Blame addon/doxysearch/doxyindexer.cpp

Packit 1c1d7e
/******************************************************************************
Packit 1c1d7e
 *
Packit 1c1d7e
 * Copyright (C) 1997-2015 by Dimitri van Heesch.
Packit 1c1d7e
 *
Packit 1c1d7e
 * Permission to use, copy, modify, and distribute this software and its
Packit 1c1d7e
 * documentation under the terms of the GNU General Public License is hereby 
Packit 1c1d7e
 * granted. No representations are made about the suitability of this software 
Packit 1c1d7e
 * for any purpose. It is provided "as is" without express or implied warranty.
Packit 1c1d7e
 * See the GNU General Public License for more details.
Packit 1c1d7e
 *
Packit 1c1d7e
 * Documents produced by Doxygen are derivative works derived from the
Packit 1c1d7e
 * input used in their production; they are not affected by this license.
Packit 1c1d7e
 *
Packit 1c1d7e
 */
Packit 1c1d7e
Packit 1c1d7e
// STL includes
Packit 1c1d7e
#include <cstdio>
Packit 1c1d7e
#include <cstdlib>
Packit 1c1d7e
#include <iostream>
Packit 1c1d7e
#include <string>
Packit 1c1d7e
#include <algorithm>
Packit 1c1d7e
#include <sstream>
Packit 1c1d7e
Packit 1c1d7e
// Qtools includes
Packit 1c1d7e
#include <qregexp.h>
Packit 1c1d7e
#include <qxml.h>
Packit 1c1d7e
#include <qfile.h>
Packit 1c1d7e
#include <qfileinfo.h>
Packit 1c1d7e
Packit 1c1d7e
// Xapian include
Packit 1c1d7e
#include <xapian.h>
Packit 1c1d7e
Packit 1c1d7e
#define MAX_TERM_LENGTH 245
Packit 1c1d7e
Packit 1c1d7e
#if defined(_WIN32) && !defined(__CYGWIN__)
Packit 1c1d7e
static char pathSep = '\\';
Packit 1c1d7e
#else
Packit 1c1d7e
static char pathSep = '/';
Packit 1c1d7e
#endif
Packit 1c1d7e
Packit 1c1d7e
static void safeAddTerm(const std::string &term,Xapian::Document &doc,int wfd)
Packit 1c1d7e
{
Packit 1c1d7e
  if (term.length()<=MAX_TERM_LENGTH) doc.add_term(term,wfd);
Packit 1c1d7e
}
Packit 1c1d7e
Packit 1c1d7e
/** trims \a whitespace characters from the start and end of string \a str. */
Packit 1c1d7e
static std::string trim(const std::string& str,
Packit 1c1d7e
                        const std::string& whitespace = " \t")
Packit 1c1d7e
{
Packit 1c1d7e
  size_t strBegin = str.find_first_not_of(whitespace);
Packit 1c1d7e
  if (strBegin == std::string::npos)
Packit 1c1d7e
    return ""; // no content
Packit 1c1d7e
Packit 1c1d7e
  size_t strEnd = str.find_last_not_of(whitespace);
Packit 1c1d7e
  int strRange = strEnd - strBegin + 1;
Packit 1c1d7e
Packit 1c1d7e
  return str.substr(strBegin, strRange);
Packit 1c1d7e
}
Packit 1c1d7e
Packit 1c1d7e
/** trims \a whitespace from start and end and replace occurrences of 
Packit 1c1d7e
 *  \a whitespace with \a fill.
Packit 1c1d7e
 */
Packit 1c1d7e
static std::string reduce(const std::string& str,
Packit 1c1d7e
    const std::string& fill = " ",
Packit 1c1d7e
    const std::string& whitespace = " \t")
Packit 1c1d7e
{
Packit 1c1d7e
  // trim first
Packit 1c1d7e
  std::string result = trim(str, whitespace);
Packit 1c1d7e
Packit 1c1d7e
  // replace sub ranges
Packit 1c1d7e
  size_t beginSpace = result.find_first_of(whitespace);
Packit 1c1d7e
  while (beginSpace != std::string::npos)
Packit 1c1d7e
  {
Packit 1c1d7e
    size_t endSpace = result.find_first_not_of(whitespace, beginSpace);
Packit 1c1d7e
    int range = endSpace - beginSpace;
Packit 1c1d7e
Packit 1c1d7e
    result.replace(beginSpace, range, fill);
Packit 1c1d7e
Packit 1c1d7e
    size_t newStart = beginSpace + fill.length();
Packit 1c1d7e
    beginSpace = result.find_first_of(whitespace, newStart);
Packit 1c1d7e
  }
Packit 1c1d7e
Packit 1c1d7e
  return result;
Packit 1c1d7e
}
Packit 1c1d7e
Packit 1c1d7e
/** Adds all words in \a s to document \a doc with weight \a wfd */
Packit 1c1d7e
static void addWords(const std::string &s,Xapian::Document &doc,int wfd)
Packit 1c1d7e
{
Packit 1c1d7e
  std::istringstream iss(s);
Packit 1c1d7e
  std::istream_iterator<std::string> begin(iss),end,it;
Packit 1c1d7e
  for (it=begin;it!=end;++it)
Packit 1c1d7e
  {
Packit 1c1d7e
    std::string word = *it;
Packit 1c1d7e
    std::string lword = word;
Packit 1c1d7e
    std::transform(lword.begin(), lword.end(), lword.begin(), ::tolower);
Packit 1c1d7e
    safeAddTerm(word,doc,wfd);
Packit 1c1d7e
    if (lword!=word)
Packit 1c1d7e
    {
Packit 1c1d7e
      safeAddTerm(lword,doc,wfd);
Packit 1c1d7e
    }
Packit 1c1d7e
  }
Packit 1c1d7e
}
Packit 1c1d7e
Packit 1c1d7e
/** Adds all identifiers in \a s to document \a doc with weight \a wfd */
Packit 1c1d7e
static void addIdentifiers(const std::string &s,Xapian::Document &doc,int wfd)
Packit 1c1d7e
{
Packit 1c1d7e
  QRegExp re("[A-Z_a-z][A-Z_a-z0-9]*");
Packit 1c1d7e
  int i,l,p=0;
Packit 1c1d7e
  QCString qs = s.c_str();
Packit 1c1d7e
  while ((i=re.match(qs,p,&l))!=-1)
Packit 1c1d7e
  {
Packit 1c1d7e
    safeAddTerm(qs.mid(p,i-p).data(),doc,wfd);
Packit 1c1d7e
    p=i+l;
Packit 1c1d7e
  }
Packit 1c1d7e
}
Packit 1c1d7e
Packit 1c1d7e
/** Replaces all occurrences of \a old with \a repl in string \a str */
Packit 1c1d7e
static void replace_all(std::string& str, const std::string& old, const std::string& repl) 
Packit 1c1d7e
{
Packit 1c1d7e
  size_t pos = 0;
Packit 1c1d7e
  while ((pos = str.find(old, pos)) != std::string::npos) 
Packit 1c1d7e
  {
Packit 1c1d7e
    str.replace(pos, old.length(), repl);
Packit 1c1d7e
    pos += repl.length();
Packit 1c1d7e
  }
Packit 1c1d7e
}
Packit 1c1d7e
Packit 1c1d7e
/** Replaces all XML entities in \a s with their unescaped representation */
Packit 1c1d7e
static std::string unescapeXmlEntities(const std::string &s)
Packit 1c1d7e
{
Packit 1c1d7e
  std::string result=s;
Packit 1c1d7e
  replace_all(result,">",">");
Packit 1c1d7e
  replace_all(result,"<","<");
Packit 1c1d7e
  replace_all(result,"'","'");
Packit 1c1d7e
  replace_all(result,""","\"");
Packit 1c1d7e
  replace_all(result,"&","&";;
Packit 1c1d7e
  return result;
Packit 1c1d7e
}
Packit 1c1d7e
Packit 1c1d7e
/** This class is a wrapper around SAX style XML parser, which
Packit 1c1d7e
 *  parses the file without first building a DOM tree in memory.
Packit 1c1d7e
 */
Packit 1c1d7e
class XMLContentHandler : public QXmlDefaultHandler
Packit 1c1d7e
{
Packit 1c1d7e
  public:
Packit 1c1d7e
    /** Handler for parsing XML data */
Packit 1c1d7e
    XMLContentHandler(const QString &path) 
Packit 1c1d7e
      : m_db((path+"doxysearch.db").utf8().data(),Xapian::DB_CREATE_OR_OVERWRITE), 
Packit 1c1d7e
        m_stemmer("english")
Packit 1c1d7e
    {
Packit 1c1d7e
      m_curFieldName = UnknownField;
Packit 1c1d7e
      m_indexer.set_stemmer(m_stemmer);
Packit 1c1d7e
      m_indexer.set_document(m_doc);
Packit 1c1d7e
    }
Packit 1c1d7e
Packit 1c1d7e
    /** Free data handler */
Packit 1c1d7e
   ~XMLContentHandler()
Packit 1c1d7e
    {
Packit 1c1d7e
      m_db.commit();
Packit 1c1d7e
    }
Packit 1c1d7e
Packit 1c1d7e
  private:
Packit 1c1d7e
    enum FieldNames
Packit 1c1d7e
    {
Packit 1c1d7e
      UnknownField = 0,
Packit 1c1d7e
      TypeField    = 1,
Packit 1c1d7e
      NameField    = 2,
Packit 1c1d7e
      ArgsField    = 3,
Packit 1c1d7e
      TagField     = 4,
Packit 1c1d7e
      UrlField     = 5,
Packit 1c1d7e
      KeywordField = 6,
Packit 1c1d7e
      TextField    = 7
Packit 1c1d7e
    };
Packit 1c1d7e
Packit 1c1d7e
    /** Handler for a start tag. Called for <doc> and <field> tags */
Packit 1c1d7e
    bool startElement(const QString &, const QString &,
Packit 1c1d7e
        const QString &name, const QXmlAttributes &attrib)
Packit 1c1d7e
    {
Packit 1c1d7e
      m_data="";
Packit 1c1d7e
      if (name=="field")
Packit 1c1d7e
      {
Packit 1c1d7e
        QString fieldName = attrib.value("name");
Packit 1c1d7e
        if      (fieldName=="type")     m_curFieldName=TypeField;
Packit 1c1d7e
        else if (fieldName=="name")     m_curFieldName=NameField;
Packit 1c1d7e
        else if (fieldName=="args")     m_curFieldName=ArgsField;
Packit 1c1d7e
        else if (fieldName=="tag")      m_curFieldName=TagField;
Packit 1c1d7e
        else if (fieldName=="url")      m_curFieldName=UrlField;
Packit 1c1d7e
        else if (fieldName=="keywords") m_curFieldName=KeywordField;
Packit 1c1d7e
        else if (fieldName=="text")     m_curFieldName=TextField;
Packit 1c1d7e
        else m_curFieldName=UnknownField;
Packit 1c1d7e
      }
Packit 1c1d7e
      return TRUE;
Packit 1c1d7e
    }
Packit 1c1d7e
Packit 1c1d7e
    /** Handler for an end tag. Called for </doc> and </field> tags */
Packit 1c1d7e
    bool endElement(const QString &, const QString &, const QString &name)
Packit 1c1d7e
    {
Packit 1c1d7e
      if (name=="doc") // </doc>
Packit 1c1d7e
      {
Packit 1c1d7e
        std::string term = m_doc.get_value(NameField);
Packit 1c1d7e
        std::string partTerm;
Packit 1c1d7e
        size_t pos = term.rfind("::");
Packit 1c1d7e
        if (pos!=std::string::npos)
Packit 1c1d7e
        {
Packit 1c1d7e
          partTerm = term.substr(pos+2);
Packit 1c1d7e
        }
Packit 1c1d7e
        if (m_doc.get_value(TypeField)=="class" || 
Packit 1c1d7e
            m_doc.get_value(TypeField)=="file" || 
Packit 1c1d7e
            m_doc.get_value(TypeField)=="namespace") // containers get highest prio
Packit 1c1d7e
        {
Packit 1c1d7e
          safeAddTerm(term,m_doc,1000);
Packit 1c1d7e
          if (!partTerm.empty())
Packit 1c1d7e
          {
Packit 1c1d7e
            safeAddTerm(partTerm,m_doc,500);
Packit 1c1d7e
          }
Packit 1c1d7e
        }
Packit 1c1d7e
        else // members and others get lower prio
Packit 1c1d7e
        {
Packit 1c1d7e
          safeAddTerm(m_doc.get_value(NameField),m_doc,100);
Packit 1c1d7e
          if (!partTerm.empty())
Packit 1c1d7e
          {
Packit 1c1d7e
            safeAddTerm(partTerm,m_doc,50);
Packit 1c1d7e
          }
Packit 1c1d7e
        }
Packit 1c1d7e
        m_db.add_document(m_doc);
Packit 1c1d7e
        m_doc.clear_values();
Packit 1c1d7e
        m_doc.clear_terms();
Packit 1c1d7e
      }
Packit 1c1d7e
      else if (name=="field" && m_curFieldName!=UnknownField) // </field>
Packit 1c1d7e
      {
Packit 1c1d7e
        // strip whitespace from m_data
Packit 1c1d7e
        m_data = reduce(m_data);
Packit 1c1d7e
        // replace XML entities
Packit 1c1d7e
        m_data = unescapeXmlEntities(m_data);
Packit 1c1d7e
        // add data to the document
Packit 1c1d7e
        m_doc.add_value(m_curFieldName,m_data); 
Packit 1c1d7e
        switch (m_curFieldName)
Packit 1c1d7e
        {
Packit 1c1d7e
          case TypeField:    
Packit 1c1d7e
          case NameField:    
Packit 1c1d7e
          case TagField:     
Packit 1c1d7e
          case UrlField:     
Packit 1c1d7e
            // meta data that is not searchable
Packit 1c1d7e
            break;
Packit 1c1d7e
          case KeywordField: 
Packit 1c1d7e
            addWords(m_data,m_doc,50);
Packit 1c1d7e
            break;
Packit 1c1d7e
          case ArgsField:    
Packit 1c1d7e
            addIdentifiers(m_data,m_doc,10);
Packit 1c1d7e
            break;
Packit 1c1d7e
          case TextField:    
Packit 1c1d7e
            addWords(m_data,m_doc,2);
Packit 1c1d7e
            break;
Packit 1c1d7e
          default:
Packit 1c1d7e
            break;
Packit 1c1d7e
        }
Packit 1c1d7e
        m_data="";
Packit 1c1d7e
        m_curFieldName=UnknownField;
Packit 1c1d7e
      }
Packit 1c1d7e
      // reset m_data
Packit 1c1d7e
      return TRUE;
Packit 1c1d7e
    }
Packit 1c1d7e
Packit 1c1d7e
    /** Handler for inline text */
Packit 1c1d7e
    bool characters(const QString& ch) 
Packit 1c1d7e
    {
Packit 1c1d7e
      m_data += ch.utf8();
Packit 1c1d7e
      return TRUE;
Packit 1c1d7e
    }
Packit 1c1d7e
Packit 1c1d7e
    // internal state
Packit 1c1d7e
    Xapian::WritableDatabase m_db;
Packit 1c1d7e
    Xapian::Document m_doc;
Packit 1c1d7e
    Xapian::TermGenerator m_indexer;
Packit 1c1d7e
    Xapian::Stem m_stemmer;
Packit 1c1d7e
    std::string m_data;
Packit 1c1d7e
    FieldNames m_curFieldName;
Packit 1c1d7e
};
Packit 1c1d7e
Packit 1c1d7e
/** Class for handling error during XML parsing */
Packit 1c1d7e
class XMLErrorHandler : public QXmlErrorHandler
Packit 1c1d7e
{
Packit 1c1d7e
  public:
Packit 1c1d7e
    virtual ~XMLErrorHandler() {}
Packit 1c1d7e
    bool warning( const QXmlParseException & )
Packit 1c1d7e
    {
Packit 1c1d7e
      return FALSE;
Packit 1c1d7e
    }
Packit 1c1d7e
    bool error( const QXmlParseException & )
Packit 1c1d7e
    {
Packit 1c1d7e
      return FALSE;
Packit 1c1d7e
    }
Packit 1c1d7e
    bool fatalError( const QXmlParseException &exception )
Packit 1c1d7e
    {
Packit 1c1d7e
      std::cerr << "Fatal error at line " << exception.lineNumber() 
Packit 1c1d7e
                << " column " << exception.columnNumber() << ": "
Packit 1c1d7e
                << exception.message().utf8() << std::endl;
Packit 1c1d7e
      return FALSE;
Packit 1c1d7e
    }
Packit 1c1d7e
    QString errorString() { return ""; }
Packit 1c1d7e
Packit 1c1d7e
  private:
Packit 1c1d7e
    QString errorMsg;
Packit 1c1d7e
};
Packit 1c1d7e
Packit 1c1d7e
static void usage(const char *name)
Packit 1c1d7e
{
Packit 1c1d7e
  std::cerr << "Usage: " << name << " [-o output_dir] searchdata.xml [searchdata2.xml ...]" << std::endl;
Packit 1c1d7e
  exit(1);
Packit 1c1d7e
}
Packit 1c1d7e
Packit 1c1d7e
/** main function to index data */
Packit 1c1d7e
int main(int argc,const char **argv)
Packit 1c1d7e
{
Packit 1c1d7e
  if (argc<2)
Packit 1c1d7e
  {
Packit 1c1d7e
    usage(argv[0]);
Packit 1c1d7e
  }
Packit 1c1d7e
  QString outputDir;
Packit 1c1d7e
  for (int i=1;i
Packit 1c1d7e
  {
Packit 1c1d7e
    if (std::string(argv[i])=="-o")
Packit 1c1d7e
    {
Packit 1c1d7e
      if (i>=argc-1)
Packit 1c1d7e
      {
Packit 1c1d7e
        std::cerr << "Error: missing parameter for -o option" << std::endl;
Packit 1c1d7e
        usage(argv[0]);
Packit 1c1d7e
      }
Packit 1c1d7e
      else
Packit 1c1d7e
      {
Packit 1c1d7e
        i++;
Packit 1c1d7e
        outputDir=argv[i];
Packit 1c1d7e
        QFileInfo fi(outputDir);
Packit 1c1d7e
        if (!fi.exists() || !fi.isDir())
Packit 1c1d7e
        {
Packit 1c1d7e
          std::cerr << "Error: specified output directory does not exist!" << std::endl;
Packit 1c1d7e
          usage(argv[0]);
Packit 1c1d7e
        }
Packit 1c1d7e
      }
Packit 1c1d7e
    }
Packit 1c1d7e
    else if (std::string(argv[i])=="-h" || std::string(argv[i])=="--help")
Packit 1c1d7e
    {
Packit 1c1d7e
      usage(argv[0]);
Packit 1c1d7e
    }
Packit 1c1d7e
  }
Packit 1c1d7e
Packit 1c1d7e
  try
Packit 1c1d7e
  {
Packit 1c1d7e
    if (!outputDir.isEmpty() && outputDir.at(outputDir.length()-1)!=pathSep)
Packit 1c1d7e
    {
Packit 1c1d7e
      outputDir+=pathSep;
Packit 1c1d7e
    }
Packit 1c1d7e
    XMLContentHandler handler(outputDir);
Packit 1c1d7e
    XMLErrorHandler errorHandler;
Packit 1c1d7e
    for (int i=1;i
Packit 1c1d7e
    {
Packit 1c1d7e
      if (std::string(argv[i])=="-o")
Packit 1c1d7e
      {
Packit 1c1d7e
        i++;
Packit 1c1d7e
      }
Packit 1c1d7e
      else
Packit 1c1d7e
      {
Packit 1c1d7e
        QString xmlFileName = argv[i];
Packit 1c1d7e
        std::cout << "Processing " << xmlFileName.utf8() << "..." << std::endl;
Packit 1c1d7e
        QFile xmlFile(xmlFileName);
Packit 1c1d7e
        QXmlInputSource source(xmlFile);
Packit 1c1d7e
        QXmlSimpleReader reader;
Packit 1c1d7e
        reader.setContentHandler(&handler);
Packit 1c1d7e
        reader.setErrorHandler(&errorHandler);
Packit 1c1d7e
        reader.parse(source);
Packit 1c1d7e
      }
Packit 1c1d7e
    }
Packit 1c1d7e
  }
Packit 1c1d7e
  catch(const Xapian::Error &e) 
Packit 1c1d7e
  {
Packit 1c1d7e
    std::cerr << "Caught exception: " << e.get_description() << std::endl;
Packit 1c1d7e
  }
Packit 1c1d7e
  catch(...)
Packit 1c1d7e
  {
Packit 1c1d7e
    std::cerr << "Caught an unknown exception" << std::endl;
Packit 1c1d7e
  }
Packit 1c1d7e
Packit 1c1d7e
  return 0;
Packit 1c1d7e
}