/****************************************************************************** * * Copyright (C) 1997-2015 by Dimitri van Heesch. * * Permission to use, copy, modify, and distribute this software and its * documentation under the terms of the GNU General Public License is hereby * granted. No representations are made about the suitability of this software * for any purpose. It is provided "as is" without express or implied warranty. * See the GNU General Public License for more details. * * Documents produced by Doxygen are derivative works derived from the * input used in their production; they are not affected by this license. * */ // STL includes #include #include #include #include #include #include // Qtools includes #include #include #include #include // Xapian include #include #define MAX_TERM_LENGTH 245 #if defined(_WIN32) && !defined(__CYGWIN__) static char pathSep = '\\'; #else static char pathSep = '/'; #endif static void safeAddTerm(const std::string &term,Xapian::Document &doc,int wfd) { if (term.length()<=MAX_TERM_LENGTH) doc.add_term(term,wfd); } /** trims \a whitespace characters from the start and end of string \a str. */ static std::string trim(const std::string& str, const std::string& whitespace = " \t") { size_t strBegin = str.find_first_not_of(whitespace); if (strBegin == std::string::npos) return ""; // no content size_t strEnd = str.find_last_not_of(whitespace); int strRange = strEnd - strBegin + 1; return str.substr(strBegin, strRange); } /** trims \a whitespace from start and end and replace occurrences of * \a whitespace with \a fill. */ static std::string reduce(const std::string& str, const std::string& fill = " ", const std::string& whitespace = " \t") { // trim first std::string result = trim(str, whitespace); // replace sub ranges size_t beginSpace = result.find_first_of(whitespace); while (beginSpace != std::string::npos) { size_t endSpace = result.find_first_not_of(whitespace, beginSpace); int range = endSpace - beginSpace; result.replace(beginSpace, range, fill); size_t newStart = beginSpace + fill.length(); beginSpace = result.find_first_of(whitespace, newStart); } return result; } /** Adds all words in \a s to document \a doc with weight \a wfd */ static void addWords(const std::string &s,Xapian::Document &doc,int wfd) { std::istringstream iss(s); std::istream_iterator begin(iss),end,it; for (it=begin;it!=end;++it) { std::string word = *it; std::string lword = word; std::transform(lword.begin(), lword.end(), lword.begin(), ::tolower); safeAddTerm(word,doc,wfd); if (lword!=word) { safeAddTerm(lword,doc,wfd); } } } /** Adds all identifiers in \a s to document \a doc with weight \a wfd */ static void addIdentifiers(const std::string &s,Xapian::Document &doc,int wfd) { QRegExp re("[A-Z_a-z][A-Z_a-z0-9]*"); int i,l,p=0; QCString qs = s.c_str(); while ((i=re.match(qs,p,&l))!=-1) { safeAddTerm(qs.mid(p,i-p).data(),doc,wfd); p=i+l; } } /** Replaces all occurrences of \a old with \a repl in string \a str */ static void replace_all(std::string& str, const std::string& old, const std::string& repl) { size_t pos = 0; while ((pos = str.find(old, pos)) != std::string::npos) { str.replace(pos, old.length(), repl); pos += repl.length(); } } /** Replaces all XML entities in \a s with their unescaped representation */ static std::string unescapeXmlEntities(const std::string &s) { std::string result=s; replace_all(result,">",">"); replace_all(result,"<","<"); replace_all(result,"'","'"); replace_all(result,""","\""); replace_all(result,"&","&"); return result; } /** This class is a wrapper around SAX style XML parser, which * parses the file without first building a DOM tree in memory. */ class XMLContentHandler : public QXmlDefaultHandler { public: /** Handler for parsing XML data */ XMLContentHandler(const QString &path) : m_db((path+"doxysearch.db").utf8().data(),Xapian::DB_CREATE_OR_OVERWRITE), m_stemmer("english") { m_curFieldName = UnknownField; m_indexer.set_stemmer(m_stemmer); m_indexer.set_document(m_doc); } /** Free data handler */ ~XMLContentHandler() { m_db.commit(); } private: enum FieldNames { UnknownField = 0, TypeField = 1, NameField = 2, ArgsField = 3, TagField = 4, UrlField = 5, KeywordField = 6, TextField = 7 }; /** Handler for a start tag. Called for and tags */ bool startElement(const QString &, const QString &, const QString &name, const QXmlAttributes &attrib) { m_data=""; if (name=="field") { QString fieldName = attrib.value("name"); if (fieldName=="type") m_curFieldName=TypeField; else if (fieldName=="name") m_curFieldName=NameField; else if (fieldName=="args") m_curFieldName=ArgsField; else if (fieldName=="tag") m_curFieldName=TagField; else if (fieldName=="url") m_curFieldName=UrlField; else if (fieldName=="keywords") m_curFieldName=KeywordField; else if (fieldName=="text") m_curFieldName=TextField; else m_curFieldName=UnknownField; } return TRUE; } /** Handler for an end tag. Called for and tags */ bool endElement(const QString &, const QString &, const QString &name) { if (name=="doc") // { std::string term = m_doc.get_value(NameField); std::string partTerm; size_t pos = term.rfind("::"); if (pos!=std::string::npos) { partTerm = term.substr(pos+2); } if (m_doc.get_value(TypeField)=="class" || m_doc.get_value(TypeField)=="file" || m_doc.get_value(TypeField)=="namespace") // containers get highest prio { safeAddTerm(term,m_doc,1000); if (!partTerm.empty()) { safeAddTerm(partTerm,m_doc,500); } } else // members and others get lower prio { safeAddTerm(m_doc.get_value(NameField),m_doc,100); if (!partTerm.empty()) { safeAddTerm(partTerm,m_doc,50); } } m_db.add_document(m_doc); m_doc.clear_values(); m_doc.clear_terms(); } else if (name=="field" && m_curFieldName!=UnknownField) // { // strip whitespace from m_data m_data = reduce(m_data); // replace XML entities m_data = unescapeXmlEntities(m_data); // add data to the document m_doc.add_value(m_curFieldName,m_data); switch (m_curFieldName) { case TypeField: case NameField: case TagField: case UrlField: // meta data that is not searchable break; case KeywordField: addWords(m_data,m_doc,50); break; case ArgsField: addIdentifiers(m_data,m_doc,10); break; case TextField: addWords(m_data,m_doc,2); break; default: break; } m_data=""; m_curFieldName=UnknownField; } // reset m_data return TRUE; } /** Handler for inline text */ bool characters(const QString& ch) { m_data += ch.utf8(); return TRUE; } // internal state Xapian::WritableDatabase m_db; Xapian::Document m_doc; Xapian::TermGenerator m_indexer; Xapian::Stem m_stemmer; std::string m_data; FieldNames m_curFieldName; }; /** Class for handling error during XML parsing */ class XMLErrorHandler : public QXmlErrorHandler { public: virtual ~XMLErrorHandler() {} bool warning( const QXmlParseException & ) { return FALSE; } bool error( const QXmlParseException & ) { return FALSE; } bool fatalError( const QXmlParseException &exception ) { std::cerr << "Fatal error at line " << exception.lineNumber() << " column " << exception.columnNumber() << ": " << exception.message().utf8() << std::endl; return FALSE; } QString errorString() { return ""; } private: QString errorMsg; }; static void usage(const char *name) { std::cerr << "Usage: " << name << " [-o output_dir] searchdata.xml [searchdata2.xml ...]" << std::endl; exit(1); } /** main function to index data */ int main(int argc,const char **argv) { if (argc<2) { usage(argv[0]); } QString outputDir; for (int i=1;i=argc-1) { std::cerr << "Error: missing parameter for -o option" << std::endl; usage(argv[0]); } else { i++; outputDir=argv[i]; QFileInfo fi(outputDir); if (!fi.exists() || !fi.isDir()) { std::cerr << "Error: specified output directory does not exist!" << std::endl; usage(argv[0]); } } } else if (std::string(argv[i])=="-h" || std::string(argv[i])=="--help") { usage(argv[0]); } } try { if (!outputDir.isEmpty() && outputDir.at(outputDir.length()-1)!=pathSep) { outputDir+=pathSep; } XMLContentHandler handler(outputDir); XMLErrorHandler errorHandler; for (int i=1;i