Blame doc/index.py

Packit 423ecb
#!/usr/bin/python -u
Packit 423ecb
#
Packit 423ecb
# imports the API description and fills up a database with
Packit 423ecb
# name relevance to modules, functions or web pages
Packit 423ecb
#
Packit 423ecb
# Operation needed:
Packit 423ecb
# =================
Packit 423ecb
#
Packit 423ecb
# install mysqld, the python wrappers for mysql and libxml2, start mysqld
Packit 423ecb
# Change the root passwd of mysql:
Packit 423ecb
#    mysqladmin -u root password new_password
Packit 423ecb
# Create the new database xmlsoft
Packit 423ecb
#    mysqladmin -p create xmlsoft
Packit 423ecb
# Create a database user 'veillard' and give him passord access
Packit 423ecb
# change veillard and abcde with the right user name and passwd
Packit 423ecb
#    mysql -p
Packit 423ecb
#    password:
Packit 423ecb
#    mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
Packit 423ecb
#           IDENTIFIED BY 'abcde' WITH GRANT OPTION;
Packit 423ecb
#
Packit 423ecb
# As the user check the access:
Packit 423ecb
#    mysql -p xmlsoft
Packit 423ecb
#    Enter password:
Packit 423ecb
#    Welcome to the MySQL monitor....
Packit 423ecb
#    mysql> use xmlsoft
Packit 423ecb
#    Database changed
Packit 423ecb
#    mysql> quit
Packit 423ecb
#    Bye
Packit 423ecb
#
Packit 423ecb
# Then run the script in the doc subdir, it will create the symbols and
Packit 423ecb
# word tables and populate them with informations extracted from 
Packit 423ecb
# the libxml2-api.xml API description, and make then accessible read-only
Packit 423ecb
# by nobody@loaclhost the user expected to be Apache's one
Packit 423ecb
#
Packit 423ecb
# On the Apache configuration, make sure you have php support enabled
Packit 423ecb
#
Packit 423ecb
Packit 423ecb
import MySQLdb
Packit 423ecb
import libxml2
Packit 423ecb
import sys
Packit 423ecb
import string
Packit 423ecb
import os
Packit 423ecb
Packit 423ecb
#
Packit 423ecb
# We are not interested in parsing errors here
Packit 423ecb
#
Packit 423ecb
def callback(ctx, str):
Packit 423ecb
    return
Packit 423ecb
libxml2.registerErrorHandler(callback, None)
Packit 423ecb
Packit 423ecb
#
Packit 423ecb
# The dictionary of tables required and the SQL command needed
Packit 423ecb
# to create them
Packit 423ecb
#
Packit 423ecb
TABLES={
Packit 423ecb
  "symbols" : """CREATE TABLE symbols (
Packit 423ecb
           name varchar(255) BINARY NOT NULL,
Packit 423ecb
	   module varchar(255) BINARY NOT NULL,
Packit 423ecb
           type varchar(25) NOT NULL,
Packit 423ecb
	   descr varchar(255),
Packit 423ecb
	   UNIQUE KEY name (name),
Packit 423ecb
	   KEY module (module))""",
Packit 423ecb
  "words" : """CREATE TABLE words (
Packit 423ecb
           name varchar(50) BINARY NOT NULL,
Packit 423ecb
	   symbol varchar(255) BINARY NOT NULL,
Packit 423ecb
           relevance int,
Packit 423ecb
	   KEY name (name),
Packit 423ecb
	   KEY symbol (symbol),
Packit 423ecb
	   UNIQUE KEY ID (name, symbol))""",
Packit 423ecb
  "wordsHTML" : """CREATE TABLE wordsHTML (
Packit 423ecb
           name varchar(50) BINARY NOT NULL,
Packit 423ecb
	   resource varchar(255) BINARY NOT NULL,
Packit 423ecb
	   section varchar(255),
Packit 423ecb
	   id varchar(50),
Packit 423ecb
           relevance int,
Packit 423ecb
	   KEY name (name),
Packit 423ecb
	   KEY resource (resource),
Packit 423ecb
	   UNIQUE KEY ref (name, resource))""",
Packit 423ecb
  "wordsArchive" : """CREATE TABLE wordsArchive (
Packit 423ecb
           name varchar(50) BINARY NOT NULL,
Packit 423ecb
	   ID int(11) NOT NULL,
Packit 423ecb
           relevance int,
Packit 423ecb
	   KEY name (name),
Packit 423ecb
	   UNIQUE KEY ref (name, ID))""",
Packit 423ecb
  "pages" : """CREATE TABLE pages (
Packit 423ecb
           resource varchar(255) BINARY NOT NULL,
Packit 423ecb
	   title varchar(255) BINARY NOT NULL,
Packit 423ecb
	   UNIQUE KEY name (resource))""",
Packit 423ecb
  "archives" : """CREATE TABLE archives (
Packit 423ecb
           ID int(11) NOT NULL auto_increment,
Packit 423ecb
           resource varchar(255) BINARY NOT NULL,
Packit 423ecb
	   title varchar(255) BINARY NOT NULL,
Packit 423ecb
	   UNIQUE KEY id (ID,resource(255)),
Packit 423ecb
	   INDEX (ID),
Packit 423ecb
	   INDEX (resource))""",
Packit 423ecb
  "Queries" : """CREATE TABLE Queries (
Packit 423ecb
           ID int(11) NOT NULL auto_increment,
Packit 423ecb
	   Value varchar(50) NOT NULL,
Packit 423ecb
	   Count int(11) NOT NULL,
Packit 423ecb
	   UNIQUE KEY id (ID,Value(35)),
Packit 423ecb
	   INDEX (ID))""",
Packit 423ecb
  "AllQueries" : """CREATE TABLE AllQueries (
Packit 423ecb
           ID int(11) NOT NULL auto_increment,
Packit 423ecb
	   Value varchar(50) NOT NULL,
Packit 423ecb
	   Count int(11) NOT NULL,
Packit 423ecb
	   UNIQUE KEY id (ID,Value(35)),
Packit 423ecb
	   INDEX (ID))""",
Packit 423ecb
}
Packit 423ecb
Packit 423ecb
#
Packit 423ecb
# The XML API description file to parse
Packit 423ecb
#
Packit 423ecb
API="libxml2-api.xml"
Packit 423ecb
DB=None
Packit 423ecb
Packit 423ecb
#########################################################################
Packit 423ecb
#									#
Packit 423ecb
#                  MySQL database interfaces				#
Packit 423ecb
#									#
Packit 423ecb
#########################################################################
Packit 423ecb
def createTable(db, name):
Packit 423ecb
    global TABLES
Packit 423ecb
Packit 423ecb
    if db == None:
Packit 423ecb
        return -1
Packit 423ecb
    if name == None:
Packit 423ecb
        return -1
Packit 423ecb
    c = db.cursor()
Packit 423ecb
Packit 423ecb
    ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
Packit 423ecb
    if ret == 1:
Packit 423ecb
        print "Removed table %s" % (name)
Packit 423ecb
    print "Creating table %s" % (name)
Packit 423ecb
    try:
Packit 423ecb
        ret = c.execute(TABLES[name])
Packit 423ecb
    except:
Packit 423ecb
        print "Failed to create table %s" % (name)
Packit 423ecb
	return -1
Packit 423ecb
    return ret
Packit 423ecb
Packit 423ecb
def checkTables(db, verbose = 1):
Packit 423ecb
    global TABLES
Packit 423ecb
Packit 423ecb
    if db == None:
Packit 423ecb
        return -1
Packit 423ecb
    c = db.cursor()
Packit 423ecb
    nbtables = c.execute("show tables")
Packit 423ecb
    if verbose:
Packit 423ecb
	print "Found %d tables" % (nbtables)
Packit 423ecb
    tables = {}
Packit 423ecb
    i = 0
Packit 423ecb
    while i < nbtables:
Packit 423ecb
        l = c.fetchone()
Packit 423ecb
	name = l[0]
Packit 423ecb
	tables[name] = {}
Packit 423ecb
        i = i + 1
Packit 423ecb
Packit 423ecb
    for table in TABLES.keys():
Packit 423ecb
        if not tables.has_key(table):
Packit 423ecb
	    print "table %s missing" % (table)
Packit 423ecb
	    createTable(db, table)
Packit 423ecb
	try:
Packit 423ecb
	    ret = c.execute("SELECT count(*) from %s" % table);
Packit 423ecb
	    row = c.fetchone()
Packit 423ecb
	    if verbose:
Packit 423ecb
		print "Table %s contains %d records" % (table, row[0])
Packit 423ecb
	except:
Packit 423ecb
	    print "Troubles with table %s : repairing" % (table)
Packit 423ecb
	    ret = c.execute("repair table %s" % table);
Packit 423ecb
	    print "repairing returned %d" % (ret)
Packit 423ecb
	    ret = c.execute("SELECT count(*) from %s" % table);
Packit 423ecb
	    row = c.fetchone()
Packit 423ecb
	    print "Table %s contains %d records" % (table, row[0])
Packit 423ecb
    if verbose:
Packit 423ecb
	print "checkTables finished"
Packit 423ecb
Packit 423ecb
    # make sure apache can access the tables read-only
Packit 423ecb
    try:
Packit 423ecb
	ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
Packit 423ecb
	ret = c.execute("GRANT INSERT,SELECT,UPDATE  ON xmlsoft.Queries TO nobody@localhost")
Packit 423ecb
    except:
Packit 423ecb
        pass
Packit 423ecb
    return 0
Packit 423ecb
    
Packit 423ecb
def openMySQL(db="xmlsoft", passwd=None, verbose = 1):
Packit 423ecb
    global DB
Packit 423ecb
Packit 423ecb
    if passwd == None:
Packit 423ecb
        try:
Packit 423ecb
	    passwd = os.environ["MySQL_PASS"]
Packit 423ecb
	except:
Packit 423ecb
	    print "No password available, set environment MySQL_PASS"
Packit 423ecb
	    sys.exit(1)
Packit 423ecb
Packit 423ecb
    DB = MySQLdb.connect(passwd=passwd, db=db)
Packit 423ecb
    if DB == None:
Packit 423ecb
        return -1
Packit 423ecb
    ret = checkTables(DB, verbose)
Packit 423ecb
    return ret
Packit 423ecb
Packit 423ecb
def updateWord(name, symbol, relevance):
Packit 423ecb
    global DB
Packit 423ecb
Packit 423ecb
    if DB == None:
Packit 423ecb
        openMySQL()
Packit 423ecb
    if DB == None:
Packit 423ecb
        return -1
Packit 423ecb
    if name == None:
Packit 423ecb
        return -1
Packit 423ecb
    if symbol == None:
Packit 423ecb
        return -1
Packit 423ecb
Packit 423ecb
    c = DB.cursor()
Packit 423ecb
    try:
Packit 423ecb
	ret = c.execute(
Packit 423ecb
"""INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
Packit 423ecb
		(name, symbol, relevance))
Packit 423ecb
    except:
Packit 423ecb
        try:
Packit 423ecb
	    ret = c.execute(
Packit 423ecb
    """UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
Packit 423ecb
		    (relevance, name, symbol))
Packit 423ecb
	except:
Packit 423ecb
	    print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
Packit 423ecb
	    print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
Packit 423ecb
	    print sys.exc_type, sys.exc_value
Packit 423ecb
	    return -1
Packit 423ecb
	     
Packit 423ecb
    return ret
Packit 423ecb
Packit 423ecb
def updateSymbol(name, module, type, desc):
Packit 423ecb
    global DB
Packit 423ecb
Packit 423ecb
    updateWord(name, name, 50)
Packit 423ecb
    if DB == None:
Packit 423ecb
        openMySQL()
Packit 423ecb
    if DB == None:
Packit 423ecb
        return -1
Packit 423ecb
    if name == None:
Packit 423ecb
        return -1
Packit 423ecb
    if module == None:
Packit 423ecb
        return -1
Packit 423ecb
    if type == None:
Packit 423ecb
        return -1
Packit 423ecb
Packit 423ecb
    try:
Packit 423ecb
	desc = string.replace(desc, "'", " ")
Packit 423ecb
	l = string.split(desc, ".")
Packit 423ecb
	desc = l[0]
Packit 423ecb
	desc = desc[0:99]
Packit 423ecb
    except:
Packit 423ecb
        desc = ""
Packit 423ecb
Packit 423ecb
    c = DB.cursor()
Packit 423ecb
    try:
Packit 423ecb
	ret = c.execute(
Packit 423ecb
"""INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
Packit 423ecb
                    (name, module, type, desc))
Packit 423ecb
    except:
Packit 423ecb
        try:
Packit 423ecb
	    ret = c.execute(
Packit 423ecb
"""UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
Packit 423ecb
                    (module, type, desc, name))
Packit 423ecb
        except:
Packit 423ecb
	    print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
Packit 423ecb
	    print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
Packit 423ecb
	    print sys.exc_type, sys.exc_value
Packit 423ecb
	    return -1
Packit 423ecb
	     
Packit 423ecb
    return ret
Packit 423ecb
        
Packit 423ecb
def addFunction(name, module, desc = ""):
Packit 423ecb
    return updateSymbol(name, module, 'function', desc)
Packit 423ecb
Packit 423ecb
def addMacro(name, module, desc = ""):
Packit 423ecb
    return updateSymbol(name, module, 'macro', desc)
Packit 423ecb
Packit 423ecb
def addEnum(name, module, desc = ""):
Packit 423ecb
    return updateSymbol(name, module, 'enum', desc)
Packit 423ecb
Packit 423ecb
def addStruct(name, module, desc = ""):
Packit 423ecb
    return updateSymbol(name, module, 'struct', desc)
Packit 423ecb
Packit 423ecb
def addConst(name, module, desc = ""):
Packit 423ecb
    return updateSymbol(name, module, 'const', desc)
Packit 423ecb
Packit 423ecb
def addType(name, module, desc = ""):
Packit 423ecb
    return updateSymbol(name, module, 'type', desc)
Packit 423ecb
Packit 423ecb
def addFunctype(name, module, desc = ""):
Packit 423ecb
    return updateSymbol(name, module, 'functype', desc)
Packit 423ecb
Packit 423ecb
def addPage(resource, title):
Packit 423ecb
    global DB
Packit 423ecb
Packit 423ecb
    if DB == None:
Packit 423ecb
        openMySQL()
Packit 423ecb
    if DB == None:
Packit 423ecb
        return -1
Packit 423ecb
    if resource == None:
Packit 423ecb
        return -1
Packit 423ecb
Packit 423ecb
    c = DB.cursor()
Packit 423ecb
    try:
Packit 423ecb
	ret = c.execute(
Packit 423ecb
	    """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
Packit 423ecb
                    (resource, title))
Packit 423ecb
    except:
Packit 423ecb
        try:
Packit 423ecb
	    ret = c.execute(
Packit 423ecb
		"""UPDATE pages SET title='%s' WHERE resource='%s'""" %
Packit 423ecb
                    (title, resource))
Packit 423ecb
        except:
Packit 423ecb
	    print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
Packit 423ecb
	    print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
Packit 423ecb
	    print sys.exc_type, sys.exc_value
Packit 423ecb
	    return -1
Packit 423ecb
	     
Packit 423ecb
    return ret
Packit 423ecb
Packit 423ecb
def updateWordHTML(name, resource, desc, id, relevance):
Packit 423ecb
    global DB
Packit 423ecb
Packit 423ecb
    if DB == None:
Packit 423ecb
        openMySQL()
Packit 423ecb
    if DB == None:
Packit 423ecb
        return -1
Packit 423ecb
    if name == None:
Packit 423ecb
        return -1
Packit 423ecb
    if resource == None:
Packit 423ecb
        return -1
Packit 423ecb
    if id == None:
Packit 423ecb
        id = ""
Packit 423ecb
    if desc == None:
Packit 423ecb
        desc = ""
Packit 423ecb
    else:
Packit 423ecb
	try:
Packit 423ecb
	    desc = string.replace(desc, "'", " ")
Packit 423ecb
	    desc = desc[0:99]
Packit 423ecb
	except:
Packit 423ecb
	    desc = ""
Packit 423ecb
Packit 423ecb
    c = DB.cursor()
Packit 423ecb
    try:
Packit 423ecb
	ret = c.execute(
Packit 423ecb
"""INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
Packit 423ecb
                    (name, resource, desc, id, relevance))
Packit 423ecb
    except:
Packit 423ecb
        try:
Packit 423ecb
	    ret = c.execute(
Packit 423ecb
"""UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
Packit 423ecb
                    (desc, id, relevance, name, resource))
Packit 423ecb
        except:
Packit 423ecb
	    print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
Packit 423ecb
	    print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
Packit 423ecb
	    print sys.exc_type, sys.exc_value
Packit 423ecb
	    return -1
Packit 423ecb
	     
Packit 423ecb
    return ret
Packit 423ecb
Packit 423ecb
def checkXMLMsgArchive(url):
Packit 423ecb
    global DB
Packit 423ecb
Packit 423ecb
    if DB == None:
Packit 423ecb
        openMySQL()
Packit 423ecb
    if DB == None:
Packit 423ecb
        return -1
Packit 423ecb
    if url == None:
Packit 423ecb
        return -1
Packit 423ecb
Packit 423ecb
    c = DB.cursor()
Packit 423ecb
    try:
Packit 423ecb
	ret = c.execute(
Packit 423ecb
	    """SELECT ID FROM archives WHERE resource='%s'""" % (url))
Packit 423ecb
	row = c.fetchone()
Packit 423ecb
	if row == None:
Packit 423ecb
	    return -1
Packit 423ecb
    except:
Packit 423ecb
	return -1
Packit 423ecb
	     
Packit 423ecb
    return row[0]
Packit 423ecb
    
Packit 423ecb
def addXMLMsgArchive(url, title):
Packit 423ecb
    global DB
Packit 423ecb
Packit 423ecb
    if DB == None:
Packit 423ecb
        openMySQL()
Packit 423ecb
    if DB == None:
Packit 423ecb
        return -1
Packit 423ecb
    if url == None:
Packit 423ecb
        return -1
Packit 423ecb
    if title == None:
Packit 423ecb
        title = ""
Packit 423ecb
    else:
Packit 423ecb
	title = string.replace(title, "'", " ")
Packit 423ecb
	title = title[0:99]
Packit 423ecb
Packit 423ecb
    c = DB.cursor()
Packit 423ecb
    try:
Packit 423ecb
        cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
Packit 423ecb
        ret = c.execute(cmd)
Packit 423ecb
	cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
Packit 423ecb
        ret = c.execute(cmd)
Packit 423ecb
	row = c.fetchone()
Packit 423ecb
	if row == None:
Packit 423ecb
	    print "addXMLMsgArchive failed to get the ID: %s" % (url)
Packit 423ecb
	    return -1
Packit 423ecb
    except:
Packit 423ecb
        print "addXMLMsgArchive failed command: %s" % (cmd)
Packit 423ecb
	return -1
Packit 423ecb
	     
Packit 423ecb
    return((int)(row[0]))
Packit 423ecb
Packit 423ecb
def updateWordArchive(name, id, relevance):
Packit 423ecb
    global DB
Packit 423ecb
Packit 423ecb
    if DB == None:
Packit 423ecb
        openMySQL()
Packit 423ecb
    if DB == None:
Packit 423ecb
        return -1
Packit 423ecb
    if name == None:
Packit 423ecb
        return -1
Packit 423ecb
    if id == None:
Packit 423ecb
        return -1
Packit 423ecb
Packit 423ecb
    c = DB.cursor()
Packit 423ecb
    try:
Packit 423ecb
	ret = c.execute(
Packit 423ecb
"""INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
Packit 423ecb
                    (name, id, relevance))
Packit 423ecb
    except:
Packit 423ecb
        try:
Packit 423ecb
	    ret = c.execute(
Packit 423ecb
"""UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
Packit 423ecb
                    (relevance, name, id))
Packit 423ecb
        except:
Packit 423ecb
	    print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
Packit 423ecb
	    print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
Packit 423ecb
	    print sys.exc_type, sys.exc_value
Packit 423ecb
	    return -1
Packit 423ecb
Packit 423ecb
    return ret
Packit 423ecb
Packit 423ecb
#########################################################################
Packit 423ecb
#									#
Packit 423ecb
#                  Word dictionary and analysis routines		#
Packit 423ecb
#									#
Packit 423ecb
#########################################################################
Packit 423ecb
Packit 423ecb
#
Packit 423ecb
# top 100 english word without the one len < 3 + own set
Packit 423ecb
#
Packit 423ecb
dropWords = {
Packit 423ecb
    'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
Packit 423ecb
    'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
Packit 423ecb
    'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
Packit 423ecb
    'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
Packit 423ecb
    'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
Packit 423ecb
    'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
Packit 423ecb
    'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
Packit 423ecb
    'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
Packit 423ecb
    'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
Packit 423ecb
    'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
Packit 423ecb
    'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
Packit 423ecb
    'down':0,
Packit 423ecb
    'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
Packit 423ecb
}
Packit 423ecb
Packit 423ecb
wordsDict = {}
Packit 423ecb
wordsDictHTML = {}
Packit 423ecb
wordsDictArchive = {}
Packit 423ecb
Packit 423ecb
def cleanupWordsString(str):
Packit 423ecb
    str = string.replace(str, ".", " ")
Packit 423ecb
    str = string.replace(str, "!", " ")
Packit 423ecb
    str = string.replace(str, "?", " ")
Packit 423ecb
    str = string.replace(str, ",", " ")
Packit 423ecb
    str = string.replace(str, "'", " ")
Packit 423ecb
    str = string.replace(str, '"', " ")
Packit 423ecb
    str = string.replace(str, ";", " ")
Packit 423ecb
    str = string.replace(str, "(", " ")
Packit 423ecb
    str = string.replace(str, ")", " ")
Packit 423ecb
    str = string.replace(str, "{", " ")
Packit 423ecb
    str = string.replace(str, "}", " ")
Packit 423ecb
    str = string.replace(str, "<", " ")
Packit 423ecb
    str = string.replace(str, ">", " ")
Packit 423ecb
    str = string.replace(str, "=", " ")
Packit 423ecb
    str = string.replace(str, "/", " ")
Packit 423ecb
    str = string.replace(str, "*", " ")
Packit 423ecb
    str = string.replace(str, ":", " ")
Packit 423ecb
    str = string.replace(str, "#", " ")
Packit 423ecb
    str = string.replace(str, "\\", " ")
Packit 423ecb
    str = string.replace(str, "\n", " ")
Packit 423ecb
    str = string.replace(str, "\r", " ")
Packit 423ecb
    str = string.replace(str, "\xc2", " ")
Packit 423ecb
    str = string.replace(str, "\xa0", " ")
Packit 423ecb
    return str
Packit 423ecb
    
Packit 423ecb
def cleanupDescrString(str):
Packit 423ecb
    str = string.replace(str, "'", " ")
Packit 423ecb
    str = string.replace(str, "\n", " ")
Packit 423ecb
    str = string.replace(str, "\r", " ")
Packit 423ecb
    str = string.replace(str, "\xc2", " ")
Packit 423ecb
    str = string.replace(str, "\xa0", " ")
Packit 423ecb
    l = string.split(str)
Packit 423ecb
    str = string.join(str)
Packit 423ecb
    return str
Packit 423ecb
Packit 423ecb
def splitIdentifier(str):
Packit 423ecb
    ret = []
Packit 423ecb
    while str != "":
Packit 423ecb
        cur = string.lower(str[0])
Packit 423ecb
	str = str[1:]
Packit 423ecb
	if ((cur < 'a') or (cur > 'z')):
Packit 423ecb
	    continue
Packit 423ecb
	while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
Packit 423ecb
	    cur = cur + string.lower(str[0])
Packit 423ecb
	    str = str[1:]
Packit 423ecb
	while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
Packit 423ecb
	    cur = cur + str[0]
Packit 423ecb
	    str = str[1:]
Packit 423ecb
	while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
Packit 423ecb
	    str = str[1:]
Packit 423ecb
	ret.append(cur)
Packit 423ecb
    return ret
Packit 423ecb
Packit 423ecb
def addWord(word, module, symbol, relevance):
Packit 423ecb
    global wordsDict
Packit 423ecb
Packit 423ecb
    if word == None or len(word) < 3:
Packit 423ecb
        return -1
Packit 423ecb
    if module == None or symbol == None:
Packit 423ecb
        return -1
Packit 423ecb
    if dropWords.has_key(word):
Packit 423ecb
        return 0
Packit 423ecb
    if ord(word[0]) > 0x80:
Packit 423ecb
        return 0
Packit 423ecb
Packit 423ecb
    if wordsDict.has_key(word):
Packit 423ecb
        d = wordsDict[word]
Packit 423ecb
	if d == None:
Packit 423ecb
	    return 0
Packit 423ecb
	if len(d) > 500:
Packit 423ecb
	    wordsDict[word] = None
Packit 423ecb
	    return 0
Packit 423ecb
	try:
Packit 423ecb
	    relevance = relevance + d[(module, symbol)]
Packit 423ecb
	except:
Packit 423ecb
	    pass
Packit 423ecb
    else:
Packit 423ecb
        wordsDict[word] = {}
Packit 423ecb
    wordsDict[word][(module, symbol)] = relevance
Packit 423ecb
    return relevance
Packit 423ecb
    
Packit 423ecb
def addString(str, module, symbol, relevance):
Packit 423ecb
    if str == None or len(str) < 3:
Packit 423ecb
        return -1
Packit 423ecb
    ret = 0
Packit 423ecb
    str = cleanupWordsString(str)
Packit 423ecb
    l = string.split(str)
Packit 423ecb
    for word in l:
Packit 423ecb
	if len(word) > 2:
Packit 423ecb
	    ret = ret + addWord(word, module, symbol, 5)
Packit 423ecb
Packit 423ecb
    return ret
Packit 423ecb
Packit 423ecb
def addWordHTML(word, resource, id, section, relevance):
Packit 423ecb
    global wordsDictHTML
Packit 423ecb
Packit 423ecb
    if word == None or len(word) < 3:
Packit 423ecb
        return -1
Packit 423ecb
    if resource == None or section == None:
Packit 423ecb
        return -1
Packit 423ecb
    if dropWords.has_key(word):
Packit 423ecb
        return 0
Packit 423ecb
    if ord(word[0]) > 0x80:
Packit 423ecb
        return 0
Packit 423ecb
Packit 423ecb
    section = cleanupDescrString(section)
Packit 423ecb
Packit 423ecb
    if wordsDictHTML.has_key(word):
Packit 423ecb
        d = wordsDictHTML[word]
Packit 423ecb
	if d == None:
Packit 423ecb
	    print "skipped %s" % (word)
Packit 423ecb
	    return 0
Packit 423ecb
	try:
Packit 423ecb
	    (r,i,s) = d[resource]
Packit 423ecb
	    if i != None:
Packit 423ecb
	        id = i
Packit 423ecb
	    if s != None:
Packit 423ecb
	        section = s
Packit 423ecb
	    relevance = relevance + r
Packit 423ecb
	except:
Packit 423ecb
	    pass
Packit 423ecb
    else:
Packit 423ecb
        wordsDictHTML[word] = {}
Packit 423ecb
    d = wordsDictHTML[word];
Packit 423ecb
    d[resource] = (relevance, id, section)
Packit 423ecb
    return relevance
Packit 423ecb
    
Packit 423ecb
def addStringHTML(str, resource, id, section, relevance):
Packit 423ecb
    if str == None or len(str) < 3:
Packit 423ecb
        return -1
Packit 423ecb
    ret = 0
Packit 423ecb
    str = cleanupWordsString(str)
Packit 423ecb
    l = string.split(str)
Packit 423ecb
    for word in l:
Packit 423ecb
	if len(word) > 2:
Packit 423ecb
	    try:
Packit 423ecb
		r = addWordHTML(word, resource, id, section, relevance)
Packit 423ecb
		if r < 0:
Packit 423ecb
		    print "addWordHTML failed: %s %s" % (word, resource)
Packit 423ecb
		ret = ret + r
Packit 423ecb
	    except:
Packit 423ecb
		print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
Packit 423ecb
		print sys.exc_type, sys.exc_value
Packit 423ecb
Packit 423ecb
    return ret
Packit 423ecb
Packit 423ecb
def addWordArchive(word, id, relevance):
Packit 423ecb
    global wordsDictArchive
Packit 423ecb
Packit 423ecb
    if word == None or len(word) < 3:
Packit 423ecb
        return -1
Packit 423ecb
    if id == None or id == -1:
Packit 423ecb
        return -1
Packit 423ecb
    if dropWords.has_key(word):
Packit 423ecb
        return 0
Packit 423ecb
    if ord(word[0]) > 0x80:
Packit 423ecb
        return 0
Packit 423ecb
Packit 423ecb
    if wordsDictArchive.has_key(word):
Packit 423ecb
        d = wordsDictArchive[word]
Packit 423ecb
	if d == None:
Packit 423ecb
	    print "skipped %s" % (word)
Packit 423ecb
	    return 0
Packit 423ecb
	try:
Packit 423ecb
	    r = d[id]
Packit 423ecb
	    relevance = relevance + r
Packit 423ecb
	except:
Packit 423ecb
	    pass
Packit 423ecb
    else:
Packit 423ecb
        wordsDictArchive[word] = {}
Packit 423ecb
    d = wordsDictArchive[word];
Packit 423ecb
    d[id] = relevance
Packit 423ecb
    return relevance
Packit 423ecb
    
Packit 423ecb
def addStringArchive(str, id, relevance):
Packit 423ecb
    if str == None or len(str) < 3:
Packit 423ecb
        return -1
Packit 423ecb
    ret = 0
Packit 423ecb
    str = cleanupWordsString(str)
Packit 423ecb
    l = string.split(str)
Packit 423ecb
    for word in l:
Packit 423ecb
        i = len(word)
Packit 423ecb
	if i > 2:
Packit 423ecb
	    try:
Packit 423ecb
		r = addWordArchive(word, id, relevance)
Packit 423ecb
		if r < 0:
Packit 423ecb
		    print "addWordArchive failed: %s %s" % (word, id)
Packit 423ecb
		else:
Packit 423ecb
		    ret = ret + r
Packit 423ecb
	    except:
Packit 423ecb
		print "addWordArchive failed: %s %s %d" % (word, id, relevance)
Packit 423ecb
		print sys.exc_type, sys.exc_value
Packit 423ecb
    return ret
Packit 423ecb
Packit 423ecb
#########################################################################
Packit 423ecb
#									#
Packit 423ecb
#                  XML API description analysis				#
Packit 423ecb
#									#
Packit 423ecb
#########################################################################
Packit 423ecb
Packit 423ecb
def loadAPI(filename):
Packit 423ecb
    doc = libxml2.parseFile(filename)
Packit 423ecb
    print "loaded %s" % (filename)
Packit 423ecb
    return doc
Packit 423ecb
Packit 423ecb
def foundExport(file, symbol):
Packit 423ecb
    if file == None:
Packit 423ecb
        return 0
Packit 423ecb
    if symbol == None:
Packit 423ecb
        return 0
Packit 423ecb
    addFunction(symbol, file)
Packit 423ecb
    l = splitIdentifier(symbol)
Packit 423ecb
    for word in l:
Packit 423ecb
	addWord(word, file, symbol, 10)
Packit 423ecb
    return 1
Packit 423ecb
     
Packit 423ecb
def analyzeAPIFile(top):
Packit 423ecb
    count = 0
Packit 423ecb
    name = top.prop("name")
Packit 423ecb
    cur = top.children
Packit 423ecb
    while cur != None:
Packit 423ecb
        if cur.type == 'text':
Packit 423ecb
	    cur = cur.next
Packit 423ecb
	    continue
Packit 423ecb
	if cur.name == "exports":
Packit 423ecb
	    count = count + foundExport(name, cur.prop("symbol"))
Packit 423ecb
	else:
Packit 423ecb
	    print "unexpected element %s in API doc <file name='%s'>" % (name)
Packit 423ecb
        cur = cur.next
Packit 423ecb
    return count
Packit 423ecb
Packit 423ecb
def analyzeAPIFiles(top):
Packit 423ecb
    count = 0
Packit 423ecb
    cur = top.children
Packit 423ecb
        
Packit 423ecb
    while cur != None:
Packit 423ecb
        if cur.type == 'text':
Packit 423ecb
	    cur = cur.next
Packit 423ecb
	    continue
Packit 423ecb
	if cur.name == "file":
Packit 423ecb
	    count = count + analyzeAPIFile(cur)
Packit 423ecb
	else:
Packit 423ecb
	    print "unexpected element %s in API doc <files>" % (cur.name)
Packit 423ecb
        cur = cur.next
Packit 423ecb
    return count
Packit 423ecb
Packit 423ecb
def analyzeAPIEnum(top):
Packit 423ecb
    file = top.prop("file")
Packit 423ecb
    if file == None:
Packit 423ecb
        return 0
Packit 423ecb
    symbol = top.prop("name")
Packit 423ecb
    if symbol == None:
Packit 423ecb
        return 0
Packit 423ecb
Packit 423ecb
    addEnum(symbol, file)
Packit 423ecb
    l = splitIdentifier(symbol)
Packit 423ecb
    for word in l:
Packit 423ecb
	addWord(word, file, symbol, 10)
Packit 423ecb
Packit 423ecb
    return 1
Packit 423ecb
Packit 423ecb
def analyzeAPIConst(top):
Packit 423ecb
    file = top.prop("file")
Packit 423ecb
    if file == None:
Packit 423ecb
        return 0
Packit 423ecb
    symbol = top.prop("name")
Packit 423ecb
    if symbol == None:
Packit 423ecb
        return 0
Packit 423ecb
Packit 423ecb
    addConst(symbol, file)
Packit 423ecb
    l = splitIdentifier(symbol)
Packit 423ecb
    for word in l:
Packit 423ecb
	addWord(word, file, symbol, 10)
Packit 423ecb
Packit 423ecb
    return 1
Packit 423ecb
Packit 423ecb
def analyzeAPIType(top):
Packit 423ecb
    file = top.prop("file")
Packit 423ecb
    if file == None:
Packit 423ecb
        return 0
Packit 423ecb
    symbol = top.prop("name")
Packit 423ecb
    if symbol == None:
Packit 423ecb
        return 0
Packit 423ecb
Packit 423ecb
    addType(symbol, file)
Packit 423ecb
    l = splitIdentifier(symbol)
Packit 423ecb
    for word in l:
Packit 423ecb
	addWord(word, file, symbol, 10)
Packit 423ecb
    return 1
Packit 423ecb
Packit 423ecb
def analyzeAPIFunctype(top):
Packit 423ecb
    file = top.prop("file")
Packit 423ecb
    if file == None:
Packit 423ecb
        return 0
Packit 423ecb
    symbol = top.prop("name")
Packit 423ecb
    if symbol == None:
Packit 423ecb
        return 0
Packit 423ecb
Packit 423ecb
    addFunctype(symbol, file)
Packit 423ecb
    l = splitIdentifier(symbol)
Packit 423ecb
    for word in l:
Packit 423ecb
	addWord(word, file, symbol, 10)
Packit 423ecb
    return 1
Packit 423ecb
Packit 423ecb
def analyzeAPIStruct(top):
Packit 423ecb
    file = top.prop("file")
Packit 423ecb
    if file == None:
Packit 423ecb
        return 0
Packit 423ecb
    symbol = top.prop("name")
Packit 423ecb
    if symbol == None:
Packit 423ecb
        return 0
Packit 423ecb
Packit 423ecb
    addStruct(symbol, file)
Packit 423ecb
    l = splitIdentifier(symbol)
Packit 423ecb
    for word in l:
Packit 423ecb
	addWord(word, file, symbol, 10)
Packit 423ecb
Packit 423ecb
    info = top.prop("info")
Packit 423ecb
    if info != None:
Packit 423ecb
	info = string.replace(info, "'", " ")
Packit 423ecb
	info = string.strip(info)
Packit 423ecb
	l = string.split(info)
Packit 423ecb
	for word in l:
Packit 423ecb
	    if len(word) > 2:
Packit 423ecb
		addWord(word, file, symbol, 5)
Packit 423ecb
    return 1
Packit 423ecb
Packit 423ecb
def analyzeAPIMacro(top):
Packit 423ecb
    file = top.prop("file")
Packit 423ecb
    if file == None:
Packit 423ecb
        return 0
Packit 423ecb
    symbol = top.prop("name")
Packit 423ecb
    if symbol == None:
Packit 423ecb
        return 0
Packit 423ecb
    symbol = string.replace(symbol, "'", " ")
Packit 423ecb
    symbol = string.strip(symbol)
Packit 423ecb
Packit 423ecb
    info = None
Packit 423ecb
    cur = top.children
Packit 423ecb
    while cur != None:
Packit 423ecb
        if cur.type == 'text':
Packit 423ecb
	    cur = cur.next
Packit 423ecb
	    continue
Packit 423ecb
	if cur.name == "info":
Packit 423ecb
	    info = cur.content
Packit 423ecb
	    break
Packit 423ecb
        cur = cur.next
Packit 423ecb
Packit 423ecb
    l = splitIdentifier(symbol)
Packit 423ecb
    for word in l:
Packit 423ecb
	addWord(word, file, symbol, 10)
Packit 423ecb
Packit 423ecb
    if info == None:
Packit 423ecb
	addMacro(symbol, file)
Packit 423ecb
        print "Macro %s description has no <info>" % (symbol)
Packit 423ecb
        return 0
Packit 423ecb
Packit 423ecb
    info = string.replace(info, "'", " ")
Packit 423ecb
    info = string.strip(info)
Packit 423ecb
    addMacro(symbol, file, info)
Packit 423ecb
    l = string.split(info)
Packit 423ecb
    for word in l:
Packit 423ecb
	if len(word) > 2:
Packit 423ecb
	    addWord(word, file, symbol, 5)
Packit 423ecb
    return 1
Packit 423ecb
Packit 423ecb
def analyzeAPIFunction(top):
Packit 423ecb
    file = top.prop("file")
Packit 423ecb
    if file == None:
Packit 423ecb
        return 0
Packit 423ecb
    symbol = top.prop("name")
Packit 423ecb
    if symbol == None:
Packit 423ecb
        return 0
Packit 423ecb
Packit 423ecb
    symbol = string.replace(symbol, "'", " ")
Packit 423ecb
    symbol = string.strip(symbol)
Packit 423ecb
    info = None
Packit 423ecb
    cur = top.children
Packit 423ecb
    while cur != None:
Packit 423ecb
        if cur.type == 'text':
Packit 423ecb
	    cur = cur.next
Packit 423ecb
	    continue
Packit 423ecb
	if cur.name == "info":
Packit 423ecb
	    info = cur.content
Packit 423ecb
	elif cur.name == "return":
Packit 423ecb
	    rinfo = cur.prop("info")
Packit 423ecb
	    if rinfo != None:
Packit 423ecb
		rinfo = string.replace(rinfo, "'", " ")
Packit 423ecb
		rinfo = string.strip(rinfo)
Packit 423ecb
	        addString(rinfo, file, symbol, 7)
Packit 423ecb
	elif cur.name == "arg":
Packit 423ecb
	    ainfo = cur.prop("info")
Packit 423ecb
	    if ainfo != None:
Packit 423ecb
		ainfo = string.replace(ainfo, "'", " ")
Packit 423ecb
		ainfo = string.strip(ainfo)
Packit 423ecb
	        addString(ainfo, file, symbol, 5)
Packit 423ecb
	    name = cur.prop("name")
Packit 423ecb
	    if name != None:
Packit 423ecb
		name = string.replace(name, "'", " ")
Packit 423ecb
		name = string.strip(name)
Packit 423ecb
	        addWord(name, file, symbol, 7)
Packit 423ecb
        cur = cur.next
Packit 423ecb
    if info == None:
Packit 423ecb
        print "Function %s description has no <info>" % (symbol)
Packit 423ecb
	addFunction(symbol, file, "")
Packit 423ecb
    else:
Packit 423ecb
        info = string.replace(info, "'", " ")
Packit 423ecb
	info = string.strip(info)
Packit 423ecb
	addFunction(symbol, file, info)
Packit 423ecb
        addString(info, file, symbol, 5)
Packit 423ecb
Packit 423ecb
    l = splitIdentifier(symbol)
Packit 423ecb
    for word in l:
Packit 423ecb
	addWord(word, file, symbol, 10)
Packit 423ecb
Packit 423ecb
    return 1
Packit 423ecb
Packit 423ecb
def analyzeAPISymbols(top):
Packit 423ecb
    count = 0
Packit 423ecb
    cur = top.children
Packit 423ecb
        
Packit 423ecb
    while cur != None:
Packit 423ecb
        if cur.type == 'text':
Packit 423ecb
	    cur = cur.next
Packit 423ecb
	    continue
Packit 423ecb
	if cur.name == "macro":
Packit 423ecb
	    count = count + analyzeAPIMacro(cur)
Packit 423ecb
	elif cur.name == "function":
Packit 423ecb
	    count = count + analyzeAPIFunction(cur)
Packit 423ecb
	elif cur.name == "const":
Packit 423ecb
	    count = count + analyzeAPIConst(cur)
Packit 423ecb
	elif cur.name == "typedef":
Packit 423ecb
	    count = count + analyzeAPIType(cur)
Packit 423ecb
	elif cur.name == "struct":
Packit 423ecb
	    count = count + analyzeAPIStruct(cur)
Packit 423ecb
	elif cur.name == "enum":
Packit 423ecb
	    count = count + analyzeAPIEnum(cur)
Packit 423ecb
	elif cur.name == "functype":
Packit 423ecb
	    count = count + analyzeAPIFunctype(cur)
Packit 423ecb
	else:
Packit 423ecb
	    print "unexpected element %s in API doc <files>" % (cur.name)
Packit 423ecb
        cur = cur.next
Packit 423ecb
    return count
Packit 423ecb
Packit 423ecb
def analyzeAPI(doc):
Packit 423ecb
    count = 0
Packit 423ecb
    if doc == None:
Packit 423ecb
        return -1
Packit 423ecb
    root = doc.getRootElement()
Packit 423ecb
    if root.name != "api":
Packit 423ecb
        print "Unexpected root name"
Packit 423ecb
        return -1
Packit 423ecb
    cur = root.children
Packit 423ecb
    while cur != None:
Packit 423ecb
        if cur.type == 'text':
Packit 423ecb
	    cur = cur.next
Packit 423ecb
	    continue
Packit 423ecb
	if cur.name == "files":
Packit 423ecb
	    pass
Packit 423ecb
#	    count = count + analyzeAPIFiles(cur)
Packit 423ecb
	elif cur.name == "symbols":
Packit 423ecb
	    count = count + analyzeAPISymbols(cur)
Packit 423ecb
	else:
Packit 423ecb
	    print "unexpected element %s in API doc" % (cur.name)
Packit 423ecb
        cur = cur.next
Packit 423ecb
    return count
Packit 423ecb
Packit 423ecb
#########################################################################
Packit 423ecb
#									#
Packit 423ecb
#                  Web pages parsing and analysis			#
Packit 423ecb
#									#
Packit 423ecb
#########################################################################
Packit 423ecb
Packit 423ecb
import glob
Packit 423ecb
Packit 423ecb
def analyzeHTMLText(doc, resource, p, section, id):
Packit 423ecb
    words = 0
Packit 423ecb
    try:
Packit 423ecb
	content = p.content
Packit 423ecb
	words = words + addStringHTML(content, resource, id, section, 5)
Packit 423ecb
    except:
Packit 423ecb
        return -1
Packit 423ecb
    return words
Packit 423ecb
Packit 423ecb
def analyzeHTMLPara(doc, resource, p, section, id):
Packit 423ecb
    words = 0
Packit 423ecb
    try:
Packit 423ecb
	content = p.content
Packit 423ecb
	words = words + addStringHTML(content, resource, id, section, 5)
Packit 423ecb
    except:
Packit 423ecb
        return -1
Packit 423ecb
    return words
Packit 423ecb
Packit 423ecb
def analyzeHTMLPre(doc, resource, p, section, id):
Packit 423ecb
    words = 0
Packit 423ecb
    try:
Packit 423ecb
	content = p.content
Packit 423ecb
	words = words + addStringHTML(content, resource, id, section, 5)
Packit 423ecb
    except:
Packit 423ecb
        return -1
Packit 423ecb
    return words
Packit 423ecb
Packit 423ecb
def analyzeHTML(doc, resource, p, section, id):
Packit 423ecb
    words = 0
Packit 423ecb
    try:
Packit 423ecb
	content = p.content
Packit 423ecb
	words = words + addStringHTML(content, resource, id, section, 5)
Packit 423ecb
    except:
Packit 423ecb
        return -1
Packit 423ecb
    return words
Packit 423ecb
Packit 423ecb
def analyzeHTML(doc, resource):
Packit 423ecb
    para = 0;
Packit 423ecb
    ctxt = doc.xpathNewContext()
Packit 423ecb
    try:
Packit 423ecb
	res = ctxt.xpathEval("//head/title")
Packit 423ecb
	title = res[0].content
Packit 423ecb
    except:
Packit 423ecb
        title = "Page %s" % (resource)
Packit 423ecb
    addPage(resource, title)
Packit 423ecb
    try:
Packit 423ecb
	items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
Packit 423ecb
	section = title
Packit 423ecb
	id = ""
Packit 423ecb
	for item in items:
Packit 423ecb
	    if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
Packit 423ecb
	        section = item.content
Packit 423ecb
		if item.prop("id"):
Packit 423ecb
		    id = item.prop("id")
Packit 423ecb
		elif item.prop("name"):
Packit 423ecb
		    id = item.prop("name")
Packit 423ecb
	    elif item.type == 'text':
Packit 423ecb
	        analyzeHTMLText(doc, resource, item, section, id)
Packit 423ecb
		para = para + 1
Packit 423ecb
	    elif item.name == 'p':
Packit 423ecb
	        analyzeHTMLPara(doc, resource, item, section, id)
Packit 423ecb
		para = para + 1
Packit 423ecb
	    elif item.name == 'pre':
Packit 423ecb
	        analyzeHTMLPre(doc, resource, item, section, id)
Packit 423ecb
		para = para + 1
Packit 423ecb
	    else:
Packit 423ecb
	        print "Page %s, unexpected %s element" % (resource, item.name)
Packit 423ecb
    except:
Packit 423ecb
        print "Page %s: problem analyzing" % (resource)
Packit 423ecb
	print sys.exc_type, sys.exc_value
Packit 423ecb
Packit 423ecb
    return para
Packit 423ecb
Packit 423ecb
def analyzeHTMLPages():
Packit 423ecb
    ret = 0
Packit 423ecb
    HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
Packit 423ecb
    for html in HTMLfiles:
Packit 423ecb
	if html[0:3] == "API":
Packit 423ecb
	    continue
Packit 423ecb
	if html == "xml.html":
Packit 423ecb
	    continue
Packit 423ecb
	try:
Packit 423ecb
	    doc = libxml2.parseFile(html)
Packit 423ecb
	except:
Packit 423ecb
	    doc = libxml2.htmlParseFile(html, None)
Packit 423ecb
	try:
Packit 423ecb
	    res = analyzeHTML(doc, html)
Packit 423ecb
	    print "Parsed %s : %d paragraphs" % (html, res)
Packit 423ecb
	    ret = ret + 1
Packit 423ecb
	except:
Packit 423ecb
	    print "could not parse %s" % (html)
Packit 423ecb
    return ret
Packit 423ecb
Packit 423ecb
#########################################################################
Packit 423ecb
#									#
Packit 423ecb
#                  Mail archives parsing and analysis			#
Packit 423ecb
#									#
Packit 423ecb
#########################################################################
Packit 423ecb
Packit 423ecb
import time
Packit 423ecb
Packit 423ecb
def getXMLDateArchive(t = None):
Packit 423ecb
    if t == None:
Packit 423ecb
	t = time.time()
Packit 423ecb
    T = time.gmtime(t)
Packit 423ecb
    month = time.strftime("%B", T)
Packit 423ecb
    year = T[0]
Packit 423ecb
    url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
Packit 423ecb
    return url
Packit 423ecb
Packit 423ecb
def scanXMLMsgArchive(url, title, force = 0):
Packit 423ecb
    if url == None or title == None:
Packit 423ecb
        return 0
Packit 423ecb
Packit 423ecb
    ID = checkXMLMsgArchive(url)
Packit 423ecb
    if force == 0 and ID != -1:
Packit 423ecb
        return 0
Packit 423ecb
Packit 423ecb
    if ID == -1:
Packit 423ecb
	ID = addXMLMsgArchive(url, title)
Packit 423ecb
	if ID == -1:
Packit 423ecb
	    return 0
Packit 423ecb
Packit 423ecb
    try:
Packit 423ecb
        print "Loading %s" % (url)
Packit 423ecb
        doc = libxml2.htmlParseFile(url, None);
Packit 423ecb
    except:
Packit 423ecb
        doc = None
Packit 423ecb
    if doc == None:
Packit 423ecb
        print "Failed to parse %s" % (url)
Packit 423ecb
	return 0
Packit 423ecb
Packit 423ecb
    addStringArchive(title, ID, 20)
Packit 423ecb
    ctxt = doc.xpathNewContext()
Packit 423ecb
    texts = ctxt.xpathEval("//pre//text()")
Packit 423ecb
    for text in texts:
Packit 423ecb
        addStringArchive(text.content, ID, 5)
Packit 423ecb
Packit 423ecb
    return 1
Packit 423ecb
Packit 423ecb
def scanXMLDateArchive(t = None, force = 0):
Packit 423ecb
    global wordsDictArchive
Packit 423ecb
Packit 423ecb
    wordsDictArchive = {}
Packit 423ecb
Packit 423ecb
    url = getXMLDateArchive(t)
Packit 423ecb
    print "loading %s" % (url)
Packit 423ecb
    try:
Packit 423ecb
	doc = libxml2.htmlParseFile(url, None);
Packit 423ecb
    except:
Packit 423ecb
        doc = None
Packit 423ecb
    if doc == None:
Packit 423ecb
        print "Failed to parse %s" % (url)
Packit 423ecb
	return -1
Packit 423ecb
    ctxt = doc.xpathNewContext()
Packit 423ecb
    anchors = ctxt.xpathEval("//a[@href]")
Packit 423ecb
    links = 0
Packit 423ecb
    newmsg = 0
Packit 423ecb
    for anchor in anchors:
Packit 423ecb
	href = anchor.prop("href")
Packit 423ecb
	if href == None or href[0:3] != "msg":
Packit 423ecb
	    continue
Packit 423ecb
        try:
Packit 423ecb
	    links = links + 1
Packit 423ecb
Packit 423ecb
	    msg = libxml2.buildURI(href, url)
Packit 423ecb
	    title = anchor.content
Packit 423ecb
	    if title != None and title[0:4] == 'Re: ':
Packit 423ecb
	        title = title[4:]
Packit 423ecb
	    if title != None and title[0:6] == '[xml] ':
Packit 423ecb
	        title = title[6:]
Packit 423ecb
	    newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
Packit 423ecb
Packit 423ecb
	except:
Packit 423ecb
	    pass
Packit 423ecb
Packit 423ecb
    return newmsg
Packit 423ecb
    
Packit 423ecb
Packit 423ecb
#########################################################################
Packit 423ecb
#									#
Packit 423ecb
#          Main code: open the DB, the API XML and analyze it		#
Packit 423ecb
#									#
Packit 423ecb
#########################################################################
Packit 423ecb
def analyzeArchives(t = None, force = 0):
Packit 423ecb
    global wordsDictArchive
Packit 423ecb
Packit 423ecb
    ret = scanXMLDateArchive(t, force)
Packit 423ecb
    print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
Packit 423ecb
Packit 423ecb
    i = 0
Packit 423ecb
    skipped = 0
Packit 423ecb
    for word in wordsDictArchive.keys():
Packit 423ecb
	refs = wordsDictArchive[word]
Packit 423ecb
	if refs  == None:
Packit 423ecb
	    skipped = skipped + 1
Packit 423ecb
	    continue;
Packit 423ecb
	for id in refs.keys():
Packit 423ecb
	    relevance = refs[id]
Packit 423ecb
	    updateWordArchive(word, id, relevance)
Packit 423ecb
	    i = i + 1
Packit 423ecb
Packit 423ecb
    print "Found %d associations in HTML pages" % (i)
Packit 423ecb
Packit 423ecb
def analyzeHTMLTop():
Packit 423ecb
    global wordsDictHTML
Packit 423ecb
Packit 423ecb
    ret = analyzeHTMLPages()
Packit 423ecb
    print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
Packit 423ecb
Packit 423ecb
    i = 0
Packit 423ecb
    skipped = 0
Packit 423ecb
    for word in wordsDictHTML.keys():
Packit 423ecb
	refs = wordsDictHTML[word]
Packit 423ecb
	if refs  == None:
Packit 423ecb
	    skipped = skipped + 1
Packit 423ecb
	    continue;
Packit 423ecb
	for resource in refs.keys():
Packit 423ecb
	    (relevance, id, section) = refs[resource]
Packit 423ecb
	    updateWordHTML(word, resource, section, id, relevance)
Packit 423ecb
	    i = i + 1
Packit 423ecb
Packit 423ecb
    print "Found %d associations in HTML pages" % (i)
Packit 423ecb
Packit 423ecb
def analyzeAPITop():
Packit 423ecb
    global wordsDict
Packit 423ecb
    global API
Packit 423ecb
Packit 423ecb
    try:
Packit 423ecb
	doc = loadAPI(API)
Packit 423ecb
	ret = analyzeAPI(doc)
Packit 423ecb
	print "Analyzed %d blocs" % (ret)
Packit 423ecb
	doc.freeDoc()
Packit 423ecb
    except:
Packit 423ecb
	print "Failed to parse and analyze %s" % (API)
Packit 423ecb
	print sys.exc_type, sys.exc_value
Packit 423ecb
	sys.exit(1)
Packit 423ecb
Packit 423ecb
    print "Indexed %d words" % (len(wordsDict))
Packit 423ecb
    i = 0
Packit 423ecb
    skipped = 0
Packit 423ecb
    for word in wordsDict.keys():
Packit 423ecb
	refs = wordsDict[word]
Packit 423ecb
	if refs  == None:
Packit 423ecb
	    skipped = skipped + 1
Packit 423ecb
	    continue;
Packit 423ecb
	for (module, symbol) in refs.keys():
Packit 423ecb
	    updateWord(word, symbol, refs[(module, symbol)])
Packit 423ecb
	    i = i + 1
Packit 423ecb
Packit 423ecb
    print "Found %d associations, skipped %d words" % (i, skipped)
Packit 423ecb
Packit 423ecb
def usage():
Packit 423ecb
    print "Usage index.py [--force] [--archive]  [--archive-year year] [--archive-month month] [--API] [--docs]"
Packit 423ecb
    sys.exit(1)
Packit 423ecb
Packit 423ecb
def main():
Packit 423ecb
    try:
Packit 423ecb
	openMySQL()
Packit 423ecb
    except:
Packit 423ecb
	print "Failed to open the database"
Packit 423ecb
	print sys.exc_type, sys.exc_value
Packit 423ecb
	sys.exit(1)
Packit 423ecb
Packit 423ecb
    args = sys.argv[1:]
Packit 423ecb
    force = 0
Packit 423ecb
    if args:
Packit 423ecb
        i = 0
Packit 423ecb
	while i < len(args):
Packit 423ecb
	    if args[i] == '--force':
Packit 423ecb
	        force = 1
Packit 423ecb
	    elif args[i] == '--archive':
Packit 423ecb
	        analyzeArchives(None, force)
Packit 423ecb
	    elif args[i] == '--archive-year':
Packit 423ecb
	        i = i + 1;
Packit 423ecb
		year = args[i]
Packit 423ecb
		months = ["January" , "February", "March", "April", "May",
Packit 423ecb
			  "June", "July", "August", "September", "October",
Packit 423ecb
			  "November", "December"];
Packit 423ecb
	        for month in months:
Packit 423ecb
		    try:
Packit 423ecb
		        str = "%s-%s" % (year, month)
Packit 423ecb
			T = time.strptime(str, "%Y-%B")
Packit 423ecb
			t = time.mktime(T) + 3600 * 24 * 10;
Packit 423ecb
			analyzeArchives(t, force)
Packit 423ecb
		    except:
Packit 423ecb
			print "Failed to index month archive:"
Packit 423ecb
			print sys.exc_type, sys.exc_value
Packit 423ecb
	    elif args[i] == '--archive-month':
Packit 423ecb
	        i = i + 1;
Packit 423ecb
		month = args[i]
Packit 423ecb
		try:
Packit 423ecb
		    T = time.strptime(month, "%Y-%B")
Packit 423ecb
		    t = time.mktime(T) + 3600 * 24 * 10;
Packit 423ecb
		    analyzeArchives(t, force)
Packit 423ecb
		except:
Packit 423ecb
		    print "Failed to index month archive:"
Packit 423ecb
		    print sys.exc_type, sys.exc_value
Packit 423ecb
	    elif args[i] == '--API':
Packit 423ecb
	        analyzeAPITop()
Packit 423ecb
	    elif args[i] == '--docs':
Packit 423ecb
	        analyzeHTMLTop()
Packit 423ecb
	    else:
Packit 423ecb
	        usage()
Packit 423ecb
	    i = i + 1
Packit 423ecb
    else:
Packit 423ecb
        usage()
Packit 423ecb
Packit 423ecb
if __name__ == "__main__":
Packit 423ecb
    main()