Blame mythes.cxx

Packit 1184b9
#include "COPYING"
Packit 1184b9
#include <stdio.h>
Packit 1184b9
#include <string.h>
Packit 1184b9
#include <stdlib.h>
Packit 1184b9
#include <errno.h>
Packit 1184b9
#include <limits>
Packit 1184b9
#include <vector>
Packit 1184b9
Packit 1184b9
#include "mythes.hxx"
Packit 1184b9
Packit 1184b9
MyThes::MyThes(const char* idxpath, const char * datpath)
Packit 1184b9
{
Packit 1184b9
    nw = 0;
Packit 1184b9
    encoding = NULL;
Packit 1184b9
    list = NULL;
Packit 1184b9
    offst = NULL;
Packit 1184b9
    pdfile = NULL;
Packit 1184b9
Packit 1184b9
    if (thInitialize(idxpath, datpath) != 1) {
Packit 1184b9
        fprintf(stderr,"Error - can't open %s or %s\n",idxpath, datpath);
Packit 1184b9
        fflush(stderr);
Packit 1184b9
        thCleanup();
Packit 1184b9
        // did not initialize properly - throw exception?
Packit 1184b9
    }
Packit 1184b9
}
Packit 1184b9
Packit 1184b9
Packit 1184b9
MyThes::~MyThes()
Packit 1184b9
{
Packit 1184b9
    thCleanup();
Packit 1184b9
}
Packit 1184b9
Packit 1184b9
Packit 1184b9
int MyThes::thInitialize(const char* idxpath, const char* datpath)
Packit 1184b9
{
Packit 1184b9
Packit 1184b9
    // open the index file
Packit 1184b9
    FILE * pifile = fopen(idxpath,"r");
Packit 1184b9
    if (!pifile) {
Packit 1184b9
        return 0;
Packit 1184b9
    } 
Packit 1184b9
Packit 1184b9
    // parse in encoding and index size */    
Packit 1184b9
    std::vector<char> buffer(MAX_WD_LEN);
Packit 1184b9
    char * wrd = &buffer[0];
Packit 1184b9
    readLine(pifile,wrd,MAX_WD_LEN);
Packit 1184b9
    encoding = mystrdup(wrd);
Packit 1184b9
    readLine(pifile,wrd,MAX_WD_LEN);
Packit 1184b9
    int idxsz = atoi(wrd); 
Packit 1184b9
   
Packit 1184b9
    if (idxsz <= 0 || idxsz > std::numeric_limits<int>::max() / sizeof(sizeof(char*))) {
Packit 1184b9
       fprintf(stderr,"Error - bad index %d\n", idxsz);
Packit 1184b9
       fclose(pifile);
Packit 1184b9
       return 0;
Packit 1184b9
    }
Packit 1184b9
Packit 1184b9
    // now allocate list, offst for the given size
Packit 1184b9
    list = (char**)   calloc(idxsz,sizeof(char*));
Packit 1184b9
    offst = (unsigned int*) calloc(idxsz,sizeof(unsigned int));
Packit 1184b9
Packit 1184b9
    if ( (!(list)) || (!(offst)) ) {
Packit 1184b9
       fprintf(stderr,"Error - bad memory allocation\n");
Packit 1184b9
       fclose(pifile);
Packit 1184b9
       return 0;
Packit 1184b9
    }
Packit 1184b9
Packit 1184b9
    // now parse the remaining lines of the index
Packit 1184b9
    int len = readLine(pifile,wrd,MAX_WD_LEN);
Packit 1184b9
    while (len > 0)
Packit 1184b9
    { 
Packit 1184b9
        int np = mystr_indexOfChar(wrd,'|');
Packit 1184b9
        if (nw < idxsz) {
Packit 1184b9
            if (np >= 0) {          
Packit 1184b9
                *(wrd+np) = '\0';
Packit 1184b9
                list[nw] = (char *)calloc(1,(np+1));
Packit 1184b9
                if (!list[nw]) {
Packit 1184b9
                    fprintf(stderr,"Error - bad memory allocation\n");
Packit 1184b9
                    fflush(stderr);
Packit 1184b9
                    fclose(pifile);
Packit 1184b9
                    return 0;
Packit 1184b9
                }
Packit 1184b9
                memcpy((list[nw]),wrd,np);
Packit 1184b9
                offst[nw] = atoi(wrd+np+1);
Packit 1184b9
                nw++;
Packit 1184b9
            }
Packit 1184b9
        }
Packit 1184b9
        len = readLine(pifile,wrd,MAX_WD_LEN);
Packit 1184b9
    }
Packit 1184b9
Packit 1184b9
    fclose(pifile);
Packit 1184b9
Packit 1184b9
    /* next open the data file */
Packit 1184b9
    pdfile = fopen(datpath,"r");
Packit 1184b9
    if (!pdfile) {
Packit 1184b9
        return 0;
Packit 1184b9
    } 
Packit 1184b9
        
Packit 1184b9
    return 1;        
Packit 1184b9
}
Packit 1184b9
Packit 1184b9
Packit 1184b9
void MyThes::thCleanup()
Packit 1184b9
{
Packit 1184b9
    /* first close the data file */
Packit 1184b9
    if (pdfile) {
Packit 1184b9
        fclose(pdfile);
Packit 1184b9
        pdfile=NULL;
Packit 1184b9
    }
Packit 1184b9
Packit 1184b9
    if (list)
Packit 1184b9
    {
Packit 1184b9
        /* now free up all the allocated strings on the list */
Packit 1184b9
        for (int i=0; i < nw; i++) 
Packit 1184b9
        {
Packit 1184b9
            if (list[i]) {
Packit 1184b9
                free(list[i]);
Packit 1184b9
                list[i] = 0;
Packit 1184b9
            }
Packit 1184b9
        }
Packit 1184b9
        free((void*)list);
Packit 1184b9
    }
Packit 1184b9
Packit 1184b9
    if (encoding) free((void*)encoding);
Packit 1184b9
    if (offst) free((void*)offst);
Packit 1184b9
Packit 1184b9
    encoding = NULL;
Packit 1184b9
    list = NULL;
Packit 1184b9
    offst = NULL;
Packit 1184b9
    nw = 0;
Packit 1184b9
}
Packit 1184b9
Packit 1184b9
Packit 1184b9
Packit 1184b9
// lookup text in index and count of meanings and a list of meaning entries
Packit 1184b9
// with each entry having a synonym count and pointer to an 
Packit 1184b9
// array of char * (i.e the synonyms)
Packit 1184b9
// 
Packit 1184b9
// note: calling routine should call CleanUpAfterLookup with the original
Packit 1184b9
// meaning point and count to properly deallocate memory
Packit 1184b9
Packit 1184b9
int MyThes::Lookup(const char * pText, int len, mentry** pme)
Packit 1184b9
{ 
Packit 1184b9
Packit 1184b9
    *pme = NULL;
Packit 1184b9
Packit 1184b9
    // handle the case of missing file or file related errors
Packit 1184b9
    if (! pdfile) return 0;
Packit 1184b9
Packit 1184b9
    long offset = 0;
Packit 1184b9
Packit 1184b9
    /* copy search word and make sure null terminated */
Packit 1184b9
    std::vector<char> buffer(len+1);
Packit 1184b9
    char * wrd = &buffer[0];
Packit 1184b9
    memcpy(wrd,pText,len);
Packit 1184b9
  
Packit 1184b9
    /* find it in the list */
Packit 1184b9
    int idx = nw > 0 ? binsearch(wrd,list,nw) : -1;
Packit 1184b9
    if (idx < 0) return 0;
Packit 1184b9
Packit 1184b9
    // now seek to the offset
Packit 1184b9
    offset = (long) offst[idx];
Packit 1184b9
    int rc = fseek(pdfile,offset,SEEK_SET);
Packit 1184b9
    if (rc) {
Packit 1184b9
       return 0;
Packit 1184b9
    }
Packit 1184b9
Packit 1184b9
    // grab the count of the number of meanings
Packit 1184b9
    // and allocate a list of meaning entries
Packit 1184b9
    char * buf = NULL;
Packit 1184b9
    buf  = (char *) malloc( MAX_LN_LEN );
Packit 1184b9
    if (!buf) return 0;
Packit 1184b9
    readLine(pdfile, buf, (MAX_LN_LEN-1));
Packit 1184b9
    int np = mystr_indexOfChar(buf,'|');
Packit 1184b9
    if (np < 0) {
Packit 1184b9
         free(buf);
Packit 1184b9
         return 0;
Packit 1184b9
    }          
Packit 1184b9
    int nmeanings = atoi(buf+np+1);
Packit 1184b9
    if (nmeanings < 0 || nmeanings > std::numeric_limits<int>::max() / sizeof(mentry))
Packit 1184b9
        nmeanings = 0;
Packit 1184b9
    *pme = (mentry*)(nmeanings ? malloc(nmeanings * sizeof(mentry)) : NULL);
Packit 1184b9
    if (!(*pme)) {
Packit 1184b9
        free(buf);
Packit 1184b9
        return 0;
Packit 1184b9
    }
Packit 1184b9
Packit 1184b9
    // now read in each meaning and parse it to get defn, count and synonym lists
Packit 1184b9
    mentry* pm = *(pme);
Packit 1184b9
    char dfn[MAX_WD_LEN];
Packit 1184b9
Packit 1184b9
    for (int j = 0; j < nmeanings; j++) {
Packit 1184b9
        readLine(pdfile, buf, (MAX_LN_LEN-1));
Packit 1184b9
Packit 1184b9
        pm->count = 0;
Packit 1184b9
        pm->psyns = NULL;
Packit 1184b9
        pm->defn = NULL;
Packit 1184b9
Packit 1184b9
        // store away the part of speech for later use
Packit 1184b9
        char * p = buf;
Packit 1184b9
        char * pos = NULL;
Packit 1184b9
        np = mystr_indexOfChar(p,'|');
Packit 1184b9
        if (np >= 0) {
Packit 1184b9
           *(buf+np) = '\0';
Packit 1184b9
	   pos = mystrdup(p);
Packit 1184b9
           p = p + np + 1;
Packit 1184b9
	} else {
Packit 1184b9
          pos = mystrdup("");
Packit 1184b9
        }
Packit 1184b9
        
Packit 1184b9
        // count the number of fields in the remaining line
Packit 1184b9
        int nf = 1;
Packit 1184b9
        char * d = p;
Packit 1184b9
        np = mystr_indexOfChar(d,'|');        
Packit 1184b9
        while ( np >= 0 ) {
Packit 1184b9
	  nf++;
Packit 1184b9
          d = d + np + 1;
Packit 1184b9
          np = mystr_indexOfChar(d,'|');          
Packit 1184b9
	}
Packit 1184b9
	pm->count = nf;
Packit 1184b9
        pm->psyns = (char **) malloc(nf*sizeof(char*)); 
Packit 1184b9
        
Packit 1184b9
        // fill in the synonym list
Packit 1184b9
        d = p;
Packit 1184b9
        for (int jj = 0; jj < nf; jj++) 
Packit 1184b9
        {
Packit 1184b9
            np = mystr_indexOfChar(d,'|');
Packit 1184b9
            if (np > 0) 
Packit 1184b9
            {
Packit 1184b9
                *(d+np) = '\0';
Packit 1184b9
                pm->psyns[jj] = mystrdup(d);
Packit 1184b9
                d = d + np + 1;
Packit 1184b9
            } 
Packit 1184b9
            else 
Packit 1184b9
            {
Packit 1184b9
              pm->psyns[jj] = mystrdup(d);
Packit 1184b9
            }            
Packit 1184b9
        }
Packit 1184b9
Packit 1184b9
        // add pos to first synonym to create the definition
Packit 1184b9
        if (pm->psyns[0])
Packit 1184b9
	{
Packit 1184b9
            int k = strlen(pos);
Packit 1184b9
            int m = strlen(pm->psyns[0]);
Packit 1184b9
            if ((k+m) < (MAX_WD_LEN - 1)) {
Packit 1184b9
                 strncpy(dfn,pos,k);
Packit 1184b9
                 *(dfn+k) = ' ';
Packit 1184b9
                 strncpy((dfn+k+1),(pm->psyns[0]),m+1);
Packit 1184b9
                 pm->defn = mystrdup(dfn);
Packit 1184b9
            } else {
Packit 1184b9
                pm->defn = mystrdup(pm->psyns[0]);
Packit 1184b9
            }
Packit 1184b9
        }
Packit 1184b9
        free(pos);
Packit 1184b9
        pm++;
Packit 1184b9
Packit 1184b9
    }
Packit 1184b9
    free(buf);
Packit 1184b9
   
Packit 1184b9
    return nmeanings;
Packit 1184b9
} 
Packit 1184b9
Packit 1184b9
Packit 1184b9
Packit 1184b9
void MyThes::CleanUpAfterLookup(mentry ** pme, int nmeanings)
Packit 1184b9
{ 
Packit 1184b9
Packit 1184b9
    if (nmeanings == 0) return;
Packit 1184b9
    if ((*pme) == NULL) return;
Packit 1184b9
Packit 1184b9
    mentry * pm = *pme;
Packit 1184b9
       
Packit 1184b9
    for (int i = 0; i < nmeanings; i++) {
Packit 1184b9
       int count = pm->count;
Packit 1184b9
       for (int j = 0; j < count; j++) {
Packit 1184b9
	  if (pm->psyns[j]) free(pm->psyns[j]);
Packit 1184b9
          pm->psyns[j] = NULL;
Packit 1184b9
       }
Packit 1184b9
       if (pm->psyns) free(pm->psyns);
Packit 1184b9
       pm->psyns = NULL;
Packit 1184b9
       if (pm->defn) free(pm->defn);
Packit 1184b9
       pm->defn = NULL;
Packit 1184b9
       pm->count = 0;
Packit 1184b9
       pm++;
Packit 1184b9
    }
Packit 1184b9
    pm = *pme;
Packit 1184b9
    free(pm);
Packit 1184b9
    *pme = NULL;
Packit 1184b9
    return;
Packit 1184b9
}
Packit 1184b9
Packit 1184b9
Packit 1184b9
// read a line of text from a text file stripping
Packit 1184b9
// off the line terminator and replacing it with
Packit 1184b9
// a null string terminator.
Packit 1184b9
// returns:  -1 on error or the number of characters in
Packit 1184b9
//             in the returning string
Packit 1184b9
Packit 1184b9
// A maximum of nc characters will be returned
Packit 1184b9
Packit 1184b9
int MyThes::readLine(FILE * pf, char * buf, int nc)
Packit 1184b9
{
Packit 1184b9
    
Packit 1184b9
  if (fgets(buf,nc,pf)) {
Packit 1184b9
    mychomp(buf);
Packit 1184b9
    return strlen(buf);
Packit 1184b9
  }
Packit 1184b9
  return -1;
Packit 1184b9
}
Packit 1184b9
Packit 1184b9
Packit 1184b9
 
Packit 1184b9
//  performs a binary search on null terminated character
Packit 1184b9
//  strings
Packit 1184b9
//
Packit 1184b9
//  returns: -1 on not found
Packit 1184b9
//           index of wrd in the list[]
Packit 1184b9
Packit 1184b9
int MyThes::binsearch(char * sw, char* _list[], int nlst) 
Packit 1184b9
{
Packit 1184b9
    int lp, up, mp, j, indx;
Packit 1184b9
    lp = 0;
Packit 1184b9
    up = nlst-1;
Packit 1184b9
    indx = -1;
Packit 1184b9
    if (strcmp(sw,_list[lp]) < 0) return -1;
Packit 1184b9
    if (strcmp(sw,_list[up]) > 0) return -1;
Packit 1184b9
    while (indx < 0 ) {
Packit 1184b9
        mp = (int)((lp+up) >> 1);
Packit 1184b9
        j = strcmp(sw,_list[mp]);
Packit 1184b9
        if ( j > 0) {
Packit 1184b9
            lp = mp + 1;
Packit 1184b9
        } else if (j < 0 ) {
Packit 1184b9
            up = mp - 1;
Packit 1184b9
        } else {
Packit 1184b9
            indx = mp;
Packit 1184b9
        }
Packit 1184b9
        if (lp > up) return -1;      
Packit 1184b9
    }
Packit 1184b9
    return indx;
Packit 1184b9
}
Packit 1184b9
Packit 1184b9
char * MyThes::get_th_encoding()
Packit 1184b9
{
Packit 1184b9
  return encoding;
Packit 1184b9
}
Packit 1184b9
Packit 1184b9
Packit 1184b9
// string duplication routine
Packit 1184b9
char * MyThes::mystrdup(const char * s)
Packit 1184b9
{
Packit 1184b9
  char * d = NULL;
Packit 1184b9
  if (s) {
Packit 1184b9
    int sl = strlen(s)+1;
Packit 1184b9
    d = (char *) malloc(sl);
Packit 1184b9
    if (d) memcpy(d,s,sl);
Packit 1184b9
  }
Packit 1184b9
  return d;
Packit 1184b9
}
Packit 1184b9
Packit 1184b9
// remove cross-platform text line end characters
Packit 1184b9
void MyThes::mychomp(char * s)
Packit 1184b9
{
Packit 1184b9
  int k = strlen(s);
Packit 1184b9
  if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
Packit 1184b9
  if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
Packit 1184b9
}
Packit 1184b9
Packit 1184b9
Packit 1184b9
// return index of char in string
Packit 1184b9
int MyThes::mystr_indexOfChar(const char * d, int c)
Packit 1184b9
{
Packit 1184b9
  char * p = strchr((char *)d,c);
Packit 1184b9
  if (p) return (int)(p-d);
Packit 1184b9
  return -1;
Packit 1184b9
}
Packit 1184b9