Blame dbdimp_tokenizer.inc

Packit 723767
typedef struct perl_tokenizer {
Packit 723767
    sqlite3_tokenizer base;
Packit 723767
    SV *coderef;                 /* the perl tokenizer is a coderef that takes
Packit 723767
                                    a string and returns a cursor coderef */
Packit 723767
} perl_tokenizer;
Packit 723767
Packit 723767
typedef struct perl_tokenizer_cursor {
Packit 723767
    sqlite3_tokenizer_cursor base;
Packit 723767
    SV *coderef;                 /* ref to the closure that returns terms */
Packit 723767
    char *pToken;                /* storage for a copy of the last token */
Packit 723767
    int nTokenAllocated;         /* space allocated to pToken buffer */
Packit 723767
Packit 723767
    /* members below are only used if the input string is in utf8 */
Packit 723767
    const char *pInput;          /* input we are tokenizing */
Packit 723767
    const char *lastByteOffset;  /* offset into pInput */
Packit 723767
    int lastCharOffset;          /* char offset corresponding to lastByteOffset */
Packit 723767
} perl_tokenizer_cursor;
Packit 723767
Packit 723767
/*
Packit 723767
** Create a new tokenizer instance.
Packit 723767
** Will be called whenever a FTS3 table is created with
Packit 723767
**   CREATE .. USING fts3( ... , tokenize=perl qualified::function::name)
Packit 723767
** where qualified::function::name is a fully qualified perl function
Packit 723767
*/
Packit 723767
static int perl_tokenizer_Create(
Packit 723767
    int argc, const char * const *argv,
Packit 723767
    sqlite3_tokenizer **ppTokenizer
Packit 723767
){
Packit 723767
    dTHX;
Packit 723767
    dSP;
Packit 723767
    int n_retval;
Packit 723767
    SV *retval;
Packit 723767
    perl_tokenizer *t;
Packit 723767
Packit 723767
    if (!argc) {
Packit 723767
        return SQLITE_ERROR;
Packit 723767
    }
Packit 723767
Packit 723767
    t = (perl_tokenizer *) sqlite3_malloc(sizeof(*t));
Packit 723767
    if( t==NULL ) return SQLITE_NOMEM;
Packit 723767
    memset(t, 0, sizeof(*t));
Packit 723767
Packit 723767
    ENTER;
Packit 723767
    SAVETMPS;
Packit 723767
Packit 723767
    /* call the qualified::function::name */
Packit 723767
    PUSHMARK(SP);
Packit 723767
    PUTBACK;
Packit 723767
    n_retval = call_pv(argv[0], G_SCALAR);
Packit 723767
    SPAGAIN;
Packit 723767
Packit 723767
    /* store a copy of the returned coderef into the tokenizer structure */
Packit 723767
    if (n_retval != 1) {
Packit 723767
        warn("tokenizer_Create returned %d arguments", n_retval);
Packit 723767
    }
Packit 723767
    retval = POPs;
Packit 723767
    t->coderef   = newSVsv(retval);
Packit 723767
    *ppTokenizer = &t->base;
Packit 723767
Packit 723767
    PUTBACK;
Packit 723767
    FREETMPS;
Packit 723767
    LEAVE;
Packit 723767
Packit 723767
    return SQLITE_OK;
Packit 723767
}
Packit 723767
Packit 723767
/*
Packit 723767
** Destroy a tokenizer
Packit 723767
*/
Packit 723767
static int perl_tokenizer_Destroy(sqlite3_tokenizer *pTokenizer){
Packit 723767
    dTHX;
Packit 723767
    perl_tokenizer *t = (perl_tokenizer *) pTokenizer;
Packit 723767
    sv_free(t->coderef);
Packit 723767
    sqlite3_free(t);
Packit 723767
    return SQLITE_OK;
Packit 723767
}
Packit 723767
Packit 723767
/*
Packit 723767
** Prepare to begin tokenizing a particular string.  The input
Packit 723767
** string to be tokenized is supposed to be pInput[0..nBytes-1] ..
Packit 723767
** except that nBytes passed by fts3 is -1 (don't know why) !
Packit 723767
** This is passed to the tokenizer instance, which then returns a
Packit 723767
** closure implementing the cursor (so the cursor is again a coderef).
Packit 723767
*/
Packit 723767
static int perl_tokenizer_Open(
Packit 723767
    sqlite3_tokenizer *pTokenizer,       /* Tokenizer object */
Packit 723767
    const char *pInput, int nBytes,      /* Input buffer */
Packit 723767
    sqlite3_tokenizer_cursor **ppCursor  /* OUT: Created tokenizer cursor */
Packit 723767
){
Packit 723767
    dTHX;
Packit 723767
    dSP;
Packit 723767
    dMY_CXT;
Packit 723767
    U32 flags;
Packit 723767
    SV *perl_string;
Packit 723767
    int n_retval;
Packit 723767
Packit 723767
    perl_tokenizer *t = (perl_tokenizer *)pTokenizer;
Packit 723767
Packit 723767
    /* allocate and initialize the cursor struct */
Packit 723767
    perl_tokenizer_cursor *c;
Packit 723767
    c = (perl_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
Packit 723767
    memset(c, 0, sizeof(*c));
Packit 723767
    *ppCursor = &c->base;
Packit 723767
Packit 723767
    /* flags for creating the Perl SV containing the input string */
Packit 723767
    flags = SVs_TEMP; /* will call sv_2mortal */
Packit 723767
Packit 723767
    /* special handling if working with utf8 strings */
Packit 723767
    if (MY_CXT.last_dbh_is_unicode) {
Packit 723767
Packit 723767
        /* data to keep track of byte offsets */
Packit 723767
        c->lastByteOffset = c->pInput = pInput;
Packit 723767
        c->lastCharOffset = 0;
Packit 723767
Packit 723767
        /* string passed to Perl needs to be flagged as utf8 */
Packit 723767
        flags |= SVf_UTF8;
Packit 723767
    }
Packit 723767
Packit 723767
    ENTER;
Packit 723767
    SAVETMPS;
Packit 723767
Packit 723767
    /* build a Perl copy of the input string */
Packit 723767
    if (nBytes < 0) { /* we get -1 from fts3. Don't know why ! */
Packit 723767
        nBytes = strlen(pInput);
Packit 723767
    }
Packit 723767
    perl_string = newSVpvn_flags(pInput, nBytes, flags);
Packit 723767
Packit 723767
    /* call the tokenizer coderef */
Packit 723767
    PUSHMARK(SP);
Packit 723767
    XPUSHs(perl_string);
Packit 723767
    PUTBACK;
Packit 723767
    n_retval = call_sv(t->coderef, G_SCALAR);
Packit 723767
    SPAGAIN;
Packit 723767
Packit 723767
    /* store the cursor coderef returned by the tokenizer */
Packit 723767
    if (n_retval != 1) {
Packit 723767
        warn("tokenizer returned %d arguments", n_retval);
Packit 723767
    }
Packit 723767
    c->coderef = newSVsv(POPs);
Packit 723767
Packit 723767
    PUTBACK;
Packit 723767
    FREETMPS;
Packit 723767
    LEAVE;
Packit 723767
    return SQLITE_OK;
Packit 723767
}
Packit 723767
Packit 723767
/*
Packit 723767
** Close a tokenization cursor previously opened by a call to
Packit 723767
** perl_tokenizer_Open() above.
Packit 723767
*/
Packit 723767
static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){
Packit 723767
    perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
Packit 723767
Packit 723767
    dTHX;
Packit 723767
    sv_free(c->coderef);
Packit 723767
    if (c->pToken) sqlite3_free(c->pToken);
Packit 723767
    sqlite3_free(c);
Packit 723767
    return SQLITE_OK;
Packit 723767
}
Packit 723767
Packit 723767
/*
Packit 723767
** Extract the next token from a tokenization cursor.  The cursor must
Packit 723767
** have been opened by a prior call to perl_tokenizer_Open().
Packit 723767
*/
Packit 723767
static int perl_tokenizer_Next(
Packit 723767
    sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by perl_tokenizer_Open */
Packit 723767
    const char **ppToken,               /* OUT: *ppToken is the token text */
Packit 723767
    int *pnBytes,                       /* OUT: Number of bytes in token */
Packit 723767
    int *piStartOffset,                 /* OUT: Starting offset of token */
Packit 723767
    int *piEndOffset,                   /* OUT: Ending offset of token */
Packit 723767
    int *piPosition                     /* OUT: Position integer of token */
Packit 723767
){
Packit 723767
    perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
Packit 723767
    int result;
Packit 723767
    int n_retval;
Packit 723767
    char *token;
Packit 723767
    char *byteOffset;
Packit 723767
    STRLEN n_a; /* this is required for older perls < 5.8.8 */
Packit 723767
    I32 hop;
Packit 723767
Packit 723767
    dTHX;
Packit 723767
    dSP;
Packit 723767
Packit 723767
    ENTER;
Packit 723767
    SAVETMPS;
Packit 723767
Packit 723767
    /* call the cursor */
Packit 723767
    PUSHMARK(SP);
Packit 723767
    PUTBACK;
Packit 723767
    n_retval = call_sv(c->coderef, G_ARRAY);
Packit 723767
    SPAGAIN;
Packit 723767
Packit 723767
    /* if we get back an empty list, there is no more token */
Packit 723767
    if (n_retval == 0) {
Packit 723767
        result = SQLITE_DONE;
Packit 723767
    }
Packit 723767
    /* otherwise, get token details from the return list */
Packit 723767
    else {
Packit 723767
        if (n_retval != 5) {
Packit 723767
            warn("tokenizer cursor returned %d arguments", n_retval);
Packit 723767
        }
Packit 723767
        *piPosition    = POPi;
Packit 723767
        *piEndOffset   = POPi;
Packit 723767
        *piStartOffset = POPi;
Packit 723767
        *pnBytes       = POPi;
Packit 723767
        token          = POPpx;
Packit 723767
Packit 723767
        if (c->pInput) { /* if working with utf8 data */
Packit 723767
Packit 723767
            /* recompute *pnBytes in bytes, not in chars */
Packit 723767
            *pnBytes = strlen(token);
Packit 723767
Packit 723767
            /* recompute start/end offsets in bytes, not in chars */
Packit 723767
            hop            = *piStartOffset - c->lastCharOffset;
Packit 723767
            byteOffset     = (char*)utf8_hop((U8*)c->lastByteOffset, hop);
Packit 723767
            hop            = *piEndOffset - *piStartOffset;
Packit 723767
            *piStartOffset = byteOffset - c->pInput;
Packit 723767
            byteOffset     = (char*)utf8_hop((U8*)byteOffset, hop);
Packit 723767
            *piEndOffset   = byteOffset - c->pInput;
Packit 723767
Packit 723767
            /* remember where we are for next round */
Packit 723767
            c->lastCharOffset = *piEndOffset,
Packit 723767
            c->lastByteOffset = byteOffset;
Packit 723767
        }
Packit 723767
Packit 723767
        /* make sure we have enough storage for copying the token */
Packit 723767
        if (*pnBytes > c->nTokenAllocated ){
Packit 723767
            char *pNew;
Packit 723767
            c->nTokenAllocated = *pnBytes + 20;
Packit 723767
            pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated);
Packit 723767
            if( !pNew ) return SQLITE_NOMEM;
Packit 723767
            c->pToken = pNew;
Packit 723767
        }
Packit 723767
Packit 723767
        /* need to copy the token into the C cursor before perl frees that
Packit 723767
           memory */
Packit 723767
        memcpy(c->pToken, token, *pnBytes);
Packit 723767
        *ppToken  = c->pToken;
Packit 723767
Packit 723767
        result = SQLITE_OK;
Packit 723767
    }
Packit 723767
Packit 723767
    PUTBACK;
Packit 723767
    FREETMPS;
Packit 723767
    LEAVE;
Packit 723767
Packit 723767
    return result;
Packit 723767
}
Packit 723767
Packit 723767
/*
Packit 723767
** The set of routines that implement the perl tokenizer
Packit 723767
*/
Packit 723767
sqlite3_tokenizer_module perl_tokenizer_Module = {
Packit 723767
    0,
Packit 723767
    perl_tokenizer_Create,
Packit 723767
    perl_tokenizer_Destroy,
Packit 723767
    perl_tokenizer_Open,
Packit 723767
    perl_tokenizer_Close,
Packit 723767
    perl_tokenizer_Next
Packit 723767
};
Packit 723767
Packit 723767
/*
Packit 723767
** Register the perl tokenizer with FTS3
Packit 723767
*/
Packit 723767
int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh)
Packit 723767
{
Packit 723767
    D_imp_dbh(dbh);
Packit 723767
Packit 723767
    int rc;
Packit 723767
    sqlite3_stmt *pStmt;
Packit 723767
    const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
Packit 723767
    sqlite3_tokenizer_module *p = &perl_tokenizer_Module;
Packit 723767
Packit 723767
    if (!DBIc_ACTIVE(imp_dbh)) {
Packit 723767
        sqlite_error(dbh, -2, "attempt to register fts3 tokenizer on inactive database handle");
Packit 723767
        return FALSE;
Packit 723767
    }
Packit 723767
Packit 723767
#if SQLITE_VERSION_NUMBER >= 3012000
Packit 723767
    rc = sqlite3_db_config(imp_dbh->db, SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER, 1, 0);
Packit 723767
    if( rc!=SQLITE_OK ){
Packit 723767
        return rc;
Packit 723767
    }
Packit 723767
#endif
Packit 723767
Packit 723767
    rc = sqlite3_prepare_v2(imp_dbh->db, zSql, -1, &pStmt, 0);
Packit 723767
    if( rc!=SQLITE_OK ){
Packit 723767
        return rc;
Packit 723767
    }
Packit 723767
Packit 723767
    sqlite3_bind_text(pStmt, 1, "perl", -1, SQLITE_STATIC);
Packit 723767
    sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
Packit 723767
    sqlite3_step(pStmt);
Packit 723767
Packit 723767
    return sqlite3_finalize(pStmt);
Packit 723767
}