typedef struct perl_tokenizer { sqlite3_tokenizer base; SV *coderef; /* the perl tokenizer is a coderef that takes a string and returns a cursor coderef */ } perl_tokenizer; typedef struct perl_tokenizer_cursor { sqlite3_tokenizer_cursor base; SV *coderef; /* ref to the closure that returns terms */ char *pToken; /* storage for a copy of the last token */ int nTokenAllocated; /* space allocated to pToken buffer */ /* members below are only used if the input string is in utf8 */ const char *pInput; /* input we are tokenizing */ const char *lastByteOffset; /* offset into pInput */ int lastCharOffset; /* char offset corresponding to lastByteOffset */ } perl_tokenizer_cursor; /* ** Create a new tokenizer instance. ** Will be called whenever a FTS3 table is created with ** CREATE .. USING fts3( ... , tokenize=perl qualified::function::name) ** where qualified::function::name is a fully qualified perl function */ static int perl_tokenizer_Create( int argc, const char * const *argv, sqlite3_tokenizer **ppTokenizer ){ dTHX; dSP; int n_retval; SV *retval; perl_tokenizer *t; if (!argc) { return SQLITE_ERROR; } t = (perl_tokenizer *) sqlite3_malloc(sizeof(*t)); if( t==NULL ) return SQLITE_NOMEM; memset(t, 0, sizeof(*t)); ENTER; SAVETMPS; /* call the qualified::function::name */ PUSHMARK(SP); PUTBACK; n_retval = call_pv(argv[0], G_SCALAR); SPAGAIN; /* store a copy of the returned coderef into the tokenizer structure */ if (n_retval != 1) { warn("tokenizer_Create returned %d arguments", n_retval); } retval = POPs; t->coderef = newSVsv(retval); *ppTokenizer = &t->base; PUTBACK; FREETMPS; LEAVE; return SQLITE_OK; } /* ** Destroy a tokenizer */ static int perl_tokenizer_Destroy(sqlite3_tokenizer *pTokenizer){ dTHX; perl_tokenizer *t = (perl_tokenizer *) pTokenizer; sv_free(t->coderef); sqlite3_free(t); return SQLITE_OK; } /* ** Prepare to begin tokenizing a particular string. The input ** string to be tokenized is supposed to be pInput[0..nBytes-1] .. ** except that nBytes passed by fts3 is -1 (don't know why) ! ** This is passed to the tokenizer instance, which then returns a ** closure implementing the cursor (so the cursor is again a coderef). */ static int perl_tokenizer_Open( sqlite3_tokenizer *pTokenizer, /* Tokenizer object */ const char *pInput, int nBytes, /* Input buffer */ sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */ ){ dTHX; dSP; dMY_CXT; U32 flags; SV *perl_string; int n_retval; perl_tokenizer *t = (perl_tokenizer *)pTokenizer; /* allocate and initialize the cursor struct */ perl_tokenizer_cursor *c; c = (perl_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); memset(c, 0, sizeof(*c)); *ppCursor = &c->base; /* flags for creating the Perl SV containing the input string */ flags = SVs_TEMP; /* will call sv_2mortal */ /* special handling if working with utf8 strings */ if (MY_CXT.last_dbh_is_unicode) { /* data to keep track of byte offsets */ c->lastByteOffset = c->pInput = pInput; c->lastCharOffset = 0; /* string passed to Perl needs to be flagged as utf8 */ flags |= SVf_UTF8; } ENTER; SAVETMPS; /* build a Perl copy of the input string */ if (nBytes < 0) { /* we get -1 from fts3. Don't know why ! */ nBytes = strlen(pInput); } perl_string = newSVpvn_flags(pInput, nBytes, flags); /* call the tokenizer coderef */ PUSHMARK(SP); XPUSHs(perl_string); PUTBACK; n_retval = call_sv(t->coderef, G_SCALAR); SPAGAIN; /* store the cursor coderef returned by the tokenizer */ if (n_retval != 1) { warn("tokenizer returned %d arguments", n_retval); } c->coderef = newSVsv(POPs); PUTBACK; FREETMPS; LEAVE; return SQLITE_OK; } /* ** Close a tokenization cursor previously opened by a call to ** perl_tokenizer_Open() above. */ static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){ perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor; dTHX; sv_free(c->coderef); if (c->pToken) sqlite3_free(c->pToken); sqlite3_free(c); return SQLITE_OK; } /* ** Extract the next token from a tokenization cursor. The cursor must ** have been opened by a prior call to perl_tokenizer_Open(). */ static int perl_tokenizer_Next( sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by perl_tokenizer_Open */ const char **ppToken, /* OUT: *ppToken is the token text */ int *pnBytes, /* OUT: Number of bytes in token */ int *piStartOffset, /* OUT: Starting offset of token */ int *piEndOffset, /* OUT: Ending offset of token */ int *piPosition /* OUT: Position integer of token */ ){ perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor; int result; int n_retval; char *token; char *byteOffset; STRLEN n_a; /* this is required for older perls < 5.8.8 */ I32 hop; dTHX; dSP; ENTER; SAVETMPS; /* call the cursor */ PUSHMARK(SP); PUTBACK; n_retval = call_sv(c->coderef, G_ARRAY); SPAGAIN; /* if we get back an empty list, there is no more token */ if (n_retval == 0) { result = SQLITE_DONE; } /* otherwise, get token details from the return list */ else { if (n_retval != 5) { warn("tokenizer cursor returned %d arguments", n_retval); } *piPosition = POPi; *piEndOffset = POPi; *piStartOffset = POPi; *pnBytes = POPi; token = POPpx; if (c->pInput) { /* if working with utf8 data */ /* recompute *pnBytes in bytes, not in chars */ *pnBytes = strlen(token); /* recompute start/end offsets in bytes, not in chars */ hop = *piStartOffset - c->lastCharOffset; byteOffset = (char*)utf8_hop((U8*)c->lastByteOffset, hop); hop = *piEndOffset - *piStartOffset; *piStartOffset = byteOffset - c->pInput; byteOffset = (char*)utf8_hop((U8*)byteOffset, hop); *piEndOffset = byteOffset - c->pInput; /* remember where we are for next round */ c->lastCharOffset = *piEndOffset, c->lastByteOffset = byteOffset; } /* make sure we have enough storage for copying the token */ if (*pnBytes > c->nTokenAllocated ){ char *pNew; c->nTokenAllocated = *pnBytes + 20; pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated); if( !pNew ) return SQLITE_NOMEM; c->pToken = pNew; } /* need to copy the token into the C cursor before perl frees that memory */ memcpy(c->pToken, token, *pnBytes); *ppToken = c->pToken; result = SQLITE_OK; } PUTBACK; FREETMPS; LEAVE; return result; } /* ** The set of routines that implement the perl tokenizer */ sqlite3_tokenizer_module perl_tokenizer_Module = { 0, perl_tokenizer_Create, perl_tokenizer_Destroy, perl_tokenizer_Open, perl_tokenizer_Close, perl_tokenizer_Next }; /* ** Register the perl tokenizer with FTS3 */ int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh) { D_imp_dbh(dbh); int rc; sqlite3_stmt *pStmt; const char zSql[] = "SELECT fts3_tokenizer(?, ?)"; sqlite3_tokenizer_module *p = &perl_tokenizer_Module; if (!DBIc_ACTIVE(imp_dbh)) { sqlite_error(dbh, -2, "attempt to register fts3 tokenizer on inactive database handle"); return FALSE; } #if SQLITE_VERSION_NUMBER >= 3012000 rc = sqlite3_db_config(imp_dbh->db, SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER, 1, 0); if( rc!=SQLITE_OK ){ return rc; } #endif rc = sqlite3_prepare_v2(imp_dbh->db, zSql, -1, &pStmt, 0); if( rc!=SQLITE_OK ){ return rc; } sqlite3_bind_text(pStmt, 1, "perl", -1, SQLITE_STATIC); sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC); sqlite3_step(pStmt); return sqlite3_finalize(pStmt); }