sqlite3使用fts3虚拟表支持全文搜索,默认支持simple和porter两种分词器,并提供接口自定义分词器。 这里用mmseg构建自定义中文分词器。
sqlite在fts3_tokenizer.h中提供了各种用于用户定制分词器的接口,但不提供用于注册用户定制分词器的c函数。 分词器的注册必须使用sql语句进行。
SELECTfts3_tokenizer (,
其中tokenizer-name是分词器的名称,sqlite3_tokenizer_moduleptr只具有一个指向sqlite3_tokenizer_module结构的指针,作为SQLblob 以下是官方提供的注册函数。
int registerTokenizer (
sqlite3 *db数据库,
char *zName,
const sqlite3 _ tokenizer _ module * p
() )。
intrc;
sqlite3_stmt*pStmt;
const char * zsql=' select FTS3_ tokenizer? () );
RC=SQLite3_prepare_v2(db,zSql,-1,pStmt,0 );
if(RC!=SQLITE_OK ) {
返回RC;
}
SQLite3_bind_text(pstmt,1,zName,-1,SQLITE_STATIC );
SQLite3_bind_blob(pstmt,2,p,sizeof(p ),SQLITE_STATIC );
sqlite3_step(pstmt;
returnSQLite3_finalize(pstmt;
}
实现定制分词器最重要的是得到指向sqlite3_tokenizer_module结构的指针。 sqlite3_tokenizer_module结构定义如下:
struct sqlite3 _ tokenizer _ module {
int iVersion; //版本号,需要设定为0
int(*xcreate )//创建虚拟表时自动调用并创建分词器
intargc,const char*const*argv,sqlite3_tokenizer**ppTokenizer;
int(xdestroy ) ) sqlite3 _ tokenizer (p tokenizer ); //数据库连接关闭时自动调用并销毁资源
插入或搜索int(*xopen )//数据时自动调用并分开写
sqlite3_tokenizer*pTokenizer,const char*pInput,intnBytes,sqlite3 _ tokenizer _ cursor * * pcursor );
int(xclose ) ) sqlite3 _ tokenizer _ cursor * pcursor ); //分词结果提取结束后自动调用
int(*xnext ) )//将分词结果逐一提取
sqlite3 _ tokenizer _ cursor * pcursor、const char**ppToken、int *pnBytes,
int*piStartOffset,int*piEndOffset,int * pi位置;
(;
我有几个需要注意的事情。
分词引擎使用sql语句注册,意味着每次建立sqlite连接都必须注册分词器,对于需要使用词典的中文分词器来说也意味着巨大的内存消耗。
2检索时分词结果的提取和语义的解析表达式交替进行。 例如,检索' kanif ORsqlite '时,引擎首先将一切传递给分词器,调用一次next获取词kanif,然后将词sqlite传递给分词器,等待所有分析完成。
3由于汉语分词本身的特殊性,例如“北京市”很可能被认为是一个完整的词,搜索“北京”时没有结果。 如果分词器支持将“北京市”拆分为“北京市”和“北京”,或者支持将11月拆分为“11月”和“11”,则返回*xNext (函数的piStartOffset和piendoot ) 在经过测试后插入数据时,这两个参数不实用,但在查询时,这两个参数决定以下输入列:
附件:
#包含
#包含
#包含
#包含
#包含
#包含
#include'fts3_tokenizer.h '
#include 'mmseg/mmseg.cpp '
静态布尔加载DIC=true;
typedef struct cus_tokenizer{
sql
ite3_tokenizer base;} cus_tokenizer;
typedef struct cus_tokenizer_cursor{
sqlite3_tokenizer_cursor base;
char *pInput;
int nBytes;
int iToken;
char *pToken;
rmmseg::Algorithm *pAlgor;
} cus_tokenizer_cursor;
void initmmseg(void){
if(!loadDic)
return;
mmseg_load_words("chars.dic");
mmseg_load_words("words.dic");
loadDic =False;
}
static int cusCreate(
int argc, const char * const *argv,
sqlite3_tokenizer **ppTokenizer
){
cus_tokenizer *t;
t = (cus_tokenizer *) sqlite3_malloc(sizeof(*t));
if( t==NULL ) return SQLITE_NOMEM;
memset(t, 0, sizeof(*t));
initmmseg();
*ppTokenizer = &t->base;
return SQLITE_OK;
}
static intcusDestroy(sqlite3_tokenizer *pTokenizer){
sqlite3_free(pTokenizer);
return SQLITE_OK;
}
static int cusOpen(
sqlite3_tokenizer*pTokenizer, const char *pInput, intnBytes, sqlite3_tokenizer_cursor**ppCursor ){
cus_tokenizer_cursor *c;
if(pInput == 0){
nBytes =0;
}else if(nBytes < 0)
nBytes = (int)strlen(pInput);
c = (cus_tokenizer_cursor *)sqlite3_malloc(sizeof(*c));
if(c == NULL)
return SQLITE_NOMEM;
c->iToken =c->nBytes = 0;
c->pInput = c->pToken =NULL;
c->pAlgor = mmseg_algor_create(pInput,nBytes);
c->nBytes = nBytes;
*ppCursor = &c->base;
return SQLITE_OK;
}
static intcusClose(sqlite3_tokenizer_cursor *pCursor){
cus_tokenizer_cursor *c = (cus_tokenizer_cursor *)pCursor;
if(c->pInput != NULL){
sqlite3_free(c->pInput);
}
if(c->pToken != NULL){
sqlite3_free(c->pToken);
}
if(c->pAlgor != NULL){
mmseg_algor_destroy(c->pAlgor);
}
c->pInput = c->pToken =NULL;
c->pAlgor = NULL;
sqlite3_free(c);
return SQLITE_OK;
}
static int cusNext(
sqlite3_tokenizer_cursor *pCursor,
const char**ppToken, int*pnBytes, int*piStartOffset, int*piEndOffset, int*piPosition ){
cus_tokenizer_cursor *c = (cus_tokenizer_cursor *)pCursor;
cus_tokenizer *t = (cus_tokenizer *)pCursor->pTokenizer;
if(c->pToken != NULL){
sqlite3_free(c->pToken);
c->pToken = NULL;
}
struct Token token =mmseg_next_token(c->pAlgor);
if(token.length != 0 ){
int l =token.length;
c->pToken = (char *)sqlite3_malloc(l+1);
if(c->pToken == NULL)
return SQLITE_NOMEM;
c->pToken[l] = 0;
memcpy(c->pToken, token.text, l);
*ppToken =c->pToken;
*pnBytes =l;
*piStartOffset = token.offset;
*piEndOffset= token.offset + token.length;
*piPosition= c->iToken++;
returnSQLITE_OK;
}
//一般来说只有插入数据时才会进入到这里
return SQLITE_DONE;
}
static const sqlite3_tokenizer_module cusTokenizerModule ={
0,
cusCreate,
cusDestroy,
cusOpen,
cusClose,
cusNext,
};
int registerTokenizer(
sqlite3 *db,
char *zName,
const sqlite3_tokenizer_module *p
){
intrc;
sqlite3_stmt*pStmt;
const char*zSql = "SELECT fts3_tokenizer(?, ?)";
rc =sqlite3_prepare_v2(db, zSql, -1, &pStmt,0);
if(rc!=SQLITE_OK ){
return rc;
}
sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
sqlite3_bind_blob(pStmt, 2, &p, sizeof(p),SQLITE_STATIC);
sqlite3_step(pStmt);
returnsqlite3_finalize(pStmt);
}
int main(){
constsqlite3_tokenizer_module *ptr =&cusTokenizerModule;
sqlite3*pDB;
sqlite3_stmt* stmt;
char *errMsg = NULL;
const char*zTail;
int rc =sqlite3_open("test.sqlite3", &pDB);
if(rc){
printf("create error. %sn",sqlite3_errmsg(pDB));
return rc;
}
chartoken_name[] = "custoken";
registerTokenizer(pDB, token_name, ptr);
rc =sqlite3_exec(pDB, "CREATE VIRTUAL TABLE foo USINGfts3(tokenize=custoken)", 0, 0, &errMsg); if(rc !=SQLITE_OK){ printf("create virtual error, %sn", errMsg); if(rc !=SQLITE_OK){ printf("create virtual error, %sn", errMsg); return rc; } rc =sqlite3_exec(pDB, "INSERT INTO fooVALUES('xe5x8cx97xe4xbaxacxe5xb8x82')", 0, 0,&errMsg); if(rc !=SQLITE_OK){ printf("insert value error, %sn", errMsg); return rc; } int nrow =0, ncolumn = 0; char**azResult; //二维数组存放结果 sqlite3_get_table(pDB , "SELECT * FROM foo WHERE content MATCH'xe5x8cx97xe4xbaxacxe5xb8x82'" , &azResult, &nrow , &ncolumn ,&errMsg ); int i = 0; printf("row:%d column=%d n" , nrow , ncolumn ); printf("nThe result of querying is : n" ); for( i=0 ;i