Index: ext/misc/regexp.c ================================================================== --- ext/misc/regexp.c +++ ext/misc/regexp.c @@ -74,27 +74,27 @@ #define RE_EOF 0 /* End of input */ /* The NFA is implemented as sequence of opcodes taken from the following ** set. Each opcode has a single integer argument. */ -#define RE_OP_MATCH 1 /* Match the one character in the argument */ -#define RE_OP_ANY 2 /* Match any one character. (Implements ".") */ -#define RE_OP_ANYSTAR 3 /* Special optimized version of .* */ -#define RE_OP_FORK 4 /* Continue to both next and opcode at iArg */ -#define RE_OP_GOTO 5 /* Jump to opcode at iArg */ -#define RE_OP_ACCEPT 6 /* Halt and indicate a successful match */ -#define RE_OP_CC_INC 7 /* Beginning of a [...] character class */ -#define RE_OP_CC_EXC 8 /* Beginning of a [^...] character class */ -#define RE_OP_CC_VALUE 9 /* Single value in a character class */ -#define RE_OP_CC_RANGE 10 /* Range of values in a character class */ -#define RE_OP_WORD 11 /* Perl word character [A-Za-z0-9_] */ -#define RE_OP_NOTWORD 12 /* Not a perl word character */ -#define RE_OP_DIGIT 13 /* digit: [0-9] */ -#define RE_OP_NOTDIGIT 14 /* Not a digit */ -#define RE_OP_SPACE 15 /* space: [ \t\n\r\v\f] */ -#define RE_OP_NOTSPACE 16 /* Not a digit */ -#define RE_OP_BOUNDARY 17 /* Boundary between word and non-word */ +#define RE_OP_MATCH 0 /* Match the one character in the argument */ +#define RE_OP_ANY 1 /* Match any one character. (Implements ".") */ +#define RE_OP_CC_INC 2 /* Beginning of a [...] character class */ +#define RE_OP_CC_EXC 3 /* Beginning of a [^...] character class */ +#define RE_OP_CC_VALUE 4 /* Single value in a character class */ +#define RE_OP_CC_RANGE 5 /* Range of values in a character class */ +#define RE_OP_WORD 6 /* Perl word character [A-Za-z0-9_] */ +#define RE_OP_NOTWORD 7 /* Not a perl word character */ +#define RE_OP_DIGIT 8 /* digit: [0-9] */ +#define RE_OP_NOTDIGIT 9 /* Not a digit */ +#define RE_OP_SPACE 10 /* space: [ \t\n\r\v\f] */ +#define RE_OP_NOTSPACE 11 /* Not a digit */ +#define RE_IMMEDIATE 12 /* OPs >= this evaluate immediately */ +#define RE_OP_BOUNDARY 12 /* Boundary between word and non-word */ +#define RE_OP_FORK 13 /* Continue to both next and opcode at iArg */ +#define RE_OP_GOTO 14 /* Jump to opcode at iArg */ +#define RE_OP_ACCEPT 15 /* Halt and indicate a successful match */ /* Each opcode is a "state" in the NFA */ typedef unsigned short ReStateNumber; /* Because this is an NFA and not a DFA, multiple states can be active at @@ -116,30 +116,127 @@ int mx; /* EOF when i>=mx */ }; /* A compiled NFA (or an NFA that is in the process of being compiled) is ** an instance of the following object. +** +** The application must serialize access to this object. The code here +** is threadsafe, as long as each thread is using its own ReCompiled object. +** But a single ReCompiled object is neither threadsafe nor reentrant. +** +** The iBegin value is the earliest possible start of the match. The +** true start of the match might be later. */ typedef struct ReCompiled ReCompiled; struct ReCompiled { - ReInput sIn; /* Regular expression text */ + ReInput sIn; /* Regexp text, or match string text */ const char *zErr; /* Error message to return */ char *aOp; /* Operators for the virtual machine */ int *aArg; /* Arguments to each operator */ + ReStateSet a, b; /* Used by re_match() */ unsigned (*xNextChar)(ReInput*); /* Next character function */ + unsigned char nInit; /* Number of characters in zInit */ + unsigned char bFlexStart; /* Input regexp omits the initial "^" */ + unsigned char bKeepSpan; /* When matching, remember start and end */ + unsigned char bMultiBegin; /* iBegin might not be accurate */ + unsigned int iBegin; /* Offset to first character in the match */ + unsigned int iEnd; /* Offset past the end of last match char */ + unsigned int nState; /* Number of entries in aOp[] and aArg[] */ + unsigned int nAlloc; /* Slots allocated for aOp[] and aArg[] */ unsigned char zInit[12]; /* Initial text to match */ - int nInit; /* Number of characters in zInit */ - unsigned nState; /* Number of entries in aOp[] and aArg[] */ - unsigned nAlloc; /* Slots allocated for aOp[] and aArg[] */ }; +#if SQLITE_REGEXP_DEBUG +/************************************************************************ +** The following interfaces are used for development and debugging only +** and are commented out for deployment. +*/ +#include +#include + +/* Render one character. */ +static char *re_char(int c){ + static char zBuf[20]; + if( isprint(c) && (c==' ' || !isspace(c)) ){ + sqlite3_snprintf(sizeof(zBuf),zBuf,"'%c'",c); + }else{ + sqlite3_snprintf(sizeof(zBuf),zBuf,"0x%02x",c); + } + return zBuf; +} + +/* +** Print out a regexp program. +*/ +static void re_print(ReCompiled *pRe){ + int i; + for(i=0; inState; i++){ + printf("%3d ", i); + switch( pRe->aOp[i] ){ + case RE_OP_MATCH: printf("MATCH %s\n", re_char(pRe->aArg[i])); + break; + case RE_OP_ANY: printf("ANY\n"); break; + case RE_OP_WORD: printf("WORD\n"); break; + case RE_OP_NOTWORD: printf("NOTWORD\n"); break; + case RE_OP_DIGIT: printf("DIGIT\n"); break; + case RE_OP_NOTDIGIT: printf("NOTDIGIT\n"); break; + case RE_OP_SPACE: printf("SPACE\n"); break; + case RE_OP_NOTSPACE: printf("NOTSPACE\n"); break; + case RE_OP_BOUNDARY: printf("BOUNDARY\n"); break; + case RE_OP_FORK: printf("FORK %d\n", i+pRe->aArg[i]); break; + case RE_OP_GOTO: printf("GOTO %d\n", i+pRe->aArg[i]); break; + case RE_OP_ACCEPT: printf("ACCEPT\n"); break; + case RE_OP_CC_INC: + case RE_OP_CC_EXC: { + printf("%s\n", pRe->aOp[i]==RE_OP_CC_INC ? "CC_INC" : "CC_EXC"); + int j; + int n = pRe->aArg[i]; + for(j=1; j>0 && jaOp[i+j]==RE_OP_CC_VALUE ? "VALUE" : "RANGE", + re_char(pRe->aArg[i+j])); + } + i += n-1; + break; + } + } + } +} +/* +** Print out a ReStateSet +*/ +static void restateset_print(ReStateSet *pSet){ + int i; + for(i=0; inState; i++){ + if( i ) printf(" "); + printf(" %2d", pSet->aState[i]); + } +} +/* End of debugging utilities +*****************************************************************************/ +#endif /* SQLITE_REGEXP_DEBUG */ + /* Add a state to the given state set if it is not already there */ -static void re_add_state(ReStateSet *pSet, int newState){ +static void re_add_state_to_set(ReStateSet *pSet, int newState){ unsigned i; for(i=0; inState; i++) if( pSet->aState[i]==newState ) return; pSet->aState[pSet->nState++] = (ReStateNumber)newState; } + +/* Add a state to the appropriate state set. States for immediate +** evaluation are added into pRe->a. If another input token is required +** before the state can be evaluated, then it is added to pRe->b. +*/ +static void re_add_state(ReCompiled *pRe, int newState){ + ReStateSet *pSet; + if( pRe->aOp[newState]>=RE_IMMEDIATE ){ + pSet = &pRe->a; + }else{ + pSet = &pRe->b; + } + re_add_state_to_set(pSet, newState); +} /* Extract the next unicode character from *pzIn and return it. Advance ** *pzIn to the first byte past the end of the character returned. To ** be clear: this routine converts utf8 to unicode. This routine is ** optimized for the common case where the next character is a single byte. @@ -188,116 +285,106 @@ /* Return true if c is a perl "space" character: [ \t\r\n\v\f] */ static int re_space_char(int c){ return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f'; } + +/* Interchange pRe->a and pRe->b */ +static void re_swap(ReCompiled *pRe){ + ReStateSet t = pRe->a; + pRe->a = pRe->b; + pRe->b = t; +} /* Run a compiled regular expression on the zero-terminated input ** string zIn[]. Return true on a match and false if there is no match. */ static int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){ - ReStateSet aStateSet[2], *pThis, *pNext; - ReStateNumber aSpace[100]; - ReStateNumber *pToFree; unsigned int i = 0; - unsigned int iSwap = 0; int c = RE_EOF+1; - int cPrev = 0; int rc = 0; - ReInput in; + int iIdx = -1; + int iBegin = -1; + int iReset = -1; + ReInput *pIn = &pRe->sIn; - in.z = zIn; - in.i = 0; - in.mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn); + pIn->z = zIn; + pIn->i = 0; + pIn->mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn); + pRe->bMultiBegin = 0; /* Look for the initial prefix match, if there is one. */ if( pRe->nInit ){ unsigned char x = pRe->zInit[0]; - while( in.i+pRe->nInit<=in.mx - && (zIn[in.i]!=x || - strncmp((const char*)zIn+in.i, (const char*)pRe->zInit, pRe->nInit)!=0) + while( pIn->i+pRe->nInit<=pIn->mx + && (zIn[pIn->i]!=x || + strncmp((const char*)zIn+pIn->i, + (const char*)pRe->zInit,pRe->nInit)!=0) ){ - in.i++; - } - if( in.i+pRe->nInit>in.mx ) return 0; - } - - if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){ - pToFree = 0; - aStateSet[0].aState = aSpace; - }else{ - pToFree = sqlite3_malloc( sizeof(ReStateNumber)*2*pRe->nState ); - if( pToFree==0 ) return -1; - aStateSet[0].aState = pToFree; - } - aStateSet[1].aState = &aStateSet[0].aState[pRe->nState]; - pNext = &aStateSet[1]; - pNext->nState = 0; - re_add_state(pNext, 0); - while( c!=RE_EOF && pNext->nState>0 ){ - cPrev = c; - c = pRe->xNextChar(&in); - pThis = pNext; - pNext = &aStateSet[iSwap]; - iSwap = 1 - iSwap; - pNext->nState = 0; - for(i=0; inState; i++){ - int x = pThis->aState[i]; + pIn->i++; + } + if( pIn->i+pRe->nInit>pIn->mx ) return 0; + } + +re_print(pRe); +printf("INPUT: [%.*s]\n", nIn, zIn); + pRe->b.nState = 0; + pRe->a.nState = 0; + re_add_state(pRe, 0); + iReset = -1; + c = 0; + while(1){ + for(i=0; ia.nState; i++){ + int x = pRe->a.aState[i]; +printf("%2d,%-5s %d.%d = ", pRe->sIn.i, re_char(c), i, x); +restateset_print(&pRe->a); +printf(" => "); +restateset_print(&pRe->b); +printf("\n"); switch( pRe->aOp[x] ){ case RE_OP_MATCH: { - if( pRe->aArg[x]==c ) re_add_state(pNext, x+1); - break; + if( pRe->aArg[x]!=c ) break; + /* Fall thru */ } case RE_OP_ANY: { - re_add_state(pNext, x+1); + add_next: + re_add_state(pRe, x+1); break; } case RE_OP_WORD: { - if( re_word_char(c) ) re_add_state(pNext, x+1); + if( re_word_char(c) ) goto add_next; break; } case RE_OP_NOTWORD: { - if( !re_word_char(c) ) re_add_state(pNext, x+1); + if( !re_word_char(c) ) goto add_next; break; } case RE_OP_DIGIT: { - if( re_digit_char(c) ) re_add_state(pNext, x+1); + if( re_digit_char(c) ) goto add_next; break; } case RE_OP_NOTDIGIT: { - if( !re_digit_char(c) ) re_add_state(pNext, x+1); + if( !re_digit_char(c) ) goto add_next; break; } case RE_OP_SPACE: { - if( re_space_char(c) ) re_add_state(pNext, x+1); + if( re_space_char(c) ) goto add_next; break; } case RE_OP_NOTSPACE: { - if( !re_space_char(c) ) re_add_state(pNext, x+1); + if( !re_space_char(c) ) goto add_next; break; } case RE_OP_BOUNDARY: { - if( re_word_char(c)!=re_word_char(cPrev) ) re_add_state(pThis, x+1); - break; - } - case RE_OP_ANYSTAR: { - re_add_state(pNext, x); - re_add_state(pThis, x+1); - break; - } - case RE_OP_FORK: { - re_add_state(pThis, x+pRe->aArg[x]); - re_add_state(pThis, x+1); - break; - } - case RE_OP_GOTO: { - re_add_state(pThis, x+pRe->aArg[x]); - break; - } - case RE_OP_ACCEPT: { - rc = 1; - goto re_match_end; + int iCur = pIn->i; + int cNext = pRe->xNextChar(pIn); + pIn->i = iCur; + if( re_word_char(c)!=re_word_char(cNext) ){ + if( iReset>=0 ) goto add_next; + re_add_state_to_set(&pRe->b, x); + } + break; } case RE_OP_CC_INC: case RE_OP_CC_EXC: { int j = 1; int n = pRe->aArg[x]; @@ -316,21 +403,55 @@ j++; } } } if( pRe->aOp[x]==RE_OP_CC_EXC ) hit = !hit; - if( hit ) re_add_state(pNext, x+n); + if( hit ) re_add_state(pRe, x+n); break; } + case RE_OP_FORK: { + re_add_state(pRe, x+1); + /* Fall thru */ + } + case RE_OP_GOTO: { + re_add_state(pRe, x+pRe->aArg[x]); + break; + } + case RE_OP_ACCEPT: { + if( !pRe->bKeepSpan ) return 1; + pRe->iEnd = pIn->i; +printf("ACCEPT %d\n", pIn->i); + rc = 1; + break; + } + } /* End of switch( pRe->aOp[x] ) */ + } /* End of loop over states in pRe->a.aState[] */ + if( iReset<0 ){ + if( pRe->bFlexStart ){ + iReset = pRe->b.nState; + memcpy(pRe->a.aState, pRe->b.aState, iReset*sizeof(ReStateNumber)); + }else{ + iReset = 0; + } +printf("iReset=%d\n", iReset); + }else if( c==RE_EOF || pRe->b.nState==0 ){ + break; + }else if( pRe->b.nState>iReset ){ + if( iBegin<0 ){ + iBegin = iIdx; +printf("Begin at %d\n", iIdx); + }else{ + pRe->bMultiBegin = 1; +printf("Another possible beginning: %d\n", iIdx); } } - } - for(i=0; inState; i++){ - if( pRe->aOp[pNext->aState[i]]==RE_OP_ACCEPT ){ rc = 1; break; } + re_swap(pRe); + pRe->b.nState = iReset; + iIdx = pIn->i; + c = pRe->xNextChar(pIn); } -re_match_end: - sqlite3_free(pToFree); + pRe->iBegin = iBegin; return rc; } /* Resize the opcode and argument arrays for an RE under construction. */ @@ -491,16 +612,11 @@ if( rePeek(p)!=')' ) return "unmatched '('"; p->sIn.i++; break; } case '.': { - if( rePeek(p)=='*' ){ - re_append(p, RE_OP_ANYSTAR, 0); - p->sIn.i++; - }else{ - re_append(p, RE_OP_ANY, 0); - } + re_append(p, RE_OP_ANY, 0); break; } case '*': { if( iPrev<0 ) return "'*' without operand"; re_insert(p, iPrev, RE_OP_GOTO, p->nState - iPrev + 1); @@ -519,16 +635,16 @@ } case '{': { int m = 0, n = 0; int sz, j; if( iPrev<0 ) return "'{m,n}' without operand"; - while( (c=rePeek(p))>='0' && c<='9' ){ m = m*10 + c - '0'; p->sIn.i++; } + while( (c=rePeek(p))>='0' && c<='9' ){ m = m*10 + c - '0';p->sIn.i++; } n = m; if( c==',' ){ p->sIn.i++; n = 0; - while( (c=rePeek(p))>='0' && c<='9' ){ n = n*10 + c-'0'; p->sIn.i++; } + while( (c=rePeek(p))>='0' && c<='9' ){ n = n*10 + c-'0';p->sIn.i++; } } if( c!='}' ) return "unmatched '{'"; if( n>0 && nsIn.i++; sz = p->nState - iPrev; @@ -612,10 +728,12 @@ */ void re_free(ReCompiled *pRe){ if( pRe ){ sqlite3_free(pRe->aOp); sqlite3_free(pRe->aArg); + sqlite3_free(pRe->a.aState); + sqlite3_free(pRe->b.aState); sqlite3_free(pRe); } } /* @@ -641,11 +759,11 @@ return "out of memory"; } if( zIn[0]=='^' ){ zIn++; }else{ - re_append(pRe, RE_OP_ANYSTAR, 0); + pRe->bFlexStart = 1; } pRe->sIn.z = (unsigned char*)zIn; pRe->sIn.i = 0; pRe->sIn.mx = (int)strlen(zIn); zErr = re_subcompile_re(pRe); @@ -662,21 +780,27 @@ *ppRe = pRe; }else{ re_free(pRe); return "unrecognized character"; } + pRe->a.aState = sqlite3_malloc64( sizeof(pRe->a.aState[0])*pRe->nState ); + pRe->b.aState = sqlite3_malloc64( sizeof(pRe->b.aState[0])*pRe->nState ); + if( pRe->a.aState==0 || pRe->b.aState==0 ){ + re_free(pRe); + return "out of memory"; + } /* The following is a performance optimization. If the regex begins with ** ".*" (if the input regex lacks an initial "^") and afterwards there are ** one or more matching characters, enter those matching characters into ** zInit[]. The re_match() routine can then search ahead in the input ** string looking for the initial match without having to run the whole ** regex engine over the string. Do not worry able trying to match ** unicode characters beyond plane 0 - those are very rare and this is ** just an optimization. */ - if( pRe->aOp[0]==RE_OP_ANYSTAR ){ - for(j=0, i=1; jzInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){ + if( pRe->bFlexStart && !noCase ){ + for(j=0, i=0; jzInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){ unsigned x = pRe->aArg[i]; if( x<=127 ){ pRe->zInit[j++] = (unsigned char)x; }else if( x<=0xfff ){ pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6)); @@ -737,10 +861,72 @@ } if( setAux ){ sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free); } } + +/* +** Implementation of the regexp_test1() SQL function. +** +** regexp_test1(PATTERN, STRING) +** +** Return the part of STRING that matches PATTERN. Or return NULL +** if there is no match. +*/ +static void re_test1_func( + sqlite3_context *context, + int argc, + sqlite3_value **argv +){ + ReCompiled *pRe; /* Compiled regular expression */ + const char *zPattern; /* The regular expression */ + const unsigned char *zStr;/* String being searched */ + const char *zErr; /* Compile error message */ + int setAux = 0; /* True to invoke sqlite3_set_auxdata() */ + int rc; + + pRe = sqlite3_get_auxdata(context, 0); + if( pRe==0 ){ + zPattern = (const char*)sqlite3_value_text(argv[0]); + if( zPattern==0 ) return; + zErr = re_compile(&pRe, zPattern, 0); + if( zErr ){ + re_free(pRe); + sqlite3_result_error(context, zErr, -1); + return; + } + if( pRe==0 ){ + sqlite3_result_error_nomem(context); + return; + } + setAux = 1; + } + zStr = (const unsigned char*)sqlite3_value_text(argv[1]); + if( zStr!=0 ){ + int iBegin, iEnd; + pRe->bKeepSpan = 1; + rc = re_match(pRe, zStr, -1); + if( !rc ) goto re_test1_end; + iBegin = pRe->iBegin; + iEnd = pRe->iEnd; + if( pRe->bMultiBegin ){ + pRe->bKeepSpan = 0; + while( re_match(pRe, zStr+iBegin, iEnd-iBegin)==0 ){ + if( zStr[++iBegin]>=0xc0 ){ + while( (zStr[iBegin]&0xc0)==0x80 ) iBegin++; + } + if( iBegin>=iEnd ) goto re_test1_end; + } + } + sqlite3_result_text(context, (const char*)zStr+iBegin, + iEnd - iBegin, SQLITE_TRANSIENT); + } +re_test1_end: + if( setAux ){ + sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free); + } +} /* ** Invoke this routine to register the regexp() function with the ** SQLite database connection. */ @@ -754,7 +940,11 @@ ){ int rc = SQLITE_OK; SQLITE_EXTENSION_INIT2(pApi); rc = sqlite3_create_function(db, "regexp", 2, SQLITE_UTF8, 0, re_sql_func, 0, 0); + if( rc==SQLITE_OK ){ + rc = sqlite3_create_function(db, "re_test1", 2, SQLITE_UTF8, 0, + re_test1_func, 0, 0); + } return rc; }