Index: ext/misc/regexp.c
==================================================================
--- ext/misc/regexp.c
+++ ext/misc/regexp.c
@@ -74,27 +74,27 @@
 #define RE_EOF            0    /* End of input */
 
 /* The NFA is implemented as sequence of opcodes taken from the following
 ** set.  Each opcode has a single integer argument.
 */
-#define RE_OP_MATCH       1    /* Match the one character in the argument */
-#define RE_OP_ANY         2    /* Match any one character.  (Implements ".") */
-#define RE_OP_ANYSTAR     3    /* Special optimized version of .* */
-#define RE_OP_FORK        4    /* Continue to both next and opcode at iArg */
-#define RE_OP_GOTO        5    /* Jump to opcode at iArg */
-#define RE_OP_ACCEPT      6    /* Halt and indicate a successful match */
-#define RE_OP_CC_INC      7    /* Beginning of a [...] character class */
-#define RE_OP_CC_EXC      8    /* Beginning of a [^...] character class */
-#define RE_OP_CC_VALUE    9    /* Single value in a character class */
-#define RE_OP_CC_RANGE   10    /* Range of values in a character class */
-#define RE_OP_WORD       11    /* Perl word character [A-Za-z0-9_] */
-#define RE_OP_NOTWORD    12    /* Not a perl word character */
-#define RE_OP_DIGIT      13    /* digit:  [0-9] */
-#define RE_OP_NOTDIGIT   14    /* Not a digit */
-#define RE_OP_SPACE      15    /* space:  [ \t\n\r\v\f] */
-#define RE_OP_NOTSPACE   16    /* Not a digit */
-#define RE_OP_BOUNDARY   17    /* Boundary between word and non-word */
+#define RE_OP_MATCH       0    /* Match the one character in the argument */
+#define RE_OP_ANY         1    /* Match any one character.  (Implements ".") */
+#define RE_OP_CC_INC      2    /* Beginning of a [...] character class */
+#define RE_OP_CC_EXC      3    /* Beginning of a [^...] character class */
+#define RE_OP_CC_VALUE    4    /* Single value in a character class */
+#define RE_OP_CC_RANGE    5    /* Range of values in a character class */
+#define RE_OP_WORD        6    /* Perl word character [A-Za-z0-9_] */
+#define RE_OP_NOTWORD     7    /* Not a perl word character */
+#define RE_OP_DIGIT       8    /* digit:  [0-9] */
+#define RE_OP_NOTDIGIT    9    /* Not a digit */
+#define RE_OP_SPACE      10    /* space:  [ \t\n\r\v\f] */
+#define RE_OP_NOTSPACE   11    /* Not a digit */
+#define RE_IMMEDIATE     12    /* OPs >= this evaluate immediately */
+#define RE_OP_BOUNDARY   12    /* Boundary between word and non-word */
+#define RE_OP_FORK       13    /* Continue to both next and opcode at iArg */
+#define RE_OP_GOTO       14    /* Jump to opcode at iArg */
+#define RE_OP_ACCEPT     15    /* Halt and indicate a successful match */
 
 /* Each opcode is a "state" in the NFA */
 typedef unsigned short ReStateNumber;
 
 /* Because this is an NFA and not a DFA, multiple states can be active at
@@ -116,30 +116,127 @@
   int mx;                  /* EOF when i>=mx */
 };
 
 /* A compiled NFA (or an NFA that is in the process of being compiled) is
 ** an instance of the following object.
+**
+** The application must serialize access to this object.  The code here
+** is threadsafe, as long as each thread is using its own ReCompiled object.
+** But a single ReCompiled object is neither threadsafe nor reentrant.
+**
+** The iBegin value is the earliest possible start of the match.  The
+** true start of the match might be later.
 */
 typedef struct ReCompiled ReCompiled;
 struct ReCompiled {
-  ReInput sIn;                /* Regular expression text */
+  ReInput sIn;                /* Regexp text, or match string text */
   const char *zErr;           /* Error message to return */
   char *aOp;                  /* Operators for the virtual machine */
   int *aArg;                  /* Arguments to each operator */
+  ReStateSet a, b;                  /* Used by re_match() */
   unsigned (*xNextChar)(ReInput*);  /* Next character function */
+  unsigned char nInit;              /* Number of characters in zInit */
+  unsigned char bFlexStart;   /* Input regexp omits the initial "^" */
+  unsigned char bKeepSpan;    /* When matching, remember start and end */
+  unsigned char bMultiBegin;  /* iBegin might not be accurate */
+  unsigned int iBegin;        /* Offset to first character in the match */
+  unsigned int iEnd;          /* Offset past the end of last match char */
+  unsigned int nState;        /* Number of entries in aOp[] and aArg[] */
+  unsigned int nAlloc;        /* Slots allocated for aOp[] and aArg[] */
   unsigned char zInit[12];    /* Initial text to match */
-  int nInit;                  /* Number of characters in zInit */
-  unsigned nState;            /* Number of entries in aOp[] and aArg[] */
-  unsigned nAlloc;            /* Slots allocated for aOp[] and aArg[] */
 };
 
+#if SQLITE_REGEXP_DEBUG
+/************************************************************************
+** The following interfaces are used for development and debugging only
+** and are commented out for deployment.
+*/
+#include <stdio.h>
+#include <ctype.h>
+
+/* Render one character. */
+static char *re_char(int c){
+  static char zBuf[20];
+  if( isprint(c) && (c==' ' || !isspace(c)) ){
+    sqlite3_snprintf(sizeof(zBuf),zBuf,"'%c'",c);
+  }else{
+    sqlite3_snprintf(sizeof(zBuf),zBuf,"0x%02x",c);
+  }
+  return zBuf;
+}
+
+/*
+** Print out a regexp program.
+*/
+static void re_print(ReCompiled *pRe){
+  int i;
+  for(i=0; i<pRe->nState; i++){
+    printf("%3d ", i);
+    switch( pRe->aOp[i] ){
+      case RE_OP_MATCH:    printf("MATCH     %s\n", re_char(pRe->aArg[i]));
+                           break;
+      case RE_OP_ANY:      printf("ANY\n"); break;
+      case RE_OP_WORD:     printf("WORD\n"); break;
+      case RE_OP_NOTWORD:  printf("NOTWORD\n"); break;
+      case RE_OP_DIGIT:    printf("DIGIT\n"); break;
+      case RE_OP_NOTDIGIT: printf("NOTDIGIT\n"); break;
+      case RE_OP_SPACE:    printf("SPACE\n"); break;
+      case RE_OP_NOTSPACE: printf("NOTSPACE\n"); break;
+      case RE_OP_BOUNDARY: printf("BOUNDARY\n"); break;
+      case RE_OP_FORK:     printf("FORK      %d\n", i+pRe->aArg[i]); break;
+      case RE_OP_GOTO:     printf("GOTO      %d\n", i+pRe->aArg[i]); break;
+      case RE_OP_ACCEPT:   printf("ACCEPT\n"); break;
+      case RE_OP_CC_INC: 
+      case RE_OP_CC_EXC: {
+        printf("%s\n", pRe->aOp[i]==RE_OP_CC_INC ? "CC_INC" : "CC_EXC");
+        int j;
+        int n = pRe->aArg[i];
+        for(j=1; j>0 && j<n; j++){
+          printf("%3d ... %s %s\n", i+j,
+           pRe->aOp[i+j]==RE_OP_CC_VALUE ? "VALUE" : "RANGE",
+           re_char(pRe->aArg[i+j]));
+        }
+        i += n-1;
+        break;
+      }
+    }
+  }
+}
+/*
+** Print out a ReStateSet
+*/
+static void restateset_print(ReStateSet *pSet){
+  int i;
+  for(i=0; i<pSet->nState; i++){
+    if( i ) printf(" ");
+    printf(" %2d", pSet->aState[i]);
+  }
+}
+/* End of debugging utilities
+*****************************************************************************/
+#endif /* SQLITE_REGEXP_DEBUG */
+
 /* Add a state to the given state set if it is not already there */
-static void re_add_state(ReStateSet *pSet, int newState){
+static void re_add_state_to_set(ReStateSet *pSet, int newState){
   unsigned i;
   for(i=0; i<pSet->nState; i++) if( pSet->aState[i]==newState ) return;
   pSet->aState[pSet->nState++] = (ReStateNumber)newState;
 }
+
+/* Add a state to the appropriate state set.  States for immediate
+** evaluation are added into pRe->a.  If another input token is required
+** before the state can be evaluated, then it is added to pRe->b.
+*/
+static void re_add_state(ReCompiled *pRe, int newState){
+  ReStateSet *pSet;
+  if( pRe->aOp[newState]>=RE_IMMEDIATE ){
+    pSet = &pRe->a;
+  }else{
+    pSet = &pRe->b;
+  }
+  re_add_state_to_set(pSet, newState);
+}
 
 /* Extract the next unicode character from *pzIn and return it.  Advance
 ** *pzIn to the first byte past the end of the character returned.  To
 ** be clear:  this routine converts utf8 to unicode.  This routine is 
 ** optimized for the common case where the next character is a single byte.
@@ -188,116 +285,106 @@
 
 /* Return true if c is a perl "space" character:  [ \t\r\n\v\f] */
 static int re_space_char(int c){
   return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
 }
+
+/* Interchange pRe->a and pRe->b */
+static void re_swap(ReCompiled *pRe){
+  ReStateSet t = pRe->a;
+  pRe->a = pRe->b;
+  pRe->b = t;
+}
 
 /* Run a compiled regular expression on the zero-terminated input
 ** string zIn[].  Return true on a match and false if there is no match.
 */
 static int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){
-  ReStateSet aStateSet[2], *pThis, *pNext;
-  ReStateNumber aSpace[100];
-  ReStateNumber *pToFree;
   unsigned int i = 0;
-  unsigned int iSwap = 0;
   int c = RE_EOF+1;
-  int cPrev = 0;
   int rc = 0;
-  ReInput in;
+  int iIdx = -1;
+  int iBegin = -1;
+  int iReset = -1;
+  ReInput *pIn = &pRe->sIn;
 
-  in.z = zIn;
-  in.i = 0;
-  in.mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn);
+  pIn->z = zIn;
+  pIn->i = 0;
+  pIn->mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn);
+  pRe->bMultiBegin = 0;
 
   /* Look for the initial prefix match, if there is one. */
   if( pRe->nInit ){
     unsigned char x = pRe->zInit[0];
-    while( in.i+pRe->nInit<=in.mx 
-     && (zIn[in.i]!=x ||
-         strncmp((const char*)zIn+in.i, (const char*)pRe->zInit, pRe->nInit)!=0)
+    while( pIn->i+pRe->nInit<=pIn->mx 
+     && (zIn[pIn->i]!=x ||
+         strncmp((const char*)zIn+pIn->i,
+                 (const char*)pRe->zInit,pRe->nInit)!=0)
     ){
-      in.i++;
-    }
-    if( in.i+pRe->nInit>in.mx ) return 0;
-  }
-
-  if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){
-    pToFree = 0;
-    aStateSet[0].aState = aSpace;
-  }else{
-    pToFree = sqlite3_malloc( sizeof(ReStateNumber)*2*pRe->nState );
-    if( pToFree==0 ) return -1;
-    aStateSet[0].aState = pToFree;
-  }
-  aStateSet[1].aState = &aStateSet[0].aState[pRe->nState];
-  pNext = &aStateSet[1];
-  pNext->nState = 0;
-  re_add_state(pNext, 0);
-  while( c!=RE_EOF && pNext->nState>0 ){
-    cPrev = c;
-    c = pRe->xNextChar(&in);
-    pThis = pNext;
-    pNext = &aStateSet[iSwap];
-    iSwap = 1 - iSwap;
-    pNext->nState = 0;
-    for(i=0; i<pThis->nState; i++){
-      int x = pThis->aState[i];
+      pIn->i++;
+    }
+    if( pIn->i+pRe->nInit>pIn->mx ) return 0;
+  }
+
+re_print(pRe);
+printf("INPUT: [%.*s]\n", nIn, zIn);
+  pRe->b.nState = 0;
+  pRe->a.nState = 0;
+  re_add_state(pRe, 0);
+  iReset = -1;
+  c = 0;
+  while(1){
+    for(i=0; i<pRe->a.nState; i++){
+      int x = pRe->a.aState[i];
+printf("%2d,%-5s %d.%d = ", pRe->sIn.i, re_char(c), i, x);
+restateset_print(&pRe->a);
+printf("  => ");
+restateset_print(&pRe->b);
+printf("\n");
       switch( pRe->aOp[x] ){
         case RE_OP_MATCH: {
-          if( pRe->aArg[x]==c ) re_add_state(pNext, x+1);
-          break;
+          if( pRe->aArg[x]!=c ) break;
+          /* Fall thru */
         }
         case RE_OP_ANY: {
-          re_add_state(pNext, x+1);
+        add_next:
+          re_add_state(pRe, x+1);
           break;
         }
         case RE_OP_WORD: {
-          if( re_word_char(c) ) re_add_state(pNext, x+1);
+          if( re_word_char(c) ) goto add_next;
           break;
         }
         case RE_OP_NOTWORD: {
-          if( !re_word_char(c) ) re_add_state(pNext, x+1);
+          if( !re_word_char(c) ) goto add_next;
           break;
         }
         case RE_OP_DIGIT: {
-          if( re_digit_char(c) ) re_add_state(pNext, x+1);
+          if( re_digit_char(c) ) goto add_next;
           break;
         }
         case RE_OP_NOTDIGIT: {
-          if( !re_digit_char(c) ) re_add_state(pNext, x+1);
+          if( !re_digit_char(c) ) goto add_next;
           break;
         }
         case RE_OP_SPACE: {
-          if( re_space_char(c) ) re_add_state(pNext, x+1);
+          if( re_space_char(c) ) goto add_next;
           break;
         }
         case RE_OP_NOTSPACE: {
-          if( !re_space_char(c) ) re_add_state(pNext, x+1);
+          if( !re_space_char(c) ) goto add_next;
           break;
         }
         case RE_OP_BOUNDARY: {
-          if( re_word_char(c)!=re_word_char(cPrev) ) re_add_state(pThis, x+1);
-          break;
-        }
-        case RE_OP_ANYSTAR: {
-          re_add_state(pNext, x);
-          re_add_state(pThis, x+1);
-          break;
-        }
-        case RE_OP_FORK: {
-          re_add_state(pThis, x+pRe->aArg[x]);
-          re_add_state(pThis, x+1);
-          break;
-        }
-        case RE_OP_GOTO: {
-          re_add_state(pThis, x+pRe->aArg[x]);
-          break;
-        }
-        case RE_OP_ACCEPT: {
-          rc = 1;
-          goto re_match_end;
+          int iCur = pIn->i;
+          int cNext = pRe->xNextChar(pIn);
+          pIn->i = iCur;
+          if( re_word_char(c)!=re_word_char(cNext) ){
+            if( iReset>=0 ) goto add_next;
+            re_add_state_to_set(&pRe->b, x);
+          }
+          break;
         }
         case RE_OP_CC_INC:
         case RE_OP_CC_EXC: {
           int j = 1;
           int n = pRe->aArg[x];
@@ -316,21 +403,55 @@
                 j++;
               }
             }
           }
           if( pRe->aOp[x]==RE_OP_CC_EXC ) hit = !hit;
-          if( hit ) re_add_state(pNext, x+n);
+          if( hit ) re_add_state(pRe, x+n);
           break;            
         }
+        case RE_OP_FORK: {
+          re_add_state(pRe, x+1);
+          /* Fall thru */
+        }
+        case RE_OP_GOTO: {
+          re_add_state(pRe, x+pRe->aArg[x]);
+          break;
+        }
+        case RE_OP_ACCEPT: {
+          if( !pRe->bKeepSpan ) return 1;
+          pRe->iEnd = pIn->i;
+printf("ACCEPT %d\n", pIn->i);
+          rc = 1;
+          break;
+        }
+      } /* End of switch( pRe->aOp[x] ) */
+    } /* End of loop over states in pRe->a.aState[] */
+    if( iReset<0 ){
+      if( pRe->bFlexStart ){
+        iReset = pRe->b.nState;
+        memcpy(pRe->a.aState, pRe->b.aState, iReset*sizeof(ReStateNumber));
+      }else{
+        iReset = 0;
+      }
+printf("iReset=%d\n", iReset);
+    }else if( c==RE_EOF || pRe->b.nState==0 ){
+      break;
+    }else if( pRe->b.nState>iReset ){
+      if( iBegin<0 ){
+        iBegin = iIdx;
+printf("Begin at %d\n", iIdx);
+      }else{
+        pRe->bMultiBegin = 1;
+printf("Another possible beginning: %d\n", iIdx);
       }
     }
-  }
-  for(i=0; i<pNext->nState; i++){
-    if( pRe->aOp[pNext->aState[i]]==RE_OP_ACCEPT ){ rc = 1; break; }
+    re_swap(pRe);
+    pRe->b.nState = iReset;
+    iIdx = pIn->i;
+    c = pRe->xNextChar(pIn);
   }
-re_match_end:
-  sqlite3_free(pToFree);
+  pRe->iBegin = iBegin;
   return rc;
 }
 
 /* Resize the opcode and argument arrays for an RE under construction.
 */
@@ -491,16 +612,11 @@
         if( rePeek(p)!=')' ) return "unmatched '('";
         p->sIn.i++;
         break;
       }
       case '.': {
-        if( rePeek(p)=='*' ){
-          re_append(p, RE_OP_ANYSTAR, 0);
-          p->sIn.i++;
-        }else{ 
-          re_append(p, RE_OP_ANY, 0);
-        }
+        re_append(p, RE_OP_ANY, 0);
         break;
       }
       case '*': {
         if( iPrev<0 ) return "'*' without operand";
         re_insert(p, iPrev, RE_OP_GOTO, p->nState - iPrev + 1);
@@ -519,16 +635,16 @@
       }
       case '{': {
         int m = 0, n = 0;
         int sz, j;
         if( iPrev<0 ) return "'{m,n}' without operand";
-        while( (c=rePeek(p))>='0' && c<='9' ){ m = m*10 + c - '0'; p->sIn.i++; }
+        while( (c=rePeek(p))>='0' && c<='9' ){ m = m*10 + c - '0';p->sIn.i++; }
         n = m;
         if( c==',' ){
           p->sIn.i++;
           n = 0;
-          while( (c=rePeek(p))>='0' && c<='9' ){ n = n*10 + c-'0'; p->sIn.i++; }
+          while( (c=rePeek(p))>='0' && c<='9' ){ n = n*10 + c-'0';p->sIn.i++; }
         }
         if( c!='}' ) return "unmatched '{'";
         if( n>0 && n<m ) return "n less than m in '{m,n}'";
         p->sIn.i++;
         sz = p->nState - iPrev;
@@ -612,10 +728,12 @@
 */
 void re_free(ReCompiled *pRe){
   if( pRe ){
     sqlite3_free(pRe->aOp);
     sqlite3_free(pRe->aArg);
+    sqlite3_free(pRe->a.aState);
+    sqlite3_free(pRe->b.aState);
     sqlite3_free(pRe);
   }
 }
 
 /*
@@ -641,11 +759,11 @@
     return "out of memory";
   }
   if( zIn[0]=='^' ){
     zIn++;
   }else{
-    re_append(pRe, RE_OP_ANYSTAR, 0);
+    pRe->bFlexStart = 1;
   }
   pRe->sIn.z = (unsigned char*)zIn;
   pRe->sIn.i = 0;
   pRe->sIn.mx = (int)strlen(zIn);
   zErr = re_subcompile_re(pRe);
@@ -662,21 +780,27 @@
     *ppRe = pRe;
   }else{
     re_free(pRe);
     return "unrecognized character";
   }
+  pRe->a.aState = sqlite3_malloc64( sizeof(pRe->a.aState[0])*pRe->nState );
+  pRe->b.aState = sqlite3_malloc64( sizeof(pRe->b.aState[0])*pRe->nState );
+  if( pRe->a.aState==0 || pRe->b.aState==0 ){
+    re_free(pRe);
+    return "out of memory";
+  }
 
   /* The following is a performance optimization.  If the regex begins with
   ** ".*" (if the input regex lacks an initial "^") and afterwards there are
   ** one or more matching characters, enter those matching characters into
   ** zInit[].  The re_match() routine can then search ahead in the input 
   ** string looking for the initial match without having to run the whole
   ** regex engine over the string.  Do not worry able trying to match
   ** unicode characters beyond plane 0 - those are very rare and this is
   ** just an optimization. */
-  if( pRe->aOp[0]==RE_OP_ANYSTAR ){
-    for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
+  if( pRe->bFlexStart && !noCase ){
+    for(j=0, i=0; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
       unsigned x = pRe->aArg[i];
       if( x<=127 ){
         pRe->zInit[j++] = (unsigned char)x;
       }else if( x<=0xfff ){
         pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6));
@@ -737,10 +861,72 @@
   }
   if( setAux ){
     sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free);
   }
 }
+
+/*
+** Implementation of the regexp_test1() SQL function.
+**
+**         regexp_test1(PATTERN, STRING)
+**
+** Return the part of STRING that matches PATTERN.  Or return NULL
+** if there is no match.
+*/
+static void re_test1_func(
+  sqlite3_context *context, 
+  int argc, 
+  sqlite3_value **argv
+){
+  ReCompiled *pRe;          /* Compiled regular expression */
+  const char *zPattern;     /* The regular expression */
+  const unsigned char *zStr;/* String being searched */
+  const char *zErr;         /* Compile error message */
+  int setAux = 0;           /* True to invoke sqlite3_set_auxdata() */
+  int rc;
+
+  pRe = sqlite3_get_auxdata(context, 0);
+  if( pRe==0 ){
+    zPattern = (const char*)sqlite3_value_text(argv[0]);
+    if( zPattern==0 ) return;
+    zErr = re_compile(&pRe, zPattern, 0);
+    if( zErr ){
+      re_free(pRe);
+      sqlite3_result_error(context, zErr, -1);
+      return;
+    }
+    if( pRe==0 ){
+      sqlite3_result_error_nomem(context);
+      return;
+    }
+    setAux = 1;
+  }
+  zStr = (const unsigned char*)sqlite3_value_text(argv[1]);
+  if( zStr!=0 ){
+    int iBegin, iEnd;
+    pRe->bKeepSpan = 1;
+    rc = re_match(pRe, zStr, -1);
+    if( !rc ) goto re_test1_end;
+    iBegin = pRe->iBegin;
+    iEnd = pRe->iEnd;
+    if( pRe->bMultiBegin ){
+      pRe->bKeepSpan = 0;
+      while( re_match(pRe, zStr+iBegin, iEnd-iBegin)==0 ){
+        if( zStr[++iBegin]>=0xc0 ){
+          while( (zStr[iBegin]&0xc0)==0x80 ) iBegin++;
+        }
+        if( iBegin>=iEnd ) goto re_test1_end;
+      }
+    }
+    sqlite3_result_text(context, (const char*)zStr+iBegin,
+                        iEnd - iBegin, SQLITE_TRANSIENT);
+  }
+re_test1_end:
+  if( setAux ){
+    sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free);
+  }
+}
 
 /*
 ** Invoke this routine to register the regexp() function with the
 ** SQLite database connection.
 */
@@ -754,7 +940,11 @@
 ){
   int rc = SQLITE_OK;
   SQLITE_EXTENSION_INIT2(pApi);
   rc = sqlite3_create_function(db, "regexp", 2, SQLITE_UTF8, 0,
                                  re_sql_func, 0, 0);
+  if( rc==SQLITE_OK ){
+    rc = sqlite3_create_function(db, "re_test1", 2, SQLITE_UTF8, 0,
+                                 re_test1_func, 0, 0);
+  }
   return rc;
 }