Index: ext/fts3/unicode/mkunicode.tcl
==================================================================
--- ext/fts3/unicode/mkunicode.tcl
+++ ext/fts3/unicode/mkunicode.tcl
@@ -1,79 +1,7 @@
 
-#
-# Parameter $zName must be a path to the file UnicodeData.txt. This command
-# reads the file and returns a list of mappings required to remove all
-# diacritical marks from a unicode string. Each mapping is itself a list
-# consisting of two elements - the unicode codepoint and the single ASCII
-# character that it should be replaced with, or an empty string if the 
-# codepoint should simply be removed from the input. Examples:
-#
-#   { 224 a  }     (replace codepoint 224 to "a")
-#   { 769 "" }     (remove codepoint 769 from input)
-#
-# Mappings are only returned for non-upper case codepoints. It is assumed
-# that the input has already been folded to lower case.
-#
-proc rd_load_unicodedata_text {zName} {
-  global tl_lookup_table
-
-  set fd [open $zName]
-  set lField {
-    code
-    character_name
-    general_category
-    canonical_combining_classes
-    bidirectional_category
-    character_decomposition_mapping
-    decimal_digit_value
-    digit_value
-    numeric_value
-    mirrored
-    unicode_1_name
-    iso10646_comment_field
-    uppercase_mapping
-    lowercase_mapping
-    titlecase_mapping
-  }
-  set lRet [list]
-
-  while { ![eof $fd] } {
-    set line [gets $fd]
-    if {$line == ""} continue
-
-    set fields [split $line ";"]
-    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
-    foreach $lField $fields {}
-    if { [llength $character_decomposition_mapping]!=2
-      || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
-    } {
-      continue
-    }
-
-    set iCode  [expr "0x$code"]
-    set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
-    set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]
-
-    if {[info exists tl_lookup_table($iCode)]} continue
-
-    if { ($iAscii >= 97 && $iAscii <= 122)
-      || ($iAscii >= 65 && $iAscii <= 90)
-    } {
-      lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
-      set dia($iDia) 1
-    }
-  }
-
-  foreach d [array names dia] {
-    lappend lRet [list $d ""]
-  }
-  set lRet [lsort -integer -index 0 $lRet]
-
-  close $fd
-  set lRet
-}
-
+source [file join [file dirname [info script]] parseunicode.tcl]
 
 proc print_rd {map} {
   global tl_lookup_table
   set aChar [list]
   set lRange [list]
@@ -202,57 +130,10 @@
 }
 
 
 #-------------------------------------------------------------------------
 
-# Parameter $zName must be a path to the file UnicodeData.txt. This command
-# reads the file and returns a list of codepoints (integers). The list
-# contains all codepoints in the UnicodeData.txt assigned to any "General
-# Category" that is not a "Letter" or "Number".
-#
-proc an_load_unicodedata_text {zName} {
-  set fd [open $zName]
-  set lField {
-    code
-    character_name
-    general_category
-    canonical_combining_classes
-    bidirectional_category
-    character_decomposition_mapping
-    decimal_digit_value
-    digit_value
-    numeric_value
-    mirrored
-    unicode_1_name
-    iso10646_comment_field
-    uppercase_mapping
-    lowercase_mapping
-    titlecase_mapping
-  }
-  set lRet [list]
-
-  while { ![eof $fd] } {
-    set line [gets $fd]
-    if {$line == ""} continue
-
-    set fields [split $line ";"]
-    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
-    foreach $lField $fields {}
-
-    set iCode [expr "0x$code"]
-    set bAlnum [expr {
-         [lsearch {L N} [string range $general_category 0 0]] >= 0
-      || $general_category=="Co"
-    }]
-
-    if { !$bAlnum } { lappend lRet $iCode }
-  }
-
-  close $fd
-  set lRet
-}
-
 proc an_load_separator_ranges {} {
   global unicodedata.txt
   set lSep [an_load_unicodedata_text ${unicodedata.txt}]
   unset -nocomplain iFirst 
   unset -nocomplain nRange 
@@ -438,33 +319,10 @@
   puts "\}"
 }
 
 #-------------------------------------------------------------------------
 
-proc tl_load_casefolding_txt {zName} {
-  global tl_lookup_table
-
-  set fd [open $zName]
-  while { ![eof $fd] } {
-    set line [gets $fd]
-    if {[string range $line 0 0] == "#"} continue
-    if {$line == ""} continue
-
-    foreach x {a b c d} {unset -nocomplain $x}
-    foreach {a b c d} [split $line ";"] {}
-
-    set a2 [list]
-    set c2 [list]
-    foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
-    foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
-    set b [string trim $b]
-    set d [string trim $d]
-
-    if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
-  }
-}
-
 proc tl_create_records {} {
   global tl_lookup_table
 
   set iFirst ""
   set nOff 0
@@ -633,14 +491,16 @@
   assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
 
   if( c<128 ){
     if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
   }else if( c<65536 ){
+    const struct TableEntry *p;
     int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
     int iLo = 0;
     int iRes = -1;
 
+    assert( c>aEntry[0].iCode );
     while( iHi>=iLo ){
       int iTest = (iHi + iLo) / 2;
       int cmp = (c - aEntry[iTest].iCode);
       if( cmp>=0 ){
         iRes = iTest;
@@ -647,18 +507,16 @@
         iLo = iTest+1;
       }else{
         iHi = iTest-1;
       }
     }
-    assert( iRes<0 || c>=aEntry[iRes].iCode );
-
-    if( iRes>=0 ){
-      const struct TableEntry *p = &aEntry[iRes];
-      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
-        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
-        assert( ret>0 );
-      }
+
+    assert( iRes>=0 && c>=aEntry[iRes].iCode );
+    p = &aEntry[iRes];
+    if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
+      ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
+      assert( ret>0 );
     }
 
     if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret);
   }
   }]

ADDED   ext/fts3/unicode/parseunicode.tcl
Index: ext/fts3/unicode/parseunicode.tcl
==================================================================
--- /dev/null
+++ ext/fts3/unicode/parseunicode.tcl
@@ -0,0 +1,146 @@
+
+#--------------------------------------------------------------------------
+# Parameter $zName must be a path to the file UnicodeData.txt. This command
+# reads the file and returns a list of mappings required to remove all
+# diacritical marks from a unicode string. Each mapping is itself a list
+# consisting of two elements - the unicode codepoint and the single ASCII
+# character that it should be replaced with, or an empty string if the 
+# codepoint should simply be removed from the input. Examples:
+#
+#   { 224 a  }     (replace codepoint 224 to "a")
+#   { 769 "" }     (remove codepoint 769 from input)
+#
+# Mappings are only returned for non-upper case codepoints. It is assumed
+# that the input has already been folded to lower case.
+#
+proc rd_load_unicodedata_text {zName} {
+  global tl_lookup_table
+
+  set fd [open $zName]
+  set lField {
+    code
+    character_name
+    general_category
+    canonical_combining_classes
+    bidirectional_category
+    character_decomposition_mapping
+    decimal_digit_value
+    digit_value
+    numeric_value
+    mirrored
+    unicode_1_name
+    iso10646_comment_field
+    uppercase_mapping
+    lowercase_mapping
+    titlecase_mapping
+  }
+  set lRet [list]
+
+  while { ![eof $fd] } {
+    set line [gets $fd]
+    if {$line == ""} continue
+
+    set fields [split $line ";"]
+    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
+    foreach $lField $fields {}
+    if { [llength $character_decomposition_mapping]!=2
+      || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
+    } {
+      continue
+    }
+
+    set iCode  [expr "0x$code"]
+    set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
+    set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]
+
+    if {[info exists tl_lookup_table($iCode)]} continue
+
+    if { ($iAscii >= 97 && $iAscii <= 122)
+      || ($iAscii >= 65 && $iAscii <= 90)
+    } {
+      lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
+      set dia($iDia) 1
+    }
+  }
+
+  foreach d [array names dia] {
+    lappend lRet [list $d ""]
+  }
+  set lRet [lsort -integer -index 0 $lRet]
+
+  close $fd
+  set lRet
+}
+
+#-------------------------------------------------------------------------
+# Parameter $zName must be a path to the file UnicodeData.txt. This command
+# reads the file and returns a list of codepoints (integers). The list
+# contains all codepoints in the UnicodeData.txt assigned to any "General
+# Category" that is not a "Letter" or "Number".
+#
+proc an_load_unicodedata_text {zName} {
+  set fd [open $zName]
+  set lField {
+    code
+    character_name
+    general_category
+    canonical_combining_classes
+    bidirectional_category
+    character_decomposition_mapping
+    decimal_digit_value
+    digit_value
+    numeric_value
+    mirrored
+    unicode_1_name
+    iso10646_comment_field
+    uppercase_mapping
+    lowercase_mapping
+    titlecase_mapping
+  }
+  set lRet [list]
+
+  while { ![eof $fd] } {
+    set line [gets $fd]
+    if {$line == ""} continue
+
+    set fields [split $line ";"]
+    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
+    foreach $lField $fields {}
+
+    set iCode [expr "0x$code"]
+    set bAlnum [expr {
+         [lsearch {L N} [string range $general_category 0 0]] >= 0
+      || $general_category=="Co"
+    }]
+
+    if { !$bAlnum } { lappend lRet $iCode }
+  }
+
+  close $fd
+  set lRet
+}
+
+proc tl_load_casefolding_txt {zName} {
+  global tl_lookup_table
+
+  set fd [open $zName]
+  while { ![eof $fd] } {
+    set line [gets $fd]
+    if {[string range $line 0 0] == "#"} continue
+    if {$line == ""} continue
+
+    foreach x {a b c d} {unset -nocomplain $x}
+    foreach {a b c d} [split $line ";"] {}
+
+    set a2 [list]
+    set c2 [list]
+    foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
+    foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
+    set b [string trim $b]
+    set d [string trim $d]
+
+    if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
+  }
+}
+
+

Index: ext/fts5/fts5Int.h
==================================================================
--- ext/fts5/fts5Int.h
+++ ext/fts5/fts5Int.h
@@ -629,6 +629,17 @@
 
 /*
 ** End of interface to code in fts5_vocab.c.
 **************************************************************************/
 
+
+/**************************************************************************
+** Interface to automatically generated code in fts5_unicode2.c. 
+*/
+int sqlite3Fts5UnicodeIsalnum(int c);
+int sqlite3Fts5UnicodeIsdiacritic(int c);
+int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic);
+/*
+** End of interface to code in fts5_unicode2.c.
+**************************************************************************/
+
 #endif

Index: ext/fts5/fts5_expr.c
==================================================================
--- ext/fts5/fts5_expr.c
+++ ext/fts5/fts5_expr.c
@@ -1615,10 +1615,49 @@
   int nArg,                       /* Number of args */
   sqlite3_value **apVal           /* Function arguments */
 ){
   fts5ExprFunction(pCtx, nArg, apVal, 1);
 }
+
+/*
+** The implementation of an SQLite user-defined-function that accepts a
+** single integer as an argument. If the integer is an alpha-numeric 
+** unicode code point, 1 is returned. Otherwise 0.
+*/
+static void fts5ExprIsAlnum(
+  sqlite3_context *pCtx,          /* Function call context */
+  int nArg,                       /* Number of args */
+  sqlite3_value **apVal           /* Function arguments */
+){
+  int iCode;
+  if( nArg!=1 ){
+    sqlite3_result_error(pCtx, 
+        "wrong number of arguments to function fts5_isalnum", -1
+    );
+    return;
+  }
+  iCode = sqlite3_value_int(apVal[0]);
+  sqlite3_result_int(pCtx, sqlite3Fts5UnicodeIsalnum(iCode));
+}
+
+static void fts5ExprFold(
+  sqlite3_context *pCtx,          /* Function call context */
+  int nArg,                       /* Number of args */
+  sqlite3_value **apVal           /* Function arguments */
+){
+  if( nArg!=1 && nArg!=2 ){
+    sqlite3_result_error(pCtx, 
+        "wrong number of arguments to function fts5_fold", -1
+    );
+  }else{
+    int iCode;
+    int bRemoveDiacritics = 0;
+    iCode = sqlite3_value_int(apVal[0]);
+    if( nArg==2 ) bRemoveDiacritics = sqlite3_value_int(apVal[1]);
+    sqlite3_result_int(pCtx, sqlite3Fts5UnicodeFold(iCode, bRemoveDiacritics));
+  }
+}
 
 /*
 ** This is called during initialization to register the fts5_expr() scalar
 ** UDF with the SQLite handle passed as the only argument.
 */
@@ -1625,12 +1664,14 @@
 int sqlite3Fts5ExprInit(Fts5Global *pGlobal, sqlite3 *db){
   struct Fts5ExprFunc {
     const char *z;
     void (*x)(sqlite3_context*,int,sqlite3_value**);
   } aFunc[] = {
-    { "fts5_expr", fts5ExprFunctionHr },
+    { "fts5_expr",     fts5ExprFunctionHr },
     { "fts5_expr_tcl", fts5ExprFunctionTcl },
+    { "fts5_isalnum",  fts5ExprIsAlnum },
+    { "fts5_fold",     fts5ExprFold },
   };
   int i;
   int rc = SQLITE_OK;
   void *pCtx = (void*)pGlobal;
 

Index: ext/fts5/fts5_tokenize.c
==================================================================
--- ext/fts5/fts5_tokenize.c
+++ ext/fts5/fts5_tokenize.c
@@ -172,17 +172,10 @@
 
 /**************************************************************************
 ** Start of unicode61 tokenizer implementation.
 */
 
-/*
-** Functions in fts5_unicode2.c. 
-*/
-int sqlite3Fts5UnicodeIsalnum(int c);
-int sqlite3Fts5UnicodeIsdiacritic(int c);
-int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic);
-
 
 /*
 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
 ** from the sqlite3 source file utf.c. If this file is compiled as part
 ** of the amalgamation, they are not required.

Index: ext/fts5/fts5_unicode2.c
==================================================================
--- ext/fts5/fts5_unicode2.c
+++ ext/fts5/fts5_unicode2.c
@@ -325,14 +325,16 @@
   assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
 
   if( c<128 ){
     if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
   }else if( c<65536 ){
+    const struct TableEntry *p;
     int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
     int iLo = 0;
     int iRes = -1;
 
+    assert( c>aEntry[0].iCode );
     while( iHi>=iLo ){
       int iTest = (iHi + iLo) / 2;
       int cmp = (c - aEntry[iTest].iCode);
       if( cmp>=0 ){
         iRes = iTest;
@@ -339,18 +341,16 @@
         iLo = iTest+1;
       }else{
         iHi = iTest-1;
       }
     }
-    assert( iRes<0 || c>=aEntry[iRes].iCode );
-
-    if( iRes>=0 ){
-      const struct TableEntry *p = &aEntry[iRes];
-      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
-        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
-        assert( ret>0 );
-      }
+
+    assert( iRes>=0 && c>=aEntry[iRes].iCode );
+    p = &aEntry[iRes];
+    if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
+      ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
+      assert( ret>0 );
     }
 
     if( bRemoveDiacritic ) ret = fts5_remove_diacritic(ret);
   }
   

ADDED   ext/fts5/test/fts5unicode3.test
Index: ext/fts5/test/fts5unicode3.test
==================================================================
--- /dev/null
+++ ext/fts5/test/fts5unicode3.test
@@ -0,0 +1,122 @@
+# 2014 Dec 20
+#
+# The author disclaims copyright to this source code.  In place of
+# a legal notice, here is a blessing:
+#
+#    May you do good and not evil.
+#    May you find forgiveness for yourself and forgive others.
+#    May you share freely, never taking more than you give.
+#
+#***********************************************************************
+#
+# Tests focusing on the fts5 tokenizers
+#
+
+proc fts3_unicode_path {file} {
+  file join [file dirname [info script]] .. .. fts3 unicode $file
+}
+
+source [file join [file dirname [info script]] fts5_common.tcl]
+source [fts3_unicode_path parseunicode.tcl]
+set testprefix fts5unicode3
+
+set CF [fts3_unicode_path CaseFolding.txt]
+set UD [fts3_unicode_path UnicodeData.txt]
+
+tl_load_casefolding_txt $CF
+foreach x [an_load_unicodedata_text $UD] {
+  set aNotAlnum($x) 1
+}
+
+foreach {y} [rd_load_unicodedata_text $UD] {
+  foreach {code ascii} $y {}
+  if {$ascii==""} {
+    set int 0
+  } else {
+    binary scan $ascii c int
+  }
+  set aDiacritic($code) $int
+}
+
+proc tcl_fold {i {bRemoveDiacritic 0}} {
+  global tl_lookup_table
+  global aDiacritic
+
+  if {[info exists tl_lookup_table($i)]} {
+    set i $tl_lookup_table($i)
+  }
+  if {$bRemoveDiacritic && [info exists aDiacritic($i)]} {
+    set i $aDiacritic($i)
+  }
+  expr $i
+}
+db func tcl_fold tcl_fold
+
+proc tcl_isalnum {i} {
+  global aNotAlnum
+  expr {![info exists aNotAlnum($i)]}
+}
+db func tcl_isalnum tcl_isalnum
+
+
+do_catchsql_test 1.0.1 {
+  SELECT fts5_isalnum(1, 2, 3);
+} {1 {wrong number of arguments to function fts5_isalnum}}
+do_catchsql_test 1.0.2 {
+  SELECT fts5_fold();
+} {1 {wrong number of arguments to function fts5_fold}}
+do_catchsql_test 1.0.3 {
+  SELECT fts5_fold(1,2,3);
+} {1 {wrong number of arguments to function fts5_fold}}
+
+do_execsql_test 1.1 {
+  WITH ii(i) AS (
+    SELECT -1
+    UNION ALL
+    SELECT i+1 FROM ii WHERE i<100000
+  )
+  SELECT count(*), min(i) FROM ii WHERE fts5_fold(i)!=CAST(tcl_fold(i) AS int);
+} {0 {}}
+
+do_execsql_test 1.2 {
+  WITH ii(i) AS (
+    SELECT -1
+    UNION ALL
+    SELECT i+1 FROM ii WHERE i<100000
+  )
+  SELECT count(*), min(i) FROM ii 
+  WHERE fts5_fold(i,1)!=CAST(tcl_fold(i,1) AS int);
+} {0 {}}
+
+do_execsql_test 1.3 {
+  WITH ii(i) AS (
+    SELECT -1
+    UNION ALL
+    SELECT i+1 FROM ii WHERE i<100000
+  )
+  SELECT count(*), min(i) FROM ii 
+  WHERE fts5_isalnum(i)!=CAST(tcl_isalnum(i) AS int);
+} {0 {}}
+
+do_test 1.4 {
+  set str {CREATE VIRTUAL TABLE f3 USING fts5(a, tokenize=}
+  append str {"unicode61 separators '}
+  for {set i 700} {$i<900} {incr i} {
+    append str [format %c $i]
+  }
+  append str {'");}
+  execsql $str
+} {}
+do_test 1.5 {
+  set str {CREATE VIRTUAL TABLE f5 USING fts5(a, tokenize=}
+  append str {"unicode61 tokenchars '}
+  for {set i 700} {$i<900} {incr i} {
+    append str [format %c $i]
+  }
+  append str {'");}
+  execsql $str
+} {}
+
+
+finish_test
+