Index: ext/fts3/unicode/mkunicode.tcl ================================================================== --- ext/fts3/unicode/mkunicode.tcl +++ ext/fts3/unicode/mkunicode.tcl @@ -1,79 +1,7 @@ -# -# Parameter $zName must be a path to the file UnicodeData.txt. This command -# reads the file and returns a list of mappings required to remove all -# diacritical marks from a unicode string. Each mapping is itself a list -# consisting of two elements - the unicode codepoint and the single ASCII -# character that it should be replaced with, or an empty string if the -# codepoint should simply be removed from the input. Examples: -# -# { 224 a } (replace codepoint 224 to "a") -# { 769 "" } (remove codepoint 769 from input) -# -# Mappings are only returned for non-upper case codepoints. It is assumed -# that the input has already been folded to lower case. -# -proc rd_load_unicodedata_text {zName} { - global tl_lookup_table - - set fd [open $zName] - set lField { - code - character_name - general_category - canonical_combining_classes - bidirectional_category - character_decomposition_mapping - decimal_digit_value - digit_value - numeric_value - mirrored - unicode_1_name - iso10646_comment_field - uppercase_mapping - lowercase_mapping - titlecase_mapping - } - set lRet [list] - - while { ![eof $fd] } { - set line [gets $fd] - if {$line == ""} continue - - set fields [split $line ";"] - if {[llength $fields] != [llength $lField]} { error "parse error: $line" } - foreach $lField $fields {} - if { [llength $character_decomposition_mapping]!=2 - || [string is xdigit [lindex $character_decomposition_mapping 0]]==0 - } { - continue - } - - set iCode [expr "0x$code"] - set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"] - set iDia [expr "0x[lindex $character_decomposition_mapping 1]"] - - if {[info exists tl_lookup_table($iCode)]} continue - - if { ($iAscii >= 97 && $iAscii <= 122) - || ($iAscii >= 65 && $iAscii <= 90) - } { - lappend lRet [list $iCode [string tolower [format %c $iAscii]]] - set dia($iDia) 1 - } - } - - foreach d [array names dia] { - lappend lRet [list $d ""] - } - set lRet [lsort -integer -index 0 $lRet] - - close $fd - set lRet -} - +source [file join [file dirname [info script]] parseunicode.tcl] proc print_rd {map} { global tl_lookup_table set aChar [list] set lRange [list] @@ -202,57 +130,10 @@ } #------------------------------------------------------------------------- -# Parameter $zName must be a path to the file UnicodeData.txt. This command -# reads the file and returns a list of codepoints (integers). The list -# contains all codepoints in the UnicodeData.txt assigned to any "General -# Category" that is not a "Letter" or "Number". -# -proc an_load_unicodedata_text {zName} { - set fd [open $zName] - set lField { - code - character_name - general_category - canonical_combining_classes - bidirectional_category - character_decomposition_mapping - decimal_digit_value - digit_value - numeric_value - mirrored - unicode_1_name - iso10646_comment_field - uppercase_mapping - lowercase_mapping - titlecase_mapping - } - set lRet [list] - - while { ![eof $fd] } { - set line [gets $fd] - if {$line == ""} continue - - set fields [split $line ";"] - if {[llength $fields] != [llength $lField]} { error "parse error: $line" } - foreach $lField $fields {} - - set iCode [expr "0x$code"] - set bAlnum [expr { - [lsearch {L N} [string range $general_category 0 0]] >= 0 - || $general_category=="Co" - }] - - if { !$bAlnum } { lappend lRet $iCode } - } - - close $fd - set lRet -} - proc an_load_separator_ranges {} { global unicodedata.txt set lSep [an_load_unicodedata_text ${unicodedata.txt}] unset -nocomplain iFirst unset -nocomplain nRange @@ -438,33 +319,10 @@ puts "\}" } #------------------------------------------------------------------------- -proc tl_load_casefolding_txt {zName} { - global tl_lookup_table - - set fd [open $zName] - while { ![eof $fd] } { - set line [gets $fd] - if {[string range $line 0 0] == "#"} continue - if {$line == ""} continue - - foreach x {a b c d} {unset -nocomplain $x} - foreach {a b c d} [split $line ";"] {} - - set a2 [list] - set c2 [list] - foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] } - foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] } - set b [string trim $b] - set d [string trim $d] - - if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 } - } -} - proc tl_create_records {} { global tl_lookup_table set iFirst "" set nOff 0 @@ -633,14 +491,16 @@ assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 ); if( c<128 ){ if( c>='A' && c<='Z' ) ret = c + ('a' - 'A'); }else if( c<65536 ){ + const struct TableEntry *p; int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; int iLo = 0; int iRes = -1; + assert( c>aEntry[0].iCode ); while( iHi>=iLo ){ int iTest = (iHi + iLo) / 2; int cmp = (c - aEntry[iTest].iCode); if( cmp>=0 ){ iRes = iTest; @@ -647,18 +507,16 @@ iLo = iTest+1; }else{ iHi = iTest-1; } } - assert( iRes<0 || c>=aEntry[iRes].iCode ); - - if( iRes>=0 ){ - const struct TableEntry *p = &aEntry[iRes]; - if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ - ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; - assert( ret>0 ); - } + + assert( iRes>=0 && c>=aEntry[iRes].iCode ); + p = &aEntry[iRes]; + if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ + ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; + assert( ret>0 ); } if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret); } }] ADDED ext/fts3/unicode/parseunicode.tcl Index: ext/fts3/unicode/parseunicode.tcl ================================================================== --- /dev/null +++ ext/fts3/unicode/parseunicode.tcl @@ -0,0 +1,146 @@ + +#-------------------------------------------------------------------------- +# Parameter $zName must be a path to the file UnicodeData.txt. This command +# reads the file and returns a list of mappings required to remove all +# diacritical marks from a unicode string. Each mapping is itself a list +# consisting of two elements - the unicode codepoint and the single ASCII +# character that it should be replaced with, or an empty string if the +# codepoint should simply be removed from the input. Examples: +# +# { 224 a } (replace codepoint 224 to "a") +# { 769 "" } (remove codepoint 769 from input) +# +# Mappings are only returned for non-upper case codepoints. It is assumed +# that the input has already been folded to lower case. +# +proc rd_load_unicodedata_text {zName} { + global tl_lookup_table + + set fd [open $zName] + set lField { + code + character_name + general_category + canonical_combining_classes + bidirectional_category + character_decomposition_mapping + decimal_digit_value + digit_value + numeric_value + mirrored + unicode_1_name + iso10646_comment_field + uppercase_mapping + lowercase_mapping + titlecase_mapping + } + set lRet [list] + + while { ![eof $fd] } { + set line [gets $fd] + if {$line == ""} continue + + set fields [split $line ";"] + if {[llength $fields] != [llength $lField]} { error "parse error: $line" } + foreach $lField $fields {} + if { [llength $character_decomposition_mapping]!=2 + || [string is xdigit [lindex $character_decomposition_mapping 0]]==0 + } { + continue + } + + set iCode [expr "0x$code"] + set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"] + set iDia [expr "0x[lindex $character_decomposition_mapping 1]"] + + if {[info exists tl_lookup_table($iCode)]} continue + + if { ($iAscii >= 97 && $iAscii <= 122) + || ($iAscii >= 65 && $iAscii <= 90) + } { + lappend lRet [list $iCode [string tolower [format %c $iAscii]]] + set dia($iDia) 1 + } + } + + foreach d [array names dia] { + lappend lRet [list $d ""] + } + set lRet [lsort -integer -index 0 $lRet] + + close $fd + set lRet +} + +#------------------------------------------------------------------------- +# Parameter $zName must be a path to the file UnicodeData.txt. This command +# reads the file and returns a list of codepoints (integers). The list +# contains all codepoints in the UnicodeData.txt assigned to any "General +# Category" that is not a "Letter" or "Number". +# +proc an_load_unicodedata_text {zName} { + set fd [open $zName] + set lField { + code + character_name + general_category + canonical_combining_classes + bidirectional_category + character_decomposition_mapping + decimal_digit_value + digit_value + numeric_value + mirrored + unicode_1_name + iso10646_comment_field + uppercase_mapping + lowercase_mapping + titlecase_mapping + } + set lRet [list] + + while { ![eof $fd] } { + set line [gets $fd] + if {$line == ""} continue + + set fields [split $line ";"] + if {[llength $fields] != [llength $lField]} { error "parse error: $line" } + foreach $lField $fields {} + + set iCode [expr "0x$code"] + set bAlnum [expr { + [lsearch {L N} [string range $general_category 0 0]] >= 0 + || $general_category=="Co" + }] + + if { !$bAlnum } { lappend lRet $iCode } + } + + close $fd + set lRet +} + +proc tl_load_casefolding_txt {zName} { + global tl_lookup_table + + set fd [open $zName] + while { ![eof $fd] } { + set line [gets $fd] + if {[string range $line 0 0] == "#"} continue + if {$line == ""} continue + + foreach x {a b c d} {unset -nocomplain $x} + foreach {a b c d} [split $line ";"] {} + + set a2 [list] + set c2 [list] + foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] } + foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] } + set b [string trim $b] + set d [string trim $d] + + if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 } + } +} + + Index: ext/fts5/fts5Int.h ================================================================== --- ext/fts5/fts5Int.h +++ ext/fts5/fts5Int.h @@ -629,6 +629,17 @@ /* ** End of interface to code in fts5_vocab.c. **************************************************************************/ + +/************************************************************************** +** Interface to automatically generated code in fts5_unicode2.c. +*/ +int sqlite3Fts5UnicodeIsalnum(int c); +int sqlite3Fts5UnicodeIsdiacritic(int c); +int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic); +/* +** End of interface to code in fts5_unicode2.c. +**************************************************************************/ + #endif Index: ext/fts5/fts5_expr.c ================================================================== --- ext/fts5/fts5_expr.c +++ ext/fts5/fts5_expr.c @@ -1615,10 +1615,49 @@ int nArg, /* Number of args */ sqlite3_value **apVal /* Function arguments */ ){ fts5ExprFunction(pCtx, nArg, apVal, 1); } + +/* +** The implementation of an SQLite user-defined-function that accepts a +** single integer as an argument. If the integer is an alpha-numeric +** unicode code point, 1 is returned. Otherwise 0. +*/ +static void fts5ExprIsAlnum( + sqlite3_context *pCtx, /* Function call context */ + int nArg, /* Number of args */ + sqlite3_value **apVal /* Function arguments */ +){ + int iCode; + if( nArg!=1 ){ + sqlite3_result_error(pCtx, + "wrong number of arguments to function fts5_isalnum", -1 + ); + return; + } + iCode = sqlite3_value_int(apVal[0]); + sqlite3_result_int(pCtx, sqlite3Fts5UnicodeIsalnum(iCode)); +} + +static void fts5ExprFold( + sqlite3_context *pCtx, /* Function call context */ + int nArg, /* Number of args */ + sqlite3_value **apVal /* Function arguments */ +){ + if( nArg!=1 && nArg!=2 ){ + sqlite3_result_error(pCtx, + "wrong number of arguments to function fts5_fold", -1 + ); + }else{ + int iCode; + int bRemoveDiacritics = 0; + iCode = sqlite3_value_int(apVal[0]); + if( nArg==2 ) bRemoveDiacritics = sqlite3_value_int(apVal[1]); + sqlite3_result_int(pCtx, sqlite3Fts5UnicodeFold(iCode, bRemoveDiacritics)); + } +} /* ** This is called during initialization to register the fts5_expr() scalar ** UDF with the SQLite handle passed as the only argument. */ @@ -1625,12 +1664,14 @@ int sqlite3Fts5ExprInit(Fts5Global *pGlobal, sqlite3 *db){ struct Fts5ExprFunc { const char *z; void (*x)(sqlite3_context*,int,sqlite3_value**); } aFunc[] = { - { "fts5_expr", fts5ExprFunctionHr }, + { "fts5_expr", fts5ExprFunctionHr }, { "fts5_expr_tcl", fts5ExprFunctionTcl }, + { "fts5_isalnum", fts5ExprIsAlnum }, + { "fts5_fold", fts5ExprFold }, }; int i; int rc = SQLITE_OK; void *pCtx = (void*)pGlobal; Index: ext/fts5/fts5_tokenize.c ================================================================== --- ext/fts5/fts5_tokenize.c +++ ext/fts5/fts5_tokenize.c @@ -172,17 +172,10 @@ /************************************************************************** ** Start of unicode61 tokenizer implementation. */ -/* -** Functions in fts5_unicode2.c. -*/ -int sqlite3Fts5UnicodeIsalnum(int c); -int sqlite3Fts5UnicodeIsdiacritic(int c); -int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic); - /* ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied ** from the sqlite3 source file utf.c. If this file is compiled as part ** of the amalgamation, they are not required. Index: ext/fts5/fts5_unicode2.c ================================================================== --- ext/fts5/fts5_unicode2.c +++ ext/fts5/fts5_unicode2.c @@ -325,14 +325,16 @@ assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 ); if( c<128 ){ if( c>='A' && c<='Z' ) ret = c + ('a' - 'A'); }else if( c<65536 ){ + const struct TableEntry *p; int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; int iLo = 0; int iRes = -1; + assert( c>aEntry[0].iCode ); while( iHi>=iLo ){ int iTest = (iHi + iLo) / 2; int cmp = (c - aEntry[iTest].iCode); if( cmp>=0 ){ iRes = iTest; @@ -339,18 +341,16 @@ iLo = iTest+1; }else{ iHi = iTest-1; } } - assert( iRes<0 || c>=aEntry[iRes].iCode ); - - if( iRes>=0 ){ - const struct TableEntry *p = &aEntry[iRes]; - if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ - ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; - assert( ret>0 ); - } + + assert( iRes>=0 && c>=aEntry[iRes].iCode ); + p = &aEntry[iRes]; + if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ + ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; + assert( ret>0 ); } if( bRemoveDiacritic ) ret = fts5_remove_diacritic(ret); } ADDED ext/fts5/test/fts5unicode3.test Index: ext/fts5/test/fts5unicode3.test ================================================================== --- /dev/null +++ ext/fts5/test/fts5unicode3.test @@ -0,0 +1,122 @@ +# 2014 Dec 20 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# +# Tests focusing on the fts5 tokenizers +# + +proc fts3_unicode_path {file} { + file join [file dirname [info script]] .. .. fts3 unicode $file +} + +source [file join [file dirname [info script]] fts5_common.tcl] +source [fts3_unicode_path parseunicode.tcl] +set testprefix fts5unicode3 + +set CF [fts3_unicode_path CaseFolding.txt] +set UD [fts3_unicode_path UnicodeData.txt] + +tl_load_casefolding_txt $CF +foreach x [an_load_unicodedata_text $UD] { + set aNotAlnum($x) 1 +} + +foreach {y} [rd_load_unicodedata_text $UD] { + foreach {code ascii} $y {} + if {$ascii==""} { + set int 0 + } else { + binary scan $ascii c int + } + set aDiacritic($code) $int +} + +proc tcl_fold {i {bRemoveDiacritic 0}} { + global tl_lookup_table + global aDiacritic + + if {[info exists tl_lookup_table($i)]} { + set i $tl_lookup_table($i) + } + if {$bRemoveDiacritic && [info exists aDiacritic($i)]} { + set i $aDiacritic($i) + } + expr $i +} +db func tcl_fold tcl_fold + +proc tcl_isalnum {i} { + global aNotAlnum + expr {![info exists aNotAlnum($i)]} +} +db func tcl_isalnum tcl_isalnum + + +do_catchsql_test 1.0.1 { + SELECT fts5_isalnum(1, 2, 3); +} {1 {wrong number of arguments to function fts5_isalnum}} +do_catchsql_test 1.0.2 { + SELECT fts5_fold(); +} {1 {wrong number of arguments to function fts5_fold}} +do_catchsql_test 1.0.3 { + SELECT fts5_fold(1,2,3); +} {1 {wrong number of arguments to function fts5_fold}} + +do_execsql_test 1.1 { + WITH ii(i) AS ( + SELECT -1 + UNION ALL + SELECT i+1 FROM ii WHERE i<100000 + ) + SELECT count(*), min(i) FROM ii WHERE fts5_fold(i)!=CAST(tcl_fold(i) AS int); +} {0 {}} + +do_execsql_test 1.2 { + WITH ii(i) AS ( + SELECT -1 + UNION ALL + SELECT i+1 FROM ii WHERE i<100000 + ) + SELECT count(*), min(i) FROM ii + WHERE fts5_fold(i,1)!=CAST(tcl_fold(i,1) AS int); +} {0 {}} + +do_execsql_test 1.3 { + WITH ii(i) AS ( + SELECT -1 + UNION ALL + SELECT i+1 FROM ii WHERE i<100000 + ) + SELECT count(*), min(i) FROM ii + WHERE fts5_isalnum(i)!=CAST(tcl_isalnum(i) AS int); +} {0 {}} + +do_test 1.4 { + set str {CREATE VIRTUAL TABLE f3 USING fts5(a, tokenize=} + append str {"unicode61 separators '} + for {set i 700} {$i<900} {incr i} { + append str [format %c $i] + } + append str {'");} + execsql $str +} {} +do_test 1.5 { + set str {CREATE VIRTUAL TABLE f5 USING fts5(a, tokenize=} + append str {"unicode61 tokenchars '} + for {set i 700} {$i<900} {incr i} { + append str [format %c $i] + } + append str {'");} + execsql $str +} {} + + +finish_test +