/ Check-in [fea8a4db9d]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Improve test coverage of fts5_unicode2.c.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fts5
Files: files | file ages | folders
SHA1: fea8a4db9d8c7b9a946017a0dc984cbca6ce240e
User & Date: dan 2015-05-22 06:08:25
Context
2015-05-22
07:44
Increase test coverage of fts5_vocab.c. check-in: 065ab83a6c user: dan tags: fts5
06:08
Improve test coverage of fts5_unicode2.c. check-in: fea8a4db9d user: dan tags: fts5
2015-05-20
09:27
Improve test coverage of fts5_tokenize.c. check-in: 0e91a6a520 user: dan tags: fts5
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to ext/fts3/unicode/mkunicode.tcl.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

75
76
77
78
79
80
81
...
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
...
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
...
631
632
633
634
635
636
637

638
639
640
641

642
643
644
645
646
647
648
649
650
651
652
653
654

655
656
657
658
659
660
661
662
663
664
665
666

#
# Parameter $zName must be a path to the file UnicodeData.txt. This command
# reads the file and returns a list of mappings required to remove all
# diacritical marks from a unicode string. Each mapping is itself a list
# consisting of two elements - the unicode codepoint and the single ASCII
# character that it should be replaced with, or an empty string if the 
# codepoint should simply be removed from the input. Examples:
#
#   { 224 a  }     (replace codepoint 224 to "a")
#   { 769 "" }     (remove codepoint 769 from input)
#
# Mappings are only returned for non-upper case codepoints. It is assumed
# that the input has already been folded to lower case.
#
proc rd_load_unicodedata_text {zName} {
  global tl_lookup_table

  set fd [open $zName]
  set lField {
    code
    character_name
    general_category
    canonical_combining_classes
    bidirectional_category
    character_decomposition_mapping
    decimal_digit_value
    digit_value
    numeric_value
    mirrored
    unicode_1_name
    iso10646_comment_field
    uppercase_mapping
    lowercase_mapping
    titlecase_mapping
  }
  set lRet [list]

  while { ![eof $fd] } {
    set line [gets $fd]
    if {$line == ""} continue

    set fields [split $line ";"]
    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
    foreach $lField $fields {}
    if { [llength $character_decomposition_mapping]!=2
      || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
    } {
      continue
    }

    set iCode  [expr "0x$code"]
    set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
    set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]

    if {[info exists tl_lookup_table($iCode)]} continue

    if { ($iAscii >= 97 && $iAscii <= 122)
      || ($iAscii >= 65 && $iAscii <= 90)
    } {
      lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
      set dia($iDia) 1
    }
  }

  foreach d [array names dia] {
    lappend lRet [list $d ""]
  }
  set lRet [lsort -integer -index 0 $lRet]

  close $fd
  set lRet
}



proc print_rd {map} {
  global tl_lookup_table
  set aChar [list]
  set lRange [list]

  set nRange 1
................................................................................
  puts "      (mask1 & (1 << (c-$iFirst-32)));"
  puts "\}"
}


#-------------------------------------------------------------------------

# Parameter $zName must be a path to the file UnicodeData.txt. This command
# reads the file and returns a list of codepoints (integers). The list
# contains all codepoints in the UnicodeData.txt assigned to any "General
# Category" that is not a "Letter" or "Number".
#
proc an_load_unicodedata_text {zName} {
  set fd [open $zName]
  set lField {
    code
    character_name
    general_category
    canonical_combining_classes
    bidirectional_category
    character_decomposition_mapping
    decimal_digit_value
    digit_value
    numeric_value
    mirrored
    unicode_1_name
    iso10646_comment_field
    uppercase_mapping
    lowercase_mapping
    titlecase_mapping
  }
  set lRet [list]

  while { ![eof $fd] } {
    set line [gets $fd]
    if {$line == ""} continue

    set fields [split $line ";"]
    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
    foreach $lField $fields {}

    set iCode [expr "0x$code"]
    set bAlnum [expr {
         [lsearch {L N} [string range $general_category 0 0]] >= 0
      || $general_category=="Co"
    }]

    if { !$bAlnum } { lappend lRet $iCode }
  }

  close $fd
  set lRet
}

proc an_load_separator_ranges {} {
  global unicodedata.txt
  set lSep [an_load_unicodedata_text ${unicodedata.txt}]
  unset -nocomplain iFirst 
  unset -nocomplain nRange 
  set lRange [list]
  foreach sep $lSep {
................................................................................
  }]
  puts "  return 0;"
  puts "\}"
}

#-------------------------------------------------------------------------

proc tl_load_casefolding_txt {zName} {
  global tl_lookup_table

  set fd [open $zName]
  while { ![eof $fd] } {
    set line [gets $fd]
    if {[string range $line 0 0] == "#"} continue
    if {$line == ""} continue

    foreach x {a b c d} {unset -nocomplain $x}
    foreach {a b c d} [split $line ";"] {}

    set a2 [list]
    set c2 [list]
    foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
    foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
    set b [string trim $b]
    set d [string trim $d]

    if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
  }
}

proc tl_create_records {} {
  global tl_lookup_table

  set iFirst ""
  set nOff 0
  set nRange 0
  set nIncr 0
................................................................................

  assert( c>=0 );
  assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );

  if( c<128 ){
    if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
  }else if( c<65536 ){

    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
    int iLo = 0;
    int iRes = -1;


    while( iHi>=iLo ){
      int iTest = (iHi + iLo) / 2;
      int cmp = (c - aEntry[iTest].iCode);
      if( cmp>=0 ){
        iRes = iTest;
        iLo = iTest+1;
      }else{
        iHi = iTest-1;
      }
    }
    assert( iRes<0 || c>=aEntry[iRes].iCode );

    if( iRes>=0 ){

      const struct TableEntry *p = &aEntry[iRes];
      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
        assert( ret>0 );
      }
    }

    if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret);
  }
  }]

  foreach entry $lHigh {

<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
>







 







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







 







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







 







>




>










<

<
>
|
|
|
|
<







1









































































2
3
4
5
6
7
8
9
...
128
129
130
131
132
133
134















































135
136
137
138
139
140
141
...
317
318
319
320
321
322
323























324
325
326
327
328
329
330
...
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511

512

513
514
515
516
517

518
519
520
521
522
523
524










































































source [file join [file dirname [info script]] parseunicode.tcl]

proc print_rd {map} {
  global tl_lookup_table
  set aChar [list]
  set lRange [list]

  set nRange 1
................................................................................
  puts "      (mask1 & (1 << (c-$iFirst-32)));"
  puts "\}"
}


#-------------------------------------------------------------------------
















































proc an_load_separator_ranges {} {
  global unicodedata.txt
  set lSep [an_load_unicodedata_text ${unicodedata.txt}]
  unset -nocomplain iFirst 
  unset -nocomplain nRange 
  set lRange [list]
  foreach sep $lSep {
................................................................................
  }]
  puts "  return 0;"
  puts "\}"
}

#-------------------------------------------------------------------------
























proc tl_create_records {} {
  global tl_lookup_table

  set iFirst ""
  set nOff 0
  set nRange 0
  set nIncr 0
................................................................................

  assert( c>=0 );
  assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );

  if( c<128 ){
    if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
  }else if( c<65536 ){
    const struct TableEntry *p;
    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
    int iLo = 0;
    int iRes = -1;

    assert( c>aEntry[0].iCode );
    while( iHi>=iLo ){
      int iTest = (iHi + iLo) / 2;
      int cmp = (c - aEntry[iTest].iCode);
      if( cmp>=0 ){
        iRes = iTest;
        iLo = iTest+1;
      }else{
        iHi = iTest-1;
      }
    }



    assert( iRes>=0 && c>=aEntry[iRes].iCode );
    p = &aEntry[iRes];
    if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
      ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
      assert( ret>0 );

    }

    if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret);
  }
  }]

  foreach entry $lHigh {

Added ext/fts3/unicode/parseunicode.tcl.





































































































































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

#--------------------------------------------------------------------------
# Parameter $zName must be a path to the file UnicodeData.txt. This command
# reads the file and returns a list of mappings required to remove all
# diacritical marks from a unicode string. Each mapping is itself a list
# consisting of two elements - the unicode codepoint and the single ASCII
# character that it should be replaced with, or an empty string if the 
# codepoint should simply be removed from the input. Examples:
#
#   { 224 a  }     (replace codepoint 224 to "a")
#   { 769 "" }     (remove codepoint 769 from input)
#
# Mappings are only returned for non-upper case codepoints. It is assumed
# that the input has already been folded to lower case.
#
proc rd_load_unicodedata_text {zName} {
  global tl_lookup_table

  set fd [open $zName]
  set lField {
    code
    character_name
    general_category
    canonical_combining_classes
    bidirectional_category
    character_decomposition_mapping
    decimal_digit_value
    digit_value
    numeric_value
    mirrored
    unicode_1_name
    iso10646_comment_field
    uppercase_mapping
    lowercase_mapping
    titlecase_mapping
  }
  set lRet [list]

  while { ![eof $fd] } {
    set line [gets $fd]
    if {$line == ""} continue

    set fields [split $line ";"]
    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
    foreach $lField $fields {}
    if { [llength $character_decomposition_mapping]!=2
      || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
    } {
      continue
    }

    set iCode  [expr "0x$code"]
    set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
    set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]

    if {[info exists tl_lookup_table($iCode)]} continue

    if { ($iAscii >= 97 && $iAscii <= 122)
      || ($iAscii >= 65 && $iAscii <= 90)
    } {
      lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
      set dia($iDia) 1
    }
  }

  foreach d [array names dia] {
    lappend lRet [list $d ""]
  }
  set lRet [lsort -integer -index 0 $lRet]

  close $fd
  set lRet
}

#-------------------------------------------------------------------------
# Parameter $zName must be a path to the file UnicodeData.txt. This command
# reads the file and returns a list of codepoints (integers). The list
# contains all codepoints in the UnicodeData.txt assigned to any "General
# Category" that is not a "Letter" or "Number".
#
proc an_load_unicodedata_text {zName} {
  set fd [open $zName]
  set lField {
    code
    character_name
    general_category
    canonical_combining_classes
    bidirectional_category
    character_decomposition_mapping
    decimal_digit_value
    digit_value
    numeric_value
    mirrored
    unicode_1_name
    iso10646_comment_field
    uppercase_mapping
    lowercase_mapping
    titlecase_mapping
  }
  set lRet [list]

  while { ![eof $fd] } {
    set line [gets $fd]
    if {$line == ""} continue

    set fields [split $line ";"]
    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
    foreach $lField $fields {}

    set iCode [expr "0x$code"]
    set bAlnum [expr {
         [lsearch {L N} [string range $general_category 0 0]] >= 0
      || $general_category=="Co"
    }]

    if { !$bAlnum } { lappend lRet $iCode }
  }

  close $fd
  set lRet
}

proc tl_load_casefolding_txt {zName} {
  global tl_lookup_table

  set fd [open $zName]
  while { ![eof $fd] } {
    set line [gets $fd]
    if {[string range $line 0 0] == "#"} continue
    if {$line == ""} continue

    foreach x {a b c d} {unset -nocomplain $x}
    foreach {a b c d} [split $line ";"] {}

    set a2 [list]
    set c2 [list]
    foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
    foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
    set b [string trim $b]
    set d [string trim $d]

    if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
  }
}


Changes to ext/fts5/fts5Int.h.

627
628
629
630
631
632
633











634

int sqlite3Fts5VocabInit(Fts5Global*, sqlite3*);

/*
** End of interface to code in fts5_vocab.c.
**************************************************************************/












#endif







>
>
>
>
>
>
>
>
>
>
>

627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645

int sqlite3Fts5VocabInit(Fts5Global*, sqlite3*);

/*
** End of interface to code in fts5_vocab.c.
**************************************************************************/


/**************************************************************************
** Interface to automatically generated code in fts5_unicode2.c. 
*/
int sqlite3Fts5UnicodeIsalnum(int c);
int sqlite3Fts5UnicodeIsdiacritic(int c);
int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic);
/*
** End of interface to code in fts5_unicode2.c.
**************************************************************************/

#endif

Changes to ext/fts5/fts5_expr.c.

1613
1614
1615
1616
1617
1618
1619







































1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631


1632
1633
1634
1635
1636
1637
1638
static void fts5ExprFunctionTcl(
  sqlite3_context *pCtx,          /* Function call context */
  int nArg,                       /* Number of args */
  sqlite3_value **apVal           /* Function arguments */
){
  fts5ExprFunction(pCtx, nArg, apVal, 1);
}








































/*
** This is called during initialization to register the fts5_expr() scalar
** UDF with the SQLite handle passed as the only argument.
*/
int sqlite3Fts5ExprInit(Fts5Global *pGlobal, sqlite3 *db){
  struct Fts5ExprFunc {
    const char *z;
    void (*x)(sqlite3_context*,int,sqlite3_value**);
  } aFunc[] = {
    { "fts5_expr", fts5ExprFunctionHr },
    { "fts5_expr_tcl", fts5ExprFunctionTcl },


  };
  int i;
  int rc = SQLITE_OK;
  void *pCtx = (void*)pGlobal;

  for(i=0; rc==SQLITE_OK && i<(sizeof(aFunc) / sizeof(aFunc[0])); i++){
    struct Fts5ExprFunc *p = &aFunc[i];







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>










|

>
>







1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
static void fts5ExprFunctionTcl(
  sqlite3_context *pCtx,          /* Function call context */
  int nArg,                       /* Number of args */
  sqlite3_value **apVal           /* Function arguments */
){
  fts5ExprFunction(pCtx, nArg, apVal, 1);
}

/*
** The implementation of an SQLite user-defined-function that accepts a
** single integer as an argument. If the integer is an alpha-numeric 
** unicode code point, 1 is returned. Otherwise 0.
*/
static void fts5ExprIsAlnum(
  sqlite3_context *pCtx,          /* Function call context */
  int nArg,                       /* Number of args */
  sqlite3_value **apVal           /* Function arguments */
){
  int iCode;
  if( nArg!=1 ){
    sqlite3_result_error(pCtx, 
        "wrong number of arguments to function fts5_isalnum", -1
    );
    return;
  }
  iCode = sqlite3_value_int(apVal[0]);
  sqlite3_result_int(pCtx, sqlite3Fts5UnicodeIsalnum(iCode));
}

static void fts5ExprFold(
  sqlite3_context *pCtx,          /* Function call context */
  int nArg,                       /* Number of args */
  sqlite3_value **apVal           /* Function arguments */
){
  if( nArg!=1 && nArg!=2 ){
    sqlite3_result_error(pCtx, 
        "wrong number of arguments to function fts5_fold", -1
    );
  }else{
    int iCode;
    int bRemoveDiacritics = 0;
    iCode = sqlite3_value_int(apVal[0]);
    if( nArg==2 ) bRemoveDiacritics = sqlite3_value_int(apVal[1]);
    sqlite3_result_int(pCtx, sqlite3Fts5UnicodeFold(iCode, bRemoveDiacritics));
  }
}

/*
** This is called during initialization to register the fts5_expr() scalar
** UDF with the SQLite handle passed as the only argument.
*/
int sqlite3Fts5ExprInit(Fts5Global *pGlobal, sqlite3 *db){
  struct Fts5ExprFunc {
    const char *z;
    void (*x)(sqlite3_context*,int,sqlite3_value**);
  } aFunc[] = {
    { "fts5_expr",     fts5ExprFunctionHr },
    { "fts5_expr_tcl", fts5ExprFunctionTcl },
    { "fts5_isalnum",  fts5ExprIsAlnum },
    { "fts5_fold",     fts5ExprFold },
  };
  int i;
  int rc = SQLITE_OK;
  void *pCtx = (void*)pGlobal;

  for(i=0; rc==SQLITE_OK && i<(sizeof(aFunc) / sizeof(aFunc[0])); i++){
    struct Fts5ExprFunc *p = &aFunc[i];

Changes to ext/fts5/fts5_tokenize.c.

170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
  return rc;
}

/**************************************************************************
** Start of unicode61 tokenizer implementation.
*/

/*
** Functions in fts5_unicode2.c. 
*/
int sqlite3Fts5UnicodeIsalnum(int c);
int sqlite3Fts5UnicodeIsdiacritic(int c);
int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic);


/*
** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
** from the sqlite3 source file utf.c. If this file is compiled as part
** of the amalgamation, they are not required.
*/
#ifndef SQLITE_AMALGAMATION







<
<
<
<
<
<
<







170
171
172
173
174
175
176







177
178
179
180
181
182
183
  return rc;
}

/**************************************************************************
** Start of unicode61 tokenizer implementation.
*/









/*
** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
** from the sqlite3 source file utf.c. If this file is compiled as part
** of the amalgamation, they are not required.
*/
#ifndef SQLITE_AMALGAMATION

Changes to ext/fts5/fts5_unicode2.c.

323
324
325
326
327
328
329

330
331
332
333

334
335
336
337
338
339
340
341
342
343
344
345
346

347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363

  assert( c>=0 );
  assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );

  if( c<128 ){
    if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
  }else if( c<65536 ){

    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
    int iLo = 0;
    int iRes = -1;


    while( iHi>=iLo ){
      int iTest = (iHi + iLo) / 2;
      int cmp = (c - aEntry[iTest].iCode);
      if( cmp>=0 ){
        iRes = iTest;
        iLo = iTest+1;
      }else{
        iHi = iTest-1;
      }
    }
    assert( iRes<0 || c>=aEntry[iRes].iCode );

    if( iRes>=0 ){

      const struct TableEntry *p = &aEntry[iRes];
      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
        assert( ret>0 );
      }
    }

    if( bRemoveDiacritic ) ret = fts5_remove_diacritic(ret);
  }
  
  else if( c>=66560 && c<66600 ){
    ret = c + 40;
  }

  return ret;
}
#endif /* defined(SQLITE_ENABLE_FTS5) */







>




>










<

<
>
|
|
|
|
<












323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345

346

347
348
349
350
351

352
353
354
355
356
357
358
359
360
361
362
363

  assert( c>=0 );
  assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );

  if( c<128 ){
    if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
  }else if( c<65536 ){
    const struct TableEntry *p;
    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
    int iLo = 0;
    int iRes = -1;

    assert( c>aEntry[0].iCode );
    while( iHi>=iLo ){
      int iTest = (iHi + iLo) / 2;
      int cmp = (c - aEntry[iTest].iCode);
      if( cmp>=0 ){
        iRes = iTest;
        iLo = iTest+1;
      }else{
        iHi = iTest-1;
      }
    }



    assert( iRes>=0 && c>=aEntry[iRes].iCode );
    p = &aEntry[iRes];
    if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
      ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
      assert( ret>0 );

    }

    if( bRemoveDiacritic ) ret = fts5_remove_diacritic(ret);
  }
  
  else if( c>=66560 && c<66600 ){
    ret = c + 40;
  }

  return ret;
}
#endif /* defined(SQLITE_ENABLE_FTS5) */

Added ext/fts5/test/fts5unicode3.test.





















































































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# 2014 Dec 20
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#
# Tests focusing on the fts5 tokenizers
#

proc fts3_unicode_path {file} {
  file join [file dirname [info script]] .. .. fts3 unicode $file
}

source [file join [file dirname [info script]] fts5_common.tcl]
source [fts3_unicode_path parseunicode.tcl]
set testprefix fts5unicode3

set CF [fts3_unicode_path CaseFolding.txt]
set UD [fts3_unicode_path UnicodeData.txt]

tl_load_casefolding_txt $CF
foreach x [an_load_unicodedata_text $UD] {
  set aNotAlnum($x) 1
}

foreach {y} [rd_load_unicodedata_text $UD] {
  foreach {code ascii} $y {}
  if {$ascii==""} {
    set int 0
  } else {
    binary scan $ascii c int
  }
  set aDiacritic($code) $int
}

proc tcl_fold {i {bRemoveDiacritic 0}} {
  global tl_lookup_table
  global aDiacritic

  if {[info exists tl_lookup_table($i)]} {
    set i $tl_lookup_table($i)
  }
  if {$bRemoveDiacritic && [info exists aDiacritic($i)]} {
    set i $aDiacritic($i)
  }
  expr $i
}
db func tcl_fold tcl_fold

proc tcl_isalnum {i} {
  global aNotAlnum
  expr {![info exists aNotAlnum($i)]}
}
db func tcl_isalnum tcl_isalnum


do_catchsql_test 1.0.1 {
  SELECT fts5_isalnum(1, 2, 3);
} {1 {wrong number of arguments to function fts5_isalnum}}
do_catchsql_test 1.0.2 {
  SELECT fts5_fold();
} {1 {wrong number of arguments to function fts5_fold}}
do_catchsql_test 1.0.3 {
  SELECT fts5_fold(1,2,3);
} {1 {wrong number of arguments to function fts5_fold}}

do_execsql_test 1.1 {
  WITH ii(i) AS (
    SELECT -1
    UNION ALL
    SELECT i+1 FROM ii WHERE i<100000
  )
  SELECT count(*), min(i) FROM ii WHERE fts5_fold(i)!=CAST(tcl_fold(i) AS int);
} {0 {}}

do_execsql_test 1.2 {
  WITH ii(i) AS (
    SELECT -1
    UNION ALL
    SELECT i+1 FROM ii WHERE i<100000
  )
  SELECT count(*), min(i) FROM ii 
  WHERE fts5_fold(i,1)!=CAST(tcl_fold(i,1) AS int);
} {0 {}}

do_execsql_test 1.3 {
  WITH ii(i) AS (
    SELECT -1
    UNION ALL
    SELECT i+1 FROM ii WHERE i<100000
  )
  SELECT count(*), min(i) FROM ii 
  WHERE fts5_isalnum(i)!=CAST(tcl_isalnum(i) AS int);
} {0 {}}

do_test 1.4 {
  set str {CREATE VIRTUAL TABLE f3 USING fts5(a, tokenize=}
  append str {"unicode61 separators '}
  for {set i 700} {$i<900} {incr i} {
    append str [format %c $i]
  }
  append str {'");}
  execsql $str
} {}
do_test 1.5 {
  set str {CREATE VIRTUAL TABLE f5 USING fts5(a, tokenize=}
  append str {"unicode61 tokenchars '}
  for {set i 700} {$i<900} {incr i} {
    append str [format %c $i]
  }
  append str {'");}
  execsql $str
} {}


finish_test