1 files changed, 205 insertions, 0 deletions
diff --git a/ext/fts3/unicode/parseunicode.tcl b/ext/fts3/unicode/parseunicode.tcl
new file mode 100644
index 0000000..7c246a4
--- /dev/null
+++ b/ext/fts3/unicode/parseunicode.tcl
@@ -0,0 +1,205 @@
+
+#--------------------------------------------------------------------------
+# Parameter $zName must be a path to the file UnicodeData.txt. This command
+# reads the file and returns a list of mappings required to remove all
+# diacritical marks from a unicode string. Each mapping is itself a list
+# consisting of two elements - the unicode codepoint and the single ASCII
+# character that it should be replaced with, or an empty string if the 
+# codepoint should simply be removed from the input. Examples:
+#
+#   { 224 a  0 }     (replace codepoint 224 to "a")
+#   { 769 "" 0 }     (remove codepoint 769 from input)
+#
+# Mappings are only returned for non-upper case codepoints. It is assumed
+# that the input has already been folded to lower case.
+#
+# The third value in the list is always either 0 or 1. 0 if the 
+# UnicodeData.txt file maps the codepoint to a single ASCII character and
+# a diacritic, or 1 if the mapping is indirect. For example, consider the 
+# two entries:
+#
+# 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC
+# 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8
+#
+# The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a 
+# diacritic). The second is an indirect mapping, as it maps to the
+# first codepoint plus 0302 (a diacritic).
+#
+proc rd_load_unicodedata_text {zName} {
+  global tl_lookup_table
+
+  set fd [open $zName]
+  set lField {
+    code
+    character_name
+    general_category
+    canonical_combining_classes
+    bidirectional_category
+    character_decomposition_mapping
+    decimal_digit_value
+    digit_value
+    numeric_value
+    mirrored
+    unicode_1_name
+    iso10646_comment_field
+    uppercase_mapping
+    lowercase_mapping
+    titlecase_mapping
+  }
+  set lRet [list]
+
+  while { ![eof $fd] } {
+    set line [gets $fd]
+    if {$line == ""} continue
+
+    set fields [split $line ";"]
+    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
+    foreach $lField $fields {}
+    if { [llength $character_decomposition_mapping]!=2
+      || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
+    } {
+      continue
+    }
+
+    set iCode  [expr "0x$code"]
+    set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
+    set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]
+
+    # Filter out upper-case characters, as they will be mapped to their
+    # lower-case equivalents before this data is used.
+    if {[info exists tl_lookup_table($iCode)]} continue
+
+    # Check if this is an indirect mapping. If so, set bIndirect to true
+    # and change $iAscii to the indirectly mappped ASCII character.
+    set bIndirect 0
+    if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} {
+      set iAscii $mapping($iAscii)
+      set bIndirect 1
+    }
+
+    if { ($iAscii >= 97 && $iAscii <= 122)
+      || ($iAscii >= 65 && $iAscii <= 90)
+    } {
+      lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect]
+      set mapping($iCode) $iAscii
+      set dia($iDia) 1
+    }
+  }
+
+  foreach d [array names dia] {
+    lappend lRet [list $d "" 0]
+  }
+  set lRet [lsort -integer -index 0 $lRet]
+
+  close $fd
+  set lRet
+}
+
+#-------------------------------------------------------------------------
+# Parameter $zName must be a path to the file UnicodeData.txt. This command
+# reads the file and returns a list of codepoints (integers). The list
+# contains all codepoints in the UnicodeData.txt assigned to any "General
+# Category" that is not a "Letter" or "Number".
+#
+proc an_load_unicodedata_text {zName} {
+  set fd [open $zName]
+  set lField {
+    code
+    character_name
+    general_category
+    canonical_combining_classes
+    bidirectional_category
+    character_decomposition_mapping
+    decimal_digit_value
+    digit_value
+    numeric_value
+    mirrored
+    unicode_1_name
+    iso10646_comment_field
+    uppercase_mapping
+    lowercase_mapping
+    titlecase_mapping
+  }
+  set lRet [list]
+
+  while { ![eof $fd] } {
+    set line [gets $fd]
+    if {$line == ""} continue
+
+    set fields [split $line ";"]
+    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
+    foreach $lField $fields {}
+
+    set iCode [expr "0x$code"]
+    set bAlnum [expr {
+         [lsearch {L N} [string range $general_category 0 0]] >= 0
+      || $general_category=="Co"
+    }]
+
+    if { !$bAlnum } { lappend lRet $iCode }
+  }
+
+  close $fd
+  set lRet
+}
+
+proc tl_load_casefolding_txt {zName} {
+  global tl_lookup_table
+
+  set fd [open $zName]
+  while { ![eof $fd] } {
+    set line [gets $fd]
+    if {[string range $line 0 0] == "#"} continue
+    if {$line == ""} continue
+
+    foreach x {a b c d} {unset -nocomplain $x}
+    foreach {a b c d} [split $line ";"] {}
+
+    set a2 [list]
+    set c2 [list]
+    foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
+    foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
+    set b [string trim $b]
+    set d [string trim $d]
+
+    if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
+  }
+}
+
+proc cc_load_unicodedata_text {zName} {
+  set fd [open $zName]
+  set lField {
+    code
+    character_name
+    general_category
+    canonical_combining_classes
+    bidirectional_category
+    character_decomposition_mapping
+    decimal_digit_value
+    digit_value
+    numeric_value
+    mirrored
+    unicode_1_name
+    iso10646_comment_field
+    uppercase_mapping
+    lowercase_mapping
+    titlecase_mapping
+  }
+  set lRet [list]
+
+  while { ![eof $fd] } {
+    set line [gets $fd]
+    if {$line == ""} continue
+
+    set fields [split $line ";"]
+    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
+    foreach $lField $fields {}
+
+    lappend lRet [list $code $general_category]
+  }
+
+  close $fd
+  set lRet
+}
+
+