diff options
Diffstat (limited to 'ext/fts3/unicode/parseunicode.tcl')
-rw-r--r-- | ext/fts3/unicode/parseunicode.tcl | 205 |
1 files changed, 205 insertions, 0 deletions
diff --git a/ext/fts3/unicode/parseunicode.tcl b/ext/fts3/unicode/parseunicode.tcl new file mode 100644 index 0000000..7c246a4 --- /dev/null +++ b/ext/fts3/unicode/parseunicode.tcl @@ -0,0 +1,205 @@ + +#-------------------------------------------------------------------------- +# Parameter $zName must be a path to the file UnicodeData.txt. This command +# reads the file and returns a list of mappings required to remove all +# diacritical marks from a unicode string. Each mapping is itself a list +# consisting of two elements - the unicode codepoint and the single ASCII +# character that it should be replaced with, or an empty string if the +# codepoint should simply be removed from the input. Examples: +# +# { 224 a 0 } (replace codepoint 224 to "a") +# { 769 "" 0 } (remove codepoint 769 from input) +# +# Mappings are only returned for non-upper case codepoints. It is assumed +# that the input has already been folded to lower case. +# +# The third value in the list is always either 0 or 1. 0 if the +# UnicodeData.txt file maps the codepoint to a single ASCII character and +# a diacritic, or 1 if the mapping is indirect. For example, consider the +# two entries: +# +# 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC +# 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8 +# +# The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a +# diacritic). The second is an indirect mapping, as it maps to the +# first codepoint plus 0302 (a diacritic). +# +proc rd_load_unicodedata_text {zName} { + global tl_lookup_table + + set fd [open $zName] + set lField { + code + character_name + general_category + canonical_combining_classes + bidirectional_category + character_decomposition_mapping + decimal_digit_value + digit_value + numeric_value + mirrored + unicode_1_name + iso10646_comment_field + uppercase_mapping + lowercase_mapping + titlecase_mapping + } + set lRet [list] + + while { ![eof $fd] } { + set line [gets $fd] + if {$line == ""} continue + + set fields [split $line ";"] + if {[llength $fields] != [llength $lField]} { error "parse error: $line" } + foreach $lField $fields {} + if { [llength $character_decomposition_mapping]!=2 + || [string is xdigit [lindex $character_decomposition_mapping 0]]==0 + } { + continue + } + + set iCode [expr "0x$code"] + set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"] + set iDia [expr "0x[lindex $character_decomposition_mapping 1]"] + + # Filter out upper-case characters, as they will be mapped to their + # lower-case equivalents before this data is used. + if {[info exists tl_lookup_table($iCode)]} continue + + # Check if this is an indirect mapping. If so, set bIndirect to true + # and change $iAscii to the indirectly mappped ASCII character. + set bIndirect 0 + if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} { + set iAscii $mapping($iAscii) + set bIndirect 1 + } + + if { ($iAscii >= 97 && $iAscii <= 122) + || ($iAscii >= 65 && $iAscii <= 90) + } { + lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect] + set mapping($iCode) $iAscii + set dia($iDia) 1 + } + } + + foreach d [array names dia] { + lappend lRet [list $d "" 0] + } + set lRet [lsort -integer -index 0 $lRet] + + close $fd + set lRet +} + +#------------------------------------------------------------------------- +# Parameter $zName must be a path to the file UnicodeData.txt. This command +# reads the file and returns a list of codepoints (integers). The list +# contains all codepoints in the UnicodeData.txt assigned to any "General +# Category" that is not a "Letter" or "Number". +# +proc an_load_unicodedata_text {zName} { + set fd [open $zName] + set lField { + code + character_name + general_category + canonical_combining_classes + bidirectional_category + character_decomposition_mapping + decimal_digit_value + digit_value + numeric_value + mirrored + unicode_1_name + iso10646_comment_field + uppercase_mapping + lowercase_mapping + titlecase_mapping + } + set lRet [list] + + while { ![eof $fd] } { + set line [gets $fd] + if {$line == ""} continue + + set fields [split $line ";"] + if {[llength $fields] != [llength $lField]} { error "parse error: $line" } + foreach $lField $fields {} + + set iCode [expr "0x$code"] + set bAlnum [expr { + [lsearch {L N} [string range $general_category 0 0]] >= 0 + || $general_category=="Co" + }] + + if { !$bAlnum } { lappend lRet $iCode } + } + + close $fd + set lRet +} + +proc tl_load_casefolding_txt {zName} { + global tl_lookup_table + + set fd [open $zName] + while { ![eof $fd] } { + set line [gets $fd] + if {[string range $line 0 0] == "#"} continue + if {$line == ""} continue + + foreach x {a b c d} {unset -nocomplain $x} + foreach {a b c d} [split $line ";"] {} + + set a2 [list] + set c2 [list] + foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] } + foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] } + set b [string trim $b] + set d [string trim $d] + + if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 } + } +} + +proc cc_load_unicodedata_text {zName} { + set fd [open $zName] + set lField { + code + character_name + general_category + canonical_combining_classes + bidirectional_category + character_decomposition_mapping + decimal_digit_value + digit_value + numeric_value + mirrored + unicode_1_name + iso10646_comment_field + uppercase_mapping + lowercase_mapping + titlecase_mapping + } + set lRet [list] + + while { ![eof $fd] } { + set line [gets $fd] + if {$line == ""} continue + + set fields [split $line ";"] + if {[llength $fields] != [llength $lField]} { error "parse error: $line" } + foreach $lField $fields {} + + lappend lRet [list $code $general_category] + } + + close $fd + set lRet +} + + |