diff options
Diffstat (limited to '')
-rw-r--r-- | runtime/tools/unicode.vim | 471 |
1 files changed, 471 insertions, 0 deletions
diff --git a/runtime/tools/unicode.vim b/runtime/tools/unicode.vim new file mode 100644 index 0000000..630a581 --- /dev/null +++ b/runtime/tools/unicode.vim @@ -0,0 +1,471 @@ +" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c. +" The format of the UnicodeData.txt file is explained here: +" http://www.unicode.org/Public/5.1.0/ucd/UCD.html +" For the other files see the header. +" +" Might need to update the URL to the emoji-data.txt +" Usage: Vim -S <this-file> +" +" Author: Bram Moolenaar +" Last Update: 2020 Aug 24 + +" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops. +func! ParseDataToProps() + let s:dataprops = [] + let lnum = 1 + while lnum <= line('$') + let l = split(getline(lnum), '\s*;\s*', 1) + if len(l) != 15 + echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15' + return + endif + call add(s:dataprops, l) + let lnum += 1 + endwhile +endfunc + +" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops. +func! ParseFoldProps() + let s:foldprops = [] + let lnum = 1 + while lnum <= line('$') + let line = getline(lnum) + if line !~ '^#' && line !~ '^\s*$' + let l = split(line, '\s*;\s*', 1) + if len(l) != 4 + echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4' + return + endif + call add(s:foldprops, l) + endif + let lnum += 1 + endwhile +endfunc + +" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops. +func! ParseWidthProps() + let s:widthprops = [] + let lnum = 1 + while lnum <= line('$') + let line = getline(lnum) + if line !~ '^#' && line !~ '^\s*$' + let l = split(line, '\s*;\s*', 1) + if len(l) != 2 + echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2' + return + endif + call add(s:widthprops, l) + endif + let lnum += 1 + endwhile +endfunc + +" Build the toLower or toUpper table in a new buffer. +" Uses s:dataprops. +func! BuildCaseTable(name, index) + let start = -1 + let end = -1 + let step = 0 + let add = -1 + let ranges = [] + for p in s:dataprops + if p[a:index] != '' + let n = ('0x' . p[0]) + 0 + let nl = ('0x' . p[a:index]) + 0 + if start >= 0 && add == nl - n && (step == 0 || n - end == step) + " continue with same range. + let step = n - end + let end = n + else + if start >= 0 + " produce previous range + call Range(ranges, start, end, step, add) + endif + let start = n + let end = n + let step = 0 + let add = nl - n + endif + endif + endfor + if start >= 0 + call Range(ranges, start, end, step, add) + endif + + " New buffer to put the result in. + new + exe "file to" . a:name + call setline(1, "static convertStruct to" . a:name . "[] =") + call setline(2, "{") + call append('$', ranges) + call setline('$', getline('$')[:-2]) " remove last comma + call setline(line('$') + 1, "};") + wincmd p +endfunc + +" Build the foldCase table in a new buffer. +" Uses s:foldprops. +func! BuildFoldTable() + let start = -1 + let end = -1 + let step = 0 + let add = -1 + let ranges = [] + for p in s:foldprops + if p[1] == 'C' || p[1] == 'S' + let n = ('0x' . p[0]) + 0 + let nl = ('0x' . p[2]) + 0 + if start >= 0 && add == nl - n && (step == 0 || n - end == step) + " continue with same range. + let step = n - end + let end = n + else + if start >= 0 + " produce previous range + call Range(ranges, start, end, step, add) + endif + let start = n + let end = n + let step = 0 + let add = nl - n + endif + endif + endfor + if start >= 0 + call Range(ranges, start, end, step, add) + endif + + " New buffer to put the result in. + new + file foldCase + call setline(1, "static convertStruct foldCase[] =") + call setline(2, "{") + call append('$', ranges) + call setline('$', getline('$')[:-2]) " remove last comma + call setline(line('$') + 1, "};") + wincmd p +endfunc + +func! Range(ranges, start, end, step, add) + let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add) + call add(a:ranges, s) +endfunc + +" Build the combining table. +" Uses s:dataprops. +func! BuildCombiningTable() + let start = -1 + let end = -1 + let ranges = [] + for p in s:dataprops + " The 'Mc' property was removed, it does take up space. + if p[2] == 'Mn' || p[2] == 'Me' + let n = ('0x' . p[0]) + 0 + if start >= 0 && end + 1 == n + " continue with same range. + let end = n + else + if start >= 0 + " produce previous range + call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) + endif + let start = n + let end = n + endif + endif + endfor + if start >= 0 + call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) + endif + + " New buffer to put the result in. + new + file combining + call setline(1, " static struct interval combining[] =") + call setline(2, " {") + call append('$', ranges) + call setline('$', getline('$')[:-2]) " remove last comma + call setline(line('$') + 1, " };") + wincmd p +endfunc + +" Build the double width or ambiguous width table in a new buffer. +" Uses s:widthprops and s:dataprops. +func! BuildWidthTable(pattern, tableName) + let start = -1 + let end = -1 + let ranges = [] + let dataidx = 0 + " Account for indentation differences between ambiguous and doublewidth + " table in mbyte.c + if a:pattern == 'A' + let spc = ' ' + else + let spc = "\t" + endif + for p in s:widthprops + if p[1][0] =~ a:pattern + if p[0] =~ '\.\.' + " It is a range. we don't check for composing char then. + let rng = split(p[0], '\.\.') + if len(rng) != 2 + echoerr "Cannot parse range: '" . p[0] . "' in width table" + endif + let n = ('0x' . rng[0]) + 0 + let n_last = ('0x' . rng[1]) + 0 + else + let n = ('0x' . p[0]) + 0 + let n_last = n + endif + " Find this char in the data table. + while 1 + let dn = ('0x' . s:dataprops[dataidx][0]) + 0 + if dn >= n + break + endif + let dataidx += 1 + endwhile + if dn != n && n_last == n + echoerr "Cannot find character " . n . " in data table" + endif + " Only use the char when it's not a composing char. + " But use all chars from a range. + let dp = s:dataprops[dataidx] + if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me') + if start >= 0 && end + 1 == n + " continue with same range. + else + if start >= 0 + " produce previous range + call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end)) + if a:pattern == 'A' + call add(s:ambitable, [start, end]) + else + call add(s:doubletable, [start, end]) + endif + endif + let start = n + endif + let end = n_last + endif + endif + endfor + if start >= 0 + call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end)) + if a:pattern == 'A' + call add(s:ambitable, [start, end]) + else + call add(s:doubletable, [start, end]) + endif + endif + + " New buffer to put the result in. + new + exe "file " . a:tableName + if a:pattern == 'A' + call setline(1, "static struct interval " . a:tableName . "[] =") + call setline(2, "{") + else + call setline(1, " static struct interval " . a:tableName . "[] =") + call setline(2, " {") + endif + call append('$', ranges) + call setline('$', getline('$')[:-2]) " remove last comma + if a:pattern == 'A' + call setline(line('$') + 1, "};") + else + call setline(line('$') + 1, " };") + endif + wincmd p +endfunc + + +" Get characters from a list of lines in form "12ab .." or "12ab..56cd ..." +" and put them in dictionary "chardict" +func AddLinesToCharDict(lines, chardict) + for line in a:lines + let tokens = split(line, '\.\.') + let first = str2nr(tokens[0], 16) + if len(tokens) == 1 + let last = first + else + let last = str2nr(tokens[1], 16) + endif + for nr in range(first, last) + let a:chardict[nr] = 1 + endfor + endfor +endfunc + +func Test_AddLinesToCharDict() + let dict = {} + call AddLinesToCharDict([ + \ '1234 blah blah', + \ '1235 blah blah', + \ '12a0..12a2 blah blah', + \ '12a1 blah blah', + \ ], dict) + call assert_equal({0x1234: 1, 0x1235: 1, + \ 0x12a0: 1, 0x12a1: 1, 0x12a2: 1, + \ }, dict) + if v:errors != [] + echoerr 'AddLinesToCharDict' v:errors + return 1 + endif + return 0 +endfunc + + +func CharDictToPairList(chardict) + let result = [] + let keys = keys(a:chardict)->map('str2nr(v:val)')->sort('N') + let low = keys[0] + let high = keys[0] + for key in keys + if key > high + 1 + call add(result, [low, high]) + let low = key + let high = key + else + let high = key + endif + endfor + call add(result, [low, high]) + return result +endfunc + +func Test_CharDictToPairList() + let dict = {0x1020: 1, 0x1021: 1, 0x1022: 1, + \ 0x1024: 1, + \ 0x2022: 1, + \ 0x2024: 1, 0x2025: 1} + call assert_equal([ + \ [0x1020, 0x1022], + \ [0x1024, 0x1024], + \ [0x2022, 0x2022], + \ [0x2024, 0x2025], + \ ], CharDictToPairList(dict)) + if v:errors != [] + echoerr 'CharDictToPairList' v:errors + return 1 + endif + return 0 +endfunc + + +" Build the amoji width table in a new buffer. +func BuildEmojiTable() + " First make the table for all emojis. + let pattern = '; Emoji\s\+#\s' + let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")') + + " Make a dictionary with an entry for each character. + let chardict = {} + call AddLinesToCharDict(lines, chardict) + let pairlist = CharDictToPairList(chardict) + let allranges = map(pairlist, 'printf(" {0x%04x, 0x%04x},", v:val[0], v:val[1])') + + " New buffer to put the result in. + new + exe 'file emoji_all' + call setline(1, "static struct interval emoji_all[] =") + call setline(2, "{") + call append('$', allranges) + call setline('$', getline('$')[:-2]) " remove last comma + call setline(line('$') + 1, "};") + wincmd p + + " Make the table for wide emojis. + let pattern = '; Emoji_\(Presentation\|Modifier_Base\)\s\+#\s' + let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")') + + " Make a dictionary with an entry for each character. + let chardict = {} + call AddLinesToCharDict(lines, chardict) + + " exclude characters that are in the "ambiguous" or "doublewidth" table + for ambi in s:ambitable + for nr in range(ambi[0], ambi[1]) + if has_key(chardict, nr) + call remove(chardict, nr) + endif + endfor + endfor + + for wide in s:doubletable + for nr in range(wide[0], wide[1]) + if has_key(chardict, nr) + call remove(chardict, nr) + endif + endfor + endfor + + let pairlist = CharDictToPairList(chardict) + let wide_ranges = map(pairlist, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])') + + " New buffer to put the result in. + new + exe 'file emoji_wide' + call setline(1, " static struct interval emoji_wide[] =") + call setline(2, " {") + call append('$', wide_ranges) + call setline('$', getline('$')[:-2]) " remove last comma + call setline(line('$') + 1, " };") + wincmd p +endfunc + +" First test a few things +let v:errors = [] +if Test_AddLinesToCharDict() || Test_CharDictToPairList() + finish +endif + + +" Try to avoid hitting E36 +set equalalways + +" Edit the Unicode text file. Requires the netrw plugin. +edit http://unicode.org/Public/UNIDATA/UnicodeData.txt + +" Parse each line, create a list of lists. +call ParseDataToProps() + +" Build the toLower table. +call BuildCaseTable("Lower", 13) + +" Build the toUpper table. +call BuildCaseTable("Upper", 12) + +" Build the ranges of composing chars. +call BuildCombiningTable() + +" Edit the case folding text file. Requires the netrw plugin. +edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt + +" Parse each line, create a list of lists. +call ParseFoldProps() + +" Build the foldCase table. +call BuildFoldTable() + +" Edit the width text file. Requires the netrw plugin. +edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt + +" Parse each line, create a list of lists. +call ParseWidthProps() + +" Build the double width table. +let s:doubletable = [] +call BuildWidthTable('[WF]', 'doublewidth') + +" Build the ambiguous width table. +let s:ambitable = [] +call BuildWidthTable('A', 'ambiguous') + +" Edit the emoji text file. Requires the netrw plugin. +" commented out, because it drops too many characters +"edit https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt +" +"" Build the emoji table. Ver. 1.0 - 6.0 +"" Must come after the "ambiguous" and "doublewidth" tables +"call BuildEmojiTable() |