diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 02:44:24 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 02:44:24 +0000 |
commit | 8baab3c8d7a6f22888bd581cd5c6098fd2e4b5a8 (patch) | |
tree | 3537e168b860f2742f6029d70501b5ed7d15d345 /runtime/tools/unicode.vim | |
parent | Initial commit. (diff) | |
download | vim-upstream.tar.xz vim-upstream.zip |
Adding upstream version 2:8.1.0875.upstream/2%8.1.0875upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | runtime/tools/unicode.vim | 391 |
1 files changed, 391 insertions, 0 deletions
diff --git a/runtime/tools/unicode.vim b/runtime/tools/unicode.vim new file mode 100644 index 0000000..b518d05 --- /dev/null +++ b/runtime/tools/unicode.vim @@ -0,0 +1,391 @@ +" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c. +" The format of the UnicodeData.txt file is explained here: +" http://www.unicode.org/Public/5.1.0/ucd/UCD.html +" For the other files see the header. +" +" Might need to update the URL to the emoji-data.txt +" Usage: Vim -S <this-file> +" +" Author: Bram Moolenaar +" Last Update: 2010 Jan 12 + +" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops. +func! ParseDataToProps() + let s:dataprops = [] + let lnum = 1 + while lnum <= line('$') + let l = split(getline(lnum), '\s*;\s*', 1) + if len(l) != 15 + echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15' + return + endif + call add(s:dataprops, l) + let lnum += 1 + endwhile +endfunc + +" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops. +func! ParseFoldProps() + let s:foldprops = [] + let lnum = 1 + while lnum <= line('$') + let line = getline(lnum) + if line !~ '^#' && line !~ '^\s*$' + let l = split(line, '\s*;\s*', 1) + if len(l) != 4 + echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4' + return + endif + call add(s:foldprops, l) + endif + let lnum += 1 + endwhile +endfunc + +" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops. +func! ParseWidthProps() + let s:widthprops = [] + let lnum = 1 + while lnum <= line('$') + let line = getline(lnum) + if line !~ '^#' && line !~ '^\s*$' + let l = split(line, '\s*;\s*', 1) + if len(l) != 2 + echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2' + return + endif + call add(s:widthprops, l) + endif + let lnum += 1 + endwhile +endfunc + +" Build the toLower or toUpper table in a new buffer. +" Uses s:dataprops. +func! BuildCaseTable(name, index) + let start = -1 + let end = -1 + let step = 0 + let add = -1 + let ranges = [] + for p in s:dataprops + if p[a:index] != '' + let n = ('0x' . p[0]) + 0 + let nl = ('0x' . p[a:index]) + 0 + if start >= 0 && add == nl - n && (step == 0 || n - end == step) + " continue with same range. + let step = n - end + let end = n + else + if start >= 0 + " produce previous range + call Range(ranges, start, end, step, add) + endif + let start = n + let end = n + let step = 0 + let add = nl - n + endif + endif + endfor + if start >= 0 + call Range(ranges, start, end, step, add) + endif + + " New buffer to put the result in. + new + exe "file to" . a:name + call setline(1, "static convertStruct to" . a:name . "[] =") + call setline(2, "{") + call append('$', ranges) + call setline('$', getline('$')[:-2]) " remove last comma + call setline(line('$') + 1, "};") + wincmd p +endfunc + +" Build the foldCase table in a new buffer. +" Uses s:foldprops. +func! BuildFoldTable() + let start = -1 + let end = -1 + let step = 0 + let add = -1 + let ranges = [] + for p in s:foldprops + if p[1] == 'C' || p[1] == 'S' + let n = ('0x' . p[0]) + 0 + let nl = ('0x' . p[2]) + 0 + if start >= 0 && add == nl - n && (step == 0 || n - end == step) + " continue with same range. + let step = n - end + let end = n + else + if start >= 0 + " produce previous range + call Range(ranges, start, end, step, add) + endif + let start = n + let end = n + let step = 0 + let add = nl - n + endif + endif + endfor + if start >= 0 + call Range(ranges, start, end, step, add) + endif + + " New buffer to put the result in. + new + file foldCase + call setline(1, "static convertStruct foldCase[] =") + call setline(2, "{") + call append('$', ranges) + call setline('$', getline('$')[:-2]) " remove last comma + call setline(line('$') + 1, "};") + wincmd p +endfunc + +func! Range(ranges, start, end, step, add) + let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add) + call add(a:ranges, s) +endfunc + +" Build the combining table. +" Uses s:dataprops. +func! BuildCombiningTable() + let start = -1 + let end = -1 + let ranges = [] + for p in s:dataprops + if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me' + let n = ('0x' . p[0]) + 0 + if start >= 0 && end + 1 == n + " continue with same range. + let end = n + else + if start >= 0 + " produce previous range + call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) + endif + let start = n + let end = n + endif + endif + endfor + if start >= 0 + call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) + endif + + " New buffer to put the result in. + new + file combining + call setline(1, " static struct interval combining[] =") + call setline(2, " {") + call append('$', ranges) + call setline('$', getline('$')[:-2]) " remove last comma + call setline(line('$') + 1, " };") + wincmd p +endfunc + +" Build the double width or ambiguous width table in a new buffer. +" Uses s:widthprops and s:dataprops. +func! BuildWidthTable(pattern, tableName) + let start = -1 + let end = -1 + let ranges = [] + let dataidx = 0 + for p in s:widthprops + if p[1][0] =~ a:pattern + if p[0] =~ '\.\.' + " It is a range. we don't check for composing char then. + let rng = split(p[0], '\.\.') + if len(rng) != 2 + echoerr "Cannot parse range: '" . p[0] . "' in width table" + endif + let n = ('0x' . rng[0]) + 0 + let n_last = ('0x' . rng[1]) + 0 + else + let n = ('0x' . p[0]) + 0 + let n_last = n + endif + " Find this char in the data table. + while 1 + let dn = ('0x' . s:dataprops[dataidx][0]) + 0 + if dn >= n + break + endif + let dataidx += 1 + endwhile + if dn != n && n_last == n + echoerr "Cannot find character " . n . " in data table" + endif + " Only use the char when it's not a composing char. + " But use all chars from a range. + let dp = s:dataprops[dataidx] + if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me') + if start >= 0 && end + 1 == n + " continue with same range. + else + if start >= 0 + " produce previous range + call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) + if a:pattern == 'A' + call add(s:ambitable, [start, end]) + else + call add(s:doubletable, [start, end]) + endif + endif + let start = n + endif + let end = n_last + endif + endif + endfor + if start >= 0 + call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) + if a:pattern == 'A' + call add(s:ambitable, [start, end]) + else + call add(s:doubletable, [start, end]) + endif + endif + + " New buffer to put the result in. + new + exe "file " . a:tableName + call setline(1, " static struct interval " . a:tableName . "[] =") + call setline(2, " {") + call append('$', ranges) + call setline('$', getline('$')[:-2]) " remove last comma + call setline(line('$') + 1, " };") + wincmd p +endfunc + +" Build the amoji width table in a new buffer. +func! BuildEmojiTable(pattern, tableName) + let alltokens = [] + let widthtokens = [] + let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~a:pattern'), 'matchstr(v:val,"^\\S\\+")') + for n in range(len(lines)) + let line = lines[n] + let token = split(line, '\.\.') + let first = ('0x' . token[0]) + 0 + if len(token) == 1 + let last = first + else + let last = ('0x' . token[1]) + 0 + endif + + let token = [first, last] + if len(alltokens) > 0 && (token[0] - 1 == alltokens[-1][1]) + let alltokens[-1][1] = token[1] + else + call add(alltokens, token) + endif + + " Characters below 1F000 may be considered single width traditionally, + " making them double width causes problems. + if first < 0x1f000 + continue + endif + + " exclude characters that are in the "ambiguous" or "doublewidth" table + for ambi in s:ambitable + if first >= ambi[0] && first <= ambi[1] + let first = ambi[1] + 1 + endif + if last >= ambi[0] && last <= ambi[1] + let last = ambi[0] - 1 + endif + endfor + for double in s:doubletable + if first >= double[0] && first <= double[1] + let first = double[1] + 1 + endif + if last >= double[0] && last <= double[1] + let last = double[0] - 1 + endif + endfor + + if first <= last + let token = [first, last] + if len(widthtokens) > 0 && (token[0] - 1 == widthtokens[-1][1]) + let widthtokens[-1][1] = token[1] + else + call add(widthtokens, token) + endif + endif + endfor + let allranges = map(alltokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])') + let widthranges = map(widthtokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])') + + " New buffer to put the result in. + new + exe "file " . a:tableName . '_all' + call setline(1, " static struct interval " . a:tableName . "_all[] =") + call setline(2, " {") + call append('$', allranges) + call setline('$', getline('$')[:-2]) " remove last comma + call setline(line('$') + 1, " };") + wincmd p + + " New buffer to put the result in. + new + exe "file " . a:tableName . '_width' + call setline(1, " static struct interval " . a:tableName . "_width[] =") + call setline(2, " {") + call append('$', widthranges) + call setline('$', getline('$')[:-2]) " remove last comma + call setline(line('$') + 1, " };") + wincmd p +endfunc + +" Try to avoid hitting E36 +set equalalways + +" Edit the Unicode text file. Requires the netrw plugin. +edit http://unicode.org/Public/UNIDATA/UnicodeData.txt + +" Parse each line, create a list of lists. +call ParseDataToProps() + +" Build the toLower table. +call BuildCaseTable("Lower", 13) + +" Build the toUpper table. +call BuildCaseTable("Upper", 12) + +" Build the ranges of composing chars. +call BuildCombiningTable() + +" Edit the case folding text file. Requires the netrw plugin. +edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt + +" Parse each line, create a list of lists. +call ParseFoldProps() + +" Build the foldCase table. +call BuildFoldTable() + +" Edit the width text file. Requires the netrw plugin. +edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt + +" Parse each line, create a list of lists. +call ParseWidthProps() + +" Build the double width table. +let s:doubletable = [] +call BuildWidthTable('[WF]', 'doublewidth') + +" Build the ambiguous width table. +let s:ambitable = [] +call BuildWidthTable('A', 'ambiguous') + +" Edit the emoji text file. Requires the netrw plugin. +edit https://www.unicode.org/Public/emoji/11.0/emoji-data.txt +"edit http://www.unicode.org/Public/emoji/latest/emoji-data.txt + +" Build the emoji table. Ver. 1.0 - 6.0 +" Must come after the "ambiguous" table +call BuildEmojiTable('; Emoji\s\+#\s\+\d\+\.\d', 'emoji') |