1 files changed, 471 insertions, 0 deletions
diff --git a/runtime/tools/unicode.vim b/runtime/tools/unicode.vim
new file mode 100644
index 0000000..630a581
--- /dev/null
+++ b/runtime/tools/unicode.vim
@@ -0,0 +1,471 @@
+" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
+" The format of the UnicodeData.txt file is explained here:
+" http://www.unicode.org/Public/5.1.0/ucd/UCD.html
+" For the other files see the header.
+"
+" Might need to update the URL to the emoji-data.txt
+" Usage: Vim -S <this-file>
+"
+" Author: Bram Moolenaar
+" Last Update: 2020 Aug 24
+
+" Parse lines of UnicodeData.txt.  Creates a list of lists in s:dataprops.
+func! ParseDataToProps()
+  let s:dataprops = []
+  let lnum = 1
+  while lnum <= line('$')
+    let l = split(getline(lnum), '\s*;\s*', 1)
+    if len(l) != 15
+      echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
+      return
+    endif
+    call add(s:dataprops, l)
+    let lnum += 1
+  endwhile
+endfunc
+
+" Parse lines of CaseFolding.txt.  Creates a list of lists in s:foldprops.
+func! ParseFoldProps()
+  let s:foldprops = []
+  let lnum = 1
+  while lnum <= line('$')
+    let line = getline(lnum)
+    if line !~ '^#' && line !~ '^\s*$'
+      let l = split(line, '\s*;\s*', 1)
+      if len(l) != 4
+        echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
+        return
+      endif
+      call add(s:foldprops, l)
+    endif
+    let lnum += 1
+  endwhile
+endfunc
+
+" Parse lines of EastAsianWidth.txt.  Creates a list of lists in s:widthprops.
+func! ParseWidthProps()
+  let s:widthprops = []
+  let lnum = 1
+  while lnum <= line('$')
+    let line = getline(lnum)
+    if line !~ '^#' && line !~ '^\s*$'
+      let l = split(line, '\s*;\s*', 1)
+      if len(l) != 2
+        echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
+        return
+      endif
+      call add(s:widthprops, l)
+    endif
+    let lnum += 1
+  endwhile
+endfunc
+
+" Build the toLower or toUpper table in a new buffer.
+" Uses s:dataprops.
+func! BuildCaseTable(name, index)
+  let start = -1
+  let end = -1
+  let step = 0
+  let add = -1
+  let ranges = []
+  for p in s:dataprops
+    if p[a:index] != ''
+      let n = ('0x' . p[0]) + 0
+      let nl = ('0x' . p[a:index]) + 0
+      if start >= 0 && add == nl - n && (step == 0 || n - end == step)
+        " continue with same range.
+        let step = n - end
+        let end = n
+      else
+        if start >= 0
+          " produce previous range
+          call Range(ranges, start, end, step, add)
+        endif
+        let start = n
+        let end = n
+        let step = 0
+        let add = nl - n
+      endif
+    endif
+  endfor
+  if start >= 0
+    call Range(ranges, start, end, step, add)
+  endif
+
+  " New buffer to put the result in.
+  new
+  exe "file to" . a:name
+  call setline(1, "static convertStruct to" . a:name . "[] =")
+  call setline(2, "{")
+  call append('$', ranges)
+  call setline('$', getline('$')[:-2])  " remove last comma
+  call setline(line('$') + 1, "};")
+  wincmd p
+endfunc
+
+" Build the foldCase table in a new buffer.
+" Uses s:foldprops.
+func! BuildFoldTable()
+  let start = -1
+  let end = -1
+  let step = 0
+  let add = -1
+  let ranges = []
+  for p in s:foldprops
+    if p[1] == 'C' || p[1] == 'S'
+      let n = ('0x' . p[0]) + 0
+      let nl = ('0x' . p[2]) + 0
+      if start >= 0 && add == nl - n && (step == 0 || n - end == step)
+        " continue with same range.
+        let step = n - end
+        let end = n
+      else
+        if start >= 0
+          " produce previous range
+          call Range(ranges, start, end, step, add)
+        endif
+        let start = n
+        let end = n
+        let step = 0
+        let add = nl - n
+      endif
+    endif
+  endfor
+  if start >= 0
+    call Range(ranges, start, end, step, add)
+  endif
+
+  " New buffer to put the result in.
+  new
+  file foldCase
+  call setline(1, "static convertStruct foldCase[] =")
+  call setline(2, "{")
+  call append('$', ranges)
+  call setline('$', getline('$')[:-2])  " remove last comma
+  call setline(line('$') + 1, "};")
+  wincmd p
+endfunc
+
+func! Range(ranges, start, end, step, add)
+  let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
+  call add(a:ranges, s)
+endfunc
+
+" Build the combining table.
+" Uses s:dataprops.
+func! BuildCombiningTable()
+  let start = -1
+  let end = -1
+  let ranges = []
+  for p in s:dataprops
+    " The 'Mc' property was removed, it does take up space.
+    if p[2] == 'Mn' || p[2] == 'Me'
+      let n = ('0x' . p[0]) + 0
+      if start >= 0 && end + 1 == n
+        " continue with same range.
+        let end = n
+      else
+        if start >= 0
+          " produce previous range
+          call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
+        endif
+        let start = n
+        let end = n
+      endif
+    endif
+  endfor
+  if start >= 0
+    call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
+  endif
+
+  " New buffer to put the result in.
+  new
+  file combining
+  call setline(1, "    static struct interval combining[] =")
+  call setline(2, "    {")
+  call append('$', ranges)
+  call setline('$', getline('$')[:-2])  " remove last comma
+  call setline(line('$') + 1, "    };")
+  wincmd p
+endfunc
+
+" Build the double width or ambiguous width table in a new buffer.
+" Uses s:widthprops and s:dataprops.
+func! BuildWidthTable(pattern, tableName)
+  let start = -1
+  let end = -1
+  let ranges = []
+  let dataidx = 0
+  " Account for indentation differences between ambiguous and doublewidth
+  " table in mbyte.c
+  if a:pattern == 'A'
+    let spc = '    '
+  else
+    let spc = "\t"
+  endif
+  for p in s:widthprops
+    if p[1][0] =~ a:pattern
+      if p[0] =~ '\.\.'
+        " It is a range.  we don't check for composing char then.
+        let rng = split(p[0], '\.\.')
+        if len(rng) != 2
+          echoerr "Cannot parse range: '" . p[0] . "' in width table"
+        endif
+        let n = ('0x' . rng[0]) + 0
+        let n_last =  ('0x' . rng[1]) + 0
+      else
+        let n = ('0x' . p[0]) + 0
+        let n_last = n
+      endif
+      " Find this char in the data table.
+      while 1
+        let dn = ('0x' . s:dataprops[dataidx][0]) + 0
+        if dn >= n
+          break
+        endif
+        let dataidx += 1
+      endwhile
+      if dn != n && n_last == n
+        echoerr "Cannot find character " . n . " in data table"
+      endif
+      " Only use the char when it's not a composing char.
+      " But use all chars from a range.
+      let dp = s:dataprops[dataidx]
+      if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me')
+        if start >= 0 && end + 1 == n
+          " continue with same range.
+        else
+          if start >= 0
+            " produce previous range
+            call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end))
+	    if a:pattern == 'A'
+	      call add(s:ambitable, [start, end])
+	    else
+	      call add(s:doubletable, [start, end])
+	    endif
+          endif
+          let start = n
+        endif
+        let end = n_last
+      endif
+    endif
+  endfor
+  if start >= 0
+    call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end))
+    if a:pattern == 'A'
+      call add(s:ambitable, [start, end])
+    else
+      call add(s:doubletable, [start, end])
+    endif
+  endif
+
+  " New buffer to put the result in.
+  new
+  exe "file " . a:tableName
+  if a:pattern == 'A'
+    call setline(1, "static struct interval " . a:tableName . "[] =")
+    call setline(2, "{")
+  else
+    call setline(1, "    static struct interval " . a:tableName . "[] =")
+    call setline(2, "    {")
+  endif
+  call append('$', ranges)
+  call setline('$', getline('$')[:-2])  " remove last comma
+  if a:pattern == 'A'
+    call setline(line('$') + 1, "};")
+  else
+    call setline(line('$') + 1, "    };")
+  endif
+  wincmd p
+endfunc
+
+
+" Get characters from a list of lines in form "12ab .." or "12ab..56cd ..."
+" and put them in dictionary "chardict"
+func AddLinesToCharDict(lines, chardict)
+  for line in a:lines
+    let tokens = split(line, '\.\.')
+    let first = str2nr(tokens[0], 16)
+    if len(tokens) == 1
+      let last = first
+    else
+      let last = str2nr(tokens[1], 16)
+    endif
+    for nr in range(first, last)
+      let a:chardict[nr] = 1
+    endfor
+  endfor
+endfunc
+
+func Test_AddLinesToCharDict()
+  let dict = {}
+  call AddLinesToCharDict([
+	\ '1234 blah blah',
+	\ '1235 blah blah',
+	\ '12a0..12a2 blah blah',
+	\ '12a1 blah blah',
+	\ ], dict)
+  call assert_equal({0x1234: 1, 0x1235: 1,
+	\ 0x12a0: 1, 0x12a1: 1, 0x12a2: 1,
+	\ }, dict)
+  if v:errors != []
+    echoerr 'AddLinesToCharDict' v:errors
+    return 1
+  endif
+  return 0
+endfunc
+
+
+func CharDictToPairList(chardict)
+  let result = []
+  let keys = keys(a:chardict)->map('str2nr(v:val)')->sort('N')
+  let low = keys[0]
+  let high = keys[0]
+  for key in keys
+    if key > high + 1
+      call add(result, [low, high])
+      let low = key
+      let high = key
+    else
+      let high = key
+    endif
+  endfor
+  call add(result, [low, high])
+  return result
+endfunc
+
+func Test_CharDictToPairList()
+  let dict = {0x1020: 1, 0x1021: 1, 0x1022: 1,
+	\ 0x1024: 1,
+	\ 0x2022: 1,
+	\ 0x2024: 1, 0x2025: 1}
+  call assert_equal([
+	\ [0x1020, 0x1022],
+	\ [0x1024, 0x1024],
+	\ [0x2022, 0x2022],
+	\ [0x2024, 0x2025],
+	\ ], CharDictToPairList(dict))
+  if v:errors != []
+    echoerr 'CharDictToPairList' v:errors
+    return 1
+  endif
+  return 0
+endfunc
+
+
+" Build the amoji width table in a new buffer.
+func BuildEmojiTable()
+  " First make the table for all emojis.
+  let pattern = '; Emoji\s\+#\s'
+  let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
+
+  " Make a dictionary with an entry for each character.
+  let chardict = {}
+  call AddLinesToCharDict(lines, chardict)
+  let pairlist = CharDictToPairList(chardict)
+  let allranges = map(pairlist, 'printf("    {0x%04x, 0x%04x},", v:val[0], v:val[1])')
+
+  " New buffer to put the result in.
+  new
+  exe 'file emoji_all'
+  call setline(1, "static struct interval emoji_all[] =")
+  call setline(2, "{")
+  call append('$', allranges)
+  call setline('$', getline('$')[:-2])  " remove last comma
+  call setline(line('$') + 1, "};")
+  wincmd p
+
+  " Make the table for wide emojis.
+  let pattern = '; Emoji_\(Presentation\|Modifier_Base\)\s\+#\s'
+  let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
+
+  " Make a dictionary with an entry for each character.
+  let chardict = {}
+  call AddLinesToCharDict(lines, chardict)
+
+  " exclude characters that are in the "ambiguous" or "doublewidth" table
+  for ambi in s:ambitable
+    for nr in range(ambi[0], ambi[1])
+      if has_key(chardict, nr)
+	call remove(chardict, nr)
+      endif
+    endfor
+  endfor
+
+  for wide in s:doubletable
+    for nr in range(wide[0], wide[1])
+      if has_key(chardict, nr)
+	call remove(chardict, nr)
+      endif
+    endfor
+  endfor
+
+  let pairlist = CharDictToPairList(chardict)
+  let wide_ranges = map(pairlist, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
+
+  " New buffer to put the result in.
+  new
+  exe 'file emoji_wide'
+  call setline(1, "    static struct interval emoji_wide[] =")
+  call setline(2, "    {")
+  call append('$', wide_ranges)
+  call setline('$', getline('$')[:-2])  " remove last comma
+  call setline(line('$') + 1, "    };")
+  wincmd p
+endfunc
+
+" First test a few things
+let v:errors = []
+if Test_AddLinesToCharDict() || Test_CharDictToPairList()
+  finish
+endif
+
+
+" Try to avoid hitting E36
+set equalalways
+
+" Edit the Unicode text file.  Requires the netrw plugin.
+edit http://unicode.org/Public/UNIDATA/UnicodeData.txt
+
+" Parse each line, create a list of lists.
+call ParseDataToProps()
+
+" Build the toLower table.
+call BuildCaseTable("Lower", 13)
+
+" Build the toUpper table.
+call BuildCaseTable("Upper", 12)
+
+" Build the ranges of composing chars.
+call BuildCombiningTable()
+
+" Edit the case folding text file.  Requires the netrw plugin.
+edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
+
+" Parse each line, create a list of lists.
+call ParseFoldProps()
+
+" Build the foldCase table.
+call BuildFoldTable()
+
+" Edit the width text file.  Requires the netrw plugin.
+edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
+
+" Parse each line, create a list of lists.
+call ParseWidthProps()
+
+" Build the double width table.
+let s:doubletable = []
+call BuildWidthTable('[WF]', 'doublewidth')
+
+" Build the ambiguous width table.
+let s:ambitable = []
+call BuildWidthTable('A', 'ambiguous')
+
+" Edit the emoji text file.  Requires the netrw plugin.
+" commented out, because it drops too many characters
+"edit https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt
+"
+"" Build the emoji table. Ver. 1.0 - 6.0
+"" Must come after the "ambiguous" and "doublewidth" tables
+"call BuildEmojiTable()