diff options
Diffstat (limited to 'fluent-bit/lib/onigmo/tool')
-rw-r--r-- | fluent-bit/lib/onigmo/tool/.gitignore | 18 | ||||
-rw-r--r-- | fluent-bit/lib/onigmo/tool/Makefile | 48 | ||||
-rwxr-xr-x | fluent-bit/lib/onigmo/tool/case-folding.rb | 418 | ||||
-rwxr-xr-x | fluent-bit/lib/onigmo/tool/convert-jis-props.sh | 19 | ||||
-rwxr-xr-x | fluent-bit/lib/onigmo/tool/download-ucd.sh | 30 | ||||
-rwxr-xr-x | fluent-bit/lib/onigmo/tool/enc-unicode.rb | 548 | ||||
-rwxr-xr-x | fluent-bit/lib/onigmo/tool/update-doc.py | 145 |
7 files changed, 1226 insertions, 0 deletions
diff --git a/fluent-bit/lib/onigmo/tool/.gitignore b/fluent-bit/lib/onigmo/tool/.gitignore new file mode 100644 index 00000000..981fe8fb --- /dev/null +++ b/fluent-bit/lib/onigmo/tool/.gitignore @@ -0,0 +1,18 @@ +# ignore UCD files +Blocks.txt +CaseFolding.txt +DerivedAge.txt +DerivedCoreProperties.txt +PropList.txt +PropertyAliases.txt +PropertyValueAliases.txt +Scripts.txt +SpecialCasing.txt +UnicodeData.txt +GraphemeBreakProperty.txt +emoji-data.txt + +# ignore generated files +casefold.h +name2ctype.h +name2ctype.kwd diff --git a/fluent-bit/lib/onigmo/tool/Makefile b/fluent-bit/lib/onigmo/tool/Makefile new file mode 100644 index 00000000..cca6e732 --- /dev/null +++ b/fluent-bit/lib/onigmo/tool/Makefile @@ -0,0 +1,48 @@ +UNICODE_VERSION = 15.0.0 +EMOJI_VERSION = 15.0.0 + +PROP_FILES = \ + $(UNICODE_VERSION)/Blocks.txt \ + $(UNICODE_VERSION)/DerivedAge.txt \ + $(UNICODE_VERSION)/DerivedCoreProperties.txt \ + $(UNICODE_VERSION)/PropertyAliases.txt \ + $(UNICODE_VERSION)/PropertyValueAliases.txt \ + $(UNICODE_VERSION)/PropList.txt \ + $(UNICODE_VERSION)/Scripts.txt \ + $(UNICODE_VERSION)/UnicodeData.txt \ + $(UNICODE_VERSION)/auxiliary/GraphemeBreakProperty.txt \ + $(UNICODE_VERSION)/emoji-data.txt + +CASEFOLD_FILES = \ + $(UNICODE_VERSION)/CaseFolding.txt \ + $(UNICODE_VERSION)/UnicodeData.txt \ + $(UNICODE_VERSION)/SpecialCasing.txt + +update: update-unicode-header update-jis-header update-doc + +update-unicode-header: casefold.h name2ctype.h + cp casefold.h name2ctype.h ../enc/unicode + +update-jis-header: ../enc/jis/props.kwd + cd .. && ./tool/convert-jis-props.sh enc/jis/props.kwd enc/jis/props.h && cd - + +update-doc: $(PROP_FILES) update-doc.py + $(PYTHON) ./update-doc.py $(UNICODE_VERSION) > ../doc/UnicodeProps.txt + +download: + ./download-ucd.sh $(UNICODE_VERSION) $(EMOJI_VERSION) + + +casefold.h: $(CASEFOLD_FILES) case-folding.rb + $(RUBY) ./case-folding.rb -m $(UNICODE_VERSION) -o casefold.h + +name2ctype.h: $(PROP_FILES) enc-unicode.rb + $(RUBY) ./enc-unicode.rb --header $(UNICODE_VERSION) > name2ctype.h || rm -f name2ctype.h + + +clean: + -rm -f casefold.h name2ctype.kwd name2ctype.h + -rm -f $(PROP_FILES) $(CASEFOLD_FILES) + -rm -f GraphemeBreakProperty.txt + -rmdir $(UNICODE_VERSION)/auxiliary + -rmdir $(UNICODE_VERSION) diff --git a/fluent-bit/lib/onigmo/tool/case-folding.rb b/fluent-bit/lib/onigmo/tool/case-folding.rb new file mode 100755 index 00000000..c299074f --- /dev/null +++ b/fluent-bit/lib/onigmo/tool/case-folding.rb @@ -0,0 +1,418 @@ +#!/usr/bin/ruby +require 'stringio' + +# Usage (for case folding only): +# $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt +# $ ruby case-folding.rb CaseFolding.txt -o casefold.h +# or (for case folding and case mapping): +# $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt +# $ wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt +# $ wget http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt +# $ ruby case-folding.rb -m . -o casefold.h +# using -d or --debug will include UTF-8 characters in comments for debugging + +class CaseFolding + module Util + module_function + + def hex_seq(v) + v.map { |i| "0x%04x" % i }.join(", ") + end + + def print_table_1(dest, type, mapping_data, data) + for k, v in data = data.sort + sk = (Array === k and k.length > 1) ? "{#{hex_seq(k)}}" : ("0x%04x" % k) + if type=='CaseUnfold_11' and v.length>1 + # reorder CaseUnfold_11 entries to avoid special treatment for U+03B9/U+03BC/U+A64B + item = mapping_data.map("%04X" % k[0]) + upper = item.upper if item + v = v.sort_by { |i| ("%04X"%i) == upper ? 0 : 1 } + end + ck = @debug ? ' /* ' + Array(k).pack("U*") + ' */' : '' + cv = @debug ? ' /* ' + Array(v).map{|c|[c].pack("U*")}.join(", ") + ' */' : '' + dest.print(" {#{sk}#{ck}, {#{v.length}#{mapping_data.flags(k, type, v)}, {#{hex_seq(v)}#{cv}}}},\n") + end + data + end + + def print_table(dest, type, mapping_data, data) + dest.print("static const #{type}_Type #{type}_Table[] = {\n") + i = 0 + ret = data.inject([]) do |a, (n, d)| + dest.print("#define #{n} (*(#{type}_Type (*)[#{d.size}])(#{type}_Table+#{i}))\n") + i += d.size + a.concat(print_table_1(dest, type, mapping_data, d)) + end + dest.print("};\n\n") + ret + end + end + + include Util + + attr_reader :fold, :fold_locale, :unfold, :unfold_locale, :version + + def load(filename) + pattern = /([0-9A-F]{4,6}); ([CFT]); ([0-9A-F]{4,6})(?: ([0-9A-F]{4,6}))?(?: ([0-9A-F]{4,6}))?;/ + + @fold = fold = {} + @unfold = unfold = [{}, {}, {}] + @debug = false + @version = nil + turkic = [] + + IO.foreach(filename, mode: "rb") do |line| + @version ||= line[/-([0-9.]+).txt/, 1] + next unless res = pattern.match(line) + ch_from = res[1].to_i(16) + + if res[2] == 'T' + # Turkic case folding + turkic << ch_from + next + end + + # store folding data + ch_to = res[3..6].inject([]) do |a, i| + break a unless i + a << i.to_i(16) + end + fold[ch_from] = ch_to + + # store unfolding data + i = ch_to.length - 1 + (unfold[i][ch_to] ||= []) << ch_from + end + + # move locale dependent data to (un)fold_locale + @fold_locale = fold_locale = {} + @unfold_locale = unfold_locale = [{}, {}] + for ch_from in turkic + key = fold[ch_from] + i = key.length - 1 + unfold_locale[i][i == 0 ? key[0] : key] = unfold[i].delete(key) + fold_locale[ch_from] = fold.delete(ch_from) + end + self + end + + def range_check(code) + "#{code} <= MAX_CODE_VALUE && #{code} >= MIN_CODE_VALUE" + end + + def lookup_hash(key, type, data) + hash = "onigenc_unicode_#{key}_hash" + lookup = "onigenc_unicode_#{key}_lookup" + arity = Array(data[0][0]).size + gperf = %W"gperf -7 -k#{[*1..(arity*3)].join(',')} -F,-1 -c -j1 -i1 -t -T -E -C -H #{hash} -N #{lookup} -n" + argname = arity > 1 ? "codes" : "code" + argdecl = "const OnigCodePoint #{arity > 1 ? "*": ""}#{argname}" + n = 7 + m = (1 << n) - 1 + min, max = data.map {|c, *|c}.flatten.minmax + src = IO.popen(gperf, "r+") {|f| + f << "short\n%%\n" + data.each_with_index {|(k, _), i| + k = Array(k) + ks = k.map {|j| [(j >> n*2) & m, (j >> n) & m, (j) & m]}.flatten.map {|c| "\\x%.2x" % c}.join("") + f.printf "\"%s\", ::::/*%s*/ %d\n", ks, k.map {|c| "0x%.4x" % c}.join(","), i + } + f << "%%\n" + f.close_write + f.read + } + src.sub!(/^(#{hash})\s*\(.*?\).*?\n\{\n(.*)^\}/m) { + name = $1 + body = $2 + body.gsub!(/\(unsigned char\)str\[(\d+)\]/, "bits_#{arity > 1 ? 'at' : 'of'}(#{argname}, \\1)") + "#{name}(#{argdecl})\n{\n#{body}}" + } + src.sub!(/const short *\*\n^(#{lookup})\s*\(.*?\).*?\n\{\n(.*)^\}/m) { + name = $1 + body = $2 + body.sub!(/\benum\s+\{(\n[ \t]+)/, "\\&MIN_CODE_VALUE = 0x#{min.to_s(16)},\\1""MAX_CODE_VALUE = 0x#{max.to_s(16)},\\1") + body.gsub!(/(#{hash})\s*\(.*?\)/, "\\1(#{argname})") + body.gsub!(/\{"",-1}/, "-1") + body.gsub!(/\{"(?:[^"]|\\")+", *::::(.*)\}/, '\1') + body.sub!(/(\s+if\s)\(len\b.*\)/) do + "#$1(" << + (arity > 1 ? (0...arity).map {|i| range_check("#{argname}[#{i}]")}.join(" &&\n ") : range_check(argname)) << + ")" + end + v = nil + body.sub!(/(if\s*\(.*MAX_HASH_VALUE.*\)\n([ \t]*))\{(.*?)\n\2\}/m) { + pre = $1 + indent = $2 + s = $3 + s.sub!(/const char *\* *(\w+)( *= *wordlist\[\w+\]).\w+/, 'short \1 = wordlist[key]') + v = $1 + s.sub!(/\bif *\(.*\)/, "if (#{v} >= 0 && code#{arity}_equal(#{argname}, #{key}_Table[#{v}].from))") + "#{pre}{#{s}\n#{indent}}" + } + body.sub!(/\b(return\s+&)([^;]+);/, '\1'"#{key}_Table[#{v}].to;") + "static const #{type} *\n#{name}(#{argdecl})\n{\n#{body}}" + } + src + end + + def display(dest, mapping_data) + # print the header + dest.print("/* DO NOT EDIT THIS FILE. */\n") + dest.print("/* Generated by tool/case-folding.rb */\n\n") + + versions = version.scan(/\d+/) + dest.print("#if defined ONIG_UNICODE_VERSION_STRING && !( \\\n") + %w[MAJOR MINOR TEENY].zip(versions) do |n, v| + dest.print(" ONIG_UNICODE_VERSION_#{n} == #{v} && \\\n") + end + dest.print(" 1)\n") + dest.print("# error ONIG_UNICODE_VERSION_STRING mismatch\n") + dest.print("#endif\n") + dest.print("#define ONIG_UNICODE_VERSION_STRING #{version.dump}\n") + %w[MAJOR MINOR TEENY].zip(versions) do |n, v| + dest.print("#define ONIG_UNICODE_VERSION_#{n} #{v}\n") + end + dest.print("\n") + + # print folding data + + # CaseFold + CaseFold_Locale + name = "CaseFold_11" + data = print_table(dest, name, mapping_data, "CaseFold"=>fold, "CaseFold_Locale"=>fold_locale) + dest.print lookup_hash(name, "CodePointList3", data) + + # print unfolding data + + # CaseUnfold_11 + CaseUnfold_11_Locale + name = "CaseUnfold_11" + data = print_table(dest, name, mapping_data, name=>unfold[0], "#{name}_Locale"=>unfold_locale[0]) + dest.print lookup_hash(name, "CodePointList3", data) + + # CaseUnfold_12 + CaseUnfold_12_Locale + name = "CaseUnfold_12" + data = print_table(dest, name, mapping_data, name=>unfold[1], "#{name}_Locale"=>unfold_locale[1]) + dest.print lookup_hash(name, "CodePointList2", data) + + # CaseUnfold_13 + name = "CaseUnfold_13" + data = print_table(dest, name, mapping_data, name=>unfold[2]) + dest.print lookup_hash(name, "CodePointList2", data) + + # TitleCase + dest.print mapping_data.specials_output + end + + def debug! + @debug = true + end + + def self.load(*args) + new.load(*args) + end +end + +class MapItem + attr_accessor :upper, :lower, :title, :code + + def initialize(code, upper, lower, title) + @code = code + @upper = upper unless upper == '' + @lower = lower unless lower == '' + @title = title unless title == '' + end +end + +class CaseMapping + attr_reader :filename, :version + + def initialize(mapping_directory) + @mappings = {} + @specials = [] + @specials_length = 0 + @version = nil + IO.foreach(File.join(mapping_directory, 'UnicodeData.txt'), mode: "rb") do |line| + next if line =~ /^</ + code, __1,__2,__3,__4,__5,__6,__7,__8,__9,__10,__11, upper, lower, title = line.chomp.split ';' + unless upper and lower and title and (upper+lower+title)=='' + @mappings[code] = MapItem.new(code, upper, lower, title) + end + end + + @filename = File.join(mapping_directory, 'SpecialCasing.txt') + IO.foreach(@filename, mode: "rb") do |line| + @version ||= line[/-([0-9.]+).txt/, 1] + line.chomp! + line, comment = line.split(/ *#/) + next if not line or line == '' + code, lower, title, upper, conditions = line.split(/ *; */) + unless conditions + item = @mappings[code] + item.lower = lower + item.title = title + item.upper = upper + end + end + end + + def map (from) + @mappings[from] + end + + def flags(from, type, to) + # types: CaseFold_11, CaseUnfold_11, CaseUnfold_12, CaseUnfold_13 + flags = "" + from = Array(from).map {|i| "%04X" % i}.join(" ") + to = Array(to).map {|i| "%04X" % i}.join(" ") + item = map(from) + specials = [] + case type + when 'CaseFold_11' + flags += '|F' + if item + flags += '|U' if to==item.upper + flags += '|D' if to==item.lower + unless item.upper == item.title + if item.code == item.title + flags += '|IT' + swap = case item.code + when '01C5' then '0064 017D' + when '01C8' then '006C 004A' + when '01CB' then '006E 004A' + when '01F2' then '0064 005A' + else # Greek + to.split(' ').first + ' 0399' + end + specials << swap + else + flags += '|ST' + specials << item.title + end + end + unless item.lower.nil? or item.lower==from or item.lower==to + specials << item.lower + flags += '|SL' + end + unless item.upper.nil? or item.upper==from or item.upper==to + specials << item.upper + flags += '|SU' + end + end + when 'CaseUnfold_11' + to = to.split(/ /) + if item + case to.first + when item.upper then flags += '|U' + when item.lower then flags += '|D' + else + raise "Unpredicted case 0 in enc/unicode/case_folding.rb. Please contact https://bugs.ruby-lang.org/." + end + unless item.upper == item.title + if item.code == item.title + flags += '|IT' # was unpredicted case 1 + elsif item.title==to[1] + flags += '|ST' + else + raise "Unpredicted case 2 in enc/unicode/case_folding.rb. Please contact https://bugs.ruby-lang.org/." + end + end + end + end + unless specials.empty? + flags += "|I(#{@specials_length})" + @specials_length += specials.map { |s| s.split(/ /).length }.reduce(:+) + @specials << specials + end + flags + end + + def debug! + @debug = true + end + + def specials_output + "static const OnigCodePoint CaseMappingSpecials[] = {\n" + + @specials.map do |sps| + ' ' + sps.map do |sp| + chars = sp.split(/ /) + ct = ' /* ' + Array(chars).map{|c|[c.to_i(16)].pack("U*")}.join(", ") + ' */' if @debug + " L(#{chars.length})|#{chars.map {|c| "0x"+c }.join(', ')}#{ct}," + end.join + "\n" + end.join + "};\n" + end + + def self.load(*args) + new(*args) + end +end + +class CaseMappingDummy + def flags(from, type, to) + "" + end + + def titlecase_output() '' end + def debug!() end +end + +if $0 == __FILE__ + require 'optparse' + dest = nil + mapping_directory = nil + mapping_data = nil + debug = false + fold_1 = false + ARGV.options do |opt| + opt.banner << " [INPUT]" + opt.on("--output-file=FILE", "-o", "output to the FILE instead of STDOUT") {|output| + dest = (output unless output == '-') + } + opt.on('--mapping-data-directory=DIRECTORY', '-m', 'data DIRECTORY of mapping files') { |directory| + mapping_directory = directory + } + opt.on('--debug', '-d') { + debug = true + } + opt.parse! + abort(opt.to_s) if ARGV.size > 1 + end + if mapping_directory + if ARGV[0] + warn "Either specify directory or individual file, but not both." + exit + end + filename = File.join(mapping_directory, 'CaseFolding.txt') + mapping_data = CaseMapping.load(mapping_directory) + end + filename ||= ARGV[0] || 'CaseFolding.txt' + data = CaseFolding.load(filename) + if mapping_data and data.version != mapping_data.version + abort "Unicode data version mismatch\n" \ + " #{filename} = #{data.version}\n" \ + " #{mapping_data.filename} = #{mapping_data.version}" + end + mapping_data ||= CaseMappingDummy.new + + if debug + data.debug! + mapping_data.debug! + end + f = StringIO.new + begin + data.display(f, mapping_data) + rescue Errno::ENOENT => e + raise unless /gperf/ =~ e.message + warn e.message + abort unless dest + File.utime(nil, nil, dest) # assume existing file is OK + exit + else + s = f.string + end + if dest + open(dest, "wb") do |file| + file.print(s) + end + else + STDOUT.print(s) + end +end diff --git a/fluent-bit/lib/onigmo/tool/convert-jis-props.sh b/fluent-bit/lib/onigmo/tool/convert-jis-props.sh new file mode 100755 index 00000000..476a5a53 --- /dev/null +++ b/fluent-bit/lib/onigmo/tool/convert-jis-props.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +# Convert props.kwd to props.h using GNU gperf. +# +# Usage: +# ./tool/convert-jis-props.sh enc/jis/props.kwd enc/jis/props.h + +GPERF_VERSION=`gperf -v | head -n1 | sed -e 's/^GNU gperf \([0-9]\+\)\.\([0-9]\+.*\)$/\1 \2/' | xargs printf '%02d%02d'` +if [ $GPERF_VERSION -ge '0301' ]; then + # static const struct enc_property *onig_jis_property(const char *str, unsigned int len); + GPERF_REPLACE='s/\(onig_jis_property([^,]\+, \).\+\( len)\)/\1size_t\2/' +else + GPERF_REPLACE='#' +fi + +JIS_PROPS_OPTIONS='-k1,3 -7 -c -j1 -i1 -t -C -P -t --ignore-case -H onig_jis_property_hash -Q onig_jis_property_pool -N onig_jis_property' + +gperf $JIS_PROPS_OPTIONS $1 | sed "$GPERF_REPLACE" | \ + sed 's/(int)(\(long\|size_t\))&((\([a-zA-Z_0-9 ]*[a-zA-Z_0-9]\) *\*)0)->\([a-zA-Z0-9_]*\),/(char)offsetof(\2, \3),/g' > $2 diff --git a/fluent-bit/lib/onigmo/tool/download-ucd.sh b/fluent-bit/lib/onigmo/tool/download-ucd.sh new file mode 100755 index 00000000..b6b46581 --- /dev/null +++ b/fluent-bit/lib/onigmo/tool/download-ucd.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +files='Blocks.txt CaseFolding.txt DerivedAge.txt DerivedCoreProperties.txt PropertyAliases.txt PropertyValueAliases.txt PropList.txt Scripts.txt SpecialCasing.txt UnicodeData.txt auxiliary/GraphemeBreakProperty.txt' +emoji_files='emoji-data.txt' + +if [ -z $1 ] || [ -z $2 ]; then + echo "usage: $0 UNICODE_VERSION EMOJI_VERSION" + exit 1 +fi +UNICODE_VERSION=$1 +EMOJI_VERSION=$2 + +# remove old files +if [ -d $UNICODE_VERSION ]; then + cd $UNICODE_VERSION + rm -f $files $emoji_files + rm -f GraphemeBreakProperty.txt + cd - +fi + +mkdir -p $UNICODE_VERSION/auxiliary +cd $UNICODE_VERSION + +for i in $files; do + echo http://www.unicode.org/Public/${UNICODE_VERSION}/ucd/$i +done | xargs wget +mv GraphemeBreakProperty.txt auxiliary +for i in $emoji_files; do + echo http://www.unicode.org/Public/${EMOJI_VERSION}/ucd/emoji/$i +done | xargs wget diff --git a/fluent-bit/lib/onigmo/tool/enc-unicode.rb b/fluent-bit/lib/onigmo/tool/enc-unicode.rb new file mode 100755 index 00000000..84f494e8 --- /dev/null +++ b/fluent-bit/lib/onigmo/tool/enc-unicode.rb @@ -0,0 +1,548 @@ +#!/usr/bin/env ruby + +# Creates the data structures needed by Oniguruma to map Unicode codepoints to +# property names and POSIX character classes +# +# To use this, get UnicodeData.txt, Scripts.txt, PropList.txt, +# PropertyAliases.txt, PropertyValueAliases.txt, DerivedCoreProperties.txt, +# DerivedAge.txt and Blocks.txt from unicode.org. +# (http://unicode.org/Public/UNIDATA/) And run following command. +# ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd +# You can get source file for gperf. After this, simply make ruby. + +if ARGV[0] == "--header" + header = true + ARGV.shift +end +unless ARGV.size == 1 + abort "Usage: #{$0} data_directory" +end + +$unicode_version = File.basename(ARGV[0])[/\A[.\d]+\z/] + +POSIX_NAMES = %w[NEWLINE Alpha Blank Cntrl Digit Graph Lower Print XPosixPunct Space Upper XDigit Word Alnum ASCII Punct] + +GPERF_VERSION = `gperf -v`.split("\n").first # /^GNU gperf (.+)$/ + .split.last + +def pair_codepoints(codepoints) + + # We have a sorted Array of codepoints that we wish to partition into + # ranges such that the start- and endpoints form an inclusive set of + # codepoints with property _property_. Note: It is intended that some ranges + # will begin with the value with which they end, e.g. 0x0020 -> 0x0020 + + codepoints.sort! + last_cp = codepoints.first + pairs = [[last_cp, nil]] + codepoints[1..-1].each do |codepoint| + next if last_cp == codepoint + + # If the current codepoint does not follow directly on from the last + # codepoint, the last codepoint represents the end of the current range, + # and the current codepoint represents the start of the next range. + if last_cp.next != codepoint + pairs[-1][-1] = last_cp + pairs << [codepoint, nil] + end + last_cp = codepoint + end + + # The final pair has as its endpoint the last codepoint for this property + pairs[-1][-1] = codepoints.last + pairs +end + +def parse_unicode_data(file) + last_cp = 0 + data = {'Any' => (0x0000..0x10ffff).to_a, 'Assigned' => [], + 'ASCII' => (0..0x007F).to_a, 'NEWLINE' => [0x0a], 'Cn' => []} + beg_cp = nil + IO.foreach(file) do |line| + fields = line.split(';') + cp = fields[0].to_i(16) + + case fields[1] + when /\A<(.*),\s*First>\z/ + beg_cp = cp + next + when /\A<(.*),\s*Last>\z/ + cps = (beg_cp..cp).to_a + else + beg_cp = cp + cps = [cp] + end + + # The Cn category represents unassigned characters. These are not listed in + # UnicodeData.txt so we must derive them by looking for 'holes' in the range + # of listed codepoints. We increment the last codepoint seen and compare it + # with the current codepoint. If the current codepoint is less than + # last_cp.next we have found a hole, so we add the missing codepoint to the + # Cn category. + data['Cn'].concat((last_cp.next...beg_cp).to_a) + + # Assigned - Defined in unicode.c; interpreted as every character in the + # Unicode range minus the unassigned characters + data['Assigned'].concat(cps) + + # The third field denotes the 'General' category, e.g. Lu + (data[fields[2]] ||= []).concat(cps) + + # The 'Major' category is the first letter of the 'General' category, e.g. + # 'Lu' -> 'L' + (data[fields[2][0,1]] ||= []).concat(cps) + last_cp = cp + end + + # The last Cn codepoint should be 0x10ffff. If it's not, append the missing + # codepoints to Cn and C + cn_remainder = (last_cp.next..0x10ffff).to_a + data['Cn'] += cn_remainder + data['C'] += data['Cn'] + + # Special case for LC (Cased_Letter). LC = Ll + Lt + Lu + data['LC'] = data['Ll'] + data['Lt'] + data['Lu'] + + # Define General Category properties + gcps = data.keys.sort - POSIX_NAMES + + # Returns General Category Property names and the data + [gcps, data] +end + +def define_posix_props(data) + # We now derive the character classes (POSIX brackets), e.g. [[:alpha:]] + # + + data['Alpha'] = data['Alphabetic'] + data['Upper'] = data['Uppercase'] + data['Lower'] = data['Lowercase'] + data['Punct'] = data['Punctuation'] + data['XPosixPunct'] = data['Punctuation'] + [0x24, 0x2b, 0x3c, 0x3d, 0x3e, 0x5e, 0x60, 0x7c, 0x7e] + data['Digit'] = data['Decimal_Number'] + data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a + + (0x0061..0x0066).to_a + data['Alnum'] = data['Alpha'] + data['Digit'] + data['Space'] = data['White_Space'] + data['Blank'] = data['Space_Separator'] + [0x0009] + data['Cntrl'] = data['Cc'] + data['Word'] = data['Alpha'] + data['Mark'] + data['Digit'] + data['Connector_Punctuation'] + data['Graph'] = data['Any'] - data['Space'] - data['Cntrl'] - + data['Surrogate'] - data['Unassigned'] + data['Print'] = data['Graph'] + data['Space_Separator'] +end + +def parse_scripts(data, categories) + files = [ + {:fn => 'DerivedCoreProperties.txt', :title => 'Derived Property'}, + {:fn => 'Scripts.txt', :title => 'Script'}, + {:fn => 'PropList.txt', :title => 'Binary Property'}, + {:fn => 'emoji-data.txt', :title => 'Emoji'} + ] + current = nil + cps = [] + names = {} + files.each do |file| + data_foreach(file[:fn]) do |line| + if /^# Total (?:code points|elements): / =~ line + data[current] = cps + categories[current] = file[:title] + (names[file[:title]] ||= []) << current + cps = [] + elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line + current = $3 + $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16)) + end + end + end + # All code points not explicitly listed for Script + # have the value Unknown (Zzzz). + data['Unknown'] = (0..0x10ffff).to_a - data.values_at(*names['Script']).flatten + categories['Unknown'] = 'Script' + names.values.flatten << 'Unknown' +end + +def parse_aliases(data) + kv = {} + data_foreach('PropertyAliases.txt') do |line| + next unless /^(\w+)\s*; (\w+)/ =~ line + data[$1] = data[$2] + kv[normalize_propname($1)] = normalize_propname($2) + end + data_foreach('PropertyValueAliases.txt') do |line| + next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line + if $1 == 'gc' + data[$3] = data[$2] + data[$4] = data[$2] + kv[normalize_propname($3)] = normalize_propname($2) + kv[normalize_propname($4)] = normalize_propname($2) if $4 + else + data[$2] = data[$3] + data[$4] = data[$3] + kv[normalize_propname($2)] = normalize_propname($3) + kv[normalize_propname($4)] = normalize_propname($3) if $4 + end + end + kv +end + +# According to Unicode6.0.0/ch03.pdf, Section 3.1, "An update version +# never involves any additions to the character repertoire." Versions +# in DerivedAge.txt should always be /\d+\.\d+/ +def parse_age(data) + current = nil + last_constname = nil + cps = [] + ages = [] + data_foreach('DerivedAge.txt') do |line| + if /^# Total code points: / =~ line + constname = constantize_agename(current) + # each version matches all previous versions + cps.concat(data[last_constname]) if last_constname + data[constname] = cps + make_const(constname, cps, "Derived Age #{current}") + ages << current + last_constname = constname + cps = [] + elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\d+\.\d+)/ =~ line + current = $3 + $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16)) + end + end + ages +end + +def parse_GraphemeBreakProperty(data) + current = nil + cps = [] + ages = [] + data_foreach('auxiliary/GraphemeBreakProperty.txt') do |line| + if /^# Total code points: / =~ line + constname = constantize_Grapheme_Cluster_Break(current) + data[constname] = cps + make_const(constname, cps, "Grapheme_Cluster_Break=#{current}") + ages << current + cps = [] + elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line + current = $3 + $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16)) + end + end + ages +end + +def parse_block(data) + cps = [] + blocks = [] + data_foreach('Blocks.txt') do |line| + if /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);\s*(.*)/ =~ line + cps = ($1.to_i(16)..$2.to_i(16)).to_a + constname = constantize_blockname($3) + data[constname] = cps + make_const(constname, cps, "Block") + blocks << constname + end + end + + # All code points not belonging to any of the named blocks + # have the value No_Block. + no_block = (0..0x10ffff).to_a - data.values_at(*blocks).flatten + constname = constantize_blockname("No_Block") + make_const(constname, no_block, "Block") + blocks << constname +end + +# shim for Ruby 1.8 +unless {}.respond_to?(:key) + class Hash + alias key index + end +end + +$const_cache = {} +# make_const(property, pairs, name): Prints a 'static const' structure for a +# given property, group of paired codepoints, and a human-friendly name for +# the group +def make_const(prop, data, name) + if name.empty? + puts "\n/* '#{prop}' */" + else + puts "\n/* '#{prop}': #{name} */" + end + if origprop = $const_cache.key(data) + puts "#define CR_#{prop} CR_#{origprop}" + else + $const_cache[prop] = data + pairs = pair_codepoints(data) + puts "static const OnigCodePoint CR_#{prop}[] = {" + # The first element of the constant is the number of pairs of codepoints + puts "\t#{pairs.size}," + pairs.each do |pair| + pair.map! { |c| c == 0 ? '0x0000' : sprintf("%0#6x", c) } + puts "\t#{pair.first}, #{pair.last}," + end + puts "}; /* CR_#{prop} */" + end +end + +def normalize_propname(name) + name = name.downcase + name.delete!('- _') + name +end + +def constantize_agename(name) + "Age_#{name.sub(/\./, '_')}" +end + +def constantize_Grapheme_Cluster_Break(name) + "Grapheme_Cluster_Break_#{name}" +end + +def constantize_blockname(name) + "In_#{name.gsub(/\W/, '_')}" +end + +def get_file(name) + File.join(ARGV[0], name) +end + +def data_foreach(name, &block) + fn = get_file(name) + warn "Reading #{name}" + pat = /^# #{File.basename(name).sub(/\./, '-([\\d.]+)\\.')}/ + File.open(fn, 'rb') do |f| + line = f.gets + unless /^emoji-/ =~ name + unless pat =~ line + raise ArgumentError, "#{name}: no Unicode version" + end + if !$unicode_version + $unicode_version = $1 + elsif $unicode_version != $1 + raise ArgumentError, "#{name}: Unicode version mismatch: #$1" + end + end + f.each(&block) + end +end + +# Write Data +class Unifdef + attr_accessor :output, :top, :stack, :stdout, :kwdonly + def initialize(out) + @top = @output = [] + @stack = [] + $stdout, @stdout = self, out + end + def restore + $stdout = @stdout + end + def ifdef(sym) + if @kwdonly + @stdout.puts "#ifdef #{sym}" + else + @stack << @top + @top << tmp = [sym] + @top = tmp + end + if block_given? + begin + return yield + ensure + endif(sym) + end + end + end + def endif(sym) + if @kwdonly + @stdout.puts "#endif /* #{sym} */" + else + unless sym == @top[0] + restore + raise ArgumentError, "#{sym} unmatch to #{@top[0]}" + end + @top = @stack.pop + end + end + def show(dest, *syms) + _show(dest, @output, syms) + end + def _show(dest, ary, syms) + if Symbol === (sym = ary[0]) + unless syms.include?(sym) + return + end + end + ary.each do |e| + case e + when Array + _show(dest, e, syms) + when String + dest.print e + end + end + end + def write(str) + if @kwdonly + @stdout.write(str) + else + @top << str + end + self + end + alias << write +end + +output = Unifdef.new($stdout) +output.kwdonly = !header + +puts '%{' +props, data = parse_unicode_data(get_file('UnicodeData.txt')) +categories = {} +props.concat parse_scripts(data, categories) +aliases = parse_aliases(data) +ages = blocks = graphemeBreaks = nil +define_posix_props(data) +POSIX_NAMES.each do |name| + if name == 'XPosixPunct' + make_const(name, data[name], "[[:Punct:]]") + elsif name == 'Punct' + make_const(name, data[name], "") + else + make_const(name, data[name], "[[:#{name}:]]") + end +end +output.ifdef :USE_UNICODE_PROPERTIES do + props.each do |name| + category = categories[name] || + case name.size + when 1 then 'Major Category' + when 2 then 'General Category' + else '-' + end + make_const(name, data[name], category) + end + output.ifdef :USE_UNICODE_AGE_PROPERTIES do + ages = parse_age(data) + end + graphemeBreaks = parse_GraphemeBreakProperty(data) + blocks = parse_block(data) +end +puts(<<'__HEREDOC') + +static const OnigCodePoint* const CodeRanges[] = { +__HEREDOC +POSIX_NAMES.each{|name|puts" CR_#{name},"} +output.ifdef :USE_UNICODE_PROPERTIES do + props.each{|name| puts" CR_#{name},"} + output.ifdef :USE_UNICODE_AGE_PROPERTIES do + ages.each{|name| puts" CR_#{constantize_agename(name)},"} + end + graphemeBreaks.each{|name| puts" CR_#{constantize_Grapheme_Cluster_Break(name)},"} + blocks.each{|name|puts" CR_#{name},"} +end + +puts(<<"__HEREDOC") +}; +struct uniname2ctype_struct { + short name; + unsigned short ctype; +}; +#define uniname2ctype_offset(str) offsetof(struct uniname2ctype_pool_t, uniname2ctype_pool_##str) + +static const struct uniname2ctype_struct *uniname2ctype_p(const char *, #{ GPERF_VERSION >= '3.1' ? 'size_t' : 'unsigned int' }); +%} +struct uniname2ctype_struct; +%% +__HEREDOC + +i = -1 +name_to_index = {} +POSIX_NAMES.each do |name| + i += 1 + next if name == 'NEWLINE' + name = normalize_propname(name) + name_to_index[name] = i + puts"%-40s %3d" % [name + ',', i] +end +output.ifdef :USE_UNICODE_PROPERTIES do + props.each do |name| + i += 1 + name = normalize_propname(name) + name_to_index[name] = i + puts "%-40s %3d" % [name + ',', i] + end + aliases.each_pair do |k, v| + next if name_to_index[k] + next unless v = name_to_index[v] + puts "%-40s %3d" % [k + ',', v] + end + output.ifdef :USE_UNICODE_AGE_PROPERTIES do + ages.each do |name| + i += 1 + name = "age=#{name}" + name_to_index[name] = i + puts "%-40s %3d" % [name + ',', i] + end + end + graphemeBreaks.each do |name| + i += 1 + name = "graphemeclusterbreak=#{name.delete('_').downcase}" + name_to_index[name] = i + puts "%-40s %3d" % [name + ',', i] + end + blocks.each do |name| + i += 1 + name = normalize_propname(name) + name_to_index[name] = i + puts "%-40s %3d" % [name + ',', i] + end +end +puts(<<'__HEREDOC') +%% +static int +uniname2ctype(const UChar *name, unsigned int len) +{ + const struct uniname2ctype_struct *p = uniname2ctype_p((const char *)name, len); + if (p) return p->ctype; + return -1; +} +__HEREDOC +versions = $unicode_version.scan(/\d+/) +print("#if defined ONIG_UNICODE_VERSION_STRING && !( \\\n") +%w[MAJOR MINOR TEENY].zip(versions) do |n, v| + print(" ONIG_UNICODE_VERSION_#{n} == #{v} && \\\n") +end +print(" 1)\n") +print("# error ONIG_UNICODE_VERSION_STRING mismatch\n") +print("#endif\n") +print("#define ONIG_UNICODE_VERSION_STRING #{$unicode_version.dump}\n") +%w[MAJOR MINOR TEENY].zip(versions) do |n, v| + print("#define ONIG_UNICODE_VERSION_#{n} #{v}\n") +end + +output.restore + +if header + require 'tempfile' + + NAME2CTYPE = %w[gperf -7 -c -j1 -i1 -t -C -P -T -H uniname2ctype_hash -Q uniname2ctype_pool -N uniname2ctype_p] + + fds = [] + syms = %i[USE_UNICODE_PROPERTIES USE_UNICODE_AGE_PROPERTIES] + begin + fds << (tmp = Tempfile.new(%w"name2ctype .h")) + IO.popen([*NAME2CTYPE, out: tmp], "w") {|f| output.show(f, *syms)} + end while syms.pop + fds.each(&:close) + IO.popen(%W[diff -DUSE_UNICODE_AGE_PROPERTIES #{fds[1].path} #{fds[0].path}], "r") {|age| + IO.popen(%W[diff -DUSE_UNICODE_PROPERTIES #{fds[2].path} -], "r", in: age) {|f| + f.each {|line| + line.gsub!(/\(int\)\((?:long|size_t)\)&\(\(struct uniname2ctype_pool_t \*\)0\)->uniname2ctype_pool_(str\d+),\s+/, + 'uniname2ctype_offset(\1), ') + puts line + } + } + } +end diff --git a/fluent-bit/lib/onigmo/tool/update-doc.py b/fluent-bit/lib/onigmo/tool/update-doc.py new file mode 100755 index 00000000..4126adff --- /dev/null +++ b/fluent-bit/lib/onigmo/tool/update-doc.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Usage: +# $ python update-doc.py UCD_DIR > ../doc/UnicodeProps.txt + +from __future__ import print_function +import sys +import os +import re +import datetime + +onig_ver = "6.2.2" +ucddir = "." + +def print_list(arr, title): + print() + print("*", title) + for i in arr: + print(" " + i) + +def output_header(): + d = datetime.date.today() + print("Onigmo (Oniguruma-mod) Unicode Properties Version %s %04d/%02d/%02d" + % (onig_ver, d.year, d.month, d.day)) + + posix_brackets = [ + "Alpha", "Blank", "Cntrl", "Digit", "Graph", "Lower", "Print", + "Punct", "Space", "Upper", "XDigit", "Word", "Alnum", "ASCII", + "XPosixPunct" + ] + specials = ["Any", "Assigned"] + + print_list(posix_brackets, "POSIX brackets") + print_list(specials, "Special") + return set(posix_brackets) | set(specials) + +def output_categories(): + categories = set(["LC", "Cn"]) + pattern = re.compile('^.*?;.*?;(..);') + with open(ucddir + os.sep + 'UnicodeData.txt', 'r') as f: + for line in f: + res = pattern.match(line) + if not res: + continue + categories.add(res.group(1)) + categories.add(res.group(1)[0]) # Major category + print_list(sorted(categories), "Major and General Categories") + return categories + +def output_scripts(filename, title, add=[]): + scripts = set(add) + pattern = re.compile('^[0-9a-fA-F]+(?:\.\.[0-9a-fA-F]+)? *; (\w+) +# ') + with open(filename, 'r') as f: + for line in f: + res = pattern.match(line) + if not res: + continue + scripts.add(res.group(1)) + print_list(sorted(scripts), title) + return scripts + +def output_aliases(scripts): + aliases = set() + pattern = re.compile('^(\w+) *; (\w+)') + with open(ucddir + os.sep + 'PropertyAliases.txt', 'r') as f: + for line in f: + res = pattern.match(line) + if not res: + continue + if (res.group(2) in scripts) and (res.group(1) not in scripts): + aliases.add(res.group(1)) + print_list(sorted(aliases), "PropertyAliases") + return aliases + +def output_valuealiases(scripts): + scripts |= set(["cntrl", "digit", "punct"]) # exclude them + aliases = list() + aliases_sc = list() + pattern = re.compile('^(gc|sc) ; (\w+) *; (\w+)(?: *; (\w+))?') + with open(ucddir + os.sep + 'PropertyValueAliases.txt', 'r') as f: + for line in f: + res = pattern.match(line) + if not res: + continue + if (res.group(1) == "gc"): + if res.group(2) in scripts: + if res.group(3) not in scripts: + aliases.append(res.group(3)) + if res.group(4) and (res.group(4) not in scripts): + aliases.append(res.group(4)) + else: + if res.group(3) in scripts: + if res.group(2) not in scripts: + aliases_sc.append(res.group(2)) + if res.group(4) and (res.group(4) not in scripts): + aliases_sc.append(res.group(4)) + + print_list(aliases, "PropertyValueAliases (General_Category)") + print_list(aliases_sc, "PropertyValueAliases (Script)") + return set(aliases) | set(aliases_sc) + +def output_ages(): + ages = set() + pattern = re.compile('^[\dA-F.]+ *; ([\d.]+)') + with open(ucddir + os.sep + 'DerivedAge.txt', 'r') as f: + for line in f: + res = pattern.match(line) + if not res: + continue + ages.add("Age=" + res.group(1)) + print_list(sorted(ages), "DerivedAges") + return ages + +def output_blocks(): + blocks = list() + pattern = re.compile('^[\dA-F.]+ *; ([-\w ]+)') + with open(ucddir + os.sep + 'Blocks.txt', 'r') as f: + for line in f: + res = pattern.match(line) + if not res: + continue + blocks.append("In_" + re.sub('\W', '_', res.group(1))) + blocks.append("In_No_Block") + print_list(blocks, "Blocks") + return set(blocks) + +def main(): + global ucddir + if len(sys.argv) > 1: + ucddir = sys.argv[1] + scripts = set() + scripts |= output_header() + scripts |= output_categories() + scripts |= output_scripts(ucddir + os.sep + 'Scripts.txt', 'Scripts', ["Unknown"]) + scripts |= output_scripts(ucddir + os.sep + 'DerivedCoreProperties.txt', 'DerivedCoreProperties') + scripts |= output_scripts(ucddir + os.sep + 'PropList.txt', 'PropList') + scripts |= output_scripts(ucddir + os.sep + 'emoji-data.txt', 'Emoji') + output_aliases(scripts) + output_valuealiases(scripts) + output_ages() + output_blocks() + +if __name__ == '__main__': + main() |