diff options
Diffstat (limited to 'fluent-bit/lib/onigmo/tool')
-rw-r--r-- | fluent-bit/lib/onigmo/tool/.gitignore | 18 | ||||
-rw-r--r-- | fluent-bit/lib/onigmo/tool/Makefile | 48 | ||||
-rwxr-xr-x | fluent-bit/lib/onigmo/tool/case-folding.rb | 418 | ||||
-rwxr-xr-x | fluent-bit/lib/onigmo/tool/convert-jis-props.sh | 19 | ||||
-rwxr-xr-x | fluent-bit/lib/onigmo/tool/download-ucd.sh | 30 | ||||
-rwxr-xr-x | fluent-bit/lib/onigmo/tool/enc-unicode.rb | 548 | ||||
-rwxr-xr-x | fluent-bit/lib/onigmo/tool/update-doc.py | 145 |
7 files changed, 0 insertions, 1226 deletions
diff --git a/fluent-bit/lib/onigmo/tool/.gitignore b/fluent-bit/lib/onigmo/tool/.gitignore deleted file mode 100644 index 981fe8fb6..000000000 --- a/fluent-bit/lib/onigmo/tool/.gitignore +++ /dev/null @@ -1,18 +0,0 @@ -# ignore UCD files -Blocks.txt -CaseFolding.txt -DerivedAge.txt -DerivedCoreProperties.txt -PropList.txt -PropertyAliases.txt -PropertyValueAliases.txt -Scripts.txt -SpecialCasing.txt -UnicodeData.txt -GraphemeBreakProperty.txt -emoji-data.txt - -# ignore generated files -casefold.h -name2ctype.h -name2ctype.kwd diff --git a/fluent-bit/lib/onigmo/tool/Makefile b/fluent-bit/lib/onigmo/tool/Makefile deleted file mode 100644 index cca6e732c..000000000 --- a/fluent-bit/lib/onigmo/tool/Makefile +++ /dev/null @@ -1,48 +0,0 @@ -UNICODE_VERSION = 15.0.0 -EMOJI_VERSION = 15.0.0 - -PROP_FILES = \ - $(UNICODE_VERSION)/Blocks.txt \ - $(UNICODE_VERSION)/DerivedAge.txt \ - $(UNICODE_VERSION)/DerivedCoreProperties.txt \ - $(UNICODE_VERSION)/PropertyAliases.txt \ - $(UNICODE_VERSION)/PropertyValueAliases.txt \ - $(UNICODE_VERSION)/PropList.txt \ - $(UNICODE_VERSION)/Scripts.txt \ - $(UNICODE_VERSION)/UnicodeData.txt \ - $(UNICODE_VERSION)/auxiliary/GraphemeBreakProperty.txt \ - $(UNICODE_VERSION)/emoji-data.txt - -CASEFOLD_FILES = \ - $(UNICODE_VERSION)/CaseFolding.txt \ - $(UNICODE_VERSION)/UnicodeData.txt \ - $(UNICODE_VERSION)/SpecialCasing.txt - -update: update-unicode-header update-jis-header update-doc - -update-unicode-header: casefold.h name2ctype.h - cp casefold.h name2ctype.h ../enc/unicode - -update-jis-header: ../enc/jis/props.kwd - cd .. && ./tool/convert-jis-props.sh enc/jis/props.kwd enc/jis/props.h && cd - - -update-doc: $(PROP_FILES) update-doc.py - $(PYTHON) ./update-doc.py $(UNICODE_VERSION) > ../doc/UnicodeProps.txt - -download: - ./download-ucd.sh $(UNICODE_VERSION) $(EMOJI_VERSION) - - -casefold.h: $(CASEFOLD_FILES) case-folding.rb - $(RUBY) ./case-folding.rb -m $(UNICODE_VERSION) -o casefold.h - -name2ctype.h: $(PROP_FILES) enc-unicode.rb - $(RUBY) ./enc-unicode.rb --header $(UNICODE_VERSION) > name2ctype.h || rm -f name2ctype.h - - -clean: - -rm -f casefold.h name2ctype.kwd name2ctype.h - -rm -f $(PROP_FILES) $(CASEFOLD_FILES) - -rm -f GraphemeBreakProperty.txt - -rmdir $(UNICODE_VERSION)/auxiliary - -rmdir $(UNICODE_VERSION) diff --git a/fluent-bit/lib/onigmo/tool/case-folding.rb b/fluent-bit/lib/onigmo/tool/case-folding.rb deleted file mode 100755 index c299074f0..000000000 --- a/fluent-bit/lib/onigmo/tool/case-folding.rb +++ /dev/null @@ -1,418 +0,0 @@ -#!/usr/bin/ruby -require 'stringio' - -# Usage (for case folding only): -# $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt -# $ ruby case-folding.rb CaseFolding.txt -o casefold.h -# or (for case folding and case mapping): -# $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt -# $ wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt -# $ wget http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt -# $ ruby case-folding.rb -m . -o casefold.h -# using -d or --debug will include UTF-8 characters in comments for debugging - -class CaseFolding - module Util - module_function - - def hex_seq(v) - v.map { |i| "0x%04x" % i }.join(", ") - end - - def print_table_1(dest, type, mapping_data, data) - for k, v in data = data.sort - sk = (Array === k and k.length > 1) ? "{#{hex_seq(k)}}" : ("0x%04x" % k) - if type=='CaseUnfold_11' and v.length>1 - # reorder CaseUnfold_11 entries to avoid special treatment for U+03B9/U+03BC/U+A64B - item = mapping_data.map("%04X" % k[0]) - upper = item.upper if item - v = v.sort_by { |i| ("%04X"%i) == upper ? 0 : 1 } - end - ck = @debug ? ' /* ' + Array(k).pack("U*") + ' */' : '' - cv = @debug ? ' /* ' + Array(v).map{|c|[c].pack("U*")}.join(", ") + ' */' : '' - dest.print(" {#{sk}#{ck}, {#{v.length}#{mapping_data.flags(k, type, v)}, {#{hex_seq(v)}#{cv}}}},\n") - end - data - end - - def print_table(dest, type, mapping_data, data) - dest.print("static const #{type}_Type #{type}_Table[] = {\n") - i = 0 - ret = data.inject([]) do |a, (n, d)| - dest.print("#define #{n} (*(#{type}_Type (*)[#{d.size}])(#{type}_Table+#{i}))\n") - i += d.size - a.concat(print_table_1(dest, type, mapping_data, d)) - end - dest.print("};\n\n") - ret - end - end - - include Util - - attr_reader :fold, :fold_locale, :unfold, :unfold_locale, :version - - def load(filename) - pattern = /([0-9A-F]{4,6}); ([CFT]); ([0-9A-F]{4,6})(?: ([0-9A-F]{4,6}))?(?: ([0-9A-F]{4,6}))?;/ - - @fold = fold = {} - @unfold = unfold = [{}, {}, {}] - @debug = false - @version = nil - turkic = [] - - IO.foreach(filename, mode: "rb") do |line| - @version ||= line[/-([0-9.]+).txt/, 1] - next unless res = pattern.match(line) - ch_from = res[1].to_i(16) - - if res[2] == 'T' - # Turkic case folding - turkic << ch_from - next - end - - # store folding data - ch_to = res[3..6].inject([]) do |a, i| - break a unless i - a << i.to_i(16) - end - fold[ch_from] = ch_to - - # store unfolding data - i = ch_to.length - 1 - (unfold[i][ch_to] ||= []) << ch_from - end - - # move locale dependent data to (un)fold_locale - @fold_locale = fold_locale = {} - @unfold_locale = unfold_locale = [{}, {}] - for ch_from in turkic - key = fold[ch_from] - i = key.length - 1 - unfold_locale[i][i == 0 ? key[0] : key] = unfold[i].delete(key) - fold_locale[ch_from] = fold.delete(ch_from) - end - self - end - - def range_check(code) - "#{code} <= MAX_CODE_VALUE && #{code} >= MIN_CODE_VALUE" - end - - def lookup_hash(key, type, data) - hash = "onigenc_unicode_#{key}_hash" - lookup = "onigenc_unicode_#{key}_lookup" - arity = Array(data[0][0]).size - gperf = %W"gperf -7 -k#{[*1..(arity*3)].join(',')} -F,-1 -c -j1 -i1 -t -T -E -C -H #{hash} -N #{lookup} -n" - argname = arity > 1 ? "codes" : "code" - argdecl = "const OnigCodePoint #{arity > 1 ? "*": ""}#{argname}" - n = 7 - m = (1 << n) - 1 - min, max = data.map {|c, *|c}.flatten.minmax - src = IO.popen(gperf, "r+") {|f| - f << "short\n%%\n" - data.each_with_index {|(k, _), i| - k = Array(k) - ks = k.map {|j| [(j >> n*2) & m, (j >> n) & m, (j) & m]}.flatten.map {|c| "\\x%.2x" % c}.join("") - f.printf "\"%s\", ::::/*%s*/ %d\n", ks, k.map {|c| "0x%.4x" % c}.join(","), i - } - f << "%%\n" - f.close_write - f.read - } - src.sub!(/^(#{hash})\s*\(.*?\).*?\n\{\n(.*)^\}/m) { - name = $1 - body = $2 - body.gsub!(/\(unsigned char\)str\[(\d+)\]/, "bits_#{arity > 1 ? 'at' : 'of'}(#{argname}, \\1)") - "#{name}(#{argdecl})\n{\n#{body}}" - } - src.sub!(/const short *\*\n^(#{lookup})\s*\(.*?\).*?\n\{\n(.*)^\}/m) { - name = $1 - body = $2 - body.sub!(/\benum\s+\{(\n[ \t]+)/, "\\&MIN_CODE_VALUE = 0x#{min.to_s(16)},\\1""MAX_CODE_VALUE = 0x#{max.to_s(16)},\\1") - body.gsub!(/(#{hash})\s*\(.*?\)/, "\\1(#{argname})") - body.gsub!(/\{"",-1}/, "-1") - body.gsub!(/\{"(?:[^"]|\\")+", *::::(.*)\}/, '\1') - body.sub!(/(\s+if\s)\(len\b.*\)/) do - "#$1(" << - (arity > 1 ? (0...arity).map {|i| range_check("#{argname}[#{i}]")}.join(" &&\n ") : range_check(argname)) << - ")" - end - v = nil - body.sub!(/(if\s*\(.*MAX_HASH_VALUE.*\)\n([ \t]*))\{(.*?)\n\2\}/m) { - pre = $1 - indent = $2 - s = $3 - s.sub!(/const char *\* *(\w+)( *= *wordlist\[\w+\]).\w+/, 'short \1 = wordlist[key]') - v = $1 - s.sub!(/\bif *\(.*\)/, "if (#{v} >= 0 && code#{arity}_equal(#{argname}, #{key}_Table[#{v}].from))") - "#{pre}{#{s}\n#{indent}}" - } - body.sub!(/\b(return\s+&)([^;]+);/, '\1'"#{key}_Table[#{v}].to;") - "static const #{type} *\n#{name}(#{argdecl})\n{\n#{body}}" - } - src - end - - def display(dest, mapping_data) - # print the header - dest.print("/* DO NOT EDIT THIS FILE. */\n") - dest.print("/* Generated by tool/case-folding.rb */\n\n") - - versions = version.scan(/\d+/) - dest.print("#if defined ONIG_UNICODE_VERSION_STRING && !( \\\n") - %w[MAJOR MINOR TEENY].zip(versions) do |n, v| - dest.print(" ONIG_UNICODE_VERSION_#{n} == #{v} && \\\n") - end - dest.print(" 1)\n") - dest.print("# error ONIG_UNICODE_VERSION_STRING mismatch\n") - dest.print("#endif\n") - dest.print("#define ONIG_UNICODE_VERSION_STRING #{version.dump}\n") - %w[MAJOR MINOR TEENY].zip(versions) do |n, v| - dest.print("#define ONIG_UNICODE_VERSION_#{n} #{v}\n") - end - dest.print("\n") - - # print folding data - - # CaseFold + CaseFold_Locale - name = "CaseFold_11" - data = print_table(dest, name, mapping_data, "CaseFold"=>fold, "CaseFold_Locale"=>fold_locale) - dest.print lookup_hash(name, "CodePointList3", data) - - # print unfolding data - - # CaseUnfold_11 + CaseUnfold_11_Locale - name = "CaseUnfold_11" - data = print_table(dest, name, mapping_data, name=>unfold[0], "#{name}_Locale"=>unfold_locale[0]) - dest.print lookup_hash(name, "CodePointList3", data) - - # CaseUnfold_12 + CaseUnfold_12_Locale - name = "CaseUnfold_12" - data = print_table(dest, name, mapping_data, name=>unfold[1], "#{name}_Locale"=>unfold_locale[1]) - dest.print lookup_hash(name, "CodePointList2", data) - - # CaseUnfold_13 - name = "CaseUnfold_13" - data = print_table(dest, name, mapping_data, name=>unfold[2]) - dest.print lookup_hash(name, "CodePointList2", data) - - # TitleCase - dest.print mapping_data.specials_output - end - - def debug! - @debug = true - end - - def self.load(*args) - new.load(*args) - end -end - -class MapItem - attr_accessor :upper, :lower, :title, :code - - def initialize(code, upper, lower, title) - @code = code - @upper = upper unless upper == '' - @lower = lower unless lower == '' - @title = title unless title == '' - end -end - -class CaseMapping - attr_reader :filename, :version - - def initialize(mapping_directory) - @mappings = {} - @specials = [] - @specials_length = 0 - @version = nil - IO.foreach(File.join(mapping_directory, 'UnicodeData.txt'), mode: "rb") do |line| - next if line =~ /^</ - code, __1,__2,__3,__4,__5,__6,__7,__8,__9,__10,__11, upper, lower, title = line.chomp.split ';' - unless upper and lower and title and (upper+lower+title)=='' - @mappings[code] = MapItem.new(code, upper, lower, title) - end - end - - @filename = File.join(mapping_directory, 'SpecialCasing.txt') - IO.foreach(@filename, mode: "rb") do |line| - @version ||= line[/-([0-9.]+).txt/, 1] - line.chomp! - line, comment = line.split(/ *#/) - next if not line or line == '' - code, lower, title, upper, conditions = line.split(/ *; */) - unless conditions - item = @mappings[code] - item.lower = lower - item.title = title - item.upper = upper - end - end - end - - def map (from) - @mappings[from] - end - - def flags(from, type, to) - # types: CaseFold_11, CaseUnfold_11, CaseUnfold_12, CaseUnfold_13 - flags = "" - from = Array(from).map {|i| "%04X" % i}.join(" ") - to = Array(to).map {|i| "%04X" % i}.join(" ") - item = map(from) - specials = [] - case type - when 'CaseFold_11' - flags += '|F' - if item - flags += '|U' if to==item.upper - flags += '|D' if to==item.lower - unless item.upper == item.title - if item.code == item.title - flags += '|IT' - swap = case item.code - when '01C5' then '0064 017D' - when '01C8' then '006C 004A' - when '01CB' then '006E 004A' - when '01F2' then '0064 005A' - else # Greek - to.split(' ').first + ' 0399' - end - specials << swap - else - flags += '|ST' - specials << item.title - end - end - unless item.lower.nil? or item.lower==from or item.lower==to - specials << item.lower - flags += '|SL' - end - unless item.upper.nil? or item.upper==from or item.upper==to - specials << item.upper - flags += '|SU' - end - end - when 'CaseUnfold_11' - to = to.split(/ /) - if item - case to.first - when item.upper then flags += '|U' - when item.lower then flags += '|D' - else - raise "Unpredicted case 0 in enc/unicode/case_folding.rb. Please contact https://bugs.ruby-lang.org/." - end - unless item.upper == item.title - if item.code == item.title - flags += '|IT' # was unpredicted case 1 - elsif item.title==to[1] - flags += '|ST' - else - raise "Unpredicted case 2 in enc/unicode/case_folding.rb. Please contact https://bugs.ruby-lang.org/." - end - end - end - end - unless specials.empty? - flags += "|I(#{@specials_length})" - @specials_length += specials.map { |s| s.split(/ /).length }.reduce(:+) - @specials << specials - end - flags - end - - def debug! - @debug = true - end - - def specials_output - "static const OnigCodePoint CaseMappingSpecials[] = {\n" + - @specials.map do |sps| - ' ' + sps.map do |sp| - chars = sp.split(/ /) - ct = ' /* ' + Array(chars).map{|c|[c.to_i(16)].pack("U*")}.join(", ") + ' */' if @debug - " L(#{chars.length})|#{chars.map {|c| "0x"+c }.join(', ')}#{ct}," - end.join + "\n" - end.join + "};\n" - end - - def self.load(*args) - new(*args) - end -end - -class CaseMappingDummy - def flags(from, type, to) - "" - end - - def titlecase_output() '' end - def debug!() end -end - -if $0 == __FILE__ - require 'optparse' - dest = nil - mapping_directory = nil - mapping_data = nil - debug = false - fold_1 = false - ARGV.options do |opt| - opt.banner << " [INPUT]" - opt.on("--output-file=FILE", "-o", "output to the FILE instead of STDOUT") {|output| - dest = (output unless output == '-') - } - opt.on('--mapping-data-directory=DIRECTORY', '-m', 'data DIRECTORY of mapping files') { |directory| - mapping_directory = directory - } - opt.on('--debug', '-d') { - debug = true - } - opt.parse! - abort(opt.to_s) if ARGV.size > 1 - end - if mapping_directory - if ARGV[0] - warn "Either specify directory or individual file, but not both." - exit - end - filename = File.join(mapping_directory, 'CaseFolding.txt') - mapping_data = CaseMapping.load(mapping_directory) - end - filename ||= ARGV[0] || 'CaseFolding.txt' - data = CaseFolding.load(filename) - if mapping_data and data.version != mapping_data.version - abort "Unicode data version mismatch\n" \ - " #{filename} = #{data.version}\n" \ - " #{mapping_data.filename} = #{mapping_data.version}" - end - mapping_data ||= CaseMappingDummy.new - - if debug - data.debug! - mapping_data.debug! - end - f = StringIO.new - begin - data.display(f, mapping_data) - rescue Errno::ENOENT => e - raise unless /gperf/ =~ e.message - warn e.message - abort unless dest - File.utime(nil, nil, dest) # assume existing file is OK - exit - else - s = f.string - end - if dest - open(dest, "wb") do |file| - file.print(s) - end - else - STDOUT.print(s) - end -end diff --git a/fluent-bit/lib/onigmo/tool/convert-jis-props.sh b/fluent-bit/lib/onigmo/tool/convert-jis-props.sh deleted file mode 100755 index 476a5a532..000000000 --- a/fluent-bit/lib/onigmo/tool/convert-jis-props.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/sh - -# Convert props.kwd to props.h using GNU gperf. -# -# Usage: -# ./tool/convert-jis-props.sh enc/jis/props.kwd enc/jis/props.h - -GPERF_VERSION=`gperf -v | head -n1 | sed -e 's/^GNU gperf \([0-9]\+\)\.\([0-9]\+.*\)$/\1 \2/' | xargs printf '%02d%02d'` -if [ $GPERF_VERSION -ge '0301' ]; then - # static const struct enc_property *onig_jis_property(const char *str, unsigned int len); - GPERF_REPLACE='s/\(onig_jis_property([^,]\+, \).\+\( len)\)/\1size_t\2/' -else - GPERF_REPLACE='#' -fi - -JIS_PROPS_OPTIONS='-k1,3 -7 -c -j1 -i1 -t -C -P -t --ignore-case -H onig_jis_property_hash -Q onig_jis_property_pool -N onig_jis_property' - -gperf $JIS_PROPS_OPTIONS $1 | sed "$GPERF_REPLACE" | \ - sed 's/(int)(\(long\|size_t\))&((\([a-zA-Z_0-9 ]*[a-zA-Z_0-9]\) *\*)0)->\([a-zA-Z0-9_]*\),/(char)offsetof(\2, \3),/g' > $2 diff --git a/fluent-bit/lib/onigmo/tool/download-ucd.sh b/fluent-bit/lib/onigmo/tool/download-ucd.sh deleted file mode 100755 index b6b46581f..000000000 --- a/fluent-bit/lib/onigmo/tool/download-ucd.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -files='Blocks.txt CaseFolding.txt DerivedAge.txt DerivedCoreProperties.txt PropertyAliases.txt PropertyValueAliases.txt PropList.txt Scripts.txt SpecialCasing.txt UnicodeData.txt auxiliary/GraphemeBreakProperty.txt' -emoji_files='emoji-data.txt' - -if [ -z $1 ] || [ -z $2 ]; then - echo "usage: $0 UNICODE_VERSION EMOJI_VERSION" - exit 1 -fi -UNICODE_VERSION=$1 -EMOJI_VERSION=$2 - -# remove old files -if [ -d $UNICODE_VERSION ]; then - cd $UNICODE_VERSION - rm -f $files $emoji_files - rm -f GraphemeBreakProperty.txt - cd - -fi - -mkdir -p $UNICODE_VERSION/auxiliary -cd $UNICODE_VERSION - -for i in $files; do - echo http://www.unicode.org/Public/${UNICODE_VERSION}/ucd/$i -done | xargs wget -mv GraphemeBreakProperty.txt auxiliary -for i in $emoji_files; do - echo http://www.unicode.org/Public/${EMOJI_VERSION}/ucd/emoji/$i -done | xargs wget diff --git a/fluent-bit/lib/onigmo/tool/enc-unicode.rb b/fluent-bit/lib/onigmo/tool/enc-unicode.rb deleted file mode 100755 index 84f494e8d..000000000 --- a/fluent-bit/lib/onigmo/tool/enc-unicode.rb +++ /dev/null @@ -1,548 +0,0 @@ -#!/usr/bin/env ruby - -# Creates the data structures needed by Oniguruma to map Unicode codepoints to -# property names and POSIX character classes -# -# To use this, get UnicodeData.txt, Scripts.txt, PropList.txt, -# PropertyAliases.txt, PropertyValueAliases.txt, DerivedCoreProperties.txt, -# DerivedAge.txt and Blocks.txt from unicode.org. -# (http://unicode.org/Public/UNIDATA/) And run following command. -# ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd -# You can get source file for gperf. After this, simply make ruby. - -if ARGV[0] == "--header" - header = true - ARGV.shift -end -unless ARGV.size == 1 - abort "Usage: #{$0} data_directory" -end - -$unicode_version = File.basename(ARGV[0])[/\A[.\d]+\z/] - -POSIX_NAMES = %w[NEWLINE Alpha Blank Cntrl Digit Graph Lower Print XPosixPunct Space Upper XDigit Word Alnum ASCII Punct] - -GPERF_VERSION = `gperf -v`.split("\n").first # /^GNU gperf (.+)$/ - .split.last - -def pair_codepoints(codepoints) - - # We have a sorted Array of codepoints that we wish to partition into - # ranges such that the start- and endpoints form an inclusive set of - # codepoints with property _property_. Note: It is intended that some ranges - # will begin with the value with which they end, e.g. 0x0020 -> 0x0020 - - codepoints.sort! - last_cp = codepoints.first - pairs = [[last_cp, nil]] - codepoints[1..-1].each do |codepoint| - next if last_cp == codepoint - - # If the current codepoint does not follow directly on from the last - # codepoint, the last codepoint represents the end of the current range, - # and the current codepoint represents the start of the next range. - if last_cp.next != codepoint - pairs[-1][-1] = last_cp - pairs << [codepoint, nil] - end - last_cp = codepoint - end - - # The final pair has as its endpoint the last codepoint for this property - pairs[-1][-1] = codepoints.last - pairs -end - -def parse_unicode_data(file) - last_cp = 0 - data = {'Any' => (0x0000..0x10ffff).to_a, 'Assigned' => [], - 'ASCII' => (0..0x007F).to_a, 'NEWLINE' => [0x0a], 'Cn' => []} - beg_cp = nil - IO.foreach(file) do |line| - fields = line.split(';') - cp = fields[0].to_i(16) - - case fields[1] - when /\A<(.*),\s*First>\z/ - beg_cp = cp - next - when /\A<(.*),\s*Last>\z/ - cps = (beg_cp..cp).to_a - else - beg_cp = cp - cps = [cp] - end - - # The Cn category represents unassigned characters. These are not listed in - # UnicodeData.txt so we must derive them by looking for 'holes' in the range - # of listed codepoints. We increment the last codepoint seen and compare it - # with the current codepoint. If the current codepoint is less than - # last_cp.next we have found a hole, so we add the missing codepoint to the - # Cn category. - data['Cn'].concat((last_cp.next...beg_cp).to_a) - - # Assigned - Defined in unicode.c; interpreted as every character in the - # Unicode range minus the unassigned characters - data['Assigned'].concat(cps) - - # The third field denotes the 'General' category, e.g. Lu - (data[fields[2]] ||= []).concat(cps) - - # The 'Major' category is the first letter of the 'General' category, e.g. - # 'Lu' -> 'L' - (data[fields[2][0,1]] ||= []).concat(cps) - last_cp = cp - end - - # The last Cn codepoint should be 0x10ffff. If it's not, append the missing - # codepoints to Cn and C - cn_remainder = (last_cp.next..0x10ffff).to_a - data['Cn'] += cn_remainder - data['C'] += data['Cn'] - - # Special case for LC (Cased_Letter). LC = Ll + Lt + Lu - data['LC'] = data['Ll'] + data['Lt'] + data['Lu'] - - # Define General Category properties - gcps = data.keys.sort - POSIX_NAMES - - # Returns General Category Property names and the data - [gcps, data] -end - -def define_posix_props(data) - # We now derive the character classes (POSIX brackets), e.g. [[:alpha:]] - # - - data['Alpha'] = data['Alphabetic'] - data['Upper'] = data['Uppercase'] - data['Lower'] = data['Lowercase'] - data['Punct'] = data['Punctuation'] - data['XPosixPunct'] = data['Punctuation'] + [0x24, 0x2b, 0x3c, 0x3d, 0x3e, 0x5e, 0x60, 0x7c, 0x7e] - data['Digit'] = data['Decimal_Number'] - data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a + - (0x0061..0x0066).to_a - data['Alnum'] = data['Alpha'] + data['Digit'] - data['Space'] = data['White_Space'] - data['Blank'] = data['Space_Separator'] + [0x0009] - data['Cntrl'] = data['Cc'] - data['Word'] = data['Alpha'] + data['Mark'] + data['Digit'] + data['Connector_Punctuation'] - data['Graph'] = data['Any'] - data['Space'] - data['Cntrl'] - - data['Surrogate'] - data['Unassigned'] - data['Print'] = data['Graph'] + data['Space_Separator'] -end - -def parse_scripts(data, categories) - files = [ - {:fn => 'DerivedCoreProperties.txt', :title => 'Derived Property'}, - {:fn => 'Scripts.txt', :title => 'Script'}, - {:fn => 'PropList.txt', :title => 'Binary Property'}, - {:fn => 'emoji-data.txt', :title => 'Emoji'} - ] - current = nil - cps = [] - names = {} - files.each do |file| - data_foreach(file[:fn]) do |line| - if /^# Total (?:code points|elements): / =~ line - data[current] = cps - categories[current] = file[:title] - (names[file[:title]] ||= []) << current - cps = [] - elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line - current = $3 - $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16)) - end - end - end - # All code points not explicitly listed for Script - # have the value Unknown (Zzzz). - data['Unknown'] = (0..0x10ffff).to_a - data.values_at(*names['Script']).flatten - categories['Unknown'] = 'Script' - names.values.flatten << 'Unknown' -end - -def parse_aliases(data) - kv = {} - data_foreach('PropertyAliases.txt') do |line| - next unless /^(\w+)\s*; (\w+)/ =~ line - data[$1] = data[$2] - kv[normalize_propname($1)] = normalize_propname($2) - end - data_foreach('PropertyValueAliases.txt') do |line| - next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line - if $1 == 'gc' - data[$3] = data[$2] - data[$4] = data[$2] - kv[normalize_propname($3)] = normalize_propname($2) - kv[normalize_propname($4)] = normalize_propname($2) if $4 - else - data[$2] = data[$3] - data[$4] = data[$3] - kv[normalize_propname($2)] = normalize_propname($3) - kv[normalize_propname($4)] = normalize_propname($3) if $4 - end - end - kv -end - -# According to Unicode6.0.0/ch03.pdf, Section 3.1, "An update version -# never involves any additions to the character repertoire." Versions -# in DerivedAge.txt should always be /\d+\.\d+/ -def parse_age(data) - current = nil - last_constname = nil - cps = [] - ages = [] - data_foreach('DerivedAge.txt') do |line| - if /^# Total code points: / =~ line - constname = constantize_agename(current) - # each version matches all previous versions - cps.concat(data[last_constname]) if last_constname - data[constname] = cps - make_const(constname, cps, "Derived Age #{current}") - ages << current - last_constname = constname - cps = [] - elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\d+\.\d+)/ =~ line - current = $3 - $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16)) - end - end - ages -end - -def parse_GraphemeBreakProperty(data) - current = nil - cps = [] - ages = [] - data_foreach('auxiliary/GraphemeBreakProperty.txt') do |line| - if /^# Total code points: / =~ line - constname = constantize_Grapheme_Cluster_Break(current) - data[constname] = cps - make_const(constname, cps, "Grapheme_Cluster_Break=#{current}") - ages << current - cps = [] - elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line - current = $3 - $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16)) - end - end - ages -end - -def parse_block(data) - cps = [] - blocks = [] - data_foreach('Blocks.txt') do |line| - if /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);\s*(.*)/ =~ line - cps = ($1.to_i(16)..$2.to_i(16)).to_a - constname = constantize_blockname($3) - data[constname] = cps - make_const(constname, cps, "Block") - blocks << constname - end - end - - # All code points not belonging to any of the named blocks - # have the value No_Block. - no_block = (0..0x10ffff).to_a - data.values_at(*blocks).flatten - constname = constantize_blockname("No_Block") - make_const(constname, no_block, "Block") - blocks << constname -end - -# shim for Ruby 1.8 -unless {}.respond_to?(:key) - class Hash - alias key index - end -end - -$const_cache = {} -# make_const(property, pairs, name): Prints a 'static const' structure for a -# given property, group of paired codepoints, and a human-friendly name for -# the group -def make_const(prop, data, name) - if name.empty? - puts "\n/* '#{prop}' */" - else - puts "\n/* '#{prop}': #{name} */" - end - if origprop = $const_cache.key(data) - puts "#define CR_#{prop} CR_#{origprop}" - else - $const_cache[prop] = data - pairs = pair_codepoints(data) - puts "static const OnigCodePoint CR_#{prop}[] = {" - # The first element of the constant is the number of pairs of codepoints - puts "\t#{pairs.size}," - pairs.each do |pair| - pair.map! { |c| c == 0 ? '0x0000' : sprintf("%0#6x", c) } - puts "\t#{pair.first}, #{pair.last}," - end - puts "}; /* CR_#{prop} */" - end -end - -def normalize_propname(name) - name = name.downcase - name.delete!('- _') - name -end - -def constantize_agename(name) - "Age_#{name.sub(/\./, '_')}" -end - -def constantize_Grapheme_Cluster_Break(name) - "Grapheme_Cluster_Break_#{name}" -end - -def constantize_blockname(name) - "In_#{name.gsub(/\W/, '_')}" -end - -def get_file(name) - File.join(ARGV[0], name) -end - -def data_foreach(name, &block) - fn = get_file(name) - warn "Reading #{name}" - pat = /^# #{File.basename(name).sub(/\./, '-([\\d.]+)\\.')}/ - File.open(fn, 'rb') do |f| - line = f.gets - unless /^emoji-/ =~ name - unless pat =~ line - raise ArgumentError, "#{name}: no Unicode version" - end - if !$unicode_version - $unicode_version = $1 - elsif $unicode_version != $1 - raise ArgumentError, "#{name}: Unicode version mismatch: #$1" - end - end - f.each(&block) - end -end - -# Write Data -class Unifdef - attr_accessor :output, :top, :stack, :stdout, :kwdonly - def initialize(out) - @top = @output = [] - @stack = [] - $stdout, @stdout = self, out - end - def restore - $stdout = @stdout - end - def ifdef(sym) - if @kwdonly - @stdout.puts "#ifdef #{sym}" - else - @stack << @top - @top << tmp = [sym] - @top = tmp - end - if block_given? - begin - return yield - ensure - endif(sym) - end - end - end - def endif(sym) - if @kwdonly - @stdout.puts "#endif /* #{sym} */" - else - unless sym == @top[0] - restore - raise ArgumentError, "#{sym} unmatch to #{@top[0]}" - end - @top = @stack.pop - end - end - def show(dest, *syms) - _show(dest, @output, syms) - end - def _show(dest, ary, syms) - if Symbol === (sym = ary[0]) - unless syms.include?(sym) - return - end - end - ary.each do |e| - case e - when Array - _show(dest, e, syms) - when String - dest.print e - end - end - end - def write(str) - if @kwdonly - @stdout.write(str) - else - @top << str - end - self - end - alias << write -end - -output = Unifdef.new($stdout) -output.kwdonly = !header - -puts '%{' -props, data = parse_unicode_data(get_file('UnicodeData.txt')) -categories = {} -props.concat parse_scripts(data, categories) -aliases = parse_aliases(data) -ages = blocks = graphemeBreaks = nil -define_posix_props(data) -POSIX_NAMES.each do |name| - if name == 'XPosixPunct' - make_const(name, data[name], "[[:Punct:]]") - elsif name == 'Punct' - make_const(name, data[name], "") - else - make_const(name, data[name], "[[:#{name}:]]") - end -end -output.ifdef :USE_UNICODE_PROPERTIES do - props.each do |name| - category = categories[name] || - case name.size - when 1 then 'Major Category' - when 2 then 'General Category' - else '-' - end - make_const(name, data[name], category) - end - output.ifdef :USE_UNICODE_AGE_PROPERTIES do - ages = parse_age(data) - end - graphemeBreaks = parse_GraphemeBreakProperty(data) - blocks = parse_block(data) -end -puts(<<'__HEREDOC') - -static const OnigCodePoint* const CodeRanges[] = { -__HEREDOC -POSIX_NAMES.each{|name|puts" CR_#{name},"} -output.ifdef :USE_UNICODE_PROPERTIES do - props.each{|name| puts" CR_#{name},"} - output.ifdef :USE_UNICODE_AGE_PROPERTIES do - ages.each{|name| puts" CR_#{constantize_agename(name)},"} - end - graphemeBreaks.each{|name| puts" CR_#{constantize_Grapheme_Cluster_Break(name)},"} - blocks.each{|name|puts" CR_#{name},"} -end - -puts(<<"__HEREDOC") -}; -struct uniname2ctype_struct { - short name; - unsigned short ctype; -}; -#define uniname2ctype_offset(str) offsetof(struct uniname2ctype_pool_t, uniname2ctype_pool_##str) - -static const struct uniname2ctype_struct *uniname2ctype_p(const char *, #{ GPERF_VERSION >= '3.1' ? 'size_t' : 'unsigned int' }); -%} -struct uniname2ctype_struct; -%% -__HEREDOC - -i = -1 -name_to_index = {} -POSIX_NAMES.each do |name| - i += 1 - next if name == 'NEWLINE' - name = normalize_propname(name) - name_to_index[name] = i - puts"%-40s %3d" % [name + ',', i] -end -output.ifdef :USE_UNICODE_PROPERTIES do - props.each do |name| - i += 1 - name = normalize_propname(name) - name_to_index[name] = i - puts "%-40s %3d" % [name + ',', i] - end - aliases.each_pair do |k, v| - next if name_to_index[k] - next unless v = name_to_index[v] - puts "%-40s %3d" % [k + ',', v] - end - output.ifdef :USE_UNICODE_AGE_PROPERTIES do - ages.each do |name| - i += 1 - name = "age=#{name}" - name_to_index[name] = i - puts "%-40s %3d" % [name + ',', i] - end - end - graphemeBreaks.each do |name| - i += 1 - name = "graphemeclusterbreak=#{name.delete('_').downcase}" - name_to_index[name] = i - puts "%-40s %3d" % [name + ',', i] - end - blocks.each do |name| - i += 1 - name = normalize_propname(name) - name_to_index[name] = i - puts "%-40s %3d" % [name + ',', i] - end -end -puts(<<'__HEREDOC') -%% -static int -uniname2ctype(const UChar *name, unsigned int len) -{ - const struct uniname2ctype_struct *p = uniname2ctype_p((const char *)name, len); - if (p) return p->ctype; - return -1; -} -__HEREDOC -versions = $unicode_version.scan(/\d+/) -print("#if defined ONIG_UNICODE_VERSION_STRING && !( \\\n") -%w[MAJOR MINOR TEENY].zip(versions) do |n, v| - print(" ONIG_UNICODE_VERSION_#{n} == #{v} && \\\n") -end -print(" 1)\n") -print("# error ONIG_UNICODE_VERSION_STRING mismatch\n") -print("#endif\n") -print("#define ONIG_UNICODE_VERSION_STRING #{$unicode_version.dump}\n") -%w[MAJOR MINOR TEENY].zip(versions) do |n, v| - print("#define ONIG_UNICODE_VERSION_#{n} #{v}\n") -end - -output.restore - -if header - require 'tempfile' - - NAME2CTYPE = %w[gperf -7 -c -j1 -i1 -t -C -P -T -H uniname2ctype_hash -Q uniname2ctype_pool -N uniname2ctype_p] - - fds = [] - syms = %i[USE_UNICODE_PROPERTIES USE_UNICODE_AGE_PROPERTIES] - begin - fds << (tmp = Tempfile.new(%w"name2ctype .h")) - IO.popen([*NAME2CTYPE, out: tmp], "w") {|f| output.show(f, *syms)} - end while syms.pop - fds.each(&:close) - IO.popen(%W[diff -DUSE_UNICODE_AGE_PROPERTIES #{fds[1].path} #{fds[0].path}], "r") {|age| - IO.popen(%W[diff -DUSE_UNICODE_PROPERTIES #{fds[2].path} -], "r", in: age) {|f| - f.each {|line| - line.gsub!(/\(int\)\((?:long|size_t)\)&\(\(struct uniname2ctype_pool_t \*\)0\)->uniname2ctype_pool_(str\d+),\s+/, - 'uniname2ctype_offset(\1), ') - puts line - } - } - } -end diff --git a/fluent-bit/lib/onigmo/tool/update-doc.py b/fluent-bit/lib/onigmo/tool/update-doc.py deleted file mode 100755 index 4126adff4..000000000 --- a/fluent-bit/lib/onigmo/tool/update-doc.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# Usage: -# $ python update-doc.py UCD_DIR > ../doc/UnicodeProps.txt - -from __future__ import print_function -import sys -import os -import re -import datetime - -onig_ver = "6.2.2" -ucddir = "." - -def print_list(arr, title): - print() - print("*", title) - for i in arr: - print(" " + i) - -def output_header(): - d = datetime.date.today() - print("Onigmo (Oniguruma-mod) Unicode Properties Version %s %04d/%02d/%02d" - % (onig_ver, d.year, d.month, d.day)) - - posix_brackets = [ - "Alpha", "Blank", "Cntrl", "Digit", "Graph", "Lower", "Print", - "Punct", "Space", "Upper", "XDigit", "Word", "Alnum", "ASCII", - "XPosixPunct" - ] - specials = ["Any", "Assigned"] - - print_list(posix_brackets, "POSIX brackets") - print_list(specials, "Special") - return set(posix_brackets) | set(specials) - -def output_categories(): - categories = set(["LC", "Cn"]) - pattern = re.compile('^.*?;.*?;(..);') - with open(ucddir + os.sep + 'UnicodeData.txt', 'r') as f: - for line in f: - res = pattern.match(line) - if not res: - continue - categories.add(res.group(1)) - categories.add(res.group(1)[0]) # Major category - print_list(sorted(categories), "Major and General Categories") - return categories - -def output_scripts(filename, title, add=[]): - scripts = set(add) - pattern = re.compile('^[0-9a-fA-F]+(?:\.\.[0-9a-fA-F]+)? *; (\w+) +# ') - with open(filename, 'r') as f: - for line in f: - res = pattern.match(line) - if not res: - continue - scripts.add(res.group(1)) - print_list(sorted(scripts), title) - return scripts - -def output_aliases(scripts): - aliases = set() - pattern = re.compile('^(\w+) *; (\w+)') - with open(ucddir + os.sep + 'PropertyAliases.txt', 'r') as f: - for line in f: - res = pattern.match(line) - if not res: - continue - if (res.group(2) in scripts) and (res.group(1) not in scripts): - aliases.add(res.group(1)) - print_list(sorted(aliases), "PropertyAliases") - return aliases - -def output_valuealiases(scripts): - scripts |= set(["cntrl", "digit", "punct"]) # exclude them - aliases = list() - aliases_sc = list() - pattern = re.compile('^(gc|sc) ; (\w+) *; (\w+)(?: *; (\w+))?') - with open(ucddir + os.sep + 'PropertyValueAliases.txt', 'r') as f: - for line in f: - res = pattern.match(line) - if not res: - continue - if (res.group(1) == "gc"): - if res.group(2) in scripts: - if res.group(3) not in scripts: - aliases.append(res.group(3)) - if res.group(4) and (res.group(4) not in scripts): - aliases.append(res.group(4)) - else: - if res.group(3) in scripts: - if res.group(2) not in scripts: - aliases_sc.append(res.group(2)) - if res.group(4) and (res.group(4) not in scripts): - aliases_sc.append(res.group(4)) - - print_list(aliases, "PropertyValueAliases (General_Category)") - print_list(aliases_sc, "PropertyValueAliases (Script)") - return set(aliases) | set(aliases_sc) - -def output_ages(): - ages = set() - pattern = re.compile('^[\dA-F.]+ *; ([\d.]+)') - with open(ucddir + os.sep + 'DerivedAge.txt', 'r') as f: - for line in f: - res = pattern.match(line) - if not res: - continue - ages.add("Age=" + res.group(1)) - print_list(sorted(ages), "DerivedAges") - return ages - -def output_blocks(): - blocks = list() - pattern = re.compile('^[\dA-F.]+ *; ([-\w ]+)') - with open(ucddir + os.sep + 'Blocks.txt', 'r') as f: - for line in f: - res = pattern.match(line) - if not res: - continue - blocks.append("In_" + re.sub('\W', '_', res.group(1))) - blocks.append("In_No_Block") - print_list(blocks, "Blocks") - return set(blocks) - -def main(): - global ucddir - if len(sys.argv) > 1: - ucddir = sys.argv[1] - scripts = set() - scripts |= output_header() - scripts |= output_categories() - scripts |= output_scripts(ucddir + os.sep + 'Scripts.txt', 'Scripts', ["Unknown"]) - scripts |= output_scripts(ucddir + os.sep + 'DerivedCoreProperties.txt', 'DerivedCoreProperties') - scripts |= output_scripts(ucddir + os.sep + 'PropList.txt', 'PropList') - scripts |= output_scripts(ucddir + os.sep + 'emoji-data.txt', 'Emoji') - output_aliases(scripts) - output_valuealiases(scripts) - output_ages() - output_blocks() - -if __name__ == '__main__': - main() |