summaryrefslogtreecommitdiffstats
path: root/fluent-bit/lib/onigmo/tool
diff options
context:
space:
mode:
Diffstat (limited to 'fluent-bit/lib/onigmo/tool')
-rw-r--r--fluent-bit/lib/onigmo/tool/.gitignore18
-rw-r--r--fluent-bit/lib/onigmo/tool/Makefile48
-rwxr-xr-xfluent-bit/lib/onigmo/tool/case-folding.rb418
-rwxr-xr-xfluent-bit/lib/onigmo/tool/convert-jis-props.sh19
-rwxr-xr-xfluent-bit/lib/onigmo/tool/download-ucd.sh30
-rwxr-xr-xfluent-bit/lib/onigmo/tool/enc-unicode.rb548
-rwxr-xr-xfluent-bit/lib/onigmo/tool/update-doc.py145
7 files changed, 1226 insertions, 0 deletions
diff --git a/fluent-bit/lib/onigmo/tool/.gitignore b/fluent-bit/lib/onigmo/tool/.gitignore
new file mode 100644
index 000000000..981fe8fb6
--- /dev/null
+++ b/fluent-bit/lib/onigmo/tool/.gitignore
@@ -0,0 +1,18 @@
+# ignore UCD files
+Blocks.txt
+CaseFolding.txt
+DerivedAge.txt
+DerivedCoreProperties.txt
+PropList.txt
+PropertyAliases.txt
+PropertyValueAliases.txt
+Scripts.txt
+SpecialCasing.txt
+UnicodeData.txt
+GraphemeBreakProperty.txt
+emoji-data.txt
+
+# ignore generated files
+casefold.h
+name2ctype.h
+name2ctype.kwd
diff --git a/fluent-bit/lib/onigmo/tool/Makefile b/fluent-bit/lib/onigmo/tool/Makefile
new file mode 100644
index 000000000..cca6e732c
--- /dev/null
+++ b/fluent-bit/lib/onigmo/tool/Makefile
@@ -0,0 +1,48 @@
+UNICODE_VERSION = 15.0.0
+EMOJI_VERSION = 15.0.0
+
+PROP_FILES = \
+ $(UNICODE_VERSION)/Blocks.txt \
+ $(UNICODE_VERSION)/DerivedAge.txt \
+ $(UNICODE_VERSION)/DerivedCoreProperties.txt \
+ $(UNICODE_VERSION)/PropertyAliases.txt \
+ $(UNICODE_VERSION)/PropertyValueAliases.txt \
+ $(UNICODE_VERSION)/PropList.txt \
+ $(UNICODE_VERSION)/Scripts.txt \
+ $(UNICODE_VERSION)/UnicodeData.txt \
+ $(UNICODE_VERSION)/auxiliary/GraphemeBreakProperty.txt \
+ $(UNICODE_VERSION)/emoji-data.txt
+
+CASEFOLD_FILES = \
+ $(UNICODE_VERSION)/CaseFolding.txt \
+ $(UNICODE_VERSION)/UnicodeData.txt \
+ $(UNICODE_VERSION)/SpecialCasing.txt
+
+update: update-unicode-header update-jis-header update-doc
+
+update-unicode-header: casefold.h name2ctype.h
+ cp casefold.h name2ctype.h ../enc/unicode
+
+update-jis-header: ../enc/jis/props.kwd
+ cd .. && ./tool/convert-jis-props.sh enc/jis/props.kwd enc/jis/props.h && cd -
+
+update-doc: $(PROP_FILES) update-doc.py
+ $(PYTHON) ./update-doc.py $(UNICODE_VERSION) > ../doc/UnicodeProps.txt
+
+download:
+ ./download-ucd.sh $(UNICODE_VERSION) $(EMOJI_VERSION)
+
+
+casefold.h: $(CASEFOLD_FILES) case-folding.rb
+ $(RUBY) ./case-folding.rb -m $(UNICODE_VERSION) -o casefold.h
+
+name2ctype.h: $(PROP_FILES) enc-unicode.rb
+ $(RUBY) ./enc-unicode.rb --header $(UNICODE_VERSION) > name2ctype.h || rm -f name2ctype.h
+
+
+clean:
+ -rm -f casefold.h name2ctype.kwd name2ctype.h
+ -rm -f $(PROP_FILES) $(CASEFOLD_FILES)
+ -rm -f GraphemeBreakProperty.txt
+ -rmdir $(UNICODE_VERSION)/auxiliary
+ -rmdir $(UNICODE_VERSION)
diff --git a/fluent-bit/lib/onigmo/tool/case-folding.rb b/fluent-bit/lib/onigmo/tool/case-folding.rb
new file mode 100755
index 000000000..c299074f0
--- /dev/null
+++ b/fluent-bit/lib/onigmo/tool/case-folding.rb
@@ -0,0 +1,418 @@
+#!/usr/bin/ruby
+require 'stringio'
+
+# Usage (for case folding only):
+# $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
+# $ ruby case-folding.rb CaseFolding.txt -o casefold.h
+# or (for case folding and case mapping):
+# $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
+# $ wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
+# $ wget http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt
+# $ ruby case-folding.rb -m . -o casefold.h
+# using -d or --debug will include UTF-8 characters in comments for debugging
+
+class CaseFolding
+ module Util
+ module_function
+
+ def hex_seq(v)
+ v.map { |i| "0x%04x" % i }.join(", ")
+ end
+
+ def print_table_1(dest, type, mapping_data, data)
+ for k, v in data = data.sort
+ sk = (Array === k and k.length > 1) ? "{#{hex_seq(k)}}" : ("0x%04x" % k)
+ if type=='CaseUnfold_11' and v.length>1
+ # reorder CaseUnfold_11 entries to avoid special treatment for U+03B9/U+03BC/U+A64B
+ item = mapping_data.map("%04X" % k[0])
+ upper = item.upper if item
+ v = v.sort_by { |i| ("%04X"%i) == upper ? 0 : 1 }
+ end
+ ck = @debug ? ' /* ' + Array(k).pack("U*") + ' */' : ''
+ cv = @debug ? ' /* ' + Array(v).map{|c|[c].pack("U*")}.join(", ") + ' */' : ''
+ dest.print(" {#{sk}#{ck}, {#{v.length}#{mapping_data.flags(k, type, v)}, {#{hex_seq(v)}#{cv}}}},\n")
+ end
+ data
+ end
+
+ def print_table(dest, type, mapping_data, data)
+ dest.print("static const #{type}_Type #{type}_Table[] = {\n")
+ i = 0
+ ret = data.inject([]) do |a, (n, d)|
+ dest.print("#define #{n} (*(#{type}_Type (*)[#{d.size}])(#{type}_Table+#{i}))\n")
+ i += d.size
+ a.concat(print_table_1(dest, type, mapping_data, d))
+ end
+ dest.print("};\n\n")
+ ret
+ end
+ end
+
+ include Util
+
+ attr_reader :fold, :fold_locale, :unfold, :unfold_locale, :version
+
+ def load(filename)
+ pattern = /([0-9A-F]{4,6}); ([CFT]); ([0-9A-F]{4,6})(?: ([0-9A-F]{4,6}))?(?: ([0-9A-F]{4,6}))?;/
+
+ @fold = fold = {}
+ @unfold = unfold = [{}, {}, {}]
+ @debug = false
+ @version = nil
+ turkic = []
+
+ IO.foreach(filename, mode: "rb") do |line|
+ @version ||= line[/-([0-9.]+).txt/, 1]
+ next unless res = pattern.match(line)
+ ch_from = res[1].to_i(16)
+
+ if res[2] == 'T'
+ # Turkic case folding
+ turkic << ch_from
+ next
+ end
+
+ # store folding data
+ ch_to = res[3..6].inject([]) do |a, i|
+ break a unless i
+ a << i.to_i(16)
+ end
+ fold[ch_from] = ch_to
+
+ # store unfolding data
+ i = ch_to.length - 1
+ (unfold[i][ch_to] ||= []) << ch_from
+ end
+
+ # move locale dependent data to (un)fold_locale
+ @fold_locale = fold_locale = {}
+ @unfold_locale = unfold_locale = [{}, {}]
+ for ch_from in turkic
+ key = fold[ch_from]
+ i = key.length - 1
+ unfold_locale[i][i == 0 ? key[0] : key] = unfold[i].delete(key)
+ fold_locale[ch_from] = fold.delete(ch_from)
+ end
+ self
+ end
+
+ def range_check(code)
+ "#{code} <= MAX_CODE_VALUE && #{code} >= MIN_CODE_VALUE"
+ end
+
+ def lookup_hash(key, type, data)
+ hash = "onigenc_unicode_#{key}_hash"
+ lookup = "onigenc_unicode_#{key}_lookup"
+ arity = Array(data[0][0]).size
+ gperf = %W"gperf -7 -k#{[*1..(arity*3)].join(',')} -F,-1 -c -j1 -i1 -t -T -E -C -H #{hash} -N #{lookup} -n"
+ argname = arity > 1 ? "codes" : "code"
+ argdecl = "const OnigCodePoint #{arity > 1 ? "*": ""}#{argname}"
+ n = 7
+ m = (1 << n) - 1
+ min, max = data.map {|c, *|c}.flatten.minmax
+ src = IO.popen(gperf, "r+") {|f|
+ f << "short\n%%\n"
+ data.each_with_index {|(k, _), i|
+ k = Array(k)
+ ks = k.map {|j| [(j >> n*2) & m, (j >> n) & m, (j) & m]}.flatten.map {|c| "\\x%.2x" % c}.join("")
+ f.printf "\"%s\", ::::/*%s*/ %d\n", ks, k.map {|c| "0x%.4x" % c}.join(","), i
+ }
+ f << "%%\n"
+ f.close_write
+ f.read
+ }
+ src.sub!(/^(#{hash})\s*\(.*?\).*?\n\{\n(.*)^\}/m) {
+ name = $1
+ body = $2
+ body.gsub!(/\(unsigned char\)str\[(\d+)\]/, "bits_#{arity > 1 ? 'at' : 'of'}(#{argname}, \\1)")
+ "#{name}(#{argdecl})\n{\n#{body}}"
+ }
+ src.sub!(/const short *\*\n^(#{lookup})\s*\(.*?\).*?\n\{\n(.*)^\}/m) {
+ name = $1
+ body = $2
+ body.sub!(/\benum\s+\{(\n[ \t]+)/, "\\&MIN_CODE_VALUE = 0x#{min.to_s(16)},\\1""MAX_CODE_VALUE = 0x#{max.to_s(16)},\\1")
+ body.gsub!(/(#{hash})\s*\(.*?\)/, "\\1(#{argname})")
+ body.gsub!(/\{"",-1}/, "-1")
+ body.gsub!(/\{"(?:[^"]|\\")+", *::::(.*)\}/, '\1')
+ body.sub!(/(\s+if\s)\(len\b.*\)/) do
+ "#$1(" <<
+ (arity > 1 ? (0...arity).map {|i| range_check("#{argname}[#{i}]")}.join(" &&\n ") : range_check(argname)) <<
+ ")"
+ end
+ v = nil
+ body.sub!(/(if\s*\(.*MAX_HASH_VALUE.*\)\n([ \t]*))\{(.*?)\n\2\}/m) {
+ pre = $1
+ indent = $2
+ s = $3
+ s.sub!(/const char *\* *(\w+)( *= *wordlist\[\w+\]).\w+/, 'short \1 = wordlist[key]')
+ v = $1
+ s.sub!(/\bif *\(.*\)/, "if (#{v} >= 0 && code#{arity}_equal(#{argname}, #{key}_Table[#{v}].from))")
+ "#{pre}{#{s}\n#{indent}}"
+ }
+ body.sub!(/\b(return\s+&)([^;]+);/, '\1'"#{key}_Table[#{v}].to;")
+ "static const #{type} *\n#{name}(#{argdecl})\n{\n#{body}}"
+ }
+ src
+ end
+
+ def display(dest, mapping_data)
+ # print the header
+ dest.print("/* DO NOT EDIT THIS FILE. */\n")
+ dest.print("/* Generated by tool/case-folding.rb */\n\n")
+
+ versions = version.scan(/\d+/)
+ dest.print("#if defined ONIG_UNICODE_VERSION_STRING && !( \\\n")
+ %w[MAJOR MINOR TEENY].zip(versions) do |n, v|
+ dest.print(" ONIG_UNICODE_VERSION_#{n} == #{v} && \\\n")
+ end
+ dest.print(" 1)\n")
+ dest.print("# error ONIG_UNICODE_VERSION_STRING mismatch\n")
+ dest.print("#endif\n")
+ dest.print("#define ONIG_UNICODE_VERSION_STRING #{version.dump}\n")
+ %w[MAJOR MINOR TEENY].zip(versions) do |n, v|
+ dest.print("#define ONIG_UNICODE_VERSION_#{n} #{v}\n")
+ end
+ dest.print("\n")
+
+ # print folding data
+
+ # CaseFold + CaseFold_Locale
+ name = "CaseFold_11"
+ data = print_table(dest, name, mapping_data, "CaseFold"=>fold, "CaseFold_Locale"=>fold_locale)
+ dest.print lookup_hash(name, "CodePointList3", data)
+
+ # print unfolding data
+
+ # CaseUnfold_11 + CaseUnfold_11_Locale
+ name = "CaseUnfold_11"
+ data = print_table(dest, name, mapping_data, name=>unfold[0], "#{name}_Locale"=>unfold_locale[0])
+ dest.print lookup_hash(name, "CodePointList3", data)
+
+ # CaseUnfold_12 + CaseUnfold_12_Locale
+ name = "CaseUnfold_12"
+ data = print_table(dest, name, mapping_data, name=>unfold[1], "#{name}_Locale"=>unfold_locale[1])
+ dest.print lookup_hash(name, "CodePointList2", data)
+
+ # CaseUnfold_13
+ name = "CaseUnfold_13"
+ data = print_table(dest, name, mapping_data, name=>unfold[2])
+ dest.print lookup_hash(name, "CodePointList2", data)
+
+ # TitleCase
+ dest.print mapping_data.specials_output
+ end
+
+ def debug!
+ @debug = true
+ end
+
+ def self.load(*args)
+ new.load(*args)
+ end
+end
+
+class MapItem
+ attr_accessor :upper, :lower, :title, :code
+
+ def initialize(code, upper, lower, title)
+ @code = code
+ @upper = upper unless upper == ''
+ @lower = lower unless lower == ''
+ @title = title unless title == ''
+ end
+end
+
+class CaseMapping
+ attr_reader :filename, :version
+
+ def initialize(mapping_directory)
+ @mappings = {}
+ @specials = []
+ @specials_length = 0
+ @version = nil
+ IO.foreach(File.join(mapping_directory, 'UnicodeData.txt'), mode: "rb") do |line|
+ next if line =~ /^</
+ code, __1,__2,__3,__4,__5,__6,__7,__8,__9,__10,__11, upper, lower, title = line.chomp.split ';'
+ unless upper and lower and title and (upper+lower+title)==''
+ @mappings[code] = MapItem.new(code, upper, lower, title)
+ end
+ end
+
+ @filename = File.join(mapping_directory, 'SpecialCasing.txt')
+ IO.foreach(@filename, mode: "rb") do |line|
+ @version ||= line[/-([0-9.]+).txt/, 1]
+ line.chomp!
+ line, comment = line.split(/ *#/)
+ next if not line or line == ''
+ code, lower, title, upper, conditions = line.split(/ *; */)
+ unless conditions
+ item = @mappings[code]
+ item.lower = lower
+ item.title = title
+ item.upper = upper
+ end
+ end
+ end
+
+ def map (from)
+ @mappings[from]
+ end
+
+ def flags(from, type, to)
+ # types: CaseFold_11, CaseUnfold_11, CaseUnfold_12, CaseUnfold_13
+ flags = ""
+ from = Array(from).map {|i| "%04X" % i}.join(" ")
+ to = Array(to).map {|i| "%04X" % i}.join(" ")
+ item = map(from)
+ specials = []
+ case type
+ when 'CaseFold_11'
+ flags += '|F'
+ if item
+ flags += '|U' if to==item.upper
+ flags += '|D' if to==item.lower
+ unless item.upper == item.title
+ if item.code == item.title
+ flags += '|IT'
+ swap = case item.code
+ when '01C5' then '0064 017D'
+ when '01C8' then '006C 004A'
+ when '01CB' then '006E 004A'
+ when '01F2' then '0064 005A'
+ else # Greek
+ to.split(' ').first + ' 0399'
+ end
+ specials << swap
+ else
+ flags += '|ST'
+ specials << item.title
+ end
+ end
+ unless item.lower.nil? or item.lower==from or item.lower==to
+ specials << item.lower
+ flags += '|SL'
+ end
+ unless item.upper.nil? or item.upper==from or item.upper==to
+ specials << item.upper
+ flags += '|SU'
+ end
+ end
+ when 'CaseUnfold_11'
+ to = to.split(/ /)
+ if item
+ case to.first
+ when item.upper then flags += '|U'
+ when item.lower then flags += '|D'
+ else
+ raise "Unpredicted case 0 in enc/unicode/case_folding.rb. Please contact https://bugs.ruby-lang.org/."
+ end
+ unless item.upper == item.title
+ if item.code == item.title
+ flags += '|IT' # was unpredicted case 1
+ elsif item.title==to[1]
+ flags += '|ST'
+ else
+ raise "Unpredicted case 2 in enc/unicode/case_folding.rb. Please contact https://bugs.ruby-lang.org/."
+ end
+ end
+ end
+ end
+ unless specials.empty?
+ flags += "|I(#{@specials_length})"
+ @specials_length += specials.map { |s| s.split(/ /).length }.reduce(:+)
+ @specials << specials
+ end
+ flags
+ end
+
+ def debug!
+ @debug = true
+ end
+
+ def specials_output
+ "static const OnigCodePoint CaseMappingSpecials[] = {\n" +
+ @specials.map do |sps|
+ ' ' + sps.map do |sp|
+ chars = sp.split(/ /)
+ ct = ' /* ' + Array(chars).map{|c|[c.to_i(16)].pack("U*")}.join(", ") + ' */' if @debug
+ " L(#{chars.length})|#{chars.map {|c| "0x"+c }.join(', ')}#{ct},"
+ end.join + "\n"
+ end.join + "};\n"
+ end
+
+ def self.load(*args)
+ new(*args)
+ end
+end
+
+class CaseMappingDummy
+ def flags(from, type, to)
+ ""
+ end
+
+ def titlecase_output() '' end
+ def debug!() end
+end
+
+if $0 == __FILE__
+ require 'optparse'
+ dest = nil
+ mapping_directory = nil
+ mapping_data = nil
+ debug = false
+ fold_1 = false
+ ARGV.options do |opt|
+ opt.banner << " [INPUT]"
+ opt.on("--output-file=FILE", "-o", "output to the FILE instead of STDOUT") {|output|
+ dest = (output unless output == '-')
+ }
+ opt.on('--mapping-data-directory=DIRECTORY', '-m', 'data DIRECTORY of mapping files') { |directory|
+ mapping_directory = directory
+ }
+ opt.on('--debug', '-d') {
+ debug = true
+ }
+ opt.parse!
+ abort(opt.to_s) if ARGV.size > 1
+ end
+ if mapping_directory
+ if ARGV[0]
+ warn "Either specify directory or individual file, but not both."
+ exit
+ end
+ filename = File.join(mapping_directory, 'CaseFolding.txt')
+ mapping_data = CaseMapping.load(mapping_directory)
+ end
+ filename ||= ARGV[0] || 'CaseFolding.txt'
+ data = CaseFolding.load(filename)
+ if mapping_data and data.version != mapping_data.version
+ abort "Unicode data version mismatch\n" \
+ " #{filename} = #{data.version}\n" \
+ " #{mapping_data.filename} = #{mapping_data.version}"
+ end
+ mapping_data ||= CaseMappingDummy.new
+
+ if debug
+ data.debug!
+ mapping_data.debug!
+ end
+ f = StringIO.new
+ begin
+ data.display(f, mapping_data)
+ rescue Errno::ENOENT => e
+ raise unless /gperf/ =~ e.message
+ warn e.message
+ abort unless dest
+ File.utime(nil, nil, dest) # assume existing file is OK
+ exit
+ else
+ s = f.string
+ end
+ if dest
+ open(dest, "wb") do |file|
+ file.print(s)
+ end
+ else
+ STDOUT.print(s)
+ end
+end
diff --git a/fluent-bit/lib/onigmo/tool/convert-jis-props.sh b/fluent-bit/lib/onigmo/tool/convert-jis-props.sh
new file mode 100755
index 000000000..476a5a532
--- /dev/null
+++ b/fluent-bit/lib/onigmo/tool/convert-jis-props.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+# Convert props.kwd to props.h using GNU gperf.
+#
+# Usage:
+# ./tool/convert-jis-props.sh enc/jis/props.kwd enc/jis/props.h
+
+GPERF_VERSION=`gperf -v | head -n1 | sed -e 's/^GNU gperf \([0-9]\+\)\.\([0-9]\+.*\)$/\1 \2/' | xargs printf '%02d%02d'`
+if [ $GPERF_VERSION -ge '0301' ]; then
+ # static const struct enc_property *onig_jis_property(const char *str, unsigned int len);
+ GPERF_REPLACE='s/\(onig_jis_property([^,]\+, \).\+\( len)\)/\1size_t\2/'
+else
+ GPERF_REPLACE='#'
+fi
+
+JIS_PROPS_OPTIONS='-k1,3 -7 -c -j1 -i1 -t -C -P -t --ignore-case -H onig_jis_property_hash -Q onig_jis_property_pool -N onig_jis_property'
+
+gperf $JIS_PROPS_OPTIONS $1 | sed "$GPERF_REPLACE" | \
+ sed 's/(int)(\(long\|size_t\))&((\([a-zA-Z_0-9 ]*[a-zA-Z_0-9]\) *\*)0)->\([a-zA-Z0-9_]*\),/(char)offsetof(\2, \3),/g' > $2
diff --git a/fluent-bit/lib/onigmo/tool/download-ucd.sh b/fluent-bit/lib/onigmo/tool/download-ucd.sh
new file mode 100755
index 000000000..b6b46581f
--- /dev/null
+++ b/fluent-bit/lib/onigmo/tool/download-ucd.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+files='Blocks.txt CaseFolding.txt DerivedAge.txt DerivedCoreProperties.txt PropertyAliases.txt PropertyValueAliases.txt PropList.txt Scripts.txt SpecialCasing.txt UnicodeData.txt auxiliary/GraphemeBreakProperty.txt'
+emoji_files='emoji-data.txt'
+
+if [ -z $1 ] || [ -z $2 ]; then
+ echo "usage: $0 UNICODE_VERSION EMOJI_VERSION"
+ exit 1
+fi
+UNICODE_VERSION=$1
+EMOJI_VERSION=$2
+
+# remove old files
+if [ -d $UNICODE_VERSION ]; then
+ cd $UNICODE_VERSION
+ rm -f $files $emoji_files
+ rm -f GraphemeBreakProperty.txt
+ cd -
+fi
+
+mkdir -p $UNICODE_VERSION/auxiliary
+cd $UNICODE_VERSION
+
+for i in $files; do
+ echo http://www.unicode.org/Public/${UNICODE_VERSION}/ucd/$i
+done | xargs wget
+mv GraphemeBreakProperty.txt auxiliary
+for i in $emoji_files; do
+ echo http://www.unicode.org/Public/${EMOJI_VERSION}/ucd/emoji/$i
+done | xargs wget
diff --git a/fluent-bit/lib/onigmo/tool/enc-unicode.rb b/fluent-bit/lib/onigmo/tool/enc-unicode.rb
new file mode 100755
index 000000000..84f494e8d
--- /dev/null
+++ b/fluent-bit/lib/onigmo/tool/enc-unicode.rb
@@ -0,0 +1,548 @@
+#!/usr/bin/env ruby
+
+# Creates the data structures needed by Oniguruma to map Unicode codepoints to
+# property names and POSIX character classes
+#
+# To use this, get UnicodeData.txt, Scripts.txt, PropList.txt,
+# PropertyAliases.txt, PropertyValueAliases.txt, DerivedCoreProperties.txt,
+# DerivedAge.txt and Blocks.txt from unicode.org.
+# (http://unicode.org/Public/UNIDATA/) And run following command.
+# ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd
+# You can get source file for gperf. After this, simply make ruby.
+
+if ARGV[0] == "--header"
+ header = true
+ ARGV.shift
+end
+unless ARGV.size == 1
+ abort "Usage: #{$0} data_directory"
+end
+
+$unicode_version = File.basename(ARGV[0])[/\A[.\d]+\z/]
+
+POSIX_NAMES = %w[NEWLINE Alpha Blank Cntrl Digit Graph Lower Print XPosixPunct Space Upper XDigit Word Alnum ASCII Punct]
+
+GPERF_VERSION = `gperf -v`.split("\n").first # /^GNU gperf (.+)$/
+ .split.last
+
+def pair_codepoints(codepoints)
+
+ # We have a sorted Array of codepoints that we wish to partition into
+ # ranges such that the start- and endpoints form an inclusive set of
+ # codepoints with property _property_. Note: It is intended that some ranges
+ # will begin with the value with which they end, e.g. 0x0020 -> 0x0020
+
+ codepoints.sort!
+ last_cp = codepoints.first
+ pairs = [[last_cp, nil]]
+ codepoints[1..-1].each do |codepoint|
+ next if last_cp == codepoint
+
+ # If the current codepoint does not follow directly on from the last
+ # codepoint, the last codepoint represents the end of the current range,
+ # and the current codepoint represents the start of the next range.
+ if last_cp.next != codepoint
+ pairs[-1][-1] = last_cp
+ pairs << [codepoint, nil]
+ end
+ last_cp = codepoint
+ end
+
+ # The final pair has as its endpoint the last codepoint for this property
+ pairs[-1][-1] = codepoints.last
+ pairs
+end
+
+def parse_unicode_data(file)
+ last_cp = 0
+ data = {'Any' => (0x0000..0x10ffff).to_a, 'Assigned' => [],
+ 'ASCII' => (0..0x007F).to_a, 'NEWLINE' => [0x0a], 'Cn' => []}
+ beg_cp = nil
+ IO.foreach(file) do |line|
+ fields = line.split(';')
+ cp = fields[0].to_i(16)
+
+ case fields[1]
+ when /\A<(.*),\s*First>\z/
+ beg_cp = cp
+ next
+ when /\A<(.*),\s*Last>\z/
+ cps = (beg_cp..cp).to_a
+ else
+ beg_cp = cp
+ cps = [cp]
+ end
+
+ # The Cn category represents unassigned characters. These are not listed in
+ # UnicodeData.txt so we must derive them by looking for 'holes' in the range
+ # of listed codepoints. We increment the last codepoint seen and compare it
+ # with the current codepoint. If the current codepoint is less than
+ # last_cp.next we have found a hole, so we add the missing codepoint to the
+ # Cn category.
+ data['Cn'].concat((last_cp.next...beg_cp).to_a)
+
+ # Assigned - Defined in unicode.c; interpreted as every character in the
+ # Unicode range minus the unassigned characters
+ data['Assigned'].concat(cps)
+
+ # The third field denotes the 'General' category, e.g. Lu
+ (data[fields[2]] ||= []).concat(cps)
+
+ # The 'Major' category is the first letter of the 'General' category, e.g.
+ # 'Lu' -> 'L'
+ (data[fields[2][0,1]] ||= []).concat(cps)
+ last_cp = cp
+ end
+
+ # The last Cn codepoint should be 0x10ffff. If it's not, append the missing
+ # codepoints to Cn and C
+ cn_remainder = (last_cp.next..0x10ffff).to_a
+ data['Cn'] += cn_remainder
+ data['C'] += data['Cn']
+
+ # Special case for LC (Cased_Letter). LC = Ll + Lt + Lu
+ data['LC'] = data['Ll'] + data['Lt'] + data['Lu']
+
+ # Define General Category properties
+ gcps = data.keys.sort - POSIX_NAMES
+
+ # Returns General Category Property names and the data
+ [gcps, data]
+end
+
+def define_posix_props(data)
+ # We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
+ #
+
+ data['Alpha'] = data['Alphabetic']
+ data['Upper'] = data['Uppercase']
+ data['Lower'] = data['Lowercase']
+ data['Punct'] = data['Punctuation']
+ data['XPosixPunct'] = data['Punctuation'] + [0x24, 0x2b, 0x3c, 0x3d, 0x3e, 0x5e, 0x60, 0x7c, 0x7e]
+ data['Digit'] = data['Decimal_Number']
+ data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a +
+ (0x0061..0x0066).to_a
+ data['Alnum'] = data['Alpha'] + data['Digit']
+ data['Space'] = data['White_Space']
+ data['Blank'] = data['Space_Separator'] + [0x0009]
+ data['Cntrl'] = data['Cc']
+ data['Word'] = data['Alpha'] + data['Mark'] + data['Digit'] + data['Connector_Punctuation']
+ data['Graph'] = data['Any'] - data['Space'] - data['Cntrl'] -
+ data['Surrogate'] - data['Unassigned']
+ data['Print'] = data['Graph'] + data['Space_Separator']
+end
+
+def parse_scripts(data, categories)
+ files = [
+ {:fn => 'DerivedCoreProperties.txt', :title => 'Derived Property'},
+ {:fn => 'Scripts.txt', :title => 'Script'},
+ {:fn => 'PropList.txt', :title => 'Binary Property'},
+ {:fn => 'emoji-data.txt', :title => 'Emoji'}
+ ]
+ current = nil
+ cps = []
+ names = {}
+ files.each do |file|
+ data_foreach(file[:fn]) do |line|
+ if /^# Total (?:code points|elements): / =~ line
+ data[current] = cps
+ categories[current] = file[:title]
+ (names[file[:title]] ||= []) << current
+ cps = []
+ elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line
+ current = $3
+ $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
+ end
+ end
+ end
+ # All code points not explicitly listed for Script
+ # have the value Unknown (Zzzz).
+ data['Unknown'] = (0..0x10ffff).to_a - data.values_at(*names['Script']).flatten
+ categories['Unknown'] = 'Script'
+ names.values.flatten << 'Unknown'
+end
+
+def parse_aliases(data)
+ kv = {}
+ data_foreach('PropertyAliases.txt') do |line|
+ next unless /^(\w+)\s*; (\w+)/ =~ line
+ data[$1] = data[$2]
+ kv[normalize_propname($1)] = normalize_propname($2)
+ end
+ data_foreach('PropertyValueAliases.txt') do |line|
+ next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line
+ if $1 == 'gc'
+ data[$3] = data[$2]
+ data[$4] = data[$2]
+ kv[normalize_propname($3)] = normalize_propname($2)
+ kv[normalize_propname($4)] = normalize_propname($2) if $4
+ else
+ data[$2] = data[$3]
+ data[$4] = data[$3]
+ kv[normalize_propname($2)] = normalize_propname($3)
+ kv[normalize_propname($4)] = normalize_propname($3) if $4
+ end
+ end
+ kv
+end
+
+# According to Unicode6.0.0/ch03.pdf, Section 3.1, "An update version
+# never involves any additions to the character repertoire." Versions
+# in DerivedAge.txt should always be /\d+\.\d+/
+def parse_age(data)
+ current = nil
+ last_constname = nil
+ cps = []
+ ages = []
+ data_foreach('DerivedAge.txt') do |line|
+ if /^# Total code points: / =~ line
+ constname = constantize_agename(current)
+ # each version matches all previous versions
+ cps.concat(data[last_constname]) if last_constname
+ data[constname] = cps
+ make_const(constname, cps, "Derived Age #{current}")
+ ages << current
+ last_constname = constname
+ cps = []
+ elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\d+\.\d+)/ =~ line
+ current = $3
+ $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
+ end
+ end
+ ages
+end
+
+def parse_GraphemeBreakProperty(data)
+ current = nil
+ cps = []
+ ages = []
+ data_foreach('auxiliary/GraphemeBreakProperty.txt') do |line|
+ if /^# Total code points: / =~ line
+ constname = constantize_Grapheme_Cluster_Break(current)
+ data[constname] = cps
+ make_const(constname, cps, "Grapheme_Cluster_Break=#{current}")
+ ages << current
+ cps = []
+ elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line
+ current = $3
+ $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
+ end
+ end
+ ages
+end
+
+def parse_block(data)
+ cps = []
+ blocks = []
+ data_foreach('Blocks.txt') do |line|
+ if /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);\s*(.*)/ =~ line
+ cps = ($1.to_i(16)..$2.to_i(16)).to_a
+ constname = constantize_blockname($3)
+ data[constname] = cps
+ make_const(constname, cps, "Block")
+ blocks << constname
+ end
+ end
+
+ # All code points not belonging to any of the named blocks
+ # have the value No_Block.
+ no_block = (0..0x10ffff).to_a - data.values_at(*blocks).flatten
+ constname = constantize_blockname("No_Block")
+ make_const(constname, no_block, "Block")
+ blocks << constname
+end
+
+# shim for Ruby 1.8
+unless {}.respond_to?(:key)
+ class Hash
+ alias key index
+ end
+end
+
+$const_cache = {}
+# make_const(property, pairs, name): Prints a 'static const' structure for a
+# given property, group of paired codepoints, and a human-friendly name for
+# the group
+def make_const(prop, data, name)
+ if name.empty?
+ puts "\n/* '#{prop}' */"
+ else
+ puts "\n/* '#{prop}': #{name} */"
+ end
+ if origprop = $const_cache.key(data)
+ puts "#define CR_#{prop} CR_#{origprop}"
+ else
+ $const_cache[prop] = data
+ pairs = pair_codepoints(data)
+ puts "static const OnigCodePoint CR_#{prop}[] = {"
+ # The first element of the constant is the number of pairs of codepoints
+ puts "\t#{pairs.size},"
+ pairs.each do |pair|
+ pair.map! { |c| c == 0 ? '0x0000' : sprintf("%0#6x", c) }
+ puts "\t#{pair.first}, #{pair.last},"
+ end
+ puts "}; /* CR_#{prop} */"
+ end
+end
+
+def normalize_propname(name)
+ name = name.downcase
+ name.delete!('- _')
+ name
+end
+
+def constantize_agename(name)
+ "Age_#{name.sub(/\./, '_')}"
+end
+
+def constantize_Grapheme_Cluster_Break(name)
+ "Grapheme_Cluster_Break_#{name}"
+end
+
+def constantize_blockname(name)
+ "In_#{name.gsub(/\W/, '_')}"
+end
+
+def get_file(name)
+ File.join(ARGV[0], name)
+end
+
+def data_foreach(name, &block)
+ fn = get_file(name)
+ warn "Reading #{name}"
+ pat = /^# #{File.basename(name).sub(/\./, '-([\\d.]+)\\.')}/
+ File.open(fn, 'rb') do |f|
+ line = f.gets
+ unless /^emoji-/ =~ name
+ unless pat =~ line
+ raise ArgumentError, "#{name}: no Unicode version"
+ end
+ if !$unicode_version
+ $unicode_version = $1
+ elsif $unicode_version != $1
+ raise ArgumentError, "#{name}: Unicode version mismatch: #$1"
+ end
+ end
+ f.each(&block)
+ end
+end
+
+# Write Data
+class Unifdef
+ attr_accessor :output, :top, :stack, :stdout, :kwdonly
+ def initialize(out)
+ @top = @output = []
+ @stack = []
+ $stdout, @stdout = self, out
+ end
+ def restore
+ $stdout = @stdout
+ end
+ def ifdef(sym)
+ if @kwdonly
+ @stdout.puts "#ifdef #{sym}"
+ else
+ @stack << @top
+ @top << tmp = [sym]
+ @top = tmp
+ end
+ if block_given?
+ begin
+ return yield
+ ensure
+ endif(sym)
+ end
+ end
+ end
+ def endif(sym)
+ if @kwdonly
+ @stdout.puts "#endif /* #{sym} */"
+ else
+ unless sym == @top[0]
+ restore
+ raise ArgumentError, "#{sym} unmatch to #{@top[0]}"
+ end
+ @top = @stack.pop
+ end
+ end
+ def show(dest, *syms)
+ _show(dest, @output, syms)
+ end
+ def _show(dest, ary, syms)
+ if Symbol === (sym = ary[0])
+ unless syms.include?(sym)
+ return
+ end
+ end
+ ary.each do |e|
+ case e
+ when Array
+ _show(dest, e, syms)
+ when String
+ dest.print e
+ end
+ end
+ end
+ def write(str)
+ if @kwdonly
+ @stdout.write(str)
+ else
+ @top << str
+ end
+ self
+ end
+ alias << write
+end
+
+output = Unifdef.new($stdout)
+output.kwdonly = !header
+
+puts '%{'
+props, data = parse_unicode_data(get_file('UnicodeData.txt'))
+categories = {}
+props.concat parse_scripts(data, categories)
+aliases = parse_aliases(data)
+ages = blocks = graphemeBreaks = nil
+define_posix_props(data)
+POSIX_NAMES.each do |name|
+ if name == 'XPosixPunct'
+ make_const(name, data[name], "[[:Punct:]]")
+ elsif name == 'Punct'
+ make_const(name, data[name], "")
+ else
+ make_const(name, data[name], "[[:#{name}:]]")
+ end
+end
+output.ifdef :USE_UNICODE_PROPERTIES do
+ props.each do |name|
+ category = categories[name] ||
+ case name.size
+ when 1 then 'Major Category'
+ when 2 then 'General Category'
+ else '-'
+ end
+ make_const(name, data[name], category)
+ end
+ output.ifdef :USE_UNICODE_AGE_PROPERTIES do
+ ages = parse_age(data)
+ end
+ graphemeBreaks = parse_GraphemeBreakProperty(data)
+ blocks = parse_block(data)
+end
+puts(<<'__HEREDOC')
+
+static const OnigCodePoint* const CodeRanges[] = {
+__HEREDOC
+POSIX_NAMES.each{|name|puts" CR_#{name},"}
+output.ifdef :USE_UNICODE_PROPERTIES do
+ props.each{|name| puts" CR_#{name},"}
+ output.ifdef :USE_UNICODE_AGE_PROPERTIES do
+ ages.each{|name| puts" CR_#{constantize_agename(name)},"}
+ end
+ graphemeBreaks.each{|name| puts" CR_#{constantize_Grapheme_Cluster_Break(name)},"}
+ blocks.each{|name|puts" CR_#{name},"}
+end
+
+puts(<<"__HEREDOC")
+};
+struct uniname2ctype_struct {
+ short name;
+ unsigned short ctype;
+};
+#define uniname2ctype_offset(str) offsetof(struct uniname2ctype_pool_t, uniname2ctype_pool_##str)
+
+static const struct uniname2ctype_struct *uniname2ctype_p(const char *, #{ GPERF_VERSION >= '3.1' ? 'size_t' : 'unsigned int' });
+%}
+struct uniname2ctype_struct;
+%%
+__HEREDOC
+
+i = -1
+name_to_index = {}
+POSIX_NAMES.each do |name|
+ i += 1
+ next if name == 'NEWLINE'
+ name = normalize_propname(name)
+ name_to_index[name] = i
+ puts"%-40s %3d" % [name + ',', i]
+end
+output.ifdef :USE_UNICODE_PROPERTIES do
+ props.each do |name|
+ i += 1
+ name = normalize_propname(name)
+ name_to_index[name] = i
+ puts "%-40s %3d" % [name + ',', i]
+ end
+ aliases.each_pair do |k, v|
+ next if name_to_index[k]
+ next unless v = name_to_index[v]
+ puts "%-40s %3d" % [k + ',', v]
+ end
+ output.ifdef :USE_UNICODE_AGE_PROPERTIES do
+ ages.each do |name|
+ i += 1
+ name = "age=#{name}"
+ name_to_index[name] = i
+ puts "%-40s %3d" % [name + ',', i]
+ end
+ end
+ graphemeBreaks.each do |name|
+ i += 1
+ name = "graphemeclusterbreak=#{name.delete('_').downcase}"
+ name_to_index[name] = i
+ puts "%-40s %3d" % [name + ',', i]
+ end
+ blocks.each do |name|
+ i += 1
+ name = normalize_propname(name)
+ name_to_index[name] = i
+ puts "%-40s %3d" % [name + ',', i]
+ end
+end
+puts(<<'__HEREDOC')
+%%
+static int
+uniname2ctype(const UChar *name, unsigned int len)
+{
+ const struct uniname2ctype_struct *p = uniname2ctype_p((const char *)name, len);
+ if (p) return p->ctype;
+ return -1;
+}
+__HEREDOC
+versions = $unicode_version.scan(/\d+/)
+print("#if defined ONIG_UNICODE_VERSION_STRING && !( \\\n")
+%w[MAJOR MINOR TEENY].zip(versions) do |n, v|
+ print(" ONIG_UNICODE_VERSION_#{n} == #{v} && \\\n")
+end
+print(" 1)\n")
+print("# error ONIG_UNICODE_VERSION_STRING mismatch\n")
+print("#endif\n")
+print("#define ONIG_UNICODE_VERSION_STRING #{$unicode_version.dump}\n")
+%w[MAJOR MINOR TEENY].zip(versions) do |n, v|
+ print("#define ONIG_UNICODE_VERSION_#{n} #{v}\n")
+end
+
+output.restore
+
+if header
+ require 'tempfile'
+
+ NAME2CTYPE = %w[gperf -7 -c -j1 -i1 -t -C -P -T -H uniname2ctype_hash -Q uniname2ctype_pool -N uniname2ctype_p]
+
+ fds = []
+ syms = %i[USE_UNICODE_PROPERTIES USE_UNICODE_AGE_PROPERTIES]
+ begin
+ fds << (tmp = Tempfile.new(%w"name2ctype .h"))
+ IO.popen([*NAME2CTYPE, out: tmp], "w") {|f| output.show(f, *syms)}
+ end while syms.pop
+ fds.each(&:close)
+ IO.popen(%W[diff -DUSE_UNICODE_AGE_PROPERTIES #{fds[1].path} #{fds[0].path}], "r") {|age|
+ IO.popen(%W[diff -DUSE_UNICODE_PROPERTIES #{fds[2].path} -], "r", in: age) {|f|
+ f.each {|line|
+ line.gsub!(/\(int\)\((?:long|size_t)\)&\(\(struct uniname2ctype_pool_t \*\)0\)->uniname2ctype_pool_(str\d+),\s+/,
+ 'uniname2ctype_offset(\1), ')
+ puts line
+ }
+ }
+ }
+end
diff --git a/fluent-bit/lib/onigmo/tool/update-doc.py b/fluent-bit/lib/onigmo/tool/update-doc.py
new file mode 100755
index 000000000..4126adff4
--- /dev/null
+++ b/fluent-bit/lib/onigmo/tool/update-doc.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Usage:
+# $ python update-doc.py UCD_DIR > ../doc/UnicodeProps.txt
+
+from __future__ import print_function
+import sys
+import os
+import re
+import datetime
+
+onig_ver = "6.2.2"
+ucddir = "."
+
+def print_list(arr, title):
+ print()
+ print("*", title)
+ for i in arr:
+ print(" " + i)
+
+def output_header():
+ d = datetime.date.today()
+ print("Onigmo (Oniguruma-mod) Unicode Properties Version %s %04d/%02d/%02d"
+ % (onig_ver, d.year, d.month, d.day))
+
+ posix_brackets = [
+ "Alpha", "Blank", "Cntrl", "Digit", "Graph", "Lower", "Print",
+ "Punct", "Space", "Upper", "XDigit", "Word", "Alnum", "ASCII",
+ "XPosixPunct"
+ ]
+ specials = ["Any", "Assigned"]
+
+ print_list(posix_brackets, "POSIX brackets")
+ print_list(specials, "Special")
+ return set(posix_brackets) | set(specials)
+
+def output_categories():
+ categories = set(["LC", "Cn"])
+ pattern = re.compile('^.*?;.*?;(..);')
+ with open(ucddir + os.sep + 'UnicodeData.txt', 'r') as f:
+ for line in f:
+ res = pattern.match(line)
+ if not res:
+ continue
+ categories.add(res.group(1))
+ categories.add(res.group(1)[0]) # Major category
+ print_list(sorted(categories), "Major and General Categories")
+ return categories
+
+def output_scripts(filename, title, add=[]):
+ scripts = set(add)
+ pattern = re.compile('^[0-9a-fA-F]+(?:\.\.[0-9a-fA-F]+)? *; (\w+) +# ')
+ with open(filename, 'r') as f:
+ for line in f:
+ res = pattern.match(line)
+ if not res:
+ continue
+ scripts.add(res.group(1))
+ print_list(sorted(scripts), title)
+ return scripts
+
+def output_aliases(scripts):
+ aliases = set()
+ pattern = re.compile('^(\w+) *; (\w+)')
+ with open(ucddir + os.sep + 'PropertyAliases.txt', 'r') as f:
+ for line in f:
+ res = pattern.match(line)
+ if not res:
+ continue
+ if (res.group(2) in scripts) and (res.group(1) not in scripts):
+ aliases.add(res.group(1))
+ print_list(sorted(aliases), "PropertyAliases")
+ return aliases
+
+def output_valuealiases(scripts):
+ scripts |= set(["cntrl", "digit", "punct"]) # exclude them
+ aliases = list()
+ aliases_sc = list()
+ pattern = re.compile('^(gc|sc) ; (\w+) *; (\w+)(?: *; (\w+))?')
+ with open(ucddir + os.sep + 'PropertyValueAliases.txt', 'r') as f:
+ for line in f:
+ res = pattern.match(line)
+ if not res:
+ continue
+ if (res.group(1) == "gc"):
+ if res.group(2) in scripts:
+ if res.group(3) not in scripts:
+ aliases.append(res.group(3))
+ if res.group(4) and (res.group(4) not in scripts):
+ aliases.append(res.group(4))
+ else:
+ if res.group(3) in scripts:
+ if res.group(2) not in scripts:
+ aliases_sc.append(res.group(2))
+ if res.group(4) and (res.group(4) not in scripts):
+ aliases_sc.append(res.group(4))
+
+ print_list(aliases, "PropertyValueAliases (General_Category)")
+ print_list(aliases_sc, "PropertyValueAliases (Script)")
+ return set(aliases) | set(aliases_sc)
+
+def output_ages():
+ ages = set()
+ pattern = re.compile('^[\dA-F.]+ *; ([\d.]+)')
+ with open(ucddir + os.sep + 'DerivedAge.txt', 'r') as f:
+ for line in f:
+ res = pattern.match(line)
+ if not res:
+ continue
+ ages.add("Age=" + res.group(1))
+ print_list(sorted(ages), "DerivedAges")
+ return ages
+
+def output_blocks():
+ blocks = list()
+ pattern = re.compile('^[\dA-F.]+ *; ([-\w ]+)')
+ with open(ucddir + os.sep + 'Blocks.txt', 'r') as f:
+ for line in f:
+ res = pattern.match(line)
+ if not res:
+ continue
+ blocks.append("In_" + re.sub('\W', '_', res.group(1)))
+ blocks.append("In_No_Block")
+ print_list(blocks, "Blocks")
+ return set(blocks)
+
+def main():
+ global ucddir
+ if len(sys.argv) > 1:
+ ucddir = sys.argv[1]
+ scripts = set()
+ scripts |= output_header()
+ scripts |= output_categories()
+ scripts |= output_scripts(ucddir + os.sep + 'Scripts.txt', 'Scripts', ["Unknown"])
+ scripts |= output_scripts(ucddir + os.sep + 'DerivedCoreProperties.txt', 'DerivedCoreProperties')
+ scripts |= output_scripts(ucddir + os.sep + 'PropList.txt', 'PropList')
+ scripts |= output_scripts(ucddir + os.sep + 'emoji-data.txt', 'Emoji')
+ output_aliases(scripts)
+ output_valuealiases(scripts)
+ output_ages()
+ output_blocks()
+
+if __name__ == '__main__':
+ main()