From b485aab7e71c1625cfc27e0f92c9509f42378458 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 5 May 2024 13:19:16 +0200 Subject: Adding upstream version 1.45.3+dfsg. Signed-off-by: Daniel Baumann --- src/fluent-bit/lib/onigmo/tool/case-folding.rb | 418 +++++++++++++++++++++++++ 1 file changed, 418 insertions(+) create mode 100755 src/fluent-bit/lib/onigmo/tool/case-folding.rb (limited to 'src/fluent-bit/lib/onigmo/tool/case-folding.rb') diff --git a/src/fluent-bit/lib/onigmo/tool/case-folding.rb b/src/fluent-bit/lib/onigmo/tool/case-folding.rb new file mode 100755 index 000000000..c299074f0 --- /dev/null +++ b/src/fluent-bit/lib/onigmo/tool/case-folding.rb @@ -0,0 +1,418 @@ +#!/usr/bin/ruby +require 'stringio' + +# Usage (for case folding only): +# $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt +# $ ruby case-folding.rb CaseFolding.txt -o casefold.h +# or (for case folding and case mapping): +# $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt +# $ wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt +# $ wget http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt +# $ ruby case-folding.rb -m . -o casefold.h +# using -d or --debug will include UTF-8 characters in comments for debugging + +class CaseFolding + module Util + module_function + + def hex_seq(v) + v.map { |i| "0x%04x" % i }.join(", ") + end + + def print_table_1(dest, type, mapping_data, data) + for k, v in data = data.sort + sk = (Array === k and k.length > 1) ? "{#{hex_seq(k)}}" : ("0x%04x" % k) + if type=='CaseUnfold_11' and v.length>1 + # reorder CaseUnfold_11 entries to avoid special treatment for U+03B9/U+03BC/U+A64B + item = mapping_data.map("%04X" % k[0]) + upper = item.upper if item + v = v.sort_by { |i| ("%04X"%i) == upper ? 0 : 1 } + end + ck = @debug ? ' /* ' + Array(k).pack("U*") + ' */' : '' + cv = @debug ? ' /* ' + Array(v).map{|c|[c].pack("U*")}.join(", ") + ' */' : '' + dest.print(" {#{sk}#{ck}, {#{v.length}#{mapping_data.flags(k, type, v)}, {#{hex_seq(v)}#{cv}}}},\n") + end + data + end + + def print_table(dest, type, mapping_data, data) + dest.print("static const #{type}_Type #{type}_Table[] = {\n") + i = 0 + ret = data.inject([]) do |a, (n, d)| + dest.print("#define #{n} (*(#{type}_Type (*)[#{d.size}])(#{type}_Table+#{i}))\n") + i += d.size + a.concat(print_table_1(dest, type, mapping_data, d)) + end + dest.print("};\n\n") + ret + end + end + + include Util + + attr_reader :fold, :fold_locale, :unfold, :unfold_locale, :version + + def load(filename) + pattern = /([0-9A-F]{4,6}); ([CFT]); ([0-9A-F]{4,6})(?: ([0-9A-F]{4,6}))?(?: ([0-9A-F]{4,6}))?;/ + + @fold = fold = {} + @unfold = unfold = [{}, {}, {}] + @debug = false + @version = nil + turkic = [] + + IO.foreach(filename, mode: "rb") do |line| + @version ||= line[/-([0-9.]+).txt/, 1] + next unless res = pattern.match(line) + ch_from = res[1].to_i(16) + + if res[2] == 'T' + # Turkic case folding + turkic << ch_from + next + end + + # store folding data + ch_to = res[3..6].inject([]) do |a, i| + break a unless i + a << i.to_i(16) + end + fold[ch_from] = ch_to + + # store unfolding data + i = ch_to.length - 1 + (unfold[i][ch_to] ||= []) << ch_from + end + + # move locale dependent data to (un)fold_locale + @fold_locale = fold_locale = {} + @unfold_locale = unfold_locale = [{}, {}] + for ch_from in turkic + key = fold[ch_from] + i = key.length - 1 + unfold_locale[i][i == 0 ? key[0] : key] = unfold[i].delete(key) + fold_locale[ch_from] = fold.delete(ch_from) + end + self + end + + def range_check(code) + "#{code} <= MAX_CODE_VALUE && #{code} >= MIN_CODE_VALUE" + end + + def lookup_hash(key, type, data) + hash = "onigenc_unicode_#{key}_hash" + lookup = "onigenc_unicode_#{key}_lookup" + arity = Array(data[0][0]).size + gperf = %W"gperf -7 -k#{[*1..(arity*3)].join(',')} -F,-1 -c -j1 -i1 -t -T -E -C -H #{hash} -N #{lookup} -n" + argname = arity > 1 ? "codes" : "code" + argdecl = "const OnigCodePoint #{arity > 1 ? "*": ""}#{argname}" + n = 7 + m = (1 << n) - 1 + min, max = data.map {|c, *|c}.flatten.minmax + src = IO.popen(gperf, "r+") {|f| + f << "short\n%%\n" + data.each_with_index {|(k, _), i| + k = Array(k) + ks = k.map {|j| [(j >> n*2) & m, (j >> n) & m, (j) & m]}.flatten.map {|c| "\\x%.2x" % c}.join("") + f.printf "\"%s\", ::::/*%s*/ %d\n", ks, k.map {|c| "0x%.4x" % c}.join(","), i + } + f << "%%\n" + f.close_write + f.read + } + src.sub!(/^(#{hash})\s*\(.*?\).*?\n\{\n(.*)^\}/m) { + name = $1 + body = $2 + body.gsub!(/\(unsigned char\)str\[(\d+)\]/, "bits_#{arity > 1 ? 'at' : 'of'}(#{argname}, \\1)") + "#{name}(#{argdecl})\n{\n#{body}}" + } + src.sub!(/const short *\*\n^(#{lookup})\s*\(.*?\).*?\n\{\n(.*)^\}/m) { + name = $1 + body = $2 + body.sub!(/\benum\s+\{(\n[ \t]+)/, "\\&MIN_CODE_VALUE = 0x#{min.to_s(16)},\\1""MAX_CODE_VALUE = 0x#{max.to_s(16)},\\1") + body.gsub!(/(#{hash})\s*\(.*?\)/, "\\1(#{argname})") + body.gsub!(/\{"",-1}/, "-1") + body.gsub!(/\{"(?:[^"]|\\")+", *::::(.*)\}/, '\1') + body.sub!(/(\s+if\s)\(len\b.*\)/) do + "#$1(" << + (arity > 1 ? (0...arity).map {|i| range_check("#{argname}[#{i}]")}.join(" &&\n ") : range_check(argname)) << + ")" + end + v = nil + body.sub!(/(if\s*\(.*MAX_HASH_VALUE.*\)\n([ \t]*))\{(.*?)\n\2\}/m) { + pre = $1 + indent = $2 + s = $3 + s.sub!(/const char *\* *(\w+)( *= *wordlist\[\w+\]).\w+/, 'short \1 = wordlist[key]') + v = $1 + s.sub!(/\bif *\(.*\)/, "if (#{v} >= 0 && code#{arity}_equal(#{argname}, #{key}_Table[#{v}].from))") + "#{pre}{#{s}\n#{indent}}" + } + body.sub!(/\b(return\s+&)([^;]+);/, '\1'"#{key}_Table[#{v}].to;") + "static const #{type} *\n#{name}(#{argdecl})\n{\n#{body}}" + } + src + end + + def display(dest, mapping_data) + # print the header + dest.print("/* DO NOT EDIT THIS FILE. */\n") + dest.print("/* Generated by tool/case-folding.rb */\n\n") + + versions = version.scan(/\d+/) + dest.print("#if defined ONIG_UNICODE_VERSION_STRING && !( \\\n") + %w[MAJOR MINOR TEENY].zip(versions) do |n, v| + dest.print(" ONIG_UNICODE_VERSION_#{n} == #{v} && \\\n") + end + dest.print(" 1)\n") + dest.print("# error ONIG_UNICODE_VERSION_STRING mismatch\n") + dest.print("#endif\n") + dest.print("#define ONIG_UNICODE_VERSION_STRING #{version.dump}\n") + %w[MAJOR MINOR TEENY].zip(versions) do |n, v| + dest.print("#define ONIG_UNICODE_VERSION_#{n} #{v}\n") + end + dest.print("\n") + + # print folding data + + # CaseFold + CaseFold_Locale + name = "CaseFold_11" + data = print_table(dest, name, mapping_data, "CaseFold"=>fold, "CaseFold_Locale"=>fold_locale) + dest.print lookup_hash(name, "CodePointList3", data) + + # print unfolding data + + # CaseUnfold_11 + CaseUnfold_11_Locale + name = "CaseUnfold_11" + data = print_table(dest, name, mapping_data, name=>unfold[0], "#{name}_Locale"=>unfold_locale[0]) + dest.print lookup_hash(name, "CodePointList3", data) + + # CaseUnfold_12 + CaseUnfold_12_Locale + name = "CaseUnfold_12" + data = print_table(dest, name, mapping_data, name=>unfold[1], "#{name}_Locale"=>unfold_locale[1]) + dest.print lookup_hash(name, "CodePointList2", data) + + # CaseUnfold_13 + name = "CaseUnfold_13" + data = print_table(dest, name, mapping_data, name=>unfold[2]) + dest.print lookup_hash(name, "CodePointList2", data) + + # TitleCase + dest.print mapping_data.specials_output + end + + def debug! + @debug = true + end + + def self.load(*args) + new.load(*args) + end +end + +class MapItem + attr_accessor :upper, :lower, :title, :code + + def initialize(code, upper, lower, title) + @code = code + @upper = upper unless upper == '' + @lower = lower unless lower == '' + @title = title unless title == '' + end +end + +class CaseMapping + attr_reader :filename, :version + + def initialize(mapping_directory) + @mappings = {} + @specials = [] + @specials_length = 0 + @version = nil + IO.foreach(File.join(mapping_directory, 'UnicodeData.txt'), mode: "rb") do |line| + next if line =~ /^ 1 + end + if mapping_directory + if ARGV[0] + warn "Either specify directory or individual file, but not both." + exit + end + filename = File.join(mapping_directory, 'CaseFolding.txt') + mapping_data = CaseMapping.load(mapping_directory) + end + filename ||= ARGV[0] || 'CaseFolding.txt' + data = CaseFolding.load(filename) + if mapping_data and data.version != mapping_data.version + abort "Unicode data version mismatch\n" \ + " #{filename} = #{data.version}\n" \ + " #{mapping_data.filename} = #{mapping_data.version}" + end + mapping_data ||= CaseMappingDummy.new + + if debug + data.debug! + mapping_data.debug! + end + f = StringIO.new + begin + data.display(f, mapping_data) + rescue Errno::ENOENT => e + raise unless /gperf/ =~ e.message + warn e.message + abort unless dest + File.utime(nil, nil, dest) # assume existing file is OK + exit + else + s = f.string + end + if dest + open(dest, "wb") do |file| + file.print(s) + end + else + STDOUT.print(s) + end +end -- cgit v1.2.3