diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-03-09 13:19:48 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-03-09 13:20:02 +0000 |
commit | 58daab21cd043e1dc37024a7f99b396788372918 (patch) | |
tree | 96771e43bb69f7c1c2b0b4f7374cb74d7866d0cb /fluent-bit/lib/onigmo/tool/enc-unicode.rb | |
parent | Releasing debian version 1.43.2-1. (diff) | |
download | netdata-58daab21cd043e1dc37024a7f99b396788372918.tar.xz netdata-58daab21cd043e1dc37024a7f99b396788372918.zip |
Merging upstream version 1.44.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'fluent-bit/lib/onigmo/tool/enc-unicode.rb')
-rwxr-xr-x | fluent-bit/lib/onigmo/tool/enc-unicode.rb | 548 |
1 files changed, 548 insertions, 0 deletions
diff --git a/fluent-bit/lib/onigmo/tool/enc-unicode.rb b/fluent-bit/lib/onigmo/tool/enc-unicode.rb new file mode 100755 index 00000000..84f494e8 --- /dev/null +++ b/fluent-bit/lib/onigmo/tool/enc-unicode.rb @@ -0,0 +1,548 @@ +#!/usr/bin/env ruby + +# Creates the data structures needed by Oniguruma to map Unicode codepoints to +# property names and POSIX character classes +# +# To use this, get UnicodeData.txt, Scripts.txt, PropList.txt, +# PropertyAliases.txt, PropertyValueAliases.txt, DerivedCoreProperties.txt, +# DerivedAge.txt and Blocks.txt from unicode.org. +# (http://unicode.org/Public/UNIDATA/) And run following command. +# ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd +# You can get source file for gperf. After this, simply make ruby. + +if ARGV[0] == "--header" + header = true + ARGV.shift +end +unless ARGV.size == 1 + abort "Usage: #{$0} data_directory" +end + +$unicode_version = File.basename(ARGV[0])[/\A[.\d]+\z/] + +POSIX_NAMES = %w[NEWLINE Alpha Blank Cntrl Digit Graph Lower Print XPosixPunct Space Upper XDigit Word Alnum ASCII Punct] + +GPERF_VERSION = `gperf -v`.split("\n").first # /^GNU gperf (.+)$/ + .split.last + +def pair_codepoints(codepoints) + + # We have a sorted Array of codepoints that we wish to partition into + # ranges such that the start- and endpoints form an inclusive set of + # codepoints with property _property_. Note: It is intended that some ranges + # will begin with the value with which they end, e.g. 0x0020 -> 0x0020 + + codepoints.sort! + last_cp = codepoints.first + pairs = [[last_cp, nil]] + codepoints[1..-1].each do |codepoint| + next if last_cp == codepoint + + # If the current codepoint does not follow directly on from the last + # codepoint, the last codepoint represents the end of the current range, + # and the current codepoint represents the start of the next range. + if last_cp.next != codepoint + pairs[-1][-1] = last_cp + pairs << [codepoint, nil] + end + last_cp = codepoint + end + + # The final pair has as its endpoint the last codepoint for this property + pairs[-1][-1] = codepoints.last + pairs +end + +def parse_unicode_data(file) + last_cp = 0 + data = {'Any' => (0x0000..0x10ffff).to_a, 'Assigned' => [], + 'ASCII' => (0..0x007F).to_a, 'NEWLINE' => [0x0a], 'Cn' => []} + beg_cp = nil + IO.foreach(file) do |line| + fields = line.split(';') + cp = fields[0].to_i(16) + + case fields[1] + when /\A<(.*),\s*First>\z/ + beg_cp = cp + next + when /\A<(.*),\s*Last>\z/ + cps = (beg_cp..cp).to_a + else + beg_cp = cp + cps = [cp] + end + + # The Cn category represents unassigned characters. These are not listed in + # UnicodeData.txt so we must derive them by looking for 'holes' in the range + # of listed codepoints. We increment the last codepoint seen and compare it + # with the current codepoint. If the current codepoint is less than + # last_cp.next we have found a hole, so we add the missing codepoint to the + # Cn category. + data['Cn'].concat((last_cp.next...beg_cp).to_a) + + # Assigned - Defined in unicode.c; interpreted as every character in the + # Unicode range minus the unassigned characters + data['Assigned'].concat(cps) + + # The third field denotes the 'General' category, e.g. Lu + (data[fields[2]] ||= []).concat(cps) + + # The 'Major' category is the first letter of the 'General' category, e.g. + # 'Lu' -> 'L' + (data[fields[2][0,1]] ||= []).concat(cps) + last_cp = cp + end + + # The last Cn codepoint should be 0x10ffff. If it's not, append the missing + # codepoints to Cn and C + cn_remainder = (last_cp.next..0x10ffff).to_a + data['Cn'] += cn_remainder + data['C'] += data['Cn'] + + # Special case for LC (Cased_Letter). LC = Ll + Lt + Lu + data['LC'] = data['Ll'] + data['Lt'] + data['Lu'] + + # Define General Category properties + gcps = data.keys.sort - POSIX_NAMES + + # Returns General Category Property names and the data + [gcps, data] +end + +def define_posix_props(data) + # We now derive the character classes (POSIX brackets), e.g. [[:alpha:]] + # + + data['Alpha'] = data['Alphabetic'] + data['Upper'] = data['Uppercase'] + data['Lower'] = data['Lowercase'] + data['Punct'] = data['Punctuation'] + data['XPosixPunct'] = data['Punctuation'] + [0x24, 0x2b, 0x3c, 0x3d, 0x3e, 0x5e, 0x60, 0x7c, 0x7e] + data['Digit'] = data['Decimal_Number'] + data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a + + (0x0061..0x0066).to_a + data['Alnum'] = data['Alpha'] + data['Digit'] + data['Space'] = data['White_Space'] + data['Blank'] = data['Space_Separator'] + [0x0009] + data['Cntrl'] = data['Cc'] + data['Word'] = data['Alpha'] + data['Mark'] + data['Digit'] + data['Connector_Punctuation'] + data['Graph'] = data['Any'] - data['Space'] - data['Cntrl'] - + data['Surrogate'] - data['Unassigned'] + data['Print'] = data['Graph'] + data['Space_Separator'] +end + +def parse_scripts(data, categories) + files = [ + {:fn => 'DerivedCoreProperties.txt', :title => 'Derived Property'}, + {:fn => 'Scripts.txt', :title => 'Script'}, + {:fn => 'PropList.txt', :title => 'Binary Property'}, + {:fn => 'emoji-data.txt', :title => 'Emoji'} + ] + current = nil + cps = [] + names = {} + files.each do |file| + data_foreach(file[:fn]) do |line| + if /^# Total (?:code points|elements): / =~ line + data[current] = cps + categories[current] = file[:title] + (names[file[:title]] ||= []) << current + cps = [] + elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line + current = $3 + $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16)) + end + end + end + # All code points not explicitly listed for Script + # have the value Unknown (Zzzz). + data['Unknown'] = (0..0x10ffff).to_a - data.values_at(*names['Script']).flatten + categories['Unknown'] = 'Script' + names.values.flatten << 'Unknown' +end + +def parse_aliases(data) + kv = {} + data_foreach('PropertyAliases.txt') do |line| + next unless /^(\w+)\s*; (\w+)/ =~ line + data[$1] = data[$2] + kv[normalize_propname($1)] = normalize_propname($2) + end + data_foreach('PropertyValueAliases.txt') do |line| + next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line + if $1 == 'gc' + data[$3] = data[$2] + data[$4] = data[$2] + kv[normalize_propname($3)] = normalize_propname($2) + kv[normalize_propname($4)] = normalize_propname($2) if $4 + else + data[$2] = data[$3] + data[$4] = data[$3] + kv[normalize_propname($2)] = normalize_propname($3) + kv[normalize_propname($4)] = normalize_propname($3) if $4 + end + end + kv +end + +# According to Unicode6.0.0/ch03.pdf, Section 3.1, "An update version +# never involves any additions to the character repertoire." Versions +# in DerivedAge.txt should always be /\d+\.\d+/ +def parse_age(data) + current = nil + last_constname = nil + cps = [] + ages = [] + data_foreach('DerivedAge.txt') do |line| + if /^# Total code points: / =~ line + constname = constantize_agename(current) + # each version matches all previous versions + cps.concat(data[last_constname]) if last_constname + data[constname] = cps + make_const(constname, cps, "Derived Age #{current}") + ages << current + last_constname = constname + cps = [] + elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\d+\.\d+)/ =~ line + current = $3 + $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16)) + end + end + ages +end + +def parse_GraphemeBreakProperty(data) + current = nil + cps = [] + ages = [] + data_foreach('auxiliary/GraphemeBreakProperty.txt') do |line| + if /^# Total code points: / =~ line + constname = constantize_Grapheme_Cluster_Break(current) + data[constname] = cps + make_const(constname, cps, "Grapheme_Cluster_Break=#{current}") + ages << current + cps = [] + elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line + current = $3 + $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16)) + end + end + ages +end + +def parse_block(data) + cps = [] + blocks = [] + data_foreach('Blocks.txt') do |line| + if /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);\s*(.*)/ =~ line + cps = ($1.to_i(16)..$2.to_i(16)).to_a + constname = constantize_blockname($3) + data[constname] = cps + make_const(constname, cps, "Block") + blocks << constname + end + end + + # All code points not belonging to any of the named blocks + # have the value No_Block. + no_block = (0..0x10ffff).to_a - data.values_at(*blocks).flatten + constname = constantize_blockname("No_Block") + make_const(constname, no_block, "Block") + blocks << constname +end + +# shim for Ruby 1.8 +unless {}.respond_to?(:key) + class Hash + alias key index + end +end + +$const_cache = {} +# make_const(property, pairs, name): Prints a 'static const' structure for a +# given property, group of paired codepoints, and a human-friendly name for +# the group +def make_const(prop, data, name) + if name.empty? + puts "\n/* '#{prop}' */" + else + puts "\n/* '#{prop}': #{name} */" + end + if origprop = $const_cache.key(data) + puts "#define CR_#{prop} CR_#{origprop}" + else + $const_cache[prop] = data + pairs = pair_codepoints(data) + puts "static const OnigCodePoint CR_#{prop}[] = {" + # The first element of the constant is the number of pairs of codepoints + puts "\t#{pairs.size}," + pairs.each do |pair| + pair.map! { |c| c == 0 ? '0x0000' : sprintf("%0#6x", c) } + puts "\t#{pair.first}, #{pair.last}," + end + puts "}; /* CR_#{prop} */" + end +end + +def normalize_propname(name) + name = name.downcase + name.delete!('- _') + name +end + +def constantize_agename(name) + "Age_#{name.sub(/\./, '_')}" +end + +def constantize_Grapheme_Cluster_Break(name) + "Grapheme_Cluster_Break_#{name}" +end + +def constantize_blockname(name) + "In_#{name.gsub(/\W/, '_')}" +end + +def get_file(name) + File.join(ARGV[0], name) +end + +def data_foreach(name, &block) + fn = get_file(name) + warn "Reading #{name}" + pat = /^# #{File.basename(name).sub(/\./, '-([\\d.]+)\\.')}/ + File.open(fn, 'rb') do |f| + line = f.gets + unless /^emoji-/ =~ name + unless pat =~ line + raise ArgumentError, "#{name}: no Unicode version" + end + if !$unicode_version + $unicode_version = $1 + elsif $unicode_version != $1 + raise ArgumentError, "#{name}: Unicode version mismatch: #$1" + end + end + f.each(&block) + end +end + +# Write Data +class Unifdef + attr_accessor :output, :top, :stack, :stdout, :kwdonly + def initialize(out) + @top = @output = [] + @stack = [] + $stdout, @stdout = self, out + end + def restore + $stdout = @stdout + end + def ifdef(sym) + if @kwdonly + @stdout.puts "#ifdef #{sym}" + else + @stack << @top + @top << tmp = [sym] + @top = tmp + end + if block_given? + begin + return yield + ensure + endif(sym) + end + end + end + def endif(sym) + if @kwdonly + @stdout.puts "#endif /* #{sym} */" + else + unless sym == @top[0] + restore + raise ArgumentError, "#{sym} unmatch to #{@top[0]}" + end + @top = @stack.pop + end + end + def show(dest, *syms) + _show(dest, @output, syms) + end + def _show(dest, ary, syms) + if Symbol === (sym = ary[0]) + unless syms.include?(sym) + return + end + end + ary.each do |e| + case e + when Array + _show(dest, e, syms) + when String + dest.print e + end + end + end + def write(str) + if @kwdonly + @stdout.write(str) + else + @top << str + end + self + end + alias << write +end + +output = Unifdef.new($stdout) +output.kwdonly = !header + +puts '%{' +props, data = parse_unicode_data(get_file('UnicodeData.txt')) +categories = {} +props.concat parse_scripts(data, categories) +aliases = parse_aliases(data) +ages = blocks = graphemeBreaks = nil +define_posix_props(data) +POSIX_NAMES.each do |name| + if name == 'XPosixPunct' + make_const(name, data[name], "[[:Punct:]]") + elsif name == 'Punct' + make_const(name, data[name], "") + else + make_const(name, data[name], "[[:#{name}:]]") + end +end +output.ifdef :USE_UNICODE_PROPERTIES do + props.each do |name| + category = categories[name] || + case name.size + when 1 then 'Major Category' + when 2 then 'General Category' + else '-' + end + make_const(name, data[name], category) + end + output.ifdef :USE_UNICODE_AGE_PROPERTIES do + ages = parse_age(data) + end + graphemeBreaks = parse_GraphemeBreakProperty(data) + blocks = parse_block(data) +end +puts(<<'__HEREDOC') + +static const OnigCodePoint* const CodeRanges[] = { +__HEREDOC +POSIX_NAMES.each{|name|puts" CR_#{name},"} +output.ifdef :USE_UNICODE_PROPERTIES do + props.each{|name| puts" CR_#{name},"} + output.ifdef :USE_UNICODE_AGE_PROPERTIES do + ages.each{|name| puts" CR_#{constantize_agename(name)},"} + end + graphemeBreaks.each{|name| puts" CR_#{constantize_Grapheme_Cluster_Break(name)},"} + blocks.each{|name|puts" CR_#{name},"} +end + +puts(<<"__HEREDOC") +}; +struct uniname2ctype_struct { + short name; + unsigned short ctype; +}; +#define uniname2ctype_offset(str) offsetof(struct uniname2ctype_pool_t, uniname2ctype_pool_##str) + +static const struct uniname2ctype_struct *uniname2ctype_p(const char *, #{ GPERF_VERSION >= '3.1' ? 'size_t' : 'unsigned int' }); +%} +struct uniname2ctype_struct; +%% +__HEREDOC + +i = -1 +name_to_index = {} +POSIX_NAMES.each do |name| + i += 1 + next if name == 'NEWLINE' + name = normalize_propname(name) + name_to_index[name] = i + puts"%-40s %3d" % [name + ',', i] +end +output.ifdef :USE_UNICODE_PROPERTIES do + props.each do |name| + i += 1 + name = normalize_propname(name) + name_to_index[name] = i + puts "%-40s %3d" % [name + ',', i] + end + aliases.each_pair do |k, v| + next if name_to_index[k] + next unless v = name_to_index[v] + puts "%-40s %3d" % [k + ',', v] + end + output.ifdef :USE_UNICODE_AGE_PROPERTIES do + ages.each do |name| + i += 1 + name = "age=#{name}" + name_to_index[name] = i + puts "%-40s %3d" % [name + ',', i] + end + end + graphemeBreaks.each do |name| + i += 1 + name = "graphemeclusterbreak=#{name.delete('_').downcase}" + name_to_index[name] = i + puts "%-40s %3d" % [name + ',', i] + end + blocks.each do |name| + i += 1 + name = normalize_propname(name) + name_to_index[name] = i + puts "%-40s %3d" % [name + ',', i] + end +end +puts(<<'__HEREDOC') +%% +static int +uniname2ctype(const UChar *name, unsigned int len) +{ + const struct uniname2ctype_struct *p = uniname2ctype_p((const char *)name, len); + if (p) return p->ctype; + return -1; +} +__HEREDOC +versions = $unicode_version.scan(/\d+/) +print("#if defined ONIG_UNICODE_VERSION_STRING && !( \\\n") +%w[MAJOR MINOR TEENY].zip(versions) do |n, v| + print(" ONIG_UNICODE_VERSION_#{n} == #{v} && \\\n") +end +print(" 1)\n") +print("# error ONIG_UNICODE_VERSION_STRING mismatch\n") +print("#endif\n") +print("#define ONIG_UNICODE_VERSION_STRING #{$unicode_version.dump}\n") +%w[MAJOR MINOR TEENY].zip(versions) do |n, v| + print("#define ONIG_UNICODE_VERSION_#{n} #{v}\n") +end + +output.restore + +if header + require 'tempfile' + + NAME2CTYPE = %w[gperf -7 -c -j1 -i1 -t -C -P -T -H uniname2ctype_hash -Q uniname2ctype_pool -N uniname2ctype_p] + + fds = [] + syms = %i[USE_UNICODE_PROPERTIES USE_UNICODE_AGE_PROPERTIES] + begin + fds << (tmp = Tempfile.new(%w"name2ctype .h")) + IO.popen([*NAME2CTYPE, out: tmp], "w") {|f| output.show(f, *syms)} + end while syms.pop + fds.each(&:close) + IO.popen(%W[diff -DUSE_UNICODE_AGE_PROPERTIES #{fds[1].path} #{fds[0].path}], "r") {|age| + IO.popen(%W[diff -DUSE_UNICODE_PROPERTIES #{fds[2].path} -], "r", in: age) {|f| + f.each {|line| + line.gsub!(/\(int\)\((?:long|size_t)\)&\(\(struct uniname2ctype_pool_t \*\)0\)->uniname2ctype_pool_(str\d+),\s+/, + 'uniname2ctype_offset(\1), ') + puts line + } + } + } +end |