#! /bin/sh # # make-uniuni -- script for creating the file uniuni.cpp # # Copyright (C) 2005-2020 Free Software Foundation, Inc. # Written by Werner Lemberg # # This file is part of groff. # # groff is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation, either version 3 of the License, or # (at your option) any later version. # # groff is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # usage: # # make-uniuni < UnicodeData.txt > uniuni.cpp # # 'UnicodeData.txt' is the central database file from the Unicode standard. # Unfortunately, it doesn't contain a version number which must be thus # provided manually as a parameter to the filter. # # This program needs a C preprocessor. # CPP=cpp prog="$0" if test $# -ne 1; then echo "usage: $0 < UnicodeData.txt > uniuni.cpp" exit 1 fi version_string="$1" # Remove ranges and control characters, # then extract the decomposition field, # then remove lines without decomposition, # then remove all compatibility decompositions. sed -e '/^[^;]*; $$1 # Prepare input for running cpp. cat $$1 \ | sed -e 's/^\([^;]*\);/#define \1 /' \ -e 's/ / u/g' > $$2 cat $$1 \ | sed -e 's/^\([^;]*\);.*$/\1 u\1/' >> $$2 # Run C preprocessor to recursively decompose. $CPP $$2 $$3 # Convert it back to original format. cat $$3 \ | sed -e '/#/d' \ -e '/^$/d' \ -e 's/ \+/ /g' \ -e 's/ *$//' \ -e 's/u//g' \ -e 's/^\([^ ]*\) /\1;/' > $$4 # Write preamble. cat < This file is part of groff. groff is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. groff is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ // This code has been algorithmically derived from the file // UnicodeData.txt, version $version_string, available from unicode.org, // on `date '+%Y-%m-%d'`. #include "lib.h" #include "stringclass.h" #include "ptable.h" #include "unicode.h" struct unicode_decompose { char *value; }; declare_ptable(unicode_decompose) implement_ptable(unicode_decompose) PTABLE(unicode_decompose) unicode_decompose_table; // the first digit in the composite string gives the number of composites struct S { const char *key; const char *value; } unicode_decompose_list[] = { END # Emit Unicode data. cat $$4 \ | sed -e 's/ /_/g' \ -e 's/\(.*\);\(.*_.*_.*_.*\)$/ { "\1", "4\2" },/' \ -e 's/\(.*\);\(.*_.*_.*\)$/ { "\1", "3\2" },/' \ -e 's/\(.*\);\(.*_.*\)$/ { "\1", "2\2" },/' \ -e 's/\(.*\);\(.*\)$/ { "\1", "1\2" },/' # Write postamble. cat <value = (char *)unicode_decompose_list[i].value; unicode_decompose_table.define(unicode_decompose_list[i].key, dec); } } const char *decompose_unicode(const char *s) { unicode_decompose *result = unicode_decompose_table.lookup(s); return result ? result->value : 0; } END # Remove temporary files. rm $$1 $$2 $$3 $$4 # EOF