diff options
Diffstat (limited to 'vendor/bstr/scripts/generate-unicode-data')
-rwxr-xr-x | vendor/bstr/scripts/generate-unicode-data | 149 |
1 files changed, 149 insertions, 0 deletions
diff --git a/vendor/bstr/scripts/generate-unicode-data b/vendor/bstr/scripts/generate-unicode-data new file mode 100755 index 000000000..b8341c5a6 --- /dev/null +++ b/vendor/bstr/scripts/generate-unicode-data @@ -0,0 +1,149 @@ +#!/bin/sh + +set -e +D="$(dirname "$0")" + +# Convenience function for checking that a command exists. +requires() { + cmd="$1" + if ! command -v "$cmd" > /dev/null 2>&1; then + echo "DEPENDENCY MISSING: $cmd must be installed" >&2 + exit 1 + fi +} + +# Test if an array ($2) contains a particular element ($1). +array_exists() { + needle="$1" + shift + + for el in "$@"; do + if [ "$el" = "$needle" ]; then + return 0 + fi + done + return 1 +} + +graphemes() { + regex="$(sh "$D/regex/grapheme.sh")" + + echo "generating forward grapheme DFA" + ucd-generate dfa \ + --name GRAPHEME_BREAK_FWD \ + --sparse --minimize --anchored --state-size 2 \ + src/unicode/fsm/ \ + "$regex" + + echo "generating reverse grapheme DFA" + ucd-generate dfa \ + --name GRAPHEME_BREAK_REV \ + --reverse --longest \ + --sparse --minimize --anchored --state-size 2 \ + src/unicode/fsm/ \ + "$regex" +} + +words() { + regex="$(sh "$D/regex/word.sh")" + + echo "generating forward word DFA (this can take a while)" + ucd-generate dfa \ + --name WORD_BREAK_FWD \ + --sparse --minimize --anchored --state-size 4 \ + src/unicode/fsm/ \ + "$regex" +} + +sentences() { + regex="$(sh "$D/regex/sentence.sh")" + + echo "generating forward sentence DFA (this can take a while)" + ucd-generate dfa \ + --name SENTENCE_BREAK_FWD \ + --minimize \ + --sparse --anchored --state-size 4 \ + src/unicode/fsm/ \ + "$regex" +} + +regional_indicator() { + # For finding all occurrences of region indicators. This is used to handle + # regional indicators as a special case for the reverse grapheme iterator + # and the reverse word iterator. + echo "generating regional indicator DFA" + ucd-generate dfa \ + --name REGIONAL_INDICATOR_REV \ + --reverse \ + --classes --minimize --anchored --premultiply --state-size 1 \ + src/unicode/fsm/ \ + "\p{gcb=Regional_Indicator}" +} + +simple_word() { + echo "generating forward simple word DFA" + ucd-generate dfa \ + --name SIMPLE_WORD_FWD \ + --sparse --minimize --state-size 2 \ + src/unicode/fsm/ \ + "\w" +} + +whitespace() { + echo "generating forward whitespace DFA" + ucd-generate dfa \ + --name WHITESPACE_ANCHORED_FWD \ + --anchored --classes --premultiply --minimize --state-size 1 \ + src/unicode/fsm/ \ + "\s+" + + echo "generating reverse whitespace DFA" + ucd-generate dfa \ + --name WHITESPACE_ANCHORED_REV \ + --reverse \ + --anchored --classes --premultiply --minimize --state-size 2 \ + src/unicode/fsm/ \ + "\s+" +} + +main() { + if array_exists "-h" "$@" || array_exists "--help" "$@"; then + echo "Usage: $(basename "$0") [--list-commands] [<command>] ..." >&2 + exit + fi + + commands=" + graphemes + sentences + words + regional-indicator + simple-word + whitespace + " + if array_exists "--list-commands" "$@"; then + for cmd in $commands; do + echo "$cmd" + done + exit + fi + + # ucd-generate is used to compile regexes into DFAs. + requires ucd-generate + + mkdir -p src/unicode/fsm/ + + cmds=$* + if [ $# -eq 0 ] || array_exists "all" "$@"; then + cmds=$commands + fi + for cmd in $cmds; do + if array_exists "$cmd" $commands; then + fun="$(echo "$cmd" | sed 's/-/_/g')" + eval "$fun" + else + echo "unrecognized command: $cmd" >&2 + fi + done +} + +main "$@" |