diff options
Diffstat (limited to 'zishrink.awk')
-rw-r--r-- | zishrink.awk | 388 |
1 files changed, 388 insertions, 0 deletions
diff --git a/zishrink.awk b/zishrink.awk new file mode 100644 index 0000000..c98dc6a --- /dev/null +++ b/zishrink.awk @@ -0,0 +1,388 @@ +# Convert tzdata source into a smaller version of itself. + +# Contributed by Paul Eggert. This file is in the public domain. + +# This is not a general-purpose converter; it is designed for current tzdata. +# 'zic' should treat this script's output as if it were identical to +# this script's input. + +# Record a hash N for the new name NAME, checking for collisions. + +function record_hash(n, name) +{ + if (used_hashes[n]) { + printf "# ! collision: %s %s\n", used_hashes[n], name + exit 1 + } + used_hashes[n] = name +} + +# Return a shortened rule name representing NAME, +# and record this relationship to the hash table. + +function gen_rule_name(name, \ + n) +{ + # Use a simple mnemonic: the first two letters. + n = substr(name, 1, 2) + record_hash(n, name) + # printf "# %s = %s\n", n, name + return n +} + +function prehash_rule_names( \ + name) +{ + # Rule names are not part of the tzdb API, so substitute shorter + # ones. Shortening them consistently from one release to the next + # simplifies comparison of the output. That being said, the + # 1-letter names below are not standardized in any way, and can + # change arbitrarily from one release to the next, as the main goal + # here is compression not comparison. + + # Abbreviating these rules names to one letter saved the most space + # circa 2018e. + rule["Arg"] = "A" + rule["Brazil"] = "B" + rule["Canada"] = "C" + rule["Denmark"] = "D" + rule["EU"] = "E" + rule["France"] = "F" + rule["GB-Eire"] = "G" + rule["Halifax"] = "H" + rule["Italy"] = "I" + rule["Jordan"] = "J" + rule["Egypt"] = "K" # "Kemet" in ancient Egyptian + rule["Libya"] = "L" + rule["Morocco"] = "M" + rule["Neth"] = "N" + rule["Poland"] = "O" # arbitrary + rule["Palestine"] = "P" + rule["Cuba"] = "Q" # Its start sounds like "Q". + rule["Russia"] = "R" + rule["Syria"] = "S" + rule["Turkey"] = "T" + rule["Uruguay"] = "U" + rule["Vincennes"] = "V" + rule["Winn"] = "W" + rule["Mongol"] = "X" # arbitrary + rule["NT_YK"] = "Y" + rule["Zion"] = "Z" + rule["Austria"] = "a" + rule["Belgium"] = "b" + rule["C-Eur"] = "c" + rule["Algeria"] = "d" # country code DZ + rule["E-Eur"] = "e" + rule["Taiwan"] = "f" # Formosa + rule["Greece"] = "g" + rule["Hungary"] = "h" + rule["Iran"] = "i" + rule["StJohns"] = "j" + rule["Chatham"] = "k" # arbitrary + rule["Lebanon"] = "l" + rule["Mexico"] = "m" + rule["Tunisia"] = "n" # country code TN + rule["Moncton"] = "o" # arbitrary + rule["Port"] = "p" + rule["Albania"] = "q" # arbitrary + rule["Regina"] = "r" + rule["Spain"] = "s" + rule["Toronto"] = "t" + rule["US"] = "u" + rule["Louisville"] = "v" # ville + rule["Iceland"] = "w" # arbitrary + rule["Chile"] = "x" # arbitrary + rule["Para"] = "y" # country code PY + rule["Romania"] = "z" # arbitrary + rule["Macau"] = "_" # arbitrary + + # Use ISO 3166 alpha-2 country codes for remaining names that are countries. + # This is more systematic, and avoids collisions (e.g., Malta and Moldova). + rule["Armenia"] = "AM" + rule["Aus"] = "AU" + rule["Azer"] = "AZ" + rule["Barb"] = "BB" + rule["Dhaka"] = "BD" + rule["Bulg"] = "BG" + rule["Bahamas"] = "BS" + rule["Belize"] = "BZ" + rule["Swiss"] = "CH" + rule["Cook"] = "CK" + rule["PRC"] = "CN" + rule["Cyprus"] = "CY" + rule["Czech"] = "CZ" + rule["Germany"] = "DE" + rule["DR"] = "DO" + rule["Ecuador"] = "EC" + rule["Finland"] = "FI" + rule["Fiji"] = "FJ" + rule["Falk"] = "FK" + rule["Ghana"] = "GH" + rule["Guat"] = "GT" + rule["Hond"] = "HN" + rule["Haiti"] = "HT" + rule["Eire"] = "IE" + rule["Iraq"] = "IQ" + rule["Japan"] = "JP" + rule["Kyrgyz"] = "KG" + rule["ROK"] = "KR" + rule["Latvia"] = "LV" + rule["Lux"] = "LX" + rule["Moldova"] = "MD" + rule["Malta"] = "MT" + rule["Mauritius"] = "MU" + rule["Namibia"] = "NA" + rule["Nic"] = "NI" + rule["Norway"] = "NO" + rule["Peru"] = "PE" + rule["Phil"] = "PH" + rule["Pakistan"] = "PK" + rule["Sudan"] = "SD" + rule["Salv"] = "SV" + rule["Tonga"] = "TO" + rule["Vanuatu"] = "VU" + + # Avoid collisions. + rule["Detroit"] = "Dt" # De = Denver + + for (name in rule) { + record_hash(rule[name], name) + } +} + +function make_line(n, field, \ + f, r) +{ + r = field[1] + for (f = 2; f <= n; f++) + r = r " " field[f] + return r +} + +# Process the input line LINE and save it for later output. + +function process_input_line(line, \ + f, field, end, n, outline, r, \ + linkline, ruleline, zoneline) +{ + # Remove comments, normalize spaces, and append a space to each line. + sub(/#.*/, "", line) + line = line " " + gsub(/[\t ]+/, " ", line) + + # Abbreviate keywords and determine line type. + linkline = sub(/^Link /, "L ", line) + ruleline = sub(/^Rule /, "R ", line) + zoneline = sub(/^Zone /, "Z ", line) + + # Replace FooAsia rules with the same rules without "Asia", as they + # are duplicates. + if (match(line, /[^ ]Asia /)) { + if (ruleline) return + line = substr(line, 1, RSTART) substr(line, RSTART + 5) + } + + # Abbreviate times. + while (match(line, /[: ]0+[0-9]/)) + line = substr(line, 1, RSTART) substr(line, RSTART + RLENGTH - 1) + while (match(line, /:0[^:]/)) + line = substr(line, 1, RSTART - 1) substr(line, RSTART + 2) + + # Abbreviate weekday names. + while (match(line, / (last)?(Mon|Wed|Fri)[ <>]/)) { + end = RSTART + RLENGTH + line = substr(line, 1, end - 4) substr(line, end - 1) + } + while (match(line, / (last)?(Sun|Tue|Thu|Sat)[ <>]/)) { + end = RSTART + RLENGTH + line = substr(line, 1, end - 3) substr(line, end - 1) + } + + # Abbreviate "max", "min", "only" and month names. + # Although "max" and "min" can both be abbreviated to just "m", + # the longer forms "ma" and "mi" are needed with zic 2023d and earlier. + gsub(/ max /, dataform == "vanguard" ? " m " : " ma ", line) + gsub(/ min /, dataform == "vanguard" ? " m " : " mi ", line) + gsub(/ only /, " o ", line) + gsub(/ Jan /, " Ja ", line) + gsub(/ Feb /, " F ", line) + gsub(/ Apr /, " Ap ", line) + gsub(/ Aug /, " Au ", line) + gsub(/ Sep /, " S ", line) + gsub(/ Oct /, " O ", line) + gsub(/ Nov /, " N ", line) + gsub(/ Dec /, " D ", line) + + # Strip leading and trailing space. + sub(/^ /, "", line) + sub(/ $/, "", line) + + # Remove unnecessary trailing zero fields. + sub(/ 0+$/, "", line) + + # Remove unnecessary trailing days-of-month "1". + if (match(line, /[A-Za-z] 1$/)) + line = substr(line, 1, RSTART) + + # Remove unnecessary trailing " Ja" (for January). + sub(/ Ja$/, "", line) + + n = split(line, field) + + # Record which rule names are used, and generate their abbreviations. + f = zoneline ? 4 : linkline || ruleline ? 0 : 2 + r = field[f] + if (r ~ /^[^-+0-9]/) { + rule_used[r] = 1 + } + + if (zoneline) + zonename = startdef = field[2] + else if (linkline) + zonename = startdef = field[3] + else if (ruleline) + zonename = "" + + # Save the information for later output. + outline = make_line(n, field) + if (ruleline) + rule_output_line[nrule_out++] = outline + else if (linkline) { + # In vanguard format with Gawk, links are output sorted by destination. + if (dataform == "vanguard" && PROCINFO["version"]) + linkdef[zonename] = field[2] + else + link_output_line[nlink_out++] = outline + }else + zonedef[zonename] = (zoneline ? "" : zonedef[zonename] "\n") outline +} + +function omit_unused_rules( \ + i, field) +{ + for (i = 0; i < nrule_out; i++) { + split(rule_output_line[i], field) + if (!rule_used[field[2]]) + rule_output_line[i] = "" + } +} + +function abbreviate_rule_names( \ + abbr, f, field, i, n, newdef, newline, r, \ + zoneline, zonelines, zonename) +{ + for (i = 0; i < nrule_out; i++) { + n = split(rule_output_line[i], field) + if (n) { + r = field[2] + if (r ~ /^[^-+0-9]/) { + abbr = rule[r] + if (!abbr) { + rule[r] = abbr = gen_rule_name(r) + } + field[2] = abbr + rule_output_line[i] = make_line(n, field) + } + } + } + for (zonename in zonedef) { + zonelines = split(zonedef[zonename], zoneline, /\n/) + newdef = "" + for (i = 1; i <= zonelines; i++) { + newline = zoneline[i] + n = split(newline, field) + f = i == 1 ? 4 : 2 + r = rule[field[f]] + if (r) { + field[f] = r + newline = make_line(n, field) + } + newdef = (newdef ? newdef "\n" : "") newline + } + zonedef[zonename] = newdef + } +} + +function output_saved_lines( \ + i, zonename) +{ + for (i = 0; i < nrule_out; i++) + if (rule_output_line[i]) + print rule_output_line[i] + + # When using gawk, output zones sorted by name. + # This makes the output a bit more compressible. + PROCINFO["sorted_in"] = "@ind_str_asc" + for (zonename in zonedef) + print zonedef[zonename] + + if (nlink_out) + for (i = 0; i < nlink_out; i++) + print link_output_line[i] + else { + # When using gawk, output links sorted by destination. + # This also helps compressibility a bit. + PROCINFO["sorted_in"] = "@val_type_asc" + for (zonename in linkdef) + printf "L %s %s\n", linkdef[zonename], zonename + } +} + +BEGIN { + # Files that the output normally depends on. + default_dep["africa"] = 1 + default_dep["antarctica"] = 1 + default_dep["asia"] = 1 + default_dep["australasia"] = 1 + default_dep["backward"] = 1 + default_dep["etcetera"] = 1 + default_dep["europe"] = 1 + default_dep["factory"] = 1 + default_dep["northamerica"] = 1 + default_dep["southamerica"] = 1 + default_dep["ziguard.awk"] = 1 + default_dep["zishrink.awk"] = 1 + + # Output a version string from 'version' and related configuration variables + # supported by tzdb's Makefile. If you change the makefile or any other files + # that affect the output of this script, you should append '-SOMETHING' + # to the contents of 'version', where SOMETHING identifies what was changed. + + ndeps = split(deps, dep) + ddeps = "" + for (i = 1; i <= ndeps; i++) { + if (default_dep[dep[i]]) { + default_dep[dep[i]]++ + } else { + ddeps = ddeps " " dep[i] + } + } + for (d in default_dep) { + if (default_dep[d] == 1) { + ddeps = ddeps " !" d + } + } + print "# version", version + if (dataform != "main") { + print "# dataform", dataform + } + if (redo != "posix_right") { + print "# redo " redo + } + if (ddeps) { + print "# ddeps" ddeps + } + print "# This zic input file is in the public domain." + + prehash_rule_names() +} + +/^[\t ]*[^#\t ]/ { + process_input_line($0) +} + +END { + omit_unused_rules() + abbreviate_rule_names() + output_saved_lines() +} |