summaryrefslogtreecommitdiffstats
path: root/zishrink.awk
diff options
context:
space:
mode:
Diffstat (limited to 'zishrink.awk')
-rw-r--r--zishrink.awk388
1 files changed, 388 insertions, 0 deletions
diff --git a/zishrink.awk b/zishrink.awk
new file mode 100644
index 0000000..c98dc6a
--- /dev/null
+++ b/zishrink.awk
@@ -0,0 +1,388 @@
+# Convert tzdata source into a smaller version of itself.
+
+# Contributed by Paul Eggert. This file is in the public domain.
+
+# This is not a general-purpose converter; it is designed for current tzdata.
+# 'zic' should treat this script's output as if it were identical to
+# this script's input.
+
+# Record a hash N for the new name NAME, checking for collisions.
+
+function record_hash(n, name)
+{
+ if (used_hashes[n]) {
+ printf "# ! collision: %s %s\n", used_hashes[n], name
+ exit 1
+ }
+ used_hashes[n] = name
+}
+
+# Return a shortened rule name representing NAME,
+# and record this relationship to the hash table.
+
+function gen_rule_name(name, \
+ n)
+{
+ # Use a simple mnemonic: the first two letters.
+ n = substr(name, 1, 2)
+ record_hash(n, name)
+ # printf "# %s = %s\n", n, name
+ return n
+}
+
+function prehash_rule_names( \
+ name)
+{
+ # Rule names are not part of the tzdb API, so substitute shorter
+ # ones. Shortening them consistently from one release to the next
+ # simplifies comparison of the output. That being said, the
+ # 1-letter names below are not standardized in any way, and can
+ # change arbitrarily from one release to the next, as the main goal
+ # here is compression not comparison.
+
+ # Abbreviating these rules names to one letter saved the most space
+ # circa 2018e.
+ rule["Arg"] = "A"
+ rule["Brazil"] = "B"
+ rule["Canada"] = "C"
+ rule["Denmark"] = "D"
+ rule["EU"] = "E"
+ rule["France"] = "F"
+ rule["GB-Eire"] = "G"
+ rule["Halifax"] = "H"
+ rule["Italy"] = "I"
+ rule["Jordan"] = "J"
+ rule["Egypt"] = "K" # "Kemet" in ancient Egyptian
+ rule["Libya"] = "L"
+ rule["Morocco"] = "M"
+ rule["Neth"] = "N"
+ rule["Poland"] = "O" # arbitrary
+ rule["Palestine"] = "P"
+ rule["Cuba"] = "Q" # Its start sounds like "Q".
+ rule["Russia"] = "R"
+ rule["Syria"] = "S"
+ rule["Turkey"] = "T"
+ rule["Uruguay"] = "U"
+ rule["Vincennes"] = "V"
+ rule["Winn"] = "W"
+ rule["Mongol"] = "X" # arbitrary
+ rule["NT_YK"] = "Y"
+ rule["Zion"] = "Z"
+ rule["Austria"] = "a"
+ rule["Belgium"] = "b"
+ rule["C-Eur"] = "c"
+ rule["Algeria"] = "d" # country code DZ
+ rule["E-Eur"] = "e"
+ rule["Taiwan"] = "f" # Formosa
+ rule["Greece"] = "g"
+ rule["Hungary"] = "h"
+ rule["Iran"] = "i"
+ rule["StJohns"] = "j"
+ rule["Chatham"] = "k" # arbitrary
+ rule["Lebanon"] = "l"
+ rule["Mexico"] = "m"
+ rule["Tunisia"] = "n" # country code TN
+ rule["Moncton"] = "o" # arbitrary
+ rule["Port"] = "p"
+ rule["Albania"] = "q" # arbitrary
+ rule["Regina"] = "r"
+ rule["Spain"] = "s"
+ rule["Toronto"] = "t"
+ rule["US"] = "u"
+ rule["Louisville"] = "v" # ville
+ rule["Iceland"] = "w" # arbitrary
+ rule["Chile"] = "x" # arbitrary
+ rule["Para"] = "y" # country code PY
+ rule["Romania"] = "z" # arbitrary
+ rule["Macau"] = "_" # arbitrary
+
+ # Use ISO 3166 alpha-2 country codes for remaining names that are countries.
+ # This is more systematic, and avoids collisions (e.g., Malta and Moldova).
+ rule["Armenia"] = "AM"
+ rule["Aus"] = "AU"
+ rule["Azer"] = "AZ"
+ rule["Barb"] = "BB"
+ rule["Dhaka"] = "BD"
+ rule["Bulg"] = "BG"
+ rule["Bahamas"] = "BS"
+ rule["Belize"] = "BZ"
+ rule["Swiss"] = "CH"
+ rule["Cook"] = "CK"
+ rule["PRC"] = "CN"
+ rule["Cyprus"] = "CY"
+ rule["Czech"] = "CZ"
+ rule["Germany"] = "DE"
+ rule["DR"] = "DO"
+ rule["Ecuador"] = "EC"
+ rule["Finland"] = "FI"
+ rule["Fiji"] = "FJ"
+ rule["Falk"] = "FK"
+ rule["Ghana"] = "GH"
+ rule["Guat"] = "GT"
+ rule["Hond"] = "HN"
+ rule["Haiti"] = "HT"
+ rule["Eire"] = "IE"
+ rule["Iraq"] = "IQ"
+ rule["Japan"] = "JP"
+ rule["Kyrgyz"] = "KG"
+ rule["ROK"] = "KR"
+ rule["Latvia"] = "LV"
+ rule["Lux"] = "LX"
+ rule["Moldova"] = "MD"
+ rule["Malta"] = "MT"
+ rule["Mauritius"] = "MU"
+ rule["Namibia"] = "NA"
+ rule["Nic"] = "NI"
+ rule["Norway"] = "NO"
+ rule["Peru"] = "PE"
+ rule["Phil"] = "PH"
+ rule["Pakistan"] = "PK"
+ rule["Sudan"] = "SD"
+ rule["Salv"] = "SV"
+ rule["Tonga"] = "TO"
+ rule["Vanuatu"] = "VU"
+
+ # Avoid collisions.
+ rule["Detroit"] = "Dt" # De = Denver
+
+ for (name in rule) {
+ record_hash(rule[name], name)
+ }
+}
+
+function make_line(n, field, \
+ f, r)
+{
+ r = field[1]
+ for (f = 2; f <= n; f++)
+ r = r " " field[f]
+ return r
+}
+
+# Process the input line LINE and save it for later output.
+
+function process_input_line(line, \
+ f, field, end, n, outline, r, \
+ linkline, ruleline, zoneline)
+{
+ # Remove comments, normalize spaces, and append a space to each line.
+ sub(/#.*/, "", line)
+ line = line " "
+ gsub(/[\t ]+/, " ", line)
+
+ # Abbreviate keywords and determine line type.
+ linkline = sub(/^Link /, "L ", line)
+ ruleline = sub(/^Rule /, "R ", line)
+ zoneline = sub(/^Zone /, "Z ", line)
+
+ # Replace FooAsia rules with the same rules without "Asia", as they
+ # are duplicates.
+ if (match(line, /[^ ]Asia /)) {
+ if (ruleline) return
+ line = substr(line, 1, RSTART) substr(line, RSTART + 5)
+ }
+
+ # Abbreviate times.
+ while (match(line, /[: ]0+[0-9]/))
+ line = substr(line, 1, RSTART) substr(line, RSTART + RLENGTH - 1)
+ while (match(line, /:0[^:]/))
+ line = substr(line, 1, RSTART - 1) substr(line, RSTART + 2)
+
+ # Abbreviate weekday names.
+ while (match(line, / (last)?(Mon|Wed|Fri)[ <>]/)) {
+ end = RSTART + RLENGTH
+ line = substr(line, 1, end - 4) substr(line, end - 1)
+ }
+ while (match(line, / (last)?(Sun|Tue|Thu|Sat)[ <>]/)) {
+ end = RSTART + RLENGTH
+ line = substr(line, 1, end - 3) substr(line, end - 1)
+ }
+
+ # Abbreviate "max", "min", "only" and month names.
+ # Although "max" and "min" can both be abbreviated to just "m",
+ # the longer forms "ma" and "mi" are needed with zic 2023d and earlier.
+ gsub(/ max /, dataform == "vanguard" ? " m " : " ma ", line)
+ gsub(/ min /, dataform == "vanguard" ? " m " : " mi ", line)
+ gsub(/ only /, " o ", line)
+ gsub(/ Jan /, " Ja ", line)
+ gsub(/ Feb /, " F ", line)
+ gsub(/ Apr /, " Ap ", line)
+ gsub(/ Aug /, " Au ", line)
+ gsub(/ Sep /, " S ", line)
+ gsub(/ Oct /, " O ", line)
+ gsub(/ Nov /, " N ", line)
+ gsub(/ Dec /, " D ", line)
+
+ # Strip leading and trailing space.
+ sub(/^ /, "", line)
+ sub(/ $/, "", line)
+
+ # Remove unnecessary trailing zero fields.
+ sub(/ 0+$/, "", line)
+
+ # Remove unnecessary trailing days-of-month "1".
+ if (match(line, /[A-Za-z] 1$/))
+ line = substr(line, 1, RSTART)
+
+ # Remove unnecessary trailing " Ja" (for January).
+ sub(/ Ja$/, "", line)
+
+ n = split(line, field)
+
+ # Record which rule names are used, and generate their abbreviations.
+ f = zoneline ? 4 : linkline || ruleline ? 0 : 2
+ r = field[f]
+ if (r ~ /^[^-+0-9]/) {
+ rule_used[r] = 1
+ }
+
+ if (zoneline)
+ zonename = startdef = field[2]
+ else if (linkline)
+ zonename = startdef = field[3]
+ else if (ruleline)
+ zonename = ""
+
+ # Save the information for later output.
+ outline = make_line(n, field)
+ if (ruleline)
+ rule_output_line[nrule_out++] = outline
+ else if (linkline) {
+ # In vanguard format with Gawk, links are output sorted by destination.
+ if (dataform == "vanguard" && PROCINFO["version"])
+ linkdef[zonename] = field[2]
+ else
+ link_output_line[nlink_out++] = outline
+ }else
+ zonedef[zonename] = (zoneline ? "" : zonedef[zonename] "\n") outline
+}
+
+function omit_unused_rules( \
+ i, field)
+{
+ for (i = 0; i < nrule_out; i++) {
+ split(rule_output_line[i], field)
+ if (!rule_used[field[2]])
+ rule_output_line[i] = ""
+ }
+}
+
+function abbreviate_rule_names( \
+ abbr, f, field, i, n, newdef, newline, r, \
+ zoneline, zonelines, zonename)
+{
+ for (i = 0; i < nrule_out; i++) {
+ n = split(rule_output_line[i], field)
+ if (n) {
+ r = field[2]
+ if (r ~ /^[^-+0-9]/) {
+ abbr = rule[r]
+ if (!abbr) {
+ rule[r] = abbr = gen_rule_name(r)
+ }
+ field[2] = abbr
+ rule_output_line[i] = make_line(n, field)
+ }
+ }
+ }
+ for (zonename in zonedef) {
+ zonelines = split(zonedef[zonename], zoneline, /\n/)
+ newdef = ""
+ for (i = 1; i <= zonelines; i++) {
+ newline = zoneline[i]
+ n = split(newline, field)
+ f = i == 1 ? 4 : 2
+ r = rule[field[f]]
+ if (r) {
+ field[f] = r
+ newline = make_line(n, field)
+ }
+ newdef = (newdef ? newdef "\n" : "") newline
+ }
+ zonedef[zonename] = newdef
+ }
+}
+
+function output_saved_lines( \
+ i, zonename)
+{
+ for (i = 0; i < nrule_out; i++)
+ if (rule_output_line[i])
+ print rule_output_line[i]
+
+ # When using gawk, output zones sorted by name.
+ # This makes the output a bit more compressible.
+ PROCINFO["sorted_in"] = "@ind_str_asc"
+ for (zonename in zonedef)
+ print zonedef[zonename]
+
+ if (nlink_out)
+ for (i = 0; i < nlink_out; i++)
+ print link_output_line[i]
+ else {
+ # When using gawk, output links sorted by destination.
+ # This also helps compressibility a bit.
+ PROCINFO["sorted_in"] = "@val_type_asc"
+ for (zonename in linkdef)
+ printf "L %s %s\n", linkdef[zonename], zonename
+ }
+}
+
+BEGIN {
+ # Files that the output normally depends on.
+ default_dep["africa"] = 1
+ default_dep["antarctica"] = 1
+ default_dep["asia"] = 1
+ default_dep["australasia"] = 1
+ default_dep["backward"] = 1
+ default_dep["etcetera"] = 1
+ default_dep["europe"] = 1
+ default_dep["factory"] = 1
+ default_dep["northamerica"] = 1
+ default_dep["southamerica"] = 1
+ default_dep["ziguard.awk"] = 1
+ default_dep["zishrink.awk"] = 1
+
+ # Output a version string from 'version' and related configuration variables
+ # supported by tzdb's Makefile. If you change the makefile or any other files
+ # that affect the output of this script, you should append '-SOMETHING'
+ # to the contents of 'version', where SOMETHING identifies what was changed.
+
+ ndeps = split(deps, dep)
+ ddeps = ""
+ for (i = 1; i <= ndeps; i++) {
+ if (default_dep[dep[i]]) {
+ default_dep[dep[i]]++
+ } else {
+ ddeps = ddeps " " dep[i]
+ }
+ }
+ for (d in default_dep) {
+ if (default_dep[d] == 1) {
+ ddeps = ddeps " !" d
+ }
+ }
+ print "# version", version
+ if (dataform != "main") {
+ print "# dataform", dataform
+ }
+ if (redo != "posix_right") {
+ print "# redo " redo
+ }
+ if (ddeps) {
+ print "# ddeps" ddeps
+ }
+ print "# This zic input file is in the public domain."
+
+ prehash_rule_names()
+}
+
+/^[\t ]*[^#\t ]/ {
+ process_input_line($0)
+}
+
+END {
+ omit_unused_rules()
+ abbreviate_rule_names()
+ output_saved_lines()
+}