zishrink.awk


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388

# Convert tzdata source into a smaller version of itself.

# Contributed by Paul Eggert.  This file is in the public domain.

# This is not a general-purpose converter; it is designed for current tzdata.
# 'zic' should treat this script's output as if it were identical to
# this script's input.

# Record a hash N for the new name NAME, checking for collisions.

function record_hash(n, name)
{
  if (used_hashes[n]) {
    printf "# ! collision: %s %s\n", used_hashes[n], name
    exit 1
  }
  used_hashes[n] = name
}

# Return a shortened rule name representing NAME,
# and record this relationship to the hash table.

function gen_rule_name(name, \
		       n)
{
  # Use a simple mnemonic: the first two letters.
  n = substr(name, 1, 2)
  record_hash(n, name)
  # printf "# %s = %s\n", n, name
  return n
}

function prehash_rule_names( \
			    name)
{
  # Rule names are not part of the tzdb API, so substitute shorter
  # ones.  Shortening them consistently from one release to the next
  # simplifies comparison of the output.  That being said, the
  # 1-letter names below are not standardized in any way, and can
  # change arbitrarily from one release to the next, as the main goal
  # here is compression not comparison.

  # Abbreviating these rules names to one letter saved the most space
  # circa 2018e.
  rule["Arg"] = "A"
  rule["Brazil"] = "B"
  rule["Canada"] = "C"
  rule["Denmark"] = "D"
  rule["EU"] = "E"
  rule["France"] = "F"
  rule["GB-Eire"] = "G"
  rule["Halifax"] = "H"
  rule["Italy"] = "I"
  rule["Jordan"] = "J"
  rule["Egypt"] = "K" # "Kemet" in ancient Egyptian
  rule["Libya"] = "L"
  rule["Morocco"] = "M"
  rule["Neth"] = "N"
  rule["Poland"] = "O" # arbitrary
  rule["Palestine"] = "P"
  rule["Cuba"] = "Q" # Its start sounds like "Q".
  rule["Russia"] = "R"
  rule["Syria"] = "S"
  rule["Turkey"] = "T"
  rule["Uruguay"] = "U"
  rule["Vincennes"] = "V"
  rule["Winn"] = "W"
  rule["Mongol"] = "X" # arbitrary
  rule["NT_YK"] = "Y"
  rule["Zion"] = "Z"
  rule["Austria"] = "a"
  rule["Belgium"] = "b"
  rule["C-Eur"] = "c"
  rule["Algeria"] = "d" # country code DZ
  rule["E-Eur"] = "e"
  rule["Taiwan"] = "f" # Formosa
  rule["Greece"] = "g"
  rule["Hungary"] = "h"
  rule["Iran"] = "i"
  rule["StJohns"] = "j"
  rule["Chatham"] = "k" # arbitrary
  rule["Lebanon"] = "l"
  rule["Mexico"] = "m"
  rule["Tunisia"] = "n" # country code TN
  rule["Moncton"] = "o" # arbitrary
  rule["Port"] = "p"
  rule["Albania"] = "q" # arbitrary
  rule["Regina"] = "r"
  rule["Spain"] = "s"
  rule["Toronto"] = "t"
  rule["US"] = "u"
  rule["Louisville"] = "v" # ville
  rule["Iceland"] = "w" # arbitrary
  rule["Chile"] = "x" # arbitrary
  rule["Para"] = "y" # country code PY
  rule["Romania"] = "z" # arbitrary
  rule["Macau"] = "_" # arbitrary

  # Use ISO 3166 alpha-2 country codes for remaining names that are countries.
  # This is more systematic, and avoids collisions (e.g., Malta and Moldova).
  rule["Armenia"] = "AM"
  rule["Aus"] = "AU"
  rule["Azer"] = "AZ"
  rule["Barb"] = "BB"
  rule["Dhaka"] = "BD"
  rule["Bulg"] = "BG"
  rule["Bahamas"] = "BS"
  rule["Belize"] = "BZ"
  rule["Swiss"] = "CH"
  rule["Cook"] = "CK"
  rule["PRC"] = "CN"
  rule["Cyprus"] = "CY"
  rule["Czech"] = "CZ"
  rule["Germany"] = "DE"
  rule["DR"] = "DO"
  rule["Ecuador"] = "EC"
  rule["Finland"] = "FI"
  rule["Fiji"] = "FJ"
  rule["Falk"] = "FK"
  rule["Ghana"] = "GH"
  rule["Guat"] = "GT"
  rule["Hond"] = "HN"
  rule["Haiti"] = "HT"
  rule["Eire"] = "IE"
  rule["Iraq"] = "IQ"
  rule["Japan"] = "JP"
  rule["Kyrgyz"] = "KG"
  rule["ROK"] = "KR"
  rule["Latvia"] = "LV"
  rule["Lux"] = "LX"
  rule["Moldova"] = "MD"
  rule["Malta"] = "MT"
  rule["Mauritius"] = "MU"
  rule["Namibia"] = "NA"
  rule["Nic"] = "NI"
  rule["Norway"] = "NO"
  rule["Peru"] = "PE"
  rule["Phil"] = "PH"
  rule["Pakistan"] = "PK"
  rule["Sudan"] = "SD"
  rule["Salv"] = "SV"
  rule["Tonga"] = "TO"
  rule["Vanuatu"] = "VU"

  # Avoid collisions.
  rule["Detroit"] = "Dt" # De = Denver

  for (name in rule) {
    record_hash(rule[name], name)
  }
}

function make_line(n, field, \
		   f, r)
{
  r = field[1]
  for (f = 2; f <= n; f++)
    r = r " " field[f]
  return r
}

# Process the input line LINE and save it for later output.

function process_input_line(line, \
			    f, field, end, n, outline, r, \
			    linkline, ruleline, zoneline)
{
  # Remove comments, normalize spaces, and append a space to each line.
  sub(/#.*/, "", line)
  line = line " "
  gsub(/[\t ]+/, " ", line)

  # Abbreviate keywords and determine line type.
  linkline = sub(/^Link /, "L ", line)
  ruleline = sub(/^Rule /, "R ", line)
  zoneline = sub(/^Zone /, "Z ", line)

  # Replace FooAsia rules with the same rules without "Asia", as they
  # are duplicates.
  if (match(line, /[^ ]Asia /)) {
    if (ruleline) return
    line = substr(line, 1, RSTART) substr(line, RSTART + 5)
  }

  # Abbreviate times.
  while (match(line, /[: ]0+[0-9]/))
    line = substr(line, 1, RSTART) substr(line, RSTART + RLENGTH - 1)
  while (match(line, /:0[^:]/))
    line = substr(line, 1, RSTART - 1) substr(line, RSTART + 2)

  # Abbreviate weekday names.
  while (match(line, / (last)?(Mon|Wed|Fri)[ <>]/)) {
    end = RSTART + RLENGTH
    line = substr(line, 1, end - 4) substr(line, end - 1)
  }
  while (match(line, / (last)?(Sun|Tue|Thu|Sat)[ <>]/)) {
    end = RSTART + RLENGTH
    line = substr(line, 1, end - 3) substr(line, end - 1)
  }

  # Abbreviate "max", "min", "only" and month names.
  # Although "max" and "min" can both be abbreviated to just "m",
  # the longer forms "ma" and "mi" are needed with zic 2023d and earlier.
  gsub(/ max /, dataform == "vanguard" ? " m " : " ma ", line)
  gsub(/ min /, dataform == "vanguard" ? " m " : " mi ", line)
  gsub(/ only /, " o ", line)
  gsub(/ Jan /, " Ja ", line)
  gsub(/ Feb /, " F ", line)
  gsub(/ Apr /, " Ap ", line)
  gsub(/ Aug /, " Au ", line)
  gsub(/ Sep /, " S ", line)
  gsub(/ Oct /, " O ", line)
  gsub(/ Nov /, " N ", line)
  gsub(/ Dec /, " D ", line)

  # Strip leading and trailing space.
  sub(/^ /, "", line)
  sub(/ $/, "", line)

  # Remove unnecessary trailing zero fields.
  sub(/ 0+$/, "", line)

  # Remove unnecessary trailing days-of-month "1".
  if (match(line, /[A-Za-z] 1$/))
    line = substr(line, 1, RSTART)

  # Remove unnecessary trailing " Ja" (for January).
  sub(/ Ja$/, "", line)

  n = split(line, field)

  # Record which rule names are used, and generate their abbreviations.
  f = zoneline ? 4 : linkline || ruleline ? 0 : 2
  r = field[f]
  if (r ~ /^[^-+0-9]/) {
    rule_used[r] = 1
  }

  if (zoneline)
    zonename = startdef = field[2]
  else if (linkline)
    zonename = startdef = field[3]
  else if (ruleline)
    zonename = ""

  # Save the information for later output.
  outline = make_line(n, field)
  if (ruleline)
    rule_output_line[nrule_out++] = outline
  else if (linkline) {
    # In vanguard format with Gawk, links are output sorted by destination.
    if (dataform == "vanguard" && PROCINFO["version"])
      linkdef[zonename] = field[2]
    else
      link_output_line[nlink_out++] = outline
  }else
    zonedef[zonename] = (zoneline ? "" : zonedef[zonename] "\n") outline
}

function omit_unused_rules( \
			   i, field)
{
  for (i = 0; i < nrule_out; i++) {
    split(rule_output_line[i], field)
    if (!rule_used[field[2]])
      rule_output_line[i] = ""
  }
}

function abbreviate_rule_names( \
			       abbr, f, field, i, n, newdef, newline, r, \
			       zoneline, zonelines, zonename)
{
  for (i = 0; i < nrule_out; i++) {
    n = split(rule_output_line[i], field)
    if (n) {
      r = field[2]
      if (r ~ /^[^-+0-9]/) {
	abbr = rule[r]
	if (!abbr) {
	  rule[r] = abbr = gen_rule_name(r)
	}
	field[2] = abbr
	rule_output_line[i] = make_line(n, field)
      }
    }
  }
  for (zonename in zonedef) {
    zonelines = split(zonedef[zonename], zoneline, /\n/)
    newdef = ""
    for (i = 1; i <= zonelines; i++) {
      newline = zoneline[i]
      n = split(newline, field)
      f = i == 1 ? 4 : 2
      r = rule[field[f]]
      if (r) {
	field[f] = r
	newline = make_line(n, field)
      }
      newdef = (newdef ? newdef "\n" : "") newline
    }
    zonedef[zonename] = newdef
  }
}

function output_saved_lines( \
			    i, zonename)
{
  for (i = 0; i < nrule_out; i++)
    if (rule_output_line[i])
      print rule_output_line[i]

  # When using gawk, output zones sorted by name.
  # This makes the output a bit more compressible.
  PROCINFO["sorted_in"] = "@ind_str_asc"
  for (zonename in zonedef)
    print zonedef[zonename]

  if (nlink_out)
    for (i = 0; i < nlink_out; i++)
      print link_output_line[i]
  else {
    # When using gawk, output links sorted by destination.
    # This also helps compressibility a bit.
    PROCINFO["sorted_in"] = "@val_type_asc"
    for (zonename in linkdef)
      printf "L %s %s\n", linkdef[zonename], zonename
  }
}

BEGIN {
  # Files that the output normally depends on.
  default_dep["africa"] = 1
  default_dep["antarctica"] = 1
  default_dep["asia"] = 1
  default_dep["australasia"] = 1
  default_dep["backward"] = 1
  default_dep["etcetera"] = 1
  default_dep["europe"] = 1
  default_dep["factory"] = 1
  default_dep["northamerica"] = 1
  default_dep["southamerica"] = 1
  default_dep["ziguard.awk"] = 1
  default_dep["zishrink.awk"] = 1

  # Output a version string from 'version' and related configuration variables
  # supported by tzdb's Makefile.  If you change the makefile or any other files
  # that affect the output of this script, you should append '-SOMETHING'
  # to the contents of 'version', where SOMETHING identifies what was changed.

  ndeps = split(deps, dep)
  ddeps = ""
  for (i = 1; i <= ndeps; i++) {
    if (default_dep[dep[i]]) {
      default_dep[dep[i]]++
    } else {
      ddeps = ddeps " " dep[i]
    }
  }
  for (d in default_dep) {
    if (default_dep[d] == 1) {
      ddeps = ddeps " !" d
    }
  }
  print "# version", version
  if (dataform != "main") {
    print "# dataform", dataform
  }
  if (redo != "posix_right") {
    print "# redo " redo
  }
  if (ddeps) {
    print "# ddeps" ddeps
  }
  print "# This zic input file is in the public domain."

  prehash_rule_names()
}

/^[\t ]*[^#\t ]/ {
  process_input_line($0)
}

END {
  omit_unused_rules()
  abbreviate_rule_names()
  output_saved_lines()
}