1 files changed, 2008 insertions, 0 deletions
diff --git a/third_party/rust/encoding_rs/generate-encoding-data.py b/third_party/rust/encoding_rs/generate-encoding-data.py
new file mode 100644
index 0000000000..99cec1adc5
--- /dev/null
+++ b/third_party/rust/encoding_rs/generate-encoding-data.py
@@ -0,0 +1,2008 @@
+#!/usr/bin/python
+
+# Copyright Mozilla Foundation. See the COPYRIGHT
+# file at the top-level directory of this distribution.
+#
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+import json
+import subprocess
+import sys
+import os.path
+
+if (not os.path.isfile("../encoding/encodings.json")) or (not os.path.isfile("../encoding/indexes.json")):
+  sys.stderr.write("This script needs a clone of https://github.com/whatwg/encoding/ (preferably at revision f381389) next to the encoding_rs directory.\n");
+  sys.exit(-1)
+
+if not os.path.isfile("../encoding_c/src/lib.rs"):
+  sys.stderr.write("This script also writes the generated parts of the encoding_c crate and needs a clone of https://github.com/hsivonen/encoding_c next to the encoding_rs directory.\n");
+  sys.exit(-1)
+
+if not os.path.isfile("../codepage/src/lib.rs"):
+  sys.stderr.write("This script also writes the generated parts of the codepage crate and needs a clone of https://github.com/hsivonen/codepage next to the encoding_rs directory.\n");
+  sys.exit(-1)
+
+def cmp_from_end(one, other):
+  c = cmp(len(one), len(other))
+  if c != 0:
+    return c
+  i = len(one) - 1
+  while i >= 0:
+    c = cmp(one[i], other[i])
+    if c != 0:
+      return c
+    i -= 1
+  return 0
+
+
+class Label:
+  def __init__(self, label, preferred):
+    self.label = label
+    self.preferred = preferred
+  def __cmp__(self, other):
+    return cmp_from_end(self.label, other.label)
+
+class CodePage:
+  def __init__(self, code_page, preferred):
+    self.code_page = code_page
+    self.preferred = preferred
+  def __cmp__(self, other):
+    return self.code_page, other.code_page
+
+def static_u16_table(name, data):
+  data_file.write('''pub static %s: [u16; %d] = [
+  ''' % (name, len(data)))
+
+  for i in xrange(len(data)):
+    data_file.write('0x%04X,\n' % data[i])
+
+  data_file.write('''];
+
+  ''')
+
+def static_u16_table_from_indexable(name, data, item, feature):
+  data_file.write('''#[cfg(all(
+    feature = "less-slow-%s",
+    not(feature = "fast-%s")
+))]
+static %s: [u16; %d] = [
+  ''' % (feature, feature, name, len(data)))
+
+  for i in xrange(len(data)):
+    data_file.write('0x%04X,\n' % data[i][item])
+
+  data_file.write('''];
+
+  ''')
+
+def static_u8_pair_table_from_indexable(name, data, item, feature):
+  data_file.write('''#[cfg(all(
+    feature = "less-slow-%s",
+    not(feature = "fast-%s")
+))]
+static %s: [[u8; 2]; %d] = [
+  ''' % (feature, feature, name, len(data)))
+
+  for i in xrange(len(data)):
+    data_file.write('[0x%02X, 0x%02X],\n' % data[i][item])
+
+  data_file.write('''];
+
+  ''')
+
+def static_u8_pair_table(name, data, feature):
+  data_file.write('''#[cfg(feature = "%s")]
+static %s: [[u8; 2]; %d] = [
+  ''' % (feature, name, len(data)))
+
+  for i in xrange(len(data)):
+    pair = data[i]
+    if not pair:
+      pair = (0, 0)
+    data_file.write('[0x%02X, 0x%02X],\n' % pair)
+
+  data_file.write('''];
+
+  ''')
+
+preferred = []
+
+dom = []
+
+labels = []
+
+data = json.load(open("../encoding/encodings.json", "r"))
+
+indexes = json.load(open("../encoding/indexes.json", "r"))
+
+single_byte = []
+
+multi_byte = []
+
+def to_camel_name(name):
+  if name == u"iso-8859-8-i":
+    return u"Iso8I"
+  if name.startswith(u"iso-8859-"):
+    return name.replace(u"iso-8859-", u"Iso")
+  return name.title().replace(u"X-", u"").replace(u"-", u"").replace(u"_", u"")
+
+def to_constant_name(name):
+  return name.replace(u"-", u"_").upper()
+
+def to_snake_name(name):
+  return name.replace(u"-", u"_").lower()
+
+def to_dom_name(name):
+  return name
+
+# Guestimate based on
+# https://w3techs.com/technologies/overview/character_encoding/all
+# whose methodology is known to be bogus, but the results are credible for
+# this purpose. UTF-16LE lifted up due to prevalence on Windows and
+# "ANSI codepages" prioritized.
+encodings_by_code_page_frequency = [
+  "UTF-8",    
+  "UTF-16LE",
+  "windows-1252",
+  "windows-1251",
+  "GBK",
+  "Shift_JIS",
+  "EUC-KR",
+  "windows-1250",
+  "windows-1256",
+  "windows-1254",
+  "Big5",
+  "windows-874",
+  "windows-1255",
+  "windows-1253",
+  "windows-1257",
+  "windows-1258",
+  "EUC-JP",
+  "ISO-8859-2",
+  "ISO-8859-15",
+  "ISO-8859-7",
+  "KOI8-R",
+  "gb18030",
+  "ISO-8859-5",
+  "ISO-8859-8-I",
+  "ISO-8859-4",
+  "ISO-8859-6",
+  "ISO-2022-JP",
+  "KOI8-U",
+  "ISO-8859-13",
+  "ISO-8859-3",
+  "UTF-16BE",
+  "IBM866",
+  "ISO-8859-10",
+  "ISO-8859-8",
+  "macintosh",
+  "x-mac-cyrillic",
+  "ISO-8859-14",
+  "ISO-8859-16",
+]
+
+encodings_by_code_page = {
+  932: "Shift_JIS",
+  936: "GBK",
+  949: "EUC-KR",
+  950: "Big5",
+  866: "IBM866",
+  874: "windows-874",
+  1200: "UTF-16LE",
+  1201: "UTF-16BE",
+  1250: "windows-1250",
+  1251: "windows-1251",
+  1252: "windows-1252",
+  1253: "windows-1253",
+  1254: "windows-1254",
+  1255: "windows-1255",
+  1256: "windows-1256",
+  1257: "windows-1257",
+  1258: "windows-1258",
+  10000: "macintosh",
+  10017: "x-mac-cyrillic",
+  20866: "KOI8-R",
+  20932: "EUC-JP",
+  21866: "KOI8-U",
+  28592: "ISO-8859-2",
+  28593: "ISO-8859-3",
+  28594: "ISO-8859-4",
+  28595: "ISO-8859-5",
+  28596: "ISO-8859-6",
+  28597: "ISO-8859-7",
+  28598: "ISO-8859-8",
+  28600: "ISO-8859-10",
+  28603: "ISO-8859-13",
+  28604: "ISO-8859-14",
+  28605: "ISO-8859-15",
+  28606: "ISO-8859-16",
+  38598: "ISO-8859-8-I",
+  50221: "ISO-2022-JP",
+  54936: "gb18030",
+  65001: "UTF-8",
+}
+
+code_pages_by_encoding = {}
+
+for code_page, encoding in encodings_by_code_page.iteritems():
+  code_pages_by_encoding[encoding] = code_page
+
+encoding_by_alias_code_page = {
+  951: "Big5",
+  10007: "x-mac-cyrillic",
+  20936: "GBK",
+  20949: "EUC-KR",
+  21010: "UTF-16LE", # Undocumented; needed by calamine for Excel compat
+  28591: "windows-1252",
+  28599: "windows-1254",
+  28601: "windows-874",
+  50220: "ISO-2022-JP",
+  50222: "ISO-2022-JP",
+  50225: "replacement", # ISO-2022-KR
+  50227: "replacement", # ISO-2022-CN
+  51949: "EUC-JP",
+  51936: "GBK",
+  51949: "EUC-KR",
+  52936: "replacement", # HZ
+}
+
+code_pages = []
+
+for name in encodings_by_code_page_frequency:
+  code_pages.append(code_pages_by_encoding[name])
+
+encodings_by_code_page.update(encoding_by_alias_code_page)
+
+temp_keys = encodings_by_code_page.keys()
+temp_keys.sort()
+for code_page in temp_keys:
+  if not code_page in code_pages:
+    code_pages.append(code_page)
+
+# The position in the index (0 is the first index entry,
+# i.e. byte value 0x80) that starts the longest run of
+# consecutive code points. Must not be in the first
+# quadrant. If the character to be encoded is not in this
+# run, the part of the index after the run is searched
+# forward. Then the part of the index from 32 to the start
+# of the run. The first quadrant is searched last.
+#
+# If there is no obviously most useful longest run,
+# the index here is just used to affect the search order.
+start_of_longest_run_in_single_byte = {
+  "IBM866": 96, # 0 would be longest, but we don't want to start in the first quadrant
+  "windows-874": 33,
+  "windows-1250": 92,
+  "windows-1251": 64,
+  "windows-1252": 32,
+  "windows-1253": 83,
+  "windows-1254": 95,
+  "windows-1255": 96,
+  "windows-1256": 65,
+  "windows-1257": 95, # not actually longest
+  "windows-1258": 95, # not actually longest
+  "macintosh": 106, # useless
+  "x-mac-cyrillic": 96,
+  "KOI8-R": 64, # not actually longest
+  "KOI8-U": 64, # not actually longest
+  "ISO-8859-2": 95, # not actually longest
+  "ISO-8859-3": 95, # not actually longest
+  "ISO-8859-4": 95, # not actually longest
+  "ISO-8859-5": 46,
+  "ISO-8859-6": 65,
+  "ISO-8859-7": 83,
+  "ISO-8859-8": 96,
+  "ISO-8859-10": 90, # not actually longest
+  "ISO-8859-13": 95, # not actually longest
+  "ISO-8859-14": 95,
+  "ISO-8859-15": 63,
+  "ISO-8859-16": 95, # not actually longest
+}
+
+#
+
+for group in data:
+  if group["heading"] == "Legacy single-byte encodings":
+    single_byte = group["encodings"]
+  else:
+    multi_byte.extend(group["encodings"])
+  for encoding in group["encodings"]:
+    preferred.append(encoding["name"])
+    for label in encoding["labels"]:
+      labels.append(Label(label, encoding["name"]))
+
+for name in preferred:
+  dom.append(to_dom_name(name))
+
+preferred.sort()
+labels.sort()
+dom.sort(cmp=cmp_from_end)
+
+longest_label_length = 0
+longest_name_length = 0
+longest_label = None
+longest_name = None
+
+for name in preferred:
+  if len(name) > longest_name_length:
+    longest_name_length = len(name)
+    longest_name = name
+
+for label in labels:
+  if len(label.label) > longest_label_length:
+    longest_label_length = len(label.label)
+    longest_label = label.label
+
+def longest_run_for_single_byte(name):
+  if name == u"ISO-8859-8-I":
+    name = u"ISO-8859-8"
+  index = indexes[name.lower()]
+  run_byte_offset = start_of_longest_run_in_single_byte[name]
+  run_bmp_offset = index[run_byte_offset]
+  previous_code_point = run_bmp_offset
+  run_length = 1
+  while True:
+    i = run_byte_offset + run_length
+    if i == len(index):
+      break
+    code_point = index[i]
+    if previous_code_point + 1 != code_point:
+      break
+    previous_code_point = code_point
+    run_length += 1
+  return (run_bmp_offset, run_byte_offset, run_length)
+
+def is_single_byte(name):
+  for encoding in single_byte:
+    if name == encoding["name"]:
+      return True
+  return False
+
+def read_non_generated(path):
+  partially_generated_file = open(path, "r")
+  full = partially_generated_file.read()
+  partially_generated_file.close()
+
+  generated_begin = "// BEGIN GENERATED CODE. PLEASE DO NOT EDIT."
+  generated_end = "// END GENERATED CODE"
+
+  generated_begin_index = full.find(generated_begin)
+  if generated_begin_index < 0:
+    sys.stderr.write("Can't find generated code start marker in %s. Exiting.\n" % path)
+    sys.exit(-1)
+  generated_end_index = full.find(generated_end)
+  if generated_end_index < 0:
+    sys.stderr.write("Can't find generated code end marker in %s. Exiting.\n" % path)
+    sys.exit(-1)
+
+  return (full[0:generated_begin_index + len(generated_begin)],
+          full[generated_end_index:])
+
+(lib_rs_begin, lib_rs_end) = read_non_generated("src/lib.rs")
+
+label_file = open("src/lib.rs", "w")
+
+label_file.write(lib_rs_begin)
+label_file.write("""
+// Instead, please regenerate using generate-encoding-data.py
+
+const LONGEST_LABEL_LENGTH: usize = %d; // %s
+
+""" % (longest_label_length, longest_label))
+
+for name in preferred:
+  variant = None
+  if is_single_byte(name):
+    (run_bmp_offset, run_byte_offset, run_length) = longest_run_for_single_byte(name)
+    variant = "SingleByte(&data::SINGLE_BYTE_DATA.%s, 0x%04X, %d, %d)" % (to_snake_name(u"iso-8859-8" if name == u"ISO-8859-8-I" else name), run_bmp_offset, run_byte_offset, run_length)
+  else:
+    variant = to_camel_name(name)
+
+  docfile = open("doc/%s.txt" % name, "r")
+  doctext = docfile.read()
+  docfile.close()
+
+  label_file.write('''/// The initializer for the [%s](static.%s.html) encoding.
+///
+/// For use only for taking the address of this form when
+/// Rust prohibits the use of the non-`_INIT` form directly,
+/// such as in initializers of other `static`s. If in doubt,
+/// use the corresponding non-`_INIT` reference-typed `static`.
+///
+/// This part of the public API will go away if Rust changes
+/// to make the referent of `pub const FOO: &'static Encoding`
+/// unique cross-crate or if Rust starts allowing static arrays
+/// to be initialized with `pub static FOO: &'static Encoding`
+/// items.
+pub static %s_INIT: Encoding = Encoding {
+    name: "%s",
+    variant: VariantEncoding::%s,
+};
+
+/// The %s encoding.
+///
+%s///
+/// This will change from `static` to `const` if Rust changes
+/// to make the referent of `pub const FOO: &'static Encoding`
+/// unique cross-crate, so don't take the address of this
+/// `static`.
+pub static %s: &'static Encoding = &%s_INIT;
+
+''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name), to_dom_name(name), variant, to_dom_name(name), doctext, to_constant_name(name), to_constant_name(name)))
+
+label_file.write("""static LABELS_SORTED: [&'static str; %d] = [
+""" % len(labels))
+
+for label in labels:
+  label_file.write('''"%s",\n''' % label.label)
+
+label_file.write("""];
+
+static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; %d] = [
+""" % len(labels))
+
+for label in labels:
+  label_file.write('''&%s_INIT,\n''' % to_constant_name(label.preferred))
+
+label_file.write('''];
+
+''')
+label_file.write(lib_rs_end)
+label_file.close()
+
+label_test_file = open("src/test_labels_names.rs", "w")
+label_test_file.write('''// Any copyright to the test code below this comment is dedicated to the
+// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
+
+// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
+// Instead, please regenerate using generate-encoding-data.py
+
+use super::*;
+
+#[test]
+fn test_all_labels() {
+''')
+
+for label in labels:
+  label_test_file.write('''assert_eq!(Encoding::for_label(b"%s"), Some(%s));\n''' % (label.label, to_constant_name(label.preferred)))
+
+label_test_file.write('''}
+''')
+label_test_file.close()
+
+def null_to_zero(code_point):
+  if not code_point:
+    code_point = 0
+  return code_point
+
+(data_rs_begin, data_rs_end) = read_non_generated("src/data.rs")
+
+data_file = open("src/data.rs", "w")
+data_file.write(data_rs_begin)
+data_file.write('''
+// Instead, please regenerate using generate-encoding-data.py
+
+#[repr(align(64))] // Align to cache lines
+pub struct SingleByteData {
+''')
+
+# Single-byte
+
+for encoding in single_byte:
+  name = encoding["name"]
+  if name == u"ISO-8859-8-I":
+    continue
+
+  data_file.write('''    pub %s: [u16; 128],
+''' % to_snake_name(name))
+
+data_file.write('''}
+
+pub static SINGLE_BYTE_DATA: SingleByteData = SingleByteData {
+''')
+
+for encoding in single_byte:
+  name = encoding["name"]
+  if name == u"ISO-8859-8-I":
+    continue
+
+  data_file.write('''    %s: [
+''' % to_snake_name(name))
+
+  for code_point in indexes[name.lower()]:
+    data_file.write('0x%04X,\n' % null_to_zero(code_point))
+
+  data_file.write('''],
+''')
+
+data_file.write('''};
+
+''')
+
+# Big5
+
+index = indexes["big5"]
+
+astralness = []
+low_bits = []
+
+for code_point in index[942:19782]:
+  if code_point:
+    astralness.append(1 if code_point > 0xFFFF else 0)
+    low_bits.append(code_point & 0xFFFF)
+  else:
+    astralness.append(0)
+    low_bits.append(0)
+
+# pad length to multiple of 32
+for j in xrange(32 - (len(astralness) % 32)):
+  astralness.append(0)
+
+data_file.write('''#[cfg_attr(feature = "cargo-clippy", allow(unreadable_literal))]
+static BIG5_ASTRALNESS: [u32; %d] = [
+''' % (len(astralness) / 32))
+
+i = 0
+while i < len(astralness):
+  accu = 0
+  for j in xrange(32):
+    accu |= astralness[i + j] << j
+  data_file.write('0x%08X,\n' % accu)
+  i += 32
+
+data_file.write('''];
+
+''')
+
+static_u16_table("BIG5_LOW_BITS", low_bits)
+
+# Encoder table for Level 1 Hanzi
+# Note: If we were OK with doubling this table, we
+# could use a directly-indexable table instead...
+level1_hanzi_index = index[5495:10896]
+level1_hanzi_pairs = []
+for i in xrange(len(level1_hanzi_index)):
+  hanzi_lead = (i / 157) + 0xA4
+  hanzi_trail = (i % 157)
+  hanzi_trail += 0x40 if hanzi_trail < 0x3F else 0x62
+  level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail)))
+level1_hanzi_pairs.append((0x4E5A, (0xC8, 0x7B)))
+level1_hanzi_pairs.append((0x5202, (0xC8, 0x7D)))
+level1_hanzi_pairs.append((0x9FB0, (0xC8, 0xA1)))
+level1_hanzi_pairs.append((0x5188, (0xC8, 0xA2)))
+level1_hanzi_pairs.append((0x9FB1, (0xC8, 0xA3)))
+level1_hanzi_pairs.sort(key=lambda x: x[0])
+
+static_u16_table_from_indexable("BIG5_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "big5-hanzi-encode")
+static_u8_pair_table_from_indexable("BIG5_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "big5-hanzi-encode")
+
+# Fast Unified Ideograph encode
+big5_unified_ideograph_bytes = [None] * (0x9FCC - 0x4E00)
+for row in xrange(0x7E - 0x20):
+  for column in xrange(157):
+    pointer = 5024 + column + (row * 157)
+    code_point = index[pointer]
+    if code_point and code_point >= 0x4E00 and code_point <= 0x9FCB:
+      unified_offset = code_point - 0x4E00
+      unified_lead = 0xA1 + row
+      unified_trail = (0x40 if column < 0x3F else 0x62) + column
+      if code_point == 0x5341 or code_point == 0x5345 or not big5_unified_ideograph_bytes[unified_offset]:
+        big5_unified_ideograph_bytes[unified_offset] = (unified_lead, unified_trail)
+
+static_u8_pair_table("BIG5_UNIFIED_IDEOGRAPH_BYTES", big5_unified_ideograph_bytes, "fast-big5-hanzi-encode")
+
+# JIS0208
+
+index = indexes["jis0208"]
+
+# JIS 0208 Level 1 Kanji
+static_u16_table("JIS0208_LEVEL1_KANJI", index[1410:4375])
+
+# JIS 0208 Level 2 Kanji and Additional Kanji
+static_u16_table("JIS0208_LEVEL2_AND_ADDITIONAL_KANJI", index[4418:7808])
+
+# IBM Kanji
+static_u16_table("IBM_KANJI", index[8272:8632])
+
+# Check that the other instance is the same
+if index[8272:8632] != index[10744:11104]:
+  raise Error()
+
+# JIS 0208 symbols (all non-Kanji, non-range items)
+symbol_index = []
+symbol_triples = []
+pointers_to_scan = [
+  (0, 188),
+  (658, 691),
+  (1159, 1221),
+]
+in_run = False
+run_start_pointer = 0
+run_start_array_index = 0
+for (start, end) in pointers_to_scan:
+  for i in range(start, end):
+    code_point = index[i]
+    if in_run:
+      if code_point:
+        symbol_index.append(code_point)
+      else:
+        symbol_triples.append(run_start_pointer)
+        symbol_triples.append(i - run_start_pointer)
+        symbol_triples.append(run_start_array_index)
+        in_run = False
+    else:
+      if code_point:
+        in_run = True
+        run_start_pointer = i
+        run_start_array_index = len(symbol_index)
+        symbol_index.append(code_point)
+  if in_run:
+    symbol_triples.append(run_start_pointer)
+    symbol_triples.append(end - run_start_pointer)
+    symbol_triples.append(run_start_array_index)
+    in_run = False
+if in_run:
+  raise Error()
+
+# Now add manually the two overlapping slices of
+# index from the NEC/IBM extensions.
+run_start_array_index = len(symbol_index)
+symbol_index.extend(index[10736:10744])
+# Later
+symbol_triples.append(10736)
+symbol_triples.append(8)
+symbol_triples.append(run_start_array_index)
+# Earlier
+symbol_triples.append(8644)
+symbol_triples.append(4)
+symbol_triples.append(run_start_array_index)
+
+static_u16_table("JIS0208_SYMBOLS", symbol_index)
+static_u16_table("JIS0208_SYMBOL_TRIPLES", symbol_triples)
+
+# Write down the magic numbers needed when preferring the earlier case
+data_file.write('''const IBM_SYMBOL_START: usize = %d;''' % (run_start_array_index + 1))
+data_file.write('''const IBM_SYMBOL_END: usize = %d;''' % (run_start_array_index + 4))
+data_file.write('''const IBM_SYMBOL_POINTER_START: usize = %d;''' % 8645)
+
+# JIS 0208 ranges (excluding kana)
+range_triples = []
+pointers_to_scan = [
+  (188, 281),
+  (470, 657),
+  (1128, 1159),
+  (8634, 8644),
+  (10716, 10736),
+]
+in_run = False
+run_start_pointer = 0
+run_start_code_point = 0
+previous_code_point = 0
+for (start, end) in pointers_to_scan:
+  for i in range(start, end):
+    code_point = index[i]
+    if in_run:
+      if code_point:
+        if previous_code_point + 1 != code_point:
+          range_triples.append(run_start_pointer)
+          range_triples.append(i - run_start_pointer)
+          range_triples.append(run_start_code_point)
+          run_start_pointer = i
+          run_start_code_point = code_point
+        previous_code_point = code_point
+      else:
+          range_triples.append(run_start_pointer)
+          range_triples.append(i - run_start_pointer)
+          range_triples.append(run_start_code_point)
+          run_start_pointer = 0
+          run_start_code_point = 0
+          previous_code_point = 0
+          in_run = False
+    else:
+      if code_point:
+        in_run = True
+        run_start_pointer = i
+        run_start_code_point = code_point
+        previous_code_point = code_point
+  if in_run:
+    range_triples.append(run_start_pointer)
+    range_triples.append(end - run_start_pointer)
+    range_triples.append(run_start_code_point)
+    run_start_pointer = 0
+    run_start_code_point = 0
+    previous_code_point = 0
+    in_run = False
+if in_run:
+  raise Error()
+
+static_u16_table("JIS0208_RANGE_TRIPLES", range_triples)
+
+# Encoder table for Level 1 Kanji
+# Note: If we were OK with 30 KB more footprint, we
+# could use a directly-indexable table instead...
+level1_kanji_index = index[1410:4375]
+level1_kanji_pairs = []
+for i in xrange(len(level1_kanji_index)):
+  pointer = 1410 + i
+  (lead, trail) = divmod(pointer, 188)
+  lead += 0x81 if lead < 0x1F else 0xC1
+  trail += 0x40 if trail < 0x3F else 0x41
+  level1_kanji_pairs.append((level1_kanji_index[i], (lead, trail)))
+level1_kanji_pairs.sort(key=lambda x: x[0])
+
+static_u16_table_from_indexable("JIS0208_LEVEL1_KANJI_CODE_POINTS", level1_kanji_pairs, 0, "kanji-encode")
+static_u8_pair_table_from_indexable("JIS0208_LEVEL1_KANJI_SHIFT_JIS_BYTES", level1_kanji_pairs, 1, "kanji-encode")
+
+# Fast encoder table for Kanji
+kanji_bytes = [None] * (0x9FA1 - 0x4E00)
+for pointer in xrange(len(index)):
+  code_point = index[pointer]
+  if code_point and code_point >= 0x4E00 and code_point <= 0x9FA0:
+    (lead, trail) = divmod(pointer, 188)
+    lead += 0x81 if lead < 0x1F else 0xC1
+    trail += 0x40 if trail < 0x3F else 0x41
+    # unset the high bit of lead if IBM Kanji
+    if pointer >= 8272:
+      lead = lead & 0x7F
+    kanji_bytes[code_point - 0x4E00] = (lead, trail)
+
+static_u8_pair_table("JIS0208_KANJI_BYTES", kanji_bytes, "fast-kanji-encode")
+
+# ISO-2022-JP half-width katakana
+
+# index is still jis0208
+half_width_index = indexes["iso-2022-jp-katakana"]
+
+data_file.write('''pub static ISO_2022_JP_HALF_WIDTH_TRAIL: [u8; %d] = [
+''' % len(half_width_index))
+
+for i in xrange(len(half_width_index)):
+  code_point = half_width_index[i]
+  pointer = index.index(code_point)
+  trail = pointer % 94 + 0x21
+  data_file.write('0x%02X,\n' % trail)
+
+data_file.write('''];
+
+''')
+
+# EUC-KR
+
+index = indexes["euc-kr"]
+
+# Unicode 1.1 Hangul above the old KS X 1001 block
+# Compressed form takes 35% of uncompressed form
+pointers = []
+offsets = []
+previous_code_point = 0
+for row in xrange(0x20):
+  for column in xrange(190):
+    i = column + (row * 190)
+    # Skip the gaps
+    if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40):
+      continue
+    code_point = index[i]
+    if previous_code_point > code_point:
+      raise Error()
+    if code_point - previous_code_point != 1:
+      adjustment = 0
+      if column >= 0x40:
+        adjustment = 12
+      elif column >= 0x20:
+        adjustment = 6
+      pointers.append(column - adjustment + (row * (190 - 12)))
+      offsets.append(code_point)
+    previous_code_point = code_point
+
+static_u16_table("CP949_TOP_HANGUL_POINTERS", pointers)
+static_u16_table("CP949_TOP_HANGUL_OFFSETS", offsets)
+
+# Unicode 1.1 Hangul to the left of the old KS X 1001 block
+pointers = []
+offsets = []
+previous_code_point = 0
+for row in xrange(0x46 - 0x20):
+  for column in xrange(190 - 94):
+    i = 6080 + column + (row * 190)
+    # Skip the gaps
+    if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40):
+      continue
+    if i > 13127:
+      # Exclude unassigned on partial last row
+      break
+    code_point = index[i]
+    if previous_code_point > code_point:
+      raise Error()
+    if code_point - previous_code_point != 1:
+      adjustment = 0
+      if column >= 0x40:
+        adjustment = 12
+      elif column >= 0x20:
+        adjustment = 6
+      pointers.append(column - adjustment + (row * (190 - 94 - 12)))
+      offsets.append(code_point)
+    previous_code_point = code_point
+
+static_u16_table("CP949_LEFT_HANGUL_POINTERS", pointers)
+static_u16_table("CP949_LEFT_HANGUL_OFFSETS", offsets)
+
+# KS X 1001 Hangul
+hangul_index = []
+previous_code_point = 0
+for row in xrange(0x48 - 0x2F):
+  for column in xrange(94):
+    code_point = index[9026 + column + (row * 190)]
+    if previous_code_point >= code_point:
+      raise Error()
+    hangul_index.append(code_point)
+    previous_code_point = code_point
+
+static_u16_table("KSX1001_HANGUL", hangul_index)
+
+# KS X 1001 Hanja
+hanja_index = []
+for row in xrange(0x7D - 0x49):
+  for column in xrange(94):
+    hanja_index.append(index[13966 + column + (row * 190)])
+
+static_u16_table("KSX1001_HANJA", hanja_index)
+
+# KS X 1001 symbols
+symbol_index = []
+for i in range(6176, 6270):
+  symbol_index.append(index[i])
+for i in range(6366, 6437):
+  symbol_index.append(index[i])
+
+static_u16_table("KSX1001_SYMBOLS", symbol_index)
+
+# KS X 1001 Uppercase Latin
+subindex = []
+for i in range(7506, 7521):
+  subindex.append(null_to_zero(index[i]))
+
+static_u16_table("KSX1001_UPPERCASE", subindex)
+
+# KS X 1001 Lowercase Latin
+subindex = []
+for i in range(7696, 7712):
+  subindex.append(index[i])
+
+static_u16_table("KSX1001_LOWERCASE", subindex)
+
+# KS X 1001 Box drawing
+subindex = []
+for i in range(7126, 7194):
+  subindex.append(index[i])
+
+static_u16_table("KSX1001_BOX", subindex)
+
+# KS X 1001 other
+pointers = []
+offsets = []
+previous_code_point = 0
+for row in xrange(10):
+  for column in xrange(94):
+    i = 6556 + column + (row * 190)
+    code_point = index[i]
+    # Exclude ranges that were processed as lookup tables
+    # or that contain unmapped cells by filling them with
+    # ASCII. Upon encode, ASCII code points will
+    # never appear as the search key.
+    if (i >= 6946 and i <= 6950):
+      code_point = i - 6946
+    elif (i >= 6961 and i <= 6967):
+      code_point = i - 6961
+    elif (i >= 6992 and i <= 6999):
+      code_point = i - 6992
+    elif (i >= 7024 and i <= 7029):
+      code_point = i - 7024
+    elif (i >= 7126 and i <= 7219):
+      code_point = i - 7126
+    elif (i >= 7395 and i <= 7409):
+      code_point = i - 7395
+    elif (i >= 7506 and i <= 7521):
+      code_point = i - 7506
+    elif (i >= 7696 and i <= 7711):
+      code_point = i - 7696
+    elif (i >= 7969 and i <= 7979):
+      code_point = i - 7969
+    elif (i >= 8162 and i <= 8169):
+      code_point = i - 8162
+    elif (i >= 8299 and i <= 8313):
+      code_point = i - 8299
+    elif (i >= 8347 and i <= 8359):
+      code_point = i - 8347
+    if code_point - previous_code_point != 1:
+      pointers.append(column + (row * 94))
+      offsets.append(code_point)
+    previous_code_point = code_point
+
+static_u16_table("KSX1001_OTHER_POINTERS", pointers)
+# Omit the last offset, because the end of the last line
+# is unmapped, so we don't want to look at it.
+static_u16_table("KSX1001_OTHER_UNSORTED_OFFSETS", offsets[:-1])
+
+# Fast Hangul and Hanja encode
+hangul_bytes = [None] * (0xD7A4 - 0xAC00)
+hanja_unified_bytes = [None] * (0x9F9D - 0x4E00)
+hanja_compatibility_bytes = [None] * (0xFA0C - 0xF900)
+for row in xrange(0x7D):
+  for column in xrange(190):
+    pointer = column + (row * 190)
+    code_point = index[pointer]
+    if code_point:
+      lead = 0x81 + row
+      trail = 0x41 + column
+      if code_point >= 0xAC00 and code_point < 0xD7A4:
+        hangul_bytes[code_point - 0xAC00] = (lead, trail)
+      elif code_point >= 0x4E00 and code_point < 0x9F9D:
+        hanja_unified_bytes[code_point - 0x4E00] = (lead, trail)
+      elif code_point >= 0xF900 and code_point < 0xFA0C:
+        hanja_compatibility_bytes[code_point - 0xF900] = (lead, trail)
+
+static_u8_pair_table("CP949_HANGUL_BYTES", hangul_bytes, "fast-hangul-encode")
+static_u8_pair_table("KSX1001_UNIFIED_HANJA_BYTES", hanja_unified_bytes, "fast-hanja-encode")
+static_u8_pair_table("KSX1001_COMPATIBILITY_HANJA_BYTES", hanja_compatibility_bytes, "fast-hanja-encode")
+
+# JIS 0212
+
+index = indexes["jis0212"]
+
+# JIS 0212 Kanji
+static_u16_table("JIS0212_KANJI", index[1410:7211])
+
+# JIS 0212 accented (all non-Kanji, non-range items)
+symbol_index = []
+symbol_triples = []
+pointers_to_scan = [
+  (0, 596),
+  (608, 644),
+  (656, 1409),
+]
+in_run = False
+run_start_pointer = 0
+run_start_array_index = 0
+for (start, end) in pointers_to_scan:
+  for i in range(start, end):
+    code_point = index[i]
+    if in_run:
+      if code_point:
+        symbol_index.append(code_point)
+      elif index[i + 1]:
+        symbol_index.append(0)
+      else:
+        symbol_triples.append(run_start_pointer)
+        symbol_triples.append(i - run_start_pointer)
+        symbol_triples.append(run_start_array_index)
+        in_run = False
+    else:
+      if code_point:
+        in_run = True
+        run_start_pointer = i
+        run_start_array_index = len(symbol_index)
+        symbol_index.append(code_point)
+  if in_run:
+    symbol_triples.append(run_start_pointer)
+    symbol_triples.append(end - run_start_pointer)
+    symbol_triples.append(run_start_array_index)
+    in_run = False
+if in_run:
+  raise Error()
+
+static_u16_table("JIS0212_ACCENTED", symbol_index)
+static_u16_table("JIS0212_ACCENTED_TRIPLES", symbol_triples)
+
+# gb18030
+
+index = indexes["gb18030"]
+
+# Unicode 1.1 ideographs above the old GB2312 block
+# Compressed form takes 63% of uncompressed form
+pointers = []
+offsets = []
+previous_code_point = 0
+for i in xrange(6080):
+  code_point = index[i]
+  if previous_code_point > code_point:
+    raise Error()
+  if code_point - previous_code_point != 1:
+    pointers.append(i)
+    offsets.append(code_point)
+  previous_code_point = code_point
+
+static_u16_table("GBK_TOP_IDEOGRAPH_POINTERS", pointers)
+static_u16_table("GBK_TOP_IDEOGRAPH_OFFSETS", offsets)
+
+# Unicode 1.1 ideographs to the left of the old GB2312 block
+# Compressed form takes 40% of uncompressed form
+pointers = []
+offsets = []
+previous_code_point = 0
+for row in xrange(0x7D - 0x29):
+  for column in xrange(190 - 94):
+    i = 7790 + column + (row * 190)
+    if i > 23650:
+      # Exclude compatibility ideographs at the end
+      break
+    code_point = index[i]
+    if previous_code_point > code_point:
+      raise Error()
+    if code_point - previous_code_point != 1:
+      pointers.append(column + (row * (190 - 94)))
+      offsets.append(code_point)
+    previous_code_point = code_point
+
+static_u16_table("GBK_LEFT_IDEOGRAPH_POINTERS", pointers)
+static_u16_table("GBK_LEFT_IDEOGRAPH_OFFSETS", offsets)
+
+# GBK other (excl. Ext A, Compat & PUA at the bottom)
+pointers = []
+offsets = []
+previous_code_point = 0
+for row in xrange(0x29 - 0x20):
+  for column in xrange(190 - 94):
+    i = 6080 + column + (row * 190)
+    code_point = index[i]
+    if code_point - previous_code_point != 1:
+      pointers.append(column + (row * (190 - 94)))
+      offsets.append(code_point)
+    previous_code_point = code_point
+
+pointers.append((190 - 94) * (0x29 - 0x20))
+static_u16_table("GBK_OTHER_POINTERS", pointers)
+static_u16_table("GBK_OTHER_UNSORTED_OFFSETS", offsets)
+
+# GBK bottom: Compatibility ideagraphs, Ext A and PUA
+bottom_index = []
+# 5 compat following Unified Ideographs
+for i in range(23651, 23656):
+  bottom_index.append(index[i])
+# Last row
+for i in range(23750, 23846):
+  bottom_index.append(index[i])
+
+static_u16_table("GBK_BOTTOM", bottom_index)
+
+# GB2312 Hanzi
+# (and the 5 PUA code points in between Level 1 and Level 2)
+hanzi_index = []
+for row in xrange(0x77 - 0x2F):
+  for column in xrange(94):
+    hanzi_index.append(index[9026 + column + (row * 190)])
+
+static_u16_table("GB2312_HANZI", hanzi_index)
+
+# GB2312 symbols
+symbol_index = []
+for i in xrange(94):
+  symbol_index.append(index[6176 + i])
+
+static_u16_table("GB2312_SYMBOLS", symbol_index)
+
+# GB2312 symbols on Greek row (incl. PUA)
+symbol_index = []
+for i in xrange(22):
+  symbol_index.append(index[7189 + i])
+
+static_u16_table("GB2312_SYMBOLS_AFTER_GREEK", symbol_index)
+
+# GB2312 Pinyin
+pinyin_index = []
+for i in xrange(32):
+  pinyin_index.append(index[7506 + i])
+
+static_u16_table("GB2312_PINYIN", pinyin_index)
+
+# GB2312 other (excl. bottom PUA)
+pointers = []
+offsets = []
+previous_code_point = 0
+for row in xrange(14):
+  for column in xrange(94):
+    i = 6366 + column + (row * 190)
+    code_point = index[i]
+    # Exclude the two ranges that were processed as
+    # lookup tables above by filling them with
+    # ASCII. Upon encode, ASCII code points will
+    # never appear as the search key.
+    if (i >= 7189 and i < 7189 + 22):
+      code_point = i - 7189
+    elif (i >= 7506 and i < 7506 + 32):
+      code_point = i - 7506
+    if code_point - previous_code_point != 1:
+      pointers.append(column + (row * 94))
+      offsets.append(code_point)
+    previous_code_point = code_point
+
+pointers.append(14 * 94)
+static_u16_table("GB2312_OTHER_POINTERS", pointers)
+static_u16_table("GB2312_OTHER_UNSORTED_OFFSETS", offsets)
+
+# Non-gbk code points
+pointers = []
+offsets = []
+for pair in indexes["gb18030-ranges"]:
+  if pair[1] == 0x10000:
+    break # the last entry doesn't fit in u16
+  pointers.append(pair[0])
+  offsets.append(pair[1])
+
+static_u16_table("GB18030_RANGE_POINTERS", pointers)
+static_u16_table("GB18030_RANGE_OFFSETS", offsets)
+
+# Encoder table for Level 1 Hanzi
+# The units here really fit into 12 bits, but since we're
+# looking for speed here, let's use 16 bits per unit.
+# Once we use 16 bits per unit, we might as well precompute
+# the output bytes.
+level1_hanzi_index = hanzi_index[:(94 * (0xD8 - 0xB0) - 5)]
+level1_hanzi_pairs = []
+for i in xrange(len(level1_hanzi_index)):
+  hanzi_lead = (i / 94) + 0xB0
+  hanzi_trail = (i % 94) + 0xA1
+  level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail)))
+level1_hanzi_pairs.sort(key=lambda x: x[0])
+
+static_u16_table_from_indexable("GB2312_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "gb-hanzi-encode")
+static_u8_pair_table_from_indexable("GB2312_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "gb-hanzi-encode")
+
+# Fast Hanzi encoder table
+hanzi_bytes = [None] * (0x9FA7 - 0x4E00)
+for row in xrange(126):
+  for column in xrange(190):
+    pointer = column + (row * 190)
+    code_point = index[pointer]
+    if code_point and code_point >= 0x4E00 and code_point <= 0x9FA6:
+      hanzi_lead = 0x81 + row
+      hanzi_trail = column + (0x40 if column < 0x3F else 0x41)
+      hanzi_bytes[code_point - 0x4E00] = (hanzi_lead, hanzi_trail)
+
+static_u8_pair_table("GBK_HANZI_BYTES", hanzi_bytes, "fast-gb-hanzi-encode")
+
+data_file.write(data_rs_end)
+
+data_file.close()
+
+# Variant
+
+variant_file = open("src/variant.rs", "w")
+variant_file.write('''// Copyright Mozilla Foundation. See the COPYRIGHT
+// file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
+// Instead, please regenerate using generate-encoding-data.py
+
+//! This module provides enums that wrap the various decoders and encoders.
+//! The purpose is to make `Decoder` and `Encoder` `Sized` by writing the
+//! dispatch explicitly for a finite set of specialized decoders and encoders.
+//! Unfortunately, this means the compiler doesn't generate the dispatch code
+//! and it has to be written here instead.
+//!
+//! The purpose of making `Decoder` and `Encoder` `Sized` is to allow stack
+//! allocation in Rust code, including the convenience methods on `Encoding`.
+
+''')
+
+encoding_variants = [u"single-byte",]
+for encoding in multi_byte:
+  if encoding["name"] in [u"UTF-16LE", u"UTF-16BE"]:
+    continue
+  else:
+    encoding_variants.append(encoding["name"])
+encoding_variants.append(u"UTF-16")
+
+decoder_variants = []
+for variant in encoding_variants:
+  if variant == u"GBK":
+    continue
+  decoder_variants.append(variant)
+
+encoder_variants = []
+for variant in encoding_variants:
+  if variant in [u"replacement", u"GBK", u"UTF-16"]:
+    continue
+  encoder_variants.append(variant)
+
+for variant in decoder_variants:
+  variant_file.write("use %s::*;\n" % to_snake_name(variant))
+
+variant_file.write('''use super::*;
+
+pub enum VariantDecoder {
+''')
+
+for variant in decoder_variants:
+  variant_file.write("   %s(%sDecoder),\n" % (to_camel_name(variant), to_camel_name(variant)))
+
+variant_file.write('''}
+
+impl VariantDecoder {
+''')
+
+def write_variant_method(name, mut, arg_list, ret, variants, excludes, kind):
+  variant_file.write('''pub fn %s(&''' % name)
+  if mut:
+    variant_file.write('''mut ''')
+  variant_file.write('''self''')
+  for arg in arg_list:
+    variant_file.write(''', %s: %s''' % (arg[0], arg[1]))
+  variant_file.write(''')''')
+  if ret:
+    variant_file.write(''' -> %s''' % ret)
+  variant_file.write(''' {\nmatch *self {\n''')
+  for variant in variants:
+    variant_file.write('''Variant%s::%s(ref ''' % (kind, to_camel_name(variant)))
+    if mut:
+      variant_file.write('''mut ''')
+    if variant in excludes:
+      variant_file.write('''v) => (),''')
+      continue
+    variant_file.write('''v) => v.%s(''' % name)
+    first = True
+    for arg in arg_list:
+      if not first:
+        variant_file.write(''', ''')
+      first = False
+      variant_file.write(arg[0])
+    variant_file.write('''),\n''')
+  variant_file.write('''}\n}\n\n''')
+
+write_variant_method("max_utf16_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder")
+
+write_variant_method("max_utf8_buffer_length_without_replacement", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder")
+
+write_variant_method("max_utf8_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder")
+
+write_variant_method("decode_to_utf16_raw", True, [("src", "&[u8]"),
+                           ("dst", "&mut [u16]"),
+                           ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder")
+
+write_variant_method("decode_to_utf8_raw", True, [("src", "&[u8]"),
+                           ("dst", "&mut [u8]"),
+                           ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder")
+
+variant_file.write('''
+
+    pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> Option<usize> {
+        match *self {
+            VariantDecoder::SingleByte(ref v) => {
+                return Some(v.latin1_byte_compatible_up_to(buffer));
+            }
+            VariantDecoder::Utf8(ref v) => {
+                if !v.in_neutral_state() {
+                    return None;
+                }
+            }
+            VariantDecoder::Gb18030(ref v) => {
+                if !v.in_neutral_state() {
+                    return None;
+                }
+            }
+            VariantDecoder::Big5(ref v) => {
+                if !v.in_neutral_state() {
+                    return None;
+                }
+            }
+            VariantDecoder::EucJp(ref v) => {
+                if !v.in_neutral_state() {
+                    return None;
+                }
+            }
+            VariantDecoder::Iso2022Jp(ref v) => {
+                if v.in_neutral_state() {
+                    return Some(Encoding::iso_2022_jp_ascii_valid_up_to(buffer));
+                }
+                return None;
+            }
+            VariantDecoder::ShiftJis(ref v) => {
+                if !v.in_neutral_state() {
+                    return None;
+                }
+            }
+            VariantDecoder::EucKr(ref v) => {
+                if !v.in_neutral_state() {
+                    return None;
+                }
+            }
+            VariantDecoder::UserDefined(_) => {}
+            VariantDecoder::Replacement(_) | VariantDecoder::Utf16(_) => {
+                return None;
+            }
+        };
+        Some(Encoding::ascii_valid_up_to(buffer))
+    }
+}
+
+pub enum VariantEncoder {
+''')
+
+for variant in encoder_variants:
+  variant_file.write("   %s(%sEncoder),\n" % (to_camel_name(variant), to_camel_name(variant)))
+
+variant_file.write('''}
+
+impl VariantEncoder {
+    pub fn has_pending_state(&self) -> bool {
+        match *self {
+            VariantEncoder::Iso2022Jp(ref v) => {
+                v.has_pending_state()
+            }
+            _ => false,
+        }
+    }
+''')
+
+write_variant_method("max_buffer_length_from_utf16_without_replacement", False, [("u16_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder")
+
+write_variant_method("max_buffer_length_from_utf8_without_replacement", False, [("byte_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder")
+
+write_variant_method("encode_from_utf16_raw", True, [("src", "&[u16]"),
+                           ("dst", "&mut [u8]"),
+                           ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder")
+
+write_variant_method("encode_from_utf8_raw", True, [("src", "&str"),
+                           ("dst", "&mut [u8]"),
+                           ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder")
+
+
+variant_file.write('''}
+
+pub enum VariantEncoding {
+    SingleByte(&'static [u16; 128], u16, u8, u8),''')
+
+for encoding in multi_byte:
+  variant_file.write("%s,\n" % to_camel_name(encoding["name"]))
+
+variant_file.write('''}
+
+impl VariantEncoding {
+    pub fn new_variant_decoder(&self) -> VariantDecoder {
+        match *self {
+            VariantEncoding::SingleByte(table, _, _, _) => SingleByteDecoder::new(table),
+            VariantEncoding::Utf8 => Utf8Decoder::new(),
+            VariantEncoding::Gbk | VariantEncoding::Gb18030 => Gb18030Decoder::new(),
+            VariantEncoding::Big5 => Big5Decoder::new(),
+            VariantEncoding::EucJp => EucJpDecoder::new(),
+            VariantEncoding::Iso2022Jp => Iso2022JpDecoder::new(),
+            VariantEncoding::ShiftJis => ShiftJisDecoder::new(),
+            VariantEncoding::EucKr => EucKrDecoder::new(),
+            VariantEncoding::Replacement => ReplacementDecoder::new(),
+            VariantEncoding::UserDefined => UserDefinedDecoder::new(),
+            VariantEncoding::Utf16Be => Utf16Decoder::new(true),
+            VariantEncoding::Utf16Le => Utf16Decoder::new(false),
+        }
+    }
+
+    pub fn new_encoder(&self, encoding: &'static Encoding) -> Encoder {
+        match *self {
+            VariantEncoding::SingleByte(table, run_bmp_offset, run_byte_offset, run_length) => SingleByteEncoder::new(encoding, table, run_bmp_offset, run_byte_offset, run_length),
+            VariantEncoding::Utf8 => Utf8Encoder::new(encoding),
+            VariantEncoding::Gbk => Gb18030Encoder::new(encoding, false),
+            VariantEncoding::Gb18030 => Gb18030Encoder::new(encoding, true),
+            VariantEncoding::Big5 => Big5Encoder::new(encoding),
+            VariantEncoding::EucJp => EucJpEncoder::new(encoding),
+            VariantEncoding::Iso2022Jp => Iso2022JpEncoder::new(encoding),
+            VariantEncoding::ShiftJis => ShiftJisEncoder::new(encoding),
+            VariantEncoding::EucKr => EucKrEncoder::new(encoding),
+            VariantEncoding::UserDefined => UserDefinedEncoder::new(encoding),
+            VariantEncoding::Utf16Be | VariantEncoding::Replacement |
+            VariantEncoding::Utf16Le => unreachable!(),
+        }
+    }
+
+    pub fn is_single_byte(&self) -> bool {
+        match *self {
+            VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined => true,
+            _ => false,
+        }
+    }
+}
+''')
+
+variant_file.close()
+
+(ffi_rs_begin, ffi_rs_end) = read_non_generated("../encoding_c/src/lib.rs")
+
+ffi_file = open("../encoding_c/src/lib.rs", "w")
+
+ffi_file.write(ffi_rs_begin)
+ffi_file.write("""
+// Instead, please regenerate using generate-encoding-data.py
+
+/// The minimum length of buffers that may be passed to `encoding_name()`.
+pub const ENCODING_NAME_MAX_LENGTH: usize = %d; // %s
+
+""" % (longest_name_length, longest_name))
+
+for name in preferred:
+  ffi_file.write('''/// The %s encoding.
+#[no_mangle]
+pub static %s_ENCODING: ConstEncoding = ConstEncoding(&%s_INIT);
+
+''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name)))
+
+ffi_file.write(ffi_rs_end)
+ffi_file.close()
+
+(single_byte_rs_begin, single_byte_rs_end) = read_non_generated("src/single_byte.rs")
+
+single_byte_file = open("src/single_byte.rs", "w")
+
+single_byte_file.write(single_byte_rs_begin)
+single_byte_file.write("""
+// Instead, please regenerate using generate-encoding-data.py
+
+    #[test]
+    fn test_single_byte_decode() {""")
+
+idx = 0 # for Miri, return after 2nd test
+for name in preferred:
+  if name == u"ISO-8859-8-I":
+    continue;
+  if is_single_byte(name):
+    single_byte_file.write("""
+        decode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name)))
+    idx += 1
+    if idx == 2:
+      single_byte_file.write("""
+        if cfg!(miri) {
+            // Miri is too slow
+            return;
+        }""")
+
+single_byte_file.write("""
+    }
+
+    #[test]
+    fn test_single_byte_encode() {""")
+
+
+idx = 0 # for Miri, return after 2nd test
+for name in preferred:
+  if name == u"ISO-8859-8-I":
+    continue;
+  if is_single_byte(name):
+    single_byte_file.write("""
+        encode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name)))
+    idx += 1
+    if idx == 2:
+      single_byte_file.write("""
+        if cfg!(miri) {
+            // Miri is too slow
+            return;
+        }""")
+
+
+single_byte_file.write("""
+    }
+""")
+
+single_byte_file.write(single_byte_rs_end)
+single_byte_file.close()
+
+static_file = open("../encoding_c/include/encoding_rs_statics.h", "w")
+
+static_file.write("""// Copyright Mozilla Foundation. See the COPYRIGHT
+// file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
+// Instead, please regenerate using generate-encoding-data.py
+
+// This file is not meant to be included directly. Instead, encoding_rs.h
+// includes this file.
+
+#ifndef encoding_rs_statics_h_
+#define encoding_rs_statics_h_
+
+#ifndef ENCODING_RS_ENCODING
+#define ENCODING_RS_ENCODING Encoding
+#ifndef __cplusplus
+typedef struct Encoding_ Encoding;
+#endif
+#endif
+
+#ifndef ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR
+#define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ENCODING_RS_ENCODING*
+#endif
+
+#ifndef ENCODING_RS_ENCODER
+#define ENCODING_RS_ENCODER Encoder
+#ifndef __cplusplus
+typedef struct Encoder_ Encoder;
+#endif
+#endif
+
+#ifndef ENCODING_RS_DECODER
+#define ENCODING_RS_DECODER Decoder
+#ifndef __cplusplus
+typedef struct Decoder_ Decoder;
+#endif
+#endif
+
+#define INPUT_EMPTY 0
+
+#define OUTPUT_FULL 0xFFFFFFFF
+
+// %s
+#define ENCODING_NAME_MAX_LENGTH %d
+
+""" % (longest_name, longest_name_length))
+
+for name in preferred:
+  static_file.write('''/// The %s encoding.
+extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const %s_ENCODING;
+
+''' % (to_dom_name(name), to_constant_name(name)))
+
+static_file.write("""#endif // encoding_rs_statics_h_
+""")
+static_file.close()
+
+(utf_8_rs_begin, utf_8_rs_end) = read_non_generated("src/utf_8.rs")
+
+utf_8_file = open("src/utf_8.rs", "w")
+
+utf_8_file.write(utf_8_rs_begin)
+utf_8_file.write("""
+// Instead, please regenerate using generate-encoding-data.py
+
+pub static UTF8_DATA: Utf8Data = Utf8Data {
+    table: [
+""")
+
+for i in range(256):
+  combined = (1 << 2) # invalid lead
+  if i < 0x80 or i > 0xBF:
+    combined |= (1 << 3) # normal trail
+  if i < 0xA0 or i > 0xBF:
+    combined |= (1 << 4) # three-byte special lower bound
+  if i < 0x80 or i > 0x9F:
+    combined |= (1 << 5) # three-byte special upper bound
+  if i < 0x90 or i > 0xBF:
+    combined |= (1 << 6) # four-byte special lower bound
+  if i < 0x80 or i > 0x8F:
+    combined |= (1 << 7) # four-byte special upper bound
+  utf_8_file.write("%d," % combined)
+
+for i in range(128, 256):
+  lane = (1 << 2) # invalid lead
+  if i >= 0xC2 and i <= 0xDF:
+    lane = (1 << 3) # normal trail
+  elif i == 0xE0:
+    lane = (1 << 4) # three-byte special lower bound
+  elif i >= 0xE1 and i <= 0xEC:
+    lane = (1 << 3) # normal trail
+  elif i == 0xED:
+    lane = (1 << 5) # three-byte special upper bound
+  elif i >= 0xEE and i <= 0xEF:
+    lane = (1 << 3) # normal trail
+  elif i == 0xF0:
+    lane = (1 << 6) # four-byte special lower bound
+  elif i >= 0xF1 and i <= 0xF3:
+    lane = (1 << 3) # normal trail
+  elif i == 0xF4:
+    lane = (1 << 7) # four-byte special upper bound
+  utf_8_file.write("%d," % lane)
+
+utf_8_file.write("""
+    ],
+};
+
+""")
+
+utf_8_file.write(utf_8_rs_end)
+utf_8_file.close()
+
+# Unit tests
+
+TEST_HEADER = '''Any copyright to the test code below this comment is dedicated to the
+Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
+
+This is a generated file. Please do not edit.
+Instead, please regenerate using generate-encoding-data.py
+'''
+
+index = indexes["jis0208"]
+
+jis0208_in_file = open("src/test_data/jis0208_in.txt", "w")
+jis0208_in_file.write(TEST_HEADER)
+for pointer in range(0, 94 * 94):
+  (lead, trail) = divmod(pointer, 94)
+  lead += 0xA1
+  trail += 0xA1
+  jis0208_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
+jis0208_in_file.close()
+
+jis0208_in_ref_file = open("src/test_data/jis0208_in_ref.txt", "w")
+jis0208_in_ref_file.write(TEST_HEADER)
+for pointer in range(0, 94 * 94):
+  code_point = index[pointer]
+  if code_point:
+    jis0208_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
+  else:
+    jis0208_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
+jis0208_in_ref_file.close()
+
+jis0208_out_file = open("src/test_data/jis0208_out.txt", "w")
+jis0208_out_ref_file = open("src/test_data/jis0208_out_ref.txt", "w")
+jis0208_out_file.write(TEST_HEADER)
+jis0208_out_ref_file.write(TEST_HEADER)
+for pointer in range(0, 94 * 94):
+  code_point = index[pointer]
+  if code_point:
+    revised_pointer = pointer
+    if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220):
+      revised_pointer = index.index(code_point)
+    (lead, trail) = divmod(revised_pointer, 94)
+    lead += 0xA1
+    trail += 0xA1
+    jis0208_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
+    jis0208_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
+jis0208_out_file.close()
+jis0208_out_ref_file.close()
+
+shift_jis_in_file = open("src/test_data/shift_jis_in.txt", "w")
+shift_jis_in_file.write(TEST_HEADER)
+for pointer in range(0, len(index)):
+  (lead, trail) = divmod(pointer, 188)
+  lead += 0x81 if lead < 0x1F else 0xC1
+  trail += 0x40 if trail < 0x3F else 0x41
+  shift_jis_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
+shift_jis_in_file.close()
+
+shift_jis_in_ref_file = open("src/test_data/shift_jis_in_ref.txt", "w")
+shift_jis_in_ref_file.write(TEST_HEADER)
+for pointer in range(0, len(index)):
+  code_point = 0xE000 - 8836 + pointer if pointer >= 8836 and pointer <= 10715 else index[pointer]
+  if code_point:
+    shift_jis_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
+  else:
+    trail = pointer % 188
+    trail += 0x40 if trail < 0x3F else 0x41
+    if trail < 0x80:
+      shift_jis_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
+    else:
+      shift_jis_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
+shift_jis_in_ref_file.close()
+
+shift_jis_out_file = open("src/test_data/shift_jis_out.txt", "w")
+shift_jis_out_ref_file = open("src/test_data/shift_jis_out_ref.txt", "w")
+shift_jis_out_file.write(TEST_HEADER)
+shift_jis_out_ref_file.write(TEST_HEADER)
+for pointer in range(0, 8272):
+  code_point = index[pointer]
+  if code_point:
+    revised_pointer = pointer
+    if revised_pointer >= 1207 and revised_pointer < 1220:
+      revised_pointer = index.index(code_point)
+    (lead, trail) = divmod(revised_pointer, 188)
+    lead += 0x81 if lead < 0x1F else 0xC1
+    trail += 0x40 if trail < 0x3F else 0x41
+    shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
+    shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
+for pointer in range(8836, len(index)):
+  code_point = index[pointer]
+  if code_point:
+    revised_pointer = index.index(code_point)
+    if revised_pointer >= 8272 and revised_pointer < 8836:
+      revised_pointer = pointer
+    (lead, trail) = divmod(revised_pointer, 188)
+    lead += 0x81 if lead < 0x1F else 0xC1
+    trail += 0x40 if trail < 0x3F else 0x41
+    shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
+    shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
+shift_jis_out_file.close()
+shift_jis_out_ref_file.close()
+
+iso_2022_jp_in_file = open("src/test_data/iso_2022_jp_in.txt", "w")
+iso_2022_jp_in_file.write(TEST_HEADER)
+for pointer in range(0, 94 * 94):
+  (lead, trail) = divmod(pointer, 94)
+  lead += 0x21
+  trail += 0x21
+  iso_2022_jp_in_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
+iso_2022_jp_in_file.close()
+
+iso_2022_jp_in_ref_file = open("src/test_data/iso_2022_jp_in_ref.txt", "w")
+iso_2022_jp_in_ref_file.write(TEST_HEADER)
+for pointer in range(0, 94 * 94):
+  code_point = index[pointer]
+  if code_point:
+    iso_2022_jp_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
+  else:
+    iso_2022_jp_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
+iso_2022_jp_in_ref_file.close()
+
+iso_2022_jp_out_file = open("src/test_data/iso_2022_jp_out.txt", "w")
+iso_2022_jp_out_ref_file = open("src/test_data/iso_2022_jp_out_ref.txt", "w")
+iso_2022_jp_out_file.write(TEST_HEADER)
+iso_2022_jp_out_ref_file.write(TEST_HEADER)
+for pointer in range(0, 94 * 94):
+  code_point = index[pointer]
+  if code_point:
+    revised_pointer = pointer
+    if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220):
+      revised_pointer = index.index(code_point)
+    (lead, trail) = divmod(revised_pointer, 94)
+    lead += 0x21
+    trail += 0x21
+    iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
+    iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
+for i in xrange(len(half_width_index)):
+  code_point = i + 0xFF61
+  normalized_code_point = half_width_index[i]
+  pointer = index.index(normalized_code_point)
+  (lead, trail) = divmod(pointer, 94)
+  lead += 0x21
+  trail += 0x21
+  iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
+  iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
+iso_2022_jp_out_file.close()
+iso_2022_jp_out_ref_file.close()
+
+index = indexes["euc-kr"]
+
+euc_kr_in_file = open("src/test_data/euc_kr_in.txt", "w")
+euc_kr_in_file.write(TEST_HEADER)
+for pointer in range(0, len(index)):
+  (lead, trail) = divmod(pointer, 190)
+  lead += 0x81
+  trail += 0x41
+  euc_kr_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
+euc_kr_in_file.close()
+
+euc_kr_in_ref_file = open("src/test_data/euc_kr_in_ref.txt", "w")
+euc_kr_in_ref_file.write(TEST_HEADER)
+for pointer in range(0, len(index)):
+  code_point = index[pointer]
+  if code_point:
+    euc_kr_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
+  else:
+    trail = pointer % 190
+    trail += 0x41
+    if trail < 0x80:
+      euc_kr_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
+    else:
+      euc_kr_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
+euc_kr_in_ref_file.close()
+
+euc_kr_out_file = open("src/test_data/euc_kr_out.txt", "w")
+euc_kr_out_ref_file = open("src/test_data/euc_kr_out_ref.txt", "w")
+euc_kr_out_file.write(TEST_HEADER)
+euc_kr_out_ref_file.write(TEST_HEADER)
+for pointer in range(0, len(index)):
+  code_point = index[pointer]
+  if code_point:
+    (lead, trail) = divmod(pointer, 190)
+    lead += 0x81
+    trail += 0x41
+    euc_kr_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
+    euc_kr_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
+euc_kr_out_file.close()
+euc_kr_out_ref_file.close()
+
+index = indexes["gb18030"]
+
+gb18030_in_file = open("src/test_data/gb18030_in.txt", "w")
+gb18030_in_file.write(TEST_HEADER)
+for pointer in range(0, len(index)):
+  (lead, trail) = divmod(pointer, 190)
+  lead += 0x81
+  trail += 0x40 if trail < 0x3F else 0x41
+  gb18030_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
+gb18030_in_file.close()
+
+gb18030_in_ref_file = open("src/test_data/gb18030_in_ref.txt", "w")
+gb18030_in_ref_file.write(TEST_HEADER)
+for pointer in range(0, len(index)):
+  code_point = index[pointer]
+  if code_point:
+    gb18030_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
+  else:
+    trail = pointer % 190
+    trail += 0x40 if trail < 0x3F else 0x41
+    if trail < 0x80:
+      gb18030_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
+    else:
+      gb18030_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
+gb18030_in_ref_file.close()
+
+gb18030_out_file = open("src/test_data/gb18030_out.txt", "w")
+gb18030_out_ref_file = open("src/test_data/gb18030_out_ref.txt", "w")
+gb18030_out_file.write(TEST_HEADER)
+gb18030_out_ref_file.write(TEST_HEADER)
+for pointer in range(0, len(index)):
+  if pointer == 6555:
+    continue
+  code_point = index[pointer]
+  if code_point:
+    (lead, trail) = divmod(pointer, 190)
+    lead += 0x81
+    trail += 0x40 if trail < 0x3F else 0x41
+    gb18030_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
+    gb18030_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
+gb18030_out_file.close()
+gb18030_out_ref_file.close()
+
+index = indexes["big5"]
+
+big5_in_file = open("src/test_data/big5_in.txt", "w")
+big5_in_file.write(TEST_HEADER)
+for pointer in range(0, len(index)):
+  (lead, trail) = divmod(pointer, 157)
+  lead += 0x81
+  trail += 0x40 if trail < 0x3F else 0x62
+  big5_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
+big5_in_file.close()
+
+big5_two_characters = {
+  1133: u"\u00CA\u0304",
+  1135: u"\u00CA\u030C",
+  1164: u"\u00EA\u0304",
+  1166: u"\u00EA\u030C",
+}
+
+big5_in_ref_file = open("src/test_data/big5_in_ref.txt", "w")
+big5_in_ref_file.write(TEST_HEADER)
+for pointer in range(0, len(index)):
+  if pointer in big5_two_characters.keys():
+    big5_in_ref_file.write((u"%s\n" % big5_two_characters[pointer]).encode("utf-8"))
+    continue
+  code_point = index[pointer]
+  if code_point:
+    big5_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
+  else:
+    trail = pointer % 157
+    trail += 0x40 if trail < 0x3F else 0x62
+    if trail < 0x80:
+      big5_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
+    else:
+      big5_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
+big5_in_ref_file.close()
+
+prefer_last = [
+  0x2550,
+  0x255E,
+  0x2561,
+  0x256A,
+  0x5341,
+  0x5345,
+]
+
+pointer_for_prefer_last = []
+
+for code_point in prefer_last:
+  # Python lists don't have .rindex() :-(
+  for i in xrange(len(index) - 1, -1, -1):
+    candidate = index[i]
+    if candidate == code_point:
+       pointer_for_prefer_last.append(i)
+       break
+
+big5_out_file = open("src/test_data/big5_out.txt", "w")
+big5_out_ref_file = open("src/test_data/big5_out_ref.txt", "w")
+big5_out_file.write(TEST_HEADER)
+big5_out_ref_file.write(TEST_HEADER)
+for pointer in range(((0xA1 - 0x81) * 157), len(index)):
+  code_point = index[pointer]
+  if code_point:
+    if code_point in prefer_last:
+      if pointer != pointer_for_prefer_last[prefer_last.index(code_point)]:
+        continue
+    else:
+      if pointer != index.index(code_point):
+        continue
+    (lead, trail) = divmod(pointer, 157)
+    lead += 0x81
+    trail += 0x40 if trail < 0x3F else 0x62
+    big5_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
+    big5_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
+big5_out_file.close()
+big5_out_ref_file.close()
+
+index = indexes["jis0212"]
+
+jis0212_in_file = open("src/test_data/jis0212_in.txt", "w")
+jis0212_in_file.write(TEST_HEADER)
+for pointer in range(0, len(index)):
+  (lead, trail) = divmod(pointer, 94)
+  lead += 0xA1
+  trail += 0xA1
+  jis0212_in_file.write("\x8F%s%s\n" % (chr(lead), chr(trail)))
+jis0212_in_file.close()
+
+jis0212_in_ref_file = open("src/test_data/jis0212_in_ref.txt", "w")
+jis0212_in_ref_file.write(TEST_HEADER)
+for pointer in range(0, len(index)):
+  code_point = index[pointer]
+  if code_point:
+    jis0212_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
+  else:
+    jis0212_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
+jis0212_in_ref_file.close()
+
+(codepage_begin, codepage_end) = read_non_generated("../codepage/src/lib.rs")
+
+codepage_file = open("../codepage/src/lib.rs", "w")
+
+codepage_file.write(codepage_begin)
+codepage_file.write("""
+// Instead, please regenerate using generate-encoding-data.py
+
+/// Supported code page numbers in estimated order of usage frequency
+static CODE_PAGES: [u16; %d] = [
+""" % len(code_pages))
+
+for code_page in code_pages:
+  codepage_file.write("    %d,\n" % code_page)
+
+codepage_file.write("""];
+
+/// Encodings corresponding to the code page numbers in the same order
+static ENCODINGS: [&'static Encoding; %d] = [
+""" % len(code_pages))
+
+for code_page in code_pages:
+  name = encodings_by_code_page[code_page]
+  codepage_file.write("    &%s_INIT,\n" % to_constant_name(name))
+
+codepage_file.write("""];
+
+""")
+
+codepage_file.write(codepage_end)
+codepage_file.close()
+
+(codepage_test_begin, codepage_test_end) = read_non_generated("../codepage/src/tests.rs")
+
+codepage_test_file = open("../codepage/src/tests.rs", "w")
+
+codepage_test_file.write(codepage_test_begin)
+codepage_test_file.write("""
+// Instead, please regenerate using generate-encoding-data.py
+
+#[test]
+fn test_to_encoding() {
+    assert_eq!(to_encoding(0), None);
+
+""")
+
+for code_page in code_pages:
+  codepage_test_file.write("    assert_eq!(to_encoding(%d), Some(%s));\n" % (code_page, to_constant_name(encodings_by_code_page[code_page])))  
+
+codepage_test_file.write("""}
+
+#[test]
+fn test_from_encoding() {
+""")
+
+for name in preferred:
+  if code_pages_by_encoding.has_key(name):
+    codepage_test_file.write("    assert_eq!(from_encoding(%s), Some(%d));\n" % (to_constant_name(name), code_pages_by_encoding[name]))
+  else:
+    codepage_test_file.write("    assert_eq!(from_encoding(%s), None);\n" % to_constant_name(name))
+
+codepage_test_file.write("""}
+""")
+
+codepage_test_file.write(codepage_test_end)
+codepage_test_file.close()
+
+subprocess.call(["cargo", "fmt"])