summaryrefslogtreecommitdiffstats
path: root/netwerk/dns/prepare_tlds.py
diff options
context:
space:
mode:
Diffstat (limited to 'netwerk/dns/prepare_tlds.py')
-rw-r--r--netwerk/dns/prepare_tlds.py150
1 files changed, 150 insertions, 0 deletions
diff --git a/netwerk/dns/prepare_tlds.py b/netwerk/dns/prepare_tlds.py
new file mode 100644
index 0000000000..adebcec487
--- /dev/null
+++ b/netwerk/dns/prepare_tlds.py
@@ -0,0 +1,150 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import codecs
+import encodings.idna
+import re
+import sys
+
+from make_dafsa import words_to_bin, words_to_cxx
+
+"""
+Processes a file containing effective TLD data. See the following URL for a
+description of effective TLDs and of the file format that this script
+processes (although for the latter you're better off just reading this file's
+short source code).
+
+http://wiki.mozilla.org/Gecko:Effective_TLD_Service
+"""
+
+
+def getEffectiveTLDs(path):
+ file = codecs.open(path, "r", "UTF-8")
+ domains = set()
+ for line in file:
+ # line always contains a line terminator unless the file is empty
+ if len(line) == 0:
+ raise StopIteration
+ line = line.rstrip()
+ # comment, empty, or superfluous line for explicitness purposes
+ if line.startswith("//") or not line.strip():
+ continue
+ line = re.split(r"[ \t\n]", line, 1)[0]
+ entry = EffectiveTLDEntry(line)
+ domain = entry.domain()
+ assert domain not in domains, "repeating domain %s makes no sense" % domain
+ domains.add(domain)
+ yield entry
+
+
+def _normalizeHostname(domain):
+ """
+ Normalizes the given domain, component by component. ASCII components are
+ lowercased, while non-ASCII components are processed using the ToASCII
+ algorithm.
+ """
+
+ def convertLabel(label):
+ if _isASCII(label):
+ return label.lower()
+ return encodings.idna.ToASCII(label).decode("utf-8")
+
+ return ".".join(map(convertLabel, domain.split(".")))
+
+
+def _isASCII(s):
+ "True if s consists entirely of ASCII characters, false otherwise."
+ for c in s:
+ if ord(c) > 127:
+ return False
+ return True
+
+
+class EffectiveTLDEntry:
+ """
+ Stores an entry in an effective-TLD name file.
+ """
+
+ _exception = False
+ _wild = False
+
+ def __init__(self, line):
+ """
+ Creates a TLD entry from a line of data, which must have been stripped of
+ the line ending.
+ """
+ if line.startswith("!"):
+ self._exception = True
+ domain = line[1:]
+ elif line.startswith("*."):
+ self._wild = True
+ domain = line[2:]
+ else:
+ domain = line
+ self._domain = _normalizeHostname(domain)
+
+ def domain(self):
+ "The domain this represents."
+ return self._domain
+
+ def exception(self):
+ "True if this entry's domain denotes does not denote an effective TLD."
+ return self._exception
+
+ def wild(self):
+ "True if this entry represents a class of effective TLDs."
+ return self._wild
+
+
+#################
+# DO EVERYTHING #
+#################
+
+
+def main(output, effective_tld_filename, output_format="cxx"):
+ """
+ effective_tld_filename is the effective TLD file to parse.
+ based on the output format, either a C++ array of a binary representation
+ of a DAFSA representing the eTLD file is then printed to standard output
+ or a binary file is written to disk.
+ """
+
+ def typeEnum(etld):
+ """
+ Maps the flags to the DAFSA's enum types.
+ """
+ if etld.exception():
+ return 1
+ elif etld.wild():
+ return 2
+ else:
+ return 0
+
+ def dafsa_words():
+ """
+ make_dafsa expects lines of the form "<domain_name><enum_value>"
+ """
+ for etld in getEffectiveTLDs(effective_tld_filename):
+ yield "%s%d" % (etld.domain(), typeEnum(etld))
+
+ """ words_to_bin() returns a bytes while words_to_cxx() returns string """
+ if output_format == "bin":
+ output.write(words_to_bin(dafsa_words()))
+ else:
+ output.write(words_to_cxx(dafsa_words()))
+
+
+if __name__ == "__main__":
+ """
+ This program can output the DAFSA in two formats:
+ as C++ code that will be included and compiled at build time
+ or as a binary file that will be published in Remote Settings.
+
+ Flags for format options:
+ "cxx" -> C++ array [default]
+ "bin" -> Binary file
+ """
+
+ output_format = "bin" if "--bin" in sys.argv else "cxx"
+ main(sys.stdout, sys.argv[1], output_format=output_format)