summaryrefslogtreecommitdiffstats
path: root/netwerk/dns/prepare_tlds.py
blob: adebcec4870539206b469db0c266fea0ccd1631b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import codecs
import encodings.idna
import re
import sys

from make_dafsa import words_to_bin, words_to_cxx

"""
Processes a file containing effective TLD data.  See the following URL for a
description of effective TLDs and of the file format that this script
processes (although for the latter you're better off just reading this file's
short source code).

http://wiki.mozilla.org/Gecko:Effective_TLD_Service
"""


def getEffectiveTLDs(path):
    file = codecs.open(path, "r", "UTF-8")
    domains = set()
    for line in file:
        # line always contains a line terminator unless the file is empty
        if len(line) == 0:
            raise StopIteration
        line = line.rstrip()
        # comment, empty, or superfluous line for explicitness purposes
        if line.startswith("//") or not line.strip():
            continue
        line = re.split(r"[ \t\n]", line, 1)[0]
        entry = EffectiveTLDEntry(line)
        domain = entry.domain()
        assert domain not in domains, "repeating domain %s makes no sense" % domain
        domains.add(domain)
        yield entry


def _normalizeHostname(domain):
    """
    Normalizes the given domain, component by component.  ASCII components are
    lowercased, while non-ASCII components are processed using the ToASCII
    algorithm.
    """

    def convertLabel(label):
        if _isASCII(label):
            return label.lower()
        return encodings.idna.ToASCII(label).decode("utf-8")

    return ".".join(map(convertLabel, domain.split(".")))


def _isASCII(s):
    "True if s consists entirely of ASCII characters, false otherwise."
    for c in s:
        if ord(c) > 127:
            return False
    return True


class EffectiveTLDEntry:
    """
    Stores an entry in an effective-TLD name file.
    """

    _exception = False
    _wild = False

    def __init__(self, line):
        """
        Creates a TLD entry from a line of data, which must have been stripped of
        the line ending.
        """
        if line.startswith("!"):
            self._exception = True
            domain = line[1:]
        elif line.startswith("*."):
            self._wild = True
            domain = line[2:]
        else:
            domain = line
        self._domain = _normalizeHostname(domain)

    def domain(self):
        "The domain this represents."
        return self._domain

    def exception(self):
        "True if this entry's domain denotes does not denote an effective TLD."
        return self._exception

    def wild(self):
        "True if this entry represents a class of effective TLDs."
        return self._wild


#################
# DO EVERYTHING #
#################


def main(output, effective_tld_filename, output_format="cxx"):
    """
    effective_tld_filename is the effective TLD file to parse.
    based on the output format, either a C++ array of a binary representation
    of a DAFSA representing the eTLD file is then printed to standard output
    or a binary file is written to disk.
    """

    def typeEnum(etld):
        """
        Maps the flags to the DAFSA's enum types.
        """
        if etld.exception():
            return 1
        elif etld.wild():
            return 2
        else:
            return 0

    def dafsa_words():
        """
        make_dafsa expects lines of the form "<domain_name><enum_value>"
        """
        for etld in getEffectiveTLDs(effective_tld_filename):
            yield "%s%d" % (etld.domain(), typeEnum(etld))

    """ words_to_bin() returns a bytes while words_to_cxx() returns string """
    if output_format == "bin":
        output.write(words_to_bin(dafsa_words()))
    else:
        output.write(words_to_cxx(dafsa_words()))


if __name__ == "__main__":
    """
    This program can output the DAFSA in two formats:
    as C++ code that will be included and compiled at build time
    or as a binary file that will be published in Remote Settings.

    Flags for format options:
    "cxx" -> C++ array [default]
    "bin" -> Binary file
    """

    output_format = "bin" if "--bin" in sys.argv else "cxx"
    main(sys.stdout, sys.argv[1], output_format=output_format)