1 files changed, 192 insertions, 0 deletions
diff --git a/misc/xml-tokens/gen-odf-tokens.py b/misc/xml-tokens/gen-odf-tokens.py
new file mode 100755
index 0000000..2838604
--- /dev/null
+++ b/misc/xml-tokens/gen-odf-tokens.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+########################################################################
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+########################################################################
+
+import xml.parsers.expat
+import sys
+import argparse
+from pathlib import Path
+
+import token_util
+
+
+NS_RNG = "http://relaxng.org/ns/structure/1.0"
+
+
+class TokenParser:
+
+    def __init__ (self, strm):
+        self.__strm = strm
+        self.__elem = None
+        self.tokens = set()
+
+    def start_element(self, name, attrs):
+        self.__elem = name
+        if name in {f"{NS_RNG}:element", f"{NS_RNG}:attribute"} and "name" in attrs:
+            tokens = attrs['name'].split(':')
+            n = len(tokens)
+            if n != 2:
+                sys.stderr.write("unrecognized token type: "+attrs['name'])
+                sys.exit(1)
+
+            self.tokens.add(tokens[1])
+
+            for token in tokens:
+                self.tokens.add(token)
+
+    def character(self, data):
+        if self.__elem == f"{NS_RNG}:value":
+            s = data.strip()
+            if len(s) > 0:
+                self.tokens.add(s)
+
+    def parse(self):
+        p = xml.parsers.expat.ParserCreate(encoding="utf-8", namespace_separator=":")
+        p.StartElementHandler = self.start_element
+        p.CharacterDataHandler = self.character
+        p.Parse(self.__strm, 1)
+
+        self.tokens = sorted(self.tokens)
+
+
+class NSParser:
+
+    def __init__ (self, strm):
+        self.__strm = strm
+        self.__elem = None
+        self.ns_values = dict()  # namespace values
+
+    def start_element(self, name, attrs):
+        self.__elem = name
+        if name.endswith("grammar"):
+            names = attrs.keys()
+            for name in names:
+                tokens = name.split(':')
+                if len(tokens) < 2 or tokens[0] != "xmlns":
+                    continue
+
+                val = attrs[name]
+                self.ns_values[tokens[1]] = val
+
+    def parse(self):
+        p = xml.parsers.expat.ParserCreate(encoding="utf-8")
+        p.StartElementHandler = self.start_element
+        p.Parse(self.__strm, 1)
+
+        ns_values = list()
+        for k, v in self.ns_values.items():
+            if v == "http://relaxng.org/ns/structure/1.0":
+                # skip the relaxNG namespace since it is only used in the schema document.
+                continue
+            ns_values.append((k, v))
+
+        self.ns_values = sorted(ns_values, key=lambda x: x[0])
+
+
+def gen_namespace_tokens(filepath, ns_values):
+
+    # header (.hpp)
+    filepath_hpp = filepath + "_hpp.inl"
+    outfile = open(filepath_hpp, 'w')
+    outfile.write("namespace orcus {\n\n")
+    for key, _ in ns_values:
+        outfile.write("extern const xmlns_id_t NS_odf_")
+        outfile.write(key)
+        outfile.write(";\n")
+    outfile.write("\nextern const xmlns_id_t* NS_odf_all;\n")
+    outfile.write("\n}\n\n")
+    outfile.close()
+
+    # source (.cpp)
+    filepath_cpp = filepath + "_cpp.inl"
+    outfile = open(filepath_cpp, 'w')
+    outfile.write("namespace orcus {\n\n")
+    for key, value in ns_values:
+        outfile.write("const xmlns_id_t NS_odf_")
+        outfile.write(key)
+        outfile.write(" = \"")
+        outfile.write(value)
+        outfile.write("\"")
+        outfile.write(";\n")
+
+    outfile.write("\n")
+    outfile.write("namespace {\n\n")
+    outfile.write("const xmlns_id_t odf_ns[] = {\n")
+    for key, _ in ns_values:
+        outfile.write("    NS_odf_")
+        outfile.write(key)
+        outfile.write(",\n")
+    outfile.write("    nullptr\n")
+    outfile.write("};\n\n")
+    outfile.write("} // anonymous\n\n")
+
+    outfile.write("const xmlns_id_t* NS_odf_all = odf_ns;\n\n")
+
+    outfile.write("}\n\n")
+    outfile.close()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ns-file-prefix", type=str,
+        help="file name prefix for optioal namespace constant files")
+    parser.add_argument(
+        "--summary-output", type=Path,
+        help="optional output file to write collected token data summary")
+    parser.add_argument(
+        "--token-constants", type=Path,
+        help="path to C++ output file where token consants are to be written to")
+    parser.add_argument(
+        "--token-names", type=Path,
+        help="path to C++ output file where token names are to be written to")
+    parser.add_argument(
+        "odf_schema", metavar="ODF-SCHEMA", type=Path, help="path to RNG ODF schema file")
+    args = parser.parse_args()
+
+    if not args.odf_schema.is_file():
+        print(f"{args.odf_schema} is not a valid file.", file=sys.stderr)
+        sys.exit(1)
+
+    schema_content = args.odf_schema.read_text()
+    parser = TokenParser(schema_content)
+    parser.parse()
+    tokens = parser.tokens
+
+    parser = NSParser(schema_content)
+    parser.parse()
+    ns_values = parser.ns_values
+
+    if args.summary_output:
+        summary_content_buf = list()
+        summary_content_buf.append("list of tokens:")
+
+        for token in tokens:
+            summary_content_buf.append(f"- \"{token}\"")
+
+        summary_content_buf.append("list of namespaces:")
+
+        for ns, value in ns_values:
+            summary_content_buf.append(f"- {ns}: \"{value}\"")
+
+        args.summary_output.write_text("\n".join(summary_content_buf))
+
+    if args.token_constants:
+        with open(args.token_constants, "w") as f:
+            token_util.gen_token_constants(f, tokens)
+
+    if args.token_names:
+        with open(args.token_names, "w") as f:
+            token_util.gen_token_names(f, tokens)
+
+    if args.ns_file_prefix is not None:
+        gen_namespace_tokens(args.ns_file_prefix, ns_values)
+
+
+if __name__ == '__main__':
+    main()