10 files changed, 694 insertions, 0 deletions
diff --git a/src/python/orcus/__init__.py b/src/python/orcus/__init__.py
new file mode 100644
index 0000000..4914d95
--- /dev/null
+++ b/src/python/orcus/__init__.py
@@ -0,0 +1,111 @@
+#######################################################################
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+########################################################################
+
+try:
+    from _orcus import *
+    from _orcus import __version__
+except ModuleNotFoundError:
+    # We do this to enable sphinx to generate documentation without having to
+    # build the C++ part.
+    pass
+
+from enum import Enum
+
+
+class FormatType(Enum):
+    """Collection of file format types currently used in orcus."""
+
+    UNKNOWN    = 0
+    ODS        = 1
+    XLSX       = 2
+    GNUMERIC   = 3
+    XLS_XML    = 4
+    CSV        = 5
+    YAML       = 6
+    JSON       = 7
+    XML        = 8
+    PARQUET    = 9
+
+
+class CellType(Enum):
+    """Collection of cell types stored in spreadsheet."""
+
+    UNKNOWN = 0
+    EMPTY   = 1
+    BOOLEAN = 2
+    NUMERIC = 3
+    STRING  = 4
+    STRING_WITH_ERROR = 5
+    FORMULA = 6
+    FORMULA_WITH_ERROR = 7
+
+
+class FormulaTokenType(Enum):
+    """Collection of formula token types."""
+
+    UNKNOWN   = 0
+    REFERENCE = 1
+    VALUE     = 2
+    NAME      = 3
+    FUNCTION  = 4
+    OPERATOR  = 5
+    ERROR     = 6
+
+
+class FormulaTokenOp(Enum):
+    """Collection of formula token opcodes."""
+
+    UNKNOWN          = 0
+    SINGLE_REF       = 1
+    RANGE_REF        = 2
+    TABLE_REF        = 3
+    NAMED_EXPRESSION = 4
+    STRING           = 5
+    VALUE            = 6
+    FUNCTION         = 7
+    PLUS             = 8
+    MINUS            = 9
+    DIVIDE           = 10
+    MULTIPLY         = 11
+    EXPONENT         = 12
+    CONCAT           = 13
+    EQUAL            = 14
+    NOT_EQUAL        = 15
+    LESS             = 16
+    GREATER          = 17
+    LESS_EQUAL       = 18
+    GREATER_EQUAL    = 19
+    OPEN             = 20
+    CLOSE            = 21
+    SEP              = 22
+    ERROR            = 23
+
+
+def get_document_loader_module(format_type):
+    """Obtain a document loader module for the specified format type.
+
+    Args:
+        format_type (orcus.FormatType):
+            Format type for which to load a document loader.
+
+    Returns:
+        Document loader module.
+    """
+    m = None
+    if format_type == FormatType.ODS:
+        from . import ods as m
+    elif format_type == FormatType.XLSX:
+        from . import xlsx as m
+    elif format_type == FormatType.XLS_XML:
+        from . import xls_xml as m
+    elif format_type == FormatType.GNUMERIC:
+        from . import gnumeric as m
+    elif format_type == FormatType.CSV:
+        from . import csv as m
+
+    return m
diff --git a/src/python/orcus/csv.py b/src/python/orcus/csv.py
new file mode 100644
index 0000000..bad095f
--- /dev/null
+++ b/src/python/orcus/csv.py
@@ -0,0 +1,10 @@
+########################################################################
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+########################################################################
+
+from _orcus import _csv_read as read
+
diff --git a/src/python/orcus/gnumeric.py b/src/python/orcus/gnumeric.py
new file mode 100644
index 0000000..d7f0f23
--- /dev/null
+++ b/src/python/orcus/gnumeric.py
@@ -0,0 +1,10 @@
+########################################################################
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+########################################################################
+
+from _orcus import _gnumeric_read as read
+
diff --git a/src/python/orcus/json.py b/src/python/orcus/json.py
new file mode 100644
index 0000000..d351766
--- /dev/null
+++ b/src/python/orcus/json.py
@@ -0,0 +1,10 @@
+########################################################################
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+########################################################################
+
+from _orcus_json import *
+
diff --git a/src/python/orcus/ods.py b/src/python/orcus/ods.py
new file mode 100644
index 0000000..8bcf2b9
--- /dev/null
+++ b/src/python/orcus/ods.py
@@ -0,0 +1,10 @@
+########################################################################
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+########################################################################
+
+from _orcus import _ods_read as read
+
diff --git a/src/python/orcus/tools/__init__.py b/src/python/orcus/tools/__init__.py
new file mode 100644
index 0000000..413bd0a
--- /dev/null
+++ b/src/python/orcus/tools/__init__.py
@@ -0,0 +1,9 @@
+########################################################################
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+########################################################################
+
+
diff --git a/src/python/orcus/tools/bugzilla.py b/src/python/orcus/tools/bugzilla.py
new file mode 100644
index 0000000..f23b4d5
--- /dev/null
+++ b/src/python/orcus/tools/bugzilla.py
@@ -0,0 +1,219 @@
+########################################################################
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+########################################################################
+
+import argparse
+import requests
+import json
+import os
+import base64
+import traceback
+import concurrent.futures as cf
+from pathlib import Path
+from urllib.parse import urlparse
+
+
+class BugzillaAccess:
+    """Encapsulates access to a bugzilla server by using its REST API.
+
+    Args:
+        bzurl (str): URL to the bugzilla server.
+        cache_dir (:obj:`pathlib.Path`): path to the cache directory.
+    """
+
+    def __init__(self, bzurl, cache_dir):
+        self._bzurl = bzurl
+        self._cache_dir = cache_dir
+        os.makedirs(self._cache_dir, exist_ok=True)
+
+    def _get_cache_content(self, cache_file, func_fetch):
+        if os.path.isfile(cache_file):
+            with open(cache_file, 'r') as f:
+                return f.read()
+
+        s = func_fetch()
+        with open(cache_file, 'w') as f:
+            f.write(s)
+
+        return s
+
+    def get_bug_ids(self, bz_params):
+        """Get all bug ID's for specified bugzilla query parameters.
+
+        Args:
+            bz_params (dict):
+                dictionary containing all search parameters. Each search term
+                must form a single key-value pair.
+
+        Returns (:obj:`list` of :obj:`str`):
+            list of bug ID strings.
+        """
+
+        def _fetch():
+            r = requests.get(
+                f"{self._bzurl}/rest/bug",
+                params=bz_params
+            )
+
+            if r.status_code != 200:
+                raise RuntimeError(f"failed to query bug ids from the TDF bugzilla! (status:{r.status_code})")
+            return r.text
+
+        escape_chars = " /"
+        buf = []
+        for key in bz_params.keys():
+            v = str(bz_params[key])
+            for c in escape_chars:
+                v = v.replace(c, '-')
+            buf.append(key)
+            buf.append(v)
+
+        cache_file = '-'.join(buf) + ".json"
+        cache_file = self._cache_dir / cache_file
+        s = self._get_cache_content(cache_file, _fetch)
+
+        content = json.loads(s)
+        bugs = content.get("bugs")
+        if not bugs:
+            return []
+
+        bug_ids = [bug.get("id") for bug in bugs]
+        bug_ids = [x for x in filter(None, bug_ids)]
+        return bug_ids
+
+    def get_attachments(self, bug_id):
+        """Fetch all attachments for specified bug."""
+
+        def _fetch():
+            r = requests.get(f"{self._bzurl}/rest/bug/{bug_id}/attachment")
+            if r.status_code != 200:
+                raise RuntimeError(
+                    f"failed to fetch the attachments for bug {bug_id}! (status:{r.status_code})")
+            return r.text
+
+        cache_file = self._cache_dir / f"attachments-{bug_id}.json"
+        s = self._get_cache_content(cache_file, _fetch)
+        content = json.loads(s)
+        attachments = list()
+        for d in content["bugs"][str(bug_id)]:
+            data = d["data"]
+            if not data:
+                continue
+            bytes = base64.b64decode(data)
+            attachments.append({
+                "content_type": d["content_type"],
+                "filename": d["file_name"],
+                "data": bytes
+            })
+        return attachments
+
+
+def parse_query_params(queries):
+    bz_params = dict()
+    for query in queries:
+        k, v = query.split('=')
+        if v and v[0] in ('"', "'"):
+            if v[0] != v[-1]:
+                raise argparse.ArgumentError(f"mis-matched quotes in {query}")
+            v = v[1:-1]
+        bz_params[k] = v
+    return bz_params
+
+
+def _create_argparser():
+    parser = argparse.ArgumentParser(
+        description="""This command allows you to download attachments from a
+bugzilla server that supports REST API.""")
+    parser.add_argument(
+        "--outdir", "-o", type=str, required=True,
+        help="""output directory for downloaded files. Downloaded files are
+grouped by their respective bug ID's.""")
+    parser.add_argument(
+        "--limit", type=int, default=50,
+        help="number of bugs to include in a single set of search results.")
+    parser.add_argument(
+        "--offset", type=int, default=0,
+        help="number of bugs to skip in the search results.")
+    parser.add_argument(
+        "--cont", action="store_true", default=False,
+        help="""when specified, the search continues after the initial batch
+is returned, by retrieving the next batch of results until the entire search
+results are returned. The number specified by the ``--limit`` option is used
+as the batch size.""")
+    parser.add_argument(
+        "--worker", type=int, default=8,
+        help="number of worker threads to use for parallel downloads of files.")
+    parser.add_argument(
+        "--cache-dir", type=Path, default=Path(".bugzilla"),
+        help="""directory to keep downloaded bugzilla search results. The
+command will not send the query request to the remote server when the results
+are cached. You may want to delete the cache directory after you are finished.""")
+    parser.add_argument(
+        "--url", type=str, required=True,
+        help="""base URL for bugzilla service. It must begin with the
+``http(s)://`` prefix.""")
+    parser.add_argument(
+        "query", type=str, nargs='*',
+        help="""One or more query term to use to limit your search. Each query
+term must be in the form key=value. You need to quote the value string when the
+value string contains whitespace character i.e. key="value with space".""")
+    return parser
+
+
+def main():
+    parser = _create_argparser()
+    args = parser.parse_args()
+
+    bz_params = parse_query_params(args.query)
+
+    for k, v in bz_params.items():
+        print(f"{k}: {v}")
+
+    bz_params["limit"] = args.limit
+    bz_params["offset"] = args.offset
+
+    url = urlparse(args.url)
+    cache_dir = Path(args.cache_dir) / url.netloc
+    bz = BugzillaAccess(args.url, cache_dir)
+
+    def _run(bug_id, index, totals):
+        """Top-level function for each worker thread."""
+        width = len(str(totals))
+        index_s = str(index+1)
+        index_s = ' ' * (width - len(index_s)) + index_s
+        print(f"({index_s}/{totals}) fetching attachments for bug {bug_id} ...", flush=True)
+
+        try:
+            attachments = bz.get_attachments(bug_id)
+            for attachment in attachments:
+                filepath = Path(args.outdir) / url.netloc / str(bug_id) / attachment["filename"]
+                os.makedirs(os.path.dirname(filepath), exist_ok=True)
+                with open(filepath, "wb") as f:
+                    f.write(attachment["data"])
+        except Exception as e:
+            traceback.print_exc()
+            print(e)
+
+    iter_count = 0
+    while True:
+        bug_ids = bz.get_bug_ids(bz_params)
+        if not bug_ids:
+            return
+        print(f"-- iteration {iter_count+1}", flush=True)
+        with cf.ThreadPoolExecutor(max_workers=args.worker) as executor:
+            for i, bug_id in enumerate(bug_ids):
+                executor.submit(_run, bug_id, i, len(bug_ids))
+
+        if not args.cont:
+            return
+
+        bz_params["offset"] += bz_params["limit"]
+        iter_count += 1
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/python/orcus/tools/file_processor.py b/src/python/orcus/tools/file_processor.py
new file mode 100644
index 0000000..472fea3
--- /dev/null
+++ b/src/python/orcus/tools/file_processor.py
@@ -0,0 +1,295 @@
+########################################################################
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+########################################################################
+
+import argparse
+import os
+import os.path
+import sys
+import string
+import pathlib
+import enum
+import re
+import multiprocessing as mp
+import importlib.util
+
+import orcus
+
+
+class _Config:
+    ext_good = "orcus-pf.good"
+    ext_bad = "orcus-pf.bad"
+    ext_out = "orcus-pf.out"
+    prefix_skip = ".orcus-pf.skip."
+
+
+config = _Config()
+
+
+def is_special_file(filename):
+    if filename.find(config.prefix_skip) >= 0:
+        return True
+
+    return filename.endswith(config.ext_out) or filename.endswith(config.ext_good) or filename.endswith(config.ext_bad)
+
+
+def skips_by_rule(filename, skip_rules):
+    for rule in skip_rules:
+        if rule.search(filename):
+            return True
+    return False
+
+
+def sanitize_string(s):
+    """Replace non-printable characters with \\x[value]."""
+
+    buf = list()
+    for c in s:
+        if c in string.printable:
+            buf.append(c)
+        else:
+            buf.append(f"\\x{ord(c):02X}")
+
+    return "".join(buf)
+
+
+class LoadStatus(enum.Enum):
+    SUCCESS = 0
+    FAILURE = 1
+    SKIPPED = 2
+
+
+def load_doc(bytes):
+
+    buf = list()
+
+    try:
+        format_type = orcus.detect_format(bytes)
+    except Exception as e:
+        buf.append(str(e))
+        status = LoadStatus.SKIPPED
+        return None, status, buf
+
+    buf.append(f"* format type: {format_type}")
+    buf.append(f"* size: {len(bytes)} bytes")
+
+    doc = None
+
+    try:
+        loader = orcus.get_document_loader_module(format_type)
+        if loader is None:
+            buf.append(f"unhandled format type: {format_type}")
+            status = LoadStatus.SKIPPED
+            return doc, status, buf
+
+        status = LoadStatus.SUCCESS
+        doc = loader.read(bytes, error_policy="skip")
+        return doc, status, buf
+
+    except Exception as e:
+        buf.append(f"{e.__class__.__name__}: {e}")
+        status = LoadStatus.FAILURE
+        return None, status, buf
+
+
+def print_results(inpath):
+    outpath = f"{inpath}.{config.ext_out}"
+    with open(outpath, "r") as f:
+        print()
+        for line in f.readlines():
+            print(f"  {line.strip()}")
+        print()
+
+
+def remove_result_files(rootdir):
+    for root, dir, files in os.walk(rootdir):
+        for filename in files:
+            if is_special_file(filename):
+                filepath = os.path.join(root, filename)
+                os.remove(filepath)
+
+
+def show_result_stats(rootdir):
+    counts = dict(good=0, bad=0, skipped=0, unprocessed=0)
+    for root, dir, files in os.walk(rootdir):
+        for filename in files:
+            if is_special_file(filename):
+                continue
+
+            inpath = os.path.join(root, filename)
+            out_filepath = f"{inpath}.{config.ext_out}"
+            good_filepath = f"{inpath}.{config.ext_good}"
+            bad_filepath = f"{inpath}.{config.ext_bad}"
+            if os.path.isfile(good_filepath):
+                counts["good"] += 1
+            elif os.path.isfile(bad_filepath):
+                counts["bad"] += 1
+            elif os.path.isfile(out_filepath):
+                counts["skipped"] += 1
+            else:
+                counts["unprocessed"] += 1
+
+    print("* result counts")
+    for cat in ("good", "bad", "skipped", "unprocessed"):
+        print(f"  * {cat}: {counts[cat]}")
+
+    total = counts["good"] + counts["bad"]
+    if total:
+        print("* ratios")
+        print(f"  * good: {counts['good']/total*100:.1f}%")
+        print(f"  * bad: {counts['bad']/total*100:.1f}%")
+
+
+def show_results(rootdir, good, bad):
+    for root, dir, files in os.walk(rootdir):
+        for filename in files:
+            if is_special_file(filename):
+                continue
+            inpath = os.path.join(root, filename)
+            good_filepath = f"{inpath}.{config.ext_good}"
+            bad_filepath = f"{inpath}.{config.ext_bad}"
+
+            if os.path.isfile(good_filepath) and good:
+                print(sanitize_string(inpath), flush=True)
+                print_results(inpath)
+            elif os.path.isfile(bad_filepath) and bad:
+                print(sanitize_string(inpath), flush=True)
+                print_results(inpath)
+            else:
+                continue
+
+
+def load_module_from_filepath(filepath):
+    if not os.path.isfile(filepath):
+        raise RuntimeError(f"{filepath} is not a valid file.")
+
+    mod_name = os.path.splitext(os.path.basename(filepath))[0]
+    mod_name = mod_name.replace('-', '_')
+    spec = importlib.util.spec_from_file_location(mod_name, filepath)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod
+
+
+def process_filepath(i, inpath, outpath, processor_path):
+    mod = load_module_from_filepath(processor_path) if processor_path else None
+    term_buf = list()  # terminal output buffer
+    term_buf.append(f"{i} {sanitize_string(inpath)}")
+
+    good_filepath = f"{inpath}.{config.ext_good}"
+    bad_filepath = f"{inpath}.{config.ext_bad}"
+
+    if os.path.isfile(good_filepath) or os.path.isfile(bad_filepath):
+        term_buf.append("already processed. skipping...")
+        return "\n".join(term_buf)
+
+    success = False
+    with open(inpath, 'rb') as f:
+        bytes = f.read()
+
+    buf = list()  # non-terminal output buffer
+    doc, status, output = load_doc(bytes)
+    buf.extend(output)
+    if doc and mod:
+        buf.extend(mod.process_document(inpath, doc))
+
+    with open(outpath, "w") as f:
+        f.write("\n".join(buf))
+
+    term_buf.extend(buf)
+
+    if status == LoadStatus.SUCCESS:
+        pathlib.Path(good_filepath).touch()
+    elif status == LoadStatus.FAILURE:
+        pathlib.Path(bad_filepath).touch()
+
+    return "\n".join(term_buf)
+
+
+def _create_argparser():
+    parser = argparse.ArgumentParser(
+        description="""This script allows you to process a collection of spreadsheet documents.""")
+    parser.add_argument(
+        "--skip-file", type=argparse.FileType("r"),
+        help="Optional text file containing a set of regular expressions (one per line). Files that match one of these rules will be skipped.")
+    parser.add_argument("--processes", type=int, default=1, help="Number of worker processes to use.")
+    parser.add_argument("-p", "--processor", type=str, help="Python module file containing callback functions.")
+    parser.add_argument(
+        "--remove-results", action="store_true", default=False,
+        help="Remove all cached results files from the directory tree.")
+    parser.add_argument(
+        "--results", action="store_true", default=False,
+        help="Display the results of the processed files.")
+    parser.add_argument(
+        "--good", action="store_true", default=False,
+        help="Display the results of the successfully processed files.")
+    parser.add_argument(
+        "--bad", action="store_true", default=False,
+        help="Display the results of the unsuccessfully processed files.")
+    parser.add_argument(
+        "--stats", action="store_true", default=False,
+        help="Display statistics of the results.  Use it with --results.")
+    parser.add_argument(
+        "rootdir", metavar="ROOT-DIR",
+        help="Root directory below which to recursively find and process test files.")
+    return parser
+
+
+def main():
+    parser = _create_argparser()
+    args = parser.parse_args()
+
+    if args.remove_results:
+        remove_result_files(args.rootdir)
+        return
+
+    if args.results:
+        if args.stats:
+            show_result_stats(args.rootdir)
+            return
+
+        show_results(args.rootdir, args.good, args.bad)
+        return
+
+    skip_rules = list()
+
+    if args.skip_file:
+        for line in args.skip_file.readlines():
+            line = line.strip()
+            if not line:
+                continue
+            rule = re.compile(line)
+            skip_rules.append(rule)
+
+    # build a list of files to process.
+    filepaths = list()
+    for root, dir, files in os.walk(args.rootdir):
+        for filename in files:
+            if is_special_file(filename):
+                continue
+
+            inpath = os.path.join(root, filename)
+            outpath = f"{inpath}.{config.ext_out}"
+            if skips_by_rule(inpath, skip_rules):
+                pathlib.Path(outpath).touch()
+                continue
+
+            filepaths.append((inpath, outpath))
+
+    with mp.Pool(processes=args.processes) as pool:
+        futures = list()
+        for i, (inpath, outpath) in enumerate(filepaths):
+            future = pool.apply_async(process_filepath, (i, inpath, outpath, args.processor))
+            futures.append(future)
+
+        for future in futures:
+            output = future.get()
+            print(output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/python/orcus/xls_xml.py b/src/python/orcus/xls_xml.py
new file mode 100644
index 0000000..671d39b
--- /dev/null
+++ b/src/python/orcus/xls_xml.py
@@ -0,0 +1,10 @@
+########################################################################
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+########################################################################
+
+from _orcus import _xls_xml_read as read
+
diff --git a/src/python/orcus/xlsx.py b/src/python/orcus/xlsx.py
new file mode 100644
index 0000000..fd24ee1
--- /dev/null
+++ b/src/python/orcus/xlsx.py
@@ -0,0 +1,10 @@
+########################################################################
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+########################################################################
+
+from _orcus import _xlsx_read as read
+