diff options
Diffstat (limited to 'src/python/orcus')
-rw-r--r-- | src/python/orcus/__init__.py | 111 | ||||
-rw-r--r-- | src/python/orcus/csv.py | 10 | ||||
-rw-r--r-- | src/python/orcus/gnumeric.py | 10 | ||||
-rw-r--r-- | src/python/orcus/json.py | 10 | ||||
-rw-r--r-- | src/python/orcus/ods.py | 10 | ||||
-rw-r--r-- | src/python/orcus/tools/__init__.py | 9 | ||||
-rw-r--r-- | src/python/orcus/tools/bugzilla.py | 219 | ||||
-rw-r--r-- | src/python/orcus/tools/file_processor.py | 295 | ||||
-rw-r--r-- | src/python/orcus/xls_xml.py | 10 | ||||
-rw-r--r-- | src/python/orcus/xlsx.py | 10 |
10 files changed, 694 insertions, 0 deletions
diff --git a/src/python/orcus/__init__.py b/src/python/orcus/__init__.py new file mode 100644 index 0000000..4914d95 --- /dev/null +++ b/src/python/orcus/__init__.py @@ -0,0 +1,111 @@ +####################################################################### +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +######################################################################## + +try: + from _orcus import * + from _orcus import __version__ +except ModuleNotFoundError: + # We do this to enable sphinx to generate documentation without having to + # build the C++ part. + pass + +from enum import Enum + + +class FormatType(Enum): + """Collection of file format types currently used in orcus.""" + + UNKNOWN = 0 + ODS = 1 + XLSX = 2 + GNUMERIC = 3 + XLS_XML = 4 + CSV = 5 + YAML = 6 + JSON = 7 + XML = 8 + PARQUET = 9 + + +class CellType(Enum): + """Collection of cell types stored in spreadsheet.""" + + UNKNOWN = 0 + EMPTY = 1 + BOOLEAN = 2 + NUMERIC = 3 + STRING = 4 + STRING_WITH_ERROR = 5 + FORMULA = 6 + FORMULA_WITH_ERROR = 7 + + +class FormulaTokenType(Enum): + """Collection of formula token types.""" + + UNKNOWN = 0 + REFERENCE = 1 + VALUE = 2 + NAME = 3 + FUNCTION = 4 + OPERATOR = 5 + ERROR = 6 + + +class FormulaTokenOp(Enum): + """Collection of formula token opcodes.""" + + UNKNOWN = 0 + SINGLE_REF = 1 + RANGE_REF = 2 + TABLE_REF = 3 + NAMED_EXPRESSION = 4 + STRING = 5 + VALUE = 6 + FUNCTION = 7 + PLUS = 8 + MINUS = 9 + DIVIDE = 10 + MULTIPLY = 11 + EXPONENT = 12 + CONCAT = 13 + EQUAL = 14 + NOT_EQUAL = 15 + LESS = 16 + GREATER = 17 + LESS_EQUAL = 18 + GREATER_EQUAL = 19 + OPEN = 20 + CLOSE = 21 + SEP = 22 + ERROR = 23 + + +def get_document_loader_module(format_type): + """Obtain a document loader module for the specified format type. + + Args: + format_type (orcus.FormatType): + Format type for which to load a document loader. + + Returns: + Document loader module. + """ + m = None + if format_type == FormatType.ODS: + from . import ods as m + elif format_type == FormatType.XLSX: + from . import xlsx as m + elif format_type == FormatType.XLS_XML: + from . import xls_xml as m + elif format_type == FormatType.GNUMERIC: + from . import gnumeric as m + elif format_type == FormatType.CSV: + from . import csv as m + + return m diff --git a/src/python/orcus/csv.py b/src/python/orcus/csv.py new file mode 100644 index 0000000..bad095f --- /dev/null +++ b/src/python/orcus/csv.py @@ -0,0 +1,10 @@ +######################################################################## +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +######################################################################## + +from _orcus import _csv_read as read + diff --git a/src/python/orcus/gnumeric.py b/src/python/orcus/gnumeric.py new file mode 100644 index 0000000..d7f0f23 --- /dev/null +++ b/src/python/orcus/gnumeric.py @@ -0,0 +1,10 @@ +######################################################################## +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +######################################################################## + +from _orcus import _gnumeric_read as read + diff --git a/src/python/orcus/json.py b/src/python/orcus/json.py new file mode 100644 index 0000000..d351766 --- /dev/null +++ b/src/python/orcus/json.py @@ -0,0 +1,10 @@ +######################################################################## +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +######################################################################## + +from _orcus_json import * + diff --git a/src/python/orcus/ods.py b/src/python/orcus/ods.py new file mode 100644 index 0000000..8bcf2b9 --- /dev/null +++ b/src/python/orcus/ods.py @@ -0,0 +1,10 @@ +######################################################################## +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +######################################################################## + +from _orcus import _ods_read as read + diff --git a/src/python/orcus/tools/__init__.py b/src/python/orcus/tools/__init__.py new file mode 100644 index 0000000..413bd0a --- /dev/null +++ b/src/python/orcus/tools/__init__.py @@ -0,0 +1,9 @@ +######################################################################## +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +######################################################################## + + diff --git a/src/python/orcus/tools/bugzilla.py b/src/python/orcus/tools/bugzilla.py new file mode 100644 index 0000000..f23b4d5 --- /dev/null +++ b/src/python/orcus/tools/bugzilla.py @@ -0,0 +1,219 @@ +######################################################################## +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +######################################################################## + +import argparse +import requests +import json +import os +import base64 +import traceback +import concurrent.futures as cf +from pathlib import Path +from urllib.parse import urlparse + + +class BugzillaAccess: + """Encapsulates access to a bugzilla server by using its REST API. + + Args: + bzurl (str): URL to the bugzilla server. + cache_dir (:obj:`pathlib.Path`): path to the cache directory. + """ + + def __init__(self, bzurl, cache_dir): + self._bzurl = bzurl + self._cache_dir = cache_dir + os.makedirs(self._cache_dir, exist_ok=True) + + def _get_cache_content(self, cache_file, func_fetch): + if os.path.isfile(cache_file): + with open(cache_file, 'r') as f: + return f.read() + + s = func_fetch() + with open(cache_file, 'w') as f: + f.write(s) + + return s + + def get_bug_ids(self, bz_params): + """Get all bug ID's for specified bugzilla query parameters. + + Args: + bz_params (dict): + dictionary containing all search parameters. Each search term + must form a single key-value pair. + + Returns (:obj:`list` of :obj:`str`): + list of bug ID strings. + """ + + def _fetch(): + r = requests.get( + f"{self._bzurl}/rest/bug", + params=bz_params + ) + + if r.status_code != 200: + raise RuntimeError(f"failed to query bug ids from the TDF bugzilla! (status:{r.status_code})") + return r.text + + escape_chars = " /" + buf = [] + for key in bz_params.keys(): + v = str(bz_params[key]) + for c in escape_chars: + v = v.replace(c, '-') + buf.append(key) + buf.append(v) + + cache_file = '-'.join(buf) + ".json" + cache_file = self._cache_dir / cache_file + s = self._get_cache_content(cache_file, _fetch) + + content = json.loads(s) + bugs = content.get("bugs") + if not bugs: + return [] + + bug_ids = [bug.get("id") for bug in bugs] + bug_ids = [x for x in filter(None, bug_ids)] + return bug_ids + + def get_attachments(self, bug_id): + """Fetch all attachments for specified bug.""" + + def _fetch(): + r = requests.get(f"{self._bzurl}/rest/bug/{bug_id}/attachment") + if r.status_code != 200: + raise RuntimeError( + f"failed to fetch the attachments for bug {bug_id}! (status:{r.status_code})") + return r.text + + cache_file = self._cache_dir / f"attachments-{bug_id}.json" + s = self._get_cache_content(cache_file, _fetch) + content = json.loads(s) + attachments = list() + for d in content["bugs"][str(bug_id)]: + data = d["data"] + if not data: + continue + bytes = base64.b64decode(data) + attachments.append({ + "content_type": d["content_type"], + "filename": d["file_name"], + "data": bytes + }) + return attachments + + +def parse_query_params(queries): + bz_params = dict() + for query in queries: + k, v = query.split('=') + if v and v[0] in ('"', "'"): + if v[0] != v[-1]: + raise argparse.ArgumentError(f"mis-matched quotes in {query}") + v = v[1:-1] + bz_params[k] = v + return bz_params + + +def _create_argparser(): + parser = argparse.ArgumentParser( + description="""This command allows you to download attachments from a +bugzilla server that supports REST API.""") + parser.add_argument( + "--outdir", "-o", type=str, required=True, + help="""output directory for downloaded files. Downloaded files are +grouped by their respective bug ID's.""") + parser.add_argument( + "--limit", type=int, default=50, + help="number of bugs to include in a single set of search results.") + parser.add_argument( + "--offset", type=int, default=0, + help="number of bugs to skip in the search results.") + parser.add_argument( + "--cont", action="store_true", default=False, + help="""when specified, the search continues after the initial batch +is returned, by retrieving the next batch of results until the entire search +results are returned. The number specified by the ``--limit`` option is used +as the batch size.""") + parser.add_argument( + "--worker", type=int, default=8, + help="number of worker threads to use for parallel downloads of files.") + parser.add_argument( + "--cache-dir", type=Path, default=Path(".bugzilla"), + help="""directory to keep downloaded bugzilla search results. The +command will not send the query request to the remote server when the results +are cached. You may want to delete the cache directory after you are finished.""") + parser.add_argument( + "--url", type=str, required=True, + help="""base URL for bugzilla service. It must begin with the +``http(s)://`` prefix.""") + parser.add_argument( + "query", type=str, nargs='*', + help="""One or more query term to use to limit your search. Each query +term must be in the form key=value. You need to quote the value string when the +value string contains whitespace character i.e. key="value with space".""") + return parser + + +def main(): + parser = _create_argparser() + args = parser.parse_args() + + bz_params = parse_query_params(args.query) + + for k, v in bz_params.items(): + print(f"{k}: {v}") + + bz_params["limit"] = args.limit + bz_params["offset"] = args.offset + + url = urlparse(args.url) + cache_dir = Path(args.cache_dir) / url.netloc + bz = BugzillaAccess(args.url, cache_dir) + + def _run(bug_id, index, totals): + """Top-level function for each worker thread.""" + width = len(str(totals)) + index_s = str(index+1) + index_s = ' ' * (width - len(index_s)) + index_s + print(f"({index_s}/{totals}) fetching attachments for bug {bug_id} ...", flush=True) + + try: + attachments = bz.get_attachments(bug_id) + for attachment in attachments: + filepath = Path(args.outdir) / url.netloc / str(bug_id) / attachment["filename"] + os.makedirs(os.path.dirname(filepath), exist_ok=True) + with open(filepath, "wb") as f: + f.write(attachment["data"]) + except Exception as e: + traceback.print_exc() + print(e) + + iter_count = 0 + while True: + bug_ids = bz.get_bug_ids(bz_params) + if not bug_ids: + return + print(f"-- iteration {iter_count+1}", flush=True) + with cf.ThreadPoolExecutor(max_workers=args.worker) as executor: + for i, bug_id in enumerate(bug_ids): + executor.submit(_run, bug_id, i, len(bug_ids)) + + if not args.cont: + return + + bz_params["offset"] += bz_params["limit"] + iter_count += 1 + + +if __name__ == "__main__": + main() diff --git a/src/python/orcus/tools/file_processor.py b/src/python/orcus/tools/file_processor.py new file mode 100644 index 0000000..472fea3 --- /dev/null +++ b/src/python/orcus/tools/file_processor.py @@ -0,0 +1,295 @@ +######################################################################## +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +######################################################################## + +import argparse +import os +import os.path +import sys +import string +import pathlib +import enum +import re +import multiprocessing as mp +import importlib.util + +import orcus + + +class _Config: + ext_good = "orcus-pf.good" + ext_bad = "orcus-pf.bad" + ext_out = "orcus-pf.out" + prefix_skip = ".orcus-pf.skip." + + +config = _Config() + + +def is_special_file(filename): + if filename.find(config.prefix_skip) >= 0: + return True + + return filename.endswith(config.ext_out) or filename.endswith(config.ext_good) or filename.endswith(config.ext_bad) + + +def skips_by_rule(filename, skip_rules): + for rule in skip_rules: + if rule.search(filename): + return True + return False + + +def sanitize_string(s): + """Replace non-printable characters with \\x[value].""" + + buf = list() + for c in s: + if c in string.printable: + buf.append(c) + else: + buf.append(f"\\x{ord(c):02X}") + + return "".join(buf) + + +class LoadStatus(enum.Enum): + SUCCESS = 0 + FAILURE = 1 + SKIPPED = 2 + + +def load_doc(bytes): + + buf = list() + + try: + format_type = orcus.detect_format(bytes) + except Exception as e: + buf.append(str(e)) + status = LoadStatus.SKIPPED + return None, status, buf + + buf.append(f"* format type: {format_type}") + buf.append(f"* size: {len(bytes)} bytes") + + doc = None + + try: + loader = orcus.get_document_loader_module(format_type) + if loader is None: + buf.append(f"unhandled format type: {format_type}") + status = LoadStatus.SKIPPED + return doc, status, buf + + status = LoadStatus.SUCCESS + doc = loader.read(bytes, error_policy="skip") + return doc, status, buf + + except Exception as e: + buf.append(f"{e.__class__.__name__}: {e}") + status = LoadStatus.FAILURE + return None, status, buf + + +def print_results(inpath): + outpath = f"{inpath}.{config.ext_out}" + with open(outpath, "r") as f: + print() + for line in f.readlines(): + print(f" {line.strip()}") + print() + + +def remove_result_files(rootdir): + for root, dir, files in os.walk(rootdir): + for filename in files: + if is_special_file(filename): + filepath = os.path.join(root, filename) + os.remove(filepath) + + +def show_result_stats(rootdir): + counts = dict(good=0, bad=0, skipped=0, unprocessed=0) + for root, dir, files in os.walk(rootdir): + for filename in files: + if is_special_file(filename): + continue + + inpath = os.path.join(root, filename) + out_filepath = f"{inpath}.{config.ext_out}" + good_filepath = f"{inpath}.{config.ext_good}" + bad_filepath = f"{inpath}.{config.ext_bad}" + if os.path.isfile(good_filepath): + counts["good"] += 1 + elif os.path.isfile(bad_filepath): + counts["bad"] += 1 + elif os.path.isfile(out_filepath): + counts["skipped"] += 1 + else: + counts["unprocessed"] += 1 + + print("* result counts") + for cat in ("good", "bad", "skipped", "unprocessed"): + print(f" * {cat}: {counts[cat]}") + + total = counts["good"] + counts["bad"] + if total: + print("* ratios") + print(f" * good: {counts['good']/total*100:.1f}%") + print(f" * bad: {counts['bad']/total*100:.1f}%") + + +def show_results(rootdir, good, bad): + for root, dir, files in os.walk(rootdir): + for filename in files: + if is_special_file(filename): + continue + inpath = os.path.join(root, filename) + good_filepath = f"{inpath}.{config.ext_good}" + bad_filepath = f"{inpath}.{config.ext_bad}" + + if os.path.isfile(good_filepath) and good: + print(sanitize_string(inpath), flush=True) + print_results(inpath) + elif os.path.isfile(bad_filepath) and bad: + print(sanitize_string(inpath), flush=True) + print_results(inpath) + else: + continue + + +def load_module_from_filepath(filepath): + if not os.path.isfile(filepath): + raise RuntimeError(f"{filepath} is not a valid file.") + + mod_name = os.path.splitext(os.path.basename(filepath))[0] + mod_name = mod_name.replace('-', '_') + spec = importlib.util.spec_from_file_location(mod_name, filepath) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +def process_filepath(i, inpath, outpath, processor_path): + mod = load_module_from_filepath(processor_path) if processor_path else None + term_buf = list() # terminal output buffer + term_buf.append(f"{i} {sanitize_string(inpath)}") + + good_filepath = f"{inpath}.{config.ext_good}" + bad_filepath = f"{inpath}.{config.ext_bad}" + + if os.path.isfile(good_filepath) or os.path.isfile(bad_filepath): + term_buf.append("already processed. skipping...") + return "\n".join(term_buf) + + success = False + with open(inpath, 'rb') as f: + bytes = f.read() + + buf = list() # non-terminal output buffer + doc, status, output = load_doc(bytes) + buf.extend(output) + if doc and mod: + buf.extend(mod.process_document(inpath, doc)) + + with open(outpath, "w") as f: + f.write("\n".join(buf)) + + term_buf.extend(buf) + + if status == LoadStatus.SUCCESS: + pathlib.Path(good_filepath).touch() + elif status == LoadStatus.FAILURE: + pathlib.Path(bad_filepath).touch() + + return "\n".join(term_buf) + + +def _create_argparser(): + parser = argparse.ArgumentParser( + description="""This script allows you to process a collection of spreadsheet documents.""") + parser.add_argument( + "--skip-file", type=argparse.FileType("r"), + help="Optional text file containing a set of regular expressions (one per line). Files that match one of these rules will be skipped.") + parser.add_argument("--processes", type=int, default=1, help="Number of worker processes to use.") + parser.add_argument("-p", "--processor", type=str, help="Python module file containing callback functions.") + parser.add_argument( + "--remove-results", action="store_true", default=False, + help="Remove all cached results files from the directory tree.") + parser.add_argument( + "--results", action="store_true", default=False, + help="Display the results of the processed files.") + parser.add_argument( + "--good", action="store_true", default=False, + help="Display the results of the successfully processed files.") + parser.add_argument( + "--bad", action="store_true", default=False, + help="Display the results of the unsuccessfully processed files.") + parser.add_argument( + "--stats", action="store_true", default=False, + help="Display statistics of the results. Use it with --results.") + parser.add_argument( + "rootdir", metavar="ROOT-DIR", + help="Root directory below which to recursively find and process test files.") + return parser + + +def main(): + parser = _create_argparser() + args = parser.parse_args() + + if args.remove_results: + remove_result_files(args.rootdir) + return + + if args.results: + if args.stats: + show_result_stats(args.rootdir) + return + + show_results(args.rootdir, args.good, args.bad) + return + + skip_rules = list() + + if args.skip_file: + for line in args.skip_file.readlines(): + line = line.strip() + if not line: + continue + rule = re.compile(line) + skip_rules.append(rule) + + # build a list of files to process. + filepaths = list() + for root, dir, files in os.walk(args.rootdir): + for filename in files: + if is_special_file(filename): + continue + + inpath = os.path.join(root, filename) + outpath = f"{inpath}.{config.ext_out}" + if skips_by_rule(inpath, skip_rules): + pathlib.Path(outpath).touch() + continue + + filepaths.append((inpath, outpath)) + + with mp.Pool(processes=args.processes) as pool: + futures = list() + for i, (inpath, outpath) in enumerate(filepaths): + future = pool.apply_async(process_filepath, (i, inpath, outpath, args.processor)) + futures.append(future) + + for future in futures: + output = future.get() + print(output) + + +if __name__ == "__main__": + main() diff --git a/src/python/orcus/xls_xml.py b/src/python/orcus/xls_xml.py new file mode 100644 index 0000000..671d39b --- /dev/null +++ b/src/python/orcus/xls_xml.py @@ -0,0 +1,10 @@ +######################################################################## +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +######################################################################## + +from _orcus import _xls_xml_read as read + diff --git a/src/python/orcus/xlsx.py b/src/python/orcus/xlsx.py new file mode 100644 index 0000000..fd24ee1 --- /dev/null +++ b/src/python/orcus/xlsx.py @@ -0,0 +1,10 @@ +######################################################################## +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +######################################################################## + +from _orcus import _xlsx_read as read + |