#!/usr/bin/env python3

#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

"""
Runs the static rooting analysis
"""

import argparse
import os
import subprocess
import sys
from subprocess import Popen

try:
    from shlex import quote
except ImportError:
    from pipes import quote


def execfile(thefile, globals):
    exec(compile(open(thefile).read(), filename=thefile, mode="exec"), globals)


# Label a string as an output.
class Output(str):
    pass


# Label a string as a pattern for multiple inputs.
class MultiInput(str):
    pass


# Construct a new environment by merging in some settings needed for running the individual scripts.
def env(config):
    # Add config['sixgill_bin'] to $PATH if not already there.
    path = os.environ["PATH"].split(":")
    if dir := config.get("sixgill_bin"):
        if dir not in path:
            path.insert(0, dir)

    return dict(
        os.environ,
        PATH=":".join(path),
        XDB=f"{config['sixgill_bin']}/xdb.so",
        SOURCE=config["source"],
    )


def fill(command, config):
    filled = []
    for s in command:
        try:
            rep = s.format(**config)
        except KeyError:
            print("Substitution failed: %s" % s)
            filled = None
            break

        if isinstance(s, Output):
            filled.append(Output(rep))
        elif isinstance(s, MultiInput):
            N = int(config["jobs"])
            for i in range(1, N + 1):
                filled.append(rep.format(i=i, n=N))
        else:
            filled.append(rep)

    if filled is None:
        raise Exception("substitution failure")

    return tuple(filled)


def print_command(job, config, env=None):
    # Display a command to run that has roughly the same effect as what was
    # actually run. The actual command uses temporary files that get renamed at
    # the end, and run some commands in parallel chunks. The printed command
    # will substitute in the actual output and run in a single chunk, so that
    # it is easier to cut & paste and add a --function flag for debugging.
    cfg = dict(config, n=1, i=1, jobs=1)
    cmd = job_command_with_final_output_names(job)
    cmd = fill(cmd, cfg)

    cmd = [quote(s) for s in cmd]
    if outfile := job.get("redirect-output"):
        cmd.extend([">", quote(outfile.format(**cfg))])
    if HOME := os.environ.get("HOME"):
        cmd = [s.replace(HOME, "~") for s in cmd]

    if env:
        # Try to keep the command as short as possible by only displaying
        # modified environment variable settings.
        e = os.environ
        changed = {key: value for key, value in env.items() if value != e.get(key)}
        if changed:
            settings = []
            for key, value in changed.items():
                if key in e and e[key] in value:
                    # Display modifications as V=prefix${V}suffix when
                    # possible. This can make a huge different for $PATH.
                    start = value.index(e[key])
                    end = start + len(e[key])
                    setting = '%s="%s${%s}%s"' % (key, value[:start], key, value[end:])
                else:
                    setting = '%s="%s"' % (key, value)
                if HOME:
                    setting = setting.replace(HOME, "$HOME")
                settings.append(setting)

            cmd = settings + cmd

    print("  " + " ".join(cmd))


JOBS = {
    "list-dbs": {"command": ["ls", "-l"]},
    "rawcalls": {
        "command": [
            "{js}",
            "{analysis_scriptdir}/computeCallgraph.js",
            "{typeInfo}",
            Output("{rawcalls}"),
            "{i}",
            "{n}",
        ],
        "multi-output": True,
        "outputs": ["rawcalls.{i}.of.{n}"],
    },
    "gcFunctions": {
        "command": [
            "{js}",
            "{analysis_scriptdir}/computeGCFunctions.js",
            MultiInput("{rawcalls}"),
            "--outputs",
            Output("{callgraph}"),
            Output("{gcFunctions}"),
            Output("{gcFunctions_list}"),
            Output("{limitedFunctions_list}"),
        ],
        "outputs": [
            "callgraph.txt",
            "gcFunctions.txt",
            "gcFunctions.lst",
            "limitedFunctions.lst",
        ],
    },
    "gcTypes": {
        "command": [
            "{js}",
            "{analysis_scriptdir}/computeGCTypes.js",
            Output("{gcTypes}"),
            Output("{typeInfo}"),
        ],
        "outputs": ["gcTypes.txt", "typeInfo.txt"],
    },
    "allFunctions": {
        "command": ["{sixgill_bin}/xdbkeys", "src_body.xdb"],
        "redirect-output": "allFunctions.txt",
    },
    "hazards": {
        "command": [
            "{js}",
            "{analysis_scriptdir}/analyzeRoots.js",
            "{gcFunctions_list}",
            "{limitedFunctions_list}",
            "{gcTypes}",
            "{typeInfo}",
            "{i}",
            "{n}",
            "tmp.{i}.of.{n}",
        ],
        "multi-output": True,
        "redirect-output": "rootingHazards.{i}.of.{n}",
    },
    "gather-hazards": {
        "command": [
            "{js}",
            "{analysis_scriptdir}/mergeJSON.js",
            MultiInput("{hazards}"),
            Output("{all_hazards}"),
        ],
        "outputs": ["rootingHazards.json"],
    },
    "explain": {
        "command": [
            sys.executable,
            "{analysis_scriptdir}/explain.py",
            "{all_hazards}",
            "{gcFunctions}",
            Output("{explained_hazards}"),
            Output("{unnecessary}"),
            Output("{refs}"),
            Output("{html}"),
        ],
        "outputs": ["hazards.txt", "unnecessary.txt", "refs.txt", "hazards.html"],
    },
    "heapwrites": {
        "command": ["{js}", "{analysis_scriptdir}/analyzeHeapWrites.js"],
        "redirect-output": "heapWriteHazards.txt",
    },
}


# Generator of (i, j, item) tuples corresponding to outputs:
#  - i is just the index of the yielded tuple (a la enumerate())
#  - j is the index of the item in the command list
#  - item is command[j]
def out_indexes(command):
    i = 0
    for (j, fragment) in enumerate(command):
        if isinstance(fragment, Output):
            yield (i, j, fragment)
            i += 1


def job_command_with_final_output_names(job):
    outfiles = job.get("outputs", [])
    command = list(job["command"])
    for (i, j, name) in out_indexes(job["command"]):
        command[j] = outfiles[i]
    return command


def run_job(name, config):
    job = JOBS[name]
    outs = job.get("outputs") or job.get("redirect-output")
    print("Running " + name + " to generate " + str(outs))
    if "function" in job:
        job["function"](config, job["redirect-output"])
        return

    N = int(config["jobs"]) if job.get("multi-output") else 1
    config["n"] = N
    jobs = {}
    for i in range(1, N + 1):
        config["i"] = i
        cmd = fill(job["command"], config)
        info = spawn_command(cmd, job, name, config)
        jobs[info["proc"].pid] = info

    if config["verbose"] > 0:
        print_command(job, config, env=env(config))

    final_status = 0
    while jobs:
        pid, status = os.wait()
        final_status = final_status or status
        info = jobs[pid]
        del jobs[pid]
        if "redirect" in info:
            info["redirect"].close()

        # Rename the temporary files to their final names.
        for (temp, final) in info["rename_map"].items():
            try:
                if config["verbose"] > 1:
                    print("Renaming %s -> %s" % (temp, final))
                os.rename(temp, final)
            except OSError:
                print("Error renaming %s -> %s" % (temp, final))
                raise

    if final_status != 0:
        raise Exception("job {} returned status {}".format(name, final_status))


def spawn_command(cmdspec, job, name, config):
    rename_map = {}

    if "redirect-output" in job:
        stdout_filename = "{}.tmp{}".format(name, config.get("i", ""))
        final_outfile = job["redirect-output"].format(**config)
        rename_map[stdout_filename] = final_outfile
        command = cmdspec
    else:
        outfiles = fill(job["outputs"], config)
        stdout_filename = None

        # Replace the Outputs with temporary filenames, and record a mapping
        # from those temp names to their actual final names that will be used
        # if the command succeeds.
        command = list(cmdspec)
        for (i, j, raw_name) in out_indexes(cmdspec):
            [name] = fill([raw_name], config)
            command[j] = "{}.tmp{}".format(name, config.get("i", ""))
            rename_map[command[j]] = outfiles[i]

    sys.stdout.flush()
    info = {"rename_map": rename_map}
    if stdout_filename:
        info["redirect"] = open(stdout_filename, "w")
        info["proc"] = Popen(command, stdout=info["redirect"], env=env(config))
    else:
        info["proc"] = Popen(command, env=env(config))

    if config["verbose"] > 1:
        print("Spawned process {}".format(info["proc"].pid))

    return info


# Default to conservatively assuming 4GB/job.
def max_parallel_jobs(job_size=4 * 2 ** 30):
    """Return the max number of parallel jobs we can run without overfilling
    memory, assuming heavyweight jobs."""
    from_cores = int(subprocess.check_output(["nproc", "--ignore=1"]).strip())
    mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES")
    from_mem = round(mem_bytes / job_size)
    return min(from_cores, from_mem)


config = {"analysis_scriptdir": os.path.dirname(__file__)}

defaults = [
    "%s/defaults.py" % config["analysis_scriptdir"],
    "%s/defaults.py" % os.getcwd(),
]

parser = argparse.ArgumentParser(
    description="Statically analyze build tree for rooting hazards."
)
parser.add_argument(
    "step", metavar="STEP", type=str, nargs="?", help="run only step STEP"
)
parser.add_argument(
    "--source", metavar="SOURCE", type=str, nargs="?", help="source code to analyze"
)
parser.add_argument(
    "--js",
    metavar="JSSHELL",
    type=str,
    nargs="?",
    help="full path to ctypes-capable JS shell",
)
parser.add_argument(
    "--first",
    metavar="STEP",
    type=str,
    nargs="?",
    help="execute all jobs starting with STEP",
)
parser.add_argument(
    "--last", metavar="STEP", type=str, nargs="?", help="stop at step STEP"
)
parser.add_argument(
    "--jobs",
    "-j",
    default=None,
    metavar="JOBS",
    type=int,
    help="number of simultaneous analyzeRoots.js jobs",
)
parser.add_argument(
    "--list", const=True, nargs="?", type=bool, help="display available steps"
)
parser.add_argument(
    "--expect-file",
    type=str,
    nargs="?",
    help="deprecated option, temporarily still present for backwards " "compatibility",
)
parser.add_argument(
    "--verbose",
    "-v",
    action="count",
    default=1,
    help="Display cut & paste commands to run individual steps (give twice for more output)",
)
parser.add_argument("--quiet", "-q", action="count", default=0, help="Suppress output")

args = parser.parse_args()
args.verbose = max(0, args.verbose - args.quiet)

for default in defaults:
    try:
        execfile(default, config)
        if args.verbose > 1:
            print("Loaded %s" % default)
    except Exception:
        pass

# execfile() used config as the globals for running the
# defaults.py script, and will have set a __builtins__ key as a side effect.
del config["__builtins__"]
data = config.copy()

for k, v in vars(args).items():
    if v is not None:
        data[k] = v

if args.jobs is not None:
    data["jobs"] = args.jobs
if not data.get("jobs"):
    data["jobs"] = max_parallel_jobs()

if "GECKO_PATH" in os.environ:
    data["source"] = os.environ["GECKO_PATH"]
if "SOURCE" in os.environ:
    data["source"] = os.environ["SOURCE"]

steps = [
    "gcTypes",
    "rawcalls",
    "gcFunctions",
    "allFunctions",
    "hazards",
    "gather-hazards",
    "explain",
    "heapwrites",
]

if args.list:
    for step in steps:
        job = JOBS[step]
        outfiles = job.get("outputs") or job.get("redirect-output")
        if outfiles:
            print(
                "%s\n    ->%s %s"
                % (step, "*" if job.get("multi-output") else "", outfiles)
            )
        else:
            print(step)
    sys.exit(0)

for step in steps:
    job = JOBS[step]
    if "redirect-output" in job:
        data[step] = job["redirect-output"]
    elif "outputs" in job and "command" in job:
        outfiles = job["outputs"]
        num_outputs = 0
        for (i, j, name) in out_indexes(job["command"]):
            # Trim the {curly brackets} off of the output keys.
            data[name[1:-1]] = outfiles[i]
            num_outputs += 1
        assert (
            len(outfiles) == num_outputs
        ), 'step "%s": mismatched number of output files (%d) and params (%d)' % (
            step,
            num_outputs,
            len(outfiles),
        )  # NOQA: E501

if args.step:
    if args.first or args.last:
        raise Exception(
            "--first and --last cannot be used when a step argument is given"
        )
    steps = [args.step]
else:
    if args.first:
        steps = steps[steps.index(args.first) :]
    if args.last:
        steps = steps[: steps.index(args.last) + 1]

for step in steps:
    run_job(step, data)