diff options
Diffstat (limited to 'tools/github-sync/converter.py')
-rwxr-xr-x | tools/github-sync/converter.py | 481 |
1 files changed, 0 insertions, 481 deletions
diff --git a/tools/github-sync/converter.py b/tools/github-sync/converter.py deleted file mode 100755 index 104229e299..0000000000 --- a/tools/github-sync/converter.py +++ /dev/null @@ -1,481 +0,0 @@ -#!/usr/bin/env python3 - -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. - -import os -import re -import subprocess -import sys - -import hglib -import pygit2 - -DEBUG = False - - -def eprint(*args, **kwargs): - print(*args, file=sys.stderr, **kwargs) - - -def debugprint(*args, **kwargs): - if DEBUG: - eprint(*args, **kwargs) - - -class HgCommit: - def __init__(self, parent1, parent2): - self.parents = [] - if parent1 == NULL_PARENT_REV: - raise Exception( - "Encountered a hg changeset with no parents! We don't handle this...." - ) - self.parents.append(parent1) - if parent2 != NULL_PARENT_REV: - self.parents.append(parent2) - self.touches_sync_code = False - self.children = [] - - def add_child(self, rev): - self.children.append(rev) - - -class GitCommit: - def __init__(self, hg_rev, commit_obj): - self.hg_rev = hg_rev - self.commit_obj = commit_obj - - -def load_git_repository(): - commit_map = dict() - # First, scan the tags for "mozilla-xxx" that keep track of manually synchronized changes - sync_tags = filter( - lambda ref: ref.startswith("refs/tags/mozilla-"), - list(downstream_git_repo.references), - ) - for desc in sync_tags: - commit = downstream_git_repo.lookup_reference(desc).peel() - # cut out the revision hash from the output - hg_rev = desc[18:] - commit_map[hg_rev] = GitCommit(hg_rev, commit) - debugprint("Loaded pre-existing tag hg %s -> git %s" % (hg_rev, commit.oid)) - - # Next, scan the commits for a specific message format - re_commitmsg = re.compile( - r"^\[(ghsync|wrupdater)\] From https://hg.mozilla.org/mozilla-central/rev/([0-9a-fA-F]+)$", - re.MULTILINE, - ) - for commit in downstream_git_repo.walk(downstream_git_repo.head.target): - m = re_commitmsg.search(commit.message) - if not m: - continue - hg_rev = m.group(2) - commit_map[hg_rev] = GitCommit(hg_rev, commit) - debugprint("Loaded pre-existing commit hg %s -> git %s" % (hg_rev, commit.oid)) - return commit_map - - -def timeof(git_commit): - return git_commit.commit_obj.commit_time + git_commit.commit_obj.commit_time_offset - - -def find_newest_commit(commit_map): - newest_hg_rev = None - newest_commit_time = None - - for hg_rev, git_commit in commit_map.items(): - if newest_hg_rev is None or timeof(git_commit) > newest_commit_time: - newest_hg_rev = hg_rev - newest_commit_time = timeof(git_commit) - - return newest_hg_rev - - -def get_single_rev(revset): - output = subprocess.check_output( - ["hg", "log", "-r", revset, "--template", "{node}"] - ) - output = str(output, "ascii") - return output - - -def get_multiple_revs(revset, template): - output = subprocess.check_output( - ["hg", "log", "-r", revset, "--template", template + "\\n"] - ) - for line in output.splitlines(): - yield str(line, "ascii") - - -def get_base_hg_rev(commit_map): - base_hg_rev = find_newest_commit(commit_map) - eprint("Using %s as base hg revision" % base_hg_rev) - return base_hg_rev - - -def load_hg_commits(commits, query): - for cset in get_multiple_revs(query, "{node} {p1node} {p2node}"): - tokens = cset.split() - commits[tokens[0]] = HgCommit(tokens[1], tokens[2]) - return commits - - -def get_real_base_hg_rev(hg_data, commit_map): - # Some of the HG commits we want to port to github may have landed on codelines - # that branched off central prior to base_hg_rev. So when we create the git - # equivalents, they will have parents that are not the HEAD of the git repo, - # but instead will be descendants of older commits in the git repo. In order - # to do this correctly, we need to find the hg-equivalents of all of those - # possible git parents. So first we identify all the "tail" hg revisions in - # our hg_data set (think "tail" as in opposite of "head" which is the tipmost - # commit). The "tail" hg revisions are the ones for which we don't have their - # ancestors in hg_data. - tails = [] - for rev, cset in hg_data.items(): - for parent in cset.parents: - if parent not in hg_data: - tails.append(rev) - eprint("Found hg tail revisions %s" % tails) - # Then we find their common ancestor, which will be some ancestor of base_hg_rev - # from which those codelines. - if len(tails) == 0: - common_ancestor = get_single_rev(".") - else: - common_ancestor = get_single_rev("ancestor(" + ",".join(tails) + ")") - eprint("Found common ancestor of tail revisions: %s" % common_ancestor) - - # And then we find the newest git commit whose hg-equivalent is an ancestor of - # that common ancestor, to make sure we are starting from a known hg/git - # commit pair. - for git_commit in sorted(commit_map.values(), key=timeof, reverse=True): - new_base = get_single_rev( - "ancestor(" + common_ancestor + "," + git_commit.hg_rev + ")" - ) - if new_base == common_ancestor: - eprint( - "Pre-existing git commit %s from hg rev %s is descendant of common ancestor; %s" - % ( - git_commit.commit_obj.id, - git_commit.hg_rev, - "walking back further...", - ) - ) - continue - if new_base != git_commit.hg_rev: - eprint( - "Pre-existing git commit %s from hg rev %s is on sibling branch" - " of common ancestor; %s" - % ( - git_commit.commit_obj.id, - git_commit.hg_rev, - "walking back further...", - ) - ) - continue - eprint( - "Pre-existing git commit %s from hg rev %s is sufficiently old; stopping walk" - % (git_commit.commit_obj.id, git_commit.hg_rev) - ) - common_ancestor = new_base - break - - return common_ancestor - - -# Now we prune out all the uninteresting changesets from hg_commits. The -# uninteresting ones are ones that don't touch the target code, are not merges, -# and are not referenced by mozilla tags in the git repo. -# We do this by rewriting the parents to the "interesting" ancestor. -def prune_boring(rev): - while rev in hg_commits: - parent_pruned = False - for i in range(len(hg_commits[rev].parents)): - parent_rev = hg_commits[rev].parents[i] - if parent_rev not in hg_commits: - continue - if hg_commits[parent_rev].touches_sync_code: - continue - if len(hg_commits[parent_rev].parents) > 1: - continue - if parent_rev in hg_to_git_commit_map: - continue - - # If we get here, then `parent_rev` is a boring revision and we can - # prune it. Connect `rev` to its grandparent, and prune the parent - grandparent_rev = hg_commits[parent_rev].parents[0] - hg_commits[rev].parents[i] = grandparent_rev - # eprint("Pruned %s as boring parent of %s, using %s now" % - # (parent_rev, rev, grandparent_rev)) - parent_pruned = True - - if parent_pruned: - # If we pruned a parent, process `rev` again as we might want to - # prune more parents - continue - - # Collapse identical parents, because if the parents are identical - # we don't need to keep multiple copies of them. - hg_commits[rev].parents = list(dict.fromkeys(hg_commits[rev].parents)) - - # If we get here, all of `rev`s parents are interesting, so we can't - # prune them. Move up to the parent rev and start processing that, or - # if we have multiple parents then recurse on those nodes. - if len(hg_commits[rev].parents) == 1: - rev = hg_commits[rev].parents[0] - continue - - for parent_rev in hg_commits[rev].parents: - prune_boring(parent_rev) - return - - -class FakeCommit: - def __init__(self, oid): - self.oid = oid - - -def fake_commit(hg_rev, parent1, parent2): - if parent1 is None: - eprint("ERROR: Trying to build on None") - exit(1) - oid = "githash_%s" % hash(parent1) - eprint("Fake-built %s" % oid) - return FakeCommit(oid) - - -def build_tree(builder, treedata): - for name, value in treedata.items(): - if isinstance(value, dict): - subbuilder = downstream_git_repo.TreeBuilder() - build_tree(subbuilder, value) - builder.insert(name, subbuilder.write(), pygit2.GIT_FILEMODE_TREE) - else: - (filemode, contents) = value - blob_oid = downstream_git_repo.create_blob(contents) - builder.insert(name, blob_oid, filemode) - - -def author_to_signature(author): - pieces = author.strip().split("<") - if len(pieces) != 2 or pieces[1][-1] != ">": - # We could probably handle this better - return pygit2.Signature(author, "") - name = pieces[0].strip() - email = pieces[1][:-1].strip() - return pygit2.Signature(name, email) - - -def real_commit(hg_rev, parent1, parent2): - filetree = dict() - manifest = mozilla_hg_repo.manifest(rev=hg_rev) - for nodeid, permission, executable, symlink, filename in manifest: - if not filename.startswith(relative_path.encode("utf-8")): - continue - if symlink: - filemode = pygit2.GIT_FILEMODE_LINK - elif executable: - filemode = pygit2.GIT_FILEMODE_BLOB_EXECUTABLE - else: - filemode = pygit2.GIT_FILEMODE_BLOB - filecontent = mozilla_hg_repo.cat([filename], rev=hg_rev) - subtree = filetree - for component in filename.split(b"/")[2:-1]: - subtree = subtree.setdefault(component.decode("latin-1"), dict()) - filename = filename.split(b"/")[-1] - subtree[filename.decode("latin-1")] = (filemode, filecontent) - - builder = downstream_git_repo.TreeBuilder() - build_tree(builder, filetree) - tree_oid = builder.write() - - parent1_obj = downstream_git_repo.get(parent1) - if parent1_obj.tree_id == tree_oid: - eprint("Early-exit; tree matched that of parent git commit %s" % parent1) - return parent1_obj - - if parent2 is not None: - parent2_obj = downstream_git_repo.get(parent2) - if parent2_obj.tree_id == tree_oid: - eprint("Early-exit; tree matched that of parent git commit %s" % parent2) - return parent2_obj - - hg_rev_obj = mozilla_hg_repo.log(revrange=hg_rev, limit=1)[0] - commit_author = hg_rev_obj[4].decode("latin-1") - commit_message = hg_rev_obj[5].decode("latin-1") - commit_message += ( - "\n\n[ghsync] From https://hg.mozilla.org/mozilla-central/rev/%s" % hg_rev - + "\n" - ) - - parents = [parent1] - if parent2 is not None: - parents.append(parent2) - commit_oid = downstream_git_repo.create_commit( - None, - author_to_signature(commit_author), - author_to_signature(commit_author), - commit_message, - tree_oid, - parents, - ) - eprint("Built git commit %s" % commit_oid) - return downstream_git_repo.get(commit_oid) - - -def try_commit(hg_rev, parent1, parent2=None): - if False: - return fake_commit(hg_rev, parent1, parent2) - else: - return real_commit(hg_rev, parent1, parent2) - - -def build_git_commits(rev): - debugprint("build_git_commit(%s)..." % rev) - if rev in hg_to_git_commit_map: - debugprint(" maps to %s" % hg_to_git_commit_map[rev].commit_obj.oid) - return hg_to_git_commit_map[rev].commit_obj.oid - - if rev not in hg_commits: - debugprint(" not in hg_commits") - return None - - if len(hg_commits[rev].parents) == 1: - git_parent = build_git_commits(hg_commits[rev].parents[0]) - if not hg_commits[rev].touches_sync_code: - eprint( - "WARNING: Found rev %s that is non-merge and not related to the target" - % rev - ) - return git_parent - eprint("Building git equivalent for %s on top of %s" % (rev, git_parent)) - commit_obj = try_commit(rev, git_parent) - hg_to_git_commit_map[rev] = GitCommit(rev, commit_obj) - debugprint(" built %s as %s" % (rev, commit_obj.oid)) - return commit_obj.oid - - git_parent_1 = build_git_commits(hg_commits[rev].parents[0]) - git_parent_2 = build_git_commits(hg_commits[rev].parents[1]) - if git_parent_1 is None or git_parent_2 is None or git_parent_1 == git_parent_2: - git_parent = git_parent_1 if git_parent_2 is None else git_parent_2 - if not hg_commits[rev].touches_sync_code: - debugprint( - " %s is merge with no parents or doesn't touch WR, returning %s" - % (rev, git_parent) - ) - return git_parent - - eprint( - "WARNING: Found merge rev %s whose parents have identical target code" - ", but modifies the target" % rev - ) - eprint("Building git equivalent for %s on top of %s" % (rev, git_parent)) - commit_obj = try_commit(rev, git_parent) - hg_to_git_commit_map[rev] = GitCommit(rev, commit_obj) - debugprint(" built %s as %s" % (rev, commit_obj.oid)) - return commit_obj.oid - - # An actual merge - eprint( - "Building git equivalent for %s on top of %s, %s" - % (rev, git_parent_1, git_parent_2) - ) - commit_obj = try_commit(rev, git_parent_1, git_parent_2) - hg_to_git_commit_map[rev] = GitCommit(rev, commit_obj) - debugprint(" built %s as %s" % (rev, commit_obj.oid)) - return commit_obj.oid - - -def pretty_print(rev, cset): - desc = " %s" % rev - desc += " parents: %s" % cset.parents - if rev in hg_to_git_commit_map: - desc += " git: %s" % hg_to_git_commit_map[rev].commit_obj.oid - if rev == hg_tip: - desc += " (tip)" - return desc - - -if len(sys.argv) < 3: - eprint("Usage: %s <local-checkout-path> <repo-relative-path>" % sys.argv[0]) - eprint("Current dir must be the mozilla hg repo") - exit(1) - -local_checkout_path = sys.argv[1] -relative_path = sys.argv[2] -mozilla_hg_path = os.getcwd() -NULL_PARENT_REV = "0000000000000000000000000000000000000000" - -downstream_git_repo = pygit2.Repository(pygit2.discover_repository(local_checkout_path)) -mozilla_hg_repo = hglib.open(mozilla_hg_path) -hg_to_git_commit_map = load_git_repository() -base_hg_rev = get_base_hg_rev(hg_to_git_commit_map) -if base_hg_rev is None: - eprint("Found no sync commits or 'mozilla-xxx' tags") - exit(1) - -hg_commits = load_hg_commits(dict(), "only(.," + base_hg_rev + ")") -eprint("Initial set has %s changesets" % len(hg_commits)) -base_hg_rev = get_real_base_hg_rev(hg_commits, hg_to_git_commit_map) -eprint("Using hg rev %s as common ancestor of all interesting changesets" % base_hg_rev) - -# Refresh hg_commits with our wider dataset -hg_tip = get_single_rev(".") -wider_range = "%s::%s" % (base_hg_rev, hg_tip) -hg_commits = load_hg_commits(hg_commits, wider_range) -eprint("Updated set has %s changesets" % len(hg_commits)) - -if DEBUG: - eprint("Graph of descendants of %s" % base_hg_rev) - output = subprocess.check_output( - [ - "hg", - "log", - "--graph", - "-r", - "descendants(" + base_hg_rev + ")", - "--template", - "{node} {desc|firstline}\\n", - ] - ) - for line in output.splitlines(): - eprint(line.decode("utf-8", "ignore")) - -# Also flag any changes that touch the project -query = "(" + wider_range + ') & file("glob:' + relative_path + '/**")' -for cset in get_multiple_revs(query, "{node}"): - debugprint("Changeset %s modifies %s" % (cset, relative_path)) - hg_commits[cset].touches_sync_code = True -eprint( - "Identified %s changesets that touch the target code" - % sum([1 if v.touches_sync_code else 0 for (k, v) in hg_commits.items()]) -) - -prune_boring(hg_tip) - -# hg_tip itself might be boring -if not hg_commits[hg_tip].touches_sync_code and len(hg_commits[hg_tip].parents) == 1: - new_tip = hg_commits[hg_tip].parents[0] - eprint("Pruned tip %s as boring, using %s now" % (hg_tip, new_tip)) - hg_tip = new_tip - -eprint("--- Interesting changesets ---") -for rev, cset in hg_commits.items(): - if cset.touches_sync_code or len(cset.parents) > 1 or rev in hg_to_git_commit_map: - eprint(pretty_print(rev, cset)) -if DEBUG: - eprint("--- Other changesets (not really interesting) ---") - for rev, cset in hg_commits.items(): - if not ( - cset.touches_sync_code - or len(cset.parents) > 1 - or rev in hg_to_git_commit_map - ): - eprint(pretty_print(rev, cset)) - -git_tip = build_git_commits(hg_tip) -if git_tip is None: - eprint("No new changesets generated, exiting.") -else: - downstream_git_repo.create_reference("refs/heads/github-sync", git_tip, force=True) - eprint("Updated github-sync branch to %s, done!" % git_tip) |