summaryrefslogtreecommitdiffstats
path: root/tools/github-sync/converter.py
blob: 104229e2997475b69efc98724b302876b43b3445 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
#!/usr/bin/env python3

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import os
import re
import subprocess
import sys

import hglib
import pygit2

DEBUG = False


def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)


def debugprint(*args, **kwargs):
    if DEBUG:
        eprint(*args, **kwargs)


class HgCommit:
    def __init__(self, parent1, parent2):
        self.parents = []
        if parent1 == NULL_PARENT_REV:
            raise Exception(
                "Encountered a hg changeset with no parents! We don't handle this...."
            )
        self.parents.append(parent1)
        if parent2 != NULL_PARENT_REV:
            self.parents.append(parent2)
        self.touches_sync_code = False
        self.children = []

    def add_child(self, rev):
        self.children.append(rev)


class GitCommit:
    def __init__(self, hg_rev, commit_obj):
        self.hg_rev = hg_rev
        self.commit_obj = commit_obj


def load_git_repository():
    commit_map = dict()
    # First, scan the tags for "mozilla-xxx" that keep track of manually synchronized changes
    sync_tags = filter(
        lambda ref: ref.startswith("refs/tags/mozilla-"),
        list(downstream_git_repo.references),
    )
    for desc in sync_tags:
        commit = downstream_git_repo.lookup_reference(desc).peel()
        # cut out the revision hash from the output
        hg_rev = desc[18:]
        commit_map[hg_rev] = GitCommit(hg_rev, commit)
        debugprint("Loaded pre-existing tag hg %s -> git %s" % (hg_rev, commit.oid))

    # Next, scan the commits for a specific message format
    re_commitmsg = re.compile(
        r"^\[(ghsync|wrupdater)\] From https://hg.mozilla.org/mozilla-central/rev/([0-9a-fA-F]+)$",
        re.MULTILINE,
    )
    for commit in downstream_git_repo.walk(downstream_git_repo.head.target):
        m = re_commitmsg.search(commit.message)
        if not m:
            continue
        hg_rev = m.group(2)
        commit_map[hg_rev] = GitCommit(hg_rev, commit)
        debugprint("Loaded pre-existing commit hg %s -> git %s" % (hg_rev, commit.oid))
    return commit_map


def timeof(git_commit):
    return git_commit.commit_obj.commit_time + git_commit.commit_obj.commit_time_offset


def find_newest_commit(commit_map):
    newest_hg_rev = None
    newest_commit_time = None

    for hg_rev, git_commit in commit_map.items():
        if newest_hg_rev is None or timeof(git_commit) > newest_commit_time:
            newest_hg_rev = hg_rev
            newest_commit_time = timeof(git_commit)

    return newest_hg_rev


def get_single_rev(revset):
    output = subprocess.check_output(
        ["hg", "log", "-r", revset, "--template", "{node}"]
    )
    output = str(output, "ascii")
    return output


def get_multiple_revs(revset, template):
    output = subprocess.check_output(
        ["hg", "log", "-r", revset, "--template", template + "\\n"]
    )
    for line in output.splitlines():
        yield str(line, "ascii")


def get_base_hg_rev(commit_map):
    base_hg_rev = find_newest_commit(commit_map)
    eprint("Using %s as base hg revision" % base_hg_rev)
    return base_hg_rev


def load_hg_commits(commits, query):
    for cset in get_multiple_revs(query, "{node} {p1node} {p2node}"):
        tokens = cset.split()
        commits[tokens[0]] = HgCommit(tokens[1], tokens[2])
    return commits


def get_real_base_hg_rev(hg_data, commit_map):
    # Some of the HG commits we want to port to github may have landed on codelines
    # that branched off central prior to base_hg_rev. So when we create the git
    # equivalents, they will have parents that are not the HEAD of the git repo,
    # but instead will be descendants of older commits in the git repo. In order
    # to do this correctly, we need to find the hg-equivalents of all of those
    # possible git parents. So first we identify all the "tail" hg revisions in
    # our hg_data set (think "tail" as in opposite of "head" which is the tipmost
    # commit). The "tail" hg revisions are the ones for which we don't have their
    # ancestors in hg_data.
    tails = []
    for rev, cset in hg_data.items():
        for parent in cset.parents:
            if parent not in hg_data:
                tails.append(rev)
    eprint("Found hg tail revisions %s" % tails)
    # Then we find their common ancestor, which will be some ancestor of base_hg_rev
    # from which those codelines.
    if len(tails) == 0:
        common_ancestor = get_single_rev(".")
    else:
        common_ancestor = get_single_rev("ancestor(" + ",".join(tails) + ")")
    eprint("Found common ancestor of tail revisions: %s" % common_ancestor)

    # And then we find the newest git commit whose hg-equivalent is an ancestor of
    # that common ancestor, to make sure we are starting from a known hg/git
    # commit pair.
    for git_commit in sorted(commit_map.values(), key=timeof, reverse=True):
        new_base = get_single_rev(
            "ancestor(" + common_ancestor + "," + git_commit.hg_rev + ")"
        )
        if new_base == common_ancestor:
            eprint(
                "Pre-existing git commit %s from hg rev %s is descendant of common ancestor; %s"
                % (
                    git_commit.commit_obj.id,
                    git_commit.hg_rev,
                    "walking back further...",
                )
            )
            continue
        if new_base != git_commit.hg_rev:
            eprint(
                "Pre-existing git commit %s from hg rev %s is on sibling branch"
                " of common ancestor; %s"
                % (
                    git_commit.commit_obj.id,
                    git_commit.hg_rev,
                    "walking back further...",
                )
            )
            continue
        eprint(
            "Pre-existing git commit %s from hg rev %s is sufficiently old; stopping walk"
            % (git_commit.commit_obj.id, git_commit.hg_rev)
        )
        common_ancestor = new_base
        break

    return common_ancestor


# Now we prune out all the uninteresting changesets from hg_commits. The
# uninteresting ones are ones that don't touch the target code, are not merges,
# and are not referenced by mozilla tags in the git repo.
# We do this by rewriting the parents to the "interesting" ancestor.
def prune_boring(rev):
    while rev in hg_commits:
        parent_pruned = False
        for i in range(len(hg_commits[rev].parents)):
            parent_rev = hg_commits[rev].parents[i]
            if parent_rev not in hg_commits:
                continue
            if hg_commits[parent_rev].touches_sync_code:
                continue
            if len(hg_commits[parent_rev].parents) > 1:
                continue
            if parent_rev in hg_to_git_commit_map:
                continue

            # If we get here, then `parent_rev` is a boring revision and we can
            # prune it. Connect `rev` to its grandparent, and prune the parent
            grandparent_rev = hg_commits[parent_rev].parents[0]
            hg_commits[rev].parents[i] = grandparent_rev
            # eprint("Pruned %s as boring parent of %s, using %s now" %
            #    (parent_rev, rev, grandparent_rev))
            parent_pruned = True

        if parent_pruned:
            # If we pruned a parent, process `rev` again as we might want to
            # prune more parents
            continue

        # Collapse identical parents, because if the parents are identical
        # we don't need to keep multiple copies of them.
        hg_commits[rev].parents = list(dict.fromkeys(hg_commits[rev].parents))

        # If we get here, all of `rev`s parents are interesting, so we can't
        # prune them. Move up to the parent rev and start processing that, or
        # if we have multiple parents then recurse on those nodes.
        if len(hg_commits[rev].parents) == 1:
            rev = hg_commits[rev].parents[0]
            continue

        for parent_rev in hg_commits[rev].parents:
            prune_boring(parent_rev)
        return


class FakeCommit:
    def __init__(self, oid):
        self.oid = oid


def fake_commit(hg_rev, parent1, parent2):
    if parent1 is None:
        eprint("ERROR: Trying to build on None")
        exit(1)
    oid = "githash_%s" % hash(parent1)
    eprint("Fake-built %s" % oid)
    return FakeCommit(oid)


def build_tree(builder, treedata):
    for name, value in treedata.items():
        if isinstance(value, dict):
            subbuilder = downstream_git_repo.TreeBuilder()
            build_tree(subbuilder, value)
            builder.insert(name, subbuilder.write(), pygit2.GIT_FILEMODE_TREE)
        else:
            (filemode, contents) = value
            blob_oid = downstream_git_repo.create_blob(contents)
            builder.insert(name, blob_oid, filemode)


def author_to_signature(author):
    pieces = author.strip().split("<")
    if len(pieces) != 2 or pieces[1][-1] != ">":
        # We could probably handle this better
        return pygit2.Signature(author, "")
    name = pieces[0].strip()
    email = pieces[1][:-1].strip()
    return pygit2.Signature(name, email)


def real_commit(hg_rev, parent1, parent2):
    filetree = dict()
    manifest = mozilla_hg_repo.manifest(rev=hg_rev)
    for nodeid, permission, executable, symlink, filename in manifest:
        if not filename.startswith(relative_path.encode("utf-8")):
            continue
        if symlink:
            filemode = pygit2.GIT_FILEMODE_LINK
        elif executable:
            filemode = pygit2.GIT_FILEMODE_BLOB_EXECUTABLE
        else:
            filemode = pygit2.GIT_FILEMODE_BLOB
        filecontent = mozilla_hg_repo.cat([filename], rev=hg_rev)
        subtree = filetree
        for component in filename.split(b"/")[2:-1]:
            subtree = subtree.setdefault(component.decode("latin-1"), dict())
        filename = filename.split(b"/")[-1]
        subtree[filename.decode("latin-1")] = (filemode, filecontent)

    builder = downstream_git_repo.TreeBuilder()
    build_tree(builder, filetree)
    tree_oid = builder.write()

    parent1_obj = downstream_git_repo.get(parent1)
    if parent1_obj.tree_id == tree_oid:
        eprint("Early-exit; tree matched that of parent git commit %s" % parent1)
        return parent1_obj

    if parent2 is not None:
        parent2_obj = downstream_git_repo.get(parent2)
        if parent2_obj.tree_id == tree_oid:
            eprint("Early-exit; tree matched that of parent git commit %s" % parent2)
            return parent2_obj

    hg_rev_obj = mozilla_hg_repo.log(revrange=hg_rev, limit=1)[0]
    commit_author = hg_rev_obj[4].decode("latin-1")
    commit_message = hg_rev_obj[5].decode("latin-1")
    commit_message += (
        "\n\n[ghsync] From https://hg.mozilla.org/mozilla-central/rev/%s" % hg_rev
        + "\n"
    )

    parents = [parent1]
    if parent2 is not None:
        parents.append(parent2)
    commit_oid = downstream_git_repo.create_commit(
        None,
        author_to_signature(commit_author),
        author_to_signature(commit_author),
        commit_message,
        tree_oid,
        parents,
    )
    eprint("Built git commit %s" % commit_oid)
    return downstream_git_repo.get(commit_oid)


def try_commit(hg_rev, parent1, parent2=None):
    if False:
        return fake_commit(hg_rev, parent1, parent2)
    else:
        return real_commit(hg_rev, parent1, parent2)


def build_git_commits(rev):
    debugprint("build_git_commit(%s)..." % rev)
    if rev in hg_to_git_commit_map:
        debugprint("  maps to %s" % hg_to_git_commit_map[rev].commit_obj.oid)
        return hg_to_git_commit_map[rev].commit_obj.oid

    if rev not in hg_commits:
        debugprint("  not in hg_commits")
        return None

    if len(hg_commits[rev].parents) == 1:
        git_parent = build_git_commits(hg_commits[rev].parents[0])
        if not hg_commits[rev].touches_sync_code:
            eprint(
                "WARNING: Found rev %s that is non-merge and not related to the target"
                % rev
            )
            return git_parent
        eprint("Building git equivalent for %s on top of %s" % (rev, git_parent))
        commit_obj = try_commit(rev, git_parent)
        hg_to_git_commit_map[rev] = GitCommit(rev, commit_obj)
        debugprint("  built %s as %s" % (rev, commit_obj.oid))
        return commit_obj.oid

    git_parent_1 = build_git_commits(hg_commits[rev].parents[0])
    git_parent_2 = build_git_commits(hg_commits[rev].parents[1])
    if git_parent_1 is None or git_parent_2 is None or git_parent_1 == git_parent_2:
        git_parent = git_parent_1 if git_parent_2 is None else git_parent_2
        if not hg_commits[rev].touches_sync_code:
            debugprint(
                "  %s is merge with no parents or doesn't touch WR, returning %s"
                % (rev, git_parent)
            )
            return git_parent

        eprint(
            "WARNING: Found merge rev %s whose parents have identical target code"
            ", but modifies the target" % rev
        )
        eprint("Building git equivalent for %s on top of %s" % (rev, git_parent))
        commit_obj = try_commit(rev, git_parent)
        hg_to_git_commit_map[rev] = GitCommit(rev, commit_obj)
        debugprint("  built %s as %s" % (rev, commit_obj.oid))
        return commit_obj.oid

    # An actual merge
    eprint(
        "Building git equivalent for %s on top of %s, %s"
        % (rev, git_parent_1, git_parent_2)
    )
    commit_obj = try_commit(rev, git_parent_1, git_parent_2)
    hg_to_git_commit_map[rev] = GitCommit(rev, commit_obj)
    debugprint("  built %s as %s" % (rev, commit_obj.oid))
    return commit_obj.oid


def pretty_print(rev, cset):
    desc = "  %s" % rev
    desc += " parents: %s" % cset.parents
    if rev in hg_to_git_commit_map:
        desc += " git: %s" % hg_to_git_commit_map[rev].commit_obj.oid
    if rev == hg_tip:
        desc += " (tip)"
    return desc


if len(sys.argv) < 3:
    eprint("Usage: %s <local-checkout-path> <repo-relative-path>" % sys.argv[0])
    eprint("Current dir must be the mozilla hg repo")
    exit(1)

local_checkout_path = sys.argv[1]
relative_path = sys.argv[2]
mozilla_hg_path = os.getcwd()
NULL_PARENT_REV = "0000000000000000000000000000000000000000"

downstream_git_repo = pygit2.Repository(pygit2.discover_repository(local_checkout_path))
mozilla_hg_repo = hglib.open(mozilla_hg_path)
hg_to_git_commit_map = load_git_repository()
base_hg_rev = get_base_hg_rev(hg_to_git_commit_map)
if base_hg_rev is None:
    eprint("Found no sync commits or 'mozilla-xxx' tags")
    exit(1)

hg_commits = load_hg_commits(dict(), "only(.," + base_hg_rev + ")")
eprint("Initial set has %s changesets" % len(hg_commits))
base_hg_rev = get_real_base_hg_rev(hg_commits, hg_to_git_commit_map)
eprint("Using hg rev %s as common ancestor of all interesting changesets" % base_hg_rev)

# Refresh hg_commits with our wider dataset
hg_tip = get_single_rev(".")
wider_range = "%s::%s" % (base_hg_rev, hg_tip)
hg_commits = load_hg_commits(hg_commits, wider_range)
eprint("Updated set has %s changesets" % len(hg_commits))

if DEBUG:
    eprint("Graph of descendants of %s" % base_hg_rev)
    output = subprocess.check_output(
        [
            "hg",
            "log",
            "--graph",
            "-r",
            "descendants(" + base_hg_rev + ")",
            "--template",
            "{node} {desc|firstline}\\n",
        ]
    )
    for line in output.splitlines():
        eprint(line.decode("utf-8", "ignore"))

# Also flag any changes that touch the project
query = "(" + wider_range + ') & file("glob:' + relative_path + '/**")'
for cset in get_multiple_revs(query, "{node}"):
    debugprint("Changeset %s modifies %s" % (cset, relative_path))
    hg_commits[cset].touches_sync_code = True
eprint(
    "Identified %s changesets that touch the target code"
    % sum([1 if v.touches_sync_code else 0 for (k, v) in hg_commits.items()])
)

prune_boring(hg_tip)

# hg_tip itself might be boring
if not hg_commits[hg_tip].touches_sync_code and len(hg_commits[hg_tip].parents) == 1:
    new_tip = hg_commits[hg_tip].parents[0]
    eprint("Pruned tip %s as boring, using %s now" % (hg_tip, new_tip))
    hg_tip = new_tip

eprint("--- Interesting changesets ---")
for rev, cset in hg_commits.items():
    if cset.touches_sync_code or len(cset.parents) > 1 or rev in hg_to_git_commit_map:
        eprint(pretty_print(rev, cset))
if DEBUG:
    eprint("--- Other changesets (not really interesting) ---")
    for rev, cset in hg_commits.items():
        if not (
            cset.touches_sync_code
            or len(cset.parents) > 1
            or rev in hg_to_git_commit_map
        ):
            eprint(pretty_print(rev, cset))

git_tip = build_git_commits(hg_tip)
if git_tip is None:
    eprint("No new changesets generated, exiting.")
else:
    downstream_git_repo.create_reference("refs/heads/github-sync", git_tip, force=True)
    eprint("Updated github-sync branch to %s, done!" % git_tip)