1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
|
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import sys
import hashlib
import functools
from mozbuild.preprocessor import Preprocessor
from mozbuild.util import DefinesAction
from mozpack.packager.unpack import UnpackFinder
from mozpack.files import DeflatedFile
from collections import OrderedDict
from io import StringIO
import argparse
import buildconfig
"""
Find files duplicated in a given packaged directory, independently of its
package format.
"""
def normalize_osx_path(p):
"""
Strips the first 3 elements of an OSX app path
>>> normalize_osx_path('Nightly.app/foo/bar/baz')
'baz'
"""
bits = p.split("/")
if len(bits) > 3 and bits[0].endswith(".app"):
return "/".join(bits[3:])
return p
def is_l10n_file(path):
return (
"/locale/" in path
or "/localization/" in path
or path.startswith("localization/")
)
def normalize_path(p):
return normalize_osx_path(p)
def find_dupes(source, allowed_dupes, bail=True):
chunk_size = 1024 * 10
allowed_dupes = set(allowed_dupes)
checksums = OrderedDict()
for p, f in UnpackFinder(source):
checksum = hashlib.sha1()
content_size = 0
for buf in iter(functools.partial(f.open().read, chunk_size), b""):
checksum.update(buf)
content_size += len(buf)
m = checksum.digest()
if m not in checksums:
if isinstance(f, DeflatedFile):
compressed = f.file.compressed_size
else:
compressed = content_size
checksums[m] = (content_size, compressed, [])
checksums[m][2].append(p)
total = 0
total_compressed = 0
num_dupes = 0
unexpected_dupes = []
for m, (size, compressed, paths) in sorted(
checksums.items(), key=lambda x: x[1][1]
):
if len(paths) > 1:
_compressed = " (%d compressed)" % compressed if compressed != size else ""
_times = " (%d times)" % (len(paths) - 1) if len(paths) > 2 else ""
print("Duplicates {} bytes{}{}:".format(size, _compressed, _times))
print("".join(" %s\n" % p for p in paths))
total += (len(paths) - 1) * size
total_compressed += (len(paths) - 1) * compressed
num_dupes += 1
for p in paths:
if not is_l10n_file(p) and normalize_path(p) not in allowed_dupes:
unexpected_dupes.append(p)
if num_dupes:
total_compressed = (
"%d compressed" % total_compressed
if total_compressed != total
else "uncompressed"
)
print(
"WARNING: Found {} duplicated files taking {} bytes ({})".format(
num_dupes, total, total_compressed
)
)
if unexpected_dupes:
errortype = "ERROR" if bail else "WARNING"
print("{}: The following duplicated files are not allowed:".format(errortype))
print("\n".join(unexpected_dupes))
if bail:
sys.exit(1)
def main():
parser = argparse.ArgumentParser(description="Find duplicate files in directory.")
parser.add_argument(
"--warning",
"-w",
action="store_true",
help="Only warn about duplicates, do not exit with an error",
)
parser.add_argument(
"--file",
"-f",
action="append",
dest="dupes_files",
default=[],
help="Add exceptions to the duplicate list from this file",
)
parser.add_argument("-D", action=DefinesAction)
parser.add_argument("-U", action="append", default=[])
parser.add_argument("directory", help="The directory to check for duplicates in")
args = parser.parse_args()
allowed_dupes = []
for filename in args.dupes_files:
pp = Preprocessor()
pp.context.update(buildconfig.defines["ALLDEFINES"])
if args.D:
pp.context.update(args.D)
for undefine in args.U:
if undefine in pp.context:
del pp.context[undefine]
pp.out = StringIO()
pp.do_filter("substitution")
pp.do_include(filename)
allowed_dupes.extend(
[line.partition("#")[0].rstrip() for line in pp.out.getvalue().splitlines()]
)
find_dupes(args.directory, bail=not args.warning, allowed_dupes=allowed_dupes)
if __name__ == "__main__":
main()
|