1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
|
#!/usr/bin/env python
# Copyright 2018 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Wraps ml.exe or ml64.exe and postprocesses the output to be deterministic.
Sets timestamp in .obj file to 0, hence incompatible with link.exe /incremental.
Use by prefixing the ml(64).exe invocation with this script:
python ml.py ml.exe [args...]"""
import array
import collections
import struct
import subprocess
import sys
class Struct(object):
"""A thin wrapper around the struct module that returns a namedtuple"""
def __init__(self, name, *args):
"""Pass the name of the return type, and then an interleaved list of
format strings as used by the struct module and of field names."""
self.fmt = '<' + ''.join(args[0::2])
self.type = collections.namedtuple(name, args[1::2])
def pack_into(self, buffer, offset, data):
return struct.pack_into(self.fmt, buffer, offset, *data)
def unpack_from(self, buffer, offset=0):
return self.type(*struct.unpack_from(self.fmt, buffer, offset))
def size(self):
return struct.calcsize(self.fmt)
def Subtract(nt, **kwargs):
"""Subtract(nt, f=2) returns a new namedtuple with 2 subtracted from nt.f"""
return nt._replace(**{k: getattr(nt, k) - v for k, v in kwargs.items()})
def MakeDeterministic(objdata):
# Takes data produced by ml(64).exe (without any special flags) and
# 1. Sets the timestamp to 0
# 2. Strips the .debug$S section (which contains an unwanted absolute path)
# This makes several assumptions about ml's output:
# - Section data is in the same order as the corresponding section headers:
# section headers preceding the .debug$S section header have their data
# preceding the .debug$S section data; likewise for section headers
# following the .debug$S section.
# - The .debug$S section contains only the absolute path to the obj file and
# nothing else, in particular there's only a single entry in the symbol
# table referring to the .debug$S section.
# - There are no COFF line number entries.
# - There's no IMAGE_SYM_CLASS_CLR_TOKEN symbol.
# These seem to hold in practice; if they stop holding this script needs to
# become smarter.
objdata = array.array('b', objdata) # Writable, e.g. via struct.pack_into.
# Read coff header.
COFFHEADER = Struct('COFFHEADER',
'H', 'Machine',
'H', 'NumberOfSections',
'I', 'TimeDateStamp',
'I', 'PointerToSymbolTable',
'I', 'NumberOfSymbols',
'H', 'SizeOfOptionalHeader',
'H', 'Characteristics')
coff_header = COFFHEADER.unpack_from(objdata)
assert coff_header.SizeOfOptionalHeader == 0 # Only set for binaries.
# Read section headers following coff header.
SECTIONHEADER = Struct('SECTIONHEADER',
'8s', 'Name',
'I', 'VirtualSize',
'I', 'VirtualAddress',
'I', 'SizeOfRawData',
'I', 'PointerToRawData',
'I', 'PointerToRelocations',
'I', 'PointerToLineNumbers',
'H', 'NumberOfRelocations',
'H', 'NumberOfLineNumbers',
'I', 'Characteristics')
section_headers = []
debug_section_index = -1
for i in range(0, coff_header.NumberOfSections):
section_header = SECTIONHEADER.unpack_from(
objdata, offset=COFFHEADER.size() + i * SECTIONHEADER.size())
assert not section_header[0].startswith(b'/') # Support short names only.
section_headers.append(section_header)
if section_header.Name == b'.debug$S':
assert debug_section_index == -1
debug_section_index = i
assert debug_section_index != -1
data_start = COFFHEADER.size() + len(section_headers) * SECTIONHEADER.size()
# Verify the .debug$S section looks like we expect.
assert section_headers[debug_section_index].Name == b'.debug$S'
assert section_headers[debug_section_index].VirtualSize == 0
assert section_headers[debug_section_index].VirtualAddress == 0
debug_size = section_headers[debug_section_index].SizeOfRawData
debug_offset = section_headers[debug_section_index].PointerToRawData
assert section_headers[debug_section_index].PointerToRelocations == 0
assert section_headers[debug_section_index].PointerToLineNumbers == 0
assert section_headers[debug_section_index].NumberOfRelocations == 0
assert section_headers[debug_section_index].NumberOfLineNumbers == 0
# Make sure sections in front of .debug$S have their data preceding it.
for header in section_headers[:debug_section_index]:
assert header.PointerToRawData < debug_offset
assert header.PointerToRelocations < debug_offset
assert header.PointerToLineNumbers < debug_offset
# Make sure sections after of .debug$S have their data following it.
for header in section_headers[debug_section_index + 1:]:
# Make sure the .debug$S data is at the very end of section data:
assert header.PointerToRawData > debug_offset
assert header.PointerToRelocations == 0
assert header.PointerToLineNumbers == 0
# Make sure the first non-empty section's data starts right after the section
# headers.
for section_header in section_headers:
if section_header.PointerToRawData == 0:
assert section_header.PointerToRelocations == 0
assert section_header.PointerToLineNumbers == 0
continue
assert section_header.PointerToRawData == data_start
break
# Make sure the symbol table (and hence, string table) appear after the last
# section:
assert (coff_header.PointerToSymbolTable >=
section_headers[-1].PointerToRawData + section_headers[-1].SizeOfRawData)
# The symbol table contains a symbol for the no-longer-present .debug$S
# section. If we leave it there, lld-link will complain:
#
# lld-link: error: .debug$S should not refer to non-existent section 5
#
# so we need to remove that symbol table entry as well. This shifts symbol
# entries around and we need to update symbol table indices in:
# - relocations
# - line number records (never present)
# - one aux symbol entry (IMAGE_SYM_CLASS_CLR_TOKEN; not present in ml output)
SYM = Struct('SYM',
'8s', 'Name',
'I', 'Value',
'h', 'SectionNumber', # Note: Signed!
'H', 'Type',
'B', 'StorageClass',
'B', 'NumberOfAuxSymbols')
i = 0
debug_sym = -1
while i < coff_header.NumberOfSymbols:
sym_offset = coff_header.PointerToSymbolTable + i * SYM.size()
sym = SYM.unpack_from(objdata, sym_offset)
# 107 is IMAGE_SYM_CLASS_CLR_TOKEN, which has aux entry "CLR Token
# Definition", which contains a symbol index. Check it's never present.
assert sym.StorageClass != 107
# Note: sym.SectionNumber is 1-based, debug_section_index is 0-based.
if sym.SectionNumber - 1 == debug_section_index:
assert debug_sym == -1, 'more than one .debug$S symbol found'
debug_sym = i
# Make sure the .debug$S symbol looks like we expect.
# In particular, it should have exactly one aux symbol.
assert sym.Name == b'.debug$S'
assert sym.Value == 0
assert sym.Type == 0
assert sym.StorageClass == 3
assert sym.NumberOfAuxSymbols == 1
elif sym.SectionNumber > debug_section_index:
sym = Subtract(sym, SectionNumber=1)
SYM.pack_into(objdata, sym_offset, sym)
i += 1 + sym.NumberOfAuxSymbols
assert debug_sym != -1, '.debug$S symbol not found'
# Note: Usually the .debug$S section is the last, but for files saying
# `includelib foo.lib`, like safe_terminate_process.asm in 32-bit builds,
# this isn't true: .drectve is after .debug$S.
# Update symbol table indices in relocations.
# There are a few processor types that have one or two relocation types
# where SymbolTableIndex has a different meaning, but not for x86.
REL = Struct('REL',
'I', 'VirtualAddress',
'I', 'SymbolTableIndex',
'H', 'Type')
for header in section_headers[0:debug_section_index]:
for j in range(0, header.NumberOfRelocations):
rel_offset = header.PointerToRelocations + j * REL.size()
rel = REL.unpack_from(objdata, rel_offset)
assert rel.SymbolTableIndex != debug_sym
if rel.SymbolTableIndex > debug_sym:
rel = Subtract(rel, SymbolTableIndex=2)
REL.pack_into(objdata, rel_offset, rel)
# Update symbol table indices in line numbers -- just check they don't exist.
for header in section_headers:
assert header.NumberOfLineNumbers == 0
# Now that all indices are updated, remove the symbol table entry referring to
# .debug$S and its aux entry.
del objdata[coff_header.PointerToSymbolTable + debug_sym * SYM.size():
coff_header.PointerToSymbolTable + (debug_sym + 2) * SYM.size()]
# Now we know that it's safe to write out the input data, with just the
# timestamp overwritten to 0, the last section header cut out (and the
# offsets of all other section headers decremented by the size of that
# one section header), and the last section's data cut out. The symbol
# table offset needs to be reduced by one section header and the size of
# the missing section.
# (The COFF spec only requires on-disk sections to be aligned in image files,
# for obj files it's not required. If that wasn't the case, deleting slices
# if data would not generally be safe.)
# Update section offsets and remove .debug$S section data.
for i in range(0, debug_section_index):
header = section_headers[i]
if header.SizeOfRawData:
header = Subtract(header, PointerToRawData=SECTIONHEADER.size())
if header.NumberOfRelocations:
header = Subtract(header, PointerToRelocations=SECTIONHEADER.size())
if header.NumberOfLineNumbers:
header = Subtract(header, PointerToLineNumbers=SECTIONHEADER.size())
SECTIONHEADER.pack_into(
objdata, COFFHEADER.size() + i * SECTIONHEADER.size(), header)
for i in range(debug_section_index + 1, len(section_headers)):
header = section_headers[i]
shift = SECTIONHEADER.size() + debug_size
if header.SizeOfRawData:
header = Subtract(header, PointerToRawData=shift)
if header.NumberOfRelocations:
header = Subtract(header, PointerToRelocations=shift)
if header.NumberOfLineNumbers:
header = Subtract(header, PointerToLineNumbers=shift)
SECTIONHEADER.pack_into(
objdata, COFFHEADER.size() + i * SECTIONHEADER.size(), header)
del objdata[debug_offset:debug_offset + debug_size]
# Finally, remove .debug$S section header and update coff header.
coff_header = coff_header._replace(TimeDateStamp=0)
coff_header = Subtract(coff_header,
NumberOfSections=1,
PointerToSymbolTable=SECTIONHEADER.size() + debug_size,
NumberOfSymbols=2)
COFFHEADER.pack_into(objdata, 0, coff_header)
del objdata[
COFFHEADER.size() + debug_section_index * SECTIONHEADER.size():
COFFHEADER.size() + (debug_section_index + 1) * SECTIONHEADER.size()]
# All done!
if sys.version_info.major == 2:
return objdata.tostring()
else:
return objdata.tobytes()
def main():
ml_result = subprocess.call(sys.argv[1:])
if ml_result != 0:
return ml_result
objfile = None
for i in range(1, len(sys.argv)):
if sys.argv[i].startswith('/Fo'):
objfile = sys.argv[i][len('/Fo'):]
assert objfile, 'failed to find ml output'
with open(objfile, 'rb') as f:
objdata = f.read()
objdata = MakeDeterministic(objdata)
with open(objfile, 'wb') as f:
f.write(objdata)
if __name__ == '__main__':
sys.exit(main())
|