diff options
Diffstat (limited to '')
-rw-r--r-- | testing/talos/talos/xtalos/etlparser.py | 640 |
1 files changed, 640 insertions, 0 deletions
diff --git a/testing/talos/talos/xtalos/etlparser.py b/testing/talos/talos/xtalos/etlparser.py new file mode 100644 index 0000000000..10fa5c64d3 --- /dev/null +++ b/testing/talos/talos/xtalos/etlparser.py @@ -0,0 +1,640 @@ +#!/usr/bin/env python + +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +import csv +import json +import os +import re +import shutil +import subprocess +import sys + +from talos.xtalos import xtalos + +EVENTNAME_INDEX = 0 +PROCESS_INDEX = 2 +THREAD_ID_INDEX = 3 +PARENT_PID_INDEX = 3 +DISKBYTES_COL = "Size" +FNAME_COL = "FileName" +IMAGEFUNC_COL = "Image!Function" +EVENTGUID_COL = "EventGuid" +ACTIVITY_ID_COL = "etw:ActivityId" +NUMBYTES_COL = "NumBytes" +BYTESSENT_COL = "BytesSent" + +CEVT_WINDOWS_RESTORED = "{917b96b1-ecad-4dab-a760-8d49027748ae}" +CEVT_XPCOM_SHUTDOWN = "{26d1e091-0ae7-4f49-a554-4214445c505c}" +NAME_SUBSTITUTIONS = [ + # Careful with your regex! + # Substitution happens after combinations like \t \s \n ... are replaced + # with their real representations. So, prepend them with extra backslash. + # Read more: https://docs.python.org/2.7/library/re.html#re.sub + (re.compile(r"{\w{8}-\w{4}-\w{4}-\w{4}-\w{12}}"), "{uuid}"), + (re.compile(r"talos\\tests\\tp5n\\.*"), r"talos\\tests\{tp5n_files}"), + (re.compile(r"nvidia corporation\\3d vision\\.*"), "{nvidia_3d_vision}"), + (re.compile(r"cltbld\.t-w732-ix-\d+\.\d+"), "{cltbld}"), + ( + re.compile(r"venv\\lib\\site-packages\\pip\\_vendor\\.*"), + r"venv\\lib\\site-packages\\{pip_vendor}", + ), +] +stages = ["startup", "normal", "shutdown"] +net_events = { + "TcpDataTransferReceive": "recv", + "UdpEndpointReceiveMessages": "recv", + "TcpDataTransferSend": "send", + "UdpEndpointSendMessages": "send", +} +gThreads = {} +gConnectionIDs = {} +gHeaders = {} +gBrowserPID = None + + +def uploadFile(filename): + mud = os.environ.get("MOZ_UPLOAD_DIR", None) + if mud: + print("uploading raw file %s via blobber" % filename) + mud_filename = os.path.join(mud, filename) + shutil.copyfile(filename, "%s.log" % mud_filename) + + +def filterOutHeader(data): + # -1 means we have not yet found the header + # 0 means we are in the header + # 1+ means that we are past the header + state = -1 + done = False + while not done: + try: + row = next(data) + except StopIteration: + done = True + break + except UnicodeDecodeError: + continue + except csv.Error: + continue + + if not len(row): + continue + + if state < 0: + # Keep looking for the header (denoted by "BeginHeader"). + if row[0] == "BeginHeader": + state = 0 + continue + + if state == 0: + # Eventually, we'll find the end (denoted by "EndHeader"). + if row[0] == "EndHeader": + state = 1 + continue + + gHeaders[row[EVENTNAME_INDEX]] = row + continue + + if state >= 1: + state = state + 1 + + # The line after "EndHeader" is also not useful, so we want to strip + # that in addition to the header. + if state > 2: + yield row + + +def getIndex(eventName, *colNames): + eventHeader = gHeaders[eventName] + + for colName in colNames: + try: + return eventHeader.index(colName) + except ValueError: + pass + + return None + + +def readFile(filename): + data = csv.reader( + open(filename, "r", encoding="cp1252"), + delimiter=",", + quotechar='"', + skipinitialspace=True, + ) + data = filterOutHeader(data) + return data + + +def fileSummary(row, stage, retVal): + event = row[EVENTNAME_INDEX] + + # TODO: do we care about the other events? + if event not in ("FileIoRead", "FileIoWrite"): + return + fname_index = getIndex(event, FNAME_COL) + + # We only care about events that have a file name. + if fname_index is None: + return + + # Some data rows are missing the filename? + if len(row) <= fname_index: + return + + thread_extra = "" + if gThreads[row[THREAD_ID_INDEX]] == "main": + thread_extra = " (main)" + key_tuple = ( + row[fname_index], + "%s%s" % (row[THREAD_ID_INDEX], thread_extra), + stages[stage], + ) + total_tuple = ( + row[fname_index], + "%s%s" % (row[THREAD_ID_INDEX], thread_extra), + "all", + ) + + if key_tuple not in retVal: + retVal[key_tuple] = { + "DiskReadBytes": 0, + "DiskReadCount": 0, + "DiskWriteBytes": 0, + "DiskWriteCount": 0, + } + retVal[total_tuple] = retVal[key_tuple] + + if event == "FileIoRead": + retVal[key_tuple]["DiskReadCount"] += 1 + retVal[total_tuple]["DiskReadCount"] += 1 + idx = getIndex(event, DISKBYTES_COL) + retVal[key_tuple]["DiskReadBytes"] += int(row[idx], 16) + retVal[total_tuple]["DiskReadBytes"] += int(row[idx], 16) + elif event == "FileIoWrite": + retVal[key_tuple]["DiskWriteCount"] += 1 + retVal[total_tuple]["DiskWriteCount"] += 1 + idx = getIndex(event, DISKBYTES_COL) + retVal[key_tuple]["DiskWriteBytes"] += int(row[idx], 16) + retVal[total_tuple]["DiskWriteBytes"] += int(row[idx], 16) + + +def etl2csv(xperf_path, etl_filename, debug=False): + """ + Convert etl_filename to etl_filename.csv (temp file) which is the .csv + representation of the .etl file Etlparser will read this .csv and parse + the information we care about into the final output. This is done to keep + things simple and to preserve resources on talos machines + (large files == high memory + cpu) + """ + + xperf_cmd = [ + xperf_path, + "-merge", + "%s.user" % etl_filename, + "%s.kernel" % etl_filename, + etl_filename, + ] + if debug: + print("executing '%s'" % subprocess.list2cmdline(xperf_cmd)) + subprocess.call(xperf_cmd) + + csv_filename = "%s.csv" % etl_filename + xperf_cmd = [xperf_path, "-i", etl_filename, "-o", csv_filename] + if debug: + print("executing '%s'" % subprocess.list2cmdline(xperf_cmd)) + subprocess.call(xperf_cmd) + return csv_filename + + +def trackProcess(row, firstFirefoxPID): + global gBrowserPID + if gBrowserPID: + return + + # Without the launcher, the initial Firefox process *is* the browser + # process. OTOH, with the launcher process enabled, the browser is actually + # the first child process of the first Firefox process. + parentPID = int(row[PARENT_PID_INDEX]) + if parentPID == firstFirefoxPID: + proc = row[PROCESS_INDEX] + gBrowserPID = int(re.search("^.* \(\s*(\d+)\)$", proc).group(1)) + + +def getBrowserPID(): + global gBrowserPID + return gBrowserPID + + +def trackThread(row, browserPID): + event, proc, tid = row[EVENTNAME_INDEX], row[PROCESS_INDEX], row[THREAD_ID_INDEX] + if event in ["T-DCStart", "T-Start"]: + procName, procID = re.search("^(.*) \(\s*(\d+)\)$", proc).group(1, 2) + if procID == str(browserPID): + imgIdx = getIndex(event, IMAGEFUNC_COL) + img = re.match("([^!]+)!", row[imgIdx]).group(1) + if img == procName: + gThreads[tid] = "main" + else: + gThreads[tid] = "nonmain" + elif event in ["T-DCEnd", "T-End"] and tid in gThreads: + del gThreads[tid] + + +def trackThreadFileIO(row, io, stage): + event, tid = row[EVENTNAME_INDEX], row[THREAD_ID_INDEX] + opType = {"FileIoWrite": "write", "FileIoRead": "read"}[event] + th, stg = gThreads[tid], stages[stage] + sizeIdx = getIndex(event, DISKBYTES_COL) + bytes = int(row[sizeIdx], 16) + io[(th, stg, "file_%s_ops" % opType)] = ( + io.get((th, stg, "file_%s_ops" % opType), 0) + 1 + ) + io[(th, stg, "file_%s_bytes" % opType)] = ( + io.get((th, stg, "file_%s_bytes" % opType), 0) + bytes + ) + io[(th, stg, "file_io_bytes")] = io.get((th, stg, "file_io_bytes"), 0) + bytes + + +def trackThreadNetIO(row, io, stage): + event, tid = row[EVENTNAME_INDEX], row[THREAD_ID_INDEX] + connIdIdx = getIndex(event, ACTIVITY_ID_COL) + connID = row[connIdIdx] + if connID not in gConnectionIDs: + gConnectionIDs[connID] = tid + origThread = gConnectionIDs[connID] + if origThread in gThreads: + match = re.match("[\w-]+\/([\w-]+)?", event) + if not match: + raise xtalos.XTalosError( + "Could not find a regular expression match for event: {}".format(event) + ) + netEvt = match.group(1) + + if netEvt in net_events: + opType = net_events[netEvt] + th, stg = gThreads[origThread], stages[stage] + # On newer versions of Windows, some net I/O events have switched to + # using BYTESSENT_COL, so we try both + lenIdx = getIndex(event, NUMBYTES_COL, BYTESSENT_COL) + bytes = int(row[lenIdx]) + io[(th, stg, "net_%s_bytes" % opType)] = ( + io.get((th, stg, "net_%s_bytes" % opType), 0) + bytes + ) + io[(th, stg, "net_io_bytes")] = io.get((th, stg, "net_io_bytes"), 0) + bytes + + +def updateStage(row, stage): + guidIdx = getIndex(row[EVENTNAME_INDEX], EVENTGUID_COL) + if row[guidIdx] == CEVT_WINDOWS_RESTORED and stage == 0: + stage = 1 + elif row[guidIdx] == CEVT_XPCOM_SHUTDOWN and stage == 1: + stage = 2 + return stage + + +def loadAllowlist(filename): + if not filename: + return + if not os.path.exists(filename): + print("Warning: xperf allowlist %s was not found" % filename) + return + lines = open(filename).readlines() + # Expand paths + lines = [os.path.expandvars(elem.strip()) for elem in lines] + files = set() + dirs = set() + recur = set() + for line in lines: + if line.startswith("#"): + continue + elif line.endswith("\\*\\*"): + recur.add(line[:-4]) + elif line.endswith("\\*"): + dirs.add(line[:-2]) + else: + files.add(line) + return (files, dirs, recur) + + +def checkAllowlist(filename, allowlist): + if not allowlist: + return False + if filename in allowlist[0]: + return True + if os.path.dirname(filename) in allowlist[1]: + return True + head = filename + while len(head) > 3: # Length 3 implies root directory, e.g. C:\ + head, tail = os.path.split(head) + if head in allowlist[2]: + return True + return False + + +def etlparser( + xperf_path, + etl_filename, + processID, + approot=None, + configFile=None, + outputFile=None, + allowlist_file=None, + error_filename=None, + all_stages=False, + all_threads=False, + debug=False, +): + global NAME_SUBSTITUTIONS + + # setup output file + if outputFile: + outFile = open(outputFile, "w") + else: + outFile = sys.stdout + + files = {} + io = {} + stage = 0 + + csvname = etl2csv(xperf_path, etl_filename, debug=debug) + for row in readFile(csvname): + event = row[EVENTNAME_INDEX] + if event == "P-Start": + trackProcess(row, processID) + elif event in ["T-DCStart", "T-Start", "T-DCEnd", "T-End"]: + trackThread(row, getBrowserPID()) + elif ( + event in ["FileIoRead", "FileIoWrite"] and row[THREAD_ID_INDEX] in gThreads + ): + fileSummary(row, stage, files) + trackThreadFileIO(row, io, stage) + elif event.endswith("Event/Classic") and row[THREAD_ID_INDEX] in gThreads: + stage = updateStage(row, stage) + elif event.startswith("Microsoft-Windows-TCPIP"): + trackThreadNetIO(row, io, stage) + + if debug: + uploadFile(csvname) + + output = "thread, stage, counter, value\n" + for cntr in sorted(io.keys()): + output += "%s, %s\n" % (", ".join(cntr), str(io[cntr])) + if outputFile: + fname = "%s_thread_stats%s" % os.path.splitext(outputFile) + with open(fname, "w") as f: + f.write(output) + + if debug: + uploadFile(fname) + else: + print(output) + + allowlist = loadAllowlist(allowlist_file) + + header = "filename, tid, stage, readcount, readbytes, writecount," " writebytes" + outFile.write(header + "\n") + + # Filter out stages, threads, and allowlisted files that we're not + # interested in + filekeys = [ + x + for x in files.keys() + if (all_stages or x[2] == stages[0]) + and (all_threads or x[1].endswith("(main)")) + and (all_stages and x[2] != stages[0] or not checkAllowlist(x[0], allowlist)) + ] + if debug: + # in debug, we want stages = [startup+normal] and all threads, not just (main) + # we will use this data to upload fileIO info to blobber only for debug mode + outputData = [ + x + for x in files.keys() + if (all_stages or x[2] in [stages[0], stages[1]]) + and ( + all_stages + and x[2] not in [stages[0], stages[1]] + or not checkAllowlist(x[0], allowlist) + ) + ] + else: + outputData = filekeys + + # output data + for row in outputData: + output = "%s, %s, %s, %s, %s, %s, %s\n" % ( + row[0], + row[1], + row[2], + files[row]["DiskReadCount"], + files[row]["DiskReadBytes"], + files[row]["DiskWriteCount"], + files[row]["DiskWriteBytes"], + ) + outFile.write(output) + + if outputFile: + # close the file handle + outFile.close() + if debug: + uploadFile(outputFile) + + # We still like to have the outputfile to record the raw data, now + # filter out acceptable files/ranges + allowlist_path = None + wl_temp = {} + dirname = os.path.dirname(__file__) + if os.path.exists(os.path.join(dirname, "xperf_allowlist.json")): + allowlist_path = os.path.join(dirname, "xperf_allowlist.json") + elif os.path.exists(os.path.join(dirname, "xtalos")) and os.path.exists( + os.path.join(dirname, "xtalos", "xperf_allowlist.json") + ): + allowlist_path = os.path.join(dirname, "xtalos", "xperf_allowlist.json") + + wl_temp = {} + if allowlist_path: + with open(allowlist_path, "r") as fHandle: + wl_temp = json.load(fHandle) + + # Approot is the full path where the application is located at + # We depend on it for dependentlibs.list to ignore files required for + # normal startup. + if approot: + if os.path.exists("%s\\dependentlibs.list" % approot): + with open("%s\\dependentlibs.list" % approot, "r") as fhandle: + libs = fhandle.readlines() + + for lib in libs: + wl_temp["{firefox}\\%s" % lib.strip()] = {"ignore": True} + + # Windows isn't case sensitive, this protects us against mismatched + # systems. + wl = {} + for item in wl_temp: + wl[item.lower()] = wl_temp[item] + + errors = [] + for row in filekeys: + filename = original_filename = row[0] + filename = filename.lower() + # take care of 'program files (x86)' matching 'program files' + filename = filename.replace(" (x86)", "") + + paths = ["profile", "firefox", "desktop", "talos"] + for path in paths: + pathname = "%s\\" % path + parts = filename.split(pathname) + if len(parts) >= 2: + filename = "{%s}\\%s" % (path, pathname.join(parts[1:])) + + parts = filename.split("\\installtime") + if len(parts) >= 2: + filename = "%s\\{time}" % parts[0] + + # NOTE: this is Prefetch or prefetch, not case sensitive operating + # system + parts = filename.split("refetch") + if len(parts) >= 2: + filename = "%srefetch\\{prefetch}.pf" % parts[0] + + for pattern, substitution in NAME_SUBSTITUTIONS: + filename = re.sub(pattern, substitution, filename) + + if filename in wl: + if "ignore" in wl[filename] and wl[filename]["ignore"]: + continue + + # too noisy + # if wl[filename]['minbytes'] > (files[row]['DiskReadBytes'] +\ + # files[row]['DiskWriteBytes']): + # print "%s: read %s bytes, less than expected minimum: %s" + # % (filename, (files[row]['DiskReadBytes'] + + # files[row]['DiskWriteBytes']), + # wl[filename]['minbytes']) + + # don't report in first round + # elif wl[filename]['maxbytes'] < (files[row]['DiskReadBytes'] +\ + # files[row]['DiskWriteBytes']): + # errors.append("%s: read %s bytes, more than expected maximum:" + # " %s" + # % (filename, (files[row]['DiskReadBytes'] + + # files[row]['DiskWriteBytes']), + # wl[filename]['maxbytes'])) + + # too noisy + # elif wl[filename]['mincount'] > (files[row]['DiskReadCount'] +\ + # files[row]['DiskWriteCount']): + # print "%s: %s accesses, less than expected minimum: %s" + # % (filename, (files[row]['DiskReadCount'] + + # files[row]['DiskWriteCount']), + # wl[filename]['mincount']) + + # don't report in first round + # elif wl[filename]['maxcount'] < (files[row]['DiskReadCount'] +\ + # files[row]['DiskWriteCount']): + # errors.append("%s: %s accesses, more than expected maximum:" + # " %s" + # % (filename, (files[row]['DiskReadCount'] + + # files[row]['DiskWriteCount']), + # wl[filename]['maxcount'])) + else: + errors.append( + "File '%s' (normalized from '%s') was accessed and we were not expecting" + " it. DiskReadCount: %s, DiskWriteCount: %s," + " DiskReadBytes: %s, DiskWriteBytes: %s" + % ( + filename, + original_filename, + files[row]["DiskReadCount"], + files[row]["DiskWriteCount"], + files[row]["DiskReadBytes"], + files[row]["DiskWriteBytes"], + ) + ) + + if errors: + # output specific errors to be picked up by tbpl parser + for error in errors: + # NOTE: the ' :' is intentional, without the space before the :, + # some parser will translate this + print("TEST-UNEXPECTED-FAIL : xperf: %s" % error) + + # We detect if browser_failures.txt exists to exit and turn the job + # orange + if error_filename: + with open(error_filename, "w") as errorFile: + errorFile.write("\n".join(errors)) + + if debug: + uploadFile(etl_filename) + + +def etlparser_from_config(config_file, **kwargs): + """start from a YAML config file""" + + # option defaults + args = { + "xperf_path": "xperf.exe", + "etl_filename": "test.etl", + "outputFile": "etl_output.csv", + "processID": None, + "approot": None, + "allowlist_file": None, + "error_filename": None, + "all_stages": False, + "all_threads": False, + } + args.update(kwargs) + + # override from YAML config file + args = xtalos.options_from_config(args, config_file) + + # ensure process ID is given + if not args.get("processID"): + raise xtalos.XTalosError("No process ID option given") + + # ensure path to xperf given + if not os.path.exists(args["xperf_path"]): + raise xtalos.XTalosError( + "ERROR: xperf_path '%s' does not exist" % args["xperf_path"] + ) + + # update args with config file + args["configFile"] = config_file + + # call etlparser + etlparser(**args) + + +def main(args=sys.argv[1:]): + + # parse command line arguments + parser = xtalos.XtalosOptions() + args = parser.parse_args(args) + args = parser.verifyOptions(args) + if args is None: + parser.error("Unable to verify arguments") + if not args.processID: + parser.error("No process ID argument given") + + # call API + etlparser( + args.xperf_path, + args.etl_filename, + args.processID, + args.approot, + args.configFile, + args.outputFile, + args.allowlist_file, + args.error_filename, + args.all_stages, + args.all_threads, + debug=args.debug_level >= xtalos.DEBUG_INFO, + ) + + +if __name__ == "__main__": + main() |