diff options
Diffstat (limited to 'dom/quota/scripts/stackanalysis.py')
-rw-r--r-- | dom/quota/scripts/stackanalysis.py | 396 |
1 files changed, 396 insertions, 0 deletions
diff --git a/dom/quota/scripts/stackanalysis.py b/dom/quota/scripts/stackanalysis.py new file mode 100644 index 0000000000..f0363c5e1f --- /dev/null +++ b/dom/quota/scripts/stackanalysis.py @@ -0,0 +1,396 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +# There seem to be sometimes identical events recorded twice by telemetry +def sanitize(rows): + newrows = [] + pcid = "unset" + psid = "unset" + pseq = "unset" + for row in rows: + cid = row["client_id"] + sid = row["session_id"] + seq = row["seq"] + if cid != pcid or sid != psid or seq != pseq: + newrows.append(row) + pcid = cid + psid = sid + pseq = seq + + return newrows + + +# Given a set of rows, find all distinct build ids +def extractBuildIDs(rows): + buildids = {} + for row in rows: + id = row["build_id"] + if id in buildids: + buildids[id] = buildids[id] + 1 + else: + buildids[id] = 1 + return buildids + + +# Given a set of build ids and rows, enrich each row by an hg link. +# Relys on the result of utils.fetchBuildRevisions in buildids. +def constructHGLinks(buildids, rows): + for row in rows: + id = row["build_id"] + if id in buildids: + row["location"] = ( + buildids[id] + "/" + row["source_file"] + "#l" + row["source_line"] + ) + else: + row["location"] = id + "/" + row["source_file"] + "#l" + row["source_line"] + + +topmost_stackframes = set() +delta_frames = {} + + +def isTopmostFrame(frame): + f = (frame["location"], frame["result"]) + return f in topmost_stackframes + + +def addTopmostFrame(frame): + f = (frame["location"], frame["result"]) + if not isTopmostFrame(frame): + # print("Found new topmost frame {}.".format(frame)) + topmost_stackframes.add(f) + frame["topmost"] = True + + +def addFrameDelta(frame1, frame2): + if frame1["client_id"] != frame2["client_id"]: + return + if frame1["session_id"] != frame2["session_id"]: + return + + fkey = "{}:{}-{}:{}".format( + frame2["location"], frame2["result"], frame1["location"], frame1["result"] + ) + if fkey not in delta_frames: + fdelta = {"delta_sum": 0, "delta_cnt": 0} + fdelta["prev_row"] = frame1 + fdelta["candidate"] = frame2 + delta_frames[fkey] = fdelta + + fdelta = delta_frames[fkey] + etv1 = frame1["event_timestamp"] + etv2 = frame2["event_timestamp"] + if isinstance(etv1, int) and isinstance(etv2, int) and etv2 > etv1: + delta = etv2 - etv1 + fdelta["delta_sum"] = fdelta["delta_sum"] + delta + fdelta["delta_cnt"] = fdelta["delta_cnt"] + 1 + + +# There can be outliers in terms of time distance between two stack frames +# that belong to the same propagation stack. In order to not increase the +# risk that one outlier breaks thousands of stacks, we check for the average +# time distance. +def checkAverageFrameTimeDeltas(rows, max_delta): + # print("checkAverageFrameTimeDeltas") + prev_row = None + for row in rows: + if "topmost" in row or not row["session_complete"]: + prev_row = None + continue + + if prev_row: + addFrameDelta(prev_row, row) + prev_row = row + + for fd in delta_frames: + sum = delta_frames[fd]["delta_sum"] + cnt = delta_frames[fd]["delta_cnt"] + if cnt > 0 and (sum / cnt) > max_delta: + # print(delta_frames[fd]) + addTopmostFrame(delta_frames[fd]["candidate"]) + + +# A topmost frame is considered to initiate a new raw stack. We collect all +# candidates before we actually apply them. This implies, that we should run +# this function on a "large enough" sample of rows to be more accurate. +# As a side effect, we mark all rows that are part of a "complete" session +# (a session, that started within our data scope). +def collectTopmostFrames(rows): + prev_cid = "unset" + prev_sid = "unset" + prev_tid = "unset" + prev_ctx = "unset" + prev_sev = "ERROR" + session_complete = False + after_severity_downgrade = False + for row in rows: + cid = row["client_id"] + sid = row["session_id"] + tid = row["seq"] >> 32 # thread_id + ctx = row["context"] + seq = row["seq"] & 0x00000000FFFFFFFF # seq + sev = row["severity"] + + # If we have a new session, ensure it is complete from start, + # otherwise we will ignore it entirely. + if cid != prev_cid or sid != prev_sid or tid != prev_tid: + if seq == 1: + session_complete = True + else: + session_complete = False + row["session_complete"] = session_complete + if session_complete: + # If we change client, session, thread or context, we can be sure to have + # a new topmost frame. + if ( + seq == 1 + or cid != prev_cid + or sid != prev_sid + or tid != prev_tid + or ctx != prev_ctx + ): + addTopmostFrame(row) + after_severity_downgrade = False + # We do not expect a non-error to be ever upgraded to an error + elif sev == "ERROR" and prev_sev != "ERROR": + addTopmostFrame(row) + after_severity_downgrade = False + # If we just had a severity downgrade, we assume that we wanted + # to break the error propagation after this point and split, too + elif after_severity_downgrade: + addTopmostFrame(row) + after_severity_downgrade = False + elif prev_sev == "ERROR" and sev != "ERROR": + after_severity_downgrade = True + + prev_cid = cid + prev_sid = sid + prev_tid = tid + prev_ctx = ctx + prev_sev = sev + + # Should be ms. We've seen quite some runtime between stackframes in the + # wild. We might want to consider to make this configurable. In general + # we prefer local context over letting slip through some topmost frame + # unrecognized, assuming that fixing the issues one by one they will + # uncover them succesively. This is achieved by a rather high delta value. + max_avg_delta = 200 + checkAverageFrameTimeDeltas(rows, max_avg_delta) + + +def getFrameKey(frame): + return "{}.{}|".format(frame["location"], frame["result"]) + + +def getStackKey(stack): + stack_key = "" + for frame in stack["frames"]: + stack_key += getFrameKey(frame) + return hash(stack_key) + + +# A "raw stack" is a list of frames, that: +# - share the same build_id (implicitely through location) +# - share the same client_id +# - share the same session_id +# - has a growing sequence number +# - stops at the first downgrade of severity from ERROR to else +# - XXX: contains each location at most once (no recursion) +# - appears to be in a reasonable short timeframe +# Calculates also a hash key to identify identical stacks +def collectRawStacks(rows): + collectTopmostFrames(rows) + raw_stacks = [] + stack = { + "stack_id": "unset", + "client_id": "unset", + "session_id": "unset", + "submit_timeabs": "unset", + "frames": [{"location": "unset"}], + } + stack_id = 1 + first = True + for row in rows: + if isTopmostFrame(row): + if not first: + stack["stack_key"] = getStackKey(stack) + raw_stacks.append(stack) + stack_id += 1 + stack = { + "stack_id": stack_id, + "client_id": row["client_id"], + "session_id": row["session_id"], + "submit_timeabs": row["submit_timeabs"], + "context": row["context"], + "frames": [], + } + + stack["frames"].append( + { + "location": row["location"], + "source_file": row["source_file"], + "source_line": row["source_line"], + "seq": row["seq"], + "severity": row["severity"], + "result": row["result"], + } + ) + first = False + + return raw_stacks + + +# Merge all stacks that have the same hash key and count occurences. +# Relys on the ordering per client_id/session_id for correct counting. +def mergeEqualStacks(raw_stacks): + merged_stacks = {} + last_client_id = "none" + last_session_id = "none" + for stack in raw_stacks: + stack_key = stack["stack_key"] + merged_stack = stack + if stack_key in merged_stacks: + merged_stack = merged_stacks[stack_key] + if stack["client_id"] != last_client_id: + last_client_id = stack["client_id"] + merged_stack["client_count"] += 1 + if stack["session_id"] != last_session_id: + last_session_id = stack["session_id"] + merged_stack["session_count"] += 1 + merged_stack["hit_count"] += 1 + else: + merged_stack["client_count"] = 1 + last_client_id = merged_stack["client_id"] + merged_stack["session_count"] = 1 + last_session_id = merged_stack["session_id"] + merged_stack["hit_count"] = 1 + merged_stacks[stack_key] = merged_stack + + merged_list = list(merged_stacks.values()) + merged_list.sort(key=lambda x: x.get("hit_count"), reverse=True) + return merged_list + + +# Split the list of stacks into: +# - aborted (has at least one frame with NS_ERROR_ABORT) +# - info/warning (has at least one frame with that severity) +# - error (has only error frames) +def filterStacksForPropagation( + all_stacks, error_stacks, warn_stacks, info_stacks, abort_stacks +): + for stack in all_stacks: + warn = list(filter(lambda x: x["severity"] == "WARNING", stack["frames"])) + info = list(filter(lambda x: x["severity"] == "INFO", stack["frames"])) + abort = list(filter(lambda x: x["result"] == "NS_ERROR_ABORT", stack["frames"])) + if len(abort) > 0: + abort_stacks.append(stack) + elif len(info) > 0: + info_stacks.append(stack) + elif len(warn) > 0: + warn_stacks.append(stack) + else: + error_stacks.append(stack) + + +# Bugzilla comment markup +def printStacks(stacks): + row_format = "{} | {} | {} | {} | {}\n" + out = "" + out += row_format.format("Clients", "Sessions", "Hits", "Anchor (Context)", "Stack") + out += row_format.format("-------", "--------", "----", "----------------", "-----") + for stack in stacks: + framestr = "" + first = True + for frame in stack["frames"]: + if not first: + framestr += " <- " + framestr += "[{}#{}:{}]({})".format( + frame["source_file"], + frame["source_line"], + frame["result"], + frame["location"], + ) + first = False + out += row_format.format( + stack["client_count"], + stack["session_count"], + stack["hit_count"], + "{} ({})".format(stack["frames"][0]["anchor"], stack["context"]), + framestr, + ) + + return out + + +def groupStacksForAnchors(stacks): + anchors = {} + for stack in stacks: + anchor_name = stack["frames"][0]["anchor"] + if anchor_name in anchors: + anchors[anchor_name]["stacks"].append(stack) + else: + anchor = {"anchor": anchor_name, "stacks": [stack]} + anchors[anchor_name] = anchor + return anchors + + +""" +def getSummaryForAnchor(anchor): + return "[QM_TRY] Errors in function {}".format(anchor) + + +def searchBugForAnchor(bugzilla_key, anchor): + summary = getSummaryForAnchor(anchor) + bug_url = "https://bugzilla.mozilla.org/rest/bug?" \ + "summary={}&api_key={}".format(summary, bugzilla_key) + return requests.get(url=bug_url).json()["bugs"] + + +def createBugForAnchor(bugzilla_key, anchor): + summary = getSummaryForAnchor(anchor) + bug_url = "https://bugzilla.mozilla.org/rest/bug?" \ + "Bugzilla_api_key={}".format(bugzilla_key) + body = { + "product" : "Core", + "component" : "Storage: Quota Manager", + "version" : "unspecified", + "summary" : summary, + "description" : "This bug collects errors reported by QM_TRY" + "macros for function {}.".format(anchor), + } + resp = requests.post(url=bug_url, json=body) + if resp.status_code != 200: + print(resp) + return 0 + id = resp.json()["id"] + print("Added new bug {}:".format(id)) + return id + + +def ensureBugForAnchor(bugzilla_key, anchor): + buglist = searchBugForAnchor(bugzilla_key, anchor) + if (len(buglist) > 0): + id = buglist[0]["id"] + print("Found existing bug {}:".format(id)) + return id + return createBugForAnchor(bugzilla_key, anchor) + + +def addCommentForAnchor(bugzilla_key, anchor, stacks): + id = ensureBugForAnchor(bugzilla_key, anchor) + if (id <= 0): + print("Unable to create a bug for {}.".format(anchor)) + return + comment = printStacks(stacks) + print("") + print("Add comment to bug {}:".format(id)) + print(comment) + + +def addCommentsForStacks(bugzilla_key, stacks): + anchors = groupStacksForAnchors(stacks) + for anchor in anchors: + addCommentForAnchor(bugzilla_key, anchors[anchor]["anchor"], anchors[anchor]["stacks"]) +""" |