diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
commit | 26a029d407be480d791972afb5975cf62c9360a6 (patch) | |
tree | f435a8308119effd964b339f76abb83a57c29483 /testing/mozbase/mozsystemmonitor | |
parent | Initial commit. (diff) | |
download | firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip |
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'testing/mozbase/mozsystemmonitor')
7 files changed, 1454 insertions, 0 deletions
diff --git a/testing/mozbase/mozsystemmonitor/README.rst b/testing/mozbase/mozsystemmonitor/README.rst new file mode 100644 index 0000000000..c8dad37671 --- /dev/null +++ b/testing/mozbase/mozsystemmonitor/README.rst @@ -0,0 +1,12 @@ +================ +mozsystemmonitor +================ + +mozsystemmonitor contains modules for monitoring a running system. + +SystemResourceMonitor +===================== + +mozsystemmonitor.resourcemonitor.SystemResourceMonitor is class used to +measure system resource usage. It is useful to get a handle on what an +overall system is doing. diff --git a/testing/mozbase/mozsystemmonitor/mozsystemmonitor/__init__.py b/testing/mozbase/mozsystemmonitor/mozsystemmonitor/__init__.py new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/testing/mozbase/mozsystemmonitor/mozsystemmonitor/__init__.py diff --git a/testing/mozbase/mozsystemmonitor/mozsystemmonitor/resourcemonitor.py b/testing/mozbase/mozsystemmonitor/mozsystemmonitor/resourcemonitor.py new file mode 100644 index 0000000000..978d7d6911 --- /dev/null +++ b/testing/mozbase/mozsystemmonitor/mozsystemmonitor/resourcemonitor.py @@ -0,0 +1,1220 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +import multiprocessing +import sys +import time +import warnings +from collections import OrderedDict, namedtuple +from contextlib import contextmanager + + +class PsutilStub(object): + def __init__(self): + self.sswap = namedtuple( + "sswap", ["total", "used", "free", "percent", "sin", "sout"] + ) + self.sdiskio = namedtuple( + "sdiskio", + [ + "read_count", + "write_count", + "read_bytes", + "write_bytes", + "read_time", + "write_time", + ], + ) + self.pcputimes = namedtuple("pcputimes", ["user", "system"]) + self.svmem = namedtuple( + "svmem", + [ + "total", + "available", + "percent", + "used", + "free", + "active", + "inactive", + "buffers", + "cached", + ], + ) + + def cpu_count(self, logical=True): + return 0 + + def cpu_percent(self, a, b): + return [0] + + def cpu_times(self, percpu): + if percpu: + return [self.pcputimes(0, 0)] + else: + return self.pcputimes(0, 0) + + def disk_io_counters(self): + return self.sdiskio(0, 0, 0, 0, 0, 0) + + def swap_memory(self): + return self.sswap(0, 0, 0, 0, 0, 0) + + def virtual_memory(self): + return self.svmem(0, 0, 0, 0, 0, 0, 0, 0, 0) + + +# psutil will raise NotImplementedError if the platform is not supported. +try: + import psutil + + have_psutil = True +except Exception: + try: + # The PsutilStub should get us time intervals, at least + psutil = PsutilStub() + except Exception: + psutil = None + + have_psutil = False + + +def get_disk_io_counters(): + try: + io_counters = psutil.disk_io_counters() + + if io_counters is None: + return PsutilStub().disk_io_counters() + except RuntimeError: + io_counters = PsutilStub().disk_io_counters() + + return io_counters + + +def _poll(pipe, poll_interval=0.1): + """Wrap multiprocessing.Pipe.poll to hide POLLERR and POLLIN + exceptions. + + multiprocessing.Pipe is not actually a pipe on at least Linux. + That has an effect on the expected outcome of reading from it when + the other end of the pipe dies, leading to possibly hanging on revc() + below. + """ + try: + return pipe.poll(poll_interval) + except Exception: + # Poll might throw an exception even though there's still + # data to read. That happens when the underlying system call + # returns both POLLERR and POLLIN, but python doesn't tell us + # about it. So assume there is something to read, and we'll + # get an exception when trying to read the data. + return True + + +def _collect(pipe, poll_interval): + """Collects system metrics. + + This is the main function for the background process. It collects + data then forwards it on a pipe until told to stop. + """ + + data = [] + + try: + # Establish initial values. + + io_last = get_disk_io_counters() + swap_last = psutil.swap_memory() + psutil.cpu_percent(None, True) + cpu_last = psutil.cpu_times(True) + last_time = time.monotonic() + + sin_index = swap_last._fields.index("sin") + sout_index = swap_last._fields.index("sout") + + sleep_interval = poll_interval + + while not _poll(pipe, poll_interval=sleep_interval): + io = get_disk_io_counters() + virt_mem = psutil.virtual_memory() + swap_mem = psutil.swap_memory() + cpu_percent = psutil.cpu_percent(None, True) + cpu_times = psutil.cpu_times(True) + # Take the timestamp as soon as possible after getting cpu_times + # to reduce the likelihood of our process being interrupted between + # the two instructions. Having a delayed timestamp would cause the + # next sample to report more than 100% CPU time. + measured_end_time = time.monotonic() + + # TODO Does this wrap? At 32 bits? At 64 bits? + # TODO Consider patching "delta" API to upstream. + io_diff = [v - io_last[i] for i, v in enumerate(io)] + io_last = io + + cpu_diff = [] + for core, values in enumerate(cpu_times): + cpu_diff.append([v - cpu_last[core][i] for i, v in enumerate(values)]) + + cpu_last = cpu_times + + swap_entry = list(swap_mem) + swap_entry[sin_index] = swap_mem.sin - swap_last.sin + swap_entry[sout_index] = swap_mem.sout - swap_last.sout + swap_last = swap_mem + + data.append( + ( + last_time, + measured_end_time, + io_diff, + cpu_diff, + cpu_percent, + list(virt_mem), + swap_entry, + ) + ) + + collection_overhead = time.monotonic() - last_time - sleep_interval + last_time = measured_end_time + sleep_interval = max(poll_interval / 2, poll_interval - collection_overhead) + + except Exception as e: + warnings.warn("_collect failed: %s" % e) + + finally: + for entry in data: + pipe.send(entry) + + pipe.send(("done", None, None, None, None, None, None)) + pipe.close() + + sys.exit(0) + + +SystemResourceUsage = namedtuple( + "SystemResourceUsage", + ["start", "end", "cpu_times", "cpu_percent", "io", "virt", "swap"], +) + + +class SystemResourceMonitor(object): + """Measures system resources. + + Each instance measures system resources from the time it is started + until it is finished. It does this on a separate process so it doesn't + impact execution of the main Python process. + + Each instance is a one-shot instance. It cannot be used to record multiple + durations. + + Aside from basic data gathering, the class supports basic analysis + capabilities. You can query for data between ranges. You can also tell it + when certain events occur and later grab data relevant to those events or + plot those events on a timeline. + + The resource monitor works by periodically polling the state of the + system. By default, it polls every second. This can be adjusted depending + on the required granularity of the data and considerations for probe + overhead. It tries to probe at the interval specified. However, variations + should be expected. Fast and well-behaving systems should experience + variations in the 1ms range. Larger variations may exist if the system is + under heavy load or depending on how accurate socket polling is on your + system. + + In its current implementation, data is not available until collection has + stopped. This may change in future iterations. + + Usage + ===== + + monitor = SystemResourceMonitor() + monitor.start() + + # Record that a single event in time just occurred. + foo.do_stuff() + monitor.record_event('foo_did_stuff') + + # Record that we're about to perform a possibly long-running event. + with monitor.phase('long_job'): + foo.do_long_running_job() + + # Stop recording. Currently we need to stop before data is available. + monitor.stop() + + # Obtain the raw data for the entire probed range. + print('CPU Usage:') + for core in monitor.aggregate_cpu(): + print(core) + + # We can also request data corresponding to a specific phase. + for data in monitor.phase_usage('long_job'): + print(data.cpu_percent) + """ + + # The interprocess communication is complicated enough to warrant + # explanation. To work around the Python GIL, we launch a separate + # background process whose only job is to collect metrics. If we performed + # collection in the main process, the polling interval would be + # inconsistent if a long-running function were on the stack. Since the + # child process is independent of the instantiating process, data + # collection should be evenly spaced. + # + # As the child process collects data, it buffers it locally. When + # collection stops, it flushes all that data to a pipe to be read by + # the parent process. + + instance = None + + def __init__(self, poll_interval=1.0, metadata={}): + """Instantiate a system resource monitor instance. + + The instance is configured with a poll interval. This is the interval + between samples, in float seconds. + """ + self.start_time = None + self.end_time = None + + self.events = [] + self.markers = [] + self.phases = OrderedDict() + + self._active_phases = {} + self._active_markers = {} + + self._running = False + self._stopped = False + self._process = None + + if psutil is None: + return + + # This try..except should not be needed! However, some tools (like + # |mach build|) attempt to load psutil before properly creating a + # virtualenv by building psutil. As a result, python/psutil may be in + # sys.path and its .py files may pick up the psutil C extension from + # the system install. If the versions don't match, we typically see + # failures invoking one of these functions. + try: + cpu_percent = psutil.cpu_percent(0.0, True) + cpu_times = psutil.cpu_times(False) + io = get_disk_io_counters() + virt = psutil.virtual_memory() + swap = psutil.swap_memory() + except Exception as e: + warnings.warn("psutil failed to run: %s" % e) + return + + self._cpu_cores = len(cpu_percent) + self._cpu_times_type = type(cpu_times) + self._cpu_times_len = len(cpu_times) + self._io_type = type(io) + self._io_len = len(io) + self._virt_type = type(virt) + self._virt_len = len(virt) + self._swap_type = type(swap) + self._swap_len = len(swap) + self.start_timestamp = time.time() + + self._pipe, child_pipe = multiprocessing.Pipe(True) + + self._process = multiprocessing.Process( + target=_collect, args=(child_pipe, poll_interval) + ) + self.poll_interval = poll_interval + self.metadata = metadata + + def __del__(self): + if self._running: + self._pipe.send(("terminate",)) + self._process.join() + + # Methods to control monitoring. + + def start(self): + """Start measuring system-wide CPU resource utilization. + + You should only call this once per instance. + """ + if not self._process: + return + + self._process.start() + self._running = True + self.start_time = time.monotonic() + SystemResourceMonitor.instance = self + + def stop(self): + """Stop measuring system-wide CPU resource utilization. + + You should call this if and only if you have called start(). You should + always pair a stop() with a start(). + + Currently, data is not available until you call stop(). + """ + if not self._process: + self._stopped = True + return + + self.stop_time = time.monotonic() + assert not self._stopped + + try: + self._pipe.send(("terminate",)) + except Exception: + pass + self._stopped = True + + self.measurements = [] + + # The child process will send each data sample over the pipe + # as a separate data structure. When it has finished sending + # samples, it sends a special "done" message to indicate it + # is finished. + + while _poll(self._pipe, poll_interval=0.1): + try: + ( + start_time, + end_time, + io_diff, + cpu_diff, + cpu_percent, + virt_mem, + swap_mem, + ) = self._pipe.recv() + except Exception as e: + warnings.warn("failed to receive data: %s" % e) + # Assume we can't recover + break + + # There should be nothing after the "done" message so + # terminate. + if start_time == "done": + break + + try: + io = self._io_type(*io_diff) + virt = self._virt_type(*virt_mem) + swap = self._swap_type(*swap_mem) + cpu_times = [self._cpu_times_type(*v) for v in cpu_diff] + + self.measurements.append( + SystemResourceUsage( + start_time, end_time, cpu_times, cpu_percent, io, virt, swap + ) + ) + except Exception: + # We also can't recover, but output the data that caused the exception + warnings.warn( + "failed to read the received data: %s" + % str( + ( + start_time, + end_time, + io_diff, + cpu_diff, + cpu_percent, + virt_mem, + swap_mem, + ) + ) + ) + + break + + # We establish a timeout so we don't hang forever if the child + # process has crashed. + if self._running: + self._process.join(10) + if self._process.is_alive(): + self._process.terminate() + self._process.join(10) + + self._running = False + SystemResourceUsage.instance = None + self.end_time = time.monotonic() + + # Methods to record events alongside the monitored data. + + @staticmethod + def record_event(name): + """Record an event as occuring now. + + Events are actions that occur at a specific point in time. If you are + looking for an action that has a duration, see the phase API below. + """ + if SystemResourceMonitor.instance: + SystemResourceMonitor.instance.events.append((time.monotonic(), name)) + + @staticmethod + def record_marker(name, start, end, text): + """Record a marker with a duration and an optional text + + Markers are typically used to record when a single command happened. + For actions with a longer duration that justifies tracking resource use + see the phase API below. + """ + if SystemResourceMonitor.instance: + SystemResourceMonitor.instance.markers.append((name, start, end, text)) + + @staticmethod + def begin_marker(name, text, disambiguator=None): + if SystemResourceMonitor.instance: + id = name + ":" + text + if disambiguator: + id += ":" + disambiguator + SystemResourceMonitor.instance._active_markers[id] = time.monotonic() + + @staticmethod + def end_marker(name, text, disambiguator=None): + if not SystemResourceMonitor.instance: + return + end = time.monotonic() + id = name + ":" + text + if disambiguator: + id += ":" + disambiguator + if not id in SystemResourceMonitor.instance._active_markers: + return + start = SystemResourceMonitor.instance._active_markers.pop(id) + SystemResourceMonitor.instance.record_marker(name, start, end, text) + + @contextmanager + def phase(self, name): + """Context manager for recording an active phase.""" + self.begin_phase(name) + yield + self.finish_phase(name) + + def begin_phase(self, name): + """Record the start of a phase. + + Phases are actions that have a duration. Multiple phases can be active + simultaneously. Phases can be closed in any order. + + Keep in mind that if phases occur in parallel, it will become difficult + to isolate resource utilization specific to individual phases. + """ + assert name not in self._active_phases + + self._active_phases[name] = time.monotonic() + + def finish_phase(self, name): + """Record the end of a phase.""" + + assert name in self._active_phases + + phase = (self._active_phases[name], time.monotonic()) + self.phases[name] = phase + del self._active_phases[name] + + return phase[1] - phase[0] + + # Methods to query data. + + def range_usage(self, start=None, end=None): + """Obtain the usage data falling within the given time range. + + This is a generator of SystemResourceUsage. + + If no time range bounds are given, all data is returned. + """ + if not self._stopped or self.start_time is None: + return + + if start is None: + start = self.start_time + + if end is None: + end = self.end_time + + for entry in self.measurements: + if entry.start < start: + continue + + if entry.end > end: + break + + yield entry + + def phase_usage(self, phase): + """Obtain usage data for a specific phase. + + This is a generator of SystemResourceUsage. + """ + time_start, time_end = self.phases[phase] + + return self.range_usage(time_start, time_end) + + def between_events_usage(self, start_event, end_event): + """Obtain usage data between two point events. + + This is a generator of SystemResourceUsage. + """ + start_time = None + end_time = None + + for t, name in self.events: + if name == start_event: + start_time = t + elif name == end_event: + end_time = t + + if start_time is None: + raise Exception("Could not find start event: %s" % start_event) + + if end_time is None: + raise Exception("Could not find end event: %s" % end_event) + + return self.range_usage(start_time, end_time) + + def aggregate_cpu_percent(self, start=None, end=None, phase=None, per_cpu=True): + """Obtain the aggregate CPU percent usage for a range. + + Returns a list of floats representing average CPU usage percentage per + core if per_cpu is True (the default). If per_cpu is False, return a + single percentage value. + + By default this will return data for the entire instrumented interval. + If phase is defined, data for a named phase will be returned. If start + and end are defined, these times will be fed into range_usage(). + """ + cpu = [[] for i in range(0, self._cpu_cores)] + + if phase: + data = self.phase_usage(phase) + else: + data = self.range_usage(start, end) + + for usage in data: + for i, v in enumerate(usage.cpu_percent): + cpu[i].append(v) + + samples = len(cpu[0]) + + if not samples: + return 0 + + if per_cpu: + # pylint --py3k W1619 + return [sum(x) / samples for x in cpu] + + cores = [sum(x) for x in cpu] + + # pylint --py3k W1619 + return sum(cores) / len(cpu) / samples + + def aggregate_cpu_times(self, start=None, end=None, phase=None, per_cpu=True): + """Obtain the aggregate CPU times for a range. + + If per_cpu is True (the default), this returns a list of named tuples. + Each tuple is as if it were returned by psutil.cpu_times(). If per_cpu + is False, this returns a single named tuple of the aforementioned type. + """ + empty = [0 for i in range(0, self._cpu_times_len)] + cpu = [list(empty) for i in range(0, self._cpu_cores)] + + if phase: + data = self.phase_usage(phase) + else: + data = self.range_usage(start, end) + + for usage in data: + for i, core_values in enumerate(usage.cpu_times): + for j, v in enumerate(core_values): + cpu[i][j] += v + + if per_cpu: + return [self._cpu_times_type(*v) for v in cpu] + + sums = list(empty) + for core in cpu: + for i, v in enumerate(core): + sums[i] += v + + return self._cpu_times_type(*sums) + + def aggregate_io(self, start=None, end=None, phase=None): + """Obtain aggregate I/O counters for a range. + + Returns an iostat named tuple from psutil. + """ + + io = [0 for i in range(self._io_len)] + + if phase: + data = self.phase_usage(phase) + else: + data = self.range_usage(start, end) + + for usage in data: + for i, v in enumerate(usage.io): + io[i] += v + + return self._io_type(*io) + + def min_memory_available(self, start=None, end=None, phase=None): + """Return the minimum observed available memory number from a range. + + Returns long bytes of memory available. + + See psutil for notes on how this is calculated. + """ + if phase: + data = self.phase_usage(phase) + else: + data = self.range_usage(start, end) + + values = [] + + for usage in data: + values.append(usage.virt.available) + + return min(values) + + def max_memory_percent(self, start=None, end=None, phase=None): + """Returns the maximum percentage of system memory used. + + Returns a float percentage. 1.00 would mean all system memory was in + use at one point. + """ + if phase: + data = self.phase_usage(phase) + else: + data = self.range_usage(start, end) + + values = [] + + for usage in data: + values.append(usage.virt.percent) + + return max(values) + + def as_dict(self): + """Convert the recorded data to a dict, suitable for serialization. + + The returned dict has the following keys: + + version - Integer version number being rendered. Currently 2. + cpu_times_fields - A list of the names of the CPU times fields. + io_fields - A list of the names of the I/O fields. + virt_fields - A list of the names of the virtual memory fields. + swap_fields - A list of the names of the swap memory fields. + samples - A list of dicts containing low-level measurements. + events - A list of lists representing point events. The inner list + has 2 elements, the float wall time of the event and the string + event name. + phases - A list of dicts describing phases. Each phase looks a lot + like an entry from samples (see below). Some phases may not have + data recorded against them, so some keys may be None. + overall - A dict representing overall resource usage. This resembles + a sample entry. + system - Contains additional information about the system including + number of processors and amount of memory. + + Each entry in the sample list is a dict with the following keys: + + start - Float wall time this measurement began on. + end - Float wall time this measurement ended on. + io - List of numerics for I/O values. + virt - List of numerics for virtual memory values. + swap - List of numerics for swap memory values. + cpu_percent - List of floats representing CPU percent on each core. + cpu_times - List of lists. Main list is each core. Inner lists are + lists of floats representing CPU times on that core. + cpu_percent_mean - Float of mean CPU percent across all cores. + cpu_times_sum - List of floats representing the sum of CPU times + across all cores. + cpu_times_total - Float representing the sum of all CPU times across + all cores. This is useful for calculating the percent in each CPU + time. + """ + + o = dict( + version=2, + cpu_times_fields=list(self._cpu_times_type._fields), + io_fields=list(self._io_type._fields), + virt_fields=list(self._virt_type._fields), + swap_fields=list(self._swap_type._fields), + samples=[], + phases=[], + system={}, + ) + + def populate_derived(e): + if e["cpu_percent_cores"]: + # pylint --py3k W1619 + e["cpu_percent_mean"] = sum(e["cpu_percent_cores"]) / len( + e["cpu_percent_cores"] + ) + else: + e["cpu_percent_mean"] = None + + if e["cpu_times"]: + e["cpu_times_sum"] = [0.0] * self._cpu_times_len + for i in range(0, self._cpu_times_len): + e["cpu_times_sum"][i] = sum(core[i] for core in e["cpu_times"]) + + e["cpu_times_total"] = sum(e["cpu_times_sum"]) + + def phase_entry(name, start, end): + e = dict( + name=name, + start=start, + end=end, + duration=end - start, + cpu_percent_cores=self.aggregate_cpu_percent(phase=name), + cpu_times=[list(c) for c in self.aggregate_cpu_times(phase=name)], + io=list(self.aggregate_io(phase=name)), + ) + populate_derived(e) + return e + + for m in self.measurements: + e = dict( + start=m.start, + end=m.end, + io=list(m.io), + virt=list(m.virt), + swap=list(m.swap), + cpu_percent_cores=list(m.cpu_percent), + cpu_times=list(list(cpu) for cpu in m.cpu_times), + ) + + populate_derived(e) + o["samples"].append(e) + + if o["samples"]: + o["start"] = o["samples"][0]["start"] + o["end"] = o["samples"][-1]["end"] + o["duration"] = o["end"] - o["start"] + o["overall"] = phase_entry(None, o["start"], o["end"]) + else: + o["start"] = None + o["end"] = None + o["duration"] = None + o["overall"] = None + + o["events"] = [list(ev) for ev in self.events] + + for phase, v in self.phases.items(): + o["phases"].append(phase_entry(phase, v[0], v[1])) + + if have_psutil: + o["system"].update( + dict( + cpu_logical_count=psutil.cpu_count(logical=True), + cpu_physical_count=psutil.cpu_count(logical=False), + swap_total=psutil.swap_memory()[0], + vmem_total=psutil.virtual_memory()[0], + ) + ) + + return o + + def as_profile(self): + profile_time = time.monotonic() + start_time = self.start_time + profile = { + "meta": { + "processType": 0, + "product": "mach", + "stackwalk": 0, + "version": 27, + "preprocessedProfileVersion": 47, + "symbolicationNotSupported": True, + "interval": self.poll_interval * 1000, + "startTime": self.start_timestamp * 1000, + "profilingStartTime": 0, + "logicalCPUs": psutil.cpu_count(logical=True), + "physicalCPUs": psutil.cpu_count(logical=False), + "mainMemory": psutil.virtual_memory()[0], + "markerSchema": [ + { + "name": "Phase", + "tooltipLabel": "{marker.data.phase}", + "tableLabel": "{marker.name} — {marker.data.phase} — CPU time: {marker.data.cpuTime} ({marker.data.cpuPercent})", + "chartLabel": "{marker.data.phase}", + "display": [ + "marker-chart", + "marker-table", + "timeline-overview", + ], + "data": [ + { + "key": "cpuTime", + "label": "CPU Time", + "format": "duration", + }, + { + "key": "cpuPercent", + "label": "CPU Percent", + "format": "string", + }, + ], + }, + { + "name": "Text", + "tooltipLabel": "{marker.name}", + "tableLabel": "{marker.name} — {marker.data.text}", + "chartLabel": "{marker.data.text}", + "display": ["marker-chart", "marker-table"], + "data": [ + { + "key": "text", + "label": "Description", + "format": "string", + "searchable": True, + } + ], + }, + { + "name": "Mem", + "tooltipLabel": "{marker.name}", + "display": [], + "data": [ + {"key": "used", "label": "Memory Used", "format": "bytes"}, + { + "key": "cached", + "label": "Memory cached", + "format": "bytes", + }, + { + "key": "buffers", + "label": "Memory buffers", + "format": "bytes", + }, + ], + "graphs": [ + {"key": "used", "color": "orange", "type": "line-filled"} + ], + }, + { + "name": "IO", + "tooltipLabel": "{marker.name}", + "display": [], + "data": [ + { + "key": "write_bytes", + "label": "Written", + "format": "bytes", + }, + { + "key": "write_count", + "label": "Write count", + "format": "integer", + }, + {"key": "read_bytes", "label": "Read", "format": "bytes"}, + { + "key": "read_count", + "label": "Read count", + "format": "integer", + }, + ], + "graphs": [ + {"key": "read_bytes", "color": "green", "type": "bar"}, + {"key": "write_bytes", "color": "red", "type": "bar"}, + ], + }, + ], + "usesOnlyOneStackType": True, + }, + "libs": [], + "threads": [ + { + "processType": "default", + "processName": "mach", + "processStartupTime": 0, + "processShutdownTime": None, + "registerTime": 0, + "unregisterTime": None, + "pausedRanges": [], + "showMarkersInTimeline": True, + "name": "", + "isMainThread": False, + "pid": "0", + "tid": 0, + "samples": { + "weightType": "samples", + "weight": None, + "stack": [], + "time": [], + "length": 0, + }, + "stringArray": ["(root)"], + "markers": { + "data": [], + "name": [], + "startTime": [], + "endTime": [], + "phase": [], + "category": [], + "length": 0, + }, + "stackTable": { + "frame": [0], + "prefix": [None], + "category": [0], + "subcategory": [0], + "length": 1, + }, + "frameTable": { + "address": [-1], + "inlineDepth": [0], + "category": [None], + "subcategory": [0], + "func": [0], + "nativeSymbol": [None], + "innerWindowID": [0], + "implementation": [None], + "line": [None], + "column": [None], + "length": 1, + }, + "funcTable": { + "isJS": [False], + "relevantForJS": [False], + "name": [0], + "resource": [-1], + "fileName": [None], + "lineNumber": [None], + "columnNumber": [None], + "length": 1, + }, + "resourceTable": { + "lib": [], + "name": [], + "host": [], + "type": [], + "length": 0, + }, + "nativeSymbols": { + "libIndex": [], + "address": [], + "name": [], + "functionSize": [], + "length": 0, + }, + } + ], + "counters": [], + } + + firstThread = profile["threads"][0] + markers = firstThread["markers"] + for key in self.metadata: + profile["meta"][key] = self.metadata[key] + + def get_string_index(string): + stringArray = firstThread["stringArray"] + try: + return stringArray.index(string) + except ValueError: + stringArray.append(string) + return len(stringArray) - 1 + + def add_marker(name_index, start, end, data, precision=None): + # The precision argument allows setting how many digits after the + # decimal point are desired. + # For resource use samples where we sample with a timer, an integer + # number of ms is good enough. + # For short duration markers, the profiler front-end may show up to + # 3 digits after the decimal point (ie. µs precision). + markers["startTime"].append(round((start - start_time) * 1000, precision)) + if end is None: + markers["endTime"].append(None) + # 0 = Instant marker + markers["phase"].append(0) + else: + markers["endTime"].append(round((end - start_time) * 1000, precision)) + # 1 = marker with start and end times, 2 = start but no end. + markers["phase"].append(1) + markers["category"].append(0) + markers["name"].append(name_index) + markers["data"].append(data) + markers["length"] = markers["length"] + 1 + + def format_percent(value): + return str(round(value, 1)) + "%" + + samples = firstThread["samples"] + samples["stack"].append(0) + samples["time"].append(0) + + cpu_string_index = get_string_index("CPU Use") + memory_string_index = get_string_index("Memory") + io_string_index = get_string_index("IO") + valid_cpu_fields = set() + for m in self.measurements: + # Ignore samples that are much too short. + if m.end - m.start < self.poll_interval / 10: + continue + + # Sample times + samples["stack"].append(0) + samples["time"].append(round((m.end - start_time) * 1000)) + + # CPU + markerData = { + "type": "CPU", + "cpuPercent": format_percent( + sum(list(m.cpu_percent)) / len(m.cpu_percent) + ), + } + + # due to inconsistencies in the sampling rate, sometimes the + # cpu_times add up to more than 100%, causing annoying + # spikes in the CPU use charts. Avoid them by dividing the + # values by the total if it is above 1. + total = 0 + for field in ["nice", "user", "system", "iowait", "softirq", "idle"]: + if hasattr(m.cpu_times[0], field): + total += sum(getattr(core, field) for core in m.cpu_times) / ( + m.end - m.start + ) + divisor = total if total > 1 else 1 + + total = 0 + for field in ["nice", "user", "system", "iowait", "softirq"]: + if hasattr(m.cpu_times[0], field): + total += ( + sum(getattr(core, field) for core in m.cpu_times) + / (m.end - m.start) + / divisor + ) + if total > 0: + valid_cpu_fields.add(field) + markerData[field] = round(total, 3) + for field in ["nice", "user", "system", "iowait", "idle"]: + if hasattr(m.cpu_times[0], field): + markerData[field + "_pct"] = format_percent( + 100 + * sum(getattr(core, field) for core in m.cpu_times) + / (m.end - m.start) + / len(m.cpu_times) + ) + add_marker(cpu_string_index, m.start, m.end, markerData) + + # Memory + markerData = {"type": "Mem", "used": m.virt.used} + if hasattr(m.virt, "cached"): + markerData["cached"] = m.virt.cached + if hasattr(m.virt, "buffers"): + markerData["buffers"] = m.virt.buffers + add_marker(memory_string_index, m.start, m.end, markerData) + + # IO + markerData = { + "type": "IO", + "read_count": m.io.read_count, + "read_bytes": m.io.read_bytes, + "write_count": m.io.write_count, + "write_bytes": m.io.write_bytes, + } + add_marker(io_string_index, m.start, m.end, markerData) + samples["length"] = len(samples["stack"]) + + # The marker schema for CPU markers should only contain graph + # definitions for fields we actually have, or the profiler front-end + # will detect missing data and skip drawing the track entirely. + cpuSchema = { + "name": "CPU", + "tooltipLabel": "{marker.name}", + "display": [], + "data": [{"key": "cpuPercent", "label": "CPU Percent", "format": "string"}], + "graphs": [], + } + cpuData = cpuSchema["data"] + for field, label in { + "user": "User %", + "iowait": "IO Wait %", + "system": "System %", + "nice": "Nice %", + "idle": "Idle %", + }.items(): + if field in valid_cpu_fields or field == "idle": + cpuData.append( + {"key": field + "_pct", "label": label, "format": "string"} + ) + cpuGraphs = cpuSchema["graphs"] + for field, color in { + "softirq": "orange", + "iowait": "red", + "system": "grey", + "user": "yellow", + "nice": "blue", + }.items(): + if field in valid_cpu_fields: + cpuGraphs.append({"key": field, "color": color, "type": "bar"}) + profile["meta"]["markerSchema"].insert(0, cpuSchema) + + # Create markers for phases + phase_string_index = get_string_index("Phase") + for phase, v in self.phases.items(): + markerData = {"type": "Phase", "phase": phase} + + cpu_percent_cores = self.aggregate_cpu_percent(phase=phase) + if cpu_percent_cores: + markerData["cpuPercent"] = format_percent( + sum(cpu_percent_cores) / len(cpu_percent_cores) + ) + + cpu_times = [list(c) for c in self.aggregate_cpu_times(phase=phase)] + cpu_times_sum = [0.0] * self._cpu_times_len + for i in range(0, self._cpu_times_len): + cpu_times_sum[i] = sum(core[i] for core in cpu_times) + total_cpu_time_ms = sum(cpu_times_sum) * 1000 + if total_cpu_time_ms > 0: + markerData["cpuTime"] = total_cpu_time_ms + + add_marker(phase_string_index, v[0], v[1], markerData, 3) + + # Add generic markers + for name, start, end, text in self.markers: + markerData = {"type": "Text"} + if text: + markerData["text"] = text + add_marker(get_string_index(name), start, end, markerData, 3) + if self.events: + event_string_index = get_string_index("Event") + for event_time, text in self.events: + if text: + add_marker( + event_string_index, + event_time, + None, + {"type": "Text", "text": text}, + 3, + ) + + # We may have spent some time generating this profile, and there might + # also have been some time elapsed between stopping the resource + # monitor, and the profile being created. These are hidden costs that + # we should account for as best as possible, and the best we can do + # is to make the profile contain information about this cost somehow. + # We extend the profile end time up to now rather than self.end_time, + # and add a phase covering that period of time. + now = time.monotonic() + profile["meta"]["profilingEndTime"] = round( + (now - self.start_time) * 1000 + 0.0005, 3 + ) + markerData = { + "type": "Phase", + "phase": "teardown", + } + add_marker(phase_string_index, self.stop_time, now, markerData, 3) + teardown_string_index = get_string_index("resourcemonitor") + markerData = { + "type": "Text", + "text": "stop", + } + add_marker(teardown_string_index, self.stop_time, self.end_time, markerData, 3) + markerData = { + "type": "Text", + "text": "as_profile", + } + add_marker(teardown_string_index, profile_time, now, markerData, 3) + + # Unfortunately, whatever the caller does with the profile (e.g. json) + # or after that (hopefully, exit) is not going to be counted, but we + # assume it's fast enough. + return profile diff --git a/testing/mozbase/mozsystemmonitor/setup.cfg b/testing/mozbase/mozsystemmonitor/setup.cfg new file mode 100644 index 0000000000..3c6e79cf31 --- /dev/null +++ b/testing/mozbase/mozsystemmonitor/setup.cfg @@ -0,0 +1,2 @@ +[bdist_wheel] +universal=1 diff --git a/testing/mozbase/mozsystemmonitor/setup.py b/testing/mozbase/mozsystemmonitor/setup.py new file mode 100644 index 0000000000..7b575ef4bd --- /dev/null +++ b/testing/mozbase/mozsystemmonitor/setup.py @@ -0,0 +1,33 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import os + +from setuptools import setup + +PACKAGE_VERSION = "1.0.1" + +try: + pwd = os.path.dirname(os.path.abspath(__file__)) + description = open(os.path.join(pwd, "README.rst")).read() +except Exception: + description = "" + +setup( + name="mozsystemmonitor", + description="Monitor system resource usage.", + long_description="see https://firefox-source-docs.mozilla.org/mozbase/index.html", + classifiers=[ + "Programming Language :: Python :: 3", + ], + # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers + license="MPL 2.0", + keywords="mozilla", + author="Mozilla Automation and Tools Team", + author_email="tools@lists.mozilla.org", + url="https://wiki.mozilla.org/Auto-tools/Projects/Mozbase", + packages=["mozsystemmonitor"], + version=PACKAGE_VERSION, + install_requires=["psutil >= 3.1.1"], +) diff --git a/testing/mozbase/mozsystemmonitor/tests/manifest.toml b/testing/mozbase/mozsystemmonitor/tests/manifest.toml new file mode 100644 index 0000000000..babb20c1b0 --- /dev/null +++ b/testing/mozbase/mozsystemmonitor/tests/manifest.toml @@ -0,0 +1,4 @@ +[DEFAULT] +subsuite = "mozbase" + +["test_resource_monitor.py"] diff --git a/testing/mozbase/mozsystemmonitor/tests/test_resource_monitor.py b/testing/mozbase/mozsystemmonitor/tests/test_resource_monitor.py new file mode 100644 index 0000000000..6de99152b9 --- /dev/null +++ b/testing/mozbase/mozsystemmonitor/tests/test_resource_monitor.py @@ -0,0 +1,183 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import multiprocessing +import time +import unittest + +import mozunit +from six import integer_types + +try: + import psutil +except ImportError: + psutil = None + +from mozsystemmonitor.resourcemonitor import SystemResourceMonitor, SystemResourceUsage + + +@unittest.skipIf(psutil is None, "Resource monitor requires psutil.") +class TestResourceMonitor(unittest.TestCase): + def test_basic(self): + monitor = SystemResourceMonitor(poll_interval=0.5) + + monitor.start() + time.sleep(3) + + monitor.stop() + + data = list(monitor.range_usage()) + self.assertGreater(len(data), 3) + + self.assertIsInstance(data[0], SystemResourceUsage) + + def test_empty(self): + monitor = SystemResourceMonitor(poll_interval=2.0) + monitor.start() + monitor.stop() + + data = list(monitor.range_usage()) + self.assertEqual(len(data), 0) + + def test_phases(self): + monitor = SystemResourceMonitor(poll_interval=0.25) + + monitor.start() + time.sleep(1) + + with monitor.phase("phase1"): + time.sleep(1) + + with monitor.phase("phase2"): + time.sleep(1) + + monitor.stop() + + self.assertEqual(len(monitor.phases), 2) + self.assertEqual(["phase2", "phase1"], list(monitor.phases.keys())) + + all = list(monitor.range_usage()) + data1 = list(monitor.phase_usage("phase1")) + data2 = list(monitor.phase_usage("phase2")) + + self.assertGreater(len(all), len(data1)) + self.assertGreater(len(data1), len(data2)) + + # This could fail if time.monotonic() takes more than 0.1s. It really + # shouldn't. + self.assertAlmostEqual(data1[-1].end, data2[-1].end, delta=0.25) + + def test_no_data(self): + monitor = SystemResourceMonitor() + + data = list(monitor.range_usage()) + self.assertEqual(len(data), 0) + + def test_events(self): + monitor = SystemResourceMonitor(poll_interval=0.25) + + monitor.start() + time.sleep(0.5) + + t0 = time.monotonic() + monitor.record_event("t0") + time.sleep(2) + + monitor.record_event("t1") + time.sleep(0.5) + monitor.stop() + + events = monitor.events + self.assertEqual(len(events), 2) + + event = events[0] + + self.assertEqual(event[1], "t0") + self.assertAlmostEqual(event[0], t0, delta=0.25) + + data = list(monitor.between_events_usage("t0", "t1")) + self.assertGreater(len(data), 0) + + def test_aggregate_cpu(self): + monitor = SystemResourceMonitor(poll_interval=0.25) + + monitor.start() + time.sleep(1) + monitor.stop() + + values = monitor.aggregate_cpu_percent() + self.assertIsInstance(values, list) + self.assertEqual(len(values), multiprocessing.cpu_count()) + for v in values: + self.assertIsInstance(v, float) + + value = monitor.aggregate_cpu_percent(per_cpu=False) + self.assertIsInstance(value, float) + + values = monitor.aggregate_cpu_times() + self.assertIsInstance(values, list) + self.assertGreater(len(values), 0) + self.assertTrue(hasattr(values[0], "user")) + + t = type(values[0]) + + value = monitor.aggregate_cpu_times(per_cpu=False) + self.assertIsInstance(value, t) + + def test_aggregate_io(self): + monitor = SystemResourceMonitor(poll_interval=0.25) + + # There's really no easy way to ensure I/O occurs. For all we know + # reads and writes will all be serviced by the page cache. + monitor.start() + time.sleep(1.0) + monitor.stop() + + values = monitor.aggregate_io() + self.assertTrue(hasattr(values, "read_count")) + + def test_memory(self): + monitor = SystemResourceMonitor(poll_interval=0.25) + + monitor.start() + time.sleep(1.0) + monitor.stop() + + v = monitor.min_memory_available() + self.assertIsInstance(v, integer_types) + + v = monitor.max_memory_percent() + self.assertIsInstance(v, float) + + def test_as_dict(self): + monitor = SystemResourceMonitor(poll_interval=0.25) + + monitor.start() + time.sleep(0.1) + monitor.begin_phase("phase1") + monitor.record_event("foo") + time.sleep(0.1) + monitor.begin_phase("phase2") + monitor.record_event("bar") + time.sleep(0.2) + monitor.finish_phase("phase1") + time.sleep(0.2) + monitor.finish_phase("phase2") + time.sleep(0.4) + monitor.stop() + + d = monitor.as_dict() + + self.assertEqual(d["version"], 2) + self.assertEqual(len(d["events"]), 2) + self.assertEqual(len(d["phases"]), 2) + self.assertIn("system", d) + self.assertIsInstance(d["system"], dict) + self.assertIsInstance(d["overall"], dict) + self.assertIn("duration", d["overall"]) + self.assertIn("cpu_times", d["overall"]) + + +if __name__ == "__main__": + mozunit.main() |