diff options
Diffstat (limited to '')
-rw-r--r-- | tools/kvm/kvm_stat/Makefile | 42 | ||||
-rwxr-xr-x | tools/kvm/kvm_stat/kvm_stat | 1888 | ||||
-rw-r--r-- | tools/kvm/kvm_stat/kvm_stat.service | 17 | ||||
-rw-r--r-- | tools/kvm/kvm_stat/kvm_stat.txt | 124 |
4 files changed, 2071 insertions, 0 deletions
diff --git a/tools/kvm/kvm_stat/Makefile b/tools/kvm/kvm_stat/Makefile new file mode 100644 index 0000000000..c3e36c60d4 --- /dev/null +++ b/tools/kvm/kvm_stat/Makefile @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: GPL-2.0 +include ../../scripts/Makefile.include +include ../../scripts/utilities.mak +BINDIR=usr/bin +MANDIR=usr/share/man +MAN1DIR=$(MANDIR)/man1 + +MAN1=kvm_stat.1 + +A2X=a2x +a2x_path := $(call get-executable,$(A2X)) + +all: man + +ifneq ($(findstring $(MAKEFLAGS),s),s) + ifneq ($(V),1) + QUIET_A2X = @echo ' A2X '$@; + endif +endif + +%.1: %.txt +ifeq ($(a2x_path),) + $(error "You need to install asciidoc for man pages") +else + $(QUIET_A2X)$(A2X) --doctype manpage --format manpage $< +endif + +clean: + rm -f $(MAN1) + +man: $(MAN1) + +install-man: man + install -d -m 755 $(INSTALL_ROOT)/$(MAN1DIR) + install -m 644 kvm_stat.1 $(INSTALL_ROOT)/$(MAN1DIR) + +install-tools: + install -d -m 755 $(INSTALL_ROOT)/$(BINDIR) + install -m 755 -p "kvm_stat" "$(INSTALL_ROOT)/$(BINDIR)/$(TARGET)" + +install: install-tools install-man +.PHONY: all clean man install-tools install-man install diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat new file mode 100755 index 0000000000..15bf00e79e --- /dev/null +++ b/tools/kvm/kvm_stat/kvm_stat @@ -0,0 +1,1888 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0-only +# +# top-like utility for displaying kvm statistics +# +# Copyright 2006-2008 Qumranet Technologies +# Copyright 2008-2011 Red Hat, Inc. +# +# Authors: +# Avi Kivity <avi@redhat.com> +# +"""The kvm_stat module outputs statistics about running KVM VMs + +Three different ways of output formatting are available: +- as a top-like text ui +- in a key -> value format +- in an all keys, all values format + +The data is sampled from the KVM's debugfs entries and its perf events. +""" +from __future__ import print_function + +import curses +import sys +import locale +import os +import time +import argparse +import ctypes +import fcntl +import resource +import struct +import re +import subprocess +import signal +from collections import defaultdict, namedtuple +from functools import reduce +from datetime import datetime + +VMX_EXIT_REASONS = { + 'EXCEPTION_NMI': 0, + 'EXTERNAL_INTERRUPT': 1, + 'TRIPLE_FAULT': 2, + 'INIT_SIGNAL': 3, + 'SIPI_SIGNAL': 4, + 'INTERRUPT_WINDOW': 7, + 'NMI_WINDOW': 8, + 'TASK_SWITCH': 9, + 'CPUID': 10, + 'HLT': 12, + 'INVD': 13, + 'INVLPG': 14, + 'RDPMC': 15, + 'RDTSC': 16, + 'VMCALL': 18, + 'VMCLEAR': 19, + 'VMLAUNCH': 20, + 'VMPTRLD': 21, + 'VMPTRST': 22, + 'VMREAD': 23, + 'VMRESUME': 24, + 'VMWRITE': 25, + 'VMOFF': 26, + 'VMON': 27, + 'CR_ACCESS': 28, + 'DR_ACCESS': 29, + 'IO_INSTRUCTION': 30, + 'MSR_READ': 31, + 'MSR_WRITE': 32, + 'INVALID_STATE': 33, + 'MSR_LOAD_FAIL': 34, + 'MWAIT_INSTRUCTION': 36, + 'MONITOR_TRAP_FLAG': 37, + 'MONITOR_INSTRUCTION': 39, + 'PAUSE_INSTRUCTION': 40, + 'MCE_DURING_VMENTRY': 41, + 'TPR_BELOW_THRESHOLD': 43, + 'APIC_ACCESS': 44, + 'EOI_INDUCED': 45, + 'GDTR_IDTR': 46, + 'LDTR_TR': 47, + 'EPT_VIOLATION': 48, + 'EPT_MISCONFIG': 49, + 'INVEPT': 50, + 'RDTSCP': 51, + 'PREEMPTION_TIMER': 52, + 'INVVPID': 53, + 'WBINVD': 54, + 'XSETBV': 55, + 'APIC_WRITE': 56, + 'RDRAND': 57, + 'INVPCID': 58, + 'VMFUNC': 59, + 'ENCLS': 60, + 'RDSEED': 61, + 'PML_FULL': 62, + 'XSAVES': 63, + 'XRSTORS': 64, + 'UMWAIT': 67, + 'TPAUSE': 68, + 'BUS_LOCK': 74, + 'NOTIFY': 75, +} + +SVM_EXIT_REASONS = { + 'READ_CR0': 0x000, + 'READ_CR2': 0x002, + 'READ_CR3': 0x003, + 'READ_CR4': 0x004, + 'READ_CR8': 0x008, + 'WRITE_CR0': 0x010, + 'WRITE_CR2': 0x012, + 'WRITE_CR3': 0x013, + 'WRITE_CR4': 0x014, + 'WRITE_CR8': 0x018, + 'READ_DR0': 0x020, + 'READ_DR1': 0x021, + 'READ_DR2': 0x022, + 'READ_DR3': 0x023, + 'READ_DR4': 0x024, + 'READ_DR5': 0x025, + 'READ_DR6': 0x026, + 'READ_DR7': 0x027, + 'WRITE_DR0': 0x030, + 'WRITE_DR1': 0x031, + 'WRITE_DR2': 0x032, + 'WRITE_DR3': 0x033, + 'WRITE_DR4': 0x034, + 'WRITE_DR5': 0x035, + 'WRITE_DR6': 0x036, + 'WRITE_DR7': 0x037, + 'EXCP_BASE': 0x040, + 'LAST_EXCP': 0x05f, + 'INTR': 0x060, + 'NMI': 0x061, + 'SMI': 0x062, + 'INIT': 0x063, + 'VINTR': 0x064, + 'CR0_SEL_WRITE': 0x065, + 'IDTR_READ': 0x066, + 'GDTR_READ': 0x067, + 'LDTR_READ': 0x068, + 'TR_READ': 0x069, + 'IDTR_WRITE': 0x06a, + 'GDTR_WRITE': 0x06b, + 'LDTR_WRITE': 0x06c, + 'TR_WRITE': 0x06d, + 'RDTSC': 0x06e, + 'RDPMC': 0x06f, + 'PUSHF': 0x070, + 'POPF': 0x071, + 'CPUID': 0x072, + 'RSM': 0x073, + 'IRET': 0x074, + 'SWINT': 0x075, + 'INVD': 0x076, + 'PAUSE': 0x077, + 'HLT': 0x078, + 'INVLPG': 0x079, + 'INVLPGA': 0x07a, + 'IOIO': 0x07b, + 'MSR': 0x07c, + 'TASK_SWITCH': 0x07d, + 'FERR_FREEZE': 0x07e, + 'SHUTDOWN': 0x07f, + 'VMRUN': 0x080, + 'VMMCALL': 0x081, + 'VMLOAD': 0x082, + 'VMSAVE': 0x083, + 'STGI': 0x084, + 'CLGI': 0x085, + 'SKINIT': 0x086, + 'RDTSCP': 0x087, + 'ICEBP': 0x088, + 'WBINVD': 0x089, + 'MONITOR': 0x08a, + 'MWAIT': 0x08b, + 'MWAIT_COND': 0x08c, + 'XSETBV': 0x08d, + 'RDPRU': 0x08e, + 'EFER_WRITE_TRAP': 0x08f, + 'CR0_WRITE_TRAP': 0x090, + 'CR1_WRITE_TRAP': 0x091, + 'CR2_WRITE_TRAP': 0x092, + 'CR3_WRITE_TRAP': 0x093, + 'CR4_WRITE_TRAP': 0x094, + 'CR5_WRITE_TRAP': 0x095, + 'CR6_WRITE_TRAP': 0x096, + 'CR7_WRITE_TRAP': 0x097, + 'CR8_WRITE_TRAP': 0x098, + 'CR9_WRITE_TRAP': 0x099, + 'CR10_WRITE_TRAP': 0x09a, + 'CR11_WRITE_TRAP': 0x09b, + 'CR12_WRITE_TRAP': 0x09c, + 'CR13_WRITE_TRAP': 0x09d, + 'CR14_WRITE_TRAP': 0x09e, + 'CR15_WRITE_TRAP': 0x09f, + 'INVPCID': 0x0a2, + 'NPF': 0x400, + 'AVIC_INCOMPLETE_IPI': 0x401, + 'AVIC_UNACCELERATED_ACCESS': 0x402, + 'VMGEXIT': 0x403, +} + +# EC definition of HSR (from arch/arm64/include/asm/esr.h) +AARCH64_EXIT_REASONS = { + 'UNKNOWN': 0x00, + 'WFx': 0x01, + 'CP15_32': 0x03, + 'CP15_64': 0x04, + 'CP14_MR': 0x05, + 'CP14_LS': 0x06, + 'FP_ASIMD': 0x07, + 'CP10_ID': 0x08, + 'PAC': 0x09, + 'CP14_64': 0x0C, + 'BTI': 0x0D, + 'ILL': 0x0E, + 'SVC32': 0x11, + 'HVC32': 0x12, + 'SMC32': 0x13, + 'SVC64': 0x15, + 'HVC64': 0x16, + 'SMC64': 0x17, + 'SYS64': 0x18, + 'SVE': 0x19, + 'ERET': 0x1A, + 'FPAC': 0x1C, + 'SME': 0x1D, + 'IMP_DEF': 0x1F, + 'IABT_LOW': 0x20, + 'IABT_CUR': 0x21, + 'PC_ALIGN': 0x22, + 'DABT_LOW': 0x24, + 'DABT_CUR': 0x25, + 'SP_ALIGN': 0x26, + 'FP_EXC32': 0x28, + 'FP_EXC64': 0x2C, + 'SERROR': 0x2F, + 'BREAKPT_LOW': 0x30, + 'BREAKPT_CUR': 0x31, + 'SOFTSTP_LOW': 0x32, + 'SOFTSTP_CUR': 0x33, + 'WATCHPT_LOW': 0x34, + 'WATCHPT_CUR': 0x35, + 'BKPT32': 0x38, + 'VECTOR32': 0x3A, + 'BRK64': 0x3C, +} + +# From include/uapi/linux/kvm.h, KVM_EXIT_xxx +USERSPACE_EXIT_REASONS = { + 'UNKNOWN': 0, + 'EXCEPTION': 1, + 'IO': 2, + 'HYPERCALL': 3, + 'DEBUG': 4, + 'HLT': 5, + 'MMIO': 6, + 'IRQ_WINDOW_OPEN': 7, + 'SHUTDOWN': 8, + 'FAIL_ENTRY': 9, + 'INTR': 10, + 'SET_TPR': 11, + 'TPR_ACCESS': 12, + 'S390_SIEIC': 13, + 'S390_RESET': 14, + 'DCR': 15, + 'NMI': 16, + 'INTERNAL_ERROR': 17, + 'OSI': 18, + 'PAPR_HCALL': 19, + 'S390_UCONTROL': 20, + 'WATCHDOG': 21, + 'S390_TSCH': 22, + 'EPR': 23, + 'SYSTEM_EVENT': 24, + 'S390_STSI': 25, + 'IOAPIC_EOI': 26, + 'HYPERV': 27, + 'ARM_NISV': 28, + 'X86_RDMSR': 29, + 'X86_WRMSR': 30, + 'DIRTY_RING_FULL': 31, + 'AP_RESET_HOLD': 32, + 'X86_BUS_LOCK': 33, + 'XEN': 34, + 'RISCV_SBI': 35, + 'RISCV_CSR': 36, + 'NOTIFY': 37, +} + +IOCTL_NUMBERS = { + 'SET_FILTER': 0x40082406, + 'ENABLE': 0x00002400, + 'DISABLE': 0x00002401, + 'RESET': 0x00002403, +} + +signal_received = False + +ENCODING = locale.getpreferredencoding(False) +TRACE_FILTER = re.compile(r'^[^\(]*$') + + +class Arch(object): + """Encapsulates global architecture specific data. + + Contains the performance event open syscall and ioctl numbers, as + well as the VM exit reasons for the architecture it runs on. + + """ + @staticmethod + def get_arch(): + machine = os.uname()[4] + + if machine.startswith('ppc'): + return ArchPPC() + elif machine.startswith('aarch64'): + return ArchA64() + elif machine.startswith('s390'): + return ArchS390() + else: + # X86_64 + for line in open('/proc/cpuinfo'): + if not line.startswith('flags'): + continue + + flags = line.split() + if 'vmx' in flags: + return ArchX86(VMX_EXIT_REASONS) + if 'svm' in flags: + return ArchX86(SVM_EXIT_REASONS) + return + + def tracepoint_is_child(self, field): + if (TRACE_FILTER.match(field)): + return None + return field.split('(', 1)[0] + + +class ArchX86(Arch): + def __init__(self, exit_reasons): + self.sc_perf_evt_open = 298 + self.ioctl_numbers = IOCTL_NUMBERS + self.exit_reason_field = 'exit_reason' + self.exit_reasons = exit_reasons + + def debugfs_is_child(self, field): + """ Returns name of parent if 'field' is a child, None otherwise """ + return None + + +class ArchPPC(Arch): + def __init__(self): + self.sc_perf_evt_open = 319 + self.ioctl_numbers = IOCTL_NUMBERS + self.ioctl_numbers['ENABLE'] = 0x20002400 + self.ioctl_numbers['DISABLE'] = 0x20002401 + self.ioctl_numbers['RESET'] = 0x20002403 + + # PPC comes in 32 and 64 bit and some generated ioctl + # numbers depend on the wordsize. + char_ptr_size = ctypes.sizeof(ctypes.c_char_p) + self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16 + self.exit_reason_field = 'exit_nr' + self.exit_reasons = {} + + def debugfs_is_child(self, field): + """ Returns name of parent if 'field' is a child, None otherwise """ + return None + + +class ArchA64(Arch): + def __init__(self): + self.sc_perf_evt_open = 241 + self.ioctl_numbers = IOCTL_NUMBERS + self.exit_reason_field = 'esr_ec' + self.exit_reasons = AARCH64_EXIT_REASONS + + def debugfs_is_child(self, field): + """ Returns name of parent if 'field' is a child, None otherwise """ + return None + + +class ArchS390(Arch): + def __init__(self): + self.sc_perf_evt_open = 331 + self.ioctl_numbers = IOCTL_NUMBERS + self.exit_reason_field = None + self.exit_reasons = None + + def debugfs_is_child(self, field): + """ Returns name of parent if 'field' is a child, None otherwise """ + if field.startswith('instruction_'): + return 'exit_instruction' + + +ARCH = Arch.get_arch() + + +class perf_event_attr(ctypes.Structure): + """Struct that holds the necessary data to set up a trace event. + + For an extensive explanation see perf_event_open(2) and + include/uapi/linux/perf_event.h, struct perf_event_attr + + All fields that are not initialized in the constructor are 0. + + """ + _fields_ = [('type', ctypes.c_uint32), + ('size', ctypes.c_uint32), + ('config', ctypes.c_uint64), + ('sample_freq', ctypes.c_uint64), + ('sample_type', ctypes.c_uint64), + ('read_format', ctypes.c_uint64), + ('flags', ctypes.c_uint64), + ('wakeup_events', ctypes.c_uint32), + ('bp_type', ctypes.c_uint32), + ('bp_addr', ctypes.c_uint64), + ('bp_len', ctypes.c_uint64), + ] + + def __init__(self): + super(self.__class__, self).__init__() + self.type = PERF_TYPE_TRACEPOINT + self.size = ctypes.sizeof(self) + self.read_format = PERF_FORMAT_GROUP + + +PERF_TYPE_TRACEPOINT = 2 +PERF_FORMAT_GROUP = 1 << 3 + + +class Group(object): + """Represents a perf event group.""" + + def __init__(self): + self.events = [] + + def add_event(self, event): + self.events.append(event) + + def read(self): + """Returns a dict with 'event name: value' for all events in the + group. + + Values are read by reading from the file descriptor of the + event that is the group leader. See perf_event_open(2) for + details. + + Read format for the used event configuration is: + struct read_format { + u64 nr; /* The number of events */ + struct { + u64 value; /* The value of the event */ + } values[nr]; + }; + + """ + length = 8 * (1 + len(self.events)) + read_format = 'xxxxxxxx' + 'Q' * len(self.events) + return dict(zip([event.name for event in self.events], + struct.unpack(read_format, + os.read(self.events[0].fd, length)))) + + +class Event(object): + """Represents a performance event and manages its life cycle.""" + def __init__(self, name, group, trace_cpu, trace_pid, trace_point, + trace_filter, trace_set='kvm'): + self.libc = ctypes.CDLL('libc.so.6', use_errno=True) + self.syscall = self.libc.syscall + self.name = name + self.fd = None + self._setup_event(group, trace_cpu, trace_pid, trace_point, + trace_filter, trace_set) + + def __del__(self): + """Closes the event's file descriptor. + + As no python file object was created for the file descriptor, + python will not reference count the descriptor and will not + close it itself automatically, so we do it. + + """ + if self.fd: + os.close(self.fd) + + def _perf_event_open(self, attr, pid, cpu, group_fd, flags): + """Wrapper for the sys_perf_evt_open() syscall. + + Used to set up performance events, returns a file descriptor or -1 + on error. + + Attributes are: + - syscall number + - struct perf_event_attr * + - pid or -1 to monitor all pids + - cpu number or -1 to monitor all cpus + - The file descriptor of the group leader or -1 to create a group. + - flags + + """ + return self.syscall(ARCH.sc_perf_evt_open, ctypes.pointer(attr), + ctypes.c_int(pid), ctypes.c_int(cpu), + ctypes.c_int(group_fd), ctypes.c_long(flags)) + + def _setup_event_attribute(self, trace_set, trace_point): + """Returns an initialized ctype perf_event_attr struct.""" + + id_path = os.path.join(PATH_DEBUGFS_TRACING, 'events', trace_set, + trace_point, 'id') + + event_attr = perf_event_attr() + event_attr.config = int(open(id_path).read()) + return event_attr + + def _setup_event(self, group, trace_cpu, trace_pid, trace_point, + trace_filter, trace_set): + """Sets up the perf event in Linux. + + Issues the syscall to register the event in the kernel and + then sets the optional filter. + + """ + + event_attr = self._setup_event_attribute(trace_set, trace_point) + + # First event will be group leader. + group_leader = -1 + + # All others have to pass the leader's descriptor instead. + if group.events: + group_leader = group.events[0].fd + + fd = self._perf_event_open(event_attr, trace_pid, + trace_cpu, group_leader, 0) + if fd == -1: + err = ctypes.get_errno() + raise OSError(err, os.strerror(err), + 'while calling sys_perf_event_open().') + + if trace_filter: + fcntl.ioctl(fd, ARCH.ioctl_numbers['SET_FILTER'], + trace_filter) + + self.fd = fd + + def enable(self): + """Enables the trace event in the kernel. + + Enabling the group leader makes reading counters from it and the + events under it possible. + + """ + fcntl.ioctl(self.fd, ARCH.ioctl_numbers['ENABLE'], 0) + + def disable(self): + """Disables the trace event in the kernel. + + Disabling the group leader makes reading all counters under it + impossible. + + """ + fcntl.ioctl(self.fd, ARCH.ioctl_numbers['DISABLE'], 0) + + def reset(self): + """Resets the count of the trace event in the kernel.""" + fcntl.ioctl(self.fd, ARCH.ioctl_numbers['RESET'], 0) + + +class Provider(object): + """Encapsulates functionalities used by all providers.""" + def __init__(self, pid): + self.child_events = False + self.pid = pid + + @staticmethod + def is_field_wanted(fields_filter, field): + """Indicate whether field is valid according to fields_filter.""" + if not fields_filter: + return True + return re.match(fields_filter, field) is not None + + @staticmethod + def walkdir(path): + """Returns os.walk() data for specified directory. + + As it is only a wrapper it returns the same 3-tuple of (dirpath, + dirnames, filenames). + """ + return next(os.walk(path)) + + +class TracepointProvider(Provider): + """Data provider for the stats class. + + Manages the events/groups from which it acquires its data. + + """ + def __init__(self, pid, fields_filter): + self.group_leaders = [] + self.filters = self._get_filters() + self.update_fields(fields_filter) + super(TracepointProvider, self).__init__(pid) + + @staticmethod + def _get_filters(): + """Returns a dict of trace events, their filter ids and + the values that can be filtered. + + Trace events can be filtered for special values by setting a + filter string via an ioctl. The string normally has the format + identifier==value. For each filter a new event will be created, to + be able to distinguish the events. + + """ + filters = {} + filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS) + if ARCH.exit_reason_field and ARCH.exit_reasons: + filters['kvm_exit'] = (ARCH.exit_reason_field, ARCH.exit_reasons) + return filters + + def _get_available_fields(self): + """Returns a list of available events of format 'event name(filter + name)'. + + All available events have directories under + /sys/kernel/tracing/events/ which export information + about the specific event. Therefore, listing the dirs gives us + a list of all available events. + + Some events like the vm exit reasons can be filtered for + specific values. To take account for that, the routine below + creates special fields with the following format: + event name(filter name) + + """ + path = os.path.join(PATH_DEBUGFS_TRACING, 'events', 'kvm') + fields = self.walkdir(path)[1] + extra = [] + for field in fields: + if field in self.filters: + filter_name_, filter_dicts = self.filters[field] + for name in filter_dicts: + extra.append(field + '(' + name + ')') + fields += extra + return fields + + def update_fields(self, fields_filter): + """Refresh fields, applying fields_filter""" + self.fields = [field for field in self._get_available_fields() + if self.is_field_wanted(fields_filter, field)] + # add parents for child fields - otherwise we won't see any output! + for field in self._fields: + parent = ARCH.tracepoint_is_child(field) + if (parent and parent not in self._fields): + self.fields.append(parent) + + @staticmethod + def _get_online_cpus(): + """Returns a list of cpu id integers.""" + def parse_int_list(list_string): + """Returns an int list from a string of comma separated integers and + integer ranges.""" + integers = [] + members = list_string.split(',') + + for member in members: + if '-' not in member: + integers.append(int(member)) + else: + int_range = member.split('-') + integers.extend(range(int(int_range[0]), + int(int_range[1]) + 1)) + + return integers + + with open('/sys/devices/system/cpu/online') as cpu_list: + cpu_string = cpu_list.readline() + return parse_int_list(cpu_string) + + def _setup_traces(self): + """Creates all event and group objects needed to be able to retrieve + data.""" + fields = self._get_available_fields() + if self._pid > 0: + # Fetch list of all threads of the monitored pid, as qemu + # starts a thread for each vcpu. + path = os.path.join('/proc', str(self._pid), 'task') + groupids = self.walkdir(path)[1] + else: + groupids = self._get_online_cpus() + + # The constant is needed as a buffer for python libs, std + # streams and other files that the script opens. + newlim = len(groupids) * len(fields) + 50 + try: + softlim_, hardlim = resource.getrlimit(resource.RLIMIT_NOFILE) + + if hardlim < newlim: + # Now we need CAP_SYS_RESOURCE, to increase the hard limit. + resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, newlim)) + else: + # Raising the soft limit is sufficient. + resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, hardlim)) + + except ValueError: + sys.exit("NOFILE rlimit could not be raised to {0}".format(newlim)) + + for groupid in groupids: + group = Group() + for name in fields: + tracepoint = name + tracefilter = None + match = re.match(r'(.*)\((.*)\)', name) + if match: + tracepoint, sub = match.groups() + tracefilter = ('%s==%d\0' % + (self.filters[tracepoint][0], + self.filters[tracepoint][1][sub])) + + # From perf_event_open(2): + # pid > 0 and cpu == -1 + # This measures the specified process/thread on any CPU. + # + # pid == -1 and cpu >= 0 + # This measures all processes/threads on the specified CPU. + trace_cpu = groupid if self._pid == 0 else -1 + trace_pid = int(groupid) if self._pid != 0 else -1 + + group.add_event(Event(name=name, + group=group, + trace_cpu=trace_cpu, + trace_pid=trace_pid, + trace_point=tracepoint, + trace_filter=tracefilter)) + + self.group_leaders.append(group) + + @property + def fields(self): + return self._fields + + @fields.setter + def fields(self, fields): + """Enables/disables the (un)wanted events""" + self._fields = fields + for group in self.group_leaders: + for index, event in enumerate(group.events): + if event.name in fields: + event.reset() + event.enable() + else: + # Do not disable the group leader. + # It would disable all of its events. + if index != 0: + event.disable() + + @property + def pid(self): + return self._pid + + @pid.setter + def pid(self, pid): + """Changes the monitored pid by setting new traces.""" + self._pid = pid + # The garbage collector will get rid of all Event/Group + # objects and open files after removing the references. + self.group_leaders = [] + self._setup_traces() + self.fields = self._fields + + def read(self, by_guest=0): + """Returns 'event name: current value' for all enabled events.""" + ret = defaultdict(int) + for group in self.group_leaders: + for name, val in group.read().items(): + if name not in self._fields: + continue + parent = ARCH.tracepoint_is_child(name) + if parent: + name += ' ' + parent + ret[name] += val + return ret + + def reset(self): + """Reset all field counters""" + for group in self.group_leaders: + for event in group.events: + event.reset() + + +class DebugfsProvider(Provider): + """Provides data from the files that KVM creates in the kvm debugfs + folder.""" + def __init__(self, pid, fields_filter, include_past): + self.update_fields(fields_filter) + self._baseline = {} + self.do_read = True + self.paths = [] + super(DebugfsProvider, self).__init__(pid) + if include_past: + self._restore() + + def _get_available_fields(self): + """"Returns a list of available fields. + + The fields are all available KVM debugfs files + + """ + exempt_list = ['halt_poll_fail_ns', 'halt_poll_success_ns', 'halt_wait_ns'] + fields = [field for field in self.walkdir(PATH_DEBUGFS_KVM)[2] + if field not in exempt_list] + + return fields + + def update_fields(self, fields_filter): + """Refresh fields, applying fields_filter""" + self._fields = [field for field in self._get_available_fields() + if self.is_field_wanted(fields_filter, field)] + # add parents for child fields - otherwise we won't see any output! + for field in self._fields: + parent = ARCH.debugfs_is_child(field) + if (parent and parent not in self._fields): + self.fields.append(parent) + + @property + def fields(self): + return self._fields + + @fields.setter + def fields(self, fields): + self._fields = fields + self.reset() + + @property + def pid(self): + return self._pid + + @pid.setter + def pid(self, pid): + self._pid = pid + if pid != 0: + vms = self.walkdir(PATH_DEBUGFS_KVM)[1] + if len(vms) == 0: + self.do_read = False + + self.paths = list(filter(lambda x: "{}-".format(pid) in x, vms)) + + else: + self.paths = [] + self.do_read = True + + def _verify_paths(self): + """Remove invalid paths""" + for path in self.paths: + if not os.path.exists(os.path.join(PATH_DEBUGFS_KVM, path)): + self.paths.remove(path) + continue + + def read(self, reset=0, by_guest=0): + """Returns a dict with format:'file name / field -> current value'. + + Parameter 'reset': + 0 plain read + 1 reset field counts to 0 + 2 restore the original field counts + + """ + results = {} + + # If no debugfs filtering support is available, then don't read. + if not self.do_read: + return results + self._verify_paths() + + paths = self.paths + if self._pid == 0: + paths = [] + for entry in os.walk(PATH_DEBUGFS_KVM): + for dir in entry[1]: + paths.append(dir) + for path in paths: + for field in self._fields: + value = self._read_field(field, path) + key = path + field + if reset == 1: + self._baseline[key] = value + if reset == 2: + self._baseline[key] = 0 + if self._baseline.get(key, -1) == -1: + self._baseline[key] = value + parent = ARCH.debugfs_is_child(field) + if parent: + field = field + ' ' + parent + else: + if by_guest: + field = key.split('-')[0] # set 'field' to 'pid' + increment = value - self._baseline.get(key, 0) + if field in results: + results[field] += increment + else: + results[field] = increment + + return results + + def _read_field(self, field, path): + """Returns the value of a single field from a specific VM.""" + try: + return int(open(os.path.join(PATH_DEBUGFS_KVM, + path, + field)) + .read()) + except IOError: + return 0 + + def reset(self): + """Reset field counters""" + self._baseline = {} + self.read(1) + + def _restore(self): + """Reset field counters""" + self._baseline = {} + self.read(2) + + +EventStat = namedtuple('EventStat', ['value', 'delta']) + + +class Stats(object): + """Manages the data providers and the data they provide. + + It is used to set filters on the provider's data and collect all + provider data. + + """ + def __init__(self, options): + self.providers = self._get_providers(options) + self._pid_filter = options.pid + self._fields_filter = options.fields + self.values = {} + self._child_events = False + + def _get_providers(self, options): + """Returns a list of data providers depending on the passed options.""" + providers = [] + + if options.debugfs: + providers.append(DebugfsProvider(options.pid, options.fields, + options.debugfs_include_past)) + if options.tracepoints or not providers: + providers.append(TracepointProvider(options.pid, options.fields)) + + return providers + + def _update_provider_filters(self): + """Propagates fields filters to providers.""" + # As we reset the counters when updating the fields we can + # also clear the cache of old values. + self.values = {} + for provider in self.providers: + provider.update_fields(self._fields_filter) + + def reset(self): + self.values = {} + for provider in self.providers: + provider.reset() + + @property + def fields_filter(self): + return self._fields_filter + + @fields_filter.setter + def fields_filter(self, fields_filter): + if fields_filter != self._fields_filter: + self._fields_filter = fields_filter + self._update_provider_filters() + + @property + def pid_filter(self): + return self._pid_filter + + @pid_filter.setter + def pid_filter(self, pid): + if pid != self._pid_filter: + self._pid_filter = pid + self.values = {} + for provider in self.providers: + provider.pid = self._pid_filter + + @property + def child_events(self): + return self._child_events + + @child_events.setter + def child_events(self, val): + self._child_events = val + for provider in self.providers: + provider.child_events = val + + def get(self, by_guest=0): + """Returns a dict with field -> (value, delta to last value) of all + provider data. + Key formats: + * plain: 'key' is event name + * child-parent: 'key' is in format '<child> <parent>' + * pid: 'key' is the pid of the guest, and the record contains the + aggregated event data + These formats are generated by the providers, and handled in class TUI. + """ + for provider in self.providers: + new = provider.read(by_guest=by_guest) + for key in new: + oldval = self.values.get(key, EventStat(0, 0)).value + newval = new.get(key, 0) + newdelta = newval - oldval + self.values[key] = EventStat(newval, newdelta) + return self.values + + def toggle_display_guests(self, to_pid): + """Toggle between collection of stats by individual event and by + guest pid + + Events reported by DebugfsProvider change when switching to/from + reading by guest values. Hence we have to remove the excess event + names from self.values. + + """ + if any(isinstance(ins, TracepointProvider) for ins in self.providers): + return 1 + if to_pid: + for provider in self.providers: + if isinstance(provider, DebugfsProvider): + for key in provider.fields: + if key in self.values.keys(): + del self.values[key] + else: + oldvals = self.values.copy() + for key in oldvals: + if key.isdigit(): + del self.values[key] + # Update oldval (see get()) + self.get(to_pid) + return 0 + + +DELAY_DEFAULT = 3.0 +MAX_GUEST_NAME_LEN = 48 +MAX_REGEX_LEN = 44 +SORT_DEFAULT = 0 +MIN_DELAY = 0.1 +MAX_DELAY = 25.5 + + +class Tui(object): + """Instruments curses to draw a nice text ui.""" + def __init__(self, stats, opts): + self.stats = stats + self.screen = None + self._delay_initial = 0.25 + self._delay_regular = opts.set_delay + self._sorting = SORT_DEFAULT + self._display_guests = 0 + + def __enter__(self): + """Initialises curses for later use. Based on curses.wrapper + implementation from the Python standard library.""" + self.screen = curses.initscr() + curses.noecho() + curses.cbreak() + + # The try/catch works around a minor bit of + # over-conscientiousness in the curses module, the error + # return from C start_color() is ignorable. + try: + curses.start_color() + except curses.error: + pass + + # Hide cursor in extra statement as some monochrome terminals + # might support hiding but not colors. + try: + curses.curs_set(0) + except curses.error: + pass + + curses.use_default_colors() + return self + + def __exit__(self, *exception): + """Resets the terminal to its normal state. Based on curses.wrapper + implementation from the Python standard library.""" + if self.screen: + self.screen.keypad(0) + curses.echo() + curses.nocbreak() + curses.endwin() + + @staticmethod + def get_all_gnames(): + """Returns a list of (pid, gname) tuples of all running guests""" + res = [] + try: + child = subprocess.Popen(['ps', '-A', '--format', 'pid,args'], + stdout=subprocess.PIPE) + except: + raise Exception + for line in child.stdout: + line = line.decode(ENCODING).lstrip().split(' ', 1) + # perform a sanity check before calling the more expensive + # function to possibly extract the guest name + if ' -name ' in line[1]: + res.append((line[0], Tui.get_gname_from_pid(line[0]))) + child.stdout.close() + + return res + + def _print_all_gnames(self, row): + """Print a list of all running guests along with their pids.""" + self.screen.addstr(row, 2, '%8s %-60s' % + ('Pid', 'Guest Name (fuzzy list, might be ' + 'inaccurate!)'), + curses.A_UNDERLINE) + row += 1 + try: + for line in self.get_all_gnames(): + self.screen.addstr(row, 2, '%8s %-60s' % (line[0], line[1])) + row += 1 + if row >= self.screen.getmaxyx()[0]: + break + except Exception: + self.screen.addstr(row + 1, 2, 'Not available') + + @staticmethod + def get_pid_from_gname(gname): + """Fuzzy function to convert guest name to QEMU process pid. + + Returns a list of potential pids, can be empty if no match found. + Throws an exception on processing errors. + + """ + pids = [] + for line in Tui.get_all_gnames(): + if gname == line[1]: + pids.append(int(line[0])) + + return pids + + @staticmethod + def get_gname_from_pid(pid): + """Returns the guest name for a QEMU process pid. + + Extracts the guest name from the QEMU comma line by processing the + '-name' option. Will also handle names specified out of sequence. + + """ + name = '' + try: + line = open('/proc/{}/cmdline' + .format(pid), 'r').read().split('\0') + parms = line[line.index('-name') + 1].split(',') + while '' in parms: + # commas are escaped (i.e. ',,'), hence e.g. 'foo,bar' results + # in # ['foo', '', 'bar'], which we revert here + idx = parms.index('') + parms[idx - 1] += ',' + parms[idx + 1] + del parms[idx:idx+2] + # the '-name' switch allows for two ways to specify the guest name, + # where the plain name overrides the name specified via 'guest=' + for arg in parms: + if '=' not in arg: + name = arg + break + if arg[:6] == 'guest=': + name = arg[6:] + except (ValueError, IOError, IndexError): + pass + + return name + + def _update_pid(self, pid): + """Propagates pid selection to stats object.""" + self.screen.addstr(4, 1, 'Updating pid filter...') + self.screen.refresh() + self.stats.pid_filter = pid + + def _refresh_header(self, pid=None): + """Refreshes the header.""" + if pid is None: + pid = self.stats.pid_filter + self.screen.erase() + gname = self.get_gname_from_pid(pid) + self._gname = gname + if gname: + gname = ('({})'.format(gname[:MAX_GUEST_NAME_LEN] + '...' + if len(gname) > MAX_GUEST_NAME_LEN + else gname)) + if pid > 0: + self._headline = 'kvm statistics - pid {0} {1}'.format(pid, gname) + else: + self._headline = 'kvm statistics - summary' + self.screen.addstr(0, 0, self._headline, curses.A_BOLD) + if self.stats.fields_filter: + regex = self.stats.fields_filter + if len(regex) > MAX_REGEX_LEN: + regex = regex[:MAX_REGEX_LEN] + '...' + self.screen.addstr(1, 17, 'regex filter: {0}'.format(regex)) + if self._display_guests: + col_name = 'Guest Name' + else: + col_name = 'Event' + self.screen.addstr(2, 1, '%-40s %10s%7s %8s' % + (col_name, 'Total', '%Total', 'CurAvg/s'), + curses.A_STANDOUT) + self.screen.addstr(4, 1, 'Collecting data...') + self.screen.refresh() + + def _refresh_body(self, sleeptime): + def insert_child(sorted_items, child, values, parent): + num = len(sorted_items) + for i in range(0, num): + # only add child if parent is present + if parent.startswith(sorted_items[i][0]): + sorted_items.insert(i + 1, (' ' + child, values)) + + def get_sorted_events(self, stats): + """ separate parent and child events """ + if self._sorting == SORT_DEFAULT: + def sortkey(pair): + # sort by (delta value, overall value) + v = pair[1] + return (v.delta, v.value) + else: + def sortkey(pair): + # sort by overall value + v = pair[1] + return v.value + + childs = [] + sorted_items = [] + # we can't rule out child events to appear prior to parents even + # when sorted - separate out all children first, and add in later + for key, values in sorted(stats.items(), key=sortkey, + reverse=True): + if values == (0, 0): + continue + if key.find(' ') != -1: + if not self.stats.child_events: + continue + childs.insert(0, (key, values)) + else: + sorted_items.append((key, values)) + if self.stats.child_events: + for key, values in childs: + (child, parent) = key.split(' ') + insert_child(sorted_items, child, values, parent) + + return sorted_items + + if not self._is_running_guest(self.stats.pid_filter): + if self._gname: + try: # ...to identify the guest by name in case it's back + pids = self.get_pid_from_gname(self._gname) + if len(pids) == 1: + self._refresh_header(pids[0]) + self._update_pid(pids[0]) + return + except: + pass + self._display_guest_dead() + # leave final data on screen + return + row = 3 + self.screen.move(row, 0) + self.screen.clrtobot() + stats = self.stats.get(self._display_guests) + total = 0. + ctotal = 0. + for key, values in stats.items(): + if self._display_guests: + if self.get_gname_from_pid(key): + total += values.value + continue + if not key.find(' ') != -1: + total += values.value + else: + ctotal += values.value + if total == 0.: + # we don't have any fields, or all non-child events are filtered + total = ctotal + + # print events + tavg = 0 + tcur = 0 + guest_removed = False + for key, values in get_sorted_events(self, stats): + if row >= self.screen.getmaxyx()[0] - 1 or values == (0, 0): + break + if self._display_guests: + key = self.get_gname_from_pid(key) + if not key: + continue + cur = int(round(values.delta / sleeptime)) if values.delta else 0 + if cur < 0: + guest_removed = True + continue + if key[0] != ' ': + if values.delta: + tcur += values.delta + ptotal = values.value + ltotal = total + else: + ltotal = ptotal + self.screen.addstr(row, 1, '%-40s %10d%7.1f %8s' % (key, + values.value, + values.value * 100 / float(ltotal), cur)) + row += 1 + if row == 3: + if guest_removed: + self.screen.addstr(4, 1, 'Guest removed, updating...') + else: + self.screen.addstr(4, 1, 'No matching events reported yet') + if row > 4: + tavg = int(round(tcur / sleeptime)) if tcur > 0 else '' + self.screen.addstr(row, 1, '%-40s %10d %8s' % + ('Total', total, tavg), curses.A_BOLD) + self.screen.refresh() + + def _display_guest_dead(self): + marker = ' Guest is DEAD ' + y = min(len(self._headline), 80 - len(marker)) + self.screen.addstr(0, y, marker, curses.A_BLINK | curses.A_STANDOUT) + + def _show_msg(self, text): + """Display message centered text and exit on key press""" + hint = 'Press any key to continue' + curses.cbreak() + self.screen.erase() + (x, term_width) = self.screen.getmaxyx() + row = 2 + for line in text: + start = (term_width - len(line)) // 2 + self.screen.addstr(row, start, line) + row += 1 + self.screen.addstr(row + 1, (term_width - len(hint)) // 2, hint, + curses.A_STANDOUT) + self.screen.getkey() + + def _show_help_interactive(self): + """Display help with list of interactive commands""" + msg = (' b toggle events by guests (debugfs only, honors' + ' filters)', + ' c clear filter', + ' f filter by regular expression', + ' g filter by guest name/PID', + ' h display interactive commands reference', + ' o toggle sorting order (Total vs CurAvg/s)', + ' p filter by guest name/PID', + ' q quit', + ' r reset stats', + ' s set delay between refreshs (value range: ' + '%s-%s secs)' % (MIN_DELAY, MAX_DELAY), + ' x toggle reporting of stats for individual child trace' + ' events', + 'Any other key refreshes statistics immediately') + curses.cbreak() + self.screen.erase() + self.screen.addstr(0, 0, "Interactive commands reference", + curses.A_BOLD) + self.screen.addstr(2, 0, "Press any key to exit", curses.A_STANDOUT) + row = 4 + for line in msg: + self.screen.addstr(row, 0, line) + row += 1 + self.screen.getkey() + self._refresh_header() + + def _show_filter_selection(self): + """Draws filter selection mask. + + Asks for a valid regex and sets the fields filter accordingly. + + """ + msg = '' + while True: + self.screen.erase() + self.screen.addstr(0, 0, + "Show statistics for events matching a regex.", + curses.A_BOLD) + self.screen.addstr(2, 0, + "Current regex: {0}" + .format(self.stats.fields_filter)) + self.screen.addstr(5, 0, msg) + self.screen.addstr(3, 0, "New regex: ") + curses.echo() + regex = self.screen.getstr().decode(ENCODING) + curses.noecho() + if len(regex) == 0: + self.stats.fields_filter = '' + self._refresh_header() + return + try: + re.compile(regex) + self.stats.fields_filter = regex + self._refresh_header() + return + except re.error: + msg = '"' + regex + '": Not a valid regular expression' + continue + + def _show_set_update_interval(self): + """Draws update interval selection mask.""" + msg = '' + while True: + self.screen.erase() + self.screen.addstr(0, 0, 'Set update interval (defaults to %.1fs).' + % DELAY_DEFAULT, curses.A_BOLD) + self.screen.addstr(4, 0, msg) + self.screen.addstr(2, 0, 'Change delay from %.1fs to ' % + self._delay_regular) + curses.echo() + val = self.screen.getstr().decode(ENCODING) + curses.noecho() + + try: + if len(val) > 0: + delay = float(val) + err = is_delay_valid(delay) + if err is not None: + msg = err + continue + else: + delay = DELAY_DEFAULT + self._delay_regular = delay + break + + except ValueError: + msg = '"' + str(val) + '": Invalid value' + self._refresh_header() + + def _is_running_guest(self, pid): + """Check if pid is still a running process.""" + if not pid: + return True + return os.path.isdir(os.path.join('/proc/', str(pid))) + + def _show_vm_selection_by_guest(self): + """Draws guest selection mask. + + Asks for a guest name or pid until a valid guest name or '' is entered. + + """ + msg = '' + while True: + self.screen.erase() + self.screen.addstr(0, 0, + 'Show statistics for specific guest or pid.', + curses.A_BOLD) + self.screen.addstr(1, 0, + 'This might limit the shown data to the trace ' + 'statistics.') + self.screen.addstr(5, 0, msg) + self._print_all_gnames(7) + curses.echo() + curses.curs_set(1) + self.screen.addstr(3, 0, "Guest or pid [ENTER exits]: ") + guest = self.screen.getstr().decode(ENCODING) + curses.noecho() + + pid = 0 + if not guest or guest == '0': + break + if guest.isdigit(): + if not self._is_running_guest(guest): + msg = '"' + guest + '": Not a running process' + continue + pid = int(guest) + break + pids = [] + try: + pids = self.get_pid_from_gname(guest) + except: + msg = '"' + guest + '": Internal error while searching, ' \ + 'use pid filter instead' + continue + if len(pids) == 0: + msg = '"' + guest + '": Not an active guest' + continue + if len(pids) > 1: + msg = '"' + guest + '": Multiple matches found, use pid ' \ + 'filter instead' + continue + pid = pids[0] + break + curses.curs_set(0) + self._refresh_header(pid) + self._update_pid(pid) + + def show_stats(self): + """Refreshes the screen and processes user input.""" + sleeptime = self._delay_initial + self._refresh_header() + start = 0.0 # result based on init value never appears on screen + while True: + self._refresh_body(time.time() - start) + curses.halfdelay(int(sleeptime * 10)) + start = time.time() + sleeptime = self._delay_regular + try: + char = self.screen.getkey() + if char == 'b': + self._display_guests = not self._display_guests + if self.stats.toggle_display_guests(self._display_guests): + self._show_msg(['Command not available with ' + 'tracepoints enabled', 'Restart with ' + 'debugfs only (see option \'-d\') and ' + 'try again!']) + self._display_guests = not self._display_guests + self._refresh_header() + if char == 'c': + self.stats.fields_filter = '' + self._refresh_header(0) + self._update_pid(0) + if char == 'f': + curses.curs_set(1) + self._show_filter_selection() + curses.curs_set(0) + sleeptime = self._delay_initial + if char == 'g' or char == 'p': + self._show_vm_selection_by_guest() + sleeptime = self._delay_initial + if char == 'h': + self._show_help_interactive() + if char == 'o': + self._sorting = not self._sorting + if char == 'q': + break + if char == 'r': + self.stats.reset() + if char == 's': + curses.curs_set(1) + self._show_set_update_interval() + curses.curs_set(0) + sleeptime = self._delay_initial + if char == 'x': + self.stats.child_events = not self.stats.child_events + except KeyboardInterrupt: + break + except curses.error: + continue + + +def batch(stats): + """Prints statistics in a key, value format.""" + try: + s = stats.get() + time.sleep(1) + s = stats.get() + for key, values in sorted(s.items()): + print('%-42s%10d%10d' % (key.split(' ')[0], values.value, + values.delta)) + except KeyboardInterrupt: + pass + + +class StdFormat(object): + def __init__(self, keys): + self._banner = '' + for key in keys: + self._banner += key.split(' ')[0] + ' ' + + def get_banner(self): + return self._banner + + def get_statline(self, keys, s): + res = '' + for key in keys: + res += ' %9d' % s[key].delta + return res + + +class CSVFormat(object): + def __init__(self, keys): + self._banner = 'timestamp' + self._banner += reduce(lambda res, key: "{},{!s}".format(res, + key.split(' ')[0]), keys, '') + + def get_banner(self): + return self._banner + + def get_statline(self, keys, s): + return reduce(lambda res, key: "{},{!s}".format(res, s[key].delta), + keys, '') + + +def log(stats, opts, frmt, keys): + """Prints statistics as reiterating key block, multiple value blocks.""" + global signal_received + line = 0 + banner_repeat = 20 + f = None + + def do_banner(opts): + nonlocal f + if opts.log_to_file: + if not f: + try: + f = open(opts.log_to_file, 'a') + except (IOError, OSError): + sys.exit("Error: Could not open file: %s" % + opts.log_to_file) + if isinstance(frmt, CSVFormat) and f.tell() != 0: + return + print(frmt.get_banner(), file=f or sys.stdout) + + def do_statline(opts, values): + statline = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + \ + frmt.get_statline(keys, values) + print(statline, file=f or sys.stdout) + + do_banner(opts) + banner_printed = True + while True: + try: + time.sleep(opts.set_delay) + if signal_received: + banner_printed = True + line = 0 + f.close() + do_banner(opts) + signal_received = False + if (line % banner_repeat == 0 and not banner_printed and + not (opts.log_to_file and isinstance(frmt, CSVFormat))): + do_banner(opts) + banner_printed = True + values = stats.get() + if (not opts.skip_zero_records or + any(values[k].delta != 0 for k in keys)): + do_statline(opts, values) + line += 1 + banner_printed = False + except KeyboardInterrupt: + break + + if opts.log_to_file: + f.close() + + +def handle_signal(sig, frame): + global signal_received + + signal_received = True + + return + + +def is_delay_valid(delay): + """Verify delay is in valid value range.""" + msg = None + if delay < MIN_DELAY: + msg = '"' + str(delay) + '": Delay must be >=%s' % MIN_DELAY + if delay > MAX_DELAY: + msg = '"' + str(delay) + '": Delay must be <=%s' % MAX_DELAY + return msg + + +def get_options(): + """Returns processed program arguments.""" + description_text = """ +This script displays various statistics about VMs running under KVM. +The statistics are gathered from the KVM debugfs entries and / or the +currently available perf traces. + +The monitoring takes additional cpu cycles and might affect the VM's +performance. + +Requirements: +- Access to: + %s + %s/events/* + /proc/pid/task +- /proc/sys/kernel/perf_event_paranoid < 1 if user has no + CAP_SYS_ADMIN and perf events are used. +- CAP_SYS_RESOURCE if the hard limit is not high enough to allow + the large number of files that are possibly opened. + +Interactive Commands: + b toggle events by guests (debugfs only, honors filters) + c clear filter + f filter by regular expression + g filter by guest name + h display interactive commands reference + o toggle sorting order (Total vs CurAvg/s) + p filter by PID + q quit + r reset stats + s set update interval (value range: 0.1-25.5 secs) + x toggle reporting of stats for individual child trace events +Press any other key to refresh statistics immediately. +""" % (PATH_DEBUGFS_KVM, PATH_DEBUGFS_TRACING) + + class Guest_to_pid(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + try: + pids = Tui.get_pid_from_gname(values) + except: + sys.exit('Error while searching for guest "{}". Use "-p" to ' + 'specify a pid instead?'.format(values)) + if len(pids) == 0: + sys.exit('Error: No guest by the name "{}" found' + .format(values)) + if len(pids) > 1: + sys.exit('Error: Multiple processes found (pids: {}). Use "-p"' + ' to specify the desired pid' + .format(" ".join(map(str, pids)))) + namespace.pid = pids[0] + + argparser = argparse.ArgumentParser(description=description_text, + formatter_class=argparse + .RawTextHelpFormatter) + argparser.add_argument('-1', '--once', '--batch', + action='store_true', + default=False, + help='run in batch mode for one second', + ) + argparser.add_argument('-c', '--csv', + action='store_true', + default=False, + help='log in csv format - requires option -l/-L', + ) + argparser.add_argument('-d', '--debugfs', + action='store_true', + default=False, + help='retrieve statistics from debugfs', + ) + argparser.add_argument('-f', '--fields', + default='', + help='''fields to display (regex) +"-f help" for a list of available events''', + ) + argparser.add_argument('-g', '--guest', + type=str, + help='restrict statistics to guest by name', + action=Guest_to_pid, + ) + argparser.add_argument('-i', '--debugfs-include-past', + action='store_true', + default=False, + help='include all available data on past events for' + ' debugfs', + ) + argparser.add_argument('-l', '--log', + action='store_true', + default=False, + help='run in logging mode (like vmstat)', + ) + argparser.add_argument('-L', '--log-to-file', + type=str, + metavar='FILE', + help="like '--log', but logging to a file" + ) + argparser.add_argument('-p', '--pid', + type=int, + default=0, + help='restrict statistics to pid', + ) + argparser.add_argument('-s', '--set-delay', + type=float, + default=DELAY_DEFAULT, + metavar='DELAY', + help='set delay between refreshs (value range: ' + '%s-%s secs)' % (MIN_DELAY, MAX_DELAY), + ) + argparser.add_argument('-t', '--tracepoints', + action='store_true', + default=False, + help='retrieve statistics from tracepoints', + ) + argparser.add_argument('-z', '--skip-zero-records', + action='store_true', + default=False, + help='omit records with all zeros in logging mode', + ) + options = argparser.parse_args() + if options.csv and not (options.log or options.log_to_file): + sys.exit('Error: Option -c/--csv requires -l/--log') + if options.skip_zero_records and not (options.log or options.log_to_file): + sys.exit('Error: Option -z/--skip-zero-records requires -l/-L') + try: + # verify that we were passed a valid regex up front + re.compile(options.fields) + except re.error: + sys.exit('Error: "' + options.fields + '" is not a valid regular ' + 'expression') + + return options + + +def check_access(options): + """Exits if the current user can't access all needed directories.""" + if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints or + not options.debugfs): + sys.stderr.write("Please enable CONFIG_TRACING in your kernel " + "when using the option -t (default).\n" + "If it is enabled, make {0} readable by the " + "current user.\n" + .format(PATH_DEBUGFS_TRACING)) + if options.tracepoints: + sys.exit(1) + + sys.stderr.write("Falling back to debugfs statistics!\n") + options.debugfs = True + time.sleep(5) + + return options + + +def assign_globals(): + global PATH_DEBUGFS_KVM + global PATH_DEBUGFS_TRACING + + debugfs = '' + for line in open('/proc/mounts'): + if line.split(' ')[2] == 'debugfs': + debugfs = line.split(' ')[1] + break + if debugfs == '': + sys.stderr.write("Please make sure that CONFIG_DEBUG_FS is enabled in " + "your kernel, mounted and\nreadable by the current " + "user:\n" + "('mount -t debugfs debugfs /sys/kernel/debug')\n") + sys.exit(1) + + PATH_DEBUGFS_KVM = os.path.join(debugfs, 'kvm') + PATH_DEBUGFS_TRACING = os.path.join(debugfs, 'tracing') + + if not os.path.exists(PATH_DEBUGFS_KVM): + sys.stderr.write("Please make sure that CONFIG_KVM is enabled in " + "your kernel and that the modules are loaded.\n") + sys.exit(1) + + +def main(): + assign_globals() + options = get_options() + options = check_access(options) + + if (options.pid > 0 and + not os.path.isdir(os.path.join('/proc/', + str(options.pid)))): + sys.stderr.write('Did you use a (unsupported) tid instead of a pid?\n') + sys.exit('Specified pid does not exist.') + + err = is_delay_valid(options.set_delay) + if err is not None: + sys.exit('Error: ' + err) + + stats = Stats(options) + + if options.fields == 'help': + stats.fields_filter = None + event_list = [] + for key in stats.get().keys(): + event_list.append(key.split('(', 1)[0]) + sys.stdout.write(' ' + '\n '.join(sorted(set(event_list))) + '\n') + sys.exit(0) + + if options.log or options.log_to_file: + if options.log_to_file: + signal.signal(signal.SIGHUP, handle_signal) + keys = sorted(stats.get().keys()) + if options.csv: + frmt = CSVFormat(keys) + else: + frmt = StdFormat(keys) + log(stats, options, frmt, keys) + elif not options.once: + with Tui(stats, options) as tui: + tui.show_stats() + else: + batch(stats) + + +if __name__ == "__main__": + main() diff --git a/tools/kvm/kvm_stat/kvm_stat.service b/tools/kvm/kvm_stat/kvm_stat.service new file mode 100644 index 0000000000..8f13b843d5 --- /dev/null +++ b/tools/kvm/kvm_stat/kvm_stat.service @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0-only + +[Unit] +Description=Service that logs KVM kernel module trace events +Before=qemu-kvm.service + +[Service] +Type=simple +ExecStart=/usr/bin/kvm_stat -dtcz -s 10 -L /var/log/kvm_stat.csv +ExecReload=/bin/kill -HUP $MAINPID +Restart=always +RestartSec=60s +SyslogIdentifier=kvm_stat +SyslogLevel=debug + +[Install] +WantedBy=multi-user.target diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt new file mode 100644 index 0000000000..3a9f2037bd --- /dev/null +++ b/tools/kvm/kvm_stat/kvm_stat.txt @@ -0,0 +1,124 @@ +kvm_stat(1) +=========== + +NAME +---- +kvm_stat - Report KVM kernel module event counters + +SYNOPSIS +-------- +[verse] +'kvm_stat' [OPTION]... + +DESCRIPTION +----------- +kvm_stat prints counts of KVM kernel module trace events. These events signify +state transitions such as guest mode entry and exit. + +This tool is useful for observing guest behavior from the host perspective. +Often conclusions about performance or buggy behavior can be drawn from the +output. +While running in regular mode, use any of the keys listed in section +'Interactive Commands' below. +Use batch and logging modes for scripting purposes. + +The set of KVM kernel module trace events may be specific to the kernel version +or architecture. It is best to check the KVM kernel module source code for the +meaning of events. + +INTERACTIVE COMMANDS +-------------------- +[horizontal] +*b*:: toggle events by guests (debugfs only, honors filters) + +*c*:: clear filter + +*f*:: filter by regular expression + :: *Note*: Child events pull in their parents, and parents' stats summarize + all child events, not just the filtered ones + +*g*:: filter by guest name/PID + +*h*:: display interactive commands reference + +*o*:: toggle sorting order (Total vs CurAvg/s) + +*p*:: filter by guest name/PID + +*q*:: quit + +*r*:: reset stats + +*s*:: set delay between refreshs + +*x*:: toggle reporting of stats for child trace events + :: *Note*: The stats for the parents summarize the respective child trace + events + +Press any other key to refresh statistics immediately. + +OPTIONS +------- +-1:: +--once:: +--batch:: + run in batch mode for one second + +-c:: +--csv:: + log in csv format. Requires option -l/--log or -L/--log-to-file. + When used with option -L/--log-to-file, the header is only ever + written to start of file to preserve the format. + +-d:: +--debugfs:: + retrieve statistics from debugfs + +-f<fields>:: +--fields=<fields>:: + fields to display (regex), "-f help" for a list of available events + +-g<guest>:: +--guest=<guest_name>:: + limit statistics to one virtual machine (guest name) + +-h:: +--help:: + show help message + +-i:: +--debugfs-include-past:: + include all available data on past events for debugfs + +-l:: +--log:: + run in logging mode (like vmstat) + + +-L<file>:: +--log-to-file=<file>:: + like -l/--log, but logging to a file. Appends to existing files. + +-p<pid>:: +--pid=<pid>:: + limit statistics to one virtual machine (pid) + +-s:: +--set-delay:: + set delay between refreshs (value range: 0.1-25.5 secs) + +-t:: +--tracepoints:: + retrieve statistics from tracepoints + +-z:: +--skip-zero-records:: + omit records with all zeros in logging mode + +SEE ALSO +-------- +'perf'(1), 'trace-cmd'(1) + +AUTHOR +------ +Stefan Hajnoczi <stefanha@redhat.com> |