diff options
Diffstat (limited to 'Documentation/libtracefs-sql.txt')
-rw-r--r-- | Documentation/libtracefs-sql.txt | 628 |
1 files changed, 628 insertions, 0 deletions
diff --git a/Documentation/libtracefs-sql.txt b/Documentation/libtracefs-sql.txt new file mode 100644 index 0000000..6d606db --- /dev/null +++ b/Documentation/libtracefs-sql.txt @@ -0,0 +1,628 @@ +libtracefs(3) +============= + +NAME +---- +tracefs_sql - Create a synthetic event via an SQL statement + +SYNOPSIS +-------- +[verse] +-- +*#include <tracefs.h>* + +struct tracefs_synth pass:[*]*tracefs_sql*(struct tep_handle pass:[*]_tep_, const char pass:[*]_name_, + const char pass:[*]_sql_buffer_, char pass:[**]_err_); +-- + +DESCRIPTION +----------- +Synthetic events are dynamically created events that attach two existing events +together via one or more matching fields between the two events. It can be used +to find the latency between the events, or to simply pass fields of the first event +on to the second event to display as one event. + +The Linux kernel interface to create synthetic events is complex, and there needs +to be a better way to create synthetic events that is easy and can be understood +via existing technology. + +If you think of each event as a table, where the fields are the column of the table +and each instance of the event as a row, you can understand how SQL can be used +to attach two events together and form another event (table). Utilizing the +SQL *SELECT* *FROM* *JOIN* *ON* [ *WHERE* ] syntax, a synthetic event can easily +be created from two different events. + +For simple SQL queries to make a histogram instead of a synthetic event, see +HISTOGRAMS below. + +*tracefs_sql*() takes in a _tep_ handler (See _tep_local_events_(3)) that is used to +verify the events within the _sql_buffer_ expression. The _name_ is the name of the +synthetic event to create. If _err_ points to an address of a string, it will be filled +with a detailed message on any type of parsing error, including fields that do not belong +to an event, or if the events or fields are not properly compared. + +The example program below is a fully functional parser where it will create a synthetic +event from a SQL syntax passed in via the command line or a file. + +The SQL format is as follows: + +*SELECT* <fields> *FROM* <start-event> *JOIN* <end-event> *ON* <matching-fields> *WHERE* <filter> + +Note, although the examples show the SQL commands in uppercase, they are not required to +be so. That is, you can use "SELECT" or "select" or "sElEct". + +For example: +[source,c] +-- +SELECT syscalls.sys_enter_read.fd, syscalls.sys_exit_read.ret FROM syscalls.sys_enter_read + JOIN syscalls.sys_exit_read + ON syscalls.sys_enter_read.common_pid = syscalls.sys_exit_write.common_pid +-- + +Will create a synthetic event that with the fields: + + u64 fd; s64 ret; + +Because the function takes a _tep_ handle, and usually all event names are unique, you can +leave off the system (group) name of the event, and *tracefs_sql*() will discover the +system for you. + +That is, the above statement would work with: + +[source,c] +-- +SELECT sys_enter_read.fd, sys_exit_read.ret FROM sys_enter_read JOIN sys_exit_read + ON sys_enter_read.common_pid = sys_exit_write.common_pid +-- + +The *AS* keyword can be used to name the fields as well as to give an alias to the +events, such that the above can be simplified even more as: + +[source,c] +-- +SELECT start.fd, end.ret FROM sys_enter_read AS start JOIN sys_exit_read AS end ON start.common_pid = end.common_pid +-- + +The above aliases _sys_enter_read_ as *start* and _sys_exit_read_ as *end* and uses +those aliases to reference the event throughout the statement. + +Using the *AS* keyword in the selection portion of the SQL statement will define what +those fields will be called in the synthetic event. + +[source,c] +-- +SELECT start.fd AS filed, end.ret AS return FROM sys_enter_read AS start JOIN sys_exit_read AS end + ON start.common_pid = end.common_pid +-- + +The above labels the _fd_ of _start_ as *filed* and the _ret_ of _end_ as *return* where +the synthetic event that is created will now have the fields: + + u64 filed; s64 return; + +The fields can also be calculated with results passed to the synthetic event: + +[source,c] +-- +select start.truesize, end.len, (start.truesize - end.len) as diff from napi_gro_receive_entry as start + JOIN netif_receive_skb as end ON start.skbaddr = end.skbaddr +-- + +Which would show the *truesize* of the _napi_gro_receive_entry_ event, the actual +_len_ of the content, shown by the _netif_receive_skb_, and the delta between +the two and expressed by the field *diff*. + +The code also supports recording the timestamps at either event, and performing calculations +on them. For wakeup latency, you have: + +[source,c] +-- +select start.pid, (end.TIMESTAMP_USECS - start.TIMESTAMP_USECS) as lat from sched_waking as start + JOIN sched_switch as end ON start.pid = end.next_pid +-- + +The above will create a synthetic event that records the _pid_ of the task being woken up, +and the time difference between the _sched_waking_ event and the _sched_switch_ event. +The *TIMESTAMP_USECS* will truncate the time down to microseconds as the timestamp usually +recorded in the tracing buffer has nanosecond resolution. If you do not want that +truncation, use *TIMESTAMP* instead of *TIMESTAMP_USECS*. + +Finally, the *WHERE* clause can be added, that will let you add filters on either or both events. + +[source,c] +-- +select start.pid, (end.TIMESTAMP_USECS - start.TIMESTAMP_USECS) as lat from sched_waking as start + JOIN sched_switch as end ON start.pid = end.next_pid + WHERE start.prio < 100 && (!(end.prev_pid < 1 || end.prev_prio > 100) || end.prev_pid == 0) +-- + +*NOTE* + +Although both events can be used together in the *WHERE* clause, they must not be mixed outside +the top most "&&" statements. You can not OR (||) the events together, where a filter of one +event is OR'd to a filter of the other event. This does not make sense, as the synthetic event +requires both events to take place to be recorded. If one is filtered out, then the synthetic +event does not execute. + +[source,c] +-- +select start.pid, (end.TIMESTAMP_USECS - start.TIMESTAMP_USECS) as lat from sched_waking as start + JOIN sched_switch as end ON start.pid = end.next_pid + WHERE start.prio < 100 && end.prev_prio < 100 +-- + +The above is valid. + +Where as the below is not. + +[source,c] +-- +select start.pid, (end.TIMESTAMP_USECS - start.TIMESTAMP_USECS) as lat from sched_waking as start + JOIN sched_switch as end ON start.pid = end.next_pid + WHERE start.prio < 100 || end.prev_prio < 100 +-- + + +KEYWORDS AS EVENT FIELDS +------------------------ + +In some cases, an event may have a keyword. For example, regcache_drop_region has "from" +as a field and the following will not work + +[source,c] +-- + select from from regcache_drop_region +-- + +In such cases, add a backslash to the conflicting field, and this will tell the parser +that the "from" is a field and not a keyword: + +[source,c] +-- + select \from from regcache_drop_region +-- + +HISTOGRAMS +---------- + +Simple SQL statements without the *JOIN* *ON* may also be used, which will create a histogram +instead. When doing this, the struct tracefs_hist descriptor can be retrieved from the +returned synthetic event descriptor via the *tracefs_synth_get_start_hist*(3). + +In order to utilize the histogram types (see xxx) the CAST command of SQL can be used. + +That is: + +[source,c] +-- + select CAST(common_pid AS comm), CAST(id AS syscall) FROM sys_enter +-- + +Which produces: + +[source,c] +-- + # echo 'hist:keys=common_pid.execname,id.syscall' > events/raw_syscalls/sys_enter/trigger + + # cat events/raw_syscalls/sys_enter/hist + +{ common_pid: bash [ 18248], id: sys_setpgid [109] } hitcount: 1 +{ common_pid: sendmail [ 1812], id: sys_read [ 0] } hitcount: 1 +{ common_pid: bash [ 18247], id: sys_getpid [ 39] } hitcount: 1 +{ common_pid: bash [ 18247], id: sys_dup2 [ 33] } hitcount: 1 +{ common_pid: gmain [ 13684], id: sys_inotify_add_watch [254] } hitcount: 1 +{ common_pid: cat [ 18247], id: sys_access [ 21] } hitcount: 1 +{ common_pid: bash [ 18248], id: sys_getpid [ 39] } hitcount: 1 +{ common_pid: cat [ 18247], id: sys_fadvise64 [221] } hitcount: 1 +{ common_pid: sendmail [ 1812], id: sys_openat [257] } hitcount: 1 +{ common_pid: less [ 18248], id: sys_munmap [ 11] } hitcount: 1 +{ common_pid: sendmail [ 1812], id: sys_close [ 3] } hitcount: 1 +{ common_pid: gmain [ 1534], id: sys_poll [ 7] } hitcount: 1 +{ common_pid: bash [ 18247], id: sys_execve [ 59] } hitcount: 1 +-- + +Note, string fields may not be cast. + +The possible types to cast to are: + +*HEX* - convert the value to use hex and not decimal + +*SYM* - convert a pointer to symbolic (kallsyms values) + +*SYM-OFFSET* - convert a pointer to symbolic and include the offset. + +*SYSCALL* - convert the number to the mapped system call name + +*EXECNAME* or *COMM* - can only be used with the common_pid field. Will show the task +name of the process. + +*LOG* or *LOG2* - bucket the key values in a log 2 values (1, 2, 3-4, 5-8, 9-16, 17-32, ...) + +The above fields are not case sensitive, and "LOG2" works as good as "log". + +A special CAST to _COUNTER_ or __COUNTER__ will make the field a value and not +a key. For example: + +[source,c] +-- + SELECT common_pid, CAST(bytes_req AS _COUNTER_) FROM kmalloc +-- + +Which will create + +[source,c] +-- + echo 'hist:keys=common_pid:vals=bytes_req' > events/kmem/kmalloc/trigger + + cat events/kmem/kmalloc/hist + +{ common_pid: 1812 } hitcount: 1 bytes_req: 32 +{ common_pid: 9111 } hitcount: 2 bytes_req: 272 +{ common_pid: 1768 } hitcount: 3 bytes_req: 1112 +{ common_pid: 0 } hitcount: 4 bytes_req: 512 +{ common_pid: 18297 } hitcount: 11 bytes_req: 2004 +-- + +RETURN VALUE +------------ +Returns 0 on success and -1 on failure. On failure, if _err_ is defined, it will be +allocated to hold a detailed description of what went wrong if it the error was caused +by a parsing error, or that an event, field does not exist or is not compatible with +what it was combined with. + +CREATE A TOOL +------------- + +The below example is a functional program that can be used to parse SQL commands into +synthetic events. + +[source, c] +-- + man tracefs_sql | sed -ne '/^EXAMPLE/,/FILES/ { /EXAMPLE/d ; /FILES/d ; p}' > sqlhist.c + gcc -o sqlhist sqlhist.c `pkg-config --cflags --libs libtracefs` +-- + +Then you can run the above examples: + +[source, c] +-- + sudo ./sqlhist 'select start.pid, (end.TIMESTAMP_USECS - start.TIMESTAMP_USECS) as lat from sched_waking as start + JOIN sched_switch as end ON start.pid = end.next_pid + WHERE start.prio < 100 || end.prev_prio < 100' +-- +EXAMPLE +------- +[source,c] +-- +#include <stdio.h> +#include <stdlib.h> +#include <stdarg.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <tracefs.h> + +static void usage(char **argv) +{ + fprintf(stderr, "usage: %s [-ed][-n name][-s][-S fields][-m var][-c var][-T][-t dir][-f file | sql-command-line]\n" + " -n name - name of synthetic event 'Anonymous' if left off\n" + " -t dir - use dir instead of /sys/kernel/tracing\n" + " -e - execute the commands to create the synthetic event\n" + " -m - trigger the action when var is a new max.\n" + " -c - trigger the action when var changes.\n" + " -s - used with -m or -c to do a snapshot of the tracing buffer\n" + " -S - used with -m or -c to save fields of the end event (comma deliminated)\n" + " -T - used with -m or -c to do both a snapshot and a trace\n" + " -f file - read sql lines from file otherwise from the command line\n" + " if file is '-' then read from standard input.\n", + argv[0]); + exit(-1); +} + +enum action { + ACTION_DEFAULT = 0, + ACTION_SNAPSHOT = (1 << 0), + ACTION_TRACE = (1 << 1), + ACTION_SAVE = (1 << 2), + ACTION_MAX = (1 << 3), + ACTION_CHANGE = (1 << 4), +}; + +#define ACTIONS ((ACTION_MAX - 1)) + +static int do_sql(const char *instance_name, + const char *buffer, const char *name, const char *var, + const char *trace_dir, bool execute, int action, + char **save_fields) +{ + struct tracefs_synth *synth; + struct tep_handle *tep; + struct trace_seq seq; + enum tracefs_synth_handler handler; + char *err; + int ret; + + if ((action & ACTIONS) && !var) { + fprintf(stderr, "Error: -s, -S and -T not supported without -m or -c"); + exit(-1); + } + + if (!name) + name = "Anonymous"; + + trace_seq_init(&seq); + tep = tracefs_local_events(trace_dir); + if (!tep) { + if (!trace_dir) + trace_dir = "tracefs directory"; + perror(trace_dir); + exit(-1); + } + + synth = tracefs_sql(tep, name, buffer, &err); + if (!synth) { + perror("Failed creating synthetic event!"); + if (err) + fprintf(stderr, "%s", err); + free(err); + exit(-1); + } + + if (tracefs_synth_complete(synth)) { + if (var) { + if (action & ACTION_MAX) + handler = TRACEFS_SYNTH_HANDLE_MAX; + else + handler = TRACEFS_SYNTH_HANDLE_CHANGE; + + if (action & ACTION_SAVE) { + ret = tracefs_synth_save(synth, handler, var, save_fields); + if (ret < 0) { + err = "adding save"; + goto failed_action; + } + } + if (action & ACTION_TRACE) { + /* + * By doing the trace before snapshot, it will be included + * in the snapshot. + */ + ret = tracefs_synth_trace(synth, handler, var); + if (ret < 0) { + err = "adding trace"; + goto failed_action; + } + } + if (action & ACTION_SNAPSHOT) { + ret = tracefs_synth_snapshot(synth, handler, var); + if (ret < 0) { + err = "adding snapshot"; + failed_action: + perror(err); + if (errno == ENODEV) + fprintf(stderr, "ERROR: '%s' is not a variable\n", + var); + exit(-1); + } + } + } + tracefs_synth_echo_cmd(&seq, synth); + if (execute) { + ret = tracefs_synth_create(synth); + if (ret < 0) { + fprintf(stderr, "%s\n", tracefs_error_last(NULL)); + exit(-1); + } + } + } else { + struct tracefs_instance *instance = NULL; + struct tracefs_hist *hist; + + hist = tracefs_synth_get_start_hist(synth); + if (!hist) { + perror("get_start_hist"); + exit(-1); + } + if (instance_name) { + if (execute) + instance = tracefs_instance_create(instance_name); + else + instance = tracefs_instance_alloc(trace_dir, + instance_name); + if (!instance) { + perror("Failed to create instance"); + exit(-1); + } + } + tracefs_hist_echo_cmd(&seq, instance, hist, 0); + if (execute) { + ret = tracefs_hist_start(instance, hist); + if (ret < 0) { + fprintf(stderr, "%s\n", tracefs_error_last(instance)); + exit(-1); + } + } + } + + tracefs_synth_free(synth); + + trace_seq_do_printf(&seq); + trace_seq_destroy(&seq); + return 0; +} + +int main (int argc, char **argv) +{ + char *trace_dir = NULL; + char *buffer = NULL; + char buf[BUFSIZ]; + int buffer_size = 0; + const char *file = NULL; + const char *instance = NULL; + bool execute = false; + char **save_fields = NULL; + const char *name; + const char *var; + int action = 0; + char *tok; + FILE *fp; + size_t r; + int c; + int i; + + for (;;) { + c = getopt(argc, argv, "ht:f:en:m:c:sS:TB:"); + if (c == -1) + break; + + switch(c) { + case 'h': + usage(argv); + case 't': + trace_dir = optarg; + break; + case 'f': + file = optarg; + break; + case 'e': + execute = true; + break; + case 'm': + action |= ACTION_MAX; + var = optarg; + break; + case 'c': + action |= ACTION_CHANGE; + var = optarg; + break; + case 's': + action |= ACTION_SNAPSHOT; + break; + case 'S': + action |= ACTION_SAVE; + tok = strtok(optarg, ","); + while (tok) { + save_fields = tracefs_list_add(save_fields, tok); + tok = strtok(NULL, ","); + } + if (!save_fields) { + perror(optarg); + exit(-1); + } + break; + case 'T': + action |= ACTION_TRACE | ACTION_SNAPSHOT; + break; + case 'B': + instance = optarg; + break; + case 'n': + name = optarg; + break; + } + } + + if ((action & (ACTION_MAX|ACTION_CHANGE)) == (ACTION_MAX|ACTION_CHANGE)) { + fprintf(stderr, "Can not use both -m and -c together\n"); + exit(-1); + } + if (file) { + if (!strcmp(file, "-")) + fp = stdin; + else + fp = fopen(file, "r"); + if (!fp) { + perror(file); + exit(-1); + } + while ((r = fread(buf, 1, BUFSIZ, fp)) > 0) { + buffer = realloc(buffer, buffer_size + r + 1); + strncpy(buffer + buffer_size, buf, r); + buffer_size += r; + } + fclose(fp); + if (buffer_size) + buffer[buffer_size] = '\0'; + } else if (argc == optind) { + usage(argv); + } else { + for (i = optind; i < argc; i++) { + r = strlen(argv[i]); + buffer = realloc(buffer, buffer_size + r + 2); + if (i != optind) + buffer[buffer_size++] = ' '; + strcpy(buffer + buffer_size, argv[i]); + buffer_size += r; + } + } + + do_sql(instance, buffer, name, var, trace_dir, execute, action, save_fields); + free(buffer); + + return 0; +} +-- + +FILES +----- +[verse] +-- +*tracefs.h* + Header file to include in order to have access to the library APIs. +*-ltracefs* + Linker switch to add when building a program that uses the library. +-- + +SEE ALSO +-------- +*sqlhist*(1), +*libtracefs*(3), +*libtraceevent*(3), +*trace-cmd*(1), +*tracefs_synth_init*(3), +*tracefs_synth_add_match_field*(3), +*tracefs_synth_add_compare_field*(3), +*tracefs_synth_add_start_field*(3), +*tracefs_synth_add_end_field*(3), +*tracefs_synth_append_start_filter*(3), +*tracefs_synth_append_end_filter*(3), +*tracefs_synth_create*(3), +*tracefs_synth_destroy*(3), +*tracefs_synth_free*(3), +*tracefs_synth_echo_cmd*(3), +*tracefs_hist_alloc*(3), +*tracefs_hist_alloc_2d*(3), +*tracefs_hist_alloc_nd*(3), +*tracefs_hist_free*(3), +*tracefs_hist_add_key*(3), +*tracefs_hist_add_value*(3), +*tracefs_hist_add_name*(3), +*tracefs_hist_start*(3), +*tracefs_hist_destory*(3), +*tracefs_hist_add_sort_key*(3), +*tracefs_hist_sort_key_direction*(3) + +AUTHOR +------ +[verse] +-- +*Steven Rostedt* <rostedt@goodmis.org> +*Tzvetomir Stoyanov* <tz.stoyanov@gmail.com> +*sameeruddin shaik* <sameeruddin.shaik8@gmail.com> +-- +REPORTING BUGS +-------------- +Report bugs to <linux-trace-devel@vger.kernel.org> + +LICENSE +------- +libtracefs is Free Software licensed under the GNU LGPL 2.1 + +RESOURCES +--------- +https://git.kernel.org/pub/scm/libs/libtrace/libtracefs.git/ + +COPYING +------- +Copyright \(C) 2020 VMware, Inc. Free use of this software is granted under +the terms of the GNU Public License (GPL). |