summaryrefslogtreecommitdiffstats
path: root/Documentation/libtracefs-sql.txt
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/libtracefs-sql.txt')
-rw-r--r--Documentation/libtracefs-sql.txt628
1 files changed, 628 insertions, 0 deletions
diff --git a/Documentation/libtracefs-sql.txt b/Documentation/libtracefs-sql.txt
new file mode 100644
index 0000000..6d606db
--- /dev/null
+++ b/Documentation/libtracefs-sql.txt
@@ -0,0 +1,628 @@
+libtracefs(3)
+=============
+
+NAME
+----
+tracefs_sql - Create a synthetic event via an SQL statement
+
+SYNOPSIS
+--------
+[verse]
+--
+*#include <tracefs.h>*
+
+struct tracefs_synth pass:[*]*tracefs_sql*(struct tep_handle pass:[*]_tep_, const char pass:[*]_name_,
+ const char pass:[*]_sql_buffer_, char pass:[**]_err_);
+--
+
+DESCRIPTION
+-----------
+Synthetic events are dynamically created events that attach two existing events
+together via one or more matching fields between the two events. It can be used
+to find the latency between the events, or to simply pass fields of the first event
+on to the second event to display as one event.
+
+The Linux kernel interface to create synthetic events is complex, and there needs
+to be a better way to create synthetic events that is easy and can be understood
+via existing technology.
+
+If you think of each event as a table, where the fields are the column of the table
+and each instance of the event as a row, you can understand how SQL can be used
+to attach two events together and form another event (table). Utilizing the
+SQL *SELECT* *FROM* *JOIN* *ON* [ *WHERE* ] syntax, a synthetic event can easily
+be created from two different events.
+
+For simple SQL queries to make a histogram instead of a synthetic event, see
+HISTOGRAMS below.
+
+*tracefs_sql*() takes in a _tep_ handler (See _tep_local_events_(3)) that is used to
+verify the events within the _sql_buffer_ expression. The _name_ is the name of the
+synthetic event to create. If _err_ points to an address of a string, it will be filled
+with a detailed message on any type of parsing error, including fields that do not belong
+to an event, or if the events or fields are not properly compared.
+
+The example program below is a fully functional parser where it will create a synthetic
+event from a SQL syntax passed in via the command line or a file.
+
+The SQL format is as follows:
+
+*SELECT* <fields> *FROM* <start-event> *JOIN* <end-event> *ON* <matching-fields> *WHERE* <filter>
+
+Note, although the examples show the SQL commands in uppercase, they are not required to
+be so. That is, you can use "SELECT" or "select" or "sElEct".
+
+For example:
+[source,c]
+--
+SELECT syscalls.sys_enter_read.fd, syscalls.sys_exit_read.ret FROM syscalls.sys_enter_read
+ JOIN syscalls.sys_exit_read
+ ON syscalls.sys_enter_read.common_pid = syscalls.sys_exit_write.common_pid
+--
+
+Will create a synthetic event that with the fields:
+
+ u64 fd; s64 ret;
+
+Because the function takes a _tep_ handle, and usually all event names are unique, you can
+leave off the system (group) name of the event, and *tracefs_sql*() will discover the
+system for you.
+
+That is, the above statement would work with:
+
+[source,c]
+--
+SELECT sys_enter_read.fd, sys_exit_read.ret FROM sys_enter_read JOIN sys_exit_read
+ ON sys_enter_read.common_pid = sys_exit_write.common_pid
+--
+
+The *AS* keyword can be used to name the fields as well as to give an alias to the
+events, such that the above can be simplified even more as:
+
+[source,c]
+--
+SELECT start.fd, end.ret FROM sys_enter_read AS start JOIN sys_exit_read AS end ON start.common_pid = end.common_pid
+--
+
+The above aliases _sys_enter_read_ as *start* and _sys_exit_read_ as *end* and uses
+those aliases to reference the event throughout the statement.
+
+Using the *AS* keyword in the selection portion of the SQL statement will define what
+those fields will be called in the synthetic event.
+
+[source,c]
+--
+SELECT start.fd AS filed, end.ret AS return FROM sys_enter_read AS start JOIN sys_exit_read AS end
+ ON start.common_pid = end.common_pid
+--
+
+The above labels the _fd_ of _start_ as *filed* and the _ret_ of _end_ as *return* where
+the synthetic event that is created will now have the fields:
+
+ u64 filed; s64 return;
+
+The fields can also be calculated with results passed to the synthetic event:
+
+[source,c]
+--
+select start.truesize, end.len, (start.truesize - end.len) as diff from napi_gro_receive_entry as start
+ JOIN netif_receive_skb as end ON start.skbaddr = end.skbaddr
+--
+
+Which would show the *truesize* of the _napi_gro_receive_entry_ event, the actual
+_len_ of the content, shown by the _netif_receive_skb_, and the delta between
+the two and expressed by the field *diff*.
+
+The code also supports recording the timestamps at either event, and performing calculations
+on them. For wakeup latency, you have:
+
+[source,c]
+--
+select start.pid, (end.TIMESTAMP_USECS - start.TIMESTAMP_USECS) as lat from sched_waking as start
+ JOIN sched_switch as end ON start.pid = end.next_pid
+--
+
+The above will create a synthetic event that records the _pid_ of the task being woken up,
+and the time difference between the _sched_waking_ event and the _sched_switch_ event.
+The *TIMESTAMP_USECS* will truncate the time down to microseconds as the timestamp usually
+recorded in the tracing buffer has nanosecond resolution. If you do not want that
+truncation, use *TIMESTAMP* instead of *TIMESTAMP_USECS*.
+
+Finally, the *WHERE* clause can be added, that will let you add filters on either or both events.
+
+[source,c]
+--
+select start.pid, (end.TIMESTAMP_USECS - start.TIMESTAMP_USECS) as lat from sched_waking as start
+ JOIN sched_switch as end ON start.pid = end.next_pid
+ WHERE start.prio < 100 && (!(end.prev_pid < 1 || end.prev_prio > 100) || end.prev_pid == 0)
+--
+
+*NOTE*
+
+Although both events can be used together in the *WHERE* clause, they must not be mixed outside
+the top most "&&" statements. You can not OR (||) the events together, where a filter of one
+event is OR'd to a filter of the other event. This does not make sense, as the synthetic event
+requires both events to take place to be recorded. If one is filtered out, then the synthetic
+event does not execute.
+
+[source,c]
+--
+select start.pid, (end.TIMESTAMP_USECS - start.TIMESTAMP_USECS) as lat from sched_waking as start
+ JOIN sched_switch as end ON start.pid = end.next_pid
+ WHERE start.prio < 100 && end.prev_prio < 100
+--
+
+The above is valid.
+
+Where as the below is not.
+
+[source,c]
+--
+select start.pid, (end.TIMESTAMP_USECS - start.TIMESTAMP_USECS) as lat from sched_waking as start
+ JOIN sched_switch as end ON start.pid = end.next_pid
+ WHERE start.prio < 100 || end.prev_prio < 100
+--
+
+
+KEYWORDS AS EVENT FIELDS
+------------------------
+
+In some cases, an event may have a keyword. For example, regcache_drop_region has "from"
+as a field and the following will not work
+
+[source,c]
+--
+ select from from regcache_drop_region
+--
+
+In such cases, add a backslash to the conflicting field, and this will tell the parser
+that the "from" is a field and not a keyword:
+
+[source,c]
+--
+ select \from from regcache_drop_region
+--
+
+HISTOGRAMS
+----------
+
+Simple SQL statements without the *JOIN* *ON* may also be used, which will create a histogram
+instead. When doing this, the struct tracefs_hist descriptor can be retrieved from the
+returned synthetic event descriptor via the *tracefs_synth_get_start_hist*(3).
+
+In order to utilize the histogram types (see xxx) the CAST command of SQL can be used.
+
+That is:
+
+[source,c]
+--
+ select CAST(common_pid AS comm), CAST(id AS syscall) FROM sys_enter
+--
+
+Which produces:
+
+[source,c]
+--
+ # echo 'hist:keys=common_pid.execname,id.syscall' > events/raw_syscalls/sys_enter/trigger
+
+ # cat events/raw_syscalls/sys_enter/hist
+
+{ common_pid: bash [ 18248], id: sys_setpgid [109] } hitcount: 1
+{ common_pid: sendmail [ 1812], id: sys_read [ 0] } hitcount: 1
+{ common_pid: bash [ 18247], id: sys_getpid [ 39] } hitcount: 1
+{ common_pid: bash [ 18247], id: sys_dup2 [ 33] } hitcount: 1
+{ common_pid: gmain [ 13684], id: sys_inotify_add_watch [254] } hitcount: 1
+{ common_pid: cat [ 18247], id: sys_access [ 21] } hitcount: 1
+{ common_pid: bash [ 18248], id: sys_getpid [ 39] } hitcount: 1
+{ common_pid: cat [ 18247], id: sys_fadvise64 [221] } hitcount: 1
+{ common_pid: sendmail [ 1812], id: sys_openat [257] } hitcount: 1
+{ common_pid: less [ 18248], id: sys_munmap [ 11] } hitcount: 1
+{ common_pid: sendmail [ 1812], id: sys_close [ 3] } hitcount: 1
+{ common_pid: gmain [ 1534], id: sys_poll [ 7] } hitcount: 1
+{ common_pid: bash [ 18247], id: sys_execve [ 59] } hitcount: 1
+--
+
+Note, string fields may not be cast.
+
+The possible types to cast to are:
+
+*HEX* - convert the value to use hex and not decimal
+
+*SYM* - convert a pointer to symbolic (kallsyms values)
+
+*SYM-OFFSET* - convert a pointer to symbolic and include the offset.
+
+*SYSCALL* - convert the number to the mapped system call name
+
+*EXECNAME* or *COMM* - can only be used with the common_pid field. Will show the task
+name of the process.
+
+*LOG* or *LOG2* - bucket the key values in a log 2 values (1, 2, 3-4, 5-8, 9-16, 17-32, ...)
+
+The above fields are not case sensitive, and "LOG2" works as good as "log".
+
+A special CAST to _COUNTER_ or __COUNTER__ will make the field a value and not
+a key. For example:
+
+[source,c]
+--
+ SELECT common_pid, CAST(bytes_req AS _COUNTER_) FROM kmalloc
+--
+
+Which will create
+
+[source,c]
+--
+ echo 'hist:keys=common_pid:vals=bytes_req' > events/kmem/kmalloc/trigger
+
+ cat events/kmem/kmalloc/hist
+
+{ common_pid: 1812 } hitcount: 1 bytes_req: 32
+{ common_pid: 9111 } hitcount: 2 bytes_req: 272
+{ common_pid: 1768 } hitcount: 3 bytes_req: 1112
+{ common_pid: 0 } hitcount: 4 bytes_req: 512
+{ common_pid: 18297 } hitcount: 11 bytes_req: 2004
+--
+
+RETURN VALUE
+------------
+Returns 0 on success and -1 on failure. On failure, if _err_ is defined, it will be
+allocated to hold a detailed description of what went wrong if it the error was caused
+by a parsing error, or that an event, field does not exist or is not compatible with
+what it was combined with.
+
+CREATE A TOOL
+-------------
+
+The below example is a functional program that can be used to parse SQL commands into
+synthetic events.
+
+[source, c]
+--
+ man tracefs_sql | sed -ne '/^EXAMPLE/,/FILES/ { /EXAMPLE/d ; /FILES/d ; p}' > sqlhist.c
+ gcc -o sqlhist sqlhist.c `pkg-config --cflags --libs libtracefs`
+--
+
+Then you can run the above examples:
+
+[source, c]
+--
+ sudo ./sqlhist 'select start.pid, (end.TIMESTAMP_USECS - start.TIMESTAMP_USECS) as lat from sched_waking as start
+ JOIN sched_switch as end ON start.pid = end.next_pid
+ WHERE start.prio < 100 || end.prev_prio < 100'
+--
+EXAMPLE
+-------
+[source,c]
+--
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <tracefs.h>
+
+static void usage(char **argv)
+{
+ fprintf(stderr, "usage: %s [-ed][-n name][-s][-S fields][-m var][-c var][-T][-t dir][-f file | sql-command-line]\n"
+ " -n name - name of synthetic event 'Anonymous' if left off\n"
+ " -t dir - use dir instead of /sys/kernel/tracing\n"
+ " -e - execute the commands to create the synthetic event\n"
+ " -m - trigger the action when var is a new max.\n"
+ " -c - trigger the action when var changes.\n"
+ " -s - used with -m or -c to do a snapshot of the tracing buffer\n"
+ " -S - used with -m or -c to save fields of the end event (comma deliminated)\n"
+ " -T - used with -m or -c to do both a snapshot and a trace\n"
+ " -f file - read sql lines from file otherwise from the command line\n"
+ " if file is '-' then read from standard input.\n",
+ argv[0]);
+ exit(-1);
+}
+
+enum action {
+ ACTION_DEFAULT = 0,
+ ACTION_SNAPSHOT = (1 << 0),
+ ACTION_TRACE = (1 << 1),
+ ACTION_SAVE = (1 << 2),
+ ACTION_MAX = (1 << 3),
+ ACTION_CHANGE = (1 << 4),
+};
+
+#define ACTIONS ((ACTION_MAX - 1))
+
+static int do_sql(const char *instance_name,
+ const char *buffer, const char *name, const char *var,
+ const char *trace_dir, bool execute, int action,
+ char **save_fields)
+{
+ struct tracefs_synth *synth;
+ struct tep_handle *tep;
+ struct trace_seq seq;
+ enum tracefs_synth_handler handler;
+ char *err;
+ int ret;
+
+ if ((action & ACTIONS) && !var) {
+ fprintf(stderr, "Error: -s, -S and -T not supported without -m or -c");
+ exit(-1);
+ }
+
+ if (!name)
+ name = "Anonymous";
+
+ trace_seq_init(&seq);
+ tep = tracefs_local_events(trace_dir);
+ if (!tep) {
+ if (!trace_dir)
+ trace_dir = "tracefs directory";
+ perror(trace_dir);
+ exit(-1);
+ }
+
+ synth = tracefs_sql(tep, name, buffer, &err);
+ if (!synth) {
+ perror("Failed creating synthetic event!");
+ if (err)
+ fprintf(stderr, "%s", err);
+ free(err);
+ exit(-1);
+ }
+
+ if (tracefs_synth_complete(synth)) {
+ if (var) {
+ if (action & ACTION_MAX)
+ handler = TRACEFS_SYNTH_HANDLE_MAX;
+ else
+ handler = TRACEFS_SYNTH_HANDLE_CHANGE;
+
+ if (action & ACTION_SAVE) {
+ ret = tracefs_synth_save(synth, handler, var, save_fields);
+ if (ret < 0) {
+ err = "adding save";
+ goto failed_action;
+ }
+ }
+ if (action & ACTION_TRACE) {
+ /*
+ * By doing the trace before snapshot, it will be included
+ * in the snapshot.
+ */
+ ret = tracefs_synth_trace(synth, handler, var);
+ if (ret < 0) {
+ err = "adding trace";
+ goto failed_action;
+ }
+ }
+ if (action & ACTION_SNAPSHOT) {
+ ret = tracefs_synth_snapshot(synth, handler, var);
+ if (ret < 0) {
+ err = "adding snapshot";
+ failed_action:
+ perror(err);
+ if (errno == ENODEV)
+ fprintf(stderr, "ERROR: '%s' is not a variable\n",
+ var);
+ exit(-1);
+ }
+ }
+ }
+ tracefs_synth_echo_cmd(&seq, synth);
+ if (execute) {
+ ret = tracefs_synth_create(synth);
+ if (ret < 0) {
+ fprintf(stderr, "%s\n", tracefs_error_last(NULL));
+ exit(-1);
+ }
+ }
+ } else {
+ struct tracefs_instance *instance = NULL;
+ struct tracefs_hist *hist;
+
+ hist = tracefs_synth_get_start_hist(synth);
+ if (!hist) {
+ perror("get_start_hist");
+ exit(-1);
+ }
+ if (instance_name) {
+ if (execute)
+ instance = tracefs_instance_create(instance_name);
+ else
+ instance = tracefs_instance_alloc(trace_dir,
+ instance_name);
+ if (!instance) {
+ perror("Failed to create instance");
+ exit(-1);
+ }
+ }
+ tracefs_hist_echo_cmd(&seq, instance, hist, 0);
+ if (execute) {
+ ret = tracefs_hist_start(instance, hist);
+ if (ret < 0) {
+ fprintf(stderr, "%s\n", tracefs_error_last(instance));
+ exit(-1);
+ }
+ }
+ }
+
+ tracefs_synth_free(synth);
+
+ trace_seq_do_printf(&seq);
+ trace_seq_destroy(&seq);
+ return 0;
+}
+
+int main (int argc, char **argv)
+{
+ char *trace_dir = NULL;
+ char *buffer = NULL;
+ char buf[BUFSIZ];
+ int buffer_size = 0;
+ const char *file = NULL;
+ const char *instance = NULL;
+ bool execute = false;
+ char **save_fields = NULL;
+ const char *name;
+ const char *var;
+ int action = 0;
+ char *tok;
+ FILE *fp;
+ size_t r;
+ int c;
+ int i;
+
+ for (;;) {
+ c = getopt(argc, argv, "ht:f:en:m:c:sS:TB:");
+ if (c == -1)
+ break;
+
+ switch(c) {
+ case 'h':
+ usage(argv);
+ case 't':
+ trace_dir = optarg;
+ break;
+ case 'f':
+ file = optarg;
+ break;
+ case 'e':
+ execute = true;
+ break;
+ case 'm':
+ action |= ACTION_MAX;
+ var = optarg;
+ break;
+ case 'c':
+ action |= ACTION_CHANGE;
+ var = optarg;
+ break;
+ case 's':
+ action |= ACTION_SNAPSHOT;
+ break;
+ case 'S':
+ action |= ACTION_SAVE;
+ tok = strtok(optarg, ",");
+ while (tok) {
+ save_fields = tracefs_list_add(save_fields, tok);
+ tok = strtok(NULL, ",");
+ }
+ if (!save_fields) {
+ perror(optarg);
+ exit(-1);
+ }
+ break;
+ case 'T':
+ action |= ACTION_TRACE | ACTION_SNAPSHOT;
+ break;
+ case 'B':
+ instance = optarg;
+ break;
+ case 'n':
+ name = optarg;
+ break;
+ }
+ }
+
+ if ((action & (ACTION_MAX|ACTION_CHANGE)) == (ACTION_MAX|ACTION_CHANGE)) {
+ fprintf(stderr, "Can not use both -m and -c together\n");
+ exit(-1);
+ }
+ if (file) {
+ if (!strcmp(file, "-"))
+ fp = stdin;
+ else
+ fp = fopen(file, "r");
+ if (!fp) {
+ perror(file);
+ exit(-1);
+ }
+ while ((r = fread(buf, 1, BUFSIZ, fp)) > 0) {
+ buffer = realloc(buffer, buffer_size + r + 1);
+ strncpy(buffer + buffer_size, buf, r);
+ buffer_size += r;
+ }
+ fclose(fp);
+ if (buffer_size)
+ buffer[buffer_size] = '\0';
+ } else if (argc == optind) {
+ usage(argv);
+ } else {
+ for (i = optind; i < argc; i++) {
+ r = strlen(argv[i]);
+ buffer = realloc(buffer, buffer_size + r + 2);
+ if (i != optind)
+ buffer[buffer_size++] = ' ';
+ strcpy(buffer + buffer_size, argv[i]);
+ buffer_size += r;
+ }
+ }
+
+ do_sql(instance, buffer, name, var, trace_dir, execute, action, save_fields);
+ free(buffer);
+
+ return 0;
+}
+--
+
+FILES
+-----
+[verse]
+--
+*tracefs.h*
+ Header file to include in order to have access to the library APIs.
+*-ltracefs*
+ Linker switch to add when building a program that uses the library.
+--
+
+SEE ALSO
+--------
+*sqlhist*(1),
+*libtracefs*(3),
+*libtraceevent*(3),
+*trace-cmd*(1),
+*tracefs_synth_init*(3),
+*tracefs_synth_add_match_field*(3),
+*tracefs_synth_add_compare_field*(3),
+*tracefs_synth_add_start_field*(3),
+*tracefs_synth_add_end_field*(3),
+*tracefs_synth_append_start_filter*(3),
+*tracefs_synth_append_end_filter*(3),
+*tracefs_synth_create*(3),
+*tracefs_synth_destroy*(3),
+*tracefs_synth_free*(3),
+*tracefs_synth_echo_cmd*(3),
+*tracefs_hist_alloc*(3),
+*tracefs_hist_alloc_2d*(3),
+*tracefs_hist_alloc_nd*(3),
+*tracefs_hist_free*(3),
+*tracefs_hist_add_key*(3),
+*tracefs_hist_add_value*(3),
+*tracefs_hist_add_name*(3),
+*tracefs_hist_start*(3),
+*tracefs_hist_destory*(3),
+*tracefs_hist_add_sort_key*(3),
+*tracefs_hist_sort_key_direction*(3)
+
+AUTHOR
+------
+[verse]
+--
+*Steven Rostedt* <rostedt@goodmis.org>
+*Tzvetomir Stoyanov* <tz.stoyanov@gmail.com>
+*sameeruddin shaik* <sameeruddin.shaik8@gmail.com>
+--
+REPORTING BUGS
+--------------
+Report bugs to <linux-trace-devel@vger.kernel.org>
+
+LICENSE
+-------
+libtracefs is Free Software licensed under the GNU LGPL 2.1
+
+RESOURCES
+---------
+https://git.kernel.org/pub/scm/libs/libtrace/libtracefs.git/
+
+COPYING
+-------
+Copyright \(C) 2020 VMware, Inc. Free use of this software is granted under
+the terms of the GNU Public License (GPL).