diff options
Diffstat (limited to 'src/bin/pgbench/pgbench.c')
-rw-r--r-- | src/bin/pgbench/pgbench.c | 7854 |
1 files changed, 7854 insertions, 0 deletions
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c new file mode 100644 index 0000000..e0cd754 --- /dev/null +++ b/src/bin/pgbench/pgbench.c @@ -0,0 +1,7854 @@ +/* + * pgbench.c + * + * A simple benchmark program for PostgreSQL + * Originally written by Tatsuo Ishii and enhanced by many contributors. + * + * src/bin/pgbench/pgbench.c + * Copyright (c) 2000-2022, PostgreSQL Global Development Group + * ALL RIGHTS RESERVED; + * + * Permission to use, copy, modify, and distribute this software and its + * documentation for any purpose, without fee, and without a written agreement + * is hereby granted, provided that the above copyright notice and this + * paragraph and the following two paragraphs appear in all copies. + * + * IN NO EVENT SHALL THE AUTHOR OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR + * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING + * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS + * DOCUMENTATION, EVEN IF THE AUTHOR OR DISTRIBUTORS HAVE BEEN ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * THE AUTHOR AND DISTRIBUTORS SPECIFICALLY DISCLAIMS ANY WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS + * ON AN "AS IS" BASIS, AND THE AUTHOR AND DISTRIBUTORS HAS NO OBLIGATIONS TO + * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. + * + */ + +#ifdef WIN32 +#define FD_SETSIZE 1024 /* must set before winsock2.h is included */ +#endif + +#include "postgres_fe.h" + +#include <ctype.h> +#include <float.h> +#include <limits.h> +#include <math.h> +#include <signal.h> +#include <time.h> +#include <sys/time.h> +#ifdef HAVE_SYS_RESOURCE_H +#include <sys/resource.h> /* for getrlimit */ +#endif + +/* For testing, PGBENCH_USE_SELECT can be defined to force use of that code */ +#if defined(HAVE_PPOLL) && !defined(PGBENCH_USE_SELECT) +#define POLL_USING_PPOLL +#ifdef HAVE_POLL_H +#include <poll.h> +#endif +#else /* no ppoll(), so use select() */ +#define POLL_USING_SELECT +#ifdef HAVE_SYS_SELECT_H +#include <sys/select.h> +#endif +#endif + +#include "common/int.h" +#include "common/logging.h" +#include "common/pg_prng.h" +#include "common/string.h" +#include "common/username.h" +#include "fe_utils/cancel.h" +#include "fe_utils/conditional.h" +#include "fe_utils/option_utils.h" +#include "fe_utils/string_utils.h" +#include "getopt_long.h" +#include "libpq-fe.h" +#include "pgbench.h" +#include "port/pg_bitutils.h" +#include "portability/instr_time.h" + +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif + +#define ERRCODE_T_R_SERIALIZATION_FAILURE "40001" +#define ERRCODE_T_R_DEADLOCK_DETECTED "40P01" +#define ERRCODE_UNDEFINED_TABLE "42P01" + +/* + * Hashing constants + */ +#define FNV_PRIME UINT64CONST(0x100000001b3) +#define FNV_OFFSET_BASIS UINT64CONST(0xcbf29ce484222325) +#define MM2_MUL UINT64CONST(0xc6a4a7935bd1e995) +#define MM2_MUL_TIMES_8 UINT64CONST(0x35253c9ade8f4ca8) +#define MM2_ROT 47 + +/* + * Multi-platform socket set implementations + */ + +#ifdef POLL_USING_PPOLL +#define SOCKET_WAIT_METHOD "ppoll" + +typedef struct socket_set +{ + int maxfds; /* allocated length of pollfds[] array */ + int curfds; /* number currently in use */ + struct pollfd pollfds[FLEXIBLE_ARRAY_MEMBER]; +} socket_set; + +#endif /* POLL_USING_PPOLL */ + +#ifdef POLL_USING_SELECT +#define SOCKET_WAIT_METHOD "select" + +typedef struct socket_set +{ + int maxfd; /* largest FD currently set in fds */ + fd_set fds; +} socket_set; + +#endif /* POLL_USING_SELECT */ + +/* + * Multi-platform thread implementations + */ + +#ifdef WIN32 +/* Use Windows threads */ +#include <windows.h> +#define GETERRNO() (_dosmaperr(GetLastError()), errno) +#define THREAD_T HANDLE +#define THREAD_FUNC_RETURN_TYPE unsigned +#define THREAD_FUNC_RETURN return 0 +#define THREAD_FUNC_CC __stdcall +#define THREAD_CREATE(handle, function, arg) \ + ((*(handle) = (HANDLE) _beginthreadex(NULL, 0, (function), (arg), 0, NULL)) == 0 ? errno : 0) +#define THREAD_JOIN(handle) \ + (WaitForSingleObject(handle, INFINITE) != WAIT_OBJECT_0 ? \ + GETERRNO() : CloseHandle(handle) ? 0 : GETERRNO()) +#define THREAD_BARRIER_T SYNCHRONIZATION_BARRIER +#define THREAD_BARRIER_INIT(barrier, n) \ + (InitializeSynchronizationBarrier((barrier), (n), 0) ? 0 : GETERRNO()) +#define THREAD_BARRIER_WAIT(barrier) \ + EnterSynchronizationBarrier((barrier), \ + SYNCHRONIZATION_BARRIER_FLAGS_BLOCK_ONLY) +#define THREAD_BARRIER_DESTROY(barrier) +#elif defined(ENABLE_THREAD_SAFETY) +/* Use POSIX threads */ +#include "port/pg_pthread.h" +#define THREAD_T pthread_t +#define THREAD_FUNC_RETURN_TYPE void * +#define THREAD_FUNC_RETURN return NULL +#define THREAD_FUNC_CC +#define THREAD_CREATE(handle, function, arg) \ + pthread_create((handle), NULL, (function), (arg)) +#define THREAD_JOIN(handle) \ + pthread_join((handle), NULL) +#define THREAD_BARRIER_T pthread_barrier_t +#define THREAD_BARRIER_INIT(barrier, n) \ + pthread_barrier_init((barrier), NULL, (n)) +#define THREAD_BARRIER_WAIT(barrier) pthread_barrier_wait((barrier)) +#define THREAD_BARRIER_DESTROY(barrier) pthread_barrier_destroy((barrier)) +#else +/* No threads implementation, use none (-j 1) */ +#define THREAD_T void * +#define THREAD_FUNC_RETURN_TYPE void * +#define THREAD_FUNC_RETURN return NULL +#define THREAD_FUNC_CC +#define THREAD_BARRIER_T int +#define THREAD_BARRIER_INIT(barrier, n) (*(barrier) = 0) +#define THREAD_BARRIER_WAIT(barrier) +#define THREAD_BARRIER_DESTROY(barrier) +#endif + + +/******************************************************************** + * some configurable parameters */ + +#define DEFAULT_INIT_STEPS "dtgvp" /* default -I setting */ +#define ALL_INIT_STEPS "dtgGvpf" /* all possible steps */ + +#define LOG_STEP_SECONDS 5 /* seconds between log messages */ +#define DEFAULT_NXACTS 10 /* default nxacts */ + +#define MIN_GAUSSIAN_PARAM 2.0 /* minimum parameter for gauss */ + +#define MIN_ZIPFIAN_PARAM 1.001 /* minimum parameter for zipfian */ +#define MAX_ZIPFIAN_PARAM 1000.0 /* maximum parameter for zipfian */ + +int nxacts = 0; /* number of transactions per client */ +int duration = 0; /* duration in seconds */ +int64 end_time = 0; /* when to stop in micro seconds, under -T */ + +/* + * scaling factor. for example, scale = 10 will make 1000000 tuples in + * pgbench_accounts table. + */ +int scale = 1; + +/* + * fillfactor. for example, fillfactor = 90 will use only 90 percent + * space during inserts and leave 10 percent free. + */ +int fillfactor = 100; + +/* + * use unlogged tables? + */ +bool unlogged_tables = false; + +/* + * log sampling rate (1.0 = log everything, 0.0 = option not given) + */ +double sample_rate = 0.0; + +/* + * When threads are throttled to a given rate limit, this is the target delay + * to reach that rate in usec. 0 is the default and means no throttling. + */ +double throttle_delay = 0; + +/* + * Transactions which take longer than this limit (in usec) are counted as + * late, and reported as such, although they are completed anyway. When + * throttling is enabled, execution time slots that are more than this late + * are skipped altogether, and counted separately. + */ +int64 latency_limit = 0; + +/* + * tablespace selection + */ +char *tablespace = NULL; +char *index_tablespace = NULL; + +/* + * Number of "pgbench_accounts" partitions. 0 is the default and means no + * partitioning. + */ +static int partitions = 0; + +/* partitioning strategy for "pgbench_accounts" */ +typedef enum +{ + PART_NONE, /* no partitioning */ + PART_RANGE, /* range partitioning */ + PART_HASH /* hash partitioning */ +} partition_method_t; + +static partition_method_t partition_method = PART_NONE; +static const char *PARTITION_METHOD[] = {"none", "range", "hash"}; + +/* random seed used to initialize base_random_sequence */ +int64 random_seed = -1; + +/* + * end of configurable parameters + *********************************************************************/ + +#define nbranches 1 /* Makes little sense to change this. Change + * -s instead */ +#define ntellers 10 +#define naccounts 100000 + +/* + * The scale factor at/beyond which 32bit integers are incapable of storing + * 64bit values. + * + * Although the actual threshold is 21474, we use 20000 because it is easier to + * document and remember, and isn't that far away from the real threshold. + */ +#define SCALE_32BIT_THRESHOLD 20000 + +bool use_log; /* log transaction latencies to a file */ +bool use_quiet; /* quiet logging onto stderr */ +int agg_interval; /* log aggregates instead of individual + * transactions */ +bool per_script_stats = false; /* whether to collect stats per script */ +int progress = 0; /* thread progress report every this seconds */ +bool progress_timestamp = false; /* progress report with Unix time */ +int nclients = 1; /* number of clients */ +int nthreads = 1; /* number of threads */ +bool is_connect; /* establish connection for each transaction */ +bool report_per_command = false; /* report per-command latencies, + * retries after errors and failures + * (errors without retrying) */ +int main_pid; /* main process id used in log filename */ + +/* + * There are different types of restrictions for deciding that the current + * transaction with a serialization/deadlock error can no longer be retried and + * should be reported as failed: + * - max_tries (--max-tries) can be used to limit the number of tries; + * - latency_limit (-L) can be used to limit the total time of tries; + * - duration (-T) can be used to limit the total benchmark time. + * + * They can be combined together, and you need to use at least one of them to + * retry the transactions with serialization/deadlock errors. If none of them is + * used, the default value of max_tries is 1 and such transactions will not be + * retried. + */ + +/* + * We cannot retry a transaction after the serialization/deadlock error if its + * number of tries reaches this maximum; if its value is zero, it is not used. + */ +uint32 max_tries = 1; + +bool failures_detailed = false; /* whether to group failures in + * reports or logs by basic types */ + +const char *pghost = NULL; +const char *pgport = NULL; +const char *username = NULL; +const char *dbName = NULL; +char *logfile_prefix = NULL; +const char *progname; + +#define WSEP '@' /* weight separator */ + +volatile bool timer_exceeded = false; /* flag from signal handler */ + +/* + * We don't want to allocate variables one by one; for efficiency, add a + * constant margin each time it overflows. + */ +#define VARIABLES_ALLOC_MARGIN 8 + +/* + * Variable definitions. + * + * If a variable only has a string value, "svalue" is that value, and value is + * "not set". If the value is known, "value" contains the value (in any + * variant). + * + * In this case "svalue" contains the string equivalent of the value, if we've + * had occasion to compute that, or NULL if we haven't. + */ +typedef struct +{ + char *name; /* variable's name */ + char *svalue; /* its value in string form, if known */ + PgBenchValue value; /* actual variable's value */ +} Variable; + +/* + * Data structure for client variables. + */ +typedef struct +{ + Variable *vars; /* array of variable definitions */ + int nvars; /* number of variables */ + + /* + * The maximum number of variables that we can currently store in 'vars' + * without having to reallocate more space. We must always have max_vars + * >= nvars. + */ + int max_vars; + + bool vars_sorted; /* are variables sorted by name? */ +} Variables; + +#define MAX_SCRIPTS 128 /* max number of SQL scripts allowed */ +#define SHELL_COMMAND_SIZE 256 /* maximum size allowed for shell command */ + +/* + * Simple data structure to keep stats about something. + * + * XXX probably the first value should be kept and used as an offset for + * better numerical stability... + */ +typedef struct SimpleStats +{ + int64 count; /* how many values were encountered */ + double min; /* the minimum seen */ + double max; /* the maximum seen */ + double sum; /* sum of values */ + double sum2; /* sum of squared values */ +} SimpleStats; + +/* + * The instr_time type is expensive when dealing with time arithmetic. Define + * a type to hold microseconds instead. Type int64 is good enough for about + * 584500 years. + */ +typedef int64 pg_time_usec_t; + +/* + * Data structure to hold various statistics: per-thread and per-script stats + * are maintained and merged together. + */ +typedef struct StatsData +{ + pg_time_usec_t start_time; /* interval start time, for aggregates */ + + /*---------- + * Transactions are counted depending on their execution and outcome. + * First a transaction may have started or not: skipped transactions occur + * under --rate and --latency-limit when the client is too late to execute + * them. Secondly, a started transaction may ultimately succeed or fail, + * possibly after some retries when --max-tries is not one. Thus + * + * the number of all transactions = + * 'skipped' (it was too late to execute them) + + * 'cnt' (the number of successful transactions) + + * 'failed' (the number of failed transactions). + * + * A successful transaction can have several unsuccessful tries before a + * successful run. Thus + * + * 'cnt' (the number of successful transactions) = + * successfully retried transactions (they got a serialization or a + * deadlock error(s), but were + * successfully retried from the very + * beginning) + + * directly successful transactions (they were successfully completed on + * the first try). + * + * A failed transaction is defined as unsuccessfully retried transactions. + * It can be one of two types: + * + * failed (the number of failed transactions) = + * 'serialization_failures' (they got a serialization error and were not + * successfully retried) + + * 'deadlock_failures' (they got a deadlock error and were not + * successfully retried). + * + * If the transaction was retried after a serialization or a deadlock + * error this does not guarantee that this retry was successful. Thus + * + * 'retries' (number of retries) = + * number of retries in all retried transactions = + * number of retries in (successfully retried transactions + + * failed transactions); + * + * 'retried' (number of all retried transactions) = + * successfully retried transactions + + * failed transactions. + *---------- + */ + int64 cnt; /* number of successful transactions, not + * including 'skipped' */ + int64 skipped; /* number of transactions skipped under --rate + * and --latency-limit */ + int64 retries; /* number of retries after a serialization or + * a deadlock error in all the transactions */ + int64 retried; /* number of all transactions that were + * retried after a serialization or a deadlock + * error (perhaps the last try was + * unsuccessful) */ + int64 serialization_failures; /* number of transactions that were + * not successfully retried after a + * serialization error */ + int64 deadlock_failures; /* number of transactions that were not + * successfully retried after a deadlock + * error */ + SimpleStats latency; + SimpleStats lag; +} StatsData; + +/* + * For displaying Unix epoch timestamps, as some time functions may have + * another reference. + */ +pg_time_usec_t epoch_shift; + +/* + * Error status for errors during script execution. + */ +typedef enum EStatus +{ + ESTATUS_NO_ERROR = 0, + ESTATUS_META_COMMAND_ERROR, + + /* SQL errors */ + ESTATUS_SERIALIZATION_ERROR, + ESTATUS_DEADLOCK_ERROR, + ESTATUS_OTHER_SQL_ERROR +} EStatus; + +/* + * Transaction status at the end of a command. + */ +typedef enum TStatus +{ + TSTATUS_IDLE, + TSTATUS_IN_BLOCK, + TSTATUS_CONN_ERROR, + TSTATUS_OTHER_ERROR +} TStatus; + +/* Various random sequences are initialized from this one. */ +static pg_prng_state base_random_sequence; + +/* Synchronization barrier for start and connection */ +static THREAD_BARRIER_T barrier; + +/* + * Connection state machine states. + */ +typedef enum +{ + /* + * The client must first choose a script to execute. Once chosen, it can + * either be throttled (state CSTATE_PREPARE_THROTTLE under --rate), start + * right away (state CSTATE_START_TX) or not start at all if the timer was + * exceeded (state CSTATE_FINISHED). + */ + CSTATE_CHOOSE_SCRIPT, + + /* + * CSTATE_START_TX performs start-of-transaction processing. Establishes + * a new connection for the transaction in --connect mode, records the + * transaction start time, and proceed to the first command. + * + * Note: once a script is started, it will either error or run till its + * end, where it may be interrupted. It is not interrupted while running, + * so pgbench --time is to be understood as tx are allowed to start in + * that time, and will finish when their work is completed. + */ + CSTATE_START_TX, + + /* + * In CSTATE_PREPARE_THROTTLE state, we calculate when to begin the next + * transaction, and advance to CSTATE_THROTTLE. CSTATE_THROTTLE state + * sleeps until that moment, then advances to CSTATE_START_TX, or + * CSTATE_FINISHED if the next transaction would start beyond the end of + * the run. + */ + CSTATE_PREPARE_THROTTLE, + CSTATE_THROTTLE, + + /* + * We loop through these states, to process each command in the script: + * + * CSTATE_START_COMMAND starts the execution of a command. On a SQL + * command, the command is sent to the server, and we move to + * CSTATE_WAIT_RESULT state unless in pipeline mode. On a \sleep + * meta-command, the timer is set, and we enter the CSTATE_SLEEP state to + * wait for it to expire. Other meta-commands are executed immediately. If + * the command about to start is actually beyond the end of the script, + * advance to CSTATE_END_TX. + * + * CSTATE_WAIT_RESULT waits until we get a result set back from the server + * for the current command. + * + * CSTATE_SLEEP waits until the end of \sleep. + * + * CSTATE_END_COMMAND records the end-of-command timestamp, increments the + * command counter, and loops back to CSTATE_START_COMMAND state. + * + * CSTATE_SKIP_COMMAND is used by conditional branches which are not + * executed. It quickly skip commands that do not need any evaluation. + * This state can move forward several commands, till there is something + * to do or the end of the script. + */ + CSTATE_START_COMMAND, + CSTATE_WAIT_RESULT, + CSTATE_SLEEP, + CSTATE_END_COMMAND, + CSTATE_SKIP_COMMAND, + + /* + * States for failed commands. + * + * If the SQL/meta command fails, in CSTATE_ERROR clean up after an error: + * (1) clear the conditional stack; (2) if we have an unterminated + * (possibly failed) transaction block, send the rollback command to the + * server and wait for the result in CSTATE_WAIT_ROLLBACK_RESULT. If + * something goes wrong with rolling back, go to CSTATE_ABORTED. + * + * But if everything is ok we are ready for future transactions: if this + * is a serialization or deadlock error and we can re-execute the + * transaction from the very beginning, go to CSTATE_RETRY; otherwise go + * to CSTATE_FAILURE. + * + * In CSTATE_RETRY report an error, set the same parameters for the + * transaction execution as in the previous tries and process the first + * transaction command in CSTATE_START_COMMAND. + * + * In CSTATE_FAILURE report a failure, set the parameters for the + * transaction execution as they were before the first run of this + * transaction (except for a random state) and go to CSTATE_END_TX to + * complete this transaction. + */ + CSTATE_ERROR, + CSTATE_WAIT_ROLLBACK_RESULT, + CSTATE_RETRY, + CSTATE_FAILURE, + + /* + * CSTATE_END_TX performs end-of-transaction processing. It calculates + * latency, and logs the transaction. In --connect mode, it closes the + * current connection. + * + * Then either starts over in CSTATE_CHOOSE_SCRIPT, or enters + * CSTATE_FINISHED if we have no more work to do. + */ + CSTATE_END_TX, + + /* + * Final states. CSTATE_ABORTED means that the script execution was + * aborted because a command failed, CSTATE_FINISHED means success. + */ + CSTATE_ABORTED, + CSTATE_FINISHED +} ConnectionStateEnum; + +/* + * Connection state. + */ +typedef struct +{ + PGconn *con; /* connection handle to DB */ + int id; /* client No. */ + ConnectionStateEnum state; /* state machine's current state. */ + ConditionalStack cstack; /* enclosing conditionals state */ + + /* + * Separate randomness for each client. This is used for random functions + * PGBENCH_RANDOM_* during the execution of the script. + */ + pg_prng_state cs_func_rs; + + int use_file; /* index in sql_script for this client */ + int command; /* command number in script */ + + /* client variables */ + Variables variables; + + /* various times about current transaction in microseconds */ + pg_time_usec_t txn_scheduled; /* scheduled start time of transaction */ + pg_time_usec_t sleep_until; /* scheduled start time of next cmd */ + pg_time_usec_t txn_begin; /* used for measuring schedule lag times */ + pg_time_usec_t stmt_begin; /* used for measuring statement latencies */ + + /* whether client prepared each command of each script */ + bool **prepared; + + /* + * For processing failures and repeating transactions with serialization + * or deadlock errors: + */ + EStatus estatus; /* the error status of the current transaction + * execution; this is ESTATUS_NO_ERROR if + * there were no errors */ + pg_prng_state random_state; /* random state */ + uint32 tries; /* how many times have we already tried the + * current transaction? */ + + /* per client collected stats */ + int64 cnt; /* client transaction count, for -t; skipped + * and failed transactions are also counted + * here */ +} CState; + +/* + * Thread state + */ +typedef struct +{ + int tid; /* thread id */ + THREAD_T thread; /* thread handle */ + CState *state; /* array of CState */ + int nstate; /* length of state[] */ + + /* + * Separate randomness for each thread. Each thread option uses its own + * random state to make all of them independent of each other and + * therefore deterministic at the thread level. + */ + pg_prng_state ts_choose_rs; /* random state for selecting a script */ + pg_prng_state ts_throttle_rs; /* random state for transaction throttling */ + pg_prng_state ts_sample_rs; /* random state for log sampling */ + + int64 throttle_trigger; /* previous/next throttling (us) */ + FILE *logfile; /* where to log, or NULL */ + + /* per thread collected stats in microseconds */ + pg_time_usec_t create_time; /* thread creation time */ + pg_time_usec_t started_time; /* thread is running */ + pg_time_usec_t bench_start; /* thread is benchmarking */ + pg_time_usec_t conn_duration; /* cumulated connection and disconnection + * delays */ + + StatsData stats; + int64 latency_late; /* count executed but late transactions */ +} TState; + +/* + * queries read from files + */ +#define SQL_COMMAND 1 +#define META_COMMAND 2 + +/* + * max number of backslash command arguments or SQL variables, + * including the command or SQL statement itself + */ +#define MAX_ARGS 256 + +typedef enum MetaCommand +{ + META_NONE, /* not a known meta-command */ + META_SET, /* \set */ + META_SETSHELL, /* \setshell */ + META_SHELL, /* \shell */ + META_SLEEP, /* \sleep */ + META_GSET, /* \gset */ + META_ASET, /* \aset */ + META_IF, /* \if */ + META_ELIF, /* \elif */ + META_ELSE, /* \else */ + META_ENDIF, /* \endif */ + META_STARTPIPELINE, /* \startpipeline */ + META_ENDPIPELINE /* \endpipeline */ +} MetaCommand; + +typedef enum QueryMode +{ + QUERY_SIMPLE, /* simple query */ + QUERY_EXTENDED, /* extended query */ + QUERY_PREPARED, /* extended query with prepared statements */ + NUM_QUERYMODE +} QueryMode; + +static QueryMode querymode = QUERY_SIMPLE; +static const char *QUERYMODE[] = {"simple", "extended", "prepared"}; + +/* + * struct Command represents one command in a script. + * + * lines The raw, possibly multi-line command text. Variable substitution + * not applied. + * first_line A short, single-line extract of 'lines', for error reporting. + * type SQL_COMMAND or META_COMMAND + * meta The type of meta-command, with META_NONE/GSET/ASET if command + * is SQL. + * argc Number of arguments of the command, 0 if not yet processed. + * argv Command arguments, the first of which is the command or SQL + * string itself. For SQL commands, after post-processing + * argv[0] is the same as 'lines' with variables substituted. + * prepname The name that this command is prepared under, in prepare mode + * varprefix SQL commands terminated with \gset or \aset have this set + * to a non NULL value. If nonempty, it's used to prefix the + * variable name that receives the value. + * aset do gset on all possible queries of a combined query (\;). + * expr Parsed expression, if needed. + * stats Time spent in this command. + * retries Number of retries after a serialization or deadlock error in the + * current command. + * failures Number of errors in the current command that were not retried. + */ +typedef struct Command +{ + PQExpBufferData lines; + char *first_line; + int type; + MetaCommand meta; + int argc; + char *argv[MAX_ARGS]; + char *prepname; + char *varprefix; + PgBenchExpr *expr; + SimpleStats stats; + int64 retries; + int64 failures; +} Command; + +typedef struct ParsedScript +{ + const char *desc; /* script descriptor (eg, file name) */ + int weight; /* selection weight */ + Command **commands; /* NULL-terminated array of Commands */ + StatsData stats; /* total time spent in script */ +} ParsedScript; + +static ParsedScript sql_script[MAX_SCRIPTS]; /* SQL script files */ +static int num_scripts; /* number of scripts in sql_script[] */ +static int64 total_weight = 0; + +static bool verbose_errors = false; /* print verbose messages of all errors */ + +/* Builtin test scripts */ +typedef struct BuiltinScript +{ + const char *name; /* very short name for -b ... */ + const char *desc; /* short description */ + const char *script; /* actual pgbench script */ +} BuiltinScript; + +static const BuiltinScript builtin_script[] = +{ + { + "tpcb-like", + "<builtin: TPC-B (sort of)>", + "\\set aid random(1, " CppAsString2(naccounts) " * :scale)\n" + "\\set bid random(1, " CppAsString2(nbranches) " * :scale)\n" + "\\set tid random(1, " CppAsString2(ntellers) " * :scale)\n" + "\\set delta random(-5000, 5000)\n" + "BEGIN;\n" + "UPDATE pgbench_accounts SET abalance = abalance + :delta WHERE aid = :aid;\n" + "SELECT abalance FROM pgbench_accounts WHERE aid = :aid;\n" + "UPDATE pgbench_tellers SET tbalance = tbalance + :delta WHERE tid = :tid;\n" + "UPDATE pgbench_branches SET bbalance = bbalance + :delta WHERE bid = :bid;\n" + "INSERT INTO pgbench_history (tid, bid, aid, delta, mtime) VALUES (:tid, :bid, :aid, :delta, CURRENT_TIMESTAMP);\n" + "END;\n" + }, + { + "simple-update", + "<builtin: simple update>", + "\\set aid random(1, " CppAsString2(naccounts) " * :scale)\n" + "\\set bid random(1, " CppAsString2(nbranches) " * :scale)\n" + "\\set tid random(1, " CppAsString2(ntellers) " * :scale)\n" + "\\set delta random(-5000, 5000)\n" + "BEGIN;\n" + "UPDATE pgbench_accounts SET abalance = abalance + :delta WHERE aid = :aid;\n" + "SELECT abalance FROM pgbench_accounts WHERE aid = :aid;\n" + "INSERT INTO pgbench_history (tid, bid, aid, delta, mtime) VALUES (:tid, :bid, :aid, :delta, CURRENT_TIMESTAMP);\n" + "END;\n" + }, + { + "select-only", + "<builtin: select only>", + "\\set aid random(1, " CppAsString2(naccounts) " * :scale)\n" + "SELECT abalance FROM pgbench_accounts WHERE aid = :aid;\n" + } +}; + + +/* Function prototypes */ +static void setNullValue(PgBenchValue *pv); +static void setBoolValue(PgBenchValue *pv, bool bval); +static void setIntValue(PgBenchValue *pv, int64 ival); +static void setDoubleValue(PgBenchValue *pv, double dval); +static bool evaluateExpr(CState *st, PgBenchExpr *expr, + PgBenchValue *retval); +static ConnectionStateEnum executeMetaCommand(CState *st, pg_time_usec_t *now); +static void doLog(TState *thread, CState *st, + StatsData *agg, bool skipped, double latency, double lag); +static void processXactStats(TState *thread, CState *st, pg_time_usec_t *now, + bool skipped, StatsData *agg); +static void addScript(const ParsedScript *script); +static THREAD_FUNC_RETURN_TYPE THREAD_FUNC_CC threadRun(void *arg); +static void finishCon(CState *st); +static void setalarm(int seconds); +static socket_set *alloc_socket_set(int count); +static void free_socket_set(socket_set *sa); +static void clear_socket_set(socket_set *sa); +static void add_socket_to_set(socket_set *sa, int fd, int idx); +static int wait_on_socket_set(socket_set *sa, int64 usecs); +static bool socket_has_input(socket_set *sa, int fd, int idx); + + +/* callback functions for our flex lexer */ +static const PsqlScanCallbacks pgbench_callbacks = { + NULL, /* don't need get_variable functionality */ +}; + +static inline pg_time_usec_t +pg_time_now(void) +{ + instr_time now; + + INSTR_TIME_SET_CURRENT(now); + + return (pg_time_usec_t) INSTR_TIME_GET_MICROSEC(now); +} + +static inline void +pg_time_now_lazy(pg_time_usec_t *now) +{ + if ((*now) == 0) + (*now) = pg_time_now(); +} + +#define PG_TIME_GET_DOUBLE(t) (0.000001 * (t)) + +static void +usage(void) +{ + printf("%s is a benchmarking tool for PostgreSQL.\n\n" + "Usage:\n" + " %s [OPTION]... [DBNAME]\n" + "\nInitialization options:\n" + " -i, --initialize invokes initialization mode\n" + " -I, --init-steps=[" ALL_INIT_STEPS "]+ (default \"" DEFAULT_INIT_STEPS "\")\n" + " run selected initialization steps\n" + " -F, --fillfactor=NUM set fill factor\n" + " -n, --no-vacuum do not run VACUUM during initialization\n" + " -q, --quiet quiet logging (one message each 5 seconds)\n" + " -s, --scale=NUM scaling factor\n" + " --foreign-keys create foreign key constraints between tables\n" + " --index-tablespace=TABLESPACE\n" + " create indexes in the specified tablespace\n" + " --partition-method=(range|hash)\n" + " partition pgbench_accounts with this method (default: range)\n" + " --partitions=NUM partition pgbench_accounts into NUM parts (default: 0)\n" + " --tablespace=TABLESPACE create tables in the specified tablespace\n" + " --unlogged-tables create tables as unlogged tables\n" + "\nOptions to select what to run:\n" + " -b, --builtin=NAME[@W] add builtin script NAME weighted at W (default: 1)\n" + " (use \"-b list\" to list available scripts)\n" + " -f, --file=FILENAME[@W] add script FILENAME weighted at W (default: 1)\n" + " -N, --skip-some-updates skip updates of pgbench_tellers and pgbench_branches\n" + " (same as \"-b simple-update\")\n" + " -S, --select-only perform SELECT-only transactions\n" + " (same as \"-b select-only\")\n" + "\nBenchmarking options:\n" + " -c, --client=NUM number of concurrent database clients (default: 1)\n" + " -C, --connect establish new connection for each transaction\n" + " -D, --define=VARNAME=VALUE\n" + " define variable for use by custom script\n" + " -j, --jobs=NUM number of threads (default: 1)\n" + " -l, --log write transaction times to log file\n" + " -L, --latency-limit=NUM count transactions lasting more than NUM ms as late\n" + " -M, --protocol=simple|extended|prepared\n" + " protocol for submitting queries (default: simple)\n" + " -n, --no-vacuum do not run VACUUM before tests\n" + " -P, --progress=NUM show thread progress report every NUM seconds\n" + " -r, --report-per-command report latencies, failures, and retries per command\n" + " -R, --rate=NUM target rate in transactions per second\n" + " -s, --scale=NUM report this scale factor in output\n" + " -t, --transactions=NUM number of transactions each client runs (default: 10)\n" + " -T, --time=NUM duration of benchmark test in seconds\n" + " -v, --vacuum-all vacuum all four standard tables before tests\n" + " --aggregate-interval=NUM aggregate data over NUM seconds\n" + " --failures-detailed report the failures grouped by basic types\n" + " --log-prefix=PREFIX prefix for transaction time log file\n" + " (default: \"pgbench_log\")\n" + " --max-tries=NUM max number of tries to run transaction (default: 1)\n" + " --progress-timestamp use Unix epoch timestamps for progress\n" + " --random-seed=SEED set random seed (\"time\", \"rand\", integer)\n" + " --sampling-rate=NUM fraction of transactions to log (e.g., 0.01 for 1%%)\n" + " --show-script=NAME show builtin script code, then exit\n" + " --verbose-errors print messages of all errors\n" + "\nCommon options:\n" + " -d, --debug print debugging output\n" + " -h, --host=HOSTNAME database server host or socket directory\n" + " -p, --port=PORT database server port number\n" + " -U, --username=USERNAME connect as specified database user\n" + " -V, --version output version information, then exit\n" + " -?, --help show this help, then exit\n" + "\n" + "Report bugs to <%s>.\n" + "%s home page: <%s>\n", + progname, progname, PACKAGE_BUGREPORT, PACKAGE_NAME, PACKAGE_URL); +} + +/* return whether str matches "^\s*[-+]?[0-9]+$" */ +static bool +is_an_int(const char *str) +{ + const char *ptr = str; + + /* skip leading spaces; cast is consistent with strtoint64 */ + while (*ptr && isspace((unsigned char) *ptr)) + ptr++; + + /* skip sign */ + if (*ptr == '+' || *ptr == '-') + ptr++; + + /* at least one digit */ + if (*ptr && !isdigit((unsigned char) *ptr)) + return false; + + /* eat all digits */ + while (*ptr && isdigit((unsigned char) *ptr)) + ptr++; + + /* must have reached end of string */ + return *ptr == '\0'; +} + + +/* + * strtoint64 -- convert a string to 64-bit integer + * + * This function is a slightly modified version of pg_strtoint64() from + * src/backend/utils/adt/numutils.c. + * + * The function returns whether the conversion worked, and if so + * "*result" is set to the result. + * + * If not errorOK, an error message is also printed out on errors. + */ +bool +strtoint64(const char *str, bool errorOK, int64 *result) +{ + const char *ptr = str; + int64 tmp = 0; + bool neg = false; + + /* + * Do our own scan, rather than relying on sscanf which might be broken + * for long long. + * + * As INT64_MIN can't be stored as a positive 64 bit integer, accumulate + * value as a negative number. + */ + + /* skip leading spaces */ + while (*ptr && isspace((unsigned char) *ptr)) + ptr++; + + /* handle sign */ + if (*ptr == '-') + { + ptr++; + neg = true; + } + else if (*ptr == '+') + ptr++; + + /* require at least one digit */ + if (unlikely(!isdigit((unsigned char) *ptr))) + goto invalid_syntax; + + /* process digits */ + while (*ptr && isdigit((unsigned char) *ptr)) + { + int8 digit = (*ptr++ - '0'); + + if (unlikely(pg_mul_s64_overflow(tmp, 10, &tmp)) || + unlikely(pg_sub_s64_overflow(tmp, digit, &tmp))) + goto out_of_range; + } + + /* allow trailing whitespace, but not other trailing chars */ + while (*ptr != '\0' && isspace((unsigned char) *ptr)) + ptr++; + + if (unlikely(*ptr != '\0')) + goto invalid_syntax; + + if (!neg) + { + if (unlikely(tmp == PG_INT64_MIN)) + goto out_of_range; + tmp = -tmp; + } + + *result = tmp; + return true; + +out_of_range: + if (!errorOK) + pg_log_error("value \"%s\" is out of range for type bigint", str); + return false; + +invalid_syntax: + if (!errorOK) + pg_log_error("invalid input syntax for type bigint: \"%s\"", str); + return false; +} + +/* convert string to double, detecting overflows/underflows */ +bool +strtodouble(const char *str, bool errorOK, double *dv) +{ + char *end; + + errno = 0; + *dv = strtod(str, &end); + + if (unlikely(errno != 0)) + { + if (!errorOK) + pg_log_error("value \"%s\" is out of range for type double", str); + return false; + } + + if (unlikely(end == str || *end != '\0')) + { + if (!errorOK) + pg_log_error("invalid input syntax for type double: \"%s\"", str); + return false; + } + return true; +} + +/* + * Initialize a prng state struct. + * + * We derive the seed from base_random_sequence, which must be set up already. + */ +static void +initRandomState(pg_prng_state *state) +{ + pg_prng_seed(state, pg_prng_uint64(&base_random_sequence)); +} + + +/* + * random number generator: uniform distribution from min to max inclusive. + * + * Although the limits are expressed as int64, you can't generate the full + * int64 range in one call, because the difference of the limits mustn't + * overflow int64. This is not checked. + */ +static int64 +getrand(pg_prng_state *state, int64 min, int64 max) +{ + return min + (int64) pg_prng_uint64_range(state, 0, max - min); +} + +/* + * random number generator: exponential distribution from min to max inclusive. + * the parameter is so that the density of probability for the last cut-off max + * value is exp(-parameter). + */ +static int64 +getExponentialRand(pg_prng_state *state, int64 min, int64 max, + double parameter) +{ + double cut, + uniform, + rand; + + /* abort if wrong parameter, but must really be checked beforehand */ + Assert(parameter > 0.0); + cut = exp(-parameter); + /* pg_prng_double value in [0, 1), uniform in (0, 1] */ + uniform = 1.0 - pg_prng_double(state); + + /* + * inner expression in (cut, 1] (if parameter > 0), rand in [0, 1) + */ + Assert((1.0 - cut) != 0.0); + rand = -log(cut + (1.0 - cut) * uniform) / parameter; + /* return int64 random number within between min and max */ + return min + (int64) ((max - min + 1) * rand); +} + +/* random number generator: gaussian distribution from min to max inclusive */ +static int64 +getGaussianRand(pg_prng_state *state, int64 min, int64 max, + double parameter) +{ + double stdev; + double rand; + + /* abort if parameter is too low, but must really be checked beforehand */ + Assert(parameter >= MIN_GAUSSIAN_PARAM); + + /* + * Get user specified random number from this loop, with -parameter < + * stdev <= parameter + * + * This loop is executed until the number is in the expected range. + * + * As the minimum parameter is 2.0, the probability of looping is low: + * sqrt(-2 ln(r)) <= 2 => r >= e^{-2} ~ 0.135, then when taking the + * average sinus multiplier as 2/pi, we have a 8.6% looping probability in + * the worst case. For a parameter value of 5.0, the looping probability + * is about e^{-5} * 2 / pi ~ 0.43%. + */ + do + { + /* + * pg_prng_double generates [0, 1), but for the basic version of the + * Box-Muller transform the two uniformly distributed random numbers + * are expected to be in (0, 1] (see + * https://en.wikipedia.org/wiki/Box-Muller_transform) + */ + double rand1 = 1.0 - pg_prng_double(state); + double rand2 = 1.0 - pg_prng_double(state); + + /* Box-Muller basic form transform */ + double var_sqrt = sqrt(-2.0 * log(rand1)); + + stdev = var_sqrt * sin(2.0 * M_PI * rand2); + + /* + * we may try with cos, but there may be a bias induced if the + * previous value fails the test. To be on the safe side, let us try + * over. + */ + } + while (stdev < -parameter || stdev >= parameter); + + /* stdev is in [-parameter, parameter), normalization to [0,1) */ + rand = (stdev + parameter) / (parameter * 2.0); + + /* return int64 random number within between min and max */ + return min + (int64) ((max - min + 1) * rand); +} + +/* + * random number generator: generate a value, such that the series of values + * will approximate a Poisson distribution centered on the given value. + * + * Individual results are rounded to integers, though the center value need + * not be one. + */ +static int64 +getPoissonRand(pg_prng_state *state, double center) +{ + /* + * Use inverse transform sampling to generate a value > 0, such that the + * expected (i.e. average) value is the given argument. + */ + double uniform; + + /* pg_prng_double value in [0, 1), uniform in (0, 1] */ + uniform = 1.0 - pg_prng_double(state); + + return (int64) (-log(uniform) * center + 0.5); +} + +/* + * Computing zipfian using rejection method, based on + * "Non-Uniform Random Variate Generation", + * Luc Devroye, p. 550-551, Springer 1986. + * + * This works for s > 1.0, but may perform badly for s very close to 1.0. + */ +static int64 +computeIterativeZipfian(pg_prng_state *state, int64 n, double s) +{ + double b = pow(2.0, s - 1.0); + double x, + t, + u, + v; + + /* Ensure n is sane */ + if (n <= 1) + return 1; + + while (true) + { + /* random variates */ + u = pg_prng_double(state); + v = pg_prng_double(state); + + x = floor(pow(u, -1.0 / (s - 1.0))); + + t = pow(1.0 + 1.0 / x, s - 1.0); + /* reject if too large or out of bound */ + if (v * x * (t - 1.0) / (b - 1.0) <= t / b && x <= n) + break; + } + return (int64) x; +} + +/* random number generator: zipfian distribution from min to max inclusive */ +static int64 +getZipfianRand(pg_prng_state *state, int64 min, int64 max, double s) +{ + int64 n = max - min + 1; + + /* abort if parameter is invalid */ + Assert(MIN_ZIPFIAN_PARAM <= s && s <= MAX_ZIPFIAN_PARAM); + + return min - 1 + computeIterativeZipfian(state, n, s); +} + +/* + * FNV-1a hash function + */ +static int64 +getHashFnv1a(int64 val, uint64 seed) +{ + int64 result; + int i; + + result = FNV_OFFSET_BASIS ^ seed; + for (i = 0; i < 8; ++i) + { + int32 octet = val & 0xff; + + val = val >> 8; + result = result ^ octet; + result = result * FNV_PRIME; + } + + return result; +} + +/* + * Murmur2 hash function + * + * Based on original work of Austin Appleby + * https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp + */ +static int64 +getHashMurmur2(int64 val, uint64 seed) +{ + uint64 result = seed ^ MM2_MUL_TIMES_8; /* sizeof(int64) */ + uint64 k = (uint64) val; + + k *= MM2_MUL; + k ^= k >> MM2_ROT; + k *= MM2_MUL; + + result ^= k; + result *= MM2_MUL; + + result ^= result >> MM2_ROT; + result *= MM2_MUL; + result ^= result >> MM2_ROT; + + return (int64) result; +} + +/* + * Pseudorandom permutation function + * + * For small sizes, this generates each of the (size!) possible permutations + * of integers in the range [0, size) with roughly equal probability. Once + * the size is larger than 20, the number of possible permutations exceeds the + * number of distinct states of the internal pseudorandom number generator, + * and so not all possible permutations can be generated, but the permutations + * chosen should continue to give the appearance of being random. + * + * THIS FUNCTION IS NOT CRYPTOGRAPHICALLY SECURE. + * DO NOT USE FOR SUCH PURPOSE. + */ +static int64 +permute(const int64 val, const int64 isize, const int64 seed) +{ + /* using a high-end PRNG is probably overkill */ + pg_prng_state state; + uint64 size; + uint64 v; + int masklen; + uint64 mask; + int i; + + if (isize < 2) + return 0; /* nothing to permute */ + + /* Initialize prng state using the seed */ + pg_prng_seed(&state, (uint64) seed); + + /* Computations are performed on unsigned values */ + size = (uint64) isize; + v = (uint64) val % size; + + /* Mask to work modulo largest power of 2 less than or equal to size */ + masklen = pg_leftmost_one_pos64(size); + mask = (((uint64) 1) << masklen) - 1; + + /* + * Permute the input value by applying several rounds of pseudorandom + * bijective transformations. The intention here is to distribute each + * input uniformly randomly across the range, and separate adjacent inputs + * approximately uniformly randomly from each other, leading to a fairly + * random overall choice of permutation. + * + * To separate adjacent inputs, we multiply by a random number modulo + * (mask + 1), which is a power of 2. For this to be a bijection, the + * multiplier must be odd. Since this is known to lead to less randomness + * in the lower bits, we also apply a rotation that shifts the topmost bit + * into the least significant bit. In the special cases where size <= 3, + * mask = 1 and each of these operations is actually a no-op, so we also + * XOR the value with a different random number to inject additional + * randomness. Since the size is generally not a power of 2, we apply + * this bijection on overlapping upper and lower halves of the input. + * + * To distribute the inputs uniformly across the range, we then also apply + * a random offset modulo the full range. + * + * Taken together, these operations resemble a modified linear + * congruential generator, as is commonly used in pseudorandom number + * generators. The number of rounds is fairly arbitrary, but six has been + * found empirically to give a fairly good tradeoff between performance + * and uniform randomness. For small sizes it selects each of the (size!) + * possible permutations with roughly equal probability. For larger + * sizes, not all permutations can be generated, but the intended random + * spread is still produced. + */ + for (i = 0; i < 6; i++) + { + uint64 m, + r, + t; + + /* Random multiply (by an odd number), XOR and rotate of lower half */ + m = (pg_prng_uint64(&state) & mask) | 1; + r = pg_prng_uint64(&state) & mask; + if (v <= mask) + { + v = ((v * m) ^ r) & mask; + v = ((v << 1) & mask) | (v >> (masklen - 1)); + } + + /* Random multiply (by an odd number), XOR and rotate of upper half */ + m = (pg_prng_uint64(&state) & mask) | 1; + r = pg_prng_uint64(&state) & mask; + t = size - 1 - v; + if (t <= mask) + { + t = ((t * m) ^ r) & mask; + t = ((t << 1) & mask) | (t >> (masklen - 1)); + v = size - 1 - t; + } + + /* Random offset */ + r = pg_prng_uint64_range(&state, 0, size - 1); + v = (v + r) % size; + } + + return (int64) v; +} + +/* + * Initialize the given SimpleStats struct to all zeroes + */ +static void +initSimpleStats(SimpleStats *ss) +{ + memset(ss, 0, sizeof(SimpleStats)); +} + +/* + * Accumulate one value into a SimpleStats struct. + */ +static void +addToSimpleStats(SimpleStats *ss, double val) +{ + if (ss->count == 0 || val < ss->min) + ss->min = val; + if (ss->count == 0 || val > ss->max) + ss->max = val; + ss->count++; + ss->sum += val; + ss->sum2 += val * val; +} + +/* + * Merge two SimpleStats objects + */ +static void +mergeSimpleStats(SimpleStats *acc, SimpleStats *ss) +{ + if (acc->count == 0 || ss->min < acc->min) + acc->min = ss->min; + if (acc->count == 0 || ss->max > acc->max) + acc->max = ss->max; + acc->count += ss->count; + acc->sum += ss->sum; + acc->sum2 += ss->sum2; +} + +/* + * Initialize a StatsData struct to mostly zeroes, with its start time set to + * the given value. + */ +static void +initStats(StatsData *sd, pg_time_usec_t start) +{ + sd->start_time = start; + sd->cnt = 0; + sd->skipped = 0; + sd->retries = 0; + sd->retried = 0; + sd->serialization_failures = 0; + sd->deadlock_failures = 0; + initSimpleStats(&sd->latency); + initSimpleStats(&sd->lag); +} + +/* + * Accumulate one additional item into the given stats object. + */ +static void +accumStats(StatsData *stats, bool skipped, double lat, double lag, + EStatus estatus, int64 tries) +{ + /* Record the skipped transaction */ + if (skipped) + { + /* no latency to record on skipped transactions */ + stats->skipped++; + return; + } + + /* + * Record the number of retries regardless of whether the transaction was + * successful or failed. + */ + if (tries > 1) + { + stats->retries += (tries - 1); + stats->retried++; + } + + switch (estatus) + { + /* Record the successful transaction */ + case ESTATUS_NO_ERROR: + stats->cnt++; + + addToSimpleStats(&stats->latency, lat); + + /* and possibly the same for schedule lag */ + if (throttle_delay) + addToSimpleStats(&stats->lag, lag); + break; + + /* Record the failed transaction */ + case ESTATUS_SERIALIZATION_ERROR: + stats->serialization_failures++; + break; + case ESTATUS_DEADLOCK_ERROR: + stats->deadlock_failures++; + break; + default: + /* internal error which should never occur */ + pg_fatal("unexpected error status: %d", estatus); + } +} + +/* call PQexec() and exit() on failure */ +static void +executeStatement(PGconn *con, const char *sql) +{ + PGresult *res; + + res = PQexec(con, sql); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + pg_log_error("query failed: %s", PQerrorMessage(con)); + pg_log_error_detail("Query was: %s", sql); + exit(1); + } + PQclear(res); +} + +/* call PQexec() and complain, but without exiting, on failure */ +static void +tryExecuteStatement(PGconn *con, const char *sql) +{ + PGresult *res; + + res = PQexec(con, sql); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + pg_log_error("%s", PQerrorMessage(con)); + pg_log_error_detail("(ignoring this error and continuing anyway)"); + } + PQclear(res); +} + +/* set up a connection to the backend */ +static PGconn * +doConnect(void) +{ + PGconn *conn; + bool new_pass; + static char *password = NULL; + + /* + * Start the connection. Loop until we have a password if requested by + * backend. + */ + do + { +#define PARAMS_ARRAY_SIZE 7 + + const char *keywords[PARAMS_ARRAY_SIZE]; + const char *values[PARAMS_ARRAY_SIZE]; + + keywords[0] = "host"; + values[0] = pghost; + keywords[1] = "port"; + values[1] = pgport; + keywords[2] = "user"; + values[2] = username; + keywords[3] = "password"; + values[3] = password; + keywords[4] = "dbname"; + values[4] = dbName; + keywords[5] = "fallback_application_name"; + values[5] = progname; + keywords[6] = NULL; + values[6] = NULL; + + new_pass = false; + + conn = PQconnectdbParams(keywords, values, true); + + if (!conn) + { + pg_log_error("connection to database \"%s\" failed", dbName); + return NULL; + } + + if (PQstatus(conn) == CONNECTION_BAD && + PQconnectionNeedsPassword(conn) && + !password) + { + PQfinish(conn); + password = simple_prompt("Password: ", false); + new_pass = true; + } + } while (new_pass); + + /* check to see that the backend connection was successfully made */ + if (PQstatus(conn) == CONNECTION_BAD) + { + pg_log_error("%s", PQerrorMessage(conn)); + PQfinish(conn); + return NULL; + } + + return conn; +} + +/* qsort comparator for Variable array */ +static int +compareVariableNames(const void *v1, const void *v2) +{ + return strcmp(((const Variable *) v1)->name, + ((const Variable *) v2)->name); +} + +/* Locate a variable by name; returns NULL if unknown */ +static Variable * +lookupVariable(Variables *variables, char *name) +{ + Variable key; + + /* On some versions of Solaris, bsearch of zero items dumps core */ + if (variables->nvars <= 0) + return NULL; + + /* Sort if we have to */ + if (!variables->vars_sorted) + { + qsort((void *) variables->vars, variables->nvars, sizeof(Variable), + compareVariableNames); + variables->vars_sorted = true; + } + + /* Now we can search */ + key.name = name; + return (Variable *) bsearch((void *) &key, + (void *) variables->vars, + variables->nvars, + sizeof(Variable), + compareVariableNames); +} + +/* Get the value of a variable, in string form; returns NULL if unknown */ +static char * +getVariable(Variables *variables, char *name) +{ + Variable *var; + char stringform[64]; + + var = lookupVariable(variables, name); + if (var == NULL) + return NULL; /* not found */ + + if (var->svalue) + return var->svalue; /* we have it in string form */ + + /* We need to produce a string equivalent of the value */ + Assert(var->value.type != PGBT_NO_VALUE); + if (var->value.type == PGBT_NULL) + snprintf(stringform, sizeof(stringform), "NULL"); + else if (var->value.type == PGBT_BOOLEAN) + snprintf(stringform, sizeof(stringform), + "%s", var->value.u.bval ? "true" : "false"); + else if (var->value.type == PGBT_INT) + snprintf(stringform, sizeof(stringform), + INT64_FORMAT, var->value.u.ival); + else if (var->value.type == PGBT_DOUBLE) + snprintf(stringform, sizeof(stringform), + "%.*g", DBL_DIG, var->value.u.dval); + else /* internal error, unexpected type */ + Assert(0); + var->svalue = pg_strdup(stringform); + return var->svalue; +} + +/* Try to convert variable to a value; return false on failure */ +static bool +makeVariableValue(Variable *var) +{ + size_t slen; + + if (var->value.type != PGBT_NO_VALUE) + return true; /* no work */ + + slen = strlen(var->svalue); + + if (slen == 0) + /* what should it do on ""? */ + return false; + + if (pg_strcasecmp(var->svalue, "null") == 0) + { + setNullValue(&var->value); + } + + /* + * accept prefixes such as y, ye, n, no... but not for "o". 0/1 are + * recognized later as an int, which is converted to bool if needed. + */ + else if (pg_strncasecmp(var->svalue, "true", slen) == 0 || + pg_strncasecmp(var->svalue, "yes", slen) == 0 || + pg_strcasecmp(var->svalue, "on") == 0) + { + setBoolValue(&var->value, true); + } + else if (pg_strncasecmp(var->svalue, "false", slen) == 0 || + pg_strncasecmp(var->svalue, "no", slen) == 0 || + pg_strcasecmp(var->svalue, "off") == 0 || + pg_strcasecmp(var->svalue, "of") == 0) + { + setBoolValue(&var->value, false); + } + else if (is_an_int(var->svalue)) + { + /* if it looks like an int, it must be an int without overflow */ + int64 iv; + + if (!strtoint64(var->svalue, false, &iv)) + return false; + + setIntValue(&var->value, iv); + } + else /* type should be double */ + { + double dv; + + if (!strtodouble(var->svalue, true, &dv)) + { + pg_log_error("malformed variable \"%s\" value: \"%s\"", + var->name, var->svalue); + return false; + } + setDoubleValue(&var->value, dv); + } + return true; +} + +/* + * Check whether a variable's name is allowed. + * + * We allow any non-ASCII character, as well as ASCII letters, digits, and + * underscore. + * + * Keep this in sync with the definitions of variable name characters in + * "src/fe_utils/psqlscan.l", "src/bin/psql/psqlscanslash.l" and + * "src/bin/pgbench/exprscan.l". Also see parseVariable(), below. + * + * Note: this static function is copied from "src/bin/psql/variables.c" + * but changed to disallow variable names starting with a digit. + */ +static bool +valid_variable_name(const char *name) +{ + const unsigned char *ptr = (const unsigned char *) name; + + /* Mustn't be zero-length */ + if (*ptr == '\0') + return false; + + /* must not start with [0-9] */ + if (IS_HIGHBIT_SET(*ptr) || + strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" + "_", *ptr) != NULL) + ptr++; + else + return false; + + /* remaining characters can include [0-9] */ + while (*ptr) + { + if (IS_HIGHBIT_SET(*ptr) || + strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" + "_0123456789", *ptr) != NULL) + ptr++; + else + return false; + } + + return true; +} + +/* + * Make sure there is enough space for 'needed' more variable in the variables + * array. + */ +static void +enlargeVariables(Variables *variables, int needed) +{ + /* total number of variables required now */ + needed += variables->nvars; + + if (variables->max_vars < needed) + { + variables->max_vars = needed + VARIABLES_ALLOC_MARGIN; + variables->vars = (Variable *) + pg_realloc(variables->vars, variables->max_vars * sizeof(Variable)); + } +} + +/* + * Lookup a variable by name, creating it if need be. + * Caller is expected to assign a value to the variable. + * Returns NULL on failure (bad name). + */ +static Variable * +lookupCreateVariable(Variables *variables, const char *context, char *name) +{ + Variable *var; + + var = lookupVariable(variables, name); + if (var == NULL) + { + /* + * Check for the name only when declaring a new variable to avoid + * overhead. + */ + if (!valid_variable_name(name)) + { + pg_log_error("%s: invalid variable name: \"%s\"", context, name); + return NULL; + } + + /* Create variable at the end of the array */ + enlargeVariables(variables, 1); + + var = &(variables->vars[variables->nvars]); + + var->name = pg_strdup(name); + var->svalue = NULL; + /* caller is expected to initialize remaining fields */ + + variables->nvars++; + /* we don't re-sort the array till we have to */ + variables->vars_sorted = false; + } + + return var; +} + +/* Assign a string value to a variable, creating it if need be */ +/* Returns false on failure (bad name) */ +static bool +putVariable(Variables *variables, const char *context, char *name, + const char *value) +{ + Variable *var; + char *val; + + var = lookupCreateVariable(variables, context, name); + if (!var) + return false; + + /* dup then free, in case value is pointing at this variable */ + val = pg_strdup(value); + + if (var->svalue) + free(var->svalue); + var->svalue = val; + var->value.type = PGBT_NO_VALUE; + + return true; +} + +/* Assign a value to a variable, creating it if need be */ +/* Returns false on failure (bad name) */ +static bool +putVariableValue(Variables *variables, const char *context, char *name, + const PgBenchValue *value) +{ + Variable *var; + + var = lookupCreateVariable(variables, context, name); + if (!var) + return false; + + if (var->svalue) + free(var->svalue); + var->svalue = NULL; + var->value = *value; + + return true; +} + +/* Assign an integer value to a variable, creating it if need be */ +/* Returns false on failure (bad name) */ +static bool +putVariableInt(Variables *variables, const char *context, char *name, + int64 value) +{ + PgBenchValue val; + + setIntValue(&val, value); + return putVariableValue(variables, context, name, &val); +} + +/* + * Parse a possible variable reference (:varname). + * + * "sql" points at a colon. If what follows it looks like a valid + * variable name, return a malloc'd string containing the variable name, + * and set *eaten to the number of characters consumed (including the colon). + * Otherwise, return NULL. + */ +static char * +parseVariable(const char *sql, int *eaten) +{ + int i = 1; /* starting at 1 skips the colon */ + char *name; + + /* keep this logic in sync with valid_variable_name() */ + if (IS_HIGHBIT_SET(sql[i]) || + strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" + "_", sql[i]) != NULL) + i++; + else + return NULL; + + while (IS_HIGHBIT_SET(sql[i]) || + strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" + "_0123456789", sql[i]) != NULL) + i++; + + name = pg_malloc(i); + memcpy(name, &sql[1], i - 1); + name[i - 1] = '\0'; + + *eaten = i; + return name; +} + +static char * +replaceVariable(char **sql, char *param, int len, char *value) +{ + int valueln = strlen(value); + + if (valueln > len) + { + size_t offset = param - *sql; + + *sql = pg_realloc(*sql, strlen(*sql) - len + valueln + 1); + param = *sql + offset; + } + + if (valueln != len) + memmove(param + valueln, param + len, strlen(param + len) + 1); + memcpy(param, value, valueln); + + return param + valueln; +} + +static char * +assignVariables(Variables *variables, char *sql) +{ + char *p, + *name, + *val; + + p = sql; + while ((p = strchr(p, ':')) != NULL) + { + int eaten; + + name = parseVariable(p, &eaten); + if (name == NULL) + { + while (*p == ':') + { + p++; + } + continue; + } + + val = getVariable(variables, name); + free(name); + if (val == NULL) + { + p++; + continue; + } + + p = replaceVariable(&sql, p, eaten, val); + } + + return sql; +} + +static void +getQueryParams(Variables *variables, const Command *command, + const char **params) +{ + int i; + + for (i = 0; i < command->argc - 1; i++) + params[i] = getVariable(variables, command->argv[i + 1]); +} + +static char * +valueTypeName(PgBenchValue *pval) +{ + if (pval->type == PGBT_NO_VALUE) + return "none"; + else if (pval->type == PGBT_NULL) + return "null"; + else if (pval->type == PGBT_INT) + return "int"; + else if (pval->type == PGBT_DOUBLE) + return "double"; + else if (pval->type == PGBT_BOOLEAN) + return "boolean"; + else + { + /* internal error, should never get there */ + Assert(false); + return NULL; + } +} + +/* get a value as a boolean, or tell if there is a problem */ +static bool +coerceToBool(PgBenchValue *pval, bool *bval) +{ + if (pval->type == PGBT_BOOLEAN) + { + *bval = pval->u.bval; + return true; + } + else /* NULL, INT or DOUBLE */ + { + pg_log_error("cannot coerce %s to boolean", valueTypeName(pval)); + *bval = false; /* suppress uninitialized-variable warnings */ + return false; + } +} + +/* + * Return true or false from an expression for conditional purposes. + * Non zero numerical values are true, zero and NULL are false. + */ +static bool +valueTruth(PgBenchValue *pval) +{ + switch (pval->type) + { + case PGBT_NULL: + return false; + case PGBT_BOOLEAN: + return pval->u.bval; + case PGBT_INT: + return pval->u.ival != 0; + case PGBT_DOUBLE: + return pval->u.dval != 0.0; + default: + /* internal error, unexpected type */ + Assert(0); + return false; + } +} + +/* get a value as an int, tell if there is a problem */ +static bool +coerceToInt(PgBenchValue *pval, int64 *ival) +{ + if (pval->type == PGBT_INT) + { + *ival = pval->u.ival; + return true; + } + else if (pval->type == PGBT_DOUBLE) + { + double dval = rint(pval->u.dval); + + if (isnan(dval) || !FLOAT8_FITS_IN_INT64(dval)) + { + pg_log_error("double to int overflow for %f", dval); + return false; + } + *ival = (int64) dval; + return true; + } + else /* BOOLEAN or NULL */ + { + pg_log_error("cannot coerce %s to int", valueTypeName(pval)); + return false; + } +} + +/* get a value as a double, or tell if there is a problem */ +static bool +coerceToDouble(PgBenchValue *pval, double *dval) +{ + if (pval->type == PGBT_DOUBLE) + { + *dval = pval->u.dval; + return true; + } + else if (pval->type == PGBT_INT) + { + *dval = (double) pval->u.ival; + return true; + } + else /* BOOLEAN or NULL */ + { + pg_log_error("cannot coerce %s to double", valueTypeName(pval)); + return false; + } +} + +/* assign a null value */ +static void +setNullValue(PgBenchValue *pv) +{ + pv->type = PGBT_NULL; + pv->u.ival = 0; +} + +/* assign a boolean value */ +static void +setBoolValue(PgBenchValue *pv, bool bval) +{ + pv->type = PGBT_BOOLEAN; + pv->u.bval = bval; +} + +/* assign an integer value */ +static void +setIntValue(PgBenchValue *pv, int64 ival) +{ + pv->type = PGBT_INT; + pv->u.ival = ival; +} + +/* assign a double value */ +static void +setDoubleValue(PgBenchValue *pv, double dval) +{ + pv->type = PGBT_DOUBLE; + pv->u.dval = dval; +} + +static bool +isLazyFunc(PgBenchFunction func) +{ + return func == PGBENCH_AND || func == PGBENCH_OR || func == PGBENCH_CASE; +} + +/* lazy evaluation of some functions */ +static bool +evalLazyFunc(CState *st, + PgBenchFunction func, PgBenchExprLink *args, PgBenchValue *retval) +{ + PgBenchValue a1, + a2; + bool ba1, + ba2; + + Assert(isLazyFunc(func) && args != NULL && args->next != NULL); + + /* args points to first condition */ + if (!evaluateExpr(st, args->expr, &a1)) + return false; + + /* second condition for AND/OR and corresponding branch for CASE */ + args = args->next; + + switch (func) + { + case PGBENCH_AND: + if (a1.type == PGBT_NULL) + { + setNullValue(retval); + return true; + } + + if (!coerceToBool(&a1, &ba1)) + return false; + + if (!ba1) + { + setBoolValue(retval, false); + return true; + } + + if (!evaluateExpr(st, args->expr, &a2)) + return false; + + if (a2.type == PGBT_NULL) + { + setNullValue(retval); + return true; + } + else if (!coerceToBool(&a2, &ba2)) + return false; + else + { + setBoolValue(retval, ba2); + return true; + } + + return true; + + case PGBENCH_OR: + + if (a1.type == PGBT_NULL) + { + setNullValue(retval); + return true; + } + + if (!coerceToBool(&a1, &ba1)) + return false; + + if (ba1) + { + setBoolValue(retval, true); + return true; + } + + if (!evaluateExpr(st, args->expr, &a2)) + return false; + + if (a2.type == PGBT_NULL) + { + setNullValue(retval); + return true; + } + else if (!coerceToBool(&a2, &ba2)) + return false; + else + { + setBoolValue(retval, ba2); + return true; + } + + case PGBENCH_CASE: + /* when true, execute branch */ + if (valueTruth(&a1)) + return evaluateExpr(st, args->expr, retval); + + /* now args contains next condition or final else expression */ + args = args->next; + + /* final else case? */ + if (args->next == NULL) + return evaluateExpr(st, args->expr, retval); + + /* no, another when, proceed */ + return evalLazyFunc(st, PGBENCH_CASE, args, retval); + + default: + /* internal error, cannot get here */ + Assert(0); + break; + } + return false; +} + +/* maximum number of function arguments */ +#define MAX_FARGS 16 + +/* + * Recursive evaluation of standard functions, + * which do not require lazy evaluation. + */ +static bool +evalStandardFunc(CState *st, + PgBenchFunction func, PgBenchExprLink *args, + PgBenchValue *retval) +{ + /* evaluate all function arguments */ + int nargs = 0; + PgBenchValue vargs[MAX_FARGS]; + PgBenchExprLink *l = args; + bool has_null = false; + + for (nargs = 0; nargs < MAX_FARGS && l != NULL; nargs++, l = l->next) + { + if (!evaluateExpr(st, l->expr, &vargs[nargs])) + return false; + has_null |= vargs[nargs].type == PGBT_NULL; + } + + if (l != NULL) + { + pg_log_error("too many function arguments, maximum is %d", MAX_FARGS); + return false; + } + + /* NULL arguments */ + if (has_null && func != PGBENCH_IS && func != PGBENCH_DEBUG) + { + setNullValue(retval); + return true; + } + + /* then evaluate function */ + switch (func) + { + /* overloaded operators */ + case PGBENCH_ADD: + case PGBENCH_SUB: + case PGBENCH_MUL: + case PGBENCH_DIV: + case PGBENCH_MOD: + case PGBENCH_EQ: + case PGBENCH_NE: + case PGBENCH_LE: + case PGBENCH_LT: + { + PgBenchValue *lval = &vargs[0], + *rval = &vargs[1]; + + Assert(nargs == 2); + + /* overloaded type management, double if some double */ + if ((lval->type == PGBT_DOUBLE || + rval->type == PGBT_DOUBLE) && func != PGBENCH_MOD) + { + double ld, + rd; + + if (!coerceToDouble(lval, &ld) || + !coerceToDouble(rval, &rd)) + return false; + + switch (func) + { + case PGBENCH_ADD: + setDoubleValue(retval, ld + rd); + return true; + + case PGBENCH_SUB: + setDoubleValue(retval, ld - rd); + return true; + + case PGBENCH_MUL: + setDoubleValue(retval, ld * rd); + return true; + + case PGBENCH_DIV: + setDoubleValue(retval, ld / rd); + return true; + + case PGBENCH_EQ: + setBoolValue(retval, ld == rd); + return true; + + case PGBENCH_NE: + setBoolValue(retval, ld != rd); + return true; + + case PGBENCH_LE: + setBoolValue(retval, ld <= rd); + return true; + + case PGBENCH_LT: + setBoolValue(retval, ld < rd); + return true; + + default: + /* cannot get here */ + Assert(0); + } + } + else /* we have integer operands, or % */ + { + int64 li, + ri, + res; + + if (!coerceToInt(lval, &li) || + !coerceToInt(rval, &ri)) + return false; + + switch (func) + { + case PGBENCH_ADD: + if (pg_add_s64_overflow(li, ri, &res)) + { + pg_log_error("bigint add out of range"); + return false; + } + setIntValue(retval, res); + return true; + + case PGBENCH_SUB: + if (pg_sub_s64_overflow(li, ri, &res)) + { + pg_log_error("bigint sub out of range"); + return false; + } + setIntValue(retval, res); + return true; + + case PGBENCH_MUL: + if (pg_mul_s64_overflow(li, ri, &res)) + { + pg_log_error("bigint mul out of range"); + return false; + } + setIntValue(retval, res); + return true; + + case PGBENCH_EQ: + setBoolValue(retval, li == ri); + return true; + + case PGBENCH_NE: + setBoolValue(retval, li != ri); + return true; + + case PGBENCH_LE: + setBoolValue(retval, li <= ri); + return true; + + case PGBENCH_LT: + setBoolValue(retval, li < ri); + return true; + + case PGBENCH_DIV: + case PGBENCH_MOD: + if (ri == 0) + { + pg_log_error("division by zero"); + return false; + } + /* special handling of -1 divisor */ + if (ri == -1) + { + if (func == PGBENCH_DIV) + { + /* overflow check (needed for INT64_MIN) */ + if (li == PG_INT64_MIN) + { + pg_log_error("bigint div out of range"); + return false; + } + else + setIntValue(retval, -li); + } + else + setIntValue(retval, 0); + return true; + } + /* else divisor is not -1 */ + if (func == PGBENCH_DIV) + setIntValue(retval, li / ri); + else /* func == PGBENCH_MOD */ + setIntValue(retval, li % ri); + + return true; + + default: + /* cannot get here */ + Assert(0); + } + } + + Assert(0); + return false; /* NOTREACHED */ + } + + /* integer bitwise operators */ + case PGBENCH_BITAND: + case PGBENCH_BITOR: + case PGBENCH_BITXOR: + case PGBENCH_LSHIFT: + case PGBENCH_RSHIFT: + { + int64 li, + ri; + + if (!coerceToInt(&vargs[0], &li) || !coerceToInt(&vargs[1], &ri)) + return false; + + if (func == PGBENCH_BITAND) + setIntValue(retval, li & ri); + else if (func == PGBENCH_BITOR) + setIntValue(retval, li | ri); + else if (func == PGBENCH_BITXOR) + setIntValue(retval, li ^ ri); + else if (func == PGBENCH_LSHIFT) + setIntValue(retval, li << ri); + else if (func == PGBENCH_RSHIFT) + setIntValue(retval, li >> ri); + else /* cannot get here */ + Assert(0); + + return true; + } + + /* logical operators */ + case PGBENCH_NOT: + { + bool b; + + if (!coerceToBool(&vargs[0], &b)) + return false; + + setBoolValue(retval, !b); + return true; + } + + /* no arguments */ + case PGBENCH_PI: + setDoubleValue(retval, M_PI); + return true; + + /* 1 overloaded argument */ + case PGBENCH_ABS: + { + PgBenchValue *varg = &vargs[0]; + + Assert(nargs == 1); + + if (varg->type == PGBT_INT) + { + int64 i = varg->u.ival; + + setIntValue(retval, i < 0 ? -i : i); + } + else + { + double d = varg->u.dval; + + Assert(varg->type == PGBT_DOUBLE); + setDoubleValue(retval, d < 0.0 ? -d : d); + } + + return true; + } + + case PGBENCH_DEBUG: + { + PgBenchValue *varg = &vargs[0]; + + Assert(nargs == 1); + + fprintf(stderr, "debug(script=%d,command=%d): ", + st->use_file, st->command + 1); + + if (varg->type == PGBT_NULL) + fprintf(stderr, "null\n"); + else if (varg->type == PGBT_BOOLEAN) + fprintf(stderr, "boolean %s\n", varg->u.bval ? "true" : "false"); + else if (varg->type == PGBT_INT) + fprintf(stderr, "int " INT64_FORMAT "\n", varg->u.ival); + else if (varg->type == PGBT_DOUBLE) + fprintf(stderr, "double %.*g\n", DBL_DIG, varg->u.dval); + else /* internal error, unexpected type */ + Assert(0); + + *retval = *varg; + + return true; + } + + /* 1 double argument */ + case PGBENCH_DOUBLE: + case PGBENCH_SQRT: + case PGBENCH_LN: + case PGBENCH_EXP: + { + double dval; + + Assert(nargs == 1); + + if (!coerceToDouble(&vargs[0], &dval)) + return false; + + if (func == PGBENCH_SQRT) + dval = sqrt(dval); + else if (func == PGBENCH_LN) + dval = log(dval); + else if (func == PGBENCH_EXP) + dval = exp(dval); + /* else is cast: do nothing */ + + setDoubleValue(retval, dval); + return true; + } + + /* 1 int argument */ + case PGBENCH_INT: + { + int64 ival; + + Assert(nargs == 1); + + if (!coerceToInt(&vargs[0], &ival)) + return false; + + setIntValue(retval, ival); + return true; + } + + /* variable number of arguments */ + case PGBENCH_LEAST: + case PGBENCH_GREATEST: + { + bool havedouble; + int i; + + Assert(nargs >= 1); + + /* need double result if any input is double */ + havedouble = false; + for (i = 0; i < nargs; i++) + { + if (vargs[i].type == PGBT_DOUBLE) + { + havedouble = true; + break; + } + } + if (havedouble) + { + double extremum; + + if (!coerceToDouble(&vargs[0], &extremum)) + return false; + for (i = 1; i < nargs; i++) + { + double dval; + + if (!coerceToDouble(&vargs[i], &dval)) + return false; + if (func == PGBENCH_LEAST) + extremum = Min(extremum, dval); + else + extremum = Max(extremum, dval); + } + setDoubleValue(retval, extremum); + } + else + { + int64 extremum; + + if (!coerceToInt(&vargs[0], &extremum)) + return false; + for (i = 1; i < nargs; i++) + { + int64 ival; + + if (!coerceToInt(&vargs[i], &ival)) + return false; + if (func == PGBENCH_LEAST) + extremum = Min(extremum, ival); + else + extremum = Max(extremum, ival); + } + setIntValue(retval, extremum); + } + return true; + } + + /* random functions */ + case PGBENCH_RANDOM: + case PGBENCH_RANDOM_EXPONENTIAL: + case PGBENCH_RANDOM_GAUSSIAN: + case PGBENCH_RANDOM_ZIPFIAN: + { + int64 imin, + imax, + delta; + + Assert(nargs >= 2); + + if (!coerceToInt(&vargs[0], &imin) || + !coerceToInt(&vargs[1], &imax)) + return false; + + /* check random range */ + if (unlikely(imin > imax)) + { + pg_log_error("empty range given to random"); + return false; + } + else if (unlikely(pg_sub_s64_overflow(imax, imin, &delta) || + pg_add_s64_overflow(delta, 1, &delta))) + { + /* prevent int overflows in random functions */ + pg_log_error("random range is too large"); + return false; + } + + if (func == PGBENCH_RANDOM) + { + Assert(nargs == 2); + setIntValue(retval, getrand(&st->cs_func_rs, imin, imax)); + } + else /* gaussian & exponential */ + { + double param; + + Assert(nargs == 3); + + if (!coerceToDouble(&vargs[2], ¶m)) + return false; + + if (func == PGBENCH_RANDOM_GAUSSIAN) + { + if (param < MIN_GAUSSIAN_PARAM) + { + pg_log_error("gaussian parameter must be at least %f (not %f)", + MIN_GAUSSIAN_PARAM, param); + return false; + } + + setIntValue(retval, + getGaussianRand(&st->cs_func_rs, + imin, imax, param)); + } + else if (func == PGBENCH_RANDOM_ZIPFIAN) + { + if (param < MIN_ZIPFIAN_PARAM || param > MAX_ZIPFIAN_PARAM) + { + pg_log_error("zipfian parameter must be in range [%.3f, %.0f] (not %f)", + MIN_ZIPFIAN_PARAM, MAX_ZIPFIAN_PARAM, param); + return false; + } + + setIntValue(retval, + getZipfianRand(&st->cs_func_rs, imin, imax, param)); + } + else /* exponential */ + { + if (param <= 0.0) + { + pg_log_error("exponential parameter must be greater than zero (not %f)", + param); + return false; + } + + setIntValue(retval, + getExponentialRand(&st->cs_func_rs, + imin, imax, param)); + } + } + + return true; + } + + case PGBENCH_POW: + { + PgBenchValue *lval = &vargs[0]; + PgBenchValue *rval = &vargs[1]; + double ld, + rd; + + Assert(nargs == 2); + + if (!coerceToDouble(lval, &ld) || + !coerceToDouble(rval, &rd)) + return false; + + setDoubleValue(retval, pow(ld, rd)); + + return true; + } + + case PGBENCH_IS: + { + Assert(nargs == 2); + + /* + * note: this simple implementation is more permissive than + * SQL + */ + setBoolValue(retval, + vargs[0].type == vargs[1].type && + vargs[0].u.bval == vargs[1].u.bval); + return true; + } + + /* hashing */ + case PGBENCH_HASH_FNV1A: + case PGBENCH_HASH_MURMUR2: + { + int64 val, + seed; + + Assert(nargs == 2); + + if (!coerceToInt(&vargs[0], &val) || + !coerceToInt(&vargs[1], &seed)) + return false; + + if (func == PGBENCH_HASH_MURMUR2) + setIntValue(retval, getHashMurmur2(val, seed)); + else if (func == PGBENCH_HASH_FNV1A) + setIntValue(retval, getHashFnv1a(val, seed)); + else + /* cannot get here */ + Assert(0); + + return true; + } + + case PGBENCH_PERMUTE: + { + int64 val, + size, + seed; + + Assert(nargs == 3); + + if (!coerceToInt(&vargs[0], &val) || + !coerceToInt(&vargs[1], &size) || + !coerceToInt(&vargs[2], &seed)) + return false; + + if (size <= 0) + { + pg_log_error("permute size parameter must be greater than zero"); + return false; + } + + setIntValue(retval, permute(val, size, seed)); + return true; + } + + default: + /* cannot get here */ + Assert(0); + /* dead code to avoid a compiler warning */ + return false; + } +} + +/* evaluate some function */ +static bool +evalFunc(CState *st, + PgBenchFunction func, PgBenchExprLink *args, PgBenchValue *retval) +{ + if (isLazyFunc(func)) + return evalLazyFunc(st, func, args, retval); + else + return evalStandardFunc(st, func, args, retval); +} + +/* + * Recursive evaluation of an expression in a pgbench script + * using the current state of variables. + * Returns whether the evaluation was ok, + * the value itself is returned through the retval pointer. + */ +static bool +evaluateExpr(CState *st, PgBenchExpr *expr, PgBenchValue *retval) +{ + switch (expr->etype) + { + case ENODE_CONSTANT: + { + *retval = expr->u.constant; + return true; + } + + case ENODE_VARIABLE: + { + Variable *var; + + if ((var = lookupVariable(&st->variables, expr->u.variable.varname)) == NULL) + { + pg_log_error("undefined variable \"%s\"", expr->u.variable.varname); + return false; + } + + if (!makeVariableValue(var)) + return false; + + *retval = var->value; + return true; + } + + case ENODE_FUNCTION: + return evalFunc(st, + expr->u.function.function, + expr->u.function.args, + retval); + + default: + /* internal error which should never occur */ + pg_fatal("unexpected enode type in evaluation: %d", expr->etype); + } +} + +/* + * Convert command name to meta-command enum identifier + */ +static MetaCommand +getMetaCommand(const char *cmd) +{ + MetaCommand mc; + + if (cmd == NULL) + mc = META_NONE; + else if (pg_strcasecmp(cmd, "set") == 0) + mc = META_SET; + else if (pg_strcasecmp(cmd, "setshell") == 0) + mc = META_SETSHELL; + else if (pg_strcasecmp(cmd, "shell") == 0) + mc = META_SHELL; + else if (pg_strcasecmp(cmd, "sleep") == 0) + mc = META_SLEEP; + else if (pg_strcasecmp(cmd, "if") == 0) + mc = META_IF; + else if (pg_strcasecmp(cmd, "elif") == 0) + mc = META_ELIF; + else if (pg_strcasecmp(cmd, "else") == 0) + mc = META_ELSE; + else if (pg_strcasecmp(cmd, "endif") == 0) + mc = META_ENDIF; + else if (pg_strcasecmp(cmd, "gset") == 0) + mc = META_GSET; + else if (pg_strcasecmp(cmd, "aset") == 0) + mc = META_ASET; + else if (pg_strcasecmp(cmd, "startpipeline") == 0) + mc = META_STARTPIPELINE; + else if (pg_strcasecmp(cmd, "endpipeline") == 0) + mc = META_ENDPIPELINE; + else + mc = META_NONE; + return mc; +} + +/* + * Run a shell command. The result is assigned to the variable if not NULL. + * Return true if succeeded, or false on error. + */ +static bool +runShellCommand(Variables *variables, char *variable, char **argv, int argc) +{ + char command[SHELL_COMMAND_SIZE]; + int i, + len = 0; + FILE *fp; + char res[64]; + char *endptr; + int retval; + + /*---------- + * Join arguments with whitespace separators. Arguments starting with + * exactly one colon are treated as variables: + * name - append a string "name" + * :var - append a variable named 'var' + * ::name - append a string ":name" + *---------- + */ + for (i = 0; i < argc; i++) + { + char *arg; + int arglen; + + if (argv[i][0] != ':') + { + arg = argv[i]; /* a string literal */ + } + else if (argv[i][1] == ':') + { + arg = argv[i] + 1; /* a string literal starting with colons */ + } + else if ((arg = getVariable(variables, argv[i] + 1)) == NULL) + { + pg_log_error("%s: undefined variable \"%s\"", argv[0], argv[i]); + return false; + } + + arglen = strlen(arg); + if (len + arglen + (i > 0 ? 1 : 0) >= SHELL_COMMAND_SIZE - 1) + { + pg_log_error("%s: shell command is too long", argv[0]); + return false; + } + + if (i > 0) + command[len++] = ' '; + memcpy(command + len, arg, arglen); + len += arglen; + } + + command[len] = '\0'; + + /* Fast path for non-assignment case */ + if (variable == NULL) + { + if (system(command)) + { + if (!timer_exceeded) + pg_log_error("%s: could not launch shell command", argv[0]); + return false; + } + return true; + } + + /* Execute the command with pipe and read the standard output. */ + if ((fp = popen(command, "r")) == NULL) + { + pg_log_error("%s: could not launch shell command", argv[0]); + return false; + } + if (fgets(res, sizeof(res), fp) == NULL) + { + if (!timer_exceeded) + pg_log_error("%s: could not read result of shell command", argv[0]); + (void) pclose(fp); + return false; + } + if (pclose(fp) < 0) + { + pg_log_error("%s: could not close shell command", argv[0]); + return false; + } + + /* Check whether the result is an integer and assign it to the variable */ + retval = (int) strtol(res, &endptr, 10); + while (*endptr != '\0' && isspace((unsigned char) *endptr)) + endptr++; + if (*res == '\0' || *endptr != '\0') + { + pg_log_error("%s: shell command must return an integer (not \"%s\")", argv[0], res); + return false; + } + if (!putVariableInt(variables, "setshell", variable, retval)) + return false; + + pg_log_debug("%s: shell parameter name: \"%s\", value: \"%s\"", argv[0], argv[1], res); + + return true; +} + +/* + * Report the abortion of the client when processing SQL commands. + */ +static void +commandFailed(CState *st, const char *cmd, const char *message) +{ + pg_log_error("client %d aborted in command %d (%s) of script %d; %s", + st->id, st->command, cmd, st->use_file, message); +} + +/* + * Report the error in the command while the script is executing. + */ +static void +commandError(CState *st, const char *message) +{ + Assert(sql_script[st->use_file].commands[st->command]->type == SQL_COMMAND); + pg_log_info("client %d got an error in command %d (SQL) of script %d; %s", + st->id, st->command, st->use_file, message); +} + +/* return a script number with a weighted choice. */ +static int +chooseScript(TState *thread) +{ + int i = 0; + int64 w; + + if (num_scripts == 1) + return 0; + + w = getrand(&thread->ts_choose_rs, 0, total_weight - 1); + do + { + w -= sql_script[i++].weight; + } while (w >= 0); + + return i - 1; +} + +/* + * Allocate space for CState->prepared: we need one boolean for each command + * of each script. + */ +static void +allocCStatePrepared(CState *st) +{ + Assert(st->prepared == NULL); + + st->prepared = pg_malloc(sizeof(bool *) * num_scripts); + for (int i = 0; i < num_scripts; i++) + { + ParsedScript *script = &sql_script[i]; + int numcmds; + + for (numcmds = 0; script->commands[numcmds] != NULL; numcmds++) + ; + st->prepared[i] = pg_malloc0(sizeof(bool) * numcmds); + } +} + +/* + * Prepare the SQL command from st->use_file at command_num. + */ +static void +prepareCommand(CState *st, int command_num) +{ + Command *command = sql_script[st->use_file].commands[command_num]; + + /* No prepare for non-SQL commands */ + if (command->type != SQL_COMMAND) + return; + + if (!st->prepared) + allocCStatePrepared(st); + + if (!st->prepared[st->use_file][command_num]) + { + PGresult *res; + + pg_log_debug("client %d preparing %s", st->id, command->prepname); + res = PQprepare(st->con, command->prepname, + command->argv[0], command->argc - 1, NULL); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + pg_log_error("%s", PQerrorMessage(st->con)); + PQclear(res); + st->prepared[st->use_file][command_num] = true; + } +} + +/* + * Prepare all the commands in the script that come after the \startpipeline + * that's at position st->command, and the first \endpipeline we find. + * + * This sets the ->prepared flag for each relevant command as well as the + * \startpipeline itself, but doesn't move the st->command counter. + */ +static void +prepareCommandsInPipeline(CState *st) +{ + int j; + Command **commands = sql_script[st->use_file].commands; + + Assert(commands[st->command]->type == META_COMMAND && + commands[st->command]->meta == META_STARTPIPELINE); + + if (!st->prepared) + allocCStatePrepared(st); + + /* + * We set the 'prepared' flag on the \startpipeline itself to flag that we + * don't need to do this next time without calling prepareCommand(), even + * though we don't actually prepare this command. + */ + if (st->prepared[st->use_file][st->command]) + return; + + for (j = st->command + 1; commands[j] != NULL; j++) + { + if (commands[j]->type == META_COMMAND && + commands[j]->meta == META_ENDPIPELINE) + break; + + prepareCommand(st, j); + } + + st->prepared[st->use_file][st->command] = true; +} + +/* Send a SQL command, using the chosen querymode */ +static bool +sendCommand(CState *st, Command *command) +{ + int r; + + if (querymode == QUERY_SIMPLE) + { + char *sql; + + sql = pg_strdup(command->argv[0]); + sql = assignVariables(&st->variables, sql); + + pg_log_debug("client %d sending %s", st->id, sql); + r = PQsendQuery(st->con, sql); + free(sql); + } + else if (querymode == QUERY_EXTENDED) + { + const char *sql = command->argv[0]; + const char *params[MAX_ARGS]; + + getQueryParams(&st->variables, command, params); + + pg_log_debug("client %d sending %s", st->id, sql); + r = PQsendQueryParams(st->con, sql, command->argc - 1, + NULL, params, NULL, NULL, 0); + } + else if (querymode == QUERY_PREPARED) + { + const char *params[MAX_ARGS]; + + prepareCommand(st, st->command); + getQueryParams(&st->variables, command, params); + + pg_log_debug("client %d sending %s", st->id, command->prepname); + r = PQsendQueryPrepared(st->con, command->prepname, command->argc - 1, + params, NULL, NULL, 0); + } + else /* unknown sql mode */ + r = 0; + + if (r == 0) + { + pg_log_debug("client %d could not send %s", st->id, command->argv[0]); + return false; + } + else + return true; +} + +/* + * Get the error status from the error code. + */ +static EStatus +getSQLErrorStatus(const char *sqlState) +{ + if (sqlState != NULL) + { + if (strcmp(sqlState, ERRCODE_T_R_SERIALIZATION_FAILURE) == 0) + return ESTATUS_SERIALIZATION_ERROR; + else if (strcmp(sqlState, ERRCODE_T_R_DEADLOCK_DETECTED) == 0) + return ESTATUS_DEADLOCK_ERROR; + } + + return ESTATUS_OTHER_SQL_ERROR; +} + +/* + * Returns true if this type of error can be retried. + */ +static bool +canRetryError(EStatus estatus) +{ + return (estatus == ESTATUS_SERIALIZATION_ERROR || + estatus == ESTATUS_DEADLOCK_ERROR); +} + +/* + * Process query response from the backend. + * + * If varprefix is not NULL, it's the variable name prefix where to store + * the results of the *last* command (META_GSET) or *all* commands + * (META_ASET). + * + * Returns true if everything is A-OK, false if any error occurs. + */ +static bool +readCommandResponse(CState *st, MetaCommand meta, char *varprefix) +{ + PGresult *res; + PGresult *next_res; + int qrynum = 0; + + /* + * varprefix should be set only with \gset or \aset, and \endpipeline and + * SQL commands do not need it. + */ + Assert((meta == META_NONE && varprefix == NULL) || + ((meta == META_ENDPIPELINE) && varprefix == NULL) || + ((meta == META_GSET || meta == META_ASET) && varprefix != NULL)); + + res = PQgetResult(st->con); + + while (res != NULL) + { + bool is_last; + + /* peek at the next result to know whether the current is last */ + next_res = PQgetResult(st->con); + is_last = (next_res == NULL); + + switch (PQresultStatus(res)) + { + case PGRES_COMMAND_OK: /* non-SELECT commands */ + case PGRES_EMPTY_QUERY: /* may be used for testing no-op overhead */ + if (is_last && meta == META_GSET) + { + pg_log_error("client %d script %d command %d query %d: expected one row, got %d", + st->id, st->use_file, st->command, qrynum, 0); + st->estatus = ESTATUS_META_COMMAND_ERROR; + goto error; + } + break; + + case PGRES_TUPLES_OK: + if ((is_last && meta == META_GSET) || meta == META_ASET) + { + int ntuples = PQntuples(res); + + if (meta == META_GSET && ntuples != 1) + { + /* under \gset, report the error */ + pg_log_error("client %d script %d command %d query %d: expected one row, got %d", + st->id, st->use_file, st->command, qrynum, PQntuples(res)); + st->estatus = ESTATUS_META_COMMAND_ERROR; + goto error; + } + else if (meta == META_ASET && ntuples <= 0) + { + /* coldly skip empty result under \aset */ + break; + } + + /* store results into variables */ + for (int fld = 0; fld < PQnfields(res); fld++) + { + char *varname = PQfname(res, fld); + + /* allocate varname only if necessary, freed below */ + if (*varprefix != '\0') + varname = psprintf("%s%s", varprefix, varname); + + /* store last row result as a string */ + if (!putVariable(&st->variables, meta == META_ASET ? "aset" : "gset", varname, + PQgetvalue(res, ntuples - 1, fld))) + { + /* internal error */ + pg_log_error("client %d script %d command %d query %d: error storing into variable %s", + st->id, st->use_file, st->command, qrynum, varname); + st->estatus = ESTATUS_META_COMMAND_ERROR; + goto error; + } + + if (*varprefix != '\0') + pg_free(varname); + } + } + /* otherwise the result is simply thrown away by PQclear below */ + break; + + case PGRES_PIPELINE_SYNC: + pg_log_debug("client %d pipeline ending", st->id); + if (PQexitPipelineMode(st->con) != 1) + pg_log_error("client %d failed to exit pipeline mode: %s", st->id, + PQerrorMessage(st->con)); + break; + + case PGRES_NONFATAL_ERROR: + case PGRES_FATAL_ERROR: + st->estatus = getSQLErrorStatus(PQresultErrorField(res, + PG_DIAG_SQLSTATE)); + if (canRetryError(st->estatus)) + { + if (verbose_errors) + commandError(st, PQerrorMessage(st->con)); + goto error; + } + /* fall through */ + + default: + /* anything else is unexpected */ + pg_log_error("client %d script %d aborted in command %d query %d: %s", + st->id, st->use_file, st->command, qrynum, + PQerrorMessage(st->con)); + goto error; + } + + PQclear(res); + qrynum++; + res = next_res; + } + + if (qrynum == 0) + { + pg_log_error("client %d command %d: no results", st->id, st->command); + return false; + } + + return true; + +error: + PQclear(res); + PQclear(next_res); + do + { + res = PQgetResult(st->con); + PQclear(res); + } while (res); + + return false; +} + +/* + * Parse the argument to a \sleep command, and return the requested amount + * of delay, in microseconds. Returns true on success, false on error. + */ +static bool +evaluateSleep(Variables *variables, int argc, char **argv, int *usecs) +{ + char *var; + int usec; + + if (*argv[1] == ':') + { + if ((var = getVariable(variables, argv[1] + 1)) == NULL) + { + pg_log_error("%s: undefined variable \"%s\"", argv[0], argv[1] + 1); + return false; + } + + usec = atoi(var); + + /* Raise an error if the value of a variable is not a number */ + if (usec == 0 && !isdigit((unsigned char) *var)) + { + pg_log_error("%s: invalid sleep time \"%s\" for variable \"%s\"", + argv[0], var, argv[1] + 1); + return false; + } + } + else + usec = atoi(argv[1]); + + if (argc > 2) + { + if (pg_strcasecmp(argv[2], "ms") == 0) + usec *= 1000; + else if (pg_strcasecmp(argv[2], "s") == 0) + usec *= 1000000; + } + else + usec *= 1000000; + + *usecs = usec; + return true; +} + + +/* + * Returns true if the error can be retried. + */ +static bool +doRetry(CState *st, pg_time_usec_t *now) +{ + Assert(st->estatus != ESTATUS_NO_ERROR); + + /* We can only retry serialization or deadlock errors. */ + if (!canRetryError(st->estatus)) + return false; + + /* + * We must have at least one option to limit the retrying of transactions + * that got an error. + */ + Assert(max_tries || latency_limit || duration > 0); + + /* + * We cannot retry the error if we have reached the maximum number of + * tries. + */ + if (max_tries && st->tries >= max_tries) + return false; + + /* + * We cannot retry the error if we spent too much time on this + * transaction. + */ + if (latency_limit) + { + pg_time_now_lazy(now); + if (*now - st->txn_scheduled > latency_limit) + return false; + } + + /* + * We cannot retry the error if the benchmark duration is over. + */ + if (timer_exceeded) + return false; + + /* OK */ + return true; +} + +/* + * Read results and discard it until a sync point. + */ +static int +discardUntilSync(CState *st) +{ + /* send a sync */ + if (!PQpipelineSync(st->con)) + { + pg_log_error("client %d aborted: failed to send a pipeline sync", + st->id); + return 0; + } + + /* receive PGRES_PIPELINE_SYNC and null following it */ + for (;;) + { + PGresult *res = PQgetResult(st->con); + + if (PQresultStatus(res) == PGRES_PIPELINE_SYNC) + { + PQclear(res); + res = PQgetResult(st->con); + Assert(res == NULL); + break; + } + PQclear(res); + } + + /* exit pipeline */ + if (PQexitPipelineMode(st->con) != 1) + { + pg_log_error("client %d aborted: failed to exit pipeline mode for rolling back the failed transaction", + st->id); + return 0; + } + return 1; +} + +/* + * Get the transaction status at the end of a command especially for + * checking if we are in a (failed) transaction block. + */ +static TStatus +getTransactionStatus(PGconn *con) +{ + PGTransactionStatusType tx_status; + + tx_status = PQtransactionStatus(con); + switch (tx_status) + { + case PQTRANS_IDLE: + return TSTATUS_IDLE; + case PQTRANS_INTRANS: + case PQTRANS_INERROR: + return TSTATUS_IN_BLOCK; + case PQTRANS_UNKNOWN: + /* PQTRANS_UNKNOWN is expected given a broken connection */ + if (PQstatus(con) == CONNECTION_BAD) + return TSTATUS_CONN_ERROR; + /* fall through */ + case PQTRANS_ACTIVE: + default: + + /* + * We cannot find out whether we are in a transaction block or + * not. Internal error which should never occur. + */ + pg_log_error("unexpected transaction status %d", tx_status); + return TSTATUS_OTHER_ERROR; + } + + /* not reached */ + Assert(false); + return TSTATUS_OTHER_ERROR; +} + +/* + * Print verbose messages of an error + */ +static void +printVerboseErrorMessages(CState *st, pg_time_usec_t *now, bool is_retry) +{ + static PQExpBuffer buf = NULL; + + if (buf == NULL) + buf = createPQExpBuffer(); + else + resetPQExpBuffer(buf); + + printfPQExpBuffer(buf, "client %d ", st->id); + appendPQExpBuffer(buf, "%s", + (is_retry ? + "repeats the transaction after the error" : + "ends the failed transaction")); + appendPQExpBuffer(buf, " (try %u", st->tries); + + /* Print max_tries if it is not unlimitted. */ + if (max_tries) + appendPQExpBuffer(buf, "/%u", max_tries); + + /* + * If the latency limit is used, print a percentage of the current + * transaction latency from the latency limit. + */ + if (latency_limit) + { + pg_time_now_lazy(now); + appendPQExpBuffer(buf, ", %.3f%% of the maximum time of tries was used", + (100.0 * (*now - st->txn_scheduled) / latency_limit)); + } + appendPQExpBuffer(buf, ")\n"); + + pg_log_info("%s", buf->data); +} + +/* + * Advance the state machine of a connection. + */ +static void +advanceConnectionState(TState *thread, CState *st, StatsData *agg) +{ + + /* + * gettimeofday() isn't free, so we get the current timestamp lazily the + * first time it's needed, and reuse the same value throughout this + * function after that. This also ensures that e.g. the calculated + * latency reported in the log file and in the totals are the same. Zero + * means "not set yet". Reset "now" when we execute shell commands or + * expressions, which might take a non-negligible amount of time, though. + */ + pg_time_usec_t now = 0; + + /* + * Loop in the state machine, until we have to wait for a result from the + * server or have to sleep for throttling or \sleep. + * + * Note: In the switch-statement below, 'break' will loop back here, + * meaning "continue in the state machine". Return is used to return to + * the caller, giving the thread the opportunity to advance another + * client. + */ + for (;;) + { + Command *command; + + switch (st->state) + { + /* Select transaction (script) to run. */ + case CSTATE_CHOOSE_SCRIPT: + st->use_file = chooseScript(thread); + Assert(conditional_stack_empty(st->cstack)); + + /* reset transaction variables to default values */ + st->estatus = ESTATUS_NO_ERROR; + st->tries = 1; + + pg_log_debug("client %d executing script \"%s\"", + st->id, sql_script[st->use_file].desc); + + /* + * If time is over, we're done; otherwise, get ready to start + * a new transaction, or to get throttled if that's requested. + */ + st->state = timer_exceeded ? CSTATE_FINISHED : + throttle_delay > 0 ? CSTATE_PREPARE_THROTTLE : CSTATE_START_TX; + break; + + /* Start new transaction (script) */ + case CSTATE_START_TX: + pg_time_now_lazy(&now); + + /* establish connection if needed, i.e. under --connect */ + if (st->con == NULL) + { + pg_time_usec_t start = now; + + if ((st->con = doConnect()) == NULL) + { + /* + * as the bench is already running, we do not abort + * the process + */ + pg_log_error("client %d aborted while establishing connection", st->id); + st->state = CSTATE_ABORTED; + break; + } + + /* reset now after connection */ + now = pg_time_now(); + + thread->conn_duration += now - start; + + /* Reset session-local state */ + pg_free(st->prepared); + st->prepared = NULL; + } + + /* + * It is the first try to run this transaction. Remember the + * random state: maybe it will get an error and we will need + * to run it again. + */ + st->random_state = st->cs_func_rs; + + /* record transaction start time */ + st->txn_begin = now; + + /* + * When not throttling, this is also the transaction's + * scheduled start time. + */ + if (!throttle_delay) + st->txn_scheduled = now; + + /* Begin with the first command */ + st->state = CSTATE_START_COMMAND; + st->command = 0; + break; + + /* + * Handle throttling once per transaction by sleeping. + */ + case CSTATE_PREPARE_THROTTLE: + + /* + * Generate a delay such that the series of delays will + * approximate a Poisson distribution centered on the + * throttle_delay time. + * + * If transactions are too slow or a given wait is shorter + * than a transaction, the next transaction will start right + * away. + */ + Assert(throttle_delay > 0); + + thread->throttle_trigger += + getPoissonRand(&thread->ts_throttle_rs, throttle_delay); + st->txn_scheduled = thread->throttle_trigger; + + /* + * If --latency-limit is used, and this slot is already late + * so that the transaction will miss the latency limit even if + * it completed immediately, skip this time slot and loop to + * reschedule. + */ + if (latency_limit) + { + pg_time_now_lazy(&now); + + if (thread->throttle_trigger < now - latency_limit) + { + processXactStats(thread, st, &now, true, agg); + + /* + * Finish client if -T or -t was exceeded. + * + * Stop counting skipped transactions under -T as soon + * as the timer is exceeded. Because otherwise it can + * take a very long time to count all of them + * especially when quite a lot of them happen with + * unrealistically high rate setting in -R, which + * would prevent pgbench from ending immediately. + * Because of this behavior, note that there is no + * guarantee that all skipped transactions are counted + * under -T though there is under -t. This is OK in + * practice because it's very unlikely to happen with + * realistic setting. + */ + if (timer_exceeded || (nxacts > 0 && st->cnt >= nxacts)) + st->state = CSTATE_FINISHED; + + /* Go back to top of loop with CSTATE_PREPARE_THROTTLE */ + break; + } + } + + /* + * stop client if next transaction is beyond pgbench end of + * execution; otherwise, throttle it. + */ + st->state = end_time > 0 && st->txn_scheduled > end_time ? + CSTATE_FINISHED : CSTATE_THROTTLE; + break; + + /* + * Wait until it's time to start next transaction. + */ + case CSTATE_THROTTLE: + pg_time_now_lazy(&now); + + if (now < st->txn_scheduled) + return; /* still sleeping, nothing to do here */ + + /* done sleeping, but don't start transaction if we're done */ + st->state = timer_exceeded ? CSTATE_FINISHED : CSTATE_START_TX; + break; + + /* + * Send a command to server (or execute a meta-command) + */ + case CSTATE_START_COMMAND: + command = sql_script[st->use_file].commands[st->command]; + + /* Transition to script end processing if done */ + if (command == NULL) + { + st->state = CSTATE_END_TX; + break; + } + + /* record begin time of next command, and initiate it */ + if (report_per_command) + { + pg_time_now_lazy(&now); + st->stmt_begin = now; + } + + /* Execute the command */ + if (command->type == SQL_COMMAND) + { + /* disallow \aset and \gset in pipeline mode */ + if (PQpipelineStatus(st->con) != PQ_PIPELINE_OFF) + { + if (command->meta == META_GSET) + { + commandFailed(st, "gset", "\\gset is not allowed in pipeline mode"); + st->state = CSTATE_ABORTED; + break; + } + else if (command->meta == META_ASET) + { + commandFailed(st, "aset", "\\aset is not allowed in pipeline mode"); + st->state = CSTATE_ABORTED; + break; + } + } + + if (!sendCommand(st, command)) + { + commandFailed(st, "SQL", "SQL command send failed"); + st->state = CSTATE_ABORTED; + } + else + { + /* Wait for results, unless in pipeline mode */ + if (PQpipelineStatus(st->con) == PQ_PIPELINE_OFF) + st->state = CSTATE_WAIT_RESULT; + else + st->state = CSTATE_END_COMMAND; + } + } + else if (command->type == META_COMMAND) + { + /*----- + * Possible state changes when executing meta commands: + * - on errors CSTATE_ABORTED + * - on sleep CSTATE_SLEEP + * - else CSTATE_END_COMMAND + */ + st->state = executeMetaCommand(st, &now); + if (st->state == CSTATE_ABORTED) + st->estatus = ESTATUS_META_COMMAND_ERROR; + } + + /* + * We're now waiting for an SQL command to complete, or + * finished processing a metacommand, or need to sleep, or + * something bad happened. + */ + Assert(st->state == CSTATE_WAIT_RESULT || + st->state == CSTATE_END_COMMAND || + st->state == CSTATE_SLEEP || + st->state == CSTATE_ABORTED); + break; + + /* + * non executed conditional branch + */ + case CSTATE_SKIP_COMMAND: + Assert(!conditional_active(st->cstack)); + /* quickly skip commands until something to do... */ + while (true) + { + Command *command; + + command = sql_script[st->use_file].commands[st->command]; + + /* cannot reach end of script in that state */ + Assert(command != NULL); + + /* + * if this is conditional related, update conditional + * state + */ + if (command->type == META_COMMAND && + (command->meta == META_IF || + command->meta == META_ELIF || + command->meta == META_ELSE || + command->meta == META_ENDIF)) + { + switch (conditional_stack_peek(st->cstack)) + { + case IFSTATE_FALSE: + if (command->meta == META_IF || + command->meta == META_ELIF) + { + /* we must evaluate the condition */ + st->state = CSTATE_START_COMMAND; + } + else if (command->meta == META_ELSE) + { + /* we must execute next command */ + conditional_stack_poke(st->cstack, + IFSTATE_ELSE_TRUE); + st->state = CSTATE_START_COMMAND; + st->command++; + } + else if (command->meta == META_ENDIF) + { + Assert(!conditional_stack_empty(st->cstack)); + conditional_stack_pop(st->cstack); + if (conditional_active(st->cstack)) + st->state = CSTATE_START_COMMAND; + + /* + * else state remains in + * CSTATE_SKIP_COMMAND + */ + st->command++; + } + break; + + case IFSTATE_IGNORED: + case IFSTATE_ELSE_FALSE: + if (command->meta == META_IF) + conditional_stack_push(st->cstack, + IFSTATE_IGNORED); + else if (command->meta == META_ENDIF) + { + Assert(!conditional_stack_empty(st->cstack)); + conditional_stack_pop(st->cstack); + if (conditional_active(st->cstack)) + st->state = CSTATE_START_COMMAND; + } + /* could detect "else" & "elif" after "else" */ + st->command++; + break; + + case IFSTATE_NONE: + case IFSTATE_TRUE: + case IFSTATE_ELSE_TRUE: + default: + + /* + * inconsistent if inactive, unreachable dead + * code + */ + Assert(false); + } + } + else + { + /* skip and consider next */ + st->command++; + } + + if (st->state != CSTATE_SKIP_COMMAND) + /* out of quick skip command loop */ + break; + } + break; + + /* + * Wait for the current SQL command to complete + */ + case CSTATE_WAIT_RESULT: + pg_log_debug("client %d receiving", st->id); + + /* + * Only check for new network data if we processed all data + * fetched prior. Otherwise we end up doing a syscall for each + * individual pipelined query, which has a measurable + * performance impact. + */ + if (PQisBusy(st->con) && !PQconsumeInput(st->con)) + { + /* there's something wrong */ + commandFailed(st, "SQL", "perhaps the backend died while processing"); + st->state = CSTATE_ABORTED; + break; + } + if (PQisBusy(st->con)) + return; /* don't have the whole result yet */ + + /* store or discard the query results */ + if (readCommandResponse(st, + sql_script[st->use_file].commands[st->command]->meta, + sql_script[st->use_file].commands[st->command]->varprefix)) + { + /* + * outside of pipeline mode: stop reading results. + * pipeline mode: continue reading results until an + * end-of-pipeline response. + */ + if (PQpipelineStatus(st->con) != PQ_PIPELINE_ON) + st->state = CSTATE_END_COMMAND; + } + else if (canRetryError(st->estatus)) + st->state = CSTATE_ERROR; + else + st->state = CSTATE_ABORTED; + break; + + /* + * Wait until sleep is done. This state is entered after a + * \sleep metacommand. The behavior is similar to + * CSTATE_THROTTLE, but proceeds to CSTATE_START_COMMAND + * instead of CSTATE_START_TX. + */ + case CSTATE_SLEEP: + pg_time_now_lazy(&now); + if (now < st->sleep_until) + return; /* still sleeping, nothing to do here */ + /* Else done sleeping. */ + st->state = CSTATE_END_COMMAND; + break; + + /* + * End of command: record stats and proceed to next command. + */ + case CSTATE_END_COMMAND: + + /* + * command completed: accumulate per-command execution times + * in thread-local data structure, if per-command latencies + * are requested. + */ + if (report_per_command) + { + Command *command; + + pg_time_now_lazy(&now); + + command = sql_script[st->use_file].commands[st->command]; + /* XXX could use a mutex here, but we choose not to */ + addToSimpleStats(&command->stats, + PG_TIME_GET_DOUBLE(now - st->stmt_begin)); + } + + /* Go ahead with next command, to be executed or skipped */ + st->command++; + st->state = conditional_active(st->cstack) ? + CSTATE_START_COMMAND : CSTATE_SKIP_COMMAND; + break; + + /* + * Clean up after an error. + */ + case CSTATE_ERROR: + { + TStatus tstatus; + + Assert(st->estatus != ESTATUS_NO_ERROR); + + /* Clear the conditional stack */ + conditional_stack_reset(st->cstack); + + /* Read and discard until a sync point in pipeline mode */ + if (PQpipelineStatus(st->con) != PQ_PIPELINE_OFF) + { + if (!discardUntilSync(st)) + { + st->state = CSTATE_ABORTED; + break; + } + } + + /* + * Check if we have a (failed) transaction block or not, + * and roll it back if any. + */ + tstatus = getTransactionStatus(st->con); + if (tstatus == TSTATUS_IN_BLOCK) + { + /* Try to rollback a (failed) transaction block. */ + if (!PQsendQuery(st->con, "ROLLBACK")) + { + pg_log_error("client %d aborted: failed to send sql command for rolling back the failed transaction", + st->id); + st->state = CSTATE_ABORTED; + } + else + st->state = CSTATE_WAIT_ROLLBACK_RESULT; + } + else if (tstatus == TSTATUS_IDLE) + { + /* + * If time is over, we're done; otherwise, check if we + * can retry the error. + */ + st->state = timer_exceeded ? CSTATE_FINISHED : + doRetry(st, &now) ? CSTATE_RETRY : CSTATE_FAILURE; + } + else + { + if (tstatus == TSTATUS_CONN_ERROR) + pg_log_error("perhaps the backend died while processing"); + + pg_log_error("client %d aborted while receiving the transaction status", st->id); + st->state = CSTATE_ABORTED; + } + break; + } + + /* + * Wait for the rollback command to complete + */ + case CSTATE_WAIT_ROLLBACK_RESULT: + { + PGresult *res; + + pg_log_debug("client %d receiving", st->id); + if (!PQconsumeInput(st->con)) + { + pg_log_error("client %d aborted while rolling back the transaction after an error; perhaps the backend died while processing", + st->id); + st->state = CSTATE_ABORTED; + break; + } + if (PQisBusy(st->con)) + return; /* don't have the whole result yet */ + + /* + * Read and discard the query result; + */ + res = PQgetResult(st->con); + switch (PQresultStatus(res)) + { + case PGRES_COMMAND_OK: + /* OK */ + PQclear(res); + /* null must be returned */ + res = PQgetResult(st->con); + Assert(res == NULL); + + /* + * If time is over, we're done; otherwise, check + * if we can retry the error. + */ + st->state = timer_exceeded ? CSTATE_FINISHED : + doRetry(st, &now) ? CSTATE_RETRY : CSTATE_FAILURE; + break; + default: + pg_log_error("client %d aborted while rolling back the transaction after an error; %s", + st->id, PQerrorMessage(st->con)); + PQclear(res); + st->state = CSTATE_ABORTED; + break; + } + break; + } + + /* + * Retry the transaction after an error. + */ + case CSTATE_RETRY: + command = sql_script[st->use_file].commands[st->command]; + + /* + * Inform that the transaction will be retried after the + * error. + */ + if (verbose_errors) + printVerboseErrorMessages(st, &now, true); + + /* Count tries and retries */ + st->tries++; + command->retries++; + + /* + * Reset the random state as they were at the beginning of the + * transaction. + */ + st->cs_func_rs = st->random_state; + + /* Process the first transaction command. */ + st->command = 0; + st->estatus = ESTATUS_NO_ERROR; + st->state = CSTATE_START_COMMAND; + break; + + /* + * Record a failed transaction. + */ + case CSTATE_FAILURE: + command = sql_script[st->use_file].commands[st->command]; + + /* Accumulate the failure. */ + command->failures++; + + /* + * Inform that the failed transaction will not be retried. + */ + if (verbose_errors) + printVerboseErrorMessages(st, &now, false); + + /* End the failed transaction. */ + st->state = CSTATE_END_TX; + break; + + /* + * End of transaction (end of script, really). + */ + case CSTATE_END_TX: + { + TStatus tstatus; + + /* transaction finished: calculate latency and do log */ + processXactStats(thread, st, &now, false, agg); + + /* + * missing \endif... cannot happen if CheckConditional was + * okay + */ + Assert(conditional_stack_empty(st->cstack)); + + /* + * We must complete all the transaction blocks that were + * started in this script. + */ + tstatus = getTransactionStatus(st->con); + if (tstatus == TSTATUS_IN_BLOCK) + { + pg_log_error("client %d aborted: end of script reached without completing the last transaction", + st->id); + st->state = CSTATE_ABORTED; + break; + } + else if (tstatus != TSTATUS_IDLE) + { + if (tstatus == TSTATUS_CONN_ERROR) + pg_log_error("perhaps the backend died while processing"); + + pg_log_error("client %d aborted while receiving the transaction status", st->id); + st->state = CSTATE_ABORTED; + break; + } + + if (is_connect) + { + pg_time_usec_t start = now; + + pg_time_now_lazy(&start); + finishCon(st); + now = pg_time_now(); + thread->conn_duration += now - start; + } + + if ((st->cnt >= nxacts && duration <= 0) || timer_exceeded) + { + /* script completed */ + st->state = CSTATE_FINISHED; + break; + } + + /* next transaction (script) */ + st->state = CSTATE_CHOOSE_SCRIPT; + + /* + * Ensure that we always return on this point, so as to + * avoid an infinite loop if the script only contains meta + * commands. + */ + return; + } + + /* + * Final states. Close the connection if it's still open. + */ + case CSTATE_ABORTED: + case CSTATE_FINISHED: + + /* + * Don't measure the disconnection delays here even if in + * CSTATE_FINISHED and -C/--connect option is specified. + * Because in this case all the connections that this thread + * established are closed at the end of transactions and the + * disconnection delays should have already been measured at + * that moment. + * + * In CSTATE_ABORTED state, the measurement is no longer + * necessary because we cannot report complete results anyways + * in this case. + */ + finishCon(st); + return; + } + } +} + +/* + * Subroutine for advanceConnectionState -- initiate or execute the current + * meta command, and return the next state to set. + * + * *now is updated to the current time, unless the command is expected to + * take no time to execute. + */ +static ConnectionStateEnum +executeMetaCommand(CState *st, pg_time_usec_t *now) +{ + Command *command = sql_script[st->use_file].commands[st->command]; + int argc; + char **argv; + + Assert(command != NULL && command->type == META_COMMAND); + + argc = command->argc; + argv = command->argv; + + if (unlikely(__pg_log_level <= PG_LOG_DEBUG)) + { + PQExpBufferData buf; + + initPQExpBuffer(&buf); + + printfPQExpBuffer(&buf, "client %d executing \\%s", st->id, argv[0]); + for (int i = 1; i < argc; i++) + appendPQExpBuffer(&buf, " %s", argv[i]); + + pg_log_debug("%s", buf.data); + + termPQExpBuffer(&buf); + } + + if (command->meta == META_SLEEP) + { + int usec; + + /* + * A \sleep doesn't execute anything, we just get the delay from the + * argument, and enter the CSTATE_SLEEP state. (The per-command + * latency will be recorded in CSTATE_SLEEP state, not here, after the + * delay has elapsed.) + */ + if (!evaluateSleep(&st->variables, argc, argv, &usec)) + { + commandFailed(st, "sleep", "execution of meta-command failed"); + return CSTATE_ABORTED; + } + + pg_time_now_lazy(now); + st->sleep_until = (*now) + usec; + return CSTATE_SLEEP; + } + else if (command->meta == META_SET) + { + PgBenchExpr *expr = command->expr; + PgBenchValue result; + + if (!evaluateExpr(st, expr, &result)) + { + commandFailed(st, argv[0], "evaluation of meta-command failed"); + return CSTATE_ABORTED; + } + + if (!putVariableValue(&st->variables, argv[0], argv[1], &result)) + { + commandFailed(st, "set", "assignment of meta-command failed"); + return CSTATE_ABORTED; + } + } + else if (command->meta == META_IF) + { + /* backslash commands with an expression to evaluate */ + PgBenchExpr *expr = command->expr; + PgBenchValue result; + bool cond; + + if (!evaluateExpr(st, expr, &result)) + { + commandFailed(st, argv[0], "evaluation of meta-command failed"); + return CSTATE_ABORTED; + } + + cond = valueTruth(&result); + conditional_stack_push(st->cstack, cond ? IFSTATE_TRUE : IFSTATE_FALSE); + } + else if (command->meta == META_ELIF) + { + /* backslash commands with an expression to evaluate */ + PgBenchExpr *expr = command->expr; + PgBenchValue result; + bool cond; + + if (conditional_stack_peek(st->cstack) == IFSTATE_TRUE) + { + /* elif after executed block, skip eval and wait for endif. */ + conditional_stack_poke(st->cstack, IFSTATE_IGNORED); + return CSTATE_END_COMMAND; + } + + if (!evaluateExpr(st, expr, &result)) + { + commandFailed(st, argv[0], "evaluation of meta-command failed"); + return CSTATE_ABORTED; + } + + cond = valueTruth(&result); + Assert(conditional_stack_peek(st->cstack) == IFSTATE_FALSE); + conditional_stack_poke(st->cstack, cond ? IFSTATE_TRUE : IFSTATE_FALSE); + } + else if (command->meta == META_ELSE) + { + switch (conditional_stack_peek(st->cstack)) + { + case IFSTATE_TRUE: + conditional_stack_poke(st->cstack, IFSTATE_ELSE_FALSE); + break; + case IFSTATE_FALSE: /* inconsistent if active */ + case IFSTATE_IGNORED: /* inconsistent if active */ + case IFSTATE_NONE: /* else without if */ + case IFSTATE_ELSE_TRUE: /* else after else */ + case IFSTATE_ELSE_FALSE: /* else after else */ + default: + /* dead code if conditional check is ok */ + Assert(false); + } + } + else if (command->meta == META_ENDIF) + { + Assert(!conditional_stack_empty(st->cstack)); + conditional_stack_pop(st->cstack); + } + else if (command->meta == META_SETSHELL) + { + if (!runShellCommand(&st->variables, argv[1], argv + 2, argc - 2)) + { + commandFailed(st, "setshell", "execution of meta-command failed"); + return CSTATE_ABORTED; + } + } + else if (command->meta == META_SHELL) + { + if (!runShellCommand(&st->variables, NULL, argv + 1, argc - 1)) + { + commandFailed(st, "shell", "execution of meta-command failed"); + return CSTATE_ABORTED; + } + } + else if (command->meta == META_STARTPIPELINE) + { + /* + * In pipeline mode, we use a workflow based on libpq pipeline + * functions. + */ + if (querymode == QUERY_SIMPLE) + { + commandFailed(st, "startpipeline", "cannot use pipeline mode with the simple query protocol"); + return CSTATE_ABORTED; + } + + /* + * If we're in prepared-query mode, we need to prepare all the + * commands that are inside the pipeline before we actually start the + * pipeline itself. This solves the problem that running BEGIN + * ISOLATION LEVEL SERIALIZABLE in a pipeline would fail due to a + * snapshot having been acquired by the prepare within the pipeline. + */ + if (querymode == QUERY_PREPARED) + prepareCommandsInPipeline(st); + + if (PQpipelineStatus(st->con) != PQ_PIPELINE_OFF) + { + commandFailed(st, "startpipeline", "already in pipeline mode"); + return CSTATE_ABORTED; + } + if (PQenterPipelineMode(st->con) == 0) + { + commandFailed(st, "startpipeline", "failed to enter pipeline mode"); + return CSTATE_ABORTED; + } + } + else if (command->meta == META_ENDPIPELINE) + { + if (PQpipelineStatus(st->con) != PQ_PIPELINE_ON) + { + commandFailed(st, "endpipeline", "not in pipeline mode"); + return CSTATE_ABORTED; + } + if (!PQpipelineSync(st->con)) + { + commandFailed(st, "endpipeline", "failed to send a pipeline sync"); + return CSTATE_ABORTED; + } + /* Now wait for the PGRES_PIPELINE_SYNC and exit pipeline mode there */ + /* collect pending results before getting out of pipeline mode */ + return CSTATE_WAIT_RESULT; + } + + /* + * executing the expression or shell command might have taken a + * non-negligible amount of time, so reset 'now' + */ + *now = 0; + + return CSTATE_END_COMMAND; +} + +/* + * Return the number fo failed transactions. + */ +static int64 +getFailures(const StatsData *stats) +{ + return (stats->serialization_failures + + stats->deadlock_failures); +} + +/* + * Return a string constant representing the result of a transaction + * that is not successfully processed. + */ +static const char * +getResultString(bool skipped, EStatus estatus) +{ + if (skipped) + return "skipped"; + else if (failures_detailed) + { + switch (estatus) + { + case ESTATUS_SERIALIZATION_ERROR: + return "serialization"; + case ESTATUS_DEADLOCK_ERROR: + return "deadlock"; + default: + /* internal error which should never occur */ + pg_fatal("unexpected error status: %d", estatus); + } + } + else + return "failed"; +} + +/* + * Print log entry after completing one transaction. + * + * We print Unix-epoch timestamps in the log, so that entries can be + * correlated against other logs. + * + * XXX We could obtain the time from the caller and just shift it here, to + * avoid the cost of an extra call to pg_time_now(). + */ +static void +doLog(TState *thread, CState *st, + StatsData *agg, bool skipped, double latency, double lag) +{ + FILE *logfile = thread->logfile; + pg_time_usec_t now = pg_time_now() + epoch_shift; + + Assert(use_log); + + /* + * Skip the log entry if sampling is enabled and this row doesn't belong + * to the random sample. + */ + if (sample_rate != 0.0 && + pg_prng_double(&thread->ts_sample_rs) > sample_rate) + return; + + /* should we aggregate the results or not? */ + if (agg_interval > 0) + { + pg_time_usec_t next; + + /* + * Loop until we reach the interval of the current moment, and print + * any empty intervals in between (this may happen with very low tps, + * e.g. --rate=0.1). + */ + + while ((next = agg->start_time + agg_interval * INT64CONST(1000000)) <= now) + { + double lag_sum = 0.0; + double lag_sum2 = 0.0; + double lag_min = 0.0; + double lag_max = 0.0; + int64 skipped = 0; + int64 serialization_failures = 0; + int64 deadlock_failures = 0; + int64 retried = 0; + int64 retries = 0; + + /* print aggregated report to logfile */ + fprintf(logfile, INT64_FORMAT " " INT64_FORMAT " %.0f %.0f %.0f %.0f", + agg->start_time / 1000000, /* seconds since Unix epoch */ + agg->cnt, + agg->latency.sum, + agg->latency.sum2, + agg->latency.min, + agg->latency.max); + + if (throttle_delay) + { + lag_sum = agg->lag.sum; + lag_sum2 = agg->lag.sum2; + lag_min = agg->lag.min; + lag_max = agg->lag.max; + } + fprintf(logfile, " %.0f %.0f %.0f %.0f", + lag_sum, + lag_sum2, + lag_min, + lag_max); + + if (latency_limit) + skipped = agg->skipped; + fprintf(logfile, " " INT64_FORMAT, skipped); + + if (max_tries != 1) + { + retried = agg->retried; + retries = agg->retries; + } + fprintf(logfile, " " INT64_FORMAT " " INT64_FORMAT, retried, retries); + + if (failures_detailed) + { + serialization_failures = agg->serialization_failures; + deadlock_failures = agg->deadlock_failures; + } + fprintf(logfile, " " INT64_FORMAT " " INT64_FORMAT, + serialization_failures, + deadlock_failures); + + fputc('\n', logfile); + + /* reset data and move to next interval */ + initStats(agg, next); + } + + /* accumulate the current transaction */ + accumStats(agg, skipped, latency, lag, st->estatus, st->tries); + } + else + { + /* no, print raw transactions */ + if (!skipped && st->estatus == ESTATUS_NO_ERROR) + fprintf(logfile, "%d " INT64_FORMAT " %.0f %d " INT64_FORMAT " " + INT64_FORMAT, + st->id, st->cnt, latency, st->use_file, + now / 1000000, now % 1000000); + else + fprintf(logfile, "%d " INT64_FORMAT " %s %d " INT64_FORMAT " " + INT64_FORMAT, + st->id, st->cnt, getResultString(skipped, st->estatus), + st->use_file, now / 1000000, now % 1000000); + + if (throttle_delay) + fprintf(logfile, " %.0f", lag); + if (max_tries != 1) + fprintf(logfile, " %u", st->tries - 1); + fputc('\n', logfile); + } +} + +/* + * Accumulate and report statistics at end of a transaction. + * + * (This is also called when a transaction is late and thus skipped. + * Note that even skipped and failed transactions are counted in the CState + * "cnt" field.) + */ +static void +processXactStats(TState *thread, CState *st, pg_time_usec_t *now, + bool skipped, StatsData *agg) +{ + double latency = 0.0, + lag = 0.0; + bool detailed = progress || throttle_delay || latency_limit || + use_log || per_script_stats; + + if (detailed && !skipped && st->estatus == ESTATUS_NO_ERROR) + { + pg_time_now_lazy(now); + + /* compute latency & lag */ + latency = (*now) - st->txn_scheduled; + lag = st->txn_begin - st->txn_scheduled; + } + + /* keep detailed thread stats */ + accumStats(&thread->stats, skipped, latency, lag, st->estatus, st->tries); + + /* count transactions over the latency limit, if needed */ + if (latency_limit && latency > latency_limit) + thread->latency_late++; + + /* client stat is just counting */ + st->cnt++; + + if (use_log) + doLog(thread, st, agg, skipped, latency, lag); + + /* XXX could use a mutex here, but we choose not to */ + if (per_script_stats) + accumStats(&sql_script[st->use_file].stats, skipped, latency, lag, + st->estatus, st->tries); +} + + +/* discard connections */ +static void +disconnect_all(CState *state, int length) +{ + int i; + + for (i = 0; i < length; i++) + finishCon(&state[i]); +} + +/* + * Remove old pgbench tables, if any exist + */ +static void +initDropTables(PGconn *con) +{ + fprintf(stderr, "dropping old tables...\n"); + + /* + * We drop all the tables in one command, so that whether there are + * foreign key dependencies or not doesn't matter. + */ + executeStatement(con, "drop table if exists " + "pgbench_accounts, " + "pgbench_branches, " + "pgbench_history, " + "pgbench_tellers"); +} + +/* + * Create "pgbench_accounts" partitions if needed. + * + * This is the larger table of pgbench default tpc-b like schema + * with a known size, so we choose to partition it. + */ +static void +createPartitions(PGconn *con) +{ + PQExpBufferData query; + + /* we must have to create some partitions */ + Assert(partitions > 0); + + fprintf(stderr, "creating %d partitions...\n", partitions); + + initPQExpBuffer(&query); + + for (int p = 1; p <= partitions; p++) + { + if (partition_method == PART_RANGE) + { + int64 part_size = (naccounts * (int64) scale + partitions - 1) / partitions; + + printfPQExpBuffer(&query, + "create%s table pgbench_accounts_%d\n" + " partition of pgbench_accounts\n" + " for values from (", + unlogged_tables ? " unlogged" : "", p); + + /* + * For RANGE, we use open-ended partitions at the beginning and + * end to allow any valid value for the primary key. Although the + * actual minimum and maximum values can be derived from the + * scale, it is more generic and the performance is better. + */ + if (p == 1) + appendPQExpBufferStr(&query, "minvalue"); + else + appendPQExpBuffer(&query, INT64_FORMAT, (p - 1) * part_size + 1); + + appendPQExpBufferStr(&query, ") to ("); + + if (p < partitions) + appendPQExpBuffer(&query, INT64_FORMAT, p * part_size + 1); + else + appendPQExpBufferStr(&query, "maxvalue"); + + appendPQExpBufferChar(&query, ')'); + } + else if (partition_method == PART_HASH) + printfPQExpBuffer(&query, + "create%s table pgbench_accounts_%d\n" + " partition of pgbench_accounts\n" + " for values with (modulus %d, remainder %d)", + unlogged_tables ? " unlogged" : "", p, + partitions, p - 1); + else /* cannot get there */ + Assert(0); + + /* + * Per ddlinfo in initCreateTables, fillfactor is needed on table + * pgbench_accounts. + */ + appendPQExpBuffer(&query, " with (fillfactor=%d)", fillfactor); + + executeStatement(con, query.data); + } + + termPQExpBuffer(&query); +} + +/* + * Create pgbench's standard tables + */ +static void +initCreateTables(PGconn *con) +{ + /* + * Note: TPC-B requires at least 100 bytes per row, and the "filler" + * fields in these table declarations were intended to comply with that. + * The pgbench_accounts table complies with that because the "filler" + * column is set to blank-padded empty string. But for all other tables + * the columns default to NULL and so don't actually take any space. We + * could fix that by giving them non-null default values. However, that + * would completely break comparability of pgbench results with prior + * versions. Since pgbench has never pretended to be fully TPC-B compliant + * anyway, we stick with the historical behavior. + */ + struct ddlinfo + { + const char *table; /* table name */ + const char *smcols; /* column decls if accountIDs are 32 bits */ + const char *bigcols; /* column decls if accountIDs are 64 bits */ + int declare_fillfactor; + }; + static const struct ddlinfo DDLs[] = { + { + "pgbench_history", + "tid int,bid int,aid int,delta int,mtime timestamp,filler char(22)", + "tid int,bid int,aid bigint,delta int,mtime timestamp,filler char(22)", + 0 + }, + { + "pgbench_tellers", + "tid int not null,bid int,tbalance int,filler char(84)", + "tid int not null,bid int,tbalance int,filler char(84)", + 1 + }, + { + "pgbench_accounts", + "aid int not null,bid int,abalance int,filler char(84)", + "aid bigint not null,bid int,abalance int,filler char(84)", + 1 + }, + { + "pgbench_branches", + "bid int not null,bbalance int,filler char(88)", + "bid int not null,bbalance int,filler char(88)", + 1 + } + }; + int i; + PQExpBufferData query; + + fprintf(stderr, "creating tables...\n"); + + initPQExpBuffer(&query); + + for (i = 0; i < lengthof(DDLs); i++) + { + const struct ddlinfo *ddl = &DDLs[i]; + + /* Construct new create table statement. */ + printfPQExpBuffer(&query, "create%s table %s(%s)", + unlogged_tables ? " unlogged" : "", + ddl->table, + (scale >= SCALE_32BIT_THRESHOLD) ? ddl->bigcols : ddl->smcols); + + /* Partition pgbench_accounts table */ + if (partition_method != PART_NONE && strcmp(ddl->table, "pgbench_accounts") == 0) + appendPQExpBuffer(&query, + " partition by %s (aid)", PARTITION_METHOD[partition_method]); + else if (ddl->declare_fillfactor) + { + /* fillfactor is only expected on actual tables */ + appendPQExpBuffer(&query, " with (fillfactor=%d)", fillfactor); + } + + if (tablespace != NULL) + { + char *escape_tablespace; + + escape_tablespace = PQescapeIdentifier(con, tablespace, strlen(tablespace)); + appendPQExpBuffer(&query, " tablespace %s", escape_tablespace); + PQfreemem(escape_tablespace); + } + + executeStatement(con, query.data); + } + + termPQExpBuffer(&query); + + if (partition_method != PART_NONE) + createPartitions(con); +} + +/* + * Truncate away any old data, in one command in case there are foreign keys + */ +static void +initTruncateTables(PGconn *con) +{ + executeStatement(con, "truncate table " + "pgbench_accounts, " + "pgbench_branches, " + "pgbench_history, " + "pgbench_tellers"); +} + +/* + * Fill the standard tables with some data generated and sent from the client + */ +static void +initGenerateDataClientSide(PGconn *con) +{ + PQExpBufferData sql; + PGresult *res; + int i; + int64 k; + char *copy_statement; + + /* used to track elapsed time and estimate of the remaining time */ + pg_time_usec_t start; + int log_interval = 1; + + /* Stay on the same line if reporting to a terminal */ + char eol = isatty(fileno(stderr)) ? '\r' : '\n'; + + fprintf(stderr, "generating data (client-side)...\n"); + + /* + * we do all of this in one transaction to enable the backend's + * data-loading optimizations + */ + executeStatement(con, "begin"); + + /* truncate away any old data */ + initTruncateTables(con); + + initPQExpBuffer(&sql); + + /* + * fill branches, tellers, accounts in that order in case foreign keys + * already exist + */ + for (i = 0; i < nbranches * scale; i++) + { + /* "filler" column defaults to NULL */ + printfPQExpBuffer(&sql, + "insert into pgbench_branches(bid,bbalance) values(%d,0)", + i + 1); + executeStatement(con, sql.data); + } + + for (i = 0; i < ntellers * scale; i++) + { + /* "filler" column defaults to NULL */ + printfPQExpBuffer(&sql, + "insert into pgbench_tellers(tid,bid,tbalance) values (%d,%d,0)", + i + 1, i / ntellers + 1); + executeStatement(con, sql.data); + } + + /* + * accounts is big enough to be worth using COPY and tracking runtime + */ + + /* use COPY with FREEZE on v14 and later without partitioning */ + if (partitions == 0 && PQserverVersion(con) >= 140000) + copy_statement = "copy pgbench_accounts from stdin with (freeze on)"; + else + copy_statement = "copy pgbench_accounts from stdin"; + + res = PQexec(con, copy_statement); + + if (PQresultStatus(res) != PGRES_COPY_IN) + pg_fatal("unexpected copy in result: %s", PQerrorMessage(con)); + PQclear(res); + + start = pg_time_now(); + + for (k = 0; k < (int64) naccounts * scale; k++) + { + int64 j = k + 1; + + /* "filler" column defaults to blank padded empty string */ + printfPQExpBuffer(&sql, + INT64_FORMAT "\t" INT64_FORMAT "\t%d\t\n", + j, k / naccounts + 1, 0); + if (PQputline(con, sql.data)) + pg_fatal("PQputline failed"); + + if (CancelRequested) + break; + + /* + * If we want to stick with the original logging, print a message each + * 100k inserted rows. + */ + if ((!use_quiet) && (j % 100000 == 0)) + { + double elapsed_sec = PG_TIME_GET_DOUBLE(pg_time_now() - start); + double remaining_sec = ((double) scale * naccounts - j) * elapsed_sec / j; + + fprintf(stderr, INT64_FORMAT " of " INT64_FORMAT " tuples (%d%%) done (elapsed %.2f s, remaining %.2f s)%c", + j, (int64) naccounts * scale, + (int) (((int64) j * 100) / (naccounts * (int64) scale)), + elapsed_sec, remaining_sec, eol); + } + /* let's not call the timing for each row, but only each 100 rows */ + else if (use_quiet && (j % 100 == 0)) + { + double elapsed_sec = PG_TIME_GET_DOUBLE(pg_time_now() - start); + double remaining_sec = ((double) scale * naccounts - j) * elapsed_sec / j; + + /* have we reached the next interval (or end)? */ + if ((j == scale * naccounts) || (elapsed_sec >= log_interval * LOG_STEP_SECONDS)) + { + fprintf(stderr, INT64_FORMAT " of " INT64_FORMAT " tuples (%d%%) done (elapsed %.2f s, remaining %.2f s)%c", + j, (int64) naccounts * scale, + (int) (((int64) j * 100) / (naccounts * (int64) scale)), elapsed_sec, remaining_sec, eol); + + /* skip to the next interval */ + log_interval = (int) ceil(elapsed_sec / LOG_STEP_SECONDS); + } + } + } + + if (eol != '\n') + fputc('\n', stderr); /* Need to move to next line */ + + if (PQputline(con, "\\.\n")) + pg_fatal("very last PQputline failed"); + if (PQendcopy(con)) + pg_fatal("PQendcopy failed"); + + termPQExpBuffer(&sql); + + executeStatement(con, "commit"); +} + +/* + * Fill the standard tables with some data generated on the server + * + * As already the case with the client-side data generation, the filler + * column defaults to NULL in pgbench_branches and pgbench_tellers, + * and is a blank-padded string in pgbench_accounts. + */ +static void +initGenerateDataServerSide(PGconn *con) +{ + PQExpBufferData sql; + + fprintf(stderr, "generating data (server-side)...\n"); + + /* + * we do all of this in one transaction to enable the backend's + * data-loading optimizations + */ + executeStatement(con, "begin"); + + /* truncate away any old data */ + initTruncateTables(con); + + initPQExpBuffer(&sql); + + printfPQExpBuffer(&sql, + "insert into pgbench_branches(bid,bbalance) " + "select bid, 0 " + "from generate_series(1, %d) as bid", nbranches * scale); + executeStatement(con, sql.data); + + printfPQExpBuffer(&sql, + "insert into pgbench_tellers(tid,bid,tbalance) " + "select tid, (tid - 1) / %d + 1, 0 " + "from generate_series(1, %d) as tid", ntellers, ntellers * scale); + executeStatement(con, sql.data); + + printfPQExpBuffer(&sql, + "insert into pgbench_accounts(aid,bid,abalance,filler) " + "select aid, (aid - 1) / %d + 1, 0, '' " + "from generate_series(1, " INT64_FORMAT ") as aid", + naccounts, (int64) naccounts * scale); + executeStatement(con, sql.data); + + termPQExpBuffer(&sql); + + executeStatement(con, "commit"); +} + +/* + * Invoke vacuum on the standard tables + */ +static void +initVacuum(PGconn *con) +{ + fprintf(stderr, "vacuuming...\n"); + executeStatement(con, "vacuum analyze pgbench_branches"); + executeStatement(con, "vacuum analyze pgbench_tellers"); + executeStatement(con, "vacuum analyze pgbench_accounts"); + executeStatement(con, "vacuum analyze pgbench_history"); +} + +/* + * Create primary keys on the standard tables + */ +static void +initCreatePKeys(PGconn *con) +{ + static const char *const DDLINDEXes[] = { + "alter table pgbench_branches add primary key (bid)", + "alter table pgbench_tellers add primary key (tid)", + "alter table pgbench_accounts add primary key (aid)" + }; + int i; + PQExpBufferData query; + + fprintf(stderr, "creating primary keys...\n"); + initPQExpBuffer(&query); + + for (i = 0; i < lengthof(DDLINDEXes); i++) + { + resetPQExpBuffer(&query); + appendPQExpBufferStr(&query, DDLINDEXes[i]); + + if (index_tablespace != NULL) + { + char *escape_tablespace; + + escape_tablespace = PQescapeIdentifier(con, index_tablespace, + strlen(index_tablespace)); + appendPQExpBuffer(&query, " using index tablespace %s", escape_tablespace); + PQfreemem(escape_tablespace); + } + + executeStatement(con, query.data); + } + + termPQExpBuffer(&query); +} + +/* + * Create foreign key constraints between the standard tables + */ +static void +initCreateFKeys(PGconn *con) +{ + static const char *const DDLKEYs[] = { + "alter table pgbench_tellers add constraint pgbench_tellers_bid_fkey foreign key (bid) references pgbench_branches", + "alter table pgbench_accounts add constraint pgbench_accounts_bid_fkey foreign key (bid) references pgbench_branches", + "alter table pgbench_history add constraint pgbench_history_bid_fkey foreign key (bid) references pgbench_branches", + "alter table pgbench_history add constraint pgbench_history_tid_fkey foreign key (tid) references pgbench_tellers", + "alter table pgbench_history add constraint pgbench_history_aid_fkey foreign key (aid) references pgbench_accounts" + }; + int i; + + fprintf(stderr, "creating foreign keys...\n"); + for (i = 0; i < lengthof(DDLKEYs); i++) + { + executeStatement(con, DDLKEYs[i]); + } +} + +/* + * Validate an initialization-steps string + * + * (We could just leave it to runInitSteps() to fail if there are wrong + * characters, but since initialization can take awhile, it seems friendlier + * to check during option parsing.) + */ +static void +checkInitSteps(const char *initialize_steps) +{ + if (initialize_steps[0] == '\0') + pg_fatal("no initialization steps specified"); + + for (const char *step = initialize_steps; *step != '\0'; step++) + { + if (strchr(ALL_INIT_STEPS " ", *step) == NULL) + { + pg_log_error("unrecognized initialization step \"%c\"", *step); + pg_log_error_detail("Allowed step characters are: \"" ALL_INIT_STEPS "\"."); + exit(1); + } + } +} + +/* + * Invoke each initialization step in the given string + */ +static void +runInitSteps(const char *initialize_steps) +{ + PQExpBufferData stats; + PGconn *con; + const char *step; + double run_time = 0.0; + bool first = true; + + initPQExpBuffer(&stats); + + if ((con = doConnect()) == NULL) + pg_fatal("could not create connection for initialization"); + + setup_cancel_handler(NULL); + SetCancelConn(con); + + for (step = initialize_steps; *step != '\0'; step++) + { + char *op = NULL; + pg_time_usec_t start = pg_time_now(); + + switch (*step) + { + case 'd': + op = "drop tables"; + initDropTables(con); + break; + case 't': + op = "create tables"; + initCreateTables(con); + break; + case 'g': + op = "client-side generate"; + initGenerateDataClientSide(con); + break; + case 'G': + op = "server-side generate"; + initGenerateDataServerSide(con); + break; + case 'v': + op = "vacuum"; + initVacuum(con); + break; + case 'p': + op = "primary keys"; + initCreatePKeys(con); + break; + case 'f': + op = "foreign keys"; + initCreateFKeys(con); + break; + case ' ': + break; /* ignore */ + default: + pg_log_error("unrecognized initialization step \"%c\"", *step); + PQfinish(con); + exit(1); + } + + if (op != NULL) + { + double elapsed_sec = PG_TIME_GET_DOUBLE(pg_time_now() - start); + + if (!first) + appendPQExpBufferStr(&stats, ", "); + else + first = false; + + appendPQExpBuffer(&stats, "%s %.2f s", op, elapsed_sec); + + run_time += elapsed_sec; + } + } + + fprintf(stderr, "done in %.2f s (%s).\n", run_time, stats.data); + ResetCancelConn(); + PQfinish(con); + termPQExpBuffer(&stats); +} + +/* + * Extract pgbench table information into global variables scale, + * partition_method and partitions. + */ +static void +GetTableInfo(PGconn *con, bool scale_given) +{ + PGresult *res; + + /* + * get the scaling factor that should be same as count(*) from + * pgbench_branches if this is not a custom query + */ + res = PQexec(con, "select count(*) from pgbench_branches"); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + char *sqlState = PQresultErrorField(res, PG_DIAG_SQLSTATE); + + pg_log_error("could not count number of branches: %s", PQerrorMessage(con)); + + if (sqlState && strcmp(sqlState, ERRCODE_UNDEFINED_TABLE) == 0) + pg_log_error_hint("Perhaps you need to do initialization (\"pgbench -i\") in database \"%s\".", + PQdb(con)); + + exit(1); + } + scale = atoi(PQgetvalue(res, 0, 0)); + if (scale < 0) + pg_fatal("invalid count(*) from pgbench_branches: \"%s\"", + PQgetvalue(res, 0, 0)); + PQclear(res); + + /* warn if we override user-given -s switch */ + if (scale_given) + pg_log_warning("scale option ignored, using count from pgbench_branches table (%d)", + scale); + + /* + * Get the partition information for the first "pgbench_accounts" table + * found in search_path. + * + * The result is empty if no "pgbench_accounts" is found. + * + * Otherwise, it always returns one row even if the table is not + * partitioned (in which case the partition strategy is NULL). + * + * The number of partitions can be 0 even for partitioned tables, if no + * partition is attached. + * + * We assume no partitioning on any failure, so as to avoid failing on an + * old version without "pg_partitioned_table". + */ + res = PQexec(con, + "select o.n, p.partstrat, pg_catalog.count(i.inhparent) " + "from pg_catalog.pg_class as c " + "join pg_catalog.pg_namespace as n on (n.oid = c.relnamespace) " + "cross join lateral (select pg_catalog.array_position(pg_catalog.current_schemas(true), n.nspname)) as o(n) " + "left join pg_catalog.pg_partitioned_table as p on (p.partrelid = c.oid) " + "left join pg_catalog.pg_inherits as i on (c.oid = i.inhparent) " + "where c.relname = 'pgbench_accounts' and o.n is not null " + "group by 1, 2 " + "order by 1 asc " + "limit 1"); + + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + /* probably an older version, coldly assume no partitioning */ + partition_method = PART_NONE; + partitions = 0; + } + else if (PQntuples(res) == 0) + { + /* + * This case is unlikely as pgbench already found "pgbench_branches" + * above to compute the scale. + */ + pg_log_error("no pgbench_accounts table found in search_path"); + pg_log_error_hint("Perhaps you need to do initialization (\"pgbench -i\") in database \"%s\".", PQdb(con)); + exit(1); + } + else /* PQntupes(res) == 1 */ + { + /* normal case, extract partition information */ + if (PQgetisnull(res, 0, 1)) + partition_method = PART_NONE; + else + { + char *ps = PQgetvalue(res, 0, 1); + + /* column must be there */ + Assert(ps != NULL); + + if (strcmp(ps, "r") == 0) + partition_method = PART_RANGE; + else if (strcmp(ps, "h") == 0) + partition_method = PART_HASH; + else + { + /* possibly a newer version with new partition method */ + pg_fatal("unexpected partition method: \"%s\"", ps); + } + } + + partitions = atoi(PQgetvalue(res, 0, 2)); + } + + PQclear(res); +} + +/* + * Replace :param with $n throughout the command's SQL text, which + * is a modifiable string in cmd->lines. + */ +static bool +parseQuery(Command *cmd) +{ + char *sql, + *p; + + cmd->argc = 1; + + p = sql = pg_strdup(cmd->lines.data); + while ((p = strchr(p, ':')) != NULL) + { + char var[13]; + char *name; + int eaten; + + name = parseVariable(p, &eaten); + if (name == NULL) + { + while (*p == ':') + { + p++; + } + continue; + } + + /* + * cmd->argv[0] is the SQL statement itself, so the max number of + * arguments is one less than MAX_ARGS + */ + if (cmd->argc >= MAX_ARGS) + { + pg_log_error("statement has too many arguments (maximum is %d): %s", + MAX_ARGS - 1, cmd->lines.data); + pg_free(name); + return false; + } + + sprintf(var, "$%d", cmd->argc); + p = replaceVariable(&sql, p, eaten, var); + + cmd->argv[cmd->argc] = name; + cmd->argc++; + } + + Assert(cmd->argv[0] == NULL); + cmd->argv[0] = sql; + return true; +} + +/* + * syntax error while parsing a script (in practice, while parsing a + * backslash command, because we don't detect syntax errors in SQL) + * + * source: source of script (filename or builtin-script ID) + * lineno: line number within script (count from 1) + * line: whole line of backslash command, if available + * command: backslash command name, if available + * msg: the actual error message + * more: optional extra message + * column: zero-based column number, or -1 if unknown + */ +void +syntax_error(const char *source, int lineno, + const char *line, const char *command, + const char *msg, const char *more, int column) +{ + PQExpBufferData buf; + + initPQExpBuffer(&buf); + + printfPQExpBuffer(&buf, "%s:%d: %s", source, lineno, msg); + if (more != NULL) + appendPQExpBuffer(&buf, " (%s)", more); + if (column >= 0 && line == NULL) + appendPQExpBuffer(&buf, " at column %d", column + 1); + if (command != NULL) + appendPQExpBuffer(&buf, " in command \"%s\"", command); + + pg_log_error("%s", buf.data); + + termPQExpBuffer(&buf); + + if (line != NULL) + { + fprintf(stderr, "%s\n", line); + if (column >= 0) + fprintf(stderr, "%*c error found here\n", column + 1, '^'); + } + + exit(1); +} + +/* + * Return a pointer to the start of the SQL command, after skipping over + * whitespace and "--" comments. + * If the end of the string is reached, return NULL. + */ +static char * +skip_sql_comments(char *sql_command) +{ + char *p = sql_command; + + /* Skip any leading whitespace, as well as "--" style comments */ + for (;;) + { + if (isspace((unsigned char) *p)) + p++; + else if (strncmp(p, "--", 2) == 0) + { + p = strchr(p, '\n'); + if (p == NULL) + return NULL; + p++; + } + else + break; + } + + /* NULL if there's nothing but whitespace and comments */ + if (*p == '\0') + return NULL; + + return p; +} + +/* + * Parse a SQL command; return a Command struct, or NULL if it's a comment + * + * On entry, psqlscan.l has collected the command into "buf", so we don't + * really need to do much here except check for comments and set up a Command + * struct. + */ +static Command * +create_sql_command(PQExpBuffer buf, const char *source) +{ + Command *my_command; + char *p = skip_sql_comments(buf->data); + + if (p == NULL) + return NULL; + + /* Allocate and initialize Command structure */ + my_command = (Command *) pg_malloc(sizeof(Command)); + initPQExpBuffer(&my_command->lines); + appendPQExpBufferStr(&my_command->lines, p); + my_command->first_line = NULL; /* this is set later */ + my_command->type = SQL_COMMAND; + my_command->meta = META_NONE; + my_command->argc = 0; + my_command->retries = 0; + my_command->failures = 0; + memset(my_command->argv, 0, sizeof(my_command->argv)); + my_command->varprefix = NULL; /* allocated later, if needed */ + my_command->expr = NULL; + initSimpleStats(&my_command->stats); + my_command->prepname = NULL; /* set later, if needed */ + + return my_command; +} + +/* Free a Command structure and associated data */ +static void +free_command(Command *command) +{ + termPQExpBuffer(&command->lines); + if (command->first_line) + pg_free(command->first_line); + for (int i = 0; i < command->argc; i++) + pg_free(command->argv[i]); + if (command->varprefix) + pg_free(command->varprefix); + + /* + * It should also free expr recursively, but this is currently not needed + * as only gset commands (which do not have an expression) are freed. + */ + pg_free(command); +} + +/* + * Once an SQL command is fully parsed, possibly by accumulating several + * parts, complete other fields of the Command structure. + */ +static void +postprocess_sql_command(Command *my_command) +{ + char buffer[128]; + static int prepnum = 0; + + Assert(my_command->type == SQL_COMMAND); + + /* Save the first line for error display. */ + strlcpy(buffer, my_command->lines.data, sizeof(buffer)); + buffer[strcspn(buffer, "\n\r")] = '\0'; + my_command->first_line = pg_strdup(buffer); + + /* Parse query and generate prepared statement name, if necessary */ + switch (querymode) + { + case QUERY_SIMPLE: + my_command->argv[0] = my_command->lines.data; + my_command->argc++; + break; + case QUERY_PREPARED: + my_command->prepname = psprintf("P_%d", prepnum++); + /* fall through */ + case QUERY_EXTENDED: + if (!parseQuery(my_command)) + exit(1); + break; + default: + exit(1); + } +} + +/* + * Parse a backslash command; return a Command struct, or NULL if comment + * + * At call, we have scanned only the initial backslash. + */ +static Command * +process_backslash_command(PsqlScanState sstate, const char *source) +{ + Command *my_command; + PQExpBufferData word_buf; + int word_offset; + int offsets[MAX_ARGS]; /* offsets of argument words */ + int start_offset; + int lineno; + int j; + + initPQExpBuffer(&word_buf); + + /* Remember location of the backslash */ + start_offset = expr_scanner_offset(sstate) - 1; + lineno = expr_scanner_get_lineno(sstate, start_offset); + + /* Collect first word of command */ + if (!expr_lex_one_word(sstate, &word_buf, &word_offset)) + { + termPQExpBuffer(&word_buf); + return NULL; + } + + /* Allocate and initialize Command structure */ + my_command = (Command *) pg_malloc0(sizeof(Command)); + my_command->type = META_COMMAND; + my_command->argc = 0; + initSimpleStats(&my_command->stats); + + /* Save first word (command name) */ + j = 0; + offsets[j] = word_offset; + my_command->argv[j++] = pg_strdup(word_buf.data); + my_command->argc++; + + /* ... and convert it to enum form */ + my_command->meta = getMetaCommand(my_command->argv[0]); + + if (my_command->meta == META_SET || + my_command->meta == META_IF || + my_command->meta == META_ELIF) + { + yyscan_t yyscanner; + + /* For \set, collect var name */ + if (my_command->meta == META_SET) + { + if (!expr_lex_one_word(sstate, &word_buf, &word_offset)) + syntax_error(source, lineno, my_command->first_line, my_command->argv[0], + "missing argument", NULL, -1); + + offsets[j] = word_offset; + my_command->argv[j++] = pg_strdup(word_buf.data); + my_command->argc++; + } + + /* then for all parse the expression */ + yyscanner = expr_scanner_init(sstate, source, lineno, start_offset, + my_command->argv[0]); + + if (expr_yyparse(yyscanner) != 0) + { + /* dead code: exit done from syntax_error called by yyerror */ + exit(1); + } + + my_command->expr = expr_parse_result; + + /* Save line, trimming any trailing newline */ + my_command->first_line = + expr_scanner_get_substring(sstate, + start_offset, + expr_scanner_offset(sstate), + true); + + expr_scanner_finish(yyscanner); + + termPQExpBuffer(&word_buf); + + return my_command; + } + + /* For all other commands, collect remaining words. */ + while (expr_lex_one_word(sstate, &word_buf, &word_offset)) + { + /* + * my_command->argv[0] is the command itself, so the max number of + * arguments is one less than MAX_ARGS + */ + if (j >= MAX_ARGS) + syntax_error(source, lineno, my_command->first_line, my_command->argv[0], + "too many arguments", NULL, -1); + + offsets[j] = word_offset; + my_command->argv[j++] = pg_strdup(word_buf.data); + my_command->argc++; + } + + /* Save line, trimming any trailing newline */ + my_command->first_line = + expr_scanner_get_substring(sstate, + start_offset, + expr_scanner_offset(sstate), + true); + + if (my_command->meta == META_SLEEP) + { + if (my_command->argc < 2) + syntax_error(source, lineno, my_command->first_line, my_command->argv[0], + "missing argument", NULL, -1); + + if (my_command->argc > 3) + syntax_error(source, lineno, my_command->first_line, my_command->argv[0], + "too many arguments", NULL, + offsets[3] - start_offset); + + /* + * Split argument into number and unit to allow "sleep 1ms" etc. We + * don't have to terminate the number argument with null because it + * will be parsed with atoi, which ignores trailing non-digit + * characters. + */ + if (my_command->argv[1][0] != ':') + { + char *c = my_command->argv[1]; + bool have_digit = false; + + /* Skip sign */ + if (*c == '+' || *c == '-') + c++; + + /* Require at least one digit */ + if (*c && isdigit((unsigned char) *c)) + have_digit = true; + + /* Eat all digits */ + while (*c && isdigit((unsigned char) *c)) + c++; + + if (*c) + { + if (my_command->argc == 2 && have_digit) + { + my_command->argv[2] = c; + offsets[2] = offsets[1] + (c - my_command->argv[1]); + my_command->argc = 3; + } + else + { + /* + * Raise an error if argument starts with non-digit + * character (after sign). + */ + syntax_error(source, lineno, my_command->first_line, my_command->argv[0], + "invalid sleep time, must be an integer", + my_command->argv[1], offsets[1] - start_offset); + } + } + } + + if (my_command->argc == 3) + { + if (pg_strcasecmp(my_command->argv[2], "us") != 0 && + pg_strcasecmp(my_command->argv[2], "ms") != 0 && + pg_strcasecmp(my_command->argv[2], "s") != 0) + syntax_error(source, lineno, my_command->first_line, my_command->argv[0], + "unrecognized time unit, must be us, ms or s", + my_command->argv[2], offsets[2] - start_offset); + } + } + else if (my_command->meta == META_SETSHELL) + { + if (my_command->argc < 3) + syntax_error(source, lineno, my_command->first_line, my_command->argv[0], + "missing argument", NULL, -1); + } + else if (my_command->meta == META_SHELL) + { + if (my_command->argc < 2) + syntax_error(source, lineno, my_command->first_line, my_command->argv[0], + "missing command", NULL, -1); + } + else if (my_command->meta == META_ELSE || my_command->meta == META_ENDIF || + my_command->meta == META_STARTPIPELINE || + my_command->meta == META_ENDPIPELINE) + { + if (my_command->argc != 1) + syntax_error(source, lineno, my_command->first_line, my_command->argv[0], + "unexpected argument", NULL, -1); + } + else if (my_command->meta == META_GSET || my_command->meta == META_ASET) + { + if (my_command->argc > 2) + syntax_error(source, lineno, my_command->first_line, my_command->argv[0], + "too many arguments", NULL, -1); + } + else + { + /* my_command->meta == META_NONE */ + syntax_error(source, lineno, my_command->first_line, my_command->argv[0], + "invalid command", NULL, -1); + } + + termPQExpBuffer(&word_buf); + + return my_command; +} + +static void +ConditionError(const char *desc, int cmdn, const char *msg) +{ + pg_fatal("condition error in script \"%s\" command %d: %s", + desc, cmdn, msg); +} + +/* + * Partial evaluation of conditionals before recording and running the script. + */ +static void +CheckConditional(const ParsedScript *ps) +{ + /* statically check conditional structure */ + ConditionalStack cs = conditional_stack_create(); + int i; + + for (i = 0; ps->commands[i] != NULL; i++) + { + Command *cmd = ps->commands[i]; + + if (cmd->type == META_COMMAND) + { + switch (cmd->meta) + { + case META_IF: + conditional_stack_push(cs, IFSTATE_FALSE); + break; + case META_ELIF: + if (conditional_stack_empty(cs)) + ConditionError(ps->desc, i + 1, "\\elif without matching \\if"); + if (conditional_stack_peek(cs) == IFSTATE_ELSE_FALSE) + ConditionError(ps->desc, i + 1, "\\elif after \\else"); + break; + case META_ELSE: + if (conditional_stack_empty(cs)) + ConditionError(ps->desc, i + 1, "\\else without matching \\if"); + if (conditional_stack_peek(cs) == IFSTATE_ELSE_FALSE) + ConditionError(ps->desc, i + 1, "\\else after \\else"); + conditional_stack_poke(cs, IFSTATE_ELSE_FALSE); + break; + case META_ENDIF: + if (!conditional_stack_pop(cs)) + ConditionError(ps->desc, i + 1, "\\endif without matching \\if"); + break; + default: + /* ignore anything else... */ + break; + } + } + } + if (!conditional_stack_empty(cs)) + ConditionError(ps->desc, i + 1, "\\if without matching \\endif"); + conditional_stack_destroy(cs); +} + +/* + * Parse a script (either the contents of a file, or a built-in script) + * and add it to the list of scripts. + */ +static void +ParseScript(const char *script, const char *desc, int weight) +{ + ParsedScript ps; + PsqlScanState sstate; + PQExpBufferData line_buf; + int alloc_num; + int index; + int lineno; + int start_offset; + +#define COMMANDS_ALLOC_NUM 128 + alloc_num = COMMANDS_ALLOC_NUM; + + /* Initialize all fields of ps */ + ps.desc = desc; + ps.weight = weight; + ps.commands = (Command **) pg_malloc(sizeof(Command *) * alloc_num); + initStats(&ps.stats, 0); + + /* Prepare to parse script */ + sstate = psql_scan_create(&pgbench_callbacks); + + /* + * Ideally, we'd scan scripts using the encoding and stdstrings settings + * we get from a DB connection. However, without major rearrangement of + * pgbench's argument parsing, we can't have a DB connection at the time + * we parse scripts. Using SQL_ASCII (encoding 0) should work well enough + * with any backend-safe encoding, though conceivably we could be fooled + * if a script file uses a client-only encoding. We also assume that + * stdstrings should be true, which is a bit riskier. + */ + psql_scan_setup(sstate, script, strlen(script), 0, true); + start_offset = expr_scanner_offset(sstate) - 1; + + initPQExpBuffer(&line_buf); + + index = 0; + + for (;;) + { + PsqlScanResult sr; + promptStatus_t prompt; + Command *command = NULL; + + resetPQExpBuffer(&line_buf); + lineno = expr_scanner_get_lineno(sstate, start_offset); + + sr = psql_scan(sstate, &line_buf, &prompt); + + /* If we collected a new SQL command, process that */ + command = create_sql_command(&line_buf, desc); + + /* store new command */ + if (command) + ps.commands[index++] = command; + + /* If we reached a backslash, process that */ + if (sr == PSCAN_BACKSLASH) + { + command = process_backslash_command(sstate, desc); + + if (command) + { + /* + * If this is gset or aset, merge into the preceding command. + * (We don't use a command slot in this case). + */ + if (command->meta == META_GSET || command->meta == META_ASET) + { + Command *cmd; + + if (index == 0) + syntax_error(desc, lineno, NULL, NULL, + "\\gset must follow an SQL command", + NULL, -1); + + cmd = ps.commands[index - 1]; + + if (cmd->type != SQL_COMMAND || + cmd->varprefix != NULL) + syntax_error(desc, lineno, NULL, NULL, + "\\gset must follow an SQL command", + cmd->first_line, -1); + + /* get variable prefix */ + if (command->argc <= 1 || command->argv[1][0] == '\0') + cmd->varprefix = pg_strdup(""); + else + cmd->varprefix = pg_strdup(command->argv[1]); + + /* update the sql command meta */ + cmd->meta = command->meta; + + /* cleanup unused command */ + free_command(command); + + continue; + } + + /* Attach any other backslash command as a new command */ + ps.commands[index++] = command; + } + } + + /* + * Since we used a command slot, allocate more if needed. Note we + * always allocate one more in order to accommodate the NULL + * terminator below. + */ + if (index >= alloc_num) + { + alloc_num += COMMANDS_ALLOC_NUM; + ps.commands = (Command **) + pg_realloc(ps.commands, sizeof(Command *) * alloc_num); + } + + /* Done if we reached EOF */ + if (sr == PSCAN_INCOMPLETE || sr == PSCAN_EOL) + break; + } + + ps.commands[index] = NULL; + + addScript(&ps); + + termPQExpBuffer(&line_buf); + psql_scan_finish(sstate); + psql_scan_destroy(sstate); +} + +/* + * Read the entire contents of file fd, and return it in a malloc'd buffer. + * + * The buffer will typically be larger than necessary, but we don't care + * in this program, because we'll free it as soon as we've parsed the script. + */ +static char * +read_file_contents(FILE *fd) +{ + char *buf; + size_t buflen = BUFSIZ; + size_t used = 0; + + buf = (char *) pg_malloc(buflen); + + for (;;) + { + size_t nread; + + nread = fread(buf + used, 1, BUFSIZ, fd); + used += nread; + /* If fread() read less than requested, must be EOF or error */ + if (nread < BUFSIZ) + break; + /* Enlarge buf so we can read some more */ + buflen += BUFSIZ; + buf = (char *) pg_realloc(buf, buflen); + } + /* There is surely room for a terminator */ + buf[used] = '\0'; + + return buf; +} + +/* + * Given a file name, read it and add its script to the list. + * "-" means to read stdin. + * NB: filename must be storage that won't disappear. + */ +static void +process_file(const char *filename, int weight) +{ + FILE *fd; + char *buf; + + /* Slurp the file contents into "buf" */ + if (strcmp(filename, "-") == 0) + fd = stdin; + else if ((fd = fopen(filename, "r")) == NULL) + pg_fatal("could not open file \"%s\": %m", filename); + + buf = read_file_contents(fd); + + if (ferror(fd)) + pg_fatal("could not read file \"%s\": %m", filename); + + if (fd != stdin) + fclose(fd); + + ParseScript(buf, filename, weight); + + free(buf); +} + +/* Parse the given builtin script and add it to the list. */ +static void +process_builtin(const BuiltinScript *bi, int weight) +{ + ParseScript(bi->script, bi->desc, weight); +} + +/* show available builtin scripts */ +static void +listAvailableScripts(void) +{ + int i; + + fprintf(stderr, "Available builtin scripts:\n"); + for (i = 0; i < lengthof(builtin_script); i++) + fprintf(stderr, " %13s: %s\n", builtin_script[i].name, builtin_script[i].desc); + fprintf(stderr, "\n"); +} + +/* return builtin script "name" if unambiguous, fails if not found */ +static const BuiltinScript * +findBuiltin(const char *name) +{ + int i, + found = 0, + len = strlen(name); + const BuiltinScript *result = NULL; + + for (i = 0; i < lengthof(builtin_script); i++) + { + if (strncmp(builtin_script[i].name, name, len) == 0) + { + result = &builtin_script[i]; + found++; + } + } + + /* ok, unambiguous result */ + if (found == 1) + return result; + + /* error cases */ + if (found == 0) + pg_log_error("no builtin script found for name \"%s\"", name); + else /* found > 1 */ + pg_log_error("ambiguous builtin name: %d builtin scripts found for prefix \"%s\"", found, name); + + listAvailableScripts(); + exit(1); +} + +/* + * Determine the weight specification from a script option (-b, -f), if any, + * and return it as an integer (1 is returned if there's no weight). The + * script name is returned in *script as a malloc'd string. + */ +static int +parseScriptWeight(const char *option, char **script) +{ + char *sep; + int weight; + + if ((sep = strrchr(option, WSEP))) + { + int namelen = sep - option; + long wtmp; + char *badp; + + /* generate the script name */ + *script = pg_malloc(namelen + 1); + strncpy(*script, option, namelen); + (*script)[namelen] = '\0'; + + /* process digits of the weight spec */ + errno = 0; + wtmp = strtol(sep + 1, &badp, 10); + if (errno != 0 || badp == sep + 1 || *badp != '\0') + pg_fatal("invalid weight specification: %s", sep); + if (wtmp > INT_MAX || wtmp < 0) + pg_fatal("weight specification out of range (0 .. %d): %lld", + INT_MAX, (long long) wtmp); + weight = wtmp; + } + else + { + *script = pg_strdup(option); + weight = 1; + } + + return weight; +} + +/* append a script to the list of scripts to process */ +static void +addScript(const ParsedScript *script) +{ + if (script->commands == NULL || script->commands[0] == NULL) + pg_fatal("empty command list for script \"%s\"", script->desc); + + if (num_scripts >= MAX_SCRIPTS) + pg_fatal("at most %d SQL scripts are allowed", MAX_SCRIPTS); + + CheckConditional(script); + + sql_script[num_scripts] = *script; + num_scripts++; +} + +/* + * Print progress report. + * + * On entry, *last and *last_report contain the statistics and time of last + * progress report. On exit, they are updated with the new stats. + */ +static void +printProgressReport(TState *threads, int64 test_start, pg_time_usec_t now, + StatsData *last, int64 *last_report) +{ + /* generate and show report */ + pg_time_usec_t run = now - *last_report; + int64 cnt, + failures, + retried; + double tps, + total_run, + latency, + sqlat, + lag, + stdev; + char tbuf[315]; + StatsData cur; + + /* + * Add up the statistics of all threads. + * + * XXX: No locking. There is no guarantee that we get an atomic snapshot + * of the transaction count and latencies, so these figures can well be + * off by a small amount. The progress report's purpose is to give a + * quick overview of how the test is going, so that shouldn't matter too + * much. (If a read from a 64-bit integer is not atomic, you might get a + * "torn" read and completely bogus latencies though!) + */ + initStats(&cur, 0); + for (int i = 0; i < nthreads; i++) + { + mergeSimpleStats(&cur.latency, &threads[i].stats.latency); + mergeSimpleStats(&cur.lag, &threads[i].stats.lag); + cur.cnt += threads[i].stats.cnt; + cur.skipped += threads[i].stats.skipped; + cur.retries += threads[i].stats.retries; + cur.retried += threads[i].stats.retried; + cur.serialization_failures += + threads[i].stats.serialization_failures; + cur.deadlock_failures += threads[i].stats.deadlock_failures; + } + + /* we count only actually executed transactions */ + cnt = cur.cnt - last->cnt; + total_run = (now - test_start) / 1000000.0; + tps = 1000000.0 * cnt / run; + if (cnt > 0) + { + latency = 0.001 * (cur.latency.sum - last->latency.sum) / cnt; + sqlat = 1.0 * (cur.latency.sum2 - last->latency.sum2) / cnt; + stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency); + lag = 0.001 * (cur.lag.sum - last->lag.sum) / cnt; + } + else + { + latency = sqlat = stdev = lag = 0; + } + failures = getFailures(&cur) - getFailures(last); + retried = cur.retried - last->retried; + + if (progress_timestamp) + { + snprintf(tbuf, sizeof(tbuf), "%.3f s", + PG_TIME_GET_DOUBLE(now + epoch_shift)); + } + else + { + /* round seconds are expected, but the thread may be late */ + snprintf(tbuf, sizeof(tbuf), "%.1f s", total_run); + } + + fprintf(stderr, + "progress: %s, %.1f tps, lat %.3f ms stddev %.3f, " INT64_FORMAT " failed", + tbuf, tps, latency, stdev, failures); + + if (throttle_delay) + { + fprintf(stderr, ", lag %.3f ms", lag); + if (latency_limit) + fprintf(stderr, ", " INT64_FORMAT " skipped", + cur.skipped - last->skipped); + } + + /* it can be non-zero only if max_tries is not equal to one */ + if (max_tries != 1) + fprintf(stderr, + ", " INT64_FORMAT " retried, " INT64_FORMAT " retries", + retried, cur.retries - last->retries); + fprintf(stderr, "\n"); + + *last = cur; + *last_report = now; +} + +static void +printSimpleStats(const char *prefix, SimpleStats *ss) +{ + if (ss->count > 0) + { + double latency = ss->sum / ss->count; + double stddev = sqrt(ss->sum2 / ss->count - latency * latency); + + printf("%s average = %.3f ms\n", prefix, 0.001 * latency); + printf("%s stddev = %.3f ms\n", prefix, 0.001 * stddev); + } +} + +/* print version banner */ +static void +printVersion(PGconn *con) +{ + int server_ver = PQserverVersion(con); + int client_ver = PG_VERSION_NUM; + + if (server_ver != client_ver) + { + const char *server_version; + char sverbuf[32]; + + /* Try to get full text form, might include "devel" etc */ + server_version = PQparameterStatus(con, "server_version"); + /* Otherwise fall back on server_ver */ + if (!server_version) + { + formatPGVersionNumber(server_ver, true, + sverbuf, sizeof(sverbuf)); + server_version = sverbuf; + } + + printf(_("%s (%s, server %s)\n"), + "pgbench", PG_VERSION, server_version); + } + /* For version match, only print pgbench version */ + else + printf("%s (%s)\n", "pgbench", PG_VERSION); + fflush(stdout); +} + +/* print out results */ +static void +printResults(StatsData *total, + pg_time_usec_t total_duration, /* benchmarking time */ + pg_time_usec_t conn_total_duration, /* is_connect */ + pg_time_usec_t conn_elapsed_duration, /* !is_connect */ + int64 latency_late) +{ + /* tps is about actually executed transactions during benchmarking */ + int64 failures = getFailures(total); + int64 total_cnt = total->cnt + total->skipped + failures; + double bench_duration = PG_TIME_GET_DOUBLE(total_duration); + double tps = total->cnt / bench_duration; + + /* Report test parameters. */ + printf("transaction type: %s\n", + num_scripts == 1 ? sql_script[0].desc : "multiple scripts"); + printf("scaling factor: %d\n", scale); + /* only print partitioning information if some partitioning was detected */ + if (partition_method != PART_NONE) + printf("partition method: %s\npartitions: %d\n", + PARTITION_METHOD[partition_method], partitions); + printf("query mode: %s\n", QUERYMODE[querymode]); + printf("number of clients: %d\n", nclients); + printf("number of threads: %d\n", nthreads); + + if (max_tries) + printf("maximum number of tries: %u\n", max_tries); + + if (duration <= 0) + { + printf("number of transactions per client: %d\n", nxacts); + printf("number of transactions actually processed: " INT64_FORMAT "/%d\n", + total->cnt, nxacts * nclients); + } + else + { + printf("duration: %d s\n", duration); + printf("number of transactions actually processed: " INT64_FORMAT "\n", + total->cnt); + } + + printf("number of failed transactions: " INT64_FORMAT " (%.3f%%)\n", + failures, 100.0 * failures / total_cnt); + + if (failures_detailed) + { + printf("number of serialization failures: " INT64_FORMAT " (%.3f%%)\n", + total->serialization_failures, + 100.0 * total->serialization_failures / total_cnt); + printf("number of deadlock failures: " INT64_FORMAT " (%.3f%%)\n", + total->deadlock_failures, + 100.0 * total->deadlock_failures / total_cnt); + } + + /* it can be non-zero only if max_tries is not equal to one */ + if (max_tries != 1) + { + printf("number of transactions retried: " INT64_FORMAT " (%.3f%%)\n", + total->retried, 100.0 * total->retried / total_cnt); + printf("total number of retries: " INT64_FORMAT "\n", total->retries); + } + + /* Remaining stats are nonsensical if we failed to execute any xacts */ + if (total->cnt + total->skipped <= 0) + return; + + if (throttle_delay && latency_limit) + printf("number of transactions skipped: " INT64_FORMAT " (%.3f%%)\n", + total->skipped, 100.0 * total->skipped / total_cnt); + + if (latency_limit) + printf("number of transactions above the %.1f ms latency limit: " INT64_FORMAT "/" INT64_FORMAT " (%.3f%%)\n", + latency_limit / 1000.0, latency_late, total->cnt, + (total->cnt > 0) ? 100.0 * latency_late / total->cnt : 0.0); + + if (throttle_delay || progress || latency_limit) + printSimpleStats("latency", &total->latency); + else + { + /* no measurement, show average latency computed from run time */ + printf("latency average = %.3f ms%s\n", + 0.001 * total_duration * nclients / total_cnt, + failures > 0 ? " (including failures)" : ""); + } + + if (throttle_delay) + { + /* + * Report average transaction lag under rate limit throttling. This + * is the delay between scheduled and actual start times for the + * transaction. The measured lag may be caused by thread/client load, + * the database load, or the Poisson throttling process. + */ + printf("rate limit schedule lag: avg %.3f (max %.3f) ms\n", + 0.001 * total->lag.sum / total->cnt, 0.001 * total->lag.max); + } + + /* + * Under -C/--connect, each transaction incurs a significant connection + * cost, it would not make much sense to ignore it in tps, and it would + * not be tps anyway. + * + * Otherwise connections are made just once at the beginning of the run + * and should not impact performance but for very short run, so they are + * (right)fully ignored in tps. + */ + if (is_connect) + { + printf("average connection time = %.3f ms\n", 0.001 * conn_total_duration / (total->cnt + failures)); + printf("tps = %f (including reconnection times)\n", tps); + } + else + { + printf("initial connection time = %.3f ms\n", 0.001 * conn_elapsed_duration); + printf("tps = %f (without initial connection time)\n", tps); + } + + /* Report per-script/command statistics */ + if (per_script_stats || report_per_command) + { + int i; + + for (i = 0; i < num_scripts; i++) + { + if (per_script_stats) + { + StatsData *sstats = &sql_script[i].stats; + int64 script_failures = getFailures(sstats); + int64 script_total_cnt = + sstats->cnt + sstats->skipped + script_failures; + + printf("SQL script %d: %s\n" + " - weight: %d (targets %.1f%% of total)\n" + " - " INT64_FORMAT " transactions (%.1f%% of total, tps = %f)\n", + i + 1, sql_script[i].desc, + sql_script[i].weight, + 100.0 * sql_script[i].weight / total_weight, + sstats->cnt, + 100.0 * sstats->cnt / total->cnt, + sstats->cnt / bench_duration); + + printf(" - number of failed transactions: " INT64_FORMAT " (%.3f%%)\n", + script_failures, + 100.0 * script_failures / script_total_cnt); + + if (failures_detailed) + { + printf(" - number of serialization failures: " INT64_FORMAT " (%.3f%%)\n", + sstats->serialization_failures, + (100.0 * sstats->serialization_failures / + script_total_cnt)); + printf(" - number of deadlock failures: " INT64_FORMAT " (%.3f%%)\n", + sstats->deadlock_failures, + (100.0 * sstats->deadlock_failures / + script_total_cnt)); + } + + /* it can be non-zero only if max_tries is not equal to one */ + if (max_tries != 1) + { + printf(" - number of transactions retried: " INT64_FORMAT " (%.3f%%)\n", + sstats->retried, + 100.0 * sstats->retried / script_total_cnt); + printf(" - total number of retries: " INT64_FORMAT "\n", + sstats->retries); + } + + if (throttle_delay && latency_limit && script_total_cnt > 0) + printf(" - number of transactions skipped: " INT64_FORMAT " (%.3f%%)\n", + sstats->skipped, + 100.0 * sstats->skipped / script_total_cnt); + + printSimpleStats(" - latency", &sstats->latency); + } + + /* + * Report per-command statistics: latencies, retries after errors, + * failures (errors without retrying). + */ + if (report_per_command) + { + Command **commands; + + printf("%sstatement latencies in milliseconds%s:\n", + per_script_stats ? " - " : "", + (max_tries == 1 ? + " and failures" : + ", failures and retries")); + + for (commands = sql_script[i].commands; + *commands != NULL; + commands++) + { + SimpleStats *cstats = &(*commands)->stats; + + if (max_tries == 1) + printf(" %11.3f %10" INT64_MODIFIER "d %s\n", + (cstats->count > 0) ? + 1000.0 * cstats->sum / cstats->count : 0.0, + (*commands)->failures, + (*commands)->first_line); + else + printf(" %11.3f %10" INT64_MODIFIER "d %10" INT64_MODIFIER "d %s\n", + (cstats->count > 0) ? + 1000.0 * cstats->sum / cstats->count : 0.0, + (*commands)->failures, + (*commands)->retries, + (*commands)->first_line); + } + } + } + } +} + +/* + * Set up a random seed according to seed parameter (NULL means default), + * and initialize base_random_sequence for use in initializing other sequences. + */ +static bool +set_random_seed(const char *seed) +{ + uint64 iseed; + + if (seed == NULL || strcmp(seed, "time") == 0) + { + /* rely on current time */ + iseed = pg_time_now(); + } + else if (strcmp(seed, "rand") == 0) + { + /* use some "strong" random source */ + if (!pg_strong_random(&iseed, sizeof(iseed))) + { + pg_log_error("could not generate random seed"); + return false; + } + } + else + { + /* parse unsigned-int seed value */ + unsigned long ulseed; + char garbage; + + /* Don't try to use UINT64_FORMAT here; it might not work for sscanf */ + if (sscanf(seed, "%lu%c", &ulseed, &garbage) != 1) + { + pg_log_error("unrecognized random seed option \"%s\"", seed); + pg_log_error_detail("Expecting an unsigned integer, \"time\" or \"rand\"."); + return false; + } + iseed = (uint64) ulseed; + } + + if (seed != NULL) + pg_log_info("setting random seed to %llu", (unsigned long long) iseed); + + random_seed = iseed; + + /* Initialize base_random_sequence using seed */ + pg_prng_seed(&base_random_sequence, (uint64) iseed); + + return true; +} + +int +main(int argc, char **argv) +{ + static struct option long_options[] = { + /* systematic long/short named options */ + {"builtin", required_argument, NULL, 'b'}, + {"client", required_argument, NULL, 'c'}, + {"connect", no_argument, NULL, 'C'}, + {"debug", no_argument, NULL, 'd'}, + {"define", required_argument, NULL, 'D'}, + {"file", required_argument, NULL, 'f'}, + {"fillfactor", required_argument, NULL, 'F'}, + {"host", required_argument, NULL, 'h'}, + {"initialize", no_argument, NULL, 'i'}, + {"init-steps", required_argument, NULL, 'I'}, + {"jobs", required_argument, NULL, 'j'}, + {"log", no_argument, NULL, 'l'}, + {"latency-limit", required_argument, NULL, 'L'}, + {"no-vacuum", no_argument, NULL, 'n'}, + {"port", required_argument, NULL, 'p'}, + {"progress", required_argument, NULL, 'P'}, + {"protocol", required_argument, NULL, 'M'}, + {"quiet", no_argument, NULL, 'q'}, + {"report-per-command", no_argument, NULL, 'r'}, + {"rate", required_argument, NULL, 'R'}, + {"scale", required_argument, NULL, 's'}, + {"select-only", no_argument, NULL, 'S'}, + {"skip-some-updates", no_argument, NULL, 'N'}, + {"time", required_argument, NULL, 'T'}, + {"transactions", required_argument, NULL, 't'}, + {"username", required_argument, NULL, 'U'}, + {"vacuum-all", no_argument, NULL, 'v'}, + /* long-named only options */ + {"unlogged-tables", no_argument, NULL, 1}, + {"tablespace", required_argument, NULL, 2}, + {"index-tablespace", required_argument, NULL, 3}, + {"sampling-rate", required_argument, NULL, 4}, + {"aggregate-interval", required_argument, NULL, 5}, + {"progress-timestamp", no_argument, NULL, 6}, + {"log-prefix", required_argument, NULL, 7}, + {"foreign-keys", no_argument, NULL, 8}, + {"random-seed", required_argument, NULL, 9}, + {"show-script", required_argument, NULL, 10}, + {"partitions", required_argument, NULL, 11}, + {"partition-method", required_argument, NULL, 12}, + {"failures-detailed", no_argument, NULL, 13}, + {"max-tries", required_argument, NULL, 14}, + {"verbose-errors", no_argument, NULL, 15}, + {NULL, 0, NULL, 0} + }; + + int c; + bool is_init_mode = false; /* initialize mode? */ + char *initialize_steps = NULL; + bool foreign_keys = false; + bool is_no_vacuum = false; + bool do_vacuum_accounts = false; /* vacuum accounts table? */ + int optindex; + bool scale_given = false; + + bool benchmarking_option_set = false; + bool initialization_option_set = false; + bool internal_script_used = false; + + CState *state; /* status of clients */ + TState *threads; /* array of thread */ + + pg_time_usec_t + start_time, /* start up time */ + bench_start = 0, /* first recorded benchmarking time */ + conn_total_duration; /* cumulated connection time in + * threads */ + int64 latency_late = 0; + StatsData stats; + int weight; + + int i; + int nclients_dealt; + +#ifdef HAVE_GETRLIMIT + struct rlimit rlim; +#endif + + PGconn *con; + char *env; + + int exit_code = 0; + struct timeval tv; + + /* + * Record difference between Unix time and instr_time time. We'll use + * this for logging and aggregation. + */ + gettimeofday(&tv, NULL); + epoch_shift = tv.tv_sec * INT64CONST(1000000) + tv.tv_usec - pg_time_now(); + + pg_logging_init(argv[0]); + progname = get_progname(argv[0]); + + if (argc > 1) + { + if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) + { + usage(); + exit(0); + } + if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) + { + puts("pgbench (PostgreSQL) " PG_VERSION); + exit(0); + } + } + + state = (CState *) pg_malloc0(sizeof(CState)); + + /* set random seed early, because it may be used while parsing scripts. */ + if (!set_random_seed(getenv("PGBENCH_RANDOM_SEED"))) + pg_fatal("error while setting random seed from PGBENCH_RANDOM_SEED environment variable"); + + while ((c = getopt_long(argc, argv, "iI:h:nvp:dqb:SNc:j:Crs:t:T:U:lf:D:F:M:P:R:L:", long_options, &optindex)) != -1) + { + char *script; + + switch (c) + { + case 'i': + is_init_mode = true; + break; + case 'I': + if (initialize_steps) + pg_free(initialize_steps); + initialize_steps = pg_strdup(optarg); + checkInitSteps(initialize_steps); + initialization_option_set = true; + break; + case 'h': + pghost = pg_strdup(optarg); + break; + case 'n': + is_no_vacuum = true; + break; + case 'v': + benchmarking_option_set = true; + do_vacuum_accounts = true; + break; + case 'p': + pgport = pg_strdup(optarg); + break; + case 'd': + pg_logging_increase_verbosity(); + break; + case 'c': + benchmarking_option_set = true; + if (!option_parse_int(optarg, "-c/--clients", 1, INT_MAX, + &nclients)) + { + exit(1); + } +#ifdef HAVE_GETRLIMIT +#ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */ + if (getrlimit(RLIMIT_NOFILE, &rlim) == -1) +#else /* but BSD doesn't ... */ + if (getrlimit(RLIMIT_OFILE, &rlim) == -1) +#endif /* RLIMIT_NOFILE */ + pg_fatal("getrlimit failed: %m"); + if (rlim.rlim_cur < nclients + 3) + { + pg_log_error("need at least %d open files, but system limit is %ld", + nclients + 3, (long) rlim.rlim_cur); + pg_log_error_hint("Reduce number of clients, or use limit/ulimit to increase the system limit."); + exit(1); + } +#endif /* HAVE_GETRLIMIT */ + break; + case 'j': /* jobs */ + benchmarking_option_set = true; + if (!option_parse_int(optarg, "-j/--jobs", 1, INT_MAX, + &nthreads)) + { + exit(1); + } +#ifndef ENABLE_THREAD_SAFETY + if (nthreads != 1) + pg_fatal("threads are not supported on this platform; use -j1"); +#endif /* !ENABLE_THREAD_SAFETY */ + break; + case 'C': + benchmarking_option_set = true; + is_connect = true; + break; + case 'r': + benchmarking_option_set = true; + report_per_command = true; + break; + case 's': + scale_given = true; + if (!option_parse_int(optarg, "-s/--scale", 1, INT_MAX, + &scale)) + exit(1); + break; + case 't': + benchmarking_option_set = true; + if (!option_parse_int(optarg, "-t/--transactions", 1, INT_MAX, + &nxacts)) + exit(1); + break; + case 'T': + benchmarking_option_set = true; + if (!option_parse_int(optarg, "-T/--time", 1, INT_MAX, + &duration)) + exit(1); + break; + case 'U': + username = pg_strdup(optarg); + break; + case 'l': + benchmarking_option_set = true; + use_log = true; + break; + case 'q': + initialization_option_set = true; + use_quiet = true; + break; + case 'b': + if (strcmp(optarg, "list") == 0) + { + listAvailableScripts(); + exit(0); + } + weight = parseScriptWeight(optarg, &script); + process_builtin(findBuiltin(script), weight); + benchmarking_option_set = true; + internal_script_used = true; + break; + case 'S': + process_builtin(findBuiltin("select-only"), 1); + benchmarking_option_set = true; + internal_script_used = true; + break; + case 'N': + process_builtin(findBuiltin("simple-update"), 1); + benchmarking_option_set = true; + internal_script_used = true; + break; + case 'f': + weight = parseScriptWeight(optarg, &script); + process_file(script, weight); + benchmarking_option_set = true; + break; + case 'D': + { + char *p; + + benchmarking_option_set = true; + + if ((p = strchr(optarg, '=')) == NULL || p == optarg || *(p + 1) == '\0') + pg_fatal("invalid variable definition: \"%s\"", optarg); + + *p++ = '\0'; + if (!putVariable(&state[0].variables, "option", optarg, p)) + exit(1); + } + break; + case 'F': + initialization_option_set = true; + if (!option_parse_int(optarg, "-F/--fillfactor", 10, 100, + &fillfactor)) + exit(1); + break; + case 'M': + benchmarking_option_set = true; + for (querymode = 0; querymode < NUM_QUERYMODE; querymode++) + if (strcmp(optarg, QUERYMODE[querymode]) == 0) + break; + if (querymode >= NUM_QUERYMODE) + pg_fatal("invalid query mode (-M): \"%s\"", optarg); + break; + case 'P': + benchmarking_option_set = true; + if (!option_parse_int(optarg, "-P/--progress", 1, INT_MAX, + &progress)) + exit(1); + break; + case 'R': + { + /* get a double from the beginning of option value */ + double throttle_value = atof(optarg); + + benchmarking_option_set = true; + + if (throttle_value <= 0.0) + pg_fatal("invalid rate limit: \"%s\"", optarg); + /* Invert rate limit into per-transaction delay in usec */ + throttle_delay = 1000000.0 / throttle_value; + } + break; + case 'L': + { + double limit_ms = atof(optarg); + + if (limit_ms <= 0.0) + pg_fatal("invalid latency limit: \"%s\"", optarg); + benchmarking_option_set = true; + latency_limit = (int64) (limit_ms * 1000); + } + break; + case 1: /* unlogged-tables */ + initialization_option_set = true; + unlogged_tables = true; + break; + case 2: /* tablespace */ + initialization_option_set = true; + tablespace = pg_strdup(optarg); + break; + case 3: /* index-tablespace */ + initialization_option_set = true; + index_tablespace = pg_strdup(optarg); + break; + case 4: /* sampling-rate */ + benchmarking_option_set = true; + sample_rate = atof(optarg); + if (sample_rate <= 0.0 || sample_rate > 1.0) + pg_fatal("invalid sampling rate: \"%s\"", optarg); + break; + case 5: /* aggregate-interval */ + benchmarking_option_set = true; + if (!option_parse_int(optarg, "--aggregate-interval", 1, INT_MAX, + &agg_interval)) + exit(1); + break; + case 6: /* progress-timestamp */ + progress_timestamp = true; + benchmarking_option_set = true; + break; + case 7: /* log-prefix */ + benchmarking_option_set = true; + logfile_prefix = pg_strdup(optarg); + break; + case 8: /* foreign-keys */ + initialization_option_set = true; + foreign_keys = true; + break; + case 9: /* random-seed */ + benchmarking_option_set = true; + if (!set_random_seed(optarg)) + pg_fatal("error while setting random seed from --random-seed option"); + break; + case 10: /* list */ + { + const BuiltinScript *s = findBuiltin(optarg); + + fprintf(stderr, "-- %s: %s\n%s\n", s->name, s->desc, s->script); + exit(0); + } + break; + case 11: /* partitions */ + initialization_option_set = true; + if (!option_parse_int(optarg, "--partitions", 0, INT_MAX, + &partitions)) + exit(1); + break; + case 12: /* partition-method */ + initialization_option_set = true; + if (pg_strcasecmp(optarg, "range") == 0) + partition_method = PART_RANGE; + else if (pg_strcasecmp(optarg, "hash") == 0) + partition_method = PART_HASH; + else + pg_fatal("invalid partition method, expecting \"range\" or \"hash\", got: \"%s\"", + optarg); + break; + case 13: /* failures-detailed */ + benchmarking_option_set = true; + failures_detailed = true; + break; + case 14: /* max-tries */ + { + int32 max_tries_arg = atoi(optarg); + + if (max_tries_arg < 0) + pg_fatal("invalid number of maximum tries: \"%s\"", optarg); + + benchmarking_option_set = true; + max_tries = (uint32) max_tries_arg; + } + break; + case 15: /* verbose-errors */ + benchmarking_option_set = true; + verbose_errors = true; + break; + default: + /* getopt_long already emitted a complaint */ + pg_log_error_hint("Try \"%s --help\" for more information.", progname); + exit(1); + } + } + + /* set default script if none */ + if (num_scripts == 0 && !is_init_mode) + { + process_builtin(findBuiltin("tpcb-like"), 1); + benchmarking_option_set = true; + internal_script_used = true; + } + + /* complete SQL command initialization and compute total weight */ + for (i = 0; i < num_scripts; i++) + { + Command **commands = sql_script[i].commands; + + for (int j = 0; commands[j] != NULL; j++) + if (commands[j]->type == SQL_COMMAND) + postprocess_sql_command(commands[j]); + + /* cannot overflow: weight is 32b, total_weight 64b */ + total_weight += sql_script[i].weight; + } + + if (total_weight == 0 && !is_init_mode) + pg_fatal("total script weight must not be zero"); + + /* show per script stats if several scripts are used */ + if (num_scripts > 1) + per_script_stats = true; + + /* + * Don't need more threads than there are clients. (This is not merely an + * optimization; throttle_delay is calculated incorrectly below if some + * threads have no clients assigned to them.) + */ + if (nthreads > nclients) + nthreads = nclients; + + /* + * Convert throttle_delay to a per-thread delay time. Note that this + * might be a fractional number of usec, but that's OK, since it's just + * the center of a Poisson distribution of delays. + */ + throttle_delay *= nthreads; + + if (argc > optind) + dbName = argv[optind++]; + else + { + if ((env = getenv("PGDATABASE")) != NULL && *env != '\0') + dbName = env; + else if ((env = getenv("PGUSER")) != NULL && *env != '\0') + dbName = env; + else + dbName = get_user_name_or_exit(progname); + } + + if (optind < argc) + { + pg_log_error("too many command-line arguments (first is \"%s\")", + argv[optind]); + pg_log_error_hint("Try \"%s --help\" for more information.", progname); + exit(1); + } + + if (is_init_mode) + { + if (benchmarking_option_set) + pg_fatal("some of the specified options cannot be used in initialization (-i) mode"); + + if (partitions == 0 && partition_method != PART_NONE) + pg_fatal("--partition-method requires greater than zero --partitions"); + + /* set default method */ + if (partitions > 0 && partition_method == PART_NONE) + partition_method = PART_RANGE; + + if (initialize_steps == NULL) + initialize_steps = pg_strdup(DEFAULT_INIT_STEPS); + + if (is_no_vacuum) + { + /* Remove any vacuum step in initialize_steps */ + char *p; + + while ((p = strchr(initialize_steps, 'v')) != NULL) + *p = ' '; + } + + if (foreign_keys) + { + /* Add 'f' to end of initialize_steps, if not already there */ + if (strchr(initialize_steps, 'f') == NULL) + { + initialize_steps = (char *) + pg_realloc(initialize_steps, + strlen(initialize_steps) + 2); + strcat(initialize_steps, "f"); + } + } + + runInitSteps(initialize_steps); + exit(0); + } + else + { + if (initialization_option_set) + pg_fatal("some of the specified options cannot be used in benchmarking mode"); + } + + if (nxacts > 0 && duration > 0) + pg_fatal("specify either a number of transactions (-t) or a duration (-T), not both"); + + /* Use DEFAULT_NXACTS if neither nxacts nor duration is specified. */ + if (nxacts <= 0 && duration <= 0) + nxacts = DEFAULT_NXACTS; + + /* --sampling-rate may be used only with -l */ + if (sample_rate > 0.0 && !use_log) + pg_fatal("log sampling (--sampling-rate) is allowed only when logging transactions (-l)"); + + /* --sampling-rate may not be used with --aggregate-interval */ + if (sample_rate > 0.0 && agg_interval > 0) + pg_fatal("log sampling (--sampling-rate) and aggregation (--aggregate-interval) cannot be used at the same time"); + + if (agg_interval > 0 && !use_log) + pg_fatal("log aggregation is allowed only when actually logging transactions"); + + if (!use_log && logfile_prefix) + pg_fatal("log file prefix (--log-prefix) is allowed only when logging transactions (-l)"); + + if (duration > 0 && agg_interval > duration) + pg_fatal("number of seconds for aggregation (%d) must not be higher than test duration (%d)", agg_interval, duration); + + if (duration > 0 && agg_interval > 0 && duration % agg_interval != 0) + pg_fatal("duration (%d) must be a multiple of aggregation interval (%d)", duration, agg_interval); + + if (progress_timestamp && progress == 0) + pg_fatal("--progress-timestamp is allowed only under --progress"); + + if (!max_tries) + { + if (!latency_limit && duration <= 0) + pg_fatal("an unlimited number of transaction tries can only be used with --latency-limit or a duration (-T)"); + } + + /* + * save main process id in the global variable because process id will be + * changed after fork. + */ + main_pid = (int) getpid(); + + if (nclients > 1) + { + state = (CState *) pg_realloc(state, sizeof(CState) * nclients); + memset(state + 1, 0, sizeof(CState) * (nclients - 1)); + + /* copy any -D switch values to all clients */ + for (i = 1; i < nclients; i++) + { + int j; + + state[i].id = i; + for (j = 0; j < state[0].variables.nvars; j++) + { + Variable *var = &state[0].variables.vars[j]; + + if (var->value.type != PGBT_NO_VALUE) + { + if (!putVariableValue(&state[i].variables, "startup", + var->name, &var->value)) + exit(1); + } + else + { + if (!putVariable(&state[i].variables, "startup", + var->name, var->svalue)) + exit(1); + } + } + } + } + + /* other CState initializations */ + for (i = 0; i < nclients; i++) + { + state[i].cstack = conditional_stack_create(); + initRandomState(&state[i].cs_func_rs); + } + + /* opening connection... */ + con = doConnect(); + if (con == NULL) + pg_fatal("could not create connection for setup"); + + /* report pgbench and server versions */ + printVersion(con); + + pg_log_debug("pghost: %s pgport: %s nclients: %d %s: %d dbName: %s", + PQhost(con), PQport(con), nclients, + duration <= 0 ? "nxacts" : "duration", + duration <= 0 ? nxacts : duration, PQdb(con)); + + if (internal_script_used) + GetTableInfo(con, scale_given); + + /* + * :scale variables normally get -s or database scale, but don't override + * an explicit -D switch + */ + if (lookupVariable(&state[0].variables, "scale") == NULL) + { + for (i = 0; i < nclients; i++) + { + if (!putVariableInt(&state[i].variables, "startup", "scale", scale)) + exit(1); + } + } + + /* + * Define a :client_id variable that is unique per connection. But don't + * override an explicit -D switch. + */ + if (lookupVariable(&state[0].variables, "client_id") == NULL) + { + for (i = 0; i < nclients; i++) + if (!putVariableInt(&state[i].variables, "startup", "client_id", i)) + exit(1); + } + + /* set default seed for hash functions */ + if (lookupVariable(&state[0].variables, "default_seed") == NULL) + { + uint64 seed = pg_prng_uint64(&base_random_sequence); + + for (i = 0; i < nclients; i++) + if (!putVariableInt(&state[i].variables, "startup", "default_seed", + (int64) seed)) + exit(1); + } + + /* set random seed unless overwritten */ + if (lookupVariable(&state[0].variables, "random_seed") == NULL) + { + for (i = 0; i < nclients; i++) + if (!putVariableInt(&state[i].variables, "startup", "random_seed", + random_seed)) + exit(1); + } + + if (!is_no_vacuum) + { + fprintf(stderr, "starting vacuum..."); + tryExecuteStatement(con, "vacuum pgbench_branches"); + tryExecuteStatement(con, "vacuum pgbench_tellers"); + tryExecuteStatement(con, "truncate pgbench_history"); + fprintf(stderr, "end.\n"); + + if (do_vacuum_accounts) + { + fprintf(stderr, "starting vacuum pgbench_accounts..."); + tryExecuteStatement(con, "vacuum analyze pgbench_accounts"); + fprintf(stderr, "end.\n"); + } + } + PQfinish(con); + + /* set up thread data structures */ + threads = (TState *) pg_malloc(sizeof(TState) * nthreads); + nclients_dealt = 0; + + for (i = 0; i < nthreads; i++) + { + TState *thread = &threads[i]; + + thread->tid = i; + thread->state = &state[nclients_dealt]; + thread->nstate = + (nclients - nclients_dealt + nthreads - i - 1) / (nthreads - i); + initRandomState(&thread->ts_choose_rs); + initRandomState(&thread->ts_throttle_rs); + initRandomState(&thread->ts_sample_rs); + thread->logfile = NULL; /* filled in later */ + thread->latency_late = 0; + initStats(&thread->stats, 0); + + nclients_dealt += thread->nstate; + } + + /* all clients must be assigned to a thread */ + Assert(nclients_dealt == nclients); + + /* get start up time for the whole computation */ + start_time = pg_time_now(); + + /* set alarm if duration is specified. */ + if (duration > 0) + setalarm(duration); + + errno = THREAD_BARRIER_INIT(&barrier, nthreads); + if (errno != 0) + pg_fatal("could not initialize barrier: %m"); + +#ifdef ENABLE_THREAD_SAFETY + /* start all threads but thread 0 which is executed directly later */ + for (i = 1; i < nthreads; i++) + { + TState *thread = &threads[i]; + + thread->create_time = pg_time_now(); + errno = THREAD_CREATE(&thread->thread, threadRun, thread); + + if (errno != 0) + pg_fatal("could not create thread: %m"); + } +#else + Assert(nthreads == 1); +#endif /* ENABLE_THREAD_SAFETY */ + + /* compute when to stop */ + threads[0].create_time = pg_time_now(); + if (duration > 0) + end_time = threads[0].create_time + (int64) 1000000 * duration; + + /* run thread 0 directly */ + (void) threadRun(&threads[0]); + + /* wait for other threads and accumulate results */ + initStats(&stats, 0); + conn_total_duration = 0; + + for (i = 0; i < nthreads; i++) + { + TState *thread = &threads[i]; + +#ifdef ENABLE_THREAD_SAFETY + if (i > 0) + THREAD_JOIN(thread->thread); +#endif /* ENABLE_THREAD_SAFETY */ + + for (int j = 0; j < thread->nstate; j++) + if (thread->state[j].state != CSTATE_FINISHED) + exit_code = 2; + + /* aggregate thread level stats */ + mergeSimpleStats(&stats.latency, &thread->stats.latency); + mergeSimpleStats(&stats.lag, &thread->stats.lag); + stats.cnt += thread->stats.cnt; + stats.skipped += thread->stats.skipped; + stats.retries += thread->stats.retries; + stats.retried += thread->stats.retried; + stats.serialization_failures += thread->stats.serialization_failures; + stats.deadlock_failures += thread->stats.deadlock_failures; + latency_late += thread->latency_late; + conn_total_duration += thread->conn_duration; + + /* first recorded benchmarking start time */ + if (bench_start == 0 || thread->bench_start < bench_start) + bench_start = thread->bench_start; + } + + /* + * All connections should be already closed in threadRun(), so this + * disconnect_all() will be a no-op, but clean up the connections just to + * be sure. We don't need to measure the disconnection delays here. + */ + disconnect_all(state, nclients); + + /* + * Beware that performance of short benchmarks with many threads and + * possibly long transactions can be deceptive because threads do not + * start and finish at the exact same time. The total duration computed + * here encompasses all transactions so that tps shown is somehow slightly + * underestimated. + */ + printResults(&stats, pg_time_now() - bench_start, conn_total_duration, + bench_start - start_time, latency_late); + + THREAD_BARRIER_DESTROY(&barrier); + + if (exit_code != 0) + pg_log_error("Run was aborted; the above results are incomplete."); + + return exit_code; +} + +static THREAD_FUNC_RETURN_TYPE THREAD_FUNC_CC +threadRun(void *arg) +{ + TState *thread = (TState *) arg; + CState *state = thread->state; + pg_time_usec_t start; + int nstate = thread->nstate; + int remains = nstate; /* number of remaining clients */ + socket_set *sockets = alloc_socket_set(nstate); + int64 thread_start, + last_report, + next_report; + StatsData last, + aggs; + + /* open log file if requested */ + if (use_log) + { + char logpath[MAXPGPATH]; + char *prefix = logfile_prefix ? logfile_prefix : "pgbench_log"; + + if (thread->tid == 0) + snprintf(logpath, sizeof(logpath), "%s.%d", prefix, main_pid); + else + snprintf(logpath, sizeof(logpath), "%s.%d.%d", prefix, main_pid, thread->tid); + + thread->logfile = fopen(logpath, "w"); + + if (thread->logfile == NULL) + pg_fatal("could not open logfile \"%s\": %m", logpath); + } + + /* explicitly initialize the state machines */ + for (int i = 0; i < nstate; i++) + state[i].state = CSTATE_CHOOSE_SCRIPT; + + /* READY */ + THREAD_BARRIER_WAIT(&barrier); + + thread_start = pg_time_now(); + thread->started_time = thread_start; + thread->conn_duration = 0; + last_report = thread_start; + next_report = last_report + (int64) 1000000 * progress; + + /* STEADY */ + if (!is_connect) + { + /* make connections to the database before starting */ + for (int i = 0; i < nstate; i++) + { + if ((state[i].con = doConnect()) == NULL) + { + /* coldly abort on initial connection failure */ + pg_fatal("could not create connection for client %d", + state[i].id); + } + } + } + + /* GO */ + THREAD_BARRIER_WAIT(&barrier); + + start = pg_time_now(); + thread->bench_start = start; + thread->throttle_trigger = start; + + /* + * The log format currently has Unix epoch timestamps with whole numbers + * of seconds. Round the first aggregate's start time down to the nearest + * Unix epoch second (the very first aggregate might really have started a + * fraction of a second later, but later aggregates are measured from the + * whole number time that is actually logged). + */ + initStats(&aggs, (start + epoch_shift) / 1000000 * 1000000); + last = aggs; + + /* loop till all clients have terminated */ + while (remains > 0) + { + int nsocks; /* number of sockets to be waited for */ + pg_time_usec_t min_usec; + pg_time_usec_t now = 0; /* set this only if needed */ + + /* + * identify which client sockets should be checked for input, and + * compute the nearest time (if any) at which we need to wake up. + */ + clear_socket_set(sockets); + nsocks = 0; + min_usec = PG_INT64_MAX; + for (int i = 0; i < nstate; i++) + { + CState *st = &state[i]; + + if (st->state == CSTATE_SLEEP || st->state == CSTATE_THROTTLE) + { + /* a nap from the script, or under throttling */ + pg_time_usec_t this_usec; + + /* get current time if needed */ + pg_time_now_lazy(&now); + + /* min_usec should be the minimum delay across all clients */ + this_usec = (st->state == CSTATE_SLEEP ? + st->sleep_until : st->txn_scheduled) - now; + if (min_usec > this_usec) + min_usec = this_usec; + } + else if (st->state == CSTATE_WAIT_RESULT || + st->state == CSTATE_WAIT_ROLLBACK_RESULT) + { + /* + * waiting for result from server - nothing to do unless the + * socket is readable + */ + int sock = PQsocket(st->con); + + if (sock < 0) + { + pg_log_error("invalid socket: %s", PQerrorMessage(st->con)); + goto done; + } + + add_socket_to_set(sockets, sock, nsocks++); + } + else if (st->state != CSTATE_ABORTED && + st->state != CSTATE_FINISHED) + { + /* + * This client thread is ready to do something, so we don't + * want to wait. No need to examine additional clients. + */ + min_usec = 0; + break; + } + } + + /* also wake up to print the next progress report on time */ + if (progress && min_usec > 0 && thread->tid == 0) + { + pg_time_now_lazy(&now); + + if (now >= next_report) + min_usec = 0; + else if ((next_report - now) < min_usec) + min_usec = next_report - now; + } + + /* + * If no clients are ready to execute actions, sleep until we receive + * data on some client socket or the timeout (if any) elapses. + */ + if (min_usec > 0) + { + int rc = 0; + + if (min_usec != PG_INT64_MAX) + { + if (nsocks > 0) + { + rc = wait_on_socket_set(sockets, min_usec); + } + else /* nothing active, simple sleep */ + { + pg_usleep(min_usec); + } + } + else /* no explicit delay, wait without timeout */ + { + rc = wait_on_socket_set(sockets, 0); + } + + if (rc < 0) + { + if (errno == EINTR) + { + /* On EINTR, go back to top of loop */ + continue; + } + /* must be something wrong */ + pg_log_error("%s() failed: %m", SOCKET_WAIT_METHOD); + goto done; + } + } + else + { + /* min_usec <= 0, i.e. something needs to be executed now */ + + /* If we didn't wait, don't try to read any data */ + clear_socket_set(sockets); + } + + /* ok, advance the state machine of each connection */ + nsocks = 0; + for (int i = 0; i < nstate; i++) + { + CState *st = &state[i]; + + if (st->state == CSTATE_WAIT_RESULT || + st->state == CSTATE_WAIT_ROLLBACK_RESULT) + { + /* don't call advanceConnectionState unless data is available */ + int sock = PQsocket(st->con); + + if (sock < 0) + { + pg_log_error("invalid socket: %s", PQerrorMessage(st->con)); + goto done; + } + + if (!socket_has_input(sockets, sock, nsocks++)) + continue; + } + else if (st->state == CSTATE_FINISHED || + st->state == CSTATE_ABORTED) + { + /* this client is done, no need to consider it anymore */ + continue; + } + + advanceConnectionState(thread, st, &aggs); + + /* + * If advanceConnectionState changed client to finished state, + * that's one fewer client that remains. + */ + if (st->state == CSTATE_FINISHED || st->state == CSTATE_ABORTED) + remains--; + } + + /* progress report is made by thread 0 for all threads */ + if (progress && thread->tid == 0) + { + pg_time_usec_t now = pg_time_now(); + + if (now >= next_report) + { + /* + * Horrible hack: this relies on the thread pointer we are + * passed to be equivalent to threads[0], that is the first + * entry of the threads array. That is why this MUST be done + * by thread 0 and not any other. + */ + printProgressReport(thread, thread_start, now, + &last, &last_report); + + /* + * Ensure that the next report is in the future, in case + * pgbench/postgres got stuck somewhere. + */ + do + { + next_report += (int64) 1000000 * progress; + } while (now >= next_report); + } + } + } + +done: + disconnect_all(state, nstate); + + if (thread->logfile) + { + if (agg_interval > 0) + { + /* log aggregated but not yet reported transactions */ + doLog(thread, state, &aggs, false, 0, 0); + } + fclose(thread->logfile); + thread->logfile = NULL; + } + free_socket_set(sockets); + THREAD_FUNC_RETURN; +} + +static void +finishCon(CState *st) +{ + if (st->con != NULL) + { + PQfinish(st->con); + st->con = NULL; + } +} + +/* + * Support for duration option: set timer_exceeded after so many seconds. + */ + +#ifndef WIN32 + +static void +handle_sig_alarm(SIGNAL_ARGS) +{ + timer_exceeded = true; +} + +static void +setalarm(int seconds) +{ + pqsignal(SIGALRM, handle_sig_alarm); + alarm(seconds); +} + +#else /* WIN32 */ + +static VOID CALLBACK +win32_timer_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired) +{ + timer_exceeded = true; +} + +static void +setalarm(int seconds) +{ + HANDLE queue; + HANDLE timer; + + /* This function will be called at most once, so we can cheat a bit. */ + queue = CreateTimerQueue(); + if (seconds > ((DWORD) -1) / 1000 || + !CreateTimerQueueTimer(&timer, queue, + win32_timer_callback, NULL, seconds * 1000, 0, + WT_EXECUTEINTIMERTHREAD | WT_EXECUTEONLYONCE)) + pg_fatal("failed to set timer"); +} + +#endif /* WIN32 */ + + +/* + * These functions provide an abstraction layer that hides the syscall + * we use to wait for input on a set of sockets. + * + * Currently there are two implementations, based on ppoll(2) and select(2). + * ppoll() is preferred where available due to its typically higher ceiling + * on the number of usable sockets. We do not use the more-widely-available + * poll(2) because it only offers millisecond timeout resolution, which could + * be problematic with high --rate settings. + * + * Function APIs: + * + * alloc_socket_set: allocate an empty socket set with room for up to + * "count" sockets. + * + * free_socket_set: deallocate a socket set. + * + * clear_socket_set: reset a socket set to empty. + * + * add_socket_to_set: add socket with indicated FD to slot "idx" in the + * socket set. Slots must be filled in order, starting with 0. + * + * wait_on_socket_set: wait for input on any socket in set, or for timeout + * to expire. timeout is measured in microseconds; 0 means wait forever. + * Returns result code of underlying syscall (>=0 if OK, else see errno). + * + * socket_has_input: after waiting, call this to see if given socket has + * input. fd and idx parameters should match some previous call to + * add_socket_to_set. + * + * Note that wait_on_socket_set destructively modifies the state of the + * socket set. After checking for input, caller must apply clear_socket_set + * and add_socket_to_set again before waiting again. + */ + +#ifdef POLL_USING_PPOLL + +static socket_set * +alloc_socket_set(int count) +{ + socket_set *sa; + + sa = (socket_set *) pg_malloc0(offsetof(socket_set, pollfds) + + sizeof(struct pollfd) * count); + sa->maxfds = count; + sa->curfds = 0; + return sa; +} + +static void +free_socket_set(socket_set *sa) +{ + pg_free(sa); +} + +static void +clear_socket_set(socket_set *sa) +{ + sa->curfds = 0; +} + +static void +add_socket_to_set(socket_set *sa, int fd, int idx) +{ + Assert(idx < sa->maxfds && idx == sa->curfds); + sa->pollfds[idx].fd = fd; + sa->pollfds[idx].events = POLLIN; + sa->pollfds[idx].revents = 0; + sa->curfds++; +} + +static int +wait_on_socket_set(socket_set *sa, int64 usecs) +{ + if (usecs > 0) + { + struct timespec timeout; + + timeout.tv_sec = usecs / 1000000; + timeout.tv_nsec = (usecs % 1000000) * 1000; + return ppoll(sa->pollfds, sa->curfds, &timeout, NULL); + } + else + { + return ppoll(sa->pollfds, sa->curfds, NULL, NULL); + } +} + +static bool +socket_has_input(socket_set *sa, int fd, int idx) +{ + /* + * In some cases, threadRun will apply clear_socket_set and then try to + * apply socket_has_input anyway with arguments that it used before that, + * or might've used before that except that it exited its setup loop + * early. Hence, if the socket set is empty, silently return false + * regardless of the parameters. If it's not empty, we can Assert that + * the parameters match a previous call. + */ + if (sa->curfds == 0) + return false; + + Assert(idx < sa->curfds && sa->pollfds[idx].fd == fd); + return (sa->pollfds[idx].revents & POLLIN) != 0; +} + +#endif /* POLL_USING_PPOLL */ + +#ifdef POLL_USING_SELECT + +static socket_set * +alloc_socket_set(int count) +{ + return (socket_set *) pg_malloc0(sizeof(socket_set)); +} + +static void +free_socket_set(socket_set *sa) +{ + pg_free(sa); +} + +static void +clear_socket_set(socket_set *sa) +{ + FD_ZERO(&sa->fds); + sa->maxfd = -1; +} + +static void +add_socket_to_set(socket_set *sa, int fd, int idx) +{ + if (fd < 0 || fd >= FD_SETSIZE) + { + /* + * Doing a hard exit here is a bit grotty, but it doesn't seem worth + * complicating the API to make it less grotty. + */ + pg_fatal("too many client connections for select()"); + } + FD_SET(fd, &sa->fds); + if (fd > sa->maxfd) + sa->maxfd = fd; +} + +static int +wait_on_socket_set(socket_set *sa, int64 usecs) +{ + if (usecs > 0) + { + struct timeval timeout; + + timeout.tv_sec = usecs / 1000000; + timeout.tv_usec = usecs % 1000000; + return select(sa->maxfd + 1, &sa->fds, NULL, NULL, &timeout); + } + else + { + return select(sa->maxfd + 1, &sa->fds, NULL, NULL, NULL); + } +} + +static bool +socket_has_input(socket_set *sa, int fd, int idx) +{ + return (FD_ISSET(fd, &sa->fds) != 0); +} + +#endif /* POLL_USING_SELECT */ |