/* * pgbench.c * * A simple benchmark program for PostgreSQL * Originally written by Tatsuo Ishii and enhanced by many contributors. * * src/bin/pgbench/pgbench.c * Copyright (c) 2000-2021, PostgreSQL Global Development Group * ALL RIGHTS RESERVED; * * Permission to use, copy, modify, and distribute this software and its * documentation for any purpose, without fee, and without a written agreement * is hereby granted, provided that the above copyright notice and this * paragraph and the following two paragraphs appear in all copies. * * IN NO EVENT SHALL THE AUTHOR OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS * DOCUMENTATION, EVEN IF THE AUTHOR OR DISTRIBUTORS HAVE BEEN ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * THE AUTHOR AND DISTRIBUTORS SPECIFICALLY DISCLAIMS ANY WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY * AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS * ON AN "AS IS" BASIS, AND THE AUTHOR AND DISTRIBUTORS HAS NO OBLIGATIONS TO * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. * */ #ifdef WIN32 #define FD_SETSIZE 1024 /* must set before winsock2.h is included */ #endif #include "postgres_fe.h" #include #include #include #include #include #include #include #ifdef HAVE_SYS_RESOURCE_H #include /* for getrlimit */ #endif /* For testing, PGBENCH_USE_SELECT can be defined to force use of that code */ #if defined(HAVE_PPOLL) && !defined(PGBENCH_USE_SELECT) #define POLL_USING_PPOLL #ifdef HAVE_POLL_H #include #endif #else /* no ppoll(), so use select() */ #define POLL_USING_SELECT #ifdef HAVE_SYS_SELECT_H #include #endif #endif #include "common/int.h" #include "common/logging.h" #include "common/string.h" #include "common/username.h" #include "fe_utils/cancel.h" #include "fe_utils/conditional.h" #include "fe_utils/string_utils.h" #include "getopt_long.h" #include "libpq-fe.h" #include "pgbench.h" #include "port/pg_bitutils.h" #include "portability/instr_time.h" #ifndef M_PI #define M_PI 3.14159265358979323846 #endif #define ERRCODE_UNDEFINED_TABLE "42P01" /* * Hashing constants */ #define FNV_PRIME UINT64CONST(0x100000001b3) #define FNV_OFFSET_BASIS UINT64CONST(0xcbf29ce484222325) #define MM2_MUL UINT64CONST(0xc6a4a7935bd1e995) #define MM2_MUL_TIMES_8 UINT64CONST(0x35253c9ade8f4ca8) #define MM2_ROT 47 /* * Multi-platform socket set implementations */ #ifdef POLL_USING_PPOLL #define SOCKET_WAIT_METHOD "ppoll" typedef struct socket_set { int maxfds; /* allocated length of pollfds[] array */ int curfds; /* number currently in use */ struct pollfd pollfds[FLEXIBLE_ARRAY_MEMBER]; } socket_set; #endif /* POLL_USING_PPOLL */ #ifdef POLL_USING_SELECT #define SOCKET_WAIT_METHOD "select" typedef struct socket_set { int maxfd; /* largest FD currently set in fds */ fd_set fds; } socket_set; #endif /* POLL_USING_SELECT */ /* * Multi-platform thread implementations */ #ifdef WIN32 /* Use Windows threads */ #include #define GETERRNO() (_dosmaperr(GetLastError()), errno) #define THREAD_T HANDLE #define THREAD_FUNC_RETURN_TYPE unsigned #define THREAD_FUNC_RETURN return 0 #define THREAD_FUNC_CC __stdcall #define THREAD_CREATE(handle, function, arg) \ ((*(handle) = (HANDLE) _beginthreadex(NULL, 0, (function), (arg), 0, NULL)) == 0 ? errno : 0) #define THREAD_JOIN(handle) \ (WaitForSingleObject(handle, INFINITE) != WAIT_OBJECT_0 ? \ GETERRNO() : CloseHandle(handle) ? 0 : GETERRNO()) #define THREAD_BARRIER_T SYNCHRONIZATION_BARRIER #define THREAD_BARRIER_INIT(barrier, n) \ (InitializeSynchronizationBarrier((barrier), (n), 0) ? 0 : GETERRNO()) #define THREAD_BARRIER_WAIT(barrier) \ EnterSynchronizationBarrier((barrier), \ SYNCHRONIZATION_BARRIER_FLAGS_BLOCK_ONLY) #define THREAD_BARRIER_DESTROY(barrier) #elif defined(ENABLE_THREAD_SAFETY) /* Use POSIX threads */ #include "port/pg_pthread.h" #define THREAD_T pthread_t #define THREAD_FUNC_RETURN_TYPE void * #define THREAD_FUNC_RETURN return NULL #define THREAD_FUNC_CC #define THREAD_CREATE(handle, function, arg) \ pthread_create((handle), NULL, (function), (arg)) #define THREAD_JOIN(handle) \ pthread_join((handle), NULL) #define THREAD_BARRIER_T pthread_barrier_t #define THREAD_BARRIER_INIT(barrier, n) \ pthread_barrier_init((barrier), NULL, (n)) #define THREAD_BARRIER_WAIT(barrier) pthread_barrier_wait((barrier)) #define THREAD_BARRIER_DESTROY(barrier) pthread_barrier_destroy((barrier)) #else /* No threads implementation, use none (-j 1) */ #define THREAD_T void * #define THREAD_FUNC_RETURN_TYPE void * #define THREAD_FUNC_RETURN return NULL #define THREAD_FUNC_CC #define THREAD_BARRIER_T int #define THREAD_BARRIER_INIT(barrier, n) (*(barrier) = 0) #define THREAD_BARRIER_WAIT(barrier) #define THREAD_BARRIER_DESTROY(barrier) #endif /******************************************************************** * some configurable parameters */ #define DEFAULT_INIT_STEPS "dtgvp" /* default -I setting */ #define ALL_INIT_STEPS "dtgGvpf" /* all possible steps */ #define LOG_STEP_SECONDS 5 /* seconds between log messages */ #define DEFAULT_NXACTS 10 /* default nxacts */ #define MIN_GAUSSIAN_PARAM 2.0 /* minimum parameter for gauss */ #define MIN_ZIPFIAN_PARAM 1.001 /* minimum parameter for zipfian */ #define MAX_ZIPFIAN_PARAM 1000.0 /* maximum parameter for zipfian */ int nxacts = 0; /* number of transactions per client */ int duration = 0; /* duration in seconds */ int64 end_time = 0; /* when to stop in micro seconds, under -T */ /* * scaling factor. for example, scale = 10 will make 1000000 tuples in * pgbench_accounts table. */ int scale = 1; /* * fillfactor. for example, fillfactor = 90 will use only 90 percent * space during inserts and leave 10 percent free. */ int fillfactor = 100; /* * use unlogged tables? */ bool unlogged_tables = false; /* * log sampling rate (1.0 = log everything, 0.0 = option not given) */ double sample_rate = 0.0; /* * When threads are throttled to a given rate limit, this is the target delay * to reach that rate in usec. 0 is the default and means no throttling. */ double throttle_delay = 0; /* * Transactions which take longer than this limit (in usec) are counted as * late, and reported as such, although they are completed anyway. When * throttling is enabled, execution time slots that are more than this late * are skipped altogether, and counted separately. */ int64 latency_limit = 0; /* * tablespace selection */ char *tablespace = NULL; char *index_tablespace = NULL; /* * Number of "pgbench_accounts" partitions. 0 is the default and means no * partitioning. */ static int partitions = 0; /* partitioning strategy for "pgbench_accounts" */ typedef enum { PART_NONE, /* no partitioning */ PART_RANGE, /* range partitioning */ PART_HASH /* hash partitioning */ } partition_method_t; static partition_method_t partition_method = PART_NONE; static const char *PARTITION_METHOD[] = {"none", "range", "hash"}; /* random seed used to initialize base_random_sequence */ int64 random_seed = -1; /* * end of configurable parameters *********************************************************************/ #define nbranches 1 /* Makes little sense to change this. Change * -s instead */ #define ntellers 10 #define naccounts 100000 /* * The scale factor at/beyond which 32bit integers are incapable of storing * 64bit values. * * Although the actual threshold is 21474, we use 20000 because it is easier to * document and remember, and isn't that far away from the real threshold. */ #define SCALE_32BIT_THRESHOLD 20000 bool use_log; /* log transaction latencies to a file */ bool use_quiet; /* quiet logging onto stderr */ int agg_interval; /* log aggregates instead of individual * transactions */ bool per_script_stats = false; /* whether to collect stats per script */ int progress = 0; /* thread progress report every this seconds */ bool progress_timestamp = false; /* progress report with Unix time */ int nclients = 1; /* number of clients */ int nthreads = 1; /* number of threads */ bool is_connect; /* establish connection for each transaction */ bool report_per_command; /* report per-command latencies */ int main_pid; /* main process id used in log filename */ const char *pghost = NULL; const char *pgport = NULL; const char *username = NULL; const char *dbName = NULL; char *logfile_prefix = NULL; const char *progname; #define WSEP '@' /* weight separator */ volatile bool timer_exceeded = false; /* flag from signal handler */ /* * Variable definitions. * * If a variable only has a string value, "svalue" is that value, and value is * "not set". If the value is known, "value" contains the value (in any * variant). * * In this case "svalue" contains the string equivalent of the value, if we've * had occasion to compute that, or NULL if we haven't. */ typedef struct { char *name; /* variable's name */ char *svalue; /* its value in string form, if known */ PgBenchValue value; /* actual variable's value */ } Variable; #define MAX_SCRIPTS 128 /* max number of SQL scripts allowed */ #define SHELL_COMMAND_SIZE 256 /* maximum size allowed for shell command */ /* * Simple data structure to keep stats about something. * * XXX probably the first value should be kept and used as an offset for * better numerical stability... */ typedef struct SimpleStats { int64 count; /* how many values were encountered */ double min; /* the minimum seen */ double max; /* the maximum seen */ double sum; /* sum of values */ double sum2; /* sum of squared values */ } SimpleStats; /* * The instr_time type is expensive when dealing with time arithmetic. Define * a type to hold microseconds instead. Type int64 is good enough for about * 584500 years. */ typedef int64 pg_time_usec_t; /* * Data structure to hold various statistics: per-thread and per-script stats * are maintained and merged together. */ typedef struct StatsData { pg_time_usec_t start_time; /* interval start time, for aggregates */ int64 cnt; /* number of transactions, including skipped */ int64 skipped; /* number of transactions skipped under --rate * and --latency-limit */ SimpleStats latency; SimpleStats lag; } StatsData; /* * For displaying Unix epoch timestamps, as some time functions may have * another reference. */ pg_time_usec_t epoch_shift; /* * Struct to keep random state. */ typedef struct RandomState { unsigned short xseed[3]; } RandomState; /* Various random sequences are initialized from this one. */ static RandomState base_random_sequence; /* Synchronization barrier for start and connection */ static THREAD_BARRIER_T barrier; /* * Connection state machine states. */ typedef enum { /* * The client must first choose a script to execute. Once chosen, it can * either be throttled (state CSTATE_PREPARE_THROTTLE under --rate), start * right away (state CSTATE_START_TX) or not start at all if the timer was * exceeded (state CSTATE_FINISHED). */ CSTATE_CHOOSE_SCRIPT, /* * CSTATE_START_TX performs start-of-transaction processing. Establishes * a new connection for the transaction in --connect mode, records the * transaction start time, and proceed to the first command. * * Note: once a script is started, it will either error or run till its * end, where it may be interrupted. It is not interrupted while running, * so pgbench --time is to be understood as tx are allowed to start in * that time, and will finish when their work is completed. */ CSTATE_START_TX, /* * In CSTATE_PREPARE_THROTTLE state, we calculate when to begin the next * transaction, and advance to CSTATE_THROTTLE. CSTATE_THROTTLE state * sleeps until that moment, then advances to CSTATE_START_TX, or * CSTATE_FINISHED if the next transaction would start beyond the end of * the run. */ CSTATE_PREPARE_THROTTLE, CSTATE_THROTTLE, /* * We loop through these states, to process each command in the script: * * CSTATE_START_COMMAND starts the execution of a command. On a SQL * command, the command is sent to the server, and we move to * CSTATE_WAIT_RESULT state unless in pipeline mode. On a \sleep * meta-command, the timer is set, and we enter the CSTATE_SLEEP state to * wait for it to expire. Other meta-commands are executed immediately. If * the command about to start is actually beyond the end of the script, * advance to CSTATE_END_TX. * * CSTATE_WAIT_RESULT waits until we get a result set back from the server * for the current command. * * CSTATE_SLEEP waits until the end of \sleep. * * CSTATE_END_COMMAND records the end-of-command timestamp, increments the * command counter, and loops back to CSTATE_START_COMMAND state. * * CSTATE_SKIP_COMMAND is used by conditional branches which are not * executed. It quickly skip commands that do not need any evaluation. * This state can move forward several commands, till there is something * to do or the end of the script. */ CSTATE_START_COMMAND, CSTATE_WAIT_RESULT, CSTATE_SLEEP, CSTATE_END_COMMAND, CSTATE_SKIP_COMMAND, /* * CSTATE_END_TX performs end-of-transaction processing. It calculates * latency, and logs the transaction. In --connect mode, it closes the * current connection. * * Then either starts over in CSTATE_CHOOSE_SCRIPT, or enters * CSTATE_FINISHED if we have no more work to do. */ CSTATE_END_TX, /* * Final states. CSTATE_ABORTED means that the script execution was * aborted because a command failed, CSTATE_FINISHED means success. */ CSTATE_ABORTED, CSTATE_FINISHED } ConnectionStateEnum; /* * Connection state. */ typedef struct { PGconn *con; /* connection handle to DB */ int id; /* client No. */ ConnectionStateEnum state; /* state machine's current state. */ ConditionalStack cstack; /* enclosing conditionals state */ /* * Separate randomness for each client. This is used for random functions * PGBENCH_RANDOM_* during the execution of the script. */ RandomState cs_func_rs; int use_file; /* index in sql_script for this client */ int command; /* command number in script */ /* client variables */ Variable *variables; /* array of variable definitions */ int nvariables; /* number of variables */ bool vars_sorted; /* are variables sorted by name? */ /* various times about current transaction in microseconds */ pg_time_usec_t txn_scheduled; /* scheduled start time of transaction */ pg_time_usec_t sleep_until; /* scheduled start time of next cmd */ pg_time_usec_t txn_begin; /* used for measuring schedule lag times */ pg_time_usec_t stmt_begin; /* used for measuring statement latencies */ bool prepared[MAX_SCRIPTS]; /* whether client prepared the script */ /* per client collected stats */ int64 cnt; /* client transaction count, for -t */ } CState; /* * Thread state */ typedef struct { int tid; /* thread id */ THREAD_T thread; /* thread handle */ CState *state; /* array of CState */ int nstate; /* length of state[] */ /* * Separate randomness for each thread. Each thread option uses its own * random state to make all of them independent of each other and * therefore deterministic at the thread level. */ RandomState ts_choose_rs; /* random state for selecting a script */ RandomState ts_throttle_rs; /* random state for transaction throttling */ RandomState ts_sample_rs; /* random state for log sampling */ int64 throttle_trigger; /* previous/next throttling (us) */ FILE *logfile; /* where to log, or NULL */ /* per thread collected stats in microseconds */ pg_time_usec_t create_time; /* thread creation time */ pg_time_usec_t started_time; /* thread is running */ pg_time_usec_t bench_start; /* thread is benchmarking */ pg_time_usec_t conn_duration; /* cumulated connection and deconnection * delays */ StatsData stats; int64 latency_late; /* count executed but late transactions */ } TState; /* * queries read from files */ #define SQL_COMMAND 1 #define META_COMMAND 2 /* * max number of backslash command arguments or SQL variables, * including the command or SQL statement itself */ #define MAX_ARGS 256 typedef enum MetaCommand { META_NONE, /* not a known meta-command */ META_SET, /* \set */ META_SETSHELL, /* \setshell */ META_SHELL, /* \shell */ META_SLEEP, /* \sleep */ META_GSET, /* \gset */ META_ASET, /* \aset */ META_IF, /* \if */ META_ELIF, /* \elif */ META_ELSE, /* \else */ META_ENDIF, /* \endif */ META_STARTPIPELINE, /* \startpipeline */ META_ENDPIPELINE /* \endpipeline */ } MetaCommand; typedef enum QueryMode { QUERY_SIMPLE, /* simple query */ QUERY_EXTENDED, /* extended query */ QUERY_PREPARED, /* extended query with prepared statements */ NUM_QUERYMODE } QueryMode; static QueryMode querymode = QUERY_SIMPLE; static const char *QUERYMODE[] = {"simple", "extended", "prepared"}; /* * struct Command represents one command in a script. * * lines The raw, possibly multi-line command text. Variable substitution * not applied. * first_line A short, single-line extract of 'lines', for error reporting. * type SQL_COMMAND or META_COMMAND * meta The type of meta-command, with META_NONE/GSET/ASET if command * is SQL. * argc Number of arguments of the command, 0 if not yet processed. * argv Command arguments, the first of which is the command or SQL * string itself. For SQL commands, after post-processing * argv[0] is the same as 'lines' with variables substituted. * varprefix SQL commands terminated with \gset or \aset have this set * to a non NULL value. If nonempty, it's used to prefix the * variable name that receives the value. * aset do gset on all possible queries of a combined query (\;). * expr Parsed expression, if needed. * stats Time spent in this command. */ typedef struct Command { PQExpBufferData lines; char *first_line; int type; MetaCommand meta; int argc; char *argv[MAX_ARGS]; char *varprefix; PgBenchExpr *expr; SimpleStats stats; } Command; typedef struct ParsedScript { const char *desc; /* script descriptor (eg, file name) */ int weight; /* selection weight */ Command **commands; /* NULL-terminated array of Commands */ StatsData stats; /* total time spent in script */ } ParsedScript; static ParsedScript sql_script[MAX_SCRIPTS]; /* SQL script files */ static int num_scripts; /* number of scripts in sql_script[] */ static int64 total_weight = 0; /* Builtin test scripts */ typedef struct BuiltinScript { const char *name; /* very short name for -b ... */ const char *desc; /* short description */ const char *script; /* actual pgbench script */ } BuiltinScript; static const BuiltinScript builtin_script[] = { { "tpcb-like", "", "\\set aid random(1, " CppAsString2(naccounts) " * :scale)\n" "\\set bid random(1, " CppAsString2(nbranches) " * :scale)\n" "\\set tid random(1, " CppAsString2(ntellers) " * :scale)\n" "\\set delta random(-5000, 5000)\n" "BEGIN;\n" "UPDATE pgbench_accounts SET abalance = abalance + :delta WHERE aid = :aid;\n" "SELECT abalance FROM pgbench_accounts WHERE aid = :aid;\n" "UPDATE pgbench_tellers SET tbalance = tbalance + :delta WHERE tid = :tid;\n" "UPDATE pgbench_branches SET bbalance = bbalance + :delta WHERE bid = :bid;\n" "INSERT INTO pgbench_history (tid, bid, aid, delta, mtime) VALUES (:tid, :bid, :aid, :delta, CURRENT_TIMESTAMP);\n" "END;\n" }, { "simple-update", "", "\\set aid random(1, " CppAsString2(naccounts) " * :scale)\n" "\\set bid random(1, " CppAsString2(nbranches) " * :scale)\n" "\\set tid random(1, " CppAsString2(ntellers) " * :scale)\n" "\\set delta random(-5000, 5000)\n" "BEGIN;\n" "UPDATE pgbench_accounts SET abalance = abalance + :delta WHERE aid = :aid;\n" "SELECT abalance FROM pgbench_accounts WHERE aid = :aid;\n" "INSERT INTO pgbench_history (tid, bid, aid, delta, mtime) VALUES (:tid, :bid, :aid, :delta, CURRENT_TIMESTAMP);\n" "END;\n" }, { "select-only", "", "\\set aid random(1, " CppAsString2(naccounts) " * :scale)\n" "SELECT abalance FROM pgbench_accounts WHERE aid = :aid;\n" } }; /* Function prototypes */ static void setNullValue(PgBenchValue *pv); static void setBoolValue(PgBenchValue *pv, bool bval); static void setIntValue(PgBenchValue *pv, int64 ival); static void setDoubleValue(PgBenchValue *pv, double dval); static bool evaluateExpr(CState *st, PgBenchExpr *expr, PgBenchValue *retval); static ConnectionStateEnum executeMetaCommand(CState *st, pg_time_usec_t *now); static void doLog(TState *thread, CState *st, StatsData *agg, bool skipped, double latency, double lag); static void processXactStats(TState *thread, CState *st, pg_time_usec_t *now, bool skipped, StatsData *agg); static void addScript(ParsedScript script); static THREAD_FUNC_RETURN_TYPE THREAD_FUNC_CC threadRun(void *arg); static void finishCon(CState *st); static void setalarm(int seconds); static socket_set *alloc_socket_set(int count); static void free_socket_set(socket_set *sa); static void clear_socket_set(socket_set *sa); static void add_socket_to_set(socket_set *sa, int fd, int idx); static int wait_on_socket_set(socket_set *sa, int64 usecs); static bool socket_has_input(socket_set *sa, int fd, int idx); /* callback functions for our flex lexer */ static const PsqlScanCallbacks pgbench_callbacks = { NULL, /* don't need get_variable functionality */ }; static inline pg_time_usec_t pg_time_now(void) { instr_time now; INSTR_TIME_SET_CURRENT(now); return (pg_time_usec_t) INSTR_TIME_GET_MICROSEC(now); } static inline void pg_time_now_lazy(pg_time_usec_t *now) { if ((*now) == 0) (*now) = pg_time_now(); } #define PG_TIME_GET_DOUBLE(t) (0.000001 * (t)) static void usage(void) { printf("%s is a benchmarking tool for PostgreSQL.\n\n" "Usage:\n" " %s [OPTION]... [DBNAME]\n" "\nInitialization options:\n" " -i, --initialize invokes initialization mode\n" " -I, --init-steps=[" ALL_INIT_STEPS "]+ (default \"" DEFAULT_INIT_STEPS "\")\n" " run selected initialization steps\n" " -F, --fillfactor=NUM set fill factor\n" " -n, --no-vacuum do not run VACUUM during initialization\n" " -q, --quiet quiet logging (one message each 5 seconds)\n" " -s, --scale=NUM scaling factor\n" " --foreign-keys create foreign key constraints between tables\n" " --index-tablespace=TABLESPACE\n" " create indexes in the specified tablespace\n" " --partition-method=(range|hash)\n" " partition pgbench_accounts with this method (default: range)\n" " --partitions=NUM partition pgbench_accounts into NUM parts (default: 0)\n" " --tablespace=TABLESPACE create tables in the specified tablespace\n" " --unlogged-tables create tables as unlogged tables\n" "\nOptions to select what to run:\n" " -b, --builtin=NAME[@W] add builtin script NAME weighted at W (default: 1)\n" " (use \"-b list\" to list available scripts)\n" " -f, --file=FILENAME[@W] add script FILENAME weighted at W (default: 1)\n" " -N, --skip-some-updates skip updates of pgbench_tellers and pgbench_branches\n" " (same as \"-b simple-update\")\n" " -S, --select-only perform SELECT-only transactions\n" " (same as \"-b select-only\")\n" "\nBenchmarking options:\n" " -c, --client=NUM number of concurrent database clients (default: 1)\n" " -C, --connect establish new connection for each transaction\n" " -D, --define=VARNAME=VALUE\n" " define variable for use by custom script\n" " -j, --jobs=NUM number of threads (default: 1)\n" " -l, --log write transaction times to log file\n" " -L, --latency-limit=NUM count transactions lasting more than NUM ms as late\n" " -M, --protocol=simple|extended|prepared\n" " protocol for submitting queries (default: simple)\n" " -n, --no-vacuum do not run VACUUM before tests\n" " -P, --progress=NUM show thread progress report every NUM seconds\n" " -r, --report-latencies report average latency per command\n" " -R, --rate=NUM target rate in transactions per second\n" " -s, --scale=NUM report this scale factor in output\n" " -t, --transactions=NUM number of transactions each client runs (default: 10)\n" " -T, --time=NUM duration of benchmark test in seconds\n" " -v, --vacuum-all vacuum all four standard tables before tests\n" " --aggregate-interval=NUM aggregate data over NUM seconds\n" " --log-prefix=PREFIX prefix for transaction time log file\n" " (default: \"pgbench_log\")\n" " --progress-timestamp use Unix epoch timestamps for progress\n" " --random-seed=SEED set random seed (\"time\", \"rand\", integer)\n" " --sampling-rate=NUM fraction of transactions to log (e.g., 0.01 for 1%%)\n" " --show-script=NAME show builtin script code, then exit\n" "\nCommon options:\n" " -d, --debug print debugging output\n" " -h, --host=HOSTNAME database server host or socket directory\n" " -p, --port=PORT database server port number\n" " -U, --username=USERNAME connect as specified database user\n" " -V, --version output version information, then exit\n" " -?, --help show this help, then exit\n" "\n" "Report bugs to <%s>.\n" "%s home page: <%s>\n", progname, progname, PACKAGE_BUGREPORT, PACKAGE_NAME, PACKAGE_URL); } /* return whether str matches "^\s*[-+]?[0-9]+$" */ static bool is_an_int(const char *str) { const char *ptr = str; /* skip leading spaces; cast is consistent with strtoint64 */ while (*ptr && isspace((unsigned char) *ptr)) ptr++; /* skip sign */ if (*ptr == '+' || *ptr == '-') ptr++; /* at least one digit */ if (*ptr && !isdigit((unsigned char) *ptr)) return false; /* eat all digits */ while (*ptr && isdigit((unsigned char) *ptr)) ptr++; /* must have reached end of string */ return *ptr == '\0'; } /* * strtoint64 -- convert a string to 64-bit integer * * This function is a slightly modified version of scanint8() from * src/backend/utils/adt/int8.c. * * The function returns whether the conversion worked, and if so * "*result" is set to the result. * * If not errorOK, an error message is also printed out on errors. */ bool strtoint64(const char *str, bool errorOK, int64 *result) { const char *ptr = str; int64 tmp = 0; bool neg = false; /* * Do our own scan, rather than relying on sscanf which might be broken * for long long. * * As INT64_MIN can't be stored as a positive 64 bit integer, accumulate * value as a negative number. */ /* skip leading spaces */ while (*ptr && isspace((unsigned char) *ptr)) ptr++; /* handle sign */ if (*ptr == '-') { ptr++; neg = true; } else if (*ptr == '+') ptr++; /* require at least one digit */ if (unlikely(!isdigit((unsigned char) *ptr))) goto invalid_syntax; /* process digits */ while (*ptr && isdigit((unsigned char) *ptr)) { int8 digit = (*ptr++ - '0'); if (unlikely(pg_mul_s64_overflow(tmp, 10, &tmp)) || unlikely(pg_sub_s64_overflow(tmp, digit, &tmp))) goto out_of_range; } /* allow trailing whitespace, but not other trailing chars */ while (*ptr != '\0' && isspace((unsigned char) *ptr)) ptr++; if (unlikely(*ptr != '\0')) goto invalid_syntax; if (!neg) { if (unlikely(tmp == PG_INT64_MIN)) goto out_of_range; tmp = -tmp; } *result = tmp; return true; out_of_range: if (!errorOK) pg_log_error("value \"%s\" is out of range for type bigint", str); return false; invalid_syntax: if (!errorOK) pg_log_error("invalid input syntax for type bigint: \"%s\"", str); return false; } /* convert string to double, detecting overflows/underflows */ bool strtodouble(const char *str, bool errorOK, double *dv) { char *end; errno = 0; *dv = strtod(str, &end); if (unlikely(errno != 0)) { if (!errorOK) pg_log_error("value \"%s\" is out of range for type double", str); return false; } if (unlikely(end == str || *end != '\0')) { if (!errorOK) pg_log_error("invalid input syntax for type double: \"%s\"", str); return false; } return true; } /* * Initialize a random state struct. * * We derive the seed from base_random_sequence, which must be set up already. */ static void initRandomState(RandomState *random_state) { random_state->xseed[0] = (unsigned short) (pg_jrand48(base_random_sequence.xseed) & 0xFFFF); random_state->xseed[1] = (unsigned short) (pg_jrand48(base_random_sequence.xseed) & 0xFFFF); random_state->xseed[2] = (unsigned short) (pg_jrand48(base_random_sequence.xseed) & 0xFFFF); } /* * Random number generator: uniform distribution from min to max inclusive. * * Although the limits are expressed as int64, you can't generate the full * int64 range in one call, because the difference of the limits mustn't * overflow int64. In practice it's unwise to ask for more than an int32 * range, because of the limited precision of pg_erand48(). */ static int64 getrand(RandomState *random_state, int64 min, int64 max) { /* * Odd coding is so that min and max have approximately the same chance of * being selected as do numbers between them. * * pg_erand48() is thread-safe and concurrent, which is why we use it * rather than random(), which in glibc is non-reentrant, and therefore * protected by a mutex, and therefore a bottleneck on machines with many * CPUs. */ return min + (int64) ((max - min + 1) * pg_erand48(random_state->xseed)); } /* * random number generator: exponential distribution from min to max inclusive. * the parameter is so that the density of probability for the last cut-off max * value is exp(-parameter). */ static int64 getExponentialRand(RandomState *random_state, int64 min, int64 max, double parameter) { double cut, uniform, rand; /* abort if wrong parameter, but must really be checked beforehand */ Assert(parameter > 0.0); cut = exp(-parameter); /* erand in [0, 1), uniform in (0, 1] */ uniform = 1.0 - pg_erand48(random_state->xseed); /* * inner expression in (cut, 1] (if parameter > 0), rand in [0, 1) */ Assert((1.0 - cut) != 0.0); rand = -log(cut + (1.0 - cut) * uniform) / parameter; /* return int64 random number within between min and max */ return min + (int64) ((max - min + 1) * rand); } /* random number generator: gaussian distribution from min to max inclusive */ static int64 getGaussianRand(RandomState *random_state, int64 min, int64 max, double parameter) { double stdev; double rand; /* abort if parameter is too low, but must really be checked beforehand */ Assert(parameter >= MIN_GAUSSIAN_PARAM); /* * Get user specified random number from this loop, with -parameter < * stdev <= parameter * * This loop is executed until the number is in the expected range. * * As the minimum parameter is 2.0, the probability of looping is low: * sqrt(-2 ln(r)) <= 2 => r >= e^{-2} ~ 0.135, then when taking the * average sinus multiplier as 2/pi, we have a 8.6% looping probability in * the worst case. For a parameter value of 5.0, the looping probability * is about e^{-5} * 2 / pi ~ 0.43%. */ do { /* * pg_erand48 generates [0,1), but for the basic version of the * Box-Muller transform the two uniformly distributed random numbers * are expected in (0, 1] (see * https://en.wikipedia.org/wiki/Box-Muller_transform) */ double rand1 = 1.0 - pg_erand48(random_state->xseed); double rand2 = 1.0 - pg_erand48(random_state->xseed); /* Box-Muller basic form transform */ double var_sqrt = sqrt(-2.0 * log(rand1)); stdev = var_sqrt * sin(2.0 * M_PI * rand2); /* * we may try with cos, but there may be a bias induced if the * previous value fails the test. To be on the safe side, let us try * over. */ } while (stdev < -parameter || stdev >= parameter); /* stdev is in [-parameter, parameter), normalization to [0,1) */ rand = (stdev + parameter) / (parameter * 2.0); /* return int64 random number within between min and max */ return min + (int64) ((max - min + 1) * rand); } /* * random number generator: generate a value, such that the series of values * will approximate a Poisson distribution centered on the given value. * * Individual results are rounded to integers, though the center value need * not be one. */ static int64 getPoissonRand(RandomState *random_state, double center) { /* * Use inverse transform sampling to generate a value > 0, such that the * expected (i.e. average) value is the given argument. */ double uniform; /* erand in [0, 1), uniform in (0, 1] */ uniform = 1.0 - pg_erand48(random_state->xseed); return (int64) (-log(uniform) * center + 0.5); } /* * Computing zipfian using rejection method, based on * "Non-Uniform Random Variate Generation", * Luc Devroye, p. 550-551, Springer 1986. * * This works for s > 1.0, but may perform badly for s very close to 1.0. */ static int64 computeIterativeZipfian(RandomState *random_state, int64 n, double s) { double b = pow(2.0, s - 1.0); double x, t, u, v; /* Ensure n is sane */ if (n <= 1) return 1; while (true) { /* random variates */ u = pg_erand48(random_state->xseed); v = pg_erand48(random_state->xseed); x = floor(pow(u, -1.0 / (s - 1.0))); t = pow(1.0 + 1.0 / x, s - 1.0); /* reject if too large or out of bound */ if (v * x * (t - 1.0) / (b - 1.0) <= t / b && x <= n) break; } return (int64) x; } /* random number generator: zipfian distribution from min to max inclusive */ static int64 getZipfianRand(RandomState *random_state, int64 min, int64 max, double s) { int64 n = max - min + 1; /* abort if parameter is invalid */ Assert(MIN_ZIPFIAN_PARAM <= s && s <= MAX_ZIPFIAN_PARAM); return min - 1 + computeIterativeZipfian(random_state, n, s); } /* * FNV-1a hash function */ static int64 getHashFnv1a(int64 val, uint64 seed) { int64 result; int i; result = FNV_OFFSET_BASIS ^ seed; for (i = 0; i < 8; ++i) { int32 octet = val & 0xff; val = val >> 8; result = result ^ octet; result = result * FNV_PRIME; } return result; } /* * Murmur2 hash function * * Based on original work of Austin Appleby * https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp */ static int64 getHashMurmur2(int64 val, uint64 seed) { uint64 result = seed ^ MM2_MUL_TIMES_8; /* sizeof(int64) */ uint64 k = (uint64) val; k *= MM2_MUL; k ^= k >> MM2_ROT; k *= MM2_MUL; result ^= k; result *= MM2_MUL; result ^= result >> MM2_ROT; result *= MM2_MUL; result ^= result >> MM2_ROT; return (int64) result; } /* * Pseudorandom permutation function * * For small sizes, this generates each of the (size!) possible permutations * of integers in the range [0, size) with roughly equal probability. Once * the size is larger than 20, the number of possible permutations exceeds the * number of distinct states of the internal pseudorandom number generators, * and so not all possible permutations can be generated, but the permutations * chosen should continue to give the appearance of being random. * * THIS FUNCTION IS NOT CRYPTOGRAPHICALLY SECURE. * DO NOT USE FOR SUCH PURPOSE. */ static int64 permute(const int64 val, const int64 isize, const int64 seed) { RandomState random_state1; RandomState random_state2; uint64 size; uint64 v; int masklen; uint64 mask; int i; if (isize < 2) return 0; /* nothing to permute */ /* Initialize a pair of random states using the seed */ random_state1.xseed[0] = seed & 0xFFFF; random_state1.xseed[1] = (seed >> 16) & 0xFFFF; random_state1.xseed[2] = (seed >> 32) & 0xFFFF; random_state2.xseed[0] = (((uint64) seed) >> 48) & 0xFFFF; random_state2.xseed[1] = seed & 0xFFFF; random_state2.xseed[2] = (seed >> 16) & 0xFFFF; /* Computations are performed on unsigned values */ size = (uint64) isize; v = (uint64) val % size; /* Mask to work modulo largest power of 2 less than or equal to size */ masklen = pg_leftmost_one_pos64(size); mask = (((uint64) 1) << masklen) - 1; /* * Permute the input value by applying several rounds of pseudorandom * bijective transformations. The intention here is to distribute each * input uniformly randomly across the range, and separate adjacent inputs * approximately uniformly randomly from each other, leading to a fairly * random overall choice of permutation. * * To separate adjacent inputs, we multiply by a random number modulo * (mask + 1), which is a power of 2. For this to be a bijection, the * multiplier must be odd. Since this is known to lead to less randomness * in the lower bits, we also apply a rotation that shifts the topmost bit * into the least significant bit. In the special cases where size <= 3, * mask = 1 and each of these operations is actually a no-op, so we also * XOR the value with a different random number to inject additional * randomness. Since the size is generally not a power of 2, we apply * this bijection on overlapping upper and lower halves of the input. * * To distribute the inputs uniformly across the range, we then also apply * a random offset modulo the full range. * * Taken together, these operations resemble a modified linear * congruential generator, as is commonly used in pseudorandom number * generators. The number of rounds is fairly arbitrary, but six has been * found empirically to give a fairly good tradeoff between performance * and uniform randomness. For small sizes it selects each of the (size!) * possible permutations with roughly equal probability. For larger * sizes, not all permutations can be generated, but the intended random * spread is still produced. */ for (i = 0; i < 6; i++) { uint64 m, r, t; /* Random multiply (by an odd number), XOR and rotate of lower half */ m = (uint64) getrand(&random_state1, 0, mask) | 1; r = (uint64) getrand(&random_state2, 0, mask); if (v <= mask) { v = ((v * m) ^ r) & mask; v = ((v << 1) & mask) | (v >> (masklen - 1)); } /* Random multiply (by an odd number), XOR and rotate of upper half */ m = (uint64) getrand(&random_state1, 0, mask) | 1; r = (uint64) getrand(&random_state2, 0, mask); t = size - 1 - v; if (t <= mask) { t = ((t * m) ^ r) & mask; t = ((t << 1) & mask) | (t >> (masklen - 1)); v = size - 1 - t; } /* Random offset */ r = (uint64) getrand(&random_state2, 0, size - 1); v = (v + r) % size; } return (int64) v; } /* * Initialize the given SimpleStats struct to all zeroes */ static void initSimpleStats(SimpleStats *ss) { memset(ss, 0, sizeof(SimpleStats)); } /* * Accumulate one value into a SimpleStats struct. */ static void addToSimpleStats(SimpleStats *ss, double val) { if (ss->count == 0 || val < ss->min) ss->min = val; if (ss->count == 0 || val > ss->max) ss->max = val; ss->count++; ss->sum += val; ss->sum2 += val * val; } /* * Merge two SimpleStats objects */ static void mergeSimpleStats(SimpleStats *acc, SimpleStats *ss) { if (acc->count == 0 || ss->min < acc->min) acc->min = ss->min; if (acc->count == 0 || ss->max > acc->max) acc->max = ss->max; acc->count += ss->count; acc->sum += ss->sum; acc->sum2 += ss->sum2; } /* * Initialize a StatsData struct to mostly zeroes, with its start time set to * the given value. */ static void initStats(StatsData *sd, pg_time_usec_t start) { sd->start_time = start; sd->cnt = 0; sd->skipped = 0; initSimpleStats(&sd->latency); initSimpleStats(&sd->lag); } /* * Accumulate one additional item into the given stats object. */ static void accumStats(StatsData *stats, bool skipped, double lat, double lag) { stats->cnt++; if (skipped) { /* no latency to record on skipped transactions */ stats->skipped++; } else { addToSimpleStats(&stats->latency, lat); /* and possibly the same for schedule lag */ if (throttle_delay) addToSimpleStats(&stats->lag, lag); } } /* call PQexec() and exit() on failure */ static void executeStatement(PGconn *con, const char *sql) { PGresult *res; res = PQexec(con, sql); if (PQresultStatus(res) != PGRES_COMMAND_OK) { pg_log_fatal("query failed: %s", PQerrorMessage(con)); pg_log_info("query was: %s", sql); exit(1); } PQclear(res); } /* call PQexec() and complain, but without exiting, on failure */ static void tryExecuteStatement(PGconn *con, const char *sql) { PGresult *res; res = PQexec(con, sql); if (PQresultStatus(res) != PGRES_COMMAND_OK) { pg_log_error("%s", PQerrorMessage(con)); pg_log_info("(ignoring this error and continuing anyway)"); } PQclear(res); } /* set up a connection to the backend */ static PGconn * doConnect(void) { PGconn *conn; bool new_pass; static char *password = NULL; /* * Start the connection. Loop until we have a password if requested by * backend. */ do { #define PARAMS_ARRAY_SIZE 7 const char *keywords[PARAMS_ARRAY_SIZE]; const char *values[PARAMS_ARRAY_SIZE]; keywords[0] = "host"; values[0] = pghost; keywords[1] = "port"; values[1] = pgport; keywords[2] = "user"; values[2] = username; keywords[3] = "password"; values[3] = password; keywords[4] = "dbname"; values[4] = dbName; keywords[5] = "fallback_application_name"; values[5] = progname; keywords[6] = NULL; values[6] = NULL; new_pass = false; conn = PQconnectdbParams(keywords, values, true); if (!conn) { pg_log_error("connection to database \"%s\" failed", dbName); return NULL; } if (PQstatus(conn) == CONNECTION_BAD && PQconnectionNeedsPassword(conn) && !password) { PQfinish(conn); password = simple_prompt("Password: ", false); new_pass = true; } } while (new_pass); /* check to see that the backend connection was successfully made */ if (PQstatus(conn) == CONNECTION_BAD) { pg_log_error("%s", PQerrorMessage(conn)); PQfinish(conn); return NULL; } return conn; } /* qsort comparator for Variable array */ static int compareVariableNames(const void *v1, const void *v2) { return strcmp(((const Variable *) v1)->name, ((const Variable *) v2)->name); } /* Locate a variable by name; returns NULL if unknown */ static Variable * lookupVariable(CState *st, char *name) { Variable key; /* On some versions of Solaris, bsearch of zero items dumps core */ if (st->nvariables <= 0) return NULL; /* Sort if we have to */ if (!st->vars_sorted) { qsort((void *) st->variables, st->nvariables, sizeof(Variable), compareVariableNames); st->vars_sorted = true; } /* Now we can search */ key.name = name; return (Variable *) bsearch((void *) &key, (void *) st->variables, st->nvariables, sizeof(Variable), compareVariableNames); } /* Get the value of a variable, in string form; returns NULL if unknown */ static char * getVariable(CState *st, char *name) { Variable *var; char stringform[64]; var = lookupVariable(st, name); if (var == NULL) return NULL; /* not found */ if (var->svalue) return var->svalue; /* we have it in string form */ /* We need to produce a string equivalent of the value */ Assert(var->value.type != PGBT_NO_VALUE); if (var->value.type == PGBT_NULL) snprintf(stringform, sizeof(stringform), "NULL"); else if (var->value.type == PGBT_BOOLEAN) snprintf(stringform, sizeof(stringform), "%s", var->value.u.bval ? "true" : "false"); else if (var->value.type == PGBT_INT) snprintf(stringform, sizeof(stringform), INT64_FORMAT, var->value.u.ival); else if (var->value.type == PGBT_DOUBLE) snprintf(stringform, sizeof(stringform), "%.*g", DBL_DIG, var->value.u.dval); else /* internal error, unexpected type */ Assert(0); var->svalue = pg_strdup(stringform); return var->svalue; } /* Try to convert variable to a value; return false on failure */ static bool makeVariableValue(Variable *var) { size_t slen; if (var->value.type != PGBT_NO_VALUE) return true; /* no work */ slen = strlen(var->svalue); if (slen == 0) /* what should it do on ""? */ return false; if (pg_strcasecmp(var->svalue, "null") == 0) { setNullValue(&var->value); } /* * accept prefixes such as y, ye, n, no... but not for "o". 0/1 are * recognized later as an int, which is converted to bool if needed. */ else if (pg_strncasecmp(var->svalue, "true", slen) == 0 || pg_strncasecmp(var->svalue, "yes", slen) == 0 || pg_strcasecmp(var->svalue, "on") == 0) { setBoolValue(&var->value, true); } else if (pg_strncasecmp(var->svalue, "false", slen) == 0 || pg_strncasecmp(var->svalue, "no", slen) == 0 || pg_strcasecmp(var->svalue, "off") == 0 || pg_strcasecmp(var->svalue, "of") == 0) { setBoolValue(&var->value, false); } else if (is_an_int(var->svalue)) { /* if it looks like an int, it must be an int without overflow */ int64 iv; if (!strtoint64(var->svalue, false, &iv)) return false; setIntValue(&var->value, iv); } else /* type should be double */ { double dv; if (!strtodouble(var->svalue, true, &dv)) { pg_log_error("malformed variable \"%s\" value: \"%s\"", var->name, var->svalue); return false; } setDoubleValue(&var->value, dv); } return true; } /* * Check whether a variable's name is allowed. * * We allow any non-ASCII character, as well as ASCII letters, digits, and * underscore. * * Keep this in sync with the definitions of variable name characters in * "src/fe_utils/psqlscan.l", "src/bin/psql/psqlscanslash.l" and * "src/bin/pgbench/exprscan.l". Also see parseVariable(), below. * * Note: this static function is copied from "src/bin/psql/variables.c" * but changed to disallow variable names starting with a digit. */ static bool valid_variable_name(const char *name) { const unsigned char *ptr = (const unsigned char *) name; /* Mustn't be zero-length */ if (*ptr == '\0') return false; /* must not start with [0-9] */ if (IS_HIGHBIT_SET(*ptr) || strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "_", *ptr) != NULL) ptr++; else return false; /* remaining characters can include [0-9] */ while (*ptr) { if (IS_HIGHBIT_SET(*ptr) || strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "_0123456789", *ptr) != NULL) ptr++; else return false; } return true; } /* * Lookup a variable by name, creating it if need be. * Caller is expected to assign a value to the variable. * Returns NULL on failure (bad name). */ static Variable * lookupCreateVariable(CState *st, const char *context, char *name) { Variable *var; var = lookupVariable(st, name); if (var == NULL) { Variable *newvars; /* * Check for the name only when declaring a new variable to avoid * overhead. */ if (!valid_variable_name(name)) { pg_log_error("%s: invalid variable name: \"%s\"", context, name); return NULL; } /* Create variable at the end of the array */ if (st->variables) newvars = (Variable *) pg_realloc(st->variables, (st->nvariables + 1) * sizeof(Variable)); else newvars = (Variable *) pg_malloc(sizeof(Variable)); st->variables = newvars; var = &newvars[st->nvariables]; var->name = pg_strdup(name); var->svalue = NULL; /* caller is expected to initialize remaining fields */ st->nvariables++; /* we don't re-sort the array till we have to */ st->vars_sorted = false; } return var; } /* Assign a string value to a variable, creating it if need be */ /* Returns false on failure (bad name) */ static bool putVariable(CState *st, const char *context, char *name, const char *value) { Variable *var; char *val; var = lookupCreateVariable(st, context, name); if (!var) return false; /* dup then free, in case value is pointing at this variable */ val = pg_strdup(value); if (var->svalue) free(var->svalue); var->svalue = val; var->value.type = PGBT_NO_VALUE; return true; } /* Assign a value to a variable, creating it if need be */ /* Returns false on failure (bad name) */ static bool putVariableValue(CState *st, const char *context, char *name, const PgBenchValue *value) { Variable *var; var = lookupCreateVariable(st, context, name); if (!var) return false; if (var->svalue) free(var->svalue); var->svalue = NULL; var->value = *value; return true; } /* Assign an integer value to a variable, creating it if need be */ /* Returns false on failure (bad name) */ static bool putVariableInt(CState *st, const char *context, char *name, int64 value) { PgBenchValue val; setIntValue(&val, value); return putVariableValue(st, context, name, &val); } /* * Parse a possible variable reference (:varname). * * "sql" points at a colon. If what follows it looks like a valid * variable name, return a malloc'd string containing the variable name, * and set *eaten to the number of characters consumed (including the colon). * Otherwise, return NULL. */ static char * parseVariable(const char *sql, int *eaten) { int i = 1; /* starting at 1 skips the colon */ char *name; /* keep this logic in sync with valid_variable_name() */ if (IS_HIGHBIT_SET(sql[i]) || strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "_", sql[i]) != NULL) i++; else return NULL; while (IS_HIGHBIT_SET(sql[i]) || strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "_0123456789", sql[i]) != NULL) i++; name = pg_malloc(i); memcpy(name, &sql[1], i - 1); name[i - 1] = '\0'; *eaten = i; return name; } static char * replaceVariable(char **sql, char *param, int len, char *value) { int valueln = strlen(value); if (valueln > len) { size_t offset = param - *sql; *sql = pg_realloc(*sql, strlen(*sql) - len + valueln + 1); param = *sql + offset; } if (valueln != len) memmove(param + valueln, param + len, strlen(param + len) + 1); memcpy(param, value, valueln); return param + valueln; } static char * assignVariables(CState *st, char *sql) { char *p, *name, *val; p = sql; while ((p = strchr(p, ':')) != NULL) { int eaten; name = parseVariable(p, &eaten); if (name == NULL) { while (*p == ':') { p++; } continue; } val = getVariable(st, name); free(name); if (val == NULL) { p++; continue; } p = replaceVariable(&sql, p, eaten, val); } return sql; } static void getQueryParams(CState *st, const Command *command, const char **params) { int i; for (i = 0; i < command->argc - 1; i++) params[i] = getVariable(st, command->argv[i + 1]); } static char * valueTypeName(PgBenchValue *pval) { if (pval->type == PGBT_NO_VALUE) return "none"; else if (pval->type == PGBT_NULL) return "null"; else if (pval->type == PGBT_INT) return "int"; else if (pval->type == PGBT_DOUBLE) return "double"; else if (pval->type == PGBT_BOOLEAN) return "boolean"; else { /* internal error, should never get there */ Assert(false); return NULL; } } /* get a value as a boolean, or tell if there is a problem */ static bool coerceToBool(PgBenchValue *pval, bool *bval) { if (pval->type == PGBT_BOOLEAN) { *bval = pval->u.bval; return true; } else /* NULL, INT or DOUBLE */ { pg_log_error("cannot coerce %s to boolean", valueTypeName(pval)); *bval = false; /* suppress uninitialized-variable warnings */ return false; } } /* * Return true or false from an expression for conditional purposes. * Non zero numerical values are true, zero and NULL are false. */ static bool valueTruth(PgBenchValue *pval) { switch (pval->type) { case PGBT_NULL: return false; case PGBT_BOOLEAN: return pval->u.bval; case PGBT_INT: return pval->u.ival != 0; case PGBT_DOUBLE: return pval->u.dval != 0.0; default: /* internal error, unexpected type */ Assert(0); return false; } } /* get a value as an int, tell if there is a problem */ static bool coerceToInt(PgBenchValue *pval, int64 *ival) { if (pval->type == PGBT_INT) { *ival = pval->u.ival; return true; } else if (pval->type == PGBT_DOUBLE) { double dval = rint(pval->u.dval); if (isnan(dval) || !FLOAT8_FITS_IN_INT64(dval)) { pg_log_error("double to int overflow for %f", dval); return false; } *ival = (int64) dval; return true; } else /* BOOLEAN or NULL */ { pg_log_error("cannot coerce %s to int", valueTypeName(pval)); return false; } } /* get a value as a double, or tell if there is a problem */ static bool coerceToDouble(PgBenchValue *pval, double *dval) { if (pval->type == PGBT_DOUBLE) { *dval = pval->u.dval; return true; } else if (pval->type == PGBT_INT) { *dval = (double) pval->u.ival; return true; } else /* BOOLEAN or NULL */ { pg_log_error("cannot coerce %s to double", valueTypeName(pval)); return false; } } /* assign a null value */ static void setNullValue(PgBenchValue *pv) { pv->type = PGBT_NULL; pv->u.ival = 0; } /* assign a boolean value */ static void setBoolValue(PgBenchValue *pv, bool bval) { pv->type = PGBT_BOOLEAN; pv->u.bval = bval; } /* assign an integer value */ static void setIntValue(PgBenchValue *pv, int64 ival) { pv->type = PGBT_INT; pv->u.ival = ival; } /* assign a double value */ static void setDoubleValue(PgBenchValue *pv, double dval) { pv->type = PGBT_DOUBLE; pv->u.dval = dval; } static bool isLazyFunc(PgBenchFunction func) { return func == PGBENCH_AND || func == PGBENCH_OR || func == PGBENCH_CASE; } /* lazy evaluation of some functions */ static bool evalLazyFunc(CState *st, PgBenchFunction func, PgBenchExprLink *args, PgBenchValue *retval) { PgBenchValue a1, a2; bool ba1, ba2; Assert(isLazyFunc(func) && args != NULL && args->next != NULL); /* args points to first condition */ if (!evaluateExpr(st, args->expr, &a1)) return false; /* second condition for AND/OR and corresponding branch for CASE */ args = args->next; switch (func) { case PGBENCH_AND: if (a1.type == PGBT_NULL) { setNullValue(retval); return true; } if (!coerceToBool(&a1, &ba1)) return false; if (!ba1) { setBoolValue(retval, false); return true; } if (!evaluateExpr(st, args->expr, &a2)) return false; if (a2.type == PGBT_NULL) { setNullValue(retval); return true; } else if (!coerceToBool(&a2, &ba2)) return false; else { setBoolValue(retval, ba2); return true; } return true; case PGBENCH_OR: if (a1.type == PGBT_NULL) { setNullValue(retval); return true; } if (!coerceToBool(&a1, &ba1)) return false; if (ba1) { setBoolValue(retval, true); return true; } if (!evaluateExpr(st, args->expr, &a2)) return false; if (a2.type == PGBT_NULL) { setNullValue(retval); return true; } else if (!coerceToBool(&a2, &ba2)) return false; else { setBoolValue(retval, ba2); return true; } case PGBENCH_CASE: /* when true, execute branch */ if (valueTruth(&a1)) return evaluateExpr(st, args->expr, retval); /* now args contains next condition or final else expression */ args = args->next; /* final else case? */ if (args->next == NULL) return evaluateExpr(st, args->expr, retval); /* no, another when, proceed */ return evalLazyFunc(st, PGBENCH_CASE, args, retval); default: /* internal error, cannot get here */ Assert(0); break; } return false; } /* maximum number of function arguments */ #define MAX_FARGS 16 /* * Recursive evaluation of standard functions, * which do not require lazy evaluation. */ static bool evalStandardFunc(CState *st, PgBenchFunction func, PgBenchExprLink *args, PgBenchValue *retval) { /* evaluate all function arguments */ int nargs = 0; PgBenchValue vargs[MAX_FARGS]; PgBenchExprLink *l = args; bool has_null = false; for (nargs = 0; nargs < MAX_FARGS && l != NULL; nargs++, l = l->next) { if (!evaluateExpr(st, l->expr, &vargs[nargs])) return false; has_null |= vargs[nargs].type == PGBT_NULL; } if (l != NULL) { pg_log_error("too many function arguments, maximum is %d", MAX_FARGS); return false; } /* NULL arguments */ if (has_null && func != PGBENCH_IS && func != PGBENCH_DEBUG) { setNullValue(retval); return true; } /* then evaluate function */ switch (func) { /* overloaded operators */ case PGBENCH_ADD: case PGBENCH_SUB: case PGBENCH_MUL: case PGBENCH_DIV: case PGBENCH_MOD: case PGBENCH_EQ: case PGBENCH_NE: case PGBENCH_LE: case PGBENCH_LT: { PgBenchValue *lval = &vargs[0], *rval = &vargs[1]; Assert(nargs == 2); /* overloaded type management, double if some double */ if ((lval->type == PGBT_DOUBLE || rval->type == PGBT_DOUBLE) && func != PGBENCH_MOD) { double ld, rd; if (!coerceToDouble(lval, &ld) || !coerceToDouble(rval, &rd)) return false; switch (func) { case PGBENCH_ADD: setDoubleValue(retval, ld + rd); return true; case PGBENCH_SUB: setDoubleValue(retval, ld - rd); return true; case PGBENCH_MUL: setDoubleValue(retval, ld * rd); return true; case PGBENCH_DIV: setDoubleValue(retval, ld / rd); return true; case PGBENCH_EQ: setBoolValue(retval, ld == rd); return true; case PGBENCH_NE: setBoolValue(retval, ld != rd); return true; case PGBENCH_LE: setBoolValue(retval, ld <= rd); return true; case PGBENCH_LT: setBoolValue(retval, ld < rd); return true; default: /* cannot get here */ Assert(0); } } else /* we have integer operands, or % */ { int64 li, ri, res; if (!coerceToInt(lval, &li) || !coerceToInt(rval, &ri)) return false; switch (func) { case PGBENCH_ADD: if (pg_add_s64_overflow(li, ri, &res)) { pg_log_error("bigint add out of range"); return false; } setIntValue(retval, res); return true; case PGBENCH_SUB: if (pg_sub_s64_overflow(li, ri, &res)) { pg_log_error("bigint sub out of range"); return false; } setIntValue(retval, res); return true; case PGBENCH_MUL: if (pg_mul_s64_overflow(li, ri, &res)) { pg_log_error("bigint mul out of range"); return false; } setIntValue(retval, res); return true; case PGBENCH_EQ: setBoolValue(retval, li == ri); return true; case PGBENCH_NE: setBoolValue(retval, li != ri); return true; case PGBENCH_LE: setBoolValue(retval, li <= ri); return true; case PGBENCH_LT: setBoolValue(retval, li < ri); return true; case PGBENCH_DIV: case PGBENCH_MOD: if (ri == 0) { pg_log_error("division by zero"); return false; } /* special handling of -1 divisor */ if (ri == -1) { if (func == PGBENCH_DIV) { /* overflow check (needed for INT64_MIN) */ if (li == PG_INT64_MIN) { pg_log_error("bigint div out of range"); return false; } else setIntValue(retval, -li); } else setIntValue(retval, 0); return true; } /* else divisor is not -1 */ if (func == PGBENCH_DIV) setIntValue(retval, li / ri); else /* func == PGBENCH_MOD */ setIntValue(retval, li % ri); return true; default: /* cannot get here */ Assert(0); } } Assert(0); return false; /* NOTREACHED */ } /* integer bitwise operators */ case PGBENCH_BITAND: case PGBENCH_BITOR: case PGBENCH_BITXOR: case PGBENCH_LSHIFT: case PGBENCH_RSHIFT: { int64 li, ri; if (!coerceToInt(&vargs[0], &li) || !coerceToInt(&vargs[1], &ri)) return false; if (func == PGBENCH_BITAND) setIntValue(retval, li & ri); else if (func == PGBENCH_BITOR) setIntValue(retval, li | ri); else if (func == PGBENCH_BITXOR) setIntValue(retval, li ^ ri); else if (func == PGBENCH_LSHIFT) setIntValue(retval, li << ri); else if (func == PGBENCH_RSHIFT) setIntValue(retval, li >> ri); else /* cannot get here */ Assert(0); return true; } /* logical operators */ case PGBENCH_NOT: { bool b; if (!coerceToBool(&vargs[0], &b)) return false; setBoolValue(retval, !b); return true; } /* no arguments */ case PGBENCH_PI: setDoubleValue(retval, M_PI); return true; /* 1 overloaded argument */ case PGBENCH_ABS: { PgBenchValue *varg = &vargs[0]; Assert(nargs == 1); if (varg->type == PGBT_INT) { int64 i = varg->u.ival; setIntValue(retval, i < 0 ? -i : i); } else { double d = varg->u.dval; Assert(varg->type == PGBT_DOUBLE); setDoubleValue(retval, d < 0.0 ? -d : d); } return true; } case PGBENCH_DEBUG: { PgBenchValue *varg = &vargs[0]; Assert(nargs == 1); fprintf(stderr, "debug(script=%d,command=%d): ", st->use_file, st->command + 1); if (varg->type == PGBT_NULL) fprintf(stderr, "null\n"); else if (varg->type == PGBT_BOOLEAN) fprintf(stderr, "boolean %s\n", varg->u.bval ? "true" : "false"); else if (varg->type == PGBT_INT) fprintf(stderr, "int " INT64_FORMAT "\n", varg->u.ival); else if (varg->type == PGBT_DOUBLE) fprintf(stderr, "double %.*g\n", DBL_DIG, varg->u.dval); else /* internal error, unexpected type */ Assert(0); *retval = *varg; return true; } /* 1 double argument */ case PGBENCH_DOUBLE: case PGBENCH_SQRT: case PGBENCH_LN: case PGBENCH_EXP: { double dval; Assert(nargs == 1); if (!coerceToDouble(&vargs[0], &dval)) return false; if (func == PGBENCH_SQRT) dval = sqrt(dval); else if (func == PGBENCH_LN) dval = log(dval); else if (func == PGBENCH_EXP) dval = exp(dval); /* else is cast: do nothing */ setDoubleValue(retval, dval); return true; } /* 1 int argument */ case PGBENCH_INT: { int64 ival; Assert(nargs == 1); if (!coerceToInt(&vargs[0], &ival)) return false; setIntValue(retval, ival); return true; } /* variable number of arguments */ case PGBENCH_LEAST: case PGBENCH_GREATEST: { bool havedouble; int i; Assert(nargs >= 1); /* need double result if any input is double */ havedouble = false; for (i = 0; i < nargs; i++) { if (vargs[i].type == PGBT_DOUBLE) { havedouble = true; break; } } if (havedouble) { double extremum; if (!coerceToDouble(&vargs[0], &extremum)) return false; for (i = 1; i < nargs; i++) { double dval; if (!coerceToDouble(&vargs[i], &dval)) return false; if (func == PGBENCH_LEAST) extremum = Min(extremum, dval); else extremum = Max(extremum, dval); } setDoubleValue(retval, extremum); } else { int64 extremum; if (!coerceToInt(&vargs[0], &extremum)) return false; for (i = 1; i < nargs; i++) { int64 ival; if (!coerceToInt(&vargs[i], &ival)) return false; if (func == PGBENCH_LEAST) extremum = Min(extremum, ival); else extremum = Max(extremum, ival); } setIntValue(retval, extremum); } return true; } /* random functions */ case PGBENCH_RANDOM: case PGBENCH_RANDOM_EXPONENTIAL: case PGBENCH_RANDOM_GAUSSIAN: case PGBENCH_RANDOM_ZIPFIAN: { int64 imin, imax, delta; Assert(nargs >= 2); if (!coerceToInt(&vargs[0], &imin) || !coerceToInt(&vargs[1], &imax)) return false; /* check random range */ if (unlikely(imin > imax)) { pg_log_error("empty range given to random"); return false; } else if (unlikely(pg_sub_s64_overflow(imax, imin, &delta) || pg_add_s64_overflow(delta, 1, &delta))) { /* prevent int overflows in random functions */ pg_log_error("random range is too large"); return false; } if (func == PGBENCH_RANDOM) { Assert(nargs == 2); setIntValue(retval, getrand(&st->cs_func_rs, imin, imax)); } else /* gaussian & exponential */ { double param; Assert(nargs == 3); if (!coerceToDouble(&vargs[2], ¶m)) return false; if (func == PGBENCH_RANDOM_GAUSSIAN) { if (param < MIN_GAUSSIAN_PARAM) { pg_log_error("gaussian parameter must be at least %f (not %f)", MIN_GAUSSIAN_PARAM, param); return false; } setIntValue(retval, getGaussianRand(&st->cs_func_rs, imin, imax, param)); } else if (func == PGBENCH_RANDOM_ZIPFIAN) { if (param < MIN_ZIPFIAN_PARAM || param > MAX_ZIPFIAN_PARAM) { pg_log_error("zipfian parameter must be in range [%.3f, %.0f] (not %f)", MIN_ZIPFIAN_PARAM, MAX_ZIPFIAN_PARAM, param); return false; } setIntValue(retval, getZipfianRand(&st->cs_func_rs, imin, imax, param)); } else /* exponential */ { if (param <= 0.0) { pg_log_error("exponential parameter must be greater than zero (not %f)", param); return false; } setIntValue(retval, getExponentialRand(&st->cs_func_rs, imin, imax, param)); } } return true; } case PGBENCH_POW: { PgBenchValue *lval = &vargs[0]; PgBenchValue *rval = &vargs[1]; double ld, rd; Assert(nargs == 2); if (!coerceToDouble(lval, &ld) || !coerceToDouble(rval, &rd)) return false; setDoubleValue(retval, pow(ld, rd)); return true; } case PGBENCH_IS: { Assert(nargs == 2); /* * note: this simple implementation is more permissive than * SQL */ setBoolValue(retval, vargs[0].type == vargs[1].type && vargs[0].u.bval == vargs[1].u.bval); return true; } /* hashing */ case PGBENCH_HASH_FNV1A: case PGBENCH_HASH_MURMUR2: { int64 val, seed; Assert(nargs == 2); if (!coerceToInt(&vargs[0], &val) || !coerceToInt(&vargs[1], &seed)) return false; if (func == PGBENCH_HASH_MURMUR2) setIntValue(retval, getHashMurmur2(val, seed)); else if (func == PGBENCH_HASH_FNV1A) setIntValue(retval, getHashFnv1a(val, seed)); else /* cannot get here */ Assert(0); return true; } case PGBENCH_PERMUTE: { int64 val, size, seed; Assert(nargs == 3); if (!coerceToInt(&vargs[0], &val) || !coerceToInt(&vargs[1], &size) || !coerceToInt(&vargs[2], &seed)) return false; if (size <= 0) { pg_log_error("permute size parameter must be greater than zero"); return false; } setIntValue(retval, permute(val, size, seed)); return true; } default: /* cannot get here */ Assert(0); /* dead code to avoid a compiler warning */ return false; } } /* evaluate some function */ static bool evalFunc(CState *st, PgBenchFunction func, PgBenchExprLink *args, PgBenchValue *retval) { if (isLazyFunc(func)) return evalLazyFunc(st, func, args, retval); else return evalStandardFunc(st, func, args, retval); } /* * Recursive evaluation of an expression in a pgbench script * using the current state of variables. * Returns whether the evaluation was ok, * the value itself is returned through the retval pointer. */ static bool evaluateExpr(CState *st, PgBenchExpr *expr, PgBenchValue *retval) { switch (expr->etype) { case ENODE_CONSTANT: { *retval = expr->u.constant; return true; } case ENODE_VARIABLE: { Variable *var; if ((var = lookupVariable(st, expr->u.variable.varname)) == NULL) { pg_log_error("undefined variable \"%s\"", expr->u.variable.varname); return false; } if (!makeVariableValue(var)) return false; *retval = var->value; return true; } case ENODE_FUNCTION: return evalFunc(st, expr->u.function.function, expr->u.function.args, retval); default: /* internal error which should never occur */ pg_log_fatal("unexpected enode type in evaluation: %d", expr->etype); exit(1); } } /* * Convert command name to meta-command enum identifier */ static MetaCommand getMetaCommand(const char *cmd) { MetaCommand mc; if (cmd == NULL) mc = META_NONE; else if (pg_strcasecmp(cmd, "set") == 0) mc = META_SET; else if (pg_strcasecmp(cmd, "setshell") == 0) mc = META_SETSHELL; else if (pg_strcasecmp(cmd, "shell") == 0) mc = META_SHELL; else if (pg_strcasecmp(cmd, "sleep") == 0) mc = META_SLEEP; else if (pg_strcasecmp(cmd, "if") == 0) mc = META_IF; else if (pg_strcasecmp(cmd, "elif") == 0) mc = META_ELIF; else if (pg_strcasecmp(cmd, "else") == 0) mc = META_ELSE; else if (pg_strcasecmp(cmd, "endif") == 0) mc = META_ENDIF; else if (pg_strcasecmp(cmd, "gset") == 0) mc = META_GSET; else if (pg_strcasecmp(cmd, "aset") == 0) mc = META_ASET; else if (pg_strcasecmp(cmd, "startpipeline") == 0) mc = META_STARTPIPELINE; else if (pg_strcasecmp(cmd, "endpipeline") == 0) mc = META_ENDPIPELINE; else mc = META_NONE; return mc; } /* * Run a shell command. The result is assigned to the variable if not NULL. * Return true if succeeded, or false on error. */ static bool runShellCommand(CState *st, char *variable, char **argv, int argc) { char command[SHELL_COMMAND_SIZE]; int i, len = 0; FILE *fp; char res[64]; char *endptr; int retval; /*---------- * Join arguments with whitespace separators. Arguments starting with * exactly one colon are treated as variables: * name - append a string "name" * :var - append a variable named 'var' * ::name - append a string ":name" *---------- */ for (i = 0; i < argc; i++) { char *arg; int arglen; if (argv[i][0] != ':') { arg = argv[i]; /* a string literal */ } else if (argv[i][1] == ':') { arg = argv[i] + 1; /* a string literal starting with colons */ } else if ((arg = getVariable(st, argv[i] + 1)) == NULL) { pg_log_error("%s: undefined variable \"%s\"", argv[0], argv[i]); return false; } arglen = strlen(arg); if (len + arglen + (i > 0 ? 1 : 0) >= SHELL_COMMAND_SIZE - 1) { pg_log_error("%s: shell command is too long", argv[0]); return false; } if (i > 0) command[len++] = ' '; memcpy(command + len, arg, arglen); len += arglen; } command[len] = '\0'; /* Fast path for non-assignment case */ if (variable == NULL) { if (system(command)) { if (!timer_exceeded) pg_log_error("%s: could not launch shell command", argv[0]); return false; } return true; } /* Execute the command with pipe and read the standard output. */ if ((fp = popen(command, "r")) == NULL) { pg_log_error("%s: could not launch shell command", argv[0]); return false; } if (fgets(res, sizeof(res), fp) == NULL) { if (!timer_exceeded) pg_log_error("%s: could not read result of shell command", argv[0]); (void) pclose(fp); return false; } if (pclose(fp) < 0) { pg_log_error("%s: could not close shell command", argv[0]); return false; } /* Check whether the result is an integer and assign it to the variable */ retval = (int) strtol(res, &endptr, 10); while (*endptr != '\0' && isspace((unsigned char) *endptr)) endptr++; if (*res == '\0' || *endptr != '\0') { pg_log_error("%s: shell command must return an integer (not \"%s\")", argv[0], res); return false; } if (!putVariableInt(st, "setshell", variable, retval)) return false; pg_log_debug("%s: shell parameter name: \"%s\", value: \"%s\"", argv[0], argv[1], res); return true; } #define MAX_PREPARE_NAME 32 static void preparedStatementName(char *buffer, int file, int state) { sprintf(buffer, "P%d_%d", file, state); } static void commandFailed(CState *st, const char *cmd, const char *message) { pg_log_error("client %d aborted in command %d (%s) of script %d; %s", st->id, st->command, cmd, st->use_file, message); } /* return a script number with a weighted choice. */ static int chooseScript(TState *thread) { int i = 0; int64 w; if (num_scripts == 1) return 0; w = getrand(&thread->ts_choose_rs, 0, total_weight - 1); do { w -= sql_script[i++].weight; } while (w >= 0); return i - 1; } /* Send a SQL command, using the chosen querymode */ static bool sendCommand(CState *st, Command *command) { int r; if (querymode == QUERY_SIMPLE) { char *sql; sql = pg_strdup(command->argv[0]); sql = assignVariables(st, sql); pg_log_debug("client %d sending %s", st->id, sql); r = PQsendQuery(st->con, sql); free(sql); } else if (querymode == QUERY_EXTENDED) { const char *sql = command->argv[0]; const char *params[MAX_ARGS]; getQueryParams(st, command, params); pg_log_debug("client %d sending %s", st->id, sql); r = PQsendQueryParams(st->con, sql, command->argc - 1, NULL, params, NULL, NULL, 0); } else if (querymode == QUERY_PREPARED) { char name[MAX_PREPARE_NAME]; const char *params[MAX_ARGS]; if (!st->prepared[st->use_file]) { int j; Command **commands = sql_script[st->use_file].commands; for (j = 0; commands[j] != NULL; j++) { PGresult *res; char name[MAX_PREPARE_NAME]; if (commands[j]->type != SQL_COMMAND) continue; preparedStatementName(name, st->use_file, j); if (PQpipelineStatus(st->con) == PQ_PIPELINE_OFF) { res = PQprepare(st->con, name, commands[j]->argv[0], commands[j]->argc - 1, NULL); if (PQresultStatus(res) != PGRES_COMMAND_OK) pg_log_error("%s", PQerrorMessage(st->con)); PQclear(res); } else { /* * In pipeline mode, we use asynchronous functions. If a * server-side error occurs, it will be processed later * among the other results. */ if (!PQsendPrepare(st->con, name, commands[j]->argv[0], commands[j]->argc - 1, NULL)) pg_log_error("%s", PQerrorMessage(st->con)); } } st->prepared[st->use_file] = true; } getQueryParams(st, command, params); preparedStatementName(name, st->use_file, st->command); pg_log_debug("client %d sending %s", st->id, name); r = PQsendQueryPrepared(st->con, name, command->argc - 1, params, NULL, NULL, 0); } else /* unknown sql mode */ r = 0; if (r == 0) { pg_log_debug("client %d could not send %s", st->id, command->argv[0]); return false; } else return true; } /* * Process query response from the backend. * * If varprefix is not NULL, it's the variable name prefix where to store * the results of the *last* command (META_GSET) or *all* commands * (META_ASET). * * Returns true if everything is A-OK, false if any error occurs. */ static bool readCommandResponse(CState *st, MetaCommand meta, char *varprefix) { PGresult *res; PGresult *next_res; int qrynum = 0; /* * varprefix should be set only with \gset or \aset, and \endpipeline and * SQL commands do not need it. */ Assert((meta == META_NONE && varprefix == NULL) || ((meta == META_ENDPIPELINE) && varprefix == NULL) || ((meta == META_GSET || meta == META_ASET) && varprefix != NULL)); res = PQgetResult(st->con); while (res != NULL) { bool is_last; /* peek at the next result to know whether the current is last */ next_res = PQgetResult(st->con); is_last = (next_res == NULL); switch (PQresultStatus(res)) { case PGRES_COMMAND_OK: /* non-SELECT commands */ case PGRES_EMPTY_QUERY: /* may be used for testing no-op overhead */ if (is_last && meta == META_GSET) { pg_log_error("client %d script %d command %d query %d: expected one row, got %d", st->id, st->use_file, st->command, qrynum, 0); goto error; } break; case PGRES_TUPLES_OK: if ((is_last && meta == META_GSET) || meta == META_ASET) { int ntuples = PQntuples(res); if (meta == META_GSET && ntuples != 1) { /* under \gset, report the error */ pg_log_error("client %d script %d command %d query %d: expected one row, got %d", st->id, st->use_file, st->command, qrynum, PQntuples(res)); goto error; } else if (meta == META_ASET && ntuples <= 0) { /* coldly skip empty result under \aset */ break; } /* store results into variables */ for (int fld = 0; fld < PQnfields(res); fld++) { char *varname = PQfname(res, fld); /* allocate varname only if necessary, freed below */ if (*varprefix != '\0') varname = psprintf("%s%s", varprefix, varname); /* store last row result as a string */ if (!putVariable(st, meta == META_ASET ? "aset" : "gset", varname, PQgetvalue(res, ntuples - 1, fld))) { /* internal error */ pg_log_error("client %d script %d command %d query %d: error storing into variable %s", st->id, st->use_file, st->command, qrynum, varname); goto error; } if (*varprefix != '\0') pg_free(varname); } } /* otherwise the result is simply thrown away by PQclear below */ break; case PGRES_PIPELINE_SYNC: pg_log_debug("client %d pipeline ending", st->id); if (PQexitPipelineMode(st->con) != 1) pg_log_error("client %d failed to exit pipeline mode: %s", st->id, PQerrorMessage(st->con)); break; default: /* anything else is unexpected */ pg_log_error("client %d script %d aborted in command %d query %d: %s", st->id, st->use_file, st->command, qrynum, PQerrorMessage(st->con)); goto error; } PQclear(res); qrynum++; res = next_res; } if (qrynum == 0) { pg_log_error("client %d command %d: no results", st->id, st->command); return false; } return true; error: PQclear(res); PQclear(next_res); do { res = PQgetResult(st->con); PQclear(res); } while (res); return false; } /* * Parse the argument to a \sleep command, and return the requested amount * of delay, in microseconds. Returns true on success, false on error. */ static bool evaluateSleep(CState *st, int argc, char **argv, int *usecs) { char *var; int usec; if (*argv[1] == ':') { if ((var = getVariable(st, argv[1] + 1)) == NULL) { pg_log_error("%s: undefined variable \"%s\"", argv[0], argv[1] + 1); return false; } usec = atoi(var); /* Raise an error if the value of a variable is not a number */ if (usec == 0 && !isdigit((unsigned char) *var)) { pg_log_error("%s: invalid sleep time \"%s\" for variable \"%s\"", argv[0], var, argv[1] + 1); return false; } } else usec = atoi(argv[1]); if (argc > 2) { if (pg_strcasecmp(argv[2], "ms") == 0) usec *= 1000; else if (pg_strcasecmp(argv[2], "s") == 0) usec *= 1000000; } else usec *= 1000000; *usecs = usec; return true; } /* * Advance the state machine of a connection. */ static void advanceConnectionState(TState *thread, CState *st, StatsData *agg) { /* * gettimeofday() isn't free, so we get the current timestamp lazily the * first time it's needed, and reuse the same value throughout this * function after that. This also ensures that e.g. the calculated * latency reported in the log file and in the totals are the same. Zero * means "not set yet". Reset "now" when we execute shell commands or * expressions, which might take a non-negligible amount of time, though. */ pg_time_usec_t now = 0; /* * Loop in the state machine, until we have to wait for a result from the * server or have to sleep for throttling or \sleep. * * Note: In the switch-statement below, 'break' will loop back here, * meaning "continue in the state machine". Return is used to return to * the caller, giving the thread the opportunity to advance another * client. */ for (;;) { Command *command; switch (st->state) { /* Select transaction (script) to run. */ case CSTATE_CHOOSE_SCRIPT: st->use_file = chooseScript(thread); Assert(conditional_stack_empty(st->cstack)); pg_log_debug("client %d executing script \"%s\"", st->id, sql_script[st->use_file].desc); /* * If time is over, we're done; otherwise, get ready to start * a new transaction, or to get throttled if that's requested. */ st->state = timer_exceeded ? CSTATE_FINISHED : throttle_delay > 0 ? CSTATE_PREPARE_THROTTLE : CSTATE_START_TX; break; /* Start new transaction (script) */ case CSTATE_START_TX: pg_time_now_lazy(&now); /* establish connection if needed, i.e. under --connect */ if (st->con == NULL) { pg_time_usec_t start = now; if ((st->con = doConnect()) == NULL) { pg_log_error("client %d aborted while establishing connection", st->id); st->state = CSTATE_ABORTED; break; } /* reset now after connection */ now = pg_time_now(); thread->conn_duration += now - start; /* Reset session-local state */ memset(st->prepared, 0, sizeof(st->prepared)); } /* record transaction start time */ st->txn_begin = now; /* * When not throttling, this is also the transaction's * scheduled start time. */ if (!throttle_delay) st->txn_scheduled = now; /* Begin with the first command */ st->state = CSTATE_START_COMMAND; st->command = 0; break; /* * Handle throttling once per transaction by sleeping. */ case CSTATE_PREPARE_THROTTLE: /* * Generate a delay such that the series of delays will * approximate a Poisson distribution centered on the * throttle_delay time. * * If transactions are too slow or a given wait is shorter * than a transaction, the next transaction will start right * away. */ Assert(throttle_delay > 0); thread->throttle_trigger += getPoissonRand(&thread->ts_throttle_rs, throttle_delay); st->txn_scheduled = thread->throttle_trigger; /* * If --latency-limit is used, and this slot is already late * so that the transaction will miss the latency limit even if * it completed immediately, skip this time slot and loop to * reschedule. */ if (latency_limit) { pg_time_now_lazy(&now); if (thread->throttle_trigger < now - latency_limit) { processXactStats(thread, st, &now, true, agg); /* * Finish client if -T or -t was exceeded. * * Stop counting skipped transactions under -T as soon * as the timer is exceeded. Because otherwise it can * take a very long time to count all of them * especially when quite a lot of them happen with * unrealistically high rate setting in -R, which * would prevent pgbench from ending immediately. * Because of this behavior, note that there is no * guarantee that all skipped transactions are counted * under -T though there is under -t. This is OK in * practice because it's very unlikely to happen with * realistic setting. */ if (timer_exceeded || (nxacts > 0 && st->cnt >= nxacts)) st->state = CSTATE_FINISHED; /* Go back to top of loop with CSTATE_PREPARE_THROTTLE */ break; } } /* * stop client if next transaction is beyond pgbench end of * execution; otherwise, throttle it. */ st->state = end_time > 0 && st->txn_scheduled > end_time ? CSTATE_FINISHED : CSTATE_THROTTLE; break; /* * Wait until it's time to start next transaction. */ case CSTATE_THROTTLE: pg_time_now_lazy(&now); if (now < st->txn_scheduled) return; /* still sleeping, nothing to do here */ /* done sleeping, but don't start transaction if we're done */ st->state = timer_exceeded ? CSTATE_FINISHED : CSTATE_START_TX; break; /* * Send a command to server (or execute a meta-command) */ case CSTATE_START_COMMAND: command = sql_script[st->use_file].commands[st->command]; /* Transition to script end processing if done */ if (command == NULL) { st->state = CSTATE_END_TX; break; } /* record begin time of next command, and initiate it */ if (report_per_command) { pg_time_now_lazy(&now); st->stmt_begin = now; } /* Execute the command */ if (command->type == SQL_COMMAND) { /* disallow \aset and \gset in pipeline mode */ if (PQpipelineStatus(st->con) != PQ_PIPELINE_OFF) { if (command->meta == META_GSET) { commandFailed(st, "gset", "\\gset is not allowed in pipeline mode"); st->state = CSTATE_ABORTED; break; } else if (command->meta == META_ASET) { commandFailed(st, "aset", "\\aset is not allowed in pipeline mode"); st->state = CSTATE_ABORTED; break; } } if (!sendCommand(st, command)) { commandFailed(st, "SQL", "SQL command send failed"); st->state = CSTATE_ABORTED; } else { /* Wait for results, unless in pipeline mode */ if (PQpipelineStatus(st->con) == PQ_PIPELINE_OFF) st->state = CSTATE_WAIT_RESULT; else st->state = CSTATE_END_COMMAND; } } else if (command->type == META_COMMAND) { /*----- * Possible state changes when executing meta commands: * - on errors CSTATE_ABORTED * - on sleep CSTATE_SLEEP * - else CSTATE_END_COMMAND */ st->state = executeMetaCommand(st, &now); } /* * We're now waiting for an SQL command to complete, or * finished processing a metacommand, or need to sleep, or * something bad happened. */ Assert(st->state == CSTATE_WAIT_RESULT || st->state == CSTATE_END_COMMAND || st->state == CSTATE_SLEEP || st->state == CSTATE_ABORTED); break; /* * non executed conditional branch */ case CSTATE_SKIP_COMMAND: Assert(!conditional_active(st->cstack)); /* quickly skip commands until something to do... */ while (true) { Command *command; command = sql_script[st->use_file].commands[st->command]; /* cannot reach end of script in that state */ Assert(command != NULL); /* * if this is conditional related, update conditional * state */ if (command->type == META_COMMAND && (command->meta == META_IF || command->meta == META_ELIF || command->meta == META_ELSE || command->meta == META_ENDIF)) { switch (conditional_stack_peek(st->cstack)) { case IFSTATE_FALSE: if (command->meta == META_IF || command->meta == META_ELIF) { /* we must evaluate the condition */ st->state = CSTATE_START_COMMAND; } else if (command->meta == META_ELSE) { /* we must execute next command */ conditional_stack_poke(st->cstack, IFSTATE_ELSE_TRUE); st->state = CSTATE_START_COMMAND; st->command++; } else if (command->meta == META_ENDIF) { Assert(!conditional_stack_empty(st->cstack)); conditional_stack_pop(st->cstack); if (conditional_active(st->cstack)) st->state = CSTATE_START_COMMAND; /* * else state remains in * CSTATE_SKIP_COMMAND */ st->command++; } break; case IFSTATE_IGNORED: case IFSTATE_ELSE_FALSE: if (command->meta == META_IF) conditional_stack_push(st->cstack, IFSTATE_IGNORED); else if (command->meta == META_ENDIF) { Assert(!conditional_stack_empty(st->cstack)); conditional_stack_pop(st->cstack); if (conditional_active(st->cstack)) st->state = CSTATE_START_COMMAND; } /* could detect "else" & "elif" after "else" */ st->command++; break; case IFSTATE_NONE: case IFSTATE_TRUE: case IFSTATE_ELSE_TRUE: default: /* * inconsistent if inactive, unreachable dead * code */ Assert(false); } } else { /* skip and consider next */ st->command++; } if (st->state != CSTATE_SKIP_COMMAND) /* out of quick skip command loop */ break; } break; /* * Wait for the current SQL command to complete */ case CSTATE_WAIT_RESULT: pg_log_debug("client %d receiving", st->id); /* * Only check for new network data if we processed all data * fetched prior. Otherwise we end up doing a syscall for each * individual pipelined query, which has a measurable * performance impact. */ if (PQisBusy(st->con) && !PQconsumeInput(st->con)) { /* there's something wrong */ commandFailed(st, "SQL", "perhaps the backend died while processing"); st->state = CSTATE_ABORTED; break; } if (PQisBusy(st->con)) return; /* don't have the whole result yet */ /* store or discard the query results */ if (readCommandResponse(st, sql_script[st->use_file].commands[st->command]->meta, sql_script[st->use_file].commands[st->command]->varprefix)) { /* * outside of pipeline mode: stop reading results. * pipeline mode: continue reading results until an * end-of-pipeline response. */ if (PQpipelineStatus(st->con) != PQ_PIPELINE_ON) st->state = CSTATE_END_COMMAND; } else st->state = CSTATE_ABORTED; break; /* * Wait until sleep is done. This state is entered after a * \sleep metacommand. The behavior is similar to * CSTATE_THROTTLE, but proceeds to CSTATE_START_COMMAND * instead of CSTATE_START_TX. */ case CSTATE_SLEEP: pg_time_now_lazy(&now); if (now < st->sleep_until) return; /* still sleeping, nothing to do here */ /* Else done sleeping. */ st->state = CSTATE_END_COMMAND; break; /* * End of command: record stats and proceed to next command. */ case CSTATE_END_COMMAND: /* * command completed: accumulate per-command execution times * in thread-local data structure, if per-command latencies * are requested. */ if (report_per_command) { Command *command; pg_time_now_lazy(&now); command = sql_script[st->use_file].commands[st->command]; /* XXX could use a mutex here, but we choose not to */ addToSimpleStats(&command->stats, PG_TIME_GET_DOUBLE(now - st->stmt_begin)); } /* Go ahead with next command, to be executed or skipped */ st->command++; st->state = conditional_active(st->cstack) ? CSTATE_START_COMMAND : CSTATE_SKIP_COMMAND; break; /* * End of transaction (end of script, really). */ case CSTATE_END_TX: /* transaction finished: calculate latency and do log */ processXactStats(thread, st, &now, false, agg); /* * missing \endif... cannot happen if CheckConditional was * okay */ Assert(conditional_stack_empty(st->cstack)); if (is_connect) { pg_time_usec_t start = now; pg_time_now_lazy(&start); finishCon(st); now = pg_time_now(); thread->conn_duration += now - start; } if ((st->cnt >= nxacts && duration <= 0) || timer_exceeded) { /* script completed */ st->state = CSTATE_FINISHED; break; } /* next transaction (script) */ st->state = CSTATE_CHOOSE_SCRIPT; /* * Ensure that we always return on this point, so as to avoid * an infinite loop if the script only contains meta commands. */ return; /* * Final states. Close the connection if it's still open. */ case CSTATE_ABORTED: case CSTATE_FINISHED: /* * Don't measure the disconnection delays here even if in * CSTATE_FINISHED and -C/--connect option is specified. * Because in this case all the connections that this thread * established are closed at the end of transactions and the * disconnection delays should have already been measured at * that moment. * * In CSTATE_ABORTED state, the measurement is no longer * necessary because we cannot report complete results anyways * in this case. */ finishCon(st); return; } } } /* * Subroutine for advanceConnectionState -- initiate or execute the current * meta command, and return the next state to set. * * *now is updated to the current time, unless the command is expected to * take no time to execute. */ static ConnectionStateEnum executeMetaCommand(CState *st, pg_time_usec_t *now) { Command *command = sql_script[st->use_file].commands[st->command]; int argc; char **argv; Assert(command != NULL && command->type == META_COMMAND); argc = command->argc; argv = command->argv; if (unlikely(__pg_log_level <= PG_LOG_DEBUG)) { PQExpBufferData buf; initPQExpBuffer(&buf); printfPQExpBuffer(&buf, "client %d executing \\%s", st->id, argv[0]); for (int i = 1; i < argc; i++) appendPQExpBuffer(&buf, " %s", argv[i]); pg_log_debug("%s", buf.data); termPQExpBuffer(&buf); } if (command->meta == META_SLEEP) { int usec; /* * A \sleep doesn't execute anything, we just get the delay from the * argument, and enter the CSTATE_SLEEP state. (The per-command * latency will be recorded in CSTATE_SLEEP state, not here, after the * delay has elapsed.) */ if (!evaluateSleep(st, argc, argv, &usec)) { commandFailed(st, "sleep", "execution of meta-command failed"); return CSTATE_ABORTED; } pg_time_now_lazy(now); st->sleep_until = (*now) + usec; return CSTATE_SLEEP; } else if (command->meta == META_SET) { PgBenchExpr *expr = command->expr; PgBenchValue result; if (!evaluateExpr(st, expr, &result)) { commandFailed(st, argv[0], "evaluation of meta-command failed"); return CSTATE_ABORTED; } if (!putVariableValue(st, argv[0], argv[1], &result)) { commandFailed(st, "set", "assignment of meta-command failed"); return CSTATE_ABORTED; } } else if (command->meta == META_IF) { /* backslash commands with an expression to evaluate */ PgBenchExpr *expr = command->expr; PgBenchValue result; bool cond; if (!evaluateExpr(st, expr, &result)) { commandFailed(st, argv[0], "evaluation of meta-command failed"); return CSTATE_ABORTED; } cond = valueTruth(&result); conditional_stack_push(st->cstack, cond ? IFSTATE_TRUE : IFSTATE_FALSE); } else if (command->meta == META_ELIF) { /* backslash commands with an expression to evaluate */ PgBenchExpr *expr = command->expr; PgBenchValue result; bool cond; if (conditional_stack_peek(st->cstack) == IFSTATE_TRUE) { /* elif after executed block, skip eval and wait for endif. */ conditional_stack_poke(st->cstack, IFSTATE_IGNORED); return CSTATE_END_COMMAND; } if (!evaluateExpr(st, expr, &result)) { commandFailed(st, argv[0], "evaluation of meta-command failed"); return CSTATE_ABORTED; } cond = valueTruth(&result); Assert(conditional_stack_peek(st->cstack) == IFSTATE_FALSE); conditional_stack_poke(st->cstack, cond ? IFSTATE_TRUE : IFSTATE_FALSE); } else if (command->meta == META_ELSE) { switch (conditional_stack_peek(st->cstack)) { case IFSTATE_TRUE: conditional_stack_poke(st->cstack, IFSTATE_ELSE_FALSE); break; case IFSTATE_FALSE: /* inconsistent if active */ case IFSTATE_IGNORED: /* inconsistent if active */ case IFSTATE_NONE: /* else without if */ case IFSTATE_ELSE_TRUE: /* else after else */ case IFSTATE_ELSE_FALSE: /* else after else */ default: /* dead code if conditional check is ok */ Assert(false); } } else if (command->meta == META_ENDIF) { Assert(!conditional_stack_empty(st->cstack)); conditional_stack_pop(st->cstack); } else if (command->meta == META_SETSHELL) { if (!runShellCommand(st, argv[1], argv + 2, argc - 2)) { commandFailed(st, "setshell", "execution of meta-command failed"); return CSTATE_ABORTED; } } else if (command->meta == META_SHELL) { if (!runShellCommand(st, NULL, argv + 1, argc - 1)) { commandFailed(st, "shell", "execution of meta-command failed"); return CSTATE_ABORTED; } } else if (command->meta == META_STARTPIPELINE) { /* * In pipeline mode, we use a workflow based on libpq pipeline * functions. */ if (querymode == QUERY_SIMPLE) { commandFailed(st, "startpipeline", "cannot use pipeline mode with the simple query protocol"); return CSTATE_ABORTED; } if (PQpipelineStatus(st->con) != PQ_PIPELINE_OFF) { commandFailed(st, "startpipeline", "already in pipeline mode"); return CSTATE_ABORTED; } if (PQenterPipelineMode(st->con) == 0) { commandFailed(st, "startpipeline", "failed to enter pipeline mode"); return CSTATE_ABORTED; } } else if (command->meta == META_ENDPIPELINE) { if (PQpipelineStatus(st->con) != PQ_PIPELINE_ON) { commandFailed(st, "endpipeline", "not in pipeline mode"); return CSTATE_ABORTED; } if (!PQpipelineSync(st->con)) { commandFailed(st, "endpipeline", "failed to send a pipeline sync"); return CSTATE_ABORTED; } /* Now wait for the PGRES_PIPELINE_SYNC and exit pipeline mode there */ /* collect pending results before getting out of pipeline mode */ return CSTATE_WAIT_RESULT; } /* * executing the expression or shell command might have taken a * non-negligible amount of time, so reset 'now' */ *now = 0; return CSTATE_END_COMMAND; } /* * Print log entry after completing one transaction. * * We print Unix-epoch timestamps in the log, so that entries can be * correlated against other logs. * * XXX We could obtain the time from the caller and just shift it here, to * avoid the cost of an extra call to pg_time_now(). */ static void doLog(TState *thread, CState *st, StatsData *agg, bool skipped, double latency, double lag) { FILE *logfile = thread->logfile; pg_time_usec_t now = pg_time_now() + epoch_shift; Assert(use_log); /* * Skip the log entry if sampling is enabled and this row doesn't belong * to the random sample. */ if (sample_rate != 0.0 && pg_erand48(thread->ts_sample_rs.xseed) > sample_rate) return; /* should we aggregate the results or not? */ if (agg_interval > 0) { pg_time_usec_t next; /* * Loop until we reach the interval of the current moment, and print * any empty intervals in between (this may happen with very low tps, * e.g. --rate=0.1). */ while ((next = agg->start_time + agg_interval * INT64CONST(1000000)) <= now) { /* print aggregated report to logfile */ fprintf(logfile, INT64_FORMAT " " INT64_FORMAT " %.0f %.0f %.0f %.0f", agg->start_time / 1000000, /* seconds since Unix epoch */ agg->cnt, agg->latency.sum, agg->latency.sum2, agg->latency.min, agg->latency.max); if (throttle_delay) { fprintf(logfile, " %.0f %.0f %.0f %.0f", agg->lag.sum, agg->lag.sum2, agg->lag.min, agg->lag.max); if (latency_limit) fprintf(logfile, " " INT64_FORMAT, agg->skipped); } fputc('\n', logfile); /* reset data and move to next interval */ initStats(agg, next); } /* accumulate the current transaction */ accumStats(agg, skipped, latency, lag); } else { /* no, print raw transactions */ if (skipped) fprintf(logfile, "%d " INT64_FORMAT " skipped %d " INT64_FORMAT " " INT64_FORMAT, st->id, st->cnt, st->use_file, now / 1000000, now % 1000000); else fprintf(logfile, "%d " INT64_FORMAT " %.0f %d " INT64_FORMAT " " INT64_FORMAT, st->id, st->cnt, latency, st->use_file, now / 1000000, now % 1000000); if (throttle_delay) fprintf(logfile, " %.0f", lag); fputc('\n', logfile); } } /* * Accumulate and report statistics at end of a transaction. * * (This is also called when a transaction is late and thus skipped. * Note that even skipped transactions are counted in the "cnt" fields.) */ static void processXactStats(TState *thread, CState *st, pg_time_usec_t *now, bool skipped, StatsData *agg) { double latency = 0.0, lag = 0.0; bool thread_details = progress || throttle_delay || latency_limit, detailed = thread_details || use_log || per_script_stats; if (detailed && !skipped) { pg_time_now_lazy(now); /* compute latency & lag */ latency = (*now) - st->txn_scheduled; lag = st->txn_begin - st->txn_scheduled; } if (thread_details) { /* keep detailed thread stats */ accumStats(&thread->stats, skipped, latency, lag); /* count transactions over the latency limit, if needed */ if (latency_limit && latency > latency_limit) thread->latency_late++; } else { /* no detailed stats, just count */ thread->stats.cnt++; } /* client stat is just counting */ st->cnt++; if (use_log) doLog(thread, st, agg, skipped, latency, lag); /* XXX could use a mutex here, but we choose not to */ if (per_script_stats) accumStats(&sql_script[st->use_file].stats, skipped, latency, lag); } /* discard connections */ static void disconnect_all(CState *state, int length) { int i; for (i = 0; i < length; i++) finishCon(&state[i]); } /* * Remove old pgbench tables, if any exist */ static void initDropTables(PGconn *con) { fprintf(stderr, "dropping old tables...\n"); /* * We drop all the tables in one command, so that whether there are * foreign key dependencies or not doesn't matter. */ executeStatement(con, "drop table if exists " "pgbench_accounts, " "pgbench_branches, " "pgbench_history, " "pgbench_tellers"); } /* * Create "pgbench_accounts" partitions if needed. * * This is the larger table of pgbench default tpc-b like schema * with a known size, so we choose to partition it. */ static void createPartitions(PGconn *con) { PQExpBufferData query; /* we must have to create some partitions */ Assert(partitions > 0); fprintf(stderr, "creating %d partitions...\n", partitions); initPQExpBuffer(&query); for (int p = 1; p <= partitions; p++) { if (partition_method == PART_RANGE) { int64 part_size = (naccounts * (int64) scale + partitions - 1) / partitions; printfPQExpBuffer(&query, "create%s table pgbench_accounts_%d\n" " partition of pgbench_accounts\n" " for values from (", unlogged_tables ? " unlogged" : "", p); /* * For RANGE, we use open-ended partitions at the beginning and * end to allow any valid value for the primary key. Although the * actual minimum and maximum values can be derived from the * scale, it is more generic and the performance is better. */ if (p == 1) appendPQExpBufferStr(&query, "minvalue"); else appendPQExpBuffer(&query, INT64_FORMAT, (p - 1) * part_size + 1); appendPQExpBufferStr(&query, ") to ("); if (p < partitions) appendPQExpBuffer(&query, INT64_FORMAT, p * part_size + 1); else appendPQExpBufferStr(&query, "maxvalue"); appendPQExpBufferChar(&query, ')'); } else if (partition_method == PART_HASH) printfPQExpBuffer(&query, "create%s table pgbench_accounts_%d\n" " partition of pgbench_accounts\n" " for values with (modulus %d, remainder %d)", unlogged_tables ? " unlogged" : "", p, partitions, p - 1); else /* cannot get there */ Assert(0); /* * Per ddlinfo in initCreateTables, fillfactor is needed on table * pgbench_accounts. */ appendPQExpBuffer(&query, " with (fillfactor=%d)", fillfactor); executeStatement(con, query.data); } termPQExpBuffer(&query); } /* * Create pgbench's standard tables */ static void initCreateTables(PGconn *con) { /* * Note: TPC-B requires at least 100 bytes per row, and the "filler" * fields in these table declarations were intended to comply with that. * The pgbench_accounts table complies with that because the "filler" * column is set to blank-padded empty string. But for all other tables * the columns default to NULL and so don't actually take any space. We * could fix that by giving them non-null default values. However, that * would completely break comparability of pgbench results with prior * versions. Since pgbench has never pretended to be fully TPC-B compliant * anyway, we stick with the historical behavior. */ struct ddlinfo { const char *table; /* table name */ const char *smcols; /* column decls if accountIDs are 32 bits */ const char *bigcols; /* column decls if accountIDs are 64 bits */ int declare_fillfactor; }; static const struct ddlinfo DDLs[] = { { "pgbench_history", "tid int,bid int,aid int,delta int,mtime timestamp,filler char(22)", "tid int,bid int,aid bigint,delta int,mtime timestamp,filler char(22)", 0 }, { "pgbench_tellers", "tid int not null,bid int,tbalance int,filler char(84)", "tid int not null,bid int,tbalance int,filler char(84)", 1 }, { "pgbench_accounts", "aid int not null,bid int,abalance int,filler char(84)", "aid bigint not null,bid int,abalance int,filler char(84)", 1 }, { "pgbench_branches", "bid int not null,bbalance int,filler char(88)", "bid int not null,bbalance int,filler char(88)", 1 } }; int i; PQExpBufferData query; fprintf(stderr, "creating tables...\n"); initPQExpBuffer(&query); for (i = 0; i < lengthof(DDLs); i++) { const struct ddlinfo *ddl = &DDLs[i]; /* Construct new create table statement. */ printfPQExpBuffer(&query, "create%s table %s(%s)", unlogged_tables ? " unlogged" : "", ddl->table, (scale >= SCALE_32BIT_THRESHOLD) ? ddl->bigcols : ddl->smcols); /* Partition pgbench_accounts table */ if (partition_method != PART_NONE && strcmp(ddl->table, "pgbench_accounts") == 0) appendPQExpBuffer(&query, " partition by %s (aid)", PARTITION_METHOD[partition_method]); else if (ddl->declare_fillfactor) { /* fillfactor is only expected on actual tables */ appendPQExpBuffer(&query, " with (fillfactor=%d)", fillfactor); } if (tablespace != NULL) { char *escape_tablespace; escape_tablespace = PQescapeIdentifier(con, tablespace, strlen(tablespace)); appendPQExpBuffer(&query, " tablespace %s", escape_tablespace); PQfreemem(escape_tablespace); } executeStatement(con, query.data); } termPQExpBuffer(&query); if (partition_method != PART_NONE) createPartitions(con); } /* * Truncate away any old data, in one command in case there are foreign keys */ static void initTruncateTables(PGconn *con) { executeStatement(con, "truncate table " "pgbench_accounts, " "pgbench_branches, " "pgbench_history, " "pgbench_tellers"); } /* * Fill the standard tables with some data generated and sent from the client */ static void initGenerateDataClientSide(PGconn *con) { PQExpBufferData sql; PGresult *res; int i; int64 k; /* used to track elapsed time and estimate of the remaining time */ pg_time_usec_t start; int log_interval = 1; /* Stay on the same line if reporting to a terminal */ char eol = isatty(fileno(stderr)) ? '\r' : '\n'; fprintf(stderr, "generating data (client-side)...\n"); /* * we do all of this in one transaction to enable the backend's * data-loading optimizations */ executeStatement(con, "begin"); /* truncate away any old data */ initTruncateTables(con); initPQExpBuffer(&sql); /* * fill branches, tellers, accounts in that order in case foreign keys * already exist */ for (i = 0; i < nbranches * scale; i++) { /* "filler" column defaults to NULL */ printfPQExpBuffer(&sql, "insert into pgbench_branches(bid,bbalance) values(%d,0)", i + 1); executeStatement(con, sql.data); } for (i = 0; i < ntellers * scale; i++) { /* "filler" column defaults to NULL */ printfPQExpBuffer(&sql, "insert into pgbench_tellers(tid,bid,tbalance) values (%d,%d,0)", i + 1, i / ntellers + 1); executeStatement(con, sql.data); } /* * accounts is big enough to be worth using COPY and tracking runtime */ res = PQexec(con, "copy pgbench_accounts from stdin"); if (PQresultStatus(res) != PGRES_COPY_IN) { pg_log_fatal("unexpected copy in result: %s", PQerrorMessage(con)); exit(1); } PQclear(res); start = pg_time_now(); for (k = 0; k < (int64) naccounts * scale; k++) { int64 j = k + 1; /* "filler" column defaults to blank padded empty string */ printfPQExpBuffer(&sql, INT64_FORMAT "\t" INT64_FORMAT "\t%d\t\n", j, k / naccounts + 1, 0); if (PQputline(con, sql.data)) { pg_log_fatal("PQputline failed"); exit(1); } if (CancelRequested) break; /* * If we want to stick with the original logging, print a message each * 100k inserted rows. */ if ((!use_quiet) && (j % 100000 == 0)) { double elapsed_sec = PG_TIME_GET_DOUBLE(pg_time_now() - start); double remaining_sec = ((double) scale * naccounts - j) * elapsed_sec / j; fprintf(stderr, INT64_FORMAT " of " INT64_FORMAT " tuples (%d%%) done (elapsed %.2f s, remaining %.2f s)%c", j, (int64) naccounts * scale, (int) (((int64) j * 100) / (naccounts * (int64) scale)), elapsed_sec, remaining_sec, eol); } /* let's not call the timing for each row, but only each 100 rows */ else if (use_quiet && (j % 100 == 0)) { double elapsed_sec = PG_TIME_GET_DOUBLE(pg_time_now() - start); double remaining_sec = ((double) scale * naccounts - j) * elapsed_sec / j; /* have we reached the next interval (or end)? */ if ((j == scale * naccounts) || (elapsed_sec >= log_interval * LOG_STEP_SECONDS)) { fprintf(stderr, INT64_FORMAT " of " INT64_FORMAT " tuples (%d%%) done (elapsed %.2f s, remaining %.2f s)%c", j, (int64) naccounts * scale, (int) (((int64) j * 100) / (naccounts * (int64) scale)), elapsed_sec, remaining_sec, eol); /* skip to the next interval */ log_interval = (int) ceil(elapsed_sec / LOG_STEP_SECONDS); } } } if (eol != '\n') fputc('\n', stderr); /* Need to move to next line */ if (PQputline(con, "\\.\n")) { pg_log_fatal("very last PQputline failed"); exit(1); } if (PQendcopy(con)) { pg_log_fatal("PQendcopy failed"); exit(1); } termPQExpBuffer(&sql); executeStatement(con, "commit"); } /* * Fill the standard tables with some data generated on the server * * As already the case with the client-side data generation, the filler * column defaults to NULL in pgbench_branches and pgbench_tellers, * and is a blank-padded string in pgbench_accounts. */ static void initGenerateDataServerSide(PGconn *con) { PQExpBufferData sql; fprintf(stderr, "generating data (server-side)...\n"); /* * we do all of this in one transaction to enable the backend's * data-loading optimizations */ executeStatement(con, "begin"); /* truncate away any old data */ initTruncateTables(con); initPQExpBuffer(&sql); printfPQExpBuffer(&sql, "insert into pgbench_branches(bid,bbalance) " "select bid, 0 " "from generate_series(1, %d) as bid", nbranches * scale); executeStatement(con, sql.data); printfPQExpBuffer(&sql, "insert into pgbench_tellers(tid,bid,tbalance) " "select tid, (tid - 1) / %d + 1, 0 " "from generate_series(1, %d) as tid", ntellers, ntellers * scale); executeStatement(con, sql.data); printfPQExpBuffer(&sql, "insert into pgbench_accounts(aid,bid,abalance,filler) " "select aid, (aid - 1) / %d + 1, 0, '' " "from generate_series(1, " INT64_FORMAT ") as aid", naccounts, (int64) naccounts * scale); executeStatement(con, sql.data); termPQExpBuffer(&sql); executeStatement(con, "commit"); } /* * Invoke vacuum on the standard tables */ static void initVacuum(PGconn *con) { fprintf(stderr, "vacuuming...\n"); executeStatement(con, "vacuum analyze pgbench_branches"); executeStatement(con, "vacuum analyze pgbench_tellers"); executeStatement(con, "vacuum analyze pgbench_accounts"); executeStatement(con, "vacuum analyze pgbench_history"); } /* * Create primary keys on the standard tables */ static void initCreatePKeys(PGconn *con) { static const char *const DDLINDEXes[] = { "alter table pgbench_branches add primary key (bid)", "alter table pgbench_tellers add primary key (tid)", "alter table pgbench_accounts add primary key (aid)" }; int i; PQExpBufferData query; fprintf(stderr, "creating primary keys...\n"); initPQExpBuffer(&query); for (i = 0; i < lengthof(DDLINDEXes); i++) { resetPQExpBuffer(&query); appendPQExpBufferStr(&query, DDLINDEXes[i]); if (index_tablespace != NULL) { char *escape_tablespace; escape_tablespace = PQescapeIdentifier(con, index_tablespace, strlen(index_tablespace)); appendPQExpBuffer(&query, " using index tablespace %s", escape_tablespace); PQfreemem(escape_tablespace); } executeStatement(con, query.data); } termPQExpBuffer(&query); } /* * Create foreign key constraints between the standard tables */ static void initCreateFKeys(PGconn *con) { static const char *const DDLKEYs[] = { "alter table pgbench_tellers add constraint pgbench_tellers_bid_fkey foreign key (bid) references pgbench_branches", "alter table pgbench_accounts add constraint pgbench_accounts_bid_fkey foreign key (bid) references pgbench_branches", "alter table pgbench_history add constraint pgbench_history_bid_fkey foreign key (bid) references pgbench_branches", "alter table pgbench_history add constraint pgbench_history_tid_fkey foreign key (tid) references pgbench_tellers", "alter table pgbench_history add constraint pgbench_history_aid_fkey foreign key (aid) references pgbench_accounts" }; int i; fprintf(stderr, "creating foreign keys...\n"); for (i = 0; i < lengthof(DDLKEYs); i++) { executeStatement(con, DDLKEYs[i]); } } /* * Validate an initialization-steps string * * (We could just leave it to runInitSteps() to fail if there are wrong * characters, but since initialization can take awhile, it seems friendlier * to check during option parsing.) */ static void checkInitSteps(const char *initialize_steps) { if (initialize_steps[0] == '\0') { pg_log_fatal("no initialization steps specified"); exit(1); } for (const char *step = initialize_steps; *step != '\0'; step++) { if (strchr(ALL_INIT_STEPS " ", *step) == NULL) { pg_log_fatal("unrecognized initialization step \"%c\"", *step); pg_log_info("Allowed step characters are: \"" ALL_INIT_STEPS "\"."); exit(1); } } } /* * Invoke each initialization step in the given string */ static void runInitSteps(const char *initialize_steps) { PQExpBufferData stats; PGconn *con; const char *step; double run_time = 0.0; bool first = true; initPQExpBuffer(&stats); if ((con = doConnect()) == NULL) exit(1); setup_cancel_handler(NULL); SetCancelConn(con); for (step = initialize_steps; *step != '\0'; step++) { char *op = NULL; pg_time_usec_t start = pg_time_now(); switch (*step) { case 'd': op = "drop tables"; initDropTables(con); break; case 't': op = "create tables"; initCreateTables(con); break; case 'g': op = "client-side generate"; initGenerateDataClientSide(con); break; case 'G': op = "server-side generate"; initGenerateDataServerSide(con); break; case 'v': op = "vacuum"; initVacuum(con); break; case 'p': op = "primary keys"; initCreatePKeys(con); break; case 'f': op = "foreign keys"; initCreateFKeys(con); break; case ' ': break; /* ignore */ default: pg_log_fatal("unrecognized initialization step \"%c\"", *step); PQfinish(con); exit(1); } if (op != NULL) { double elapsed_sec = PG_TIME_GET_DOUBLE(pg_time_now() - start); if (!first) appendPQExpBufferStr(&stats, ", "); else first = false; appendPQExpBuffer(&stats, "%s %.2f s", op, elapsed_sec); run_time += elapsed_sec; } } fprintf(stderr, "done in %.2f s (%s).\n", run_time, stats.data); ResetCancelConn(); PQfinish(con); termPQExpBuffer(&stats); } /* * Extract pgbench table information into global variables scale, * partition_method and partitions. */ static void GetTableInfo(PGconn *con, bool scale_given) { PGresult *res; /* * get the scaling factor that should be same as count(*) from * pgbench_branches if this is not a custom query */ res = PQexec(con, "select count(*) from pgbench_branches"); if (PQresultStatus(res) != PGRES_TUPLES_OK) { char *sqlState = PQresultErrorField(res, PG_DIAG_SQLSTATE); pg_log_fatal("could not count number of branches: %s", PQerrorMessage(con)); if (sqlState && strcmp(sqlState, ERRCODE_UNDEFINED_TABLE) == 0) pg_log_info("Perhaps you need to do initialization (\"pgbench -i\") in database \"%s\"", PQdb(con)); exit(1); } scale = atoi(PQgetvalue(res, 0, 0)); if (scale < 0) { pg_log_fatal("invalid count(*) from pgbench_branches: \"%s\"", PQgetvalue(res, 0, 0)); exit(1); } PQclear(res); /* warn if we override user-given -s switch */ if (scale_given) pg_log_warning("scale option ignored, using count from pgbench_branches table (%d)", scale); /* * Get the partition information for the first "pgbench_accounts" table * found in search_path. * * The result is empty if no "pgbench_accounts" is found. * * Otherwise, it always returns one row even if the table is not * partitioned (in which case the partition strategy is NULL). * * The number of partitions can be 0 even for partitioned tables, if no * partition is attached. * * We assume no partitioning on any failure, so as to avoid failing on an * old version without "pg_partitioned_table". */ res = PQexec(con, "select o.n, p.partstrat, pg_catalog.count(i.inhparent) " "from pg_catalog.pg_class as c " "join pg_catalog.pg_namespace as n on (n.oid = c.relnamespace) " "cross join lateral (select pg_catalog.array_position(pg_catalog.current_schemas(true), n.nspname)) as o(n) " "left join pg_catalog.pg_partitioned_table as p on (p.partrelid = c.oid) " "left join pg_catalog.pg_inherits as i on (c.oid = i.inhparent) " "where c.relname = 'pgbench_accounts' and o.n is not null " "group by 1, 2 " "order by 1 asc " "limit 1"); if (PQresultStatus(res) != PGRES_TUPLES_OK) { /* probably an older version, coldly assume no partitioning */ partition_method = PART_NONE; partitions = 0; } else if (PQntuples(res) == 0) { /* * This case is unlikely as pgbench already found "pgbench_branches" * above to compute the scale. */ pg_log_fatal("no pgbench_accounts table found in search_path"); pg_log_info("Perhaps you need to do initialization (\"pgbench -i\") in database \"%s\".", PQdb(con)); exit(1); } else /* PQntupes(res) == 1 */ { /* normal case, extract partition information */ if (PQgetisnull(res, 0, 1)) partition_method = PART_NONE; else { char *ps = PQgetvalue(res, 0, 1); /* column must be there */ Assert(ps != NULL); if (strcmp(ps, "r") == 0) partition_method = PART_RANGE; else if (strcmp(ps, "h") == 0) partition_method = PART_HASH; else { /* possibly a newer version with new partition method */ pg_log_fatal("unexpected partition method: \"%s\"", ps); exit(1); } } partitions = atoi(PQgetvalue(res, 0, 2)); } PQclear(res); } /* * Replace :param with $n throughout the command's SQL text, which * is a modifiable string in cmd->lines. */ static bool parseQuery(Command *cmd) { char *sql, *p; cmd->argc = 1; p = sql = pg_strdup(cmd->lines.data); while ((p = strchr(p, ':')) != NULL) { char var[13]; char *name; int eaten; name = parseVariable(p, &eaten); if (name == NULL) { while (*p == ':') { p++; } continue; } /* * cmd->argv[0] is the SQL statement itself, so the max number of * arguments is one less than MAX_ARGS */ if (cmd->argc >= MAX_ARGS) { pg_log_error("statement has too many arguments (maximum is %d): %s", MAX_ARGS - 1, cmd->lines.data); pg_free(name); return false; } sprintf(var, "$%d", cmd->argc); p = replaceVariable(&sql, p, eaten, var); cmd->argv[cmd->argc] = name; cmd->argc++; } Assert(cmd->argv[0] == NULL); cmd->argv[0] = sql; return true; } /* * syntax error while parsing a script (in practice, while parsing a * backslash command, because we don't detect syntax errors in SQL) * * source: source of script (filename or builtin-script ID) * lineno: line number within script (count from 1) * line: whole line of backslash command, if available * command: backslash command name, if available * msg: the actual error message * more: optional extra message * column: zero-based column number, or -1 if unknown */ void syntax_error(const char *source, int lineno, const char *line, const char *command, const char *msg, const char *more, int column) { PQExpBufferData buf; initPQExpBuffer(&buf); printfPQExpBuffer(&buf, "%s:%d: %s", source, lineno, msg); if (more != NULL) appendPQExpBuffer(&buf, " (%s)", more); if (column >= 0 && line == NULL) appendPQExpBuffer(&buf, " at column %d", column + 1); if (command != NULL) appendPQExpBuffer(&buf, " in command \"%s\"", command); pg_log_fatal("%s", buf.data); termPQExpBuffer(&buf); if (line != NULL) { fprintf(stderr, "%s\n", line); if (column >= 0) fprintf(stderr, "%*c error found here\n", column + 1, '^'); } exit(1); } /* * Return a pointer to the start of the SQL command, after skipping over * whitespace and "--" comments. * If the end of the string is reached, return NULL. */ static char * skip_sql_comments(char *sql_command) { char *p = sql_command; /* Skip any leading whitespace, as well as "--" style comments */ for (;;) { if (isspace((unsigned char) *p)) p++; else if (strncmp(p, "--", 2) == 0) { p = strchr(p, '\n'); if (p == NULL) return NULL; p++; } else break; } /* NULL if there's nothing but whitespace and comments */ if (*p == '\0') return NULL; return p; } /* * Parse a SQL command; return a Command struct, or NULL if it's a comment * * On entry, psqlscan.l has collected the command into "buf", so we don't * really need to do much here except check for comments and set up a Command * struct. */ static Command * create_sql_command(PQExpBuffer buf, const char *source) { Command *my_command; char *p = skip_sql_comments(buf->data); if (p == NULL) return NULL; /* Allocate and initialize Command structure */ my_command = (Command *) pg_malloc(sizeof(Command)); initPQExpBuffer(&my_command->lines); appendPQExpBufferStr(&my_command->lines, p); my_command->first_line = NULL; /* this is set later */ my_command->type = SQL_COMMAND; my_command->meta = META_NONE; my_command->argc = 0; memset(my_command->argv, 0, sizeof(my_command->argv)); my_command->varprefix = NULL; /* allocated later, if needed */ my_command->expr = NULL; initSimpleStats(&my_command->stats); return my_command; } /* Free a Command structure and associated data */ static void free_command(Command *command) { termPQExpBuffer(&command->lines); if (command->first_line) pg_free(command->first_line); for (int i = 0; i < command->argc; i++) pg_free(command->argv[i]); if (command->varprefix) pg_free(command->varprefix); /* * It should also free expr recursively, but this is currently not needed * as only gset commands (which do not have an expression) are freed. */ pg_free(command); } /* * Once an SQL command is fully parsed, possibly by accumulating several * parts, complete other fields of the Command structure. */ static void postprocess_sql_command(Command *my_command) { char buffer[128]; Assert(my_command->type == SQL_COMMAND); /* Save the first line for error display. */ strlcpy(buffer, my_command->lines.data, sizeof(buffer)); buffer[strcspn(buffer, "\n\r")] = '\0'; my_command->first_line = pg_strdup(buffer); /* parse query if necessary */ switch (querymode) { case QUERY_SIMPLE: my_command->argv[0] = my_command->lines.data; my_command->argc++; break; case QUERY_EXTENDED: case QUERY_PREPARED: if (!parseQuery(my_command)) exit(1); break; default: exit(1); } } /* * Parse a backslash command; return a Command struct, or NULL if comment * * At call, we have scanned only the initial backslash. */ static Command * process_backslash_command(PsqlScanState sstate, const char *source) { Command *my_command; PQExpBufferData word_buf; int word_offset; int offsets[MAX_ARGS]; /* offsets of argument words */ int start_offset; int lineno; int j; initPQExpBuffer(&word_buf); /* Remember location of the backslash */ start_offset = expr_scanner_offset(sstate) - 1; lineno = expr_scanner_get_lineno(sstate, start_offset); /* Collect first word of command */ if (!expr_lex_one_word(sstate, &word_buf, &word_offset)) { termPQExpBuffer(&word_buf); return NULL; } /* Allocate and initialize Command structure */ my_command = (Command *) pg_malloc0(sizeof(Command)); my_command->type = META_COMMAND; my_command->argc = 0; initSimpleStats(&my_command->stats); /* Save first word (command name) */ j = 0; offsets[j] = word_offset; my_command->argv[j++] = pg_strdup(word_buf.data); my_command->argc++; /* ... and convert it to enum form */ my_command->meta = getMetaCommand(my_command->argv[0]); if (my_command->meta == META_SET || my_command->meta == META_IF || my_command->meta == META_ELIF) { yyscan_t yyscanner; /* For \set, collect var name */ if (my_command->meta == META_SET) { if (!expr_lex_one_word(sstate, &word_buf, &word_offset)) syntax_error(source, lineno, my_command->first_line, my_command->argv[0], "missing argument", NULL, -1); offsets[j] = word_offset; my_command->argv[j++] = pg_strdup(word_buf.data); my_command->argc++; } /* then for all parse the expression */ yyscanner = expr_scanner_init(sstate, source, lineno, start_offset, my_command->argv[0]); if (expr_yyparse(yyscanner) != 0) { /* dead code: exit done from syntax_error called by yyerror */ exit(1); } my_command->expr = expr_parse_result; /* Save line, trimming any trailing newline */ my_command->first_line = expr_scanner_get_substring(sstate, start_offset, expr_scanner_offset(sstate), true); expr_scanner_finish(yyscanner); termPQExpBuffer(&word_buf); return my_command; } /* For all other commands, collect remaining words. */ while (expr_lex_one_word(sstate, &word_buf, &word_offset)) { /* * my_command->argv[0] is the command itself, so the max number of * arguments is one less than MAX_ARGS */ if (j >= MAX_ARGS) syntax_error(source, lineno, my_command->first_line, my_command->argv[0], "too many arguments", NULL, -1); offsets[j] = word_offset; my_command->argv[j++] = pg_strdup(word_buf.data); my_command->argc++; } /* Save line, trimming any trailing newline */ my_command->first_line = expr_scanner_get_substring(sstate, start_offset, expr_scanner_offset(sstate), true); if (my_command->meta == META_SLEEP) { if (my_command->argc < 2) syntax_error(source, lineno, my_command->first_line, my_command->argv[0], "missing argument", NULL, -1); if (my_command->argc > 3) syntax_error(source, lineno, my_command->first_line, my_command->argv[0], "too many arguments", NULL, offsets[3] - start_offset); /* * Split argument into number and unit to allow "sleep 1ms" etc. We * don't have to terminate the number argument with null because it * will be parsed with atoi, which ignores trailing non-digit * characters. */ if (my_command->argv[1][0] != ':') { char *c = my_command->argv[1]; bool have_digit = false; /* Skip sign */ if (*c == '+' || *c == '-') c++; /* Require at least one digit */ if (*c && isdigit((unsigned char) *c)) have_digit = true; /* Eat all digits */ while (*c && isdigit((unsigned char) *c)) c++; if (*c) { if (my_command->argc == 2 && have_digit) { my_command->argv[2] = c; offsets[2] = offsets[1] + (c - my_command->argv[1]); my_command->argc = 3; } else { /* * Raise an error if argument starts with non-digit * character (after sign). */ syntax_error(source, lineno, my_command->first_line, my_command->argv[0], "invalid sleep time, must be an integer", my_command->argv[1], offsets[1] - start_offset); } } } if (my_command->argc == 3) { if (pg_strcasecmp(my_command->argv[2], "us") != 0 && pg_strcasecmp(my_command->argv[2], "ms") != 0 && pg_strcasecmp(my_command->argv[2], "s") != 0) syntax_error(source, lineno, my_command->first_line, my_command->argv[0], "unrecognized time unit, must be us, ms or s", my_command->argv[2], offsets[2] - start_offset); } } else if (my_command->meta == META_SETSHELL) { if (my_command->argc < 3) syntax_error(source, lineno, my_command->first_line, my_command->argv[0], "missing argument", NULL, -1); } else if (my_command->meta == META_SHELL) { if (my_command->argc < 2) syntax_error(source, lineno, my_command->first_line, my_command->argv[0], "missing command", NULL, -1); } else if (my_command->meta == META_ELSE || my_command->meta == META_ENDIF || my_command->meta == META_STARTPIPELINE || my_command->meta == META_ENDPIPELINE) { if (my_command->argc != 1) syntax_error(source, lineno, my_command->first_line, my_command->argv[0], "unexpected argument", NULL, -1); } else if (my_command->meta == META_GSET || my_command->meta == META_ASET) { if (my_command->argc > 2) syntax_error(source, lineno, my_command->first_line, my_command->argv[0], "too many arguments", NULL, -1); } else { /* my_command->meta == META_NONE */ syntax_error(source, lineno, my_command->first_line, my_command->argv[0], "invalid command", NULL, -1); } termPQExpBuffer(&word_buf); return my_command; } static void ConditionError(const char *desc, int cmdn, const char *msg) { pg_log_fatal("condition error in script \"%s\" command %d: %s", desc, cmdn, msg); exit(1); } /* * Partial evaluation of conditionals before recording and running the script. */ static void CheckConditional(ParsedScript ps) { /* statically check conditional structure */ ConditionalStack cs = conditional_stack_create(); int i; for (i = 0; ps.commands[i] != NULL; i++) { Command *cmd = ps.commands[i]; if (cmd->type == META_COMMAND) { switch (cmd->meta) { case META_IF: conditional_stack_push(cs, IFSTATE_FALSE); break; case META_ELIF: if (conditional_stack_empty(cs)) ConditionError(ps.desc, i + 1, "\\elif without matching \\if"); if (conditional_stack_peek(cs) == IFSTATE_ELSE_FALSE) ConditionError(ps.desc, i + 1, "\\elif after \\else"); break; case META_ELSE: if (conditional_stack_empty(cs)) ConditionError(ps.desc, i + 1, "\\else without matching \\if"); if (conditional_stack_peek(cs) == IFSTATE_ELSE_FALSE) ConditionError(ps.desc, i + 1, "\\else after \\else"); conditional_stack_poke(cs, IFSTATE_ELSE_FALSE); break; case META_ENDIF: if (!conditional_stack_pop(cs)) ConditionError(ps.desc, i + 1, "\\endif without matching \\if"); break; default: /* ignore anything else... */ break; } } } if (!conditional_stack_empty(cs)) ConditionError(ps.desc, i + 1, "\\if without matching \\endif"); conditional_stack_destroy(cs); } /* * Parse a script (either the contents of a file, or a built-in script) * and add it to the list of scripts. */ static void ParseScript(const char *script, const char *desc, int weight) { ParsedScript ps; PsqlScanState sstate; PQExpBufferData line_buf; int alloc_num; int index; int lineno; int start_offset; #define COMMANDS_ALLOC_NUM 128 alloc_num = COMMANDS_ALLOC_NUM; /* Initialize all fields of ps */ ps.desc = desc; ps.weight = weight; ps.commands = (Command **) pg_malloc(sizeof(Command *) * alloc_num); initStats(&ps.stats, 0); /* Prepare to parse script */ sstate = psql_scan_create(&pgbench_callbacks); /* * Ideally, we'd scan scripts using the encoding and stdstrings settings * we get from a DB connection. However, without major rearrangement of * pgbench's argument parsing, we can't have a DB connection at the time * we parse scripts. Using SQL_ASCII (encoding 0) should work well enough * with any backend-safe encoding, though conceivably we could be fooled * if a script file uses a client-only encoding. We also assume that * stdstrings should be true, which is a bit riskier. */ psql_scan_setup(sstate, script, strlen(script), 0, true); start_offset = expr_scanner_offset(sstate) - 1; initPQExpBuffer(&line_buf); index = 0; for (;;) { PsqlScanResult sr; promptStatus_t prompt; Command *command = NULL; resetPQExpBuffer(&line_buf); lineno = expr_scanner_get_lineno(sstate, start_offset); sr = psql_scan(sstate, &line_buf, &prompt); /* If we collected a new SQL command, process that */ command = create_sql_command(&line_buf, desc); /* store new command */ if (command) ps.commands[index++] = command; /* If we reached a backslash, process that */ if (sr == PSCAN_BACKSLASH) { command = process_backslash_command(sstate, desc); if (command) { /* * If this is gset or aset, merge into the preceding command. * (We don't use a command slot in this case). */ if (command->meta == META_GSET || command->meta == META_ASET) { Command *cmd; if (index == 0) syntax_error(desc, lineno, NULL, NULL, "\\gset must follow an SQL command", NULL, -1); cmd = ps.commands[index - 1]; if (cmd->type != SQL_COMMAND || cmd->varprefix != NULL) syntax_error(desc, lineno, NULL, NULL, "\\gset must follow an SQL command", cmd->first_line, -1); /* get variable prefix */ if (command->argc <= 1 || command->argv[1][0] == '\0') cmd->varprefix = pg_strdup(""); else cmd->varprefix = pg_strdup(command->argv[1]); /* update the sql command meta */ cmd->meta = command->meta; /* cleanup unused command */ free_command(command); continue; } /* Attach any other backslash command as a new command */ ps.commands[index++] = command; } } /* * Since we used a command slot, allocate more if needed. Note we * always allocate one more in order to accommodate the NULL * terminator below. */ if (index >= alloc_num) { alloc_num += COMMANDS_ALLOC_NUM; ps.commands = (Command **) pg_realloc(ps.commands, sizeof(Command *) * alloc_num); } /* Done if we reached EOF */ if (sr == PSCAN_INCOMPLETE || sr == PSCAN_EOL) break; } ps.commands[index] = NULL; addScript(ps); termPQExpBuffer(&line_buf); psql_scan_finish(sstate); psql_scan_destroy(sstate); } /* * Read the entire contents of file fd, and return it in a malloc'd buffer. * * The buffer will typically be larger than necessary, but we don't care * in this program, because we'll free it as soon as we've parsed the script. */ static char * read_file_contents(FILE *fd) { char *buf; size_t buflen = BUFSIZ; size_t used = 0; buf = (char *) pg_malloc(buflen); for (;;) { size_t nread; nread = fread(buf + used, 1, BUFSIZ, fd); used += nread; /* If fread() read less than requested, must be EOF or error */ if (nread < BUFSIZ) break; /* Enlarge buf so we can read some more */ buflen += BUFSIZ; buf = (char *) pg_realloc(buf, buflen); } /* There is surely room for a terminator */ buf[used] = '\0'; return buf; } /* * Given a file name, read it and add its script to the list. * "-" means to read stdin. * NB: filename must be storage that won't disappear. */ static void process_file(const char *filename, int weight) { FILE *fd; char *buf; /* Slurp the file contents into "buf" */ if (strcmp(filename, "-") == 0) fd = stdin; else if ((fd = fopen(filename, "r")) == NULL) { pg_log_fatal("could not open file \"%s\": %m", filename); exit(1); } buf = read_file_contents(fd); if (ferror(fd)) { pg_log_fatal("could not read file \"%s\": %m", filename); exit(1); } if (fd != stdin) fclose(fd); ParseScript(buf, filename, weight); free(buf); } /* Parse the given builtin script and add it to the list. */ static void process_builtin(const BuiltinScript *bi, int weight) { ParseScript(bi->script, bi->desc, weight); } /* show available builtin scripts */ static void listAvailableScripts(void) { int i; fprintf(stderr, "Available builtin scripts:\n"); for (i = 0; i < lengthof(builtin_script); i++) fprintf(stderr, " %13s: %s\n", builtin_script[i].name, builtin_script[i].desc); fprintf(stderr, "\n"); } /* return builtin script "name" if unambiguous, fails if not found */ static const BuiltinScript * findBuiltin(const char *name) { int i, found = 0, len = strlen(name); const BuiltinScript *result = NULL; for (i = 0; i < lengthof(builtin_script); i++) { if (strncmp(builtin_script[i].name, name, len) == 0) { result = &builtin_script[i]; found++; } } /* ok, unambiguous result */ if (found == 1) return result; /* error cases */ if (found == 0) pg_log_fatal("no builtin script found for name \"%s\"", name); else /* found > 1 */ pg_log_fatal("ambiguous builtin name: %d builtin scripts found for prefix \"%s\"", found, name); listAvailableScripts(); exit(1); } /* * Determine the weight specification from a script option (-b, -f), if any, * and return it as an integer (1 is returned if there's no weight). The * script name is returned in *script as a malloc'd string. */ static int parseScriptWeight(const char *option, char **script) { char *sep; int weight; if ((sep = strrchr(option, WSEP))) { int namelen = sep - option; long wtmp; char *badp; /* generate the script name */ *script = pg_malloc(namelen + 1); strncpy(*script, option, namelen); (*script)[namelen] = '\0'; /* process digits of the weight spec */ errno = 0; wtmp = strtol(sep + 1, &badp, 10); if (errno != 0 || badp == sep + 1 || *badp != '\0') { pg_log_fatal("invalid weight specification: %s", sep); exit(1); } if (wtmp > INT_MAX || wtmp < 0) { pg_log_fatal("weight specification out of range (0 .. %u): %lld", INT_MAX, (long long) wtmp); exit(1); } weight = wtmp; } else { *script = pg_strdup(option); weight = 1; } return weight; } /* append a script to the list of scripts to process */ static void addScript(ParsedScript script) { if (script.commands == NULL || script.commands[0] == NULL) { pg_log_fatal("empty command list for script \"%s\"", script.desc); exit(1); } if (num_scripts >= MAX_SCRIPTS) { pg_log_fatal("at most %d SQL scripts are allowed", MAX_SCRIPTS); exit(1); } CheckConditional(script); sql_script[num_scripts] = script; num_scripts++; } /* * Print progress report. * * On entry, *last and *last_report contain the statistics and time of last * progress report. On exit, they are updated with the new stats. */ static void printProgressReport(TState *threads, int64 test_start, pg_time_usec_t now, StatsData *last, int64 *last_report) { /* generate and show report */ pg_time_usec_t run = now - *last_report; int64 ntx; double tps, total_run, latency, sqlat, lag, stdev; char tbuf[315]; StatsData cur; /* * Add up the statistics of all threads. * * XXX: No locking. There is no guarantee that we get an atomic snapshot * of the transaction count and latencies, so these figures can well be * off by a small amount. The progress report's purpose is to give a * quick overview of how the test is going, so that shouldn't matter too * much. (If a read from a 64-bit integer is not atomic, you might get a * "torn" read and completely bogus latencies though!) */ initStats(&cur, 0); for (int i = 0; i < nthreads; i++) { mergeSimpleStats(&cur.latency, &threads[i].stats.latency); mergeSimpleStats(&cur.lag, &threads[i].stats.lag); cur.cnt += threads[i].stats.cnt; cur.skipped += threads[i].stats.skipped; } /* we count only actually executed transactions */ ntx = (cur.cnt - cur.skipped) - (last->cnt - last->skipped); total_run = (now - test_start) / 1000000.0; tps = 1000000.0 * ntx / run; if (ntx > 0) { latency = 0.001 * (cur.latency.sum - last->latency.sum) / ntx; sqlat = 1.0 * (cur.latency.sum2 - last->latency.sum2) / ntx; stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency); lag = 0.001 * (cur.lag.sum - last->lag.sum) / ntx; } else { latency = sqlat = stdev = lag = 0; } if (progress_timestamp) { snprintf(tbuf, sizeof(tbuf), "%.3f s", PG_TIME_GET_DOUBLE(now + epoch_shift)); } else { /* round seconds are expected, but the thread may be late */ snprintf(tbuf, sizeof(tbuf), "%.1f s", total_run); } fprintf(stderr, "progress: %s, %.1f tps, lat %.3f ms stddev %.3f", tbuf, tps, latency, stdev); if (throttle_delay) { fprintf(stderr, ", lag %.3f ms", lag); if (latency_limit) fprintf(stderr, ", " INT64_FORMAT " skipped", cur.skipped - last->skipped); } fprintf(stderr, "\n"); *last = cur; *last_report = now; } static void printSimpleStats(const char *prefix, SimpleStats *ss) { if (ss->count > 0) { double latency = ss->sum / ss->count; double stddev = sqrt(ss->sum2 / ss->count - latency * latency); printf("%s average = %.3f ms\n", prefix, 0.001 * latency); printf("%s stddev = %.3f ms\n", prefix, 0.001 * stddev); } } /* print version banner */ static void printVersion(PGconn *con) { int server_ver = PQserverVersion(con); int client_ver = PG_VERSION_NUM; if (server_ver != client_ver) { const char *server_version; char sverbuf[32]; /* Try to get full text form, might include "devel" etc */ server_version = PQparameterStatus(con, "server_version"); /* Otherwise fall back on server_ver */ if (!server_version) { formatPGVersionNumber(server_ver, true, sverbuf, sizeof(sverbuf)); server_version = sverbuf; } printf(_("%s (%s, server %s)\n"), "pgbench", PG_VERSION, server_version); } /* For version match, only print pgbench version */ else printf("%s (%s)\n", "pgbench", PG_VERSION); fflush(stdout); } /* print out results */ static void printResults(StatsData *total, pg_time_usec_t total_duration, /* benchmarking time */ pg_time_usec_t conn_total_duration, /* is_connect */ pg_time_usec_t conn_elapsed_duration, /* !is_connect */ int64 latency_late) { /* tps is about actually executed transactions during benchmarking */ int64 ntx = total->cnt - total->skipped; double bench_duration = PG_TIME_GET_DOUBLE(total_duration); double tps = ntx / bench_duration; /* Report test parameters. */ printf("transaction type: %s\n", num_scripts == 1 ? sql_script[0].desc : "multiple scripts"); printf("scaling factor: %d\n", scale); /* only print partitioning information if some partitioning was detected */ if (partition_method != PART_NONE) printf("partition method: %s\npartitions: %d\n", PARTITION_METHOD[partition_method], partitions); printf("query mode: %s\n", QUERYMODE[querymode]); printf("number of clients: %d\n", nclients); printf("number of threads: %d\n", nthreads); if (duration <= 0) { printf("number of transactions per client: %d\n", nxacts); printf("number of transactions actually processed: " INT64_FORMAT "/%d\n", ntx, nxacts * nclients); } else { printf("duration: %d s\n", duration); printf("number of transactions actually processed: " INT64_FORMAT "\n", ntx); } /* Remaining stats are nonsensical if we failed to execute any xacts */ if (total->cnt <= 0) return; if (throttle_delay && latency_limit) printf("number of transactions skipped: " INT64_FORMAT " (%.3f %%)\n", total->skipped, 100.0 * total->skipped / total->cnt); if (latency_limit) printf("number of transactions above the %.1f ms latency limit: " INT64_FORMAT "/" INT64_FORMAT " (%.3f %%)\n", latency_limit / 1000.0, latency_late, ntx, (ntx > 0) ? 100.0 * latency_late / ntx : 0.0); if (throttle_delay || progress || latency_limit) printSimpleStats("latency", &total->latency); else { /* no measurement, show average latency computed from run time */ printf("latency average = %.3f ms\n", 0.001 * total_duration * nclients / total->cnt); } if (throttle_delay) { /* * Report average transaction lag under rate limit throttling. This * is the delay between scheduled and actual start times for the * transaction. The measured lag may be caused by thread/client load, * the database load, or the Poisson throttling process. */ printf("rate limit schedule lag: avg %.3f (max %.3f) ms\n", 0.001 * total->lag.sum / total->cnt, 0.001 * total->lag.max); } /* * Under -C/--connect, each transaction incurs a significant connection * cost, it would not make much sense to ignore it in tps, and it would * not be tps anyway. * * Otherwise connections are made just once at the beginning of the run * and should not impact performance but for very short run, so they are * (right)fully ignored in tps. */ if (is_connect) { printf("average connection time = %.3f ms\n", 0.001 * conn_total_duration / total->cnt); printf("tps = %f (including reconnection times)\n", tps); } else { printf("initial connection time = %.3f ms\n", 0.001 * conn_elapsed_duration); printf("tps = %f (without initial connection time)\n", tps); } /* Report per-script/command statistics */ if (per_script_stats || report_per_command) { int i; for (i = 0; i < num_scripts; i++) { if (per_script_stats) { StatsData *sstats = &sql_script[i].stats; printf("SQL script %d: %s\n" " - weight: %d (targets %.1f%% of total)\n" " - " INT64_FORMAT " transactions (%.1f%% of total, tps = %f)\n", i + 1, sql_script[i].desc, sql_script[i].weight, 100.0 * sql_script[i].weight / total_weight, sstats->cnt, 100.0 * sstats->cnt / total->cnt, (sstats->cnt - sstats->skipped) / bench_duration); if (throttle_delay && latency_limit && sstats->cnt > 0) printf(" - number of transactions skipped: " INT64_FORMAT " (%.3f%%)\n", sstats->skipped, 100.0 * sstats->skipped / sstats->cnt); printSimpleStats(" - latency", &sstats->latency); } /* Report per-command latencies */ if (report_per_command) { Command **commands; if (per_script_stats) printf(" - statement latencies in milliseconds:\n"); else printf("statement latencies in milliseconds:\n"); for (commands = sql_script[i].commands; *commands != NULL; commands++) { SimpleStats *cstats = &(*commands)->stats; printf(" %11.3f %s\n", (cstats->count > 0) ? 1000.0 * cstats->sum / cstats->count : 0.0, (*commands)->first_line); } } } } } /* * Set up a random seed according to seed parameter (NULL means default), * and initialize base_random_sequence for use in initializing other sequences. */ static bool set_random_seed(const char *seed) { uint64 iseed; if (seed == NULL || strcmp(seed, "time") == 0) { /* rely on current time */ iseed = pg_time_now(); } else if (strcmp(seed, "rand") == 0) { /* use some "strong" random source */ if (!pg_strong_random(&iseed, sizeof(iseed))) { pg_log_error("could not generate random seed"); return false; } } else { /* parse unsigned-int seed value */ unsigned long ulseed; char garbage; /* Don't try to use UINT64_FORMAT here; it might not work for sscanf */ if (sscanf(seed, "%lu%c", &ulseed, &garbage) != 1) { pg_log_error("unrecognized random seed option \"%s\"", seed); pg_log_info("Expecting an unsigned integer, \"time\" or \"rand\""); return false; } iseed = (uint64) ulseed; } if (seed != NULL) pg_log_info("setting random seed to %llu", (unsigned long long) iseed); random_seed = iseed; /* Fill base_random_sequence with low-order bits of seed */ base_random_sequence.xseed[0] = iseed & 0xFFFF; base_random_sequence.xseed[1] = (iseed >> 16) & 0xFFFF; base_random_sequence.xseed[2] = (iseed >> 32) & 0xFFFF; return true; } int main(int argc, char **argv) { static struct option long_options[] = { /* systematic long/short named options */ {"builtin", required_argument, NULL, 'b'}, {"client", required_argument, NULL, 'c'}, {"connect", no_argument, NULL, 'C'}, {"debug", no_argument, NULL, 'd'}, {"define", required_argument, NULL, 'D'}, {"file", required_argument, NULL, 'f'}, {"fillfactor", required_argument, NULL, 'F'}, {"host", required_argument, NULL, 'h'}, {"initialize", no_argument, NULL, 'i'}, {"init-steps", required_argument, NULL, 'I'}, {"jobs", required_argument, NULL, 'j'}, {"log", no_argument, NULL, 'l'}, {"latency-limit", required_argument, NULL, 'L'}, {"no-vacuum", no_argument, NULL, 'n'}, {"port", required_argument, NULL, 'p'}, {"progress", required_argument, NULL, 'P'}, {"protocol", required_argument, NULL, 'M'}, {"quiet", no_argument, NULL, 'q'}, {"report-latencies", no_argument, NULL, 'r'}, {"rate", required_argument, NULL, 'R'}, {"scale", required_argument, NULL, 's'}, {"select-only", no_argument, NULL, 'S'}, {"skip-some-updates", no_argument, NULL, 'N'}, {"time", required_argument, NULL, 'T'}, {"transactions", required_argument, NULL, 't'}, {"username", required_argument, NULL, 'U'}, {"vacuum-all", no_argument, NULL, 'v'}, /* long-named only options */ {"unlogged-tables", no_argument, NULL, 1}, {"tablespace", required_argument, NULL, 2}, {"index-tablespace", required_argument, NULL, 3}, {"sampling-rate", required_argument, NULL, 4}, {"aggregate-interval", required_argument, NULL, 5}, {"progress-timestamp", no_argument, NULL, 6}, {"log-prefix", required_argument, NULL, 7}, {"foreign-keys", no_argument, NULL, 8}, {"random-seed", required_argument, NULL, 9}, {"show-script", required_argument, NULL, 10}, {"partitions", required_argument, NULL, 11}, {"partition-method", required_argument, NULL, 12}, {NULL, 0, NULL, 0} }; int c; bool is_init_mode = false; /* initialize mode? */ char *initialize_steps = NULL; bool foreign_keys = false; bool is_no_vacuum = false; bool do_vacuum_accounts = false; /* vacuum accounts table? */ int optindex; bool scale_given = false; bool benchmarking_option_set = false; bool initialization_option_set = false; bool internal_script_used = false; CState *state; /* status of clients */ TState *threads; /* array of thread */ pg_time_usec_t start_time, /* start up time */ bench_start = 0, /* first recorded benchmarking time */ conn_total_duration; /* cumulated connection time in * threads */ int64 latency_late = 0; StatsData stats; int weight; int i; int nclients_dealt; #ifdef HAVE_GETRLIMIT struct rlimit rlim; #endif PGconn *con; char *env; int exit_code = 0; struct timeval tv; /* * Record difference between Unix time and instr_time time. We'll use * this for logging and aggregation. */ gettimeofday(&tv, NULL); epoch_shift = tv.tv_sec * INT64CONST(1000000) + tv.tv_usec - pg_time_now(); pg_logging_init(argv[0]); progname = get_progname(argv[0]); if (argc > 1) { if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) { usage(); exit(0); } if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) { puts("pgbench (PostgreSQL) " PG_VERSION); exit(0); } } state = (CState *) pg_malloc0(sizeof(CState)); /* set random seed early, because it may be used while parsing scripts. */ if (!set_random_seed(getenv("PGBENCH_RANDOM_SEED"))) { pg_log_fatal("error while setting random seed from PGBENCH_RANDOM_SEED environment variable"); exit(1); } while ((c = getopt_long(argc, argv, "iI:h:nvp:dqb:SNc:j:Crs:t:T:U:lf:D:F:M:P:R:L:", long_options, &optindex)) != -1) { char *script; switch (c) { case 'i': is_init_mode = true; break; case 'I': if (initialize_steps) pg_free(initialize_steps); initialize_steps = pg_strdup(optarg); checkInitSteps(initialize_steps); initialization_option_set = true; break; case 'h': pghost = pg_strdup(optarg); break; case 'n': is_no_vacuum = true; break; case 'v': benchmarking_option_set = true; do_vacuum_accounts = true; break; case 'p': pgport = pg_strdup(optarg); break; case 'd': pg_logging_increase_verbosity(); break; case 'c': benchmarking_option_set = true; nclients = atoi(optarg); if (nclients <= 0) { pg_log_fatal("invalid number of clients: \"%s\"", optarg); exit(1); } #ifdef HAVE_GETRLIMIT #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */ if (getrlimit(RLIMIT_NOFILE, &rlim) == -1) #else /* but BSD doesn't ... */ if (getrlimit(RLIMIT_OFILE, &rlim) == -1) #endif /* RLIMIT_NOFILE */ { pg_log_fatal("getrlimit failed: %m"); exit(1); } if (rlim.rlim_cur < nclients + 3) { pg_log_fatal("need at least %d open files, but system limit is %ld", nclients + 3, (long) rlim.rlim_cur); pg_log_info("Reduce number of clients, or use limit/ulimit to increase the system limit."); exit(1); } #endif /* HAVE_GETRLIMIT */ break; case 'j': /* jobs */ benchmarking_option_set = true; nthreads = atoi(optarg); if (nthreads <= 0) { pg_log_fatal("invalid number of threads: \"%s\"", optarg); exit(1); } #ifndef ENABLE_THREAD_SAFETY if (nthreads != 1) { pg_log_fatal("threads are not supported on this platform; use -j1"); exit(1); } #endif /* !ENABLE_THREAD_SAFETY */ break; case 'C': benchmarking_option_set = true; is_connect = true; break; case 'r': benchmarking_option_set = true; report_per_command = true; break; case 's': scale_given = true; scale = atoi(optarg); if (scale <= 0) { pg_log_fatal("invalid scaling factor: \"%s\"", optarg); exit(1); } break; case 't': benchmarking_option_set = true; nxacts = atoi(optarg); if (nxacts <= 0) { pg_log_fatal("invalid number of transactions: \"%s\"", optarg); exit(1); } break; case 'T': benchmarking_option_set = true; duration = atoi(optarg); if (duration <= 0) { pg_log_fatal("invalid duration: \"%s\"", optarg); exit(1); } break; case 'U': username = pg_strdup(optarg); break; case 'l': benchmarking_option_set = true; use_log = true; break; case 'q': initialization_option_set = true; use_quiet = true; break; case 'b': if (strcmp(optarg, "list") == 0) { listAvailableScripts(); exit(0); } weight = parseScriptWeight(optarg, &script); process_builtin(findBuiltin(script), weight); benchmarking_option_set = true; internal_script_used = true; break; case 'S': process_builtin(findBuiltin("select-only"), 1); benchmarking_option_set = true; internal_script_used = true; break; case 'N': process_builtin(findBuiltin("simple-update"), 1); benchmarking_option_set = true; internal_script_used = true; break; case 'f': weight = parseScriptWeight(optarg, &script); process_file(script, weight); benchmarking_option_set = true; break; case 'D': { char *p; benchmarking_option_set = true; if ((p = strchr(optarg, '=')) == NULL || p == optarg || *(p + 1) == '\0') { pg_log_fatal("invalid variable definition: \"%s\"", optarg); exit(1); } *p++ = '\0'; if (!putVariable(&state[0], "option", optarg, p)) exit(1); } break; case 'F': initialization_option_set = true; fillfactor = atoi(optarg); if (fillfactor < 10 || fillfactor > 100) { pg_log_fatal("invalid fillfactor: \"%s\"", optarg); exit(1); } break; case 'M': benchmarking_option_set = true; for (querymode = 0; querymode < NUM_QUERYMODE; querymode++) if (strcmp(optarg, QUERYMODE[querymode]) == 0) break; if (querymode >= NUM_QUERYMODE) { pg_log_fatal("invalid query mode (-M): \"%s\"", optarg); exit(1); } break; case 'P': benchmarking_option_set = true; progress = atoi(optarg); if (progress <= 0) { pg_log_fatal("invalid thread progress delay: \"%s\"", optarg); exit(1); } break; case 'R': { /* get a double from the beginning of option value */ double throttle_value = atof(optarg); benchmarking_option_set = true; if (throttle_value <= 0.0) { pg_log_fatal("invalid rate limit: \"%s\"", optarg); exit(1); } /* Invert rate limit into per-transaction delay in usec */ throttle_delay = 1000000.0 / throttle_value; } break; case 'L': { double limit_ms = atof(optarg); if (limit_ms <= 0.0) { pg_log_fatal("invalid latency limit: \"%s\"", optarg); exit(1); } benchmarking_option_set = true; latency_limit = (int64) (limit_ms * 1000); } break; case 1: /* unlogged-tables */ initialization_option_set = true; unlogged_tables = true; break; case 2: /* tablespace */ initialization_option_set = true; tablespace = pg_strdup(optarg); break; case 3: /* index-tablespace */ initialization_option_set = true; index_tablespace = pg_strdup(optarg); break; case 4: /* sampling-rate */ benchmarking_option_set = true; sample_rate = atof(optarg); if (sample_rate <= 0.0 || sample_rate > 1.0) { pg_log_fatal("invalid sampling rate: \"%s\"", optarg); exit(1); } break; case 5: /* aggregate-interval */ benchmarking_option_set = true; agg_interval = atoi(optarg); if (agg_interval <= 0) { pg_log_fatal("invalid number of seconds for aggregation: \"%s\"", optarg); exit(1); } break; case 6: /* progress-timestamp */ progress_timestamp = true; benchmarking_option_set = true; break; case 7: /* log-prefix */ benchmarking_option_set = true; logfile_prefix = pg_strdup(optarg); break; case 8: /* foreign-keys */ initialization_option_set = true; foreign_keys = true; break; case 9: /* random-seed */ benchmarking_option_set = true; if (!set_random_seed(optarg)) { pg_log_fatal("error while setting random seed from --random-seed option"); exit(1); } break; case 10: /* list */ { const BuiltinScript *s = findBuiltin(optarg); fprintf(stderr, "-- %s: %s\n%s\n", s->name, s->desc, s->script); exit(0); } break; case 11: /* partitions */ initialization_option_set = true; partitions = atoi(optarg); if (partitions < 0) { pg_log_fatal("invalid number of partitions: \"%s\"", optarg); exit(1); } break; case 12: /* partition-method */ initialization_option_set = true; if (pg_strcasecmp(optarg, "range") == 0) partition_method = PART_RANGE; else if (pg_strcasecmp(optarg, "hash") == 0) partition_method = PART_HASH; else { pg_log_fatal("invalid partition method, expecting \"range\" or \"hash\", got: \"%s\"", optarg); exit(1); } break; default: fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); exit(1); break; } } /* set default script if none */ if (num_scripts == 0 && !is_init_mode) { process_builtin(findBuiltin("tpcb-like"), 1); benchmarking_option_set = true; internal_script_used = true; } /* complete SQL command initialization and compute total weight */ for (i = 0; i < num_scripts; i++) { Command **commands = sql_script[i].commands; for (int j = 0; commands[j] != NULL; j++) if (commands[j]->type == SQL_COMMAND) postprocess_sql_command(commands[j]); /* cannot overflow: weight is 32b, total_weight 64b */ total_weight += sql_script[i].weight; } if (total_weight == 0 && !is_init_mode) { pg_log_fatal("total script weight must not be zero"); exit(1); } /* show per script stats if several scripts are used */ if (num_scripts > 1) per_script_stats = true; /* * Don't need more threads than there are clients. (This is not merely an * optimization; throttle_delay is calculated incorrectly below if some * threads have no clients assigned to them.) */ if (nthreads > nclients) nthreads = nclients; /* * Convert throttle_delay to a per-thread delay time. Note that this * might be a fractional number of usec, but that's OK, since it's just * the center of a Poisson distribution of delays. */ throttle_delay *= nthreads; if (argc > optind) dbName = argv[optind++]; else { if ((env = getenv("PGDATABASE")) != NULL && *env != '\0') dbName = env; else if ((env = getenv("PGUSER")) != NULL && *env != '\0') dbName = env; else dbName = get_user_name_or_exit(progname); } if (optind < argc) { pg_log_fatal("too many command-line arguments (first is \"%s\")", argv[optind]); fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); exit(1); } if (is_init_mode) { if (benchmarking_option_set) { pg_log_fatal("some of the specified options cannot be used in initialization (-i) mode"); exit(1); } if (partitions == 0 && partition_method != PART_NONE) { pg_log_fatal("--partition-method requires greater than zero --partitions"); exit(1); } /* set default method */ if (partitions > 0 && partition_method == PART_NONE) partition_method = PART_RANGE; if (initialize_steps == NULL) initialize_steps = pg_strdup(DEFAULT_INIT_STEPS); if (is_no_vacuum) { /* Remove any vacuum step in initialize_steps */ char *p; while ((p = strchr(initialize_steps, 'v')) != NULL) *p = ' '; } if (foreign_keys) { /* Add 'f' to end of initialize_steps, if not already there */ if (strchr(initialize_steps, 'f') == NULL) { initialize_steps = (char *) pg_realloc(initialize_steps, strlen(initialize_steps) + 2); strcat(initialize_steps, "f"); } } runInitSteps(initialize_steps); exit(0); } else { if (initialization_option_set) { pg_log_fatal("some of the specified options cannot be used in benchmarking mode"); exit(1); } } if (nxacts > 0 && duration > 0) { pg_log_fatal("specify either a number of transactions (-t) or a duration (-T), not both"); exit(1); } /* Use DEFAULT_NXACTS if neither nxacts nor duration is specified. */ if (nxacts <= 0 && duration <= 0) nxacts = DEFAULT_NXACTS; /* --sampling-rate may be used only with -l */ if (sample_rate > 0.0 && !use_log) { pg_log_fatal("log sampling (--sampling-rate) is allowed only when logging transactions (-l)"); exit(1); } /* --sampling-rate may not be used with --aggregate-interval */ if (sample_rate > 0.0 && agg_interval > 0) { pg_log_fatal("log sampling (--sampling-rate) and aggregation (--aggregate-interval) cannot be used at the same time"); exit(1); } if (agg_interval > 0 && !use_log) { pg_log_fatal("log aggregation is allowed only when actually logging transactions"); exit(1); } if (!use_log && logfile_prefix) { pg_log_fatal("log file prefix (--log-prefix) is allowed only when logging transactions (-l)"); exit(1); } if (duration > 0 && agg_interval > duration) { pg_log_fatal("number of seconds for aggregation (%d) must not be higher than test duration (%d)", agg_interval, duration); exit(1); } if (duration > 0 && agg_interval > 0 && duration % agg_interval != 0) { pg_log_fatal("duration (%d) must be a multiple of aggregation interval (%d)", duration, agg_interval); exit(1); } if (progress_timestamp && progress == 0) { pg_log_fatal("--progress-timestamp is allowed only under --progress"); exit(1); } /* * save main process id in the global variable because process id will be * changed after fork. */ main_pid = (int) getpid(); if (nclients > 1) { state = (CState *) pg_realloc(state, sizeof(CState) * nclients); memset(state + 1, 0, sizeof(CState) * (nclients - 1)); /* copy any -D switch values to all clients */ for (i = 1; i < nclients; i++) { int j; state[i].id = i; for (j = 0; j < state[0].nvariables; j++) { Variable *var = &state[0].variables[j]; if (var->value.type != PGBT_NO_VALUE) { if (!putVariableValue(&state[i], "startup", var->name, &var->value)) exit(1); } else { if (!putVariable(&state[i], "startup", var->name, var->svalue)) exit(1); } } } } /* other CState initializations */ for (i = 0; i < nclients; i++) { state[i].cstack = conditional_stack_create(); initRandomState(&state[i].cs_func_rs); } /* opening connection... */ con = doConnect(); if (con == NULL) exit(1); /* report pgbench and server versions */ printVersion(con); pg_log_debug("pghost: %s pgport: %s nclients: %d %s: %d dbName: %s", PQhost(con), PQport(con), nclients, duration <= 0 ? "nxacts" : "duration", duration <= 0 ? nxacts : duration, PQdb(con)); if (internal_script_used) GetTableInfo(con, scale_given); /* * :scale variables normally get -s or database scale, but don't override * an explicit -D switch */ if (lookupVariable(&state[0], "scale") == NULL) { for (i = 0; i < nclients; i++) { if (!putVariableInt(&state[i], "startup", "scale", scale)) exit(1); } } /* * Define a :client_id variable that is unique per connection. But don't * override an explicit -D switch. */ if (lookupVariable(&state[0], "client_id") == NULL) { for (i = 0; i < nclients; i++) if (!putVariableInt(&state[i], "startup", "client_id", i)) exit(1); } /* set default seed for hash functions */ if (lookupVariable(&state[0], "default_seed") == NULL) { uint64 seed = ((uint64) pg_jrand48(base_random_sequence.xseed) & 0xFFFFFFFF) | (((uint64) pg_jrand48(base_random_sequence.xseed) & 0xFFFFFFFF) << 32); for (i = 0; i < nclients; i++) if (!putVariableInt(&state[i], "startup", "default_seed", (int64) seed)) exit(1); } /* set random seed unless overwritten */ if (lookupVariable(&state[0], "random_seed") == NULL) { for (i = 0; i < nclients; i++) if (!putVariableInt(&state[i], "startup", "random_seed", random_seed)) exit(1); } if (!is_no_vacuum) { fprintf(stderr, "starting vacuum..."); tryExecuteStatement(con, "vacuum pgbench_branches"); tryExecuteStatement(con, "vacuum pgbench_tellers"); tryExecuteStatement(con, "truncate pgbench_history"); fprintf(stderr, "end.\n"); if (do_vacuum_accounts) { fprintf(stderr, "starting vacuum pgbench_accounts..."); tryExecuteStatement(con, "vacuum analyze pgbench_accounts"); fprintf(stderr, "end.\n"); } } PQfinish(con); /* set up thread data structures */ threads = (TState *) pg_malloc(sizeof(TState) * nthreads); nclients_dealt = 0; for (i = 0; i < nthreads; i++) { TState *thread = &threads[i]; thread->tid = i; thread->state = &state[nclients_dealt]; thread->nstate = (nclients - nclients_dealt + nthreads - i - 1) / (nthreads - i); initRandomState(&thread->ts_choose_rs); initRandomState(&thread->ts_throttle_rs); initRandomState(&thread->ts_sample_rs); thread->logfile = NULL; /* filled in later */ thread->latency_late = 0; initStats(&thread->stats, 0); nclients_dealt += thread->nstate; } /* all clients must be assigned to a thread */ Assert(nclients_dealt == nclients); /* get start up time for the whole computation */ start_time = pg_time_now(); /* set alarm if duration is specified. */ if (duration > 0) setalarm(duration); errno = THREAD_BARRIER_INIT(&barrier, nthreads); if (errno != 0) pg_log_fatal("could not initialize barrier: %m"); #ifdef ENABLE_THREAD_SAFETY /* start all threads but thread 0 which is executed directly later */ for (i = 1; i < nthreads; i++) { TState *thread = &threads[i]; thread->create_time = pg_time_now(); errno = THREAD_CREATE(&thread->thread, threadRun, thread); if (errno != 0) { pg_log_fatal("could not create thread: %m"); exit(1); } } #else Assert(nthreads == 1); #endif /* ENABLE_THREAD_SAFETY */ /* compute when to stop */ threads[0].create_time = pg_time_now(); if (duration > 0) end_time = threads[0].create_time + (int64) 1000000 * duration; /* run thread 0 directly */ (void) threadRun(&threads[0]); /* wait for other threads and accumulate results */ initStats(&stats, 0); conn_total_duration = 0; for (i = 0; i < nthreads; i++) { TState *thread = &threads[i]; #ifdef ENABLE_THREAD_SAFETY if (i > 0) THREAD_JOIN(thread->thread); #endif /* ENABLE_THREAD_SAFETY */ for (int j = 0; j < thread->nstate; j++) if (thread->state[j].state != CSTATE_FINISHED) exit_code = 2; /* aggregate thread level stats */ mergeSimpleStats(&stats.latency, &thread->stats.latency); mergeSimpleStats(&stats.lag, &thread->stats.lag); stats.cnt += thread->stats.cnt; stats.skipped += thread->stats.skipped; latency_late += thread->latency_late; conn_total_duration += thread->conn_duration; /* first recorded benchmarking start time */ if (bench_start == 0 || thread->bench_start < bench_start) bench_start = thread->bench_start; } /* * All connections should be already closed in threadRun(), so this * disconnect_all() will be a no-op, but clean up the connecions just to * be sure. We don't need to measure the disconnection delays here. */ disconnect_all(state, nclients); /* * Beware that performance of short benchmarks with many threads and * possibly long transactions can be deceptive because threads do not * start and finish at the exact same time. The total duration computed * here encompasses all transactions so that tps shown is somehow slightly * underestimated. */ printResults(&stats, pg_time_now() - bench_start, conn_total_duration, bench_start - start_time, latency_late); THREAD_BARRIER_DESTROY(&barrier); if (exit_code != 0) pg_log_fatal("Run was aborted; the above results are incomplete."); return exit_code; } static THREAD_FUNC_RETURN_TYPE THREAD_FUNC_CC threadRun(void *arg) { TState *thread = (TState *) arg; CState *state = thread->state; pg_time_usec_t start; int nstate = thread->nstate; int remains = nstate; /* number of remaining clients */ socket_set *sockets = alloc_socket_set(nstate); int64 thread_start, last_report, next_report; StatsData last, aggs; /* open log file if requested */ if (use_log) { char logpath[MAXPGPATH]; char *prefix = logfile_prefix ? logfile_prefix : "pgbench_log"; if (thread->tid == 0) snprintf(logpath, sizeof(logpath), "%s.%d", prefix, main_pid); else snprintf(logpath, sizeof(logpath), "%s.%d.%d", prefix, main_pid, thread->tid); thread->logfile = fopen(logpath, "w"); if (thread->logfile == NULL) { pg_log_fatal("could not open logfile \"%s\": %m", logpath); goto done; } } /* explicitly initialize the state machines */ for (int i = 0; i < nstate; i++) state[i].state = CSTATE_CHOOSE_SCRIPT; /* READY */ THREAD_BARRIER_WAIT(&barrier); thread_start = pg_time_now(); thread->started_time = thread_start; thread->conn_duration = 0; last_report = thread_start; next_report = last_report + (int64) 1000000 * progress; /* STEADY */ if (!is_connect) { /* make connections to the database before starting */ for (int i = 0; i < nstate; i++) { if ((state[i].con = doConnect()) == NULL) { /* * On connection failure, we meet the barrier here in place of * GO before proceeding to the "done" path which will cleanup, * so as to avoid locking the process. * * It is unclear whether it is worth doing anything rather * than coldly exiting with an error message. */ THREAD_BARRIER_WAIT(&barrier); goto done; } } } /* GO */ THREAD_BARRIER_WAIT(&barrier); start = pg_time_now(); thread->bench_start = start; thread->throttle_trigger = start; /* * The log format currently has Unix epoch timestamps with whole numbers * of seconds. Round the first aggregate's start time down to the nearest * Unix epoch second (the very first aggregate might really have started a * fraction of a second later, but later aggregates are measured from the * whole number time that is actually logged). */ initStats(&aggs, (start + epoch_shift) / 1000000 * 1000000); last = aggs; /* loop till all clients have terminated */ while (remains > 0) { int nsocks; /* number of sockets to be waited for */ pg_time_usec_t min_usec; pg_time_usec_t now = 0; /* set this only if needed */ /* * identify which client sockets should be checked for input, and * compute the nearest time (if any) at which we need to wake up. */ clear_socket_set(sockets); nsocks = 0; min_usec = PG_INT64_MAX; for (int i = 0; i < nstate; i++) { CState *st = &state[i]; if (st->state == CSTATE_SLEEP || st->state == CSTATE_THROTTLE) { /* a nap from the script, or under throttling */ pg_time_usec_t this_usec; /* get current time if needed */ pg_time_now_lazy(&now); /* min_usec should be the minimum delay across all clients */ this_usec = (st->state == CSTATE_SLEEP ? st->sleep_until : st->txn_scheduled) - now; if (min_usec > this_usec) min_usec = this_usec; } else if (st->state == CSTATE_WAIT_RESULT) { /* * waiting for result from server - nothing to do unless the * socket is readable */ int sock = PQsocket(st->con); if (sock < 0) { pg_log_error("invalid socket: %s", PQerrorMessage(st->con)); goto done; } add_socket_to_set(sockets, sock, nsocks++); } else if (st->state != CSTATE_ABORTED && st->state != CSTATE_FINISHED) { /* * This client thread is ready to do something, so we don't * want to wait. No need to examine additional clients. */ min_usec = 0; break; } } /* also wake up to print the next progress report on time */ if (progress && min_usec > 0 && thread->tid == 0) { pg_time_now_lazy(&now); if (now >= next_report) min_usec = 0; else if ((next_report - now) < min_usec) min_usec = next_report - now; } /* * If no clients are ready to execute actions, sleep until we receive * data on some client socket or the timeout (if any) elapses. */ if (min_usec > 0) { int rc = 0; if (min_usec != PG_INT64_MAX) { if (nsocks > 0) { rc = wait_on_socket_set(sockets, min_usec); } else /* nothing active, simple sleep */ { pg_usleep(min_usec); } } else /* no explicit delay, wait without timeout */ { rc = wait_on_socket_set(sockets, 0); } if (rc < 0) { if (errno == EINTR) { /* On EINTR, go back to top of loop */ continue; } /* must be something wrong */ pg_log_error("%s() failed: %m", SOCKET_WAIT_METHOD); goto done; } } else { /* min_usec <= 0, i.e. something needs to be executed now */ /* If we didn't wait, don't try to read any data */ clear_socket_set(sockets); } /* ok, advance the state machine of each connection */ nsocks = 0; for (int i = 0; i < nstate; i++) { CState *st = &state[i]; if (st->state == CSTATE_WAIT_RESULT) { /* don't call advanceConnectionState unless data is available */ int sock = PQsocket(st->con); if (sock < 0) { pg_log_error("invalid socket: %s", PQerrorMessage(st->con)); goto done; } if (!socket_has_input(sockets, sock, nsocks++)) continue; } else if (st->state == CSTATE_FINISHED || st->state == CSTATE_ABORTED) { /* this client is done, no need to consider it anymore */ continue; } advanceConnectionState(thread, st, &aggs); /* * If advanceConnectionState changed client to finished state, * that's one fewer client that remains. */ if (st->state == CSTATE_FINISHED || st->state == CSTATE_ABORTED) remains--; } /* progress report is made by thread 0 for all threads */ if (progress && thread->tid == 0) { pg_time_usec_t now = pg_time_now(); if (now >= next_report) { /* * Horrible hack: this relies on the thread pointer we are * passed to be equivalent to threads[0], that is the first * entry of the threads array. That is why this MUST be done * by thread 0 and not any other. */ printProgressReport(thread, thread_start, now, &last, &last_report); /* * Ensure that the next report is in the future, in case * pgbench/postgres got stuck somewhere. */ do { next_report += (int64) 1000000 * progress; } while (now >= next_report); } } } done: disconnect_all(state, nstate); if (thread->logfile) { if (agg_interval > 0) { /* log aggregated but not yet reported transactions */ doLog(thread, state, &aggs, false, 0, 0); } fclose(thread->logfile); thread->logfile = NULL; } free_socket_set(sockets); THREAD_FUNC_RETURN; } static void finishCon(CState *st) { if (st->con != NULL) { PQfinish(st->con); st->con = NULL; } } /* * Support for duration option: set timer_exceeded after so many seconds. */ #ifndef WIN32 static void handle_sig_alarm(SIGNAL_ARGS) { timer_exceeded = true; } static void setalarm(int seconds) { pqsignal(SIGALRM, handle_sig_alarm); alarm(seconds); } #else /* WIN32 */ static VOID CALLBACK win32_timer_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired) { timer_exceeded = true; } static void setalarm(int seconds) { HANDLE queue; HANDLE timer; /* This function will be called at most once, so we can cheat a bit. */ queue = CreateTimerQueue(); if (seconds > ((DWORD) -1) / 1000 || !CreateTimerQueueTimer(&timer, queue, win32_timer_callback, NULL, seconds * 1000, 0, WT_EXECUTEINTIMERTHREAD | WT_EXECUTEONLYONCE)) { pg_log_fatal("failed to set timer"); exit(1); } } #endif /* WIN32 */ /* * These functions provide an abstraction layer that hides the syscall * we use to wait for input on a set of sockets. * * Currently there are two implementations, based on ppoll(2) and select(2). * ppoll() is preferred where available due to its typically higher ceiling * on the number of usable sockets. We do not use the more-widely-available * poll(2) because it only offers millisecond timeout resolution, which could * be problematic with high --rate settings. * * Function APIs: * * alloc_socket_set: allocate an empty socket set with room for up to * "count" sockets. * * free_socket_set: deallocate a socket set. * * clear_socket_set: reset a socket set to empty. * * add_socket_to_set: add socket with indicated FD to slot "idx" in the * socket set. Slots must be filled in order, starting with 0. * * wait_on_socket_set: wait for input on any socket in set, or for timeout * to expire. timeout is measured in microseconds; 0 means wait forever. * Returns result code of underlying syscall (>=0 if OK, else see errno). * * socket_has_input: after waiting, call this to see if given socket has * input. fd and idx parameters should match some previous call to * add_socket_to_set. * * Note that wait_on_socket_set destructively modifies the state of the * socket set. After checking for input, caller must apply clear_socket_set * and add_socket_to_set again before waiting again. */ #ifdef POLL_USING_PPOLL static socket_set * alloc_socket_set(int count) { socket_set *sa; sa = (socket_set *) pg_malloc0(offsetof(socket_set, pollfds) + sizeof(struct pollfd) * count); sa->maxfds = count; sa->curfds = 0; return sa; } static void free_socket_set(socket_set *sa) { pg_free(sa); } static void clear_socket_set(socket_set *sa) { sa->curfds = 0; } static void add_socket_to_set(socket_set *sa, int fd, int idx) { Assert(idx < sa->maxfds && idx == sa->curfds); sa->pollfds[idx].fd = fd; sa->pollfds[idx].events = POLLIN; sa->pollfds[idx].revents = 0; sa->curfds++; } static int wait_on_socket_set(socket_set *sa, int64 usecs) { if (usecs > 0) { struct timespec timeout; timeout.tv_sec = usecs / 1000000; timeout.tv_nsec = (usecs % 1000000) * 1000; return ppoll(sa->pollfds, sa->curfds, &timeout, NULL); } else { return ppoll(sa->pollfds, sa->curfds, NULL, NULL); } } static bool socket_has_input(socket_set *sa, int fd, int idx) { /* * In some cases, threadRun will apply clear_socket_set and then try to * apply socket_has_input anyway with arguments that it used before that, * or might've used before that except that it exited its setup loop * early. Hence, if the socket set is empty, silently return false * regardless of the parameters. If it's not empty, we can Assert that * the parameters match a previous call. */ if (sa->curfds == 0) return false; Assert(idx < sa->curfds && sa->pollfds[idx].fd == fd); return (sa->pollfds[idx].revents & POLLIN) != 0; } #endif /* POLL_USING_PPOLL */ #ifdef POLL_USING_SELECT static socket_set * alloc_socket_set(int count) { return (socket_set *) pg_malloc0(sizeof(socket_set)); } static void free_socket_set(socket_set *sa) { pg_free(sa); } static void clear_socket_set(socket_set *sa) { FD_ZERO(&sa->fds); sa->maxfd = -1; } static void add_socket_to_set(socket_set *sa, int fd, int idx) { if (fd < 0 || fd >= FD_SETSIZE) { /* * Doing a hard exit here is a bit grotty, but it doesn't seem worth * complicating the API to make it less grotty. */ pg_log_fatal("too many client connections for select()"); exit(1); } FD_SET(fd, &sa->fds); if (fd > sa->maxfd) sa->maxfd = fd; } static int wait_on_socket_set(socket_set *sa, int64 usecs) { if (usecs > 0) { struct timeval timeout; timeout.tv_sec = usecs / 1000000; timeout.tv_usec = usecs % 1000000; return select(sa->maxfd + 1, &sa->fds, NULL, NULL, &timeout); } else { return select(sa->maxfd + 1, &sa->fds, NULL, NULL, NULL); } } static bool socket_has_input(socket_set *sa, int fd, int idx) { return (FD_ISSET(fd, &sa->fds) != 0); } #endif /* POLL_USING_SELECT */