diff options
Diffstat (limited to 'src/sort.c')
-rw-r--r-- | src/sort.c | 4781 |
1 files changed, 4781 insertions, 0 deletions
diff --git a/src/sort.c b/src/sort.c new file mode 100644 index 0000000..329ed45 --- /dev/null +++ b/src/sort.c @@ -0,0 +1,4781 @@ +/* sort - sort lines of text (with all kinds of options). + Copyright (C) 1988-2020 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + + Written December 1988 by Mike Haertel. + The author may be reached (Email) at the address mike@gnu.ai.mit.edu, + or (US mail) as Mike Haertel c/o Free Software Foundation. + + Ørn E. Hansen added NLS support in 1997. */ + +#include <config.h> + +#include <getopt.h> +#include <pthread.h> +#include <sys/resource.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <signal.h> +#include <assert.h> +#include "system.h" +#include "argmatch.h" +#include "die.h" +#include "error.h" +#include "fadvise.h" +#include "filevercmp.h" +#include "flexmember.h" +#include "hard-locale.h" +#include "hash.h" +#include "heap.h" +#include "ignore-value.h" +#include "md5.h" +#include "mbswidth.h" +#include "nproc.h" +#include "physmem.h" +#include "posixver.h" +#include "quote.h" +#include "randread.h" +#include "readtokens0.h" +#include "stdlib--.h" +#include "strnumcmp.h" +#include "xmemcoll.h" +#include "xnanosleep.h" +#include "xstrtol.h" +#include "xstrtol-error.h" + +#ifndef RLIMIT_DATA +struct rlimit { size_t rlim_cur; }; +# define getrlimit(Resource, Rlp) (-1) +#endif + +/* The official name of this program (e.g., no 'g' prefix). */ +#define PROGRAM_NAME "sort" + +#define AUTHORS \ + proper_name ("Mike Haertel"), \ + proper_name ("Paul Eggert") + +#if HAVE_LANGINFO_CODESET +# include <langinfo.h> +#endif + +/* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is + present. */ +#ifndef SA_NOCLDSTOP +# define SA_NOCLDSTOP 0 +/* No sigprocmask. Always 'return' zero. */ +# define sigprocmask(How, Set, Oset) (0) +# define sigset_t int +# if ! HAVE_SIGINTERRUPT +# define siginterrupt(sig, flag) /* empty */ +# endif +#endif + +#if !defined OPEN_MAX && defined NR_OPEN +# define OPEN_MAX NR_OPEN +#endif +#if !defined OPEN_MAX +# define OPEN_MAX 20 +#endif + +#define UCHAR_LIM (UCHAR_MAX + 1) + +#ifndef DEFAULT_TMPDIR +# define DEFAULT_TMPDIR "/tmp" +#endif + +/* Maximum number of lines to merge every time a NODE is taken from + the merge queue. Node is at LEVEL in the binary merge tree, + and is responsible for merging TOTAL lines. */ +#define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1) + +/* Heuristic value for the number of lines for which it is worth creating + a subthread, during an internal merge sort. I.e., it is a small number + of "average" lines for which sorting via two threads is faster than + sorting via one on an "average" system. On a dual-core 2.0 GHz i686 + system with 3GB of RAM and 2MB of L2 cache, a file containing 128K + lines of gensort -a output is sorted slightly faster with --parallel=2 + than with --parallel=1. By contrast, using --parallel=1 is about 10% + faster than using --parallel=2 with a 64K-line input. */ +enum { SUBTHREAD_LINES_HEURISTIC = 128 * 1024 }; +verify (4 <= SUBTHREAD_LINES_HEURISTIC); + +/* The number of threads after which there are + diminishing performance gains. */ +enum { DEFAULT_MAX_THREADS = 8 }; + +/* Exit statuses. */ +enum + { + /* POSIX says to exit with status 1 if invoked with -c and the + input is not properly sorted. */ + SORT_OUT_OF_ORDER = 1, + + /* POSIX says any other irregular exit must exit with a status + code greater than 1. */ + SORT_FAILURE = 2 + }; + +enum + { + /* The number of times we should try to fork a compression process + (we retry if the fork call fails). We don't _need_ to compress + temp files, this is just to reduce disk access, so this number + can be small. Each retry doubles in duration. */ + MAX_FORK_TRIES_COMPRESS = 4, + + /* The number of times we should try to fork a decompression process. + If we can't fork a decompression process, we can't sort, so this + number should be big. Each retry doubles in duration. */ + MAX_FORK_TRIES_DECOMPRESS = 9 + }; + +enum + { + /* Level of the end-of-merge node, one level above the root. */ + MERGE_END = 0, + + /* Level of the root node in merge tree. */ + MERGE_ROOT = 1 + }; + +/* The representation of the decimal point in the current locale. */ +static int decimal_point; + +/* Thousands separator; if -1, then there isn't one. */ +static int thousands_sep; + +/* Nonzero if the corresponding locales are hard. */ +static bool hard_LC_COLLATE; +#if HAVE_NL_LANGINFO +static bool hard_LC_TIME; +#endif + +#define NONZERO(x) ((x) != 0) + +/* The kind of blanks for '-b' to skip in various options. */ +enum blanktype { bl_start, bl_end, bl_both }; + +/* The character marking end of line. Default to \n. */ +static char eolchar = '\n'; + +/* Lines are held in core as counted strings. */ +struct line +{ + char *text; /* Text of the line. */ + size_t length; /* Length including final newline. */ + char *keybeg; /* Start of first key. */ + char *keylim; /* Limit of first key. */ +}; + +/* Input buffers. */ +struct buffer +{ + char *buf; /* Dynamically allocated buffer, + partitioned into 3 regions: + - input data; + - unused area; + - an array of lines, in reverse order. */ + size_t used; /* Number of bytes used for input data. */ + size_t nlines; /* Number of lines in the line array. */ + size_t alloc; /* Number of bytes allocated. */ + size_t left; /* Number of bytes left from previous reads. */ + size_t line_bytes; /* Number of bytes to reserve for each line. */ + bool eof; /* An EOF has been read. */ +}; + +/* Sort key. */ +struct keyfield +{ + size_t sword; /* Zero-origin 'word' to start at. */ + size_t schar; /* Additional characters to skip. */ + size_t eword; /* Zero-origin last 'word' of key. */ + size_t echar; /* Additional characters in field. */ + bool const *ignore; /* Boolean array of characters to ignore. */ + char const *translate; /* Translation applied to characters. */ + bool skipsblanks; /* Skip leading blanks when finding start. */ + bool skipeblanks; /* Skip leading blanks when finding end. */ + bool numeric; /* Flag for numeric comparison. Handle + strings of digits with optional decimal + point, but no exponential notation. */ + bool random; /* Sort by random hash of key. */ + bool general_numeric; /* Flag for general, numeric comparison. + Handle numbers in exponential notation. */ + bool human_numeric; /* Flag for sorting by human readable + units with either SI or IEC prefixes. */ + bool month; /* Flag for comparison by month name. */ + bool reverse; /* Reverse the sense of comparison. */ + bool version; /* sort by version number */ + bool traditional_used; /* Traditional key option format is used. */ + struct keyfield *next; /* Next keyfield to try. */ +}; + +struct month +{ + char const *name; + int val; +}; + +/* Binary merge tree node. */ +struct merge_node +{ + struct line *lo; /* Lines to merge from LO child node. */ + struct line *hi; /* Lines to merge from HI child node. */ + struct line *end_lo; /* End of available lines from LO. */ + struct line *end_hi; /* End of available lines from HI. */ + struct line **dest; /* Pointer to destination of merge. */ + size_t nlo; /* Total Lines remaining from LO. */ + size_t nhi; /* Total lines remaining from HI. */ + struct merge_node *parent; /* Parent node. */ + struct merge_node *lo_child; /* LO child node. */ + struct merge_node *hi_child; /* HI child node. */ + unsigned int level; /* Level in merge tree. */ + bool queued; /* Node is already in heap. */ + pthread_mutex_t lock; /* Lock for node operations. */ +}; + +/* Priority queue of merge nodes. */ +struct merge_node_queue +{ + struct heap *priority_queue; /* Priority queue of merge tree nodes. */ + pthread_mutex_t mutex; /* Lock for queue operations. */ + pthread_cond_t cond; /* Conditional wait for empty queue to populate + when popping. */ +}; + +/* Used to implement --unique (-u). */ +static struct line saved_line; + +/* FIXME: None of these tables work with multibyte character sets. + Also, there are many other bugs when handling multibyte characters. + One way to fix this is to rewrite 'sort' to use wide characters + internally, but doing this with good performance is a bit + tricky. */ + +/* Table of blanks. */ +static bool blanks[UCHAR_LIM]; + +/* Table of non-printing characters. */ +static bool nonprinting[UCHAR_LIM]; + +/* Table of non-dictionary characters (not letters, digits, or blanks). */ +static bool nondictionary[UCHAR_LIM]; + +/* Translation table folding lower case to upper. */ +static char fold_toupper[UCHAR_LIM]; + +#define MONTHS_PER_YEAR 12 + +/* Table mapping month names to integers. + Alphabetic order allows binary search. */ +static struct month monthtab[] = +{ + {"APR", 4}, + {"AUG", 8}, + {"DEC", 12}, + {"FEB", 2}, + {"JAN", 1}, + {"JUL", 7}, + {"JUN", 6}, + {"MAR", 3}, + {"MAY", 5}, + {"NOV", 11}, + {"OCT", 10}, + {"SEP", 9} +}; + +/* During the merge phase, the number of files to merge at once. */ +#define NMERGE_DEFAULT 16 + +/* Minimum size for a merge or check buffer. */ +#define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line)) + +/* Minimum sort size; the code might not work with smaller sizes. */ +#define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE) + +/* The number of bytes needed for a merge or check buffer, which can + function relatively efficiently even if it holds only one line. If + a longer line is seen, this value is increased. */ +static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024); + +/* The approximate maximum number of bytes of main memory to use, as + specified by the user. Zero if the user has not specified a size. */ +static size_t sort_size; + +/* The initial allocation factor for non-regular files. + This is used, e.g., when reading from a pipe. + Don't make it too big, since it is multiplied by ~130 to + obtain the size of the actual buffer sort will allocate. + Also, there may be 8 threads all doing this at the same time. */ +#define INPUT_FILE_SIZE_GUESS (128 * 1024) + +/* Array of directory names in which any temporary files are to be created. */ +static char const **temp_dirs; + +/* Number of temporary directory names used. */ +static size_t temp_dir_count; + +/* Number of allocated slots in temp_dirs. */ +static size_t temp_dir_alloc; + +/* Flag to reverse the order of all comparisons. */ +static bool reverse; + +/* Flag for stable sort. This turns off the last ditch bytewise + comparison of lines, and instead leaves lines in the same order + they were read if all keys compare equal. */ +static bool stable; + +/* If TAB has this value, blanks separate fields. */ +enum { TAB_DEFAULT = CHAR_MAX + 1 }; + +/* Tab character separating fields. If TAB_DEFAULT, then fields are + separated by the empty string between a non-blank character and a blank + character. */ +static int tab = TAB_DEFAULT; + +/* Flag to remove consecutive duplicate lines from the output. + Only the last of a sequence of equal lines will be output. */ +static bool unique; + +/* Nonzero if any of the input files are the standard input. */ +static bool have_read_stdin; + +/* List of key field comparisons to be tried. */ +static struct keyfield *keylist; + +/* Program used to (de)compress temp files. Must accept -d. */ +static char const *compress_program; + +/* Annotate the output with extra info to aid the user. */ +static bool debug; + +/* Maximum number of files to merge in one go. If more than this + number are present, temp files will be used. */ +static unsigned int nmerge = NMERGE_DEFAULT; + +/* Output an error to stderr and exit using async-signal-safe routines. + This can be used safely from signal handlers, + and between fork and exec of multithreaded processes. */ + +static void async_safe_die (int, const char *) ATTRIBUTE_NORETURN; +static void +async_safe_die (int errnum, const char *errstr) +{ + ignore_value (write (STDERR_FILENO, errstr, strlen (errstr))); + + /* Even if defined HAVE_STRERROR_R, we can't use it, + as it may return a translated string etc. and even if not + may call malloc which is unsafe. We might improve this + by testing for sys_errlist and using that if available. + For now just report the error number. */ + if (errnum) + { + char errbuf[INT_BUFSIZE_BOUND (errnum)]; + char *p = inttostr (errnum, errbuf); + ignore_value (write (STDERR_FILENO, ": errno ", 8)); + ignore_value (write (STDERR_FILENO, p, strlen (p))); + } + + ignore_value (write (STDERR_FILENO, "\n", 1)); + + _exit (SORT_FAILURE); +} + +/* Report MESSAGE for FILE, then clean up and exit. + If FILE is null, it represents standard output. */ + +static void sort_die (char const *, char const *) ATTRIBUTE_NORETURN; +static void +sort_die (char const *message, char const *file) +{ + die (SORT_FAILURE, errno, "%s: %s", message, + quotef (file ? file : _("standard output"))); +} + +void +usage (int status) +{ + if (status != EXIT_SUCCESS) + emit_try_help (); + else + { + printf (_("\ +Usage: %s [OPTION]... [FILE]...\n\ + or: %s [OPTION]... --files0-from=F\n\ +"), + program_name, program_name); + fputs (_("\ +Write sorted concatenation of all FILE(s) to standard output.\n\ +"), stdout); + + emit_stdin_note (); + emit_mandatory_arg_note (); + + fputs (_("\ +Ordering options:\n\ +\n\ +"), stdout); + fputs (_("\ + -b, --ignore-leading-blanks ignore leading blanks\n\ + -d, --dictionary-order consider only blanks and alphanumeric characters\ +\n\ + -f, --ignore-case fold lower case to upper case characters\n\ +"), stdout); + fputs (_("\ + -g, --general-numeric-sort compare according to general numerical value\n\ + -i, --ignore-nonprinting consider only printable characters\n\ + -M, --month-sort compare (unknown) < 'JAN' < ... < 'DEC'\n\ +"), stdout); + fputs (_("\ + -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\ +"), stdout); + fputs (_("\ + -n, --numeric-sort compare according to string numerical value\n\ + -R, --random-sort shuffle, but group identical keys. See shuf(1)\n\ + --random-source=FILE get random bytes from FILE\n\ + -r, --reverse reverse the result of comparisons\n\ +"), stdout); + fputs (_("\ + --sort=WORD sort according to WORD:\n\ + general-numeric -g, human-numeric -h, month -M,\ +\n\ + numeric -n, random -R, version -V\n\ + -V, --version-sort natural sort of (version) numbers within text\n\ +\n\ +"), stdout); + fputs (_("\ +Other options:\n\ +\n\ +"), stdout); + fputs (_("\ + --batch-size=NMERGE merge at most NMERGE inputs at once;\n\ + for more use temp files\n\ +"), stdout); + fputs (_("\ + -c, --check, --check=diagnose-first check for sorted input; do not sort\n\ + -C, --check=quiet, --check=silent like -c, but do not report first bad line\ +\n\ + --compress-program=PROG compress temporaries with PROG;\n\ + decompress them with PROG -d\n\ +"), stdout); + fputs (_("\ + --debug annotate the part of the line used to sort,\n\ + and warn about questionable usage to stderr\n\ + --files0-from=F read input from the files specified by\n\ + NUL-terminated names in file F;\n\ + If F is - then read names from standard input\n\ +"), stdout); + fputs (_("\ + -k, --key=KEYDEF sort via a key; KEYDEF gives location and type\n\ + -m, --merge merge already sorted files; do not sort\n\ +"), stdout); + fputs (_("\ + -o, --output=FILE write result to FILE instead of standard output\n\ + -s, --stable stabilize sort by disabling last-resort comparison\ +\n\ + -S, --buffer-size=SIZE use SIZE for main memory buffer\n\ +"), stdout); + printf (_("\ + -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\ + -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\ + multiple options specify multiple directories\n\ + --parallel=N change the number of sorts run concurrently to N\n\ + -u, --unique with -c, check for strict ordering;\n\ + without -c, output only the first of an equal run\ +\n\ +"), DEFAULT_TMPDIR); + fputs (_("\ + -z, --zero-terminated line delimiter is NUL, not newline\n\ +"), stdout); + fputs (HELP_OPTION_DESCRIPTION, stdout); + fputs (VERSION_OPTION_DESCRIPTION, stdout); + fputs (_("\ +\n\ +KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, where F is a\n\ +field number and C a character position in the field; both are origin 1, and\n\ +the stop position defaults to the line's end. If neither -t nor -b is in\n\ +effect, characters in a field are counted from the beginning of the preceding\n\ +whitespace. OPTS is one or more single-letter ordering options [bdfgiMhnRrV],\ +\n\ +which override global ordering options for that key. If no key is given, use\n\ +the entire line as the key. Use --debug to diagnose incorrect key usage.\n\ +\n\ +SIZE may be followed by the following multiplicative suffixes:\n\ +"), stdout); + fputs (_("\ +% 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\ +\n\ +*** WARNING ***\n\ +The locale specified by the environment affects sort order.\n\ +Set LC_ALL=C to get the traditional sort order that uses\n\ +native byte values.\n\ +"), stdout ); + emit_ancillary_info (PROGRAM_NAME); + } + + exit (status); +} + +/* For long options that have no equivalent short option, use a + non-character as a pseudo short option, starting with CHAR_MAX + 1. */ +enum +{ + CHECK_OPTION = CHAR_MAX + 1, + COMPRESS_PROGRAM_OPTION, + DEBUG_PROGRAM_OPTION, + FILES0_FROM_OPTION, + NMERGE_OPTION, + RANDOM_SOURCE_OPTION, + SORT_OPTION, + PARALLEL_OPTION +}; + +static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z"; + +static struct option const long_options[] = +{ + {"ignore-leading-blanks", no_argument, NULL, 'b'}, + {"check", optional_argument, NULL, CHECK_OPTION}, + {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION}, + {"debug", no_argument, NULL, DEBUG_PROGRAM_OPTION}, + {"dictionary-order", no_argument, NULL, 'd'}, + {"ignore-case", no_argument, NULL, 'f'}, + {"files0-from", required_argument, NULL, FILES0_FROM_OPTION}, + {"general-numeric-sort", no_argument, NULL, 'g'}, + {"ignore-nonprinting", no_argument, NULL, 'i'}, + {"key", required_argument, NULL, 'k'}, + {"merge", no_argument, NULL, 'm'}, + {"month-sort", no_argument, NULL, 'M'}, + {"numeric-sort", no_argument, NULL, 'n'}, + {"human-numeric-sort", no_argument, NULL, 'h'}, + {"version-sort", no_argument, NULL, 'V'}, + {"random-sort", no_argument, NULL, 'R'}, + {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION}, + {"sort", required_argument, NULL, SORT_OPTION}, + {"output", required_argument, NULL, 'o'}, + {"reverse", no_argument, NULL, 'r'}, + {"stable", no_argument, NULL, 's'}, + {"batch-size", required_argument, NULL, NMERGE_OPTION}, + {"buffer-size", required_argument, NULL, 'S'}, + {"field-separator", required_argument, NULL, 't'}, + {"temporary-directory", required_argument, NULL, 'T'}, + {"unique", no_argument, NULL, 'u'}, + {"zero-terminated", no_argument, NULL, 'z'}, + {"parallel", required_argument, NULL, PARALLEL_OPTION}, + {GETOPT_HELP_OPTION_DECL}, + {GETOPT_VERSION_OPTION_DECL}, + {NULL, 0, NULL, 0}, +}; + +#define CHECK_TABLE \ + _ct_("quiet", 'C') \ + _ct_("silent", 'C') \ + _ct_("diagnose-first", 'c') + +static char const *const check_args[] = +{ +#define _ct_(_s, _c) _s, + CHECK_TABLE NULL +#undef _ct_ +}; +static char const check_types[] = +{ +#define _ct_(_s, _c) _c, + CHECK_TABLE +#undef _ct_ +}; + +#define SORT_TABLE \ + _st_("general-numeric", 'g') \ + _st_("human-numeric", 'h') \ + _st_("month", 'M') \ + _st_("numeric", 'n') \ + _st_("random", 'R') \ + _st_("version", 'V') + +static char const *const sort_args[] = +{ +#define _st_(_s, _c) _s, + SORT_TABLE NULL +#undef _st_ +}; +static char const sort_types[] = +{ +#define _st_(_s, _c) _c, + SORT_TABLE +#undef _st_ +}; + +/* The set of signals that are caught. */ +static sigset_t caught_signals; + +/* Critical section status. */ +struct cs_status +{ + bool valid; + sigset_t sigs; +}; + +/* Enter a critical section. */ +static void +cs_enter (struct cs_status *status) +{ + int ret = pthread_sigmask (SIG_BLOCK, &caught_signals, &status->sigs); + status->valid = ret == 0; +} + +/* Leave a critical section. */ +static void +cs_leave (struct cs_status const *status) +{ + if (status->valid) + { + /* Ignore failure when restoring the signal mask. */ + pthread_sigmask (SIG_SETMASK, &status->sigs, NULL); + } +} + +/* Possible states for a temp file. If compressed, the file's status + is unreaped or reaped, depending on whether 'sort' has waited for + the subprocess to finish. */ +enum { UNCOMPRESSED, UNREAPED, REAPED }; + +/* The list of temporary files. */ +struct tempnode +{ + struct tempnode *volatile next; + pid_t pid; /* The subprocess PID; undefined if state == UNCOMPRESSED. */ + char state; + char name[FLEXIBLE_ARRAY_MEMBER]; +}; +static struct tempnode *volatile temphead; +static struct tempnode *volatile *temptail = &temphead; + +/* A file to be sorted. */ +struct sortfile +{ + /* The file's name. */ + char const *name; + + /* Non-null if this is a temporary file, in which case NAME == TEMP->name. */ + struct tempnode *temp; +}; + +/* Map PIDs of unreaped subprocesses to their struct tempnode objects. */ +static Hash_table *proctab; + +enum { INIT_PROCTAB_SIZE = 47 }; + +static size_t +proctab_hasher (void const *entry, size_t tabsize) +{ + struct tempnode const *node = entry; + return node->pid % tabsize; +} + +static bool +proctab_comparator (void const *e1, void const *e2) +{ + struct tempnode const *n1 = e1; + struct tempnode const *n2 = e2; + return n1->pid == n2->pid; +} + +/* The number of unreaped child processes. */ +static pid_t nprocs; + +static bool delete_proc (pid_t); + +/* If PID is positive, wait for the child process with that PID to + exit, and assume that PID has already been removed from the process + table. If PID is 0 or -1, clean up some child that has exited (by + waiting for it, and removing it from the proc table) and return the + child's process ID. However, if PID is 0 and no children have + exited, return 0 without waiting. */ + +static pid_t +reap (pid_t pid) +{ + int status; + pid_t cpid = waitpid ((pid ? pid : -1), &status, (pid ? 0 : WNOHANG)); + + if (cpid < 0) + die (SORT_FAILURE, errno, _("waiting for %s [-d]"), + quoteaf (compress_program)); + else if (0 < cpid && (0 < pid || delete_proc (cpid))) + { + if (! WIFEXITED (status) || WEXITSTATUS (status)) + die (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"), + quoteaf (compress_program)); + --nprocs; + } + + return cpid; +} + +/* TEMP represents a new process; add it to the process table. Create + the process table the first time it's called. */ + +static void +register_proc (struct tempnode *temp) +{ + if (! proctab) + { + proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL, + proctab_hasher, + proctab_comparator, + NULL); + if (! proctab) + xalloc_die (); + } + + temp->state = UNREAPED; + + if (! hash_insert (proctab, temp)) + xalloc_die (); +} + +/* If PID is in the process table, remove it and return true. + Otherwise, return false. */ + +static bool +delete_proc (pid_t pid) +{ + struct tempnode test; + + test.pid = pid; + struct tempnode *node = hash_delete (proctab, &test); + if (! node) + return false; + node->state = REAPED; + return true; +} + +/* Remove PID from the process table, and wait for it to exit if it + hasn't already. */ + +static void +wait_proc (pid_t pid) +{ + if (delete_proc (pid)) + reap (pid); +} + +/* Reap any exited children. Do not block; reap only those that have + already exited. */ + +static void +reap_exited (void) +{ + while (0 < nprocs && reap (0)) + continue; +} + +/* Reap at least one exited child, waiting if necessary. */ + +static void +reap_some (void) +{ + reap (-1); + reap_exited (); +} + +/* Reap all children, waiting if necessary. */ + +static void +reap_all (void) +{ + while (0 < nprocs) + reap (-1); +} + +/* Clean up any remaining temporary files. */ + +static void +cleanup (void) +{ + struct tempnode const *node; + + for (node = temphead; node; node = node->next) + unlink (node->name); + temphead = NULL; +} + +/* Cleanup actions to take when exiting. */ + +static void +exit_cleanup (void) +{ + if (temphead) + { + /* Clean up any remaining temporary files in a critical section so + that a signal handler does not try to clean them too. */ + struct cs_status cs; + cs_enter (&cs); + cleanup (); + cs_leave (&cs); + } + + close_stdout (); +} + +/* Create a new temporary file, returning its newly allocated tempnode. + Store into *PFD the file descriptor open for writing. + If the creation fails, return NULL and store -1 into *PFD if the + failure is due to file descriptor exhaustion and + SURVIVE_FD_EXHAUSTION; otherwise, die. */ + +static struct tempnode * +create_temp_file (int *pfd, bool survive_fd_exhaustion) +{ + static char const slashbase[] = "/sortXXXXXX"; + static size_t temp_dir_index; + int fd; + int saved_errno; + char const *temp_dir = temp_dirs[temp_dir_index]; + size_t len = strlen (temp_dir); + struct tempnode *node = + xmalloc (FLEXSIZEOF (struct tempnode, name, len + sizeof slashbase)); + char *file = node->name; + struct cs_status cs; + + memcpy (file, temp_dir, len); + memcpy (file + len, slashbase, sizeof slashbase); + node->next = NULL; + if (++temp_dir_index == temp_dir_count) + temp_dir_index = 0; + + /* Create the temporary file in a critical section, to avoid races. */ + cs_enter (&cs); + fd = mkostemp (file, O_CLOEXEC); + if (0 <= fd) + { + *temptail = node; + temptail = &node->next; + } + saved_errno = errno; + cs_leave (&cs); + errno = saved_errno; + + if (fd < 0) + { + if (! (survive_fd_exhaustion && errno == EMFILE)) + die (SORT_FAILURE, errno, _("cannot create temporary file in %s"), + quoteaf (temp_dir)); + free (node); + node = NULL; + } + + *pfd = fd; + return node; +} + +/* Return a pointer to stdout status, or NULL on failure. */ + +static struct stat * +get_outstatus (void) +{ + static int outstat_errno; + static struct stat outstat; + if (outstat_errno == 0) + outstat_errno = fstat (STDOUT_FILENO, &outstat) == 0 ? -1 : errno; + return outstat_errno < 0 ? &outstat : NULL; +} + +/* Return a stream for FILE, opened with mode HOW. If HOW is "w", + the file is already open on standard output, and needs to be + truncated unless FILE is null. When opening for input, "-" + means standard input. To avoid confusion, do not return file + descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when + opening an ordinary FILE. Return NULL if unsuccessful. + + Use fadvise to specify an access pattern for input files. + There are a few hints we could possibly provide, + and after careful testing it was decided that + specifying FADVISE_SEQUENTIAL was not detrimental + to any cases. On Linux 2.6.31, this option doubles + the size of read ahead performed and thus was seen to + benefit these cases: + Merging + Sorting with a smaller internal buffer + Reading from faster flash devices + + In _addition_ one could also specify other hints... + + FADVISE_WILLNEED was tested, but Linux 2.6.31 + at least uses that to _synchronously_ prepopulate the cache + with the specified range. While sort does need to + read all of its input before outputting, a synchronous + read of the whole file up front precludes any processing + that sort could do in parallel with the system doing + read ahead of the data. This was seen to have negative effects + in a couple of cases: + Merging + Sorting with a smaller internal buffer + This option was seen to shorten the runtime for sort + on a multicore system with lots of RAM and other processes + competing for CPU. It could be argued that more explicit + scheduling hints with 'nice' et. al. are more appropriate + for this situation. + + FADVISE_NOREUSE is a possibility as it could lower + the priority of input data in the cache as sort will + only need to process it once. However its functionality + has changed over Linux kernel versions and as of 2.6.31 + it does nothing and thus we can't depend on what it might + do in future. + + FADVISE_DONTNEED is not appropriate for user specified + input files, but for temp files we do want to drop the + cache immediately after processing. This is done implicitly + however when the files are unlinked. */ + +static FILE * +stream_open (char const *file, char const *how) +{ + FILE *fp; + + if (*how == 'r') + { + if (STREQ (file, "-")) + { + have_read_stdin = true; + fp = stdin; + } + else + { + int fd = open (file, O_RDONLY | O_CLOEXEC); + fp = fd < 0 ? NULL : fdopen (fd, how); + } + fadvise (fp, FADVISE_SEQUENTIAL); + } + else if (*how == 'w') + { + if (file && ftruncate (STDOUT_FILENO, 0) != 0) + { + int ftruncate_errno = errno; + struct stat *outst = get_outstatus (); + if (!outst || S_ISREG (outst->st_mode) || S_TYPEISSHM (outst)) + die (SORT_FAILURE, ftruncate_errno, _("%s: error truncating"), + quotef (file)); + } + fp = stdout; + } + else + assert (!"unexpected mode passed to stream_open"); + + return fp; +} + +/* Same as stream_open, except always return a non-null value; die on + failure. */ + +static FILE * +xfopen (char const *file, char const *how) +{ + FILE *fp = stream_open (file, how); + if (!fp) + sort_die (_("open failed"), file); + return fp; +} + +/* Close FP, whose name is FILE, and report any errors. */ + +static void +xfclose (FILE *fp, char const *file) +{ + switch (fileno (fp)) + { + case STDIN_FILENO: + /* Allow reading stdin from tty more than once. */ + if (feof (fp)) + clearerr (fp); + break; + + case STDOUT_FILENO: + /* Don't close stdout just yet. close_stdout does that. */ + if (fflush (fp) != 0) + sort_die (_("fflush failed"), file); + break; + + default: + if (fclose (fp) != 0) + sort_die (_("close failed"), file); + break; + } +} + +/* Move OLDFD to NEWFD. If OLDFD != NEWFD, NEWFD is not close-on-exec. */ + +static void +move_fd (int oldfd, int newfd) +{ + if (oldfd != newfd) + { + /* This should never fail for our usage. */ + dup2 (oldfd, newfd); + close (oldfd); + } +} + +/* Fork a child process for piping to and do common cleanup. The + TRIES parameter specifies how many times to try to fork before + giving up. Return the PID of the child, or -1 (setting errno) + on failure. */ + +static pid_t +pipe_fork (int pipefds[2], size_t tries) +{ +#if HAVE_WORKING_FORK + struct tempnode *saved_temphead; + int saved_errno; + double wait_retry = 0.25; + pid_t pid IF_LINT ( = -1); + struct cs_status cs; + + if (pipe2 (pipefds, O_CLOEXEC) < 0) + return -1; + + /* At least NMERGE + 1 subprocesses are needed. More could be created, but + uncontrolled subprocess generation can hurt performance significantly. + Allow at most NMERGE + 2 subprocesses, on the theory that there + may be some useful parallelism by letting compression for the + previous merge finish (1 subprocess) in parallel with the current + merge (NMERGE + 1 subprocesses). */ + + if (nmerge + 1 < nprocs) + reap_some (); + + while (tries--) + { + /* This is so the child process won't delete our temp files + if it receives a signal before exec-ing. */ + cs_enter (&cs); + saved_temphead = temphead; + temphead = NULL; + + pid = fork (); + saved_errno = errno; + if (pid) + temphead = saved_temphead; + + cs_leave (&cs); + errno = saved_errno; + + if (0 <= pid || errno != EAGAIN) + break; + else + { + xnanosleep (wait_retry); + wait_retry *= 2; + reap_exited (); + } + } + + if (pid < 0) + { + saved_errno = errno; + close (pipefds[0]); + close (pipefds[1]); + errno = saved_errno; + } + else if (pid == 0) + { + close (STDIN_FILENO); + close (STDOUT_FILENO); + } + else + ++nprocs; + + return pid; + +#else /* ! HAVE_WORKING_FORK */ + return -1; +#endif +} + +/* Create a temporary file and, if asked for, start a compressor + to that file. Set *PFP to the file handle and return + the address of the new temp node. If the creation + fails, return NULL if the failure is due to file descriptor + exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */ + +static struct tempnode * +maybe_create_temp (FILE **pfp, bool survive_fd_exhaustion) +{ + int tempfd; + struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion); + if (! node) + return NULL; + + node->state = UNCOMPRESSED; + + if (compress_program) + { + int pipefds[2]; + + node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS); + if (0 < node->pid) + { + close (tempfd); + close (pipefds[0]); + tempfd = pipefds[1]; + + register_proc (node); + } + else if (node->pid == 0) + { + /* Being the child of a multithreaded program before exec, + we're restricted to calling async-signal-safe routines here. */ + close (pipefds[1]); + move_fd (tempfd, STDOUT_FILENO); + move_fd (pipefds[0], STDIN_FILENO); + + execlp (compress_program, compress_program, (char *) NULL); + + async_safe_die (errno, "couldn't execute compress program"); + } + } + + *pfp = fdopen (tempfd, "w"); + if (! *pfp) + sort_die (_("couldn't create temporary file"), node->name); + + return node; +} + +/* Create a temporary file and, if asked for, start a compressor + to that file. Set *PFP to the file handle and return the address + of the new temp node. Die on failure. */ + +static struct tempnode * +create_temp (FILE **pfp) +{ + return maybe_create_temp (pfp, false); +} + +/* Open a compressed temp file and start a decompression process through + which to filter the input. Return NULL (setting errno to + EMFILE) if we ran out of file descriptors, and die on any other + kind of failure. */ + +static FILE * +open_temp (struct tempnode *temp) +{ + int tempfd, pipefds[2]; + FILE *fp = NULL; + + if (temp->state == UNREAPED) + wait_proc (temp->pid); + + tempfd = open (temp->name, O_RDONLY); + if (tempfd < 0) + return NULL; + + pid_t child = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS); + + switch (child) + { + case -1: + if (errno != EMFILE) + die (SORT_FAILURE, errno, _("couldn't create process for %s -d"), + quoteaf (compress_program)); + close (tempfd); + errno = EMFILE; + break; + + case 0: + /* Being the child of a multithreaded program before exec, + we're restricted to calling async-signal-safe routines here. */ + close (pipefds[0]); + move_fd (tempfd, STDIN_FILENO); + move_fd (pipefds[1], STDOUT_FILENO); + + execlp (compress_program, compress_program, "-d", (char *) NULL); + + async_safe_die (errno, "couldn't execute compress program (with -d)"); + + default: + temp->pid = child; + register_proc (temp); + close (tempfd); + close (pipefds[1]); + + fp = fdopen (pipefds[0], "r"); + if (! fp) + { + int saved_errno = errno; + close (pipefds[0]); + errno = saved_errno; + } + break; + } + + return fp; +} + +/* Append DIR to the array of temporary directory names. */ +static void +add_temp_dir (char const *dir) +{ + if (temp_dir_count == temp_dir_alloc) + temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc); + + temp_dirs[temp_dir_count++] = dir; +} + +/* Remove NAME from the list of temporary files. */ + +static void +zaptemp (char const *name) +{ + struct tempnode *volatile *pnode; + struct tempnode *node; + struct tempnode *next; + int unlink_status; + int unlink_errno = 0; + struct cs_status cs; + + for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next) + continue; + + if (node->state == UNREAPED) + wait_proc (node->pid); + + /* Unlink the temporary file in a critical section to avoid races. */ + next = node->next; + cs_enter (&cs); + unlink_status = unlink (name); + unlink_errno = errno; + *pnode = next; + cs_leave (&cs); + + if (unlink_status != 0) + error (0, unlink_errno, _("warning: cannot remove: %s"), quotef (name)); + if (! next) + temptail = pnode; + free (node); +} + +#if HAVE_NL_LANGINFO + +static int +struct_month_cmp (void const *m1, void const *m2) +{ + struct month const *month1 = m1; + struct month const *month2 = m2; + return strcmp (month1->name, month2->name); +} + +#endif + +/* Initialize the character class tables. */ + +static void +inittables (void) +{ + size_t i; + + for (i = 0; i < UCHAR_LIM; ++i) + { + blanks[i] = field_sep (i); + nonprinting[i] = ! isprint (i); + nondictionary[i] = ! isalnum (i) && ! field_sep (i); + fold_toupper[i] = toupper (i); + } + +#if HAVE_NL_LANGINFO + /* If we're not in the "C" locale, read different names for months. */ + if (hard_LC_TIME) + { + for (i = 0; i < MONTHS_PER_YEAR; i++) + { + char const *s; + size_t s_len; + size_t j, k; + char *name; + + s = nl_langinfo (ABMON_1 + i); + s_len = strlen (s); + monthtab[i].name = name = xmalloc (s_len + 1); + monthtab[i].val = i + 1; + + for (j = k = 0; j < s_len; j++) + if (! isblank (to_uchar (s[j]))) + name[k++] = fold_toupper[to_uchar (s[j])]; + name[k] = '\0'; + } + qsort (monthtab, MONTHS_PER_YEAR, sizeof *monthtab, struct_month_cmp); + } +#endif +} + +/* Specify how many inputs may be merged at once. + This may be set on the command-line with the + --batch-size option. */ +static void +specify_nmerge (int oi, char c, char const *s) +{ + uintmax_t n; + struct rlimit rlimit; + enum strtol_error e = xstrtoumax (s, NULL, 10, &n, ""); + + /* Try to find out how many file descriptors we'll be able + to open. We need at least nmerge + 3 (STDIN_FILENO, + STDOUT_FILENO and STDERR_FILENO). */ + unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0 + ? rlimit.rlim_cur + : OPEN_MAX) + - 3); + + if (e == LONGINT_OK) + { + nmerge = n; + if (nmerge != n) + e = LONGINT_OVERFLOW; + else + { + if (nmerge < 2) + { + error (0, 0, _("invalid --%s argument %s"), + long_options[oi].name, quote (s)); + die (SORT_FAILURE, 0, + _("minimum --%s argument is %s"), + long_options[oi].name, quote ("2")); + } + else if (max_nmerge < nmerge) + { + e = LONGINT_OVERFLOW; + } + else + return; + } + } + + if (e == LONGINT_OVERFLOW) + { + char max_nmerge_buf[INT_BUFSIZE_BOUND (max_nmerge)]; + error (0, 0, _("--%s argument %s too large"), + long_options[oi].name, quote (s)); + die (SORT_FAILURE, 0, + _("maximum --%s argument with current rlimit is %s"), + long_options[oi].name, + uinttostr (max_nmerge, max_nmerge_buf)); + } + else + xstrtol_fatal (e, oi, c, long_options, s); +} + +/* Specify the amount of main memory to use when sorting. */ +static void +specify_sort_size (int oi, char c, char const *s) +{ + uintmax_t n; + char *suffix; + enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ"); + + /* The default unit is KiB. */ + if (e == LONGINT_OK && ISDIGIT (suffix[-1])) + { + if (n <= UINTMAX_MAX / 1024) + n *= 1024; + else + e = LONGINT_OVERFLOW; + } + + /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */ + if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1]) + switch (suffix[0]) + { + case 'b': + e = LONGINT_OK; + break; + + case '%': + { + double mem = physmem_total () * n / 100; + + /* Use "<", not "<=", to avoid problems with rounding. */ + if (mem < UINTMAX_MAX) + { + n = mem; + e = LONGINT_OK; + } + else + e = LONGINT_OVERFLOW; + } + break; + } + + if (e == LONGINT_OK) + { + /* If multiple sort sizes are specified, take the maximum, so + that option order does not matter. */ + if (n < sort_size) + return; + + sort_size = n; + if (sort_size == n) + { + sort_size = MAX (sort_size, MIN_SORT_SIZE); + return; + } + + e = LONGINT_OVERFLOW; + } + + xstrtol_fatal (e, oi, c, long_options, s); +} + +/* Specify the number of threads to spawn during internal sort. */ +static size_t +specify_nthreads (int oi, char c, char const *s) +{ + uintmax_t nthreads; + enum strtol_error e = xstrtoumax (s, NULL, 10, &nthreads, ""); + if (e == LONGINT_OVERFLOW) + return SIZE_MAX; + if (e != LONGINT_OK) + xstrtol_fatal (e, oi, c, long_options, s); + if (SIZE_MAX < nthreads) + nthreads = SIZE_MAX; + if (nthreads == 0) + die (SORT_FAILURE, 0, _("number in parallel must be nonzero")); + return nthreads; +} + +/* Return the default sort size. */ +static size_t +default_sort_size (void) +{ + /* Let SIZE be MEM, but no more than the maximum object size, + total memory, or system resource limits. Don't bother to check + for values like RLIM_INFINITY since in practice they are not much + less than SIZE_MAX. */ + size_t size = SIZE_MAX; + struct rlimit rlimit; + if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size) + size = rlimit.rlim_cur; +#ifdef RLIMIT_AS + if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size) + size = rlimit.rlim_cur; +#endif + + /* Leave a large safety margin for the above limits, as failure can + occur when they are exceeded. */ + size /= 2; + +#ifdef RLIMIT_RSS + /* Leave a 1/16 margin for RSS to leave room for code, stack, etc. + Exceeding RSS is not fatal, but can be quite slow. */ + if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size) + size = rlimit.rlim_cur / 16 * 15; +#endif + + /* Let MEM be available memory or 1/8 of total memory, whichever + is greater. */ + double avail = physmem_available (); + double total = physmem_total (); + double mem = MAX (avail, total / 8); + + /* Leave a 1/4 margin for physical memory. */ + if (total * 0.75 < size) + size = total * 0.75; + + /* Return the minimum of MEM and SIZE, but no less than + MIN_SORT_SIZE. Avoid the MIN macro here, as it is not quite + right when only one argument is floating point. */ + if (mem < size) + size = mem; + return MAX (size, MIN_SORT_SIZE); +} + +/* Return the sort buffer size to use with the input files identified + by FPS and FILES, which are alternate names of the same files. + NFILES gives the number of input files; NFPS may be less. Assume + that each input line requires LINE_BYTES extra bytes' worth of line + information. Do not exceed the size bound specified by the user + (or a default size bound, if the user does not specify one). */ + +static size_t +sort_buffer_size (FILE *const *fps, size_t nfps, + char *const *files, size_t nfiles, + size_t line_bytes) +{ + /* A bound on the input size. If zero, the bound hasn't been + determined yet. */ + static size_t size_bound; + + /* In the worst case, each input byte is a newline. */ + size_t worst_case_per_input_byte = line_bytes + 1; + + /* Keep enough room for one extra input line and an extra byte. + This extra room might be needed when preparing to read EOF. */ + size_t size = worst_case_per_input_byte + 1; + + for (size_t i = 0; i < nfiles; i++) + { + struct stat st; + off_t file_size; + size_t worst_case; + + if ((i < nfps ? fstat (fileno (fps[i]), &st) + : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st) + : stat (files[i], &st)) + != 0) + sort_die (_("stat failed"), files[i]); + + if (S_ISREG (st.st_mode)) + file_size = st.st_size; + else + { + /* The file has unknown size. If the user specified a sort + buffer size, use that; otherwise, guess the size. */ + if (sort_size) + return sort_size; + file_size = INPUT_FILE_SIZE_GUESS; + } + + if (! size_bound) + { + size_bound = sort_size; + if (! size_bound) + size_bound = default_sort_size (); + } + + /* Add the amount of memory needed to represent the worst case + where the input consists entirely of newlines followed by a + single non-newline. Check for overflow. */ + worst_case = file_size * worst_case_per_input_byte + 1; + if (file_size != worst_case / worst_case_per_input_byte + || size_bound - size <= worst_case) + return size_bound; + size += worst_case; + } + + return size; +} + +/* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES + must be at least sizeof (struct line). Allocate ALLOC bytes + initially. */ + +static void +initbuf (struct buffer *buf, size_t line_bytes, size_t alloc) +{ + /* Ensure that the line array is properly aligned. If the desired + size cannot be allocated, repeatedly halve it until allocation + succeeds. The smaller allocation may hurt overall performance, + but that's better than failing. */ + while (true) + { + alloc += sizeof (struct line) - alloc % sizeof (struct line); + buf->buf = malloc (alloc); + if (buf->buf) + break; + alloc /= 2; + if (alloc <= line_bytes + 1) + xalloc_die (); + } + + buf->line_bytes = line_bytes; + buf->alloc = alloc; + buf->used = buf->left = buf->nlines = 0; + buf->eof = false; +} + +/* Return one past the limit of the line array. */ + +static inline struct line * +buffer_linelim (struct buffer const *buf) +{ + void *linelim = buf->buf + buf->alloc; + return linelim; +} + +/* Return a pointer to the first character of the field specified + by KEY in LINE. */ + +static char * +begfield (struct line const *line, struct keyfield const *key) +{ + char *ptr = line->text, *lim = ptr + line->length - 1; + size_t sword = key->sword; + size_t schar = key->schar; + + /* The leading field separator itself is included in a field when -t + is absent. */ + + if (tab != TAB_DEFAULT) + while (ptr < lim && sword--) + { + while (ptr < lim && *ptr != tab) + ++ptr; + if (ptr < lim) + ++ptr; + } + else + while (ptr < lim && sword--) + { + while (ptr < lim && blanks[to_uchar (*ptr)]) + ++ptr; + while (ptr < lim && !blanks[to_uchar (*ptr)]) + ++ptr; + } + + /* If we're ignoring leading blanks when computing the Start + of the field, skip past them here. */ + if (key->skipsblanks) + while (ptr < lim && blanks[to_uchar (*ptr)]) + ++ptr; + + /* Advance PTR by SCHAR (if possible), but no further than LIM. */ + ptr = MIN (lim, ptr + schar); + + return ptr; +} + +/* Return the limit of (a pointer to the first character after) the field + in LINE specified by KEY. */ + +static char * +limfield (struct line const *line, struct keyfield const *key) +{ + char *ptr = line->text, *lim = ptr + line->length - 1; + size_t eword = key->eword, echar = key->echar; + + if (echar == 0) + eword++; /* Skip all of end field. */ + + /* Move PTR past EWORD fields or to one past the last byte on LINE, + whichever comes first. If there are more than EWORD fields, leave + PTR pointing at the beginning of the field having zero-based index, + EWORD. If a delimiter character was specified (via -t), then that + 'beginning' is the first character following the delimiting TAB. + Otherwise, leave PTR pointing at the first 'blank' character after + the preceding field. */ + if (tab != TAB_DEFAULT) + while (ptr < lim && eword--) + { + while (ptr < lim && *ptr != tab) + ++ptr; + if (ptr < lim && (eword || echar)) + ++ptr; + } + else + while (ptr < lim && eword--) + { + while (ptr < lim && blanks[to_uchar (*ptr)]) + ++ptr; + while (ptr < lim && !blanks[to_uchar (*ptr)]) + ++ptr; + } + +#ifdef POSIX_UNSPECIFIED + /* The following block of code makes GNU sort incompatible with + standard Unix sort, so it's ifdef'd out for now. + The POSIX spec isn't clear on how to interpret this. + FIXME: request clarification. + + From: kwzh@gnu.ai.mit.edu (Karl Heuer) + Date: Thu, 30 May 96 12:20:41 -0400 + [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.] + + [...]I believe I've found another bug in 'sort'. + + $ cat /tmp/sort.in + a b c 2 d + pq rs 1 t + $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in + a b c 2 d + pq rs 1 t + $ /bin/sort -k1.7,1.7 </tmp/sort.in + pq rs 1 t + a b c 2 d + + Unix sort produced the answer I expected: sort on the single character + in column 7. GNU sort produced different results, because it disagrees + on the interpretation of the key-end spec "M.N". Unix sort reads this + as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean + "skip M-1 fields, then either N-1 characters or the rest of the current + field, whichever comes first". This extra clause applies only to + key-ends, not key-starts. + */ + + /* Make LIM point to the end of (one byte past) the current field. */ + if (tab != TAB_DEFAULT) + { + char *newlim; + newlim = memchr (ptr, tab, lim - ptr); + if (newlim) + lim = newlim; + } + else + { + char *newlim; + newlim = ptr; + while (newlim < lim && blanks[to_uchar (*newlim)]) + ++newlim; + while (newlim < lim && !blanks[to_uchar (*newlim)]) + ++newlim; + lim = newlim; + } +#endif + + if (echar != 0) /* We need to skip over a portion of the end field. */ + { + /* If we're ignoring leading blanks when computing the End + of the field, skip past them here. */ + if (key->skipeblanks) + while (ptr < lim && blanks[to_uchar (*ptr)]) + ++ptr; + + /* Advance PTR by ECHAR (if possible), but no further than LIM. */ + ptr = MIN (lim, ptr + echar); + } + + return ptr; +} + +/* Fill BUF reading from FP, moving buf->left bytes from the end + of buf->buf to the beginning first. If EOF is reached and the + file wasn't terminated by a newline, supply one. Set up BUF's line + table too. FILE is the name of the file corresponding to FP. + Return true if some input was read. */ + +static bool +fillbuf (struct buffer *buf, FILE *fp, char const *file) +{ + struct keyfield const *key = keylist; + char eol = eolchar; + size_t line_bytes = buf->line_bytes; + size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE; + + if (buf->eof) + return false; + + if (buf->used != buf->left) + { + memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left); + buf->used = buf->left; + buf->nlines = 0; + } + + while (true) + { + char *ptr = buf->buf + buf->used; + struct line *linelim = buffer_linelim (buf); + struct line *line = linelim - buf->nlines; + size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr; + char *line_start = buf->nlines ? line->text + line->length : buf->buf; + + while (line_bytes + 1 < avail) + { + /* Read as many bytes as possible, but do not read so many + bytes that there might not be enough room for the + corresponding line array. The worst case is when the + rest of the input file consists entirely of newlines, + except that the last byte is not a newline. */ + size_t readsize = (avail - 1) / (line_bytes + 1); + size_t bytes_read = fread (ptr, 1, readsize, fp); + char *ptrlim = ptr + bytes_read; + char *p; + avail -= bytes_read; + + if (bytes_read != readsize) + { + if (ferror (fp)) + sort_die (_("read failed"), file); + if (feof (fp)) + { + buf->eof = true; + if (buf->buf == ptrlim) + return false; + if (line_start != ptrlim && ptrlim[-1] != eol) + *ptrlim++ = eol; + } + } + + /* Find and record each line in the just-read input. */ + while ((p = memchr (ptr, eol, ptrlim - ptr))) + { + /* Delimit the line with NUL. This eliminates the need to + temporarily replace the last byte with NUL when calling + xmemcoll, which increases performance. */ + *p = '\0'; + ptr = p + 1; + line--; + line->text = line_start; + line->length = ptr - line_start; + mergesize = MAX (mergesize, line->length); + avail -= line_bytes; + + if (key) + { + /* Precompute the position of the first key for + efficiency. */ + line->keylim = (key->eword == SIZE_MAX + ? p + : limfield (line, key)); + + if (key->sword != SIZE_MAX) + line->keybeg = begfield (line, key); + else + { + if (key->skipsblanks) + while (blanks[to_uchar (*line_start)]) + line_start++; + line->keybeg = line_start; + } + } + + line_start = ptr; + } + + ptr = ptrlim; + if (buf->eof) + break; + } + + buf->used = ptr - buf->buf; + buf->nlines = buffer_linelim (buf) - line; + if (buf->nlines != 0) + { + buf->left = ptr - line_start; + merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE; + return true; + } + + { + /* The current input line is too long to fit in the buffer. + Increase the buffer size and try again, keeping it properly + aligned. */ + size_t line_alloc = buf->alloc / sizeof (struct line); + buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line)); + buf->alloc = line_alloc * sizeof (struct line); + } + } +} + +/* Table that maps characters to order-of-magnitude values. */ +static char const unit_order[UCHAR_LIM] = + { +#if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \ + && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107) + /* This initializer syntax works on all C99 hosts. For now, use + it only on non-ASCII hosts, to ease the pain of porting to + pre-C99 ASCII hosts. */ + ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8, + ['k']=1, +#else + /* Generate the following table with this command: + perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8); + foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\ + |fmt */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, + 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +#endif + }; + +/* Traverse number given as *number consisting of digits, thousands_sep, and + decimal_point chars only. Returns the highest digit found in the number, + or '\0' if no digit has been found. Upon return *number points at the + character that immediately follows after the given number. */ +static unsigned char +traverse_raw_number (char const **number) +{ + char const *p = *number; + unsigned char ch; + unsigned char max_digit = '\0'; + bool ends_with_thousands_sep = false; + + /* Scan to end of number. + Decimals or separators not followed by digits stop the scan. + Numbers ending in decimals or separators are thus considered + to be lacking in units. + FIXME: add support for multibyte thousands_sep and decimal_point. */ + + while (ISDIGIT (ch = *p++)) + { + if (max_digit < ch) + max_digit = ch; + + /* Allow to skip only one occurrence of thousands_sep to avoid finding + the unit in the next column in case thousands_sep matches as blank + and is used as column delimiter. */ + ends_with_thousands_sep = (*p == thousands_sep); + if (ends_with_thousands_sep) + ++p; + } + + if (ends_with_thousands_sep) + { + /* thousands_sep not followed by digit is not allowed. */ + *number = p - 2; + return max_digit; + } + + if (ch == decimal_point) + while (ISDIGIT (ch = *p++)) + if (max_digit < ch) + max_digit = ch; + + *number = p - 1; + return max_digit; +} + +/* Return an integer that represents the order of magnitude of the + unit following the number. The number may contain thousands + separators and a decimal point, but it may not contain leading blanks. + Negative numbers get negative orders; zero numbers have a zero order. */ + +static int _GL_ATTRIBUTE_PURE +find_unit_order (char const *number) +{ + bool minus_sign = (*number == '-'); + char const *p = number + minus_sign; + unsigned char max_digit = traverse_raw_number (&p); + if ('0' < max_digit) + { + unsigned char ch = *p; + int order = unit_order[ch]; + return (minus_sign ? -order : order); + } + else + return 0; +} + +/* Compare numbers A and B ending in units with SI or IEC prefixes + <none/unknown> < K/k < M < G < T < P < E < Z < Y */ + +static int +human_numcompare (char const *a, char const *b) +{ + while (blanks[to_uchar (*a)]) + a++; + while (blanks[to_uchar (*b)]) + b++; + + int diff = find_unit_order (a) - find_unit_order (b); + return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep)); +} + +/* Compare strings A and B as numbers without explicitly converting them to + machine numbers. Comparatively slow for short strings, but asymptotically + hideously fast. */ + +static int +numcompare (char const *a, char const *b) +{ + while (blanks[to_uchar (*a)]) + a++; + while (blanks[to_uchar (*b)]) + b++; + + return strnumcmp (a, b, decimal_point, thousands_sep); +} + +/* Work around a problem whereby the long double value returned by glibc's + strtold ("NaN", ...) contains uninitialized bits: clear all bytes of + A and B before calling strtold. FIXME: remove this function if + gnulib guarantees that strtold's result is always well defined. */ +static int +nan_compare (char const *sa, char const *sb) +{ + long double a; + memset (&a, 0, sizeof a); + a = strtold (sa, NULL); + + long double b; + memset (&b, 0, sizeof b); + b = strtold (sb, NULL); + + return memcmp (&a, &b, sizeof a); +} + +static int +general_numcompare (char const *sa, char const *sb) +{ + /* FIXME: maybe add option to try expensive FP conversion + only if A and B can't be compared more cheaply/accurately. */ + + char *ea; + char *eb; + long double a = strtold (sa, &ea); + long double b = strtold (sb, &eb); + + /* Put conversion errors at the start of the collating sequence. */ + if (sa == ea) + return sb == eb ? 0 : -1; + if (sb == eb) + return 1; + + /* Sort numbers in the usual way, where -0 == +0. Put NaNs after + conversion errors but before numbers; sort them by internal + bit-pattern, for lack of a more portable alternative. */ + return (a < b ? -1 + : a > b ? 1 + : a == b ? 0 + : b == b ? -1 + : a == a ? 1 + : nan_compare (sa, sb)); +} + +/* Return an integer in 1..12 of the month name MONTH. + Return 0 if the name in S is not recognized. */ + +static int +getmonth (char const *month, char **ea) +{ + size_t lo = 0; + size_t hi = MONTHS_PER_YEAR; + + while (blanks[to_uchar (*month)]) + month++; + + do + { + size_t ix = (lo + hi) / 2; + char const *m = month; + char const *n = monthtab[ix].name; + + for (;; m++, n++) + { + if (!*n) + { + if (ea) + *ea = (char *) m; + return monthtab[ix].val; + } + if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n)) + { + hi = ix; + break; + } + else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n)) + { + lo = ix + 1; + break; + } + } + } + while (lo < hi); + + return 0; +} + +/* A randomly chosen MD5 state, used for random comparison. */ +static struct md5_ctx random_md5_state; + +/* Initialize the randomly chosen MD5 state. */ + +static void +random_md5_state_init (char const *random_source) +{ + unsigned char buf[MD5_DIGEST_SIZE]; + struct randread_source *r = randread_new (random_source, sizeof buf); + if (! r) + sort_die (_("open failed"), random_source); + randread (r, buf, sizeof buf); + if (randread_free (r) != 0) + sort_die (_("close failed"), random_source); + md5_init_ctx (&random_md5_state); + md5_process_bytes (buf, sizeof buf, &random_md5_state); +} + +/* This is like strxfrm, except it reports any error and exits. */ + +static size_t +xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize) +{ + errno = 0; + size_t translated_size = strxfrm (dest, src, destsize); + + if (errno) + { + error (0, errno, _("string transformation failed")); + error (0, 0, _("set LC_ALL='C' to work around the problem")); + die (SORT_FAILURE, 0, + _("the untransformed string was %s"), + quotearg_n_style (0, locale_quoting_style, src)); + } + + return translated_size; +} + +/* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB) + using one or more random hash functions. TEXTA[LENA] and + TEXTB[LENB] must be zero. */ + +static int +compare_random (char *restrict texta, size_t lena, + char *restrict textb, size_t lenb) +{ + /* XFRM_DIFF records the equivalent of memcmp on the transformed + data. This is used to break ties if there is a checksum + collision, and this is good enough given the astronomically low + probability of a collision. */ + int xfrm_diff = 0; + + char stackbuf[4000]; + char *buf = stackbuf; + size_t bufsize = sizeof stackbuf; + void *allocated = NULL; + uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)]; + struct md5_ctx s[2]; + s[0] = s[1] = random_md5_state; + + if (hard_LC_COLLATE) + { + char const *lima = texta + lena; + char const *limb = textb + lenb; + + while (true) + { + /* Transform the text into the basis of comparison, so that byte + strings that would otherwise considered to be equal are + considered equal here even if their bytes differ. + + Each time through this loop, transform one + null-terminated string's worth from TEXTA or from TEXTB + or both. That way, there's no need to store the + transformation of the whole line, if it contains many + null-terminated strings. */ + + /* Store the transformed data into a big-enough buffer. */ + + /* A 3X size guess avoids the overhead of calling strxfrm + twice on typical implementations. Don't worry about + size_t overflow, as the guess need not be correct. */ + size_t guess_bufsize = 3 * (lena + lenb) + 2; + if (bufsize < guess_bufsize) + { + bufsize = MAX (guess_bufsize, bufsize * 3 / 2); + free (allocated); + buf = allocated = malloc (bufsize); + if (! buf) + { + buf = stackbuf; + bufsize = sizeof stackbuf; + } + } + + size_t sizea = + (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0); + bool a_fits = sizea <= bufsize; + size_t sizeb = + (textb < limb + ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb, + (a_fits ? bufsize - sizea : 0)) + + 1) + : 0); + + if (! (a_fits && sizea + sizeb <= bufsize)) + { + bufsize = sizea + sizeb; + if (bufsize < SIZE_MAX / 3) + bufsize = bufsize * 3 / 2; + free (allocated); + buf = allocated = xmalloc (bufsize); + if (texta < lima) + strxfrm (buf, texta, sizea); + if (textb < limb) + strxfrm (buf + sizea, textb, sizeb); + } + + /* Advance past NULs to the next part of each input string, + exiting the loop if both strings are exhausted. When + exiting the loop, prepare to finish off the tiebreaker + comparison properly. */ + if (texta < lima) + texta += strlen (texta) + 1; + if (textb < limb) + textb += strlen (textb) + 1; + if (! (texta < lima || textb < limb)) + { + lena = sizea; texta = buf; + lenb = sizeb; textb = buf + sizea; + break; + } + + /* Accumulate the transformed data in the corresponding + checksums. */ + md5_process_bytes (buf, sizea, &s[0]); + md5_process_bytes (buf + sizea, sizeb, &s[1]); + + /* Update the tiebreaker comparison of the transformed data. */ + if (! xfrm_diff) + { + xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb)); + if (! xfrm_diff) + xfrm_diff = (sizea > sizeb) - (sizea < sizeb); + } + } + } + + /* Compute and compare the checksums. */ + md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]); + md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]); + int diff = memcmp (dig[0], dig[1], sizeof dig[0]); + + /* Fall back on the tiebreaker if the checksums collide. */ + if (! diff) + { + if (! xfrm_diff) + { + xfrm_diff = memcmp (texta, textb, MIN (lena, lenb)); + if (! xfrm_diff) + xfrm_diff = (lena > lenb) - (lena < lenb); + } + + diff = xfrm_diff; + } + + free (allocated); + + return diff; +} + +/* Return the printable width of the block of memory starting at + TEXT and ending just before LIM, counting each tab as one byte. + FIXME: Should we generally be counting non printable chars? */ + +static size_t +debug_width (char const *text, char const *lim) +{ + size_t width = mbsnwidth (text, lim - text, 0); + while (text < lim) + width += (*text++ == '\t'); + return width; +} + +/* For debug mode, "underline" a key at the + specified offset and screen width. */ + +static void +mark_key (size_t offset, size_t width) +{ + while (offset--) + putchar (' '); + + if (!width) + printf (_("^ no match for key\n")); + else + { + do + putchar ('_'); + while (--width); + + putchar ('\n'); + } +} + +/* Return true if KEY is a numeric key. */ + +static inline bool +key_numeric (struct keyfield const *key) +{ + return key->numeric || key->general_numeric || key->human_numeric; +} + +/* For LINE, output a debugging line that underlines KEY in LINE. + If KEY is null, underline the whole line. */ + +static void +debug_key (struct line const *line, struct keyfield const *key) +{ + char *text = line->text; + char *beg = text; + char *lim = text + line->length - 1; + + if (key) + { + if (key->sword != SIZE_MAX) + beg = begfield (line, key); + if (key->eword != SIZE_MAX) + lim = limfield (line, key); + + if ((key->skipsblanks && key->sword == SIZE_MAX) + || key->month || key_numeric (key)) + { + char saved = *lim; + *lim = '\0'; + + while (blanks[to_uchar (*beg)]) + beg++; + + char *tighter_lim = beg; + + if (lim < beg) + tighter_lim = lim; + else if (key->month) + getmonth (beg, &tighter_lim); + else if (key->general_numeric) + ignore_value (strtold (beg, &tighter_lim)); + else if (key->numeric || key->human_numeric) + { + char const *p = beg + (beg < lim && *beg == '-'); + unsigned char max_digit = traverse_raw_number (&p); + if ('0' <= max_digit) + { + unsigned char ch = *p; + tighter_lim = (char *) p + + (key->human_numeric && unit_order[ch]); + } + } + else + tighter_lim = lim; + + *lim = saved; + lim = tighter_lim; + } + } + + size_t offset = debug_width (text, beg); + size_t width = debug_width (beg, lim); + mark_key (offset, width); +} + +/* Debug LINE by underlining its keys. */ + +static void +debug_line (struct line const *line) +{ + struct keyfield const *key = keylist; + + do + debug_key (line, key); + while (key && ((key = key->next) || ! (unique || stable))); +} + +/* Return whether sorting options specified for key. */ + +static bool +default_key_compare (struct keyfield const *key) +{ + return ! (key->ignore + || key->translate + || key->skipsblanks + || key->skipeblanks + || key_numeric (key) + || key->month + || key->version + || key->random + /* || key->reverse */ + ); +} + +/* Convert a key to the short options used to specify it. */ + +static void +key_to_opts (struct keyfield const *key, char *opts) +{ + if (key->skipsblanks || key->skipeblanks) + *opts++ = 'b';/* either disables global -b */ + if (key->ignore == nondictionary) + *opts++ = 'd'; + if (key->translate) + *opts++ = 'f'; + if (key->general_numeric) + *opts++ = 'g'; + if (key->human_numeric) + *opts++ = 'h'; + if (key->ignore == nonprinting) + *opts++ = 'i'; + if (key->month) + *opts++ = 'M'; + if (key->numeric) + *opts++ = 'n'; + if (key->random) + *opts++ = 'R'; + if (key->reverse) + *opts++ = 'r'; + if (key->version) + *opts++ = 'V'; + *opts = '\0'; +} + +/* Output data independent key warnings to stderr. */ + +static void +key_warnings (struct keyfield const *gkey, bool gkey_only) +{ + struct keyfield const *key; + struct keyfield ugkey = *gkey; + unsigned long keynum = 1; + + for (key = keylist; key; key = key->next, keynum++) + { + if (key->traditional_used) + { + size_t sword = key->sword; + size_t eword = key->eword; + char tmp[INT_BUFSIZE_BOUND (uintmax_t)]; + /* obsolescent syntax +A.x -B.y is equivalent to: + -k A+1.x+1,B.y (when y = 0) + -k A+1.x+1,B+1.y (when y > 0) */ + char obuf[INT_BUFSIZE_BOUND (sword) * 2 + 4]; /* +# -# */ + char nbuf[INT_BUFSIZE_BOUND (sword) * 2 + 5]; /* -k #,# */ + char *po = obuf; + char *pn = nbuf; + + if (sword == SIZE_MAX) + sword++; + + po = stpcpy (stpcpy (po, "+"), umaxtostr (sword, tmp)); + pn = stpcpy (stpcpy (pn, "-k "), umaxtostr (sword + 1, tmp)); + if (key->eword != SIZE_MAX) + { + stpcpy (stpcpy (po, " -"), umaxtostr (eword + 1, tmp)); + stpcpy (stpcpy (pn, ","), + umaxtostr (eword + 1 + + (key->echar == SIZE_MAX), tmp)); + } + error (0, 0, _("obsolescent key %s used; consider %s instead"), + quote_n (0, obuf), quote_n (1, nbuf)); + } + + /* Warn about field specs that will never match. */ + bool zero_width = key->sword != SIZE_MAX && key->eword < key->sword; + if (zero_width) + error (0, 0, _("key %lu has zero width and will be ignored"), keynum); + + /* Warn about significant leading blanks. */ + bool implicit_skip = key_numeric (key) || key->month; + bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */ + if (!zero_width && !gkey_only && tab == TAB_DEFAULT && !line_offset + && ((!key->skipsblanks && !implicit_skip) + || (!key->skipsblanks && key->schar) + || (!key->skipeblanks && key->echar))) + error (0, 0, _("leading blanks are significant in key %lu; " + "consider also specifying 'b'"), keynum); + + /* Warn about numeric comparisons spanning fields, + as field delimiters could be interpreted as part + of the number (maybe only in other locales). */ + if (!gkey_only && key_numeric (key)) + { + size_t sword = key->sword + 1; + size_t eword = key->eword + 1; + if (!sword) + sword++; + if (!eword || sword < eword) + error (0, 0, _("key %lu is numeric and spans multiple fields"), + keynum); + } + + /* Flag global options not copied or specified in any key. */ + if (ugkey.ignore && (ugkey.ignore == key->ignore)) + ugkey.ignore = NULL; + if (ugkey.translate && (ugkey.translate == key->translate)) + ugkey.translate = NULL; + ugkey.skipsblanks &= !key->skipsblanks; + ugkey.skipeblanks &= !key->skipeblanks; + ugkey.month &= !key->month; + ugkey.numeric &= !key->numeric; + ugkey.general_numeric &= !key->general_numeric; + ugkey.human_numeric &= !key->human_numeric; + ugkey.random &= !key->random; + ugkey.version &= !key->version; + ugkey.reverse &= !key->reverse; + } + + /* Warn about ignored global options flagged above. + This clears all flags if UGKEY is the only one in the list. */ + if (!default_key_compare (&ugkey) + || (ugkey.reverse && (stable || unique) && keylist)) + { + bool ugkey_reverse = ugkey.reverse; + if (!(stable || unique)) + ugkey.reverse = false; + /* The following is too big, but guaranteed to be "big enough". */ + char opts[sizeof short_options]; + key_to_opts (&ugkey, opts); + error (0, 0, + ngettext ("option '-%s' is ignored", + "options '-%s' are ignored", + select_plural (strlen (opts))), opts); + ugkey.reverse = ugkey_reverse; + } + if (ugkey.reverse && !(stable || unique) && keylist) + error (0, 0, _("option '-r' only applies to last-resort comparison")); +} + +/* Compare two lines A and B trying every key in sequence until there + are no more keys or a difference is found. */ + +static int +keycompare (struct line const *a, struct line const *b) +{ + struct keyfield *key = keylist; + + /* For the first iteration only, the key positions have been + precomputed for us. */ + char *texta = a->keybeg; + char *textb = b->keybeg; + char *lima = a->keylim; + char *limb = b->keylim; + + int diff; + + while (true) + { + char const *translate = key->translate; + bool const *ignore = key->ignore; + + /* Treat field ends before field starts as empty fields. */ + lima = MAX (texta, lima); + limb = MAX (textb, limb); + + /* Find the lengths. */ + size_t lena = lima - texta; + size_t lenb = limb - textb; + + if (hard_LC_COLLATE || key_numeric (key) + || key->month || key->random || key->version) + { + char *ta; + char *tb; + size_t tlena; + size_t tlenb; + + char enda IF_LINT (= 0); + char endb IF_LINT (= 0); + void *allocated IF_LINT (= NULL); + char stackbuf[4000]; + + if (ignore || translate) + { + /* Compute with copies of the keys, which are the result of + translating or ignoring characters, and which need their + own storage. */ + + size_t i; + + /* Allocate space for copies. */ + size_t size = lena + 1 + lenb + 1; + if (size <= sizeof stackbuf) + ta = stackbuf, allocated = NULL; + else + ta = allocated = xmalloc (size); + tb = ta + lena + 1; + + /* Put into each copy a version of the key in which the + requested characters are ignored or translated. */ + for (tlena = i = 0; i < lena; i++) + if (! (ignore && ignore[to_uchar (texta[i])])) + ta[tlena++] = (translate + ? translate[to_uchar (texta[i])] + : texta[i]); + ta[tlena] = '\0'; + + for (tlenb = i = 0; i < lenb; i++) + if (! (ignore && ignore[to_uchar (textb[i])])) + tb[tlenb++] = (translate + ? translate[to_uchar (textb[i])] + : textb[i]); + tb[tlenb] = '\0'; + } + else + { + /* Use the keys in-place, temporarily null-terminated. */ + ta = texta; tlena = lena; enda = ta[tlena]; ta[tlena] = '\0'; + tb = textb; tlenb = lenb; endb = tb[tlenb]; tb[tlenb] = '\0'; + } + + if (key->numeric) + diff = numcompare (ta, tb); + else if (key->general_numeric) + diff = general_numcompare (ta, tb); + else if (key->human_numeric) + diff = human_numcompare (ta, tb); + else if (key->month) + diff = getmonth (ta, NULL) - getmonth (tb, NULL); + else if (key->random) + diff = compare_random (ta, tlena, tb, tlenb); + else if (key->version) + diff = filevercmp (ta, tb); + else + { + /* Locale-dependent string sorting. This is slower than + C-locale sorting, which is implemented below. */ + if (tlena == 0) + diff = - NONZERO (tlenb); + else if (tlenb == 0) + diff = 1; + else + diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1); + } + + if (ignore || translate) + free (allocated); + else + { + ta[tlena] = enda; + tb[tlenb] = endb; + } + } + else if (ignore) + { +#define CMP_WITH_IGNORE(A, B) \ + do \ + { \ + while (true) \ + { \ + while (texta < lima && ignore[to_uchar (*texta)]) \ + ++texta; \ + while (textb < limb && ignore[to_uchar (*textb)]) \ + ++textb; \ + if (! (texta < lima && textb < limb)) \ + break; \ + diff = to_uchar (A) - to_uchar (B); \ + if (diff) \ + goto not_equal; \ + ++texta; \ + ++textb; \ + } \ + \ + diff = (texta < lima) - (textb < limb); \ + } \ + while (0) + + if (translate) + CMP_WITH_IGNORE (translate[to_uchar (*texta)], + translate[to_uchar (*textb)]); + else + CMP_WITH_IGNORE (*texta, *textb); + } + else if (lena == 0) + diff = - NONZERO (lenb); + else if (lenb == 0) + goto greater; + else + { + if (translate) + { + while (texta < lima && textb < limb) + { + diff = (to_uchar (translate[to_uchar (*texta++)]) + - to_uchar (translate[to_uchar (*textb++)])); + if (diff) + goto not_equal; + } + } + else + { + diff = memcmp (texta, textb, MIN (lena, lenb)); + if (diff) + goto not_equal; + } + diff = lena < lenb ? -1 : lena != lenb; + } + + if (diff) + goto not_equal; + + key = key->next; + if (! key) + break; + + /* Find the beginning and limit of the next field. */ + if (key->eword != SIZE_MAX) + lima = limfield (a, key), limb = limfield (b, key); + else + lima = a->text + a->length - 1, limb = b->text + b->length - 1; + + if (key->sword != SIZE_MAX) + texta = begfield (a, key), textb = begfield (b, key); + else + { + texta = a->text, textb = b->text; + if (key->skipsblanks) + { + while (texta < lima && blanks[to_uchar (*texta)]) + ++texta; + while (textb < limb && blanks[to_uchar (*textb)]) + ++textb; + } + } + } + + return 0; + + greater: + diff = 1; + not_equal: + return key->reverse ? -diff : diff; +} + +/* Compare two lines A and B, returning negative, zero, or positive + depending on whether A compares less than, equal to, or greater than B. */ + +static int +compare (struct line const *a, struct line const *b) +{ + int diff; + size_t alen, blen; + + /* First try to compare on the specified keys (if any). + The only two cases with no key at all are unadorned sort, + and unadorned sort -r. */ + if (keylist) + { + diff = keycompare (a, b); + if (diff || unique || stable) + return diff; + } + + /* If the keys all compare equal (or no keys were specified) + fall through to the default comparison. */ + alen = a->length - 1, blen = b->length - 1; + + if (alen == 0) + diff = - NONZERO (blen); + else if (blen == 0) + diff = 1; + else if (hard_LC_COLLATE) + { + /* xmemcoll0 is a performance enhancement as + it will not unconditionally write '\0' after the + passed in buffers, which was seen to give around + a 3% increase in performance for short lines. */ + diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1); + } + else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen)))) + diff = alen < blen ? -1 : alen != blen; + + return reverse ? -diff : diff; +} + +/* Write LINE to output stream FP; the output file's name is + OUTPUT_FILE if OUTPUT_FILE is non-null, and is the standard output + otherwise. If debugging is enabled and FP is standard output, + append some debugging information. */ + +static void +write_line (struct line const *line, FILE *fp, char const *output_file) +{ + char *buf = line->text; + size_t n_bytes = line->length; + char *ebuf = buf + n_bytes; + + if (!output_file && debug) + { + /* Convert TAB to '>' and EOL to \n, and then output debugging info. */ + char const *c = buf; + + while (c < ebuf) + { + char wc = *c++; + if (wc == '\t') + wc = '>'; + else if (c == ebuf) + wc = '\n'; + if (fputc (wc, fp) == EOF) + sort_die (_("write failed"), output_file); + } + + debug_line (line); + } + else + { + ebuf[-1] = eolchar; + if (fwrite (buf, 1, n_bytes, fp) != n_bytes) + sort_die (_("write failed"), output_file); + ebuf[-1] = '\0'; + } +} + +/* Check that the lines read from FILE_NAME come in order. Return + true if they are in order. If CHECKONLY == 'c', also print a + diagnostic (FILE_NAME, line number, contents of line) to stderr if + they are not in order. */ + +static bool +check (char const *file_name, char checkonly) +{ + FILE *fp = xfopen (file_name, "r"); + struct buffer buf; /* Input buffer. */ + struct line temp; /* Copy of previous line. */ + size_t alloc = 0; + uintmax_t line_number = 0; + struct keyfield const *key = keylist; + bool nonunique = ! unique; + bool ordered = true; + + initbuf (&buf, sizeof (struct line), + MAX (merge_buffer_size, sort_size)); + temp.text = NULL; + + while (fillbuf (&buf, fp, file_name)) + { + struct line const *line = buffer_linelim (&buf); + struct line const *linebase = line - buf.nlines; + + /* Make sure the line saved from the old buffer contents is + less than or equal to the first line of the new buffer. */ + if (alloc && nonunique <= compare (&temp, line - 1)) + { + found_disorder: + { + if (checkonly == 'c') + { + struct line const *disorder_line = line - 1; + uintmax_t disorder_line_number = + buffer_linelim (&buf) - disorder_line + line_number; + char hr_buf[INT_BUFSIZE_BOUND (disorder_line_number)]; + fprintf (stderr, _("%s: %s:%s: disorder: "), + program_name, file_name, + umaxtostr (disorder_line_number, hr_buf)); + write_line (disorder_line, stderr, _("standard error")); + } + + ordered = false; + break; + } + } + + /* Compare each line in the buffer with its successor. */ + while (linebase < --line) + if (nonunique <= compare (line, line - 1)) + goto found_disorder; + + line_number += buf.nlines; + + /* Save the last line of the buffer. */ + if (alloc < line->length) + { + do + { + alloc *= 2; + if (! alloc) + { + alloc = line->length; + break; + } + } + while (alloc < line->length); + + free (temp.text); + temp.text = xmalloc (alloc); + } + memcpy (temp.text, line->text, line->length); + temp.length = line->length; + if (key) + { + temp.keybeg = temp.text + (line->keybeg - line->text); + temp.keylim = temp.text + (line->keylim - line->text); + } + } + + xfclose (fp, file_name); + free (buf.buf); + free (temp.text); + return ordered; +} + +/* Open FILES (there are NFILES of them) and store the resulting array + of stream pointers into (*PFPS). Allocate the array. Return the + number of successfully opened files, setting errno if this value is + less than NFILES. */ + +static size_t +open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps) +{ + FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps); + int i; + + /* Open as many input files as we can. */ + for (i = 0; i < nfiles; i++) + { + fps[i] = (files[i].temp && files[i].temp->state != UNCOMPRESSED + ? open_temp (files[i].temp) + : stream_open (files[i].name, "r")); + if (!fps[i]) + break; + } + + return i; +} + +/* Merge lines from FILES onto OFP. NTEMPS is the number of temporary + files (all of which are at the start of the FILES array), and + NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE. + FPS is the vector of open stream corresponding to the files. + Close input and output streams before returning. + OUTPUT_FILE gives the name of the output file. If it is NULL, + the output file is standard output. */ + +static void +mergefps (struct sortfile *files, size_t ntemps, size_t nfiles, + FILE *ofp, char const *output_file, FILE **fps) +{ + struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer); + /* Input buffers for each file. */ + struct line saved; /* Saved line storage for unique check. */ + struct line const *savedline = NULL; + /* &saved if there is a saved line. */ + size_t savealloc = 0; /* Size allocated for the saved line. */ + struct line const **cur = xnmalloc (nfiles, sizeof *cur); + /* Current line in each line table. */ + struct line const **base = xnmalloc (nfiles, sizeof *base); + /* Base of each line table. */ + size_t *ord = xnmalloc (nfiles, sizeof *ord); + /* Table representing a permutation of fps, + such that cur[ord[0]] is the smallest line + and will be next output. */ + size_t i; + size_t j; + size_t t; + struct keyfield const *key = keylist; + saved.text = NULL; + + /* Read initial lines from each input file. */ + for (i = 0; i < nfiles; ) + { + initbuf (&buffer[i], sizeof (struct line), + MAX (merge_buffer_size, sort_size / nfiles)); + if (fillbuf (&buffer[i], fps[i], files[i].name)) + { + struct line const *linelim = buffer_linelim (&buffer[i]); + cur[i] = linelim - 1; + base[i] = linelim - buffer[i].nlines; + i++; + } + else + { + /* fps[i] is empty; eliminate it from future consideration. */ + xfclose (fps[i], files[i].name); + if (i < ntemps) + { + ntemps--; + zaptemp (files[i].name); + } + free (buffer[i].buf); + --nfiles; + for (j = i; j < nfiles; ++j) + { + files[j] = files[j + 1]; + fps[j] = fps[j + 1]; + } + } + } + + /* Set up the ord table according to comparisons among input lines. + Since this only reorders two items if one is strictly greater than + the other, it is stable. */ + for (i = 0; i < nfiles; ++i) + ord[i] = i; + for (i = 1; i < nfiles; ++i) + if (0 < compare (cur[ord[i - 1]], cur[ord[i]])) + t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0; + + /* Repeatedly output the smallest line until no input remains. */ + while (nfiles) + { + struct line const *smallest = cur[ord[0]]; + + /* If uniquified output is turned on, output only the first of + an identical series of lines. */ + if (unique) + { + if (savedline && compare (savedline, smallest)) + { + savedline = NULL; + write_line (&saved, ofp, output_file); + } + if (!savedline) + { + savedline = &saved; + if (savealloc < smallest->length) + { + do + if (! savealloc) + { + savealloc = smallest->length; + break; + } + while ((savealloc *= 2) < smallest->length); + + free (saved.text); + saved.text = xmalloc (savealloc); + } + saved.length = smallest->length; + memcpy (saved.text, smallest->text, saved.length); + if (key) + { + saved.keybeg = + saved.text + (smallest->keybeg - smallest->text); + saved.keylim = + saved.text + (smallest->keylim - smallest->text); + } + } + } + else + write_line (smallest, ofp, output_file); + + /* Check if we need to read more lines into core. */ + if (base[ord[0]] < smallest) + cur[ord[0]] = smallest - 1; + else + { + if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name)) + { + struct line const *linelim = buffer_linelim (&buffer[ord[0]]); + cur[ord[0]] = linelim - 1; + base[ord[0]] = linelim - buffer[ord[0]].nlines; + } + else + { + /* We reached EOF on fps[ord[0]]. */ + for (i = 1; i < nfiles; ++i) + if (ord[i] > ord[0]) + --ord[i]; + --nfiles; + xfclose (fps[ord[0]], files[ord[0]].name); + if (ord[0] < ntemps) + { + ntemps--; + zaptemp (files[ord[0]].name); + } + free (buffer[ord[0]].buf); + for (i = ord[0]; i < nfiles; ++i) + { + fps[i] = fps[i + 1]; + files[i] = files[i + 1]; + buffer[i] = buffer[i + 1]; + cur[i] = cur[i + 1]; + base[i] = base[i + 1]; + } + for (i = 0; i < nfiles; ++i) + ord[i] = ord[i + 1]; + continue; + } + } + + /* The new line just read in may be larger than other lines + already in main memory; push it back in the queue until we + encounter a line larger than it. Optimize for the common + case where the new line is smallest. */ + { + size_t lo = 1; + size_t hi = nfiles; + size_t probe = lo; + size_t ord0 = ord[0]; + size_t count_of_smaller_lines; + + while (lo < hi) + { + int cmp = compare (cur[ord0], cur[ord[probe]]); + if (cmp < 0 || (cmp == 0 && ord0 < ord[probe])) + hi = probe; + else + lo = probe + 1; + probe = (lo + hi) / 2; + } + + count_of_smaller_lines = lo - 1; + for (j = 0; j < count_of_smaller_lines; j++) + ord[j] = ord[j + 1]; + ord[count_of_smaller_lines] = ord0; + } + } + + if (unique && savedline) + { + write_line (&saved, ofp, output_file); + free (saved.text); + } + + xfclose (ofp, output_file); + free (fps); + free (buffer); + free (ord); + free (base); + free (cur); +} + +/* Merge lines from FILES onto OFP. NTEMPS is the number of temporary + files (all of which are at the start of the FILES array), and + NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE. + Close input and output files before returning. + OUTPUT_FILE gives the name of the output file. + + Return the number of files successfully merged. This number can be + less than NFILES if we ran low on file descriptors, but in this + case it is never less than 2. */ + +static size_t +mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles, + FILE *ofp, char const *output_file) +{ + FILE **fps; + size_t nopened = open_input_files (files, nfiles, &fps); + if (nopened < nfiles && nopened < 2) + sort_die (_("open failed"), files[nopened].name); + mergefps (files, ntemps, nopened, ofp, output_file, fps); + return nopened; +} + +/* Merge into T (of size NLINES) the two sorted arrays of lines + LO (with NLINES / 2 members), and + T - (NLINES / 2) (with NLINES - NLINES / 2 members). + T and LO point just past their respective arrays, and the arrays + are in reverse order. NLINES must be at least 2. */ + +static void +mergelines (struct line *restrict t, size_t nlines, + struct line const *restrict lo) +{ + size_t nlo = nlines / 2; + size_t nhi = nlines - nlo; + struct line *hi = t - nlo; + + while (true) + if (compare (lo - 1, hi - 1) <= 0) + { + *--t = *--lo; + if (! --nlo) + { + /* HI must equal T now, and there is no need to copy from + HI to T. */ + return; + } + } + else + { + *--t = *--hi; + if (! --nhi) + { + do + *--t = *--lo; + while (--nlo); + + return; + } + } +} + +/* Sort the array LINES with NLINES members, using TEMP for temporary space. + Do this all within one thread. NLINES must be at least 2. + If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES. + Otherwise the sort is in-place and TEMP is half-sized. + The input and output arrays are in reverse order, and LINES and + TEMP point just past the end of their respective arrays. + + Use a recursive divide-and-conquer algorithm, in the style + suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use + the optimization suggested by exercise 5.2.4-10; this requires room + for only 1.5*N lines, rather than the usual 2*N lines. Knuth + writes that this memory optimization was originally published by + D. A. Bell, Comp J. 1 (1958), 75. */ + +static void +sequential_sort (struct line *restrict lines, size_t nlines, + struct line *restrict temp, bool to_temp) +{ + if (nlines == 2) + { + /* Declare 'swap' as int, not bool, to work around a bug + <https://lists.gnu.org/r/bug-coreutils/2005-10/msg00086.html> + in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */ + int swap = (0 < compare (&lines[-1], &lines[-2])); + if (to_temp) + { + temp[-1] = lines[-1 - swap]; + temp[-2] = lines[-2 + swap]; + } + else if (swap) + { + temp[-1] = lines[-1]; + lines[-1] = lines[-2]; + lines[-2] = temp[-1]; + } + } + else + { + size_t nlo = nlines / 2; + size_t nhi = nlines - nlo; + struct line *lo = lines; + struct line *hi = lines - nlo; + + sequential_sort (hi, nhi, temp - (to_temp ? nlo : 0), to_temp); + if (1 < nlo) + sequential_sort (lo, nlo, temp, !to_temp); + else if (!to_temp) + temp[-1] = lo[-1]; + + struct line *dest; + struct line const *sorted_lo; + if (to_temp) + { + dest = temp; + sorted_lo = lines; + } + else + { + dest = lines; + sorted_lo = temp; + } + mergelines (dest, nlines, sorted_lo); + } +} + +static struct merge_node *init_node (struct merge_node *restrict, + struct merge_node *restrict, + struct line *, size_t, size_t, bool); + + +/* Create and return a merge tree for NTHREADS threads, sorting NLINES + lines, with destination DEST. */ +static struct merge_node * +merge_tree_init (size_t nthreads, size_t nlines, struct line *dest) +{ + struct merge_node *merge_tree = xmalloc (2 * sizeof *merge_tree * nthreads); + + struct merge_node *root = merge_tree; + root->lo = root->hi = root->end_lo = root->end_hi = NULL; + root->dest = NULL; + root->nlo = root->nhi = nlines; + root->parent = NULL; + root->level = MERGE_END; + root->queued = false; + pthread_mutex_init (&root->lock, NULL); + + init_node (root, root + 1, dest, nthreads, nlines, false); + return merge_tree; +} + +/* Destroy the merge tree. */ +static void +merge_tree_destroy (size_t nthreads, struct merge_node *merge_tree) +{ + size_t n_nodes = nthreads * 2; + struct merge_node *node = merge_tree; + + while (n_nodes--) + { + pthread_mutex_destroy (&node->lock); + node++; + } + + free (merge_tree); +} + +/* Initialize a merge tree node and its descendants. The node's + parent is PARENT. The node and its descendants are taken from the + array of nodes NODE_POOL. Their destination starts at DEST; they + will consume NTHREADS threads. The total number of sort lines is + TOTAL_LINES. IS_LO_CHILD is true if the node is the low child of + its parent. */ + +static struct merge_node * +init_node (struct merge_node *restrict parent, + struct merge_node *restrict node_pool, + struct line *dest, size_t nthreads, + size_t total_lines, bool is_lo_child) +{ + size_t nlines = (is_lo_child ? parent->nlo : parent->nhi); + size_t nlo = nlines / 2; + size_t nhi = nlines - nlo; + struct line *lo = dest - total_lines; + struct line *hi = lo - nlo; + struct line **parent_end = (is_lo_child ? &parent->end_lo : &parent->end_hi); + + struct merge_node *node = node_pool++; + node->lo = node->end_lo = lo; + node->hi = node->end_hi = hi; + node->dest = parent_end; + node->nlo = nlo; + node->nhi = nhi; + node->parent = parent; + node->level = parent->level + 1; + node->queued = false; + pthread_mutex_init (&node->lock, NULL); + + if (nthreads > 1) + { + size_t lo_threads = nthreads / 2; + size_t hi_threads = nthreads - lo_threads; + node->lo_child = node_pool; + node_pool = init_node (node, node_pool, lo, lo_threads, + total_lines, true); + node->hi_child = node_pool; + node_pool = init_node (node, node_pool, hi, hi_threads, + total_lines, false); + } + else + { + node->lo_child = NULL; + node->hi_child = NULL; + } + return node_pool; +} + + +/* Compare two merge nodes A and B for priority. */ + +static int +compare_nodes (void const *a, void const *b) +{ + struct merge_node const *nodea = a; + struct merge_node const *nodeb = b; + if (nodea->level == nodeb->level) + return (nodea->nlo + nodea->nhi) < (nodeb->nlo + nodeb->nhi); + return nodea->level < nodeb->level; +} + +/* Lock a merge tree NODE. */ + +static inline void +lock_node (struct merge_node *node) +{ + pthread_mutex_lock (&node->lock); +} + +/* Unlock a merge tree NODE. */ + +static inline void +unlock_node (struct merge_node *node) +{ + pthread_mutex_unlock (&node->lock); +} + +/* Destroy merge QUEUE. */ + +static void +queue_destroy (struct merge_node_queue *queue) +{ + heap_free (queue->priority_queue); + pthread_cond_destroy (&queue->cond); + pthread_mutex_destroy (&queue->mutex); +} + +/* Initialize merge QUEUE, allocating space suitable for a maximum of + NTHREADS threads. */ + +static void +queue_init (struct merge_node_queue *queue, size_t nthreads) +{ + /* Though it's highly unlikely all nodes are in the heap at the same + time, the heap should accommodate all of them. Counting a NULL + dummy head for the heap, reserve 2 * NTHREADS nodes. */ + queue->priority_queue = heap_alloc (compare_nodes, 2 * nthreads); + pthread_mutex_init (&queue->mutex, NULL); + pthread_cond_init (&queue->cond, NULL); +} + +/* Insert NODE into QUEUE. The caller either holds a lock on NODE, or + does not need to lock NODE. */ + +static void +queue_insert (struct merge_node_queue *queue, struct merge_node *node) +{ + pthread_mutex_lock (&queue->mutex); + heap_insert (queue->priority_queue, node); + node->queued = true; + pthread_cond_signal (&queue->cond); + pthread_mutex_unlock (&queue->mutex); +} + +/* Pop the top node off the priority QUEUE, lock the node, return it. */ + +static struct merge_node * +queue_pop (struct merge_node_queue *queue) +{ + struct merge_node *node; + pthread_mutex_lock (&queue->mutex); + while (! (node = heap_remove_top (queue->priority_queue))) + pthread_cond_wait (&queue->cond, &queue->mutex); + pthread_mutex_unlock (&queue->mutex); + lock_node (node); + node->queued = false; + return node; +} + +/* Output LINE to TFP, unless -u is specified and the line compares + equal to the previous line. TEMP_OUTPUT is the name of TFP, or + is null if TFP is standard output. + + This function does not save the line for comparison later, so it is + appropriate only for internal sort. */ + +static void +write_unique (struct line const *line, FILE *tfp, char const *temp_output) +{ + if (unique) + { + if (saved_line.text && ! compare (line, &saved_line)) + return; + saved_line = *line; + } + + write_line (line, tfp, temp_output); +} + +/* Merge the lines currently available to a NODE in the binary + merge tree. Merge a number of lines appropriate for this merge + level, assuming TOTAL_LINES is the total number of lines. + + If merging at the top level, send output to TFP. TEMP_OUTPUT is + the name of TFP, or is null if TFP is standard output. */ + +static void +mergelines_node (struct merge_node *restrict node, size_t total_lines, + FILE *tfp, char const *temp_output) +{ + struct line *lo_orig = node->lo; + struct line *hi_orig = node->hi; + size_t to_merge = MAX_MERGE (total_lines, node->level); + size_t merged_lo; + size_t merged_hi; + + if (node->level > MERGE_ROOT) + { + /* Merge to destination buffer. */ + struct line *dest = *node->dest; + while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--) + if (compare (node->lo - 1, node->hi - 1) <= 0) + *--dest = *--node->lo; + else + *--dest = *--node->hi; + + merged_lo = lo_orig - node->lo; + merged_hi = hi_orig - node->hi; + + if (node->nhi == merged_hi) + while (node->lo != node->end_lo && to_merge--) + *--dest = *--node->lo; + else if (node->nlo == merged_lo) + while (node->hi != node->end_hi && to_merge--) + *--dest = *--node->hi; + *node->dest = dest; + } + else + { + /* Merge directly to output. */ + while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--) + { + if (compare (node->lo - 1, node->hi - 1) <= 0) + write_unique (--node->lo, tfp, temp_output); + else + write_unique (--node->hi, tfp, temp_output); + } + + merged_lo = lo_orig - node->lo; + merged_hi = hi_orig - node->hi; + + if (node->nhi == merged_hi) + { + while (node->lo != node->end_lo && to_merge--) + write_unique (--node->lo, tfp, temp_output); + } + else if (node->nlo == merged_lo) + { + while (node->hi != node->end_hi && to_merge--) + write_unique (--node->hi, tfp, temp_output); + } + } + + /* Update NODE. */ + merged_lo = lo_orig - node->lo; + merged_hi = hi_orig - node->hi; + node->nlo -= merged_lo; + node->nhi -= merged_hi; +} + +/* Into QUEUE, insert NODE if it is not already queued, and if one of + NODE's children has available lines and the other either has + available lines or has exhausted its lines. */ + +static void +queue_check_insert (struct merge_node_queue *queue, struct merge_node *node) +{ + if (! node->queued) + { + bool lo_avail = (node->lo - node->end_lo) != 0; + bool hi_avail = (node->hi - node->end_hi) != 0; + if (lo_avail ? hi_avail || ! node->nhi : hi_avail && ! node->nlo) + queue_insert (queue, node); + } +} + +/* Into QUEUE, insert NODE's parent if the parent can now be worked on. */ + +static void +queue_check_insert_parent (struct merge_node_queue *queue, + struct merge_node *node) +{ + if (node->level > MERGE_ROOT) + { + lock_node (node->parent); + queue_check_insert (queue, node->parent); + unlock_node (node->parent); + } + else if (node->nlo + node->nhi == 0) + { + /* If the MERGE_ROOT NODE has finished merging, insert the + MERGE_END node. */ + queue_insert (queue, node->parent); + } +} + +/* Repeatedly pop QUEUE for a node with lines to merge, and merge at least + some of those lines, until the MERGE_END node is popped. + TOTAL_LINES is the total number of lines. If merging at the top + level, send output to TFP. TEMP_OUTPUT is the name of TFP, or is + null if TFP is standard output. */ + +static void +merge_loop (struct merge_node_queue *queue, + size_t total_lines, FILE *tfp, char const *temp_output) +{ + while (1) + { + struct merge_node *node = queue_pop (queue); + + if (node->level == MERGE_END) + { + unlock_node (node); + /* Reinsert so other threads can pop it. */ + queue_insert (queue, node); + break; + } + mergelines_node (node, total_lines, tfp, temp_output); + queue_check_insert (queue, node); + queue_check_insert_parent (queue, node); + + unlock_node (node); + } +} + + +static void sortlines (struct line *restrict, size_t, size_t, + struct merge_node *, struct merge_node_queue *, + FILE *, char const *); + +/* Thread arguments for sortlines_thread. */ + +struct thread_args +{ + /* Source, i.e., the array of lines to sort. This points just past + the end of the array. */ + struct line *lines; + + /* Number of threads to use. If 0 or 1, sort single-threaded. */ + size_t nthreads; + + /* Number of lines in LINES and DEST. */ + size_t const total_lines; + + /* Merge node. Lines from this node and this node's sibling will merged + to this node's parent. */ + struct merge_node *const node; + + /* The priority queue controlling available work for the entire + internal sort. */ + struct merge_node_queue *const queue; + + /* If at the top level, the file to output to, and the file's name. + If the file is standard output, the file's name is null. */ + FILE *tfp; + char const *output_temp; +}; + +/* Like sortlines, except with a signature acceptable to pthread_create. */ + +static void * +sortlines_thread (void *data) +{ + struct thread_args const *args = data; + sortlines (args->lines, args->nthreads, args->total_lines, + args->node, args->queue, args->tfp, + args->output_temp); + return NULL; +} + +/* Sort lines, possibly in parallel. The arguments are as in struct + thread_args above. + + The algorithm has three phases: node creation, sequential sort, + and binary merge. + + During node creation, sortlines recursively visits each node in the + binary merge tree and creates a NODE structure corresponding to all the + future line merging NODE is responsible for. For each call to + sortlines, half the available threads are assigned to each recursive + call, until a leaf node having only 1 available thread is reached. + + Each leaf node then performs two sequential sorts, one on each half of + the lines it is responsible for. It records in its NODE structure that + there are two sorted sublists available to merge from, and inserts its + NODE into the priority queue. + + The binary merge phase then begins. Each thread drops into a loop + where the thread retrieves a NODE from the priority queue, merges lines + available to that NODE, and potentially insert NODE or its parent back + into the queue if there are sufficient available lines for them to + merge. This continues until all lines at all nodes of the merge tree + have been merged. */ + +static void +sortlines (struct line *restrict lines, size_t nthreads, + size_t total_lines, struct merge_node *node, + struct merge_node_queue *queue, FILE *tfp, char const *temp_output) +{ + size_t nlines = node->nlo + node->nhi; + + /* Calculate thread arguments. */ + size_t lo_threads = nthreads / 2; + size_t hi_threads = nthreads - lo_threads; + pthread_t thread; + struct thread_args args = {lines, lo_threads, total_lines, + node->lo_child, queue, tfp, temp_output}; + + if (nthreads > 1 && SUBTHREAD_LINES_HEURISTIC <= nlines + && pthread_create (&thread, NULL, sortlines_thread, &args) == 0) + { + sortlines (lines - node->nlo, hi_threads, total_lines, + node->hi_child, queue, tfp, temp_output); + pthread_join (thread, NULL); + } + else + { + /* Nthreads = 1, this is a leaf NODE, or pthread_create failed. + Sort with 1 thread. */ + size_t nlo = node->nlo; + size_t nhi = node->nhi; + struct line *temp = lines - total_lines; + if (1 < nhi) + sequential_sort (lines - nlo, nhi, temp - nlo / 2, false); + if (1 < nlo) + sequential_sort (lines, nlo, temp, false); + + /* Update merge NODE. No need to lock yet. */ + node->lo = lines; + node->hi = lines - nlo; + node->end_lo = lines - nlo; + node->end_hi = lines - nlo - nhi; + + queue_insert (queue, node); + merge_loop (queue, total_lines, tfp, temp_output); + } +} + +/* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are + the same as OUTFILE. If found, replace each with the same + temporary copy that can be merged into OUTFILE without destroying + OUTFILE before it is completely read. This temporary copy does not + count as a merge temp, so don't worry about incrementing NTEMPS in + the caller; final cleanup will remove it, not zaptemp. + + This test ensures that an otherwise-erroneous use like + "sort -m -o FILE ... FILE ..." copies FILE before writing to it. + It's not clear that POSIX requires this nicety. + Detect common error cases, but don't try to catch obscure cases like + "cat ... FILE ... | sort -m -o FILE" + where traditional "sort" doesn't copy the input and where + people should know that they're getting into trouble anyway. + Catching these obscure cases would slow down performance in + common cases. */ + +static void +avoid_trashing_input (struct sortfile *files, size_t ntemps, + size_t nfiles, char const *outfile) +{ + struct tempnode *tempcopy = NULL; + + for (size_t i = ntemps; i < nfiles; i++) + { + bool is_stdin = STREQ (files[i].name, "-"); + bool same; + struct stat instat; + + if (outfile && STREQ (outfile, files[i].name) && !is_stdin) + same = true; + else + { + struct stat *outst = get_outstatus (); + if (!outst) + break; + + same = (((is_stdin + ? fstat (STDIN_FILENO, &instat) + : stat (files[i].name, &instat)) + == 0) + && SAME_INODE (instat, *outst)); + } + + if (same) + { + if (! tempcopy) + { + FILE *tftp; + tempcopy = create_temp (&tftp); + mergefiles (&files[i], 0, 1, tftp, tempcopy->name); + } + + files[i].name = tempcopy->name; + files[i].temp = tempcopy; + } + } +} + +/* Scan the input files to ensure all are accessible. + Otherwise exit with a diagnostic. + + This will catch common issues with permissions etc. + but will fail to notice issues where you can open but not read, + like when a directory is specified on some systems. + Catching these obscure cases could slow down performance in + common cases. */ + +static void +check_inputs (char *const *files, size_t nfiles) +{ + for (size_t i = 0; i < nfiles; i++) + { + if (STREQ (files[i], "-")) + continue; + + if (euidaccess (files[i], R_OK) != 0) + sort_die (_("cannot read"), files[i]); + } +} + +/* Ensure a specified output file can be created or written to, + and point stdout to it. Do not truncate the file. + Exit with a diagnostic on failure. */ + +static void +check_output (char const *outfile) +{ + if (outfile) + { + int oflags = O_WRONLY | O_BINARY | O_CLOEXEC | O_CREAT; + int outfd = open (outfile, oflags, MODE_RW_UGO); + if (outfd < 0) + sort_die (_("open failed"), outfile); + move_fd (outfd, STDOUT_FILENO); + } +} + +/* Merge the input FILES. NTEMPS is the number of files at the + start of FILES that are temporary; it is zero at the top level. + NFILES is the total number of files. Put the output in + OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */ + +static void +merge (struct sortfile *files, size_t ntemps, size_t nfiles, + char const *output_file) +{ + while (nmerge < nfiles) + { + /* Number of input files processed so far. */ + size_t in; + + /* Number of output files generated so far. */ + size_t out; + + /* nfiles % NMERGE; this counts input files that are left over + after all full-sized merges have been done. */ + size_t remainder; + + /* Number of easily-available slots at the next loop iteration. */ + size_t cheap_slots; + + /* Do as many NMERGE-size merges as possible. In the case that + nmerge is bogus, increment by the maximum number of file + descriptors allowed. */ + for (out = in = 0; nmerge <= nfiles - in; out++) + { + FILE *tfp; + struct tempnode *temp = create_temp (&tfp); + size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge), + nmerge, tfp, temp->name); + ntemps -= MIN (ntemps, num_merged); + files[out].name = temp->name; + files[out].temp = temp; + in += num_merged; + } + + remainder = nfiles - in; + cheap_slots = nmerge - out % nmerge; + + if (cheap_slots < remainder) + { + /* So many files remain that they can't all be put into the last + NMERGE-sized output window. Do one more merge. Merge as few + files as possible, to avoid needless I/O. */ + size_t nshortmerge = remainder - cheap_slots + 1; + FILE *tfp; + struct tempnode *temp = create_temp (&tfp); + size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge), + nshortmerge, tfp, temp->name); + ntemps -= MIN (ntemps, num_merged); + files[out].name = temp->name; + files[out++].temp = temp; + in += num_merged; + } + + /* Put the remaining input files into the last NMERGE-sized output + window, so they will be merged in the next pass. */ + memmove (&files[out], &files[in], (nfiles - in) * sizeof *files); + ntemps += out; + nfiles -= in - out; + } + + avoid_trashing_input (files, ntemps, nfiles, output_file); + + /* We aren't guaranteed that this final mergefiles will work, therefore we + try to merge into the output, and then merge as much as we can into a + temp file if we can't. Repeat. */ + + while (true) + { + /* Merge directly into the output file if possible. */ + FILE **fps; + size_t nopened = open_input_files (files, nfiles, &fps); + + if (nopened == nfiles) + { + FILE *ofp = stream_open (output_file, "w"); + if (ofp) + { + mergefps (files, ntemps, nfiles, ofp, output_file, fps); + break; + } + if (errno != EMFILE || nopened <= 2) + sort_die (_("open failed"), output_file); + } + else if (nopened <= 2) + sort_die (_("open failed"), files[nopened].name); + + /* We ran out of file descriptors. Close one of the input + files, to gain a file descriptor. Then create a temporary + file with our spare file descriptor. Retry if that failed + (e.g., some other process could open a file between the time + we closed and tried to create). */ + FILE *tfp; + struct tempnode *temp; + do + { + nopened--; + xfclose (fps[nopened], files[nopened].name); + temp = maybe_create_temp (&tfp, ! (nopened <= 2)); + } + while (!temp); + + /* Merge into the newly allocated temporary. */ + mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp->name, + fps); + ntemps -= MIN (ntemps, nopened); + files[0].name = temp->name; + files[0].temp = temp; + + memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files); + ntemps++; + nfiles -= nopened - 1; + } +} + +/* Sort NFILES FILES onto OUTPUT_FILE. Use at most NTHREADS threads. */ + +static void +sort (char *const *files, size_t nfiles, char const *output_file, + size_t nthreads) +{ + struct buffer buf; + IF_LINT (buf.buf = NULL); + size_t ntemps = 0; + bool output_file_created = false; + + buf.alloc = 0; + + while (nfiles) + { + char const *temp_output; + char const *file = *files; + FILE *fp = xfopen (file, "r"); + FILE *tfp; + + size_t bytes_per_line; + if (nthreads > 1) + { + /* Get log P. */ + size_t tmp = 1; + size_t mult = 1; + while (tmp < nthreads) + { + tmp *= 2; + mult++; + } + bytes_per_line = (mult * sizeof (struct line)); + } + else + bytes_per_line = sizeof (struct line) * 3 / 2; + + if (! buf.alloc) + initbuf (&buf, bytes_per_line, + sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line)); + buf.eof = false; + files++; + nfiles--; + + while (fillbuf (&buf, fp, file)) + { + struct line *line; + + if (buf.eof && nfiles + && (bytes_per_line + 1 + < (buf.alloc - buf.used - bytes_per_line * buf.nlines))) + { + /* End of file, but there is more input and buffer room. + Concatenate the next input file; this is faster in + the usual case. */ + buf.left = buf.used; + break; + } + + saved_line.text = NULL; + line = buffer_linelim (&buf); + if (buf.eof && !nfiles && !ntemps && !buf.left) + { + xfclose (fp, file); + tfp = xfopen (output_file, "w"); + temp_output = output_file; + output_file_created = true; + } + else + { + ++ntemps; + temp_output = create_temp (&tfp)->name; + } + if (1 < buf.nlines) + { + struct merge_node_queue queue; + queue_init (&queue, nthreads); + struct merge_node *merge_tree = + merge_tree_init (nthreads, buf.nlines, line); + + sortlines (line, nthreads, buf.nlines, merge_tree + 1, + &queue, tfp, temp_output); + +#ifdef lint + merge_tree_destroy (nthreads, merge_tree); + queue_destroy (&queue); +#endif + } + else + write_unique (line - 1, tfp, temp_output); + + xfclose (tfp, temp_output); + + if (output_file_created) + goto finish; + } + xfclose (fp, file); + } + + finish: + free (buf.buf); + + if (! output_file_created) + { + struct tempnode *node = temphead; + struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles); + for (size_t i = 0; node; i++) + { + tempfiles[i].name = node->name; + tempfiles[i].temp = node; + node = node->next; + } + merge (tempfiles, ntemps, ntemps, output_file); + free (tempfiles); + } + + reap_all (); +} + +/* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */ + +static void +insertkey (struct keyfield *key_arg) +{ + struct keyfield **p; + struct keyfield *key = xmemdup (key_arg, sizeof *key); + + for (p = &keylist; *p; p = &(*p)->next) + continue; + *p = key; + key->next = NULL; +} + +/* Report a bad field specification SPEC, with extra info MSGID. */ + +static void badfieldspec (char const *, char const *) + ATTRIBUTE_NORETURN; +static void +badfieldspec (char const *spec, char const *msgid) +{ + die (SORT_FAILURE, 0, _("%s: invalid field specification %s"), + _(msgid), quote (spec)); +} + +/* Report incompatible options. */ + +static void incompatible_options (char const *) ATTRIBUTE_NORETURN; +static void +incompatible_options (char const *opts) +{ + die (SORT_FAILURE, 0, _("options '-%s' are incompatible"), (opts)); +} + +/* Check compatibility of ordering options. */ + +static void +check_ordering_compatibility (void) +{ + struct keyfield *key; + + for (key = keylist; key; key = key->next) + if (1 < (key->numeric + key->general_numeric + key->human_numeric + + key->month + (key->version | key->random | !!key->ignore))) + { + /* The following is too big, but guaranteed to be "big enough". */ + char opts[sizeof short_options]; + /* Clear flags we're not interested in. */ + key->skipsblanks = key->skipeblanks = key->reverse = false; + key_to_opts (key, opts); + incompatible_options (opts); + } +} + +/* Parse the leading integer in STRING and store the resulting value + (which must fit into size_t) into *VAL. Return the address of the + suffix after the integer. If the value is too large, silently + substitute SIZE_MAX. If MSGID is NULL, return NULL after + failure; otherwise, report MSGID and exit on failure. */ + +static char const * +parse_field_count (char const *string, size_t *val, char const *msgid) +{ + char *suffix; + uintmax_t n; + + switch (xstrtoumax (string, &suffix, 10, &n, "")) + { + case LONGINT_OK: + case LONGINT_INVALID_SUFFIX_CHAR: + *val = n; + if (*val == n) + break; + FALLTHROUGH; + case LONGINT_OVERFLOW: + case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR: + *val = SIZE_MAX; + break; + + case LONGINT_INVALID: + if (msgid) + die (SORT_FAILURE, 0, _("%s: invalid count at start of %s"), + _(msgid), quote (string)); + return NULL; + } + + return suffix; +} + +/* Handle interrupts and hangups. */ + +static void +sighandler (int sig) +{ + if (! SA_NOCLDSTOP) + signal (sig, SIG_IGN); + + cleanup (); + + signal (sig, SIG_DFL); + raise (sig); +} + +/* Set the ordering options for KEY specified in S. + Return the address of the first character in S that + is not a valid ordering option. + BLANKTYPE is the kind of blanks that 'b' should skip. */ + +static char * +set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype) +{ + while (*s) + { + switch (*s) + { + case 'b': + if (blanktype == bl_start || blanktype == bl_both) + key->skipsblanks = true; + if (blanktype == bl_end || blanktype == bl_both) + key->skipeblanks = true; + break; + case 'd': + key->ignore = nondictionary; + break; + case 'f': + key->translate = fold_toupper; + break; + case 'g': + key->general_numeric = true; + break; + case 'h': + key->human_numeric = true; + break; + case 'i': + /* Option order should not matter, so don't let -i override + -d. -d implies -i, but -i does not imply -d. */ + if (! key->ignore) + key->ignore = nonprinting; + break; + case 'M': + key->month = true; + break; + case 'n': + key->numeric = true; + break; + case 'R': + key->random = true; + break; + case 'r': + key->reverse = true; + break; + case 'V': + key->version = true; + break; + default: + return (char *) s; + } + ++s; + } + return (char *) s; +} + +/* Initialize KEY. */ + +static struct keyfield * +key_init (struct keyfield *key) +{ + memset (key, 0, sizeof *key); + key->eword = SIZE_MAX; + return key; +} + +int +main (int argc, char **argv) +{ + struct keyfield *key; + struct keyfield key_buf; + struct keyfield gkey; + bool gkey_only = false; + char const *s; + int c = 0; + char checkonly = 0; + bool mergeonly = false; + char *random_source = NULL; + bool need_random = false; + size_t nthreads = 0; + size_t nfiles = 0; + bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL); + int posix_ver = posix2_version (); + bool traditional_usage = ! (200112 <= posix_ver && posix_ver < 200809); + char **files; + char *files_from = NULL; + struct Tokens tok; + char const *outfile = NULL; + bool locale_ok; + + initialize_main (&argc, &argv); + set_program_name (argv[0]); + locale_ok = !! setlocale (LC_ALL, ""); + bindtextdomain (PACKAGE, LOCALEDIR); + textdomain (PACKAGE); + + initialize_exit_failure (SORT_FAILURE); + + hard_LC_COLLATE = hard_locale (LC_COLLATE); +#if HAVE_NL_LANGINFO + hard_LC_TIME = hard_locale (LC_TIME); +#endif + + /* Get locale's representation of the decimal point. */ + { + struct lconv const *locale = localeconv (); + + /* If the locale doesn't define a decimal point, or if the decimal + point is multibyte, use the C locale's decimal point. FIXME: + add support for multibyte decimal points. */ + decimal_point = to_uchar (locale->decimal_point[0]); + if (! decimal_point || locale->decimal_point[1]) + decimal_point = '.'; + + /* FIXME: add support for multibyte thousands separators. */ + thousands_sep = to_uchar (*locale->thousands_sep); + if (! thousands_sep || locale->thousands_sep[1]) + thousands_sep = -1; + } + + have_read_stdin = false; + inittables (); + + { + size_t i; + static int const sig[] = + { + /* The usual suspects. */ + SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM, +#ifdef SIGPOLL + SIGPOLL, +#endif +#ifdef SIGPROF + SIGPROF, +#endif +#ifdef SIGVTALRM + SIGVTALRM, +#endif +#ifdef SIGXCPU + SIGXCPU, +#endif +#ifdef SIGXFSZ + SIGXFSZ, +#endif + }; + enum { nsigs = ARRAY_CARDINALITY (sig) }; + +#if SA_NOCLDSTOP + struct sigaction act; + + sigemptyset (&caught_signals); + for (i = 0; i < nsigs; i++) + { + sigaction (sig[i], NULL, &act); + if (act.sa_handler != SIG_IGN) + sigaddset (&caught_signals, sig[i]); + } + + act.sa_handler = sighandler; + act.sa_mask = caught_signals; + act.sa_flags = 0; + + for (i = 0; i < nsigs; i++) + if (sigismember (&caught_signals, sig[i])) + sigaction (sig[i], &act, NULL); +#else + for (i = 0; i < nsigs; i++) + if (signal (sig[i], SIG_IGN) != SIG_IGN) + { + signal (sig[i], sighandler); + siginterrupt (sig[i], 1); + } +#endif + } + signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent. */ + + /* The signal mask is known, so it is safe to invoke exit_cleanup. */ + atexit (exit_cleanup); + + key_init (&gkey); + gkey.sword = SIZE_MAX; + + files = xnmalloc (argc, sizeof *files); + + while (true) + { + /* Parse an operand as a file after "--" was seen; or if + pedantic and a file was seen, unless the POSIX version + is not 1003.1-2001 and -c was not seen and the operand is + "-o FILE" or "-oFILE". */ + int oi = -1; + + if (c == -1 + || (posixly_correct && nfiles != 0 + && ! (traditional_usage + && ! checkonly + && optind != argc + && argv[optind][0] == '-' && argv[optind][1] == 'o' + && (argv[optind][2] || optind + 1 != argc))) + || ((c = getopt_long (argc, argv, short_options, + long_options, &oi)) + == -1)) + { + if (argc <= optind) + break; + files[nfiles++] = argv[optind++]; + } + else switch (c) + { + case 1: + key = NULL; + if (optarg[0] == '+') + { + bool minus_pos_usage = (optind != argc && argv[optind][0] == '-' + && ISDIGIT (argv[optind][1])); + traditional_usage |= minus_pos_usage && !posixly_correct; + if (traditional_usage) + { + /* Treat +POS1 [-POS2] as a key if possible; but silently + treat an operand as a file if it is not a valid +POS1. */ + key = key_init (&key_buf); + s = parse_field_count (optarg + 1, &key->sword, NULL); + if (s && *s == '.') + s = parse_field_count (s + 1, &key->schar, NULL); + if (! (key->sword || key->schar)) + key->sword = SIZE_MAX; + if (! s || *set_ordering (s, key, bl_start)) + key = NULL; + else + { + if (minus_pos_usage) + { + char const *optarg1 = argv[optind++]; + s = parse_field_count (optarg1 + 1, &key->eword, + N_("invalid number after '-'")); + /* When called with a non-NULL message ID, + parse_field_count cannot return NULL. Tell static + analysis tools that dereferencing S is safe. */ + assert (s); + if (*s == '.') + s = parse_field_count (s + 1, &key->echar, + N_("invalid number after '.'")); + if (!key->echar && key->eword) + { + /* obsolescent syntax +A.x -B.y is equivalent to: + -k A+1.x+1,B.y (when y = 0) + -k A+1.x+1,B+1.y (when y > 0) + So eword is decremented as in the -k case + only when the end field (B) is specified and + echar (y) is 0. */ + key->eword--; + } + if (*set_ordering (s, key, bl_end)) + badfieldspec (optarg1, + N_("stray character in field spec")); + } + key->traditional_used = true; + insertkey (key); + } + } + } + if (! key) + files[nfiles++] = optarg; + break; + + case SORT_OPTION: + c = XARGMATCH ("--sort", optarg, sort_args, sort_types); + FALLTHROUGH; + case 'b': + case 'd': + case 'f': + case 'g': + case 'h': + case 'i': + case 'M': + case 'n': + case 'r': + case 'R': + case 'V': + { + char str[2]; + str[0] = c; + str[1] = '\0'; + set_ordering (str, &gkey, bl_both); + } + break; + + case CHECK_OPTION: + c = (optarg + ? XARGMATCH ("--check", optarg, check_args, check_types) + : 'c'); + FALLTHROUGH; + case 'c': + case 'C': + if (checkonly && checkonly != c) + incompatible_options ("cC"); + checkonly = c; + break; + + case COMPRESS_PROGRAM_OPTION: + if (compress_program && !STREQ (compress_program, optarg)) + die (SORT_FAILURE, 0, _("multiple compress programs specified")); + compress_program = optarg; + break; + + case DEBUG_PROGRAM_OPTION: + debug = true; + break; + + case FILES0_FROM_OPTION: + files_from = optarg; + break; + + case 'k': + key = key_init (&key_buf); + + /* Get POS1. */ + s = parse_field_count (optarg, &key->sword, + N_("invalid number at field start")); + if (! key->sword--) + { + /* Provoke with 'sort -k0' */ + badfieldspec (optarg, N_("field number is zero")); + } + if (*s == '.') + { + s = parse_field_count (s + 1, &key->schar, + N_("invalid number after '.'")); + if (! key->schar--) + { + /* Provoke with 'sort -k1.0' */ + badfieldspec (optarg, N_("character offset is zero")); + } + } + if (! (key->sword || key->schar)) + key->sword = SIZE_MAX; + s = set_ordering (s, key, bl_start); + if (*s != ',') + { + key->eword = SIZE_MAX; + key->echar = 0; + } + else + { + /* Get POS2. */ + s = parse_field_count (s + 1, &key->eword, + N_("invalid number after ','")); + if (! key->eword--) + { + /* Provoke with 'sort -k1,0' */ + badfieldspec (optarg, N_("field number is zero")); + } + if (*s == '.') + { + s = parse_field_count (s + 1, &key->echar, + N_("invalid number after '.'")); + } + s = set_ordering (s, key, bl_end); + } + if (*s) + badfieldspec (optarg, N_("stray character in field spec")); + insertkey (key); + break; + + case 'm': + mergeonly = true; + break; + + case NMERGE_OPTION: + specify_nmerge (oi, c, optarg); + break; + + case 'o': + if (outfile && !STREQ (outfile, optarg)) + die (SORT_FAILURE, 0, _("multiple output files specified")); + outfile = optarg; + break; + + case RANDOM_SOURCE_OPTION: + if (random_source && !STREQ (random_source, optarg)) + die (SORT_FAILURE, 0, _("multiple random sources specified")); + random_source = optarg; + break; + + case 's': + stable = true; + break; + + case 'S': + specify_sort_size (oi, c, optarg); + break; + + case 't': + { + char newtab = optarg[0]; + if (! newtab) + die (SORT_FAILURE, 0, _("empty tab")); + if (optarg[1]) + { + if (STREQ (optarg, "\\0")) + newtab = '\0'; + else + { + /* Provoke with 'sort -txx'. Complain about + "multi-character tab" instead of "multibyte tab", so + that the diagnostic's wording does not need to be + changed once multibyte characters are supported. */ + die (SORT_FAILURE, 0, _("multi-character tab %s"), + quote (optarg)); + } + } + if (tab != TAB_DEFAULT && tab != newtab) + die (SORT_FAILURE, 0, _("incompatible tabs")); + tab = newtab; + } + break; + + case 'T': + add_temp_dir (optarg); + break; + + case PARALLEL_OPTION: + nthreads = specify_nthreads (oi, c, optarg); + break; + + case 'u': + unique = true; + break; + + case 'y': + /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x + through Solaris 7. It is also accepted by many non-Solaris + "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5. + -y is marked as obsolete starting with Solaris 8 (1999), but is + still accepted as of Solaris 10 prerelease (2004). + + Solaris 2.5.1 "sort -y 100" reads the input file "100", but + emulate Solaris 8 and 9 "sort -y 100" which ignores the "100", + and which in general ignores the argument after "-y" if it + consists entirely of digits (it can even be empty). */ + if (optarg == argv[optind - 1]) + { + char const *p; + for (p = optarg; ISDIGIT (*p); p++) + continue; + optind -= (*p != '\0'); + } + break; + + case 'z': + eolchar = 0; + break; + + case_GETOPT_HELP_CHAR; + + case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); + + default: + usage (SORT_FAILURE); + } + } + + if (files_from) + { + /* When using --files0-from=F, you may not specify any files + on the command-line. */ + if (nfiles) + { + error (0, 0, _("extra operand %s"), quoteaf (files[0])); + fprintf (stderr, "%s\n", + _("file operands cannot be combined with --files0-from")); + usage (SORT_FAILURE); + } + + FILE *stream = xfopen (files_from, "r"); + + readtokens0_init (&tok); + + if (! readtokens0 (stream, &tok)) + die (SORT_FAILURE, 0, _("cannot read file names from %s"), + quoteaf (files_from)); + xfclose (stream, files_from); + + if (tok.n_tok) + { + free (files); + files = tok.tok; + nfiles = tok.n_tok; + for (size_t i = 0; i < nfiles; i++) + { + if (STREQ (files[i], "-")) + die (SORT_FAILURE, 0, _("when reading file names from stdin, " + "no file name of %s allowed"), + quoteaf (files[i])); + else if (files[i][0] == '\0') + { + /* Using the standard 'filename:line-number:' prefix here is + not totally appropriate, since NUL is the separator, + not NL, but it might be better than nothing. */ + unsigned long int file_number = i + 1; + die (SORT_FAILURE, 0, + _("%s:%lu: invalid zero-length file name"), + quotef (files_from), file_number); + } + } + } + else + die (SORT_FAILURE, 0, _("no input from %s"), + quoteaf (files_from)); + } + + /* Inheritance of global options to individual keys. */ + for (key = keylist; key; key = key->next) + { + if (default_key_compare (key) && !key->reverse) + { + key->ignore = gkey.ignore; + key->translate = gkey.translate; + key->skipsblanks = gkey.skipsblanks; + key->skipeblanks = gkey.skipeblanks; + key->month = gkey.month; + key->numeric = gkey.numeric; + key->general_numeric = gkey.general_numeric; + key->human_numeric = gkey.human_numeric; + key->version = gkey.version; + key->random = gkey.random; + key->reverse = gkey.reverse; + } + + need_random |= key->random; + } + + if (!keylist && !default_key_compare (&gkey)) + { + gkey_only = true; + insertkey (&gkey); + need_random |= gkey.random; + } + + check_ordering_compatibility (); + + if (debug) + { + if (checkonly || outfile) + { + static char opts[] = "X --debug"; + opts[0] = (checkonly ? checkonly : 'o'); + incompatible_options (opts); + } + + /* Always output the locale in debug mode, since this + is such a common source of confusion. */ + + /* OpenBSD can only set some categories with LC_ALL above, + so set LC_COLLATE explicitly to flag errors. */ + if (locale_ok) + locale_ok = !! setlocale (LC_COLLATE, ""); + if (! locale_ok) + error (0, 0, "%s", _("failed to set locale")); + if (hard_LC_COLLATE) + error (0, 0, _("text ordering performed using %s sorting rules"), + quote (setlocale (LC_COLLATE, NULL))); + else + error (0, 0, "%s", + _("text ordering performed using simple byte comparison")); + + key_warnings (&gkey, gkey_only); + } + + reverse = gkey.reverse; + + if (need_random) + random_md5_state_init (random_source); + + if (temp_dir_count == 0) + { + char const *tmp_dir = getenv ("TMPDIR"); + add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR); + } + + if (nfiles == 0) + { + nfiles = 1; + free (files); + files = xmalloc (sizeof *files); + *files = (char *) "-"; + } + + /* Need to re-check that we meet the minimum requirement for memory + usage with the final value for NMERGE. */ + if (0 < sort_size) + sort_size = MAX (sort_size, MIN_SORT_SIZE); + + if (checkonly) + { + if (nfiles > 1) + die (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"), + quoteaf (files[1]), checkonly); + + if (outfile) + { + static char opts[] = {0, 'o', 0}; + opts[0] = checkonly; + incompatible_options (opts); + } + + /* POSIX requires that sort return 1 IFF invoked with -c or -C and the + input is not properly sorted. */ + return check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER; + } + + /* Check all inputs are accessible, or exit immediately. */ + check_inputs (files, nfiles); + + /* Check output is writable, or exit immediately. */ + check_output (outfile); + + if (mergeonly) + { + struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles); + + for (size_t i = 0; i < nfiles; ++i) + sortfiles[i].name = files[i]; + + merge (sortfiles, 0, nfiles, outfile); + IF_LINT (free (sortfiles)); + } + else + { + if (!nthreads) + { + unsigned long int np = num_processors (NPROC_CURRENT_OVERRIDABLE); + nthreads = MIN (np, DEFAULT_MAX_THREADS); + } + + /* Avoid integer overflow later. */ + size_t nthreads_max = SIZE_MAX / (2 * sizeof (struct merge_node)); + nthreads = MIN (nthreads, nthreads_max); + + sort (files, nfiles, outfile, nthreads); + } + +#ifdef lint + if (files_from) + readtokens0_free (&tok); + else + free (files); +#endif + + if (have_read_stdin && fclose (stdin) == EOF) + sort_die (_("close failed"), "-"); + + return EXIT_SUCCESS; +} |