summaryrefslogtreecommitdiffstats
path: root/src/parsort
diff options
context:
space:
mode:
Diffstat (limited to 'src/parsort')
-rwxr-xr-xsrc/parsort459
1 files changed, 459 insertions, 0 deletions
diff --git a/src/parsort b/src/parsort
new file mode 100755
index 0000000..41a75da
--- /dev/null
+++ b/src/parsort
@@ -0,0 +1,459 @@
+#!/usr/bin/perl
+
+# SPDX-FileCopyrightText: 2021-2024 Ole Tange, http://ole.tange.dk and Free Software and Foundation, Inc.
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+=pod
+
+=head1 NAME
+
+parsort - Sort (big files) in parallel
+
+
+=head1 SYNOPSIS
+
+B<parsort> I<options for sort>
+
+
+=head1 DESCRIPTION
+
+B<parsort> uses GNU B<sort> to sort in parallel. It works just like
+B<sort> but faster on inputs with more than 1 M lines, if you have a
+multicore machine.
+
+Hopefully these ideas will make it into GNU B<sort> in the future.
+
+
+=head1 OPTIONS
+
+Same as B<sort>. Except:
+
+=over 4
+
+=item B<--parallel=>I<N>
+
+Change the number of sorts run concurrently to I<N>. I<N> will be
+increased to number of files if B<parsort> is given more than I<N>
+files.
+
+=back
+
+
+=head1 EXAMPLE
+
+Sort files:
+
+ parsort *.txt > sorted.txt
+
+Sort stdin (standard input) numerically:
+
+ cat numbers | parsort -n > sorted.txt
+
+
+=head1 PERFORMANCE
+
+B<parsort> is faster on files than on stdin (standard input), because
+different parts of a file can be read in parallel.
+
+On a 48 core machine you should see a speedup of 3x over B<sort>.
+
+
+=head1 AUTHOR
+
+Copyright (C) 2020-2024 Ole Tange,
+http://ole.tange.dk and Free Software Foundation, Inc.
+
+
+=head1 LICENSE
+
+Copyright (C) 2012 Free Software Foundation, Inc.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or
+at your option any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+=head1 DEPENDENCIES
+
+B<parsort> uses B<sort>, B<bash>, and B<parallel>.
+
+
+=head1 SEE ALSO
+
+B<sort>
+
+
+=cut
+
+use strict;
+use Getopt::Long;
+use POSIX qw(mkfifo);
+
+Getopt::Long::Configure("bundling","require_order");
+
+my @ARGV_before = @ARGV;
+
+GetOptions(
+ "debug|D" => \$opt::D,
+ "version" => \$opt::version,
+ "verbose|v" => \$opt::verbose,
+ "b|ignore-leading-blanks" => \$opt::ignore_leading_blanks,
+ "d|dictionary-order" => \$opt::dictionary_order,
+ "f|ignore-case" => \$opt::ignore_case,
+ "g|general-numeric-sort" => \$opt::general_numeric_sort,
+ "i|ignore-nonprinting" => \$opt::ignore_nonprinting,
+ "M|month-sort" => \$opt::month_sort,
+ "h|human-numeric-sort" => \$opt::human_numeric_sort,
+ "n|numeric-sort" => \$opt::numeric_sort,
+ "N|numascii" => \$opt::numascii,
+ "r|reverse" => \$opt::reverse,
+ "R|random-sort" => \$opt::random_sort,
+ "sort=s" => \$opt::sort,
+ "V|version-sort" => \$opt::version_sort,
+ "k|key=s" => \@opt::key,
+ "t|field-separator=s" => \$opt::field_separator,
+ "z|zero-terminated" => \$opt::zero_terminated,
+ "files0-from=s" => \$opt::files0_from,
+ "random-source=s" => \$opt::dummy,
+ "batch-size=s" => \$opt::dummy,
+ "check=s" => \$opt::dummy,
+ "c" => \$opt::dummy,
+ "C" => \$opt::dummy,
+ "compress-program=s" => \$opt::dummy,
+ "T|temporary-directory=s" => \$opt::dummy,
+ "parallel=s" => \$opt::parallel,
+ "u|unique" => \$opt::dummy,
+ "S|buffer-size=s" => \$opt::dummy,
+ "s|stable" => \$opt::dummy,
+ "help" => \$opt::dummy,
+ ) || exit(255);
+$Global::progname = ($0 =~ m:(^|/)([^/]+)$:)[1];
+$Global::version = 20240222;
+if($opt::version) { version(); exit 0; }
+# Remove -D and --parallel=N
+my @s = (grep { ! /^-D$|^--parallel=\S+$/ }
+ @ARGV_before[0..($#ARGV_before-$#ARGV-1)]);
+my @sortoptions;
+while(@s) {
+ my $o = shift @s;
+ # Remove '--parallel N'
+ if($o eq "--parallel") {
+ $o = shift @s;
+ } else {
+ push @sortoptions, $o;
+ }
+}
+@Global::sortoptions = shell_quote(@sortoptions);
+$ENV{'TMPDIR'} ||= "/tmp";
+
+sub merge {
+ # Input:
+ # @cmd = commands to 'cat' (part of) a file
+ # 'cat a' 'cat b' 'cat c' =>
+ # sort -m <(sort -m <(cat a) <(cat b)) <(sort -m <(cat c))
+ my @cmd = @_;
+ chomp(@cmd);
+ while($#cmd > 0) {
+ my @tmp;
+ while($#cmd >= 0) {
+ my $a = shift @cmd;
+ my $b = shift @cmd;
+ $a &&= "<($a)";
+ $b &&= "<($b)";
+ # This looks like useless use of 'cat', but contrary to
+ # naive belief it increases performance dramatically.
+ push @tmp, "sort -m @Global::sortoptions $a $b | cat"
+ }
+ @cmd = @tmp;
+ }
+ return @cmd;
+}
+
+sub sort_files {
+ # Input is files
+ my @files = @_;
+ # Let GNU Parallel generate the commands to read parts of files
+ # The commands split at \n (or \0)
+ # and there will be at least one for each CPU thread
+ my @subopt;
+ if($opt::zero_terminated) { push @subopt, qw(--recend "\0"); }
+ if($opt::parallel) { push @subopt, qw(--jobs), $opt::parallel; }
+ # $uniq is needed because @files could contain \n
+ my $uniq = join "", map { (0..9,"a".."z","A".."Z")[rand(62)] } (1..20);
+ open(my $par,"-|",qw(parallel), @subopt,
+ qw(--pipepart --block -1 --dryrun -vv sort),
+ @Global::sortoptions, $uniq, '::::', @files) || die;
+ # Generated commands:
+ # <file perl-catter | (sort ... $uniq )
+ # Use $uniq to split into commands
+ # (We cannot use \n because 'file' may contain newline)
+ my @cmd = map { "$_)\n" } split(/$uniq[)]\n/, join("",<$par>));
+ debug(1,@cmd);
+ close $par;
+ @cmd = merge(@cmd);
+ # The command uses <(...) so it is incompatible with /bin/sh
+ open(my $bash,"|-","bash") || die;
+ print $bash @cmd;
+ close $bash;
+}
+
+sub sort_stdin {
+ # Input is stdin
+ # Spread the input between n processes that each sort
+ # n = number of CPU threads
+ my $numthreads;
+ chomp($numthreads = $opt::parallel || `parallel --number-of-threads`);
+ my @fifos = map { tmpfifo() } 1..$numthreads;
+ map { mkfifo($_,0600) } @fifos;
+ # This trick removes the fifo as soon as it is connected in the other end
+ # (rm fifo; ...) < fifo
+ my @cmd = (map { "(rm $_; sort @Global::sortoptions) < $_" }
+ map { Q($_) } @fifos);
+ @cmd = merge(@cmd);
+ if(fork) {
+ } else {
+ my @subopt = $opt::zero_terminated ? qw(--recend "\0") : ();
+ exec(qw(parallel -0 -j), $numthreads, @subopt,
+ # 286k is the best mean value after testing 250..350
+ qw(--block 286k --pipe --roundrobin cat > {} :::),@fifos);
+ }
+ # The command uses <(...) so it is incompatible with /bin/sh
+ open(my $bash,"|-","bash") || die;
+ print $bash @cmd;
+ close $bash;
+}
+
+sub tmpname {
+ # Select a name that does not exist
+ # Do not create the file as it may be used for creating a socket (by tmux)
+ # Remember the name in $Global::unlink to avoid hitting the same name twice
+ my $name = shift;
+ my($tmpname);
+ if(not -w $ENV{'TMPDIR'}) {
+ if(not -e $ENV{'TMPDIR'}) {
+ ::error("Tmpdir '$ENV{'TMPDIR'}' does not exist.","Try 'mkdir ".
+ Q($ENV{'TMPDIR'})."'");
+ } else {
+ ::error("Tmpdir '$ENV{'TMPDIR'}' is not writable.","Try 'chmod +w ".
+ Q($ENV{'TMPDIR'})."'");
+ }
+ exit(255);
+ }
+ do {
+ $tmpname = $ENV{'TMPDIR'}."/".$name.
+ join"", map { (0..9,"a".."z","A".."Z")[rand(62)] } (1..5);
+ } while(-e $tmpname);
+ return $tmpname;
+}
+
+sub tmpfifo {
+ # Find an unused name and mkfifo on it
+ my $tmpfifo = tmpname("psort");
+ mkfifo($tmpfifo,0600);
+ return $tmpfifo;
+}
+
+sub debug {
+ # Returns: N/A
+ $opt::D or return;
+ @_ = grep { defined $_ ? $_ : "" } @_;
+ print STDERR @_[1..$#_];
+}
+
+sub version() {
+ # Returns: N/A
+ print join
+ ("\n",
+ "GNU $Global::progname $Global::version",
+ "Copyright (C) 2020-2024 Ole Tange, http://ole.tange.dk and Free Software",
+ "Foundation, Inc.",
+ "License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>",
+ "This is free software: you are free to change and redistribute it.",
+ "GNU $Global::progname comes with no warranty.",
+ "",
+ "Web site: https://www.gnu.org/software/parallel\n",
+ );
+}
+
+sub shell_quote(@) {
+ # Input:
+ # @strings = strings to be quoted
+ # Returns:
+ # @shell_quoted_strings = string quoted as needed by the shell
+ return wantarray ? (map { Q($_) } @_) : (join" ",map { Q($_) } @_);
+}
+
+sub shell_quote_scalar_rc($) {
+ # Quote for the rc-shell
+ my $a = $_[0];
+ if(defined $a) {
+ if(($a =~ s/'/''/g)
+ +
+ ($a =~ s/[\n\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\<\=\>\~\|\; \"\!\$\&\'\202-\377]+/'$&'/go)) {
+ # A string was replaced
+ # No need to test for "" or \0
+ } elsif($a eq "") {
+ $a = "''";
+ } elsif($a eq "\0") {
+ $a = "";
+ }
+ }
+ return $a;
+}
+
+sub shell_quote_scalar_csh($) {
+ # Quote for (t)csh
+ my $a = $_[0];
+ if(defined $a) {
+ # $a =~ s/([\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\>\<\~\|\; \"\!\$\&\'\202-\377])/\\$1/g;
+ # This is 1% faster than the above
+ if(($a =~ s/[\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\<\=\>\~\|\; \"\!\$\&\'\202-\377]/\\$&/go)
+ +
+ # quote newline in csh as \\\n
+ ($a =~ s/[\n]/"\\\n"/go)) {
+ # A string was replaced
+ # No need to test for "" or \0
+ } elsif($a eq "") {
+ $a = "''";
+ } elsif($a eq "\0") {
+ $a = "";
+ }
+ }
+ return $a;
+}
+
+sub shell_quote_scalar_default($) {
+ # Quote for other shells (Bourne compatibles)
+ # Inputs:
+ # $string = string to be quoted
+ # Returns:
+ # $shell_quoted = string quoted as needed by the shell
+ my $s = $_[0];
+ if($s =~ /[^-_.+a-z0-9\/]/i) {
+ $s =~ s/'/'"'"'/g; # "-quote single quotes
+ $s = "'$s'"; # '-quote entire string
+ $s =~ s/^''//; # Remove unneeded '' at ends
+ $s =~ s/''$//; # (faster than s/^''|''$//g)
+ return $s;
+ } elsif ($s eq "") {
+ return "''";
+ } else {
+ # No quoting needed
+ return $s;
+ }
+}
+
+sub shell_quote_scalar($) {
+ # Quote the string so the shell will not expand any special chars
+ # Inputs:
+ # $string = string to be quoted
+ # Returns:
+ # $shell_quoted = string quoted as needed by the shell
+
+ # Speed optimization: Choose the correct shell_quote_scalar_*
+ # and call that directly from now on
+ no warnings 'redefine';
+ if($Global::cshell) {
+ # (t)csh
+ *shell_quote_scalar = \&shell_quote_scalar_csh;
+ } elsif($Global::shell =~ m:(^|/)rc$:) {
+ # rc-shell
+ *shell_quote_scalar = \&shell_quote_scalar_rc;
+ } else {
+ # other shells
+ *shell_quote_scalar = \&shell_quote_scalar_default;
+ }
+ # The sub is now redefined. Call it
+ return shell_quote_scalar($_[0]);
+}
+
+sub Q($) {
+ # Q alias for ::shell_quote_scalar
+ my $ret = shell_quote_scalar($_[0]);
+ no warnings 'redefine';
+ *Q = \&::shell_quote_scalar;
+ return $ret;
+}
+
+
+sub status(@) {
+ my @w = @_;
+ my $fh = $Global::status_fd || *STDERR;
+ print $fh map { ($_, "\n") } @w;
+ flush $fh;
+}
+
+sub status_no_nl(@) {
+ my @w = @_;
+ my $fh = $Global::status_fd || *STDERR;
+ print $fh @w;
+ flush $fh;
+}
+
+sub warning(@) {
+ my @w = @_;
+ my $prog = $Global::progname || "parsort";
+ status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
+}
+
+{
+ my %warnings;
+ sub warning_once(@) {
+ my @w = @_;
+ my $prog = $Global::progname || "parsort";
+ $warnings{@w}++ or
+ status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
+ }
+}
+
+sub error(@) {
+ my @w = @_;
+ my $prog = $Global::progname || "parsort";
+ status(map { ($prog.": Error: ". $_); } @w);
+}
+
+sub die_bug($) {
+ my $bugid = shift;
+ print STDERR
+ ("$Global::progname: This should not happen. You have found a bug. ",
+ "Please follow\n",
+ "https://www.gnu.org/software/parallel/man.html#REPORTING-BUGS\n",
+ "\n",
+ "Include this in the report:\n",
+ "* The version number: $Global::version\n",
+ "* The bugid: $bugid\n",
+ "* The command line being run\n",
+ "* The files being read (put the files on a webserver if they are big)\n",
+ "\n",
+ "If you get the error on smaller/fewer files, please include those instead.\n");
+ exit(255);
+}
+
+if(@ARGV) {
+ sort_files(@ARGV);
+} elsif(length $opt::files0_from) {
+ $/="\0";
+ open(my $fh,"<",$opt::files0_from) || die;
+ my @files = <$fh>;
+ chomp(@files);
+ sort_files(@files);
+} else {
+ sort_stdin();
+}
+
+# Test
+# -z
+# OK: cat bigfile | parsort
+# OK: parsort -k4n files*.txt
+# OK: parsort files*.txt
+# OK: parsort "file with space"
+