diff options
Diffstat (limited to 'src/parsort')
-rwxr-xr-x | src/parsort | 459 |
1 files changed, 459 insertions, 0 deletions
diff --git a/src/parsort b/src/parsort new file mode 100755 index 0000000..41a75da --- /dev/null +++ b/src/parsort @@ -0,0 +1,459 @@ +#!/usr/bin/perl + +# SPDX-FileCopyrightText: 2021-2024 Ole Tange, http://ole.tange.dk and Free Software and Foundation, Inc. +# SPDX-License-Identifier: GPL-3.0-or-later + +=pod + +=head1 NAME + +parsort - Sort (big files) in parallel + + +=head1 SYNOPSIS + +B<parsort> I<options for sort> + + +=head1 DESCRIPTION + +B<parsort> uses GNU B<sort> to sort in parallel. It works just like +B<sort> but faster on inputs with more than 1 M lines, if you have a +multicore machine. + +Hopefully these ideas will make it into GNU B<sort> in the future. + + +=head1 OPTIONS + +Same as B<sort>. Except: + +=over 4 + +=item B<--parallel=>I<N> + +Change the number of sorts run concurrently to I<N>. I<N> will be +increased to number of files if B<parsort> is given more than I<N> +files. + +=back + + +=head1 EXAMPLE + +Sort files: + + parsort *.txt > sorted.txt + +Sort stdin (standard input) numerically: + + cat numbers | parsort -n > sorted.txt + + +=head1 PERFORMANCE + +B<parsort> is faster on files than on stdin (standard input), because +different parts of a file can be read in parallel. + +On a 48 core machine you should see a speedup of 3x over B<sort>. + + +=head1 AUTHOR + +Copyright (C) 2020-2024 Ole Tange, +http://ole.tange.dk and Free Software Foundation, Inc. + + +=head1 LICENSE + +Copyright (C) 2012 Free Software Foundation, Inc. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +at your option any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. + + +=head1 DEPENDENCIES + +B<parsort> uses B<sort>, B<bash>, and B<parallel>. + + +=head1 SEE ALSO + +B<sort> + + +=cut + +use strict; +use Getopt::Long; +use POSIX qw(mkfifo); + +Getopt::Long::Configure("bundling","require_order"); + +my @ARGV_before = @ARGV; + +GetOptions( + "debug|D" => \$opt::D, + "version" => \$opt::version, + "verbose|v" => \$opt::verbose, + "b|ignore-leading-blanks" => \$opt::ignore_leading_blanks, + "d|dictionary-order" => \$opt::dictionary_order, + "f|ignore-case" => \$opt::ignore_case, + "g|general-numeric-sort" => \$opt::general_numeric_sort, + "i|ignore-nonprinting" => \$opt::ignore_nonprinting, + "M|month-sort" => \$opt::month_sort, + "h|human-numeric-sort" => \$opt::human_numeric_sort, + "n|numeric-sort" => \$opt::numeric_sort, + "N|numascii" => \$opt::numascii, + "r|reverse" => \$opt::reverse, + "R|random-sort" => \$opt::random_sort, + "sort=s" => \$opt::sort, + "V|version-sort" => \$opt::version_sort, + "k|key=s" => \@opt::key, + "t|field-separator=s" => \$opt::field_separator, + "z|zero-terminated" => \$opt::zero_terminated, + "files0-from=s" => \$opt::files0_from, + "random-source=s" => \$opt::dummy, + "batch-size=s" => \$opt::dummy, + "check=s" => \$opt::dummy, + "c" => \$opt::dummy, + "C" => \$opt::dummy, + "compress-program=s" => \$opt::dummy, + "T|temporary-directory=s" => \$opt::dummy, + "parallel=s" => \$opt::parallel, + "u|unique" => \$opt::dummy, + "S|buffer-size=s" => \$opt::dummy, + "s|stable" => \$opt::dummy, + "help" => \$opt::dummy, + ) || exit(255); +$Global::progname = ($0 =~ m:(^|/)([^/]+)$:)[1]; +$Global::version = 20240222; +if($opt::version) { version(); exit 0; } +# Remove -D and --parallel=N +my @s = (grep { ! /^-D$|^--parallel=\S+$/ } + @ARGV_before[0..($#ARGV_before-$#ARGV-1)]); +my @sortoptions; +while(@s) { + my $o = shift @s; + # Remove '--parallel N' + if($o eq "--parallel") { + $o = shift @s; + } else { + push @sortoptions, $o; + } +} +@Global::sortoptions = shell_quote(@sortoptions); +$ENV{'TMPDIR'} ||= "/tmp"; + +sub merge { + # Input: + # @cmd = commands to 'cat' (part of) a file + # 'cat a' 'cat b' 'cat c' => + # sort -m <(sort -m <(cat a) <(cat b)) <(sort -m <(cat c)) + my @cmd = @_; + chomp(@cmd); + while($#cmd > 0) { + my @tmp; + while($#cmd >= 0) { + my $a = shift @cmd; + my $b = shift @cmd; + $a &&= "<($a)"; + $b &&= "<($b)"; + # This looks like useless use of 'cat', but contrary to + # naive belief it increases performance dramatically. + push @tmp, "sort -m @Global::sortoptions $a $b | cat" + } + @cmd = @tmp; + } + return @cmd; +} + +sub sort_files { + # Input is files + my @files = @_; + # Let GNU Parallel generate the commands to read parts of files + # The commands split at \n (or \0) + # and there will be at least one for each CPU thread + my @subopt; + if($opt::zero_terminated) { push @subopt, qw(--recend "\0"); } + if($opt::parallel) { push @subopt, qw(--jobs), $opt::parallel; } + # $uniq is needed because @files could contain \n + my $uniq = join "", map { (0..9,"a".."z","A".."Z")[rand(62)] } (1..20); + open(my $par,"-|",qw(parallel), @subopt, + qw(--pipepart --block -1 --dryrun -vv sort), + @Global::sortoptions, $uniq, '::::', @files) || die; + # Generated commands: + # <file perl-catter | (sort ... $uniq ) + # Use $uniq to split into commands + # (We cannot use \n because 'file' may contain newline) + my @cmd = map { "$_)\n" } split(/$uniq[)]\n/, join("",<$par>)); + debug(1,@cmd); + close $par; + @cmd = merge(@cmd); + # The command uses <(...) so it is incompatible with /bin/sh + open(my $bash,"|-","bash") || die; + print $bash @cmd; + close $bash; +} + +sub sort_stdin { + # Input is stdin + # Spread the input between n processes that each sort + # n = number of CPU threads + my $numthreads; + chomp($numthreads = $opt::parallel || `parallel --number-of-threads`); + my @fifos = map { tmpfifo() } 1..$numthreads; + map { mkfifo($_,0600) } @fifos; + # This trick removes the fifo as soon as it is connected in the other end + # (rm fifo; ...) < fifo + my @cmd = (map { "(rm $_; sort @Global::sortoptions) < $_" } + map { Q($_) } @fifos); + @cmd = merge(@cmd); + if(fork) { + } else { + my @subopt = $opt::zero_terminated ? qw(--recend "\0") : (); + exec(qw(parallel -0 -j), $numthreads, @subopt, + # 286k is the best mean value after testing 250..350 + qw(--block 286k --pipe --roundrobin cat > {} :::),@fifos); + } + # The command uses <(...) so it is incompatible with /bin/sh + open(my $bash,"|-","bash") || die; + print $bash @cmd; + close $bash; +} + +sub tmpname { + # Select a name that does not exist + # Do not create the file as it may be used for creating a socket (by tmux) + # Remember the name in $Global::unlink to avoid hitting the same name twice + my $name = shift; + my($tmpname); + if(not -w $ENV{'TMPDIR'}) { + if(not -e $ENV{'TMPDIR'}) { + ::error("Tmpdir '$ENV{'TMPDIR'}' does not exist.","Try 'mkdir ". + Q($ENV{'TMPDIR'})."'"); + } else { + ::error("Tmpdir '$ENV{'TMPDIR'}' is not writable.","Try 'chmod +w ". + Q($ENV{'TMPDIR'})."'"); + } + exit(255); + } + do { + $tmpname = $ENV{'TMPDIR'}."/".$name. + join"", map { (0..9,"a".."z","A".."Z")[rand(62)] } (1..5); + } while(-e $tmpname); + return $tmpname; +} + +sub tmpfifo { + # Find an unused name and mkfifo on it + my $tmpfifo = tmpname("psort"); + mkfifo($tmpfifo,0600); + return $tmpfifo; +} + +sub debug { + # Returns: N/A + $opt::D or return; + @_ = grep { defined $_ ? $_ : "" } @_; + print STDERR @_[1..$#_]; +} + +sub version() { + # Returns: N/A + print join + ("\n", + "GNU $Global::progname $Global::version", + "Copyright (C) 2020-2024 Ole Tange, http://ole.tange.dk and Free Software", + "Foundation, Inc.", + "License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>", + "This is free software: you are free to change and redistribute it.", + "GNU $Global::progname comes with no warranty.", + "", + "Web site: https://www.gnu.org/software/parallel\n", + ); +} + +sub shell_quote(@) { + # Input: + # @strings = strings to be quoted + # Returns: + # @shell_quoted_strings = string quoted as needed by the shell + return wantarray ? (map { Q($_) } @_) : (join" ",map { Q($_) } @_); +} + +sub shell_quote_scalar_rc($) { + # Quote for the rc-shell + my $a = $_[0]; + if(defined $a) { + if(($a =~ s/'/''/g) + + + ($a =~ s/[\n\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\<\=\>\~\|\; \"\!\$\&\'\202-\377]+/'$&'/go)) { + # A string was replaced + # No need to test for "" or \0 + } elsif($a eq "") { + $a = "''"; + } elsif($a eq "\0") { + $a = ""; + } + } + return $a; +} + +sub shell_quote_scalar_csh($) { + # Quote for (t)csh + my $a = $_[0]; + if(defined $a) { + # $a =~ s/([\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\>\<\~\|\; \"\!\$\&\'\202-\377])/\\$1/g; + # This is 1% faster than the above + if(($a =~ s/[\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\<\=\>\~\|\; \"\!\$\&\'\202-\377]/\\$&/go) + + + # quote newline in csh as \\\n + ($a =~ s/[\n]/"\\\n"/go)) { + # A string was replaced + # No need to test for "" or \0 + } elsif($a eq "") { + $a = "''"; + } elsif($a eq "\0") { + $a = ""; + } + } + return $a; +} + +sub shell_quote_scalar_default($) { + # Quote for other shells (Bourne compatibles) + # Inputs: + # $string = string to be quoted + # Returns: + # $shell_quoted = string quoted as needed by the shell + my $s = $_[0]; + if($s =~ /[^-_.+a-z0-9\/]/i) { + $s =~ s/'/'"'"'/g; # "-quote single quotes + $s = "'$s'"; # '-quote entire string + $s =~ s/^''//; # Remove unneeded '' at ends + $s =~ s/''$//; # (faster than s/^''|''$//g) + return $s; + } elsif ($s eq "") { + return "''"; + } else { + # No quoting needed + return $s; + } +} + +sub shell_quote_scalar($) { + # Quote the string so the shell will not expand any special chars + # Inputs: + # $string = string to be quoted + # Returns: + # $shell_quoted = string quoted as needed by the shell + + # Speed optimization: Choose the correct shell_quote_scalar_* + # and call that directly from now on + no warnings 'redefine'; + if($Global::cshell) { + # (t)csh + *shell_quote_scalar = \&shell_quote_scalar_csh; + } elsif($Global::shell =~ m:(^|/)rc$:) { + # rc-shell + *shell_quote_scalar = \&shell_quote_scalar_rc; + } else { + # other shells + *shell_quote_scalar = \&shell_quote_scalar_default; + } + # The sub is now redefined. Call it + return shell_quote_scalar($_[0]); +} + +sub Q($) { + # Q alias for ::shell_quote_scalar + my $ret = shell_quote_scalar($_[0]); + no warnings 'redefine'; + *Q = \&::shell_quote_scalar; + return $ret; +} + + +sub status(@) { + my @w = @_; + my $fh = $Global::status_fd || *STDERR; + print $fh map { ($_, "\n") } @w; + flush $fh; +} + +sub status_no_nl(@) { + my @w = @_; + my $fh = $Global::status_fd || *STDERR; + print $fh @w; + flush $fh; +} + +sub warning(@) { + my @w = @_; + my $prog = $Global::progname || "parsort"; + status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w); +} + +{ + my %warnings; + sub warning_once(@) { + my @w = @_; + my $prog = $Global::progname || "parsort"; + $warnings{@w}++ or + status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w); + } +} + +sub error(@) { + my @w = @_; + my $prog = $Global::progname || "parsort"; + status(map { ($prog.": Error: ". $_); } @w); +} + +sub die_bug($) { + my $bugid = shift; + print STDERR + ("$Global::progname: This should not happen. You have found a bug. ", + "Please follow\n", + "https://www.gnu.org/software/parallel/man.html#REPORTING-BUGS\n", + "\n", + "Include this in the report:\n", + "* The version number: $Global::version\n", + "* The bugid: $bugid\n", + "* The command line being run\n", + "* The files being read (put the files on a webserver if they are big)\n", + "\n", + "If you get the error on smaller/fewer files, please include those instead.\n"); + exit(255); +} + +if(@ARGV) { + sort_files(@ARGV); +} elsif(length $opt::files0_from) { + $/="\0"; + open(my $fh,"<",$opt::files0_from) || die; + my @files = <$fh>; + chomp(@files); + sort_files(@files); +} else { + sort_stdin(); +} + +# Test +# -z +# OK: cat bigfile | parsort +# OK: parsort -k4n files*.txt +# OK: parsort files*.txt +# OK: parsort "file with space" + |