summaryrefslogtreecommitdiffstats
path: root/man/fix-roff-punct
blob: 0a11f2f836a8a940b5f958a675843045795f3afd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#! /usr/bin/env perl
use strict;
use warnings;

# fix-roff-punct: Fix up punctuation usage in automatically-generated
# troff files (man pages).

# Authors:
#   Peter Moulder <pmoulder@mail.csse.monash.edu.au>
#
# Copyright (C) 2004 Monash University
#
# Gnu GPL v2+:
#
#   This program is free software; you can redistribute it and/or
#   modify it under the terms of the GNU General Public License as
#   published by the Free Software Foundation; either version 2 of the
#   License, or (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#   General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the Free Software
#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


# Background: Humans use a number of dash-like characters:
#
#   - ASCII hyphen/minus needed for command-line options and other computer
#     input;
#   - hyphen (`one-to-one');
#   - en dash (`2000-2003');
#   - em dash -- like this.  [Not currently handled.]
#
# Troff input spells them as \-, -, \[en], \[em] respectively.  (See the
# groff_char.7 man page for a full list of such punctuation characters.)  If
# you run `man' with your LC_CTYPE indicating a rich character set like unicode
# (UTF-8 encoding), then it uses different output characters for each of the
# above.
#
# In particular, if your man page source has plain `-' when giving an example
# of a flag or command or other program input, then users won't be able to use
# mouse copy&paste from the formatted man page.

# This script is something of a hack: it is only big enough to handle a few man
# pages of interest (produced by pod2man).  You should manually check the
# changes it makes.

# Approach: we handle each line a word at a time, and typically make the same
# hyphen-vs-ASCII decision throughout the word.  We're a bit haphazard about
# word-splitting, but it's hard to find an example of where we'd be hurt by
# that, and by luck we would do the right thing for many gcc options like
# `-fconstant-string-class=\fICLASS-NAME\fR' (where CLASS-NAME should use a
# hyphen and the others should be ASCII hyphen-minus).
#
# Perl's /e (execute) flag for substitutions does just what we want
# for preserving non-word bits while transforming "words".
#
# We don't currently handle special things like `apt-get' that look like
# hyphenated english words but are actually program names.  In general the
# problem is AI complete, e.g. `apt-gettable' could be either hyphen (gettable
# by apt) or ASCII hyphen-minus (able to be processed by the `apt-get'
# program).
#
# We don't currently take hints from font choice.  (E.g. text in CR font should
# probably use ASCII hyphen-minus.)
#
# We currently only handle a couple troff requests and escapes (see groff.7).

sub frob ($);

my $yearRE = qr/(?:19[6-9]|20[013])[0-9]/;

sub frob ($) {
    my ($x) = @_;

    # Consider splitting into two words.
    if ($x =~ m{\A(.*?)(\\(?:[&/,~:d]|f[BRI]|s-?[0-9]+))(.*)\z}) {
	my ($before, $s, $after) = ($1, $2, $3);
	return frob($before) . $s . frob($after);
    }

    if ($x =~ m{\A(.*?)(\.+)\z}) {
	my $d = $2;
	return frob($1) . $d;
    }

    # `32-bit', `5-page'.
    if ($x =~ m{\A[0-9]+-[a-z]+\z}) {
	return $x;
    }

    # Year range: `(C) 1998-2003'.
    if ($x =~ m{\A$yearRE\\?-$yearRE\z}) {
	$x =~ s{\\?-}{\\[en]};
	return $x;
    }

    # ISO date.
    if ($x =~ m{\A$yearRE-[01][0-9]-[0-3][0-9]\z}) {
	return $x;
    }

    # Things likely to be computer input.
    if ($x =~ m{[0-9]|\.[a-zA-Z]|\A(?:[-/.]|\\-|\[.*\]\z)}) {
	$x =~ s/\\?-/\\-/g;
	return $x;
    }

    $x =~ s/\\?-/-/g;
    return $x;
}

while(<>) {
    if ($_ eq '.tr \(*W-|\(bv\*(Tr' . "\n") {
	# Get rid of pod2man's "helpful" munging of pipe symbol.
	next;
    }

    # Leave ASCII apostrophe unchanged (i.e. \[aq]) for examples.
    if (/\A\\\&    /) {
	s/'/\\[aq]/g;   # `\[aq]' = "ascii quote"
    }

    if (/\A\.IP /) {
	s/\\?-/\\-/g;
	s/\\s\\-1/\\s-1/g;
    }
    elsif (/\A\.IX /) {
	s/\\?-/-/g;
    }
    elsif (!/\A\. *(?:\\"|ds|if|ie)/) {
	# As an optimization, we process only words containing `-'.
	s{([.@/\\[:alnum:]]*-[-.@/\\[:alnum:]]*)}{frob($1)}ge;
    }
    print;
}