diff options
Diffstat (limited to 'src/sed/sed/fmt.c')
-rw-r--r-- | src/sed/sed/fmt.c | 587 |
1 files changed, 587 insertions, 0 deletions
diff --git a/src/sed/sed/fmt.c b/src/sed/sed/fmt.c new file mode 100644 index 0000000..64600a0 --- /dev/null +++ b/src/sed/sed/fmt.c @@ -0,0 +1,587 @@ +/* `L' command implementation for GNU sed, based on GNU fmt 1.22. + Copyright (C) 1994, 1995, 1996, 2002, 2003 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +/* GNU fmt was written by Ross Paterson <rap@doc.ic.ac.uk>. */ + +#include "sed.h" + +#include <stdio.h> +#include <ctype.h> +#include <sys/types.h> + +#if HAVE_LIMITS_H +# include <limits.h> +#endif + +#ifndef UINT_MAX +# define UINT_MAX ((unsigned int) ~(unsigned int) 0) +#endif + +#ifndef INT_MAX +# define INT_MAX ((int) (UINT_MAX >> 1)) +#endif + +/* The following parameters represent the program's idea of what is + "best". Adjust to taste, subject to the caveats given. */ + +/* Prefer lines to be LEEWAY % shorter than the maximum width, giving + room for optimization. */ +#define LEEWAY 7 + +/* Costs and bonuses are expressed as the equivalent departure from the + optimal line length, multiplied by 10. e.g. assigning something a + cost of 50 means that it is as bad as a line 5 characters too short + or too long. The definition of SHORT_COST(n) should not be changed. + However, EQUIV(n) may need tuning. */ + +typedef long COST; + +#define MAXCOST (~(((unsigned long) 1) << (8 * sizeof (COST) -1))) + +#define SQR(n) ((n) * (n)) +#define EQUIV(n) SQR ((COST) (n)) + +/* Cost of a filled line n chars longer or shorter than best_width. */ +#define SHORT_COST(n) EQUIV ((n) * 10) + +/* Cost of the difference between adjacent filled lines. */ +#define RAGGED_COST(n) (SHORT_COST (n) / 2) + +/* Basic cost per line. */ +#define LINE_COST EQUIV (70) + +/* Cost of breaking a line after the first word of a sentence, where + the length of the word is N. */ +#define WIDOW_COST(n) (EQUIV (200) / ((n) + 2)) + +/* Cost of breaking a line before the last word of a sentence, where + the length of the word is N. */ +#define ORPHAN_COST(n) (EQUIV (150) / ((n) + 2)) + +/* Bonus for breaking a line at the end of a sentence. */ +#define SENTENCE_BONUS EQUIV (50) + +/* Cost of breaking a line after a period not marking end of a sentence. + With the definition of sentence we are using (borrowed from emacs, see + get_line()) such a break would then look like a sentence break. Hence + we assign a very high cost -- it should be avoided unless things are + really bad. */ +#define NOBREAK_COST EQUIV (600) + +/* Bonus for breaking a line before open parenthesis. */ +#define PAREN_BONUS EQUIV (40) + +/* Bonus for breaking a line after other punctuation. */ +#define PUNCT_BONUS EQUIV(40) + +/* Credit for breaking a long paragraph one line later. */ +#define LINE_CREDIT EQUIV(3) + +/* Size of paragraph buffer in words. Longer paragraphs are handled + neatly (cf. flush_paragraph()), so there's little to gain by making + these larger. */ +#define MAXWORDS 1000 + +#define GETC() (parabuf == end_of_parabuf ? EOF : *parabuf++) + +/* Extra ctype(3)-style macros. */ + +#define isopen(c) (strchr ("([`'\"", (c)) != NULL) +#define isclose(c) (strchr (")]'\"", (c)) != NULL) +#define isperiod(c) (strchr (".?!", (c)) != NULL) + +/* Size of a tab stop, for expansion on input and re-introduction on + output. */ +#define TABWIDTH 8 + +/* Word descriptor structure. */ + +typedef struct Word WORD; + +struct Word + { + + /* Static attributes determined during input. */ + + const char *text; /* the text of the word */ + short length; /* length of this word */ + short space; /* the size of the following space */ + unsigned paren:1; /* starts with open paren */ + unsigned period:1; /* ends in [.?!])* */ + unsigned punct:1; /* ends in punctuation */ + unsigned final:1; /* end of sentence */ + + /* The remaining fields are computed during the optimization. */ + + short line_length; /* length of the best line starting here */ + COST best_cost; /* cost of best paragraph starting here */ + WORD *next_break; /* break which achieves best_cost */ + }; + +/* Forward declarations. */ + +static bool get_paragraph P_ ((void)); +static int get_line P_ ((int c)); +static int get_space P_ ((int c)); +static int copy_rest P_ ((int c)); +static bool same_para P_ ((int c)); +static void flush_paragraph P_ ((void)); +static void fmt_paragraph P_ ((void)); +static void check_punctuation P_ ((WORD *w)); +static COST base_cost P_ ((WORD *this)); +static COST line_cost P_ ((WORD *next, int len)); +static void put_paragraph P_ ((WORD *finish)); +static void put_line P_ ((WORD *w, int indent)); +static void put_word P_ ((WORD *w)); +static void put_space P_ ((int space)); + +/* Option values. */ + +/* User-supplied maximum line width (default WIDTH). The only output + lines + longer than this will each comprise a single word. */ +static int max_width; + +/* Space for the paragraph text. */ +static const char *parabuf; + +/* End of space for the paragraph text. */ +static const char *end_of_parabuf; + +/* The file on which we output */ +static FILE *outfile; + +/* Values derived from the option values. */ + +/* The preferred width of text lines, set to LEEWAY % less than max_width. */ +static int best_width; + +/* Dynamic variables. */ + +/* Start column of the character most recently read from the input file. */ +static int in_column; + +/* Start column of the next character to be written to stdout. */ +static int out_column; + +/* The words of a paragraph -- longer paragraphs are handled neatly + (cf. flush_paragraph()). */ +static WORD words[MAXWORDS]; + +/* A pointer into the above word array, indicating the first position + after the last complete word. Sometimes it will point at an incomplete + word. */ +static WORD *word_limit; + +/* Indentation of the first line of the current paragraph. */ +static int first_indent; + +/* Indentation of other lines of the current paragraph */ +static int other_indent; + +/* The last character read from the input file. */ +static int next_char; + +/* If nonzero, the length of the last line output in the current + paragraph, used to charge for raggedness at the split point for long + paragraphs chosen by fmt_paragraph(). */ +static int last_line_length; + +/* read file F and send formatted output to stdout. */ + +void +fmt (const char *line, const char *line_end, int max_length, FILE *output_file) +{ + parabuf = line; + end_of_parabuf = line_end; + outfile = output_file; + + max_width = max_length; + best_width = max_width * (201 - 2 * LEEWAY) / 200; + + in_column = 0; + other_indent = 0; + next_char = GETC(); + while (get_paragraph ()) + { + fmt_paragraph (); + put_paragraph (word_limit); + } +} + +/* Read a paragraph from input file F. A paragraph consists of a + maximal number of non-blank (excluding any prefix) lines + with the same indent. + + Return false if end-of-file was encountered before the start of a + paragraph, else true. */ + +static bool +get_paragraph () +{ + register int c; + + last_line_length = 0; + c = next_char; + + /* Scan (and copy) blank lines, and lines not introduced by the prefix. */ + + while (c == '\n' || c == EOF) + { + c = copy_rest (c); + if (c == EOF) + { + next_char = EOF; + return false; + } + putc ('\n', outfile); + c = GETC(); + } + + /* Got a suitable first line for a paragraph. */ + + first_indent = in_column; + word_limit = words; + c = get_line (c); + + /* Read rest of paragraph. */ + + other_indent = in_column; + while (same_para (c) && in_column == other_indent) + c = get_line (c); + + (word_limit - 1)->period = (word_limit - 1)->final = true; + next_char = c; + return true; +} + +/* Copy to the output a blank line. In the latter, C is \n or EOF. + Return the character (\n or EOF) ending the line. */ + +static int +copy_rest (register int c) +{ + out_column = 0; + while (c != '\n' && c != EOF) + { + putc (c, outfile); + c = GETC(); + } + return c; +} + +/* Return true if a line whose first non-blank character after the + prefix (if any) is C could belong to the current paragraph, + otherwise false. */ + +static bool +same_para (register int c) +{ + return (c != '\n' && c != EOF); +} + +/* Read a line from the input data given first non-blank character C + after the prefix, and the following indent, and break it into words. + A word is a maximal non-empty string of non-white characters. A word + ending in [.?!]["')\]]* and followed by end-of-line or at least two + spaces ends a sentence, as in emacs. + + Return the first non-blank character of the next line. */ + +static int +get_line (register int c) +{ + int start; + register WORD *end_of_word; + + end_of_word = &words[MAXWORDS - 2]; + + do + { /* for each word in a line */ + + /* Scan word. */ + + word_limit->text = parabuf - 1; + do + c = GETC(); + while (c != EOF && !ISSPACE (c)); + word_limit->length = parabuf - word_limit->text - (c != EOF); + in_column += word_limit->length; + + check_punctuation (word_limit); + + /* Scan inter-word space. */ + + start = in_column; + c = get_space (c); + word_limit->space = in_column - start; + word_limit->final = (c == EOF + || (word_limit->period + && (c == '\n' || word_limit->space > 1))); + if (c == '\n' || c == EOF) + word_limit->space = word_limit->final ? 2 : 1; + if (word_limit == end_of_word) + flush_paragraph (); + word_limit++; + if (c == EOF) + { + in_column = first_indent; + return EOF; + } + } + while (c != '\n'); + + in_column = 0; + c = GETC(); + return get_space (c); +} + +/* Read blank characters from the input data, starting with C, and keeping + in_column up-to-date. Return first non-blank character. */ + +static int +get_space (register int c) +{ + for (;;) + { + if (c == ' ') + in_column++; + else if (c == '\t') + in_column = (in_column / TABWIDTH + 1) * TABWIDTH; + else + return c; + c = GETC(); + } +} + +/* Set extra fields in word W describing any attached punctuation. */ + +static void +check_punctuation (register WORD *w) +{ + register const char *start, *finish; + + start = w->text; + finish = start + (w->length - 1); + w->paren = isopen (*start); + w->punct = ISPUNCT (*finish); + while (isclose (*finish) && finish > start) + finish--; + w->period = isperiod (*finish); +} + +/* Flush part of the paragraph to make room. This function is called on + hitting the limit on the number of words or characters. */ + +static void +flush_paragraph (void) +{ + WORD *split_point; + register WORD *w; + COST best_break; + + /* - format what you have so far as a paragraph, + - find a low-cost line break near the end, + - output to there, + - make that the start of the paragraph. */ + + fmt_paragraph (); + + /* Choose a good split point. */ + + split_point = word_limit; + best_break = MAXCOST; + for (w = words->next_break; w != word_limit; w = w->next_break) + { + if (w->best_cost - w->next_break->best_cost < best_break) + { + split_point = w; + best_break = w->best_cost - w->next_break->best_cost; + } + if (best_break <= MAXCOST - LINE_CREDIT) + best_break += LINE_CREDIT; + } + put_paragraph (split_point); + + /* Copy words from split_point down to word -- we use memmove because + the source and target may overlap. */ + + memmove ((char *) words, (char *) split_point, + (word_limit - split_point + 1) * sizeof (WORD)); + word_limit -= split_point - words; +} + +/* Compute the optimal formatting for the whole paragraph by computing + and remembering the optimal formatting for each suffix from the empty + one to the whole paragraph. */ + +static void +fmt_paragraph (void) +{ + register WORD *start, *w; + register int len; + register COST wcost, best; + int saved_length; + + word_limit->best_cost = 0; + saved_length = word_limit->length; + word_limit->length = max_width; /* sentinel */ + + for (start = word_limit - 1; start >= words; start--) + { + best = MAXCOST; + len = start == words ? first_indent : other_indent; + + /* At least one word, however long, in the line. */ + + w = start; + len += w->length; + do + { + w++; + + /* Consider breaking before w. */ + + wcost = line_cost (w, len) + w->best_cost; + if (start == words && last_line_length > 0) + wcost += RAGGED_COST (len - last_line_length); + if (wcost < best) + { + best = wcost; + start->next_break = w; + start->line_length = len; + } + len += (w - 1)->space + w->length; /* w > start >= words */ + } + while (len < max_width); + start->best_cost = best + base_cost (start); + } + + word_limit->length = saved_length; +} + +/* Return the constant component of the cost of breaking before the + word THIS. */ + +static COST +base_cost (register WORD *this) +{ + register COST cost; + + cost = LINE_COST; + + if (this > words) + { + if ((this - 1)->period) + { + if ((this - 1)->final) + cost -= SENTENCE_BONUS; + else + cost += NOBREAK_COST; + } + else if ((this - 1)->punct) + cost -= PUNCT_BONUS; + else if (this > words + 1 && (this - 2)->final) + cost += WIDOW_COST ((this - 1)->length); + } + + if (this->paren) + cost -= PAREN_BONUS; + else if (this->final) + cost += ORPHAN_COST (this->length); + + return cost; +} + +/* Return the component of the cost of breaking before word NEXT that + depends on LEN, the length of the line beginning there. */ + +static COST +line_cost (register WORD *next, register int len) +{ + register int n; + register COST cost; + + if (next == word_limit) + return 0; + n = best_width - len; + cost = SHORT_COST (n); + if (next->next_break != word_limit) + { + n = len - next->line_length; + cost += RAGGED_COST (n); + } + return cost; +} + +/* Output to stdout a paragraph from word up to (but not including) + FINISH, which must be in the next_break chain from word. */ + +static void +put_paragraph (register WORD *finish) +{ + register WORD *w; + + put_line (words, first_indent); + for (w = words->next_break; w != finish; w = w->next_break) + put_line (w, other_indent); +} + +/* Output to stdout the line beginning with word W, beginning in column + INDENT, including the prefix (if any). */ + +static void +put_line (register WORD *w, int indent) +{ + register WORD *endline; + out_column = 0; + put_space (indent); + + endline = w->next_break - 1; + for (; w != endline; w++) + { + put_word (w); + put_space (w->space); + } + put_word (w); + last_line_length = out_column; + putc ('\n', outfile); +} + +/* Output to stdout the word W. */ + +static void +put_word (register WORD *w) +{ + register const char *s; + register int n; + + s = w->text; + for (n = w->length; n != 0; n--) + putc (*s++, outfile); + out_column += w->length; +} + +/* Output to stdout SPACE spaces, or equivalent tabs. */ + +static void +put_space (int space) +{ + out_column += space; + while (space--) + putc (' ', outfile); +} |