From d318611dd6f23fcfedd50e9b9e24620b102ba96a Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 15 Apr 2024 21:44:05 +0200 Subject: Adding upstream version 1.23.0. Signed-off-by: Daniel Baumann --- src/preproc/refer/TODO | 124 + src/preproc/refer/command.cpp | 814 ++++++ src/preproc/refer/command.h | 35 + src/preproc/refer/label.cpp | 2607 ++++++++++++++++++++ src/preproc/refer/label.hpp | 98 + src/preproc/refer/label.ypp | 1195 +++++++++ src/preproc/refer/ref.cpp | 1161 +++++++++ src/preproc/refer/ref.h | 127 + src/preproc/refer/refer.1.man | 2020 +++++++++++++++ src/preproc/refer/refer.am | 61 + src/preproc/refer/refer.cpp | 1267 ++++++++++ src/preproc/refer/refer.h | 81 + src/preproc/refer/tests/artifacts/62124.bib | 4 + .../refer/tests/report-correct-line-numbers.sh | 136 + src/preproc/refer/token.cpp | 377 +++ src/preproc/refer/token.h | 87 + 16 files changed, 10194 insertions(+) create mode 100644 src/preproc/refer/TODO create mode 100644 src/preproc/refer/command.cpp create mode 100644 src/preproc/refer/command.h create mode 100644 src/preproc/refer/label.cpp create mode 100644 src/preproc/refer/label.hpp create mode 100644 src/preproc/refer/label.ypp create mode 100644 src/preproc/refer/ref.cpp create mode 100644 src/preproc/refer/ref.h create mode 100644 src/preproc/refer/refer.1.man create mode 100644 src/preproc/refer/refer.am create mode 100644 src/preproc/refer/refer.cpp create mode 100644 src/preproc/refer/refer.h create mode 100644 src/preproc/refer/tests/artifacts/62124.bib create mode 100755 src/preproc/refer/tests/report-correct-line-numbers.sh create mode 100644 src/preproc/refer/token.cpp create mode 100644 src/preproc/refer/token.h (limited to 'src/preproc/refer') diff --git a/src/preproc/refer/TODO b/src/preproc/refer/TODO new file mode 100644 index 0000000..36f4508 --- /dev/null +++ b/src/preproc/refer/TODO @@ -0,0 +1,124 @@ +inline references + +Some sort of macro/subroutine that can cover several references. + +move-punctuation should ignore multiple punctuation characters. + +Make the index files machine independent. + +Allow search keys to be negated (with !) to indicate that the +reference should not contain the key. Ignore negated keys during +indexed searching. + +Provide an option with lkbib and lookbib that prints the location +(filename, position) of each reference. Need to map filename_id's +back to filenames. + +Rename join-authors to join-fields. Have a separate label-join-fields +command used by @ and #. + +Have some sort of quantifier: e.g., $.n#A means execute '$.n' for each +instance of an A field, setting $ to that field, and then join the +results using the join-authors command. + +no-text-in-bracket command which says not to allow post_text and +pre_text when the [] flags has been given. Useful for superscripted +footnotes. + +Make it possible to translate - to \(en in page ranges. + +Trim eign a bit. + +In indexed searching discard all numeric keys except dates. + +Allow '\ ' to separate article from first word. + +%also + +Option automatically to supply [] flags in every reference. + +See if we can avoid requiring a comma before jr. and so on +in find_last_name(). + +Cache sortified authors in authors string during tentative evaluation of +label specification. + +Possibly don't allow * and % expressions in the first part of ?:, | or +& expressions. + +Handle better the case where <> occurs inside functions and in the +first operand of ~. Or perhaps implement <> using some magic character +in the string. + +Should special treatment be given to lines beginning with . in +references? (Unix refer seems to treat them like '%'). + +Add global flag to control whether all files should be stat-ed after +loading, and whether they should be stat-ed before each search. +Perhaps make this dependent on the number of files there are. + +Option to truncate keys to truncate_len in linear searching. + +Allow multiple -f options in indxbib. + +In indxbib, possibly store common words rather than common words +filename. In this case store only words that are actually present in +the file. + +Perhaps we should put out an obnoxious copyright message when lookbib +starts up. + +Provide an option that writes a file containing just the references +actually used. Useful if you want to distribute a document. + +Have a magic token such that +%A +will print as though it were +%A +but sort as though it were +%A +Do we need this if we can specify author alternatives for sorting? +No, provided we have separate alternatives for @. + +In consider_authors when last names are ambiguous we might be able to +use just the first name and not Jr. bit. Or we might be able to +abbreviate the author. + +It ought to be possible to specify an alternative field to sort on +instead of date. (ie if there's a field giving the type of document -- +these references should sort after any years) + +Provide a way to execute a command using a command-line option. + +Option to set the label-spec as a command-line option (-L). + +Command to specify which fields can occur multiple times: +multiple AE + +Command to specify how various fields sort: +aort-as-name A +sort-as-date D +sort-as-title T +sort-as-other O + +Command to specify which fields are author fields: +# if we don't have A use field Q +author-fields AQ + +Commands to set properties of tokens. +sortify-token \(ae ae +uppercase-token \[ae] \[AE] + +Command to set the names of months: +months january february march april may ... + +Perhaps provide some sort of macro capability: +# perhaps a macro capability +defmacro foo +annotation-field $1 +endef + +Command to control strings used in capitalization +capitalize-start \s+2 +capitalize-end \s-2 +(perhaps make these arguments to the capitalize command.) diff --git a/src/preproc/refer/command.cpp b/src/preproc/refer/command.cpp new file mode 100644 index 0000000..b49e2be --- /dev/null +++ b/src/preproc/refer/command.cpp @@ -0,0 +1,814 @@ +/* Copyright (C) 1989-2020 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or +(at your option) any later version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . */ + +#include "refer.h" +#include "refid.h" +#include "search.h" +#include "command.h" + +cset cs_field_name = csalpha; + +class input_item { + input_item *next; + char *filename; + int first_lineno; + string buffer; + const char *ptr; + const char *end; +public: + input_item(string &, const char *, int = 1); + ~input_item(); + int get_char(); + int peek_char(); + void skip_char(); + void get_location(const char **, int *); + + friend class input_stack; +}; + +input_item::input_item(string &s, const char *fn, int ln) +: filename(strsave(fn)), first_lineno(ln) +{ + buffer.move(s); + ptr = buffer.contents(); + end = ptr + buffer.length(); +} + +input_item::~input_item() +{ + delete[] filename; +} + +inline int input_item::peek_char() +{ + if (ptr >= end) + return EOF; + else + return (unsigned char)*ptr; +} + +inline int input_item::get_char() +{ + if (ptr >= end) + return EOF; + else + return (unsigned char)*ptr++; +} + +inline void input_item::skip_char() +{ + ptr++; +} + +void input_item::get_location(const char **filenamep, int *linenop) +{ + *filenamep = filename; + if (ptr == buffer.contents()) + *linenop = first_lineno; + else { + int ln = first_lineno; + const char *e = ptr - 1; + for (const char *p = buffer.contents(); p < e; p++) + if (*p == '\n') + ln++; + ln--; // Back up to identify line number _before_ last seen newline. + *linenop = ln; + } + return; +} + +class input_stack { + static input_item *top; +public: + static void init(); + static int get_char(); + static int peek_char(); + static void skip_char() { top->skip_char(); } + static void push_file(const char *); + static void push_string(string &, const char *, int); + static void error(const char *format, + const errarg &arg1 = empty_errarg, + const errarg &arg2 = empty_errarg, + const errarg &arg3 = empty_errarg); +}; + +input_item *input_stack::top = 0; + +void input_stack::init() +{ + while (top) { + input_item *tem = top; + top = top->next; + delete tem; + } +} + +int input_stack::get_char() +{ + while (top) { + int c = top->get_char(); + if (c >= 0) + return c; + input_item *tem = top; + top = top->next; + delete tem; + } + return -1; +} + +int input_stack::peek_char() +{ + while (top) { + int c = top->peek_char(); + if (c >= 0) + return c; + input_item *tem = top; + top = top->next; + delete tem; + } + return -1; +} + +void input_stack::push_file(const char *fn) +{ + FILE *fp; + if (strcmp(fn, "-") == 0) { + fp = stdin; + fn = ""; + } + else { + errno = 0; + fp = fopen(fn, "r"); + if (fp == 0) { + error("can't open '%1': %2", fn, strerror(errno)); + return; + } + } + string buf; + bool is_at_beginning_of_line = true; + int lineno = 1; + for (;;) { + int c = getc(fp); + if (is_at_beginning_of_line && c == '.') { + // replace lines beginning with .R1 or .R2 with a blank line + c = getc(fp); + if (c == 'R') { + c = getc(fp); + if (c == '1' || c == '2') { + int cc = c; + c = getc(fp); + if (compatible_flag || c == ' ' || c == '\n' || c == EOF) { + while (c != '\n' && c != EOF) + c = getc(fp); + } + else { + buf += '.'; + buf += 'R'; + buf += cc; + } + } + else { + buf += '.'; + buf += 'R'; + } + } + else + buf += '.'; + } + if (c == EOF) + break; + if (is_invalid_input_char(c)) + error_with_file_and_line(fn, lineno, + "invalid input character code %1", c); + else { + buf += c; + if (c == '\n') { + is_at_beginning_of_line = true; + lineno++; + } + else + is_at_beginning_of_line = false; + } + } + if (fp != stdin) + fclose(fp); + if (buf.length() > 0 && buf[buf.length() - 1] != '\n') + buf += '\n'; + input_item *it = new input_item(buf, fn); + it->next = top; + top = it; +} + +void input_stack::push_string(string &s, const char *filename, int lineno) +{ + input_item *it = new input_item(s, filename, lineno); + it->next = top; + top = it; +} + +void input_stack::error(const char *format, const errarg &arg1, + const errarg &arg2, const errarg &arg3) +{ + const char *filename; + int lineno; + for (input_item *it = top; it; it = it->next) { + it->get_location(&filename, &lineno); + error_with_file_and_line(filename, lineno, format, arg1, arg2, arg3); + return; + } + ::error(format, arg1, arg2, arg3); +} + +void command_error(const char *format, const errarg &arg1, + const errarg &arg2, const errarg &arg3) +{ + input_stack::error(format, arg1, arg2, arg3); +} + +// # not recognized in "" +// \ is recognized in "" +// # does not conceal newline +// if missing closing quote, word extends to end of line +// no special treatment of \ other than before newline +// \ not recognized after # +// ; allowed as alternative to newline +// ; not recognized in "" +// don't clear word_buffer; just append on +// return -1 for EOF, 0 for newline, 1 for word + +int get_word(string &word_buffer) +{ + int c = input_stack::get_char(); + for (;;) { + if (c == '#') { + do { + c = input_stack::get_char(); + } while (c != '\n' && c != EOF); + break; + } + if (c == '\\' && input_stack::peek_char() == '\n') + input_stack::skip_char(); + else if (c != ' ' && c != '\t') + break; + c = input_stack::get_char(); + } + if (c == EOF) + return -1; + if (c == '\n' || c == ';') + return 0; + if (c == '"') { + for (;;) { + c = input_stack::peek_char(); + if (c == EOF || c == '\n') + break; + input_stack::skip_char(); + if (c == '"') { + int d = input_stack::peek_char(); + if (d == '"') + input_stack::skip_char(); + else + break; + } + else if (c == '\\') { + int d = input_stack::peek_char(); + if (d == '\n') + input_stack::skip_char(); + else + word_buffer += '\\'; + } + else + word_buffer += c; + } + return 1; + } + word_buffer += c; + for (;;) { + c = input_stack::peek_char(); + if (c == ' ' || c == '\t' || c == '\n' || c == '#' || c == ';') + break; + input_stack::skip_char(); + if (c == '\\') { + int d = input_stack::peek_char(); + if (d == '\n') + input_stack::skip_char(); + else + word_buffer += '\\'; + } + else + word_buffer += c; + } + return 1; +} + +union argument { + const char *s; + int n; +}; + +// This is for debugging. + +static void echo_command(int argc, argument *argv) +{ + for (int i = 0; i < argc; i++) + fprintf(stderr, "%s\n", argv[i].s); +} + +static void include_command(int argc, argument *argv) +{ + assert(argc == 1); + input_stack::push_file(argv[0].s); +} + +static void capitalize_command(int argc, argument *argv) +{ + if (argc > 0) + capitalize_fields = argv[0].s; + else + capitalize_fields.clear(); +} + +static void accumulate_command(int, argument *) +{ + accumulate = 1; +} + +static void no_accumulate_command(int, argument *) +{ + accumulate = 0; +} + +static void move_punctuation_command(int, argument *) +{ + move_punctuation = 1; +} + +static void no_move_punctuation_command(int, argument *) +{ + move_punctuation = 0; +} + +static void sort_command(int argc, argument *argv) +{ + if (argc == 0) + sort_fields = "AD"; + else + sort_fields = argv[0].s; + accumulate = 1; +} + +static void no_sort_command(int, argument *) +{ + sort_fields.clear(); +} + +static void articles_command(int argc, argument *argv) +{ + articles.clear(); + int i; + for (i = 0; i < argc; i++) { + articles += argv[i].s; + articles += '\0'; + } + int len = articles.length(); + for (i = 0; i < len; i++) + articles[i] = cmlower(articles[i]); +} + +static void database_command(int argc, argument *argv) +{ + for (int i = 0; i < argc; i++) + database_list.add_file(argv[i].s); +} + +static void default_database_command(int, argument *) +{ + search_default = 1; +} + +static void no_default_database_command(int, argument *) +{ + search_default = 0; +} + +static void bibliography_command(int argc, argument *argv) +{ + have_bibliography = 1; + const char *saved_filename = current_filename; + int saved_lineno = current_lineno; + int saved_label_in_text = label_in_text; + label_in_text = 0; + if (!accumulate) + fputs(".]<\n", stdout); + for (int i = 0; i < argc; i++) + do_bib(argv[i].s); + if (accumulate) + output_references(); + else + fputs(".]>\n", stdout); + current_filename = saved_filename; + current_lineno = saved_lineno; + label_in_text = saved_label_in_text; +} + +static void annotate_command(int argc, argument *argv) +{ + if (argc > 0) + annotation_field = argv[0].s[0]; + else + annotation_field = 'X'; + if (argc == 2) + annotation_macro = argv[1].s; + else + annotation_macro = "AP"; +} + +static void no_annotate_command(int, argument *) +{ + annotation_macro.clear(); + annotation_field = -1; +} + +static void reverse_command(int, argument *argv) +{ + reverse_fields = argv[0].s; +} + +static void no_reverse_command(int, argument *) +{ + reverse_fields.clear(); +} + +static void abbreviate_command(int argc, argument *argv) +{ + abbreviate_fields = argv[0].s; + period_before_initial = argc > 1 ? argv[1].s : ". "; + period_before_last_name = argc > 2 ? argv[2].s : ". "; + period_before_other = argc > 3 ? argv[3].s : ". "; + period_before_hyphen = argc > 4 ? argv[4].s : "."; +} + +static void no_abbreviate_command(int, argument *) +{ + abbreviate_fields.clear(); +} + +string search_ignore_fields; + +static void search_ignore_command(int argc, argument *argv) +{ + if (argc > 0) + search_ignore_fields = argv[0].s; + else + search_ignore_fields = "XYZ"; + search_ignore_fields += '\0'; + linear_ignore_fields = search_ignore_fields.contents(); +} + +static void no_search_ignore_command(int, argument *) +{ + linear_ignore_fields = ""; +} + +static void search_truncate_command(int argc, argument *argv) +{ + if (argc > 0) + linear_truncate_len = argv[0].n; + else + linear_truncate_len = 6; +} + +static void no_search_truncate_command(int, argument *) +{ + linear_truncate_len = -1; +} + +static void discard_command(int argc, argument *argv) +{ + if (argc == 0) + discard_fields = "XYZ"; + else + discard_fields = argv[0].s; + accumulate = 1; +} + +static void no_discard_command(int, argument *) +{ + discard_fields.clear(); +} + +static void label_command(int, argument *argv) +{ + set_label_spec(argv[0].s); +} + +static void abbreviate_label_ranges_command(int argc, argument *argv) +{ + abbreviate_label_ranges = 1; + label_range_indicator = argc > 0 ? argv[0].s : "-"; +} + +static void no_abbreviate_label_ranges_command(int, argument *) +{ + abbreviate_label_ranges = 0; +} + +static void label_in_reference_command(int, argument *) +{ + label_in_reference = 1; +} + +static void no_label_in_reference_command(int, argument *) +{ + label_in_reference = 0; +} + +static void label_in_text_command(int, argument *) +{ + label_in_text = 1; +} + +static void no_label_in_text_command(int, argument *) +{ + label_in_text = 0; +} + +static void sort_adjacent_labels_command(int, argument *) +{ + sort_adjacent_labels = 1; +} + +static void no_sort_adjacent_labels_command(int, argument *) +{ + sort_adjacent_labels = 0; +} + +static void date_as_label_command(int argc, argument *argv) +{ + if (set_date_label_spec(argc > 0 ? argv[0].s : "D%a*")) + date_as_label = 1; +} + +static void no_date_as_label_command(int, argument *) +{ + date_as_label = 0; +} + +static void short_label_command(int, argument *argv) +{ + if (set_short_label_spec(argv[0].s)) + short_label_flag = 1; +} + +static void no_short_label_command(int, argument *) +{ + short_label_flag = 0; +} + +static void compatible_command(int, argument *) +{ + compatible_flag = 1; +} + +static void no_compatible_command(int, argument *) +{ + compatible_flag = 0; +} + +static void join_authors_command(int argc, argument *argv) +{ + join_authors_exactly_two = argv[0].s; + join_authors_default = argc > 1 ? argv[1].s : argv[0].s; + join_authors_last_two = argc == 3 ? argv[2].s : argv[0].s; +} + +static void bracket_label_command(int, argument *argv) +{ + pre_label = argv[0].s; + post_label = argv[1].s; + sep_label = argv[2].s; +} + +static void separate_label_second_parts_command(int, argument *argv) +{ + separate_label_second_parts = argv[0].s; +} + +static void et_al_command(int argc, argument *argv) +{ + et_al = argv[0].s; + et_al_min_elide = argv[1].n; + if (et_al_min_elide < 1) + et_al_min_elide = 1; + et_al_min_total = argc >= 3 ? argv[2].n : 0; +} + +static void no_et_al_command(int, argument *) +{ + et_al.clear(); + et_al_min_elide = 0; +} + +typedef void (*command_t)(int, argument *); + +/* arg_types is a string describing the numbers and types of arguments. +s means a string, i means an integer, f is a list of fields, F is +a single field, +? means that the previous argument is optional, * means that the +previous argument can occur any number of times. */ + +struct S { + const char *name; + command_t func; + const char *arg_types; +} command_table[] = { + { "include", include_command, "s" }, + { "echo", echo_command, "s*" }, + { "capitalize", capitalize_command, "f?" }, + { "accumulate", accumulate_command, "" }, + { "no-accumulate", no_accumulate_command, "" }, + { "move-punctuation", move_punctuation_command, "" }, + { "no-move-punctuation", no_move_punctuation_command, "" }, + { "sort", sort_command, "s?" }, + { "no-sort", no_sort_command, "" }, + { "articles", articles_command, "s*" }, + { "database", database_command, "ss*" }, + { "default-database", default_database_command, "" }, + { "no-default-database", no_default_database_command, "" }, + { "bibliography", bibliography_command, "ss*" }, + { "annotate", annotate_command, "F?s?" }, + { "no-annotate", no_annotate_command, "" }, + { "reverse", reverse_command, "s" }, + { "no-reverse", no_reverse_command, "" }, + { "abbreviate", abbreviate_command, "ss?s?s?s?" }, + { "no-abbreviate", no_abbreviate_command, "" }, + { "search-ignore", search_ignore_command, "f?" }, + { "no-search-ignore", no_search_ignore_command, "" }, + { "search-truncate", search_truncate_command, "i?" }, + { "no-search-truncate", no_search_truncate_command, "" }, + { "discard", discard_command, "f?" }, + { "no-discard", no_discard_command, "" }, + { "label", label_command, "s" }, + { "abbreviate-label-ranges", abbreviate_label_ranges_command, "s?" }, + { "no-abbreviate-label-ranges", no_abbreviate_label_ranges_command, "" }, + { "label-in-reference", label_in_reference_command, "" }, + { "no-label-in-reference", no_label_in_reference_command, "" }, + { "label-in-text", label_in_text_command, "" }, + { "no-label-in-text", no_label_in_text_command, "" }, + { "sort-adjacent-labels", sort_adjacent_labels_command, "" }, + { "no-sort-adjacent-labels", no_sort_adjacent_labels_command, "" }, + { "date-as-label", date_as_label_command, "s?" }, + { "no-date-as-label", no_date_as_label_command, "" }, + { "short-label", short_label_command, "s" }, + { "no-short-label", no_short_label_command, "" }, + { "compatible", compatible_command, "" }, + { "no-compatible", no_compatible_command, "" }, + { "join-authors", join_authors_command, "sss?" }, + { "bracket-label", bracket_label_command, "sss" }, + { "separate-label-second-parts", separate_label_second_parts_command, "s" }, + { "et-al", et_al_command, "sii?" }, + { "no-et-al", no_et_al_command, "" }, +}; + +static int check_args(const char *types, const char *name, + int argc, argument *argv) +{ + int argno = 0; + while (*types) { + if (argc == 0) { + if (types[1] == '?') + break; + else if (types[1] == '*') { + assert(types[2] == '\0'); + break; + } + else { + input_stack::error("missing argument for command '%1'", name); + return 0; + } + } + switch (*types) { + case 's': + break; + case 'i': + { + char *ptr; + long n = strtol(argv->s, &ptr, 10); + if ((n == 0 && ptr == argv->s) + || *ptr != '\0') { + input_stack::error("argument %1 for command '%2' must be an integer", + argno + 1, name); + return 0; + } + argv->n = (int)n; + break; + } + case 'f': + { + for (const char *ptr = argv->s; *ptr != '\0'; ptr++) + if (!cs_field_name(*ptr)) { + input_stack::error("argument %1 for command '%2' must be a list of fields", + argno + 1, name); + return 0; + } + break; + } + case 'F': + if (argv->s[0] == '\0' || argv->s[1] != '\0' + || !cs_field_name(argv->s[0])) { + input_stack::error("argument %1 for command '%2' must be a field name", + argno + 1, name); + return 0; + } + break; + default: + assert(0); + } + if (types[1] == '?') + types += 2; + else if (types[1] != '*') + types += 1; + --argc; + ++argv; + ++argno; + } + if (argc > 0) { + input_stack::error("too many arguments for command '%1'", name); + return 0; + } + return 1; +} + +static void execute_command(const char *name, int argc, argument *argv) +{ + for (unsigned int i = 0; + i < sizeof(command_table)/sizeof(command_table[0]); i++) + if (strcmp(name, command_table[i].name) == 0) { + if (check_args(command_table[i].arg_types, name, argc, argv)) + (*command_table[i].func)(argc, argv); + return; + } + input_stack::error("unknown command '%1'", name); +} + +static void command_loop() +{ + string command; + for (;;) { + command.clear(); + int res = get_word(command); + if (res != 1) { + if (res == 0) + continue; + break; + } + int argc = 0; + command += '\0'; + while ((res = get_word(command)) == 1) { + argc++; + command += '\0'; + } + argument *argv = new argument[argc]; + const char *ptr = command.contents(); + for (int i = 0; i < argc; i++) + argv[i].s = ptr = strchr(ptr, '\0') + 1; + execute_command(command.contents(), argc, argv); + delete[] argv; + if (res == -1) + break; + } +} + +void process_commands(string &s, const char *file, int lineno) +{ + const char *saved_filename = current_filename; + int saved_lineno = current_lineno; + input_stack::init(); + current_filename = file; + // Report diagnostics with respect to line _before_ last newline seen. + current_lineno = lineno - 1; + input_stack::push_string(s, file, lineno); + command_loop(); + current_filename = saved_filename; + current_lineno = saved_lineno; +} + +// Local Variables: +// fill-column: 72 +// mode: C++ +// End: +// vim: set cindent noexpandtab shiftwidth=2 textwidth=72: diff --git a/src/preproc/refer/command.h b/src/preproc/refer/command.h new file mode 100644 index 0000000..db850f4 --- /dev/null +++ b/src/preproc/refer/command.h @@ -0,0 +1,35 @@ +// -*- C++ -*- +/* Copyright (C) 1989-2020 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or +(at your option) any later version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . */ + +void process_commands(string &s, const char *file, int lineno); + +extern int have_bibliography; +extern int accumulate; +extern int move_punctuation; +extern int search_default; +extern search_list database_list; +extern int label_in_text; +extern int label_in_reference; +extern int sort_adjacent_labels; +extern string pre_label; +extern string post_label; +extern string sep_label; + +extern void do_bib(const char *); +extern void output_references(); diff --git a/src/preproc/refer/label.cpp b/src/preproc/refer/label.cpp new file mode 100644 index 0000000..f8c1645 --- /dev/null +++ b/src/preproc/refer/label.cpp @@ -0,0 +1,2607 @@ +/* A Bison parser, made by GNU Bison 3.8.2. */ + +/* Bison implementation for Yacc-like parsers in C + + Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2021 Free Software Foundation, + Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + +/* C LALR(1) parser skeleton written by Richard Stallman, by + simplifying the original so-called "semantic" parser. */ + +/* DO NOT RELY ON FEATURES THAT ARE NOT DOCUMENTED in the manual, + especially those whose name start with YY_ or yy_. They are + private implementation details that can be changed or removed. */ + +/* All symbols defined below should begin with yy or YY, to avoid + infringing on user name space. This should be done even for local + variables, as they might otherwise be expanded by user macros. + There are some unavoidable exceptions within include files to + define necessary library symbols; they are noted "INFRINGES ON + USER NAME SPACE" below. */ + +/* Identify Bison output, and Bison version. */ +#define YYBISON 30802 + +/* Bison version string. */ +#define YYBISON_VERSION "3.8.2" + +/* Skeleton name. */ +#define YYSKELETON_NAME "yacc.c" + +/* Pure parsers. */ +#define YYPURE 0 + +/* Push parsers. */ +#define YYPUSH 0 + +/* Pull parsers. */ +#define YYPULL 1 + + + + +/* First part of user prologue. */ +#line 19 "../src/preproc/refer/label.ypp" + + +#include "refer.h" +#include "refid.h" +#include "ref.h" +#include "token.h" + +int yylex(); +void yyerror(const char *); + +static const char *format_serial(char c, int n); + +struct label_info { + int start; + int length; + int count; + int total; + label_info(const string &); +}; + +label_info *lookup_label(const string &label); + +struct expression { + enum { + // Does the tentative label depend on the reference? + CONTAINS_VARIABLE = 01, + CONTAINS_STAR = 02, + CONTAINS_FORMAT = 04, + CONTAINS_AT = 010 + }; + virtual ~expression() { } + virtual void evaluate(int, const reference &, string &, + substring_position &) = 0; + virtual unsigned analyze() { return 0; } +}; + +class at_expr : public expression { +public: + at_expr() { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return CONTAINS_VARIABLE|CONTAINS_AT; } +}; + +class format_expr : public expression { + char type; + int width; + int first_number; +public: + format_expr(char c, int w = 0, int f = 1) + : type(c), width(w), first_number(f) { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return CONTAINS_FORMAT; } +}; + +class field_expr : public expression { + int number; + char name; +public: + field_expr(char nm, int num) : number(num), name(nm) { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return CONTAINS_VARIABLE; } +}; + +class literal_expr : public expression { + string s; +public: + literal_expr(const char *ptr, int len) : s(ptr, len) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class unary_expr : public expression { +protected: + expression *expr; +public: + unary_expr(expression *e) : expr(e) { } + ~unary_expr() { delete expr; } + void evaluate(int, const reference &, string &, substring_position &) = 0; + unsigned analyze() { return expr ? expr->analyze() : 0; } +}; + +// This caches the analysis of an expression. + +class analyzed_expr : public unary_expr { + unsigned flags; +public: + analyzed_expr(expression *); + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return flags; } +}; + +class star_expr : public unary_expr { +public: + star_expr(expression *e) : unary_expr(e) { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { + return ((expr ? (expr->analyze() & ~CONTAINS_VARIABLE) : 0) + | CONTAINS_STAR); + } +}; + +typedef void map_func(const char *, const char *, string &); + +class map_expr : public unary_expr { + map_func *func; +public: + map_expr(expression *e, map_func *f) : unary_expr(e), func(f) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +typedef const char *extractor_func(const char *, const char *, const char **); + +class extractor_expr : public unary_expr { + int part; + extractor_func *func; +public: + enum { BEFORE = +1, MATCH = 0, AFTER = -1 }; + extractor_expr(expression *e, extractor_func *f, int pt) + : unary_expr(e), part(pt), func(f) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class truncate_expr : public unary_expr { + int n; +public: + truncate_expr(expression *e, int i) : unary_expr(e), n(i) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class separator_expr : public unary_expr { +public: + separator_expr(expression *e) : unary_expr(e) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class binary_expr : public expression { +protected: + expression *expr1; + expression *expr2; +public: + binary_expr(expression *e1, expression *e2) : expr1(e1), expr2(e2) { } + ~binary_expr() { delete expr1; delete expr2; } + void evaluate(int, const reference &, string &, substring_position &) = 0; + unsigned analyze() { + return (expr1 ? expr1->analyze() : 0) | (expr2 ? expr2->analyze() : 0); + } +}; + +class alternative_expr : public binary_expr { +public: + alternative_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class list_expr : public binary_expr { +public: + list_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class substitute_expr : public binary_expr { +public: + substitute_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class ternary_expr : public expression { +protected: + expression *expr1; + expression *expr2; + expression *expr3; +public: + ternary_expr(expression *e1, expression *e2, expression *e3) + : expr1(e1), expr2(e2), expr3(e3) { } + ~ternary_expr() { delete expr1; delete expr2; delete expr3; } + void evaluate(int, const reference &, string &, substring_position &) = 0; + unsigned analyze() { + return ((expr1 ? expr1->analyze() : 0) + | (expr2 ? expr2->analyze() : 0) + | (expr3 ? expr3->analyze() : 0)); + } +}; + +class conditional_expr : public ternary_expr { +public: + conditional_expr(expression *e1, expression *e2, expression *e3) + : ternary_expr(e1, e2, e3) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +static expression *parsed_label = 0; +static expression *parsed_date_label = 0; +static expression *parsed_short_label = 0; + +static expression *parse_result; + +string literals; + + +#line 270 "src/preproc/refer/label.cpp" + +# ifndef YY_CAST +# ifdef __cplusplus +# define YY_CAST(Type, Val) static_cast (Val) +# define YY_REINTERPRET_CAST(Type, Val) reinterpret_cast (Val) +# else +# define YY_CAST(Type, Val) ((Type) (Val)) +# define YY_REINTERPRET_CAST(Type, Val) ((Type) (Val)) +# endif +# endif +# ifndef YY_NULLPTR +# if defined __cplusplus +# if 201103L <= __cplusplus +# define YY_NULLPTR nullptr +# else +# define YY_NULLPTR 0 +# endif +# else +# define YY_NULLPTR ((void*)0) +# endif +# endif + +/* Use api.header.include to #include this header + instead of duplicating it here. */ +#ifndef YY_YY_SRC_PREPROC_REFER_LABEL_HPP_INCLUDED +# define YY_YY_SRC_PREPROC_REFER_LABEL_HPP_INCLUDED +/* Debug traces. */ +#ifndef YYDEBUG +# define YYDEBUG 0 +#endif +#if YYDEBUG +extern int yydebug; +#endif + +/* Token kinds. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + enum yytokentype + { + YYEMPTY = -2, + YYEOF = 0, /* "end of file" */ + YYerror = 256, /* error */ + YYUNDEF = 257, /* "invalid token" */ + TOKEN_LETTER = 258, /* TOKEN_LETTER */ + TOKEN_LITERAL = 259, /* TOKEN_LITERAL */ + TOKEN_DIGIT = 260 /* TOKEN_DIGIT */ + }; + typedef enum yytokentype yytoken_kind_t; +#endif +/* Token kinds. */ +#define YYEMPTY -2 +#define YYEOF 0 +#define YYerror 256 +#define YYUNDEF 257 +#define TOKEN_LETTER 258 +#define TOKEN_LITERAL 259 +#define TOKEN_DIGIT 260 + +/* Value type. */ +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +union YYSTYPE +{ +#line 218 "../src/preproc/refer/label.ypp" + + int num; + expression *expr; + struct { int ndigits; int val; } dig; + struct { int start; int len; } str; + +#line 340 "src/preproc/refer/label.cpp" + +}; +typedef union YYSTYPE YYSTYPE; +# define YYSTYPE_IS_TRIVIAL 1 +# define YYSTYPE_IS_DECLARED 1 +#endif + + +extern YYSTYPE yylval; + + +int yyparse (void); + + +#endif /* !YY_YY_SRC_PREPROC_REFER_LABEL_HPP_INCLUDED */ +/* Symbol kind. */ +enum yysymbol_kind_t +{ + YYSYMBOL_YYEMPTY = -2, + YYSYMBOL_YYEOF = 0, /* "end of file" */ + YYSYMBOL_YYerror = 1, /* error */ + YYSYMBOL_YYUNDEF = 2, /* "invalid token" */ + YYSYMBOL_TOKEN_LETTER = 3, /* TOKEN_LETTER */ + YYSYMBOL_TOKEN_LITERAL = 4, /* TOKEN_LITERAL */ + YYSYMBOL_TOKEN_DIGIT = 5, /* TOKEN_DIGIT */ + YYSYMBOL_6_ = 6, /* '?' */ + YYSYMBOL_7_ = 7, /* ':' */ + YYSYMBOL_8_ = 8, /* '|' */ + YYSYMBOL_9_ = 9, /* '&' */ + YYSYMBOL_10_ = 10, /* '~' */ + YYSYMBOL_11_ = 11, /* '@' */ + YYSYMBOL_12_ = 12, /* '%' */ + YYSYMBOL_13_ = 13, /* '.' */ + YYSYMBOL_14_ = 14, /* '+' */ + YYSYMBOL_15_ = 15, /* '-' */ + YYSYMBOL_16_ = 16, /* '*' */ + YYSYMBOL_17_ = 17, /* '(' */ + YYSYMBOL_18_ = 18, /* ')' */ + YYSYMBOL_19_ = 19, /* '<' */ + YYSYMBOL_20_ = 20, /* '>' */ + YYSYMBOL_YYACCEPT = 21, /* $accept */ + YYSYMBOL_expr = 22, /* expr */ + YYSYMBOL_conditional = 23, /* conditional */ + YYSYMBOL_optional_conditional = 24, /* optional_conditional */ + YYSYMBOL_alternative = 25, /* alternative */ + YYSYMBOL_list = 26, /* list */ + YYSYMBOL_substitute = 27, /* substitute */ + YYSYMBOL_string = 28, /* string */ + YYSYMBOL_optional_number = 29, /* optional_number */ + YYSYMBOL_number = 30, /* number */ + YYSYMBOL_digits = 31, /* digits */ + YYSYMBOL_flag = 32 /* flag */ +}; +typedef enum yysymbol_kind_t yysymbol_kind_t; + + + + +#ifdef short +# undef short +#endif + +/* On compilers that do not define __PTRDIFF_MAX__ etc., make sure + and (if available) are included + so that the code can choose integer types of a good width. */ + +#ifndef __PTRDIFF_MAX__ +# include /* INFRINGES ON USER NAME SPACE */ +# if defined __STDC_VERSION__ && 199901 <= __STDC_VERSION__ +# include /* INFRINGES ON USER NAME SPACE */ +# define YY_STDINT_H +# endif +#endif + +/* Narrow types that promote to a signed type and that can represent a + signed or unsigned integer of at least N bits. In tables they can + save space and decrease cache pressure. Promoting to a signed type + helps avoid bugs in integer arithmetic. */ + +#ifdef __INT_LEAST8_MAX__ +typedef __INT_LEAST8_TYPE__ yytype_int8; +#elif defined YY_STDINT_H +typedef int_least8_t yytype_int8; +#else +typedef signed char yytype_int8; +#endif + +#ifdef __INT_LEAST16_MAX__ +typedef __INT_LEAST16_TYPE__ yytype_int16; +#elif defined YY_STDINT_H +typedef int_least16_t yytype_int16; +#else +typedef short yytype_int16; +#endif + +/* Work around bug in HP-UX 11.23, which defines these macros + incorrectly for preprocessor constants. This workaround can likely + be removed in 2023, as HPE has promised support for HP-UX 11.23 + (aka HP-UX 11i v2) only through the end of 2022; see Table 2 of + . */ +#ifdef __hpux +# undef UINT_LEAST8_MAX +# undef UINT_LEAST16_MAX +# define UINT_LEAST8_MAX 255 +# define UINT_LEAST16_MAX 65535 +#endif + +#if defined __UINT_LEAST8_MAX__ && __UINT_LEAST8_MAX__ <= __INT_MAX__ +typedef __UINT_LEAST8_TYPE__ yytype_uint8; +#elif (!defined __UINT_LEAST8_MAX__ && defined YY_STDINT_H \ + && UINT_LEAST8_MAX <= INT_MAX) +typedef uint_least8_t yytype_uint8; +#elif !defined __UINT_LEAST8_MAX__ && UCHAR_MAX <= INT_MAX +typedef unsigned char yytype_uint8; +#else +typedef short yytype_uint8; +#endif + +#if defined __UINT_LEAST16_MAX__ && __UINT_LEAST16_MAX__ <= __INT_MAX__ +typedef __UINT_LEAST16_TYPE__ yytype_uint16; +#elif (!defined __UINT_LEAST16_MAX__ && defined YY_STDINT_H \ + && UINT_LEAST16_MAX <= INT_MAX) +typedef uint_least16_t yytype_uint16; +#elif !defined __UINT_LEAST16_MAX__ && USHRT_MAX <= INT_MAX +typedef unsigned short yytype_uint16; +#else +typedef int yytype_uint16; +#endif + +#ifndef YYPTRDIFF_T +# if defined __PTRDIFF_TYPE__ && defined __PTRDIFF_MAX__ +# define YYPTRDIFF_T __PTRDIFF_TYPE__ +# define YYPTRDIFF_MAXIMUM __PTRDIFF_MAX__ +# elif defined PTRDIFF_MAX +# ifndef ptrdiff_t +# include /* INFRINGES ON USER NAME SPACE */ +# endif +# define YYPTRDIFF_T ptrdiff_t +# define YYPTRDIFF_MAXIMUM PTRDIFF_MAX +# else +# define YYPTRDIFF_T long +# define YYPTRDIFF_MAXIMUM LONG_MAX +# endif +#endif + +#ifndef YYSIZE_T +# ifdef __SIZE_TYPE__ +# define YYSIZE_T __SIZE_TYPE__ +# elif defined size_t +# define YYSIZE_T size_t +# elif defined __STDC_VERSION__ && 199901 <= __STDC_VERSION__ +# include /* INFRINGES ON USER NAME SPACE */ +# define YYSIZE_T size_t +# else +# define YYSIZE_T unsigned +# endif +#endif + +#define YYSIZE_MAXIMUM \ + YY_CAST (YYPTRDIFF_T, \ + (YYPTRDIFF_MAXIMUM < YY_CAST (YYSIZE_T, -1) \ + ? YYPTRDIFF_MAXIMUM \ + : YY_CAST (YYSIZE_T, -1))) + +#define YYSIZEOF(X) YY_CAST (YYPTRDIFF_T, sizeof (X)) + + +/* Stored state numbers (used for stacks). */ +typedef yytype_int8 yy_state_t; + +/* State numbers in computations. */ +typedef int yy_state_fast_t; + +#ifndef YY_ +# if defined YYENABLE_NLS && YYENABLE_NLS +# if ENABLE_NLS +# include /* INFRINGES ON USER NAME SPACE */ +# define YY_(Msgid) dgettext ("bison-runtime", Msgid) +# endif +# endif +# ifndef YY_ +# define YY_(Msgid) Msgid +# endif +#endif + + +#ifndef YY_ATTRIBUTE_PURE +# if defined __GNUC__ && 2 < __GNUC__ + (96 <= __GNUC_MINOR__) +# define YY_ATTRIBUTE_PURE __attribute__ ((__pure__)) +# else +# define YY_ATTRIBUTE_PURE +# endif +#endif + +#ifndef YY_ATTRIBUTE_UNUSED +# if defined __GNUC__ && 2 < __GNUC__ + (7 <= __GNUC_MINOR__) +# define YY_ATTRIBUTE_UNUSED __attribute__ ((__unused__)) +# else +# define YY_ATTRIBUTE_UNUSED +# endif +#endif + +/* Suppress unused-variable warnings by "using" E. */ +#if ! defined lint || defined __GNUC__ +# define YY_USE(E) ((void) (E)) +#else +# define YY_USE(E) /* empty */ +#endif + +/* Suppress an incorrect diagnostic about yylval being uninitialized. */ +#if defined __GNUC__ && ! defined __ICC && 406 <= __GNUC__ * 100 + __GNUC_MINOR__ +# if __GNUC__ * 100 + __GNUC_MINOR__ < 407 +# define YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN \ + _Pragma ("GCC diagnostic push") \ + _Pragma ("GCC diagnostic ignored \"-Wuninitialized\"") +# else +# define YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN \ + _Pragma ("GCC diagnostic push") \ + _Pragma ("GCC diagnostic ignored \"-Wuninitialized\"") \ + _Pragma ("GCC diagnostic ignored \"-Wmaybe-uninitialized\"") +# endif +# define YY_IGNORE_MAYBE_UNINITIALIZED_END \ + _Pragma ("GCC diagnostic pop") +#else +# define YY_INITIAL_VALUE(Value) Value +#endif +#ifndef YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN +# define YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN +# define YY_IGNORE_MAYBE_UNINITIALIZED_END +#endif +#ifndef YY_INITIAL_VALUE +# define YY_INITIAL_VALUE(Value) /* Nothing. */ +#endif + +#if defined __cplusplus && defined __GNUC__ && ! defined __ICC && 6 <= __GNUC__ +# define YY_IGNORE_USELESS_CAST_BEGIN \ + _Pragma ("GCC diagnostic push") \ + _Pragma ("GCC diagnostic ignored \"-Wuseless-cast\"") +# define YY_IGNORE_USELESS_CAST_END \ + _Pragma ("GCC diagnostic pop") +#endif +#ifndef YY_IGNORE_USELESS_CAST_BEGIN +# define YY_IGNORE_USELESS_CAST_BEGIN +# define YY_IGNORE_USELESS_CAST_END +#endif + + +#define YY_ASSERT(E) ((void) (0 && (E))) + +#if !defined yyoverflow + +/* The parser invokes alloca or malloc; define the necessary symbols. */ + +# ifdef YYSTACK_USE_ALLOCA +# if YYSTACK_USE_ALLOCA +# ifdef __GNUC__ +# define YYSTACK_ALLOC __builtin_alloca +# elif defined __BUILTIN_VA_ARG_INCR +# include /* INFRINGES ON USER NAME SPACE */ +# elif defined _AIX +# define YYSTACK_ALLOC __alloca +# elif defined _MSC_VER +# include /* INFRINGES ON USER NAME SPACE */ +# define alloca _alloca +# else +# define YYSTACK_ALLOC alloca +# if ! defined _ALLOCA_H && ! defined EXIT_SUCCESS +# include /* INFRINGES ON USER NAME SPACE */ + /* Use EXIT_SUCCESS as a witness for stdlib.h. */ +# ifndef EXIT_SUCCESS +# define EXIT_SUCCESS 0 +# endif +# endif +# endif +# endif +# endif + +# ifdef YYSTACK_ALLOC + /* Pacify GCC's 'empty if-body' warning. */ +# define YYSTACK_FREE(Ptr) do { /* empty */; } while (0) +# ifndef YYSTACK_ALLOC_MAXIMUM + /* The OS might guarantee only one guard page at the bottom of the stack, + and a page size can be as small as 4096 bytes. So we cannot safely + invoke alloca (N) if N exceeds 4096. Use a slightly smaller number + to allow for a few compiler-allocated temporary stack slots. */ +# define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */ +# endif +# else +# define YYSTACK_ALLOC YYMALLOC +# define YYSTACK_FREE YYFREE +# ifndef YYSTACK_ALLOC_MAXIMUM +# define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM +# endif +# if (defined __cplusplus && ! defined EXIT_SUCCESS \ + && ! ((defined YYMALLOC || defined malloc) \ + && (defined YYFREE || defined free))) +# include /* INFRINGES ON USER NAME SPACE */ +# ifndef EXIT_SUCCESS +# define EXIT_SUCCESS 0 +# endif +# endif +# ifndef YYMALLOC +# define YYMALLOC malloc +# if ! defined malloc && ! defined EXIT_SUCCESS +void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# ifndef YYFREE +# define YYFREE free +# if ! defined free && ! defined EXIT_SUCCESS +void free (void *); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# endif +#endif /* !defined yyoverflow */ + +#if (! defined yyoverflow \ + && (! defined __cplusplus \ + || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL))) + +/* A type that is properly aligned for any stack member. */ +union yyalloc +{ + yy_state_t yyss_alloc; + YYSTYPE yyvs_alloc; +}; + +/* The size of the maximum gap between one aligned stack and the next. */ +# define YYSTACK_GAP_MAXIMUM (YYSIZEOF (union yyalloc) - 1) + +/* The size of an array large to enough to hold all stacks, each with + N elements. */ +# define YYSTACK_BYTES(N) \ + ((N) * (YYSIZEOF (yy_state_t) + YYSIZEOF (YYSTYPE)) \ + + YYSTACK_GAP_MAXIMUM) + +# define YYCOPY_NEEDED 1 + +/* Relocate STACK from its old location to the new one. The + local variables YYSIZE and YYSTACKSIZE give the old and new number of + elements in the stack, and YYPTR gives the new location of the + stack. Advance YYPTR to a properly aligned location for the next + stack. */ +# define YYSTACK_RELOCATE(Stack_alloc, Stack) \ + do \ + { \ + YYPTRDIFF_T yynewbytes; \ + YYCOPY (&yyptr->Stack_alloc, Stack, yysize); \ + Stack = &yyptr->Stack_alloc; \ + yynewbytes = yystacksize * YYSIZEOF (*Stack) + YYSTACK_GAP_MAXIMUM; \ + yyptr += yynewbytes / YYSIZEOF (*yyptr); \ + } \ + while (0) + +#endif + +#if defined YYCOPY_NEEDED && YYCOPY_NEEDED +/* Copy COUNT objects from SRC to DST. The source and destination do + not overlap. */ +# ifndef YYCOPY +# if defined __GNUC__ && 1 < __GNUC__ +# define YYCOPY(Dst, Src, Count) \ + __builtin_memcpy (Dst, Src, YY_CAST (YYSIZE_T, (Count)) * sizeof (*(Src))) +# else +# define YYCOPY(Dst, Src, Count) \ + do \ + { \ + YYPTRDIFF_T yyi; \ + for (yyi = 0; yyi < (Count); yyi++) \ + (Dst)[yyi] = (Src)[yyi]; \ + } \ + while (0) +# endif +# endif +#endif /* !YYCOPY_NEEDED */ + +/* YYFINAL -- State number of the termination state. */ +#define YYFINAL 21 +/* YYLAST -- Last index in YYTABLE. */ +#define YYLAST 39 + +/* YYNTOKENS -- Number of terminals. */ +#define YYNTOKENS 21 +/* YYNNTS -- Number of nonterminals. */ +#define YYNNTS 12 +/* YYNRULES -- Number of rules. */ +#define YYNRULES 34 +/* YYNSTATES -- Number of states. */ +#define YYNSTATES 49 + +/* YYMAXUTOK -- Last valid token kind. */ +#define YYMAXUTOK 260 + + +/* YYTRANSLATE(TOKEN-NUM) -- Symbol number corresponding to TOKEN-NUM + as returned by yylex, with out-of-bounds checking. */ +#define YYTRANSLATE(YYX) \ + (0 <= (YYX) && (YYX) <= YYMAXUTOK \ + ? YY_CAST (yysymbol_kind_t, yytranslate[YYX]) \ + : YYSYMBOL_YYUNDEF) + +/* YYTRANSLATE[TOKEN-NUM] -- Symbol number corresponding to TOKEN-NUM + as returned by yylex. */ +static const yytype_int8 yytranslate[] = +{ + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 12, 9, 2, + 17, 18, 16, 14, 2, 15, 13, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 7, 2, + 19, 2, 20, 6, 11, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 8, 2, 10, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 1, 2, 3, 4, + 5 +}; + +#if YYDEBUG +/* YYRLINE[YYN] -- Source line where rule number YYN was defined. */ +static const yytype_int16 yyrline[] = +{ + 0, 246, 246, 251, 253, 259, 260, 265, 267, 269, + 274, 276, 281, 283, 288, 290, 295, 297, 299, 315, + 319, 350, 352, 354, 356, 358, 364, 365, 370, 372, + 377, 379, 386, 387, 389 +}; +#endif + +/** Accessing symbol of state STATE. */ +#define YY_ACCESSING_SYMBOL(State) YY_CAST (yysymbol_kind_t, yystos[State]) + +#if YYDEBUG || 0 +/* The user-facing name of the symbol whose (internal) number is + YYSYMBOL. No bounds checking. */ +static const char *yysymbol_name (yysymbol_kind_t yysymbol) YY_ATTRIBUTE_UNUSED; + +/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM. + First, the terminals, then, starting at YYNTOKENS, nonterminals. */ +static const char *const yytname[] = +{ + "\"end of file\"", "error", "\"invalid token\"", "TOKEN_LETTER", + "TOKEN_LITERAL", "TOKEN_DIGIT", "'?'", "':'", "'|'", "'&'", "'~'", "'@'", + "'%'", "'.'", "'+'", "'-'", "'*'", "'('", "')'", "'<'", "'>'", "$accept", + "expr", "conditional", "optional_conditional", "alternative", "list", + "substitute", "string", "optional_number", "number", "digits", "flag", YY_NULLPTR +}; + +static const char * +yysymbol_name (yysymbol_kind_t yysymbol) +{ + return yytname[yysymbol]; +} +#endif + +#define YYPACT_NINF (-26) + +#define yypact_value_is_default(Yyn) \ + ((Yyn) == YYPACT_NINF) + +#define YYTABLE_NINF (-1) + +#define yytable_value_is_error(Yyn) \ + 0 + +/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing + STATE-NUM. */ +static const yytype_int8 yypact[] = +{ + 2, 11, -26, -26, 12, 2, 2, 24, -26, -26, + 21, 2, 18, -6, -26, 26, -26, -26, 27, 15, + 14, -26, 2, 2, 2, 18, 2, -3, 11, 11, + -26, -26, -26, -26, -26, 28, 2, 2, -6, -26, + -26, 33, 26, 26, 2, 11, -26, -26, 26 +}; + +/* YYDEFACT[STATE-NUM] -- Default reduction number in state STATE-NUM. + Performed when YYTABLE does not specify something else to do. Zero + means the default is an error. */ +static const yytype_int8 yydefact[] = +{ + 5, 16, 15, 14, 0, 5, 5, 0, 6, 2, + 3, 7, 10, 12, 28, 17, 18, 30, 19, 0, + 0, 1, 5, 0, 0, 11, 0, 32, 0, 0, + 23, 29, 31, 24, 25, 0, 8, 9, 13, 33, + 34, 0, 21, 22, 0, 26, 4, 20, 27 +}; + +/* YYPGOTO[NTERM-NUM]. */ +static const yytype_int8 yypgoto[] = +{ + -26, -26, -7, -4, -26, -1, -11, 13, -26, -25, + -26, -26 +}; + +/* YYDEFGOTO[NTERM-NUM]. */ +static const yytype_int8 yydefgoto[] = +{ + 0, 7, 8, 9, 10, 11, 12, 13, 47, 15, + 18, 41 +}; + +/* YYTABLE[YYPACT[STATE-NUM]] -- What to do in state STATE-NUM. If + positive, shift that token. If negative, reduce the rule whose + number is the opposite. If YYTABLE_NINF, syntax error. */ +static const yytype_int8 yytable[] = +{ + 25, 19, 20, 42, 43, 1, 2, 27, 28, 29, + 30, 39, 40, 3, 4, 16, 14, 17, 35, 5, + 48, 6, 36, 37, 21, 25, 25, 22, 26, 23, + 24, 31, 32, 33, 34, 44, 45, 46, 0, 38 +}; + +static const yytype_int8 yycheck[] = +{ + 11, 5, 6, 28, 29, 3, 4, 13, 14, 15, + 16, 14, 15, 11, 12, 3, 5, 5, 22, 17, + 45, 19, 23, 24, 0, 36, 37, 6, 10, 8, + 9, 5, 5, 18, 20, 7, 3, 44, -1, 26 +}; + +/* YYSTOS[STATE-NUM] -- The symbol kind of the accessing symbol of + state STATE-NUM. */ +static const yytype_int8 yystos[] = +{ + 0, 3, 4, 11, 12, 17, 19, 22, 23, 24, + 25, 26, 27, 28, 5, 30, 3, 5, 31, 24, + 24, 0, 6, 8, 9, 27, 10, 13, 14, 15, + 16, 5, 5, 18, 20, 24, 26, 26, 28, 14, + 15, 32, 30, 30, 7, 3, 23, 29, 30 +}; + +/* YYR1[RULE-NUM] -- Symbol kind of the left-hand side of rule RULE-NUM. */ +static const yytype_int8 yyr1[] = +{ + 0, 21, 22, 23, 23, 24, 24, 25, 25, 25, + 26, 26, 27, 27, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 29, 29, 30, 30, + 31, 31, 32, 32, 32 +}; + +/* YYR2[RULE-NUM] -- Number of symbols on the right-hand side of rule RULE-NUM. */ +static const yytype_int8 yyr2[] = +{ + 0, 2, 1, 1, 5, 0, 1, 1, 3, 3, + 1, 2, 1, 3, 1, 1, 1, 2, 2, 2, + 5, 3, 3, 2, 3, 3, 0, 1, 1, 2, + 1, 2, 0, 1, 1 +}; + + +enum { YYENOMEM = -2 }; + +#define yyerrok (yyerrstatus = 0) +#define yyclearin (yychar = YYEMPTY) + +#define YYACCEPT goto yyacceptlab +#define YYABORT goto yyabortlab +#define YYERROR goto yyerrorlab +#define YYNOMEM goto yyexhaustedlab + + +#define YYRECOVERING() (!!yyerrstatus) + +#define YYBACKUP(Token, Value) \ + do \ + if (yychar == YYEMPTY) \ + { \ + yychar = (Token); \ + yylval = (Value); \ + YYPOPSTACK (yylen); \ + yystate = *yyssp; \ + goto yybackup; \ + } \ + else \ + { \ + yyerror (YY_("syntax error: cannot back up")); \ + YYERROR; \ + } \ + while (0) + +/* Backward compatibility with an undocumented macro. + Use YYerror or YYUNDEF. */ +#define YYERRCODE YYUNDEF + + +/* Enable debugging if requested. */ +#if YYDEBUG + +# ifndef YYFPRINTF +# include /* INFRINGES ON USER NAME SPACE */ +# define YYFPRINTF fprintf +# endif + +# define YYDPRINTF(Args) \ +do { \ + if (yydebug) \ + YYFPRINTF Args; \ +} while (0) + + + + +# define YY_SYMBOL_PRINT(Title, Kind, Value, Location) \ +do { \ + if (yydebug) \ + { \ + YYFPRINTF (stderr, "%s ", Title); \ + yy_symbol_print (stderr, \ + Kind, Value); \ + YYFPRINTF (stderr, "\n"); \ + } \ +} while (0) + + +/*-----------------------------------. +| Print this symbol's value on YYO. | +`-----------------------------------*/ + +static void +yy_symbol_value_print (FILE *yyo, + yysymbol_kind_t yykind, YYSTYPE const * const yyvaluep) +{ + FILE *yyoutput = yyo; + YY_USE (yyoutput); + if (!yyvaluep) + return; + YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN + YY_USE (yykind); + YY_IGNORE_MAYBE_UNINITIALIZED_END +} + + +/*---------------------------. +| Print this symbol on YYO. | +`---------------------------*/ + +static void +yy_symbol_print (FILE *yyo, + yysymbol_kind_t yykind, YYSTYPE const * const yyvaluep) +{ + YYFPRINTF (yyo, "%s %s (", + yykind < YYNTOKENS ? "token" : "nterm", yysymbol_name (yykind)); + + yy_symbol_value_print (yyo, yykind, yyvaluep); + YYFPRINTF (yyo, ")"); +} + +/*------------------------------------------------------------------. +| yy_stack_print -- Print the state stack from its BOTTOM up to its | +| TOP (included). | +`------------------------------------------------------------------*/ + +static void +yy_stack_print (yy_state_t *yybottom, yy_state_t *yytop) +{ + YYFPRINTF (stderr, "Stack now"); + for (; yybottom <= yytop; yybottom++) + { + int yybot = *yybottom; + YYFPRINTF (stderr, " %d", yybot); + } + YYFPRINTF (stderr, "\n"); +} + +# define YY_STACK_PRINT(Bottom, Top) \ +do { \ + if (yydebug) \ + yy_stack_print ((Bottom), (Top)); \ +} while (0) + + +/*------------------------------------------------. +| Report that the YYRULE is going to be reduced. | +`------------------------------------------------*/ + +static void +yy_reduce_print (yy_state_t *yyssp, YYSTYPE *yyvsp, + int yyrule) +{ + int yylno = yyrline[yyrule]; + int yynrhs = yyr2[yyrule]; + int yyi; + YYFPRINTF (stderr, "Reducing stack by rule %d (line %d):\n", + yyrule - 1, yylno); + /* The symbols being reduced. */ + for (yyi = 0; yyi < yynrhs; yyi++) + { + YYFPRINTF (stderr, " $%d = ", yyi + 1); + yy_symbol_print (stderr, + YY_ACCESSING_SYMBOL (+yyssp[yyi + 1 - yynrhs]), + &yyvsp[(yyi + 1) - (yynrhs)]); + YYFPRINTF (stderr, "\n"); + } +} + +# define YY_REDUCE_PRINT(Rule) \ +do { \ + if (yydebug) \ + yy_reduce_print (yyssp, yyvsp, Rule); \ +} while (0) + +/* Nonzero means print parse trace. It is left uninitialized so that + multiple parsers can coexist. */ +int yydebug; +#else /* !YYDEBUG */ +# define YYDPRINTF(Args) ((void) 0) +# define YY_SYMBOL_PRINT(Title, Kind, Value, Location) +# define YY_STACK_PRINT(Bottom, Top) +# define YY_REDUCE_PRINT(Rule) +#endif /* !YYDEBUG */ + + +/* YYINITDEPTH -- initial size of the parser's stacks. */ +#ifndef YYINITDEPTH +# define YYINITDEPTH 200 +#endif + +/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only + if the built-in stack extension method is used). + + Do not make this value too large; the results are undefined if + YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH) + evaluated with infinite-precision integer arithmetic. */ + +#ifndef YYMAXDEPTH +# define YYMAXDEPTH 10000 +#endif + + + + + + +/*-----------------------------------------------. +| Release the memory associated to this symbol. | +`-----------------------------------------------*/ + +static void +yydestruct (const char *yymsg, + yysymbol_kind_t yykind, YYSTYPE *yyvaluep) +{ + YY_USE (yyvaluep); + if (!yymsg) + yymsg = "Deleting"; + YY_SYMBOL_PRINT (yymsg, yykind, yyvaluep, yylocationp); + + YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN + YY_USE (yykind); + YY_IGNORE_MAYBE_UNINITIALIZED_END +} + + +/* Lookahead token kind. */ +int yychar; + +/* The semantic value of the lookahead symbol. */ +YYSTYPE yylval; +/* Number of syntax errors so far. */ +int yynerrs; + + + + +/*----------. +| yyparse. | +`----------*/ + +int +yyparse (void) +{ + yy_state_fast_t yystate = 0; + /* Number of tokens to shift before error messages enabled. */ + int yyerrstatus = 0; + + /* Refer to the stacks through separate pointers, to allow yyoverflow + to reallocate them elsewhere. */ + + /* Their size. */ + YYPTRDIFF_T yystacksize = YYINITDEPTH; + + /* The state stack: array, bottom, top. */ + yy_state_t yyssa[YYINITDEPTH]; + yy_state_t *yyss = yyssa; + yy_state_t *yyssp = yyss; + + /* The semantic value stack: array, bottom, top. */ + YYSTYPE yyvsa[YYINITDEPTH]; + YYSTYPE *yyvs = yyvsa; + YYSTYPE *yyvsp = yyvs; + + int yyn; + /* The return value of yyparse. */ + int yyresult; + /* Lookahead symbol kind. */ + yysymbol_kind_t yytoken = YYSYMBOL_YYEMPTY; + /* The variables used to return semantic value and location from the + action routines. */ + YYSTYPE yyval; + + + +#define YYPOPSTACK(N) (yyvsp -= (N), yyssp -= (N)) + + /* The number of symbols on the RHS of the reduced rule. + Keep to zero when no symbol should be popped. */ + int yylen = 0; + + YYDPRINTF ((stderr, "Starting parse\n")); + + yychar = YYEMPTY; /* Cause a token to be read. */ + + goto yysetstate; + + +/*------------------------------------------------------------. +| yynewstate -- push a new state, which is found in yystate. | +`------------------------------------------------------------*/ +yynewstate: + /* In all cases, when you get here, the value and location stacks + have just been pushed. So pushing a state here evens the stacks. */ + yyssp++; + + +/*--------------------------------------------------------------------. +| yysetstate -- set current state (the top of the stack) to yystate. | +`--------------------------------------------------------------------*/ +yysetstate: + YYDPRINTF ((stderr, "Entering state %d\n", yystate)); + YY_ASSERT (0 <= yystate && yystate < YYNSTATES); + YY_IGNORE_USELESS_CAST_BEGIN + *yyssp = YY_CAST (yy_state_t, yystate); + YY_IGNORE_USELESS_CAST_END + YY_STACK_PRINT (yyss, yyssp); + + if (yyss + yystacksize - 1 <= yyssp) +#if !defined yyoverflow && !defined YYSTACK_RELOCATE + YYNOMEM; +#else + { + /* Get the current used size of the three stacks, in elements. */ + YYPTRDIFF_T yysize = yyssp - yyss + 1; + +# if defined yyoverflow + { + /* Give user a chance to reallocate the stack. Use copies of + these so that the &'s don't force the real ones into + memory. */ + yy_state_t *yyss1 = yyss; + YYSTYPE *yyvs1 = yyvs; + + /* Each stack pointer address is followed by the size of the + data in use in that stack, in bytes. This used to be a + conditional around just the two extra args, but that might + be undefined if yyoverflow is a macro. */ + yyoverflow (YY_("memory exhausted"), + &yyss1, yysize * YYSIZEOF (*yyssp), + &yyvs1, yysize * YYSIZEOF (*yyvsp), + &yystacksize); + yyss = yyss1; + yyvs = yyvs1; + } +# else /* defined YYSTACK_RELOCATE */ + /* Extend the stack our own way. */ + if (YYMAXDEPTH <= yystacksize) + YYNOMEM; + yystacksize *= 2; + if (YYMAXDEPTH < yystacksize) + yystacksize = YYMAXDEPTH; + + { + yy_state_t *yyss1 = yyss; + union yyalloc *yyptr = + YY_CAST (union yyalloc *, + YYSTACK_ALLOC (YY_CAST (YYSIZE_T, YYSTACK_BYTES (yystacksize)))); + if (! yyptr) + YYNOMEM; + YYSTACK_RELOCATE (yyss_alloc, yyss); + YYSTACK_RELOCATE (yyvs_alloc, yyvs); +# undef YYSTACK_RELOCATE + if (yyss1 != yyssa) + YYSTACK_FREE (yyss1); + } +# endif + + yyssp = yyss + yysize - 1; + yyvsp = yyvs + yysize - 1; + + YY_IGNORE_USELESS_CAST_BEGIN + YYDPRINTF ((stderr, "Stack size increased to %ld\n", + YY_CAST (long, yystacksize))); + YY_IGNORE_USELESS_CAST_END + + if (yyss + yystacksize - 1 <= yyssp) + YYABORT; + } +#endif /* !defined yyoverflow && !defined YYSTACK_RELOCATE */ + + + if (yystate == YYFINAL) + YYACCEPT; + + goto yybackup; + + +/*-----------. +| yybackup. | +`-----------*/ +yybackup: + /* Do appropriate processing given the current state. Read a + lookahead token if we need one and don't already have one. */ + + /* First try to decide what to do without reference to lookahead token. */ + yyn = yypact[yystate]; + if (yypact_value_is_default (yyn)) + goto yydefault; + + /* Not known => get a lookahead token if don't already have one. */ + + /* YYCHAR is either empty, or end-of-input, or a valid lookahead. */ + if (yychar == YYEMPTY) + { + YYDPRINTF ((stderr, "Reading a token\n")); + yychar = yylex (); + } + + if (yychar <= YYEOF) + { + yychar = YYEOF; + yytoken = YYSYMBOL_YYEOF; + YYDPRINTF ((stderr, "Now at end of input.\n")); + } + else if (yychar == YYerror) + { + /* The scanner already issued an error message, process directly + to error recovery. But do not keep the error token as + lookahead, it is too special and may lead us to an endless + loop in error recovery. */ + yychar = YYUNDEF; + yytoken = YYSYMBOL_YYerror; + goto yyerrlab1; + } + else + { + yytoken = YYTRANSLATE (yychar); + YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc); + } + + /* If the proper action on seeing token YYTOKEN is to reduce or to + detect an error, take that action. */ + yyn += yytoken; + if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken) + goto yydefault; + yyn = yytable[yyn]; + if (yyn <= 0) + { + if (yytable_value_is_error (yyn)) + goto yyerrlab; + yyn = -yyn; + goto yyreduce; + } + + /* Count tokens shifted since error; after three, turn off error + status. */ + if (yyerrstatus) + yyerrstatus--; + + /* Shift the lookahead token. */ + YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc); + yystate = yyn; + YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN + *++yyvsp = yylval; + YY_IGNORE_MAYBE_UNINITIALIZED_END + + /* Discard the shifted token. */ + yychar = YYEMPTY; + goto yynewstate; + + +/*-----------------------------------------------------------. +| yydefault -- do the default action for the current state. | +`-----------------------------------------------------------*/ +yydefault: + yyn = yydefact[yystate]; + if (yyn == 0) + goto yyerrlab; + goto yyreduce; + + +/*-----------------------------. +| yyreduce -- do a reduction. | +`-----------------------------*/ +yyreduce: + /* yyn is the number of a rule to reduce with. */ + yylen = yyr2[yyn]; + + /* If YYLEN is nonzero, implement the default value of the action: + '$$ = $1'. + + Otherwise, the following line sets YYVAL to garbage. + This behavior is undocumented and Bison + users should not rely upon it. Assigning to YYVAL + unconditionally makes the parser a bit smaller, and it avoids a + GCC warning that YYVAL may be used uninitialized. */ + yyval = yyvsp[1-yylen]; + + + YY_REDUCE_PRINT (yyn); + switch (yyn) + { + case 2: /* expr: optional_conditional */ +#line 247 "../src/preproc/refer/label.ypp" + { parse_result = ((yyvsp[0].expr) ? new analyzed_expr((yyvsp[0].expr)) : 0); } +#line 1370 "src/preproc/refer/label.cpp" + break; + + case 3: /* conditional: alternative */ +#line 252 "../src/preproc/refer/label.ypp" + { (yyval.expr) = (yyvsp[0].expr); } +#line 1376 "src/preproc/refer/label.cpp" + break; + + case 4: /* conditional: alternative '?' optional_conditional ':' conditional */ +#line 254 "../src/preproc/refer/label.ypp" + { (yyval.expr) = new conditional_expr((yyvsp[-4].expr), (yyvsp[-2].expr), (yyvsp[0].expr)); } +#line 1382 "src/preproc/refer/label.cpp" + break; + + case 5: /* optional_conditional: %empty */ +#line 259 "../src/preproc/refer/label.ypp" + { (yyval.expr) = 0; } +#line 1388 "src/preproc/refer/label.cpp" + break; + + case 6: /* optional_conditional: conditional */ +#line 261 "../src/preproc/refer/label.ypp" + { (yyval.expr) = (yyvsp[0].expr); } +#line 1394 "src/preproc/refer/label.cpp" + break; + + case 7: /* alternative: list */ +#line 266 "../src/preproc/refer/label.ypp" + { (yyval.expr) = (yyvsp[0].expr); } +#line 1400 "src/preproc/refer/label.cpp" + break; + + case 8: /* alternative: alternative '|' list */ +#line 268 "../src/preproc/refer/label.ypp" + { (yyval.expr) = new alternative_expr((yyvsp[-2].expr), (yyvsp[0].expr)); } +#line 1406 "src/preproc/refer/label.cpp" + break; + + case 9: /* alternative: alternative '&' list */ +#line 270 "../src/preproc/refer/label.ypp" + { (yyval.expr) = new conditional_expr((yyvsp[-2].expr), (yyvsp[0].expr), 0); } +#line 1412 "src/preproc/refer/label.cpp" + break; + + case 10: /* list: substitute */ +#line 275 "../src/preproc/refer/label.ypp" + { (yyval.expr) = (yyvsp[0].expr); } +#line 1418 "src/preproc/refer/label.cpp" + break; + + case 11: /* list: list substitute */ +#line 277 "../src/preproc/refer/label.ypp" + { (yyval.expr) = new list_expr((yyvsp[-1].expr), (yyvsp[0].expr)); } +#line 1424 "src/preproc/refer/label.cpp" + break; + + case 12: /* substitute: string */ +#line 282 "../src/preproc/refer/label.ypp" + { (yyval.expr) = (yyvsp[0].expr); } +#line 1430 "src/preproc/refer/label.cpp" + break; + + case 13: /* substitute: substitute '~' string */ +#line 284 "../src/preproc/refer/label.ypp" + { (yyval.expr) = new substitute_expr((yyvsp[-2].expr), (yyvsp[0].expr)); } +#line 1436 "src/preproc/refer/label.cpp" + break; + + case 14: /* string: '@' */ +#line 289 "../src/preproc/refer/label.ypp" + { (yyval.expr) = new at_expr; } +#line 1442 "src/preproc/refer/label.cpp" + break; + + case 15: /* string: TOKEN_LITERAL */ +#line 291 "../src/preproc/refer/label.ypp" + { + (yyval.expr) = new literal_expr(literals.contents() + (yyvsp[0].str).start, + (yyvsp[0].str).len); + } +#line 1451 "src/preproc/refer/label.cpp" + break; + + case 16: /* string: TOKEN_LETTER */ +#line 296 "../src/preproc/refer/label.ypp" + { (yyval.expr) = new field_expr((yyvsp[0].num), 0); } +#line 1457 "src/preproc/refer/label.cpp" + break; + + case 17: /* string: TOKEN_LETTER number */ +#line 298 "../src/preproc/refer/label.ypp" + { (yyval.expr) = new field_expr((yyvsp[-1].num), (yyvsp[0].num) - 1); } +#line 1463 "src/preproc/refer/label.cpp" + break; + + case 18: /* string: '%' TOKEN_LETTER */ +#line 300 "../src/preproc/refer/label.ypp" + { + switch ((yyvsp[0].num)) { + case 'I': + case 'i': + case 'A': + case 'a': + (yyval.expr) = new format_expr((yyvsp[0].num)); + break; + default: + command_error("unrecognized format '%1'", char((yyvsp[0].num))); + (yyval.expr) = new format_expr('a'); + break; + } + } +#line 1482 "src/preproc/refer/label.cpp" + break; + + case 19: /* string: '%' digits */ +#line 316 "../src/preproc/refer/label.ypp" + { + (yyval.expr) = new format_expr('0', (yyvsp[0].dig).ndigits, (yyvsp[0].dig).val); + } +#line 1490 "src/preproc/refer/label.cpp" + break; + + case 20: /* string: string '.' flag TOKEN_LETTER optional_number */ +#line 320 "../src/preproc/refer/label.ypp" + { + switch ((yyvsp[-1].num)) { + case 'l': + (yyval.expr) = new map_expr((yyvsp[-4].expr), lowercase); + break; + case 'u': + (yyval.expr) = new map_expr((yyvsp[-4].expr), uppercase); + break; + case 'c': + (yyval.expr) = new map_expr((yyvsp[-4].expr), capitalize); + break; + case 'r': + (yyval.expr) = new map_expr((yyvsp[-4].expr), reverse_name); + break; + case 'a': + (yyval.expr) = new map_expr((yyvsp[-4].expr), abbreviate_name); + break; + case 'y': + (yyval.expr) = new extractor_expr((yyvsp[-4].expr), find_year, (yyvsp[-2].num)); + break; + case 'n': + (yyval.expr) = new extractor_expr((yyvsp[-4].expr), find_last_name, (yyvsp[-2].num)); + break; + default: + (yyval.expr) = (yyvsp[-4].expr); + command_error("unknown function '%1'", char((yyvsp[-1].num))); + break; + } + } +#line 1524 "src/preproc/refer/label.cpp" + break; + + case 21: /* string: string '+' number */ +#line 351 "../src/preproc/refer/label.ypp" + { (yyval.expr) = new truncate_expr((yyvsp[-2].expr), (yyvsp[0].num)); } +#line 1530 "src/preproc/refer/label.cpp" + break; + + case 22: /* string: string '-' number */ +#line 353 "../src/preproc/refer/label.ypp" + { (yyval.expr) = new truncate_expr((yyvsp[-2].expr), -(yyvsp[0].num)); } +#line 1536 "src/preproc/refer/label.cpp" + break; + + case 23: /* string: string '*' */ +#line 355 "../src/preproc/refer/label.ypp" + { (yyval.expr) = new star_expr((yyvsp[-1].expr)); } +#line 1542 "src/preproc/refer/label.cpp" + break; + + case 24: /* string: '(' optional_conditional ')' */ +#line 357 "../src/preproc/refer/label.ypp" + { (yyval.expr) = (yyvsp[-1].expr); } +#line 1548 "src/preproc/refer/label.cpp" + break; + + case 25: /* string: '<' optional_conditional '>' */ +#line 359 "../src/preproc/refer/label.ypp" + { (yyval.expr) = new separator_expr((yyvsp[-1].expr)); } +#line 1554 "src/preproc/refer/label.cpp" + break; + + case 26: /* optional_number: %empty */ +#line 364 "../src/preproc/refer/label.ypp" + { (yyval.num) = -1; } +#line 1560 "src/preproc/refer/label.cpp" + break; + + case 27: /* optional_number: number */ +#line 366 "../src/preproc/refer/label.ypp" + { (yyval.num) = (yyvsp[0].num); } +#line 1566 "src/preproc/refer/label.cpp" + break; + + case 28: /* number: TOKEN_DIGIT */ +#line 371 "../src/preproc/refer/label.ypp" + { (yyval.num) = (yyvsp[0].num); } +#line 1572 "src/preproc/refer/label.cpp" + break; + + case 29: /* number: number TOKEN_DIGIT */ +#line 373 "../src/preproc/refer/label.ypp" + { (yyval.num) = (yyvsp[-1].num)*10 + (yyvsp[0].num); } +#line 1578 "src/preproc/refer/label.cpp" + break; + + case 30: /* digits: TOKEN_DIGIT */ +#line 378 "../src/preproc/refer/label.ypp" + { (yyval.dig).ndigits = 1; (yyval.dig).val = (yyvsp[0].num); } +#line 1584 "src/preproc/refer/label.cpp" + break; + + case 31: /* digits: digits TOKEN_DIGIT */ +#line 380 "../src/preproc/refer/label.ypp" + { (yyval.dig).ndigits = (yyvsp[-1].dig).ndigits + 1; (yyval.dig).val = (yyvsp[-1].dig).val*10 + (yyvsp[0].num); } +#line 1590 "src/preproc/refer/label.cpp" + break; + + case 32: /* flag: %empty */ +#line 386 "../src/preproc/refer/label.ypp" + { (yyval.num) = 0; } +#line 1596 "src/preproc/refer/label.cpp" + break; + + case 33: /* flag: '+' */ +#line 388 "../src/preproc/refer/label.ypp" + { (yyval.num) = 1; } +#line 1602 "src/preproc/refer/label.cpp" + break; + + case 34: /* flag: '-' */ +#line 390 "../src/preproc/refer/label.ypp" + { (yyval.num) = -1; } +#line 1608 "src/preproc/refer/label.cpp" + break; + + +#line 1612 "src/preproc/refer/label.cpp" + + default: break; + } + /* User semantic actions sometimes alter yychar, and that requires + that yytoken be updated with the new translation. We take the + approach of translating immediately before every use of yytoken. + One alternative is translating here after every semantic action, + but that translation would be missed if the semantic action invokes + YYABORT, YYACCEPT, or YYERROR immediately after altering yychar or + if it invokes YYBACKUP. In the case of YYABORT or YYACCEPT, an + incorrect destructor might then be invoked immediately. In the + case of YYERROR or YYBACKUP, subsequent parser actions might lead + to an incorrect destructor call or verbose syntax error message + before the lookahead is translated. */ + YY_SYMBOL_PRINT ("-> $$ =", YY_CAST (yysymbol_kind_t, yyr1[yyn]), &yyval, &yyloc); + + YYPOPSTACK (yylen); + yylen = 0; + + *++yyvsp = yyval; + + /* Now 'shift' the result of the reduction. Determine what state + that goes to, based on the state we popped back to and the rule + number reduced by. */ + { + const int yylhs = yyr1[yyn] - YYNTOKENS; + const int yyi = yypgoto[yylhs] + *yyssp; + yystate = (0 <= yyi && yyi <= YYLAST && yycheck[yyi] == *yyssp + ? yytable[yyi] + : yydefgoto[yylhs]); + } + + goto yynewstate; + + +/*--------------------------------------. +| yyerrlab -- here on detecting error. | +`--------------------------------------*/ +yyerrlab: + /* Make sure we have latest lookahead translation. See comments at + user semantic actions for why this is necessary. */ + yytoken = yychar == YYEMPTY ? YYSYMBOL_YYEMPTY : YYTRANSLATE (yychar); + /* If not already recovering from an error, report this error. */ + if (!yyerrstatus) + { + ++yynerrs; + yyerror (YY_("syntax error")); + } + + if (yyerrstatus == 3) + { + /* If just tried and failed to reuse lookahead token after an + error, discard it. */ + + if (yychar <= YYEOF) + { + /* Return failure if at end of input. */ + if (yychar == YYEOF) + YYABORT; + } + else + { + yydestruct ("Error: discarding", + yytoken, &yylval); + yychar = YYEMPTY; + } + } + + /* Else will try to reuse lookahead token after shifting the error + token. */ + goto yyerrlab1; + + +/*---------------------------------------------------. +| yyerrorlab -- error raised explicitly by YYERROR. | +`---------------------------------------------------*/ +yyerrorlab: + /* Pacify compilers when the user code never invokes YYERROR and the + label yyerrorlab therefore never appears in user code. */ + if (0) + YYERROR; + ++yynerrs; + + /* Do not reclaim the symbols of the rule whose action triggered + this YYERROR. */ + YYPOPSTACK (yylen); + yylen = 0; + YY_STACK_PRINT (yyss, yyssp); + yystate = *yyssp; + goto yyerrlab1; + + +/*-------------------------------------------------------------. +| yyerrlab1 -- common code for both syntax error and YYERROR. | +`-------------------------------------------------------------*/ +yyerrlab1: + yyerrstatus = 3; /* Each real token shifted decrements this. */ + + /* Pop stack until we find a state that shifts the error token. */ + for (;;) + { + yyn = yypact[yystate]; + if (!yypact_value_is_default (yyn)) + { + yyn += YYSYMBOL_YYerror; + if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYSYMBOL_YYerror) + { + yyn = yytable[yyn]; + if (0 < yyn) + break; + } + } + + /* Pop the current state because it cannot handle the error token. */ + if (yyssp == yyss) + YYABORT; + + + yydestruct ("Error: popping", + YY_ACCESSING_SYMBOL (yystate), yyvsp); + YYPOPSTACK (1); + yystate = *yyssp; + YY_STACK_PRINT (yyss, yyssp); + } + + YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN + *++yyvsp = yylval; + YY_IGNORE_MAYBE_UNINITIALIZED_END + + + /* Shift the error token. */ + YY_SYMBOL_PRINT ("Shifting", YY_ACCESSING_SYMBOL (yyn), yyvsp, yylsp); + + yystate = yyn; + goto yynewstate; + + +/*-------------------------------------. +| yyacceptlab -- YYACCEPT comes here. | +`-------------------------------------*/ +yyacceptlab: + yyresult = 0; + goto yyreturnlab; + + +/*-----------------------------------. +| yyabortlab -- YYABORT comes here. | +`-----------------------------------*/ +yyabortlab: + yyresult = 1; + goto yyreturnlab; + + +/*-----------------------------------------------------------. +| yyexhaustedlab -- YYNOMEM (memory exhaustion) comes here. | +`-----------------------------------------------------------*/ +yyexhaustedlab: + yyerror (YY_("memory exhausted")); + yyresult = 2; + goto yyreturnlab; + + +/*----------------------------------------------------------. +| yyreturnlab -- parsing is finished, clean up and return. | +`----------------------------------------------------------*/ +yyreturnlab: + if (yychar != YYEMPTY) + { + /* Make sure we have latest lookahead translation. See comments at + user semantic actions for why this is necessary. */ + yytoken = YYTRANSLATE (yychar); + yydestruct ("Cleanup: discarding lookahead", + yytoken, &yylval); + } + /* Do not reclaim the symbols of the rule whose action triggered + this YYABORT or YYACCEPT. */ + YYPOPSTACK (yylen); + YY_STACK_PRINT (yyss, yyssp); + while (yyssp != yyss) + { + yydestruct ("Cleanup: popping", + YY_ACCESSING_SYMBOL (+*yyssp), yyvsp); + YYPOPSTACK (1); + } +#ifndef yyoverflow + if (yyss != yyssa) + YYSTACK_FREE (yyss); +#endif + + return yyresult; +} + +#line 393 "../src/preproc/refer/label.ypp" + + +/* bison defines const to be empty unless __STDC__ is defined, which it +isn't under cfront */ + +#ifdef const +#undef const +#endif + +const char *spec_ptr; +const char *spec_end; +const char *spec_cur; + +static char uppercase_array[] = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', + 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', + 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', + 'Y', 'Z', +}; + +static char lowercase_array[] = { + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', + 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', + 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', + 'y', 'z', +}; + +int yylex() +{ + while (spec_ptr < spec_end && csspace(*spec_ptr)) + spec_ptr++; + spec_cur = spec_ptr; + if (spec_ptr >= spec_end) + return 0; + unsigned char c = *spec_ptr++; + if (csalpha(c)) { + yylval.num = c; + return TOKEN_LETTER; + } + if (csdigit(c)) { + yylval.num = c - '0'; + return TOKEN_DIGIT; + } + if (c == '\'') { + yylval.str.start = literals.length(); + for (; spec_ptr < spec_end; spec_ptr++) { + if (*spec_ptr == '\'') { + if (++spec_ptr < spec_end && *spec_ptr == '\'') + literals += '\''; + else { + yylval.str.len = literals.length() - yylval.str.start; + return TOKEN_LITERAL; + } + } + else + literals += *spec_ptr; + } + yylval.str.len = literals.length() - yylval.str.start; + return TOKEN_LITERAL; + } + return c; +} + +int set_label_spec(const char *label_spec) +{ + spec_cur = spec_ptr = label_spec; + spec_end = strchr(label_spec, '\0'); + literals.clear(); + if (yyparse()) + return 0; + delete parsed_label; + parsed_label = parse_result; + return 1; +} + +int set_date_label_spec(const char *label_spec) +{ + spec_cur = spec_ptr = label_spec; + spec_end = strchr(label_spec, '\0'); + literals.clear(); + if (yyparse()) + return 0; + delete parsed_date_label; + parsed_date_label = parse_result; + return 1; +} + +int set_short_label_spec(const char *label_spec) +{ + spec_cur = spec_ptr = label_spec; + spec_end = strchr(label_spec, '\0'); + literals.clear(); + if (yyparse()) + return 0; + delete parsed_short_label; + parsed_short_label = parse_result; + return 1; +} + +void yyerror(const char *message) +{ + if (spec_cur < spec_end) + command_error("label specification %1 before '%2'", message, spec_cur); + else + command_error("label specification %1 at end of string", + message, spec_cur); +} + +void at_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (tentative) + ref.canonicalize_authors(result); + else { + const char *end, *start = ref.get_authors(&end); + if (start) + result.append(start, end - start); + } +} + +void format_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (tentative) + return; + const label_info *lp = ref.get_label_ptr(); + int num = lp == 0 ? ref.get_number() : lp->count; + if (type != '0') + result += format_serial(type, num + 1); + else { + const char *ptr = i_to_a(num + first_number); + int pad = width - strlen(ptr); + while (--pad >= 0) + result += '0'; + result += ptr; + } +} + +static const char *format_serial(char c, int n) +{ + assert(n > 0); + static char buf[128]; // more than enough. + switch (c) { + case 'i': + case 'I': + { + char *p = buf; + // troff uses z and w to represent 10000 and 5000 in Roman + // numerals; I can find no historical basis for this usage + const char *s = c == 'i' ? "zwmdclxvi" : "ZWMDCLXVI"; + if (n >= 40000) + return i_to_a(n); + while (n >= 10000) { + *p++ = s[0]; + n -= 10000; + } + for (int i = 1000; i > 0; i /= 10, s += 2) { + int m = n/i; + n -= m*i; + switch (m) { + case 3: + *p++ = s[2]; + /* falls through */ + case 2: + *p++ = s[2]; + /* falls through */ + case 1: + *p++ = s[2]; + break; + case 4: + *p++ = s[2]; + *p++ = s[1]; + break; + case 8: + *p++ = s[1]; + *p++ = s[2]; + *p++ = s[2]; + *p++ = s[2]; + break; + case 7: + *p++ = s[1]; + *p++ = s[2]; + *p++ = s[2]; + break; + case 6: + *p++ = s[1]; + *p++ = s[2]; + break; + case 5: + *p++ = s[1]; + break; + case 9: + *p++ = s[2]; + *p++ = s[0]; + } + } + *p = 0; + break; + } + case 'a': + case 'A': + { + char *p = buf; + // this is derived from troff/reg.c + while (n > 0) { + int d = n % 26; + if (d == 0) + d = 26; + n -= d; + n /= 26; + *p++ = c == 'a' ? lowercase_array[d - 1] : + uppercase_array[d - 1]; + } + *p-- = 0; + // Reverse it. + char *q = buf; + while (q < p) { + char temp = *q; + *q = *p; + *p = temp; + --p; + ++q; + } + break; + } + default: + assert(0); + } + return buf; +} + +void field_expr::evaluate(int, const reference &ref, + string &result, substring_position &) +{ + const char *end; + const char *start = ref.get_field(name, &end); + if (start) { + start = nth_field(number, start, &end); + if (start) + result.append(start, end - start); + } +} + +void literal_expr::evaluate(int, const reference &, + string &result, substring_position &) +{ + result += s; +} + +analyzed_expr::analyzed_expr(expression *e) +: unary_expr(e), flags(e ? e->analyze() : 0) +{ +} + +void analyzed_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + if (expr) + expr->evaluate(tentative, ref, result, pos); +} + +void star_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + const label_info *lp = ref.get_label_ptr(); + if (!tentative + && (lp == 0 || lp->total > 1) + && expr) + expr->evaluate(tentative, ref, result, pos); +} + +void separator_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + int start_length = result.length(); + int is_first = pos.start < 0; + if (expr) + expr->evaluate(tentative, ref, result, pos); + if (is_first) { + pos.start = start_length; + pos.length = result.length() - start_length; + } +} + +void map_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (expr) { + string temp; + substring_position temp_pos; + expr->evaluate(tentative, ref, temp, temp_pos); + (*func)(temp.contents(), temp.contents() + temp.length(), result); + } +} + +void extractor_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (expr) { + string temp; + substring_position temp_pos; + expr->evaluate(tentative, ref, temp, temp_pos); + const char *end, *start = (*func)(temp.contents(), + temp.contents() + temp.length(), + &end); + switch (part) { + case BEFORE: + if (start) + result.append(temp.contents(), start - temp.contents()); + else + result += temp; + break; + case MATCH: + if (start) + result.append(start, end - start); + break; + case AFTER: + if (start) + result.append(end, temp.contents() + temp.length() - end); + break; + default: + assert(0); + } + } +} + +static void first_part(int len, const char *ptr, const char *end, + string &result) +{ + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + break; + const token_info *ti = lookup_token(token_start, ptr); + int counts = ti->sortify_non_empty(token_start, ptr); + if (counts && --len < 0) + break; + if (counts || ti->is_accent()) + result.append(token_start, ptr - token_start); + } +} + +static void last_part(int len, const char *ptr, const char *end, + string &result) +{ + const char *start = ptr; + int count = 0; + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + break; + const token_info *ti = lookup_token(token_start, ptr); + if (ti->sortify_non_empty(token_start, ptr)) + count++; + } + ptr = start; + int skip = count - len; + if (skip > 0) { + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + assert(0); + const token_info *ti = lookup_token(token_start, ptr); + if (ti->sortify_non_empty(token_start, ptr) && --skip < 0) { + ptr = token_start; + break; + } + } + } + first_part(len, ptr, end, result); +} + +void truncate_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (expr) { + string temp; + substring_position temp_pos; + expr->evaluate(tentative, ref, temp, temp_pos); + const char *start = temp.contents(); + const char *end = start + temp.length(); + if (n > 0) + first_part(n, start, end, result); + else if (n < 0) + last_part(-n, start, end, result); + } +} + +void alternative_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + int start_length = result.length(); + if (expr1) + expr1->evaluate(tentative, ref, result, pos); + if (result.length() == start_length && expr2) + expr2->evaluate(tentative, ref, result, pos); +} + +void list_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + if (expr1) + expr1->evaluate(tentative, ref, result, pos); + if (expr2) + expr2->evaluate(tentative, ref, result, pos); +} + +void substitute_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + int start_length = result.length(); + if (expr1) + expr1->evaluate(tentative, ref, result, pos); + if (result.length() > start_length && result[result.length() - 1] == '-') { + // ought to see if pos covers the - + result.set_length(result.length() - 1); + if (expr2) + expr2->evaluate(tentative, ref, result, pos); + } +} + +void conditional_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + string temp; + substring_position temp_pos; + if (expr1) + expr1->evaluate(tentative, ref, temp, temp_pos); + if (temp.length() > 0) { + if (expr2) + expr2->evaluate(tentative, ref, result, pos); + } + else { + if (expr3) + expr3->evaluate(tentative, ref, result, pos); + } +} + +void reference::pre_compute_label() +{ + if (parsed_label != 0 + && (parsed_label->analyze() & expression::CONTAINS_VARIABLE)) { + label.clear(); + substring_position temp_pos; + parsed_label->evaluate(1, *this, label, temp_pos); + label_ptr = lookup_label(label); + } +} + +void reference::compute_label() +{ + label.clear(); + if (parsed_label) + parsed_label->evaluate(0, *this, label, separator_pos); + if (short_label_flag && parsed_short_label) + parsed_short_label->evaluate(0, *this, short_label, short_separator_pos); + if (date_as_label) { + string new_date; + if (parsed_date_label) { + substring_position temp_pos; + parsed_date_label->evaluate(0, *this, new_date, temp_pos); + } + set_date(new_date); + } + if (label_ptr) + label_ptr->count += 1; +} + +void reference::immediate_compute_label() +{ + if (label_ptr) + label_ptr->total = 2; // force use of disambiguator + compute_label(); +} + +int reference::merge_labels(reference **v, int n, label_type type, + string &result) +{ + if (abbreviate_label_ranges) + return merge_labels_by_number(v, n, type, result); + else + return merge_labels_by_parts(v, n, type, result); +} + +int reference::merge_labels_by_number(reference **v, int n, label_type type, + string &result) +{ + if (n <= 1) + return 0; + int num = get_number(); + // Only merge three or more labels. + if (v[0]->get_number() != num + 1 + || v[1]->get_number() != num + 2) + return 0; + int i; + for (i = 2; i < n; i++) + if (v[i]->get_number() != num + i + 1) + break; + result = get_label(type); + result += label_range_indicator; + result += v[i - 1]->get_label(type); + return i; +} + +const substring_position &reference::get_separator_pos(label_type type) const +{ + if (type == SHORT_LABEL && short_label_flag) + return short_separator_pos; + else + return separator_pos; +} + +const string &reference::get_label(label_type type) const +{ + if (type == SHORT_LABEL && short_label_flag) + return short_label; + else + return label; +} + +int reference::merge_labels_by_parts(reference **v, int n, label_type type, + string &result) +{ + if (n <= 0) + return 0; + const string &lb = get_label(type); + const substring_position &sp = get_separator_pos(type); + if (sp.start < 0 + || sp.start != v[0]->get_separator_pos(type).start + || memcmp(lb.contents(), v[0]->get_label(type).contents(), + sp.start) != 0) + return 0; + result = lb; + int i = 0; + do { + result += separate_label_second_parts; + const substring_position &s = v[i]->get_separator_pos(type); + int sep_end_pos = s.start + s.length; + result.append(v[i]->get_label(type).contents() + sep_end_pos, + v[i]->get_label(type).length() - sep_end_pos); + } while (++i < n + && sp.start == v[i]->get_separator_pos(type).start + && memcmp(lb.contents(), v[i]->get_label(type).contents(), + sp.start) == 0); + return i; +} + +string label_pool; + +label_info::label_info(const string &s) +: start(label_pool.length()), length(s.length()), count(0), total(1) +{ + label_pool += s; +} + +static label_info **label_table = 0; +static int label_table_size = 0; +static int label_table_used = 0; + +label_info *lookup_label(const string &label) +{ + if (label_table == 0) { + label_table = new label_info *[17]; + label_table_size = 17; + for (int i = 0; i < 17; i++) + label_table[i] = 0; + } + unsigned h = hash_string(label.contents(), label.length()) % label_table_size; + label_info **ptr; + for (ptr = label_table + h; + *ptr != 0; + (ptr == label_table) + ? (ptr = label_table + label_table_size - 1) + : ptr--) + if ((*ptr)->length == label.length() + && memcmp(label_pool.contents() + (*ptr)->start, label.contents(), + label.length()) == 0) { + (*ptr)->total += 1; + return *ptr; + } + label_info *result = *ptr = new label_info(label); + if (++label_table_used * 2 > label_table_size) { + // Rehash the table. + label_info **old_table = label_table; + int old_size = label_table_size; + label_table_size = next_size(label_table_size); + label_table = new label_info *[label_table_size]; + int i; + for (i = 0; i < label_table_size; i++) + label_table[i] = 0; + for (i = 0; i < old_size; i++) + if (old_table[i]) { + h = hash_string(label_pool.contents() + old_table[i]->start, + old_table[i]->length); + label_info **p; + for (p = label_table + (h % label_table_size); + *p != 0; + (p == label_table) + ? (p = label_table + label_table_size - 1) + : --p) + ; + *p = old_table[i]; + } + delete[] old_table; + } + return result; +} + +void clear_labels() +{ + for (int i = 0; i < label_table_size; i++) { + delete label_table[i]; + label_table[i] = 0; + } + label_table_used = 0; + label_pool.clear(); +} + +static void consider_authors(reference **start, reference **end, int i); + +void compute_labels(reference **v, int n) +{ + if (parsed_label + && (parsed_label->analyze() & expression::CONTAINS_AT) + && sort_fields.length() >= 2 + && sort_fields[0] == 'A' + && sort_fields[1] == '+') + consider_authors(v, v + n, 0); + for (int i = 0; i < n; i++) + v[i]->compute_label(); +} + + +/* A reference with a list of authors _needs_ author i +where 0 <= i <= N if there exists a reference with a list of authors + such that != and M >= i +and Aj = Bj for 0 <= j < i. In this case if we can't say "A0, +A1,...,A(i-1) et al" because this would match both and +. If a reference needs author i we only have to call +need_author(j) for some j >= i such that the reference also needs +author j. */ + +/* This function handles 2 tasks: +determine which authors are needed (cannot be elided with et al.); +determine which authors can have only last names in the labels. + +References >= start and < end have the same first i author names. +Also they're sorted by A+. */ + +static void consider_authors(reference **start, reference **end, int i) +{ + if (start >= end) + return; + reference **p = start; + if (i >= (*p)->get_nauthors()) { + for (++p; p < end && i >= (*p)->get_nauthors(); p++) + ; + if (p < end && i > 0) { + // If we have an author list and an author list , + // then both lists need C. + for (reference **q = start; q < end; q++) + (*q)->need_author(i - 1); + } + start = p; + } + while (p < end) { + reference **last_name_start = p; + reference **name_start = p; + for (++p; + p < end && i < (*p)->get_nauthors() + && same_author_last_name(**last_name_start, **p, i); + p++) { + if (!same_author_name(**name_start, **p, i)) { + consider_authors(name_start, p, i + 1); + name_start = p; + } + } + consider_authors(name_start, p, i + 1); + if (last_name_start == name_start) { + for (reference **q = last_name_start; q < p; q++) + (*q)->set_last_name_unambiguous(i); + } + // If we have an author list and , then the lists + // need author D and E respectively. + if (name_start > start || p < end) { + for (reference **q = last_name_start; q < p; q++) + (*q)->need_author(i); + } + } +} + +int same_author_last_name(const reference &r1, const reference &r2, int n) +{ + const char *ae1; + const char *as1 = r1.get_sort_field(0, n, 0, &ae1); + const char *ae2; + const char *as2 = r2.get_sort_field(0, n, 0, &ae2); + if (!as1 && !as2) return 1; // they are the same + if (!as1 || !as2) return 0; + return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0; +} + +int same_author_name(const reference &r1, const reference &r2, int n) +{ + const char *ae1; + const char *as1 = r1.get_sort_field(0, n, -1, &ae1); + const char *ae2; + const char *as2 = r2.get_sort_field(0, n, -1, &ae2); + if (!as1 && !as2) return 1; // they are the same + if (!as1 || !as2) return 0; + return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0; +} + + +void int_set::set(int i) +{ + assert(i >= 0); + int bytei = i >> 3; + if (bytei >= v.length()) { + int old_length = v.length(); + v.set_length(bytei + 1); + for (int j = old_length; j <= bytei; j++) + v[j] = 0; + } + v[bytei] |= 1 << (i & 7); +} + +int int_set::get(int i) const +{ + assert(i >= 0); + int bytei = i >> 3; + return bytei >= v.length() ? 0 : (v[bytei] & (1 << (i & 7))) != 0; +} + +void reference::set_last_name_unambiguous(int i) +{ + last_name_unambiguous.set(i); +} + +void reference::need_author(int n) +{ + if (n > last_needed_author) + last_needed_author = n; +} + +const char *reference::get_authors(const char **end) const +{ + if (!computed_authors) { + ((reference *)this)->computed_authors = 1; + string &result = ((reference *)this)->authors; + int na = get_nauthors(); + result.clear(); + for (int i = 0; i < na; i++) { + if (last_name_unambiguous.get(i)) { + const char *e, *start = get_author_last_name(i, &e); + assert(start != 0); + result.append(start, e - start); + } + else { + const char *e, *start = get_author(i, &e); + assert(start != 0); + result.append(start, e - start); + } + if (i == last_needed_author + && et_al.length() > 0 + && et_al_min_elide > 0 + && last_needed_author + et_al_min_elide < na + && na >= et_al_min_total) { + result += et_al; + break; + } + if (i < na - 1) { + if (na == 2) + result += join_authors_exactly_two; + else if (i < na - 2) + result += join_authors_default; + else + result += join_authors_last_two; + } + } + } + const char *start = authors.contents(); + *end = start + authors.length(); + return start; +} + +int reference::get_nauthors() const +{ + if (nauthors < 0) { + const char *dummy; + int na; + for (na = 0; get_author(na, &dummy) != 0; na++) + ; + ((reference *)this)->nauthors = na; + } + return nauthors; +} + +// Local Variables: +// fill-column: 72 +// mode: C++ +// End: +// vim: set cindent noexpandtab shiftwidth=2 textwidth=72: diff --git a/src/preproc/refer/label.hpp b/src/preproc/refer/label.hpp new file mode 100644 index 0000000..9f79fd2 --- /dev/null +++ b/src/preproc/refer/label.hpp @@ -0,0 +1,98 @@ +/* A Bison parser, made by GNU Bison 3.8.2. */ + +/* Bison interface for Yacc-like parsers in C + + Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2021 Free Software Foundation, + Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + +/* DO NOT RELY ON FEATURES THAT ARE NOT DOCUMENTED in the manual, + especially those whose name start with YY_ or yy_. They are + private implementation details that can be changed or removed. */ + +#ifndef YY_YY_SRC_PREPROC_REFER_LABEL_HPP_INCLUDED +# define YY_YY_SRC_PREPROC_REFER_LABEL_HPP_INCLUDED +/* Debug traces. */ +#ifndef YYDEBUG +# define YYDEBUG 0 +#endif +#if YYDEBUG +extern int yydebug; +#endif + +/* Token kinds. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + enum yytokentype + { + YYEMPTY = -2, + YYEOF = 0, /* "end of file" */ + YYerror = 256, /* error */ + YYUNDEF = 257, /* "invalid token" */ + TOKEN_LETTER = 258, /* TOKEN_LETTER */ + TOKEN_LITERAL = 259, /* TOKEN_LITERAL */ + TOKEN_DIGIT = 260 /* TOKEN_DIGIT */ + }; + typedef enum yytokentype yytoken_kind_t; +#endif +/* Token kinds. */ +#define YYEMPTY -2 +#define YYEOF 0 +#define YYerror 256 +#define YYUNDEF 257 +#define TOKEN_LETTER 258 +#define TOKEN_LITERAL 259 +#define TOKEN_DIGIT 260 + +/* Value type. */ +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +union YYSTYPE +{ +#line 218 "../src/preproc/refer/label.ypp" + + int num; + expression *expr; + struct { int ndigits; int val; } dig; + struct { int start; int len; } str; + +#line 84 "src/preproc/refer/label.hpp" + +}; +typedef union YYSTYPE YYSTYPE; +# define YYSTYPE_IS_TRIVIAL 1 +# define YYSTYPE_IS_DECLARED 1 +#endif + + +extern YYSTYPE yylval; + + +int yyparse (void); + + +#endif /* !YY_YY_SRC_PREPROC_REFER_LABEL_HPP_INCLUDED */ diff --git a/src/preproc/refer/label.ypp b/src/preproc/refer/label.ypp new file mode 100644 index 0000000..f5210d5 --- /dev/null +++ b/src/preproc/refer/label.ypp @@ -0,0 +1,1195 @@ +/* Copyright (C) 1989-2020 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or +(at your option) any later version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . */ + +%{ + +#include "refer.h" +#include "refid.h" +#include "ref.h" +#include "token.h" + +int yylex(); +void yyerror(const char *); + +static const char *format_serial(char c, int n); + +struct label_info { + int start; + int length; + int count; + int total; + label_info(const string &); +}; + +label_info *lookup_label(const string &label); + +struct expression { + enum { + // Does the tentative label depend on the reference? + CONTAINS_VARIABLE = 01, + CONTAINS_STAR = 02, + CONTAINS_FORMAT = 04, + CONTAINS_AT = 010 + }; + virtual ~expression() { } + virtual void evaluate(int, const reference &, string &, + substring_position &) = 0; + virtual unsigned analyze() { return 0; } +}; + +class at_expr : public expression { +public: + at_expr() { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return CONTAINS_VARIABLE|CONTAINS_AT; } +}; + +class format_expr : public expression { + char type; + int width; + int first_number; +public: + format_expr(char c, int w = 0, int f = 1) + : type(c), width(w), first_number(f) { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return CONTAINS_FORMAT; } +}; + +class field_expr : public expression { + int number; + char name; +public: + field_expr(char nm, int num) : number(num), name(nm) { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return CONTAINS_VARIABLE; } +}; + +class literal_expr : public expression { + string s; +public: + literal_expr(const char *ptr, int len) : s(ptr, len) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class unary_expr : public expression { +protected: + expression *expr; +public: + unary_expr(expression *e) : expr(e) { } + ~unary_expr() { delete expr; } + void evaluate(int, const reference &, string &, substring_position &) = 0; + unsigned analyze() { return expr ? expr->analyze() : 0; } +}; + +// This caches the analysis of an expression. + +class analyzed_expr : public unary_expr { + unsigned flags; +public: + analyzed_expr(expression *); + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return flags; } +}; + +class star_expr : public unary_expr { +public: + star_expr(expression *e) : unary_expr(e) { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { + return ((expr ? (expr->analyze() & ~CONTAINS_VARIABLE) : 0) + | CONTAINS_STAR); + } +}; + +typedef void map_func(const char *, const char *, string &); + +class map_expr : public unary_expr { + map_func *func; +public: + map_expr(expression *e, map_func *f) : unary_expr(e), func(f) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +typedef const char *extractor_func(const char *, const char *, const char **); + +class extractor_expr : public unary_expr { + int part; + extractor_func *func; +public: + enum { BEFORE = +1, MATCH = 0, AFTER = -1 }; + extractor_expr(expression *e, extractor_func *f, int pt) + : unary_expr(e), part(pt), func(f) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class truncate_expr : public unary_expr { + int n; +public: + truncate_expr(expression *e, int i) : unary_expr(e), n(i) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class separator_expr : public unary_expr { +public: + separator_expr(expression *e) : unary_expr(e) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class binary_expr : public expression { +protected: + expression *expr1; + expression *expr2; +public: + binary_expr(expression *e1, expression *e2) : expr1(e1), expr2(e2) { } + ~binary_expr() { delete expr1; delete expr2; } + void evaluate(int, const reference &, string &, substring_position &) = 0; + unsigned analyze() { + return (expr1 ? expr1->analyze() : 0) | (expr2 ? expr2->analyze() : 0); + } +}; + +class alternative_expr : public binary_expr { +public: + alternative_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class list_expr : public binary_expr { +public: + list_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class substitute_expr : public binary_expr { +public: + substitute_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class ternary_expr : public expression { +protected: + expression *expr1; + expression *expr2; + expression *expr3; +public: + ternary_expr(expression *e1, expression *e2, expression *e3) + : expr1(e1), expr2(e2), expr3(e3) { } + ~ternary_expr() { delete expr1; delete expr2; delete expr3; } + void evaluate(int, const reference &, string &, substring_position &) = 0; + unsigned analyze() { + return ((expr1 ? expr1->analyze() : 0) + | (expr2 ? expr2->analyze() : 0) + | (expr3 ? expr3->analyze() : 0)); + } +}; + +class conditional_expr : public ternary_expr { +public: + conditional_expr(expression *e1, expression *e2, expression *e3) + : ternary_expr(e1, e2, e3) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +static expression *parsed_label = 0; +static expression *parsed_date_label = 0; +static expression *parsed_short_label = 0; + +static expression *parse_result; + +string literals; + +%} + +%union { + int num; + expression *expr; + struct { int ndigits; int val; } dig; + struct { int start; int len; } str; +} + +/* uppercase or lowercase letter */ +%token TOKEN_LETTER +/* literal characters */ +%token TOKEN_LITERAL +/* digit */ +%token TOKEN_DIGIT + +%type conditional +%type alternative +%type list +%type string +%type substitute +%type optional_conditional +%type number +%type digits +%type optional_number +%type flag + +%% + +expr: + optional_conditional + { parse_result = ($1 ? new analyzed_expr($1) : 0); } + ; + +conditional: + alternative + { $$ = $1; } + | alternative '?' optional_conditional ':' conditional + { $$ = new conditional_expr($1, $3, $5); } + ; + +optional_conditional: + /* empty */ + { $$ = 0; } + | conditional + { $$ = $1; } + ; + +alternative: + list + { $$ = $1; } + | alternative '|' list + { $$ = new alternative_expr($1, $3); } + | alternative '&' list + { $$ = new conditional_expr($1, $3, 0); } + ; + +list: + substitute + { $$ = $1; } + | list substitute + { $$ = new list_expr($1, $2); } + ; + +substitute: + string + { $$ = $1; } + | substitute '~' string + { $$ = new substitute_expr($1, $3); } + ; + +string: + '@' + { $$ = new at_expr; } + | TOKEN_LITERAL + { + $$ = new literal_expr(literals.contents() + $1.start, + $1.len); + } + | TOKEN_LETTER + { $$ = new field_expr($1, 0); } + | TOKEN_LETTER number + { $$ = new field_expr($1, $2 - 1); } + | '%' TOKEN_LETTER + { + switch ($2) { + case 'I': + case 'i': + case 'A': + case 'a': + $$ = new format_expr($2); + break; + default: + command_error("unrecognized format '%1'", char($2)); + $$ = new format_expr('a'); + break; + } + } + + | '%' digits + { + $$ = new format_expr('0', $2.ndigits, $2.val); + } + | string '.' flag TOKEN_LETTER optional_number + { + switch ($4) { + case 'l': + $$ = new map_expr($1, lowercase); + break; + case 'u': + $$ = new map_expr($1, uppercase); + break; + case 'c': + $$ = new map_expr($1, capitalize); + break; + case 'r': + $$ = new map_expr($1, reverse_name); + break; + case 'a': + $$ = new map_expr($1, abbreviate_name); + break; + case 'y': + $$ = new extractor_expr($1, find_year, $3); + break; + case 'n': + $$ = new extractor_expr($1, find_last_name, $3); + break; + default: + $$ = $1; + command_error("unknown function '%1'", char($4)); + break; + } + } + + | string '+' number + { $$ = new truncate_expr($1, $3); } + | string '-' number + { $$ = new truncate_expr($1, -$3); } + | string '*' + { $$ = new star_expr($1); } + | '(' optional_conditional ')' + { $$ = $2; } + | '<' optional_conditional '>' + { $$ = new separator_expr($2); } + ; + +optional_number: + /* empty */ + { $$ = -1; } + | number + { $$ = $1; } + ; + +number: + TOKEN_DIGIT + { $$ = $1; } + | number TOKEN_DIGIT + { $$ = $1*10 + $2; } + ; + +digits: + TOKEN_DIGIT + { $$.ndigits = 1; $$.val = $1; } + | digits TOKEN_DIGIT + { $$.ndigits = $1.ndigits + 1; $$.val = $1.val*10 + $2; } + ; + + +flag: + /* empty */ + { $$ = 0; } + | '+' + { $$ = 1; } + | '-' + { $$ = -1; } + ; + +%% + +/* bison defines const to be empty unless __STDC__ is defined, which it +isn't under cfront */ + +#ifdef const +#undef const +#endif + +const char *spec_ptr; +const char *spec_end; +const char *spec_cur; + +static char uppercase_array[] = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', + 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', + 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', + 'Y', 'Z', +}; + +static char lowercase_array[] = { + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', + 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', + 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', + 'y', 'z', +}; + +int yylex() +{ + while (spec_ptr < spec_end && csspace(*spec_ptr)) + spec_ptr++; + spec_cur = spec_ptr; + if (spec_ptr >= spec_end) + return 0; + unsigned char c = *spec_ptr++; + if (csalpha(c)) { + yylval.num = c; + return TOKEN_LETTER; + } + if (csdigit(c)) { + yylval.num = c - '0'; + return TOKEN_DIGIT; + } + if (c == '\'') { + yylval.str.start = literals.length(); + for (; spec_ptr < spec_end; spec_ptr++) { + if (*spec_ptr == '\'') { + if (++spec_ptr < spec_end && *spec_ptr == '\'') + literals += '\''; + else { + yylval.str.len = literals.length() - yylval.str.start; + return TOKEN_LITERAL; + } + } + else + literals += *spec_ptr; + } + yylval.str.len = literals.length() - yylval.str.start; + return TOKEN_LITERAL; + } + return c; +} + +int set_label_spec(const char *label_spec) +{ + spec_cur = spec_ptr = label_spec; + spec_end = strchr(label_spec, '\0'); + literals.clear(); + if (yyparse()) + return 0; + delete parsed_label; + parsed_label = parse_result; + return 1; +} + +int set_date_label_spec(const char *label_spec) +{ + spec_cur = spec_ptr = label_spec; + spec_end = strchr(label_spec, '\0'); + literals.clear(); + if (yyparse()) + return 0; + delete parsed_date_label; + parsed_date_label = parse_result; + return 1; +} + +int set_short_label_spec(const char *label_spec) +{ + spec_cur = spec_ptr = label_spec; + spec_end = strchr(label_spec, '\0'); + literals.clear(); + if (yyparse()) + return 0; + delete parsed_short_label; + parsed_short_label = parse_result; + return 1; +} + +void yyerror(const char *message) +{ + if (spec_cur < spec_end) + command_error("label specification %1 before '%2'", message, spec_cur); + else + command_error("label specification %1 at end of string", + message, spec_cur); +} + +void at_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (tentative) + ref.canonicalize_authors(result); + else { + const char *end, *start = ref.get_authors(&end); + if (start) + result.append(start, end - start); + } +} + +void format_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (tentative) + return; + const label_info *lp = ref.get_label_ptr(); + int num = lp == 0 ? ref.get_number() : lp->count; + if (type != '0') + result += format_serial(type, num + 1); + else { + const char *ptr = i_to_a(num + first_number); + int pad = width - strlen(ptr); + while (--pad >= 0) + result += '0'; + result += ptr; + } +} + +static const char *format_serial(char c, int n) +{ + assert(n > 0); + static char buf[128]; // more than enough. + switch (c) { + case 'i': + case 'I': + { + char *p = buf; + // troff uses z and w to represent 10000 and 5000 in Roman + // numerals; I can find no historical basis for this usage + const char *s = c == 'i' ? "zwmdclxvi" : "ZWMDCLXVI"; + if (n >= 40000) + return i_to_a(n); + while (n >= 10000) { + *p++ = s[0]; + n -= 10000; + } + for (int i = 1000; i > 0; i /= 10, s += 2) { + int m = n/i; + n -= m*i; + switch (m) { + case 3: + *p++ = s[2]; + /* falls through */ + case 2: + *p++ = s[2]; + /* falls through */ + case 1: + *p++ = s[2]; + break; + case 4: + *p++ = s[2]; + *p++ = s[1]; + break; + case 8: + *p++ = s[1]; + *p++ = s[2]; + *p++ = s[2]; + *p++ = s[2]; + break; + case 7: + *p++ = s[1]; + *p++ = s[2]; + *p++ = s[2]; + break; + case 6: + *p++ = s[1]; + *p++ = s[2]; + break; + case 5: + *p++ = s[1]; + break; + case 9: + *p++ = s[2]; + *p++ = s[0]; + } + } + *p = 0; + break; + } + case 'a': + case 'A': + { + char *p = buf; + // this is derived from troff/reg.c + while (n > 0) { + int d = n % 26; + if (d == 0) + d = 26; + n -= d; + n /= 26; + *p++ = c == 'a' ? lowercase_array[d - 1] : + uppercase_array[d - 1]; + } + *p-- = 0; + // Reverse it. + char *q = buf; + while (q < p) { + char temp = *q; + *q = *p; + *p = temp; + --p; + ++q; + } + break; + } + default: + assert(0); + } + return buf; +} + +void field_expr::evaluate(int, const reference &ref, + string &result, substring_position &) +{ + const char *end; + const char *start = ref.get_field(name, &end); + if (start) { + start = nth_field(number, start, &end); + if (start) + result.append(start, end - start); + } +} + +void literal_expr::evaluate(int, const reference &, + string &result, substring_position &) +{ + result += s; +} + +analyzed_expr::analyzed_expr(expression *e) +: unary_expr(e), flags(e ? e->analyze() : 0) +{ +} + +void analyzed_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + if (expr) + expr->evaluate(tentative, ref, result, pos); +} + +void star_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + const label_info *lp = ref.get_label_ptr(); + if (!tentative + && (lp == 0 || lp->total > 1) + && expr) + expr->evaluate(tentative, ref, result, pos); +} + +void separator_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + int start_length = result.length(); + int is_first = pos.start < 0; + if (expr) + expr->evaluate(tentative, ref, result, pos); + if (is_first) { + pos.start = start_length; + pos.length = result.length() - start_length; + } +} + +void map_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (expr) { + string temp; + substring_position temp_pos; + expr->evaluate(tentative, ref, temp, temp_pos); + (*func)(temp.contents(), temp.contents() + temp.length(), result); + } +} + +void extractor_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (expr) { + string temp; + substring_position temp_pos; + expr->evaluate(tentative, ref, temp, temp_pos); + const char *end, *start = (*func)(temp.contents(), + temp.contents() + temp.length(), + &end); + switch (part) { + case BEFORE: + if (start) + result.append(temp.contents(), start - temp.contents()); + else + result += temp; + break; + case MATCH: + if (start) + result.append(start, end - start); + break; + case AFTER: + if (start) + result.append(end, temp.contents() + temp.length() - end); + break; + default: + assert(0); + } + } +} + +static void first_part(int len, const char *ptr, const char *end, + string &result) +{ + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + break; + const token_info *ti = lookup_token(token_start, ptr); + int counts = ti->sortify_non_empty(token_start, ptr); + if (counts && --len < 0) + break; + if (counts || ti->is_accent()) + result.append(token_start, ptr - token_start); + } +} + +static void last_part(int len, const char *ptr, const char *end, + string &result) +{ + const char *start = ptr; + int count = 0; + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + break; + const token_info *ti = lookup_token(token_start, ptr); + if (ti->sortify_non_empty(token_start, ptr)) + count++; + } + ptr = start; + int skip = count - len; + if (skip > 0) { + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + assert(0); + const token_info *ti = lookup_token(token_start, ptr); + if (ti->sortify_non_empty(token_start, ptr) && --skip < 0) { + ptr = token_start; + break; + } + } + } + first_part(len, ptr, end, result); +} + +void truncate_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (expr) { + string temp; + substring_position temp_pos; + expr->evaluate(tentative, ref, temp, temp_pos); + const char *start = temp.contents(); + const char *end = start + temp.length(); + if (n > 0) + first_part(n, start, end, result); + else if (n < 0) + last_part(-n, start, end, result); + } +} + +void alternative_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + int start_length = result.length(); + if (expr1) + expr1->evaluate(tentative, ref, result, pos); + if (result.length() == start_length && expr2) + expr2->evaluate(tentative, ref, result, pos); +} + +void list_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + if (expr1) + expr1->evaluate(tentative, ref, result, pos); + if (expr2) + expr2->evaluate(tentative, ref, result, pos); +} + +void substitute_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + int start_length = result.length(); + if (expr1) + expr1->evaluate(tentative, ref, result, pos); + if (result.length() > start_length && result[result.length() - 1] == '-') { + // ought to see if pos covers the - + result.set_length(result.length() - 1); + if (expr2) + expr2->evaluate(tentative, ref, result, pos); + } +} + +void conditional_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + string temp; + substring_position temp_pos; + if (expr1) + expr1->evaluate(tentative, ref, temp, temp_pos); + if (temp.length() > 0) { + if (expr2) + expr2->evaluate(tentative, ref, result, pos); + } + else { + if (expr3) + expr3->evaluate(tentative, ref, result, pos); + } +} + +void reference::pre_compute_label() +{ + if (parsed_label != 0 + && (parsed_label->analyze() & expression::CONTAINS_VARIABLE)) { + label.clear(); + substring_position temp_pos; + parsed_label->evaluate(1, *this, label, temp_pos); + label_ptr = lookup_label(label); + } +} + +void reference::compute_label() +{ + label.clear(); + if (parsed_label) + parsed_label->evaluate(0, *this, label, separator_pos); + if (short_label_flag && parsed_short_label) + parsed_short_label->evaluate(0, *this, short_label, short_separator_pos); + if (date_as_label) { + string new_date; + if (parsed_date_label) { + substring_position temp_pos; + parsed_date_label->evaluate(0, *this, new_date, temp_pos); + } + set_date(new_date); + } + if (label_ptr) + label_ptr->count += 1; +} + +void reference::immediate_compute_label() +{ + if (label_ptr) + label_ptr->total = 2; // force use of disambiguator + compute_label(); +} + +int reference::merge_labels(reference **v, int n, label_type type, + string &result) +{ + if (abbreviate_label_ranges) + return merge_labels_by_number(v, n, type, result); + else + return merge_labels_by_parts(v, n, type, result); +} + +int reference::merge_labels_by_number(reference **v, int n, label_type type, + string &result) +{ + if (n <= 1) + return 0; + int num = get_number(); + // Only merge three or more labels. + if (v[0]->get_number() != num + 1 + || v[1]->get_number() != num + 2) + return 0; + int i; + for (i = 2; i < n; i++) + if (v[i]->get_number() != num + i + 1) + break; + result = get_label(type); + result += label_range_indicator; + result += v[i - 1]->get_label(type); + return i; +} + +const substring_position &reference::get_separator_pos(label_type type) const +{ + if (type == SHORT_LABEL && short_label_flag) + return short_separator_pos; + else + return separator_pos; +} + +const string &reference::get_label(label_type type) const +{ + if (type == SHORT_LABEL && short_label_flag) + return short_label; + else + return label; +} + +int reference::merge_labels_by_parts(reference **v, int n, label_type type, + string &result) +{ + if (n <= 0) + return 0; + const string &lb = get_label(type); + const substring_position &sp = get_separator_pos(type); + if (sp.start < 0 + || sp.start != v[0]->get_separator_pos(type).start + || memcmp(lb.contents(), v[0]->get_label(type).contents(), + sp.start) != 0) + return 0; + result = lb; + int i = 0; + do { + result += separate_label_second_parts; + const substring_position &s = v[i]->get_separator_pos(type); + int sep_end_pos = s.start + s.length; + result.append(v[i]->get_label(type).contents() + sep_end_pos, + v[i]->get_label(type).length() - sep_end_pos); + } while (++i < n + && sp.start == v[i]->get_separator_pos(type).start + && memcmp(lb.contents(), v[i]->get_label(type).contents(), + sp.start) == 0); + return i; +} + +string label_pool; + +label_info::label_info(const string &s) +: start(label_pool.length()), length(s.length()), count(0), total(1) +{ + label_pool += s; +} + +static label_info **label_table = 0; +static int label_table_size = 0; +static int label_table_used = 0; + +label_info *lookup_label(const string &label) +{ + if (label_table == 0) { + label_table = new label_info *[17]; + label_table_size = 17; + for (int i = 0; i < 17; i++) + label_table[i] = 0; + } + unsigned h = hash_string(label.contents(), label.length()) % label_table_size; + label_info **ptr; + for (ptr = label_table + h; + *ptr != 0; + (ptr == label_table) + ? (ptr = label_table + label_table_size - 1) + : ptr--) + if ((*ptr)->length == label.length() + && memcmp(label_pool.contents() + (*ptr)->start, label.contents(), + label.length()) == 0) { + (*ptr)->total += 1; + return *ptr; + } + label_info *result = *ptr = new label_info(label); + if (++label_table_used * 2 > label_table_size) { + // Rehash the table. + label_info **old_table = label_table; + int old_size = label_table_size; + label_table_size = next_size(label_table_size); + label_table = new label_info *[label_table_size]; + int i; + for (i = 0; i < label_table_size; i++) + label_table[i] = 0; + for (i = 0; i < old_size; i++) + if (old_table[i]) { + h = hash_string(label_pool.contents() + old_table[i]->start, + old_table[i]->length); + label_info **p; + for (p = label_table + (h % label_table_size); + *p != 0; + (p == label_table) + ? (p = label_table + label_table_size - 1) + : --p) + ; + *p = old_table[i]; + } + delete[] old_table; + } + return result; +} + +void clear_labels() +{ + for (int i = 0; i < label_table_size; i++) { + delete label_table[i]; + label_table[i] = 0; + } + label_table_used = 0; + label_pool.clear(); +} + +static void consider_authors(reference **start, reference **end, int i); + +void compute_labels(reference **v, int n) +{ + if (parsed_label + && (parsed_label->analyze() & expression::CONTAINS_AT) + && sort_fields.length() >= 2 + && sort_fields[0] == 'A' + && sort_fields[1] == '+') + consider_authors(v, v + n, 0); + for (int i = 0; i < n; i++) + v[i]->compute_label(); +} + + +/* A reference with a list of authors _needs_ author i +where 0 <= i <= N if there exists a reference with a list of authors + such that != and M >= i +and Aj = Bj for 0 <= j < i. In this case if we can't say "A0, +A1,...,A(i-1) et al" because this would match both and +. If a reference needs author i we only have to call +need_author(j) for some j >= i such that the reference also needs +author j. */ + +/* This function handles 2 tasks: +determine which authors are needed (cannot be elided with et al.); +determine which authors can have only last names in the labels. + +References >= start and < end have the same first i author names. +Also they're sorted by A+. */ + +static void consider_authors(reference **start, reference **end, int i) +{ + if (start >= end) + return; + reference **p = start; + if (i >= (*p)->get_nauthors()) { + for (++p; p < end && i >= (*p)->get_nauthors(); p++) + ; + if (p < end && i > 0) { + // If we have an author list and an author list , + // then both lists need C. + for (reference **q = start; q < end; q++) + (*q)->need_author(i - 1); + } + start = p; + } + while (p < end) { + reference **last_name_start = p; + reference **name_start = p; + for (++p; + p < end && i < (*p)->get_nauthors() + && same_author_last_name(**last_name_start, **p, i); + p++) { + if (!same_author_name(**name_start, **p, i)) { + consider_authors(name_start, p, i + 1); + name_start = p; + } + } + consider_authors(name_start, p, i + 1); + if (last_name_start == name_start) { + for (reference **q = last_name_start; q < p; q++) + (*q)->set_last_name_unambiguous(i); + } + // If we have an author list and , then the lists + // need author D and E respectively. + if (name_start > start || p < end) { + for (reference **q = last_name_start; q < p; q++) + (*q)->need_author(i); + } + } +} + +int same_author_last_name(const reference &r1, const reference &r2, int n) +{ + const char *ae1; + const char *as1 = r1.get_sort_field(0, n, 0, &ae1); + const char *ae2; + const char *as2 = r2.get_sort_field(0, n, 0, &ae2); + if (!as1 && !as2) return 1; // they are the same + if (!as1 || !as2) return 0; + return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0; +} + +int same_author_name(const reference &r1, const reference &r2, int n) +{ + const char *ae1; + const char *as1 = r1.get_sort_field(0, n, -1, &ae1); + const char *ae2; + const char *as2 = r2.get_sort_field(0, n, -1, &ae2); + if (!as1 && !as2) return 1; // they are the same + if (!as1 || !as2) return 0; + return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0; +} + + +void int_set::set(int i) +{ + assert(i >= 0); + int bytei = i >> 3; + if (bytei >= v.length()) { + int old_length = v.length(); + v.set_length(bytei + 1); + for (int j = old_length; j <= bytei; j++) + v[j] = 0; + } + v[bytei] |= 1 << (i & 7); +} + +int int_set::get(int i) const +{ + assert(i >= 0); + int bytei = i >> 3; + return bytei >= v.length() ? 0 : (v[bytei] & (1 << (i & 7))) != 0; +} + +void reference::set_last_name_unambiguous(int i) +{ + last_name_unambiguous.set(i); +} + +void reference::need_author(int n) +{ + if (n > last_needed_author) + last_needed_author = n; +} + +const char *reference::get_authors(const char **end) const +{ + if (!computed_authors) { + ((reference *)this)->computed_authors = 1; + string &result = ((reference *)this)->authors; + int na = get_nauthors(); + result.clear(); + for (int i = 0; i < na; i++) { + if (last_name_unambiguous.get(i)) { + const char *e, *start = get_author_last_name(i, &e); + assert(start != 0); + result.append(start, e - start); + } + else { + const char *e, *start = get_author(i, &e); + assert(start != 0); + result.append(start, e - start); + } + if (i == last_needed_author + && et_al.length() > 0 + && et_al_min_elide > 0 + && last_needed_author + et_al_min_elide < na + && na >= et_al_min_total) { + result += et_al; + break; + } + if (i < na - 1) { + if (na == 2) + result += join_authors_exactly_two; + else if (i < na - 2) + result += join_authors_default; + else + result += join_authors_last_two; + } + } + } + const char *start = authors.contents(); + *end = start + authors.length(); + return start; +} + +int reference::get_nauthors() const +{ + if (nauthors < 0) { + const char *dummy; + int na; + for (na = 0; get_author(na, &dummy) != 0; na++) + ; + ((reference *)this)->nauthors = na; + } + return nauthors; +} + +// Local Variables: +// fill-column: 72 +// mode: C++ +// End: +// vim: set cindent noexpandtab shiftwidth=2 textwidth=72: diff --git a/src/preproc/refer/ref.cpp b/src/preproc/refer/ref.cpp new file mode 100644 index 0000000..9e1b5e7 --- /dev/null +++ b/src/preproc/refer/ref.cpp @@ -0,0 +1,1161 @@ +// -*- C++ -*- +/* Copyright (C) 1989-2020 Free Software Foundation, Inc. +Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or +(at your option) any later version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . */ + +#include "refer.h" +#include "refid.h" +#include "ref.h" +#include "token.h" + +static const char *find_day(const char *, const char *, const char **); +static int find_month(const char *start, const char *end); +static void abbreviate_names(string &); + +#define DEFAULT_ARTICLES "the\000a\000an" + +string articles(DEFAULT_ARTICLES, sizeof(DEFAULT_ARTICLES)); + +// Multiple occurrences of fields are separated by FIELD_SEPARATOR. +const char FIELD_SEPARATOR = '\0'; + +const char MULTI_FIELD_NAMES[] = "AE"; +const char *AUTHOR_FIELDS = "AQ"; + +enum { OTHER, JOURNAL_ARTICLE, BOOK, ARTICLE_IN_BOOK, TECH_REPORT, BELL_TM }; + +const char *reference_types[] = { + "other", + "journal-article", + "book", + "article-in-book", + "tech-report", + "bell-tm", +}; + +static string temp_fields[256]; + +reference::reference(const char *start, int len, reference_id *ridp) +: h(0), merged(0), no(-1), field(0), nfields(0), label_ptr(0), + computed_authors(0), last_needed_author(-1), nauthors(-1) +{ + int i; + for (i = 0; i < 256; i++) + field_index[i] = NULL_FIELD_INDEX; + if (ridp) + rid = *ridp; + if (start == 0) + return; + if (len <= 0) + return; + const char *end = start + len; + const char *ptr = start; + assert(*ptr == '%'); + while (ptr < end) { + if (ptr + 1 < end && ptr[1] != '\0' + && ((ptr[1] != '%' && ptr[1] == annotation_field) + || (ptr + 2 < end && ptr[1] == '%' && ptr[2] != '\0' + && discard_fields.search(ptr[2]) < 0))) { + if (ptr[1] == '%') + ptr++; + string &f = temp_fields[(unsigned char)ptr[1]]; + ptr += 2; + while (ptr < end && csspace(*ptr)) + ptr++; + for (;;) { + for (;;) { + if (ptr >= end) { + f += '\n'; + break; + } + f += *ptr; + if (*ptr++ == '\n') + break; + } + if (ptr >= end || *ptr == '%') + break; + } + } + else if (ptr + 1 < end && ptr[1] != '\0' && ptr[1] != '%' + && discard_fields.search(ptr[1]) < 0) { + string &f = temp_fields[(unsigned char)ptr[1]]; + if (f.length() > 0) { + if (strchr(MULTI_FIELD_NAMES, ptr[1]) != 0) + f += FIELD_SEPARATOR; + else + f.clear(); + } + ptr += 2; + if (ptr < end) { + if (*ptr == ' ') + ptr++; + for (;;) { + const char *p = ptr; + while (ptr < end && *ptr != '\n') + ptr++; + // strip trailing white space + const char *q = ptr; + while (q > p && q[-1] != '\n' && csspace(q[-1])) + q--; + while (p < q) + f += *p++; + if (ptr >= end) + break; + ptr++; + if (ptr >= end) + break; + if (*ptr == '%') + break; + f += ' '; + } + } + } + else { + // skip this field + for (;;) { + while (ptr < end && *ptr++ != '\n') + ; + if (ptr >= end || *ptr == '%') + break; + } + } + } + for (i = 0; i < 256; i++) + if (temp_fields[i].length() > 0) + nfields++; + field = new string[nfields]; + int j = 0; + for (i = 0; i < 256; i++) + if (temp_fields[i].length() > 0) { + field[j].move(temp_fields[i]); + if (abbreviate_fields.search(i) >= 0) + abbreviate_names(field[j]); + field_index[i] = j; + j++; + } +} + +reference::~reference() +{ + if (nfields > 0) + delete[] field; +} + +// ref is the inline, this is the database ref + +void reference::merge(reference &ref) +{ + int i; + for (i = 0; i < 256; i++) + if (field_index[i] != NULL_FIELD_INDEX) + temp_fields[i].move(field[field_index[i]]); + for (i = 0; i < 256; i++) + if (ref.field_index[i] != NULL_FIELD_INDEX) + temp_fields[i].move(ref.field[ref.field_index[i]]); + for (i = 0; i < 256; i++) + field_index[i] = NULL_FIELD_INDEX; + int old_nfields = nfields; + nfields = 0; + for (i = 0; i < 256; i++) + if (temp_fields[i].length() > 0) + nfields++; + if (nfields != old_nfields) { + if (old_nfields > 0) + delete[] field; + field = new string[nfields]; + } + int j = 0; + for (i = 0; i < 256; i++) + if (temp_fields[i].length() > 0) { + field[j].move(temp_fields[i]); + field_index[i] = j; + j++; + } + merged = 1; +} + +void reference::insert_field(unsigned char c, string &s) +{ + assert(s.length() > 0); + if (field_index[c] != NULL_FIELD_INDEX) { + field[field_index[c]].move(s); + return; + } + assert(field_index[c] == NULL_FIELD_INDEX); + string *old_field = field; + field = new string[nfields + 1]; + int pos = 0; + int i; + for (i = 0; i < int(c); i++) + if (field_index[i] != NULL_FIELD_INDEX) + pos++; + for (i = 0; i < pos; i++) + field[i].move(old_field[i]); + field[pos].move(s); + for (i = pos; i < nfields; i++) + field[i + 1].move(old_field[i]); + if (nfields > 0) + delete[] old_field; + nfields++; + field_index[c] = pos; + for (i = c + 1; i < 256; i++) + if (field_index[i] != NULL_FIELD_INDEX) + field_index[i] += 1; +} + +void reference::delete_field(unsigned char c) +{ + if (field_index[c] == NULL_FIELD_INDEX) + return; + string *old_field = field; + field = new string[nfields - 1]; + int i; + for (i = 0; i < int(field_index[c]); i++) + field[i].move(old_field[i]); + for (i = field_index[c]; i < nfields - 1; i++) + field[i].move(old_field[i + 1]); + if (nfields > 0) + delete[] old_field; + nfields--; + field_index[c] = NULL_FIELD_INDEX; + for (i = c + 1; i < 256; i++) + if (field_index[i] != NULL_FIELD_INDEX) + field_index[i] -= 1; +} + +void reference::compute_hash_code() +{ + if (!rid.is_null()) + h = rid.hash(); + else { + h = 0; + for (int i = 0; i < nfields; i++) + if (field[i].length() > 0) { + h <<= 4; + h ^= hash_string(field[i].contents(), field[i].length()); + } + } +} + +void reference::set_number(int n) +{ + no = n; +} + +const char SORT_SEP = '\001'; +const char SORT_SUB_SEP = '\002'; +const char SORT_SUB_SUB_SEP = '\003'; + +// sep specifies additional word separators + +void sortify_words(const char *s, const char *end, const char *sep, + string &result) +{ + int non_empty = 0; + int need_separator = 0; + for (;;) { + const char *token_start = s; + if (!get_token(&s, end)) + break; + if ((s - token_start == 1 + && (*token_start == ' ' + || *token_start == '\n' + || (sep && *token_start != '\0' + && strchr(sep, *token_start) != 0))) + || (s - token_start == 2 + && token_start[0] == '\\' && token_start[1] == ' ')) { + if (non_empty) + need_separator = 1; + } + else { + const token_info *ti = lookup_token(token_start, s); + if (ti->sortify_non_empty(token_start, s)) { + if (need_separator) { + result += ' '; + need_separator = 0; + } + ti->sortify(token_start, s, result); + non_empty = 1; + } + } + } +} + +void sortify_word(const char *s, const char *end, string &result) +{ + for (;;) { + const char *token_start = s; + if (!get_token(&s, end)) + break; + const token_info *ti = lookup_token(token_start, s); + ti->sortify(token_start, s, result); + } +} + +void sortify_other(const char *s, int len, string &key) +{ + sortify_words(s, s + len, 0, key); +} + +void sortify_title(const char *s, int len, string &key) +{ + const char *end = s + len; + for (; s < end && (*s == ' ' || *s == '\n'); s++) + ; + const char *ptr = s; + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + break; + if (ptr - token_start == 1 + && (*token_start == ' ' || *token_start == '\n')) + break; + } + if (ptr < end) { + unsigned int first_word_len = ptr - s - 1; + const char *ae = articles.contents() + articles.length(); + for (const char *a = articles.contents(); + a < ae; + a = strchr(a, '\0') + 1) + if (first_word_len == strlen(a)) { + unsigned int j; + for (j = 0; j < first_word_len; j++) + if (a[j] != cmlower(s[j])) + break; + if (j >= first_word_len) { + s = ptr; + for (; s < end && (*s == ' ' || *s == '\n'); s++) + ; + break; + } + } + } + sortify_words(s, end, 0, key); +} + +void sortify_name(const char *s, int len, string &key) +{ + const char *last_name_end; + const char *last_name = find_last_name(s, s + len, &last_name_end); + sortify_word(last_name, last_name_end, key); + key += SORT_SUB_SUB_SEP; + if (last_name > s) + sortify_words(s, last_name, ".", key); + key += SORT_SUB_SUB_SEP; + if (last_name_end < s + len) + sortify_words(last_name_end, s + len, ".,", key); +} + +void sortify_date(const char *s, int len, string &key) +{ + const char *year_end; + const char *year_start = find_year(s, s + len, &year_end); + if (!year_start) { + // Things without years are often 'forthcoming', so it makes sense + // that they sort after things with explicit years. + key += 'A'; + sortify_words(s, s + len, 0, key); + return; + } + int n = year_end - year_start; + while (n < 4) { + key += '0'; + n++; + } + while (year_start < year_end) + key += *year_start++; + int m = find_month(s, s + len); + if (m < 0) + return; + key += 'A' + m; + const char *day_end; + const char *day_start = find_day(s, s + len, &day_end); + if (!day_start) + return; + if (day_end - day_start == 1) + key += '0'; + while (day_start < day_end) + key += *day_start++; +} + +// SORT_{SUB,SUB_SUB}_SEP can creep in from use of @ in label specification. + +void sortify_label(const char *s, int len, string &key) +{ + const char *end = s + len; + for (;;) { + const char *ptr; + for (ptr = s; + ptr < end && *ptr != SORT_SUB_SEP && *ptr != SORT_SUB_SUB_SEP; + ptr++) + ; + if (ptr > s) + sortify_words(s, ptr, 0, key); + s = ptr; + if (s >= end) + break; + key += *s++; + } +} + +void reference::compute_sort_key() +{ + if (sort_fields.length() == 0) + return; + sort_fields += '\0'; + const char *sf = sort_fields.contents(); + int first_time = 1; + while (*sf != '\0') { + if (!first_time) + sort_key += SORT_SEP; + first_time = 0; + char f = *sf++; + int n = 1; + if (*sf == '+') { + n = INT_MAX; + sf++; + } + else if (csdigit(*sf)) { + char *ptr; + long l = strtol(sf, &ptr, 10); + if (l == 0 && ptr == sf) + ; + else { + sf = ptr; + if (l < 0) { + n = 1; + } + else { + n = int(l); + } + } + } + if (f == '.') + sortify_label(label.contents(), label.length(), sort_key); + else if (f == AUTHOR_FIELDS[0]) + sortify_authors(n, sort_key); + else + sortify_field(f, n, sort_key); + } + sort_fields.set_length(sort_fields.length() - 1); +} + +void reference::sortify_authors(int n, string &result) const +{ + for (const char *p = AUTHOR_FIELDS; *p != '\0'; p++) + if (contains_field(*p)) { + sortify_field(*p, n, result); + return; + } + sortify_field(AUTHOR_FIELDS[0], n, result); +} + +void reference::canonicalize_authors(string &result) const +{ + int len = result.length(); + sortify_authors(INT_MAX, result); + if (result.length() > len) + result += SORT_SUB_SEP; +} + +void reference::sortify_field(unsigned char f, int n, string &result) const +{ + typedef void (*sortify_t)(const char *, int, string &); + sortify_t sortifier = sortify_other; + switch (f) { + case 'A': + case 'E': + sortifier = sortify_name; + break; + case 'D': + sortifier = sortify_date; + break; + case 'B': + case 'J': + case 'T': + sortifier = sortify_title; + break; + } + int fi = field_index[(unsigned char)f]; + if (fi != NULL_FIELD_INDEX) { + string &str = field[fi]; + const char *start = str.contents(); + const char *end = start + str.length(); + for (int i = 0; i < n && start < end; i++) { + const char *p = start; + while (start < end && *start != FIELD_SEPARATOR) + start++; + if (i > 0) + result += SORT_SUB_SEP; + (*sortifier)(p, start - p, result); + if (start < end) + start++; + } + } +} + +int compare_reference(const reference &r1, const reference &r2) +{ + assert(r1.no >= 0); + assert(r2.no >= 0); + const char *s1 = r1.sort_key.contents(); + int n1 = r1.sort_key.length(); + const char *s2 = r2.sort_key.contents(); + int n2 = r2.sort_key.length(); + for (; n1 > 0 && n2 > 0; --n1, --n2, ++s1, ++s2) + if (*s1 != *s2) + return (int)(unsigned char)*s1 - (int)(unsigned char)*s2; + if (n2 > 0) + return -1; + if (n1 > 0) + return 1; + return r1.no - r2.no; +} + +int same_reference(const reference &r1, const reference &r2) +{ + if (!r1.rid.is_null() && r1.rid == r2.rid) + return 1; + if (r1.h != r2.h) + return 0; + if (r1.nfields != r2.nfields) + return 0; + int i = 0; + for (i = 0; i < 256; i++) + if (r1.field_index != r2.field_index) + return 0; + for (i = 0; i < r1.nfields; i++) + if (r1.field[i] != r2.field[i]) + return 0; + return 1; +} + +const char *find_last_name(const char *start, const char *end, + const char **endp) +{ + const char *ptr = start; + const char *last_word = start; + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + break; + if (ptr - token_start == 1) { + if (*token_start == ',') { + *endp = token_start; + return last_word; + } + else if (*token_start == ' ' || *token_start == '\n') { + if (ptr < end && *ptr != ' ' && *ptr != '\n') + last_word = ptr; + } + } + } + *endp = end; + return last_word; +} + +void abbreviate_name(const char *ptr, const char *end, string &result) +{ + const char *last_name_end; + const char *last_name_start = find_last_name(ptr, end, &last_name_end); + int need_period = 0; + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, last_name_start)) + break; + const token_info *ti = lookup_token(token_start, ptr); + if (need_period) { + if ((ptr - token_start == 1 && *token_start == ' ') + || (ptr - token_start == 2 && token_start[0] == '\\' + && token_start[1] == ' ')) + continue; + if (ti->is_upper()) + result += period_before_initial; + else + result += period_before_other; + need_period = 0; + } + result.append(token_start, ptr - token_start); + if (ti->is_upper()) { + const char *lower_ptr = ptr; + int first_token = 1; + for (;;) { + token_start = ptr; + if (!get_token(&ptr, last_name_start)) + break; + if ((ptr - token_start == 1 && *token_start == ' ') + || (ptr - token_start == 2 && token_start[0] == '\\' + && token_start[1] == ' ')) + break; + ti = lookup_token(token_start, ptr); + if (ti->is_hyphen()) { + const char *ptr1 = ptr; + if (get_token(&ptr1, last_name_start)) { + ti = lookup_token(ptr, ptr1); + if (ti->is_upper()) { + result += period_before_hyphen; + result.append(token_start, ptr1 - token_start); + ptr = ptr1; + } + } + } + else if (ti->is_upper()) { + // MacDougal -> MacD. + result.append(lower_ptr, ptr - lower_ptr); + lower_ptr = ptr; + first_token = 1; + } + else if (first_token && ti->is_accent()) { + result.append(token_start, ptr - token_start); + lower_ptr = ptr; + } + first_token = 0; + } + need_period = 1; + } + } + if (need_period) + result += period_before_last_name; + result.append(last_name_start, end - last_name_start); +} + +static void abbreviate_names(string &result) +{ + string str; + str.move(result); + const char *ptr = str.contents(); + const char *end = ptr + str.length(); + while (ptr < end) { + const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr); + if (name_end == 0) + name_end = end; + abbreviate_name(ptr, name_end, result); + if (name_end >= end) + break; + ptr = name_end + 1; + result += FIELD_SEPARATOR; + } +} + +void reverse_name(const char *ptr, const char *name_end, string &result) +{ + const char *last_name_end; + const char *last_name_start = find_last_name(ptr, name_end, &last_name_end); + result.append(last_name_start, last_name_end - last_name_start); + while (last_name_start > ptr + && (last_name_start[-1] == ' ' || last_name_start[-1] == '\n')) + last_name_start--; + if (last_name_start > ptr) { + result += ", "; + result.append(ptr, last_name_start - ptr); + } + if (last_name_end < name_end) + result.append(last_name_end, name_end - last_name_end); +} + +void reverse_names(string &result, int n) +{ + if (n <= 0) + return; + string str; + str.move(result); + const char *ptr = str.contents(); + const char *end = ptr + str.length(); + while (ptr < end) { + if (--n < 0) { + result.append(ptr, end - ptr); + break; + } + const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr); + if (name_end == 0) + name_end = end; + reverse_name(ptr, name_end, result); + if (name_end >= end) + break; + ptr = name_end + 1; + result += FIELD_SEPARATOR; + } +} + +// Return number of field separators. + +int join_fields(string &f) +{ + const char *ptr = f.contents(); + int len = f.length(); + int nfield_seps = 0; + int j; + for (j = 0; j < len; j++) + if (ptr[j] == FIELD_SEPARATOR) + nfield_seps++; + if (nfield_seps == 0) + return 0; + string temp; + int field_seps_left = nfield_seps; + for (j = 0; j < len; j++) { + if (ptr[j] == FIELD_SEPARATOR) { + if (nfield_seps == 1) + temp += join_authors_exactly_two; + else if (--field_seps_left == 0) + temp += join_authors_last_two; + else + temp += join_authors_default; + } + else + temp += ptr[j]; + } + f = temp; + return nfield_seps; +} + +void uppercase(const char *start, const char *end, string &result) +{ + for (;;) { + const char *token_start = start; + if (!get_token(&start, end)) + break; + const token_info *ti = lookup_token(token_start, start); + ti->upper_case(token_start, start, result); + } +} + +void lowercase(const char *start, const char *end, string &result) +{ + for (;;) { + const char *token_start = start; + if (!get_token(&start, end)) + break; + const token_info *ti = lookup_token(token_start, start); + ti->lower_case(token_start, start, result); + } +} + +void capitalize(const char *ptr, const char *end, string &result) +{ + int in_small_point_size = 0; + for (;;) { + const char *start = ptr; + if (!get_token(&ptr, end)) + break; + const token_info *ti = lookup_token(start, ptr); + const char *char_end = ptr; + int is_lower = ti->is_lower(); + if ((is_lower || ti->is_upper()) && get_token(&ptr, end)) { + const token_info *ti2 = lookup_token(char_end, ptr); + if (!ti2->is_accent()) + ptr = char_end; + } + if (is_lower) { + if (!in_small_point_size) { + result += "\\s-2"; + in_small_point_size = 1; + } + ti->upper_case(start, char_end, result); + result.append(char_end, ptr - char_end); + } + else { + if (in_small_point_size) { + result += "\\s+2"; + in_small_point_size = 0; + } + result.append(start, ptr - start); + } + } + if (in_small_point_size) + result += "\\s+2"; +} + +void capitalize_field(string &str) +{ + string temp; + capitalize(str.contents(), str.contents() + str.length(), temp); + str.move(temp); +} + +int is_terminated(const char *ptr, const char *end) +{ + const char *last_token = end; + for (;;) { + const char *p = ptr; + if (!get_token(&ptr, end)) + break; + last_token = p; + } + return end - last_token == 1 + && (*last_token == '.' || *last_token == '!' || *last_token == '?'); +} + +void reference::output(FILE *fp) +{ + fputs(".]-\n", fp); + for (int i = 0; i < 256; i++) + if (field_index[i] != NULL_FIELD_INDEX && i != annotation_field) { + string &f = field[field_index[i]]; + if (!csdigit(i)) { + int j = reverse_fields.search(i); + if (j >= 0) { + int n; + int len = reverse_fields.length(); + if (++j < len && csdigit(reverse_fields[j])) { + n = reverse_fields[j] - '0'; + for (++j; j < len && csdigit(reverse_fields[j]); j++) + // should check for overflow + n = n*10 + reverse_fields[j] - '0'; + } + else + n = INT_MAX; + reverse_names(f, n); + } + } + int is_multiple = join_fields(f) > 0; + if (capitalize_fields.search(i) >= 0) + capitalize_field(f); + if (memchr(f.contents(), '\n', f.length()) == 0) { + fprintf(fp, ".ds [%c ", i); + if (f[0] == ' ' || f[0] == '\\' || f[0] == '"') + putc('"', fp); + put_string(f, fp); + putc('\n', fp); + } + else { + fprintf(fp, ".de [%c\n", i); + put_string(f, fp); + fputs("..\n", fp); + } + if (i == 'P') { + int multiple_pages = 0; + const char *s = f.contents(); + const char *end = f.contents() + f.length(); + for (;;) { + const char *token_start = s; + if (!get_token(&s, end)) + break; + const token_info *ti = lookup_token(token_start, s); + if (ti->is_hyphen() || ti->is_range_sep()) { + multiple_pages = 1; + break; + } + } + fprintf(fp, ".nr [P %d\n", multiple_pages); + } + else if (i == 'E') + fprintf(fp, ".nr [E %d\n", is_multiple); + } + for (const char *p = "TAO"; *p; p++) { + int fi = field_index[(unsigned char)*p]; + if (fi != NULL_FIELD_INDEX) { + string &f = field[fi]; + fprintf(fp, ".nr [%c %d\n", *p, + is_terminated(f.contents(), f.contents() + f.length())); + } + } + int t = classify(); + fprintf(fp, ".][ %d %s\n", t, reference_types[t]); + if (annotation_macro.length() > 0 && annotation_field >= 0 + && field_index[annotation_field] != NULL_FIELD_INDEX) { + putc('.', fp); + put_string(annotation_macro, fp); + putc('\n', fp); + put_string(field[field_index[annotation_field]], fp); + } +} + +void reference::print_sort_key_comment(FILE *fp) +{ + fputs(".\\\"", fp); + put_string(sort_key, fp); + putc('\n', fp); +} + +const char *find_year(const char *start, const char *end, const char **endp) +{ + for (;;) { + while (start < end && !csdigit(*start)) + start++; + const char *ptr = start; + if (start == end) + break; + while (ptr < end && csdigit(*ptr)) + ptr++; + if (ptr - start == 4 || ptr - start == 3 + || (ptr - start == 2 + && (start[0] >= '4' || (start[0] == '3' && start[1] >= '2')))) { + *endp = ptr; + return start; + } + start = ptr; + } + return 0; +} + +static const char *find_day(const char *start, const char *end, + const char **endp) +{ + for (;;) { + while (start < end && !csdigit(*start)) + start++; + const char *ptr = start; + if (start == end) + break; + while (ptr < end && csdigit(*ptr)) + ptr++; + if ((ptr - start == 1 && start[0] != '0') + || (ptr - start == 2 && + (start[0] == '1' + || start[0] == '2' + || (start[0] == '3' && start[1] <= '1') + || (start[0] == '0' && start[1] != '0')))) { + *endp = ptr; + return start; + } + start = ptr; + } + return 0; +} + +static int find_month(const char *start, const char *end) +{ + static const char *months[] = { + "january", + "february", + "march", + "april", + "may", + "june", + "july", + "august", + "september", + "october", + "november", + "december", + }; + for (;;) { + while (start < end && !csalpha(*start)) + start++; + const char *ptr = start; + if (start == end) + break; + while (ptr < end && csalpha(*ptr)) + ptr++; + if (ptr - start >= 3) { + for (unsigned int i = 0; i < sizeof(months)/sizeof(months[0]); i++) { + const char *q = months[i]; + const char *p = start; + for (; p < ptr; p++, q++) + if (cmlower(*p) != *q) + break; + if (p >= ptr) + return i; + } + } + start = ptr; + } + return -1; +} + +int reference::contains_field(char c) const +{ + return field_index[(unsigned char)c] != NULL_FIELD_INDEX; +} + +int reference::classify() +{ + if (contains_field('J')) + return JOURNAL_ARTICLE; + if (contains_field('B')) + return ARTICLE_IN_BOOK; + if (contains_field('G')) + return TECH_REPORT; + if (contains_field('R')) + return TECH_REPORT; + if (contains_field('I')) + return BOOK; + if (contains_field('M')) + return BELL_TM; + return OTHER; +} + +const char *reference::get_year(const char **endp) const +{ + if (field_index['D'] != NULL_FIELD_INDEX) { + string &date = field[field_index['D']]; + const char *start = date.contents(); + const char *end = start + date.length(); + return find_year(start, end, endp); + } + else + return 0; +} + +const char *reference::get_field(unsigned char c, const char **endp) const +{ + if (field_index[c] != NULL_FIELD_INDEX) { + string &f = field[field_index[c]]; + const char *start = f.contents(); + *endp = start + f.length(); + return start; + } + else + return 0; +} + +const char *reference::get_date(const char **endp) const +{ + return get_field('D', endp); +} + +const char *nth_field(int i, const char *start, const char **endp) +{ + while (--i >= 0) { + start = (char *)memchr(start, FIELD_SEPARATOR, *endp - start); + if (!start) + return 0; + start++; + } + const char *e = (char *)memchr(start, FIELD_SEPARATOR, *endp - start); + if (e) + *endp = e; + return start; +} + +const char *reference::get_author(int i, const char **endp) const +{ + for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) { + const char *start = get_field(*f, endp); + if (start) { + if (strchr(MULTI_FIELD_NAMES, *f) != 0) + return nth_field(i, start, endp); + else if (i == 0) + return start; + else + return 0; + } + } + return 0; +} + +const char *reference::get_author_last_name(int i, const char **endp) const +{ + for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) { + const char *start = get_field(*f, endp); + if (start) { + if (strchr(MULTI_FIELD_NAMES, *f) != 0) { + start = nth_field(i, start, endp); + if (!start) + return 0; + } + if (*f == 'A') + return find_last_name(start, *endp, endp); + else + return start; + } + } + return 0; +} + +void reference::set_date(string &d) +{ + if (d.length() == 0) + delete_field('D'); + else + insert_field('D', d); +} + +int same_year(const reference &r1, const reference &r2) +{ + const char *ye1; + const char *ys1 = r1.get_year(&ye1); + const char *ye2; + const char *ys2 = r2.get_year(&ye2); + if (ys1 == 0) { + if (ys2 == 0) + return same_date(r1, r2); + else + return 0; + } + else if (ys2 == 0) + return 0; + else if (ye1 - ys1 != ye2 - ys2) + return 0; + else + return memcmp(ys1, ys2, ye1 - ys1) == 0; +} + +int same_date(const reference &r1, const reference &r2) +{ + const char *e1; + const char *s1 = r1.get_date(&e1); + const char *e2; + const char *s2 = r2.get_date(&e2); + if (s1 == 0) + return s2 == 0; + else if (s2 == 0) + return 0; + else if (e1 - s1 != e2 - s2) + return 0; + else + return memcmp(s1, s2, e1 - s1) == 0; +} + +const char *reference::get_sort_field(int i, int si, int ssi, + const char **endp) const +{ + const char *start = sort_key.contents(); + const char *end = start + sort_key.length(); + if (i < 0) { + *endp = end; + return start; + } + while (--i >= 0) { + start = (char *)memchr(start, SORT_SEP, end - start); + if (!start) + return 0; + start++; + } + const char *e = (char *)memchr(start, SORT_SEP, end - start); + if (e) + end = e; + if (si < 0) { + *endp = end; + return start; + } + while (--si >= 0) { + start = (char *)memchr(start, SORT_SUB_SEP, end - start); + if (!start) + return 0; + start++; + } + e = (char *)memchr(start, SORT_SUB_SEP, end - start); + if (e) + end = e; + if (ssi < 0) { + *endp = end; + return start; + } + while (--ssi >= 0) { + start = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start); + if (!start) + return 0; + start++; + } + e = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start); + if (e) + end = e; + *endp = end; + return start; +} + diff --git a/src/preproc/refer/ref.h b/src/preproc/refer/ref.h new file mode 100644 index 0000000..1205a28 --- /dev/null +++ b/src/preproc/refer/ref.h @@ -0,0 +1,127 @@ +// -*- C++ -*- +/* Copyright (C) 1989-2020 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or +(at your option) any later version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . */ + +// declarations to avoid friend name injection problems +int compare_reference(const reference &, const reference &); +int same_reference(const reference &, const reference &); +int same_year(const reference &, const reference &); +int same_date(const reference &, const reference &); +int same_author_last_name(const reference &, const reference &, int); +int same_author_name(const reference &, const reference &, int); + +struct label_info; + +enum label_type { NORMAL_LABEL, SHORT_LABEL }; +const int N_LABEL_TYPES = 2; + +struct substring_position { + int start; + int length; + substring_position() : start(-1) { } +}; + +class int_set { + string v; +public: + int_set() { } + void set(int i); + int get(int i) const; +}; + +class reference { +private: + unsigned h; + reference_id rid; + int merged; + string sort_key; + int no; + string *field; + int nfields; + unsigned char field_index[256]; + enum { NULL_FIELD_INDEX = 255 }; + string label; + substring_position separator_pos; + string short_label; + substring_position short_separator_pos; + label_info *label_ptr; + string authors; + int computed_authors; + int last_needed_author; + int nauthors; + int_set last_name_unambiguous; + + int contains_field(char) const; + void insert_field(unsigned char, string &s); + void delete_field(unsigned char); + void set_date(string &); + const char *get_sort_field(int i, int si, int ssi, const char **endp) const; + int merge_labels_by_parts(reference **, int, label_type, string &); + int merge_labels_by_number(reference **, int, label_type, string &); +public: + reference(const char * = 0, int = -1, reference_id * = 0); + ~reference(); + void output(FILE *); + void print_sort_key_comment(FILE *); + void set_number(int); + int get_number() const { return no; } + unsigned hash() const { return h; } + const string &get_label(label_type type) const; + const substring_position &get_separator_pos(label_type) const; + int is_merged() const { return merged; } + void compute_sort_key(); + void compute_hash_code(); + void pre_compute_label(); + void compute_label(); + void immediate_compute_label(); + int classify(); + void merge(reference &); + int merge_labels(reference **, int, label_type, string &); + int get_nauthors() const; + void need_author(int); + void set_last_name_unambiguous(int); + void sortify_authors(int, string &) const; + void canonicalize_authors(string &) const; + void sortify_field(unsigned char, int, string &) const; + const char *get_author(int, const char **) const; + const char *get_author_last_name(int, const char **) const; + const char *get_date(const char **) const; + const char *get_year(const char **) const; + const char *get_field(unsigned char, const char **) const; + const label_info *get_label_ptr() const { return label_ptr; } + const char *get_authors(const char **) const; + // for sorting + friend int compare_reference(const reference &r1, const reference &r2); + // for merging + friend int same_reference(const reference &, const reference &); + friend int same_year(const reference &, const reference &); + friend int same_date(const reference &, const reference &); + friend int same_author_last_name(const reference &, const reference &, int); + friend int same_author_name(const reference &, const reference &, int); +}; + +const char *find_year(const char *, const char *, const char **); +const char *find_last_name(const char *, const char *, const char **); + +const char *nth_field(int i, const char *start, const char **endp); + +void capitalize(const char *ptr, const char *end, string &result); +void reverse_name(const char *ptr, const char *end, string &result); +void uppercase(const char *ptr, const char *end, string &result); +void lowercase(const char *ptr, const char *end, string &result); +void abbreviate_name(const char *ptr, const char *end, string &result); diff --git a/src/preproc/refer/refer.1.man b/src/preproc/refer/refer.1.man new file mode 100644 index 0000000..210afe7 --- /dev/null +++ b/src/preproc/refer/refer.1.man @@ -0,0 +1,2020 @@ +.TH @g@refer @MAN1EXT@ "@MDATE@" "groff @VERSION@" +.SH Name +@g@refer \- process bibliographic references for +.I groff +. +. +.\" ==================================================================== +.\" Legal Terms +.\" ==================================================================== +.\" +.\" Copyright (C) 1989-2021 Free Software Foundation, Inc. +.\" +.\" Permission is granted to make and distribute verbatim copies of this +.\" manual provided the copyright notice and this permission notice are +.\" preserved on all copies. +.\" +.\" Permission is granted to copy and distribute modified versions of +.\" this manual under the conditions for verbatim copying, provided that +.\" the entire resulting derived work is distributed under the terms of +.\" a permission notice identical to this one. +.\" +.\" Permission is granted to copy and distribute translations of this +.\" manual into another language, under the above conditions for +.\" modified versions, except that this permission notice may be +.\" included in translations approved by the Free Software Foundation +.\" instead of in the original English. +. +. +.\" Save and disable compatibility mode (for, e.g., Solaris 10/11). +.do nr *groff_refer_1_man_C \n[.cp] +.cp 0 +. +.\" Define fallback for groff 1.23's MR macro if the system lacks it. +.nr do-fallback 0 +.if !\n(.f .nr do-fallback 1 \" mandoc +.if \n(.g .if !d MR .nr do-fallback 1 \" older groff +.if !\n(.g .nr do-fallback 1 \" non-groff *roff +.if \n[do-fallback] \{\ +. de MR +. ie \\n(.$=1 \ +. I \%\\$1 +. el \ +. IR \%\\$1 (\\$2)\\$3 +. . +.\} +.rr do-fallback +. +. +.\" ==================================================================== +.SH Synopsis +.\" ==================================================================== +. +.SY @g@refer +.RB [ \-bCenPRS ] +.RB [ \-a\~\c +.IR n ] +.RB [ \-B +.IB field . macro\c +] +.RB [ \-c\~\c +.IR fields ] +.RB [ \-f\~\c +.IR n ] +.RB [ \-i\~\c +.IR fields ] +.RB [ \-k\~\c +.IR field ] +.RB [ \-l\~\c +.IR range-expression ] +.RB [ \-p\~\c +.IR database-file ] +.RB [ \-s\~\c +.IR fields ] +.RB [ \-t\~\c +.IR n ] +.RI [ file\~ .\|.\|.] +.YS +. +. +.SY @g@refer +.B \-\-help +.YS +. +. +.SY @g@refer +.B \-v +. +.SY @g@refer +.B \-\-version +.YS +. +. +.\" ==================================================================== +.SH Description +.\" ==================================================================== +. +The GNU implementation of +.I \%refer \" generic +is part of the +.MR groff @MAN1EXT@ +document formatting system. +. +.I @g@refer +is a +.MR @g@troff @MAN1EXT@ +preprocessor that prepares bibilographic citations by looking up +keywords specified in a +.MR roff @MAN7EXT@ +input document, +obviating the need to type such annotations, +and permitting the citation style in formatted output to be altered +independently and systematically. +. +It copies the contents of each +.I file +to the standard output stream, +except that it interprets lines between +.B .[ +and +.B .]\& +as citations to be translated into +.I groff +input, +and lines between +.B .R1 +and +.B .R2 +as instructions regarding how citations are to be processed. +. +Normally, +.I @g@refer +is not executed directly by the user, +but invoked by specifying the +.B \-R +option to +.MR groff @MAN1EXT@ . +. +If no +.I file +operands are given on the command line, +or if +.I file +is +.RB \[lq] \- \[rq], +the standard input stream is read. +. +. +.LP +Each citation specifies a reference. +. +The citation can specify a reference that is contained in a +bibliographic database by giving a set of keywords that only that +reference contains. +. +Alternatively it can specify a reference by supplying a database record +in the citation. +. +A combination of these alternatives is also possible. +. +. +.LP +For each citation, +.I @g@refer +can produce a mark in the text. +. +This mark consists of some label which can be separated from the text +and from other labels in various ways. +. +For each reference it also outputs +.MR groff @MAN7EXT@ +language commands that can be used by a macro package to produce a +formatted reference for each citation. +. +The output of +.I @g@refer +must therefore be processed using a suitable macro package, +such as +.\" .IR man , +.IR me , +.IR mm , +.IR mom , +or +.IR ms . +. +The commands to format a citation's reference can be output immediately +after the citation, +or the references may be accumulated, +and the commands output at some later point. +. +If the references are accumulated, +then multiple citations of the same reference will produce a single +formatted reference. +. +. +.LP +The interpretation of lines between +.B .R1 +and +.B .R2 +as prepreocessor commands is a feature of GNU +.IR \%refer . \" GNU +. +Documents making use of this feature can still be processed by AT&T +.I \%refer \" AT&T +just by adding the lines +. +.RS +.EX +\&.de R1 +\&.ig R2 +\&.. +.EE +.RE +. +to the beginning of the document. +. +This will cause +.MR @g@troff @MAN1EXT@ +to ignore everything between +.B .R1 +and +.BR .R2 . +. +The effect of some commands can also be achieved by options. +. +These options are supported mainly for compatibility with AT&T +.IR \%refer . \" AT&T +. +It is usually more convenient to use commands. +. +. +.LP +.I @g@refer +generates +.B .lf +requests so that file names and line numbers in messages produced by +commands that read +.I @g@refer +output will be correct; +it also interprets lines beginning with +.B .lf +so that file names and line numbers in the messages and +.B .lf +lines that it produces will be accurate even if the input has been +preprocessed by a command such as +.MR @g@soelim @MAN1EXT@ . +. +. +.\" ==================================================================== +.SS "Bibliographic databases" +.\" ==================================================================== +. +The bibliographic database is a text file consisting of records +separated by one or more blank lines. +. +Within each record fields start with a +.B % +at the beginning of a line. +. +Each field has a one character name that immediately follows the +.BR % . +It is best to use only upper and lower case letters for the names +of fields. +. +The name of the field should be followed by exactly one space, +and then by the contents of the field. +. +Empty fields are ignored. +. +The conventional meaning of each field is as follows: +. +. +.TP +.B %A +The name of an author. +. +If the name contains a suffix such as \[lq]Jr.\&\[rq], +it should be separated from the last name by a comma. +. +There can be multiple occurrences of the +.B %A +field. +. +The order is significant. +. +It is a good idea always to supply an +.B %A +field or a +.B %Q +field. +. +. +.TP +.B %B +For an article that is part of a book, +the title of the book. +. +. +.TP +.B %C +The place (city) of publication. +. +. +.TP +.B %D +The date of publication. +. +The year should be specified in full. +. +If the month is specified, +the name rather than the number of the month should be used, +but only the first three letters are required. +. +It is a good idea always to supply a +.B %D +field; +if the date is unknown, +a value such as +.B in press +or +.B unknown +can be used. +. +. +.TP +.B %E +For an article that is part of a book, +the name of an editor of the book. +. +Where the work has editors and no authors, +the names of the editors should be given as +.B %A +fields and +.RB \[lq] ,\~(ed.)\& \[rq] +or +.RB \[lq] ,\~(eds.)\& \[rq] +should be appended to the last author. +. +. +.TP +.B %G +U.S. government ordering number. +. +. +.TP +.B %I +The publisher (issuer). +. +. +.TP +.B %J +For an article in a journal, +the name of the journal. +. +. +.TP +.B %K +Keywords to be used for searching. +. +. +.TP +.B %L +Label. +. +. +.TP +.B %N +Journal issue number. +. +. +.TP +.B %O +Other information. +. +This is usually printed at the end of the reference. +. +. +.TP +.B %P +Page number. +. +A range of pages can be specified as +.IB m \- \c +.IR n . +. +. +.TP +.B %Q +The name of the author, +if the author is not a person. +. +This will only be used if there are no +.B %A +fields. +. +There can only be one +.B %Q +field. +. +. +.TP +.B %R +Technical report number. +. +. +.TP +.B %S +Series name. +. +. +.TP +.B %T +Title. +. +For an article in a book or journal, +this should be the title of the article. +. +. +.TP +.B %V +Volume number of the journal or book. +. +. +.TP +.B %X +Annotation. +. +. +.LP +For all fields except +.B %A +and +.BR %E , +if there is more than one occurrence of a particular field in a record, +only the last such field will be used. +. +. +.P +If accent strings are used, +they should follow the character to be accented. +. +This means that an +.I ms +document must call the +.B .AM +macro when it initializes. +. +Accent strings should not be quoted: +use one +.B \e +rather than two. +. +Accent strings are an obsolescent feature of the +.I me +and +.I ms +macro packages; +modern documents should use +.I groff +special character escape sequences instead; +see +.MR groff_char @MAN7EXT@ . +. +. +.\" ==================================================================== +.SS Citations +.\" ==================================================================== +. +Citations have a characteristic format. +. +.RS +.EX +.BI .[ opening-text +.I flags keywords +.I fields +.BI .] closing-text +.EE +.RE +. +. +.LP +The +.IR opening-text , +.IR closing-text , +and +.I flags +components are optional. +. +Only one of the +.I keywords +and +.I fields +components need be specified. +. +. +.LP +The +.I keywords +component says to search the bibliographic databases for a reference +that contains all the words in +.IR keywords . +. +It is an error if more than one reference is found. +. +. +.LP +The +.I fields +components specifies additional fields to replace or supplement those +specified in the reference. +. +When references are being accumulated and the +.I keywords +component is non-empty, +then additional fields should be specified only on the first occasion +that a particular reference is cited, +and will apply to all citations of that reference. +. +. +.br +.ne 2v +.LP +The +.I opening-text +and +.I closing-text +components specify strings to be used to bracket the label instead of +those in the +.B \%bracket\-label +command. +. +If either of these components is non-empty, +the strings specified in the +.B \%bracket\-label +command will not be used; +this behavior can be altered using the +.B [ +and +.B ] +flags. +. +Leading and trailing spaces are significant for these components. +. +. +.LP +The +.I flags +component is a list of non-alphanumeric characters each of which +modifies the treatment of this particular citation. +. +AT&T +.I \%refer \" AT&T +will treat these flags as part of the keywords and so will ignore them +since they are non-alphanumeric. +. +The following flags are currently recognized. +. +. +.TP +.B # +Use the label specified by the +.B \%short\-label +command, +instead of that specified by the +.B \%label +command. +. +If no short label has been specified, +the normal label will be used. +. +Typically the short label is used with author-date labels and consists +of only the date and possibly a disambiguating letter; +the +.RB \[lq] # \[rq] +is supposed to be suggestive of a numeric type of label. +. +. +.TP +.B [ +Precede +.I opening-text +with the first string specified in the +.B \%bracket\-label +command. +. +. +.TP +.B ] +Follow +.I closing-text +with the second string specified in the +.B \%bracket\-label +command. +. +. +.LP +An advantage of using the +.B [ +and +.B ] +flags rather than including the brackets in +.I opening-text +and +.I closing-text +is that +. +you can change the style of bracket used in the document just by +changing the +.B \%bracket\-label +command. +. +Another is that sorting and merging of citations will not necessarily be +inhibited if the flags are used. +. +. +.LP +If a label is to be inserted into the text, +it will be attached to the line preceding the +.B .[ +line. +. +If there is no such line, +then an extra line will be inserted before the +.B .[ +line and a warning will be given. +. +. +.LP +There is no special notation for making a citation to multiple +references. +. +Just use a sequence of citations, +one for each reference. +. +Don't put anything between the citations. +. +The labels for all the citations will be attached to the line preceding +the first citation. +. +The labels may also be sorted or merged. +. +See the description of the +.B <> +label expression, +and of the +.B \%sort\-adjacent\-labels +and +.B \%abbreviate\-label\-ranges +commands. +. +A label will not be merged if its citation has a non-empty +.I opening-text +or +.IR closing-text . +. +However, +the labels for a citation using the +.B ] +flag and without any +.I closing-text +immediately followed by a citation using the +.B [ +flag and without any +.I opening-text +may be sorted and merged +even though the first citation's +.I opening-text +or the second citation's +.I closing-text +is non-empty. +. +(If you wish to prevent this, +use the dummy character escape sequence +.B \[rs]& +as the first citation's +.IR closing-text .) +. +. +.\" ==================================================================== +.SS Commands +.\" ==================================================================== +. +Commands are contained between lines starting with +.B .R1 +and +.BR .R2 . +. +Recognition of these lines can be prevented by the +.B \-R +option. +. +When a +.B .R1 +line is recognized any accumulated references are flushed out. +. +Neither +.B .R1 +nor +.B .R2 +lines, +nor anything between them, +is output. +. +. +.P +Commands are separated by newlines or semicolons. +. +A number sign +.RB ( # ) +introduces a comment that extends to the end of the line, +but does not conceal the newline. +. +Each command is broken up into words. +. +Words are separated by spaces or tabs. +. +A word that begins with a (neutral) double quote +.RB ( \[dq] ) +extends to the next double quote that is not followed by another double +quote. +. +If there is no such double quote, +the word extends to the end of the line. +. +Pairs of double quotes in a word beginning with a double quote collapse +to one double quote. +. +Neither a number sign nor a semicolon is recognized inside double +quotes. +. +A line can be continued by ending it with a backslash +.RB \[lq] \[rs] \[rq]; +this works everywhere except after a number sign. +. +. +.LP +.ds n \fR*\fP\" +Each command +.I name +that is marked with \*n has an associated negative command +.BI no\- name +that undoes the effect of +.IR name . +. +For example, +the +.B no\-sort +command specifies that references should not be sorted. +. +The negative commands take no arguments. +. +. +.LP +In the following description each argument must be a single word; +.I field +is used for a single upper or lower case letter naming a field; +.I fields +is used for a sequence of such letters; +.I m +and +.I n +are used for a non-negative numbers; +.I string +is used for an arbitrary string; +.I file +is used for the name of a file. +. +. +.TP +.BI abbreviate\*n\~ fields\~string1\~string2\~string3\~string4 +Abbreviate the first names of +.IR fields . +. +An initial letter will be separated from another initial letter by +.IR string1 , +from the last name by +.IR string2 , +and from anything else +(such as \[lq]von\[rq] or \[lq]de\[rq]) +by +.IR string3 . +. +These default to a period followed by a space. +. +In a hyphenated first name, +the initial of the first part of the name will be separated from the +hyphen by +.IR string4 ; +this defaults to a period. +. +No attempt is made to handle any ambiguities that might +result from abbreviation. +. +Names are abbreviated before sorting and before label construction. +. +. +.TP +.BI abbreviate\-label\-ranges\*n\~ string +. +Three or more adjacent labels that refer to consecutive references +will be abbreviated to a label consisting of the first label, +followed by +.IR string , +followed by the last label. +. +This is mainly useful with numeric labels. +. +If +.I string +is omitted, +it defaults to +.RB \[lq] \- \[rq]. +. +. +.TP +.B accumulate\*n +Accumulate references instead of writing out each reference +as it is encountered. +. +Accumulated references will be written out whenever a reference +of the form +. +.RS +.RS +.EX +.B .[ +.B $LIST$ +.B .] +.EE +.RE +. +is encountered, +after all input files have been processed, +and whenever a +.B .R1 +line is recognized. +.RE +. +. +.TP +.BI annotate\*n\~ "field string" +.I field +is an annotation; +print it at the end of the reference as a paragraph preceded by the line +. +.RS +.IP +.BI . string +. +. +.LP +If +.I string +is omitted, +it will default to +.BR AP ; +if +.I field +is also omitted it will default to +.BR X . +. +Only one field can be an annotation. +.RE +. +. +.TP +.BI articles\~ string\~\c +\&.\|.\|. +Each +.I string +is a definite or indefinite article, +and should be ignored at the beginning of +.B T +fields when sorting. +. +Initially, +\[lq]a\[rq], +\[lq]an\[rq], +and +\[lq]the\[rq] are recognized as articles. +. +. +.TP +.BI bibliography\~ file\~\c +\&.\|.\|. +. +Write out all the references contained in each bibliographic database +.IR file . +. +This command should come last in an +.BR .R1 / .R2 +block. +. +. +.TP +.BI bracket\-label\~ "string1 string2 string3" +In the text, +bracket each label with +.I string1 +and +.IR string2 . +. +An occurrence of +.I string2 +immediately followed by +.I string1 +will be turned into +.IR string3 . +. +The default behavior is as follows. +. +.RS \" RS twice to get inboard of the tagged paragraph indentation. +.RS +.EX +.B bracket\-label \e*([. \e*(.] \[dq], \[dq] +.EE +.RE +.RE +. +. +.TP +.BI capitalize\~ fields +Convert +.I fields +to caps and small caps. +. +. +.TP +.B compatible\*n +Recognize +.B .R1 +and +.B .R2 +even when followed by a character other than space or newline. +. +. +.TP +.BI database\~ file\~\c +\&.\|.\|. +Search each bibliographic database +.IR file . +. +For each +.IR file , +if an index +.RI file @INDEX_SUFFIX@ +created by +.MR @g@indxbib @MAN1EXT@ +exists, +then it will be searched instead; +each index can cover multiple databases. +. +. +.TP +.BI date\-as\-label\*n\~ string +.I string +is a label expression that specifies a string with which to replace the +.B D +field after constructing the label. +. +See subsection \[lq]Label expressions\[rq] below for a description of +label expressions. +. +This command is useful if you do not want explicit labels in the +reference list, +but instead want to handle any necessary disambiguation by qualifying +the date in some way. +. +The label used in the text would typically be some combination of the +author and date. +. +In most cases you should also use the +.B \%no\-label\-in\-reference +command. +. +For example, +. +.RS \" RS twice to get inboard of the tagged paragraph indentation. +.RS +.EX +.B date\-as\-label D.+yD.y%a*D.\-y +.EE +.RE +. +would attach a disambiguating letter to the year part of the +.B D +field in the reference. +.RE +. +. +.TP +.B default\-database\*n +The default database should be searched. +. +This is the default behavior, +so the negative version of this command is more useful. +. +.I @g@refer +determines whether the default database should be searched +on the first occasion that it needs to do a search. +. +Thus a +.B \%no\-default\-database +command must be given before then, +in order to be effective. +. +. +.TP +.BI discard\*n\~ fields +When the reference is read, +.I fields +should be discarded; +no string definitions for +.I fields +will be output. +. +Initially, +.I fields +are +.BR XYZ . +. +. +.TP +.BI et\-al\*n\~ "string m n" +Control use of +.B et al.\& +in the evaluation of +.B @ +expressions in label expressions. +. +If the number of authors needed to make the author sequence unambiguous +is +.I u +and the total number of authors is +.I t +then the last +.IR t \|\-\| u +authors will be replaced by +.I string +provided that +.IR t \|\-\| u +is not less than +.I m +and +.I t +is not less than +.IR n . +. +The default behavior is as follows. +. +.RS \" RS twice to get inboard of the tagged paragraph indentation. +.RS +.EX +.B et\-al \[dq] et al\[dq] 2 3 +.EE +.RE +. +Note the absence of a dot from the end of the abbreviation, +which is arguably not correct. +. +.RI ( "Et al" [.] +is short for +.IR "et alli" , +as +.I etc.\& +is short for +.IR "et cetera".) +.RE +. +. +.TP +.BI include\~ file +Include +.I file +and interpret the contents as commands. +. +. +.TP +.BI join\-authors\~ "string1 string2 string3" +Join multiple authors together with +.IR string s. +. +When there are exactly two authors, +they will be joined with +.IR string1 . +. +When there are more than two authors, +all but the last two will be joined with +.IR string2 , +and the last two authors will be joined with +.IR string3 . +. +If +.I string3 +is omitted, +it will default to +.IR string1 ; +if +.I string2 +is also omitted it will also default to +.IR string1 . +. +For example, +. +.RS +.RS +.EX +join\-authors \[dq] and \[dq] \[dq], \[dq] \[dq], and \[dq] +.EE +.RE +. +will restore the default method for joining authors. +.RE +. +. +.TP +.B label\-in\-reference\*n +When outputting the reference, +define the string +.B [F +to be the reference's label. +. +This is the default behavior, +so the negative version of this command is more useful. +. +. +.TP +.B label\-in\-text\*n +For each reference output a label in the text. +. +The label will be separated from the surrounding text as described in +the +.B \%bracket\-label +command. +. +This is the default behavior, +so the negative version of this command is more useful. +. +. +.TP +.BI label\~ string +.I string +is a label expression describing how to label each reference. +. +. +.TP +.BI separate\-label\-second\-parts\~ string +When merging two-part labels, +separate the second part of the second label from the first label with +.IR string . +. +See the description of the +.B <> +label expression. +. +. +.TP +.B move\-punctuation\*n +In the text, +move any punctuation at the end of line past the label. +. +It is usually a good idea to give this command unless you are using +superscripted numbers as labels. +. +. +.TP +.BI reverse\*n\~ string +Reverse the fields whose names +are in +.IR string . +. +Each field name can be followed by a number which says how many such +fields should be reversed. +. +If no number is given for a field, +all such fields will be reversed. +. +. +.TP +.BI search\-ignore\*n\~ fields +While searching for keys in databases for which no index exists, +ignore the contents of +.IR fields . +. +Initially, +fields +.B XYZ +are ignored. +. +. +.TP +.BI search\-truncate\*n\~ n +Only require the first +.I n +characters of keys to be given. +. +In effect when searching for a given key words in the database are +truncated to the maximum of +.I n +and the length of the key. +. +Initially, +.I n +is\~6. +. +. +.TP +.BI short\-label\*n\~ string +.I string +is a label expression that specifies an alternative +(usually shorter) +style of label. +. +This is used when the +.B # +flag is given in the citation. +. +When using author-date style labels, +the identity of the author or authors is sometimes clear from the +context, +and so it may be desirable to omit the author or authors from the label. +. +The +.B \%short\-label +command will typically be used to specify a label containing just +a date and possibly a disambiguating letter. +. +. +.TP +.BI sort\*n\~ string +Sort references according to +.IR string . +. +References will automatically be accumulated. +. +.I string +should be a list of field names, +each followed by a number, +indicating how many fields with the name should be used for sorting. +. +.RB \[lq] + \[rq] +can be used to indicate that all the fields with the name should be +used. +. +Also +.B .\& +can be used to indicate the references should be sorted using the +(tentative) label. +. +(Subsection \[lq]Label expressions\[rq] below describes the concept of a +tentative label.) +. +. +.TP +.B sort\-adjacent\-labels\*n +Sort labels that are adjacent in the text according to their position +in the reference list. +. +This command should usually be given if the +.B \%abbreviate\-label\-ranges +command has been given, +or if the label expression contains a +.B <> +expression. +. +This will have no effect unless references are being accumulated. +. +. +.\" ==================================================================== +.SS "Label expressions" +.\" ==================================================================== +. +Label expressions can be evaluated both normally and tentatively. +. +The result of normal evaluation is used for output. +. +The result of tentative evaluation, +called the +.IR "tentative label" , +is used to gather the information that normal evaluation needs to +disambiguate the label. +. +Label expressions specified by the +.B \%date\-as\-label +and +.B \%short\-label +commands are not evaluated tentatively. +. +Normal and tentative evaluation are the same for all types of expression +other than +.BR @ , +.BR * , +and +.B % +expressions. +. +The description below applies to normal evaluation, +except where otherwise specified. +. +. +.TP +.I field +.TQ +.I field\~n +The +.IR n -th +part of +.IR field . +. +If +.I n +is omitted, +it defaults to\~1. +. +. +.TP +.BI \[aq] string \[aq] +The characters in +.I string +literally. +. +. +.TP +.B @ +All the authors joined as specified by the +.B \%join\-authors +command. +. +The whole of each author's name will be used. +. +However, +if the references are sorted by author +(that is, +the sort specification starts with +.RB \[lq] A+ \[rq]), +then authors' last names will be used instead, +provided that this does not introduce ambiguity, +and also an initial subsequence of the authors may be used instead of +all the authors, +again provided that this does not introduce ambiguity. +. +The use of only the last name for the +.IR i -th +author of some reference +is considered to be ambiguous if +there is some other reference, +such that the first +.IR i \|\-\|1 +authors of the references are the same, +the +.IR i -th +authors are not the same, +but the +.IR i -th +authors last names are the same. +. +A proper initial subsequence of the sequence of authors for some +reference is considered to be ambiguous if there is a reference with +some other sequence of authors which also has that subsequence as a +proper initial subsequence. +. +When an initial subsequence of authors is used, +the remaining authors are replaced by the string specified by the +.B \%et\-al +command; +this command may also specify additional requirements that must be +met before an initial subsequence can be used. +. +.B @ +tentatively evaluates to a canonical representation of the authors, +such that authors that compare equally for sorting purpose will have +the same representation. +. +. +.TP +.BI % n +.TQ +.B %a +.TQ +.B %A +.TQ +.B %i +.TQ +.B %I +The serial number of the reference formatted according to the +character following the +.BR % . +The serial number of a reference is\~1 plus the number of earlier +references with same tentative label as this reference. +. +These expressions tentatively evaluate to an empty string. +. +.TP +.IB expr * +If there is another reference with the same tentative label as this +reference, +then +.IR expr , +otherwise an empty string. +. +It tentatively evaluates to an empty string. +. +. +.TP +.IB expr + n +.TQ +.IB expr \- n +The first +.RB ( + ) +or last +.RB ( \- ) +.I n +upper or lower case letters or digits of +.IR expr . +. +.I roff +special characters +(such as +.BR \e(\[aq]a ) +count as a single letter. +. +Accent strings are retained but do not count towards the total. +. +. +.TP +.IB expr .l +.I expr +converted to lowercase. +. +. +.TP +.IB expr .u +.I expr +converted to uppercase. +. +. +.TP +.IB expr .c +.I expr +converted to caps and small caps. +. +. +.TP +.IB expr .r +.I expr +reversed so that the last name is first. +. +. +.TP +.IB expr .a +.I expr +with first names abbreviated. +. +Fields specified in the +.B \%abbreviate +command are abbreviated before any labels are evaluated. +. +Thus +.B .a +is useful only when you want a field to be abbreviated in a label +but not in a reference. +. +. +.TP +.IB expr .y +The year part of +.IR expr . +. +. +.TP +.IB expr .+y +The part of +.I expr +before the year, +or the whole of +.I expr +if it does not contain a year. +. +. +.TP +.IB expr .\-y +The part of +.I expr +after the year, +or an empty string if +.I expr +does not contain a year. +. +. +.TP +.IB expr .n +The last name part of +.IR expr . +. +. +.TP +.IB expr1 \[ti] expr2 +.I expr1 +except that if the last character of +.I expr1 +is +.B \- +then it will be replaced by +.IR expr2 . +. +. +.TP +.I expr1 expr2 +The concatenation of +.I expr1 +and +.IR expr2 . +. +. +.TP +.IB expr1 | expr2 +If +.I expr1 +is non-empty then +.I expr1 +otherwise +.IR expr2 . +. +. +.TP +.IB expr1 & expr2 +If +.I expr1 +is non-empty +then +.I expr2 +otherwise an empty string. +. +. +.TP +.IB expr1 ? expr2 : expr3 +If +.I expr1 +is non-empty +then +.I expr2 +otherwise +.IR expr3 . +. +. +.TP +.BI < expr > +The label is in two parts, +which are separated by +.IR expr . +. +Two adjacent two-part labels which have the same first part will be +merged by appending the second part of the second label onto the first +label separated by the string specified in the +.B \%separate\-label\-second\-parts +command +(initially, +a comma followed by a space); +the resulting label will also be a two-part label with the same first +part as before merging, +and so additional labels can be merged into it. +. +It is permissible for the first part to be empty; +this may be desirable for expressions used in the +.B \%short\-label +command. +. +. +.TP +.BI ( expr ) +The same as +.IR expr . +. +Used for grouping. +. +. +.LP +The above expressions are listed in order of precedence +(highest first); +.B & +and +.B | +have the same precedence. +. +. +.\" ==================================================================== +.SS "Macro interface" +.\" ==================================================================== +. +Each reference starts with a call to the macro +.BR ]\- . +. +The string +.B [F +will be defined to be the label for this reference, +unless the +.B \%no\-label\-in\-reference +command has been given. +. +There then follows a series of string definitions, +one for each field: +string +.BI [ X +corresponds to field +.IR X . +. +The register +.B [P +is set to\~1 if the +.B P +field contains a range of pages. +. +The +.BR [T , +.B [A +and +.B [O +registers are set to\~1 according as the +.BR T , +.B A +and +.B O +fields end with any of +.B .?!\& +(an end-of-sentence character). +. +The +.B [E +register will be set to\~1 if the +.B [E +string contains more than one name. +. +The reference is followed by a call to the +.B ][ +macro. +. +The first argument to this macro gives a number representing +the type of the reference. +. +If a reference contains a +.B J +field, +it will be classified as type\~1, +otherwise if it contains a +.B B +field, +it will be type\~3, +otherwise if it contains a +.B G +or +.B R +field it will be type\~4, +otherwise if it contains an +.B I +field it will be type\~2, +otherwise it will be type\~0. +. +The second argument is a symbolic name for the type: +.BR other , +.BR \%journal\-article , +.BR book , +.BR \%article\-in\-book , +or +.BR \%tech\-report . +. +Groups of references that have been accumulated or are produced by the +.B \%bibliography +command are preceded by a call to the +.B ]< +macro and followed by a call to the +.B ]> +macro. +. +. +.br +.ne 4v +.\" ==================================================================== +.SH Options +.\" ==================================================================== +. +.B \-\-help +displays a usage message, +while +.B \-v +and +.B \-\-version +show version information; +all exit afterward. +. +. +.TP +.B \-R +Don't recognize lines beginning with +.BR .R1 / .R2 . +. +. +.P +Other options are equivalent to +.I @g@refer +commands. +. +. +.TP 16n +.BI \-a\~ n +.B reverse +.BI A n +. +. +.TP +.B \-b +.B "\%no\-label\-in\-text; \%no\-label\-in\-reference" +. +. +.TP +.B \-B +See below. +. +. +.TP +.BI \-c\~ fields +.B capitalize +.I fields +. +. +.TP +.B \-C +.B compatible +. +. +.TP +.B \-e +.B accumulate +. +. +.TP +.BI \-f\~ n +.B \%label +.BI % n +. +. +.TP +.BI \-i\~ fields +.B search\-ignore +.I fields +. +. +.TP +.B \-k +.B \%label +.B L\[ti]%a +. +. +.TP +.BI \-k\~ field +.B \%label +.IB field \[ti]%a +. +. +.TP +.B \-l +.B \%label +.B A.nD.y%a +. +. +.TP +.BI \-l\~ m +.B \%label +.BI A.n+ m D.y%a +. +. +.TP +.BI \-l\~, n +.B \%label +.BI A.nD.y\- n %a +. +. +.TP +.BI \-l\~ m , n +.B \%label +.BI A.n+ m D.y\- n %a +. +. +.TP +.B \-n +.B \%no\-default\-database +. +. +.TP +.BI \-p\~ db-file +.B database +.I db-file +. +. +.TP +.B \-P +.B move\-punctuation +. +. +.TP +.BI \-s\~ spec +.B sort +.I spec +. +. +.TP +.B \-S +.B \%label \[dq](A.n|Q) \[aq], \[aq] (D.y|D)\[dq]; \ +\%bracket-\%label \[dq]\~(\[dq]\~)\~\[dq];\~\[dq] +. +. +.TP +.BI \-t\~ n +.B search\-truncate +.I n +. +. +.P +The +.B B +option has command equivalents with the addition that the file names +specified on the command line are processed as if they were arguments to +the +.B \%bibliography +command instead of in the normal way. +. +. +.TP 16n +.B \-B +.B "annotate X AP; \%no\-label\-in\-reference" +. +. +.TP +.BI \-B\~ field . macro +.B annotate +.I field +.IB macro ; +.B \%no\-label\-in\-reference +. +. +.\" ==================================================================== +.SH Environment +.\" ==================================================================== +. +.TP +.I REFER +If set, +overrides the default database. +. +. +.\" ==================================================================== +.SH Files +.\" ==================================================================== +. +.TP +.I @DEFAULT_INDEX@ +Default database. +. +. +.TP +.RI file @INDEX_SUFFIX@ +Index files. +. +. +.TP +.I @MACRODIR@/\:refer\:.tmac +defines macros and strings facilitating integration with macro packages +that wish to support +.IR @g@refer . +. +. +.LP +.I @g@refer +uses temporary files. +. +See the +.MR groff @MAN1EXT@ +man page for details of where such files are created. +. +. +.\" ==================================================================== +.SH Bugs +.\" ==================================================================== +. +In label expressions, +.B <> +expressions are ignored inside +.BI . char +expressions. +. +. +.\" ==================================================================== +.SH Examples +.\" ==================================================================== +. +We can illustrate the operation of +.I @g@refer +with a sample bibliographic database containing one entry and a simple +.I roff +document to cite that entry. +. +. +.P +.RS +.EX +$ \c +.B cat > my\-db\-file +.B %A Daniel P.\[rs]& Friedman +.B %A Matthias Felleisen +.B %C Cambridge, Massachusetts +.B %D 1996 +.B %I The MIT Press +.B %T The Little Schemer, Fourth Edition +$ \c +.B refer -p my\-db\-file +.B Read the book +.B .[ +.B friedman +.B .] +.B on your summer vacation. +.I +\&.lf 1 \- +Read the book\[rs]*([.1\[rs]*(.] +\&.ds [F 1 +\&.]\- +\&.ds [A Daniel P. Friedman and Matthias Felleisen +\&.ds [C Cambridge, Massachusetts +\&.ds [D 1996 +\&.ds [I The MIT Press +\&.ds [T The Little Schemer, Fourth Edition +\&.nr [T 0 +\&.nr [A 0 +\&.][ 2 book +\&.lf 5 \- +on your summer vacation. +.EE +.RE +. +. +.P +The foregoing shows us that +.I @g@refer +(a) produces a label \[lq]1\[rq]; +(b) brackets that label with interpolations of the +.RB \[lq] [. \[rq] +and +.RB \[lq] .] \[rq] +strings; +(c) calls a macro +.RB \[lq] ]\- \[rq]; +(d) defines strings and registers containing the label and bibliographic +data for the reference; +(e) calls a macro +.RB \[lq] ][ \[rq]; +and (f) uses the +.B lf +request to restore the line numbers of the original input. +. +As discussed in subsection \[lq]Macro interface\[rq] above, +it is up to the document or a macro package to employ and format this +information usefully. +. +Let us see how we might turn +.MR groff_ms @MAN7EXT@ +to this task. +. +. +.P +.RS +.EX +$ \c +.B REFER=my\-db\-file groff \-R \-ms +.B .LP +.B Read the book +.B .[ +.B friedman +.B .] +.B on your summer vacation. +.B Commentary is available.\[rs]*{*\[rs]*} +.B .FS \[rs]*{*\[rs]*} +.B Space reserved for penetrating insight. +.B .FE +.EE +.RE +. +. +.LP +.IR ms 's +automatic footnote numbering mechanism is not aware of +.IR @g@refer 's +label numbering, +so we have manually specified a (superscripted) symbolic footnote for +our non-bibliographic aside. +. +. +.\" ==================================================================== +.SH "See also" +.\" ==================================================================== +. +\[lq]Some Applications of Inverted Indexes on the Unix System\[rq], +by M.\& E.\& Lesk, +1978, +AT&T Bell Laboratories Computing Science Technical Report No.\& 69. +. +. +.LP +.MR @g@indxbib @MAN1EXT@ , +.MR @g@lookbib @MAN1EXT@ , +.MR lkbib @MAN1EXT@ +. +. +.\" Restore compatibility mode (for, e.g., Solaris 10/11). +.cp \n[*groff_refer_1_man_C] +.do rr *groff_refer_1_man_C +. +. +.\" Local Variables: +.\" fill-column: 72 +.\" mode: nroff +.\" End: +.\" vim: set filetype=groff textwidth=72: diff --git a/src/preproc/refer/refer.am b/src/preproc/refer/refer.am new file mode 100644 index 0000000..273f334 --- /dev/null +++ b/src/preproc/refer/refer.am @@ -0,0 +1,61 @@ +# Copyright (C) 2014-2020 Free Software Foundation, Inc. +# +# This file is part of groff. +# +# groff is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# groff is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +prefixexecbin_PROGRAMS += refer +refer_CPPFLAGS = $(AM_CPPFLAGS) -I $(top_srcdir)/src/preproc/refer +refer_LDADD = libbib.a libgroff.a $(LIBM) lib/libgnu.a +refer_SOURCES = \ + src/preproc/refer/command.cpp \ + src/preproc/refer/ref.cpp \ + src/preproc/refer/refer.cpp \ + src/preproc/refer/token.cpp \ + src/preproc/refer/label.ypp \ + src/preproc/refer/refer.h \ + src/preproc/refer/ref.h \ + src/preproc/refer/token.h \ + src/preproc/refer/command.h + +PREFIXMAN1 += src/preproc/refer/refer.1 +EXTRA_DIST += \ + src/preproc/refer/TODO \ + src/preproc/refer/refer.1.man + +# Since refer_CPPFLAGS was set, all .o files have a 'refer-' prefix. +src/preproc/refer/refer-command.$(OBJEXT): defs.h +src/preproc/refer/refer-ref.$(OBJEXT): defs.h +src/preproc/refer/refer-refer.$(OBJEXT): defs.h +src/preproc/refer/refer-token.$(OBJEXT): defs.h +src/preproc/refer/refer-label.$(OBJEXT): defs.h + +MAINTAINERCLEANFILES += \ + src/preproc/refer/label.cpp \ + src/preproc/refer/label.hpp \ + src/preproc/refer/label.output + +refer_TESTS = \ + src/preproc/refer/tests/report-correct-line-numbers.sh +TESTS += $(refer_TESTS) +EXTRA_DIST += \ + $(refer_TESTS) \ + src/preproc/refer/tests/artifacts/62124.bib + + +# Local Variables: +# fill-column: 72 +# mode: makefile-automake +# End: +# vim: set autoindent filetype=automake textwidth=72: diff --git a/src/preproc/refer/refer.cpp b/src/preproc/refer/refer.cpp new file mode 100644 index 0000000..a5c291e --- /dev/null +++ b/src/preproc/refer/refer.cpp @@ -0,0 +1,1267 @@ +/* Copyright (C) 1989-2020 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or +(at your option) any later version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . */ + +#include "refer.h" +#include "refid.h" +#include "ref.h" +#include "token.h" +#include "search.h" +#include "command.h" + +extern "C" const char *Version_string; + +const char PRE_LABEL_MARKER = '\013'; +const char POST_LABEL_MARKER = '\014'; +const char LABEL_MARKER = '\015'; // label_type is added on + +#define FORCE_LEFT_BRACKET 04 +#define FORCE_RIGHT_BRACKET 010 + +static FILE *outfp = stdout; + +string capitalize_fields; +string reverse_fields; +string abbreviate_fields; +string period_before_last_name = ". "; +string period_before_initial = "."; +string period_before_hyphen = ""; +string period_before_other = ". "; +string sort_fields; +int annotation_field = -1; +string annotation_macro; +string discard_fields = "XYZ"; +string pre_label = "\\*([."; +string post_label = "\\*(.]"; +string sep_label = ", "; +int have_bibliography = 0; +int accumulate = 0; +int move_punctuation = 0; +int abbreviate_label_ranges = 0; +string label_range_indicator; +int label_in_text = 1; +int label_in_reference = 1; +int date_as_label = 0; +int sort_adjacent_labels = 0; +// Join exactly two authors with this. +string join_authors_exactly_two = " and "; +// When there are more than two authors join the last two with this. +string join_authors_last_two = ", and "; +// Otherwise join authors with this. +string join_authors_default = ", "; +string separate_label_second_parts = ", "; +// Use this string to represent that there are other authors. +string et_al = " et al"; +// Use et al only if it can replace at least this many authors. +int et_al_min_elide = 2; +// Use et al only if the total number of authors is at least this. +int et_al_min_total = 3; + + +int compatible_flag = 0; + +int short_label_flag = 0; + +static bool recognize_R1_R2 = true; + +search_list database_list; +int search_default = 1; +static int default_database_loaded = 0; + +static reference **citation = 0; +static int ncitations = 0; +static int citation_max = 0; + +static reference **reference_hash_table = 0; +static int hash_table_size; +static int nreferences = 0; + +static int need_syncing = 0; +string pending_line; +string pending_lf_lines; + +static void output_pending_line(); +static unsigned immediately_handle_reference(const string &); +static void immediately_output_references(); +static unsigned store_reference(const string &); +static void divert_to_temporary_file(); +static reference *make_reference(const string &, unsigned *); +static void usage(FILE *stream); +static void do_file(const char *); +static void split_punct(string &line, string &punct); +static void output_citation_group(reference **v, int n, label_type, + FILE *fp); +static void possibly_load_default_database(); + +int main(int argc, char **argv) +{ + program_name = argv[0]; + static char stderr_buf[BUFSIZ]; + setbuf(stderr, stderr_buf); + outfp = stdout; + int finished_options = 0; + int bib_flag = 0; + int done_spec = 0; + + // TODO: Migrate to getopt_long; see, e.g., src/preproc/eqn/main.cpp. + for (--argc, ++argv; + !finished_options && argc > 0 && argv[0][0] == '-' + && argv[0][1] != '\0'; + argv++, argc--) { + const char *opt = argv[0] + 1; + while (opt != 0 && *opt != '\0') { + switch (*opt) { + case 'C': + compatible_flag = 1; + opt++; + break; + case 'B': + bib_flag = 1; + label_in_reference = 0; + label_in_text = 0; + ++opt; + if (*opt == '\0') { + annotation_field = 'X'; + annotation_macro = "AP"; + } + else if (csalnum(opt[0]) && opt[1] == '.' && opt[2] != '\0') { + annotation_field = opt[0]; + annotation_macro = opt + 2; + } + opt = 0; + break; + case 'P': + move_punctuation = 1; + opt++; + break; + case 'R': + recognize_R1_R2 = false; + opt++; + break; + case 'S': + // Not a very useful spec. + set_label_spec("(A.n|Q)', '(D.y|D)"); + done_spec = 1; + pre_label = " ("; + post_label = ")"; + sep_label = "; "; + opt++; + break; + case 'V': + do_verify = true; + opt++; + break; + case 'f': + { + const char *num = 0; + if (*++opt == '\0') { + if (argc > 1) { + num = *++argv; + --argc; + } + else { + error("'f' option requires an argument"); + usage(stderr); + exit(1); + } + } + else { + num = opt; + opt = 0; + } + const char *ptr; + for (ptr = num; *ptr; ptr++) + if (!csdigit(*ptr)) { + error("invalid character '%1' in argument to 'f' option", + *ptr); + break; + } + if (*ptr == '\0') { + string spec; + spec = '%'; + spec += num; + spec += '\0'; + set_label_spec(spec.contents()); + done_spec = 1; + } + break; + } + case 'b': + label_in_text = 0; + label_in_reference = 0; + opt++; + break; + case 'e': + accumulate = 1; + opt++; + break; + case 'c': + capitalize_fields = ++opt; + opt = 0; + break; + case 'k': + { + char buf[5]; + if (csalpha(*++opt)) + buf[0] = *opt++; + else { + if (*opt != '\0') + error("invalid field name '%1' in argument to 'k' option", + *opt++); + buf[0] = 'L'; + } + buf[1] = '~'; + buf[2] = '%'; + buf[3] = 'a'; + buf[4] = '\0'; + set_label_spec(buf); + done_spec = 1; + } + break; + case 'a': + { + const char *ptr; + for (ptr = ++opt; *ptr; ptr++) + if (!csdigit(*ptr)) { + error("'a' option argument must be an integer"); + break; + } + if (*ptr == '\0') { + reverse_fields = 'A'; + reverse_fields += opt; + } + opt = 0; + } + break; + case 'i': + linear_ignore_fields = ++opt; + opt = 0; + break; + case 'l': + { + char buf[INT_DIGITS*2 + 11]; // A.n+2D.y-3%a + strcpy(buf, "A.n"); + if (*++opt != '\0' && *opt != ',') { + char *ptr; + long n = strtol(opt, &ptr, 10); + if (n == 0 && ptr == opt) { + error("invalid integer '%1' in 'l' option argument", opt); + opt = 0; + break; + } + if (n < 0) + n = 0; + opt = ptr; + sprintf(strchr(buf, '\0'), "+%ld", n); + } + strcat(buf, "D.y"); + if (*opt == ',') + opt++; + if (*opt != '\0') { + char *ptr; + long n = strtol(opt, &ptr, 10); + if (n == 0 && ptr == opt) { + error("invalid integer '%1' in 'l' option argument", opt); + opt = 0; + break; + } + if (n < 0) + n = 0; + sprintf(strchr(buf, '\0'), "-%ld", n); + opt = ptr; + if (*opt != '\0') + error("argument to 'l' option not of form 'm,n'"); + } + strcat(buf, "%a"); + if (!set_label_spec(buf)) + assert(0 == "set_label_spec() failed"); + done_spec = 1; + } + break; + case 'n': + search_default = 0; + opt++; + break; + case 'p': + { + const char *filename = 0; + if (*++opt == '\0') { + if (argc > 1) { + filename = *++argv; + argc--; + } + else { + error("option 'p' requires an argument"); + usage(stderr); + exit(1); + } + } + else { + filename = opt; + opt = 0; + } + database_list.add_file(filename); + } + break; + case 's': + if (*++opt == '\0') + sort_fields = "AD"; + else { + sort_fields = opt; + opt = 0; + } + accumulate = 1; + break; + case 't': + { + char *ptr; + long n = strtol(opt, &ptr, 10); + if (n == 0 && ptr == opt) { + error("invalid integer '%1' in 't' option argument", opt); + opt = 0; + break; + } + if (n < 1) + n = 1; + linear_truncate_len = int(n); + opt = ptr; + break; + } + case '-': + if (opt[1] == '\0') { + finished_options = 1; + opt++; + break; + } + if (strcmp(opt, "-version") == 0) { + case 'v': + printf("GNU refer (groff) version %s\n", Version_string); + exit(0); + break; + } + if (strcmp(opt, "-help") == 0) { + usage(stdout); + exit(0); + break; + } + // fall through + default: + error("unrecognized option '%1'", opt); + usage(stderr); + exit(1); + break; + } + } + } + if (!done_spec) + set_label_spec("%1"); + if (argc <= 0) { + if (bib_flag) + do_bib("-"); + else + do_file("-"); + } + else { + for (int i = 0; i < argc; i++) { + if (bib_flag) + do_bib(argv[i]); + else + do_file(argv[i]); + } + } + if (accumulate) + output_references(); + if (fflush(stdout) < 0) + fatal("output error: %1", strerror(errno)); + return 0; +} + +static void usage(FILE *stream) +{ + fprintf(stream, +"usage: %s [-bCenPRS] [-aN] [-cXYZ] [-fN] [-iXYZ] [-kX] [-lM,N]" +" [-p db-file] [-sXYZ] [-tN] [-Bl.m] [file ...]\n" +"usage: %s {-v | --version}\n" +"usage: %s --help\n", + program_name, program_name, program_name); +} + +static void possibly_load_default_database() +{ + if (search_default && !default_database_loaded) { + char *filename = getenv("REFER"); + if (filename) + database_list.add_file(filename); + else + database_list.add_file(DEFAULT_INDEX, 1); + default_database_loaded = 1; + } +} + +static bool is_list(const string &str) +{ + const char *start = str.contents(); + const char *end = start + str.length(); + while (end > start && csspace(end[-1])) + end--; + while (start < end && csspace(*start)) + start++; + return end - start == 6 && memcmp(start, "$LIST$", 6) == 0; +} + +static void do_file(const char *filename) +{ + FILE *fp; + if (strcmp(filename, "-") == 0) { + fp = stdin; + } + else { + errno = 0; + fp = fopen(filename, "r"); + if (fp == 0) { + error("can't open '%1': %2", filename, strerror(errno)); + return; + } + } + string fn(filename); + fn += '\0'; + normalize_for_lf(fn); + current_filename = fn.contents(); + fprintf(outfp, ".lf 1 %s\n", current_filename); + current_lineno = 1; + string line; + for (;;) { + line.clear(); + for (;;) { + int c = getc(fp); + if (EOF == c) { + if (line.length() > 0) + line += '\n'; + break; + } + if (is_invalid_input_char(c)) + error("invalid input character code %1", c); + else { + line += c; + if ('\n' == c) + break; + } + } + int len = line.length(); + if (len == 0) + break; + current_lineno++; + if (len >= 2 && line[0] == '.' && line[1] == '[') { + int start_lineno = current_lineno; + bool at_start_of_line = true; + string str; + string post; + string pre(line.contents() + 2, line.length() - 3); + for (;;) { + int c = getc(fp); + if (EOF == c) { + error_with_file_and_line(current_filename, start_lineno, + "missing '.]' line"); + break; + } + if (at_start_of_line) + current_lineno++; + if (at_start_of_line && '.' == c) { + int d = getc(fp); + if (d == ']') { + while ((d = getc(fp)) != '\n' && d != EOF) { + if (is_invalid_input_char(d)) + error("invalid input character code %1", d); + else + post += d; + } + break; + } + if (d != EOF) + ungetc(d, fp); + } + if (is_invalid_input_char(c)) + error("invalid input character code %1", c); + else + str += c; + at_start_of_line = ('\n' == c); + } + if (is_list(str)) { + output_pending_line(); + if (accumulate) + output_references(); + else + error("found '$LIST$' but not accumulating references"); + } + else { + unsigned flags = (accumulate + ? store_reference(str) + : immediately_handle_reference(str)); + if (label_in_text) { + if (accumulate && outfp == stdout) + divert_to_temporary_file(); + if (pending_line.length() == 0) { + warning("can't attach citation to previous line"); + } + else + pending_line.set_length(pending_line.length() - 1); + string punct; + if (move_punctuation) + split_punct(pending_line, punct); + int have_text = pre.length() > 0 || post.length() > 0; + label_type lt = label_type(flags & ~(FORCE_LEFT_BRACKET + |FORCE_RIGHT_BRACKET)); + if ((flags & FORCE_LEFT_BRACKET) || !have_text) + pending_line += PRE_LABEL_MARKER; + pending_line += pre; + char lm = LABEL_MARKER + (int)lt; + pending_line += lm; + pending_line += post; + if ((flags & FORCE_RIGHT_BRACKET) || !have_text) + pending_line += POST_LABEL_MARKER; + pending_line += punct; + pending_line += '\n'; + } + } + need_syncing = 1; + } + else if (len >= 4 + && '.' == line[0] && 'l' == line[1] && 'f' == line[2] + && (compatible_flag || '\n' == line[3] || ' ' == line[3])) + { + pending_lf_lines += line; + line += '\0'; + if (interpret_lf_args(line.contents() + 3)) + current_lineno--; + } + else if (recognize_R1_R2 + && len >= 4 + && '.' == line[0] && 'R' == line[1] && '1' == line[2] + && (compatible_flag || '\n' == line[3] || ' ' == line[3])) + { + line.clear(); + int start_lineno = current_lineno; + bool at_start_of_line = true; + for (;;) { + int c = getc(fp); + if (c != EOF && at_start_of_line) + current_lineno++; + if (at_start_of_line && '.' == c) { + c = getc(fp); + if ('R' == c) { + c = getc(fp); + if ('2' == c) { + c = getc(fp); + if (compatible_flag || ' ' == c || '\n' == c || EOF == c) + { + while (c != EOF && c != '\n') + c = getc(fp); + break; + } + else { + line += '.'; + line += 'R'; + line += '2'; + } + } + else { + line += '.'; + line += 'R'; + } + } + else + line += '.'; + } + if (EOF == c) { + error_with_file_and_line(current_filename, start_lineno, + "missing '.R2' line"); + break; + } + if (is_invalid_input_char(c)) + error_with_file_and_line(current_filename, start_lineno, + "invalid input character code %1", + c); + else { + line += c; + at_start_of_line = ('\n' == c); + } + } + output_pending_line(); + if (accumulate) + output_references(); + else + nreferences = 0; + process_commands(line, current_filename, start_lineno + 1); + need_syncing = 1; + } + else { + output_pending_line(); + pending_line = line; + } + } + need_syncing = 0; + output_pending_line(); + if (fp != stdin) + fclose(fp); +} + +class label_processing_state { + enum { + NORMAL, + PENDING_LABEL, + PENDING_LABEL_POST, + PENDING_LABEL_POST_PRE, + PENDING_POST + } state; + label_type type; // type of pending labels + int count; // number of pending labels + reference **rptr; // pointer to next reference + int rcount; // number of references left + FILE *fp; + int handle_pending(int c); +public: + label_processing_state(reference **, int, FILE *); + ~label_processing_state(); + void process(int c); +}; + +static void output_pending_line() +{ + if (label_in_text && !accumulate && ncitations > 0) { + label_processing_state state(citation, ncitations, outfp); + int len = pending_line.length(); + for (int i = 0; i < len; i++) + state.process((unsigned char)(pending_line[i])); + } + else + put_string(pending_line, outfp); + pending_line.clear(); + if (pending_lf_lines.length() > 0) { + put_string(pending_lf_lines, outfp); + pending_lf_lines.clear(); + } + if (!accumulate) + immediately_output_references(); + if (need_syncing) { + fprintf(outfp, ".lf %d %s\n", current_lineno, current_filename); + need_syncing = 0; + } +} + +static void split_punct(string &line, string &punct) +{ + const char *start = line.contents(); + const char *end = start + line.length(); + const char *ptr = start; + const char *last_token_start = 0; + for (;;) { + if (ptr >= end) + break; + last_token_start = ptr; + if (*ptr == PRE_LABEL_MARKER || *ptr == POST_LABEL_MARKER + || (*ptr >= LABEL_MARKER + && *ptr < LABEL_MARKER + N_LABEL_TYPES)) + ptr++; + else if (!get_token(&ptr, end)) + break; + } + if (last_token_start) { + const token_info *ti = lookup_token(last_token_start, end); + if (ti->is_punct()) { + punct.append(last_token_start, end - last_token_start); + line.set_length(last_token_start - start); + } + } +} + +static void divert_to_temporary_file() +{ + outfp = xtmpfile(); +} + +static void store_citation(reference *ref) +{ + if (ncitations >= citation_max) { + if (citation == 0) + citation = new reference*[citation_max = 100]; + else { + reference **old_citation = citation; + citation_max *= 2; + citation = new reference *[citation_max]; + memcpy(citation, old_citation, ncitations*sizeof(reference *)); + delete[] old_citation; + } + } + citation[ncitations++] = ref; +} + +static unsigned store_reference(const string &str) +{ + if (reference_hash_table == 0) { + reference_hash_table = new reference *[17]; + hash_table_size = 17; + for (int i = 0; i < hash_table_size; i++) + reference_hash_table[i] = 0; + } + unsigned flags; + reference *ref = make_reference(str, &flags); + ref->compute_hash_code(); + unsigned h = ref->hash(); + reference **ptr; + for (ptr = reference_hash_table + (h % hash_table_size); + *ptr != 0; + ((ptr == reference_hash_table) + ? (ptr = reference_hash_table + hash_table_size - 1) + : --ptr)) + if (same_reference(**ptr, *ref)) + break; + if (*ptr != 0) { + if (ref->is_merged()) + warning("fields ignored because reference already used"); + delete ref; + ref = *ptr; + } + else { + *ptr = ref; + ref->set_number(nreferences); + nreferences++; + ref->pre_compute_label(); + ref->compute_sort_key(); + if (nreferences*2 >= hash_table_size) { + // Rehash it. + reference **old_table = reference_hash_table; + int old_size = hash_table_size; + hash_table_size = next_size(hash_table_size); + reference_hash_table = new reference*[hash_table_size]; + int i; + for (i = 0; i < hash_table_size; i++) + reference_hash_table[i] = 0; + for (i = 0; i < old_size; i++) + if (old_table[i]) { + reference **p; + for (p = (reference_hash_table + + (old_table[i]->hash() % hash_table_size)); + *p; + ((p == reference_hash_table) + ? (p = reference_hash_table + hash_table_size - 1) + : --p)) + ; + *p = old_table[i]; + } + delete[] old_table; + } + } + if (label_in_text) + store_citation(ref); + return flags; +} + +unsigned immediately_handle_reference(const string &str) +{ + unsigned flags; + reference *ref = make_reference(str, &flags); + ref->set_number(nreferences); + if (label_in_text || label_in_reference) { + ref->pre_compute_label(); + ref->immediate_compute_label(); + } + nreferences++; + store_citation(ref); + return flags; +} + +static void immediately_output_references() +{ + for (int i = 0; i < ncitations; i++) { + reference *ref = citation[i]; + if (label_in_reference) { + fputs(".ds [F ", outfp); + const string &label = ref->get_label(NORMAL_LABEL); + if (label.length() > 0 + && (label[0] == ' ' || label[0] == '\\' || label[0] == '"')) + putc('"', outfp); + put_string(label, outfp); + putc('\n', outfp); + } + ref->output(outfp); + delete ref; + } + ncitations = 0; +} + +static void output_citation_group(reference **v, int n, label_type type, + FILE *fp) +{ + if (sort_adjacent_labels) { + // Do an insertion sort. Usually n will be very small. + for (int i = 1; i < n; i++) { + int num = v[i]->get_number(); + reference *temp = v[i]; + int j; + for (j = i - 1; j >= 0 && v[j]->get_number() > num; j--) + v[j + 1] = v[j]; + v[j + 1] = temp; + } + } + // This messes up if !accumulate. + if (accumulate && n > 1) { + // remove duplicates + int j = 1; + for (int i = 1; i < n; i++) + if (v[i]->get_label(type) != v[i - 1]->get_label(type)) + v[j++] = v[i]; + n = j; + } + string merged_label; + for (int i = 0; i < n; i++) { + int nmerged = v[i]->merge_labels(v + i + 1, n - i - 1, type, + merged_label); + if (nmerged > 0) { + put_string(merged_label, fp); + i += nmerged; + } + else + put_string(v[i]->get_label(type), fp); + if (i < n - 1) + put_string(sep_label, fp); + } +} + + +label_processing_state::label_processing_state(reference **p, int n, + FILE *f) +: state(NORMAL), count(0), rptr(p), rcount(n), fp(f) +{ +} + +label_processing_state::~label_processing_state() +{ + int handled = handle_pending(EOF); + assert(!handled); + assert(rcount == 0); +} + +int label_processing_state::handle_pending(int c) +{ + switch (state) { + case NORMAL: + break; + case PENDING_LABEL: + if (POST_LABEL_MARKER == c) { + state = PENDING_LABEL_POST; + return 1; + } + else { + output_citation_group(rptr, count, type, fp); + rptr += count ; + rcount -= count; + state = NORMAL; + } + break; + case PENDING_LABEL_POST: + if (PRE_LABEL_MARKER == c) { + state = PENDING_LABEL_POST_PRE; + return 1; + } + else { + output_citation_group(rptr, count, type, fp); + rptr += count; + rcount -= count; + put_string(post_label, fp); + state = NORMAL; + } + break; + case PENDING_LABEL_POST_PRE: + if (c >= LABEL_MARKER + && c < LABEL_MARKER + N_LABEL_TYPES + && c - LABEL_MARKER == type) { + count += 1; + state = PENDING_LABEL; + return 1; + } + else { + output_citation_group(rptr, count, type, fp); + rptr += count; + rcount -= count; + put_string(sep_label, fp); + state = NORMAL; + } + break; + case PENDING_POST: + if (PRE_LABEL_MARKER == c) { + put_string(sep_label, fp); + state = NORMAL; + return 1; + } + else { + put_string(post_label, fp); + state = NORMAL; + } + break; + } + return 0; +} + +void label_processing_state::process(int c) +{ + if (handle_pending(c)) + return; + assert(state == NORMAL); + switch (c) { + case PRE_LABEL_MARKER: + put_string(pre_label, fp); + state = NORMAL; + break; + case POST_LABEL_MARKER: + state = PENDING_POST; + break; + case LABEL_MARKER: + case LABEL_MARKER + 1: + count = 1; + state = PENDING_LABEL; + type = label_type(c - LABEL_MARKER); + break; + default: + state = NORMAL; + putc(c, fp); + break; + } +} + +extern "C" { + +int rcompare(const void *p1, const void *p2) +{ + return compare_reference(**(reference **)p1, **(reference **)p2); +} + +} + +void output_references() +{ + assert(accumulate); + if (!hash_table_size) { + if (have_bibliography) + error("nothing to reference (probably 'bibliography' before" + " 'sort')"); + accumulate = 0; + nreferences = 0; + return; + } + if (nreferences > 0) { + int j = 0; + int i; + for (i = 0; i < hash_table_size; i++) + if (reference_hash_table[i] != 0) + reference_hash_table[j++] = reference_hash_table[i]; + assert(j == nreferences); + for (; j < hash_table_size; j++) + reference_hash_table[j] = 0; + qsort(reference_hash_table, nreferences, sizeof(reference*), + rcompare); + for (i = 0; i < nreferences; i++) + reference_hash_table[i]->set_number(i); + compute_labels(reference_hash_table, nreferences); + } + if (outfp != stdout) { + rewind(outfp); + { + label_processing_state state(citation, ncitations, stdout); + int c; + while ((c = getc(outfp)) != EOF) + state.process(c); + } + ncitations = 0; + fclose(outfp); + outfp = stdout; + } + if (nreferences > 0) { + fputs(".]<\n", outfp); + for (int i = 0; i < nreferences; i++) { + if (sort_fields.length() > 0) + reference_hash_table[i]->print_sort_key_comment(outfp); + if (label_in_reference) { + fputs(".ds [F ", outfp); + const string &label + = reference_hash_table[i]->get_label(NORMAL_LABEL); + if (label.length() > 0 + && (label[0] == ' ' || label[0] == '\\' || label[0] == '"')) + putc('"', outfp); + put_string(label, outfp); + putc('\n', outfp); + } + reference_hash_table[i]->output(outfp); + delete reference_hash_table[i]; + reference_hash_table[i] = 0; + } + fputs(".]>\n", outfp); + nreferences = 0; + } + clear_labels(); +} + +static reference *find_reference(const char *query, int query_len) +{ + // This is so that error messages look better. + while (query_len > 0 && csspace(query[query_len - 1])) + query_len--; + string str; + for (int i = 0; i < query_len; i++) + str += query[i] == '\n' ? ' ' : query[i]; + str += '\0'; + possibly_load_default_database(); + search_list_iterator iter(&database_list, str.contents()); + reference_id rid; + const char *start; + int len; + if (!iter.next(&start, &len, &rid)) { + error("no matches for '%1'", str.contents()); + return 0; + } + const char *end = start + len; + while (start < end) { + if (*start == '%') + break; + while (start < end && *start++ != '\n') + ; + } + if (start >= end) { + error("found a reference for '%1' but it didn't contain any fields", + str.contents()); + return 0; + } + reference *result = new reference(start, end - start, &rid); + if (iter.next(&start, &len, &rid)) + warning("multiple matches for '%1'", str.contents()); + return result; +} + +static reference *make_reference(const string &str, unsigned *flagsp) +{ + const char *start = str.contents(); + const char *end = start + str.length(); + const char *ptr = start; + while (ptr < end) { + if (*ptr == '%') + break; + while (ptr < end && *ptr++ != '\n') + ; + } + *flagsp = 0; + for (; start < ptr; start++) { + if (*start == '#') + *flagsp = (SHORT_LABEL | (*flagsp & (FORCE_RIGHT_BRACKET + | FORCE_LEFT_BRACKET))); + else if (*start == '[') + *flagsp |= FORCE_LEFT_BRACKET; + else if (*start == ']') + *flagsp |= FORCE_RIGHT_BRACKET; + else if (!csspace(*start)) + break; + } + if (start >= end) { + error("empty reference"); + return new reference; + } + reference *database_ref = 0; + if (start < ptr) + database_ref = find_reference(start, ptr - start); + reference *inline_ref = 0; + if (ptr < end) + inline_ref = new reference(ptr, end - ptr); + if (inline_ref) { + if (database_ref) { + database_ref->merge(*inline_ref); + delete inline_ref; + return database_ref; + } + else + return inline_ref; + } + else if (database_ref) + return database_ref; + else + return new reference; +} + +static void do_ref(const string &str) +{ + if (accumulate) + (void)store_reference(str); + else { + (void)immediately_handle_reference(str); + immediately_output_references(); + } +} + +static void trim_blanks(string &str) +{ + const char *start = str.contents(); + const char *end = start + str.length(); + while (end > start && end[-1] != '\n' && csspace(end[-1])) + --end; + str.set_length(end - start); +} + +void do_bib(const char *filename) +{ + FILE *fp; + if (strcmp(filename, "-") == 0) + fp = stdin; + else { + errno = 0; + fp = fopen(filename, "r"); + if (fp == 0) { + error("can't open '%1': %2", filename, strerror(errno)); + return; + } + current_filename = filename; + } + current_lineno = 1; + enum { + START, MIDDLE, BODY, BODY_START, BODY_BLANK, BODY_DOT + } state = START; + string body; + for (;;) { + int c = getc(fp); + if (EOF == c) + break; + if (is_invalid_input_char(c)) { + error("invalid input character code %1", c); + continue; + } + switch (state) { + case START: + if ('%' == c) { + body = c; + state = BODY; + } + else if (c != '\n') + state = MIDDLE; + break; + case MIDDLE: + if ('\n' == c) + state = START; + break; + case BODY: + body += c; + if ('\n' == c) + state = BODY_START; + break; + case BODY_START: + if ('\n' == c) { + do_ref(body); + state = START; + } + else if ('.' == c) + state = BODY_DOT; + else if (csspace(c)) { + state = BODY_BLANK; + body += c; + } + else { + body += c; + state = BODY; + } + break; + case BODY_BLANK: + if ('\n' == c) { + trim_blanks(body); + do_ref(body); + state = START; + } + else if (csspace(c)) + body += c; + else { + body += c; + state = BODY; + } + break; + case BODY_DOT: + if (']' == c) { + do_ref(body); + state = MIDDLE; + } + else { + body += '.'; + body += c; + state = ('\n' == c) ? BODY_START : BODY; + } + break; + default: + assert(0 == "unhandled case while parsing bibliography file"); + } + if ('\n' == c) + current_lineno++; + } + switch (state) { + case START: + case MIDDLE: + break; + case BODY: + body += '\n'; + do_ref(body); + break; + case BODY_DOT: + case BODY_START: + do_ref(body); + break; + case BODY_BLANK: + trim_blanks(body); + do_ref(body); + break; + } + fclose(fp); +} + +// from the Dragon Book + +unsigned hash_string(const char *s, int len) +{ + const char *end = s + len; + unsigned h = 0, g; + while (s < end) { + h <<= 4; + h += *s++; + if ((g = h & 0xf0000000) != 0) { + h ^= g >> 24; + h ^= g; + } + } + return h; +} + +int next_size(int n) +{ + static const int table_sizes[] = { + 101, 503, 1009, 2003, 3001, 4001, 5003, 10007, 20011, 40009, + 80021, 160001, 500009, 1000003, 2000003, 4000037, 8000009, + 16000057, 32000011, 64000031, 128000003, 0 + }; + + const int *p; + for (p = table_sizes; *p <= n && *p != 0; p++) + ; + assert(*p != 0); + return *p; +} + +// Local Variables: +// fill-column: 72 +// mode: C++ +// End: +// vim: set cindent noexpandtab shiftwidth=2 textwidth=72: diff --git a/src/preproc/refer/refer.h b/src/preproc/refer/refer.h new file mode 100644 index 0000000..3ebff27 --- /dev/null +++ b/src/preproc/refer/refer.h @@ -0,0 +1,81 @@ +/* Copyright (C) 1989-2020 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or +(at your option) any later version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . */ + +#include "lib.h" + +#include +#include + +#include "errarg.h" +#include "error.h" +#include "stringclass.h" +#include "cset.h" +#include "cmap.h" +#include "lf.h" + +#include "defs.h" + +unsigned hash_string(const char *, int); +int next_size(int); + +extern string capitalize_fields; +extern string reverse_fields; +extern string abbreviate_fields; +extern string period_before_last_name; +extern string period_before_initial; +extern string period_before_hyphen; +extern string period_before_other; +extern string sort_fields; +extern int annotation_field; +extern string annotation_macro; +extern string discard_fields; +extern string articles; +extern int abbreviate_label_ranges; +extern string label_range_indicator; +extern int date_as_label; +extern string join_authors_exactly_two; +extern string join_authors_last_two; +extern string join_authors_default; +extern string separate_label_second_parts; +extern string et_al; +extern int et_al_min_elide; +extern int et_al_min_total; + +extern int compatible_flag; + +extern int set_label_spec(const char *); +extern int set_date_label_spec(const char *); +extern int set_short_label_spec(const char *); + +extern int short_label_flag; + +void clear_labels(); +void command_error(const char *, + const errarg &arg1 = empty_errarg, + const errarg &arg2 = empty_errarg, + const errarg &arg3 = empty_errarg); + +class reference; + +void compute_labels(reference **, int); + +// Local Variables: +// fill-column: 72 +// mode: C++ +// End: +// vim: set cindent noexpandtab shiftwidth=2 textwidth=72: diff --git a/src/preproc/refer/tests/artifacts/62124.bib b/src/preproc/refer/tests/artifacts/62124.bib new file mode 100644 index 0000000..1093837 --- /dev/null +++ b/src/preproc/refer/tests/artifacts/62124.bib @@ -0,0 +1,4 @@ +%A Irritablé, X. +%T ˆUniversit\*'e de Grenoble. ‰Cours donn\*'es aux Houches. +%Z ˆMon dieu, Consiel !‰ +%Z NOTE: This file is deliberately not valid UTF-8. Try Latin-1. diff --git a/src/preproc/refer/tests/report-correct-line-numbers.sh b/src/preproc/refer/tests/report-correct-line-numbers.sh new file mode 100755 index 0000000..19cae53 --- /dev/null +++ b/src/preproc/refer/tests/report-correct-line-numbers.sh @@ -0,0 +1,136 @@ +#!/bin/sh +# +# Copyright (C) 2022 Free Software Foundation, Inc. +# +# This file is part of groff. +# +# groff is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# groff is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +refer="${abs_top_builddir:-.}/refer" + +fail= + +wail () { + echo FAILED >&2 + fail=YES +} + +# Regression-test Savannah #62124. Ensure correct line numbers in +# diagnostics on bibliography files. + +# Locate directory containing our test artifacts. +artifact_dir= + +for buildroot in . .. ../.. +do + d=$buildroot/src/preproc/refer/tests/artifacts + if [ -d "$d" ] + then + artifact_dir=$d + break + fi +done + +# If we can't find it, we can't test. +test -z "$artifact_dir" && exit 77 # skip + +input=". +.R1 +bibliography $artifact_dir/62124.bib +cattywumpus +.R2 +. +.R1 +bibliography $artifact_dir/62124.bib +cattywumpus +.R2" + +# We want standard error _only_. +output=$(echo "$input" | "$refer" -e -p "$artifact_dir"/62124.bib \ + 2>&1 >/dev/null) + +# We should get every complaint about the bibliography twice because it +# is dumped twice; the line numbers should not change because they're +# problems with the bibliography file, not the input file. + +# We're pattern-matching diagnostic output here, which is a delicate +# thing to do. If a test failure occurs, ensure the diagnostic message +# text hasn't changed before assuming a deeper logic problem. + +echo "checking line number of invalid character on bibliography line 1" +count=$(echo "$output" | grep -c "refer:.*/62124.bib:1:.*code 129") +test $count -eq 2 || wail + +echo "checking line number of first invalid character on bibliography" \ + "line 2" +count=$(echo "$output" | grep -c "refer:.*/62124.bib:2:.*code 136") +test $count -eq 2 || wail + +echo "checking line number of second invalid character on" \ + "bibliography line 2" +count=$(echo "$output" | grep -c "refer:.*/62124.bib:2:.*code 137") +test $count -eq 2 || wail + +echo "checking line number of first invalid character on" \ + "bibliography line 3" +count=$(echo "$output" | grep -c "refer:.*/62124.bib:3:.*code 136") +test $count -eq 2 || wail + +echo "checking line number of second invalid character on" \ + "bibliography line 3" +count=$(echo "$output" | grep -c "refer:.*/62124.bib:3:.*code 137") +test $count -eq 2 || wail + +# Problems with the input file should also be accurately located. + +echo "checking line number of invalid refer(1) command on input line 4" +echo "$output" +echo "$output" | grep -q "refer:.*:4:.*unknown command" || wail + +echo "checking line number of invalid refer(1) command on input line 9" +echo "$output" +echo "$output" | grep -q "refer:.*:9:.*unknown command" || wail + +# Regression-test Savannah #62391. + +output=$(printf '\0201\n' | "$refer" 2>&1 >/dev/null) + +echo "checking line number of invalid input character on input line 1" +echo "$output" | grep -q "refer:.*:1:.*invalid input character" \ + || wail + +output=$(printf '.R1\nbogus \0200\n.R2\n' | "$refer" 2>&1 >/dev/null) + +echo "checking line number of invalid input character after refer(1)" \ + "command on input line 2" +echo "$output" | grep -q "refer:.*:2:.*invalid input character" \ + || wail + +output=$(printf '.R1\ndatabase nonexistent.bib\n.R2\n' | "$refer" 2>&1 \ + >/dev/null) + +echo "checking line number of attempt to load nonexistent database" +echo "$output" | grep -q "refer:.*:2:.*can't open 'nonexistent\.bib':" \ + || wail + +output=$(printf '.R1\ninclude nonexistent.bib\n.R2\n' | "$refer" 2>&1 \ + >/dev/null) + +echo "checking line number of attempt to load nonexistent inclusion" +echo "$output" | grep -q "refer:.*:2:.*can't open 'nonexistent\.bib':" \ + || wail +test -z "$fail" || exit 1 + +# vim:set ai et sw=4 ts=4 tw=72: diff --git a/src/preproc/refer/token.cpp b/src/preproc/refer/token.cpp new file mode 100644 index 0000000..e643cbd --- /dev/null +++ b/src/preproc/refer/token.cpp @@ -0,0 +1,377 @@ +// -*- C++ -*- +/* Copyright (C) 1989-2020 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or +(at your option) any later version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . */ + +#include "refer.h" +#include "token.h" + +#define TOKEN_TABLE_SIZE 1009 +// I believe in Icelandic thorn sorts after z. +#define THORN_SORT_KEY "{" + +struct token_table_entry { + const char *tok; + token_info ti; + token_table_entry(); +}; + +token_table_entry token_table[TOKEN_TABLE_SIZE]; +int ntokens = 0; + +static void skip_name(const char **ptr, const char *end) +{ + if (*ptr < end) { + switch (*(*ptr)++) { + case '(': + if (*ptr < end) { + *ptr += 1; + if (*ptr < end) + *ptr += 1; + } + break; + case '[': + while (*ptr < end) + if (*(*ptr)++ == ']') + break; + break; + } + } +} + +int get_token(const char **ptr, const char *end) +{ + if (*ptr >= end) + return 0; + char c = *(*ptr)++; + if (c == '\\' && *ptr < end) { + switch (**ptr) { + default: + *ptr += 1; + break; + case '(': + case '[': + skip_name(ptr, end); + break; + case '*': + case 'f': + *ptr += 1; + skip_name(ptr, end); + break; + } + } + return 1; +} + +token_info::token_info() +: type(TOKEN_OTHER), sort_key(0), other_case(0) +{ +} + +void token_info::set(token_type t, const char *sk, const char *oc) +{ + assert(oc == 0 || t == TOKEN_UPPER || t == TOKEN_LOWER); + type = t; + sort_key = sk; + other_case = oc; +} + +void token_info::sortify(const char *start, const char *end, string &result) + const +{ + if (sort_key) + result += sort_key; + else if (type == TOKEN_UPPER || type == TOKEN_LOWER) { + for (; start < end; start++) + if (csalpha(*start)) + result += cmlower(*start); + } +} + +int token_info::sortify_non_empty(const char *start, const char *end) const +{ + if (sort_key) + return *sort_key != '\0'; + if (type != TOKEN_UPPER && type != TOKEN_LOWER) + return 0; + for (; start < end; start++) + if (csalpha(*start)) + return 1; + return 0; +} + + +void token_info::lower_case(const char *start, const char *end, + string &result) const +{ + if (type != TOKEN_UPPER) { + while (start < end) + result += *start++; + } + else if (other_case) + result += other_case; + else { + while (start < end) + result += cmlower(*start++); + } +} + +void token_info::upper_case(const char *start, const char *end, + string &result) const +{ + if (type != TOKEN_LOWER) { + while (start < end) + result += *start++; + } + else if (other_case) + result += other_case; + else { + while (start < end) + result += cmupper(*start++); + } +} + +token_table_entry::token_table_entry() +: tok(0) +{ +} + +static void store_token(const char *tok, token_type typ, + const char *sk = 0, const char *oc = 0) +{ + unsigned n = hash_string(tok, strlen(tok)) % TOKEN_TABLE_SIZE; + for (;;) { + if (token_table[n].tok == 0) { + if (++ntokens == TOKEN_TABLE_SIZE) + assert(0); + token_table[n].tok = tok; + break; + } + if (strcmp(tok, token_table[n].tok) == 0) + break; + if (n == 0) + n = TOKEN_TABLE_SIZE - 1; + else + --n; + } + token_table[n].ti.set(typ, sk, oc); +} + + +token_info default_token_info; + +const token_info *lookup_token(const char *start, const char *end) +{ + unsigned n = hash_string(start, end - start) % TOKEN_TABLE_SIZE; + for (;;) { + if (token_table[n].tok == 0) + break; + if (strlen(token_table[n].tok) == size_t(end - start) + && memcmp(token_table[n].tok, start, end - start) == 0) + return &(token_table[n].ti); + if (n == 0) + n = TOKEN_TABLE_SIZE - 1; + else + --n; + } + return &default_token_info; +} + +static void init_ascii() +{ + const char *p; + for (p = "abcdefghijklmnopqrstuvwxyz"; *p; p++) { + char buf[2]; + buf[0] = *p; + buf[1] = '\0'; + store_token(strsave(buf), TOKEN_LOWER); + buf[0] = cmupper(buf[0]); + store_token(strsave(buf), TOKEN_UPPER); + } + for (p = "0123456789"; *p; p++) { + char buf[2]; + buf[0] = *p; + buf[1] = '\0'; + const char *s = strsave(buf); + store_token(s, TOKEN_OTHER, s); + } + for (p = ".,:;?!"; *p; p++) { + char buf[2]; + buf[0] = *p; + buf[1] = '\0'; + store_token(strsave(buf), TOKEN_PUNCT); + } + store_token("-", TOKEN_HYPHEN); +} + +static void store_letter(const char *lower, const char *upper, + const char *sort_key = 0) +{ + store_token(lower, TOKEN_LOWER, sort_key, upper); + store_token(upper, TOKEN_UPPER, sort_key, lower); +} + +static void init_letter(unsigned char uc_code, unsigned char lc_code, + const char *sort_key) +{ + char lbuf[2]; + lbuf[0] = lc_code; + lbuf[1] = 0; + char ubuf[2]; + ubuf[0] = uc_code; + ubuf[1] = 0; + store_letter(strsave(lbuf), strsave(ubuf), sort_key); +} + +static void init_latin1() +{ + init_letter(0xc0, 0xe0, "a"); + init_letter(0xc1, 0xe1, "a"); + init_letter(0xc2, 0xe2, "a"); + init_letter(0xc3, 0xe3, "a"); + init_letter(0xc4, 0xe4, "a"); + init_letter(0xc5, 0xe5, "a"); + init_letter(0xc6, 0xe6, "ae"); + init_letter(0xc7, 0xe7, "c"); + init_letter(0xc8, 0xe8, "e"); + init_letter(0xc9, 0xe9, "e"); + init_letter(0xca, 0xea, "e"); + init_letter(0xcb, 0xeb, "e"); + init_letter(0xcc, 0xec, "i"); + init_letter(0xcd, 0xed, "i"); + init_letter(0xce, 0xee, "i"); + init_letter(0xcf, 0xef, "i"); + + init_letter(0xd0, 0xf0, "d"); + init_letter(0xd1, 0xf1, "n"); + init_letter(0xd2, 0xf2, "o"); + init_letter(0xd3, 0xf3, "o"); + init_letter(0xd4, 0xf4, "o"); + init_letter(0xd5, 0xf5, "o"); + init_letter(0xd6, 0xf6, "o"); + init_letter(0xd8, 0xf8, "o"); + init_letter(0xd9, 0xf9, "u"); + init_letter(0xda, 0xfa, "u"); + init_letter(0xdb, 0xfb, "u"); + init_letter(0xdc, 0xfc, "u"); + init_letter(0xdd, 0xfd, "y"); + init_letter(0xde, 0xfe, THORN_SORT_KEY); + + store_token("\337", TOKEN_LOWER, "ss", "SS"); + store_token("\377", TOKEN_LOWER, "y", "Y"); +} + +static void init_two_char_letter(char l1, char l2, char u1, char u2, + const char *sk = 0) +{ + char buf[6]; + buf[0] = '\\'; + buf[1] = '('; + buf[2] = l1; + buf[3] = l2; + buf[4] = '\0'; + const char *p = strsave(buf); + buf[2] = u1; + buf[3] = u2; + store_letter(p, strsave(buf), sk); + buf[1] = '['; + buf[4] = ']'; + buf[5] = '\0'; + p = strsave(buf); + buf[2] = l1; + buf[3] = l2; + store_letter(strsave(buf), p, sk); + +} + +static void init_special_chars() +{ + const char *p; + for (p = "':^`~"; *p; p++) + for (const char *q = "aeiouy"; *q; q++) { + // Use a variable to work around bug in gcc 2.0 + char c = cmupper(*q); + init_two_char_letter(*p, *q, *p, c); + } + for (p = "/l/o~n,coeaeij"; *p; p += 2) { + // Use variables to work around bug in gcc 2.0 + char c0 = cmupper(p[0]); + char c1 = cmupper(p[1]); + init_two_char_letter(p[0], p[1], c0, c1); + } + init_two_char_letter('v', 's', 'v', 'S', "s"); + init_two_char_letter('v', 'z', 'v', 'Z', "z"); + init_two_char_letter('o', 'a', 'o', 'A', "a"); + init_two_char_letter('T', 'p', 'T', 'P', THORN_SORT_KEY); + init_two_char_letter('-', 'd', '-', 'D'); + + store_token("\\(ss", TOKEN_LOWER, 0, "SS"); + store_token("\\[ss]", TOKEN_LOWER, 0, "SS"); + + store_token("\\(Sd", TOKEN_LOWER, "d", "\\(-D"); + store_token("\\[Sd]", TOKEN_LOWER, "d", "\\[-D]"); + store_token("\\(hy", TOKEN_HYPHEN); + store_token("\\[hy]", TOKEN_HYPHEN); + store_token("\\(en", TOKEN_RANGE_SEP); + store_token("\\[en]", TOKEN_RANGE_SEP); +} + +static void init_strings() +{ + char buf[6]; + buf[0] = '\\'; + buf[1] = '*'; + for (const char *p = "'`^^,:~v_o./;"; *p; p++) { + buf[2] = *p; + buf[3] = '\0'; + store_token(strsave(buf), TOKEN_ACCENT); + buf[2] = '['; + buf[3] = *p; + buf[4] = ']'; + buf[5] = '\0'; + store_token(strsave(buf), TOKEN_ACCENT); + } + + // -ms special letters + store_letter("\\*(th", "\\*(Th", THORN_SORT_KEY); + store_letter("\\*[th]", "\\*[Th]", THORN_SORT_KEY); + store_letter("\\*(d-", "\\*(D-"); + store_letter("\\*[d-]", "\\*[D-]"); + store_letter("\\*(ae", "\\*(Ae", "ae"); + store_letter("\\*[ae]", "\\*[Ae]", "ae"); + store_letter("\\*(oe", "\\*(Oe", "oe"); + store_letter("\\*[oe]", "\\*[Oe]", "oe"); + + store_token("\\*3", TOKEN_LOWER, "y", "Y"); + store_token("\\*8", TOKEN_LOWER, "ss", "SS"); + store_token("\\*q", TOKEN_LOWER, "o", "O"); +} + +struct token_initer { + token_initer(); +}; + +static token_initer the_token_initer; + +token_initer::token_initer() +{ + init_ascii(); + init_latin1(); + init_special_chars(); + init_strings(); + default_token_info.set(TOKEN_OTHER); +} diff --git a/src/preproc/refer/token.h b/src/preproc/refer/token.h new file mode 100644 index 0000000..9cd688c --- /dev/null +++ b/src/preproc/refer/token.h @@ -0,0 +1,87 @@ +// -*- C++ -*- +/* Copyright (C) 1989-2020 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or +(at your option) any later version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . */ + +enum token_type { + TOKEN_OTHER, + TOKEN_UPPER, + TOKEN_LOWER, + TOKEN_ACCENT, + TOKEN_PUNCT, + TOKEN_HYPHEN, + TOKEN_RANGE_SEP +}; + +class token_info { +private: + token_type type; + const char *sort_key; + const char *other_case; +public: + token_info(); + void set(token_type, const char *sk = 0, const char *oc = 0); + void lower_case(const char *start, const char *end, string &result) const; + void upper_case(const char *start, const char *end, string &result) const; + void sortify(const char *start, const char *end, string &result) const; + int sortify_non_empty(const char *start, const char *end) const; + int is_upper() const; + int is_lower() const; + int is_accent() const; + int is_other() const; + int is_punct() const; + int is_hyphen() const; + int is_range_sep() const; +}; + +inline int token_info::is_upper() const +{ + return type == TOKEN_UPPER; +} + +inline int token_info::is_lower() const +{ + return type == TOKEN_LOWER; +} + +inline int token_info::is_accent() const +{ + return type == TOKEN_ACCENT; +} + +inline int token_info::is_other() const +{ + return type == TOKEN_OTHER; +} + +inline int token_info::is_punct() const +{ + return type == TOKEN_PUNCT; +} + +inline int token_info::is_hyphen() const +{ + return type == TOKEN_HYPHEN; +} + +inline int token_info::is_range_sep() const +{ + return type == TOKEN_RANGE_SEP; +} + +int get_token(const char **ptr, const char *end); +const token_info *lookup_token(const char *start, const char *end); -- cgit v1.2.3