1 files changed, 377 insertions, 0 deletions
diff --git a/src/preproc/refer/token.cpp b/src/preproc/refer/token.cpp
new file mode 100644
index 0000000..e643cbd
--- /dev/null
+++ b/src/preproc/refer/token.cpp
@@ -0,0 +1,377 @@
+// -*- C++ -*-
+/* Copyright (C) 1989-2020 Free Software Foundation, Inc.
+     Written by James Clark (jjc@jclark.com)
+
+This file is part of groff.
+
+groff is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+groff is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include "refer.h"
+#include "token.h"
+
+#define TOKEN_TABLE_SIZE 1009
+// I believe in Icelandic thorn sorts after z.
+#define THORN_SORT_KEY "{"
+
+struct token_table_entry {
+  const char *tok;
+  token_info ti;
+  token_table_entry();
+};
+
+token_table_entry token_table[TOKEN_TABLE_SIZE];
+int ntokens = 0;
+
+static void skip_name(const char **ptr, const char *end)
+{
+  if (*ptr < end) {
+    switch (*(*ptr)++) {
+    case '(':
+      if (*ptr < end) {
+	*ptr += 1;
+	if (*ptr < end)
+	  *ptr += 1;
+      }
+      break;
+    case '[':
+      while (*ptr < end)
+	if (*(*ptr)++ == ']')
+	  break;
+      break;
+    }
+  }
+}
+
+int get_token(const char **ptr, const char *end)
+{
+  if (*ptr >= end)
+    return 0;
+  char c = *(*ptr)++;
+  if (c == '\\' && *ptr < end) {
+    switch (**ptr) {
+    default:
+      *ptr += 1;
+      break;
+    case '(':
+    case '[':
+      skip_name(ptr, end);
+      break;
+    case '*':
+    case 'f':
+      *ptr += 1;
+      skip_name(ptr, end);
+      break;
+    }
+  }
+  return 1;
+}
+
+token_info::token_info()
+: type(TOKEN_OTHER), sort_key(0), other_case(0)
+{
+}
+
+void token_info::set(token_type t, const char *sk, const char *oc)
+{
+  assert(oc == 0 || t == TOKEN_UPPER || t == TOKEN_LOWER);
+  type = t;
+  sort_key = sk;
+  other_case = oc;
+}
+
+void token_info::sortify(const char *start, const char *end, string &result)
+     const
+{
+  if (sort_key)
+    result += sort_key;
+  else if (type == TOKEN_UPPER || type == TOKEN_LOWER) {
+    for (; start < end; start++)
+      if (csalpha(*start))
+	result += cmlower(*start);
+  }
+}
+
+int token_info::sortify_non_empty(const char *start, const char *end) const
+{
+  if (sort_key)
+    return *sort_key != '\0';
+  if (type != TOKEN_UPPER && type != TOKEN_LOWER)
+    return 0;
+  for (; start < end; start++)
+    if (csalpha(*start))
+      return 1;
+  return 0;
+}
+
+
+void token_info::lower_case(const char *start, const char *end,
+			    string &result) const
+{
+  if (type != TOKEN_UPPER) {
+    while (start < end)
+      result += *start++;
+  }
+  else if (other_case)
+    result += other_case;
+  else {
+    while (start < end)
+      result += cmlower(*start++);
+  }
+}
+
+void token_info::upper_case(const char *start, const char *end,
+			    string &result) const
+{
+  if (type != TOKEN_LOWER) {
+    while (start < end)
+      result += *start++;
+  }
+  else if (other_case)
+    result += other_case;
+  else {
+    while (start < end)
+      result += cmupper(*start++);
+  }
+}
+
+token_table_entry::token_table_entry()
+: tok(0)
+{
+}
+
+static void store_token(const char *tok, token_type typ,
+			const char *sk = 0, const char *oc = 0)
+{
+  unsigned n = hash_string(tok, strlen(tok)) % TOKEN_TABLE_SIZE;
+  for (;;) {
+    if (token_table[n].tok == 0) {
+      if (++ntokens == TOKEN_TABLE_SIZE)
+	assert(0);
+      token_table[n].tok = tok;
+      break;
+    }
+    if (strcmp(tok, token_table[n].tok) == 0)
+      break;
+    if (n == 0)
+      n = TOKEN_TABLE_SIZE - 1;
+    else
+      --n;
+  }
+  token_table[n].ti.set(typ, sk, oc);
+}
+
+
+token_info default_token_info;
+
+const token_info *lookup_token(const char *start, const char *end)
+{
+  unsigned n = hash_string(start, end - start) % TOKEN_TABLE_SIZE;
+  for (;;) {
+    if (token_table[n].tok == 0)
+      break;
+    if (strlen(token_table[n].tok) == size_t(end - start)
+	&& memcmp(token_table[n].tok, start, end - start) == 0)
+      return &(token_table[n].ti);
+    if (n == 0)
+      n = TOKEN_TABLE_SIZE - 1;
+    else
+      --n;
+  }
+  return &default_token_info;
+}
+
+static void init_ascii()
+{
+  const char *p;
+  for (p = "abcdefghijklmnopqrstuvwxyz"; *p; p++) {
+    char buf[2];
+    buf[0] = *p;
+    buf[1] = '\0';
+    store_token(strsave(buf), TOKEN_LOWER);
+    buf[0] = cmupper(buf[0]);
+    store_token(strsave(buf), TOKEN_UPPER);
+  }
+  for (p = "0123456789"; *p; p++) {
+    char buf[2];
+    buf[0] = *p;
+    buf[1] = '\0';
+    const char *s = strsave(buf);
+    store_token(s, TOKEN_OTHER, s);
+  }
+  for (p = ".,:;?!"; *p; p++) {
+    char buf[2];
+    buf[0] = *p;
+    buf[1] = '\0';
+    store_token(strsave(buf), TOKEN_PUNCT);
+  }
+  store_token("-", TOKEN_HYPHEN);
+}
+
+static void store_letter(const char *lower, const char *upper,
+		  const char *sort_key = 0)
+{
+  store_token(lower, TOKEN_LOWER, sort_key, upper);
+  store_token(upper, TOKEN_UPPER, sort_key, lower);
+}
+
+static void init_letter(unsigned char uc_code, unsigned char lc_code,
+		 const char *sort_key)
+{
+  char lbuf[2];
+  lbuf[0] = lc_code;
+  lbuf[1] = 0;
+  char ubuf[2];
+  ubuf[0] = uc_code;
+  ubuf[1] = 0;
+  store_letter(strsave(lbuf), strsave(ubuf), sort_key);
+}
+
+static void init_latin1()
+{
+  init_letter(0xc0, 0xe0, "a");
+  init_letter(0xc1, 0xe1, "a");
+  init_letter(0xc2, 0xe2, "a");
+  init_letter(0xc3, 0xe3, "a");
+  init_letter(0xc4, 0xe4, "a");
+  init_letter(0xc5, 0xe5, "a");
+  init_letter(0xc6, 0xe6, "ae");
+  init_letter(0xc7, 0xe7, "c");
+  init_letter(0xc8, 0xe8, "e");
+  init_letter(0xc9, 0xe9, "e");
+  init_letter(0xca, 0xea, "e");
+  init_letter(0xcb, 0xeb, "e");
+  init_letter(0xcc, 0xec, "i");
+  init_letter(0xcd, 0xed, "i");
+  init_letter(0xce, 0xee, "i");
+  init_letter(0xcf, 0xef, "i");
+
+  init_letter(0xd0, 0xf0, "d");
+  init_letter(0xd1, 0xf1, "n");
+  init_letter(0xd2, 0xf2, "o");
+  init_letter(0xd3, 0xf3, "o");
+  init_letter(0xd4, 0xf4, "o");
+  init_letter(0xd5, 0xf5, "o");
+  init_letter(0xd6, 0xf6, "o");
+  init_letter(0xd8, 0xf8, "o");
+  init_letter(0xd9, 0xf9, "u");
+  init_letter(0xda, 0xfa, "u");
+  init_letter(0xdb, 0xfb, "u");
+  init_letter(0xdc, 0xfc, "u");
+  init_letter(0xdd, 0xfd, "y");
+  init_letter(0xde, 0xfe, THORN_SORT_KEY);
+
+  store_token("\337", TOKEN_LOWER, "ss", "SS");
+  store_token("\377", TOKEN_LOWER, "y", "Y");
+}
+
+static void init_two_char_letter(char l1, char l2, char u1, char u2,
+				 const char *sk = 0)
+{
+  char buf[6];
+  buf[0] = '\\';
+  buf[1] = '(';
+  buf[2] = l1;
+  buf[3] = l2;
+  buf[4] = '\0';
+  const char *p = strsave(buf);
+  buf[2] = u1;
+  buf[3] = u2;
+  store_letter(p, strsave(buf), sk);
+  buf[1] = '[';
+  buf[4] = ']';
+  buf[5] = '\0';
+  p = strsave(buf);
+  buf[2] = l1;
+  buf[3] = l2;
+  store_letter(strsave(buf), p, sk);
+  
+}
+
+static void init_special_chars()
+{
+  const char *p;
+  for (p = "':^`~"; *p; p++)
+    for (const char *q = "aeiouy"; *q; q++) {
+      // Use a variable to work around bug in gcc 2.0
+      char c = cmupper(*q);
+      init_two_char_letter(*p, *q, *p, c);
+    }
+  for (p = "/l/o~n,coeaeij"; *p; p += 2) {
+    // Use variables to work around bug in gcc 2.0
+    char c0 = cmupper(p[0]);
+    char c1 = cmupper(p[1]);
+    init_two_char_letter(p[0], p[1], c0, c1);
+  }
+  init_two_char_letter('v', 's', 'v', 'S', "s");
+  init_two_char_letter('v', 'z', 'v', 'Z', "z");
+  init_two_char_letter('o', 'a', 'o', 'A', "a");
+  init_two_char_letter('T', 'p', 'T', 'P', THORN_SORT_KEY);
+  init_two_char_letter('-', 'd', '-', 'D');
+  
+  store_token("\\(ss", TOKEN_LOWER, 0, "SS");
+  store_token("\\[ss]", TOKEN_LOWER, 0, "SS");
+
+  store_token("\\(Sd", TOKEN_LOWER, "d", "\\(-D");
+  store_token("\\[Sd]", TOKEN_LOWER, "d", "\\[-D]");
+  store_token("\\(hy", TOKEN_HYPHEN);
+  store_token("\\[hy]", TOKEN_HYPHEN);
+  store_token("\\(en", TOKEN_RANGE_SEP);
+  store_token("\\[en]", TOKEN_RANGE_SEP);
+}
+
+static void init_strings()
+{
+  char buf[6];
+  buf[0] = '\\';
+  buf[1] = '*';
+  for (const char *p = "'`^^,:~v_o./;"; *p; p++) {
+    buf[2] = *p;
+    buf[3] = '\0';
+    store_token(strsave(buf), TOKEN_ACCENT);
+    buf[2] = '[';
+    buf[3] = *p;
+    buf[4] = ']';
+    buf[5] = '\0';
+    store_token(strsave(buf), TOKEN_ACCENT);
+  }
+
+  // -ms special letters
+  store_letter("\\*(th", "\\*(Th", THORN_SORT_KEY);
+  store_letter("\\*[th]", "\\*[Th]", THORN_SORT_KEY);
+  store_letter("\\*(d-", "\\*(D-");
+  store_letter("\\*[d-]", "\\*[D-]");
+  store_letter("\\*(ae", "\\*(Ae", "ae");
+  store_letter("\\*[ae]", "\\*[Ae]", "ae");
+  store_letter("\\*(oe", "\\*(Oe", "oe");
+  store_letter("\\*[oe]", "\\*[Oe]", "oe");
+
+  store_token("\\*3", TOKEN_LOWER, "y", "Y");
+  store_token("\\*8", TOKEN_LOWER, "ss", "SS");
+  store_token("\\*q", TOKEN_LOWER, "o", "O");
+}
+
+struct token_initer {
+  token_initer();
+};
+
+static token_initer the_token_initer;
+
+token_initer::token_initer()
+{
+  init_ascii();
+  init_latin1();
+  init_special_chars();
+  init_strings();
+  default_token_info.set(TOKEN_OTHER);
+}