summaryrefslogtreecommitdiffstats
path: root/src/lexgrog.l
diff options
context:
space:
mode:
Diffstat (limited to 'src/lexgrog.l')
-rw-r--r--src/lexgrog.l981
1 files changed, 981 insertions, 0 deletions
diff --git a/src/lexgrog.l b/src/lexgrog.l
new file mode 100644
index 0000000..41527a4
--- /dev/null
+++ b/src/lexgrog.l
@@ -0,0 +1,981 @@
+%top{
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif /* HAVE_CONFIG_H */
+
+#include "manconfig.h"
+
+/* Flex emits several functions which might reasonably have various
+ * attributes applied and many unused macros; none of these are our problem.
+ */
+#if GNUC_PREREQ(8,0)
+# pragma GCC diagnostic ignored "-Wsuggest-attribute=malloc"
+#endif
+#pragma GCC diagnostic ignored "-Wsuggest-attribute=pure"
+#pragma GCC diagnostic ignored "-Wunused-macros"
+}
+
+%{
+
+/*
+ * lexgrog.l: extract 'whatis' info from nroff man / formatted cat pages.
+ *
+ * Copyright (C) 1994, 1995 Graeme W. Wilford. (Wilf.)
+ * Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+ * 2011, 2012 Colin Watson.
+ *
+ * This file is part of man-db.
+ *
+ * man-db is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * man-db is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with man-db; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Wed Oct 12 18:46:11 BST 1994 Wilf. (G.Wilford@ee.surrey.ac.uk)
+ *
+ * CJW: Detect grap and vgrind. Understand fill requests. Other improvements
+ * in the syntax accepted.
+ */
+
+#include <sys/stat.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "error.h"
+#include "xalloc.h"
+
+#include "gettext.h"
+#define _(String) gettext (String)
+
+#include "encodings.h"
+#include "pipeline.h"
+#include "sandbox.h"
+#include "security.h"
+#include "util.h"
+
+#include "decompress.h"
+#include "lexgrog.h"
+#include "manconv.h"
+#include "manconv_client.h"
+
+#define YY_READ_BUF_SIZE 1024
+#define MAX_NAME 8192
+
+/* defines the ordered list of filters detected by lexgrog */
+enum {
+ TBL_FILTER = 0, /* tbl */
+ EQN_FILTER, /* eqn */
+ PIC_FILTER, /* pic */
+ GRAP_FILTER, /* grap */
+ REF_FILTER, /* refer */
+ VGRIND_FILTER, /* vgrind */
+ MAX_FILTERS /* delimiter */
+};
+
+#define ARRAY_SIZE(array) (sizeof (array) / sizeof ((array)[0]))
+
+extern man_sandbox *sandbox;
+
+struct macro {
+ const char *name;
+ const char *value;
+};
+
+static const struct macro glyphs[] = {
+ /* It is vital to keep these in strcmp order (sort -t\" -k2)! They
+ * will be searched using bsearch.
+ * Data from groff_char(7), although I have omitted some that are
+ * particularly unlikely to be used in NAME sections.
+ */
+ { "'A", "Á" },
+ { "'C", "Ć" },
+ { "'E", "É" },
+ { "'I", "Í" },
+ { "'O", "Ó" },
+ { "'U", "Ú" },
+ { "'Y", "Ý" },
+ { "'a", "á" },
+ { "'c", "ć" },
+ { "'e", "é" },
+ { "'i", "í" },
+ { "'o", "ó" },
+ { "'u", "ú" },
+ { "'y", "ý" },
+ { ",C", "Ç" },
+ { ",c", "ç" },
+ { "-D", "Ð" },
+ { ".i", "ı" },
+ { "/L", "Ł" },
+ { "/O", "Ø" },
+ { "/l", "ł" },
+ { "/o", "ø" },
+ { ":A", "Ä" },
+ { ":E", "Ë" },
+ { ":I", "Ï" },
+ { ":O", "Ö" },
+ { ":U", "Ü" },
+ { ":Y", "Ÿ" },
+ { ":a", "ä" },
+ { ":e", "ë" },
+ { ":i", "ï" },
+ { ":o", "ö" },
+ { ":u", "ü" },
+ { ":y", "ÿ" },
+ { "AE", "Æ" },
+ { "Bq", "„" },
+ { "Fc", "»" },
+ { "Fi", "ffi" },
+ { "Fl", "ffl" },
+ { "Fo", "«" },
+ { "IJ", "IJ" },
+ { "OE", "Œ" },
+ { "Sd", "ð" },
+ { "TP", "Þ" },
+ { "Tp", "þ" },
+ { "^A", "Â" },
+ { "^E", "Ê" },
+ { "^I", "Î" },
+ { "^O", "Ô" },
+ { "^U", "Û" },
+ { "^a", "â" },
+ { "^e", "ê" },
+ { "^i", "î" },
+ { "^o", "ô" },
+ { "^u", "û" },
+ { "`A", "À" },
+ { "`E", "È" },
+ { "`I", "Ì" },
+ { "`O", "Ò" },
+ { "`U", "Ù" },
+ { "`a", "à" },
+ { "`e", "è" },
+ { "`i", "ì" },
+ { "`o", "ò" },
+ { "`u", "ù" },
+ { "a\"", "˝" },
+ { "a-", "¯" },
+ { "a.", "˙" },
+ { "a^", "^" },
+ { "aa", "´" },
+ { "ab", "˘" },
+ { "ac", "¸" },
+ { "ad", "¨" },
+ { "ae", "æ" },
+ { "ah", "ˇ" },
+ { "ao", "˚" },
+ { "aq", "'" },
+ { "a~", "~" },
+ { "bq", "‚" },
+ { "cq", "’" },
+ { "dq", "\"" },
+ { "em", "—" },
+ { "en", "–" },
+ { "fc", "›" },
+ { "ff", "ff" },
+ { "fi", "fi" },
+ { "fl", "fl" },
+ { "fo", "‹" },
+ { "ga", "`" },
+ { "ha", "^" },
+ { "ho", "˛" },
+ { "hy", "‐" },
+ { "ij", "ij" },
+ { "lq", "“" },
+ { "oA", "Å" },
+ { "oa", "å" },
+ { "oe", "œ" },
+ { "oq", "‘" },
+ { "r!", "¡" },
+ { "r?", "¿" },
+ { "rq", "”" },
+ { "ss", "ß" },
+ { "ti", "~" },
+ { "vS", "Š" },
+ { "vZ", "Ž" },
+ { "vs", "š" },
+ { "vz", "ž" },
+ { "~A", "Ã" },
+ { "~N", "Ñ" },
+ { "~O", "Õ" },
+ { "~a", "ã" },
+ { "~n", "ñ" },
+ { "~o", "õ" }
+};
+
+static const struct macro perldocs[] = {
+ /* It is vital to keep these in strcmp order (sort -t\" -k2)! They
+ * will be searched using bsearch.
+ * Data from Pod/Man.pm.
+ */
+ { "--", "-" },
+ { "Aq", "'" },
+ { "C'", "'" },
+ { "C+", "C++" },
+ { "C`", "`" },
+ { "L\"", "\"" },
+ { "PI", "π" },
+ { "R\"", "\"" }
+};
+
+static void add_str_to_whatis (const char *string, size_t length);
+static void add_char_to_whatis (unsigned char c);
+static void add_separator_to_whatis (void);
+static void add_wordn_to_whatis (const char *string, size_t length);
+static void add_word_to_whatis (const char *string);
+static void add_glyph_to_whatis (const char *string, size_t length);
+static void add_perldoc_to_whatis (const char *string, size_t length);
+static void mdoc_text (const char *string);
+static void newline_found (void);
+
+static char newname[MAX_NAME];
+static char *p_name;
+static const char *fname;
+static char filters[MAX_FILTERS];
+
+static bool fill_mode;
+static bool waiting_for_quote;
+
+static decompress *decomp;
+
+#define YY_INPUT(buf,result,max_size) { \
+ size_t size = max_size; \
+ const char *block = decompress_read (decomp, &size); \
+ if (block && size != 0) { \
+ memcpy (buf, block, size); \
+ buf[size] = '\0'; \
+ result = size; \
+ } else \
+ result = YY_NULL; \
+}
+#define YY_NO_INPUT
+%}
+
+%option ecs meta-ecs
+%option 8bit batch caseful never-interactive
+%option nostdinit
+%option warn
+%option noyywrap nounput
+
+%x MAN_PRENAME
+%x MAN_NAME
+%x MAN_DESC
+%x MAN_DESC_AT
+%x MAN_DESC_BSX
+%x MAN_DESC_BX
+%x MAN_DESC_BX_RELEASE
+%x MAN_DESC_DQ
+%x MAN_DESC_FX
+%x MAN_DESC_NX
+%x MAN_DESC_OX
+%x CAT_NAME
+%x CAT_FILE
+%x MAN_FILE
+%x CAT_REST
+%x MAN_REST
+%x FORCE_EXIT
+
+digit [[:digit:]]
+upper [[:upper:]]
+alpha [[:alpha:]]
+blank [[:blank:]]
+blank_eol [[:blank:]\r\n]
+word [[:alnum:]][^[:blank:]\r\n]*
+eol \r?\n
+bol {eol}+
+next {eol}*
+empty {eol}{blank}*
+indent {eol}{blank}+
+dbl_quote \"
+font_change \\f([[:upper:]1-4]|\({upper}{2})
+size_change \\s[+-]?{digit}
+style_change ({font_change}{size_change}?|{size_change}{font_change}?)
+typeface \.(B[IR]?|I[BR]?|R[BI]|S[BM])
+sec_request \.[Ss][HhYySs]
+comment ['.]\\{dbl_quote}
+
+ /* Please add to this list if you know how. */
+ /* Note that, since flex only supports UTF-8 by accident, character classes
+ * including non-ASCII characters must be written out as (a|b|c|d) rather
+ * than [abcd].
+ */
+ar_name (اﻹسم|الإسم)
+ /* ИМЕ also works for mk */
+bg_name И(М|м)(Е|е)
+cs_name (J[Mm](É|é|\\\('[Ee]|E|e)[Nn][Oo]|N(Á|á)[Zz][Ee][Vv])
+da_name N[Aa][Vv][Nn]
+de_name B[Ee][Zz][Ee][Ii][Cc][Hh][Nn][Uu][Nn][Gg]
+en_name N[Aa][Mm][Ee]
+eo_name N[Oo][Mm][Oo]
+es_name N[Oo][Mm][Bb][Rr][Ee]
+fa_name نام
+fi_name N[Ii][Mm][Ii]
+fr_name N[Oo][Mm]
+hu_name N(É|é|\\\('[Ee]|E|e)[Vv]
+id_name N[Aa][Mm][Aa]
+ /* NOME also works for gl, pt */
+it_name N[Oo][Mm][Ee]
+ja_name (名|̾)(前|称)
+ko_name (이름|명칭)
+latin_name N[Oo][Mm][Ee][Nn]
+lt_name P[Aa][Vv][Aa][Dd][Ii][Nn][Ii][Mm][Aa][Ss]
+nl_name N[Aa][Aa][Mm]
+pl_name N[Aa][Zz][Ww][Aa]
+ro_name N[Uu][Mm][Ee]
+ru_name (И(М|м)(Я|я)|Н(А|а)(З|з)(В|в)(А|а)(Н|н)(И|и)(Е|е)|Н(А|а)(И|и)(М|м)(Е|е)(Н|н)(О|о)(В|в)(А|а)(Н|н)(И|и)(Е|е))
+sk_name M[Ee][Nn][Oo]
+sr_name (И(М|м)(Е|е)|Н(А|а)(З|з)(И|и)(В|в))
+srlatin_name (I[Mm][Ee]|N[Aa][Zz][Ii][Vv])
+sv_name N[Aa][Mm][Nn]
+ta_name பெய
+tr_name (A[Dd]|(İ|i)S(İ|i)M)
+uk_name Н(А|а)(З|з)(В|в)(А|а)
+vi_name T(Ê|ê)[Nn]
+zh_CN_name 名{blank}?(称|字){blank}?.*
+zh_TW_name (名{blank}?(稱|字)|命令名){blank}?.*
+name ({ar_name}|{bg_name}|{cs_name}|{da_name}|{de_name}|{en_name}|{eo_name}|{es_name}|{fa_name}|{fi_name}|{fr_name}|{hu_name}|{id_name}|{it_name}|{ja_name}|{ko_name}|{latin_name}|{lt_name}|{nl_name}|{pl_name}|{ro_name}|{ru_name}|{sk_name}|{sr_name}|{srlatin_name}|{sv_name}|{ta_name}|{tr_name}|{uk_name}|{vi_name}|{zh_CN_name}|{zh_TW_name})
+name_sec {dbl_quote}?{style_change}?{name}{style_change}?({blank}*{dbl_quote})?
+
+ /* eptgrv : eqn, pic, tbl, grap, refer, vgrind */
+tbl_request \.TS
+eqn_request \.EQ
+pic_request \.PS
+grap_request \.G1
+ref1_request \.R1
+ref2_request \.\[
+vgrind_request \.vS
+
+%%
+
+ /* begin NAME section processing */
+<MAN_FILE>{sec_request}{blank_eol}+{name_sec}{blank}* BEGIN (MAN_PRENAME);
+<CAT_FILE>{empty}{2,}{name}{blank}*{indent} BEGIN (CAT_NAME);
+
+ /* general text matching */
+<MAN_FILE>{
+ \.[^Ss\r\n].* |
+ \..{0,3}{dbl_quote}?.{0,4}{dbl_quote}? |
+ {comment}.* |
+ .|{eol}
+}
+
+<CAT_FILE>{
+ .{1,9} |
+ [ ]* |
+ {eol}{2,} |
+ .|{eol}
+}
+
+<MAN_REST>{
+ {bol}{tbl_request} filters[TBL_FILTER] = 't';
+ {bol}{eqn_request} filters[EQN_FILTER] = 'e';
+ {bol}{pic_request} filters[PIC_FILTER] = 'p';
+ {bol}{grap_request} filters[GRAP_FILTER] = 'g';
+ {bol}{ref1_request} |
+ {bol}{ref2_request} filters[REF_FILTER] = 'r';
+ {bol}{vgrind_request} filters[VGRIND_FILTER] = 'v';
+}
+<MAN_REST><<EOF>> { /* exit */
+ *p_name = '\0'; /* terminate the string */
+ yyterminate ();
+}
+<MAN_REST>.+|{eol}
+
+ /* rules to end NAME section processing */
+<FORCE_EXIT>.|{eol} { /* forced exit */
+ *p_name = '\0'; /* terminate the string */
+ yyterminate ();
+}
+
+<MAN_PRENAME>{bol}{sec_request}{blank}* |
+<MAN_PRENAME><<EOF>> { /* no NAME at all */
+ *p_name = '\0';
+ BEGIN (MAN_REST);
+}
+
+ /* need to match whole string so that we beat the following roff catch-all,
+ so use yyless to push back the name */
+<MAN_PRENAME>{
+ {bol}{typeface}{blank}.* |
+ {bol}\.Tn{blank}.* |
+ {bol}\.ft{blank}.* |
+ {bol}\.V[be]{blank}.* |
+ {bol}\.IX{blank}.* |
+ {bol}\.Nm{blank}.* {
+ yyless (0);
+ BEGIN (MAN_NAME);
+ }
+}
+
+ /* Skip over initial roff requests in NAME section. The use of yyless here
+ is evil. */
+<MAN_PRENAME>{bol}['.].*
+
+<MAN_PRENAME>{empty}{eol} yyless (1);
+
+<MAN_PRENAME>.|{eol} {
+ yyless (0);
+ BEGIN (MAN_NAME);
+}
+
+<MAN_NAME,MAN_DESC>{
+ {bol}{sec_request}{blank}* | /* Another section */
+ {bol}\.X{upper}{blank}+ | /* special - hpux */
+ {bol}\.sp{blank}* | /* vertical spacing */
+ {bol}\.ig{blank}* | /* block comment */
+ {bol}\.de[1i]?{blank}* | /* macro definition */
+ {bol}\.i[ef]{blank}* | /* conditional */
+ {empty}{bol}.+ |
+ <<EOF>> { /* terminate the string */
+ *p_name = '\0';
+ BEGIN (MAN_REST);
+ }
+}
+
+<CAT_NAME>{
+ {bol}S[yYeE] |
+ {eol}{2,}.+ |
+ {next}__ { /* terminate the string */
+ *p_name = '\0';
+ BEGIN (CAT_REST);
+ yyterminate ();
+ }
+}
+
+ /* ROFF request removal */
+<MAN_NAME,MAN_DESC>{
+ /* some include quoting; dealing with this is unpleasant */
+ {bol}{typeface}{blank}+\" {
+ newline_found ();
+ waiting_for_quote = true;
+ }
+
+ {bol}{typeface}{blank}+ | /* type face commands */
+ {bol}\.Tn{blank}+ | /* mdoc trade name */
+ {bol}\.ft{blank}.* | /* font change */
+ {bol}\.V[be]{blank}.* | /* pod2man, verbatim mode */
+ {bol}\.IX{blank}.* | /* .IX line */
+ {bol}\.Nm{blank}+ | /* mdoc name */
+ {bol}\.PD{blank}* | /* paragraph spacing */
+ {bol}\\& | /* non-breaking space */
+ {next}{comment}.* { /* per line comments */
+ newline_found ();
+ }
+}
+
+ /* No-op requests */
+<MAN_NAME,MAN_DESC>{
+ {bol}\.{blank}*$ newline_found ();
+ {bol}\.\.$ newline_found ();
+}
+
+ /* Toggle fill mode */
+<MAN_NAME,MAN_DESC>{
+ {bol}\.nf.* fill_mode = false;
+ {bol}\.fi.* fill_mode = true;
+}
+
+<CAT_NAME>-{eol}{blank_eol}* /* strip continuations */
+
+ /* convert to DASH */
+<MAN_NAME>{
+ {next}{blank}*\\\((mi|hy|em|en){blank}* |
+ {next}{blank}*\\\[(mi|hy|em|en)\]{blank}* |
+ {next}{blank_eol}+[-\\]-{blank}* |
+ {next}{blank_eol}*[-\\]-{blank}+ |
+ {bol}\.Nd{blank}* {
+ add_separator_to_whatis ();
+ BEGIN (MAN_DESC);
+ }
+}
+<CAT_NAME>{next}{blank}+-{1,2}{blank_eol}+ add_separator_to_whatis ();
+
+ /* escape sequences and special characters */
+<MAN_NAME,MAN_DESC>{
+ {next}\\[\\e] add_char_to_whatis ('\\');
+ {next}\\('|\(aa) add_char_to_whatis ('\'');
+ {next}\\(`|\(ga) add_char_to_whatis ('`');
+ {next}\\(-|\((mi|hy|em|en)) add_char_to_whatis ('-');
+ {next}\\\[(mi|hy|em|en)\] add_char_to_whatis ('-');
+ {next}\\\. add_char_to_whatis ('.');
+ {next}((\\[ 0t~])|[ ]|\t)* add_char_to_whatis (' ');
+ {next}\\\((ru|ul) add_char_to_whatis ('_');
+ {next}\\\\t add_char_to_whatis ('\t');
+
+ {next}\\[|^&!%acdpruz{}\r\n] /* various useless control chars */
+ {next}\\[bhlLvx]{blank}*'[^']+' /* various inline functions */
+
+ {next}\\\$[1-9] /* interpolate arg */
+
+ /* roff named glyphs */
+ {next}\\\(..|\\\[..\] add_glyph_to_whatis (yytext + 2, 2);
+ /* perldoc strings */
+ {next}\\\*\(..|\\\*\[..\] add_perldoc_to_whatis (yytext + 3, 2);
+ {next}\\\*. add_perldoc_to_whatis (yytext + 2, 1);
+
+ {next}\\["#].* /* comment */
+
+ {next}{font_change} /* font changes */
+ {next}\\k{alpha} /* mark input place in register */
+
+ {next}\\n(\({alpha})?{alpha} /* interpolate number register */
+ {next}\\o\"[^"]+\" /* overstrike chars */
+
+ {next}{size_change} /* size changes */
+ {next}\\w{blank}*'[^']+'[^ \t]* /* width of string */
+
+ {next}\\ /* catch all */
+
+ {next}\(\\\|\){blank}* /* function() in hpux */
+}
+
+ /* some people rather ambitiously use non-trivial mdoc macros in NAME
+ sections; cope with those that have been seen in the wild, and a few
+ more */
+<MAN_DESC>{
+ {bol}\.At{blank}* BEGIN (MAN_DESC_AT);
+ {bol}\.Bsx{blank}* BEGIN (MAN_DESC_BSX);
+ {bol}\.Bx{blank}* BEGIN (MAN_DESC_BX);
+ {bol}\.Fx{blank}* BEGIN (MAN_DESC_FX);
+ {bol}\.Nx{blank}* BEGIN (MAN_DESC_NX);
+ {bol}\.Ox{blank}* BEGIN (MAN_DESC_OX);
+ {bol}\.Ux{blank}* add_word_to_whatis ("UNIX");
+
+ {bol}\.Dq{blank}* {
+ add_word_to_whatis ("\"");
+ BEGIN (MAN_DESC_DQ);
+ }
+}
+
+<MAN_DESC_AT>{
+ 32v{blank}* mdoc_text ("Version 32V AT&T UNIX");
+ v1{blank}* mdoc_text ("Version 1 AT&T UNIX");
+ v2{blank}* mdoc_text ("Version 2 AT&T UNIX");
+ v3{blank}* mdoc_text ("Version 3 AT&T UNIX");
+ v4{blank}* mdoc_text ("Version 4 AT&T UNIX");
+ v5{blank}* mdoc_text ("Version 5 AT&T UNIX");
+ v6{blank}* mdoc_text ("Version 6 AT&T UNIX");
+ v7{blank}* mdoc_text ("Version 7 AT&T UNIX");
+ V{blank}* mdoc_text ("AT&T System V UNIX");
+ V.1{blank}* mdoc_text ("AT&T System V.1 UNIX");
+ V.2{blank}* mdoc_text ("AT&T System V.2 UNIX");
+ V.3{blank}* mdoc_text ("AT&T System V.3 UNIX");
+ V.4{blank}* mdoc_text ("AT&T System V.4 UNIX");
+ .|{eol} {
+ yyless (0);
+ mdoc_text ("AT&T UNIX");
+ }
+}
+
+<MAN_DESC_BSX>{
+ {word} {
+ add_word_to_whatis ("BSD/OS");
+ add_wordn_to_whatis (yytext, yyleng);
+ BEGIN (MAN_DESC);
+ }
+ .|{eol} {
+ yyless (0);
+ mdoc_text ("BSD/OS");
+ }
+}
+
+<MAN_DESC_BX>{
+ -alpha{blank}* mdoc_text ("BSD (currently in alpha test)");
+ -beta{blank}* mdoc_text ("BSD (currently in beta test)");
+ -devel{blank}* mdoc_text ("BSD (currently under development");
+ {word}{blank}* {
+ add_wordn_to_whatis (yytext, yyleng);
+ add_str_to_whatis ("BSD", 3);
+ BEGIN (MAN_DESC_BX_RELEASE);
+ }
+ .|{eol} {
+ yyless (0);
+ mdoc_text ("BSD");
+ }
+}
+
+<MAN_DESC_BX_RELEASE>{
+ [Rr]eno{blank}* {
+ add_str_to_whatis ("-Reno", 5);
+ BEGIN (MAN_DESC);
+ }
+ [Tt]ahoe{blank}* {
+ add_str_to_whatis ("-Tahoe", 6);
+ BEGIN (MAN_DESC);
+ }
+ [Ll]ite{blank}* {
+ add_str_to_whatis ("-Lite", 5);
+ BEGIN (MAN_DESC);
+ }
+ [Ll]ite2{blank}* {
+ add_str_to_whatis ("-Lite2", 6);
+ BEGIN (MAN_DESC);
+ }
+ .|{eol} {
+ yyless (0);
+ BEGIN (MAN_DESC);
+ }
+}
+
+<MAN_DESC_DQ>.* {
+ add_str_to_whatis (yytext, yyleng);
+ add_char_to_whatis ('"');
+ BEGIN (MAN_DESC);
+}
+
+<MAN_DESC_FX>{
+ {word} {
+ add_word_to_whatis ("FreeBSD");
+ add_wordn_to_whatis (yytext, yyleng);
+ BEGIN (MAN_DESC);
+ }
+ .|{eol} {
+ yyless (0);
+ mdoc_text ("FreeBSD");
+ }
+}
+
+<MAN_DESC_NX>{
+ {word} {
+ add_word_to_whatis ("NetBSD");
+ add_wordn_to_whatis (yytext, yyleng);
+ BEGIN (MAN_DESC);
+ }
+ .|{eol} {
+ yyless (0);
+ mdoc_text ("NetBSD");
+ }
+}
+
+<MAN_DESC_OX>{
+ {word} {
+ add_word_to_whatis ("OpenBSD");
+ add_wordn_to_whatis (yytext, yyleng);
+ BEGIN (MAN_DESC);
+ }
+ .|{eol} {
+ yyless (0);
+ mdoc_text ("OpenBSD");
+ }
+}
+
+ /* collapse spaces, escaped spaces, tabs, newlines to a single space */
+<CAT_NAME>{next}((\\[ ])|{blank})* add_char_to_whatis (' ');
+
+ /* a ROFF break request, a paragraph request, or an indentation change
+ usually means we have multiple whatis definitions, provide a separator
+ for later processing */
+<MAN_NAME,MAN_DESC>{
+ {bol}\.br{blank}* |
+ {bol}\.LP{blank}* |
+ {bol}\.PP{blank}* |
+ {bol}\.P{blank}* |
+ {bol}\.IP{blank}.* |
+ {bol}\.HP{blank}.* |
+ {bol}\.RS{blank}.* |
+ {bol}\.RE{blank}.* {
+ add_char_to_whatis ((char) 0x11);
+ BEGIN (MAN_NAME);
+ }
+}
+
+ /* any other roff request we don't recognise terminates definitions */
+<MAN_NAME,MAN_DESC>{bol}['.] {
+ *p_name = '\0';
+ BEGIN (MAN_REST);
+}
+
+ /* pass words as a chunk. speed optimization */
+<MAN_NAME,MAN_DESC>[[:alnum:]]* add_str_to_whatis (yytext, yyleng);
+
+ /* normalise the comma (,) separators */
+<CAT_NAME>{blank}*,[ \t\r\n]* |
+<MAN_NAME,MAN_DESC>{blank}*,{blank}* add_str_to_whatis (", ", 2);
+
+<CAT_NAME,MAN_NAME,MAN_DESC>{bol}. {
+ newline_found ();
+ add_char_to_whatis (yytext[yyleng - 1]);
+}
+
+<CAT_NAME,MAN_NAME,MAN_DESC>. add_char_to_whatis (*yytext);
+
+ /* default EOF rule */
+<<EOF>> return 1;
+
+%%
+
+/* print warning and force scanner to terminate */
+static void too_big (void)
+{
+ /* Even though MAX_NAME is a macro expanding to a constant, we
+ * translate it using ngettext anyway because that will make it
+ * easier to change the macro later.
+ */
+ error (0, 0,
+ ngettext ("warning: whatis for %s exceeds %d byte, "
+ "truncating.",
+ "warning: whatis for %s exceeds %d bytes, "
+ "truncating.", MAX_NAME),
+ fname, MAX_NAME);
+
+ BEGIN (FORCE_EXIT);
+}
+
+/* append a string to newname if enough room */
+static void add_str_to_whatis (const char *string, size_t length)
+{
+ if (p_name - newname + length >= MAX_NAME)
+ too_big ();
+ else {
+ (void) strncpy (p_name, string, length);
+ p_name += length;
+ }
+}
+
+/* append a char to newname if enough room */
+static void add_char_to_whatis (unsigned char c)
+{
+ if (p_name - newname + 1 >= MAX_NAME)
+ too_big ();
+ else if (waiting_for_quote && c == '"')
+ waiting_for_quote = false;
+ else
+ *p_name++ = c;
+}
+
+/* append the " - " separator to newname, trimming the first space if one's
+ * already there
+ */
+static void add_separator_to_whatis (void)
+{
+ if (p_name != newname && *(p_name - 1) != ' ')
+ add_char_to_whatis (' ');
+ add_str_to_whatis ("- ", 2);
+}
+
+/* append a word to newname if enough room, ensuring only necessary
+ surrounding space */
+static void add_wordn_to_whatis (const char *string, size_t length)
+{
+ if (p_name != newname && *(p_name - 1) != ' ')
+ add_char_to_whatis (' ');
+ while (length && string[length - 1] == ' ')
+ --length;
+ if (length)
+ add_str_to_whatis (string, length);
+}
+
+static void add_word_to_whatis (const char *string)
+{
+ add_wordn_to_whatis (string, strlen (string));
+}
+
+struct compare_macro_key {
+ const char *string;
+ size_t length;
+};
+
+static int compare_macro (const void *left, const void *right)
+{
+ const struct compare_macro_key *key = left;
+ const struct macro *value = right;
+ int cmp;
+
+ cmp = strncmp (key->string, value->name, key->length);
+ if (cmp)
+ return cmp;
+ /* equal up to key->length, so value->name must be at least size
+ * key->length + 1
+ */
+ else if (value->name[key->length])
+ return -1;
+ else
+ return 0;
+}
+
+static void add_macro_to_whatis (const struct macro *macros, size_t n_macros,
+ const char *string, size_t length)
+{
+ struct compare_macro_key key;
+ const struct macro *macro;
+
+ key.string = string;
+ key.length = length;
+ macro = bsearch (&key, macros, n_macros, sizeof (struct macro),
+ compare_macro);
+ if (macro)
+ add_str_to_whatis (macro->value, strlen (macro->value));
+}
+
+static void add_glyph_to_whatis (const char *string, size_t length)
+{
+ add_macro_to_whatis (glyphs, ARRAY_SIZE (glyphs), string, length);
+}
+
+static void add_perldoc_to_whatis (const char *string, size_t length)
+{
+ add_macro_to_whatis (perldocs, ARRAY_SIZE (perldocs), string, length);
+}
+
+static void mdoc_text (const char *string)
+{
+ add_word_to_whatis (string);
+ BEGIN (MAN_DESC);
+}
+
+static void newline_found (void)
+{
+ /* If we are mid p_name and the last added char was not a space,
+ * best add one.
+ */
+ if (p_name != newname && *(p_name - 1) != ' ') {
+ if (fill_mode)
+ add_char_to_whatis (' ');
+ else {
+ add_char_to_whatis ((char) 0x11);
+ BEGIN (MAN_NAME);
+ }
+ }
+ waiting_for_quote = false;
+}
+
+int find_name (const char *file, const char *filename, lexgrog *p_lg,
+ const char *encoding)
+{
+ int ret = 0;
+ decompress *d;
+ char *page_encoding = NULL;
+ bool run_col = p_lg->type == CATPAGE && *PROG_COL != '\0';
+
+ if (strcmp (file, "-") == 0) {
+ d = decompress_fdopen (dup (STDIN_FILENO));
+ } else {
+ struct stat st;
+ int decompress_flags;
+ char *lang;
+
+ if (stat (file, &st)) {
+ error (0, errno, "%s", file);
+ return 0;
+ }
+
+ if (S_ISDIR (st.st_mode)) {
+ error (0, EISDIR, "%s", file);
+ return 0;
+ }
+
+ drop_effective_privs ();
+ decompress_flags = 0;
+ /* If we're looking at a cat page, then we need to run col
+ * over it, which doesn't work conveniently with an
+ * in-process decompressor.
+ */
+ if (!run_col)
+ decompress_flags |= DECOMPRESS_ALLOW_INPROCESS;
+ d = decompress_open (file, decompress_flags);
+ if (!d) {
+ error (0, errno, _("can't open %s"), file);
+ regain_effective_privs ();
+ return 0;
+ }
+ regain_effective_privs ();
+
+ if (!encoding) {
+ lang = lang_dir (file);
+ page_encoding = get_page_encoding (lang);
+ free (lang);
+ }
+ }
+ if (!page_encoding && encoding)
+ page_encoding = xstrdup (encoding);
+ if (page_encoding) {
+ if (decompress_is_pipeline (d))
+ add_manconv (decompress_get_pipeline (d),
+ page_encoding, "UTF-8");
+ else if (manconv_inprocess (d, page_encoding, "UTF-8") != 0)
+ /* manconv should already have written to stderr, so
+ * just return zero (i.e. no result).
+ */
+ goto out;
+ }
+ if (run_col) {
+ pipecmd *col_cmd;
+ col_cmd = pipecmd_new_args
+ (PROG_COL, "-b", "-p", "-x", (void *) 0);
+ pipecmd_pre_exec (col_cmd, sandbox_load, sandbox_free,
+ sandbox);
+ pipeline_command (decompress_get_pipeline (d), col_cmd);
+ }
+ decompress_start (d);
+
+ ret = find_name_decompressed (d, filename, p_lg);
+
+out:
+ free (page_encoding);
+ decompress_free (d);
+ return ret;
+}
+
+int find_name_decompressed (decompress *d, const char *filename, lexgrog *p_lg)
+{
+ int ret;
+
+ decomp = d;
+
+ fname = filename;
+ *(p_name = newname) = '\0';
+ memset (filters, '_', sizeof (filters));
+
+ fill_mode = true;
+ waiting_for_quote = false;
+
+ if (p_lg->type == CATPAGE)
+ BEGIN (CAT_FILE);
+ else
+ BEGIN (MAN_FILE);
+
+ drop_effective_privs ();
+
+ yyrestart (NULL);
+ ret = yylex ();
+
+ regain_effective_privs ();
+
+ decompress_wait (decomp);
+
+ if (ret)
+ return 0;
+ else {
+ char f_tmp[MAX_FILTERS];
+ int j, k;
+
+ /* wipe out any leading or trailing spaces */
+ if (*newname) {
+ for (p_name = strchr (newname, '\0');
+ *(p_name - 1) == ' ';
+ p_name--);
+ if (*p_name == ' ')
+ *p_name = '\0';
+ }
+ for (p_name = newname; *p_name == ' '; p_name++);
+ p_lg->whatis = xstrdup (p_name);
+ memset (f_tmp, '\0', MAX_FILTERS);
+ f_tmp[0] = '-';
+ for (j = k = 0; j < MAX_FILTERS; j++)
+ if (filters[j] != '_')
+ f_tmp[k++] = filters[j];
+ p_lg->filters = xstrdup (f_tmp);
+ return p_name[0];
+ }
+}