diff options
Diffstat (limited to 'src/lexgrog.l')
-rw-r--r-- | src/lexgrog.l | 981 |
1 files changed, 981 insertions, 0 deletions
diff --git a/src/lexgrog.l b/src/lexgrog.l new file mode 100644 index 0000000..41527a4 --- /dev/null +++ b/src/lexgrog.l @@ -0,0 +1,981 @@ +%top{ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif /* HAVE_CONFIG_H */ + +#include "manconfig.h" + +/* Flex emits several functions which might reasonably have various + * attributes applied and many unused macros; none of these are our problem. + */ +#if GNUC_PREREQ(8,0) +# pragma GCC diagnostic ignored "-Wsuggest-attribute=malloc" +#endif +#pragma GCC diagnostic ignored "-Wsuggest-attribute=pure" +#pragma GCC diagnostic ignored "-Wunused-macros" +} + +%{ + +/* + * lexgrog.l: extract 'whatis' info from nroff man / formatted cat pages. + * + * Copyright (C) 1994, 1995 Graeme W. Wilford. (Wilf.) + * Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, + * 2011, 2012 Colin Watson. + * + * This file is part of man-db. + * + * man-db is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * man-db is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with man-db; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Wed Oct 12 18:46:11 BST 1994 Wilf. (G.Wilford@ee.surrey.ac.uk) + * + * CJW: Detect grap and vgrind. Understand fill requests. Other improvements + * in the syntax accepted. + */ + +#include <sys/stat.h> +#include <errno.h> +#include <stdbool.h> +#include <string.h> +#include <stdlib.h> +#include <unistd.h> + +#include "error.h" +#include "xalloc.h" + +#include "gettext.h" +#define _(String) gettext (String) + +#include "encodings.h" +#include "pipeline.h" +#include "sandbox.h" +#include "security.h" +#include "util.h" + +#include "decompress.h" +#include "lexgrog.h" +#include "manconv.h" +#include "manconv_client.h" + +#define YY_READ_BUF_SIZE 1024 +#define MAX_NAME 8192 + +/* defines the ordered list of filters detected by lexgrog */ +enum { + TBL_FILTER = 0, /* tbl */ + EQN_FILTER, /* eqn */ + PIC_FILTER, /* pic */ + GRAP_FILTER, /* grap */ + REF_FILTER, /* refer */ + VGRIND_FILTER, /* vgrind */ + MAX_FILTERS /* delimiter */ +}; + +#define ARRAY_SIZE(array) (sizeof (array) / sizeof ((array)[0])) + +extern man_sandbox *sandbox; + +struct macro { + const char *name; + const char *value; +}; + +static const struct macro glyphs[] = { + /* It is vital to keep these in strcmp order (sort -t\" -k2)! They + * will be searched using bsearch. + * Data from groff_char(7), although I have omitted some that are + * particularly unlikely to be used in NAME sections. + */ + { "'A", "Á" }, + { "'C", "Ć" }, + { "'E", "É" }, + { "'I", "Í" }, + { "'O", "Ó" }, + { "'U", "Ú" }, + { "'Y", "Ý" }, + { "'a", "á" }, + { "'c", "ć" }, + { "'e", "é" }, + { "'i", "í" }, + { "'o", "ó" }, + { "'u", "ú" }, + { "'y", "ý" }, + { ",C", "Ç" }, + { ",c", "ç" }, + { "-D", "Ð" }, + { ".i", "ı" }, + { "/L", "Ł" }, + { "/O", "Ø" }, + { "/l", "ł" }, + { "/o", "ø" }, + { ":A", "Ä" }, + { ":E", "Ë" }, + { ":I", "Ï" }, + { ":O", "Ö" }, + { ":U", "Ü" }, + { ":Y", "Ÿ" }, + { ":a", "ä" }, + { ":e", "ë" }, + { ":i", "ï" }, + { ":o", "ö" }, + { ":u", "ü" }, + { ":y", "ÿ" }, + { "AE", "Æ" }, + { "Bq", "„" }, + { "Fc", "»" }, + { "Fi", "ffi" }, + { "Fl", "ffl" }, + { "Fo", "«" }, + { "IJ", "IJ" }, + { "OE", "Œ" }, + { "Sd", "ð" }, + { "TP", "Þ" }, + { "Tp", "þ" }, + { "^A", "Â" }, + { "^E", "Ê" }, + { "^I", "Î" }, + { "^O", "Ô" }, + { "^U", "Û" }, + { "^a", "â" }, + { "^e", "ê" }, + { "^i", "î" }, + { "^o", "ô" }, + { "^u", "û" }, + { "`A", "À" }, + { "`E", "È" }, + { "`I", "Ì" }, + { "`O", "Ò" }, + { "`U", "Ù" }, + { "`a", "à" }, + { "`e", "è" }, + { "`i", "ì" }, + { "`o", "ò" }, + { "`u", "ù" }, + { "a\"", "˝" }, + { "a-", "¯" }, + { "a.", "˙" }, + { "a^", "^" }, + { "aa", "´" }, + { "ab", "˘" }, + { "ac", "¸" }, + { "ad", "¨" }, + { "ae", "æ" }, + { "ah", "ˇ" }, + { "ao", "˚" }, + { "aq", "'" }, + { "a~", "~" }, + { "bq", "‚" }, + { "cq", "’" }, + { "dq", "\"" }, + { "em", "—" }, + { "en", "–" }, + { "fc", "›" }, + { "ff", "ff" }, + { "fi", "fi" }, + { "fl", "fl" }, + { "fo", "‹" }, + { "ga", "`" }, + { "ha", "^" }, + { "ho", "˛" }, + { "hy", "‐" }, + { "ij", "ij" }, + { "lq", "“" }, + { "oA", "Å" }, + { "oa", "å" }, + { "oe", "œ" }, + { "oq", "‘" }, + { "r!", "¡" }, + { "r?", "¿" }, + { "rq", "”" }, + { "ss", "ß" }, + { "ti", "~" }, + { "vS", "Š" }, + { "vZ", "Ž" }, + { "vs", "š" }, + { "vz", "ž" }, + { "~A", "Ã" }, + { "~N", "Ñ" }, + { "~O", "Õ" }, + { "~a", "ã" }, + { "~n", "ñ" }, + { "~o", "õ" } +}; + +static const struct macro perldocs[] = { + /* It is vital to keep these in strcmp order (sort -t\" -k2)! They + * will be searched using bsearch. + * Data from Pod/Man.pm. + */ + { "--", "-" }, + { "Aq", "'" }, + { "C'", "'" }, + { "C+", "C++" }, + { "C`", "`" }, + { "L\"", "\"" }, + { "PI", "π" }, + { "R\"", "\"" } +}; + +static void add_str_to_whatis (const char *string, size_t length); +static void add_char_to_whatis (unsigned char c); +static void add_separator_to_whatis (void); +static void add_wordn_to_whatis (const char *string, size_t length); +static void add_word_to_whatis (const char *string); +static void add_glyph_to_whatis (const char *string, size_t length); +static void add_perldoc_to_whatis (const char *string, size_t length); +static void mdoc_text (const char *string); +static void newline_found (void); + +static char newname[MAX_NAME]; +static char *p_name; +static const char *fname; +static char filters[MAX_FILTERS]; + +static bool fill_mode; +static bool waiting_for_quote; + +static decompress *decomp; + +#define YY_INPUT(buf,result,max_size) { \ + size_t size = max_size; \ + const char *block = decompress_read (decomp, &size); \ + if (block && size != 0) { \ + memcpy (buf, block, size); \ + buf[size] = '\0'; \ + result = size; \ + } else \ + result = YY_NULL; \ +} +#define YY_NO_INPUT +%} + +%option ecs meta-ecs +%option 8bit batch caseful never-interactive +%option nostdinit +%option warn +%option noyywrap nounput + +%x MAN_PRENAME +%x MAN_NAME +%x MAN_DESC +%x MAN_DESC_AT +%x MAN_DESC_BSX +%x MAN_DESC_BX +%x MAN_DESC_BX_RELEASE +%x MAN_DESC_DQ +%x MAN_DESC_FX +%x MAN_DESC_NX +%x MAN_DESC_OX +%x CAT_NAME +%x CAT_FILE +%x MAN_FILE +%x CAT_REST +%x MAN_REST +%x FORCE_EXIT + +digit [[:digit:]] +upper [[:upper:]] +alpha [[:alpha:]] +blank [[:blank:]] +blank_eol [[:blank:]\r\n] +word [[:alnum:]][^[:blank:]\r\n]* +eol \r?\n +bol {eol}+ +next {eol}* +empty {eol}{blank}* +indent {eol}{blank}+ +dbl_quote \" +font_change \\f([[:upper:]1-4]|\({upper}{2}) +size_change \\s[+-]?{digit} +style_change ({font_change}{size_change}?|{size_change}{font_change}?) +typeface \.(B[IR]?|I[BR]?|R[BI]|S[BM]) +sec_request \.[Ss][HhYySs] +comment ['.]\\{dbl_quote} + + /* Please add to this list if you know how. */ + /* Note that, since flex only supports UTF-8 by accident, character classes + * including non-ASCII characters must be written out as (a|b|c|d) rather + * than [abcd]. + */ +ar_name (اﻹسم|الإسم) + /* ИМЕ also works for mk */ +bg_name И(М|м)(Е|е) +cs_name (J[Mm](É|é|\\\('[Ee]|E|e)[Nn][Oo]|N(Á|á)[Zz][Ee][Vv]) +da_name N[Aa][Vv][Nn] +de_name B[Ee][Zz][Ee][Ii][Cc][Hh][Nn][Uu][Nn][Gg] +en_name N[Aa][Mm][Ee] +eo_name N[Oo][Mm][Oo] +es_name N[Oo][Mm][Bb][Rr][Ee] +fa_name نام +fi_name N[Ii][Mm][Ii] +fr_name N[Oo][Mm] +hu_name N(É|é|\\\('[Ee]|E|e)[Vv] +id_name N[Aa][Mm][Aa] + /* NOME also works for gl, pt */ +it_name N[Oo][Mm][Ee] +ja_name (名|̾)(前|称) +ko_name (이름|명칭) +latin_name N[Oo][Mm][Ee][Nn] +lt_name P[Aa][Vv][Aa][Dd][Ii][Nn][Ii][Mm][Aa][Ss] +nl_name N[Aa][Aa][Mm] +pl_name N[Aa][Zz][Ww][Aa] +ro_name N[Uu][Mm][Ee] +ru_name (И(М|м)(Я|я)|Н(А|а)(З|з)(В|в)(А|а)(Н|н)(И|и)(Е|е)|Н(А|а)(И|и)(М|м)(Е|е)(Н|н)(О|о)(В|в)(А|а)(Н|н)(И|и)(Е|е)) +sk_name M[Ee][Nn][Oo] +sr_name (И(М|м)(Е|е)|Н(А|а)(З|з)(И|и)(В|в)) +srlatin_name (I[Mm][Ee]|N[Aa][Zz][Ii][Vv]) +sv_name N[Aa][Mm][Nn] +ta_name பெய +tr_name (A[Dd]|(İ|i)S(İ|i)M) +uk_name Н(А|а)(З|з)(В|в)(А|а) +vi_name T(Ê|ê)[Nn] +zh_CN_name 名{blank}?(称|字){blank}?.* +zh_TW_name (名{blank}?(稱|字)|命令名){blank}?.* +name ({ar_name}|{bg_name}|{cs_name}|{da_name}|{de_name}|{en_name}|{eo_name}|{es_name}|{fa_name}|{fi_name}|{fr_name}|{hu_name}|{id_name}|{it_name}|{ja_name}|{ko_name}|{latin_name}|{lt_name}|{nl_name}|{pl_name}|{ro_name}|{ru_name}|{sk_name}|{sr_name}|{srlatin_name}|{sv_name}|{ta_name}|{tr_name}|{uk_name}|{vi_name}|{zh_CN_name}|{zh_TW_name}) +name_sec {dbl_quote}?{style_change}?{name}{style_change}?({blank}*{dbl_quote})? + + /* eptgrv : eqn, pic, tbl, grap, refer, vgrind */ +tbl_request \.TS +eqn_request \.EQ +pic_request \.PS +grap_request \.G1 +ref1_request \.R1 +ref2_request \.\[ +vgrind_request \.vS + +%% + + /* begin NAME section processing */ +<MAN_FILE>{sec_request}{blank_eol}+{name_sec}{blank}* BEGIN (MAN_PRENAME); +<CAT_FILE>{empty}{2,}{name}{blank}*{indent} BEGIN (CAT_NAME); + + /* general text matching */ +<MAN_FILE>{ + \.[^Ss\r\n].* | + \..{0,3}{dbl_quote}?.{0,4}{dbl_quote}? | + {comment}.* | + .|{eol} +} + +<CAT_FILE>{ + .{1,9} | + [ ]* | + {eol}{2,} | + .|{eol} +} + +<MAN_REST>{ + {bol}{tbl_request} filters[TBL_FILTER] = 't'; + {bol}{eqn_request} filters[EQN_FILTER] = 'e'; + {bol}{pic_request} filters[PIC_FILTER] = 'p'; + {bol}{grap_request} filters[GRAP_FILTER] = 'g'; + {bol}{ref1_request} | + {bol}{ref2_request} filters[REF_FILTER] = 'r'; + {bol}{vgrind_request} filters[VGRIND_FILTER] = 'v'; +} +<MAN_REST><<EOF>> { /* exit */ + *p_name = '\0'; /* terminate the string */ + yyterminate (); +} +<MAN_REST>.+|{eol} + + /* rules to end NAME section processing */ +<FORCE_EXIT>.|{eol} { /* forced exit */ + *p_name = '\0'; /* terminate the string */ + yyterminate (); +} + +<MAN_PRENAME>{bol}{sec_request}{blank}* | +<MAN_PRENAME><<EOF>> { /* no NAME at all */ + *p_name = '\0'; + BEGIN (MAN_REST); +} + + /* need to match whole string so that we beat the following roff catch-all, + so use yyless to push back the name */ +<MAN_PRENAME>{ + {bol}{typeface}{blank}.* | + {bol}\.Tn{blank}.* | + {bol}\.ft{blank}.* | + {bol}\.V[be]{blank}.* | + {bol}\.IX{blank}.* | + {bol}\.Nm{blank}.* { + yyless (0); + BEGIN (MAN_NAME); + } +} + + /* Skip over initial roff requests in NAME section. The use of yyless here + is evil. */ +<MAN_PRENAME>{bol}['.].* + +<MAN_PRENAME>{empty}{eol} yyless (1); + +<MAN_PRENAME>.|{eol} { + yyless (0); + BEGIN (MAN_NAME); +} + +<MAN_NAME,MAN_DESC>{ + {bol}{sec_request}{blank}* | /* Another section */ + {bol}\.X{upper}{blank}+ | /* special - hpux */ + {bol}\.sp{blank}* | /* vertical spacing */ + {bol}\.ig{blank}* | /* block comment */ + {bol}\.de[1i]?{blank}* | /* macro definition */ + {bol}\.i[ef]{blank}* | /* conditional */ + {empty}{bol}.+ | + <<EOF>> { /* terminate the string */ + *p_name = '\0'; + BEGIN (MAN_REST); + } +} + +<CAT_NAME>{ + {bol}S[yYeE] | + {eol}{2,}.+ | + {next}__ { /* terminate the string */ + *p_name = '\0'; + BEGIN (CAT_REST); + yyterminate (); + } +} + + /* ROFF request removal */ +<MAN_NAME,MAN_DESC>{ + /* some include quoting; dealing with this is unpleasant */ + {bol}{typeface}{blank}+\" { + newline_found (); + waiting_for_quote = true; + } + + {bol}{typeface}{blank}+ | /* type face commands */ + {bol}\.Tn{blank}+ | /* mdoc trade name */ + {bol}\.ft{blank}.* | /* font change */ + {bol}\.V[be]{blank}.* | /* pod2man, verbatim mode */ + {bol}\.IX{blank}.* | /* .IX line */ + {bol}\.Nm{blank}+ | /* mdoc name */ + {bol}\.PD{blank}* | /* paragraph spacing */ + {bol}\\& | /* non-breaking space */ + {next}{comment}.* { /* per line comments */ + newline_found (); + } +} + + /* No-op requests */ +<MAN_NAME,MAN_DESC>{ + {bol}\.{blank}*$ newline_found (); + {bol}\.\.$ newline_found (); +} + + /* Toggle fill mode */ +<MAN_NAME,MAN_DESC>{ + {bol}\.nf.* fill_mode = false; + {bol}\.fi.* fill_mode = true; +} + +<CAT_NAME>-{eol}{blank_eol}* /* strip continuations */ + + /* convert to DASH */ +<MAN_NAME>{ + {next}{blank}*\\\((mi|hy|em|en){blank}* | + {next}{blank}*\\\[(mi|hy|em|en)\]{blank}* | + {next}{blank_eol}+[-\\]-{blank}* | + {next}{blank_eol}*[-\\]-{blank}+ | + {bol}\.Nd{blank}* { + add_separator_to_whatis (); + BEGIN (MAN_DESC); + } +} +<CAT_NAME>{next}{blank}+-{1,2}{blank_eol}+ add_separator_to_whatis (); + + /* escape sequences and special characters */ +<MAN_NAME,MAN_DESC>{ + {next}\\[\\e] add_char_to_whatis ('\\'); + {next}\\('|\(aa) add_char_to_whatis ('\''); + {next}\\(`|\(ga) add_char_to_whatis ('`'); + {next}\\(-|\((mi|hy|em|en)) add_char_to_whatis ('-'); + {next}\\\[(mi|hy|em|en)\] add_char_to_whatis ('-'); + {next}\\\. add_char_to_whatis ('.'); + {next}((\\[ 0t~])|[ ]|\t)* add_char_to_whatis (' '); + {next}\\\((ru|ul) add_char_to_whatis ('_'); + {next}\\\\t add_char_to_whatis ('\t'); + + {next}\\[|^&!%acdpruz{}\r\n] /* various useless control chars */ + {next}\\[bhlLvx]{blank}*'[^']+' /* various inline functions */ + + {next}\\\$[1-9] /* interpolate arg */ + + /* roff named glyphs */ + {next}\\\(..|\\\[..\] add_glyph_to_whatis (yytext + 2, 2); + /* perldoc strings */ + {next}\\\*\(..|\\\*\[..\] add_perldoc_to_whatis (yytext + 3, 2); + {next}\\\*. add_perldoc_to_whatis (yytext + 2, 1); + + {next}\\["#].* /* comment */ + + {next}{font_change} /* font changes */ + {next}\\k{alpha} /* mark input place in register */ + + {next}\\n(\({alpha})?{alpha} /* interpolate number register */ + {next}\\o\"[^"]+\" /* overstrike chars */ + + {next}{size_change} /* size changes */ + {next}\\w{blank}*'[^']+'[^ \t]* /* width of string */ + + {next}\\ /* catch all */ + + {next}\(\\\|\){blank}* /* function() in hpux */ +} + + /* some people rather ambitiously use non-trivial mdoc macros in NAME + sections; cope with those that have been seen in the wild, and a few + more */ +<MAN_DESC>{ + {bol}\.At{blank}* BEGIN (MAN_DESC_AT); + {bol}\.Bsx{blank}* BEGIN (MAN_DESC_BSX); + {bol}\.Bx{blank}* BEGIN (MAN_DESC_BX); + {bol}\.Fx{blank}* BEGIN (MAN_DESC_FX); + {bol}\.Nx{blank}* BEGIN (MAN_DESC_NX); + {bol}\.Ox{blank}* BEGIN (MAN_DESC_OX); + {bol}\.Ux{blank}* add_word_to_whatis ("UNIX"); + + {bol}\.Dq{blank}* { + add_word_to_whatis ("\""); + BEGIN (MAN_DESC_DQ); + } +} + +<MAN_DESC_AT>{ + 32v{blank}* mdoc_text ("Version 32V AT&T UNIX"); + v1{blank}* mdoc_text ("Version 1 AT&T UNIX"); + v2{blank}* mdoc_text ("Version 2 AT&T UNIX"); + v3{blank}* mdoc_text ("Version 3 AT&T UNIX"); + v4{blank}* mdoc_text ("Version 4 AT&T UNIX"); + v5{blank}* mdoc_text ("Version 5 AT&T UNIX"); + v6{blank}* mdoc_text ("Version 6 AT&T UNIX"); + v7{blank}* mdoc_text ("Version 7 AT&T UNIX"); + V{blank}* mdoc_text ("AT&T System V UNIX"); + V.1{blank}* mdoc_text ("AT&T System V.1 UNIX"); + V.2{blank}* mdoc_text ("AT&T System V.2 UNIX"); + V.3{blank}* mdoc_text ("AT&T System V.3 UNIX"); + V.4{blank}* mdoc_text ("AT&T System V.4 UNIX"); + .|{eol} { + yyless (0); + mdoc_text ("AT&T UNIX"); + } +} + +<MAN_DESC_BSX>{ + {word} { + add_word_to_whatis ("BSD/OS"); + add_wordn_to_whatis (yytext, yyleng); + BEGIN (MAN_DESC); + } + .|{eol} { + yyless (0); + mdoc_text ("BSD/OS"); + } +} + +<MAN_DESC_BX>{ + -alpha{blank}* mdoc_text ("BSD (currently in alpha test)"); + -beta{blank}* mdoc_text ("BSD (currently in beta test)"); + -devel{blank}* mdoc_text ("BSD (currently under development"); + {word}{blank}* { + add_wordn_to_whatis (yytext, yyleng); + add_str_to_whatis ("BSD", 3); + BEGIN (MAN_DESC_BX_RELEASE); + } + .|{eol} { + yyless (0); + mdoc_text ("BSD"); + } +} + +<MAN_DESC_BX_RELEASE>{ + [Rr]eno{blank}* { + add_str_to_whatis ("-Reno", 5); + BEGIN (MAN_DESC); + } + [Tt]ahoe{blank}* { + add_str_to_whatis ("-Tahoe", 6); + BEGIN (MAN_DESC); + } + [Ll]ite{blank}* { + add_str_to_whatis ("-Lite", 5); + BEGIN (MAN_DESC); + } + [Ll]ite2{blank}* { + add_str_to_whatis ("-Lite2", 6); + BEGIN (MAN_DESC); + } + .|{eol} { + yyless (0); + BEGIN (MAN_DESC); + } +} + +<MAN_DESC_DQ>.* { + add_str_to_whatis (yytext, yyleng); + add_char_to_whatis ('"'); + BEGIN (MAN_DESC); +} + +<MAN_DESC_FX>{ + {word} { + add_word_to_whatis ("FreeBSD"); + add_wordn_to_whatis (yytext, yyleng); + BEGIN (MAN_DESC); + } + .|{eol} { + yyless (0); + mdoc_text ("FreeBSD"); + } +} + +<MAN_DESC_NX>{ + {word} { + add_word_to_whatis ("NetBSD"); + add_wordn_to_whatis (yytext, yyleng); + BEGIN (MAN_DESC); + } + .|{eol} { + yyless (0); + mdoc_text ("NetBSD"); + } +} + +<MAN_DESC_OX>{ + {word} { + add_word_to_whatis ("OpenBSD"); + add_wordn_to_whatis (yytext, yyleng); + BEGIN (MAN_DESC); + } + .|{eol} { + yyless (0); + mdoc_text ("OpenBSD"); + } +} + + /* collapse spaces, escaped spaces, tabs, newlines to a single space */ +<CAT_NAME>{next}((\\[ ])|{blank})* add_char_to_whatis (' '); + + /* a ROFF break request, a paragraph request, or an indentation change + usually means we have multiple whatis definitions, provide a separator + for later processing */ +<MAN_NAME,MAN_DESC>{ + {bol}\.br{blank}* | + {bol}\.LP{blank}* | + {bol}\.PP{blank}* | + {bol}\.P{blank}* | + {bol}\.IP{blank}.* | + {bol}\.HP{blank}.* | + {bol}\.RS{blank}.* | + {bol}\.RE{blank}.* { + add_char_to_whatis ((char) 0x11); + BEGIN (MAN_NAME); + } +} + + /* any other roff request we don't recognise terminates definitions */ +<MAN_NAME,MAN_DESC>{bol}['.] { + *p_name = '\0'; + BEGIN (MAN_REST); +} + + /* pass words as a chunk. speed optimization */ +<MAN_NAME,MAN_DESC>[[:alnum:]]* add_str_to_whatis (yytext, yyleng); + + /* normalise the comma (,) separators */ +<CAT_NAME>{blank}*,[ \t\r\n]* | +<MAN_NAME,MAN_DESC>{blank}*,{blank}* add_str_to_whatis (", ", 2); + +<CAT_NAME,MAN_NAME,MAN_DESC>{bol}. { + newline_found (); + add_char_to_whatis (yytext[yyleng - 1]); +} + +<CAT_NAME,MAN_NAME,MAN_DESC>. add_char_to_whatis (*yytext); + + /* default EOF rule */ +<<EOF>> return 1; + +%% + +/* print warning and force scanner to terminate */ +static void too_big (void) +{ + /* Even though MAX_NAME is a macro expanding to a constant, we + * translate it using ngettext anyway because that will make it + * easier to change the macro later. + */ + error (0, 0, + ngettext ("warning: whatis for %s exceeds %d byte, " + "truncating.", + "warning: whatis for %s exceeds %d bytes, " + "truncating.", MAX_NAME), + fname, MAX_NAME); + + BEGIN (FORCE_EXIT); +} + +/* append a string to newname if enough room */ +static void add_str_to_whatis (const char *string, size_t length) +{ + if (p_name - newname + length >= MAX_NAME) + too_big (); + else { + (void) strncpy (p_name, string, length); + p_name += length; + } +} + +/* append a char to newname if enough room */ +static void add_char_to_whatis (unsigned char c) +{ + if (p_name - newname + 1 >= MAX_NAME) + too_big (); + else if (waiting_for_quote && c == '"') + waiting_for_quote = false; + else + *p_name++ = c; +} + +/* append the " - " separator to newname, trimming the first space if one's + * already there + */ +static void add_separator_to_whatis (void) +{ + if (p_name != newname && *(p_name - 1) != ' ') + add_char_to_whatis (' '); + add_str_to_whatis ("- ", 2); +} + +/* append a word to newname if enough room, ensuring only necessary + surrounding space */ +static void add_wordn_to_whatis (const char *string, size_t length) +{ + if (p_name != newname && *(p_name - 1) != ' ') + add_char_to_whatis (' '); + while (length && string[length - 1] == ' ') + --length; + if (length) + add_str_to_whatis (string, length); +} + +static void add_word_to_whatis (const char *string) +{ + add_wordn_to_whatis (string, strlen (string)); +} + +struct compare_macro_key { + const char *string; + size_t length; +}; + +static int compare_macro (const void *left, const void *right) +{ + const struct compare_macro_key *key = left; + const struct macro *value = right; + int cmp; + + cmp = strncmp (key->string, value->name, key->length); + if (cmp) + return cmp; + /* equal up to key->length, so value->name must be at least size + * key->length + 1 + */ + else if (value->name[key->length]) + return -1; + else + return 0; +} + +static void add_macro_to_whatis (const struct macro *macros, size_t n_macros, + const char *string, size_t length) +{ + struct compare_macro_key key; + const struct macro *macro; + + key.string = string; + key.length = length; + macro = bsearch (&key, macros, n_macros, sizeof (struct macro), + compare_macro); + if (macro) + add_str_to_whatis (macro->value, strlen (macro->value)); +} + +static void add_glyph_to_whatis (const char *string, size_t length) +{ + add_macro_to_whatis (glyphs, ARRAY_SIZE (glyphs), string, length); +} + +static void add_perldoc_to_whatis (const char *string, size_t length) +{ + add_macro_to_whatis (perldocs, ARRAY_SIZE (perldocs), string, length); +} + +static void mdoc_text (const char *string) +{ + add_word_to_whatis (string); + BEGIN (MAN_DESC); +} + +static void newline_found (void) +{ + /* If we are mid p_name and the last added char was not a space, + * best add one. + */ + if (p_name != newname && *(p_name - 1) != ' ') { + if (fill_mode) + add_char_to_whatis (' '); + else { + add_char_to_whatis ((char) 0x11); + BEGIN (MAN_NAME); + } + } + waiting_for_quote = false; +} + +int find_name (const char *file, const char *filename, lexgrog *p_lg, + const char *encoding) +{ + int ret = 0; + decompress *d; + char *page_encoding = NULL; + bool run_col = p_lg->type == CATPAGE && *PROG_COL != '\0'; + + if (strcmp (file, "-") == 0) { + d = decompress_fdopen (dup (STDIN_FILENO)); + } else { + struct stat st; + int decompress_flags; + char *lang; + + if (stat (file, &st)) { + error (0, errno, "%s", file); + return 0; + } + + if (S_ISDIR (st.st_mode)) { + error (0, EISDIR, "%s", file); + return 0; + } + + drop_effective_privs (); + decompress_flags = 0; + /* If we're looking at a cat page, then we need to run col + * over it, which doesn't work conveniently with an + * in-process decompressor. + */ + if (!run_col) + decompress_flags |= DECOMPRESS_ALLOW_INPROCESS; + d = decompress_open (file, decompress_flags); + if (!d) { + error (0, errno, _("can't open %s"), file); + regain_effective_privs (); + return 0; + } + regain_effective_privs (); + + if (!encoding) { + lang = lang_dir (file); + page_encoding = get_page_encoding (lang); + free (lang); + } + } + if (!page_encoding && encoding) + page_encoding = xstrdup (encoding); + if (page_encoding) { + if (decompress_is_pipeline (d)) + add_manconv (decompress_get_pipeline (d), + page_encoding, "UTF-8"); + else if (manconv_inprocess (d, page_encoding, "UTF-8") != 0) + /* manconv should already have written to stderr, so + * just return zero (i.e. no result). + */ + goto out; + } + if (run_col) { + pipecmd *col_cmd; + col_cmd = pipecmd_new_args + (PROG_COL, "-b", "-p", "-x", (void *) 0); + pipecmd_pre_exec (col_cmd, sandbox_load, sandbox_free, + sandbox); + pipeline_command (decompress_get_pipeline (d), col_cmd); + } + decompress_start (d); + + ret = find_name_decompressed (d, filename, p_lg); + +out: + free (page_encoding); + decompress_free (d); + return ret; +} + +int find_name_decompressed (decompress *d, const char *filename, lexgrog *p_lg) +{ + int ret; + + decomp = d; + + fname = filename; + *(p_name = newname) = '\0'; + memset (filters, '_', sizeof (filters)); + + fill_mode = true; + waiting_for_quote = false; + + if (p_lg->type == CATPAGE) + BEGIN (CAT_FILE); + else + BEGIN (MAN_FILE); + + drop_effective_privs (); + + yyrestart (NULL); + ret = yylex (); + + regain_effective_privs (); + + decompress_wait (decomp); + + if (ret) + return 0; + else { + char f_tmp[MAX_FILTERS]; + int j, k; + + /* wipe out any leading or trailing spaces */ + if (*newname) { + for (p_name = strchr (newname, '\0'); + *(p_name - 1) == ' '; + p_name--); + if (*p_name == ' ') + *p_name = '\0'; + } + for (p_name = newname; *p_name == ' '; p_name++); + p_lg->whatis = xstrdup (p_name); + memset (f_tmp, '\0', MAX_FILTERS); + f_tmp[0] = '-'; + for (j = k = 0; j < MAX_FILTERS; j++) + if (filters[j] != '_') + f_tmp[k++] = filters[j]; + p_lg->filters = xstrdup (f_tmp); + return p_name[0]; + } +} |