diff options
Diffstat (limited to '')
-rw-r--r-- | src/global/uxtext.c | 278 |
1 files changed, 278 insertions, 0 deletions
diff --git a/src/global/uxtext.c b/src/global/uxtext.c new file mode 100644 index 0000000..6dfe94c --- /dev/null +++ b/src/global/uxtext.c @@ -0,0 +1,278 @@ +/*++ +/* NAME +/* uxtext 3 +/* SUMMARY +/* quote/unquote text, xtext style. +/* SYNOPSIS +/* #include <uxtext.h> +/* +/* VSTRING *uxtext_quote(quoted, unquoted, special) +/* VSTRING *quoted; +/* const char *unquoted; +/* const char *special; +/* +/* VSTRING *uxtext_quote_append(unquoted, quoted, special) +/* VSTRING *unquoted; +/* const char *quoted; +/* const char *special; +/* +/* VSTRING *uxtext_unquote(unquoted, quoted) +/* VSTRING *unquoted; +/* const char *quoted; +/* +/* VSTRING *uxtext_unquote_append(unquoted, quoted) +/* VSTRING *unquoted; +/* const char *quoted; +/* DESCRIPTION +/* uxtext_quote() takes a null-terminated UTF8 string and +/* replaces characters \, <33(10) and >126(10), as well as +/* characters specified with "special" with \x{XX}, XX being +/* a 2-6-digit uppercase hexadecimal equivalent. +/* +/* uxtext_quote_append() is like uxtext_quote(), but appends +/* the conversion result to the result buffer. +/* +/* uxtext_unquote() performs the opposite transformation. This +/* function understands lowercase, uppercase, and mixed case +/* \x{XX...} sequences. The result value is the unquoted +/* argument in case of success, a null pointer otherwise. +/* +/* uxtext_unquote_append() is like uxtext_unquote(), but appends +/* the conversion result to the result buffer. +/* BUGS +/* This module cannot process null characters in data. +/* LICENSE +/* .ad +/* .fi +/* The Secure Mailer license must be distributed with this software. +/* AUTHOR(S) +/* Arnt Gulbrandsen +/* +/* Wietse Venema +/* IBM T.J. Watson Research +/* P.O. Box 704 +/* Yorktown Heights, NY 10598, USA +/* +/* Wietse Venema +/* Google, Inc. +/* 111 8th Avenue +/* New York, NY 10011, USA +/*--*/ + +/* System library. */ + +#include <sys_defs.h> +#include <string.h> +#include <ctype.h> + +/* Utility library. */ + +#include "msg.h" +#include "vstring.h" +#include "uxtext.h" + +/* Application-specific. */ + +#define STR(x) vstring_str(x) +#define LEN(x) VSTRING_LEN(x) + +/* uxtext_quote_append - append unquoted data to quoted data */ + +VSTRING *uxtext_quote_append(VSTRING *quoted, const char *unquoted, + const char *special) +{ + unsigned const char *cp; + int ch; + + for (cp = (unsigned const char *) unquoted; (ch = *cp) != 0; cp++) { + /* Fix 20140709: the '\' character must always be quoted. */ + if (ch != '\\' && ch > 32 && ch < 127 + && (*special == 0 || strchr(special, ch) == 0)) { + VSTRING_ADDCH(quoted, ch); + } else { + + /* + * had RFC6533 been written like 6531 and 6532, this else clause + * would be one line long. + */ + int unicode = 0; + int pick = 0; + + if (ch < 0x80) { + //0000 0000 - 0000 007 F 0x xxxxxx + unicode = ch; + } else if ((ch & 0xe0) == 0xc0) { + //0000 0080 - 0000 07 FF 110 xxxxx 10 xxxxxx + unicode = (ch & 0x1f); + pick = 1; + } else if ((ch & 0xf0) == 0xe0) { + //0000 0800 - 0000 FFFF 1110 xxxx 10 xxxxxx 10 xxxxxx + unicode = (ch & 0x0f); + pick = 2; + } else if ((ch & 0xf8) == 0xf0) { + //0001 0000 - 001 F FFFF 11110 xxx 10 xxxxxx 10 xxxxxx 10 xxxxxx + unicode = (ch & 0x07); + pick = 3; + } else if ((ch & 0xfc) == 0xf8) { + //0020 0000 - 03 FF FFFF 111110 xx 10 xxxxxx 10 xxxxxx...10 xxxxxx + unicode = (ch & 0x03); + pick = 4; + } else if ((ch & 0xfe) == 0xfc) { + //0400 0000 - 7 FFF FFFF 1111110 x 10 xxxxxx...10 xxxxxx + unicode = (ch & 0x01); + pick = 5; + } else { + return (0); + } + while (pick > 0) { + ch = *++cp; + if ((ch & 0xc0) != 0x80) + return (0); + unicode = unicode << 6 | (ch & 0x3f); + pick--; + } + vstring_sprintf_append(quoted, "\\x{%02X}", unicode); + } + } + VSTRING_TERMINATE(quoted); + return (quoted); +} + +/* uxtext_quote - unquoted data to quoted */ + +VSTRING *uxtext_quote(VSTRING *quoted, const char *unquoted, const char *special) +{ + VSTRING_RESET(quoted); + uxtext_quote_append(quoted, unquoted, special); + return (quoted); +} + +/* uxtext_unquote_append - quoted data to unquoted */ + +VSTRING *uxtext_unquote_append(VSTRING *unquoted, const char *quoted) +{ + const unsigned char *cp; + int ch; + + for (cp = (const unsigned char *) quoted; (ch = *cp) != 0; cp++) { + if (ch == '\\' && cp[1] == 'x' && cp[2] == '{') { + int unicode = 0; + + cp += 2; + while ((ch = *++cp) != '}') { + if (ISDIGIT(ch)) + unicode = (unicode << 4) + (ch - '0'); + else if (ch >= 'a' && ch <= 'f') + unicode = (unicode << 4) + (ch - 'a' + 10); + else if (ch >= 'A' && ch <= 'F') + unicode = (unicode << 4) + (ch - 'A' + 10); + else + return (0); /* also covers the null + * terminator */ + if (unicode > 0x10ffff) + return (0); + } + + /* + * the following block is from + * https://github.com/aox/aox/blob/master/encodings/utf.cpp, with + * permission by the authors. + */ + if (unicode < 0x80) { + VSTRING_ADDCH(unquoted, (char) unicode); + } else if (unicode < 0x800) { + VSTRING_ADDCH(unquoted, 0xc0 | ((char) (unicode >> 6))); + VSTRING_ADDCH(unquoted, 0x80 | ((char) (unicode & 0x3f))); + } else if (unicode < 0x10000) { + VSTRING_ADDCH(unquoted, 0xe0 | ((char) (unicode >> 12))); + VSTRING_ADDCH(unquoted, 0x80 | ((char) (unicode >> 6) & 0x3f)); + VSTRING_ADDCH(unquoted, 0x80 | ((char) (unicode & 0x3f))); + } else if (unicode < 0x200000) { + VSTRING_ADDCH(unquoted, 0xf0 | ((char) (unicode >> 18))); + VSTRING_ADDCH(unquoted, 0x80 | ((char) (unicode >> 12) & 0x3f)); + VSTRING_ADDCH(unquoted, 0x80 | ((char) (unicode >> 6) & 0x3f)); + VSTRING_ADDCH(unquoted, 0x80 | ((char) (unicode & 0x3f))); + } else if (unicode < 0x4000000) { + VSTRING_ADDCH(unquoted, 0xf8 | ((char) (unicode >> 24))); + VSTRING_ADDCH(unquoted, 0x80 | ((char) (unicode >> 18) & 0x3f)); + VSTRING_ADDCH(unquoted, 0x80 | ((char) (unicode >> 12) & 0x3f)); + VSTRING_ADDCH(unquoted, 0x80 | ((char) (unicode >> 6) & 0x3f)); + VSTRING_ADDCH(unquoted, 0x80 | ((char) (unicode & 0x3f))); + } else { + VSTRING_ADDCH(unquoted, 0xfc | ((char) (unicode >> 30))); + VSTRING_ADDCH(unquoted, 0x80 | ((char) (unicode >> 24) & 0x3f)); + VSTRING_ADDCH(unquoted, 0x80 | ((char) (unicode >> 18) & 0x3f)); + VSTRING_ADDCH(unquoted, 0x80 | ((char) (unicode >> 12) & 0x3f)); + VSTRING_ADDCH(unquoted, 0x80 | ((char) (unicode >> 6) & 0x3f)); + VSTRING_ADDCH(unquoted, 0x80 | ((char) (unicode & 0x3f))); + } + } else { + VSTRING_ADDCH(unquoted, ch); + } + } + VSTRING_TERMINATE(unquoted); + return (unquoted); +} + +/* uxtext_unquote - quoted data to unquoted */ + +VSTRING *uxtext_unquote(VSTRING *unquoted, const char *quoted) +{ + VSTRING_RESET(unquoted); + return (uxtext_unquote_append(unquoted, quoted) ? unquoted : 0); +} + +#ifdef TEST + + /* + * Proof-of-concept test program: convert to quoted and back. + */ +#include <vstream.h> + +#define BUFLEN 1024 + +static ssize_t read_buf(VSTREAM *fp, VSTRING *buf) +{ + ssize_t len; + + len = vstream_fread_buf(fp, buf, BUFLEN); + VSTRING_TERMINATE(buf); + return (len); +} + +int main(int unused_argc, char **unused_argv) +{ + VSTRING *unquoted = vstring_alloc(BUFLEN); + VSTRING *quoted = vstring_alloc(100); + ssize_t len; + + /* + * Negative tests. + */ + if (uxtext_unquote(unquoted, "\\x{x1}") != 0) + msg_warn("undetected error pattern 1"); + if (uxtext_unquote(unquoted, "\\x{2x}") != 0) + msg_warn("undetected error pattern 2"); + if (uxtext_unquote(unquoted, "\\x{33") != 0) + msg_warn("undetected error pattern 3"); + + /* + * Positive tests. + */ + while ((len = read_buf(VSTREAM_IN, unquoted)) > 0) { + uxtext_quote(quoted, STR(unquoted), "+="); + if (uxtext_unquote(unquoted, STR(quoted)) == 0) + msg_fatal("bad input: %.100s", STR(quoted)); + if (LEN(unquoted) != len) + msg_fatal("len %ld != unquoted len %ld", + (long) len, (long) LEN(unquoted)); + if (vstream_fwrite(VSTREAM_OUT, STR(unquoted), LEN(unquoted)) != LEN(unquoted)) + msg_fatal("write error: %m"); + } + vstream_fflush(VSTREAM_OUT); + vstring_free(unquoted); + vstring_free(quoted); + return (0); +} + +#endif |