1 files changed, 345 insertions, 0 deletions
diff --git a/src/rfc2047.c b/src/rfc2047.c
new file mode 100644
index 0000000..1ed1dd8
--- /dev/null
+++ b/src/rfc2047.c
@@ -0,0 +1,345 @@
+/*************************************************
+*     Exim - an Internet mail transport agent    *
+*************************************************/
+
+/* Copyright (c) The Exim Maintainers 2020 - 2022 */
+/* Copyright (c) University of Cambridge 1995 - 2018 */
+/* See the file NOTICE for conditions of use and distribution. */
+
+/* This file contains a function for decoding message header lines that may
+contain encoded "words" according to the rules described in
+
+  RFC-2047 at http://www.ietf.org/rfc/rfc2047.txt
+
+The function is a rewritten version of code created by Norihisa Washitake.
+The original could be used both inside Exim (as part of a patch) or in a
+freestanding form. The original contained some built-in code conversions; I
+have chosen only to do code conversions if iconv() is supported by the OS.
+Because there were quite a lot of hacks to be done, for a variety of reasons,
+I rewrote the code.
+
+You can find the latest version of the original library at
+
+  http://washitake.com/mail/exim/mime/
+
+The code below is almost completely unlike the original. */
+
+
+#include "exim.h"
+
+
+/*************************************************
+*                Do a QP conversion              *
+*************************************************/
+
+/* This function decodes "quoted printable" into bytes.
+
+Arguments:
+  string      the string that includes QP escapes
+  ptrptr      where to return pointer to the decoded string
+
+Returns:      the length of the decoded string, or -1 on failure
+*/
+
+static int
+rfc2047_qpdecode(uschar *string, uschar **ptrptr)
+{
+int len = 0;
+uschar *ptr;
+
+ptr = *ptrptr = store_get(Ustrlen(string) + 1, string);  /* No longer than this */
+
+while (*string != 0)
+  {
+  int ch = *string++;
+
+  if (ch == '_') *ptr++ = ' ';
+  else if (ch == '=')
+    {
+    int a = *string;
+    int b = (a == 0)? 0 : string[1];
+    if (!isxdigit(a) || !isxdigit(b)) return -1;  /* Bad QP string */
+    *ptr++ = ((Ustrchr(hex_digits, tolower(a)) - hex_digits) << 4) +
+               Ustrchr(hex_digits, tolower(b)) - hex_digits;
+    string += 2;
+    }
+  else if (ch == ' ' || ch == '\t') return -1;    /* Whitespace is illegal */
+  else *ptr++ = ch;
+
+  len++;
+  }
+
+*ptr = 0;
+return len;
+}
+
+
+
+/*************************************************
+*            Decode next MIME word               *
+*************************************************/
+
+/* Scan a string to see if a MIME word exists; pass back the separator
+points in the string.
+
+Arguments:
+  string     subject string
+  lencheck   TRUE to enforce maximum length check
+  q1ptr      pass back address of first question mark
+  q2ptr      pass back address of second question mark
+  endptr     pass back address of final ?=
+  dlenptr    pass back length of decoded string
+  dptrptr    pass back pointer to decoded string
+
+Returns:     address of =? or NULL if not present
+*/
+
+static uschar *
+decode_mimeword(uschar *string, BOOL lencheck, uschar **q1ptr, uschar **q2ptr,
+  uschar **endptr, size_t *dlenptr, uschar **dptrptr)
+{
+uschar *mimeword;
+for (;; string = mimeword + 2)
+  {
+  int encoding;
+  int dlen = -1;
+
+  if ((mimeword = Ustrstr(string, "=?"))  == NULL ||
+      (*q1ptr = Ustrchr(mimeword+2, '?')) == NULL ||
+      (*q2ptr = Ustrchr(*q1ptr+1, '?')) == NULL ||
+      (*endptr = Ustrstr(*q2ptr+1, "?=")) == NULL) return NULL;
+
+  /* We have found =?xxx?xxx?xxx?= in the string. Optionally check the
+  length, and that the second field is just one character long. If not,
+  continue the loop to search again. We must start just after the initial =?
+  because we might have found =?xxx=?xxx?xxx?xxx?=. */
+
+  if ((lencheck && *endptr - mimeword > 73) || *q2ptr - *q1ptr != 2) continue;
+
+  /* Get the encoding letter, and decode the data string. */
+
+  encoding = toupper((*q1ptr)[1]);
+  **endptr = 0;
+  if (encoding == 'B')
+    dlen = b64decode(*q2ptr+1, dptrptr);
+  else if (encoding == 'Q')
+    dlen = rfc2047_qpdecode(*q2ptr+1, dptrptr);
+  **endptr = '?';   /* restore */
+
+  /* If the decoding succeeded, we are done. Set the length of the decoded
+  string, and pass back the initial pointer. Otherwise, the loop continues. */
+
+  if (dlen >= 0)
+    {
+    *dlenptr = (size_t)dlen;
+    return mimeword;
+    }
+  }
+
+/* Control should never actually get here */
+}
+
+
+
+/*************************************************
+*    Decode and convert an RFC 2047 string       *
+*************************************************/
+
+/* There are two functions defined here. The original one was rfc2047_decode()
+and it was documented in the local_scan() interface. I needed to add an extra
+argument for use by expand_string(), so I created rfc2047_decode2() for that
+purpose. The original function became a stub that just supplies NULL for the
+new argument (sizeptr).
+
+An RFC 2047-encoded string may contain one or more "words", each of the
+form  =?...?.?...?=  with the first ... specifying the character code, the
+second being Q (for quoted printable) or B for Base64 encoding. The third ...
+is the actual data.
+
+This function first decodes each "word" into bytes from the Q or B encoding.
+Then, if provided with the name of a charset encoding, and if iconv() is
+available, it attempts to translate the result to the named character set.
+If this fails, the binary string is returned with an error message.
+
+If a binary zero is encountered in the decoded string, it is replaced by the
+contents of the zeroval argument. For use with Exim headers, the value must not
+be 0 because they are handled as zero-terminated strings. When zeroval==0,
+lenptr should not be NULL.
+
+Arguments:
+    string       the subject string
+    lencheck     TRUE to enforce maximum MIME word length
+    target       the name of the target encoding for MIME words, or NULL for
+                   no charset translation
+    zeroval      the value to use for binary zero bytes
+    lenptr       if not NULL, the length of the result is returned via
+                   this variable
+    sizeptr      if not NULL, the length of a new store block in which the
+                   result is built is placed here; if no new store is obtained,
+                   the value is not changed
+    error        for error messages; NULL if no problem; this can be set
+                   when the yield is non-NULL if there was a charset
+                   translation problem
+
+Returns:         the decoded, converted string, or NULL on error; if there are
+                   no MIME words in the string, the original string is returned
+*/
+
+uschar *
+rfc2047_decode2(uschar *string, BOOL lencheck, const uschar *target,
+  int zeroval, int *lenptr, int *sizeptr, uschar **error)
+{
+int size = Ustrlen(string);
+size_t dlen;
+uschar *dptr;
+gstring *yield;
+uschar *mimeword, *q1, *q2, *endword;
+
+*error = NULL;
+mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
+
+if (!mimeword)
+  {
+  if (lenptr) *lenptr = size;
+  return string;
+  }
+
+/* Scan through the string, decoding MIME words and copying intermediate text,
+building the result as we go. The result may be longer than the input if it is
+translated into a multibyte code such as UTF-8. That's why we use the dynamic
+string building code. */
+
+yield = store_get(sizeof(gstring) + ++size, string);
+yield->size = size;
+yield->ptr = 0;
+yield->s = US(yield + 1);
+
+while (mimeword)
+  {
+
+  #if HAVE_ICONV
+  iconv_t icd = (iconv_t)(-1);
+  #endif
+
+  if (mimeword != string)
+    yield = string_catn(yield, string, mimeword - string);
+/*XXX that might have to convert an untainted string to a tainted one */
+
+  /* Do a charset translation if required. This is supported only on hosts
+  that have the iconv() function. Translation errors set error, but carry on,
+  using the untranslated data. If there is more than one error, the message
+  passed back refers to the final one. We use a loop to cater for the case
+  of long strings - the RFC puts limits on the length, but it's best to be
+  robust. */
+
+  #if HAVE_ICONV
+  *q1 = 0;
+  if (target && strcmpic(target, mimeword+2) != 0)
+    if ((icd = iconv_open(CS target, CS(mimeword+2))) == (iconv_t)-1)
+      *error = string_sprintf("iconv_open(\"%s\", \"%s\") failed: %s%s",
+        target, mimeword+2, strerror(errno),
+        (errno == EINVAL)? " (maybe unsupported conversion)" : "");
+  *q1 = '?';
+  #endif
+
+  while (dlen > 0)
+    {
+    uschar *tptr = NULL;   /* Stops compiler warning */
+    int tlen = -1;
+
+    #if HAVE_ICONV
+    uschar tbuffer[256];
+    uschar *outptr = tbuffer;
+    size_t outleft = sizeof(tbuffer);
+
+    /* If translation is required, go for it. */
+
+    if (icd != (iconv_t)(-1))
+      {
+      (void)iconv(icd, (ICONV_ARG2_TYPE)(&dptr), &dlen, CSS &outptr, &outleft);
+
+      /* If outptr has been adjusted, there is some output. Set up to add it to
+      the output buffer. The function will have adjusted dptr and dlen. If
+      iconv() stopped because of an error, we'll pick it up next time when
+      there's no output.
+
+      If there is no output, we expect there to have been a translation
+      error, because we know there was at least one input byte. We leave the
+      value of tlen as -1, which causes the rest of the input to be copied
+      verbatim. */
+
+      if (outptr > tbuffer)
+        {
+        tptr = tbuffer;
+        tlen = outptr - tbuffer;
+        }
+      else
+        {
+        DEBUG(D_any) debug_printf("iconv error translating \"%.*s\" to %s: "
+        "%s\n", (int)(endword + 2 - mimeword), mimeword, target, strerror(errno));
+        }
+      }
+
+    #endif
+
+    /* No charset translation is happening or there was a translation error;
+    just set up the original as the string to be added, and mark it all used.
+    */
+
+    if (tlen == -1)
+      {
+      tptr = dptr;
+      tlen = dlen;
+      dlen = 0;
+      }
+
+    /* Deal with zero values; convert them if requested. */
+
+    if (zeroval != 0)
+      for (int i = 0; i < tlen; i++)
+        if (tptr[i] == 0) tptr[i] = zeroval;
+
+    /* Add the new string onto the result */
+
+    yield = string_catn(yield, tptr, tlen);
+    }
+
+  #if HAVE_ICONV
+  if (icd != (iconv_t)(-1))  iconv_close(icd);
+  #endif
+
+  /* Update string past the MIME word; skip any white space if the next thing
+  is another MIME word. */
+
+  string = endword + 2;
+  mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
+  if (mimeword)
+    {
+    uschar *s = string;
+    while (isspace(*s)) s++;
+    if (s == mimeword) string = s;
+    }
+  }
+
+/* Copy the remaining characters of the string, zero-terminate it, and return
+the length as well if requested. */
+
+yield = string_cat(yield, string);
+
+if (lenptr) *lenptr = yield->ptr;
+if (sizeptr) *sizeptr = yield->size;
+return string_from_gstring(yield);
+}
+
+
+/* This is the stub that provides the original interface without the sizeptr
+argument. */
+
+uschar *
+rfc2047_decode(uschar *string, BOOL lencheck, const uschar *target, int zeroval,
+  int *lenptr, uschar **error)
+{
+return rfc2047_decode2(string, lencheck, target, zeroval, lenptr, NULL, error);
+}
+
+/* End of rfc2047.c */