summaryrefslogtreecommitdiffstats
path: root/src/rfc2047.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 16:16:13 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 16:16:13 +0000
commite90fcc54809db2591dc083f43ef54c6ec8c60847 (patch)
treef20bc206c3c2d5d59d37c46c5cf5d53a20642556 /src/rfc2047.c
parentInitial commit. (diff)
downloadexim4-e90fcc54809db2591dc083f43ef54c6ec8c60847.tar.xz
exim4-e90fcc54809db2591dc083f43ef54c6ec8c60847.zip
Adding upstream version 4.96.upstream/4.96upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/rfc2047.c')
-rw-r--r--src/rfc2047.c345
1 files changed, 345 insertions, 0 deletions
diff --git a/src/rfc2047.c b/src/rfc2047.c
new file mode 100644
index 0000000..1ed1dd8
--- /dev/null
+++ b/src/rfc2047.c
@@ -0,0 +1,345 @@
+/*************************************************
+* Exim - an Internet mail transport agent *
+*************************************************/
+
+/* Copyright (c) The Exim Maintainers 2020 - 2022 */
+/* Copyright (c) University of Cambridge 1995 - 2018 */
+/* See the file NOTICE for conditions of use and distribution. */
+
+/* This file contains a function for decoding message header lines that may
+contain encoded "words" according to the rules described in
+
+ RFC-2047 at http://www.ietf.org/rfc/rfc2047.txt
+
+The function is a rewritten version of code created by Norihisa Washitake.
+The original could be used both inside Exim (as part of a patch) or in a
+freestanding form. The original contained some built-in code conversions; I
+have chosen only to do code conversions if iconv() is supported by the OS.
+Because there were quite a lot of hacks to be done, for a variety of reasons,
+I rewrote the code.
+
+You can find the latest version of the original library at
+
+ http://washitake.com/mail/exim/mime/
+
+The code below is almost completely unlike the original. */
+
+
+#include "exim.h"
+
+
+/*************************************************
+* Do a QP conversion *
+*************************************************/
+
+/* This function decodes "quoted printable" into bytes.
+
+Arguments:
+ string the string that includes QP escapes
+ ptrptr where to return pointer to the decoded string
+
+Returns: the length of the decoded string, or -1 on failure
+*/
+
+static int
+rfc2047_qpdecode(uschar *string, uschar **ptrptr)
+{
+int len = 0;
+uschar *ptr;
+
+ptr = *ptrptr = store_get(Ustrlen(string) + 1, string); /* No longer than this */
+
+while (*string != 0)
+ {
+ int ch = *string++;
+
+ if (ch == '_') *ptr++ = ' ';
+ else if (ch == '=')
+ {
+ int a = *string;
+ int b = (a == 0)? 0 : string[1];
+ if (!isxdigit(a) || !isxdigit(b)) return -1; /* Bad QP string */
+ *ptr++ = ((Ustrchr(hex_digits, tolower(a)) - hex_digits) << 4) +
+ Ustrchr(hex_digits, tolower(b)) - hex_digits;
+ string += 2;
+ }
+ else if (ch == ' ' || ch == '\t') return -1; /* Whitespace is illegal */
+ else *ptr++ = ch;
+
+ len++;
+ }
+
+*ptr = 0;
+return len;
+}
+
+
+
+/*************************************************
+* Decode next MIME word *
+*************************************************/
+
+/* Scan a string to see if a MIME word exists; pass back the separator
+points in the string.
+
+Arguments:
+ string subject string
+ lencheck TRUE to enforce maximum length check
+ q1ptr pass back address of first question mark
+ q2ptr pass back address of second question mark
+ endptr pass back address of final ?=
+ dlenptr pass back length of decoded string
+ dptrptr pass back pointer to decoded string
+
+Returns: address of =? or NULL if not present
+*/
+
+static uschar *
+decode_mimeword(uschar *string, BOOL lencheck, uschar **q1ptr, uschar **q2ptr,
+ uschar **endptr, size_t *dlenptr, uschar **dptrptr)
+{
+uschar *mimeword;
+for (;; string = mimeword + 2)
+ {
+ int encoding;
+ int dlen = -1;
+
+ if ((mimeword = Ustrstr(string, "=?")) == NULL ||
+ (*q1ptr = Ustrchr(mimeword+2, '?')) == NULL ||
+ (*q2ptr = Ustrchr(*q1ptr+1, '?')) == NULL ||
+ (*endptr = Ustrstr(*q2ptr+1, "?=")) == NULL) return NULL;
+
+ /* We have found =?xxx?xxx?xxx?= in the string. Optionally check the
+ length, and that the second field is just one character long. If not,
+ continue the loop to search again. We must start just after the initial =?
+ because we might have found =?xxx=?xxx?xxx?xxx?=. */
+
+ if ((lencheck && *endptr - mimeword > 73) || *q2ptr - *q1ptr != 2) continue;
+
+ /* Get the encoding letter, and decode the data string. */
+
+ encoding = toupper((*q1ptr)[1]);
+ **endptr = 0;
+ if (encoding == 'B')
+ dlen = b64decode(*q2ptr+1, dptrptr);
+ else if (encoding == 'Q')
+ dlen = rfc2047_qpdecode(*q2ptr+1, dptrptr);
+ **endptr = '?'; /* restore */
+
+ /* If the decoding succeeded, we are done. Set the length of the decoded
+ string, and pass back the initial pointer. Otherwise, the loop continues. */
+
+ if (dlen >= 0)
+ {
+ *dlenptr = (size_t)dlen;
+ return mimeword;
+ }
+ }
+
+/* Control should never actually get here */
+}
+
+
+
+/*************************************************
+* Decode and convert an RFC 2047 string *
+*************************************************/
+
+/* There are two functions defined here. The original one was rfc2047_decode()
+and it was documented in the local_scan() interface. I needed to add an extra
+argument for use by expand_string(), so I created rfc2047_decode2() for that
+purpose. The original function became a stub that just supplies NULL for the
+new argument (sizeptr).
+
+An RFC 2047-encoded string may contain one or more "words", each of the
+form =?...?.?...?= with the first ... specifying the character code, the
+second being Q (for quoted printable) or B for Base64 encoding. The third ...
+is the actual data.
+
+This function first decodes each "word" into bytes from the Q or B encoding.
+Then, if provided with the name of a charset encoding, and if iconv() is
+available, it attempts to translate the result to the named character set.
+If this fails, the binary string is returned with an error message.
+
+If a binary zero is encountered in the decoded string, it is replaced by the
+contents of the zeroval argument. For use with Exim headers, the value must not
+be 0 because they are handled as zero-terminated strings. When zeroval==0,
+lenptr should not be NULL.
+
+Arguments:
+ string the subject string
+ lencheck TRUE to enforce maximum MIME word length
+ target the name of the target encoding for MIME words, or NULL for
+ no charset translation
+ zeroval the value to use for binary zero bytes
+ lenptr if not NULL, the length of the result is returned via
+ this variable
+ sizeptr if not NULL, the length of a new store block in which the
+ result is built is placed here; if no new store is obtained,
+ the value is not changed
+ error for error messages; NULL if no problem; this can be set
+ when the yield is non-NULL if there was a charset
+ translation problem
+
+Returns: the decoded, converted string, or NULL on error; if there are
+ no MIME words in the string, the original string is returned
+*/
+
+uschar *
+rfc2047_decode2(uschar *string, BOOL lencheck, const uschar *target,
+ int zeroval, int *lenptr, int *sizeptr, uschar **error)
+{
+int size = Ustrlen(string);
+size_t dlen;
+uschar *dptr;
+gstring *yield;
+uschar *mimeword, *q1, *q2, *endword;
+
+*error = NULL;
+mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
+
+if (!mimeword)
+ {
+ if (lenptr) *lenptr = size;
+ return string;
+ }
+
+/* Scan through the string, decoding MIME words and copying intermediate text,
+building the result as we go. The result may be longer than the input if it is
+translated into a multibyte code such as UTF-8. That's why we use the dynamic
+string building code. */
+
+yield = store_get(sizeof(gstring) + ++size, string);
+yield->size = size;
+yield->ptr = 0;
+yield->s = US(yield + 1);
+
+while (mimeword)
+ {
+
+ #if HAVE_ICONV
+ iconv_t icd = (iconv_t)(-1);
+ #endif
+
+ if (mimeword != string)
+ yield = string_catn(yield, string, mimeword - string);
+/*XXX that might have to convert an untainted string to a tainted one */
+
+ /* Do a charset translation if required. This is supported only on hosts
+ that have the iconv() function. Translation errors set error, but carry on,
+ using the untranslated data. If there is more than one error, the message
+ passed back refers to the final one. We use a loop to cater for the case
+ of long strings - the RFC puts limits on the length, but it's best to be
+ robust. */
+
+ #if HAVE_ICONV
+ *q1 = 0;
+ if (target && strcmpic(target, mimeword+2) != 0)
+ if ((icd = iconv_open(CS target, CS(mimeword+2))) == (iconv_t)-1)
+ *error = string_sprintf("iconv_open(\"%s\", \"%s\") failed: %s%s",
+ target, mimeword+2, strerror(errno),
+ (errno == EINVAL)? " (maybe unsupported conversion)" : "");
+ *q1 = '?';
+ #endif
+
+ while (dlen > 0)
+ {
+ uschar *tptr = NULL; /* Stops compiler warning */
+ int tlen = -1;
+
+ #if HAVE_ICONV
+ uschar tbuffer[256];
+ uschar *outptr = tbuffer;
+ size_t outleft = sizeof(tbuffer);
+
+ /* If translation is required, go for it. */
+
+ if (icd != (iconv_t)(-1))
+ {
+ (void)iconv(icd, (ICONV_ARG2_TYPE)(&dptr), &dlen, CSS &outptr, &outleft);
+
+ /* If outptr has been adjusted, there is some output. Set up to add it to
+ the output buffer. The function will have adjusted dptr and dlen. If
+ iconv() stopped because of an error, we'll pick it up next time when
+ there's no output.
+
+ If there is no output, we expect there to have been a translation
+ error, because we know there was at least one input byte. We leave the
+ value of tlen as -1, which causes the rest of the input to be copied
+ verbatim. */
+
+ if (outptr > tbuffer)
+ {
+ tptr = tbuffer;
+ tlen = outptr - tbuffer;
+ }
+ else
+ {
+ DEBUG(D_any) debug_printf("iconv error translating \"%.*s\" to %s: "
+ "%s\n", (int)(endword + 2 - mimeword), mimeword, target, strerror(errno));
+ }
+ }
+
+ #endif
+
+ /* No charset translation is happening or there was a translation error;
+ just set up the original as the string to be added, and mark it all used.
+ */
+
+ if (tlen == -1)
+ {
+ tptr = dptr;
+ tlen = dlen;
+ dlen = 0;
+ }
+
+ /* Deal with zero values; convert them if requested. */
+
+ if (zeroval != 0)
+ for (int i = 0; i < tlen; i++)
+ if (tptr[i] == 0) tptr[i] = zeroval;
+
+ /* Add the new string onto the result */
+
+ yield = string_catn(yield, tptr, tlen);
+ }
+
+ #if HAVE_ICONV
+ if (icd != (iconv_t)(-1)) iconv_close(icd);
+ #endif
+
+ /* Update string past the MIME word; skip any white space if the next thing
+ is another MIME word. */
+
+ string = endword + 2;
+ mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
+ if (mimeword)
+ {
+ uschar *s = string;
+ while (isspace(*s)) s++;
+ if (s == mimeword) string = s;
+ }
+ }
+
+/* Copy the remaining characters of the string, zero-terminate it, and return
+the length as well if requested. */
+
+yield = string_cat(yield, string);
+
+if (lenptr) *lenptr = yield->ptr;
+if (sizeptr) *sizeptr = yield->size;
+return string_from_gstring(yield);
+}
+
+
+/* This is the stub that provides the original interface without the sizeptr
+argument. */
+
+uschar *
+rfc2047_decode(uschar *string, BOOL lencheck, const uschar *target, int zeroval,
+ int *lenptr, uschar **error)
+{
+return rfc2047_decode2(string, lencheck, target, zeroval, lenptr, NULL, error);
+}
+
+/* End of rfc2047.c */