src/rfc2047.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345

/*************************************************
*     Exim - an Internet mail transport agent    *
*************************************************/

/* Copyright (c) The Exim Maintainers 2020 - 2022 */
/* Copyright (c) University of Cambridge 1995 - 2018 */
/* See the file NOTICE for conditions of use and distribution. */

/* This file contains a function for decoding message header lines that may
contain encoded "words" according to the rules described in

  RFC-2047 at http://www.ietf.org/rfc/rfc2047.txt

The function is a rewritten version of code created by Norihisa Washitake.
The original could be used both inside Exim (as part of a patch) or in a
freestanding form. The original contained some built-in code conversions; I
have chosen only to do code conversions if iconv() is supported by the OS.
Because there were quite a lot of hacks to be done, for a variety of reasons,
I rewrote the code.

You can find the latest version of the original library at

  http://washitake.com/mail/exim/mime/

The code below is almost completely unlike the original. */


#include "exim.h"


/*************************************************
*                Do a QP conversion              *
*************************************************/

/* This function decodes "quoted printable" into bytes.

Arguments:
  string      the string that includes QP escapes
  ptrptr      where to return pointer to the decoded string

Returns:      the length of the decoded string, or -1 on failure
*/

static int
rfc2047_qpdecode(uschar *string, uschar **ptrptr)
{
int len = 0;
uschar *ptr;

ptr = *ptrptr = store_get(Ustrlen(string) + 1, string);  /* No longer than this */

while (*string != 0)
  {
  int ch = *string++;

  if (ch == '_') *ptr++ = ' ';
  else if (ch == '=')
    {
    int a = *string;
    int b = (a == 0)? 0 : string[1];
    if (!isxdigit(a) || !isxdigit(b)) return -1;  /* Bad QP string */
    *ptr++ = ((Ustrchr(hex_digits, tolower(a)) - hex_digits) << 4) +
               Ustrchr(hex_digits, tolower(b)) - hex_digits;
    string += 2;
    }
  else if (ch == ' ' || ch == '\t') return -1;    /* Whitespace is illegal */
  else *ptr++ = ch;

  len++;
  }

*ptr = 0;
return len;
}


/*************************************************
*            Decode next MIME word               *
*************************************************/

/* Scan a string to see if a MIME word exists; pass back the separator
points in the string.

Arguments:
  string     subject string
  lencheck   TRUE to enforce maximum length check
  q1ptr      pass back address of first question mark
  q2ptr      pass back address of second question mark
  endptr     pass back address of final ?=
  dlenptr    pass back length of decoded string
  dptrptr    pass back pointer to decoded string

Returns:     address of =? or NULL if not present
*/

static uschar *
decode_mimeword(uschar *string, BOOL lencheck, uschar **q1ptr, uschar **q2ptr,
  uschar **endptr, size_t *dlenptr, uschar **dptrptr)
{
uschar *mimeword;
for (;; string = mimeword + 2)
  {
  int encoding;
  int dlen = -1;

  if ((mimeword = Ustrstr(string, "=?"))  == NULL ||
      (*q1ptr = Ustrchr(mimeword+2, '?')) == NULL ||
      (*q2ptr = Ustrchr(*q1ptr+1, '?')) == NULL ||
      (*endptr = Ustrstr(*q2ptr+1, "?=")) == NULL) return NULL;

  /* We have found =?xxx?xxx?xxx?= in the string. Optionally check the
  length, and that the second field is just one character long. If not,
  continue the loop to search again. We must start just after the initial =?
  because we might have found =?xxx=?xxx?xxx?xxx?=. */

  if ((lencheck && *endptr - mimeword > 73) || *q2ptr - *q1ptr != 2) continue;

  /* Get the encoding letter, and decode the data string. */

  encoding = toupper((*q1ptr)[1]);
  **endptr = 0;
  if (encoding == 'B')
    dlen = b64decode(*q2ptr+1, dptrptr);
  else if (encoding == 'Q')
    dlen = rfc2047_qpdecode(*q2ptr+1, dptrptr);
  **endptr = '?';   /* restore */

  /* If the decoding succeeded, we are done. Set the length of the decoded
  string, and pass back the initial pointer. Otherwise, the loop continues. */

  if (dlen >= 0)
    {
    *dlenptr = (size_t)dlen;
    return mimeword;
    }
  }

/* Control should never actually get here */
}


/*************************************************
*    Decode and convert an RFC 2047 string       *
*************************************************/

/* There are two functions defined here. The original one was rfc2047_decode()
and it was documented in the local_scan() interface. I needed to add an extra
argument for use by expand_string(), so I created rfc2047_decode2() for that
purpose. The original function became a stub that just supplies NULL for the
new argument (sizeptr).

An RFC 2047-encoded string may contain one or more "words", each of the
form  =?...?.?...?=  with the first ... specifying the character code, the
second being Q (for quoted printable) or B for Base64 encoding. The third ...
is the actual data.

This function first decodes each "word" into bytes from the Q or B encoding.
Then, if provided with the name of a charset encoding, and if iconv() is
available, it attempts to translate the result to the named character set.
If this fails, the binary string is returned with an error message.

If a binary zero is encountered in the decoded string, it is replaced by the
contents of the zeroval argument. For use with Exim headers, the value must not
be 0 because they are handled as zero-terminated strings. When zeroval==0,
lenptr should not be NULL.

Arguments:
    string       the subject string
    lencheck     TRUE to enforce maximum MIME word length
    target       the name of the target encoding for MIME words, or NULL for
                   no charset translation
    zeroval      the value to use for binary zero bytes
    lenptr       if not NULL, the length of the result is returned via
                   this variable
    sizeptr      if not NULL, the length of a new store block in which the
                   result is built is placed here; if no new store is obtained,
                   the value is not changed
    error        for error messages; NULL if no problem; this can be set
                   when the yield is non-NULL if there was a charset
                   translation problem

Returns:         the decoded, converted string, or NULL on error; if there are
                   no MIME words in the string, the original string is returned
*/

uschar *
rfc2047_decode2(uschar *string, BOOL lencheck, const uschar *target,
  int zeroval, int *lenptr, int *sizeptr, uschar **error)
{
int size = Ustrlen(string);
size_t dlen;
uschar *dptr;
gstring *yield;
uschar *mimeword, *q1, *q2, *endword;

*error = NULL;
mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);

if (!mimeword)
  {
  if (lenptr) *lenptr = size;
  return string;
  }

/* Scan through the string, decoding MIME words and copying intermediate text,
building the result as we go. The result may be longer than the input if it is
translated into a multibyte code such as UTF-8. That's why we use the dynamic
string building code. */

yield = store_get(sizeof(gstring) + ++size, string);
yield->size = size;
yield->ptr = 0;
yield->s = US(yield + 1);

while (mimeword)
  {

  #if HAVE_ICONV
  iconv_t icd = (iconv_t)(-1);
  #endif

  if (mimeword != string)
    yield = string_catn(yield, string, mimeword - string);
/*XXX that might have to convert an untainted string to a tainted one */

  /* Do a charset translation if required. This is supported only on hosts
  that have the iconv() function. Translation errors set error, but carry on,
  using the untranslated data. If there is more than one error, the message
  passed back refers to the final one. We use a loop to cater for the case
  of long strings - the RFC puts limits on the length, but it's best to be
  robust. */

  #if HAVE_ICONV
  *q1 = 0;
  if (target && strcmpic(target, mimeword+2) != 0)
    if ((icd = iconv_open(CS target, CS(mimeword+2))) == (iconv_t)-1)
      *error = string_sprintf("iconv_open(\"%s\", \"%s\") failed: %s%s",
        target, mimeword+2, strerror(errno),
        (errno == EINVAL)? " (maybe unsupported conversion)" : "");
  *q1 = '?';
  #endif

  while (dlen > 0)
    {
    uschar *tptr = NULL;   /* Stops compiler warning */
    int tlen = -1;

    #if HAVE_ICONV
    uschar tbuffer[256];
    uschar *outptr = tbuffer;
    size_t outleft = sizeof(tbuffer);

    /* If translation is required, go for it. */

    if (icd != (iconv_t)(-1))
      {
      (void)iconv(icd, (ICONV_ARG2_TYPE)(&dptr), &dlen, CSS &outptr, &outleft);

      /* If outptr has been adjusted, there is some output. Set up to add it to
      the output buffer. The function will have adjusted dptr and dlen. If
      iconv() stopped because of an error, we'll pick it up next time when
      there's no output.

      If there is no output, we expect there to have been a translation
      error, because we know there was at least one input byte. We leave the
      value of tlen as -1, which causes the rest of the input to be copied
      verbatim. */

      if (outptr > tbuffer)
        {
        tptr = tbuffer;
        tlen = outptr - tbuffer;
        }
      else
        {
        DEBUG(D_any) debug_printf("iconv error translating \"%.*s\" to %s: "
        "%s\n", (int)(endword + 2 - mimeword), mimeword, target, strerror(errno));
        }
      }

    #endif

    /* No charset translation is happening or there was a translation error;
    just set up the original as the string to be added, and mark it all used.
    */

    if (tlen == -1)
      {
      tptr = dptr;
      tlen = dlen;
      dlen = 0;
      }

    /* Deal with zero values; convert them if requested. */

    if (zeroval != 0)
      for (int i = 0; i < tlen; i++)
        if (tptr[i] == 0) tptr[i] = zeroval;

    /* Add the new string onto the result */

    yield = string_catn(yield, tptr, tlen);
    }

  #if HAVE_ICONV
  if (icd != (iconv_t)(-1))  iconv_close(icd);
  #endif

  /* Update string past the MIME word; skip any white space if the next thing
  is another MIME word. */

  string = endword + 2;
  mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
  if (mimeword)
    {
    uschar *s = string;
    while (isspace(*s)) s++;
    if (s == mimeword) string = s;
    }
  }

/* Copy the remaining characters of the string, zero-terminate it, and return
the length as well if requested. */

yield = string_cat(yield, string);

if (lenptr) *lenptr = yield->ptr;
if (sizeptr) *sizeptr = yield->size;
return string_from_gstring(yield);
}


/* This is the stub that provides the original interface without the sizeptr
argument. */

uschar *
rfc2047_decode(uschar *string, BOOL lencheck, const uschar *target, int zeroval,
  int *lenptr, uschar **error)
{
return rfc2047_decode2(string, lencheck, target, zeroval, lenptr, NULL, error);
}

/* End of rfc2047.c */