1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
|
From 1209e3e19e292cee517e43a2ccfe9b44b33bb1dc Mon Sep 17 00:00:00 2001
From: Jasen Betts <jasen@xnet.co.nz>
Date: Sun, 23 Jul 2023 13:43:59 +0100
Subject: [PATCH] Expansions: disallow UTF-16 surrogates from ${utf8clean:...}.
Bug 2998
---
doc/ChangeLog | 4 ++++
src/expand.c | 27 +++++++++++++++++----------
2 files changed, 21 insertions(+), 10 deletions(-)
--- a/src/expand.c
+++ b/src/expand.c
@@ -7731,11 +7731,11 @@ NOT_ITEM: ;
case EOP_UTF8CLEAN:
{
int seq_len = 0, index = 0;
int bytes_left = 0;
- long codepoint = -1;
+ ulong codepoint = (ulong)-1;
int complete;
uschar seq_buff[4]; /* accumulate utf-8 here */
/* Manually track tainting, as we deal in individual chars below */
@@ -7761,40 +7761,47 @@ NOT_ITEM: ;
codepoint = (codepoint << 6) | (c & 0x3f);
seq_buff[index++] = c;
if (--bytes_left == 0) /* codepoint complete */
if(codepoint > 0x10FFFF) /* is it too large? */
complete = -1; /* error (RFC3629 limit) */
+ else if ( (codepoint & 0x1FF800 ) == 0xD800 ) /* surrogate */
+ /* A UTF-16 surrogate (which should be one of a pair that
+ encode a Unicode codepoint that is outside the Basic
+ Multilingual Plane). Error, not UTF8.
+ RFC2279.2 is slightly unclear on this, but
+ https://unicodebook.readthedocs.io/issues.html#strict-utf8-decoder
+ says "Surrogates characters are also invalid in UTF-8:
+ characters in U+D800—U+DFFF have to be rejected." */
+ complete = -1;
else
{ /* finished; output utf-8 sequence */
yield = string_catn(yield, seq_buff, seq_len);
index = 0;
}
}
}
else /* no bytes left: new sequence */
{
- if(!(c & 0x80)) /* 1-byte sequence, US-ASCII, keep it */
+ if (!(c & 0x80)) /* 1-byte sequence, US-ASCII, keep it */
{
yield = string_catn(yield, &c, 1);
continue;
}
- if((c & 0xe0) == 0xc0) /* 2-byte sequence */
- {
- if(c == 0xc0 || c == 0xc1) /* 0xc0 and 0xc1 are illegal */
+ if ((c & 0xe0) == 0xc0) /* 2-byte sequence */
+ if (c == 0xc0 || c == 0xc1) /* 0xc0 and 0xc1 are illegal */
complete = -1;
else
{
- bytes_left = 1;
- codepoint = c & 0x1f;
+ bytes_left = 1;
+ codepoint = c & 0x1f;
}
- }
- else if((c & 0xf0) == 0xe0) /* 3-byte sequence */
+ else if ((c & 0xf0) == 0xe0) /* 3-byte sequence */
{
bytes_left = 2;
codepoint = c & 0x0f;
}
- else if((c & 0xf8) == 0xf0) /* 4-byte sequence */
+ else if ((c & 0xf8) == 0xf0) /* 4-byte sequence */
{
bytes_left = 3;
codepoint = c & 0x07;
}
else /* invalid or too long (RFC3629 allows only 4 bytes) */
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -69,10 +69,13 @@ JH/28 Bug 2996: Fix a crash in the smtp
to close it tried to use an uninitialized variable. This would afftect
high-volume sites more, especially when running mailing-list-style loads.
Pollution of logs was the major effect, as the other process delivered
the message. Found and partly investigated by Graeme Fowler.
+JH/31 Bug 2998: Fix ${utf8clean:...} to disallow UTF-16 surrogate codepoints.
+ Found and fixed by Jasen Betts. No testcase for this as my usual text
+ editor insists on emitting only valid UTF-8.
Exim version 4.96
-----------------
JH/01 Move the wait-for-next-tick (needed for unique message IDs) from
|