diff options
Diffstat (limited to 'debian/patches/75_76-Expansions-disallow-UTF-16-surrogates-from-utf8clean.patch')
-rw-r--r-- | debian/patches/75_76-Expansions-disallow-UTF-16-surrogates-from-utf8clean.patch | 99 |
1 files changed, 99 insertions, 0 deletions
diff --git a/debian/patches/75_76-Expansions-disallow-UTF-16-surrogates-from-utf8clean.patch b/debian/patches/75_76-Expansions-disallow-UTF-16-surrogates-from-utf8clean.patch new file mode 100644 index 0000000..713435b --- /dev/null +++ b/debian/patches/75_76-Expansions-disallow-UTF-16-surrogates-from-utf8clean.patch @@ -0,0 +1,99 @@ +From 1209e3e19e292cee517e43a2ccfe9b44b33bb1dc Mon Sep 17 00:00:00 2001 +From: Jasen Betts <jasen@xnet.co.nz> +Date: Sun, 23 Jul 2023 13:43:59 +0100 +Subject: [PATCH] Expansions: disallow UTF-16 surrogates from ${utf8clean:...}. + Bug 2998 + +--- + doc/ChangeLog | 4 ++++ + src/expand.c | 27 +++++++++++++++++---------- + 2 files changed, 21 insertions(+), 10 deletions(-) + +--- a/src/expand.c ++++ b/src/expand.c +@@ -7731,11 +7731,11 @@ NOT_ITEM: ; + + case EOP_UTF8CLEAN: + { + int seq_len = 0, index = 0; + int bytes_left = 0; +- long codepoint = -1; ++ ulong codepoint = (ulong)-1; + int complete; + uschar seq_buff[4]; /* accumulate utf-8 here */ + + /* Manually track tainting, as we deal in individual chars below */ + +@@ -7761,40 +7761,47 @@ NOT_ITEM: ; + codepoint = (codepoint << 6) | (c & 0x3f); + seq_buff[index++] = c; + if (--bytes_left == 0) /* codepoint complete */ + if(codepoint > 0x10FFFF) /* is it too large? */ + complete = -1; /* error (RFC3629 limit) */ ++ else if ( (codepoint & 0x1FF800 ) == 0xD800 ) /* surrogate */ ++ /* A UTF-16 surrogate (which should be one of a pair that ++ encode a Unicode codepoint that is outside the Basic ++ Multilingual Plane). Error, not UTF8. ++ RFC2279.2 is slightly unclear on this, but ++ https://unicodebook.readthedocs.io/issues.html#strict-utf8-decoder ++ says "Surrogates characters are also invalid in UTF-8: ++ characters in U+D800—U+DFFF have to be rejected." */ ++ complete = -1; + else + { /* finished; output utf-8 sequence */ + yield = string_catn(yield, seq_buff, seq_len); + index = 0; + } + } + } + else /* no bytes left: new sequence */ + { +- if(!(c & 0x80)) /* 1-byte sequence, US-ASCII, keep it */ ++ if (!(c & 0x80)) /* 1-byte sequence, US-ASCII, keep it */ + { + yield = string_catn(yield, &c, 1); + continue; + } +- if((c & 0xe0) == 0xc0) /* 2-byte sequence */ +- { +- if(c == 0xc0 || c == 0xc1) /* 0xc0 and 0xc1 are illegal */ ++ if ((c & 0xe0) == 0xc0) /* 2-byte sequence */ ++ if (c == 0xc0 || c == 0xc1) /* 0xc0 and 0xc1 are illegal */ + complete = -1; + else + { +- bytes_left = 1; +- codepoint = c & 0x1f; ++ bytes_left = 1; ++ codepoint = c & 0x1f; + } +- } +- else if((c & 0xf0) == 0xe0) /* 3-byte sequence */ ++ else if ((c & 0xf0) == 0xe0) /* 3-byte sequence */ + { + bytes_left = 2; + codepoint = c & 0x0f; + } +- else if((c & 0xf8) == 0xf0) /* 4-byte sequence */ ++ else if ((c & 0xf8) == 0xf0) /* 4-byte sequence */ + { + bytes_left = 3; + codepoint = c & 0x07; + } + else /* invalid or too long (RFC3629 allows only 4 bytes) */ +--- a/doc/ChangeLog ++++ b/doc/ChangeLog +@@ -69,10 +69,13 @@ JH/28 Bug 2996: Fix a crash in the smtp + to close it tried to use an uninitialized variable. This would afftect + high-volume sites more, especially when running mailing-list-style loads. + Pollution of logs was the major effect, as the other process delivered + the message. Found and partly investigated by Graeme Fowler. + ++JH/31 Bug 2998: Fix ${utf8clean:...} to disallow UTF-16 surrogate codepoints. ++ Found and fixed by Jasen Betts. No testcase for this as my usual text ++ editor insists on emitting only valid UTF-8. + + Exim version 4.96 + ----------------- + + JH/01 Move the wait-for-next-tick (needed for unique message IDs) from |