summaryrefslogtreecommitdiffstats
path: root/debian/patches/75_76-Expansions-disallow-UTF-16-surrogates-from-utf8clean.patch
diff options
context:
space:
mode:
Diffstat (limited to 'debian/patches/75_76-Expansions-disallow-UTF-16-surrogates-from-utf8clean.patch')
-rw-r--r--debian/patches/75_76-Expansions-disallow-UTF-16-surrogates-from-utf8clean.patch99
1 files changed, 99 insertions, 0 deletions
diff --git a/debian/patches/75_76-Expansions-disallow-UTF-16-surrogates-from-utf8clean.patch b/debian/patches/75_76-Expansions-disallow-UTF-16-surrogates-from-utf8clean.patch
new file mode 100644
index 0000000..713435b
--- /dev/null
+++ b/debian/patches/75_76-Expansions-disallow-UTF-16-surrogates-from-utf8clean.patch
@@ -0,0 +1,99 @@
+From 1209e3e19e292cee517e43a2ccfe9b44b33bb1dc Mon Sep 17 00:00:00 2001
+From: Jasen Betts <jasen@xnet.co.nz>
+Date: Sun, 23 Jul 2023 13:43:59 +0100
+Subject: [PATCH] Expansions: disallow UTF-16 surrogates from ${utf8clean:...}.
+ Bug 2998
+
+---
+ doc/ChangeLog | 4 ++++
+ src/expand.c | 27 +++++++++++++++++----------
+ 2 files changed, 21 insertions(+), 10 deletions(-)
+
+--- a/src/expand.c
++++ b/src/expand.c
+@@ -7731,11 +7731,11 @@ NOT_ITEM: ;
+
+ case EOP_UTF8CLEAN:
+ {
+ int seq_len = 0, index = 0;
+ int bytes_left = 0;
+- long codepoint = -1;
++ ulong codepoint = (ulong)-1;
+ int complete;
+ uschar seq_buff[4]; /* accumulate utf-8 here */
+
+ /* Manually track tainting, as we deal in individual chars below */
+
+@@ -7761,40 +7761,47 @@ NOT_ITEM: ;
+ codepoint = (codepoint << 6) | (c & 0x3f);
+ seq_buff[index++] = c;
+ if (--bytes_left == 0) /* codepoint complete */
+ if(codepoint > 0x10FFFF) /* is it too large? */
+ complete = -1; /* error (RFC3629 limit) */
++ else if ( (codepoint & 0x1FF800 ) == 0xD800 ) /* surrogate */
++ /* A UTF-16 surrogate (which should be one of a pair that
++ encode a Unicode codepoint that is outside the Basic
++ Multilingual Plane). Error, not UTF8.
++ RFC2279.2 is slightly unclear on this, but
++ https://unicodebook.readthedocs.io/issues.html#strict-utf8-decoder
++ says "Surrogates characters are also invalid in UTF-8:
++ characters in U+D800—U+DFFF have to be rejected." */
++ complete = -1;
+ else
+ { /* finished; output utf-8 sequence */
+ yield = string_catn(yield, seq_buff, seq_len);
+ index = 0;
+ }
+ }
+ }
+ else /* no bytes left: new sequence */
+ {
+- if(!(c & 0x80)) /* 1-byte sequence, US-ASCII, keep it */
++ if (!(c & 0x80)) /* 1-byte sequence, US-ASCII, keep it */
+ {
+ yield = string_catn(yield, &c, 1);
+ continue;
+ }
+- if((c & 0xe0) == 0xc0) /* 2-byte sequence */
+- {
+- if(c == 0xc0 || c == 0xc1) /* 0xc0 and 0xc1 are illegal */
++ if ((c & 0xe0) == 0xc0) /* 2-byte sequence */
++ if (c == 0xc0 || c == 0xc1) /* 0xc0 and 0xc1 are illegal */
+ complete = -1;
+ else
+ {
+- bytes_left = 1;
+- codepoint = c & 0x1f;
++ bytes_left = 1;
++ codepoint = c & 0x1f;
+ }
+- }
+- else if((c & 0xf0) == 0xe0) /* 3-byte sequence */
++ else if ((c & 0xf0) == 0xe0) /* 3-byte sequence */
+ {
+ bytes_left = 2;
+ codepoint = c & 0x0f;
+ }
+- else if((c & 0xf8) == 0xf0) /* 4-byte sequence */
++ else if ((c & 0xf8) == 0xf0) /* 4-byte sequence */
+ {
+ bytes_left = 3;
+ codepoint = c & 0x07;
+ }
+ else /* invalid or too long (RFC3629 allows only 4 bytes) */
+--- a/doc/ChangeLog
++++ b/doc/ChangeLog
+@@ -69,10 +69,13 @@ JH/28 Bug 2996: Fix a crash in the smtp
+ to close it tried to use an uninitialized variable. This would afftect
+ high-volume sites more, especially when running mailing-list-style loads.
+ Pollution of logs was the major effect, as the other process delivered
+ the message. Found and partly investigated by Graeme Fowler.
+
++JH/31 Bug 2998: Fix ${utf8clean:...} to disallow UTF-16 surrogate codepoints.
++ Found and fixed by Jasen Betts. No testcase for this as my usual text
++ editor insists on emitting only valid UTF-8.
+
+ Exim version 4.96
+ -----------------
+
+ JH/01 Move the wait-for-next-tick (needed for unique message IDs) from