1 files changed, 99 insertions, 0 deletions
diff --git a/debian/patches/75_76-Expansions-disallow-UTF-16-surrogates-from-utf8clean.patch b/debian/patches/75_76-Expansions-disallow-UTF-16-surrogates-from-utf8clean.patch
new file mode 100644
index 0000000..713435b
--- /dev/null
+++ b/debian/patches/75_76-Expansions-disallow-UTF-16-surrogates-from-utf8clean.patch
@@ -0,0 +1,99 @@
+From 1209e3e19e292cee517e43a2ccfe9b44b33bb1dc Mon Sep 17 00:00:00 2001
+From: Jasen Betts <jasen@xnet.co.nz>
+Date: Sun, 23 Jul 2023 13:43:59 +0100
+Subject: [PATCH] Expansions: disallow UTF-16 surrogates from ${utf8clean:...}.
+  Bug 2998
+
+---
+ doc/ChangeLog |  4 ++++
+ src/expand.c      | 27 +++++++++++++++++----------
+ 2 files changed, 21 insertions(+), 10 deletions(-)
+
+--- a/src/expand.c
++++ b/src/expand.c
+@@ -7731,11 +7731,11 @@ NOT_ITEM: ;
+ 
+ 	case EOP_UTF8CLEAN:
+ 	  {
+ 	  int seq_len = 0, index = 0;
+ 	  int bytes_left = 0;
+-	  long codepoint = -1;
++	  ulong codepoint = (ulong)-1;
+ 	  int complete;
+ 	  uschar seq_buff[4];			/* accumulate utf-8 here */
+ 
+ 	  /* Manually track tainting, as we deal in individual chars below */
+ 
+@@ -7761,40 +7761,47 @@ NOT_ITEM: ;
+ 		codepoint = (codepoint << 6) | (c & 0x3f);
+ 		seq_buff[index++] = c;
+ 		if (--bytes_left == 0)		/* codepoint complete */
+ 		  if(codepoint > 0x10FFFF)	/* is it too large? */
+ 		    complete = -1;	/* error (RFC3629 limit) */
++		  else if ( (codepoint & 0x1FF800 ) == 0xD800 ) /* surrogate */
++		    /* A UTF-16 surrogate (which should be one of a pair that
++		    encode a Unicode codepoint that is outside the Basic
++		    Multilingual Plane).  Error, not UTF8.
++		    RFC2279.2 is slightly unclear on this, but 
++		    https://unicodebook.readthedocs.io/issues.html#strict-utf8-decoder
++		    says "Surrogates characters are also invalid in UTF-8:
++		    characters in U+D800—U+DFFF have to be rejected." */
++		    complete = -1;
+ 		  else
+ 		    {		/* finished; output utf-8 sequence */
+ 		    yield = string_catn(yield, seq_buff, seq_len);
+ 		    index = 0;
+ 		    }
+ 		}
+ 	      }
+ 	    else	/* no bytes left: new sequence */
+ 	      {
+-	      if(!(c & 0x80))	/* 1-byte sequence, US-ASCII, keep it */
++	      if (!(c & 0x80))	/* 1-byte sequence, US-ASCII, keep it */
+ 		{
+ 		yield = string_catn(yield, &c, 1);
+ 		continue;
+ 		}
+-	      if((c & 0xe0) == 0xc0)		/* 2-byte sequence */
+-		{
+-		if(c == 0xc0 || c == 0xc1)	/* 0xc0 and 0xc1 are illegal */
++	      if ((c & 0xe0) == 0xc0)		/* 2-byte sequence */
++		if (c == 0xc0 || c == 0xc1)	/* 0xc0 and 0xc1 are illegal */
+ 		  complete = -1;
+ 		else
+ 		  {
+-		    bytes_left = 1;
+-		    codepoint = c & 0x1f;
++		  bytes_left = 1;
++		  codepoint = c & 0x1f;
+ 		  }
+-		}
+-	      else if((c & 0xf0) == 0xe0)		/* 3-byte sequence */
++	      else if ((c & 0xf0) == 0xe0)		/* 3-byte sequence */
+ 		{
+ 		bytes_left = 2;
+ 		codepoint = c & 0x0f;
+ 		}
+-	      else if((c & 0xf8) == 0xf0)		/* 4-byte sequence */
++	      else if ((c & 0xf8) == 0xf0)		/* 4-byte sequence */
+ 		{
+ 		bytes_left = 3;
+ 		codepoint = c & 0x07;
+ 		}
+ 	      else	/* invalid or too long (RFC3629 allows only 4 bytes) */
+--- a/doc/ChangeLog
++++ b/doc/ChangeLog
+@@ -69,10 +69,13 @@ JH/28 Bug 2996: Fix a crash in the smtp
+       to close it tried to use an uninitialized variable.  This would afftect
+       high-volume sites more, especially when running mailing-list-style loads.
+       Pollution of logs was the major effect, as the other process delivered
+       the message.  Found and partly investigated by Graeme Fowler.
+ 
++JH/31 Bug 2998: Fix ${utf8clean:...} to disallow UTF-16 surrogate codepoints.
++      Found and fixed by Jasen Betts. No testcase for this as my usual text
++      editor insists on emitting only valid UTF-8.
+ 
+ Exim version 4.96
+ -----------------
+ 
+ JH/01 Move the wait-for-next-tick (needed for unique message IDs) from