1 files changed, 1515 insertions, 0 deletions
diff --git a/libfreerdp/primitives/prim_YUV_ssse3.c b/libfreerdp/primitives/prim_YUV_ssse3.c
new file mode 100644
index 0000000..2fbef3e
--- /dev/null
+++ b/libfreerdp/primitives/prim_YUV_ssse3.c
@@ -0,0 +1,1515 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Optimized YUV/RGB conversion operations
+ *
+ * Copyright 2014 Thomas Erbesdobler
+ * Copyright 2016-2017 Armin Novak <armin.novak@thincast.com>
+ * Copyright 2016-2017 Norbert Federa <norbert.federa@thincast.com>
+ * Copyright 2016-2017 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <winpr/wtypes.h>
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include <winpr/crt.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#if !defined(WITH_SSE2)
+#error "This file needs WITH_SSE2 enabled!"
+#endif
+
+static primitives_t* generic = NULL;
+
+/****************************************************************************/
+/* SSSE3 YUV420 -> RGB conversion                                           */
+/****************************************************************************/
+static __m128i* ssse3_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m128i Uraw,
+                                  __m128i Vraw, UINT8 pos)
+{
+	/* Visual Studio 2010 doesn't like _mm_set_epi32 in array initializer list */
+	/* Note: This also applies to Visual Studio 2013 before Update 4 */
+#if !defined(_MSC_VER) || (_MSC_VER > 1600)
+	const __m128i mapY[] = { _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
+		                     _mm_set_epi32(0x80800780, 0x80800680, 0x80800580, 0x80800480),
+		                     _mm_set_epi32(0x80800B80, 0x80800A80, 0x80800980, 0x80800880),
+		                     _mm_set_epi32(0x80800F80, 0x80800E80, 0x80800D80, 0x80800C80) };
+	const __m128i mapUV[] = { _mm_set_epi32(0x80038002, 0x80018000, 0x80808080, 0x80808080),
+		                      _mm_set_epi32(0x80078006, 0x80058004, 0x80808080, 0x80808080),
+		                      _mm_set_epi32(0x800B800A, 0x80098008, 0x80808080, 0x80808080),
+		                      _mm_set_epi32(0x800F800E, 0x800D800C, 0x80808080, 0x80808080) };
+	const __m128i mask[] = { _mm_set_epi32(0x80038080, 0x80028080, 0x80018080, 0x80008080),
+		                     _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
+		                     _mm_set_epi32(0x80808003, 0x80808002, 0x80808001, 0x80808000) };
+#else
+	/* Note: must be in little-endian format ! */
+	const __m128i mapY[] = { { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80,
+		                       0x80, 0x80, 0x03, 0x80, 0x80 },
+		                     { 0x80, 0x04, 0x80, 0x80, 0x80, 0x05, 0x80, 0x80, 0x80, 0x06, 0x80,
+		                       0x80, 0x80, 0x07, 0x80, 0x80 },
+		                     { 0x80, 0x08, 0x80, 0x80, 0x80, 0x09, 0x80, 0x80, 0x80, 0x0a, 0x80,
+		                       0x80, 0x80, 0x0b, 0x80, 0x80 },
+		                     { 0x80, 0x0c, 0x80, 0x80, 0x80, 0x0d, 0x80, 0x80, 0x80, 0x0e, 0x80,
+		                       0x80, 0x80, 0x0f, 0x80, 0x80 }
+
+	};
+	const __m128i mapUV[] = { { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x80, 0x01,
+		                        0x80, 0x02, 0x80, 0x03, 0x80 },
+		                      { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x04, 0x80, 0x05,
+		                        0x80, 0x06, 0x80, 0x07, 0x80 },
+		                      { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x08, 0x80, 0x09,
+		                        0x80, 0x0a, 0x80, 0x0b, 0x80 },
+		                      { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0c, 0x80, 0x0d,
+		                        0x80, 0x0e, 0x80, 0x0f, 0x80 } };
+	const __m128i mask[] = { { 0x80, 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02,
+		                       0x80, 0x80, 0x80, 0x03, 0x80 },
+		                     { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80,
+		                       0x80, 0x80, 0x03, 0x80, 0x80 },
+		                     { 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80,
+		                       0x80, 0x03, 0x80, 0x80, 0x80 } };
+#endif
+	const __m128i c128 = _mm_set1_epi16(128);
+	__m128i BGRX = _mm_and_si128(_mm_loadu_si128(dst),
+	                             _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000));
+	{
+		__m128i C;
+		__m128i D;
+		__m128i E;
+		/* Load Y values and expand to 32 bit */
+		{
+			C = _mm_shuffle_epi8(Yraw, mapY[pos]); /* Reorder and multiply by 256 */
+		}
+		/* Load U values and expand to 32 bit */
+		{
+			const __m128i U = _mm_shuffle_epi8(Uraw, mapUV[pos]); /* Reorder dcba */
+			D = _mm_sub_epi16(U, c128);                           /* D = U - 128 */
+		}
+		/* Load V values and expand to 32 bit */
+		{
+			const __m128i V = _mm_shuffle_epi8(Vraw, mapUV[pos]); /* Reorder dcba */
+			E = _mm_sub_epi16(V, c128);                           /* E = V - 128 */
+		}
+		/* Get the R value */
+		{
+			const __m128i c403 = _mm_set1_epi16(403);
+			const __m128i e403 =
+			    _mm_unpackhi_epi16(_mm_mullo_epi16(E, c403), _mm_mulhi_epi16(E, c403));
+			const __m128i Rs = _mm_add_epi32(C, e403);
+			const __m128i R32 = _mm_srai_epi32(Rs, 8);
+			const __m128i R16 = _mm_packs_epi32(R32, _mm_setzero_si128());
+			const __m128i R = _mm_packus_epi16(R16, _mm_setzero_si128());
+			const __m128i packed = _mm_shuffle_epi8(R, mask[0]);
+			BGRX = _mm_or_si128(BGRX, packed);
+		}
+		/* Get the G value */
+		{
+			const __m128i c48 = _mm_set1_epi16(48);
+			const __m128i d48 =
+			    _mm_unpackhi_epi16(_mm_mullo_epi16(D, c48), _mm_mulhi_epi16(D, c48));
+			const __m128i c120 = _mm_set1_epi16(120);
+			const __m128i e120 =
+			    _mm_unpackhi_epi16(_mm_mullo_epi16(E, c120), _mm_mulhi_epi16(E, c120));
+			const __m128i de = _mm_add_epi32(d48, e120);
+			const __m128i Gs = _mm_sub_epi32(C, de);
+			const __m128i G32 = _mm_srai_epi32(Gs, 8);
+			const __m128i G16 = _mm_packs_epi32(G32, _mm_setzero_si128());
+			const __m128i G = _mm_packus_epi16(G16, _mm_setzero_si128());
+			const __m128i packed = _mm_shuffle_epi8(G, mask[1]);
+			BGRX = _mm_or_si128(BGRX, packed);
+		}
+		/* Get the B value */
+		{
+			const __m128i c475 = _mm_set1_epi16(475);
+			const __m128i d475 =
+			    _mm_unpackhi_epi16(_mm_mullo_epi16(D, c475), _mm_mulhi_epi16(D, c475));
+			const __m128i Bs = _mm_add_epi32(C, d475);
+			const __m128i B32 = _mm_srai_epi32(Bs, 8);
+			const __m128i B16 = _mm_packs_epi32(B32, _mm_setzero_si128());
+			const __m128i B = _mm_packus_epi16(B16, _mm_setzero_si128());
+			const __m128i packed = _mm_shuffle_epi8(B, mask[2]);
+			BGRX = _mm_or_si128(BGRX, packed);
+		}
+	}
+	_mm_storeu_si128(dst++, BGRX);
+	return dst;
+}
+
+static pstatus_t ssse3_YUV420ToRGB_BGRX(const BYTE* const WINPR_RESTRICT pSrc[],
+                                        const UINT32* WINPR_RESTRICT srcStep,
+                                        BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
+                                        const prim_size_t* WINPR_RESTRICT roi)
+{
+	const UINT32 nWidth = roi->width;
+	const UINT32 nHeight = roi->height;
+	const UINT32 pad = roi->width % 16;
+	const __m128i duplicate = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+
+	for (UINT32 y = 0; y < nHeight; y++)
+	{
+		__m128i* dst = (__m128i*)(pDst + dstStep * y);
+		const BYTE* YData = pSrc[0] + y * srcStep[0];
+		const BYTE* UData = pSrc[1] + (y / 2) * srcStep[1];
+		const BYTE* VData = pSrc[2] + (y / 2) * srcStep[2];
+
+		for (UINT32 x = 0; x < nWidth - pad; x += 16)
+		{
+			const __m128i Y = _mm_loadu_si128((const __m128i*)YData);
+			const __m128i uRaw = _mm_loadu_si128((const __m128i*)UData);
+			const __m128i vRaw = _mm_loadu_si128((const __m128i*)VData);
+			const __m128i U = _mm_shuffle_epi8(uRaw, duplicate);
+			const __m128i V = _mm_shuffle_epi8(vRaw, duplicate);
+			YData += 16;
+			UData += 8;
+			VData += 8;
+			dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
+			dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
+			dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
+			dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
+		}
+
+		for (UINT32 x = 0; x < pad; x++)
+		{
+			const BYTE Y = *YData++;
+			const BYTE U = *UData;
+			const BYTE V = *VData;
+			const BYTE r = YUV2R(Y, U, V);
+			const BYTE g = YUV2G(Y, U, V);
+			const BYTE b = YUV2B(Y, U, V);
+			dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0);
+
+			if (x % 2)
+			{
+				UData++;
+				VData++;
+			}
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t ssse3_YUV420ToRGB(const BYTE* const WINPR_RESTRICT pSrc[3],
+                                   const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+                                   UINT32 dstStep, UINT32 DstFormat,
+                                   const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRX32:
+		case PIXEL_FORMAT_BGRA32:
+			return ssse3_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, roi);
+
+		default:
+			return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+	}
+}
+
+static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(const BYTE* const WINPR_RESTRICT pSrc[],
+                                                  const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
+                                                  UINT32 dstStep,
+                                                  const prim_size_t* WINPR_RESTRICT roi)
+{
+	const UINT32 nWidth = roi->width;
+	const UINT32 nHeight = roi->height;
+	const UINT32 pad = roi->width % 16;
+
+	for (UINT32 y = 0; y < nHeight; y++)
+	{
+		__m128i* dst = (__m128i*)(pDst + dstStep * y);
+		const BYTE* YData = pSrc[0] + y * srcStep[0];
+		const BYTE* UData = pSrc[1] + y * srcStep[1];
+		const BYTE* VData = pSrc[2] + y * srcStep[2];
+
+		for (UINT32 x = 0; x < nWidth - pad; x += 16)
+		{
+			__m128i Y = _mm_load_si128((const __m128i*)YData);
+			__m128i U = _mm_load_si128((const __m128i*)UData);
+			__m128i V = _mm_load_si128((const __m128i*)VData);
+			YData += 16;
+			UData += 16;
+			VData += 16;
+			dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
+			dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
+			dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
+			dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
+		}
+
+		for (UINT32 x = 0; x < pad; x++)
+		{
+			const BYTE Y = *YData++;
+			const BYTE U = *UData++;
+			const BYTE V = *VData++;
+			const BYTE r = YUV2R(Y, U, V);
+			const BYTE g = YUV2G(Y, U, V);
+			const BYTE b = YUV2B(Y, U, V);
+			dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0);
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R(const BYTE* const WINPR_RESTRICT pSrc[],
+                                             const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
+                                             UINT32 dstStep, UINT32 DstFormat,
+                                             const prim_size_t* WINPR_RESTRICT roi)
+{
+	if ((uintptr_t)pSrc[0] % 16 || (uintptr_t)pSrc[1] % 16 || (uintptr_t)pSrc[2] % 16 ||
+	    srcStep[0] % 16 || srcStep[1] % 16 || srcStep[2] % 16)
+		return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRX32:
+		case PIXEL_FORMAT_BGRA32:
+			return ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
+
+		default:
+			return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+	}
+}
+
+/****************************************************************************/
+/* SSSE3 RGB -> YUV420 conversion                                          **/
+/****************************************************************************/
+
+/**
+ * Note (nfedera):
+ * The used forward transformation factors from RGB to YUV are based on the
+ * values specified in [Rec. ITU-R BT.709-6] Section 3:
+ * http://www.itu.int/rec/R-REC-BT.709-6-201506-I/en
+ *
+ * Y =  0.21260 * R + 0.71520 * G + 0.07220 * B +   0;
+ * U = -0.11457 * R - 0.38543 * G + 0.50000 * B + 128;
+ * V =  0.50000 * R - 0.45415 * G - 0.04585 * B + 128;
+ *
+ * The most accurate integer arithmetic approximation when using 8-bit signed
+ * integer factors with 16-bit signed integer intermediate results is:
+ *
+ * Y = ( ( 27 * R + 92 * G +  9 * B) >> 7 );
+ * U = ( (-29 * R - 99 * G + 128 * B) >> 8 ) + 128;
+ * V = ( ( 128 * R - 116 * G -  12 * B) >> 8 ) + 128;
+ *
+ * Due to signed 8bit range being [-128,127] the U and V constants of 128 are
+ * rounded to 127
+ */
+
+#define BGRX_Y_FACTORS _mm_set_epi8(0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9)
+#define BGRX_U_FACTORS \
+	_mm_set_epi8(0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127)
+#define BGRX_V_FACTORS \
+	_mm_set_epi8(0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12)
+#define CONST128_FACTORS _mm_set1_epi8(-128)
+
+#define Y_SHIFT 7
+#define U_SHIFT 8
+#define V_SHIFT 8
+
+/*
+TODO:
+RGB[AX] can simply be supported using the following factors. And instead of loading the
+globals directly the functions below could be passed pointers to the correct vectors
+depending on the source picture format.
+
+PRIM_ALIGN_128 static const BYTE rgbx_y_factors[] = {
+      27,  92,   9,   0,  27,  92,   9,   0,  27,  92,   9,   0,  27,  92,   9,   0
+};
+PRIM_ALIGN_128 static const BYTE rgbx_u_factors[] = {
+     -15, -49,  64,   0, -15, -49,  64,   0, -15, -49,  64,   0, -15, -49,  64,   0
+};
+PRIM_ALIGN_128 static const BYTE rgbx_v_factors[] = {
+      64, -58,  -6,   0,  64, -58,  -6,   0,  64, -58,  -6,   0,  64, -58,  -6,   0
+};
+*/
+
+/* compute the luma (Y) component from a single rgb source line */
+
+static INLINE void ssse3_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE* dst, UINT32 width)
+{
+	__m128i x0;
+	__m128i x1;
+	__m128i x2;
+	__m128i x3;
+	const __m128i y_factors = BGRX_Y_FACTORS;
+	const __m128i* argb = (const __m128i*)src;
+	__m128i* ydst = (__m128i*)dst;
+
+	for (UINT32 x = 0; x < width; x += 16)
+	{
+		/* store 16 rgba pixels in 4 128 bit registers */
+		x0 = _mm_load_si128(argb++); // 1st 4 pixels
+		x1 = _mm_load_si128(argb++); // 2nd 4 pixels
+		x2 = _mm_load_si128(argb++); // 3rd 4 pixels
+		x3 = _mm_load_si128(argb++); // 4th 4 pixels
+		/* multiplications and subtotals */
+		x0 = _mm_maddubs_epi16(x0, y_factors);
+		x1 = _mm_maddubs_epi16(x1, y_factors);
+		x2 = _mm_maddubs_epi16(x2, y_factors);
+		x3 = _mm_maddubs_epi16(x3, y_factors);
+		/* the total sums */
+		x0 = _mm_hadd_epi16(x0, x1);
+		x2 = _mm_hadd_epi16(x2, x3);
+		/* shift the results */
+		x0 = _mm_srli_epi16(x0, Y_SHIFT);
+		x2 = _mm_srli_epi16(x2, Y_SHIFT);
+		/* pack the 16 words into bytes */
+		x0 = _mm_packus_epi16(x0, x2);
+		/* save to y plane */
+		_mm_storeu_si128(ydst++, x0);
+	}
+}
+
+/* compute the chrominance (UV) components from two rgb source lines */
+
+static INLINE void ssse3_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1,
+                                             const BYTE* WINPR_RESTRICT src2,
+                                             BYTE* WINPR_RESTRICT dst1, BYTE* WINPR_RESTRICT dst2,
+                                             UINT32 width)
+{
+	const __m128i u_factors = BGRX_U_FACTORS;
+	const __m128i v_factors = BGRX_V_FACTORS;
+	const __m128i vector128 = CONST128_FACTORS;
+	__m128i x0;
+	__m128i x1;
+	__m128i x2;
+	__m128i x3;
+	__m128i x4;
+	__m128i x5;
+	const __m128i* rgb1 = (const __m128i*)src1;
+	const __m128i* rgb2 = (const __m128i*)src2;
+	__m64* udst = (__m64*)dst1;
+	__m64* vdst = (__m64*)dst2;
+
+	for (UINT32 x = 0; x < width; x += 16)
+	{
+		/* subsample 16x2 pixels into 16x1 pixels */
+		x0 = _mm_load_si128(rgb1++);
+		x4 = _mm_load_si128(rgb2++);
+		x0 = _mm_avg_epu8(x0, x4);
+		x1 = _mm_load_si128(rgb1++);
+		x4 = _mm_load_si128(rgb2++);
+		x1 = _mm_avg_epu8(x1, x4);
+		x2 = _mm_load_si128(rgb1++);
+		x4 = _mm_load_si128(rgb2++);
+		x2 = _mm_avg_epu8(x2, x4);
+		x3 = _mm_load_si128(rgb1++);
+		x4 = _mm_load_si128(rgb2++);
+		x3 = _mm_avg_epu8(x3, x4);
+		/* subsample these 16x1 pixels into 8x1 pixels */
+		/**
+		 * shuffle controls
+		 * c = a[0],a[2],b[0],b[2] == 10 00 10 00 = 0x88
+		 * c = a[1],a[3],b[1],b[3] == 11 01 11 01 = 0xdd
+		 */
+		x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88));
+		x0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd));
+		x0 = _mm_avg_epu8(x0, x4);
+		x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88));
+		x1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd));
+		x1 = _mm_avg_epu8(x1, x4);
+		/* multiplications and subtotals */
+		x2 = _mm_maddubs_epi16(x0, u_factors);
+		x3 = _mm_maddubs_epi16(x1, u_factors);
+		x4 = _mm_maddubs_epi16(x0, v_factors);
+		x5 = _mm_maddubs_epi16(x1, v_factors);
+		/* the total sums */
+		x0 = _mm_hadd_epi16(x2, x3);
+		x1 = _mm_hadd_epi16(x4, x5);
+		/* shift the results */
+		x0 = _mm_srai_epi16(x0, U_SHIFT);
+		x1 = _mm_srai_epi16(x1, V_SHIFT);
+		/* pack the 16 words into bytes */
+		x0 = _mm_packs_epi16(x0, x1);
+		/* add 128 */
+		x0 = _mm_sub_epi8(x0, vector128);
+		/* the lower 8 bytes go to the u plane */
+		_mm_storel_pi(udst++, _mm_castsi128_ps(x0));
+		/* the upper 8 bytes go to the v plane */
+		_mm_storeh_pi(vdst++, _mm_castsi128_ps(x0));
+	}
+}
+
+static pstatus_t ssse3_RGBToYUV420_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+                                        UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
+                                        const UINT32 dstStep[],
+                                        const prim_size_t* WINPR_RESTRICT roi)
+{
+	const BYTE* argb = pSrc;
+	BYTE* ydst = pDst[0];
+	BYTE* udst = pDst[1];
+	BYTE* vdst = pDst[2];
+
+	if (roi->height < 1 || roi->width < 1)
+	{
+		return !PRIMITIVES_SUCCESS;
+	}
+
+	if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
+	{
+		return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
+	}
+
+	for (UINT32 y = 0; y < roi->height - 1; y += 2)
+	{
+		const BYTE* line1 = argb;
+		const BYTE* line2 = argb + srcStep;
+		ssse3_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width);
+		ssse3_RGBToYUV420_BGRX_Y(line1, ydst, roi->width);
+		ssse3_RGBToYUV420_BGRX_Y(line2, ydst + dstStep[0], roi->width);
+		argb += 2 * srcStep;
+		ydst += 2 * dstStep[0];
+		udst += 1 * dstStep[1];
+		vdst += 1 * dstStep[2];
+	}
+
+	if (roi->height & 1)
+	{
+		/* pass the same last line of an odd height twice for UV */
+		ssse3_RGBToYUV420_BGRX_UV(argb, argb, udst, vdst, roi->width);
+		ssse3_RGBToYUV420_BGRX_Y(argb, ydst, roi->width);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t ssse3_RGBToYUV420(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+                                   UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
+                                   const UINT32 dstStep[], const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (srcFormat)
+	{
+		case PIXEL_FORMAT_BGRX32:
+		case PIXEL_FORMAT_BGRA32:
+			return ssse3_RGBToYUV420_BGRX(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
+
+		default:
+			return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
+	}
+}
+
+/****************************************************************************/
+/* SSSE3 RGB -> AVC444-YUV conversion                                      **/
+/****************************************************************************/
+
+static INLINE void ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
+    const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
+    BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
+    BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
+    BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
+{
+	const __m128i* argbEven = (const __m128i*)srcEven;
+	const __m128i* argbOdd = (const __m128i*)srcOdd;
+	const __m128i y_factors = BGRX_Y_FACTORS;
+	const __m128i u_factors = BGRX_U_FACTORS;
+	const __m128i v_factors = BGRX_V_FACTORS;
+	const __m128i vector128 = CONST128_FACTORS;
+
+	for (UINT32 x = 0; x < width; x += 16)
+	{
+		/* store 16 rgba pixels in 4 128 bit registers */
+		const __m128i xe1 = _mm_load_si128(argbEven++); // 1st 4 pixels
+		const __m128i xe2 = _mm_load_si128(argbEven++); // 2nd 4 pixels
+		const __m128i xe3 = _mm_load_si128(argbEven++); // 3rd 4 pixels
+		const __m128i xe4 = _mm_load_si128(argbEven++); // 4th 4 pixels
+		const __m128i xo1 = _mm_load_si128(argbOdd++);  // 1st 4 pixels
+		const __m128i xo2 = _mm_load_si128(argbOdd++);  // 2nd 4 pixels
+		const __m128i xo3 = _mm_load_si128(argbOdd++);  // 3rd 4 pixels
+		const __m128i xo4 = _mm_load_si128(argbOdd++);  // 4th 4 pixels
+		{
+			/* Y: multiplications with subtotals and horizontal sums */
+			const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
+			                                                  _mm_maddubs_epi16(xe2, y_factors)),
+			                                   Y_SHIFT);
+			const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
+			                                                  _mm_maddubs_epi16(xe4, y_factors)),
+			                                   Y_SHIFT);
+			const __m128i ye = _mm_packus_epi16(ye1, ye2);
+			const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
+			                                                  _mm_maddubs_epi16(xo2, y_factors)),
+			                                   Y_SHIFT);
+			const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
+			                                                  _mm_maddubs_epi16(xo4, y_factors)),
+			                                   Y_SHIFT);
+			const __m128i yo = _mm_packus_epi16(yo1, yo2);
+			/* store y [b1] */
+			_mm_storeu_si128((__m128i*)b1Even, ye);
+			b1Even += 16;
+
+			if (b1Odd)
+			{
+				_mm_storeu_si128((__m128i*)b1Odd, yo);
+				b1Odd += 16;
+			}
+		}
+		{
+			/* We have now
+			 * 16 even U values in ue
+			 * 16 odd U values in uo
+			 *
+			 * We need to split these according to
+			 * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
+			__m128i ue;
+			__m128i uo = { 0 };
+			{
+				const __m128i ue1 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
+				                                  _mm_maddubs_epi16(xe2, u_factors)),
+				                   U_SHIFT);
+				const __m128i ue2 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
+				                                  _mm_maddubs_epi16(xe4, u_factors)),
+				                   U_SHIFT);
+				ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
+			}
+
+			if (b1Odd)
+			{
+				const __m128i uo1 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
+				                                  _mm_maddubs_epi16(xo2, u_factors)),
+				                   U_SHIFT);
+				const __m128i uo2 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
+				                                  _mm_maddubs_epi16(xo4, u_factors)),
+				                   U_SHIFT);
+				uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
+			}
+
+			/* Now we need the following storage distribution:
+			 * 2x   2y    -> b2
+			 * x    2y+1  -> b4
+			 * 2x+1 2y    -> b6 */
+			if (b1Odd) /* b2 */
+			{
+				const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128());
+				const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128());
+				const __m128i hi = _mm_add_epi16(ueh, uoh);
+				const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128());
+				const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128());
+				const __m128i lo = _mm_add_epi16(uel, uol);
+				const __m128i added = _mm_hadd_epi16(lo, hi);
+				const __m128i avg16 = _mm_srai_epi16(added, 2);
+				const __m128i avg = _mm_packus_epi16(avg16, avg16);
+				_mm_storel_epi64((__m128i*)b2, avg);
+			}
+			else
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+				const __m128i ud = _mm_shuffle_epi8(ue, mask);
+				_mm_storel_epi64((__m128i*)b2, ud);
+			}
+
+			b2 += 8;
+
+			if (b1Odd) /* b4 */
+			{
+				_mm_store_si128((__m128i*)b4, uo);
+				b4 += 16;
+			}
+
+			{
+				/* b6 */
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+				const __m128i ude = _mm_shuffle_epi8(ue, mask);
+				_mm_storel_epi64((__m128i*)b6, ude);
+				b6 += 8;
+			}
+		}
+		{
+			/* We have now
+			 * 16 even V values in ue
+			 * 16 odd V values in uo
+			 *
+			 * We need to split these according to
+			 * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
+			__m128i ve;
+			__m128i vo = { 0 };
+			{
+				const __m128i ve1 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
+				                                  _mm_maddubs_epi16(xe2, v_factors)),
+				                   V_SHIFT);
+				const __m128i ve2 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
+				                                  _mm_maddubs_epi16(xe4, v_factors)),
+				                   V_SHIFT);
+				ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
+			}
+
+			if (b1Odd)
+			{
+				const __m128i vo1 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
+				                                  _mm_maddubs_epi16(xo2, v_factors)),
+				                   V_SHIFT);
+				const __m128i vo2 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
+				                                  _mm_maddubs_epi16(xo4, v_factors)),
+				                   V_SHIFT);
+				vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
+			}
+
+			/* Now we need the following storage distribution:
+			 * 2x   2y    -> b3
+			 * x    2y+1  -> b5
+			 * 2x+1 2y    -> b7 */
+			if (b1Odd) /* b3 */
+			{
+				const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128());
+				const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128());
+				const __m128i hi = _mm_add_epi16(veh, voh);
+				const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128());
+				const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128());
+				const __m128i lo = _mm_add_epi16(vel, vol);
+				const __m128i added = _mm_hadd_epi16(lo, hi);
+				const __m128i avg16 = _mm_srai_epi16(added, 2);
+				const __m128i avg = _mm_packus_epi16(avg16, avg16);
+				_mm_storel_epi64((__m128i*)b3, avg);
+			}
+			else
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+				const __m128i vd = _mm_shuffle_epi8(ve, mask);
+				_mm_storel_epi64((__m128i*)b3, vd);
+			}
+
+			b3 += 8;
+
+			if (b1Odd) /* b5 */
+			{
+				_mm_store_si128((__m128i*)b5, vo);
+				b5 += 16;
+			}
+
+			{
+				/* b7 */
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+				const __m128i vde = _mm_shuffle_epi8(ve, mask);
+				_mm_storel_epi64((__m128i*)b7, vde);
+				b7 += 8;
+			}
+		}
+	}
+}
+
+static pstatus_t ssse3_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+                                           UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
+                                           const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
+                                           const UINT32 dst2Step[],
+                                           const prim_size_t* WINPR_RESTRICT roi)
+{
+	const BYTE* pMaxSrc = pSrc + (roi->height - 1) * srcStep;
+
+	if (roi->height < 1 || roi->width < 1)
+		return !PRIMITIVES_SUCCESS;
+
+	if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
+		return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
+		                               roi);
+
+	for (UINT32 y = 0; y < roi->height; y += 2)
+	{
+		const BOOL last = (y >= (roi->height - 1));
+		const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
+		const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
+		const UINT32 i = y >> 1;
+		const UINT32 n = (i & ~7) + i;
+		BYTE* b1Even = pDst1[0] + y * dst1Step[0];
+		BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
+		BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
+		BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
+		BYTE* b4 = pDst2[0] + dst2Step[0] * n;
+		BYTE* b5 = b4 + 8 * dst2Step[0];
+		BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
+		BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
+		ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7,
+		                                     roi->width);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t ssse3_RGBToAVC444YUV(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+                                      UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
+                                      const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
+                                      const UINT32 dst2Step[],
+                                      const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (srcFormat)
+	{
+		case PIXEL_FORMAT_BGRX32:
+		case PIXEL_FORMAT_BGRA32:
+			return ssse3_RGBToAVC444YUV_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
+			                                 dst2Step, roi);
+
+		default:
+			return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
+			                               dst2Step, roi);
+	}
+}
+
+/* Mapping of arguments:
+ *
+ * b1 [even lines] -> yLumaDstEven
+ * b1 [odd lines]  -> yLumaDstOdd
+ * b2              -> uLumaDst
+ * b3              -> vLumaDst
+ * b4              -> yChromaDst1
+ * b5              -> yChromaDst2
+ * b6              -> uChromaDst1
+ * b7              -> uChromaDst2
+ * b8              -> vChromaDst1
+ * b9              -> vChromaDst2
+ */
+static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
+    const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
+    BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
+    BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
+    BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
+    BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
+    BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
+    BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width)
+{
+	const __m128i vector128 = CONST128_FACTORS;
+	const __m128i* argbEven = (const __m128i*)srcEven;
+	const __m128i* argbOdd = (const __m128i*)srcOdd;
+
+	for (UINT32 x = 0; x < width; x += 16)
+	{
+		/* store 16 rgba pixels in 4 128 bit registers
+		 * for even and odd rows.
+		 */
+		const __m128i xe1 = _mm_load_si128(argbEven++); /* 1st 4 pixels */
+		const __m128i xe2 = _mm_load_si128(argbEven++); /* 2nd 4 pixels */
+		const __m128i xe3 = _mm_load_si128(argbEven++); /* 3rd 4 pixels */
+		const __m128i xe4 = _mm_load_si128(argbEven++); /* 4th 4 pixels */
+		const __m128i xo1 = _mm_load_si128(argbOdd++);  /* 1st 4 pixels */
+		const __m128i xo2 = _mm_load_si128(argbOdd++);  /* 2nd 4 pixels */
+		const __m128i xo3 = _mm_load_si128(argbOdd++);  /* 3rd 4 pixels */
+		const __m128i xo4 = _mm_load_si128(argbOdd++);  /* 4th 4 pixels */
+		{
+			/* Y: multiplications with subtotals and horizontal sums */
+			const __m128i y_factors = BGRX_Y_FACTORS;
+			const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
+			                                                  _mm_maddubs_epi16(xe2, y_factors)),
+			                                   Y_SHIFT);
+			const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
+			                                                  _mm_maddubs_epi16(xe4, y_factors)),
+			                                   Y_SHIFT);
+			const __m128i ye = _mm_packus_epi16(ye1, ye2);
+			/* store y [b1] */
+			_mm_storeu_si128((__m128i*)yLumaDstEven, ye);
+			yLumaDstEven += 16;
+		}
+
+		if (yLumaDstOdd)
+		{
+			const __m128i y_factors = BGRX_Y_FACTORS;
+			const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
+			                                                  _mm_maddubs_epi16(xo2, y_factors)),
+			                                   Y_SHIFT);
+			const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
+			                                                  _mm_maddubs_epi16(xo4, y_factors)),
+			                                   Y_SHIFT);
+			const __m128i yo = _mm_packus_epi16(yo1, yo2);
+			_mm_storeu_si128((__m128i*)yLumaDstOdd, yo);
+			yLumaDstOdd += 16;
+		}
+
+		{
+			/* We have now
+			 * 16 even U values in ue
+			 * 16 odd U values in uo
+			 *
+			 * We need to split these according to
+			 * 3.3.8.3.3 YUV420p Stream Combination for YUV444v2 mode */
+			/* U: multiplications with subtotals and horizontal sums */
+			__m128i ue;
+			__m128i uo;
+			__m128i uavg;
+			{
+				const __m128i u_factors = BGRX_U_FACTORS;
+				const __m128i ue1 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
+				                                  _mm_maddubs_epi16(xe2, u_factors)),
+				                   U_SHIFT);
+				const __m128i ue2 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
+				                                  _mm_maddubs_epi16(xe4, u_factors)),
+				                   U_SHIFT);
+				const __m128i ueavg = _mm_hadd_epi16(ue1, ue2);
+				ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
+				uavg = ueavg;
+			}
+			{
+				const __m128i u_factors = BGRX_U_FACTORS;
+				const __m128i uo1 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
+				                                  _mm_maddubs_epi16(xo2, u_factors)),
+				                   U_SHIFT);
+				const __m128i uo2 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
+				                                  _mm_maddubs_epi16(xo4, u_factors)),
+				                   U_SHIFT);
+				const __m128i uoavg = _mm_hadd_epi16(uo1, uo2);
+				uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
+				uavg = _mm_add_epi16(uavg, uoavg);
+				uavg = _mm_srai_epi16(uavg, 2);
+				uavg = _mm_packs_epi16(uavg, uoavg);
+				uavg = _mm_sub_epi8(uavg, vector128);
+			}
+			/* Now we need the following storage distribution:
+			 * 2x   2y    -> uLumaDst
+			 * 2x+1  y    -> yChromaDst1
+			 * 4x   2y+1  -> uChromaDst1
+			 * 4x+2 2y+1  -> vChromaDst1 */
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+				const __m128i ude = _mm_shuffle_epi8(ue, mask);
+				_mm_storel_epi64((__m128i*)yEvenChromaDst1, ude);
+				yEvenChromaDst1 += 8;
+			}
+
+			if (yLumaDstOdd)
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+				const __m128i udo = _mm_shuffle_epi8(uo, mask);
+				_mm_storel_epi64((__m128i*)yOddChromaDst1, udo);
+				yOddChromaDst1 += 8;
+			}
+
+			if (yLumaDstOdd)
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
+				const __m128i ud = _mm_shuffle_epi8(uo, mask);
+				int* uDst1 = (int*)uChromaDst1;
+				int* vDst1 = (int*)vChromaDst1;
+				const int* src = (const int*)&ud;
+				_mm_stream_si32(uDst1, src[0]);
+				_mm_stream_si32(vDst1, src[1]);
+				uChromaDst1 += 4;
+				vChromaDst1 += 4;
+			}
+
+			if (yLumaDstOdd)
+			{
+				_mm_storel_epi64((__m128i*)uLumaDst, uavg);
+				uLumaDst += 8;
+			}
+			else
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+				const __m128i ud = _mm_shuffle_epi8(ue, mask);
+				_mm_storel_epi64((__m128i*)uLumaDst, ud);
+				uLumaDst += 8;
+			}
+		}
+
+		{
+			/* V: multiplications with subtotals and horizontal sums */
+			__m128i ve;
+			__m128i vo;
+			__m128i vavg;
+			{
+				const __m128i v_factors = BGRX_V_FACTORS;
+				const __m128i ve1 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
+				                                  _mm_maddubs_epi16(xe2, v_factors)),
+				                   V_SHIFT);
+				const __m128i ve2 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
+				                                  _mm_maddubs_epi16(xe4, v_factors)),
+				                   V_SHIFT);
+				const __m128i veavg = _mm_hadd_epi16(ve1, ve2);
+				ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
+				vavg = veavg;
+			}
+			{
+				const __m128i v_factors = BGRX_V_FACTORS;
+				const __m128i vo1 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
+				                                  _mm_maddubs_epi16(xo2, v_factors)),
+				                   V_SHIFT);
+				const __m128i vo2 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
+				                                  _mm_maddubs_epi16(xo4, v_factors)),
+				                   V_SHIFT);
+				const __m128i voavg = _mm_hadd_epi16(vo1, vo2);
+				vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
+				vavg = _mm_add_epi16(vavg, voavg);
+				vavg = _mm_srai_epi16(vavg, 2);
+				vavg = _mm_packs_epi16(vavg, voavg);
+				vavg = _mm_sub_epi8(vavg, vector128);
+			}
+			/* Now we need the following storage distribution:
+			 * 2x   2y    -> vLumaDst
+			 * 2x+1  y    -> yChromaDst2
+			 * 4x   2y+1  -> uChromaDst2
+			 * 4x+2 2y+1  -> vChromaDst2 */
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+				__m128i vde = _mm_shuffle_epi8(ve, mask);
+				_mm_storel_epi64((__m128i*)yEvenChromaDst2, vde);
+				yEvenChromaDst2 += 8;
+			}
+
+			if (yLumaDstOdd)
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+				__m128i vdo = _mm_shuffle_epi8(vo, mask);
+				_mm_storel_epi64((__m128i*)yOddChromaDst2, vdo);
+				yOddChromaDst2 += 8;
+			}
+
+			if (yLumaDstOdd)
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
+				const __m128i vd = _mm_shuffle_epi8(vo, mask);
+				int* uDst2 = (int*)uChromaDst2;
+				int* vDst2 = (int*)vChromaDst2;
+				const int* src = (const int*)&vd;
+				_mm_stream_si32(uDst2, src[0]);
+				_mm_stream_si32(vDst2, src[1]);
+				uChromaDst2 += 4;
+				vChromaDst2 += 4;
+			}
+
+			if (yLumaDstOdd)
+			{
+				_mm_storel_epi64((__m128i*)vLumaDst, vavg);
+				vLumaDst += 8;
+			}
+			else
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+				__m128i vd = _mm_shuffle_epi8(ve, mask);
+				_mm_storel_epi64((__m128i*)vLumaDst, vd);
+				vLumaDst += 8;
+			}
+		}
+	}
+}
+
+static pstatus_t ssse3_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+                                             UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
+                                             const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
+                                             const UINT32 dst2Step[],
+                                             const prim_size_t* WINPR_RESTRICT roi)
+{
+	if (roi->height < 1 || roi->width < 1)
+		return !PRIMITIVES_SUCCESS;
+
+	if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
+		return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
+		                                 roi);
+
+	for (UINT32 y = 0; y < roi->height; y += 2)
+	{
+		const BYTE* srcEven = (pSrc + y * srcStep);
+		const BYTE* srcOdd = (srcEven + srcStep);
+		BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
+		BYTE* dstLumaYOdd = (y < roi->height - 1) ? (dstLumaYEven + dst1Step[0]) : NULL;
+		BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
+		BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
+		BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
+		BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
+		BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0];
+		BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0];
+		BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
+		BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
+		BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
+		BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
+		ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU,
+		                                       dstLumaV, dstEvenChromaY1, dstEvenChromaY2,
+		                                       dstOddChromaY1, dstOddChromaY2, dstChromaU1,
+		                                       dstChromaU2, dstChromaV1, dstChromaV2, roi->width);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t ssse3_RGBToAVC444YUVv2(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+                                        UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
+                                        const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
+                                        const UINT32 dst2Step[],
+                                        const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (srcFormat)
+	{
+		case PIXEL_FORMAT_BGRX32:
+		case PIXEL_FORMAT_BGRA32:
+			return ssse3_RGBToAVC444YUVv2_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
+			                                   dst2Step, roi);
+
+		default:
+			return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
+			                                 dst2Step, roi);
+	}
+}
+
+static pstatus_t ssse3_LumaToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[],
+                                    const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDstRaw[],
+                                    const UINT32 dstStep[], const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	const UINT32 nWidth = roi->right - roi->left;
+	const UINT32 nHeight = roi->bottom - roi->top;
+	const UINT32 halfWidth = (nWidth + 1) / 2;
+	const UINT32 halfPad = halfWidth % 16;
+	const UINT32 halfHeight = (nHeight + 1) / 2;
+	const UINT32 oddY = 1;
+	const UINT32 evenY = 0;
+	const UINT32 oddX = 1;
+	const UINT32 evenX = 0;
+	const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
+		                    pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
+		                    pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
+	BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
+		              pDstRaw[1] + roi->top * dstStep[1] + roi->left,
+		              pDstRaw[2] + roi->top * dstStep[2] + roi->left };
+
+	/* Y data is already here... */
+	/* B1 */
+	for (UINT32 y = 0; y < nHeight; y++)
+	{
+		const BYTE* Ym = pSrc[0] + srcStep[0] * y;
+		BYTE* pY = pDst[0] + dstStep[0] * y;
+		memcpy(pY, Ym, nWidth);
+	}
+
+	/* The first half of U, V are already here part of this frame. */
+	/* B2 and B3 */
+	for (UINT32 y = 0; y < halfHeight; y++)
+	{
+		const UINT32 val2y = (2 * y + evenY);
+		const UINT32 val2y1 = val2y + oddY;
+		const BYTE* Um = pSrc[1] + srcStep[1] * y;
+		const BYTE* Vm = pSrc[2] + srcStep[2] * y;
+		BYTE* pU = pDst[1] + dstStep[1] * val2y;
+		BYTE* pV = pDst[2] + dstStep[2] * val2y;
+		BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;
+		BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;
+
+		UINT32 x = 0;
+		for (; x < halfWidth - halfPad; x += 16)
+		{
+			const __m128i unpackHigh = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+			const __m128i unpackLow =
+			    _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8);
+			{
+				const __m128i u = _mm_loadu_si128((const __m128i*)&Um[x]);
+				const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
+				const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
+				_mm_storeu_si128((__m128i*)&pU[2 * x], uHigh);
+				_mm_storeu_si128((__m128i*)&pU[2 * x + 16], uLow);
+				_mm_storeu_si128((__m128i*)&pU1[2 * x], uHigh);
+				_mm_storeu_si128((__m128i*)&pU1[2 * x + 16], uLow);
+			}
+			{
+				const __m128i u = _mm_loadu_si128((const __m128i*)&Vm[x]);
+				const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
+				const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
+				_mm_storeu_si128((__m128i*)&pV[2 * x], uHigh);
+				_mm_storeu_si128((__m128i*)&pV[2 * x + 16], uLow);
+				_mm_storeu_si128((__m128i*)&pV1[2 * x], uHigh);
+				_mm_storeu_si128((__m128i*)&pV1[2 * x + 16], uLow);
+			}
+		}
+
+		for (; x < halfWidth; x++)
+		{
+			const UINT32 val2x = 2 * x + evenX;
+			const UINT32 val2x1 = val2x + oddX;
+			pU[val2x] = Um[x];
+			pV[val2x] = Vm[x];
+			pU[val2x1] = Um[x];
+			pV[val2x1] = Vm[x];
+			pU1[val2x] = Um[x];
+			pV1[val2x] = Vm[x];
+			pU1[val2x1] = Um[x];
+			pV1[val2x1] = Vm[x];
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static INLINE void ssse3_filter(BYTE* WINPR_RESTRICT pSrcDst, const BYTE* WINPR_RESTRICT pSrc2)
+{
+	const __m128i even = _mm_set_epi8((char)0x80, 14, (char)0x80, 12, (char)0x80, 10, (char)0x80, 8,
+	                                  (char)0x80, 6, (char)0x80, 4, (char)0x80, 2, (char)0x80, 0);
+	const __m128i odd = _mm_set_epi8((char)0x80, 15, (char)0x80, 13, (char)0x80, 11, (char)0x80, 9,
+	                                 (char)0x80, 7, (char)0x80, 5, (char)0x80, 3, (char)0x80, 1);
+	const __m128i interleave = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
+	const __m128i u = _mm_loadu_si128((const __m128i*)pSrcDst);
+	const __m128i u1 = _mm_loadu_si128((const __m128i*)pSrc2);
+	const __m128i uEven = _mm_shuffle_epi8(u, even);
+	const __m128i uEven4 = _mm_slli_epi16(uEven, 2);
+	const __m128i uOdd = _mm_shuffle_epi8(u, odd);
+	const __m128i u1Even = _mm_shuffle_epi8(u1, even);
+	const __m128i u1Odd = _mm_shuffle_epi8(u1, odd);
+	const __m128i tmp1 = _mm_add_epi16(uOdd, u1Even);
+	const __m128i tmp2 = _mm_add_epi16(tmp1, u1Odd);
+	const __m128i result = _mm_sub_epi16(uEven4, tmp2);
+	const __m128i packed = _mm_packus_epi16(result, uOdd);
+	const __m128i interleaved = _mm_shuffle_epi8(packed, interleave);
+	_mm_storeu_si128((__m128i*)pSrcDst, interleaved);
+}
+
+static pstatus_t ssse3_ChromaFilter(BYTE* WINPR_RESTRICT pDst[], const UINT32 dstStep[],
+                                    const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	const UINT32 oddY = 1;
+	const UINT32 evenY = 0;
+	const UINT32 nWidth = roi->right - roi->left;
+	const UINT32 nHeight = roi->bottom - roi->top;
+	const UINT32 halfHeight = (nHeight + 1) / 2;
+	const UINT32 halfWidth = (nWidth + 1) / 2;
+	const UINT32 halfPad = halfWidth % 16;
+
+	/* Filter */
+	for (UINT32 y = roi->top; y < halfHeight + roi->top; y++)
+	{
+		UINT32 x = roi->left;
+		const UINT32 val2y = (y * 2 + evenY);
+		const UINT32 val2y1 = val2y + oddY;
+		BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;
+		BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;
+		BYTE* pU = pDst[1] + dstStep[1] * val2y;
+		BYTE* pV = pDst[2] + dstStep[2] * val2y;
+
+		if (val2y1 > nHeight)
+			continue;
+
+		for (; x < halfWidth + roi->left - halfPad; x += 16)
+		{
+			ssse3_filter(&pU[2 * x], &pU1[2 * x]);
+			ssse3_filter(&pV[2 * x], &pV1[2 * x]);
+		}
+
+		for (; x < halfWidth + roi->left; x++)
+		{
+			const UINT32 val2x = (x * 2);
+			const UINT32 val2x1 = val2x + 1;
+			const BYTE inU = pU[val2x];
+			const BYTE inV = pV[val2x];
+			const INT32 up = inU * 4;
+			const INT32 vp = inV * 4;
+			INT32 u2020 = 0;
+			INT32 v2020 = 0;
+
+			if (val2x1 > nWidth)
+				continue;
+
+			u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1];
+			v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1];
+			pU[val2x] = CONDITIONAL_CLIP(u2020, inU);
+			pV[val2x] = CONDITIONAL_CLIP(v2020, inV);
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t ssse3_ChromaV1ToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3],
+                                        const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
+                                        const UINT32 dstStep[3],
+                                        const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	const UINT32 mod = 16;
+	UINT32 uY = 0;
+	UINT32 vY = 0;
+	const UINT32 nWidth = roi->right - roi->left;
+	const UINT32 nHeight = roi->bottom - roi->top;
+	const UINT32 halfWidth = (nWidth + 1) / 2;
+	const UINT32 halfPad = halfWidth % 16;
+	const UINT32 halfHeight = (nHeight + 1) / 2;
+	const UINT32 oddY = 1;
+	const UINT32 evenY = 0;
+	const UINT32 oddX = 1;
+	/* The auxilary frame is aligned to multiples of 16x16.
+	 * We need the padded height for B4 and B5 conversion. */
+	const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
+	const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
+		                    pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
+		                    pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
+	BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
+		              pDstRaw[1] + roi->top * dstStep[1] + roi->left,
+		              pDstRaw[2] + roi->top * dstStep[2] + roi->left };
+	const __m128i zero = _mm_setzero_si128();
+	const __m128i mask = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0,
+	                                  (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80);
+
+	/* The second half of U and V is a bit more tricky... */
+	/* B4 and B5 */
+	for (UINT32 y = 0; y < padHeigth; y++)
+	{
+		const BYTE* Ya = pSrc[0] + srcStep[0] * y;
+		BYTE* pX = NULL;
+
+		if ((y) % mod < (mod + 1) / 2)
+		{
+			const UINT32 pos = (2 * uY++ + oddY);
+
+			if (pos >= nHeight)
+				continue;
+
+			pX = pDst[1] + dstStep[1] * pos;
+		}
+		else
+		{
+			const UINT32 pos = (2 * vY++ + oddY);
+
+			if (pos >= nHeight)
+				continue;
+
+			pX = pDst[2] + dstStep[2] * pos;
+		}
+
+		memcpy(pX, Ya, nWidth);
+	}
+
+	/* B6 and B7 */
+	for (UINT32 y = 0; y < halfHeight; y++)
+	{
+		const UINT32 val2y = (y * 2 + evenY);
+		const BYTE* Ua = pSrc[1] + srcStep[1] * y;
+		const BYTE* Va = pSrc[2] + srcStep[2] * y;
+		BYTE* pU = pDst[1] + dstStep[1] * val2y;
+		BYTE* pV = pDst[2] + dstStep[2] * val2y;
+
+		UINT32 x = 0;
+		for (; x < halfWidth - halfPad; x += 16)
+		{
+			{
+				const __m128i u = _mm_loadu_si128((const __m128i*)&Ua[x]);
+				const __m128i u2 = _mm_unpackhi_epi8(u, zero);
+				const __m128i u1 = _mm_unpacklo_epi8(u, zero);
+				_mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
+				_mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
+			}
+			{
+				const __m128i u = _mm_loadu_si128((const __m128i*)&Va[x]);
+				const __m128i u2 = _mm_unpackhi_epi8(u, zero);
+				const __m128i u1 = _mm_unpacklo_epi8(u, zero);
+				_mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x]);
+				_mm_maskmoveu_si128(u2, mask, (char*)&pV[2 * x + 16]);
+			}
+		}
+
+		for (; x < halfWidth; x++)
+		{
+			const UINT32 val2x1 = (x * 2 + oddX);
+			pU[val2x1] = Ua[x];
+			pV[val2x1] = Va[x];
+		}
+	}
+
+	/* Filter */
+	return ssse3_ChromaFilter(pDst, dstStep, roi);
+}
+
+static pstatus_t ssse3_ChromaV2ToYUV444(const BYTE* const WINPR_RESTRICT pSrc[3],
+                                        const UINT32 srcStep[3], UINT32 nTotalWidth,
+                                        UINT32 nTotalHeight, BYTE* WINPR_RESTRICT pDst[3],
+                                        const UINT32 dstStep[3],
+                                        const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	const UINT32 nWidth = roi->right - roi->left;
+	const UINT32 nHeight = roi->bottom - roi->top;
+	const UINT32 halfWidth = (nWidth + 1) / 2;
+	const UINT32 halfPad = halfWidth % 16;
+	const UINT32 halfHeight = (nHeight + 1) / 2;
+	const UINT32 quaterWidth = (nWidth + 3) / 4;
+	const UINT32 quaterPad = quaterWidth % 16;
+	const __m128i zero = _mm_setzero_si128();
+	const __m128i mask = _mm_set_epi8((char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0,
+	                                  (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0);
+	const __m128i mask2 = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80,
+	                                   0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80);
+	const __m128i shuffle1 =
+	    _mm_set_epi8((char)0x80, 15, (char)0x80, 14, (char)0x80, 13, (char)0x80, 12, (char)0x80, 11,
+	                 (char)0x80, 10, (char)0x80, 9, (char)0x80, 8);
+	const __m128i shuffle2 =
+	    _mm_set_epi8((char)0x80, 7, (char)0x80, 6, (char)0x80, 5, (char)0x80, 4, (char)0x80, 3,
+	                 (char)0x80, 2, (char)0x80, 1, (char)0x80, 0);
+
+	/* B4 and B5: odd UV values for width/2, height */
+	for (UINT32 y = 0; y < nHeight; y++)
+	{
+		const UINT32 yTop = y + roi->top;
+		const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
+		const BYTE* pYaV = pYaU + nTotalWidth / 2;
+		BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left;
+		BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left;
+
+		UINT32 x = 0;
+		for (; x < halfWidth - halfPad; x += 16)
+		{
+			{
+				const __m128i u = _mm_loadu_si128((const __m128i*)&pYaU[x]);
+				const __m128i u2 = _mm_unpackhi_epi8(zero, u);
+				const __m128i u1 = _mm_unpacklo_epi8(zero, u);
+				_mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
+				_mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
+			}
+			{
+				const __m128i v = _mm_loadu_si128((const __m128i*)&pYaV[x]);
+				const __m128i v2 = _mm_unpackhi_epi8(zero, v);
+				const __m128i v1 = _mm_unpacklo_epi8(zero, v);
+				_mm_maskmoveu_si128(v1, mask, (char*)&pV[2 * x]);
+				_mm_maskmoveu_si128(v2, mask, (char*)&pV[2 * x + 16]);
+			}
+		}
+
+		for (; x < halfWidth; x++)
+		{
+			const UINT32 odd = 2 * x + 1;
+			pU[odd] = pYaU[x];
+			pV[odd] = pYaV[x];
+		}
+	}
+
+	/* B6 - B9 */
+	for (UINT32 y = 0; y < halfHeight; y++)
+	{
+		const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
+		const BYTE* pUaV = pUaU + nTotalWidth / 4;
+		const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
+		const BYTE* pVaV = pVaU + nTotalWidth / 4;
+		BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
+		BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
+
+		UINT32 x = 0;
+		for (; x < quaterWidth - quaterPad; x += 16)
+		{
+			{
+				const __m128i uU = _mm_loadu_si128((const __m128i*)&pUaU[x]);
+				const __m128i uV = _mm_loadu_si128((const __m128i*)&pVaU[x]);
+				const __m128i uHigh = _mm_unpackhi_epi8(uU, uV);
+				const __m128i uLow = _mm_unpacklo_epi8(uU, uV);
+				const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2);
+				const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1);
+				const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2);
+				const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1);
+				_mm_maskmoveu_si128(u1, mask2, (char*)&pU[4 * x + 0]);
+				_mm_maskmoveu_si128(u2, mask2, (char*)&pU[4 * x + 16]);
+				_mm_maskmoveu_si128(u3, mask2, (char*)&pU[4 * x + 32]);
+				_mm_maskmoveu_si128(u4, mask2, (char*)&pU[4 * x + 48]);
+			}
+			{
+				const __m128i vU = _mm_loadu_si128((const __m128i*)&pUaV[x]);
+				const __m128i vV = _mm_loadu_si128((const __m128i*)&pVaV[x]);
+				const __m128i vHigh = _mm_unpackhi_epi8(vU, vV);
+				const __m128i vLow = _mm_unpacklo_epi8(vU, vV);
+				const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2);
+				const __m128i v2 = _mm_shuffle_epi8(vLow, shuffle1);
+				const __m128i v3 = _mm_shuffle_epi8(vHigh, shuffle2);
+				const __m128i v4 = _mm_shuffle_epi8(vHigh, shuffle1);
+				_mm_maskmoveu_si128(v1, mask2, (char*)&pV[4 * x + 0]);
+				_mm_maskmoveu_si128(v2, mask2, (char*)&pV[4 * x + 16]);
+				_mm_maskmoveu_si128(v3, mask2, (char*)&pV[4 * x + 32]);
+				_mm_maskmoveu_si128(v4, mask2, (char*)&pV[4 * x + 48]);
+			}
+		}
+
+		for (; x < quaterWidth; x++)
+		{
+			pU[4 * x + 0] = pUaU[x];
+			pV[4 * x + 0] = pUaV[x];
+			pU[4 * x + 2] = pVaU[x];
+			pV[4 * x + 2] = pVaV[x];
+		}
+	}
+
+	return ssse3_ChromaFilter(pDst, dstStep, roi);
+}
+
+static pstatus_t ssse3_YUV420CombineToYUV444(avc444_frame_type type,
+                                             const BYTE* const WINPR_RESTRICT pSrc[3],
+                                             const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
+                                             BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
+                                             const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
+		return -1;
+
+	if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
+		return -1;
+
+	if (!roi)
+		return -1;
+
+	switch (type)
+	{
+		case AVC444_LUMA:
+			return ssse3_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
+
+		case AVC444_CHROMAv1:
+			return ssse3_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
+
+		case AVC444_CHROMAv2:
+			return ssse3_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
+
+		default:
+			return -1;
+	}
+}
+
+void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	generic = primitives_get_generic();
+	primitives_init_YUV(prims);
+
+	if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
+	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+	{
+		prims->RGBToYUV420_8u_P3AC4R = ssse3_RGBToYUV420;
+		prims->RGBToAVC444YUV = ssse3_RGBToAVC444YUV;
+		prims->RGBToAVC444YUVv2 = ssse3_RGBToAVC444YUVv2;
+		prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB;
+		prims->YUV444ToRGB_8u_P3AC4R = ssse3_YUV444ToRGB_8u_P3AC4R;
+		prims->YUV420CombineToYUV444 = ssse3_YUV420CombineToYUV444;
+	}
+}