41 files changed, 15663 insertions, 0 deletions
diff --git a/libfreerdp/primitives/README.txt b/libfreerdp/primitives/README.txt
new file mode 100644
index 0000000..81c7e97
--- /dev/null
+++ b/libfreerdp/primitives/README.txt
@@ -0,0 +1,101 @@
+The Primitives Library
+
+Introduction
+------------
+The purpose of the primitives library is to give the freerdp code easy
+access to *run-time* optimization via SIMD operations.  When the library
+is initialized, dynamic checks of processor features are run (such as
+the support of SSE3 or Neon), and entrypoints are linked to through
+function pointers to provide the fastest possible operations.  All
+routines offer generic C alternatives as fallbacks.
+
+Run-time optimization has the advantage of allowing a single executable
+to run fast on multiple platforms with different SIMD capabilities.
+
+
+Use In Code
+-----------
+A singleton pointing to a structure containing the function pointers
+is accessed through primitives_get().   The function pointers can then
+be used from that structure, e.g.
+
+    primitives_t *prims = primitives_get();
+    prims->shiftC_16s(buffer, shifts, buffer, 256);
+
+Of course, there is some overhead in calling through the function pointer
+and setting up the SIMD operations, so it would be counterproductive to
+call the primitives library for very small operation, e.g. initializing an
+array of eight values to a constant.  The primitives library is intended
+for larger-scale operations, e.g. arrays of size 64 and larger.
+
+
+Initialization and Cleanup
+--------------------------
+Library initialization is done the first time primitives_init() is called
+or the first time primitives_get() is used.  Cleanup (if any) is done by
+primitives_deinit().
+
+
+Intel Integrated Performance Primitives (IPP)
+---------------------------------------------
+If freerdp is compiled with IPP support (-DWITH_IPP=ON), the IPP function
+calls will be used (where available) to fill the function pointers.
+Where possible, function names and parameter lists match IPP format so
+that the IPP functions can be plugged into the function pointers without
+a wrapper layer.  Use of IPP is completely optional, and in many cases
+the SSE operations in the primitives library itself are faster or similar
+in performance.
+
+
+Coverage
+--------
+The primitives library is not meant to be comprehensive, offering
+entrypoints for every operation and operand type.  Instead, the coverage
+is focused on operations known to be performance bottlenecks in the code.
+For instance, 16-bit signed operations are used widely in the RemoteFX
+software, so you'll find 16s versions of several operations, but there
+is no attempt to provide (unused) copies of the same code for 8u, 16u,
+32s, etc.
+
+
+New Optimizations
+-----------------
+As the need arises, new optimizations can be added to the library,
+including NEON, AVX, and perhaps OpenCL or other SIMD implementations.
+The CPU feature detection is done in winpr/sysinfo.
+
+
+Adding Entrypoints
+------------------
+As the need for new operations or operands arises, new entrypoints can
+be added.  
+  1) Function prototypes and pointers are added to 
+     include/freerdp/primitives.h
+  2) New module initialization and cleanup function prototypes are added
+     to prim_internal.h and called in primitives.c (primitives_init()
+     and primitives_deinit()).
+  3) Operation names and parameter lists should be compatible with the IPP.
+     IPP manuals are available online at software.intel.com.
+  4) A generic C entrypoint must be available as a fallback.
+  5) prim_templates.h contains macro-based templates for simple operations,
+     such as applying a single SSE operation to arrays of data.
+     The template functions can frequently be used to extend the
+     operations without writing a lot of new code.
+
+Cache Management
+----------------
+I haven't found a lot of speed improvement by attempting prefetch, and
+in fact it seems to have a negative impact in some cases.  Done correctly
+perhaps the routines could be further accelerated by proper use of prefetch,
+fences, etc.
+
+
+Testing
+-------
+In the test subdirectory is an executable (prim_test) that tests both
+functionality and speed of primitives library operations.   Any new
+modules should be added to that test, following the conventions already
+established in that directory.  The program can be executed on various
+target hardware to compare generic C, optimized, and IPP performance
+with various array sizes.
+
diff --git a/libfreerdp/primitives/prim_YCoCg.c b/libfreerdp/primitives/prim_YCoCg.c
new file mode 100644
index 0000000..7c1a429
--- /dev/null
+++ b/libfreerdp/primitives/prim_YCoCg.c
@@ -0,0 +1,73 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * YCoCg<->RGB Color conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+/* helper function to convert raw 8 bit values to signed 16bit values.
+ */
+static INT16 convert(UINT8 raw, int shift)
+{
+	const int cll = shift - 1; /* -1 builds in the /2's */
+	return (INT16)((INT8)(raw << cll));
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_YCoCgToRGB_8u_AC4R(const BYTE* pSrc, INT32 srcStep, BYTE* pDst,
+                                            UINT32 DstFormat, INT32 dstStep, UINT32 width,
+                                            UINT32 height, UINT8 shift, BOOL withAlpha)
+{
+	const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+	fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, TRUE);
+
+	for (UINT32 y = 0; y < height; y++)
+	{
+		const BYTE* sptr = &pSrc[srcStep * y];
+		BYTE* dptr = &pDst[dstStep * y];
+		for (UINT32 x = 0; x < width; x++)
+		{
+			/* Note: shifts must be done before sign-conversion. */
+			const INT16 Cg = convert(*sptr++, shift);
+			const INT16 Co = convert(*sptr++, shift);
+			const INT16 Y = *sptr++; /* UINT8->INT16 */
+			const INT16 T = Y - Cg;
+			const INT16 B = T + Co;
+			const INT16 G = Y + Cg;
+			const INT16 R = T - Co;
+			BYTE A = *sptr++;
+
+			if (!withAlpha)
+				A = 0xFFU;
+
+			dptr = writePixel(dptr, formatSize, DstFormat, CLIP(R), CLIP(G), CLIP(B), A);
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_YCoCg(primitives_t* prims)
+{
+	prims->YCoCgToRGB_8u_AC4R = general_YCoCgToRGB_8u_AC4R;
+}
diff --git a/libfreerdp/primitives/prim_YCoCg_opt.c b/libfreerdp/primitives/prim_YCoCg_opt.c
new file mode 100644
index 0000000..bba13fa
--- /dev/null
+++ b/libfreerdp/primitives/prim_YCoCg_opt.c
@@ -0,0 +1,589 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized YCoCg<->RGB conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#elif defined(WITH_NEON)
+#include <arm_neon.h>
+#endif /* WITH_SSE2 else WITH_NEON */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+static primitives_t* generic = NULL;
+
+#ifdef WITH_SSE2
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
+                                                  BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
+                                                  UINT32 dstStep, UINT32 width, UINT32 height,
+                                                  UINT8 shift, BOOL withAlpha)
+{
+	const BYTE* sptr = pSrc;
+	BYTE* dptr = (BYTE*)pDst;
+	int sRowBump = srcStep - width * sizeof(UINT32);
+	int dRowBump = dstStep - width * sizeof(UINT32);
+	/* Shift left by "shift" and divide by two is the same as shift
+	 * left by "shift-1".
+	 */
+	int dataShift = shift - 1;
+	BYTE mask = (BYTE)(0xFFU << dataShift);
+
+	/* Let's say the data is of the form:
+	 * y0y0o0g0 a1y1o1g1 a2y2o2g2...
+	 * Apply:
+	 * |R|   | 1  1/2 -1/2 |   |y|
+	 * |G| = | 1  0    1/2 | * |o|
+	 * |B|   | 1 -1/2 -1/2 |   |g|
+	 * where Y is 8-bit unsigned and o & g are 8-bit signed.
+	 */
+
+	if ((width < 8) || (ULONG_PTR)dptr & 0x03)
+	{
+		/* Too small, or we'll never hit a 16-byte boundary.  Punt. */
+		return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+		                                   shift, withAlpha);
+	}
+
+	for (UINT32 h = 0; h < height; h++)
+	{
+		UINT32 w = width;
+		BOOL onStride = 0;
+
+		/* Get to a 16-byte destination boundary. */
+		if ((ULONG_PTR)dptr & 0x0f)
+		{
+			pstatus_t status = 0;
+			UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
+
+			if (startup > width)
+				startup = width;
+
+			status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
+			                                     1, shift, withAlpha);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return status;
+
+			sptr += startup * sizeof(UINT32);
+			dptr += startup * sizeof(UINT32);
+			w -= startup;
+		}
+
+		/* Each loop handles eight pixels at a time. */
+		onStride = (((ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
+
+		while (w >= 8)
+		{
+			__m128i R0;
+			__m128i R1;
+			__m128i R2;
+			__m128i R3;
+			__m128i R4;
+			__m128i R5;
+			__m128i R6;
+			__m128i R7;
+
+			if (onStride)
+			{
+				/* The faster path, 16-byte aligned load. */
+				R0 = _mm_load_si128((const __m128i*)sptr);
+				sptr += (128 / 8);
+				R1 = _mm_load_si128((const __m128i*)sptr);
+				sptr += (128 / 8);
+			}
+			else
+			{
+				/* Off-stride, slower LDDQU load. */
+				R0 = _mm_lddqu_si128((const __m128i*)sptr);
+				sptr += (128 / 8);
+				R1 = _mm_lddqu_si128((const __m128i*)sptr);
+				sptr += (128 / 8);
+			}
+
+			/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
+			/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
+			/* Shuffle to pack all the like types together. */
+			R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+			R3 = _mm_shuffle_epi8(R0, R2);
+			R4 = _mm_shuffle_epi8(R1, R2);
+			/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
+			/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
+			R5 = _mm_unpackhi_epi32(R3, R4);
+			R6 = _mm_unpacklo_epi32(R3, R4);
+
+			/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
+			/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+			/* Save alphas aside */
+			if (withAlpha)
+				R7 = _mm_unpackhi_epi64(R5, R5);
+			else
+				R7 = _mm_set1_epi32(0xFFFFFFFFU);
+
+			/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
+			/* Expand Y's from 8-bit unsigned to 16-bit signed. */
+			R1 = _mm_set1_epi32(0);
+			R0 = _mm_unpacklo_epi8(R5, R1);
+			/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
+			/* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
+			 * Note: this must be done before sign-conversion.
+			 * Note also there is no slli_epi8, so we have to use a 16-bit
+			 * version and then mask.
+			 */
+			R6 = _mm_slli_epi16(R6, dataShift);
+			R1 = _mm_set1_epi8(mask);
+			R6 = _mm_and_si128(R6, R1);
+			/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+			/* Expand Co's from 8-bit signed to 16-bit signed */
+			R1 = _mm_unpackhi_epi8(R6, R6);
+			R1 = _mm_srai_epi16(R1, 8);
+			/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
+			/* Expand Cg's form 8-bit signed to 16-bit signed */
+			R2 = _mm_unpacklo_epi8(R6, R6);
+			R2 = _mm_srai_epi16(R2, 8);
+			/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
+			/* Get Y - halfCg and save */
+			R6 = _mm_subs_epi16(R0, R2);
+			/* R = (Y-halfCg) + halfCo */
+			R3 = _mm_adds_epi16(R6, R1);
+			/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
+			/* G = Y + Cg(/2) */
+			R4 = _mm_adds_epi16(R0, R2);
+			/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
+			/* B = (Y-halfCg) - Co(/2) */
+			R5 = _mm_subs_epi16(R6, R1);
+			/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
+			/* Repack R's & B's.  */
+			R0 = _mm_packus_epi16(R3, R5);
+			/* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
+			/* Repack G's. */
+			R1 = _mm_packus_epi16(R4, R4);
+			/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
+			/* And add the A's. */
+			R1 = _mm_unpackhi_epi64(R1, R7);
+			/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
+			/* Now do interleaving again. */
+			R2 = _mm_unpacklo_epi8(R0, R1);
+			/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
+			R3 = _mm_unpackhi_epi8(R0, R1);
+			/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
+			R4 = _mm_unpacklo_epi16(R2, R3);
+			/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
+			R5 = _mm_unpackhi_epi16(R2, R3);
+			/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
+			_mm_store_si128((__m128i*)dptr, R4);
+			dptr += (128 / 8);
+			_mm_store_si128((__m128i*)dptr, R5);
+			dptr += (128 / 8);
+			w -= 8;
+		}
+
+		/* Handle any remainder pixels. */
+		if (w > 0)
+		{
+			pstatus_t status = 0;
+			status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
+			                                     shift, withAlpha);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return status;
+
+			sptr += w * sizeof(UINT32);
+			dptr += w * sizeof(UINT32);
+		}
+
+		sptr += sRowBump;
+		dptr += dRowBump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
+                                                     UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+                                                     UINT32 DstFormat, UINT32 dstStep, UINT32 width,
+                                                     UINT32 height, UINT8 shift, BOOL withAlpha)
+{
+	const BYTE* sptr = pSrc;
+	BYTE* dptr = (BYTE*)pDst;
+	int sRowBump = srcStep - width * sizeof(UINT32);
+	int dRowBump = dstStep - width * sizeof(UINT32);
+	/* Shift left by "shift" and divide by two is the same as shift
+	 * left by "shift-1".
+	 */
+	int dataShift = shift - 1;
+	BYTE mask = (BYTE)(0xFFU << dataShift);
+
+	/* Let's say the data is of the form:
+	 * y0y0o0g0 a1y1o1g1 a2y2o2g2...
+	 * Apply:
+	 * |R|   | 1  1/2 -1/2 |   |y|
+	 * |G| = | 1  0    1/2 | * |o|
+	 * |B|   | 1 -1/2 -1/2 |   |g|
+	 * where Y is 8-bit unsigned and o & g are 8-bit signed.
+	 */
+
+	if ((width < 8) || (ULONG_PTR)dptr & 0x03)
+	{
+		/* Too small, or we'll never hit a 16-byte boundary.  Punt. */
+		return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+		                                   shift, withAlpha);
+	}
+
+	for (UINT32 h = 0; h < height; h++)
+	{
+		int w = width;
+		BOOL onStride = 0;
+
+		/* Get to a 16-byte destination boundary. */
+		if ((ULONG_PTR)dptr & 0x0f)
+		{
+			pstatus_t status = 0;
+			UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
+
+			if (startup > width)
+				startup = width;
+
+			status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
+			                                     1, shift, withAlpha);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return status;
+
+			sptr += startup * sizeof(UINT32);
+			dptr += startup * sizeof(UINT32);
+			w -= startup;
+		}
+
+		/* Each loop handles eight pixels at a time. */
+		onStride = (((const ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
+
+		while (w >= 8)
+		{
+			__m128i R0;
+			__m128i R1;
+			__m128i R2;
+			__m128i R3;
+			__m128i R4;
+			__m128i R5;
+			__m128i R6;
+			__m128i R7;
+
+			if (onStride)
+			{
+				/* The faster path, 16-byte aligned load. */
+				R0 = _mm_load_si128((const __m128i*)sptr);
+				sptr += (128 / 8);
+				R1 = _mm_load_si128((const __m128i*)sptr);
+				sptr += (128 / 8);
+			}
+			else
+			{
+				/* Off-stride, slower LDDQU load. */
+				R0 = _mm_lddqu_si128((const __m128i*)sptr);
+				sptr += (128 / 8);
+				R1 = _mm_lddqu_si128((const __m128i*)sptr);
+				sptr += (128 / 8);
+			}
+
+			/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
+			/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
+			/* Shuffle to pack all the like types together. */
+			R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+			R3 = _mm_shuffle_epi8(R0, R2);
+			R4 = _mm_shuffle_epi8(R1, R2);
+			/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
+			/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
+			R5 = _mm_unpackhi_epi32(R3, R4);
+			R6 = _mm_unpacklo_epi32(R3, R4);
+
+			/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
+			/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+			/* Save alphas aside */
+			if (withAlpha)
+				R7 = _mm_unpackhi_epi64(R5, R5);
+			else
+				R7 = _mm_set1_epi32(0xFFFFFFFFU);
+
+			/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
+			/* Expand Y's from 8-bit unsigned to 16-bit signed. */
+			R1 = _mm_set1_epi32(0);
+			R0 = _mm_unpacklo_epi8(R5, R1);
+			/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
+			/* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
+			 * Note: this must be done before sign-conversion.
+			 * Note also there is no slli_epi8, so we have to use a 16-bit
+			 * version and then mask.
+			 */
+			R6 = _mm_slli_epi16(R6, dataShift);
+			R1 = _mm_set1_epi8(mask);
+			R6 = _mm_and_si128(R6, R1);
+			/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+			/* Expand Co's from 8-bit signed to 16-bit signed */
+			R1 = _mm_unpackhi_epi8(R6, R6);
+			R1 = _mm_srai_epi16(R1, 8);
+			/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
+			/* Expand Cg's form 8-bit signed to 16-bit signed */
+			R2 = _mm_unpacklo_epi8(R6, R6);
+			R2 = _mm_srai_epi16(R2, 8);
+			/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
+			/* Get Y - halfCg and save */
+			R6 = _mm_subs_epi16(R0, R2);
+			/* R = (Y-halfCg) + halfCo */
+			R3 = _mm_adds_epi16(R6, R1);
+			/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
+			/* G = Y + Cg(/2) */
+			R4 = _mm_adds_epi16(R0, R2);
+			/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
+			/* B = (Y-halfCg) - Co(/2) */
+			R5 = _mm_subs_epi16(R6, R1);
+			/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
+			/* Repack R's & B's.  */
+			/* This line is the only diff between inverted and non-inverted.
+			 * Unfortunately, it would be expensive to check "inverted"
+			 * every time through this loop.
+			 */
+			R0 = _mm_packus_epi16(R5, R3);
+			/* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
+			/* Repack G's. */
+			R1 = _mm_packus_epi16(R4, R4);
+			/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
+			/* And add the A's. */
+			R1 = _mm_unpackhi_epi64(R1, R7);
+			/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
+			/* Now do interleaving again. */
+			R2 = _mm_unpacklo_epi8(R0, R1);
+			/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
+			R3 = _mm_unpackhi_epi8(R0, R1);
+			/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
+			R4 = _mm_unpacklo_epi16(R2, R3);
+			/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
+			R5 = _mm_unpackhi_epi16(R2, R3);
+			/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
+			_mm_store_si128((__m128i*)dptr, R4);
+			dptr += (128 / 8);
+			_mm_store_si128((__m128i*)dptr, R5);
+			dptr += (128 / 8);
+			w -= 8;
+		}
+
+		/* Handle any remainder pixels. */
+		if (w > 0)
+		{
+			pstatus_t status = 0;
+			status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
+			                                     shift, withAlpha);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return status;
+
+			sptr += w * sizeof(UINT32);
+			dptr += w * sizeof(UINT32);
+		}
+
+		sptr += sRowBump;
+		dptr += dRowBump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_SSE2
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
+                                           BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
+                                           INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
+                                           BOOL withAlpha)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRX32:
+		case PIXEL_FORMAT_BGRA32:
+			return ssse3_YCoCgRToRGB_8u_AC4R_invert(pSrc, srcStep, pDst, DstFormat, dstStep, width,
+			                                        height, shift, withAlpha);
+
+		case PIXEL_FORMAT_RGBX32:
+		case PIXEL_FORMAT_RGBA32:
+			return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(pSrc, srcStep, pDst, DstFormat, dstStep,
+			                                           width, height, shift, withAlpha);
+
+		default:
+			return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
+			                                   height, shift, withAlpha);
+	}
+}
+#elif defined(WITH_NEON)
+
+static pstatus_t neon_YCoCgToRGB_8u_X(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
+                                      BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
+                                      UINT32 width, UINT32 height, UINT8 shift, BYTE bPos,
+                                      BYTE gPos, BYTE rPos, BYTE aPos, BOOL alpha)
+{
+	BYTE* dptr = pDst;
+	const BYTE* sptr = pSrc;
+	const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+	const int8_t cll = shift - 1; /* -1 builds in the /2's */
+	const UINT32 srcPad = srcStep - (width * 4);
+	const UINT32 dstPad = dstStep - (width * formatSize);
+	const UINT32 pad = width % 8;
+	const uint8x8_t aVal = vdup_n_u8(0xFF);
+	const int8x8_t cllv = vdup_n_s8(cll);
+
+	for (UINT32 y = 0; y < height; y++)
+	{
+		for (UINT32 x = 0; x < width - pad; x += 8)
+		{
+			/* Note: shifts must be done before sign-conversion. */
+			const uint8x8x4_t raw = vld4_u8(sptr);
+			const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv));
+			const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv));
+			const int16x8_t Cg = vmovl_s8(CgRaw);
+			const int16x8_t Co = vmovl_s8(CoRaw);
+			const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */
+			const int16x8_t T = vsubq_s16(Y, Cg);
+			const int16x8_t R = vaddq_s16(T, Co);
+			const int16x8_t G = vaddq_s16(Y, Cg);
+			const int16x8_t B = vsubq_s16(T, Co);
+			uint8x8x4_t bgrx;
+			bgrx.val[bPos] = vqmovun_s16(B);
+			bgrx.val[gPos] = vqmovun_s16(G);
+			bgrx.val[rPos] = vqmovun_s16(R);
+
+			if (alpha)
+				bgrx.val[aPos] = raw.val[3];
+			else
+				bgrx.val[aPos] = aVal;
+
+			vst4_u8(dptr, bgrx);
+			sptr += sizeof(raw);
+			dptr += sizeof(bgrx);
+		}
+
+		for (UINT32 x = 0; x < pad; x++)
+		{
+			/* Note: shifts must be done before sign-conversion. */
+			const INT16 Cg = (INT16)((INT8)((*sptr++) << cll));
+			const INT16 Co = (INT16)((INT8)((*sptr++) << cll));
+			const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */
+			const INT16 T = Y - Cg;
+			const INT16 R = T + Co;
+			const INT16 G = Y + Cg;
+			const INT16 B = T - Co;
+			BYTE bgra[4];
+			bgra[bPos] = CLIP(B);
+			bgra[gPos] = CLIP(G);
+			bgra[rPos] = CLIP(R);
+			bgra[aPos] = *sptr++;
+
+			if (!alpha)
+				bgra[aPos] = 0xFF;
+
+			*dptr++ = bgra[0];
+			*dptr++ = bgra[1];
+			*dptr++ = bgra[2];
+			*dptr++ = bgra[3];
+		}
+
+		sptr += srcPad;
+		dptr += dstPad;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
+                                         BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
+                                         UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 2, 1, 0, 3, withAlpha);
+
+		case PIXEL_FORMAT_BGRX32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 2, 1, 0, 3, withAlpha);
+
+		case PIXEL_FORMAT_RGBA32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 0, 1, 2, 3, withAlpha);
+
+		case PIXEL_FORMAT_RGBX32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 0, 1, 2, 3, withAlpha);
+
+		case PIXEL_FORMAT_ARGB32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 1, 2, 3, 0, withAlpha);
+
+		case PIXEL_FORMAT_XRGB32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 1, 2, 3, 0, withAlpha);
+
+		case PIXEL_FORMAT_ABGR32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 3, 2, 1, 0, withAlpha);
+
+		case PIXEL_FORMAT_XBGR32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 3, 2, 1, 0, withAlpha);
+
+		default:
+			return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
+			                                   height, shift, withAlpha);
+	}
+}
+#endif /* WITH_SSE2 */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	generic = primitives_get_generic();
+	primitives_init_YCoCg(prims);
+	/* While IPP acknowledges the existence of YCoCg-R, it doesn't currently
+	 * include any routines to work with it, especially with variable shift
+	 * width.
+	 */
+#if defined(WITH_SSE2)
+
+	if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
+	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+	{
+		prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
+	}
+
+#elif defined(WITH_NEON)
+
+	if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
+	{
+		prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R;
+	}
+
+#endif /* WITH_SSE2 */
+}
diff --git a/libfreerdp/primitives/prim_YUV.c b/libfreerdp/primitives/prim_YUV.c
new file mode 100644
index 0000000..ec02139
--- /dev/null
+++ b/libfreerdp/primitives/prim_YUV.c
@@ -0,0 +1,1877 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Generic YUV/RGB conversion operations
+ *
+ * Copyright 2014 Marc-Andre Moreau <marcandre.moreau@gmail.com>
+ * Copyright 2015-2017 Armin Novak <armin.novak@thincast.com>
+ * Copyright 2015-2017 Norbert Federa <norbert.federa@thincast.com>
+ * Copyright 2015-2017 Vic Lee
+ * Copyright 2015-2017 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <winpr/wtypes.h>
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <freerdp/codec/color.h>
+#include "prim_internal.h"
+
+static pstatus_t general_LumaToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3],
+                                      const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
+                                      const UINT32 dstStep[3],
+                                      const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	const UINT32 nWidth = roi->right - roi->left;
+	const UINT32 nHeight = roi->bottom - roi->top;
+	const UINT32 halfWidth = (nWidth + 1) / 2;
+	const UINT32 halfHeight = (nHeight + 1) / 2;
+	const UINT32 oddY = 1;
+	const UINT32 evenY = 0;
+	const UINT32 oddX = 1;
+	const UINT32 evenX = 0;
+	const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
+		                    pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
+		                    pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
+	BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
+		              pDstRaw[1] + roi->top * dstStep[1] + roi->left,
+		              pDstRaw[2] + roi->top * dstStep[2] + roi->left };
+
+	/* Y data is already here... */
+	/* B1 */
+	for (UINT32 y = 0; y < nHeight; y++)
+	{
+		const BYTE* Ym = pSrc[0] + srcStep[0] * y;
+		BYTE* pY = pDst[0] + dstStep[0] * y;
+		memcpy(pY, Ym, nWidth);
+	}
+
+	/* The first half of U, V are already here part of this frame. */
+	/* B2 and B3 */
+	for (UINT32 y = 0; y < halfHeight; y++)
+	{
+		const UINT32 val2y = (2 * y + evenY);
+		const UINT32 val2y1 = val2y + oddY;
+		const BYTE* Um = pSrc[1] + srcStep[1] * y;
+		const BYTE* Vm = pSrc[2] + srcStep[2] * y;
+		BYTE* pU = pDst[1] + dstStep[1] * val2y;
+		BYTE* pV = pDst[2] + dstStep[2] * val2y;
+		BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;
+		BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;
+
+		for (UINT32 x = 0; x < halfWidth; x++)
+		{
+			const UINT32 val2x = 2 * x + evenX;
+			const UINT32 val2x1 = val2x + oddX;
+			pU[val2x] = Um[x];
+			pV[val2x] = Vm[x];
+			pU[val2x1] = Um[x];
+			pV[val2x1] = Vm[x];
+			pU1[val2x] = Um[x];
+			pV1[val2x] = Vm[x];
+			pU1[val2x1] = Um[x];
+			pV1[val2x1] = Vm[x];
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_ChromaFilter(BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
+                                      const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	const UINT32 oddY = 1;
+	const UINT32 evenY = 0;
+	const UINT32 nWidth = roi->right - roi->left;
+	const UINT32 nHeight = roi->bottom - roi->top;
+	const UINT32 halfHeight = (nHeight + 1) / 2;
+	const UINT32 halfWidth = (nWidth + 1) / 2;
+
+	/* Filter */
+	for (UINT32 y = roi->top; y < halfHeight + roi->top; y++)
+	{
+		const UINT32 val2y = (y * 2 + evenY);
+		const UINT32 val2y1 = val2y + oddY;
+		BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;
+		BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;
+		BYTE* pU = pDst[1] + dstStep[1] * val2y;
+		BYTE* pV = pDst[2] + dstStep[2] * val2y;
+
+		if (val2y1 > nHeight)
+			continue;
+
+		for (UINT32 x = roi->left; x < halfWidth + roi->left; x++)
+		{
+			const UINT32 val2x = (x * 2);
+			const UINT32 val2x1 = val2x + 1;
+			const BYTE inU = pU[val2x];
+			const BYTE inV = pV[val2x];
+			const INT32 up = inU * 4;
+			const INT32 vp = inV * 4;
+			INT32 u2020 = 0;
+			INT32 v2020 = 0;
+
+			if (val2x1 > nWidth)
+				continue;
+
+			u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1];
+			v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1];
+
+			pU[val2x] = CONDITIONAL_CLIP(u2020, inU);
+			pV[val2x] = CONDITIONAL_CLIP(v2020, inV);
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_ChromaV1ToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3],
+                                          const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
+                                          const UINT32 dstStep[3],
+                                          const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	const UINT32 mod = 16;
+	UINT32 uY = 0;
+	UINT32 vY = 0;
+	const UINT32 nWidth = roi->right - roi->left;
+	const UINT32 nHeight = roi->bottom - roi->top;
+	const UINT32 halfWidth = (nWidth) / 2;
+	const UINT32 halfHeight = (nHeight) / 2;
+	const UINT32 oddY = 1;
+	const UINT32 evenY = 0;
+	const UINT32 oddX = 1;
+	/* The auxilary frame is aligned to multiples of 16x16.
+	 * We need the padded height for B4 and B5 conversion. */
+	const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
+	const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
+		                    pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
+		                    pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
+	BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
+		              pDstRaw[1] + roi->top * dstStep[1] + roi->left,
+		              pDstRaw[2] + roi->top * dstStep[2] + roi->left };
+
+	/* The second half of U and V is a bit more tricky... */
+	/* B4 and B5 */
+	for (UINT32 y = 0; y < padHeigth; y++)
+	{
+		const BYTE* Ya = pSrc[0] + srcStep[0] * y;
+		BYTE* pX = NULL;
+
+		if ((y) % mod < (mod + 1) / 2)
+		{
+			const UINT32 pos = (2 * uY++ + oddY);
+
+			if (pos >= nHeight)
+				continue;
+
+			pX = pDst[1] + dstStep[1] * pos;
+		}
+		else
+		{
+			const UINT32 pos = (2 * vY++ + oddY);
+
+			if (pos >= nHeight)
+				continue;
+
+			pX = pDst[2] + dstStep[2] * pos;
+		}
+
+		memcpy(pX, Ya, nWidth);
+	}
+
+	/* B6 and B7 */
+	for (UINT32 y = 0; y < halfHeight; y++)
+	{
+		const UINT32 val2y = (y * 2 + evenY);
+		const BYTE* Ua = pSrc[1] + srcStep[1] * y;
+		const BYTE* Va = pSrc[2] + srcStep[2] * y;
+		BYTE* pU = pDst[1] + dstStep[1] * val2y;
+		BYTE* pV = pDst[2] + dstStep[2] * val2y;
+
+		for (UINT32 x = 0; x < halfWidth; x++)
+		{
+			const UINT32 val2x1 = (x * 2 + oddX);
+			pU[val2x1] = Ua[x];
+			pV[val2x1] = Va[x];
+		}
+	}
+
+	/* Filter */
+	return general_ChromaFilter(pDst, dstStep, roi);
+}
+
+static pstatus_t general_ChromaV2ToYUV444(const BYTE* const WINPR_RESTRICT pSrc[3],
+                                          const UINT32 srcStep[3], UINT32 nTotalWidth,
+                                          UINT32 nTotalHeight, BYTE* WINPR_RESTRICT pDst[3],
+                                          const UINT32 dstStep[3],
+                                          const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	const UINT32 nWidth = roi->right - roi->left;
+	const UINT32 nHeight = roi->bottom - roi->top;
+	const UINT32 halfWidth = (nWidth + 1) / 2;
+	const UINT32 halfHeight = (nHeight + 1) / 2;
+	const UINT32 quaterWidth = (nWidth + 3) / 4;
+
+	/* B4 and B5: odd UV values for width/2, height */
+	for (UINT32 y = 0; y < nHeight; y++)
+	{
+		const UINT32 yTop = y + roi->top;
+		const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
+		const BYTE* pYaV = pYaU + nTotalWidth / 2;
+		BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left;
+		BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left;
+
+		for (UINT32 x = 0; x < halfWidth; x++)
+		{
+			const UINT32 odd = 2 * x + 1;
+			pU[odd] = *pYaU++;
+			pV[odd] = *pYaV++;
+		}
+	}
+
+	/* B6 - B9 */
+	for (UINT32 y = 0; y < halfHeight; y++)
+	{
+		const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
+		const BYTE* pUaV = pUaU + nTotalWidth / 4;
+		const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
+		const BYTE* pVaV = pVaU + nTotalWidth / 4;
+		BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
+		BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
+
+		for (UINT32 x = 0; x < quaterWidth; x++)
+		{
+			pU[4 * x + 0] = *pUaU++;
+			pV[4 * x + 0] = *pUaV++;
+			pU[4 * x + 2] = *pVaU++;
+			pV[4 * x + 2] = *pVaV++;
+		}
+	}
+
+	return general_ChromaFilter(pDst, dstStep, roi);
+}
+
+static pstatus_t general_YUV420CombineToYUV444(avc444_frame_type type,
+                                               const BYTE* const WINPR_RESTRICT pSrc[3],
+                                               const UINT32 srcStep[3], UINT32 nWidth,
+                                               UINT32 nHeight, BYTE* WINPR_RESTRICT pDst[3],
+                                               const UINT32 dstStep[3],
+                                               const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
+		return -1;
+
+	if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
+		return -1;
+
+	if (!roi)
+		return -1;
+
+	switch (type)
+	{
+		case AVC444_LUMA:
+			return general_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
+
+		case AVC444_CHROMAv1:
+			return general_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
+
+		case AVC444_CHROMAv2:
+			return general_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
+
+		default:
+			return -1;
+	}
+}
+
+static pstatus_t
+general_YUV444SplitToYUV420(const BYTE* const WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
+                            BYTE* WINPR_RESTRICT pMainDst[3], const UINT32 dstMainStep[3],
+                            BYTE* WINPR_RESTRICT pAuxDst[3], const UINT32 dstAuxStep[3],
+                            const prim_size_t* WINPR_RESTRICT roi)
+{
+	UINT32 uY = 0;
+	UINT32 vY = 0;
+	UINT32 halfWidth = 0;
+	UINT32 halfHeight = 0;
+	/* The auxilary frame is aligned to multiples of 16x16.
+	 * We need the padded height for B4 and B5 conversion. */
+	const UINT32 padHeigth = roi->height + 16 - roi->height % 16;
+	halfWidth = (roi->width + 1) / 2;
+	halfHeight = (roi->height + 1) / 2;
+
+	/* B1 */
+	for (UINT32 y = 0; y < roi->height; y++)
+	{
+		const BYTE* pSrcY = pSrc[0] + y * srcStep[0];
+		BYTE* pY = pMainDst[0] + y * dstMainStep[0];
+		memcpy(pY, pSrcY, roi->width);
+	}
+
+	/* B2 and B3 */
+	for (UINT32 y = 0; y < halfHeight; y++)
+	{
+		const BYTE* pSrcU = pSrc[1] + 2 * y * srcStep[1];
+		const BYTE* pSrcV = pSrc[2] + 2 * y * srcStep[2];
+		const BYTE* pSrcU1 = pSrc[1] + (2 * y + 1) * srcStep[1];
+		const BYTE* pSrcV1 = pSrc[2] + (2 * y + 1) * srcStep[2];
+		BYTE* pU = pMainDst[1] + y * dstMainStep[1];
+		BYTE* pV = pMainDst[2] + y * dstMainStep[2];
+
+		for (UINT32 x = 0; x < halfWidth; x++)
+		{
+			/* Filter */
+			const INT32 u = pSrcU[2 * x] + pSrcU[2 * x + 1] + pSrcU1[2 * x] + pSrcU1[2 * x + 1];
+			const INT32 v = pSrcV[2 * x] + pSrcV[2 * x + 1] + pSrcV1[2 * x] + pSrcV1[2 * x + 1];
+			pU[x] = CLIP(u / 4L);
+			pV[x] = CLIP(v / 4L);
+		}
+	}
+
+	/* B4 and B5 */
+	for (UINT32 y = 0; y < padHeigth; y++)
+	{
+		BYTE* pY = pAuxDst[0] + y * dstAuxStep[0];
+
+		if (y % 16 < 8)
+		{
+			const UINT32 pos = (2 * uY++ + 1);
+			const BYTE* pSrcU = pSrc[1] + pos * srcStep[1];
+
+			if (pos >= roi->height)
+				continue;
+
+			memcpy(pY, pSrcU, roi->width);
+		}
+		else
+		{
+			const UINT32 pos = (2 * vY++ + 1);
+			const BYTE* pSrcV = pSrc[2] + pos * srcStep[2];
+
+			if (pos >= roi->height)
+				continue;
+
+			memcpy(pY, pSrcV, roi->width);
+		}
+	}
+
+	/* B6 and B7 */
+	for (UINT32 y = 0; y < halfHeight; y++)
+	{
+		const BYTE* pSrcU = pSrc[1] + 2 * y * srcStep[1];
+		const BYTE* pSrcV = pSrc[2] + 2 * y * srcStep[2];
+		BYTE* pU = pAuxDst[1] + y * dstAuxStep[1];
+		BYTE* pV = pAuxDst[2] + y * dstAuxStep[2];
+
+		for (UINT32 x = 0; x < halfWidth; x++)
+		{
+			pU[x] = pSrcU[2 * x + 1];
+			pV[x] = pSrcV[2 * x + 1];
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_YUV444ToRGB_8u_P3AC4R_general(const BYTE* const WINPR_RESTRICT pSrc[3],
+                                                       const UINT32 srcStep[3],
+                                                       BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
+                                                       UINT32 DstFormat,
+                                                       const prim_size_t* WINPR_RESTRICT roi)
+{
+	const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+	fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, FALSE);
+
+	WINPR_ASSERT(pSrc);
+	WINPR_ASSERT(pDst);
+	WINPR_ASSERT(roi);
+
+	const UINT32 nWidth = roi->width;
+	const UINT32 nHeight = roi->height;
+
+	for (UINT32 y = 0; y < nHeight; y++)
+	{
+		const BYTE* pY = pSrc[0] + y * srcStep[0];
+		const BYTE* pU = pSrc[1] + y * srcStep[1];
+		const BYTE* pV = pSrc[2] + y * srcStep[2];
+		BYTE* pRGB = pDst + y * dstStep;
+
+		for (UINT32 x = 0; x < nWidth; x++)
+		{
+			const BYTE Y = pY[x];
+			const BYTE U = pU[x];
+			const BYTE V = pV[x];
+			const BYTE r = YUV2R(Y, U, V);
+			const BYTE g = YUV2G(Y, U, V);
+			const BYTE b = YUV2B(Y, U, V);
+			pRGB = writePixel(pRGB, formatSize, DstFormat, r, g, b, 0);
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_YUV444ToRGB_8u_P3AC4R_BGRX(const BYTE* const WINPR_RESTRICT pSrc[3],
+                                                    const UINT32 srcStep[3],
+                                                    BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
+                                                    UINT32 DstFormat,
+                                                    const prim_size_t* WINPR_RESTRICT roi)
+{
+	const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+
+	WINPR_ASSERT(pSrc);
+	WINPR_ASSERT(pDst);
+	WINPR_ASSERT(roi);
+
+	const UINT32 nWidth = roi->width;
+	const UINT32 nHeight = roi->height;
+
+	for (UINT32 y = 0; y < nHeight; y++)
+	{
+		const BYTE* pY = pSrc[0] + y * srcStep[0];
+		const BYTE* pU = pSrc[1] + y * srcStep[1];
+		const BYTE* pV = pSrc[2] + y * srcStep[2];
+		BYTE* pRGB = pDst + y * dstStep;
+
+		for (UINT32 x = 0; x < nWidth; x++)
+		{
+			const BYTE Y = pY[x];
+			const BYTE U = pU[x];
+			const BYTE V = pV[x];
+			const BYTE r = YUV2R(Y, U, V);
+			const BYTE g = YUV2G(Y, U, V);
+			const BYTE b = YUV2B(Y, U, V);
+			pRGB = writePixelBGRX(pRGB, formatSize, DstFormat, r, g, b, 0);
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_YUV444ToRGB_8u_P3AC4R(const BYTE* const WINPR_RESTRICT pSrc[3],
+                                               const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+                                               UINT32 dstStep, UINT32 DstFormat,
+                                               const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return general_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+
+		default:
+			return general_YUV444ToRGB_8u_P3AC4R_general(pSrc, srcStep, pDst, dstStep, DstFormat,
+			                                             roi);
+	}
+}
+/**
+ * | R |   ( | 256     0    403 | |    Y    | )
+ * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+ * | B |   ( | 256   475      0 | | V - 128 | )
+ */
+static pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* const WINPR_RESTRICT pSrc[3],
+                                               const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+                                               UINT32 dstStep, UINT32 DstFormat,
+                                               const prim_size_t* WINPR_RESTRICT roi)
+{
+	UINT32 dstPad = 0;
+	UINT32 srcPad[3];
+	BYTE Y = 0;
+	BYTE U = 0;
+	BYTE V = 0;
+	UINT32 halfWidth = 0;
+	UINT32 halfHeight = 0;
+	const BYTE* pY = NULL;
+	const BYTE* pU = NULL;
+	const BYTE* pV = NULL;
+	BYTE* pRGB = pDst;
+	UINT32 nWidth = 0;
+	UINT32 nHeight = 0;
+	UINT32 lastRow = 0;
+	UINT32 lastCol = 0;
+	const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+	fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, FALSE);
+	pY = pSrc[0];
+	pU = pSrc[1];
+	pV = pSrc[2];
+	lastCol = roi->width & 0x01;
+	lastRow = roi->height & 0x01;
+	nWidth = (roi->width + 1) & ~0x0001;
+	nHeight = (roi->height + 1) & ~0x0001;
+	halfWidth = nWidth / 2;
+	halfHeight = nHeight / 2;
+	srcPad[0] = (srcStep[0] - nWidth);
+	srcPad[1] = (srcStep[1] - halfWidth);
+	srcPad[2] = (srcStep[2] - halfWidth);
+	dstPad = (dstStep - (nWidth * 4));
+
+	for (UINT32 y = 0; y < halfHeight;)
+	{
+		if (++y == halfHeight)
+			lastRow <<= 1;
+
+		for (UINT32 x = 0; x < halfWidth;)
+		{
+			BYTE r = 0;
+			BYTE g = 0;
+			BYTE b = 0;
+
+			if (++x == halfWidth)
+				lastCol <<= 1;
+
+			U = *pU++;
+			V = *pV++;
+			/* 1st pixel */
+			Y = *pY++;
+			r = YUV2R(Y, U, V);
+			g = YUV2G(Y, U, V);
+			b = YUV2B(Y, U, V);
+			pRGB = writePixel(pRGB, formatSize, DstFormat, r, g, b, 0);
+
+			/* 2nd pixel */
+			if (!(lastCol & 0x02))
+			{
+				Y = *pY++;
+				r = YUV2R(Y, U, V);
+				g = YUV2G(Y, U, V);
+				b = YUV2B(Y, U, V);
+				pRGB = writePixel(pRGB, formatSize, DstFormat, r, g, b, 0);
+			}
+			else
+			{
+				pY++;
+				pRGB += formatSize;
+				lastCol >>= 1;
+			}
+		}
+
+		pY += srcPad[0];
+		pU -= halfWidth;
+		pV -= halfWidth;
+		pRGB += dstPad;
+
+		if (lastRow & 0x02)
+			break;
+
+		for (UINT32 x = 0; x < halfWidth;)
+		{
+			BYTE r = 0;
+			BYTE g = 0;
+			BYTE b = 0;
+
+			if (++x == halfWidth)
+				lastCol <<= 1;
+
+			U = *pU++;
+			V = *pV++;
+			/* 3rd pixel */
+			Y = *pY++;
+			r = YUV2R(Y, U, V);
+			g = YUV2G(Y, U, V);
+			b = YUV2B(Y, U, V);
+			pRGB = writePixel(pRGB, formatSize, DstFormat, r, g, b, 0);
+
+			/* 4th pixel */
+			if (!(lastCol & 0x02))
+			{
+				Y = *pY++;
+				r = YUV2R(Y, U, V);
+				g = YUV2G(Y, U, V);
+				b = YUV2B(Y, U, V);
+				pRGB = writePixel(pRGB, formatSize, DstFormat, r, g, b, 0);
+			}
+			else
+			{
+				pY++;
+				pRGB += formatSize;
+				lastCol >>= 1;
+			}
+		}
+
+		pY += srcPad[0];
+		pU += srcPad[1];
+		pV += srcPad[2];
+		pRGB += dstPad;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/**
+ * | Y |    ( |  54   183     18 | | R | )        |  0  |
+ * | U | =  ( | -29   -99    128 | | G | ) >> 8 + | 128 |
+ * | V |    ( | 128  -116    -12 | | B | )        | 128 |
+ */
+static INLINE BYTE RGB2Y(BYTE R, BYTE G, BYTE B)
+{
+	return (54 * R + 183 * G + 18 * B) >> 8;
+}
+
+static INLINE BYTE RGB2U(BYTE R, BYTE G, BYTE B)
+{
+	return ((-29 * R - 99 * G + 128 * B) >> 8) + 128;
+}
+
+static INLINE BYTE RGB2V(INT32 R, INT32 G, INT32 B)
+{
+	return ((128 * R - 116 * G - 12 * B) >> 8) + 128;
+}
+
+static pstatus_t general_RGBToYUV444_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc, UINT32 SrcFormat,
+                                               const UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[3],
+                                               UINT32 dstStep[3],
+                                               const prim_size_t* WINPR_RESTRICT roi)
+{
+	const UINT32 bpp = FreeRDPGetBytesPerPixel(SrcFormat);
+	UINT32 nWidth = 0;
+	UINT32 nHeight = 0;
+	nWidth = roi->width;
+	nHeight = roi->height;
+
+	for (UINT32 y = 0; y < nHeight; y++)
+	{
+		const BYTE* pRGB = pSrc + y * srcStep;
+		BYTE* pY = pDst[0] + y * dstStep[0];
+		BYTE* pU = pDst[1] + y * dstStep[1];
+		BYTE* pV = pDst[2] + y * dstStep[2];
+
+		for (UINT32 x = 0; x < nWidth; x++)
+		{
+			BYTE B = 0;
+			BYTE G = 0;
+			BYTE R = 0;
+			const UINT32 color = FreeRDPReadColor(&pRGB[x * bpp], SrcFormat);
+			FreeRDPSplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
+			pY[x] = RGB2Y(R, G, B);
+			pU[x] = RGB2U(R, G, B);
+			pV[x] = RGB2V(R, G, B);
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static INLINE pstatus_t general_RGBToYUV420_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
+                                                 BYTE* WINPR_RESTRICT pDst[3],
+                                                 const UINT32 dstStep[3],
+                                                 const prim_size_t* WINPR_RESTRICT roi)
+{
+	UINT32 i = 0;
+	size_t x1 = 0;
+	size_t x2 = 4;
+	size_t x3 = srcStep;
+	size_t x4 = srcStep + 4;
+	size_t y1 = 0;
+	size_t y2 = 1;
+	size_t y3 = dstStep[0];
+	size_t y4 = dstStep[0] + 1;
+	UINT32 max_x = roi->width - 1;
+	UINT32 max_y = roi->height - 1;
+
+	for (UINT32 y = i = 0; y < roi->height; y += 2, i++)
+	{
+		const BYTE* src = pSrc + y * srcStep;
+		BYTE* ydst = pDst[0] + y * dstStep[0];
+		BYTE* udst = pDst[1] + i * dstStep[1];
+		BYTE* vdst = pDst[2] + i * dstStep[2];
+
+		for (UINT32 x = 0; x < roi->width; x += 2)
+		{
+			BYTE R = 0;
+			BYTE G = 0;
+			BYTE B = 0;
+			INT32 Ra = 0;
+			INT32 Ga = 0;
+			INT32 Ba = 0;
+			/* row 1, pixel 1 */
+			Ba = B = *(src + x1 + 0);
+			Ga = G = *(src + x1 + 1);
+			Ra = R = *(src + x1 + 2);
+			ydst[y1] = RGB2Y(R, G, B);
+
+			if (x < max_x)
+			{
+				/* row 1, pixel 2 */
+				Ba += B = *(src + x2 + 0);
+				Ga += G = *(src + x2 + 1);
+				Ra += R = *(src + x2 + 2);
+				ydst[y2] = RGB2Y(R, G, B);
+			}
+
+			if (y < max_y)
+			{
+				/* row 2, pixel 1 */
+				Ba += B = *(src + x3 + 0);
+				Ga += G = *(src + x3 + 1);
+				Ra += R = *(src + x3 + 2);
+				ydst[y3] = RGB2Y(R, G, B);
+
+				if (x < max_x)
+				{
+					/* row 2, pixel 2 */
+					Ba += B = *(src + x4 + 0);
+					Ga += G = *(src + x4 + 1);
+					Ra += R = *(src + x4 + 2);
+					ydst[y4] = RGB2Y(R, G, B);
+				}
+			}
+
+			Ba >>= 2;
+			Ga >>= 2;
+			Ra >>= 2;
+			*udst++ = RGB2U(Ra, Ga, Ba);
+			*vdst++ = RGB2V(Ra, Ga, Ba);
+			ydst += 2;
+			src += 8;
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static INLINE pstatus_t general_RGBToYUV420_RGBX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
+                                                 BYTE* WINPR_RESTRICT pDst[3],
+                                                 const UINT32 dstStep[3],
+                                                 const prim_size_t* WINPR_RESTRICT roi)
+{
+	size_t x1 = 0;
+	size_t x2 = 4;
+	size_t x3 = srcStep;
+	size_t x4 = srcStep + 4;
+	size_t y1 = 0;
+	size_t y2 = 1;
+	size_t y3 = dstStep[0];
+	size_t y4 = dstStep[0] + 1;
+	UINT32 max_x = roi->width - 1;
+	UINT32 max_y = roi->height - 1;
+
+	for (UINT32 y = 0, i = 0; y < roi->height; y += 2, i++)
+	{
+		const BYTE* src = pSrc + y * srcStep;
+		BYTE* ydst = pDst[0] + y * dstStep[0];
+		BYTE* udst = pDst[1] + i * dstStep[1];
+		BYTE* vdst = pDst[2] + i * dstStep[2];
+
+		for (UINT32 x = 0; x < roi->width; x += 2)
+		{
+			BYTE R = 0;
+			BYTE G = 0;
+			BYTE B = 0;
+			INT32 Ra = 0;
+			INT32 Ga = 0;
+			INT32 Ba = 0;
+			/* row 1, pixel 1 */
+			Ra = R = *(src + x1 + 0);
+			Ga = G = *(src + x1 + 1);
+			Ba = B = *(src + x1 + 2);
+			ydst[y1] = RGB2Y(R, G, B);
+
+			if (x < max_x)
+			{
+				/* row 1, pixel 2 */
+				Ra += R = *(src + x2 + 0);
+				Ga += G = *(src + x2 + 1);
+				Ba += B = *(src + x2 + 2);
+				ydst[y2] = RGB2Y(R, G, B);
+			}
+
+			if (y < max_y)
+			{
+				/* row 2, pixel 1 */
+				Ra += R = *(src + x3 + 0);
+				Ga += G = *(src + x3 + 1);
+				Ba += B = *(src + x3 + 2);
+				ydst[y3] = RGB2Y(R, G, B);
+
+				if (x < max_x)
+				{
+					/* row 2, pixel 2 */
+					Ra += R = *(src + x4 + 0);
+					Ga += G = *(src + x4 + 1);
+					Ba += B = *(src + x4 + 2);
+					ydst[y4] = RGB2Y(R, G, B);
+				}
+			}
+
+			Ba >>= 2;
+			Ga >>= 2;
+			Ra >>= 2;
+			*udst++ = RGB2U(Ra, Ga, Ba);
+			*vdst++ = RGB2V(Ra, Ga, Ba);
+			ydst += 2;
+			src += 8;
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static INLINE pstatus_t general_RGBToYUV420_ANY(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+                                                UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[3],
+                                                const UINT32 dstStep[3],
+                                                const prim_size_t* WINPR_RESTRICT roi)
+{
+	const UINT32 bpp = FreeRDPGetBytesPerPixel(srcFormat);
+	size_t x1 = 0;
+	size_t x2 = bpp;
+	size_t x3 = srcStep;
+	size_t x4 = srcStep + bpp;
+	size_t y1 = 0;
+	size_t y2 = 1;
+	size_t y3 = dstStep[0];
+	size_t y4 = dstStep[0] + 1;
+	UINT32 max_x = roi->width - 1;
+	UINT32 max_y = roi->height - 1;
+
+	for (UINT32 y = 0, i = 0; y < roi->height; y += 2, i++)
+	{
+		const BYTE* src = pSrc + y * srcStep;
+		BYTE* ydst = pDst[0] + y * dstStep[0];
+		BYTE* udst = pDst[1] + i * dstStep[1];
+		BYTE* vdst = pDst[2] + i * dstStep[2];
+
+		for (UINT32 x = 0; x < roi->width; x += 2)
+		{
+			BYTE R = 0;
+			BYTE G = 0;
+			BYTE B = 0;
+			INT32 Ra = 0;
+			INT32 Ga = 0;
+			INT32 Ba = 0;
+			UINT32 color = 0;
+			/* row 1, pixel 1 */
+			color = FreeRDPReadColor(src + x1, srcFormat);
+			FreeRDPSplitColor(color, srcFormat, &R, &G, &B, NULL, NULL);
+			Ra = R;
+			Ga = G;
+			Ba = B;
+			ydst[y1] = RGB2Y(R, G, B);
+
+			if (x < max_x)
+			{
+				/* row 1, pixel 2 */
+				color = FreeRDPReadColor(src + x2, srcFormat);
+				FreeRDPSplitColor(color, srcFormat, &R, &G, &B, NULL, NULL);
+				Ra += R;
+				Ga += G;
+				Ba += B;
+				ydst[y2] = RGB2Y(R, G, B);
+			}
+
+			if (y < max_y)
+			{
+				/* row 2, pixel 1 */
+				color = FreeRDPReadColor(src + x3, srcFormat);
+				FreeRDPSplitColor(color, srcFormat, &R, &G, &B, NULL, NULL);
+				Ra += R;
+				Ga += G;
+				Ba += B;
+				ydst[y3] = RGB2Y(R, G, B);
+
+				if (x < max_x)
+				{
+					/* row 2, pixel 2 */
+					color = FreeRDPReadColor(src + x4, srcFormat);
+					FreeRDPSplitColor(color, srcFormat, &R, &G, &B, NULL, NULL);
+					Ra += R;
+					Ga += G;
+					Ba += B;
+					ydst[y4] = RGB2Y(R, G, B);
+				}
+			}
+
+			Ra >>= 2;
+			Ga >>= 2;
+			Ba >>= 2;
+			*udst++ = RGB2U(Ra, Ga, Ba);
+			*vdst++ = RGB2V(Ra, Ga, Ba);
+			ydst += 2;
+			src += 2 * bpp;
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_RGBToYUV420_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+                                               UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[3],
+                                               const UINT32 dstStep[3],
+                                               const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (srcFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return general_RGBToYUV420_BGRX(pSrc, srcStep, pDst, dstStep, roi);
+
+		case PIXEL_FORMAT_RGBA32:
+		case PIXEL_FORMAT_RGBX32:
+			return general_RGBToYUV420_RGBX(pSrc, srcStep, pDst, dstStep, roi);
+
+		default:
+			return general_RGBToYUV420_ANY(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
+	}
+}
+
+static INLINE void general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
+    const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
+    BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
+    BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
+    BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
+{
+	for (UINT32 x = 0; x < width; x += 2)
+	{
+		const BOOL lastX = (x + 1) >= width;
+		BYTE Y1e = 0;
+		BYTE Y2e = 0;
+		BYTE U1e = 0;
+		BYTE V1e = 0;
+		BYTE U2e = 0;
+		BYTE V2e = 0;
+		BYTE Y1o = 0;
+		BYTE Y2o = 0;
+		BYTE U1o = 0;
+		BYTE V1o = 0;
+		BYTE U2o = 0;
+		BYTE V2o = 0;
+		/* Read 4 pixels, 2 from even, 2 from odd lines */
+		{
+			const BYTE b = *srcEven++;
+			const BYTE g = *srcEven++;
+			const BYTE r = *srcEven++;
+			srcEven++;
+			Y1e = Y2e = Y1o = Y2o = RGB2Y(r, g, b);
+			U1e = U2e = U1o = U2o = RGB2U(r, g, b);
+			V1e = V2e = V1o = V2o = RGB2V(r, g, b);
+		}
+
+		if (!lastX)
+		{
+			const BYTE b = *srcEven++;
+			const BYTE g = *srcEven++;
+			const BYTE r = *srcEven++;
+			srcEven++;
+			Y2e = RGB2Y(r, g, b);
+			U2e = RGB2U(r, g, b);
+			V2e = RGB2V(r, g, b);
+		}
+
+		if (b1Odd)
+		{
+			const BYTE b = *srcOdd++;
+			const BYTE g = *srcOdd++;
+			const BYTE r = *srcOdd++;
+			srcOdd++;
+			Y1o = Y2o = RGB2Y(r, g, b);
+			U1o = U2o = RGB2U(r, g, b);
+			V1o = V2o = RGB2V(r, g, b);
+		}
+
+		if (b1Odd && !lastX)
+		{
+			const BYTE b = *srcOdd++;
+			const BYTE g = *srcOdd++;
+			const BYTE r = *srcOdd++;
+			srcOdd++;
+			Y2o = RGB2Y(r, g, b);
+			U2o = RGB2U(r, g, b);
+			V2o = RGB2V(r, g, b);
+		}
+
+		/* We have 4 Y pixels, so store them. */
+		*b1Even++ = Y1e;
+		*b1Even++ = Y2e;
+
+		if (b1Odd)
+		{
+			*b1Odd++ = Y1o;
+			*b1Odd++ = Y2o;
+		}
+
+		/* 2x 2y pixel in luma UV plane use averaging
+		 */
+		{
+			const BYTE Uavg = ((UINT16)U1e + (UINT16)U2e + (UINT16)U1o + (UINT16)U2o) / 4;
+			const BYTE Vavg = ((UINT16)V1e + (UINT16)V2e + (UINT16)V1o + (UINT16)V2o) / 4;
+			*b2++ = Uavg;
+			*b3++ = Vavg;
+		}
+
+		/* UV from 2x, 2y+1 */
+		if (b1Odd)
+		{
+			*b4++ = U1o;
+			*b5++ = V1o;
+
+			if (!lastX)
+			{
+				*b4++ = U2o;
+				*b5++ = V2o;
+			}
+		}
+
+		/* UV from 2x+1, 2y */
+		if (!lastX)
+		{
+			*b6++ = U2e;
+			*b7++ = V2e;
+		}
+	}
+}
+
+static INLINE pstatus_t general_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
+                                                    BYTE* WINPR_RESTRICT pDst1[3],
+                                                    const UINT32 dst1Step[3],
+                                                    BYTE* WINPR_RESTRICT pDst2[3],
+                                                    const UINT32 dst2Step[3],
+                                                    const prim_size_t* WINPR_RESTRICT roi)
+{
+	/**
+	 * Note:
+	 * Read information in function general_RGBToAVC444YUV_ANY below !
+	 */
+	const BYTE* pMaxSrc = pSrc + (roi->height - 1) * srcStep;
+
+	for (UINT32 y = 0; y < roi->height; y += 2)
+	{
+		const BOOL last = (y >= (roi->height - 1));
+		const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
+		const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
+		const UINT32 i = y >> 1;
+		const UINT32 n = (i & ~7) + i;
+		BYTE* b1Even = pDst1[0] + y * dst1Step[0];
+		BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
+		BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
+		BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
+		BYTE* b4 = pDst2[0] + dst2Step[0] * n;
+		BYTE* b5 = b4 + 8 * dst2Step[0];
+		BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
+		BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
+		general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6,
+		                                       b7, roi->width);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static INLINE void general_RGBToAVC444YUV_RGBX_DOUBLE_ROW(
+    const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
+    BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
+    BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
+    BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
+{
+	for (UINT32 x = 0; x < width; x += 2)
+	{
+		const BOOL lastX = (x + 1) >= width;
+		BYTE Y1e = 0;
+		BYTE Y2e = 0;
+		BYTE U1e = 0;
+		BYTE V1e = 0;
+		BYTE U2e = 0;
+		BYTE V2e = 0;
+		BYTE Y1o = 0;
+		BYTE Y2o = 0;
+		BYTE U1o = 0;
+		BYTE V1o = 0;
+		BYTE U2o = 0;
+		BYTE V2o = 0;
+		/* Read 4 pixels, 2 from even, 2 from odd lines */
+		{
+			const BYTE r = *srcEven++;
+			const BYTE g = *srcEven++;
+			const BYTE b = *srcEven++;
+			srcEven++;
+			Y1e = Y2e = Y1o = Y2o = RGB2Y(r, g, b);
+			U1e = U2e = U1o = U2o = RGB2U(r, g, b);
+			V1e = V2e = V1o = V2o = RGB2V(r, g, b);
+		}
+
+		if (!lastX)
+		{
+			const BYTE r = *srcEven++;
+			const BYTE g = *srcEven++;
+			const BYTE b = *srcEven++;
+			srcEven++;
+			Y2e = RGB2Y(r, g, b);
+			U2e = RGB2U(r, g, b);
+			V2e = RGB2V(r, g, b);
+		}
+
+		if (b1Odd)
+		{
+			const BYTE r = *srcOdd++;
+			const BYTE g = *srcOdd++;
+			const BYTE b = *srcOdd++;
+			srcOdd++;
+			Y1o = Y2o = RGB2Y(r, g, b);
+			U1o = U2o = RGB2U(r, g, b);
+			V1o = V2o = RGB2V(r, g, b);
+		}
+
+		if (b1Odd && !lastX)
+		{
+			const BYTE r = *srcOdd++;
+			const BYTE g = *srcOdd++;
+			const BYTE b = *srcOdd++;
+			srcOdd++;
+			Y2o = RGB2Y(r, g, b);
+			U2o = RGB2U(r, g, b);
+			V2o = RGB2V(r, g, b);
+		}
+
+		/* We have 4 Y pixels, so store them. */
+		*b1Even++ = Y1e;
+		*b1Even++ = Y2e;
+
+		if (b1Odd)
+		{
+			*b1Odd++ = Y1o;
+			*b1Odd++ = Y2o;
+		}
+
+		/* 2x 2y pixel in luma UV plane use averaging
+		 */
+		{
+			const BYTE Uavg = ((UINT16)U1e + (UINT16)U2e + (UINT16)U1o + (UINT16)U2o) / 4;
+			const BYTE Vavg = ((UINT16)V1e + (UINT16)V2e + (UINT16)V1o + (UINT16)V2o) / 4;
+			*b2++ = Uavg;
+			*b3++ = Vavg;
+		}
+
+		/* UV from 2x, 2y+1 */
+		if (b1Odd)
+		{
+			*b4++ = U1o;
+			*b5++ = V1o;
+
+			if (!lastX)
+			{
+				*b4++ = U2o;
+				*b5++ = V2o;
+			}
+		}
+
+		/* UV from 2x+1, 2y */
+		if (!lastX)
+		{
+			*b6++ = U2e;
+			*b7++ = V2e;
+		}
+	}
+}
+
+static INLINE pstatus_t general_RGBToAVC444YUV_RGBX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
+                                                    BYTE* WINPR_RESTRICT pDst1[3],
+                                                    const UINT32 dst1Step[3],
+                                                    BYTE* WINPR_RESTRICT pDst2[3],
+                                                    const UINT32 dst2Step[3],
+                                                    const prim_size_t* WINPR_RESTRICT roi)
+{
+	/**
+	 * Note:
+	 * Read information in function general_RGBToAVC444YUV_ANY below !
+	 */
+	const BYTE* pMaxSrc = pSrc + (roi->height - 1) * srcStep;
+
+	for (UINT32 y = 0; y < roi->height; y += 2)
+	{
+		const BOOL last = (y >= (roi->height - 1));
+		const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
+		const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
+		const UINT32 i = y >> 1;
+		const UINT32 n = (i & ~7) + i;
+		BYTE* b1Even = pDst1[0] + y * dst1Step[0];
+		BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
+		BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
+		BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
+		BYTE* b4 = pDst2[0] + dst2Step[0] * n;
+		BYTE* b5 = b4 + 8 * dst2Step[0];
+		BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
+		BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
+		general_RGBToAVC444YUV_RGBX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6,
+		                                       b7, roi->width);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static INLINE void general_RGBToAVC444YUV_ANY_DOUBLE_ROW(
+    const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd, UINT32 srcFormat,
+    BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
+    BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
+    BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
+{
+	const UINT32 bpp = FreeRDPGetBytesPerPixel(srcFormat);
+	for (UINT32 x = 0; x < width; x += 2)
+	{
+		const BOOL lastX = (x + 1) >= width;
+		BYTE Y1e = 0;
+		BYTE Y2e = 0;
+		BYTE U1e = 0;
+		BYTE V1e = 0;
+		BYTE U2e = 0;
+		BYTE V2e = 0;
+		BYTE Y1o = 0;
+		BYTE Y2o = 0;
+		BYTE U1o = 0;
+		BYTE V1o = 0;
+		BYTE U2o = 0;
+		BYTE V2o = 0;
+		/* Read 4 pixels, 2 from even, 2 from odd lines */
+		{
+			BYTE r = 0;
+			BYTE g = 0;
+			BYTE b = 0;
+			const UINT32 color = FreeRDPReadColor(srcEven, srcFormat);
+			srcEven += bpp;
+			FreeRDPSplitColor(color, srcFormat, &r, &g, &b, NULL, NULL);
+			Y1e = Y2e = Y1o = Y2o = RGB2Y(r, g, b);
+			U1e = U2e = U1o = U2o = RGB2U(r, g, b);
+			V1e = V2e = V1o = V2o = RGB2V(r, g, b);
+		}
+
+		if (!lastX)
+		{
+			BYTE r = 0;
+			BYTE g = 0;
+			BYTE b = 0;
+			const UINT32 color = FreeRDPReadColor(srcEven, srcFormat);
+			srcEven += bpp;
+			FreeRDPSplitColor(color, srcFormat, &r, &g, &b, NULL, NULL);
+			Y2e = RGB2Y(r, g, b);
+			U2e = RGB2U(r, g, b);
+			V2e = RGB2V(r, g, b);
+		}
+
+		if (b1Odd)
+		{
+			BYTE r = 0;
+			BYTE g = 0;
+			BYTE b = 0;
+			const UINT32 color = FreeRDPReadColor(srcOdd, srcFormat);
+			srcOdd += bpp;
+			FreeRDPSplitColor(color, srcFormat, &r, &g, &b, NULL, NULL);
+			Y1o = Y2o = RGB2Y(r, g, b);
+			U1o = U2o = RGB2U(r, g, b);
+			V1o = V2o = RGB2V(r, g, b);
+		}
+
+		if (b1Odd && !lastX)
+		{
+			BYTE r = 0;
+			BYTE g = 0;
+			BYTE b = 0;
+			const UINT32 color = FreeRDPReadColor(srcOdd, srcFormat);
+			srcOdd += bpp;
+			FreeRDPSplitColor(color, srcFormat, &r, &g, &b, NULL, NULL);
+			Y2o = RGB2Y(r, g, b);
+			U2o = RGB2U(r, g, b);
+			V2o = RGB2V(r, g, b);
+		}
+
+		/* We have 4 Y pixels, so store them. */
+		*b1Even++ = Y1e;
+		*b1Even++ = Y2e;
+
+		if (b1Odd)
+		{
+			*b1Odd++ = Y1o;
+			*b1Odd++ = Y2o;
+		}
+
+		/* 2x 2y pixel in luma UV plane use averaging
+		 */
+		{
+			const BYTE Uavg = ((UINT16)U1e + (UINT16)U2e + (UINT16)U1o + (UINT16)U2o) / 4;
+			const BYTE Vavg = ((UINT16)V1e + (UINT16)V2e + (UINT16)V1o + (UINT16)V2o) / 4;
+			*b2++ = Uavg;
+			*b3++ = Vavg;
+		}
+
+		/* UV from 2x, 2y+1 */
+		if (b1Odd)
+		{
+			*b4++ = U1o;
+			*b5++ = V1o;
+
+			if (!lastX)
+			{
+				*b4++ = U2o;
+				*b5++ = V2o;
+			}
+		}
+
+		/* UV from 2x+1, 2y */
+		if (!lastX)
+		{
+			*b6++ = U2e;
+			*b7++ = V2e;
+		}
+	}
+}
+
+static INLINE pstatus_t general_RGBToAVC444YUV_ANY(
+    const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat, UINT32 srcStep,
+    BYTE* WINPR_RESTRICT pDst1[3], const UINT32 dst1Step[3], BYTE* WINPR_RESTRICT pDst2[3],
+    const UINT32 dst2Step[3], const prim_size_t* WINPR_RESTRICT roi)
+{
+	/**
+	 * Note: According to [MS-RDPEGFX 2.2.4.4 RFX_AVC420_BITMAP_STREAM] the
+	 * width and height of the MPEG-4 AVC/H.264 codec bitstream MUST be aligned
+	 * to a multiple of 16.
+	 * Hence the passed destination YUV420/CHROMA420 buffers must have been
+	 * allocated accordingly !!
+	 */
+	/**
+	 * [MS-RDPEGFX 3.3.8.3.2 YUV420p Stream Combination] defines the following "Bx areas":
+	 *
+	 * YUV420 frame (main view):
+	 * B1:  From Y444 all pixels
+	 * B2:  From U444 all pixels in even rows with even columns
+	 * B3:  From V444 all pixels in even rows with even columns
+	 *
+	 * Chroma420 frame (auxillary view):
+	 * B45: From U444 and V444 all pixels from all odd rows
+	 *      (The odd U444 and V444 rows must be interleaved in 8-line blocks in B45 !!!)
+	 * B6:  From U444 all pixels in even rows with odd columns
+	 * B7:  From V444 all pixels in even rows with odd columns
+	 *
+	 * Microsoft's horrible unclear description in MS-RDPEGFX translated to pseudo code looks like
+	 * this:
+	 *
+	 * for (y = 0; y < fullHeight; y++)
+	 * {
+	 *     for (x = 0; x < fullWidth; x++)
+	 *     {
+	 *         B1[x,y] = Y444[x,y];
+	 *     }
+	 *  }
+	 *
+	 * for (y = 0; y < halfHeight; y++)
+	 * {
+	 *     for (x = 0; x < halfWidth; x++)
+	 *     {
+	 *         B2[x,y] = U444[2 * x,     2 * y];
+	 *         B3[x,y] = V444[2 * x,     2 * y];
+	 *         B6[x,y] = U444[2 * x + 1, 2 * y];
+	 *     	   B7[x,y] = V444[2 * x + 1, 2 * y];
+	 *     }
+	 *  }
+	 *
+	 * for (y = 0; y < halfHeight; y++)
+	 * {
+	 *     yU  = (y / 8) * 16;   // identify first row of correct 8-line U block in B45
+	 *     yU += (y % 8);        // add offset rows in destination block
+	 *     yV  = yU + 8;         // the corresponding v line is always 8 rows ahead
+	 *
+	 *     for (x = 0; x < fullWidth; x++)
+	 *     {
+	 *         B45[x,yU] = U444[x, 2 * y + 1];
+	 *         B45[x,yV] = V444[x, 2 * y + 1];
+	 *     }
+	 *  }
+	 *
+	 */
+	const BYTE* pMaxSrc = pSrc + (roi->height - 1) * srcStep;
+
+	for (UINT32 y = 0; y < roi->height; y += 2)
+	{
+		const BOOL last = (y >= (roi->height - 1));
+		const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
+		const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
+		const UINT32 i = y >> 1;
+		const UINT32 n = (i & ~7) + i;
+		BYTE* b1Even = pDst1[0] + y * dst1Step[0];
+		BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
+		BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
+		BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
+		BYTE* b4 = pDst2[0] + dst2Step[0] * n;
+		BYTE* b5 = b4 + 8 * dst2Step[0];
+		BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
+		BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
+		general_RGBToAVC444YUV_ANY_DOUBLE_ROW(srcEven, srcOdd, srcFormat, b1Even, b1Odd, b2, b3, b4,
+		                                      b5, b6, b7, roi->width);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static INLINE pstatus_t general_RGBToAVC444YUV(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+                                               UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[3],
+                                               const UINT32 dst1Step[3],
+                                               BYTE* WINPR_RESTRICT pDst2[3],
+                                               const UINT32 dst2Step[3],
+                                               const prim_size_t* WINPR_RESTRICT roi)
+{
+	if (!pSrc || !pDst1 || !dst1Step || !pDst2 || !dst2Step)
+		return -1;
+
+	if (!pDst1[0] || !pDst1[1] || !pDst1[2])
+		return -1;
+
+	if (!dst1Step[0] || !dst1Step[1] || !dst1Step[2])
+		return -1;
+
+	if (!pDst2[0] || !pDst2[1] || !pDst2[2])
+		return -1;
+
+	if (!dst2Step[0] || !dst2Step[1] || !dst2Step[2])
+		return -1;
+
+	switch (srcFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return general_RGBToAVC444YUV_BGRX(pSrc, srcStep, pDst1, dst1Step, pDst2, dst2Step,
+			                                   roi);
+
+		case PIXEL_FORMAT_RGBA32:
+		case PIXEL_FORMAT_RGBX32:
+			return general_RGBToAVC444YUV_RGBX(pSrc, srcStep, pDst1, dst1Step, pDst2, dst2Step,
+			                                   roi);
+
+		default:
+			return general_RGBToAVC444YUV_ANY(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
+			                                  dst2Step, roi);
+	}
+
+	return !PRIMITIVES_SUCCESS;
+}
+
+static INLINE void general_RGBToAVC444YUVv2_ANY_DOUBLE_ROW(
+    const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd, UINT32 srcFormat,
+    BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
+    BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
+    BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
+    BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
+    BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
+    BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width)
+{
+	const UINT32 bpp = FreeRDPGetBytesPerPixel(srcFormat);
+
+	for (UINT32 x = 0; x < width; x += 2)
+	{
+		BYTE Ya = 0;
+		BYTE Ua = 0;
+		BYTE Va = 0;
+		BYTE Yb = 0;
+		BYTE Ub = 0;
+		BYTE Vb = 0;
+		BYTE Yc = 0;
+		BYTE Uc = 0;
+		BYTE Vc = 0;
+		BYTE Yd = 0;
+		BYTE Ud = 0;
+		BYTE Vd = 0;
+		{
+			BYTE b = 0;
+			BYTE g = 0;
+			BYTE r = 0;
+			const UINT32 color = FreeRDPReadColor(srcEven, srcFormat);
+			srcEven += bpp;
+			FreeRDPSplitColor(color, srcFormat, &r, &g, &b, NULL, NULL);
+			Ya = RGB2Y(r, g, b);
+			Ua = RGB2U(r, g, b);
+			Va = RGB2V(r, g, b);
+		}
+
+		if (x < width - 1)
+		{
+			BYTE b = 0;
+			BYTE g = 0;
+			BYTE r = 0;
+			const UINT32 color = FreeRDPReadColor(srcEven, srcFormat);
+			srcEven += bpp;
+			FreeRDPSplitColor(color, srcFormat, &r, &g, &b, NULL, NULL);
+			Yb = RGB2Y(r, g, b);
+			Ub = RGB2U(r, g, b);
+			Vb = RGB2V(r, g, b);
+		}
+		else
+		{
+			Yb = Ya;
+			Ub = Ua;
+			Vb = Va;
+		}
+
+		if (srcOdd)
+		{
+			BYTE b = 0;
+			BYTE g = 0;
+			BYTE r = 0;
+			const UINT32 color = FreeRDPReadColor(srcOdd, srcFormat);
+			srcOdd += bpp;
+			FreeRDPSplitColor(color, srcFormat, &r, &g, &b, NULL, NULL);
+			Yc = RGB2Y(r, g, b);
+			Uc = RGB2U(r, g, b);
+			Vc = RGB2V(r, g, b);
+		}
+		else
+		{
+			Yc = Ya;
+			Uc = Ua;
+			Vc = Va;
+		}
+
+		if (srcOdd && (x < width - 1))
+		{
+			BYTE b = 0;
+			BYTE g = 0;
+			BYTE r = 0;
+			const UINT32 color = FreeRDPReadColor(srcOdd, srcFormat);
+			srcOdd += bpp;
+			FreeRDPSplitColor(color, srcFormat, &r, &g, &b, NULL, NULL);
+			Yd = RGB2Y(r, g, b);
+			Ud = RGB2U(r, g, b);
+			Vd = RGB2V(r, g, b);
+		}
+		else
+		{
+			Yd = Ya;
+			Ud = Ua;
+			Vd = Va;
+		}
+
+		/* Y [b1] */
+		*yLumaDstEven++ = Ya;
+
+		if (x < width - 1)
+			*yLumaDstEven++ = Yb;
+
+		if (srcOdd)
+			*yLumaDstOdd++ = Yc;
+
+		if (srcOdd && (x < width - 1))
+			*yLumaDstOdd++ = Yd;
+
+		/* 2x 2y [b2,b3] */
+		*uLumaDst++ = (Ua + Ub + Uc + Ud) / 4;
+		*vLumaDst++ = (Va + Vb + Vc + Vd) / 4;
+
+		/* 2x+1, y [b4,b5] even */
+		if (x < width - 1)
+		{
+			*yEvenChromaDst1++ = Ub;
+			*yEvenChromaDst2++ = Vb;
+		}
+
+		if (srcOdd)
+		{
+			/* 2x+1, y [b4,b5] odd */
+			if (x < width - 1)
+			{
+				*yOddChromaDst1++ = Ud;
+				*yOddChromaDst2++ = Vd;
+			}
+
+			/* 4x 2y+1 [b6, b7] */
+			if (x % 4 == 0)
+			{
+				*uChromaDst1++ = Uc;
+				*uChromaDst2++ = Vc;
+			}
+			/* 4x+2 2y+1 [b8, b9] */
+			else
+			{
+				*vChromaDst1++ = Uc;
+				*vChromaDst2++ = Vc;
+			}
+		}
+	}
+}
+
+static INLINE pstatus_t general_RGBToAVC444YUVv2_ANY(
+    const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat, UINT32 srcStep,
+    BYTE* WINPR_RESTRICT pDst1[3], const UINT32 dst1Step[3], BYTE* WINPR_RESTRICT pDst2[3],
+    const UINT32 dst2Step[3], const prim_size_t* WINPR_RESTRICT roi)
+{
+	/**
+	 * Note: According to [MS-RDPEGFX 2.2.4.4 RFX_AVC420_BITMAP_STREAM] the
+	 * width and height of the MPEG-4 AVC/H.264 codec bitstream MUST be aligned
+	 * to a multiple of 16.
+	 * Hence the passed destination YUV420/CHROMA420 buffers must have been
+	 * allocated accordingly !!
+	 */
+	/**
+	 * [MS-RDPEGFX 3.3.8.3.3 YUV420p Stream Combination for YUV444v2 mode] defines the following "Bx
+	 * areas":
+	 *
+	 * YUV420 frame (main view):
+	 * B1:  From Y444 all pixels
+	 * B2:  From U444 all pixels in even rows with even rows and columns
+	 * B3:  From V444 all pixels in even rows with even rows and columns
+	 *
+	 * Chroma420 frame (auxillary view):
+	 * B45: From U444 and V444 all pixels from all odd columns
+	 * B67: From U444 and V444 every 4th pixel in odd rows
+	 * B89:  From U444 and V444 every 4th pixel (initial offset of 2) in odd rows
+	 *
+	 * Chroma Bxy areas correspond to the left and right half of the YUV420 plane.
+	 * for (y = 0; y < fullHeight; y++)
+	 * {
+	 *     for (x = 0; x < fullWidth; x++)
+	 *     {
+	 *         B1[x,y] = Y444[x,y];
+	 *     }
+	 *
+	 *     for (x = 0; x < halfWidth; x++)
+	 *     {
+	 *         B4[x,y] = U444[2 * x, 2 * y];
+	 *         B5[x,y] = V444[2 * x, 2 * y];
+	 *     }
+	 *  }
+	 *
+	 * for (y = 0; y < halfHeight; y++)
+	 * {
+	 *     for (x = 0; x < halfWidth; x++)
+	 *     {
+	 *         B2[x,y] = U444[2 * x,     2 * y];
+	 *         B3[x,y] = V444[2 * x,     2 * y];
+	 *         B6[x,y] = U444[4 * x,     2 * y + 1];
+	 *         B7[x,y] = V444[4 * x,     2 * y + 1];
+	 *         B8[x,y] = V444[4 * x + 2, 2 * y + 1];
+	 *         B9[x,y] = V444[4 * x + 2, 2 * y] + 1;
+	 *     }
+	 *  }
+	 *
+	 */
+	if (roi->height < 1 || roi->width < 1)
+		return !PRIMITIVES_SUCCESS;
+
+	for (UINT32 y = 0; y < roi->height; y += 2)
+	{
+		const BYTE* srcEven = (pSrc + y * srcStep);
+		const BYTE* srcOdd = (y < roi->height - 1) ? (srcEven + srcStep) : NULL;
+		BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
+		BYTE* dstLumaYOdd = (dstLumaYEven + dst1Step[0]);
+		BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
+		BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
+		BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
+		BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
+		BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0];
+		BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0];
+		BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
+		BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
+		BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
+		BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
+		general_RGBToAVC444YUVv2_ANY_DOUBLE_ROW(
+		    srcEven, srcOdd, srcFormat, dstLumaYEven, dstLumaYOdd, dstLumaU, dstLumaV,
+		    dstEvenChromaY1, dstEvenChromaY2, dstOddChromaY1, dstOddChromaY2, dstChromaU1,
+		    dstChromaU2, dstChromaV1, dstChromaV2, roi->width);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static INLINE void general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
+    const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
+    BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
+    BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
+    BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
+    BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
+    BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
+    BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width)
+{
+	for (UINT32 x = 0; x < width; x += 2)
+	{
+		BYTE Ya = 0;
+		BYTE Ua = 0;
+		BYTE Va = 0;
+		BYTE Yb = 0;
+		BYTE Ub = 0;
+		BYTE Vb = 0;
+		BYTE Yc = 0;
+		BYTE Uc = 0;
+		BYTE Vc = 0;
+		BYTE Yd = 0;
+		BYTE Ud = 0;
+		BYTE Vd = 0;
+		{
+			const BYTE b = *srcEven++;
+			const BYTE g = *srcEven++;
+			const BYTE r = *srcEven++;
+			srcEven++;
+			Ya = RGB2Y(r, g, b);
+			Ua = RGB2U(r, g, b);
+			Va = RGB2V(r, g, b);
+		}
+
+		if (x < width - 1)
+		{
+			const BYTE b = *srcEven++;
+			const BYTE g = *srcEven++;
+			const BYTE r = *srcEven++;
+			srcEven++;
+			Yb = RGB2Y(r, g, b);
+			Ub = RGB2U(r, g, b);
+			Vb = RGB2V(r, g, b);
+		}
+		else
+		{
+			Yb = Ya;
+			Ub = Ua;
+			Vb = Va;
+		}
+
+		if (srcOdd)
+		{
+			const BYTE b = *srcOdd++;
+			const BYTE g = *srcOdd++;
+			const BYTE r = *srcOdd++;
+			srcOdd++;
+			Yc = RGB2Y(r, g, b);
+			Uc = RGB2U(r, g, b);
+			Vc = RGB2V(r, g, b);
+		}
+		else
+		{
+			Yc = Ya;
+			Uc = Ua;
+			Vc = Va;
+		}
+
+		if (srcOdd && (x < width - 1))
+		{
+			const BYTE b = *srcOdd++;
+			const BYTE g = *srcOdd++;
+			const BYTE r = *srcOdd++;
+			srcOdd++;
+			Yd = RGB2Y(r, g, b);
+			Ud = RGB2U(r, g, b);
+			Vd = RGB2V(r, g, b);
+		}
+		else
+		{
+			Yd = Ya;
+			Ud = Ua;
+			Vd = Va;
+		}
+
+		/* Y [b1] */
+		*yLumaDstEven++ = Ya;
+
+		if (x < width - 1)
+			*yLumaDstEven++ = Yb;
+
+		if (srcOdd)
+			*yLumaDstOdd++ = Yc;
+
+		if (srcOdd && (x < width - 1))
+			*yLumaDstOdd++ = Yd;
+
+		/* 2x 2y [b2,b3] */
+		*uLumaDst++ = (Ua + Ub + Uc + Ud) / 4;
+		*vLumaDst++ = (Va + Vb + Vc + Vd) / 4;
+
+		/* 2x+1, y [b4,b5] even */
+		if (x < width - 1)
+		{
+			*yEvenChromaDst1++ = Ub;
+			*yEvenChromaDst2++ = Vb;
+		}
+
+		if (srcOdd)
+		{
+			/* 2x+1, y [b4,b5] odd */
+			if (x < width - 1)
+			{
+				*yOddChromaDst1++ = Ud;
+				*yOddChromaDst2++ = Vd;
+			}
+
+			/* 4x 2y+1 [b6, b7] */
+			if (x % 4 == 0)
+			{
+				*uChromaDst1++ = Uc;
+				*uChromaDst2++ = Vc;
+			}
+			/* 4x+2 2y+1 [b8, b9] */
+			else
+			{
+				*vChromaDst1++ = Uc;
+				*vChromaDst2++ = Vc;
+			}
+		}
+	}
+}
+
+static INLINE pstatus_t general_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT pSrc,
+                                                      UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[3],
+                                                      const UINT32 dst1Step[3],
+                                                      BYTE* WINPR_RESTRICT pDst2[3],
+                                                      const UINT32 dst2Step[3],
+                                                      const prim_size_t* WINPR_RESTRICT roi)
+{
+	if (roi->height < 1 || roi->width < 1)
+		return !PRIMITIVES_SUCCESS;
+
+	for (UINT32 y = 0; y < roi->height; y += 2)
+	{
+		const BYTE* srcEven = (pSrc + y * srcStep);
+		const BYTE* srcOdd = (y < roi->height - 1) ? (srcEven + srcStep) : NULL;
+		BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
+		BYTE* dstLumaYOdd = (dstLumaYEven + dst1Step[0]);
+		BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
+		BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
+		BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
+		BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
+		BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0];
+		BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0];
+		BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
+		BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
+		BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
+		BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
+		general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
+		    srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU, dstLumaV, dstEvenChromaY1,
+		    dstEvenChromaY2, dstOddChromaY1, dstOddChromaY2, dstChromaU1, dstChromaU2, dstChromaV1,
+		    dstChromaV2, roi->width);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static INLINE pstatus_t general_RGBToAVC444YUVv2(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+                                                 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[3],
+                                                 const UINT32 dst1Step[3],
+                                                 BYTE* WINPR_RESTRICT pDst2[3],
+                                                 const UINT32 dst2Step[3],
+                                                 const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (srcFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return general_RGBToAVC444YUVv2_BGRX(pSrc, srcStep, pDst1, dst1Step, pDst2, dst2Step,
+			                                     roi);
+
+		default:
+			return general_RGBToAVC444YUVv2_ANY(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
+			                                    dst2Step, roi);
+	}
+
+	return !PRIMITIVES_SUCCESS;
+}
+
+void primitives_init_YUV(primitives_t* WINPR_RESTRICT prims)
+{
+	prims->YUV420ToRGB_8u_P3AC4R = general_YUV420ToRGB_8u_P3AC4R;
+	prims->YUV444ToRGB_8u_P3AC4R = general_YUV444ToRGB_8u_P3AC4R;
+	prims->RGBToYUV420_8u_P3AC4R = general_RGBToYUV420_8u_P3AC4R;
+	prims->RGBToYUV444_8u_P3AC4R = general_RGBToYUV444_8u_P3AC4R;
+	prims->YUV420CombineToYUV444 = general_YUV420CombineToYUV444;
+	prims->YUV444SplitToYUV420 = general_YUV444SplitToYUV420;
+	prims->RGBToAVC444YUV = general_RGBToAVC444YUV;
+	prims->RGBToAVC444YUVv2 = general_RGBToAVC444YUVv2;
+}
diff --git a/libfreerdp/primitives/prim_YUV_neon.c b/libfreerdp/primitives/prim_YUV_neon.c
new file mode 100644
index 0000000..5e2039e
--- /dev/null
+++ b/libfreerdp/primitives/prim_YUV_neon.c
@@ -0,0 +1,751 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Optimized YUV/RGB conversion operations
+ *
+ * Copyright 2014 Thomas Erbesdobler
+ * Copyright 2016-2017 Armin Novak <armin.novak@thincast.com>
+ * Copyright 2016-2017 Norbert Federa <norbert.federa@thincast.com>
+ * Copyright 2016-2017 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include <winpr/crt.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+#if !defined(WITH_NEON)
+#error "This file must only be included if WITH_NEON is active!"
+#endif
+
+#include <arm_neon.h>
+
+static primitives_t* generic = NULL;
+
+static INLINE uint8x8_t neon_YUV2R(int32x4_t Ch, int32x4_t Cl, int16x4_t Dh, int16x4_t Dl,
+                                   int16x4_t Eh, int16x4_t El)
+{
+	/* R = (256 * Y + 403 * (V - 128)) >> 8 */
+	const int16x4_t c403 = vdup_n_s16(403);
+	const int32x4_t CEh = vmlal_s16(Ch, Eh, c403);
+	const int32x4_t CEl = vmlal_s16(Cl, El, c403);
+	const int32x4_t Rh = vrshrq_n_s32(CEh, 8);
+	const int32x4_t Rl = vrshrq_n_s32(CEl, 8);
+	const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh));
+	return vqmovun_s16(R);
+}
+
+static INLINE uint8x8_t neon_YUV2G(int32x4_t Ch, int32x4_t Cl, int16x4_t Dh, int16x4_t Dl,
+                                   int16x4_t Eh, int16x4_t El)
+{
+	/* G = (256L * Y -  48 * (U - 128) - 120 * (V - 128)) >> 8 */
+	const int16x4_t c48 = vdup_n_s16(48);
+	const int16x4_t c120 = vdup_n_s16(120);
+	const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48);
+	const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48);
+	const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120);
+	const int32x4_t CDEl = vmlsl_s16(CDl, El, c120);
+	const int32x4_t Gh = vrshrq_n_s32(CDEh, 8);
+	const int32x4_t Gl = vrshrq_n_s32(CDEl, 8);
+	const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh));
+	return vqmovun_s16(G);
+}
+
+static INLINE uint8x8_t neon_YUV2B(int32x4_t Ch, int32x4_t Cl, int16x4_t Dh, int16x4_t Dl,
+                                   int16x4_t Eh, int16x4_t El)
+{
+	/* B = (256L * Y + 475 * (U - 128)) >> 8*/
+	const int16x4_t c475 = vdup_n_s16(475);
+	const int32x4_t CDh = vmlal_s16(Ch, Dh, c475);
+	const int32x4_t CDl = vmlal_s16(Ch, Dl, c475);
+	const int32x4_t Bh = vrshrq_n_s32(CDh, 8);
+	const int32x4_t Bl = vrshrq_n_s32(CDl, 8);
+	const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh));
+	return vqmovun_s16(B);
+}
+
+static INLINE BYTE* neon_YuvToRgbPixel(BYTE* pRGB, int16x8_t Y, int16x8_t D, int16x8_t E,
+                                       const uint8_t rPos, const uint8_t gPos, const uint8_t bPos,
+                                       const uint8_t aPos)
+{
+	uint8x8x4_t bgrx;
+	const int32x4_t Ch = vmulq_n_s32(vmovl_s16(vget_high_s16(Y)), 256); /* Y * 256 */
+	const int32x4_t Cl = vmulq_n_s32(vmovl_s16(vget_low_s16(Y)), 256);  /* Y * 256 */
+	const int16x4_t Dh = vget_high_s16(D);
+	const int16x4_t Dl = vget_low_s16(D);
+	const int16x4_t Eh = vget_high_s16(E);
+	const int16x4_t El = vget_low_s16(E);
+	{
+		/* B = (256L * Y + 475 * (U - 128)) >> 8*/
+		const int16x4_t c475 = vdup_n_s16(475);
+		const int32x4_t CDh = vmlal_s16(Ch, Dh, c475);
+		const int32x4_t CDl = vmlal_s16(Cl, Dl, c475);
+		const int32x4_t Bh = vrshrq_n_s32(CDh, 8);
+		const int32x4_t Bl = vrshrq_n_s32(CDl, 8);
+		const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh));
+		bgrx.val[bPos] = vqmovun_s16(B);
+	}
+	{
+		/* G = (256L * Y -  48 * (U - 128) - 120 * (V - 128)) >> 8 */
+		const int16x4_t c48 = vdup_n_s16(48);
+		const int16x4_t c120 = vdup_n_s16(120);
+		const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48);
+		const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48);
+		const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120);
+		const int32x4_t CDEl = vmlsl_s16(CDl, El, c120);
+		const int32x4_t Gh = vrshrq_n_s32(CDEh, 8);
+		const int32x4_t Gl = vrshrq_n_s32(CDEl, 8);
+		const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh));
+		bgrx.val[gPos] = vqmovun_s16(G);
+	}
+	{
+		/* R = (256 * Y + 403 * (V - 128)) >> 8 */
+		const int16x4_t c403 = vdup_n_s16(403);
+		const int32x4_t CEh = vmlal_s16(Ch, Eh, c403);
+		const int32x4_t CEl = vmlal_s16(Cl, El, c403);
+		const int32x4_t Rh = vrshrq_n_s32(CEh, 8);
+		const int32x4_t Rl = vrshrq_n_s32(CEl, 8);
+		const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh));
+		bgrx.val[rPos] = vqmovun_s16(R);
+	}
+	{
+		/* A */
+		bgrx.val[aPos] = vdup_n_u8(0xFF);
+	}
+	vst4_u8(pRGB, bgrx);
+	pRGB += 32;
+	return pRGB;
+}
+
+static INLINE pstatus_t neon_YUV420ToX(const BYTE* const WINPR_RESTRICT pSrc[3],
+                                       const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+                                       UINT32 dstStep, const prim_size_t* WINPR_RESTRICT roi,
+                                       const uint8_t rPos, const uint8_t gPos, const uint8_t bPos,
+                                       const uint8_t aPos)
+{
+	const UINT32 nWidth = roi->width;
+	const UINT32 nHeight = roi->height;
+	const DWORD pad = nWidth % 16;
+	const UINT32 yPad = srcStep[0] - roi->width;
+	const UINT32 uPad = srcStep[1] - roi->width / 2;
+	const UINT32 vPad = srcStep[2] - roi->width / 2;
+	const UINT32 dPad = dstStep - roi->width * 4;
+	const int16x8_t c128 = vdupq_n_s16(128);
+
+	for (UINT32 y = 0; y < nHeight; y += 2)
+	{
+		const uint8_t* pY1 = pSrc[0] + y * srcStep[0];
+		const uint8_t* pY2 = pY1 + srcStep[0];
+		const uint8_t* pU = pSrc[1] + (y / 2) * srcStep[1];
+		const uint8_t* pV = pSrc[2] + (y / 2) * srcStep[2];
+		uint8_t* pRGB1 = pDst + y * dstStep;
+		uint8_t* pRGB2 = pRGB1 + dstStep;
+		const BOOL lastY = y >= nHeight - 1;
+
+		for (UINT32 x = 0; x < nWidth - pad;)
+		{
+			const uint8x8_t Uraw = vld1_u8(pU);
+			const uint8x8x2_t Uu = vzip_u8(Uraw, Uraw);
+			const int16x8_t U1 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[0]));
+			const int16x8_t U2 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[1]));
+			const uint8x8_t Vraw = vld1_u8(pV);
+			const uint8x8x2_t Vu = vzip_u8(Vraw, Vraw);
+			const int16x8_t V1 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[0]));
+			const int16x8_t V2 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[1]));
+			const int16x8_t D1 = vsubq_s16(U1, c128);
+			const int16x8_t E1 = vsubq_s16(V1, c128);
+			const int16x8_t D2 = vsubq_s16(U2, c128);
+			const int16x8_t E2 = vsubq_s16(V2, c128);
+			{
+				const uint8x8_t Y1u = vld1_u8(pY1);
+				const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u));
+				pRGB1 = neon_YuvToRgbPixel(pRGB1, Y1, D1, E1, rPos, gPos, bPos, aPos);
+				pY1 += 8;
+				x += 8;
+			}
+			{
+				const uint8x8_t Y1u = vld1_u8(pY1);
+				const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u));
+				pRGB1 = neon_YuvToRgbPixel(pRGB1, Y1, D2, E2, rPos, gPos, bPos, aPos);
+				pY1 += 8;
+				x += 8;
+			}
+
+			if (!lastY)
+			{
+				{
+					const uint8x8_t Y2u = vld1_u8(pY2);
+					const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u));
+					pRGB2 = neon_YuvToRgbPixel(pRGB2, Y2, D1, E1, rPos, gPos, bPos, aPos);
+					pY2 += 8;
+				}
+				{
+					const uint8x8_t Y2u = vld1_u8(pY2);
+					const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u));
+					pRGB2 = neon_YuvToRgbPixel(pRGB2, Y2, D2, E2, rPos, gPos, bPos, aPos);
+					pY2 += 8;
+				}
+			}
+
+			pU += 8;
+			pV += 8;
+		}
+
+		for (; x < nWidth; x++)
+		{
+			const BYTE U = *pU;
+			const BYTE V = *pV;
+			{
+				const BYTE Y = *pY1++;
+				const BYTE r = YUV2R(Y, U, V);
+				const BYTE g = YUV2G(Y, U, V);
+				const BYTE b = YUV2B(Y, U, V);
+				pRGB1[aPos] = 0xFF;
+				pRGB1[rPos] = r;
+				pRGB1[gPos] = g;
+				pRGB1[bPos] = b;
+				pRGB1 += 4;
+			}
+
+			if (!lastY)
+			{
+				const BYTE Y = *pY2++;
+				const BYTE r = YUV2R(Y, U, V);
+				const BYTE g = YUV2G(Y, U, V);
+				const BYTE b = YUV2B(Y, U, V);
+				pRGB2[aPos] = 0xFF;
+				pRGB2[rPos] = r;
+				pRGB2[gPos] = g;
+				pRGB2[bPos] = b;
+				pRGB2 += 4;
+			}
+
+			if (x % 2)
+			{
+				pU++;
+				pV++;
+			}
+		}
+
+		pRGB1 += dPad;
+		pRGB2 += dPad;
+		pY1 += yPad;
+		pY2 += yPad;
+		pU += uPad;
+		pV += vPad;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_YUV420ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT const pSrc[3],
+                                            const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+                                            UINT32 dstStep, UINT32 DstFormat,
+                                            const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
+
+		case PIXEL_FORMAT_RGBA32:
+		case PIXEL_FORMAT_RGBX32:
+			return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
+
+		case PIXEL_FORMAT_ARGB32:
+		case PIXEL_FORMAT_XRGB32:
+			return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
+
+		case PIXEL_FORMAT_ABGR32:
+		case PIXEL_FORMAT_XBGR32:
+			return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
+
+		default:
+			return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+	}
+}
+
+static INLINE pstatus_t neon_YUV444ToX(const BYTE* const WINPR_RESTRICT pSrc[3],
+                                       const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+                                       UINT32 dstStep, const prim_size_t* WINPR_RESTRICT roi,
+                                       const uint8_t rPos, const uint8_t gPos, const uint8_t bPos,
+                                       const uint8_t aPos)
+{
+	const UINT32 nWidth = roi->width;
+	const UINT32 nHeight = roi->height;
+	const UINT32 yPad = srcStep[0] - roi->width;
+	const UINT32 uPad = srcStep[1] - roi->width;
+	const UINT32 vPad = srcStep[2] - roi->width;
+	const UINT32 dPad = dstStep - roi->width * 4;
+	const uint8_t* pY = pSrc[0];
+	const uint8_t* pU = pSrc[1];
+	const uint8_t* pV = pSrc[2];
+	uint8_t* pRGB = pDst;
+	const int16x8_t c128 = vdupq_n_s16(128);
+	const DWORD pad = nWidth % 8;
+
+	for (UINT32 y = 0; y < nHeight; y++)
+	{
+		for (UINT32 x = 0; x < nWidth - pad; x += 8)
+		{
+			const uint8x8_t Yu = vld1_u8(pY);
+			const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(Yu));
+			const uint8x8_t Uu = vld1_u8(pU);
+			const int16x8_t U = vreinterpretq_s16_u16(vmovl_u8(Uu));
+			const uint8x8_t Vu = vld1_u8(pV);
+			const int16x8_t V = vreinterpretq_s16_u16(vmovl_u8(Vu));
+			/* Do the calculations on Y in 32bit width, the result of 255 * 256 does not fit
+			 * a signed 16 bit value. */
+			const int16x8_t D = vsubq_s16(U, c128);
+			const int16x8_t E = vsubq_s16(V, c128);
+			pRGB = neon_YuvToRgbPixel(pRGB, Y, D, E, rPos, gPos, bPos, aPos);
+			pY += 8;
+			pU += 8;
+			pV += 8;
+		}
+
+		for (UINT32 x = 0; x < pad; x++)
+		{
+			const BYTE Y = *pY++;
+			const BYTE U = *pU++;
+			const BYTE V = *pV++;
+			const BYTE r = YUV2R(Y, U, V);
+			const BYTE g = YUV2G(Y, U, V);
+			const BYTE b = YUV2B(Y, U, V);
+			pRGB[aPos] = 0xFF;
+			pRGB[rPos] = r;
+			pRGB[gPos] = g;
+			pRGB[bPos] = b;
+			pRGB += 4;
+		}
+
+		pRGB += dPad;
+		pY += yPad;
+		pU += uPad;
+		pV += vPad;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_YUV444ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT const pSrc[3],
+                                            const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+                                            UINT32 dstStep, UINT32 DstFormat,
+                                            const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
+
+		case PIXEL_FORMAT_RGBA32:
+		case PIXEL_FORMAT_RGBX32:
+			return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
+
+		case PIXEL_FORMAT_ARGB32:
+		case PIXEL_FORMAT_XRGB32:
+			return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
+
+		case PIXEL_FORMAT_ABGR32:
+		case PIXEL_FORMAT_XBGR32:
+			return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
+
+		default:
+			return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+	}
+}
+
+static pstatus_t neon_LumaToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3],
+                                   const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
+                                   const UINT32 dstStep[3], const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	const UINT32 nWidth = roi->right - roi->left;
+	const UINT32 nHeight = roi->bottom - roi->top;
+	const UINT32 halfWidth = (nWidth + 1) / 2;
+	const UINT32 halfHeight = (nHeight + 1) / 2;
+	const UINT32 evenY = 0;
+	const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
+		                    pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
+		                    pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
+	BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
+		              pDstRaw[1] + roi->top * dstStep[1] + roi->left,
+		              pDstRaw[2] + roi->top * dstStep[2] + roi->left };
+
+	/* Y data is already here... */
+	/* B1 */
+	for (UINT32 y = 0; y < nHeight; y++)
+	{
+		const BYTE* Ym = pSrc[0] + srcStep[0] * y;
+		BYTE* pY = pDst[0] + dstStep[0] * y;
+		memcpy(pY, Ym, nWidth);
+	}
+
+	/* The first half of U, V are already here part of this frame. */
+	/* B2 and B3 */
+	for (UINT32 y = 0; y < halfHeight; y++)
+	{
+		const UINT32 val2y = (2 * y + evenY);
+		const BYTE* Um = pSrc[1] + srcStep[1] * y;
+		const BYTE* Vm = pSrc[2] + srcStep[2] * y;
+		BYTE* pU = pDst[1] + dstStep[1] * val2y;
+		BYTE* pV = pDst[2] + dstStep[2] * val2y;
+		BYTE* pU1 = pU + dstStep[1];
+		BYTE* pV1 = pV + dstStep[2];
+
+		for (UINT32 x = 0; x + 16 < halfWidth; x += 16)
+		{
+			{
+				const uint8x16_t u = vld1q_u8(Um);
+				uint8x16x2_t u2x;
+				u2x.val[0] = u;
+				u2x.val[1] = u;
+				vst2q_u8(pU, u2x);
+				vst2q_u8(pU1, u2x);
+				Um += 16;
+				pU += 32;
+				pU1 += 32;
+			}
+			{
+				const uint8x16_t v = vld1q_u8(Vm);
+				uint8x16x2_t v2x;
+				v2x.val[0] = v;
+				v2x.val[1] = v;
+				vst2q_u8(pV, v2x);
+				vst2q_u8(pV1, v2x);
+				Vm += 16;
+				pV += 32;
+				pV1 += 32;
+			}
+		}
+
+		for (; x < halfWidth; x++)
+		{
+			const BYTE u = *Um++;
+			const BYTE v = *Vm++;
+			*pU++ = u;
+			*pU++ = u;
+			*pU1++ = u;
+			*pU1++ = u;
+			*pV++ = v;
+			*pV++ = v;
+			*pV1++ = v;
+			*pV1++ = v;
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_ChromaFilter(BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
+                                   const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	const UINT32 oddY = 1;
+	const UINT32 evenY = 0;
+	const UINT32 nWidth = roi->right - roi->left;
+	const UINT32 nHeight = roi->bottom - roi->top;
+	const UINT32 halfHeight = (nHeight + 1) / 2;
+	const UINT32 halfWidth = (nWidth + 1) / 2;
+	const UINT32 halfPad = halfWidth % 16;
+
+	/* Filter */
+	for (UINT32 y = roi->top; y < halfHeight + roi->top; y++)
+	{
+		const UINT32 val2y = (y * 2 + evenY);
+		const UINT32 val2y1 = val2y + oddY;
+		BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;
+		BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;
+		BYTE* pU = pDst[1] + dstStep[1] * val2y;
+		BYTE* pV = pDst[2] + dstStep[2] * val2y;
+
+		if (val2y1 > nHeight)
+			continue;
+
+		for (UINT32 x = roi->left / 2; x < halfWidth + roi->left / 2 - halfPad; x += 16)
+		{
+			{
+				/* U = (U2x,2y << 2) - U2x1,2y - U2x,2y1 - U2x1,2y1 */
+				uint8x8x2_t u = vld2_u8(&pU[2 * x]);
+				const int16x8_t up =
+				    vreinterpretq_s16_u16(vshll_n_u8(u.val[0], 2)); /* Ux2,2y << 2 */
+				const uint8x8x2_t u1 = vld2_u8(&pU1[2 * x]);
+				const uint16x8_t usub = vaddl_u8(u1.val[1], u1.val[0]); /* U2x,2y1 + U2x1,2y1 */
+				const int16x8_t us = vreinterpretq_s16_u16(
+				    vaddw_u8(usub, u.val[1])); /* U2x1,2y + U2x,2y1 + U2x1,2y1 */
+				const int16x8_t un = vsubq_s16(up, us);
+				const uint8x8_t u8 = vqmovun_s16(un); /* CLIP(un) */
+				u.val[0] = u8;
+				vst2_u8(&pU[2 * x], u);
+			}
+			{
+				/* V = (V2x,2y << 2) - V2x1,2y - V2x,2y1 - V2x1,2y1 */
+				uint8x8x2_t v = vld2_u8(&pV[2 * x]);
+				const int16x8_t vp =
+				    vreinterpretq_s16_u16(vshll_n_u8(v.val[0], 2)); /* Vx2,2y << 2 */
+				const uint8x8x2_t v1 = vld2_u8(&pV1[2 * x]);
+				const uint16x8_t vsub = vaddl_u8(v1.val[1], v1.val[0]); /* V2x,2y1 + V2x1,2y1 */
+				const int16x8_t vs = vreinterpretq_s16_u16(
+				    vaddw_u8(vsub, v.val[1])); /* V2x1,2y + V2x,2y1 + V2x1,2y1 */
+				const int16x8_t vn = vsubq_s16(vp, vs);
+				const uint8x8_t v8 = vqmovun_s16(vn); /* CLIP(vn) */
+				v.val[0] = v8;
+				vst2_u8(&pV[2 * x], v);
+			}
+		}
+
+		for (; x < halfWidth + roi->left / 2; x++)
+		{
+			const UINT32 val2x = (x * 2);
+			const UINT32 val2x1 = val2x + 1;
+			const BYTE inU = pU[val2x];
+			const BYTE inV = pV[val2x];
+			const INT32 up = inU * 4;
+			const INT32 vp = inV * 4;
+			INT32 u2020;
+			INT32 v2020;
+
+			if (val2x1 > nWidth)
+				continue;
+
+			u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1];
+			v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1];
+			pU[val2x] = CONDITIONAL_CLIP(u2020, inU);
+			pV[val2x] = CONDITIONAL_CLIP(v2020, inV);
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_ChromaV1ToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3],
+                                       const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
+                                       const UINT32 dstStep[3],
+                                       const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	const UINT32 mod = 16;
+	UINT32 uY = 0;
+	UINT32 vY = 0;
+	const UINT32 nWidth = roi->right - roi->left;
+	const UINT32 nHeight = roi->bottom - roi->top;
+	const UINT32 halfWidth = (nWidth) / 2;
+	const UINT32 halfHeight = (nHeight) / 2;
+	const UINT32 oddY = 1;
+	const UINT32 evenY = 0;
+	const UINT32 oddX = 1;
+	/* The auxilary frame is aligned to multiples of 16x16.
+	 * We need the padded height for B4 and B5 conversion. */
+	const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
+	const UINT32 halfPad = halfWidth % 16;
+	const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
+		                    pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
+		                    pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
+	BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
+		              pDstRaw[1] + roi->top * dstStep[1] + roi->left,
+		              pDstRaw[2] + roi->top * dstStep[2] + roi->left };
+
+	/* The second half of U and V is a bit more tricky... */
+	/* B4 and B5 */
+	for (UINT32 y = 0; y < padHeigth; y++)
+	{
+		const BYTE* Ya = pSrc[0] + srcStep[0] * y;
+		BYTE* pX;
+
+		if ((y) % mod < (mod + 1) / 2)
+		{
+			const UINT32 pos = (2 * uY++ + oddY);
+
+			if (pos >= nHeight)
+				continue;
+
+			pX = pDst[1] + dstStep[1] * pos;
+		}
+		else
+		{
+			const UINT32 pos = (2 * vY++ + oddY);
+
+			if (pos >= nHeight)
+				continue;
+
+			pX = pDst[2] + dstStep[2] * pos;
+		}
+
+		memcpy(pX, Ya, nWidth);
+	}
+
+	/* B6 and B7 */
+	for (UINT32 y = 0; y < halfHeight; y++)
+	{
+		const UINT32 val2y = (y * 2 + evenY);
+		const BYTE* Ua = pSrc[1] + srcStep[1] * y;
+		const BYTE* Va = pSrc[2] + srcStep[2] * y;
+		BYTE* pU = pDst[1] + dstStep[1] * val2y;
+		BYTE* pV = pDst[2] + dstStep[2] * val2y;
+
+		for (UINT32 x = 0; x < halfWidth - halfPad; x += 16)
+		{
+			{
+				uint8x16x2_t u = vld2q_u8(&pU[2 * x]);
+				u.val[1] = vld1q_u8(&Ua[x]);
+				vst2q_u8(&pU[2 * x], u);
+			}
+			{
+				uint8x16x2_t v = vld2q_u8(&pV[2 * x]);
+				v.val[1] = vld1q_u8(&Va[x]);
+				vst2q_u8(&pV[2 * x], v);
+			}
+		}
+
+		for (; x < halfWidth; x++)
+		{
+			const UINT32 val2x1 = (x * 2 + oddX);
+			pU[val2x1] = Ua[x];
+			pV[val2x1] = Va[x];
+		}
+	}
+
+	/* Filter */
+	return neon_ChromaFilter(pDst, dstStep, roi);
+}
+
+static pstatus_t neon_ChromaV2ToYUV444(const BYTE* const WINPR_RESTRICT pSrc[3],
+                                       const UINT32 srcStep[3], UINT32 nTotalWidth,
+                                       UINT32 nTotalHeight, BYTE* WINPR_RESTRICT pDst[3],
+                                       const UINT32 dstStep[3],
+                                       const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	const UINT32 nWidth = roi->right - roi->left;
+	const UINT32 nHeight = roi->bottom - roi->top;
+	const UINT32 halfWidth = (nWidth + 1) / 2;
+	const UINT32 halfPad = halfWidth % 16;
+	const UINT32 halfHeight = (nHeight + 1) / 2;
+	const UINT32 quaterWidth = (nWidth + 3) / 4;
+	const UINT32 quaterPad = quaterWidth % 16;
+
+	/* B4 and B5: odd UV values for width/2, height */
+	for (UINT32 y = 0; y < nHeight; y++)
+	{
+		const UINT32 yTop = y + roi->top;
+		const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
+		const BYTE* pYaV = pYaU + nTotalWidth / 2;
+		BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left;
+		BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left;
+
+		for (UINT32 x = 0; x < halfWidth - halfPad; x += 16)
+		{
+			{
+				uint8x16x2_t u = vld2q_u8(&pU[2 * x]);
+				u.val[1] = vld1q_u8(&pYaU[x]);
+				vst2q_u8(&pU[2 * x], u);
+			}
+			{
+				uint8x16x2_t v = vld2q_u8(&pV[2 * x]);
+				v.val[1] = vld1q_u8(&pYaV[x]);
+				vst2q_u8(&pV[2 * x], v);
+			}
+		}
+
+		for (; x < halfWidth; x++)
+		{
+			const UINT32 odd = 2 * x + 1;
+			pU[odd] = pYaU[x];
+			pV[odd] = pYaV[x];
+		}
+	}
+
+	/* B6 - B9 */
+	for (UINT32 y = 0; y < halfHeight; y++)
+	{
+		const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
+		const BYTE* pUaV = pUaU + nTotalWidth / 4;
+		const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
+		const BYTE* pVaV = pVaU + nTotalWidth / 4;
+		BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
+		BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
+
+		for (UINT32 x = 0; x < quaterWidth - quaterPad; x += 16)
+		{
+			{
+				uint8x16x4_t u = vld4q_u8(&pU[4 * x]);
+				u.val[0] = vld1q_u8(&pUaU[x]);
+				u.val[2] = vld1q_u8(&pVaU[x]);
+				vst4q_u8(&pU[4 * x], u);
+			}
+			{
+				uint8x16x4_t v = vld4q_u8(&pV[4 * x]);
+				v.val[0] = vld1q_u8(&pUaV[x]);
+				v.val[2] = vld1q_u8(&pVaV[x]);
+				vst4q_u8(&pV[4 * x], v);
+			}
+		}
+
+		for (; x < quaterWidth; x++)
+		{
+			pU[4 * x + 0] = pUaU[x];
+			pV[4 * x + 0] = pUaV[x];
+			pU[4 * x + 2] = pVaU[x];
+			pV[4 * x + 2] = pVaV[x];
+		}
+	}
+
+	return neon_ChromaFilter(pDst, dstStep, roi);
+}
+
+static pstatus_t neon_YUV420CombineToYUV444(avc444_frame_type type,
+                                            const BYTE* const WINPR_RESTRICT pSrc[3],
+                                            const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
+                                            BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
+                                            const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
+		return -1;
+
+	if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
+		return -1;
+
+	if (!roi)
+		return -1;
+
+	switch (type)
+	{
+		case AVC444_LUMA:
+			return neon_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
+
+		case AVC444_CHROMAv1:
+			return neon_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
+
+		case AVC444_CHROMAv2:
+			return neon_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
+
+		default:
+			return -1;
+	}
+}
+
+void primitives_init_YUV_opt(primitives_t* prims)
+{
+	generic = primitives_get_generic();
+	primitives_init_YUV(prims);
+
+	if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
+	{
+		prims->YUV420ToRGB_8u_P3AC4R = neon_YUV420ToRGB_8u_P3AC4R;
+		prims->YUV444ToRGB_8u_P3AC4R = neon_YUV444ToRGB_8u_P3AC4R;
+		prims->YUV420CombineToYUV444 = neon_YUV420CombineToYUV444;
+	}
+}
diff --git a/libfreerdp/primitives/prim_YUV_opencl.c b/libfreerdp/primitives/prim_YUV_opencl.c
new file mode 100644
index 0000000..2ca1b31
--- /dev/null
+++ b/libfreerdp/primitives/prim_YUV_opencl.c
@@ -0,0 +1,500 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Optimized YUV/RGB conversion operations using openCL
+ *
+ * Copyright 2019 David Fort <contact@hardening-consulting.com>
+ * Copyright 2019 Rangee Gmbh
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include "prim_internal.h"
+
+#if defined(WITH_OPENCL)
+#ifdef __APPLE__
+#include "OpenCL/opencl.h"
+#else
+#include <CL/cl.h>
+#endif
+#endif
+
+#include <freerdp/log.h>
+#define TAG FREERDP_TAG("primitives")
+
+typedef struct
+{
+	BOOL support;
+	cl_platform_id platformId;
+	cl_device_id deviceId;
+	cl_context context;
+	cl_command_queue commandQueue;
+	cl_program program;
+} primitives_opencl_context;
+
+typedef struct
+{
+	primitives_opencl_context* cl;
+	cl_kernel kernel;
+	cl_mem srcObjs[3];
+	cl_mem dstObj;
+	prim_size_t roi;
+	size_t dstStep;
+} primitives_cl_kernel;
+
+static primitives_opencl_context* primitives_get_opencl_context(void);
+
+static void cl_kernel_free(primitives_cl_kernel* kernel)
+{
+	if (!kernel)
+		return;
+
+	if (kernel->dstObj)
+		clReleaseMemObject(kernel->dstObj);
+
+	for (size_t i = 0; i < ARRAYSIZE(kernel->srcObjs); i++)
+	{
+		cl_mem obj = kernel->srcObjs[i];
+		kernel->srcObjs[i] = NULL;
+		if (obj)
+			clReleaseMemObject(obj);
+	}
+
+	if (kernel->kernel)
+		clReleaseKernel(kernel->kernel);
+
+	free(kernel);
+}
+
+static primitives_cl_kernel* cl_kernel_new(const char* kernelName, const prim_size_t* roi)
+{
+	WINPR_ASSERT(kernelName);
+	WINPR_ASSERT(roi);
+
+	primitives_cl_kernel* kernel = calloc(1, sizeof(primitives_cl_kernel));
+	if (!kernel)
+		goto fail;
+
+	kernel->roi = *roi;
+	kernel->cl = primitives_get_opencl_context();
+	if (!kernel->cl)
+		goto fail;
+
+	cl_int ret = CL_INVALID_VALUE;
+	kernel->kernel = clCreateKernel(kernel->cl->program, kernelName, &ret);
+	if (ret != CL_SUCCESS)
+	{
+		WLog_ERR(TAG, "openCL: unable to create kernel %s", kernelName);
+		goto fail;
+	}
+
+	return kernel;
+fail:
+	cl_kernel_free(kernel);
+	return NULL;
+}
+
+static BOOL cl_kernel_set_sources(primitives_cl_kernel* ctx,
+                                  const BYTE* const WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3])
+{
+	const char* sourceNames[] = { "Y", "U", "V" };
+
+	WINPR_ASSERT(ctx);
+	WINPR_ASSERT(pSrc);
+	WINPR_ASSERT(srcStep);
+
+	for (cl_uint i = 0; i < ARRAYSIZE(ctx->srcObjs); i++)
+	{
+		cl_int ret = CL_INVALID_VALUE;
+		ctx->srcObjs[i] = clCreateBuffer(ctx->cl->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+		                                 1ull * srcStep[i] * ctx->roi.height, pSrc[i], &ret);
+		if (ret != CL_SUCCESS)
+		{
+			WLog_ERR(TAG, "unable to create %sobj", sourceNames[i]);
+			return FALSE;
+		}
+
+		ret = clSetKernelArg(ctx->kernel, i * 2, sizeof(cl_mem), &ctx->srcObjs[i]);
+		if (ret != CL_SUCCESS)
+		{
+			WLog_ERR(TAG, "unable to set arg for %sobj", sourceNames[i]);
+			return FALSE;
+		}
+
+		ret = clSetKernelArg(ctx->kernel, i * 2 + 1, sizeof(cl_uint), &srcStep[i]);
+		if (ret != CL_SUCCESS)
+		{
+			WLog_ERR(TAG, "unable to set arg stride for %sobj", sourceNames[i]);
+			return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+static BOOL cl_kernel_set_destination(primitives_cl_kernel* ctx, UINT32 dstStep)
+{
+
+	WINPR_ASSERT(ctx);
+
+	ctx->dstStep = dstStep;
+	cl_int ret = CL_INVALID_VALUE;
+	ctx->dstObj = clCreateBuffer(ctx->cl->context, CL_MEM_WRITE_ONLY,
+	                             1ull * dstStep * ctx->roi.height, NULL, &ret);
+	if (ret != CL_SUCCESS)
+	{
+		WLog_ERR(TAG, "unable to create dest obj");
+		return FALSE;
+	}
+
+	ret = clSetKernelArg(ctx->kernel, 6, sizeof(cl_mem), &ctx->dstObj);
+	if (ret != CL_SUCCESS)
+	{
+		WLog_ERR(TAG, "unable to set arg destObj");
+		return FALSE;
+	}
+
+	ret = clSetKernelArg(ctx->kernel, 7, sizeof(cl_uint), &dstStep);
+	if (ret != CL_SUCCESS)
+	{
+		WLog_ERR(TAG, "unable to set arg dstStep");
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static BOOL cl_kernel_process(primitives_cl_kernel* ctx, BYTE* pDst)
+{
+	WINPR_ASSERT(ctx);
+	WINPR_ASSERT(pDst);
+
+	size_t indexes[2] = { 0 };
+	indexes[0] = ctx->roi.width;
+	indexes[1] = ctx->roi.height;
+
+	cl_int ret = clEnqueueNDRangeKernel(ctx->cl->commandQueue, ctx->kernel, 2, NULL, indexes, NULL,
+	                                    0, NULL, NULL);
+	if (ret != CL_SUCCESS)
+	{
+		WLog_ERR(TAG, "unable to enqueue call kernel");
+		return FALSE;
+	}
+
+	/* Transfer result to host */
+	ret = clEnqueueReadBuffer(ctx->cl->commandQueue, ctx->dstObj, CL_TRUE, 0,
+	                          ctx->roi.height * ctx->dstStep, pDst, 0, NULL, NULL);
+	if (ret != CL_SUCCESS)
+	{
+		WLog_ERR(TAG, "unable to read back buffer");
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static pstatus_t opencl_YUVToRGB(const char* kernelName, const BYTE* const WINPR_RESTRICT pSrc[3],
+                                 const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
+                                 const prim_size_t* WINPR_RESTRICT roi)
+{
+	pstatus_t res = -1;
+
+	primitives_cl_kernel* ctx = cl_kernel_new(kernelName, roi);
+	if (!ctx)
+		goto fail;
+
+	if (!cl_kernel_set_sources(ctx, pSrc, srcStep))
+		goto fail;
+
+	if (!cl_kernel_set_destination(ctx, dstStep))
+		goto fail;
+
+	if (!cl_kernel_process(ctx, pDst))
+		goto fail;
+
+	res = PRIMITIVES_SUCCESS;
+
+fail:
+	cl_kernel_free(ctx);
+	return res;
+}
+
+static primitives_opencl_context openclContext = { 0 };
+
+static primitives_opencl_context* primitives_get_opencl_context(void)
+{
+	return &openclContext;
+}
+
+static void cl_context_free(primitives_opencl_context* ctx)
+{
+	if (!ctx)
+		return;
+	clReleaseProgram(ctx->program);
+	clReleaseCommandQueue(ctx->commandQueue);
+	clReleaseContext(ctx->context);
+	clReleaseDevice(ctx->deviceId);
+	ctx->support = FALSE;
+}
+
+static pstatus_t primitives_uninit_opencl(void)
+{
+	if (!openclContext.support)
+		return PRIMITIVES_SUCCESS;
+
+	cl_context_free(&openclContext);
+	return PRIMITIVES_SUCCESS;
+}
+
+static const char openclProgram[] =
+#include "primitives.cl"
+    ;
+
+static BOOL primitives_init_opencl_context(primitives_opencl_context* cl)
+{
+	cl_platform_id* platform_ids = NULL;
+	cl_uint ndevices = 0;
+	cl_uint nplatforms = 0;
+	cl_kernel kernel = NULL;
+	cl_int ret = 0;
+
+	BOOL gotGPU = FALSE;
+	size_t programLen = 0;
+
+	ret = clGetPlatformIDs(0, NULL, &nplatforms);
+	if (ret != CL_SUCCESS || nplatforms < 1)
+		return FALSE;
+
+	platform_ids = calloc(nplatforms, sizeof(*platform_ids));
+	if (!platform_ids)
+		return FALSE;
+
+	ret = clGetPlatformIDs(nplatforms, platform_ids, &nplatforms);
+	if (ret != CL_SUCCESS)
+	{
+		free(platform_ids);
+		return FALSE;
+	}
+
+	for (cl_uint i = 0; (i < nplatforms) && !gotGPU; i++)
+	{
+		cl_device_id device_id = NULL;
+		cl_context context = NULL;
+		char platformName[1000] = { 0 };
+		char deviceName[1000] = { 0 };
+
+		ret = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, sizeof(platformName),
+		                        platformName, NULL);
+		if (ret != CL_SUCCESS)
+			continue;
+
+		ret = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, 1, &device_id, &ndevices);
+		if (ret != CL_SUCCESS)
+			continue;
+
+		ret = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);
+		if (ret != CL_SUCCESS)
+		{
+			WLog_ERR(TAG, "openCL: unable get device name for platform %s", platformName);
+			clReleaseDevice(device_id);
+			continue;
+		}
+
+		context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
+		if (ret != CL_SUCCESS)
+		{
+			WLog_ERR(TAG, "openCL: unable to create context for platform %s, device %s",
+			         platformName, deviceName);
+			clReleaseDevice(device_id);
+			continue;
+		}
+
+		cl->commandQueue = clCreateCommandQueue(context, device_id, 0, &ret);
+		if (ret != CL_SUCCESS)
+		{
+			WLog_ERR(TAG, "openCL: unable to create command queue");
+			clReleaseContext(context);
+			clReleaseDevice(device_id);
+			continue;
+		}
+
+		WLog_INFO(TAG, "openCL: using platform=%s device=%s", platformName, deviceName);
+
+		cl->platformId = platform_ids[i];
+		cl->deviceId = device_id;
+		cl->context = context;
+		gotGPU = TRUE;
+	}
+
+	free(platform_ids);
+
+	if (!gotGPU)
+	{
+		WLog_ERR(TAG, "openCL: no GPU found");
+		return FALSE;
+	}
+
+	programLen = strnlen(openclProgram, sizeof(openclProgram));
+	const char* ptr = openclProgram;
+	cl->program = clCreateProgramWithSource(cl->context, 1, &ptr, &programLen, &ret);
+	if (ret != CL_SUCCESS)
+	{
+		WLog_ERR(TAG, "openCL: unable to create program");
+		goto fail;
+	}
+
+	ret = clBuildProgram(cl->program, 1, &cl->deviceId, NULL, NULL, NULL);
+	if (ret != CL_SUCCESS)
+	{
+		size_t length = 0;
+		char buffer[2048];
+		ret = clGetProgramBuildInfo(cl->program, cl->deviceId, CL_PROGRAM_BUILD_LOG, sizeof(buffer),
+		                            buffer, &length);
+		if (ret != CL_SUCCESS)
+		{
+			WLog_ERR(TAG,
+			         "openCL: building program failed but unable to retrieve buildLog, error=%d",
+			         ret);
+		}
+		else
+		{
+			WLog_ERR(TAG, "openCL: unable to build program, errorLog=%s", buffer);
+		}
+		goto fail;
+	}
+
+	kernel = clCreateKernel(cl->program, "yuv420_to_bgra_1b", &ret);
+	if (ret != CL_SUCCESS)
+	{
+		WLog_ERR(TAG, "openCL: unable to create yuv420_to_bgra_1b kernel");
+		goto fail;
+	}
+	clReleaseKernel(kernel);
+
+	cl->support = TRUE;
+	return TRUE;
+
+fail:
+	cl_context_free(cl);
+	return FALSE;
+}
+
+static pstatus_t opencl_YUV420ToRGB_8u_P3AC4R(const BYTE* const WINPR_RESTRICT pSrc[3],
+                                              const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+                                              UINT32 dstStep, UINT32 DstFormat,
+                                              const prim_size_t* WINPR_RESTRICT roi)
+{
+	const char* kernel_name = NULL;
+
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_ABGR32:
+			kernel_name = "yuv420_to_abgr_1b";
+			break;
+		case PIXEL_FORMAT_XBGR32:
+			kernel_name = "yuv420_to_xbgr_1b";
+			break;
+		case PIXEL_FORMAT_RGBX32:
+			kernel_name = "yuv420_to_rgba_1b";
+			break;
+		case PIXEL_FORMAT_RGBA32:
+			kernel_name = "yuv420_to_rgbx_1b";
+			break;
+		case PIXEL_FORMAT_BGRA32:
+			kernel_name = "yuv420_to_bgra_1b";
+			break;
+		case PIXEL_FORMAT_BGRX32:
+			kernel_name = "yuv420_to_bgrx_1b";
+			break;
+		case PIXEL_FORMAT_XRGB32:
+			kernel_name = "yuv420_to_xrgb_1b";
+			break;
+		case PIXEL_FORMAT_ARGB32:
+			kernel_name = "yuv420_to_argb_1b";
+			break;
+		default:
+		{
+			primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
+			if (!p)
+				return -1;
+			return p->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+		}
+	}
+
+	return opencl_YUVToRGB(kernel_name, pSrc, srcStep, pDst, dstStep, roi);
+}
+
+static pstatus_t opencl_YUV444ToRGB_8u_P3AC4R(const BYTE* const WINPR_RESTRICT pSrc[3],
+                                              const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+                                              UINT32 dstStep, UINT32 DstFormat,
+                                              const prim_size_t* WINPR_RESTRICT roi)
+{
+	const char* kernel_name = NULL;
+
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_ABGR32:
+			kernel_name = "yuv444_to_abgr_1b";
+			break;
+		case PIXEL_FORMAT_XBGR32:
+			kernel_name = "yuv444_to_xbgr_1b";
+			break;
+		case PIXEL_FORMAT_RGBX32:
+			kernel_name = "yuv444_to_rgba_1b";
+			break;
+		case PIXEL_FORMAT_RGBA32:
+			kernel_name = "yuv444_to_rgbx_1b";
+			break;
+		case PIXEL_FORMAT_BGRA32:
+			kernel_name = "yuv444_to_bgra_1b";
+			break;
+		case PIXEL_FORMAT_BGRX32:
+			kernel_name = "yuv444_to_bgrx_1b";
+			break;
+		case PIXEL_FORMAT_XRGB32:
+			kernel_name = "yuv444_to_xrgb_1b";
+			break;
+		case PIXEL_FORMAT_ARGB32:
+			kernel_name = "yuv444_to_argb_1b";
+			break;
+		default:
+		{
+			primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
+			if (!p)
+				return -1;
+			return p->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+		}
+	}
+
+	return opencl_YUVToRGB(kernel_name, pSrc, srcStep, pDst, dstStep, roi);
+}
+
+BOOL primitives_init_opencl(primitives_t* prims)
+{
+	primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
+	if (!prims || !p)
+		return FALSE;
+	*prims = *p;
+
+	if (!primitives_init_opencl_context(&openclContext))
+		return FALSE;
+
+	prims->YUV420ToRGB_8u_P3AC4R = opencl_YUV420ToRGB_8u_P3AC4R;
+	prims->YUV444ToRGB_8u_P3AC4R = opencl_YUV444ToRGB_8u_P3AC4R;
+	prims->flags |= PRIM_FLAGS_HAVE_EXTGPU;
+	prims->uninit = primitives_uninit_opencl;
+	return TRUE;
+}
diff --git a/libfreerdp/primitives/prim_YUV_ssse3.c b/libfreerdp/primitives/prim_YUV_ssse3.c
new file mode 100644
index 0000000..2fbef3e
--- /dev/null
+++ b/libfreerdp/primitives/prim_YUV_ssse3.c
@@ -0,0 +1,1515 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Optimized YUV/RGB conversion operations
+ *
+ * Copyright 2014 Thomas Erbesdobler
+ * Copyright 2016-2017 Armin Novak <armin.novak@thincast.com>
+ * Copyright 2016-2017 Norbert Federa <norbert.federa@thincast.com>
+ * Copyright 2016-2017 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <winpr/wtypes.h>
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include <winpr/crt.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#if !defined(WITH_SSE2)
+#error "This file needs WITH_SSE2 enabled!"
+#endif
+
+static primitives_t* generic = NULL;
+
+/****************************************************************************/
+/* SSSE3 YUV420 -> RGB conversion                                           */
+/****************************************************************************/
+static __m128i* ssse3_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m128i Uraw,
+                                  __m128i Vraw, UINT8 pos)
+{
+	/* Visual Studio 2010 doesn't like _mm_set_epi32 in array initializer list */
+	/* Note: This also applies to Visual Studio 2013 before Update 4 */
+#if !defined(_MSC_VER) || (_MSC_VER > 1600)
+	const __m128i mapY[] = { _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
+		                     _mm_set_epi32(0x80800780, 0x80800680, 0x80800580, 0x80800480),
+		                     _mm_set_epi32(0x80800B80, 0x80800A80, 0x80800980, 0x80800880),
+		                     _mm_set_epi32(0x80800F80, 0x80800E80, 0x80800D80, 0x80800C80) };
+	const __m128i mapUV[] = { _mm_set_epi32(0x80038002, 0x80018000, 0x80808080, 0x80808080),
+		                      _mm_set_epi32(0x80078006, 0x80058004, 0x80808080, 0x80808080),
+		                      _mm_set_epi32(0x800B800A, 0x80098008, 0x80808080, 0x80808080),
+		                      _mm_set_epi32(0x800F800E, 0x800D800C, 0x80808080, 0x80808080) };
+	const __m128i mask[] = { _mm_set_epi32(0x80038080, 0x80028080, 0x80018080, 0x80008080),
+		                     _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
+		                     _mm_set_epi32(0x80808003, 0x80808002, 0x80808001, 0x80808000) };
+#else
+	/* Note: must be in little-endian format ! */
+	const __m128i mapY[] = { { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80,
+		                       0x80, 0x80, 0x03, 0x80, 0x80 },
+		                     { 0x80, 0x04, 0x80, 0x80, 0x80, 0x05, 0x80, 0x80, 0x80, 0x06, 0x80,
+		                       0x80, 0x80, 0x07, 0x80, 0x80 },
+		                     { 0x80, 0x08, 0x80, 0x80, 0x80, 0x09, 0x80, 0x80, 0x80, 0x0a, 0x80,
+		                       0x80, 0x80, 0x0b, 0x80, 0x80 },
+		                     { 0x80, 0x0c, 0x80, 0x80, 0x80, 0x0d, 0x80, 0x80, 0x80, 0x0e, 0x80,
+		                       0x80, 0x80, 0x0f, 0x80, 0x80 }
+
+	};
+	const __m128i mapUV[] = { { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x80, 0x01,
+		                        0x80, 0x02, 0x80, 0x03, 0x80 },
+		                      { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x04, 0x80, 0x05,
+		                        0x80, 0x06, 0x80, 0x07, 0x80 },
+		                      { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x08, 0x80, 0x09,
+		                        0x80, 0x0a, 0x80, 0x0b, 0x80 },
+		                      { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0c, 0x80, 0x0d,
+		                        0x80, 0x0e, 0x80, 0x0f, 0x80 } };
+	const __m128i mask[] = { { 0x80, 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02,
+		                       0x80, 0x80, 0x80, 0x03, 0x80 },
+		                     { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80,
+		                       0x80, 0x80, 0x03, 0x80, 0x80 },
+		                     { 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80,
+		                       0x80, 0x03, 0x80, 0x80, 0x80 } };
+#endif
+	const __m128i c128 = _mm_set1_epi16(128);
+	__m128i BGRX = _mm_and_si128(_mm_loadu_si128(dst),
+	                             _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000));
+	{
+		__m128i C;
+		__m128i D;
+		__m128i E;
+		/* Load Y values and expand to 32 bit */
+		{
+			C = _mm_shuffle_epi8(Yraw, mapY[pos]); /* Reorder and multiply by 256 */
+		}
+		/* Load U values and expand to 32 bit */
+		{
+			const __m128i U = _mm_shuffle_epi8(Uraw, mapUV[pos]); /* Reorder dcba */
+			D = _mm_sub_epi16(U, c128);                           /* D = U - 128 */
+		}
+		/* Load V values and expand to 32 bit */
+		{
+			const __m128i V = _mm_shuffle_epi8(Vraw, mapUV[pos]); /* Reorder dcba */
+			E = _mm_sub_epi16(V, c128);                           /* E = V - 128 */
+		}
+		/* Get the R value */
+		{
+			const __m128i c403 = _mm_set1_epi16(403);
+			const __m128i e403 =
+			    _mm_unpackhi_epi16(_mm_mullo_epi16(E, c403), _mm_mulhi_epi16(E, c403));
+			const __m128i Rs = _mm_add_epi32(C, e403);
+			const __m128i R32 = _mm_srai_epi32(Rs, 8);
+			const __m128i R16 = _mm_packs_epi32(R32, _mm_setzero_si128());
+			const __m128i R = _mm_packus_epi16(R16, _mm_setzero_si128());
+			const __m128i packed = _mm_shuffle_epi8(R, mask[0]);
+			BGRX = _mm_or_si128(BGRX, packed);
+		}
+		/* Get the G value */
+		{
+			const __m128i c48 = _mm_set1_epi16(48);
+			const __m128i d48 =
+			    _mm_unpackhi_epi16(_mm_mullo_epi16(D, c48), _mm_mulhi_epi16(D, c48));
+			const __m128i c120 = _mm_set1_epi16(120);
+			const __m128i e120 =
+			    _mm_unpackhi_epi16(_mm_mullo_epi16(E, c120), _mm_mulhi_epi16(E, c120));
+			const __m128i de = _mm_add_epi32(d48, e120);
+			const __m128i Gs = _mm_sub_epi32(C, de);
+			const __m128i G32 = _mm_srai_epi32(Gs, 8);
+			const __m128i G16 = _mm_packs_epi32(G32, _mm_setzero_si128());
+			const __m128i G = _mm_packus_epi16(G16, _mm_setzero_si128());
+			const __m128i packed = _mm_shuffle_epi8(G, mask[1]);
+			BGRX = _mm_or_si128(BGRX, packed);
+		}
+		/* Get the B value */
+		{
+			const __m128i c475 = _mm_set1_epi16(475);
+			const __m128i d475 =
+			    _mm_unpackhi_epi16(_mm_mullo_epi16(D, c475), _mm_mulhi_epi16(D, c475));
+			const __m128i Bs = _mm_add_epi32(C, d475);
+			const __m128i B32 = _mm_srai_epi32(Bs, 8);
+			const __m128i B16 = _mm_packs_epi32(B32, _mm_setzero_si128());
+			const __m128i B = _mm_packus_epi16(B16, _mm_setzero_si128());
+			const __m128i packed = _mm_shuffle_epi8(B, mask[2]);
+			BGRX = _mm_or_si128(BGRX, packed);
+		}
+	}
+	_mm_storeu_si128(dst++, BGRX);
+	return dst;
+}
+
+static pstatus_t ssse3_YUV420ToRGB_BGRX(const BYTE* const WINPR_RESTRICT pSrc[],
+                                        const UINT32* WINPR_RESTRICT srcStep,
+                                        BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
+                                        const prim_size_t* WINPR_RESTRICT roi)
+{
+	const UINT32 nWidth = roi->width;
+	const UINT32 nHeight = roi->height;
+	const UINT32 pad = roi->width % 16;
+	const __m128i duplicate = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+
+	for (UINT32 y = 0; y < nHeight; y++)
+	{
+		__m128i* dst = (__m128i*)(pDst + dstStep * y);
+		const BYTE* YData = pSrc[0] + y * srcStep[0];
+		const BYTE* UData = pSrc[1] + (y / 2) * srcStep[1];
+		const BYTE* VData = pSrc[2] + (y / 2) * srcStep[2];
+
+		for (UINT32 x = 0; x < nWidth - pad; x += 16)
+		{
+			const __m128i Y = _mm_loadu_si128((const __m128i*)YData);
+			const __m128i uRaw = _mm_loadu_si128((const __m128i*)UData);
+			const __m128i vRaw = _mm_loadu_si128((const __m128i*)VData);
+			const __m128i U = _mm_shuffle_epi8(uRaw, duplicate);
+			const __m128i V = _mm_shuffle_epi8(vRaw, duplicate);
+			YData += 16;
+			UData += 8;
+			VData += 8;
+			dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
+			dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
+			dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
+			dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
+		}
+
+		for (UINT32 x = 0; x < pad; x++)
+		{
+			const BYTE Y = *YData++;
+			const BYTE U = *UData;
+			const BYTE V = *VData;
+			const BYTE r = YUV2R(Y, U, V);
+			const BYTE g = YUV2G(Y, U, V);
+			const BYTE b = YUV2B(Y, U, V);
+			dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0);
+
+			if (x % 2)
+			{
+				UData++;
+				VData++;
+			}
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t ssse3_YUV420ToRGB(const BYTE* const WINPR_RESTRICT pSrc[3],
+                                   const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+                                   UINT32 dstStep, UINT32 DstFormat,
+                                   const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRX32:
+		case PIXEL_FORMAT_BGRA32:
+			return ssse3_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, roi);
+
+		default:
+			return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+	}
+}
+
+static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(const BYTE* const WINPR_RESTRICT pSrc[],
+                                                  const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
+                                                  UINT32 dstStep,
+                                                  const prim_size_t* WINPR_RESTRICT roi)
+{
+	const UINT32 nWidth = roi->width;
+	const UINT32 nHeight = roi->height;
+	const UINT32 pad = roi->width % 16;
+
+	for (UINT32 y = 0; y < nHeight; y++)
+	{
+		__m128i* dst = (__m128i*)(pDst + dstStep * y);
+		const BYTE* YData = pSrc[0] + y * srcStep[0];
+		const BYTE* UData = pSrc[1] + y * srcStep[1];
+		const BYTE* VData = pSrc[2] + y * srcStep[2];
+
+		for (UINT32 x = 0; x < nWidth - pad; x += 16)
+		{
+			__m128i Y = _mm_load_si128((const __m128i*)YData);
+			__m128i U = _mm_load_si128((const __m128i*)UData);
+			__m128i V = _mm_load_si128((const __m128i*)VData);
+			YData += 16;
+			UData += 16;
+			VData += 16;
+			dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
+			dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
+			dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
+			dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
+		}
+
+		for (UINT32 x = 0; x < pad; x++)
+		{
+			const BYTE Y = *YData++;
+			const BYTE U = *UData++;
+			const BYTE V = *VData++;
+			const BYTE r = YUV2R(Y, U, V);
+			const BYTE g = YUV2G(Y, U, V);
+			const BYTE b = YUV2B(Y, U, V);
+			dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0);
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R(const BYTE* const WINPR_RESTRICT pSrc[],
+                                             const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
+                                             UINT32 dstStep, UINT32 DstFormat,
+                                             const prim_size_t* WINPR_RESTRICT roi)
+{
+	if ((uintptr_t)pSrc[0] % 16 || (uintptr_t)pSrc[1] % 16 || (uintptr_t)pSrc[2] % 16 ||
+	    srcStep[0] % 16 || srcStep[1] % 16 || srcStep[2] % 16)
+		return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRX32:
+		case PIXEL_FORMAT_BGRA32:
+			return ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
+
+		default:
+			return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+	}
+}
+
+/****************************************************************************/
+/* SSSE3 RGB -> YUV420 conversion                                          **/
+/****************************************************************************/
+
+/**
+ * Note (nfedera):
+ * The used forward transformation factors from RGB to YUV are based on the
+ * values specified in [Rec. ITU-R BT.709-6] Section 3:
+ * http://www.itu.int/rec/R-REC-BT.709-6-201506-I/en
+ *
+ * Y =  0.21260 * R + 0.71520 * G + 0.07220 * B +   0;
+ * U = -0.11457 * R - 0.38543 * G + 0.50000 * B + 128;
+ * V =  0.50000 * R - 0.45415 * G - 0.04585 * B + 128;
+ *
+ * The most accurate integer arithmetic approximation when using 8-bit signed
+ * integer factors with 16-bit signed integer intermediate results is:
+ *
+ * Y = ( ( 27 * R + 92 * G +  9 * B) >> 7 );
+ * U = ( (-29 * R - 99 * G + 128 * B) >> 8 ) + 128;
+ * V = ( ( 128 * R - 116 * G -  12 * B) >> 8 ) + 128;
+ *
+ * Due to signed 8bit range being [-128,127] the U and V constants of 128 are
+ * rounded to 127
+ */
+
+#define BGRX_Y_FACTORS _mm_set_epi8(0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9)
+#define BGRX_U_FACTORS \
+	_mm_set_epi8(0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127)
+#define BGRX_V_FACTORS \
+	_mm_set_epi8(0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12)
+#define CONST128_FACTORS _mm_set1_epi8(-128)
+
+#define Y_SHIFT 7
+#define U_SHIFT 8
+#define V_SHIFT 8
+
+/*
+TODO:
+RGB[AX] can simply be supported using the following factors. And instead of loading the
+globals directly the functions below could be passed pointers to the correct vectors
+depending on the source picture format.
+
+PRIM_ALIGN_128 static const BYTE rgbx_y_factors[] = {
+      27,  92,   9,   0,  27,  92,   9,   0,  27,  92,   9,   0,  27,  92,   9,   0
+};
+PRIM_ALIGN_128 static const BYTE rgbx_u_factors[] = {
+     -15, -49,  64,   0, -15, -49,  64,   0, -15, -49,  64,   0, -15, -49,  64,   0
+};
+PRIM_ALIGN_128 static const BYTE rgbx_v_factors[] = {
+      64, -58,  -6,   0,  64, -58,  -6,   0,  64, -58,  -6,   0,  64, -58,  -6,   0
+};
+*/
+
+/* compute the luma (Y) component from a single rgb source line */
+
+static INLINE void ssse3_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE* dst, UINT32 width)
+{
+	__m128i x0;
+	__m128i x1;
+	__m128i x2;
+	__m128i x3;
+	const __m128i y_factors = BGRX_Y_FACTORS;
+	const __m128i* argb = (const __m128i*)src;
+	__m128i* ydst = (__m128i*)dst;
+
+	for (UINT32 x = 0; x < width; x += 16)
+	{
+		/* store 16 rgba pixels in 4 128 bit registers */
+		x0 = _mm_load_si128(argb++); // 1st 4 pixels
+		x1 = _mm_load_si128(argb++); // 2nd 4 pixels
+		x2 = _mm_load_si128(argb++); // 3rd 4 pixels
+		x3 = _mm_load_si128(argb++); // 4th 4 pixels
+		/* multiplications and subtotals */
+		x0 = _mm_maddubs_epi16(x0, y_factors);
+		x1 = _mm_maddubs_epi16(x1, y_factors);
+		x2 = _mm_maddubs_epi16(x2, y_factors);
+		x3 = _mm_maddubs_epi16(x3, y_factors);
+		/* the total sums */
+		x0 = _mm_hadd_epi16(x0, x1);
+		x2 = _mm_hadd_epi16(x2, x3);
+		/* shift the results */
+		x0 = _mm_srli_epi16(x0, Y_SHIFT);
+		x2 = _mm_srli_epi16(x2, Y_SHIFT);
+		/* pack the 16 words into bytes */
+		x0 = _mm_packus_epi16(x0, x2);
+		/* save to y plane */
+		_mm_storeu_si128(ydst++, x0);
+	}
+}
+
+/* compute the chrominance (UV) components from two rgb source lines */
+
+static INLINE void ssse3_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1,
+                                             const BYTE* WINPR_RESTRICT src2,
+                                             BYTE* WINPR_RESTRICT dst1, BYTE* WINPR_RESTRICT dst2,
+                                             UINT32 width)
+{
+	const __m128i u_factors = BGRX_U_FACTORS;
+	const __m128i v_factors = BGRX_V_FACTORS;
+	const __m128i vector128 = CONST128_FACTORS;
+	__m128i x0;
+	__m128i x1;
+	__m128i x2;
+	__m128i x3;
+	__m128i x4;
+	__m128i x5;
+	const __m128i* rgb1 = (const __m128i*)src1;
+	const __m128i* rgb2 = (const __m128i*)src2;
+	__m64* udst = (__m64*)dst1;
+	__m64* vdst = (__m64*)dst2;
+
+	for (UINT32 x = 0; x < width; x += 16)
+	{
+		/* subsample 16x2 pixels into 16x1 pixels */
+		x0 = _mm_load_si128(rgb1++);
+		x4 = _mm_load_si128(rgb2++);
+		x0 = _mm_avg_epu8(x0, x4);
+		x1 = _mm_load_si128(rgb1++);
+		x4 = _mm_load_si128(rgb2++);
+		x1 = _mm_avg_epu8(x1, x4);
+		x2 = _mm_load_si128(rgb1++);
+		x4 = _mm_load_si128(rgb2++);
+		x2 = _mm_avg_epu8(x2, x4);
+		x3 = _mm_load_si128(rgb1++);
+		x4 = _mm_load_si128(rgb2++);
+		x3 = _mm_avg_epu8(x3, x4);
+		/* subsample these 16x1 pixels into 8x1 pixels */
+		/**
+		 * shuffle controls
+		 * c = a[0],a[2],b[0],b[2] == 10 00 10 00 = 0x88
+		 * c = a[1],a[3],b[1],b[3] == 11 01 11 01 = 0xdd
+		 */
+		x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88));
+		x0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd));
+		x0 = _mm_avg_epu8(x0, x4);
+		x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88));
+		x1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd));
+		x1 = _mm_avg_epu8(x1, x4);
+		/* multiplications and subtotals */
+		x2 = _mm_maddubs_epi16(x0, u_factors);
+		x3 = _mm_maddubs_epi16(x1, u_factors);
+		x4 = _mm_maddubs_epi16(x0, v_factors);
+		x5 = _mm_maddubs_epi16(x1, v_factors);
+		/* the total sums */
+		x0 = _mm_hadd_epi16(x2, x3);
+		x1 = _mm_hadd_epi16(x4, x5);
+		/* shift the results */
+		x0 = _mm_srai_epi16(x0, U_SHIFT);
+		x1 = _mm_srai_epi16(x1, V_SHIFT);
+		/* pack the 16 words into bytes */
+		x0 = _mm_packs_epi16(x0, x1);
+		/* add 128 */
+		x0 = _mm_sub_epi8(x0, vector128);
+		/* the lower 8 bytes go to the u plane */
+		_mm_storel_pi(udst++, _mm_castsi128_ps(x0));
+		/* the upper 8 bytes go to the v plane */
+		_mm_storeh_pi(vdst++, _mm_castsi128_ps(x0));
+	}
+}
+
+static pstatus_t ssse3_RGBToYUV420_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+                                        UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
+                                        const UINT32 dstStep[],
+                                        const prim_size_t* WINPR_RESTRICT roi)
+{
+	const BYTE* argb = pSrc;
+	BYTE* ydst = pDst[0];
+	BYTE* udst = pDst[1];
+	BYTE* vdst = pDst[2];
+
+	if (roi->height < 1 || roi->width < 1)
+	{
+		return !PRIMITIVES_SUCCESS;
+	}
+
+	if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
+	{
+		return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
+	}
+
+	for (UINT32 y = 0; y < roi->height - 1; y += 2)
+	{
+		const BYTE* line1 = argb;
+		const BYTE* line2 = argb + srcStep;
+		ssse3_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width);
+		ssse3_RGBToYUV420_BGRX_Y(line1, ydst, roi->width);
+		ssse3_RGBToYUV420_BGRX_Y(line2, ydst + dstStep[0], roi->width);
+		argb += 2 * srcStep;
+		ydst += 2 * dstStep[0];
+		udst += 1 * dstStep[1];
+		vdst += 1 * dstStep[2];
+	}
+
+	if (roi->height & 1)
+	{
+		/* pass the same last line of an odd height twice for UV */
+		ssse3_RGBToYUV420_BGRX_UV(argb, argb, udst, vdst, roi->width);
+		ssse3_RGBToYUV420_BGRX_Y(argb, ydst, roi->width);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t ssse3_RGBToYUV420(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+                                   UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
+                                   const UINT32 dstStep[], const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (srcFormat)
+	{
+		case PIXEL_FORMAT_BGRX32:
+		case PIXEL_FORMAT_BGRA32:
+			return ssse3_RGBToYUV420_BGRX(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
+
+		default:
+			return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
+	}
+}
+
+/****************************************************************************/
+/* SSSE3 RGB -> AVC444-YUV conversion                                      **/
+/****************************************************************************/
+
+static INLINE void ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
+    const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
+    BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
+    BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
+    BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
+{
+	const __m128i* argbEven = (const __m128i*)srcEven;
+	const __m128i* argbOdd = (const __m128i*)srcOdd;
+	const __m128i y_factors = BGRX_Y_FACTORS;
+	const __m128i u_factors = BGRX_U_FACTORS;
+	const __m128i v_factors = BGRX_V_FACTORS;
+	const __m128i vector128 = CONST128_FACTORS;
+
+	for (UINT32 x = 0; x < width; x += 16)
+	{
+		/* store 16 rgba pixels in 4 128 bit registers */
+		const __m128i xe1 = _mm_load_si128(argbEven++); // 1st 4 pixels
+		const __m128i xe2 = _mm_load_si128(argbEven++); // 2nd 4 pixels
+		const __m128i xe3 = _mm_load_si128(argbEven++); // 3rd 4 pixels
+		const __m128i xe4 = _mm_load_si128(argbEven++); // 4th 4 pixels
+		const __m128i xo1 = _mm_load_si128(argbOdd++);  // 1st 4 pixels
+		const __m128i xo2 = _mm_load_si128(argbOdd++);  // 2nd 4 pixels
+		const __m128i xo3 = _mm_load_si128(argbOdd++);  // 3rd 4 pixels
+		const __m128i xo4 = _mm_load_si128(argbOdd++);  // 4th 4 pixels
+		{
+			/* Y: multiplications with subtotals and horizontal sums */
+			const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
+			                                                  _mm_maddubs_epi16(xe2, y_factors)),
+			                                   Y_SHIFT);
+			const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
+			                                                  _mm_maddubs_epi16(xe4, y_factors)),
+			                                   Y_SHIFT);
+			const __m128i ye = _mm_packus_epi16(ye1, ye2);
+			const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
+			                                                  _mm_maddubs_epi16(xo2, y_factors)),
+			                                   Y_SHIFT);
+			const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
+			                                                  _mm_maddubs_epi16(xo4, y_factors)),
+			                                   Y_SHIFT);
+			const __m128i yo = _mm_packus_epi16(yo1, yo2);
+			/* store y [b1] */
+			_mm_storeu_si128((__m128i*)b1Even, ye);
+			b1Even += 16;
+
+			if (b1Odd)
+			{
+				_mm_storeu_si128((__m128i*)b1Odd, yo);
+				b1Odd += 16;
+			}
+		}
+		{
+			/* We have now
+			 * 16 even U values in ue
+			 * 16 odd U values in uo
+			 *
+			 * We need to split these according to
+			 * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
+			__m128i ue;
+			__m128i uo = { 0 };
+			{
+				const __m128i ue1 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
+				                                  _mm_maddubs_epi16(xe2, u_factors)),
+				                   U_SHIFT);
+				const __m128i ue2 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
+				                                  _mm_maddubs_epi16(xe4, u_factors)),
+				                   U_SHIFT);
+				ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
+			}
+
+			if (b1Odd)
+			{
+				const __m128i uo1 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
+				                                  _mm_maddubs_epi16(xo2, u_factors)),
+				                   U_SHIFT);
+				const __m128i uo2 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
+				                                  _mm_maddubs_epi16(xo4, u_factors)),
+				                   U_SHIFT);
+				uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
+			}
+
+			/* Now we need the following storage distribution:
+			 * 2x   2y    -> b2
+			 * x    2y+1  -> b4
+			 * 2x+1 2y    -> b6 */
+			if (b1Odd) /* b2 */
+			{
+				const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128());
+				const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128());
+				const __m128i hi = _mm_add_epi16(ueh, uoh);
+				const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128());
+				const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128());
+				const __m128i lo = _mm_add_epi16(uel, uol);
+				const __m128i added = _mm_hadd_epi16(lo, hi);
+				const __m128i avg16 = _mm_srai_epi16(added, 2);
+				const __m128i avg = _mm_packus_epi16(avg16, avg16);
+				_mm_storel_epi64((__m128i*)b2, avg);
+			}
+			else
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+				const __m128i ud = _mm_shuffle_epi8(ue, mask);
+				_mm_storel_epi64((__m128i*)b2, ud);
+			}
+
+			b2 += 8;
+
+			if (b1Odd) /* b4 */
+			{
+				_mm_store_si128((__m128i*)b4, uo);
+				b4 += 16;
+			}
+
+			{
+				/* b6 */
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+				const __m128i ude = _mm_shuffle_epi8(ue, mask);
+				_mm_storel_epi64((__m128i*)b6, ude);
+				b6 += 8;
+			}
+		}
+		{
+			/* We have now
+			 * 16 even V values in ue
+			 * 16 odd V values in uo
+			 *
+			 * We need to split these according to
+			 * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
+			__m128i ve;
+			__m128i vo = { 0 };
+			{
+				const __m128i ve1 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
+				                                  _mm_maddubs_epi16(xe2, v_factors)),
+				                   V_SHIFT);
+				const __m128i ve2 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
+				                                  _mm_maddubs_epi16(xe4, v_factors)),
+				                   V_SHIFT);
+				ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
+			}
+
+			if (b1Odd)
+			{
+				const __m128i vo1 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
+				                                  _mm_maddubs_epi16(xo2, v_factors)),
+				                   V_SHIFT);
+				const __m128i vo2 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
+				                                  _mm_maddubs_epi16(xo4, v_factors)),
+				                   V_SHIFT);
+				vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
+			}
+
+			/* Now we need the following storage distribution:
+			 * 2x   2y    -> b3
+			 * x    2y+1  -> b5
+			 * 2x+1 2y    -> b7 */
+			if (b1Odd) /* b3 */
+			{
+				const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128());
+				const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128());
+				const __m128i hi = _mm_add_epi16(veh, voh);
+				const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128());
+				const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128());
+				const __m128i lo = _mm_add_epi16(vel, vol);
+				const __m128i added = _mm_hadd_epi16(lo, hi);
+				const __m128i avg16 = _mm_srai_epi16(added, 2);
+				const __m128i avg = _mm_packus_epi16(avg16, avg16);
+				_mm_storel_epi64((__m128i*)b3, avg);
+			}
+			else
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+				const __m128i vd = _mm_shuffle_epi8(ve, mask);
+				_mm_storel_epi64((__m128i*)b3, vd);
+			}
+
+			b3 += 8;
+
+			if (b1Odd) /* b5 */
+			{
+				_mm_store_si128((__m128i*)b5, vo);
+				b5 += 16;
+			}
+
+			{
+				/* b7 */
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+				const __m128i vde = _mm_shuffle_epi8(ve, mask);
+				_mm_storel_epi64((__m128i*)b7, vde);
+				b7 += 8;
+			}
+		}
+	}
+}
+
+static pstatus_t ssse3_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+                                           UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
+                                           const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
+                                           const UINT32 dst2Step[],
+                                           const prim_size_t* WINPR_RESTRICT roi)
+{
+	const BYTE* pMaxSrc = pSrc + (roi->height - 1) * srcStep;
+
+	if (roi->height < 1 || roi->width < 1)
+		return !PRIMITIVES_SUCCESS;
+
+	if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
+		return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
+		                               roi);
+
+	for (UINT32 y = 0; y < roi->height; y += 2)
+	{
+		const BOOL last = (y >= (roi->height - 1));
+		const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
+		const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
+		const UINT32 i = y >> 1;
+		const UINT32 n = (i & ~7) + i;
+		BYTE* b1Even = pDst1[0] + y * dst1Step[0];
+		BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
+		BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
+		BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
+		BYTE* b4 = pDst2[0] + dst2Step[0] * n;
+		BYTE* b5 = b4 + 8 * dst2Step[0];
+		BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
+		BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
+		ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7,
+		                                     roi->width);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t ssse3_RGBToAVC444YUV(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+                                      UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
+                                      const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
+                                      const UINT32 dst2Step[],
+                                      const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (srcFormat)
+	{
+		case PIXEL_FORMAT_BGRX32:
+		case PIXEL_FORMAT_BGRA32:
+			return ssse3_RGBToAVC444YUV_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
+			                                 dst2Step, roi);
+
+		default:
+			return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
+			                               dst2Step, roi);
+	}
+}
+
+/* Mapping of arguments:
+ *
+ * b1 [even lines] -> yLumaDstEven
+ * b1 [odd lines]  -> yLumaDstOdd
+ * b2              -> uLumaDst
+ * b3              -> vLumaDst
+ * b4              -> yChromaDst1
+ * b5              -> yChromaDst2
+ * b6              -> uChromaDst1
+ * b7              -> uChromaDst2
+ * b8              -> vChromaDst1
+ * b9              -> vChromaDst2
+ */
+static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
+    const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
+    BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
+    BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
+    BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
+    BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
+    BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
+    BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width)
+{
+	const __m128i vector128 = CONST128_FACTORS;
+	const __m128i* argbEven = (const __m128i*)srcEven;
+	const __m128i* argbOdd = (const __m128i*)srcOdd;
+
+	for (UINT32 x = 0; x < width; x += 16)
+	{
+		/* store 16 rgba pixels in 4 128 bit registers
+		 * for even and odd rows.
+		 */
+		const __m128i xe1 = _mm_load_si128(argbEven++); /* 1st 4 pixels */
+		const __m128i xe2 = _mm_load_si128(argbEven++); /* 2nd 4 pixels */
+		const __m128i xe3 = _mm_load_si128(argbEven++); /* 3rd 4 pixels */
+		const __m128i xe4 = _mm_load_si128(argbEven++); /* 4th 4 pixels */
+		const __m128i xo1 = _mm_load_si128(argbOdd++);  /* 1st 4 pixels */
+		const __m128i xo2 = _mm_load_si128(argbOdd++);  /* 2nd 4 pixels */
+		const __m128i xo3 = _mm_load_si128(argbOdd++);  /* 3rd 4 pixels */
+		const __m128i xo4 = _mm_load_si128(argbOdd++);  /* 4th 4 pixels */
+		{
+			/* Y: multiplications with subtotals and horizontal sums */
+			const __m128i y_factors = BGRX_Y_FACTORS;
+			const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
+			                                                  _mm_maddubs_epi16(xe2, y_factors)),
+			                                   Y_SHIFT);
+			const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
+			                                                  _mm_maddubs_epi16(xe4, y_factors)),
+			                                   Y_SHIFT);
+			const __m128i ye = _mm_packus_epi16(ye1, ye2);
+			/* store y [b1] */
+			_mm_storeu_si128((__m128i*)yLumaDstEven, ye);
+			yLumaDstEven += 16;
+		}
+
+		if (yLumaDstOdd)
+		{
+			const __m128i y_factors = BGRX_Y_FACTORS;
+			const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
+			                                                  _mm_maddubs_epi16(xo2, y_factors)),
+			                                   Y_SHIFT);
+			const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
+			                                                  _mm_maddubs_epi16(xo4, y_factors)),
+			                                   Y_SHIFT);
+			const __m128i yo = _mm_packus_epi16(yo1, yo2);
+			_mm_storeu_si128((__m128i*)yLumaDstOdd, yo);
+			yLumaDstOdd += 16;
+		}
+
+		{
+			/* We have now
+			 * 16 even U values in ue
+			 * 16 odd U values in uo
+			 *
+			 * We need to split these according to
+			 * 3.3.8.3.3 YUV420p Stream Combination for YUV444v2 mode */
+			/* U: multiplications with subtotals and horizontal sums */
+			__m128i ue;
+			__m128i uo;
+			__m128i uavg;
+			{
+				const __m128i u_factors = BGRX_U_FACTORS;
+				const __m128i ue1 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
+				                                  _mm_maddubs_epi16(xe2, u_factors)),
+				                   U_SHIFT);
+				const __m128i ue2 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
+				                                  _mm_maddubs_epi16(xe4, u_factors)),
+				                   U_SHIFT);
+				const __m128i ueavg = _mm_hadd_epi16(ue1, ue2);
+				ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
+				uavg = ueavg;
+			}
+			{
+				const __m128i u_factors = BGRX_U_FACTORS;
+				const __m128i uo1 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
+				                                  _mm_maddubs_epi16(xo2, u_factors)),
+				                   U_SHIFT);
+				const __m128i uo2 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
+				                                  _mm_maddubs_epi16(xo4, u_factors)),
+				                   U_SHIFT);
+				const __m128i uoavg = _mm_hadd_epi16(uo1, uo2);
+				uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
+				uavg = _mm_add_epi16(uavg, uoavg);
+				uavg = _mm_srai_epi16(uavg, 2);
+				uavg = _mm_packs_epi16(uavg, uoavg);
+				uavg = _mm_sub_epi8(uavg, vector128);
+			}
+			/* Now we need the following storage distribution:
+			 * 2x   2y    -> uLumaDst
+			 * 2x+1  y    -> yChromaDst1
+			 * 4x   2y+1  -> uChromaDst1
+			 * 4x+2 2y+1  -> vChromaDst1 */
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+				const __m128i ude = _mm_shuffle_epi8(ue, mask);
+				_mm_storel_epi64((__m128i*)yEvenChromaDst1, ude);
+				yEvenChromaDst1 += 8;
+			}
+
+			if (yLumaDstOdd)
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+				const __m128i udo = _mm_shuffle_epi8(uo, mask);
+				_mm_storel_epi64((__m128i*)yOddChromaDst1, udo);
+				yOddChromaDst1 += 8;
+			}
+
+			if (yLumaDstOdd)
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
+				const __m128i ud = _mm_shuffle_epi8(uo, mask);
+				int* uDst1 = (int*)uChromaDst1;
+				int* vDst1 = (int*)vChromaDst1;
+				const int* src = (const int*)&ud;
+				_mm_stream_si32(uDst1, src[0]);
+				_mm_stream_si32(vDst1, src[1]);
+				uChromaDst1 += 4;
+				vChromaDst1 += 4;
+			}
+
+			if (yLumaDstOdd)
+			{
+				_mm_storel_epi64((__m128i*)uLumaDst, uavg);
+				uLumaDst += 8;
+			}
+			else
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+				const __m128i ud = _mm_shuffle_epi8(ue, mask);
+				_mm_storel_epi64((__m128i*)uLumaDst, ud);
+				uLumaDst += 8;
+			}
+		}
+
+		{
+			/* V: multiplications with subtotals and horizontal sums */
+			__m128i ve;
+			__m128i vo;
+			__m128i vavg;
+			{
+				const __m128i v_factors = BGRX_V_FACTORS;
+				const __m128i ve1 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
+				                                  _mm_maddubs_epi16(xe2, v_factors)),
+				                   V_SHIFT);
+				const __m128i ve2 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
+				                                  _mm_maddubs_epi16(xe4, v_factors)),
+				                   V_SHIFT);
+				const __m128i veavg = _mm_hadd_epi16(ve1, ve2);
+				ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
+				vavg = veavg;
+			}
+			{
+				const __m128i v_factors = BGRX_V_FACTORS;
+				const __m128i vo1 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
+				                                  _mm_maddubs_epi16(xo2, v_factors)),
+				                   V_SHIFT);
+				const __m128i vo2 =
+				    _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
+				                                  _mm_maddubs_epi16(xo4, v_factors)),
+				                   V_SHIFT);
+				const __m128i voavg = _mm_hadd_epi16(vo1, vo2);
+				vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
+				vavg = _mm_add_epi16(vavg, voavg);
+				vavg = _mm_srai_epi16(vavg, 2);
+				vavg = _mm_packs_epi16(vavg, voavg);
+				vavg = _mm_sub_epi8(vavg, vector128);
+			}
+			/* Now we need the following storage distribution:
+			 * 2x   2y    -> vLumaDst
+			 * 2x+1  y    -> yChromaDst2
+			 * 4x   2y+1  -> uChromaDst2
+			 * 4x+2 2y+1  -> vChromaDst2 */
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+				__m128i vde = _mm_shuffle_epi8(ve, mask);
+				_mm_storel_epi64((__m128i*)yEvenChromaDst2, vde);
+				yEvenChromaDst2 += 8;
+			}
+
+			if (yLumaDstOdd)
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+				__m128i vdo = _mm_shuffle_epi8(vo, mask);
+				_mm_storel_epi64((__m128i*)yOddChromaDst2, vdo);
+				yOddChromaDst2 += 8;
+			}
+
+			if (yLumaDstOdd)
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
+				const __m128i vd = _mm_shuffle_epi8(vo, mask);
+				int* uDst2 = (int*)uChromaDst2;
+				int* vDst2 = (int*)vChromaDst2;
+				const int* src = (const int*)&vd;
+				_mm_stream_si32(uDst2, src[0]);
+				_mm_stream_si32(vDst2, src[1]);
+				uChromaDst2 += 4;
+				vChromaDst2 += 4;
+			}
+
+			if (yLumaDstOdd)
+			{
+				_mm_storel_epi64((__m128i*)vLumaDst, vavg);
+				vLumaDst += 8;
+			}
+			else
+			{
+				const __m128i mask =
+				    _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+				                 (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+				__m128i vd = _mm_shuffle_epi8(ve, mask);
+				_mm_storel_epi64((__m128i*)vLumaDst, vd);
+				vLumaDst += 8;
+			}
+		}
+	}
+}
+
+static pstatus_t ssse3_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+                                             UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
+                                             const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
+                                             const UINT32 dst2Step[],
+                                             const prim_size_t* WINPR_RESTRICT roi)
+{
+	if (roi->height < 1 || roi->width < 1)
+		return !PRIMITIVES_SUCCESS;
+
+	if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
+		return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
+		                                 roi);
+
+	for (UINT32 y = 0; y < roi->height; y += 2)
+	{
+		const BYTE* srcEven = (pSrc + y * srcStep);
+		const BYTE* srcOdd = (srcEven + srcStep);
+		BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
+		BYTE* dstLumaYOdd = (y < roi->height - 1) ? (dstLumaYEven + dst1Step[0]) : NULL;
+		BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
+		BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
+		BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
+		BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
+		BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0];
+		BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0];
+		BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
+		BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
+		BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
+		BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
+		ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU,
+		                                       dstLumaV, dstEvenChromaY1, dstEvenChromaY2,
+		                                       dstOddChromaY1, dstOddChromaY2, dstChromaU1,
+		                                       dstChromaU2, dstChromaV1, dstChromaV2, roi->width);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t ssse3_RGBToAVC444YUVv2(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+                                        UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
+                                        const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
+                                        const UINT32 dst2Step[],
+                                        const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (srcFormat)
+	{
+		case PIXEL_FORMAT_BGRX32:
+		case PIXEL_FORMAT_BGRA32:
+			return ssse3_RGBToAVC444YUVv2_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
+			                                   dst2Step, roi);
+
+		default:
+			return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
+			                                 dst2Step, roi);
+	}
+}
+
+static pstatus_t ssse3_LumaToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[],
+                                    const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDstRaw[],
+                                    const UINT32 dstStep[], const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	const UINT32 nWidth = roi->right - roi->left;
+	const UINT32 nHeight = roi->bottom - roi->top;
+	const UINT32 halfWidth = (nWidth + 1) / 2;
+	const UINT32 halfPad = halfWidth % 16;
+	const UINT32 halfHeight = (nHeight + 1) / 2;
+	const UINT32 oddY = 1;
+	const UINT32 evenY = 0;
+	const UINT32 oddX = 1;
+	const UINT32 evenX = 0;
+	const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
+		                    pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
+		                    pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
+	BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
+		              pDstRaw[1] + roi->top * dstStep[1] + roi->left,
+		              pDstRaw[2] + roi->top * dstStep[2] + roi->left };
+
+	/* Y data is already here... */
+	/* B1 */
+	for (UINT32 y = 0; y < nHeight; y++)
+	{
+		const BYTE* Ym = pSrc[0] + srcStep[0] * y;
+		BYTE* pY = pDst[0] + dstStep[0] * y;
+		memcpy(pY, Ym, nWidth);
+	}
+
+	/* The first half of U, V are already here part of this frame. */
+	/* B2 and B3 */
+	for (UINT32 y = 0; y < halfHeight; y++)
+	{
+		const UINT32 val2y = (2 * y + evenY);
+		const UINT32 val2y1 = val2y + oddY;
+		const BYTE* Um = pSrc[1] + srcStep[1] * y;
+		const BYTE* Vm = pSrc[2] + srcStep[2] * y;
+		BYTE* pU = pDst[1] + dstStep[1] * val2y;
+		BYTE* pV = pDst[2] + dstStep[2] * val2y;
+		BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;
+		BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;
+
+		UINT32 x = 0;
+		for (; x < halfWidth - halfPad; x += 16)
+		{
+			const __m128i unpackHigh = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+			const __m128i unpackLow =
+			    _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8);
+			{
+				const __m128i u = _mm_loadu_si128((const __m128i*)&Um[x]);
+				const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
+				const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
+				_mm_storeu_si128((__m128i*)&pU[2 * x], uHigh);
+				_mm_storeu_si128((__m128i*)&pU[2 * x + 16], uLow);
+				_mm_storeu_si128((__m128i*)&pU1[2 * x], uHigh);
+				_mm_storeu_si128((__m128i*)&pU1[2 * x + 16], uLow);
+			}
+			{
+				const __m128i u = _mm_loadu_si128((const __m128i*)&Vm[x]);
+				const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
+				const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
+				_mm_storeu_si128((__m128i*)&pV[2 * x], uHigh);
+				_mm_storeu_si128((__m128i*)&pV[2 * x + 16], uLow);
+				_mm_storeu_si128((__m128i*)&pV1[2 * x], uHigh);
+				_mm_storeu_si128((__m128i*)&pV1[2 * x + 16], uLow);
+			}
+		}
+
+		for (; x < halfWidth; x++)
+		{
+			const UINT32 val2x = 2 * x + evenX;
+			const UINT32 val2x1 = val2x + oddX;
+			pU[val2x] = Um[x];
+			pV[val2x] = Vm[x];
+			pU[val2x1] = Um[x];
+			pV[val2x1] = Vm[x];
+			pU1[val2x] = Um[x];
+			pV1[val2x] = Vm[x];
+			pU1[val2x1] = Um[x];
+			pV1[val2x1] = Vm[x];
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static INLINE void ssse3_filter(BYTE* WINPR_RESTRICT pSrcDst, const BYTE* WINPR_RESTRICT pSrc2)
+{
+	const __m128i even = _mm_set_epi8((char)0x80, 14, (char)0x80, 12, (char)0x80, 10, (char)0x80, 8,
+	                                  (char)0x80, 6, (char)0x80, 4, (char)0x80, 2, (char)0x80, 0);
+	const __m128i odd = _mm_set_epi8((char)0x80, 15, (char)0x80, 13, (char)0x80, 11, (char)0x80, 9,
+	                                 (char)0x80, 7, (char)0x80, 5, (char)0x80, 3, (char)0x80, 1);
+	const __m128i interleave = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
+	const __m128i u = _mm_loadu_si128((const __m128i*)pSrcDst);
+	const __m128i u1 = _mm_loadu_si128((const __m128i*)pSrc2);
+	const __m128i uEven = _mm_shuffle_epi8(u, even);
+	const __m128i uEven4 = _mm_slli_epi16(uEven, 2);
+	const __m128i uOdd = _mm_shuffle_epi8(u, odd);
+	const __m128i u1Even = _mm_shuffle_epi8(u1, even);
+	const __m128i u1Odd = _mm_shuffle_epi8(u1, odd);
+	const __m128i tmp1 = _mm_add_epi16(uOdd, u1Even);
+	const __m128i tmp2 = _mm_add_epi16(tmp1, u1Odd);
+	const __m128i result = _mm_sub_epi16(uEven4, tmp2);
+	const __m128i packed = _mm_packus_epi16(result, uOdd);
+	const __m128i interleaved = _mm_shuffle_epi8(packed, interleave);
+	_mm_storeu_si128((__m128i*)pSrcDst, interleaved);
+}
+
+static pstatus_t ssse3_ChromaFilter(BYTE* WINPR_RESTRICT pDst[], const UINT32 dstStep[],
+                                    const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	const UINT32 oddY = 1;
+	const UINT32 evenY = 0;
+	const UINT32 nWidth = roi->right - roi->left;
+	const UINT32 nHeight = roi->bottom - roi->top;
+	const UINT32 halfHeight = (nHeight + 1) / 2;
+	const UINT32 halfWidth = (nWidth + 1) / 2;
+	const UINT32 halfPad = halfWidth % 16;
+
+	/* Filter */
+	for (UINT32 y = roi->top; y < halfHeight + roi->top; y++)
+	{
+		UINT32 x = roi->left;
+		const UINT32 val2y = (y * 2 + evenY);
+		const UINT32 val2y1 = val2y + oddY;
+		BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;
+		BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;
+		BYTE* pU = pDst[1] + dstStep[1] * val2y;
+		BYTE* pV = pDst[2] + dstStep[2] * val2y;
+
+		if (val2y1 > nHeight)
+			continue;
+
+		for (; x < halfWidth + roi->left - halfPad; x += 16)
+		{
+			ssse3_filter(&pU[2 * x], &pU1[2 * x]);
+			ssse3_filter(&pV[2 * x], &pV1[2 * x]);
+		}
+
+		for (; x < halfWidth + roi->left; x++)
+		{
+			const UINT32 val2x = (x * 2);
+			const UINT32 val2x1 = val2x + 1;
+			const BYTE inU = pU[val2x];
+			const BYTE inV = pV[val2x];
+			const INT32 up = inU * 4;
+			const INT32 vp = inV * 4;
+			INT32 u2020 = 0;
+			INT32 v2020 = 0;
+
+			if (val2x1 > nWidth)
+				continue;
+
+			u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1];
+			v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1];
+			pU[val2x] = CONDITIONAL_CLIP(u2020, inU);
+			pV[val2x] = CONDITIONAL_CLIP(v2020, inV);
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t ssse3_ChromaV1ToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3],
+                                        const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
+                                        const UINT32 dstStep[3],
+                                        const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	const UINT32 mod = 16;
+	UINT32 uY = 0;
+	UINT32 vY = 0;
+	const UINT32 nWidth = roi->right - roi->left;
+	const UINT32 nHeight = roi->bottom - roi->top;
+	const UINT32 halfWidth = (nWidth + 1) / 2;
+	const UINT32 halfPad = halfWidth % 16;
+	const UINT32 halfHeight = (nHeight + 1) / 2;
+	const UINT32 oddY = 1;
+	const UINT32 evenY = 0;
+	const UINT32 oddX = 1;
+	/* The auxilary frame is aligned to multiples of 16x16.
+	 * We need the padded height for B4 and B5 conversion. */
+	const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
+	const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
+		                    pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
+		                    pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
+	BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
+		              pDstRaw[1] + roi->top * dstStep[1] + roi->left,
+		              pDstRaw[2] + roi->top * dstStep[2] + roi->left };
+	const __m128i zero = _mm_setzero_si128();
+	const __m128i mask = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0,
+	                                  (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80);
+
+	/* The second half of U and V is a bit more tricky... */
+	/* B4 and B5 */
+	for (UINT32 y = 0; y < padHeigth; y++)
+	{
+		const BYTE* Ya = pSrc[0] + srcStep[0] * y;
+		BYTE* pX = NULL;
+
+		if ((y) % mod < (mod + 1) / 2)
+		{
+			const UINT32 pos = (2 * uY++ + oddY);
+
+			if (pos >= nHeight)
+				continue;
+
+			pX = pDst[1] + dstStep[1] * pos;
+		}
+		else
+		{
+			const UINT32 pos = (2 * vY++ + oddY);
+
+			if (pos >= nHeight)
+				continue;
+
+			pX = pDst[2] + dstStep[2] * pos;
+		}
+
+		memcpy(pX, Ya, nWidth);
+	}
+
+	/* B6 and B7 */
+	for (UINT32 y = 0; y < halfHeight; y++)
+	{
+		const UINT32 val2y = (y * 2 + evenY);
+		const BYTE* Ua = pSrc[1] + srcStep[1] * y;
+		const BYTE* Va = pSrc[2] + srcStep[2] * y;
+		BYTE* pU = pDst[1] + dstStep[1] * val2y;
+		BYTE* pV = pDst[2] + dstStep[2] * val2y;
+
+		UINT32 x = 0;
+		for (; x < halfWidth - halfPad; x += 16)
+		{
+			{
+				const __m128i u = _mm_loadu_si128((const __m128i*)&Ua[x]);
+				const __m128i u2 = _mm_unpackhi_epi8(u, zero);
+				const __m128i u1 = _mm_unpacklo_epi8(u, zero);
+				_mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
+				_mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
+			}
+			{
+				const __m128i u = _mm_loadu_si128((const __m128i*)&Va[x]);
+				const __m128i u2 = _mm_unpackhi_epi8(u, zero);
+				const __m128i u1 = _mm_unpacklo_epi8(u, zero);
+				_mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x]);
+				_mm_maskmoveu_si128(u2, mask, (char*)&pV[2 * x + 16]);
+			}
+		}
+
+		for (; x < halfWidth; x++)
+		{
+			const UINT32 val2x1 = (x * 2 + oddX);
+			pU[val2x1] = Ua[x];
+			pV[val2x1] = Va[x];
+		}
+	}
+
+	/* Filter */
+	return ssse3_ChromaFilter(pDst, dstStep, roi);
+}
+
+static pstatus_t ssse3_ChromaV2ToYUV444(const BYTE* const WINPR_RESTRICT pSrc[3],
+                                        const UINT32 srcStep[3], UINT32 nTotalWidth,
+                                        UINT32 nTotalHeight, BYTE* WINPR_RESTRICT pDst[3],
+                                        const UINT32 dstStep[3],
+                                        const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	const UINT32 nWidth = roi->right - roi->left;
+	const UINT32 nHeight = roi->bottom - roi->top;
+	const UINT32 halfWidth = (nWidth + 1) / 2;
+	const UINT32 halfPad = halfWidth % 16;
+	const UINT32 halfHeight = (nHeight + 1) / 2;
+	const UINT32 quaterWidth = (nWidth + 3) / 4;
+	const UINT32 quaterPad = quaterWidth % 16;
+	const __m128i zero = _mm_setzero_si128();
+	const __m128i mask = _mm_set_epi8((char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0,
+	                                  (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0);
+	const __m128i mask2 = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80,
+	                                   0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80);
+	const __m128i shuffle1 =
+	    _mm_set_epi8((char)0x80, 15, (char)0x80, 14, (char)0x80, 13, (char)0x80, 12, (char)0x80, 11,
+	                 (char)0x80, 10, (char)0x80, 9, (char)0x80, 8);
+	const __m128i shuffle2 =
+	    _mm_set_epi8((char)0x80, 7, (char)0x80, 6, (char)0x80, 5, (char)0x80, 4, (char)0x80, 3,
+	                 (char)0x80, 2, (char)0x80, 1, (char)0x80, 0);
+
+	/* B4 and B5: odd UV values for width/2, height */
+	for (UINT32 y = 0; y < nHeight; y++)
+	{
+		const UINT32 yTop = y + roi->top;
+		const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
+		const BYTE* pYaV = pYaU + nTotalWidth / 2;
+		BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left;
+		BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left;
+
+		UINT32 x = 0;
+		for (; x < halfWidth - halfPad; x += 16)
+		{
+			{
+				const __m128i u = _mm_loadu_si128((const __m128i*)&pYaU[x]);
+				const __m128i u2 = _mm_unpackhi_epi8(zero, u);
+				const __m128i u1 = _mm_unpacklo_epi8(zero, u);
+				_mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
+				_mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
+			}
+			{
+				const __m128i v = _mm_loadu_si128((const __m128i*)&pYaV[x]);
+				const __m128i v2 = _mm_unpackhi_epi8(zero, v);
+				const __m128i v1 = _mm_unpacklo_epi8(zero, v);
+				_mm_maskmoveu_si128(v1, mask, (char*)&pV[2 * x]);
+				_mm_maskmoveu_si128(v2, mask, (char*)&pV[2 * x + 16]);
+			}
+		}
+
+		for (; x < halfWidth; x++)
+		{
+			const UINT32 odd = 2 * x + 1;
+			pU[odd] = pYaU[x];
+			pV[odd] = pYaV[x];
+		}
+	}
+
+	/* B6 - B9 */
+	for (UINT32 y = 0; y < halfHeight; y++)
+	{
+		const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
+		const BYTE* pUaV = pUaU + nTotalWidth / 4;
+		const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
+		const BYTE* pVaV = pVaU + nTotalWidth / 4;
+		BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
+		BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
+
+		UINT32 x = 0;
+		for (; x < quaterWidth - quaterPad; x += 16)
+		{
+			{
+				const __m128i uU = _mm_loadu_si128((const __m128i*)&pUaU[x]);
+				const __m128i uV = _mm_loadu_si128((const __m128i*)&pVaU[x]);
+				const __m128i uHigh = _mm_unpackhi_epi8(uU, uV);
+				const __m128i uLow = _mm_unpacklo_epi8(uU, uV);
+				const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2);
+				const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1);
+				const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2);
+				const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1);
+				_mm_maskmoveu_si128(u1, mask2, (char*)&pU[4 * x + 0]);
+				_mm_maskmoveu_si128(u2, mask2, (char*)&pU[4 * x + 16]);
+				_mm_maskmoveu_si128(u3, mask2, (char*)&pU[4 * x + 32]);
+				_mm_maskmoveu_si128(u4, mask2, (char*)&pU[4 * x + 48]);
+			}
+			{
+				const __m128i vU = _mm_loadu_si128((const __m128i*)&pUaV[x]);
+				const __m128i vV = _mm_loadu_si128((const __m128i*)&pVaV[x]);
+				const __m128i vHigh = _mm_unpackhi_epi8(vU, vV);
+				const __m128i vLow = _mm_unpacklo_epi8(vU, vV);
+				const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2);
+				const __m128i v2 = _mm_shuffle_epi8(vLow, shuffle1);
+				const __m128i v3 = _mm_shuffle_epi8(vHigh, shuffle2);
+				const __m128i v4 = _mm_shuffle_epi8(vHigh, shuffle1);
+				_mm_maskmoveu_si128(v1, mask2, (char*)&pV[4 * x + 0]);
+				_mm_maskmoveu_si128(v2, mask2, (char*)&pV[4 * x + 16]);
+				_mm_maskmoveu_si128(v3, mask2, (char*)&pV[4 * x + 32]);
+				_mm_maskmoveu_si128(v4, mask2, (char*)&pV[4 * x + 48]);
+			}
+		}
+
+		for (; x < quaterWidth; x++)
+		{
+			pU[4 * x + 0] = pUaU[x];
+			pV[4 * x + 0] = pUaV[x];
+			pU[4 * x + 2] = pVaU[x];
+			pV[4 * x + 2] = pVaV[x];
+		}
+	}
+
+	return ssse3_ChromaFilter(pDst, dstStep, roi);
+}
+
+static pstatus_t ssse3_YUV420CombineToYUV444(avc444_frame_type type,
+                                             const BYTE* const WINPR_RESTRICT pSrc[3],
+                                             const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
+                                             BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
+                                             const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+	if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
+		return -1;
+
+	if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
+		return -1;
+
+	if (!roi)
+		return -1;
+
+	switch (type)
+	{
+		case AVC444_LUMA:
+			return ssse3_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
+
+		case AVC444_CHROMAv1:
+			return ssse3_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
+
+		case AVC444_CHROMAv2:
+			return ssse3_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
+
+		default:
+			return -1;
+	}
+}
+
+void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	generic = primitives_get_generic();
+	primitives_init_YUV(prims);
+
+	if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
+	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+	{
+		prims->RGBToYUV420_8u_P3AC4R = ssse3_RGBToYUV420;
+		prims->RGBToAVC444YUV = ssse3_RGBToAVC444YUV;
+		prims->RGBToAVC444YUVv2 = ssse3_RGBToAVC444YUVv2;
+		prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB;
+		prims->YUV444ToRGB_8u_P3AC4R = ssse3_YUV444ToRGB_8u_P3AC4R;
+		prims->YUV420CombineToYUV444 = ssse3_YUV420CombineToYUV444;
+	}
+}
diff --git a/libfreerdp/primitives/prim_add.c b/libfreerdp/primitives/prim_add.c
new file mode 100644
index 0000000..674e04f
--- /dev/null
+++ b/libfreerdp/primitives/prim_add.c
@@ -0,0 +1,48 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Add operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+/* ----------------------------------------------------------------------------
+ * 16-bit signed add with saturation (under and over).
+ */
+static pstatus_t general_add_16s(const INT16* pSrc1, const INT16* pSrc2, INT16* pDst, UINT32 len)
+{
+	while (len--)
+	{
+		INT32 k = (INT32)(*pSrc1++) + (INT32)(*pSrc2++);
+
+		if (k > 32767)
+			*pDst++ = ((INT16)32767);
+		else if (k < -32768)
+			*pDst++ = ((INT16)-32768);
+		else
+			*pDst++ = (INT16)k;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_add(primitives_t* prims)
+{
+	prims->add_16s = general_add_16s;
+}
diff --git a/libfreerdp/primitives/prim_add_opt.c b/libfreerdp/primitives/prim_add_opt.c
new file mode 100644
index 0000000..88c8b66
--- /dev/null
+++ b/libfreerdp/primitives/prim_add_opt.c
@@ -0,0 +1,61 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized add operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+#include <ipps.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+static primitives_t* generic = NULL;
+
+#ifdef WITH_SSE2
+#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+/* ------------------------------------------------------------------------- */
+SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16,
+                 generic->add_16s(sptr1++, sptr2++, dptr++, 1))
+#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	generic = primitives_get_generic();
+	primitives_init_add(prims);
+#ifdef WITH_IPP
+	prims->add_16s = (__add_16s_t)ippsAdd_16s;
+#elif defined(WITH_SSE2)
+
+	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
+	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
+	{
+		prims->add_16s = sse3_add_16s;
+	}
+
+#endif
+}
diff --git a/libfreerdp/primitives/prim_alphaComp.c b/libfreerdp/primitives/prim_alphaComp.c
new file mode 100644
index 0000000..fe4f8dc
--- /dev/null
+++ b/libfreerdp/primitives/prim_alphaComp.c
@@ -0,0 +1,94 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Alpha blending routines.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ * Note: this code assumes the second operand is fully opaque,
+ * e.g.
+ *   newval = alpha1*val1 + (1-alpha1)*val2
+ * rather than
+ *   newval = alpha1*val1 + (1-alpha1)*alpha2*val2
+ * The IPP gives other options.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+#define ALPHA(_k_) (((_k_)&0xFF000000U) >> 24)
+#define RED(_k_) (((_k_)&0x00FF0000U) >> 16)
+#define GRN(_k_) (((_k_)&0x0000FF00U) >> 8)
+#define BLU(_k_) (((_k_)&0x000000FFU))
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_alphaComp_argb(const BYTE* pSrc1, UINT32 src1Step, const BYTE* pSrc2,
+                                        UINT32 src2Step, BYTE* pDst, UINT32 dstStep, UINT32 width,
+                                        UINT32 height)
+{
+	for (UINT32 y = 0; y < height; y++)
+	{
+		const UINT32* sptr1 = (const UINT32*)(pSrc1 + y * src1Step);
+		const UINT32* sptr2 = (const UINT32*)(pSrc2 + y * src2Step);
+		UINT32* dptr = (UINT32*)(pDst + y * dstStep);
+
+		for (UINT32 x = 0; x < width; x++)
+		{
+			const UINT32 src1 = *sptr1++;
+			const UINT32 src2 = *sptr2++;
+			UINT32 alpha = ALPHA(src1) + 1;
+
+			if (alpha == 256)
+			{
+				/* If alpha is 255+1, just copy src1. */
+				*dptr++ = src1;
+			}
+			else if (alpha <= 1)
+			{
+				/* If alpha is 0+1, just copy src2. */
+				*dptr++ = src2;
+			}
+			else
+			{
+				/* A perfectly accurate blend would do (a*src + (255-a)*dst)/255
+				 * rather than adding one to alpha and dividing by 256, but this
+				 * is much faster and only differs by one 16% of the time.
+				 * I'm not sure who first designed the double-ops trick
+				 * (Red Blue and Alpha Green).
+				 */
+				UINT32 rb = 0;
+				UINT32 ag = 0;
+				UINT32 s2rb = src2 & 0x00FF00FFU;
+				UINT32 s2ag = (src2 >> 8) & 0x00FF00FFU;
+				UINT32 s1rb = src1 & 0x00FF00FFU;
+				UINT32 s1ag = (src1 >> 8) & 0x00FF00FFU;
+				UINT32 drb = s1rb - s2rb;
+				UINT32 dag = s1ag - s2ag;
+				drb *= alpha;
+				dag *= alpha;
+				rb = ((drb >> 8) + s2rb) & 0x00FF00FFU;
+				ag = (((dag >> 8) + s2ag) << 8) & 0xFF00FF00U;
+				*dptr++ = rb | ag;
+			}
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_alphaComp(primitives_t* prims)
+{
+	prims->alphaComp_argb = general_alphaComp_argb;
+}
diff --git a/libfreerdp/primitives/prim_alphaComp_opt.c b/libfreerdp/primitives/prim_alphaComp_opt.c
new file mode 100644
index 0000000..2c675a4
--- /dev/null
+++ b/libfreerdp/primitives/prim_alphaComp_opt.c
@@ -0,0 +1,245 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized alpha blending routines.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ * Note: this code assumes the second operand is fully opaque,
+ * e.g.
+ *   newval = alpha1*val1 + (1-alpha1)*val2
+ * rather than
+ *   newval = alpha1*val1 + (1-alpha1)*alpha2*val2
+ * The IPP gives other options.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+#include <ippi.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+
+static primitives_t* generic = NULL;
+
+/* ------------------------------------------------------------------------- */
+#ifdef WITH_SSE2
+#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+
+static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
+                                     const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
+                                     BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width,
+                                     UINT32 height)
+{
+	const UINT32* sptr1 = (const UINT32*)pSrc1;
+	const UINT32* sptr2 = (const UINT32*)pSrc2;
+	UINT32* dptr = NULL;
+	int linebytes = 0;
+	int src1Jump = 0;
+	int src2Jump = 0;
+	int dstJump = 0;
+	__m128i xmm0;
+	__m128i xmm1;
+
+	if ((width <= 0) || (height <= 0))
+		return PRIMITIVES_SUCCESS;
+
+	if (width < 4) /* pointless if too small */
+	{
+		return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width,
+		                               height);
+	}
+
+	dptr = (UINT32*)pDst;
+	linebytes = width * sizeof(UINT32);
+	src1Jump = (src1Step - linebytes) / sizeof(UINT32);
+	src2Jump = (src2Step - linebytes) / sizeof(UINT32);
+	dstJump = (dstStep - linebytes) / sizeof(UINT32);
+	xmm0 = _mm_set1_epi32(0);
+	xmm1 = _mm_set1_epi16(1);
+
+	for (UINT32 y = 0; y < height; ++y)
+	{
+		int pixels = width;
+		int count = 0;
+		/* Get to the 16-byte boundary now. */
+		int leadIn = 0;
+
+		switch ((ULONG_PTR)dptr & 0x0f)
+		{
+			case 0:
+				leadIn = 0;
+				break;
+
+			case 4:
+				leadIn = 3;
+				break;
+
+			case 8:
+				leadIn = 2;
+				break;
+
+			case 12:
+				leadIn = 1;
+				break;
+
+			default:
+				/* We'll never hit a 16-byte boundary, so do the whole
+				 * thing the slow way.
+				 */
+				leadIn = width;
+				break;
+		}
+
+		if (leadIn)
+		{
+			pstatus_t status = 0;
+			status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
+			                                 src2Step, (BYTE*)dptr, dstStep, leadIn, 1);
+			if (status != PRIMITIVES_SUCCESS)
+				return status;
+
+			sptr1 += leadIn;
+			sptr2 += leadIn;
+			dptr += leadIn;
+			pixels -= leadIn;
+		}
+
+		/* Use SSE registers to do 4 pixels at a time. */
+		count = pixels >> 2;
+		pixels -= count << 2;
+
+		while (count--)
+		{
+			__m128i xmm2;
+			__m128i xmm3;
+			__m128i xmm4;
+			__m128i xmm5;
+			__m128i xmm6;
+			__m128i xmm7;
+			/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
+			xmm2 = LOAD_SI128(sptr1);
+			sptr1 += 4;
+			/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
+			xmm3 = LOAD_SI128(sptr2);
+			sptr2 += 4;
+			/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
+			xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
+			/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
+			xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
+			/* subtract */
+			xmm6 = _mm_subs_epi16(xmm4, xmm5);
+			/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
+			xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
+			/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
+			xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
+			/* Add one to alphas */
+			xmm4 = _mm_adds_epi16(xmm4, xmm1);
+			/* Multiply and take low word */
+			xmm4 = _mm_mullo_epi16(xmm4, xmm6);
+			/* Shift 8 right */
+			xmm4 = _mm_srai_epi16(xmm4, 8);
+			/* Add xmm5 */
+			xmm4 = _mm_adds_epi16(xmm4, xmm5);
+			/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
+			/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
+			xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
+			/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
+			xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
+			/* subtract */
+			xmm7 = _mm_subs_epi16(xmm5, xmm6);
+			/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
+			xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
+			/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
+			xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
+			/* Add one to alphas */
+			xmm5 = _mm_adds_epi16(xmm5, xmm1);
+			/* Multiply and take low word */
+			xmm5 = _mm_mullo_epi16(xmm5, xmm7);
+			/* Shift 8 right */
+			xmm5 = _mm_srai_epi16(xmm5, 8);
+			/* Add xmm6 */
+			xmm5 = _mm_adds_epi16(xmm5, xmm6);
+			/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
+			/* Must mask off remainders or pack gets confused */
+			xmm3 = _mm_set1_epi16(0x00ffU);
+			xmm4 = _mm_and_si128(xmm4, xmm3);
+			xmm5 = _mm_and_si128(xmm5, xmm3);
+			/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
+			xmm5 = _mm_packus_epi16(xmm5, xmm4);
+			_mm_store_si128((__m128i*)dptr, xmm5);
+			dptr += 4;
+		}
+
+		/* Finish off the remainder. */
+		if (pixels)
+		{
+			pstatus_t status = 0;
+			status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
+			                                 src2Step, (BYTE*)dptr, dstStep, pixels, 1);
+			if (status != PRIMITIVES_SUCCESS)
+				return status;
+
+			sptr1 += pixels;
+			sptr2 += pixels;
+			dptr += pixels;
+		}
+
+		/* Jump to next row. */
+		sptr1 += src1Jump;
+		sptr2 += src2Jump;
+		dptr += dstJump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+#ifdef WITH_IPP
+/* ------------------------------------------------------------------------- */
+static pstatus_t ipp_alphaComp_argb(const BYTE* pSrc1, INT32 src1Step, const BYTE* pSrc2,
+                                    INT32 src2Step, BYTE* pDst, INT32 dstStep, INT32 width,
+                                    INT32 height)
+{
+	IppiSize sz;
+	sz.width = width;
+	sz.height = height;
+	return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, sz, ippAlphaOver);
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	generic = primitives_get_generic();
+	primitives_init_alphaComp(prims);
+#ifdef WITH_IPP
+	prims->alphaComp_argb = ipp_alphaComp_argb;
+#elif defined(WITH_SSE2)
+
+	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
+	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
+	{
+		prims->alphaComp_argb = sse2_alphaComp_argb;
+	}
+
+#endif
+}
diff --git a/libfreerdp/primitives/prim_andor.c b/libfreerdp/primitives/prim_andor.c
new file mode 100644
index 0000000..9216546
--- /dev/null
+++ b/libfreerdp/primitives/prim_andor.c
@@ -0,0 +1,57 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Logical operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+/* ----------------------------------------------------------------------------
+ * 32-bit AND with a constant.
+ */
+static pstatus_t general_andC_32u(const UINT32* pSrc, UINT32 val, UINT32* pDst, INT32 len)
+{
+	if (val == 0)
+		return PRIMITIVES_SUCCESS;
+
+	while (len--)
+		*pDst++ = *pSrc++ & val;
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ----------------------------------------------------------------------------
+ * 32-bit OR with a constant.
+ */
+static pstatus_t general_orC_32u(const UINT32* pSrc, UINT32 val, UINT32* pDst, INT32 len)
+{
+	if (val == 0)
+		return PRIMITIVES_SUCCESS;
+
+	while (len--)
+		*pDst++ = *pSrc++ | val;
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_andor(primitives_t* prims)
+{
+	/* Start with the default. */
+	prims->andC_32u = general_andC_32u;
+	prims->orC_32u = general_orC_32u;
+}
diff --git a/libfreerdp/primitives/prim_andor_opt.c b/libfreerdp/primitives/prim_andor_opt.c
new file mode 100644
index 0000000..bc51f1c
--- /dev/null
+++ b/libfreerdp/primitives/prim_andor_opt.c
@@ -0,0 +1,63 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized Logical operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+#include <ipps.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+static primitives_t* generic = NULL;
+
+#ifdef WITH_SSE2
+#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, generic->andC_32u, _mm_and_si128,
+                     *dptr++ = *sptr++ & val)
+SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, generic->orC_32u, _mm_or_si128, *dptr++ = *sptr++ | val)
+#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	generic = primitives_get_generic();
+	primitives_init_andor(prims);
+#if defined(WITH_IPP)
+	prims->andC_32u = (__andC_32u_t)ippsAndC_32u;
+	prims->orC_32u = (__orC_32u_t)ippsOrC_32u;
+#elif defined(WITH_SSE2)
+
+	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
+	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+	{
+		prims->andC_32u = sse3_andC_32u;
+		prims->orC_32u = sse3_orC_32u;
+	}
+
+#endif
+}
diff --git a/libfreerdp/primitives/prim_colors.c b/libfreerdp/primitives/prim_colors.c
new file mode 100644
index 0000000..4a23129
--- /dev/null
+++ b/libfreerdp/primitives/prim_colors.c
@@ -0,0 +1,509 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Color conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * Copyright 2011 Stephen Erisman
+ * Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
+ * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <freerdp/codec/color.h>
+
+#include "prim_internal.h"
+
+#ifndef MINMAX
+#define MINMAX(_v_, _l_, _h_) ((_v_) < (_l_) ? (_l_) : ((_v_) > (_h_) ? (_h_) : (_v_)))
+#endif /* !MINMAX */
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* const WINPR_RESTRICT pSrc[3],
+                                                      UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+                                                      UINT32 dstStep, UINT32 DstFormat,
+                                                      const prim_size_t* WINPR_RESTRICT roi)
+{
+	BYTE* pRGB = pDst;
+	const INT16* pY = pSrc[0];
+	const INT16* pCb = pSrc[1];
+	const INT16* pCr = pSrc[2];
+	const size_t srcPad = (srcStep - (roi->width * 2)) / 2;
+	const size_t dstPad = (dstStep - (roi->width * 4));
+	const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+
+	for (UINT32 y = 0; y < roi->height; y++)
+	{
+		for (UINT32 x = 0; x < roi->width; x++)
+		{
+			INT16 R = 0;
+			INT16 G = 0;
+			INT16 B = 0;
+			const INT32 divisor = 16;
+			const INT32 Y = (INT32)((UINT32)((*pY++) + 4096) << divisor);
+			const INT32 Cb = (*pCb++);
+			const INT32 Cr = (*pCr++);
+			const INT64 CrR = Cr * (INT64)(1.402525f * (1 << divisor)) * 1LL;
+			const INT64 CrG = Cr * (INT64)(0.714401f * (1 << divisor)) * 1LL;
+			const INT64 CbG = Cb * (INT64)(0.343730f * (1 << divisor)) * 1LL;
+			const INT64 CbB = Cb * (INT64)(1.769905f * (1 << divisor)) * 1LL;
+			R = ((INT16)((CrR + Y) >> divisor) >> 5);
+			G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
+			B = ((INT16)((CbB + Y) >> divisor) >> 5);
+			pRGB = writePixelBGRX(pRGB, formatSize, DstFormat, CLIP(R), CLIP(G), CLIP(B), 0);
+		}
+
+		pY += srcPad;
+		pCb += srcPad;
+		pCr += srcPad;
+		pRGB += dstPad;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_yCbCrToRGB_16s8u_P3AC4R_general(const INT16* const WINPR_RESTRICT pSrc[3],
+                                                         UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+                                                         UINT32 dstStep, UINT32 DstFormat,
+                                                         const prim_size_t* WINPR_RESTRICT roi)
+{
+	BYTE* pRGB = pDst;
+	const INT16* pY = pSrc[0];
+	const INT16* pCb = pSrc[1];
+	const INT16* pCr = pSrc[2];
+	const size_t srcPad = (srcStep - (roi->width * 2)) / 2;
+	const size_t dstPad = (dstStep - (roi->width * 4));
+	const fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, FALSE);
+	const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+
+	for (UINT32 y = 0; y < roi->height; y++)
+	{
+		for (UINT32 x = 0; x < roi->width; x++)
+		{
+			INT64 R = 0;
+			INT64 G = 0;
+			INT64 B = 0;
+			const INT32 divisor = 16;
+			const INT32 Y = (INT32)((UINT32)((*pY++) + 4096) << divisor);
+			const INT32 Cb = (*pCb++);
+			const INT32 Cr = (*pCr++);
+			const INT64 CrR = Cr * (INT64)(1.402525f * (1 << divisor)) * 1LL;
+			const INT64 CrG = Cr * (INT64)(0.714401f * (1 << divisor)) * 1LL;
+			const INT64 CbG = Cb * (INT64)(0.343730f * (1 << divisor)) * 1LL;
+			const INT64 CbB = Cb * (INT64)(1.769905f * (1 << divisor)) * 1LL;
+			R = (INT64)((CrR + Y) >> (divisor + 5));
+			G = (INT64)((Y - CbG - CrG) >> (divisor + 5));
+			B = (INT64)((CbB + Y) >> (divisor + 5));
+			pRGB = writePixel(pRGB, formatSize, DstFormat, CLIP(R), CLIP(G), CLIP(B), 0);
+		}
+
+		pY += srcPad;
+		pCb += srcPad;
+		pCr += srcPad;
+		pRGB += dstPad;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_yCbCrToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3],
+                                                 UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+                                                 UINT32 dstStep, UINT32 DstFormat,
+                                                 const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return general_yCbCrToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, DstFormat,
+			                                            roi);
+
+		default:
+			return general_yCbCrToRGB_16s8u_P3AC4R_general(pSrc, srcStep, pDst, dstStep, DstFormat,
+			                                               roi);
+	}
+}
+
+/* ------------------------------------------------------------------------- */
+
+static pstatus_t
+general_yCbCrToRGB_16s16s_P3P3(const INT16* const WINPR_RESTRICT pSrc[3], INT32 srcStep,
+                               INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
+                               const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	/**
+	 * The decoded YCbCr coeffectients are represented as 11.5 fixed-point
+	 * numbers:
+	 *
+	 * 1 sign bit + 10 integer bits + 5 fractional bits
+	 *
+	 * However only 7 integer bits will be actually used since the value range
+	 * is [-128.0, 127.0].  In other words, the decoded coefficients are scaled
+	 * by << 5 when interpreted as INT16.
+	 * It was scaled in the quantization phase, so we must scale it back here.
+	 */
+	const INT16* yptr = pSrc[0];
+	const INT16* cbptr = pSrc[1];
+	const INT16* crptr = pSrc[2];
+	INT16* rptr = pDst[0];
+	INT16* gptr = pDst[1];
+	INT16* bptr = pDst[2];
+	UINT32 srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+	UINT32 dstbump = (dstStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+
+	for (UINT32 y = 0; y < roi->height; y++)
+	{
+		for (UINT32 x = 0; x < roi->width; ++x)
+		{
+			/* INT32 is used intentionally because we calculate
+			 * with shifted factors!
+			 */
+			INT32 cy = (INT32)(*yptr++);
+			INT32 cb = (INT32)(*cbptr++);
+			INT32 cr = (INT32)(*crptr++);
+			INT64 r = 0;
+			INT64 g = 0;
+			INT64 b = 0;
+			/*
+			 * This is the slow floating point version kept here for reference.
+			 * y = y + 4096; // 128<<5=4096 so that we can scale the sum by>>5
+			 * r = y + cr*1.403f;
+			 * g = y - cb*0.344f - cr*0.714f;
+			 * b = y + cb*1.770f;
+			 * y_r_buf[i]  = CLIP(r>>5);
+			 * cb_g_buf[i] = CLIP(g>>5);
+			 * cr_b_buf[i] = CLIP(b>>5);
+			 */
+			/*
+			 * We scale the factors by << 16 into 32-bit integers in order to
+			 * avoid slower floating point multiplications.  Since the final
+			 * result needs to be scaled by >> 5 we will extract only the
+			 * upper 11 bits (>> 21) from the final sum.
+			 * Hence we also have to scale the other terms of the sum by << 16.
+			 * R: 1.403 << 16 = 91947
+			 * G: 0.344 << 16 = 22544, 0.714 << 16 = 46792
+			 * B: 1.770 << 16 = 115998
+			 */
+			cy = (INT32)((UINT32)(cy + 4096) << 16);
+			r = cy + cr * 91947LL;
+			g = cy - cb * 22544LL - cr * 46792LL;
+			b = cy + cb * 115998LL;
+			*rptr++ = CLIP(r >> 21);
+			*gptr++ = CLIP(g >> 21);
+			*bptr++ = CLIP(b >> 21);
+		}
+
+		yptr += srcbump;
+		cbptr += srcbump;
+		crptr += srcbump;
+		rptr += dstbump;
+		gptr += dstbump;
+		bptr += dstbump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t
+general_RGBToYCbCr_16s16s_P3P3(const INT16* const WINPR_RESTRICT pSrc[3], INT32 srcStep,
+                               INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
+                               const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	/* The encoded YCbCr coefficients are represented as 11.5 fixed-point
+	 * numbers:
+	 *
+	 * 1 sign bit + 10 integer bits + 5 fractional bits
+	 *
+	 * However only 7 integer bits will be actually used since the value
+	 * range is [-128.0, 127.0].  In other words, the encoded coefficients
+	 * is scaled by << 5 when interpreted as INT16.
+	 * It will be scaled down to original during the quantization phase.
+	 */
+	const INT16* rptr = pSrc[0];
+	const INT16* gptr = pSrc[1];
+	const INT16* bptr = pSrc[2];
+	INT16* yptr = pDst[0];
+	INT16* cbptr = pDst[1];
+	INT16* crptr = pDst[2];
+	UINT32 srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+	UINT32 dstbump = (dstStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+
+	for (UINT32 y = 0; y < roi->height; y++)
+	{
+		for (UINT32 x = 0; x < roi->width; ++x)
+		{
+			/* INT32 is used intentionally because we calculate with
+			 * shifted factors!
+			 */
+			INT32 r = (INT32)(*rptr++);
+			INT32 g = (INT32)(*gptr++);
+			INT32 b = (INT32)(*bptr++);
+			/* We scale the factors by << 15 into 32-bit integers in order
+			 * to avoid slower floating point multiplications.  Since the
+			 * terms need to be scaled by << 5 we simply scale the final
+			 * sum by >> 10
+			 *
+			 * Y:  0.299000 << 15 = 9798,  0.587000 << 15 = 19235,
+			 *     0.114000 << 15 = 3735
+			 * Cb: 0.168935 << 15 = 5535,  0.331665 << 15 = 10868,
+			 *     0.500590 << 15 = 16403
+			 * Cr: 0.499813 << 15 = 16377, 0.418531 << 15 = 13714,
+			 *     0.081282 << 15 = 2663
+			 */
+			INT32 cy = (r * 9798 + g * 19235 + b * 3735) >> 10;
+			INT32 cb = (r * -5535 + g * -10868 + b * 16403) >> 10;
+			INT32 cr = (r * 16377 + g * -13714 + b * -2663) >> 10;
+			*yptr++ = (INT16)MINMAX(cy - 4096, -4096, 4095);
+			*cbptr++ = (INT16)MINMAX(cb, -4096, 4095);
+			*crptr++ = (INT16)MINMAX(cr, -4096, 4095);
+		}
+
+		yptr += srcbump;
+		cbptr += srcbump;
+		crptr += srcbump;
+		rptr += dstbump;
+		gptr += dstbump;
+		bptr += dstbump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static INLINE void writeScanlineGeneric(BYTE* dst, DWORD formatSize, UINT32 DstFormat,
+                                        const INT16* r, const INT16* g, const INT16* b, DWORD width)
+{
+	fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, FALSE);
+
+	for (UINT32 x = 0; x < width; x++)
+		dst = writePixel(dst, formatSize, DstFormat, *r++, *g++, *b++, 0);
+}
+
+static INLINE void writeScanlineRGB(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
+                                    const INT16* g, const INT16* b, DWORD width)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(DstFormat);
+
+	for (UINT32 x = 0; x < width; x++)
+	{
+		const BYTE R = CLIP(*r++);
+		const BYTE G = CLIP(*g++);
+		const BYTE B = CLIP(*b++);
+		*dst++ = R;
+		*dst++ = G;
+		*dst++ = B;
+	}
+}
+
+static INLINE void writeScanlineBGR(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
+                                    const INT16* g, const INT16* b, DWORD width)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(DstFormat);
+
+	for (UINT32 x = 0; x < width; x++)
+	{
+		const BYTE R = CLIP(*r++);
+		const BYTE G = CLIP(*g++);
+		const BYTE B = CLIP(*b++);
+		*dst++ = B;
+		*dst++ = G;
+		*dst++ = R;
+	}
+}
+
+static INLINE void writeScanlineBGRX(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
+                                     const INT16* g, const INT16* b, DWORD width)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(DstFormat);
+
+	for (UINT32 x = 0; x < width; x++)
+	{
+		const BYTE R = CLIP(*r++);
+		const BYTE G = CLIP(*g++);
+		const BYTE B = CLIP(*b++);
+		*dst++ = B;
+		*dst++ = G;
+		*dst++ = R;
+		*dst++ = 0xFF;
+	}
+}
+
+static INLINE void writeScanlineRGBX(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
+                                     const INT16* g, const INT16* b, DWORD width)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(DstFormat);
+
+	for (UINT32 x = 0; x < width; x++)
+	{
+		const BYTE R = CLIP(*r++);
+		const BYTE G = CLIP(*g++);
+		const BYTE B = CLIP(*b++);
+		*dst++ = R;
+		*dst++ = G;
+		*dst++ = B;
+		*dst++ = 0xFF;
+	}
+}
+
+static INLINE void writeScanlineXBGR(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
+                                     const INT16* g, const INT16* b, DWORD width)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(DstFormat);
+
+	for (UINT32 x = 0; x < width; x++)
+	{
+		const BYTE R = CLIP(*r++);
+		const BYTE G = CLIP(*g++);
+		const BYTE B = CLIP(*b++);
+		*dst++ = 0xFF;
+		*dst++ = B;
+		*dst++ = G;
+		*dst++ = R;
+	}
+}
+
+static INLINE void writeScanlineXRGB(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
+                                     const INT16* g, const INT16* b, DWORD width)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(DstFormat);
+
+	for (UINT32 x = 0; x < width; x++)
+	{
+		const BYTE R = CLIP(*r++);
+		const BYTE G = CLIP(*g++);
+		const BYTE B = CLIP(*b++);
+		*dst++ = 0xFF;
+		*dst++ = R;
+		*dst++ = G;
+		*dst++ = B;
+	}
+}
+
+typedef void (*fkt_writeScanline)(BYTE*, DWORD, UINT32, const INT16*, const INT16*, const INT16*,
+                                  DWORD);
+
+static INLINE fkt_writeScanline getScanlineWriteFunction(DWORD format)
+{
+	switch (format)
+	{
+		case PIXEL_FORMAT_ARGB32:
+		case PIXEL_FORMAT_XRGB32:
+			return writeScanlineXRGB;
+
+		case PIXEL_FORMAT_ABGR32:
+		case PIXEL_FORMAT_XBGR32:
+			return writeScanlineXBGR;
+
+		case PIXEL_FORMAT_RGBA32:
+		case PIXEL_FORMAT_RGBX32:
+			return writeScanlineRGBX;
+
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return writeScanlineBGRX;
+
+		case PIXEL_FORMAT_BGR24:
+			return writeScanlineBGR;
+
+		case PIXEL_FORMAT_RGB24:
+			return writeScanlineRGB;
+
+		default:
+			return writeScanlineGeneric;
+	}
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_RGBToRGB_16s8u_P3AC4R_general(
+    const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+    UINT32 srcStep,                            /* bytes between rows in source data */
+    BYTE* WINPR_RESTRICT pDst,                 /* 32-bit interleaved ARGB (ABGR?) data */
+    UINT32 dstStep,                            /* bytes between rows in dest data */
+    UINT32 DstFormat, const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	const INT16* r = pSrc[0];
+	const INT16* g = pSrc[1];
+	const INT16* b = pSrc[2];
+	const DWORD srcAdd = srcStep / sizeof(INT16);
+	fkt_writeScanline writeScanline = getScanlineWriteFunction(DstFormat);
+	const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+
+	for (UINT32 y = 0; y < roi->height; ++y)
+	{
+		(*writeScanline)(pDst, formatSize, DstFormat, r, g, b, roi->width);
+		pDst += dstStep;
+		r += srcAdd;
+		g += srcAdd;
+		b += srcAdd;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_RGBToRGB_16s8u_P3AC4R_BGRX(
+    const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+    UINT32 srcStep,                            /* bytes between rows in source data */
+    BYTE* WINPR_RESTRICT pDst,                 /* 32-bit interleaved ARGB (ABGR?) data */
+    UINT32 dstStep,                            /* bytes between rows in dest data */
+    UINT32 DstFormat, const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	const INT16* r = pSrc[0];
+	const INT16* g = pSrc[1];
+	const INT16* b = pSrc[2];
+	const DWORD srcAdd = srcStep / sizeof(INT16);
+	const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+
+	for (UINT32 y = 0; y < roi->height; ++y)
+	{
+		writeScanlineBGRX(pDst, formatSize, DstFormat, r, g, b, roi->width);
+		pDst += dstStep;
+		r += srcAdd;
+		g += srcAdd;
+		b += srcAdd;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_RGBToRGB_16s8u_P3AC4R(
+    const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+    UINT32 srcStep,                            /* bytes between rows in source data */
+    BYTE* WINPR_RESTRICT pDst,                 /* 32-bit interleaved ARGB (ABGR?) data */
+    UINT32 dstStep,                            /* bytes between rows in dest data */
+    UINT32 DstFormat, const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return general_RGBToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+
+		default:
+			return general_RGBToRGB_16s8u_P3AC4R_general(pSrc, srcStep, pDst, dstStep, DstFormat,
+			                                             roi);
+	}
+}
+/* ------------------------------------------------------------------------- */
+void primitives_init_colors(primitives_t* prims)
+{
+	prims->yCbCrToRGB_16s8u_P3AC4R = general_yCbCrToRGB_16s8u_P3AC4R;
+	prims->yCbCrToRGB_16s16s_P3P3 = general_yCbCrToRGB_16s16s_P3P3;
+	prims->RGBToYCbCr_16s16s_P3P3 = general_RGBToYCbCr_16s16s_P3P3;
+	prims->RGBToRGB_16s8u_P3AC4R = general_RGBToRGB_16s8u_P3AC4R;
+}
diff --git a/libfreerdp/primitives/prim_colors_opt.c b/libfreerdp/primitives/prim_colors_opt.c
new file mode 100644
index 0000000..60debc3
--- /dev/null
+++ b/libfreerdp/primitives/prim_colors_opt.c
@@ -0,0 +1,1591 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized Color conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * Copyright 2011 Stephen Erisman
+ * Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
+ * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#elif defined(WITH_NEON)
+#include <arm_neon.h>
+#endif /* WITH_SSE2 else WITH_NEON */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+static primitives_t* generic = NULL;
+
+#ifdef WITH_SSE2
+
+#ifdef __GNUC__
+#define GNU_INLINE __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+#else
+#define GNU_INLINE
+#endif
+
+#define CACHE_LINE_BYTES 64
+
+#define _mm_between_epi16(_val, _min, _max)                    \
+	do                                                         \
+	{                                                          \
+		_val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); \
+	} while (0)
+
+#ifdef DO_PREFETCH
+/*---------------------------------------------------------------------------*/
+static inline void GNU_INLINE _mm_prefetch_buffer(char* WINPR_RESTRICT buffer, int num_bytes)
+{
+	__m128i* buf = (__m128i*)buffer;
+
+	for (unsigned int i = 0; i < (num_bytes / sizeof(__m128i));
+	     i += (CACHE_LINE_BYTES / sizeof(__m128i)))
+	{
+		_mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
+	}
+}
+#endif /* DO_PREFETCH */
+
+/*---------------------------------------------------------------------------*/
+static pstatus_t
+sse2_yCbCrToRGB_16s16s_P3P3(const INT16* const WINPR_RESTRICT pSrc[3], int srcStep,
+                            INT16* WINPR_RESTRICT pDst[3], int dstStep,
+                            const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	__m128i zero;
+	__m128i max;
+	__m128i r_cr;
+	__m128i g_cb;
+	__m128i g_cr;
+	__m128i b_cb;
+	__m128i c4096;
+	const __m128i* y_buf = NULL;
+	const __m128i* cb_buf = NULL;
+	const __m128i* cr_buf = NULL;
+	__m128i* r_buf = NULL;
+	__m128i* g_buf = NULL;
+	__m128i* b_buf = NULL;
+	int srcbump = 0;
+	int dstbump = 0;
+	int imax = 0;
+
+	if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
+	    ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
+	    ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
+	    (srcStep & 127) || (dstStep & 127))
+	{
+		/* We can't maintain 16-byte alignment. */
+		return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
+	}
+
+	zero = _mm_setzero_si128();
+	max = _mm_set1_epi16(255);
+	y_buf = (const __m128i*)(pSrc[0]);
+	cb_buf = (const __m128i*)(pSrc[1]);
+	cr_buf = (const __m128i*)(pSrc[2]);
+	r_buf = (__m128i*)(pDst[0]);
+	g_buf = (__m128i*)(pDst[1]);
+	b_buf = (__m128i*)(pDst[2]);
+	r_cr = _mm_set1_epi16(22986);  /*  1.403 << 14 */
+	g_cb = _mm_set1_epi16(-5636);  /* -0.344 << 14 */
+	g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
+	b_cb = _mm_set1_epi16(28999);  /*  1.770 << 14 */
+	c4096 = _mm_set1_epi16(4096);
+	srcbump = srcStep / sizeof(__m128i);
+	dstbump = dstStep / sizeof(__m128i);
+#ifdef DO_PREFETCH
+
+	/* Prefetch Y's, Cb's, and Cr's. */
+	for (UINT32 yp = 0; yp < roi->height; yp++)
+	{
+		for (int i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
+		     i += (CACHE_LINE_BYTES / sizeof(__m128i)))
+		{
+			_mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA);
+			_mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
+			_mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
+		}
+
+		y_buf += srcbump;
+		cb_buf += srcbump;
+		cr_buf += srcbump;
+	}
+
+	y_buf = (__m128i*)(pSrc[0]);
+	cb_buf = (__m128i*)(pSrc[1]);
+	cr_buf = (__m128i*)(pSrc[2]);
+#endif /* DO_PREFETCH */
+	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
+
+	for (UINT32 yp = 0; yp < roi->height; ++yp)
+	{
+		for (int i = 0; i < imax; i++)
+		{
+			/* In order to use SSE2 signed 16-bit integer multiplication
+			 * we need to convert the floating point factors to signed int
+			 * without losing information.
+			 * The result of this multiplication is 32 bit and we have two
+			 * SSE instructions that return either the hi or lo word.
+			 * Thus we will multiply the factors by the highest possible 2^n,
+			 * take the upper 16 bits of the signed 32-bit result
+			 * (_mm_mulhi_epi16) and correct this result by multiplying
+			 * it by 2^(16-n).
+			 *
+			 * For the given factors in the conversion matrix the best
+			 * possible n is 14.
+			 *
+			 * Example for calculating r:
+			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
+			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
+			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
+			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
+			 */
+			/* y = (y_r_buf[i] + 4096) >> 2 */
+			__m128i y;
+			__m128i cb;
+			__m128i cr;
+			__m128i r;
+			__m128i g;
+			__m128i b;
+			y = _mm_load_si128(y_buf + i);
+			y = _mm_add_epi16(y, c4096);
+			y = _mm_srai_epi16(y, 2);
+			/* cb = cb_g_buf[i]; */
+			cb = _mm_load_si128(cb_buf + i);
+			/* cr = cr_b_buf[i]; */
+			cr = _mm_load_si128(cr_buf + i);
+			/* (y + HIWORD(cr*22986)) >> 3 */
+			r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
+			r = _mm_srai_epi16(r, 3);
+			/* r_buf[i] = CLIP(r); */
+			_mm_between_epi16(r, zero, max);
+			_mm_store_si128(r_buf + i, r);
+			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
+			g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
+			g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
+			g = _mm_srai_epi16(g, 3);
+			/* g_buf[i] = CLIP(g); */
+			_mm_between_epi16(g, zero, max);
+			_mm_store_si128(g_buf + i, g);
+			/* (y + HIWORD(cb*28999)) >> 3 */
+			b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
+			b = _mm_srai_epi16(b, 3);
+			/* b_buf[i] = CLIP(b); */
+			_mm_between_epi16(b, zero, max);
+			_mm_store_si128(b_buf + i, b);
+		}
+
+		y_buf += srcbump;
+		cb_buf += srcbump;
+		cr_buf += srcbump;
+		r_buf += dstbump;
+		g_buf += dstbump;
+		b_buf += dstbump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/*---------------------------------------------------------------------------*/
+static pstatus_t
+sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* const WINPR_RESTRICT pSrc[3], UINT32 srcStep,
+                                  BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
+                                  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	const __m128i zero = _mm_setzero_si128();
+	const __m128i max = _mm_set1_epi16(255);
+	const __m128i r_cr = _mm_set1_epi16(22986);  /*  1.403 << 14 */
+	const __m128i g_cb = _mm_set1_epi16(-5636);  /* -0.344 << 14 */
+	const __m128i g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
+	const __m128i b_cb = _mm_set1_epi16(28999);  /*  1.770 << 14 */
+	const __m128i c4096 = _mm_set1_epi16(4096);
+	const INT16* y_buf = (const INT16*)pSrc[0];
+	const INT16* cb_buf = (const INT16*)pSrc[1];
+	const INT16* cr_buf = (const INT16*)pSrc[2];
+	const UINT32 pad = roi->width % 16;
+	const UINT32 step = sizeof(__m128i) / sizeof(INT16);
+	const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
+	BYTE* d_buf = pDst;
+	const size_t dstPad = (dstStep - roi->width * 4);
+#ifdef DO_PREFETCH
+
+	/* Prefetch Y's, Cb's, and Cr's. */
+	for (UINT32 yp = 0; yp < roi->height; yp++)
+	{
+		for (int i = 0; i < imax; i += (CACHE_LINE_BYTES / sizeof(__m128i)))
+		{
+			_mm_prefetch((char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
+			_mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
+			_mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
+		}
+
+		y_buf += srcStep / sizeof(INT16);
+		cb_buf += srcStep / sizeof(INT16);
+		cr_buf += srcStep / sizeof(INT16);
+	}
+
+	y_buf = (INT16*)pSrc[0];
+	cb_buf = (INT16*)pSrc[1];
+	cr_buf = (INT16*)pSrc[2];
+#endif /* DO_PREFETCH */
+
+	for (UINT32 yp = 0; yp < roi->height; ++yp)
+	{
+		for (UINT32 i = 0; i < imax; i += 2)
+		{
+			/* In order to use SSE2 signed 16-bit integer multiplication
+			 * we need to convert the floating point factors to signed int
+			 * without losing information.
+			 * The result of this multiplication is 32 bit and we have two
+			 * SSE instructions that return either the hi or lo word.
+			 * Thus we will multiply the factors by the highest possible 2^n,
+			 * take the upper 16 bits of the signed 32-bit result
+			 * (_mm_mulhi_epi16) and correct this result by multiplying
+			 * it by 2^(16-n).
+			 *
+			 * For the given factors in the conversion matrix the best
+			 * possible n is 14.
+			 *
+			 * Example for calculating r:
+			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
+			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
+			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
+			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
+			 */
+			/* y = (y_r_buf[i] + 4096) >> 2 */
+			__m128i y1;
+			__m128i y2;
+			__m128i cb1;
+			__m128i cb2;
+			__m128i cr1;
+			__m128i cr2;
+			__m128i r1;
+			__m128i r2;
+			__m128i g1;
+			__m128i g2;
+			__m128i b1;
+			__m128i b2;
+			y1 = _mm_load_si128((const __m128i*)y_buf);
+			y_buf += step;
+			y1 = _mm_add_epi16(y1, c4096);
+			y1 = _mm_srai_epi16(y1, 2);
+			/* cb = cb_g_buf[i]; */
+			cb1 = _mm_load_si128((const __m128i*)cb_buf);
+			cb_buf += step;
+			/* cr = cr_b_buf[i]; */
+			cr1 = _mm_load_si128((const __m128i*)cr_buf);
+			cr_buf += step;
+			/* (y + HIWORD(cr*22986)) >> 3 */
+			r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
+			r1 = _mm_srai_epi16(r1, 3);
+			/* r_buf[i] = CLIP(r); */
+			_mm_between_epi16(r1, zero, max);
+			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
+			g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
+			g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
+			g1 = _mm_srai_epi16(g1, 3);
+			/* g_buf[i] = CLIP(g); */
+			_mm_between_epi16(g1, zero, max);
+			/* (y + HIWORD(cb*28999)) >> 3 */
+			b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
+			b1 = _mm_srai_epi16(b1, 3);
+			/* b_buf[i] = CLIP(b); */
+			_mm_between_epi16(b1, zero, max);
+			y2 = _mm_load_si128((const __m128i*)y_buf);
+			y_buf += step;
+			y2 = _mm_add_epi16(y2, c4096);
+			y2 = _mm_srai_epi16(y2, 2);
+			/* cb = cb_g_buf[i]; */
+			cb2 = _mm_load_si128((const __m128i*)cb_buf);
+			cb_buf += step;
+			/* cr = cr_b_buf[i]; */
+			cr2 = _mm_load_si128((const __m128i*)cr_buf);
+			cr_buf += step;
+			/* (y + HIWORD(cr*22986)) >> 3 */
+			r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
+			r2 = _mm_srai_epi16(r2, 3);
+			/* r_buf[i] = CLIP(r); */
+			_mm_between_epi16(r2, zero, max);
+			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
+			g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
+			g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
+			g2 = _mm_srai_epi16(g2, 3);
+			/* g_buf[i] = CLIP(g); */
+			_mm_between_epi16(g2, zero, max);
+			/* (y + HIWORD(cb*28999)) >> 3 */
+			b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
+			b2 = _mm_srai_epi16(b2, 3);
+			/* b_buf[i] = CLIP(b); */
+			_mm_between_epi16(b2, zero, max);
+			{
+				__m128i R0;
+				__m128i R1;
+				__m128i R2;
+				__m128i R3;
+				__m128i R4;
+				/* The comments below pretend these are 8-byte registers
+				 * rather than 16-byte, for readability.
+				 */
+				R0 = b1;                              /* R0 = 00B300B200B100B0 */
+				R1 = b2;                              /* R1 = 00B700B600B500B4 */
+				R0 = _mm_packus_epi16(R0, R1);        /* R0 = B7B6B5B4B3B2B1B0 */
+				R1 = g1;                              /* R1 = 00G300G200G100G0 */
+				R2 = g2;                              /* R2 = 00G700G600G500G4 */
+				R1 = _mm_packus_epi16(R1, R2);        /* R1 = G7G6G5G4G3G2G1G0 */
+				R2 = R1;                              /* R2 = G7G6G5G4G3G2G1G0 */
+				R2 = _mm_unpacklo_epi8(R0, R2);       /* R2 = B3G3B2G2B1G1B0G0 */
+				R1 = _mm_unpackhi_epi8(R0, R1);       /* R1 = B7G7B6G6B5G5B4G4 */
+				R0 = r1;                              /* R0 = 00R300R200R100R0 */
+				R3 = r2;                              /* R3 = 00R700R600R500R4 */
+				R0 = _mm_packus_epi16(R0, R3);        /* R0 = R7R6R5R4R3R2R1R0 */
+				R3 = _mm_set1_epi32(0xFFFFFFFFU);     /* R3 = FFFFFFFFFFFFFFFF */
+				R4 = R3;                              /* R4 = FFFFFFFFFFFFFFFF */
+				R4 = _mm_unpacklo_epi8(R0, R4);       /* R4 = R3FFR2FFR1FFR0FF */
+				R3 = _mm_unpackhi_epi8(R0, R3);       /* R3 = R7FFR6FFR5FFR4FF */
+				R0 = R4;                              /* R0 = R4               */
+				R0 = _mm_unpacklo_epi16(R2, R0);      /* R0 = B1G1R1FFB0G0R0FF */
+				R4 = _mm_unpackhi_epi16(R2, R4);      /* R4 = B3G3R3FFB2G2R2FF */
+				R2 = R3;                              /* R2 = R3               */
+				R2 = _mm_unpacklo_epi16(R1, R2);      /* R2 = B5G5R5FFB4G4R4FF */
+				R3 = _mm_unpackhi_epi16(R1, R3);      /* R3 = B7G7R7FFB6G6R6FF */
+				_mm_store_si128((__m128i*)d_buf, R0); /* B1G1R1FFB0G0R0FF      */
+				d_buf += sizeof(__m128i);
+				_mm_store_si128((__m128i*)d_buf, R4); /* B3G3R3FFB2G2R2FF      */
+				d_buf += sizeof(__m128i);
+				_mm_store_si128((__m128i*)d_buf, R2); /* B5G5R5FFB4G4R4FF      */
+				d_buf += sizeof(__m128i);
+				_mm_store_si128((__m128i*)d_buf, R3); /* B7G7R7FFB6G6R6FF      */
+				d_buf += sizeof(__m128i);
+			}
+		}
+
+		for (UINT32 i = 0; i < pad; i++)
+		{
+			const INT32 divisor = 16;
+			const INT32 Y = ((*y_buf++) + 4096) << divisor;
+			const INT32 Cb = (*cb_buf++);
+			const INT32 Cr = (*cr_buf++);
+			const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
+			const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
+			const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
+			const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
+			const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
+			const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
+			const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
+			*d_buf++ = CLIP(B);
+			*d_buf++ = CLIP(G);
+			*d_buf++ = CLIP(R);
+			*d_buf++ = 0xFF;
+		}
+
+		d_buf += dstPad;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/*---------------------------------------------------------------------------*/
+static pstatus_t
+sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* const WINPR_RESTRICT pSrc[3], UINT32 srcStep,
+                                  BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
+                                  const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	const __m128i zero = _mm_setzero_si128();
+	const __m128i max = _mm_set1_epi16(255);
+	const __m128i r_cr = _mm_set1_epi16(22986);  /*  1.403 << 14 */
+	const __m128i g_cb = _mm_set1_epi16(-5636);  /* -0.344 << 14 */
+	const __m128i g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
+	const __m128i b_cb = _mm_set1_epi16(28999);  /*  1.770 << 14 */
+	const __m128i c4096 = _mm_set1_epi16(4096);
+	const INT16* y_buf = (const INT16*)pSrc[0];
+	const INT16* cb_buf = (const INT16*)pSrc[1];
+	const INT16* cr_buf = (const INT16*)pSrc[2];
+	const UINT32 pad = roi->width % 16;
+	const UINT32 step = sizeof(__m128i) / sizeof(INT16);
+	const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
+	BYTE* d_buf = pDst;
+	const size_t dstPad = (dstStep - roi->width * 4);
+#ifdef DO_PREFETCH
+
+	/* Prefetch Y's, Cb's, and Cr's. */
+	for (UINT32 yp = 0; yp < roi->height; yp++)
+	{
+		for (int i = 0; i < imax; i += (CACHE_LINE_BYTES / sizeof(__m128i)))
+		{
+			_mm_prefetch((char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
+			_mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
+			_mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
+		}
+
+		y_buf += srcStep / sizeof(INT16);
+		cb_buf += srcStep / sizeof(INT16);
+		cr_buf += srcStep / sizeof(INT16);
+	}
+
+	y_buf = (INT16*)(pSrc[0]);
+	cb_buf = (INT16*)(pSrc[1]);
+	cr_buf = (INT16*)(pSrc[2]);
+#endif /* DO_PREFETCH */
+
+	for (UINT32 yp = 0; yp < roi->height; ++yp)
+	{
+		for (UINT32 i = 0; i < imax; i += 2)
+		{
+			/* In order to use SSE2 signed 16-bit integer multiplication
+			 * we need to convert the floating point factors to signed int
+			 * without losing information.
+			 * The result of this multiplication is 32 bit and we have two
+			 * SSE instructions that return either the hi or lo word.
+			 * Thus we will multiply the factors by the highest possible 2^n,
+			 * take the upper 16 bits of the signed 32-bit result
+			 * (_mm_mulhi_epi16) and correct this result by multiplying
+			 * it by 2^(16-n).
+			 *
+			 * For the given factors in the conversion matrix the best
+			 * possible n is 14.
+			 *
+			 * Example for calculating r:
+			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
+			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
+			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
+			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
+			 */
+			/* y = (y_r_buf[i] + 4096) >> 2 */
+			__m128i y1;
+			__m128i y2;
+			__m128i cb1;
+			__m128i cb2;
+			__m128i cr1;
+			__m128i cr2;
+			__m128i r1;
+			__m128i r2;
+			__m128i g1;
+			__m128i g2;
+			__m128i b1;
+			__m128i b2;
+			y1 = _mm_load_si128((const __m128i*)y_buf);
+			y_buf += step;
+			y1 = _mm_add_epi16(y1, c4096);
+			y1 = _mm_srai_epi16(y1, 2);
+			/* cb = cb_g_buf[i]; */
+			cb1 = _mm_load_si128((const __m128i*)cb_buf);
+			cb_buf += step;
+			/* cr = cr_b_buf[i]; */
+			cr1 = _mm_load_si128((const __m128i*)cr_buf);
+			cr_buf += step;
+			/* (y + HIWORD(cr*22986)) >> 3 */
+			r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
+			r1 = _mm_srai_epi16(r1, 3);
+			/* r_buf[i] = CLIP(r); */
+			_mm_between_epi16(r1, zero, max);
+			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
+			g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
+			g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
+			g1 = _mm_srai_epi16(g1, 3);
+			/* g_buf[i] = CLIP(g); */
+			_mm_between_epi16(g1, zero, max);
+			/* (y + HIWORD(cb*28999)) >> 3 */
+			b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
+			b1 = _mm_srai_epi16(b1, 3);
+			/* b_buf[i] = CLIP(b); */
+			_mm_between_epi16(b1, zero, max);
+			y2 = _mm_load_si128((const __m128i*)y_buf);
+			y_buf += step;
+			y2 = _mm_add_epi16(y2, c4096);
+			y2 = _mm_srai_epi16(y2, 2);
+			/* cb = cb_g_buf[i]; */
+			cb2 = _mm_load_si128((const __m128i*)cb_buf);
+			cb_buf += step;
+			/* cr = cr_b_buf[i]; */
+			cr2 = _mm_load_si128((const __m128i*)cr_buf);
+			cr_buf += step;
+			/* (y + HIWORD(cr*22986)) >> 3 */
+			r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
+			r2 = _mm_srai_epi16(r2, 3);
+			/* r_buf[i] = CLIP(r); */
+			_mm_between_epi16(r2, zero, max);
+			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
+			g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
+			g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
+			g2 = _mm_srai_epi16(g2, 3);
+			/* g_buf[i] = CLIP(g); */
+			_mm_between_epi16(g2, zero, max);
+			/* (y + HIWORD(cb*28999)) >> 3 */
+			b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
+			b2 = _mm_srai_epi16(b2, 3);
+			/* b_buf[i] = CLIP(b); */
+			_mm_between_epi16(b2, zero, max);
+			{
+				__m128i R0;
+				__m128i R1;
+				__m128i R2;
+				__m128i R3;
+				__m128i R4;
+				/* The comments below pretend these are 8-byte registers
+				 * rather than 16-byte, for readability.
+				 */
+				R0 = r1;                              /* R0 = 00R300R200R100R0 */
+				R1 = r2;                              /* R1 = 00R700R600R500R4 */
+				R0 = _mm_packus_epi16(R0, R1);        /* R0 = R7R6R5R4R3R2R1R0 */
+				R1 = g1;                              /* R1 = 00G300G200G100G0 */
+				R2 = g2;                              /* R2 = 00G700G600G500G4 */
+				R1 = _mm_packus_epi16(R1, R2);        /* R1 = G7G6G5G4G3G2G1G0 */
+				R2 = R1;                              /* R2 = G7G6G5G4G3G2G1G0 */
+				R2 = _mm_unpacklo_epi8(R0, R2);       /* R2 = R3G3R2G2R1G1R0G0 */
+				R1 = _mm_unpackhi_epi8(R0, R1);       /* R1 = R7G7R6G6R5G5R4G4 */
+				R0 = b1;                              /* R0 = 00B300B200B100B0 */
+				R3 = b2;                              /* R3 = 00B700B600B500B4 */
+				R0 = _mm_packus_epi16(R0, R3);        /* R0 = B7B6B5B4B3B2B1B0 */
+				R3 = _mm_set1_epi32(0xFFFFFFFFU);     /* R3 = FFFFFFFFFFFFFFFF */
+				R4 = R3;                              /* R4 = FFFFFFFFFFFFFFFF */
+				R4 = _mm_unpacklo_epi8(R0, R4);       /* R4 = B3FFB2FFB1FFB0FF */
+				R3 = _mm_unpackhi_epi8(R0, R3);       /* R3 = B7FFB6FFB5FFB4FF */
+				R0 = R4;                              /* R0 = R4               */
+				R0 = _mm_unpacklo_epi16(R2, R0);      /* R0 = R1G1B1FFR0G0B0FF */
+				R4 = _mm_unpackhi_epi16(R2, R4);      /* R4 = R3G3B3FFR2G2B2FF */
+				R2 = R3;                              /* R2 = R3               */
+				R2 = _mm_unpacklo_epi16(R1, R2);      /* R2 = R5G5B5FFR4G4B4FF */
+				R3 = _mm_unpackhi_epi16(R1, R3);      /* R3 = R7G7B7FFR6G6B6FF */
+				_mm_store_si128((__m128i*)d_buf, R0); /* R1G1B1FFR0G0B0FF      */
+				d_buf += sizeof(__m128i);
+				_mm_store_si128((__m128i*)d_buf, R4); /* R3G3B3FFR2G2B2FF      */
+				d_buf += sizeof(__m128i);
+				_mm_store_si128((__m128i*)d_buf, R2); /* R5G5B5FFR4G4B4FF      */
+				d_buf += sizeof(__m128i);
+				_mm_store_si128((__m128i*)d_buf, R3); /* R7G7B7FFR6G6B6FF      */
+				d_buf += sizeof(__m128i);
+			}
+		}
+
+		for (UINT32 i = 0; i < pad; i++)
+		{
+			const INT32 divisor = 16;
+			const INT32 Y = ((*y_buf++) + 4096) << divisor;
+			const INT32 Cb = (*cb_buf++);
+			const INT32 Cr = (*cr_buf++);
+			const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
+			const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
+			const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
+			const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
+			const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
+			const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
+			const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
+			*d_buf++ = CLIP(R);
+			*d_buf++ = CLIP(G);
+			*d_buf++ = CLIP(B);
+			*d_buf++ = 0xFF;
+		}
+
+		d_buf += dstPad;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t
+sse2_yCbCrToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], UINT32 srcStep,
+                             BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 DstFormat,
+                             const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
+	    ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst)&0x0f) || (srcStep & 0x0f) ||
+	    (dstStep & 0x0f))
+	{
+		/* We can't maintain 16-byte alignment. */
+		return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+	}
+
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
+
+		case PIXEL_FORMAT_RGBA32:
+		case PIXEL_FORMAT_RGBX32:
+			return sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
+
+		default:
+			return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+	}
+}
+/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
+ * numbers. See the general code above.
+ */
+static pstatus_t
+sse2_RGBToYCbCr_16s16s_P3P3(const INT16* const WINPR_RESTRICT pSrc[3], int srcStep,
+                            INT16* WINPR_RESTRICT pDst[3], int dstStep,
+                            const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	__m128i min;
+	__m128i max;
+	__m128i y_r;
+	__m128i y_g;
+	__m128i y_b;
+	__m128i cb_r;
+	__m128i cb_g;
+	__m128i cb_b;
+	__m128i cr_r;
+	__m128i cr_g;
+	__m128i cr_b;
+	const __m128i* r_buf = (const __m128i*)(pSrc[0]);
+	const __m128i* g_buf = (const __m128i*)(pSrc[1]);
+	const __m128i* b_buf = (const __m128i*)(pSrc[2]);
+	__m128i* y_buf = (__m128i*)(pDst[0]);
+	__m128i* cb_buf = (__m128i*)(pDst[1]);
+	__m128i* cr_buf = (__m128i*)(pDst[2]);
+	int srcbump = 0;
+	int dstbump = 0;
+	int imax = 0;
+
+	if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
+	    ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
+	    ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
+	    (srcStep & 127) || (dstStep & 127))
+	{
+		/* We can't maintain 16-byte alignment. */
+		return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
+	}
+
+	min = _mm_set1_epi16(-128 * 32);
+	max = _mm_set1_epi16(127 * 32);
+
+	y_r = _mm_set1_epi16(9798);    /*  0.299000 << 15 */
+	y_g = _mm_set1_epi16(19235);   /*  0.587000 << 15 */
+	y_b = _mm_set1_epi16(3735);    /*  0.114000 << 15 */
+	cb_r = _mm_set1_epi16(-5535);  /* -0.168935 << 15 */
+	cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
+	cb_b = _mm_set1_epi16(16403);  /*  0.500590 << 15 */
+	cr_r = _mm_set1_epi16(16377);  /*  0.499813 << 15 */
+	cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
+	cr_b = _mm_set1_epi16(-2663);  /* -0.081282 << 15 */
+	srcbump = srcStep / sizeof(__m128i);
+	dstbump = dstStep / sizeof(__m128i);
+#ifdef DO_PREFETCH
+
+	/* Prefetch RGB's. */
+	for (UINT32 yp = 0; yp < roi->height; yp++)
+	{
+		for (int i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
+		     i += (CACHE_LINE_BYTES / sizeof(__m128i)))
+		{
+			_mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
+			_mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
+			_mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
+		}
+
+		r_buf += srcbump;
+		g_buf += srcbump;
+		b_buf += srcbump;
+	}
+
+	r_buf = (__m128i*)(pSrc[0]);
+	g_buf = (__m128i*)(pSrc[1]);
+	b_buf = (__m128i*)(pSrc[2]);
+#endif /* DO_PREFETCH */
+	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
+
+	for (UINT32 yp = 0; yp < roi->height; ++yp)
+	{
+		for (int i = 0; i < imax; i++)
+		{
+			/* In order to use SSE2 signed 16-bit integer multiplication we
+			 * need to convert the floating point factors to signed int
+			 * without loosing information.  The result of this multiplication
+			 * is 32 bit and using SSE2 we get either the product's hi or lo
+			 * word.  Thus we will multiply the factors by the highest
+			 * possible 2^n and take the upper 16 bits of the signed 32-bit
+			 * result (_mm_mulhi_epi16).  Since the final result needs to
+			 * be scaled by << 5 and also in in order to keep the precision
+			 * within the upper 16 bits we will also have to scale the RGB
+			 * values used in the multiplication by << 5+(16-n).
+			 */
+			__m128i r;
+			__m128i g;
+			__m128i b;
+			__m128i y;
+			__m128i cb;
+			__m128i cr;
+			r = _mm_load_si128(r_buf + i);
+			g = _mm_load_si128(g_buf + i);
+			b = _mm_load_si128(b_buf + i);
+			/* r<<6; g<<6; b<<6 */
+			r = _mm_slli_epi16(r, 6);
+			g = _mm_slli_epi16(g, 6);
+			b = _mm_slli_epi16(b, 6);
+			/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
+			y = _mm_mulhi_epi16(r, y_r);
+			y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
+			y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
+			y = _mm_add_epi16(y, min);
+			/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
+			_mm_between_epi16(y, min, max);
+			_mm_store_si128(y_buf + i, y);
+			/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
+			cb = _mm_mulhi_epi16(r, cb_r);
+			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
+			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
+			/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
+			_mm_between_epi16(cb, min, max);
+			_mm_store_si128(cb_buf + i, cb);
+			/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
+			cr = _mm_mulhi_epi16(r, cr_r);
+			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
+			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
+			/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
+			_mm_between_epi16(cr, min, max);
+			_mm_store_si128(cr_buf + i, cr);
+		}
+
+		y_buf += srcbump;
+		cb_buf += srcbump;
+		cr_buf += srcbump;
+		r_buf += dstbump;
+		g_buf += dstbump;
+		b_buf += dstbump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/*---------------------------------------------------------------------------*/
+static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX(
+    const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+    UINT32 srcStep,                            /* bytes between rows in source data */
+    BYTE* WINPR_RESTRICT pDst,                 /* 32-bit interleaved ARGB (ABGR?) data */
+    UINT32 dstStep,                            /* bytes between rows in dest data */
+    const prim_size_t* WINPR_RESTRICT roi)     /* region of interest */
+{
+	const UINT16* pr = (const UINT16*)(pSrc[0]);
+	const UINT16* pg = (const UINT16*)(pSrc[1]);
+	const UINT16* pb = (const UINT16*)(pSrc[2]);
+	const UINT32 pad = roi->width % 16;
+	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
+	BYTE* out = NULL;
+	UINT32 srcbump = 0;
+	UINT32 dstbump = 0;
+	out = (BYTE*)pDst;
+	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
+
+	for (UINT32 y = 0; y < roi->height; ++y)
+	{
+		for (UINT32 x = 0; x < roi->width - pad; x += 16)
+		{
+			__m128i r;
+			__m128i g;
+			__m128i b;
+			/* The comments below pretend these are 8-byte registers
+			 * rather than 16-byte, for readability.
+			 */
+			{
+				__m128i R0;
+				__m128i R1;
+				R0 = _mm_load_si128((const __m128i*)pb);
+				pb += 8; /* R0 = 00B300B200B100B0 */
+				R1 = _mm_load_si128((const __m128i*)pb);
+				pb += 8;                      /* R1 = 00B700B600B500B4 */
+				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
+			}
+			{
+				__m128i R0;
+				__m128i R1;
+				R0 = _mm_load_si128((const __m128i*)pg);
+				pg += 8; /* R1 = 00G300G200G100G0 */
+				R1 = _mm_load_si128((const __m128i*)pg);
+				pg += 8;                      /* R2 = 00G700G600G500G4 */
+				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
+			}
+			{
+				__m128i R0;
+				__m128i R1;
+				R0 = _mm_load_si128((const __m128i*)pr);
+				pr += 8; /* R0 = 00R300R200R100R0 */
+				R1 = _mm_load_si128((const __m128i*)pr);
+				pr += 8;                      /* R3 = 00R700R600R500R4 */
+				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
+			}
+			{
+				__m128i gbHi;
+				__m128i gbLo;
+				__m128i arHi;
+				__m128i arLo;
+				{
+					gbLo = _mm_unpacklo_epi8(b, g); /* R0 = G7G6G5G4G3G2G1G0 */
+					gbHi = _mm_unpackhi_epi8(b, g); /* R1 = G7B7G6B7G5B5G4B4 */
+					arLo = _mm_unpacklo_epi8(r, a); /* R4 = FFR3FFR2FFR1FFR0 */
+					arHi = _mm_unpackhi_epi8(r, a); /* R3 = FFR7FFR6FFR5FFR4 */
+				}
+				{
+					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
+					_mm_store_si128((__m128i*)out, bgrx);
+					out += 16; /* FFR1G1B1FFR0G0B0      */
+				}
+				{
+					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
+					_mm_store_si128((__m128i*)out, bgrx);
+					out += 16; /* FFR3G3B3FFR2G2B2      */
+				}
+				{
+					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
+					_mm_store_si128((__m128i*)out, bgrx);
+					out += 16; /* FFR5G5B5FFR4G4B4      */
+				}
+				{
+					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
+					_mm_store_si128((__m128i*)out, bgrx);
+					out += 16; /* FFR7G7B7FFR6G6B6      */
+				}
+			}
+		}
+
+		for (UINT32 x = 0; x < pad; x++)
+		{
+			const BYTE R = CLIP(*pr++);
+			const BYTE G = CLIP(*pg++);
+			const BYTE B = CLIP(*pb++);
+			*out++ = B;
+			*out++ = G;
+			*out++ = R;
+			*out++ = 0xFF;
+		}
+
+		/* Jump to next row. */
+		pr += srcbump;
+		pg += srcbump;
+		pb += srcbump;
+		out += dstbump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX(
+    const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+    UINT32 srcStep,                            /* bytes between rows in source data */
+    BYTE* WINPR_RESTRICT pDst,                 /* 32-bit interleaved ARGB (ABGR?) data */
+    UINT32 dstStep,                            /* bytes between rows in dest data */
+    const prim_size_t* WINPR_RESTRICT roi)     /* region of interest */
+{
+	const UINT16* pr = (const UINT16*)(pSrc[0]);
+	const UINT16* pg = (const UINT16*)(pSrc[1]);
+	const UINT16* pb = (const UINT16*)(pSrc[2]);
+	const UINT32 pad = roi->width % 16;
+	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
+	BYTE* out = NULL;
+	UINT32 srcbump = 0;
+	UINT32 dstbump = 0;
+	out = (BYTE*)pDst;
+	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
+
+	for (UINT32 y = 0; y < roi->height; ++y)
+	{
+		for (UINT32 x = 0; x < roi->width - pad; x += 16)
+		{
+			__m128i r;
+			__m128i g;
+			__m128i b;
+			/* The comments below pretend these are 8-byte registers
+			 * rather than 16-byte, for readability.
+			 */
+			{
+				__m128i R0;
+				__m128i R1;
+				R0 = _mm_load_si128((const __m128i*)pb);
+				pb += 8; /* R0 = 00B300B200B100B0 */
+				R1 = _mm_load_si128((const __m128i*)pb);
+				pb += 8;                      /* R1 = 00B700B600B500B4 */
+				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
+			}
+			{
+				__m128i R0;
+				__m128i R1;
+				R0 = _mm_load_si128((const __m128i*)pg);
+				pg += 8; /* R1 = 00G300G200G100G0 */
+				R1 = _mm_load_si128((const __m128i*)pg);
+				pg += 8;                      /* R2 = 00G700G600G500G4 */
+				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
+			}
+			{
+				__m128i R0;
+				__m128i R1;
+				R0 = _mm_load_si128((const __m128i*)pr);
+				pr += 8; /* R0 = 00R300R200R100R0 */
+				R1 = _mm_load_si128((const __m128i*)pr);
+				pr += 8;                      /* R3 = 00R700R600R500R4 */
+				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
+			}
+			{
+				__m128i gbHi;
+				__m128i gbLo;
+				__m128i arHi;
+				__m128i arLo;
+				{
+					gbLo = _mm_unpacklo_epi8(r, g); /* R0 = G7G6G5G4G3G2G1G0 */
+					gbHi = _mm_unpackhi_epi8(r, g); /* R1 = G7B7G6B7G5B5G4B4 */
+					arLo = _mm_unpacklo_epi8(b, a); /* R4 = FFR3FFR2FFR1FFR0 */
+					arHi = _mm_unpackhi_epi8(b, a); /* R3 = FFR7FFR6FFR5FFR4 */
+				}
+				{
+					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
+					_mm_store_si128((__m128i*)out, bgrx);
+					out += 16; /* FFR1G1B1FFR0G0B0      */
+				}
+				{
+					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
+					_mm_store_si128((__m128i*)out, bgrx);
+					out += 16; /* FFR3G3B3FFR2G2B2      */
+				}
+				{
+					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
+					_mm_store_si128((__m128i*)out, bgrx);
+					out += 16; /* FFR5G5B5FFR4G4B4      */
+				}
+				{
+					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
+					_mm_store_si128((__m128i*)out, bgrx);
+					out += 16; /* FFR7G7B7FFR6G6B6      */
+				}
+			}
+		}
+
+		for (UINT32 x = 0; x < pad; x++)
+		{
+			const BYTE R = CLIP(*pr++);
+			const BYTE G = CLIP(*pg++);
+			const BYTE B = CLIP(*pb++);
+			*out++ = R;
+			*out++ = G;
+			*out++ = B;
+			*out++ = 0xFF;
+		}
+
+		/* Jump to next row. */
+		pr += srcbump;
+		pg += srcbump;
+		pb += srcbump;
+		out += dstbump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR(
+    const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+    UINT32 srcStep,                            /* bytes between rows in source data */
+    BYTE* WINPR_RESTRICT pDst,                 /* 32-bit interleaved ARGB (ABGR?) data */
+    UINT32 dstStep,                            /* bytes between rows in dest data */
+    const prim_size_t* WINPR_RESTRICT roi)     /* region of interest */
+{
+	const UINT16* pr = (const UINT16*)(pSrc[0]);
+	const UINT16* pg = (const UINT16*)(pSrc[1]);
+	const UINT16* pb = (const UINT16*)(pSrc[2]);
+	const UINT32 pad = roi->width % 16;
+	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
+	BYTE* out = NULL;
+	UINT32 srcbump = 0;
+	UINT32 dstbump = 0;
+	out = (BYTE*)pDst;
+	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
+
+	for (UINT32 y = 0; y < roi->height; ++y)
+	{
+		for (UINT32 x = 0; x < roi->width - pad; x += 16)
+		{
+			__m128i r;
+			__m128i g;
+			__m128i b;
+			/* The comments below pretend these are 8-byte registers
+			 * rather than 16-byte, for readability.
+			 */
+			{
+				__m128i R0;
+				__m128i R1;
+				R0 = _mm_load_si128((const __m128i*)pb);
+				pb += 8; /* R0 = 00B300B200B100B0 */
+				R1 = _mm_load_si128((const __m128i*)pb);
+				pb += 8;                      /* R1 = 00B700B600B500B4 */
+				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
+			}
+			{
+				__m128i R0;
+				__m128i R1;
+				R0 = _mm_load_si128((const __m128i*)pg);
+				pg += 8; /* R1 = 00G300G200G100G0 */
+				R1 = _mm_load_si128((const __m128i*)pg);
+				pg += 8;                      /* R2 = 00G700G600G500G4 */
+				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
+			}
+			{
+				__m128i R0;
+				__m128i R1;
+				R0 = _mm_load_si128((const __m128i*)pr);
+				pr += 8; /* R0 = 00R300R200R100R0 */
+				R1 = _mm_load_si128((const __m128i*)pr);
+				pr += 8;                      /* R3 = 00R700R600R500R4 */
+				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
+			}
+			{
+				__m128i gbHi;
+				__m128i gbLo;
+				__m128i arHi;
+				__m128i arLo;
+				{
+					gbLo = _mm_unpacklo_epi8(a, b); /* R0 = G7G6G5G4G3G2G1G0 */
+					gbHi = _mm_unpackhi_epi8(a, b); /* R1 = G7B7G6B7G5B5G4B4 */
+					arLo = _mm_unpacklo_epi8(g, r); /* R4 = FFR3FFR2FFR1FFR0 */
+					arHi = _mm_unpackhi_epi8(g, r); /* R3 = FFR7FFR6FFR5FFR4 */
+				}
+				{
+					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
+					_mm_store_si128((__m128i*)out, bgrx);
+					out += 16; /* FFR1G1B1FFR0G0B0      */
+				}
+				{
+					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
+					_mm_store_si128((__m128i*)out, bgrx);
+					out += 16; /* FFR3G3B3FFR2G2B2      */
+				}
+				{
+					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
+					_mm_store_si128((__m128i*)out, bgrx);
+					out += 16; /* FFR5G5B5FFR4G4B4      */
+				}
+				{
+					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
+					_mm_store_si128((__m128i*)out, bgrx);
+					out += 16; /* FFR7G7B7FFR6G6B6      */
+				}
+			}
+		}
+
+		for (UINT32 x = 0; x < pad; x++)
+		{
+			const BYTE R = CLIP(*pr++);
+			const BYTE G = CLIP(*pg++);
+			const BYTE B = CLIP(*pb++);
+			*out++ = 0xFF;
+			*out++ = B;
+			*out++ = G;
+			*out++ = R;
+		}
+
+		/* Jump to next row. */
+		pr += srcbump;
+		pg += srcbump;
+		pb += srcbump;
+		out += dstbump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB(
+    const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+    UINT32 srcStep,                            /* bytes between rows in source data */
+    BYTE* WINPR_RESTRICT pDst,                 /* 32-bit interleaved ARGB (ABGR?) data */
+    UINT32 dstStep,                            /* bytes between rows in dest data */
+    const prim_size_t* WINPR_RESTRICT roi)     /* region of interest */
+{
+	const UINT16* pr = (const UINT16*)(pSrc[0]);
+	const UINT16* pg = (const UINT16*)(pSrc[1]);
+	const UINT16* pb = (const UINT16*)(pSrc[2]);
+	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
+	const UINT32 pad = roi->width % 16;
+	BYTE* out = NULL;
+	UINT32 srcbump = 0;
+	UINT32 dstbump = 0;
+	out = (BYTE*)pDst;
+	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
+
+	for (UINT32 y = 0; y < roi->height; ++y)
+	{
+		for (UINT32 x = 0; x < roi->width - pad; x += 16)
+		{
+			__m128i r;
+			__m128i g;
+			__m128i b;
+			/* The comments below pretend these are 8-byte registers
+			 * rather than 16-byte, for readability.
+			 */
+			{
+				__m128i R0;
+				__m128i R1;
+				R0 = _mm_load_si128((const __m128i*)pb);
+				pb += 8; /* R0 = 00B300B200B100B0 */
+				R1 = _mm_load_si128((const __m128i*)pb);
+				pb += 8;                      /* R1 = 00B700B600B500B4 */
+				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
+			}
+			{
+				__m128i R0;
+				__m128i R1;
+				R0 = _mm_load_si128((const __m128i*)pg);
+				pg += 8; /* R1 = 00G300G200G100G0 */
+				R1 = _mm_load_si128((const __m128i*)pg);
+				pg += 8;                      /* R2 = 00G700G600G500G4 */
+				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
+			}
+			{
+				__m128i R0;
+				__m128i R1;
+				R0 = _mm_load_si128((const __m128i*)pr);
+				pr += 8; /* R0 = 00R300R200R100R0 */
+				R1 = _mm_load_si128((const __m128i*)pr);
+				pr += 8;                      /* R3 = 00R700R600R500R4 */
+				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
+			}
+			{
+				__m128i gbHi;
+				__m128i gbLo;
+				__m128i arHi;
+				__m128i arLo;
+				{
+					gbLo = _mm_unpacklo_epi8(a, r); /* R0 = G7G6G5G4G3G2G1G0 */
+					gbHi = _mm_unpackhi_epi8(a, r); /* R1 = G7B7G6B7G5B5G4B4 */
+					arLo = _mm_unpacklo_epi8(g, b); /* R4 = FFR3FFR2FFR1FFR0 */
+					arHi = _mm_unpackhi_epi8(g, b); /* R3 = FFR7FFR6FFR5FFR4 */
+				}
+				{
+					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
+					_mm_store_si128((__m128i*)out, bgrx);
+					out += 16; /* FFR1G1B1FFR0G0B0      */
+				}
+				{
+					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
+					_mm_store_si128((__m128i*)out, bgrx);
+					out += 16; /* FFR3G3B3FFR2G2B2      */
+				}
+				{
+					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
+					_mm_store_si128((__m128i*)out, bgrx);
+					out += 16; /* FFR5G5B5FFR4G4B4      */
+				}
+				{
+					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
+					_mm_store_si128((__m128i*)out, bgrx);
+					out += 16; /* FFR7G7B7FFR6G6B6      */
+				}
+			}
+		}
+
+		for (UINT32 x = 0; x < pad; x++)
+		{
+			const BYTE R = CLIP(*pr++);
+			const BYTE G = CLIP(*pg++);
+			const BYTE B = CLIP(*pb++);
+			*out++ = 0xFF;
+			*out++ = R;
+			*out++ = G;
+			*out++ = B;
+		}
+
+		/* Jump to next row. */
+		pr += srcbump;
+		pg += srcbump;
+		pb += srcbump;
+		out += dstbump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t
+sse2_RGBToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+                           UINT32 srcStep,            /* bytes between rows in source data */
+                           BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
+                           UINT32 dstStep,            /* bytes between rows in dest data */
+                           UINT32 DstFormat, const prim_size_t* WINPR_RESTRICT roi)
+{
+	if (((ULONG_PTR)pSrc[0] & 0x0f) || ((ULONG_PTR)pSrc[1] & 0x0f) || ((ULONG_PTR)pSrc[2] & 0x0f) ||
+	    (srcStep & 0x0f) || ((ULONG_PTR)pDst & 0x0f) || (dstStep & 0x0f))
+		return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return sse2_RGBToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
+
+		case PIXEL_FORMAT_RGBA32:
+		case PIXEL_FORMAT_RGBX32:
+			return sse2_RGBToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
+
+		case PIXEL_FORMAT_ABGR32:
+		case PIXEL_FORMAT_XBGR32:
+			return sse2_RGBToRGB_16s8u_P3AC4R_XBGR(pSrc, srcStep, pDst, dstStep, roi);
+
+		case PIXEL_FORMAT_ARGB32:
+		case PIXEL_FORMAT_XRGB32:
+			return sse2_RGBToRGB_16s8u_P3AC4R_XRGB(pSrc, srcStep, pDst, dstStep, roi);
+
+		default:
+			return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+	}
+}
+#endif /* WITH_SSE2 */
+
+/*---------------------------------------------------------------------------*/
+#ifdef WITH_NEON
+static pstatus_t
+neon_yCbCrToRGB_16s16s_P3P3(const INT16* const WINPR_RESTRICT pSrc[3], INT32 srcStep,
+                            INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
+                            const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	/* TODO: If necessary, check alignments and call the general version. */
+	int16x8_t zero = vdupq_n_s16(0);
+	int16x8_t max = vdupq_n_s16(255);
+	int16x8_t r_cr = vdupq_n_s16(22986);  //  1.403 << 14
+	int16x8_t g_cb = vdupq_n_s16(-5636);  // -0.344 << 14
+	int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14
+	int16x8_t b_cb = vdupq_n_s16(28999);  //  1.770 << 14
+	int16x8_t c4096 = vdupq_n_s16(4096);
+	int16x8_t* y_buf = (int16x8_t*)pSrc[0];
+	int16x8_t* cb_buf = (int16x8_t*)pSrc[1];
+	int16x8_t* cr_buf = (int16x8_t*)pSrc[2];
+	int16x8_t* r_buf = (int16x8_t*)pDst[0];
+	int16x8_t* g_buf = (int16x8_t*)pDst[1];
+	int16x8_t* b_buf = (int16x8_t*)pDst[2];
+	int srcbump = srcStep / sizeof(int16x8_t);
+	int dstbump = dstStep / sizeof(int16x8_t);
+	int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
+
+	for (int yp = 0; yp < roi->height; ++yp)
+	{
+		for (int i = 0; i < imax; i++)
+		{
+			/*
+			    In order to use NEON signed 16-bit integer multiplication we need to convert
+			    the floating point factors to signed int without loosing information.
+			    The result of this multiplication is 32 bit and we have a NEON instruction
+			    that returns the hi word of the saturated double.
+			    Thus we will multiply the factors by the highest possible 2^n, take the
+			    upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
+			    shift by 1 to reverse the doubling) and correct	this result by multiplying it
+			    by 2^(16-n).
+			    For the given factors in the conversion matrix the best possible n is 14.
+
+			    Example for calculating r:
+			    r = (y>>5) + 128 + (cr*1.403)>>5                       // our base formula
+			    r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5      // see above
+			    r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5             // simplification
+			    r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
+			*/
+			/* y = (y_buf[i] + 4096) >> 2 */
+			int16x8_t y = vld1q_s16((INT16*)&y_buf[i]);
+			y = vaddq_s16(y, c4096);
+			y = vshrq_n_s16(y, 2);
+			/* cb = cb_buf[i]; */
+			int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
+			/* cr = cr_buf[i]; */
+			int16x8_t cr = vld1q_s16((INT16*)&cr_buf[i]);
+			/* (y + HIWORD(cr*22986)) >> 3 */
+			int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
+			r = vshrq_n_s16(r, 3);
+			/* r_buf[i] = CLIP(r); */
+			r = vminq_s16(vmaxq_s16(r, zero), max);
+			vst1q_s16((INT16*)&r_buf[i], r);
+			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
+			int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
+			g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
+			g = vshrq_n_s16(g, 3);
+			/* g_buf[i] = CLIP(g); */
+			g = vminq_s16(vmaxq_s16(g, zero), max);
+			vst1q_s16((INT16*)&g_buf[i], g);
+			/* (y + HIWORD(cb*28999)) >> 3 */
+			int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
+			b = vshrq_n_s16(b, 3);
+			/* b_buf[i] = CLIP(b); */
+			b = vminq_s16(vmaxq_s16(b, zero), max);
+			vst1q_s16((INT16*)&b_buf[i], b);
+		}
+
+		y_buf += srcbump;
+		cb_buf += srcbump;
+		cr_buf += srcbump;
+		r_buf += dstbump;
+		g_buf += dstbump;
+		b_buf += dstbump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16* const WINPR_RESTRICT pSrc[3],
+                                                UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+                                                UINT32 dstStep,
+                                                const prim_size_t* WINPR_RESTRICT roi, uint8_t rPos,
+                                                uint8_t gPos, uint8_t bPos, uint8_t aPos)
+{
+	BYTE* pRGB = pDst;
+	const INT16* pY = pSrc[0];
+	const INT16* pCb = pSrc[1];
+	const INT16* pCr = pSrc[2];
+	const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16);
+	const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
+	const size_t pad = roi->width % 8;
+	const int16x4_t c4096 = vdup_n_s16(4096);
+
+	for (UINT32 y = 0; y < roi->height; y++)
+	{
+		for (UINT32 x = 0; x < roi->width - pad; x += 8)
+		{
+			const int16x8_t Y = vld1q_s16(pY);
+			const int16x4_t Yh = vget_high_s16(Y);
+			const int16x4_t Yl = vget_low_s16(Y);
+			const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */
+			const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */
+			const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
+			const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
+			const int16x8_t Cr = vld1q_s16(pCr);
+			const int16x4_t Crh = vget_high_s16(Cr);
+			const int16x4_t Crl = vget_low_s16(Cr);
+			const int16x8_t Cb = vld1q_s16(pCb);
+			const int16x4_t Cbh = vget_high_s16(Cb);
+			const int16x4_t Cbl = vget_low_s16(Cb);
+			uint8x8x4_t bgrx;
+			{
+				/* R */
+				const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */
+				const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */
+				const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
+				const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
+				const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
+				const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
+				const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
+				bgrx.val[rPos] = vqmovun_s16(Rs);
+			}
+			{
+				/* G */
+				const int32x4_t CbGh = vmull_n_s16(Cbh, 22527);            /* 0.343730 * 2^16 */
+				const int32x4_t CbGl = vmull_n_s16(Cbl, 22527);            /* 0.343730 * 2^16 */
+				const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */
+				const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */
+				const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
+				const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
+				const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
+				const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
+				const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
+				const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
+				const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
+				const uint8x8_t G = vqmovun_s16(Gs);
+				bgrx.val[gPos] = G;
+			}
+			{
+				/* B */
+				const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */
+				const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */
+				const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
+				const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
+				const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
+				const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
+				const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
+				const uint8x8_t B = vqmovun_s16(Bs);
+				bgrx.val[bPos] = B;
+			}
+			/* A */
+			{
+				bgrx.val[aPos] = vdup_n_u8(0xFF);
+			}
+			vst4_u8(pRGB, bgrx);
+			pY += 8;
+			pCb += 8;
+			pCr += 8;
+			pRGB += 32;
+		}
+
+		for (UINT32 x = 0; x < pad; x++)
+		{
+			const INT32 divisor = 16;
+			const INT32 Y = ((*pY++) + 4096) << divisor;
+			const INT32 Cb = (*pCb++);
+			const INT32 Cr = (*pCr++);
+			const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
+			const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
+			const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
+			const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
+			INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
+			INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
+			INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
+			BYTE bgrx[4];
+			bgrx[bPos] = CLIP(B);
+			bgrx[gPos] = CLIP(G);
+			bgrx[rPos] = CLIP(R);
+			bgrx[aPos] = 0xFF;
+			*pRGB++ = bgrx[0];
+			*pRGB++ = bgrx[1];
+			*pRGB++ = bgrx[2];
+			*pRGB++ = bgrx[3];
+		}
+
+		pY += srcPad;
+		pCb += srcPad;
+		pCr += srcPad;
+		pRGB += dstPad;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3],
+                                              UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+                                              UINT32 dstStep, UINT32 DstFormat,
+                                              const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
+
+		case PIXEL_FORMAT_RGBA32:
+		case PIXEL_FORMAT_RGBX32:
+			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
+
+		case PIXEL_FORMAT_ARGB32:
+		case PIXEL_FORMAT_XRGB32:
+			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
+
+		case PIXEL_FORMAT_ABGR32:
+		case PIXEL_FORMAT_XBGR32:
+			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
+
+		default:
+			return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+	}
+}
+
+static pstatus_t neon_RGBToRGB_16s8u_P3AC4R_X(
+    const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+    UINT32 srcStep,                            /* bytes between rows in source data */
+    BYTE* WINPR_RESTRICT pDst,                 /* 32-bit interleaved ARGB (ABGR?) data */
+    UINT32 dstStep,                            /* bytes between rows in dest data */
+    const prim_size_t* WINPR_RESTRICT roi,     /* region of interest */
+    uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
+{
+	UINT32 pad = roi->width % 8;
+
+	for (UINT32 y = 0; y < roi->height; y++)
+	{
+		const INT16* pr = (INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
+		const INT16* pg = (INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
+		const INT16* pb = (INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
+		BYTE* dst = pDst + y * dstStep;
+
+		for (UINT32 x = 0; x < roi->width - pad; x += 8)
+		{
+			int16x8_t r = vld1q_s16(pr);
+			int16x8_t g = vld1q_s16(pg);
+			int16x8_t b = vld1q_s16(pb);
+			uint8x8x4_t bgrx;
+			bgrx.val[aPos] = vdup_n_u8(0xFF);
+			bgrx.val[rPos] = vqmovun_s16(r);
+			bgrx.val[gPos] = vqmovun_s16(g);
+			bgrx.val[bPos] = vqmovun_s16(b);
+			vst4_u8(dst, bgrx);
+			pr += 8;
+			pg += 8;
+			pb += 8;
+			dst += 32;
+		}
+
+		for (UINT32 x = 0; x < pad; x++)
+		{
+			BYTE bgrx[4];
+			bgrx[bPos] = *pb++;
+			bgrx[gPos] = *pg++;
+			bgrx[rPos] = *pr++;
+			bgrx[aPos] = 0xFF;
+			*dst++ = bgrx[0];
+			*dst++ = bgrx[1];
+			*dst++ = bgrx[2];
+			*dst++ = bgrx[3];
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t
+neon_RGBToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+                           UINT32 srcStep,            /* bytes between rows in source data */
+                           BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
+                           UINT32 dstStep,            /* bytes between rows in dest data */
+                           UINT32 DstFormat,
+                           const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
+
+		case PIXEL_FORMAT_RGBA32:
+		case PIXEL_FORMAT_RGBX32:
+			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
+
+		case PIXEL_FORMAT_ARGB32:
+		case PIXEL_FORMAT_XRGB32:
+			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
+
+		case PIXEL_FORMAT_ABGR32:
+		case PIXEL_FORMAT_XBGR32:
+			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
+
+		default:
+			return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+	}
+}
+#endif /* WITH_NEON */
+/* I don't see a direct IPP version of this, since the input is INT16
+ * YCbCr.  It may be possible via  Deinterleave and then YCbCrToRGB_<mod>.
+ * But that would likely be slower.
+ */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_colors_opt(primitives_t* prims)
+{
+	generic = primitives_get_generic();
+	primitives_init_colors(prims);
+#if defined(WITH_SSE2)
+
+	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
+	{
+		prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
+		prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
+		prims->yCbCrToRGB_16s8u_P3AC4R = sse2_yCbCrToRGB_16s8u_P3AC4R;
+		prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
+	}
+
+#elif defined(WITH_NEON)
+
+	if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
+	{
+		prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
+		prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
+		prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
+	}
+
+#endif /* WITH_SSE2 */
+}
diff --git a/libfreerdp/primitives/prim_copy.c b/libfreerdp/primitives/prim_copy.c
new file mode 100644
index 0000000..f140c20
--- /dev/null
+++ b/libfreerdp/primitives/prim_copy.c
@@ -0,0 +1,178 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Copy operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#ifdef WITH_IPP
+#include <ipps.h>
+#include <ippi.h>
+#endif /* WITH_IPP */
+#include "prim_internal.h"
+
+static primitives_t* generic = NULL;
+
+/* ------------------------------------------------------------------------- */
+/*static inline BOOL memory_regions_overlap_1d(*/
+static BOOL memory_regions_overlap_1d(const BYTE* p1, const BYTE* p2, size_t bytes)
+{
+	const ULONG_PTR p1m = (const ULONG_PTR)p1;
+	const ULONG_PTR p2m = (const ULONG_PTR)p2;
+
+	if (p1m <= p2m)
+	{
+		if (p1m + bytes > p2m)
+			return TRUE;
+	}
+	else
+	{
+		if (p2m + bytes > p1m)
+			return TRUE;
+	}
+
+	/* else */
+	return FALSE;
+}
+
+/* ------------------------------------------------------------------------- */
+/*static inline BOOL memory_regions_overlap_2d( */
+static BOOL memory_regions_overlap_2d(const BYTE* p1, int p1Step, int p1Size, const BYTE* p2,
+                                      int p2Step, int p2Size, int width, int height)
+{
+	ULONG_PTR p1m = (ULONG_PTR)p1;
+	ULONG_PTR p2m = (ULONG_PTR)p2;
+
+	if (p1m <= p2m)
+	{
+		ULONG_PTR p1mEnd = p1m + 1ull * (height - 1) * p1Step + 1ull * width * p1Size;
+
+		if (p1mEnd > p2m)
+			return TRUE;
+	}
+	else
+	{
+		ULONG_PTR p2mEnd = p2m + 1ull * (height - 1) * p2Step + 1ull * width * p2Size;
+
+		if (p2mEnd > p1m)
+			return TRUE;
+	}
+
+	/* else */
+	return FALSE;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_copy_8u(const BYTE* pSrc, BYTE* pDst, INT32 len)
+{
+	if (memory_regions_overlap_1d(pSrc, pDst, (size_t)len))
+	{
+		memmove((void*)pDst, (const void*)pSrc, (size_t)len);
+	}
+	else
+	{
+		memcpy((void*)pDst, (const void*)pSrc, (size_t)len);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+/* Copy a block of pixels from one buffer to another.
+ * The addresses are assumed to have been already offset to the upper-left
+ * corners of the source and destination region of interest.
+ */
+static pstatus_t general_copy_8u_AC4r(const BYTE* pSrc, INT32 srcStep, BYTE* pDst, INT32 dstStep,
+                                      INT32 width, INT32 height)
+{
+	const BYTE* src = (const BYTE*)pSrc;
+	BYTE* dst = (BYTE*)pDst;
+	int rowbytes = width * sizeof(UINT32);
+
+	if ((width == 0) || (height == 0))
+		return PRIMITIVES_SUCCESS;
+
+	if (memory_regions_overlap_2d(pSrc, srcStep, sizeof(UINT32), pDst, dstStep, sizeof(UINT32),
+	                              width, height))
+	{
+		do
+		{
+			generic->copy(src, dst, rowbytes);
+			src += srcStep;
+			dst += dstStep;
+		} while (--height);
+	}
+	else
+	{
+		/* TODO: do it in one operation when the rowdata is adjacent. */
+		do
+		{
+			/* If we find a replacement for memcpy that is consistently
+			 * faster, this could be replaced with that.
+			 */
+			memcpy(dst, src, rowbytes);
+			src += srcStep;
+			dst += dstStep;
+		} while (--height);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+#ifdef WITH_IPP
+/* ------------------------------------------------------------------------- */
+/* This is just ippiCopy_8u_AC4R without the IppiSize structure parameter.   */
+static pstatus_t ippiCopy_8u_AC4r(const BYTE* pSrc, INT32 srcStep, BYTE* pDst, INT32 dstStep,
+                                  INT32 width, INT32 height)
+{
+	IppiSize roi;
+	roi.width = width;
+	roi.height = height;
+	return (pstatus_t)ippiCopy_8u_AC4R(pSrc, srcStep, pDst, dstStep, roi);
+}
+#endif /* WITH_IPP */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_copy(primitives_t* prims)
+{
+	/* Start with the default. */
+	prims->copy_8u = general_copy_8u;
+	prims->copy_8u_AC4r = general_copy_8u_AC4r;
+	/* This is just an alias with void* parameters */
+	prims->copy = (__copy_t)(prims->copy_8u);
+}
+
+#if defined(WITH_SSE2) || defined(WITH_NEON)
+void primitives_init_copy_opt(primitives_t* prims)
+{
+	generic = primitives_get_generic();
+	primitives_init_copy(prims);
+	/* Pick tuned versions if possible. */
+#ifdef WITH_IPP
+	prims->copy_8u = (__copy_8u_t)ippsCopy_8u;
+	prims->copy_8u_AC4r = (__copy_8u_AC4r_t)ippiCopy_8u_AC4r;
+#endif
+	/* Performance with an SSE2 version with no prefetch seemed to be
+	 * all over the map vs. memcpy.
+	 * Sometimes it was significantly faster, sometimes dreadfully slower,
+	 * and it seemed to vary a lot depending on block size and processor.
+	 * Hence, no SSE version is used here unless once can be written that
+	 * is consistently faster than memcpy.
+	 */
+	/* This is just an alias with void* parameters */
+	prims->copy = (__copy_t)(prims->copy_8u);
+}
+#endif
diff --git a/libfreerdp/primitives/prim_internal.h b/libfreerdp/primitives/prim_internal.h
new file mode 100644
index 0000000..cf5c124
--- /dev/null
+++ b/libfreerdp/primitives/prim_internal.h
@@ -0,0 +1,297 @@
+/* prim_internal.h
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ */
+
+#ifndef FREERDP_LIB_PRIM_INTERNAL_H
+#define FREERDP_LIB_PRIM_INTERNAL_H
+
+#include <freerdp/config.h>
+
+#include <freerdp/primitives.h>
+#include <freerdp/api.h>
+
+#ifdef __GNUC__
+#define PRIM_ALIGN_128 __attribute__((aligned(16)))
+#else
+#ifdef _WIN32
+#define PRIM_ALIGN_128 __declspec(align(16))
+#endif
+#endif
+
+#if defined(WITH_SSE2) || defined(WITH_NEON) || defined(WITH_OPENCL)
+#define HAVE_OPTIMIZED_PRIMITIVES 1
+#endif
+
+#if defined(WITH_SSE2) || defined(WITH_NEON)
+#define HAVE_CPU_OPTIMIZED_PRIMITIVES 1
+#endif
+
+#if defined(WITH_SSE2)
+/* Use lddqu for unaligned; load for 16-byte aligned. */
+#define LOAD_SI128(_ptr_)                                                       \
+	(((const ULONG_PTR)(_ptr_)&0x0f) ? _mm_lddqu_si128((const __m128i*)(_ptr_)) \
+	                                 : _mm_load_si128((const __m128i*)(_ptr_)))
+#endif
+
+static INLINE BYTE* writePixelBGRA(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+                                   BYTE B, BYTE A)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(format);
+
+	*dst++ = B;
+	*dst++ = G;
+	*dst++ = R;
+	*dst++ = A;
+	return dst;
+}
+
+static INLINE BYTE* writePixelBGRX(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+                                   BYTE B, BYTE A)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(format);
+	WINPR_UNUSED(A);
+
+	*dst++ = B;
+	*dst++ = G;
+	*dst++ = R;
+	dst++; /* Do not touch alpha */
+
+	return dst;
+}
+
+static INLINE BYTE* writePixelRGBA(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+                                   BYTE B, BYTE A)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(format);
+
+	*dst++ = R;
+	*dst++ = G;
+	*dst++ = B;
+	*dst++ = A;
+	return dst;
+}
+
+static INLINE BYTE* writePixelRGBX(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+                                   BYTE B, BYTE A)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(format);
+	WINPR_UNUSED(A);
+
+	*dst++ = R;
+	*dst++ = G;
+	*dst++ = B;
+	dst++; /* Do not touch alpha */
+
+	return dst;
+}
+
+static INLINE BYTE* writePixelABGR(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+                                   BYTE B, BYTE A)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(format);
+
+	*dst++ = A;
+	*dst++ = B;
+	*dst++ = G;
+	*dst++ = R;
+	return dst;
+}
+
+static INLINE BYTE* writePixelXBGR(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+                                   BYTE B, BYTE A)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(format);
+	WINPR_UNUSED(A);
+
+	dst++; /* Do not touch alpha */
+	*dst++ = B;
+	*dst++ = G;
+	*dst++ = R;
+	return dst;
+}
+
+static INLINE BYTE* writePixelARGB(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+                                   BYTE B, BYTE A)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(format);
+
+	*dst++ = A;
+	*dst++ = R;
+	*dst++ = G;
+	*dst++ = B;
+	return dst;
+}
+
+static INLINE BYTE* writePixelXRGB(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+                                   BYTE B, BYTE A)
+{
+	WINPR_UNUSED(formatSize);
+	WINPR_UNUSED(format);
+	WINPR_UNUSED(A);
+
+	dst++; /* Do not touch alpha */
+	*dst++ = R;
+	*dst++ = G;
+	*dst++ = B;
+	return dst;
+}
+
+static INLINE BYTE* writePixelGenericAlpha(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R,
+                                           BYTE G, BYTE B, BYTE A)
+{
+	UINT32 color = FreeRDPGetColor(format, R, G, B, A);
+	FreeRDPWriteColor(dst, format, color);
+	return dst + formatSize;
+}
+
+static INLINE BYTE* writePixelGeneric(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+                                      BYTE B, BYTE A)
+{
+	UINT32 color = FreeRDPGetColor(format, R, G, B, A);
+	FreeRDPWriteColorIgnoreAlpha(dst, format, color);
+	return dst + formatSize;
+}
+
+typedef BYTE* (*fkt_writePixel)(BYTE*, DWORD, UINT32, BYTE, BYTE, BYTE, BYTE);
+
+static INLINE fkt_writePixel getPixelWriteFunction(DWORD format, BOOL useAlpha)
+{
+	switch (format)
+	{
+		case PIXEL_FORMAT_ARGB32:
+		case PIXEL_FORMAT_XRGB32:
+			return useAlpha ? writePixelARGB : writePixelXRGB;
+
+		case PIXEL_FORMAT_ABGR32:
+		case PIXEL_FORMAT_XBGR32:
+			return useAlpha ? writePixelABGR : writePixelXBGR;
+
+		case PIXEL_FORMAT_RGBA32:
+		case PIXEL_FORMAT_RGBX32:
+			return useAlpha ? writePixelRGBA : writePixelRGBX;
+
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return useAlpha ? writePixelBGRA : writePixelBGRX;
+
+		default:
+			return useAlpha ? writePixelGenericAlpha : writePixelGeneric;
+	}
+}
+
+static INLINE BYTE CLIP(INT64 X)
+{
+	if (X > 255L)
+		return 255L;
+
+	if (X < 0L)
+		return 0L;
+
+	return (BYTE)X;
+}
+
+static INLINE BYTE CONDITIONAL_CLIP(INT32 in, BYTE original)
+{
+	BYTE out = CLIP(in);
+	BYTE diff;
+	if (out > original)
+		diff = out - original;
+	else
+		diff = original - out;
+	if (diff < 30)
+		return original;
+	return out;
+}
+
+/**
+ * | R |   ( | 256     0    403 | |    Y    | )
+ * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+ * | B |   ( | 256   475      0 | | V - 128 | )
+ */
+static INLINE INT32 C(INT32 Y)
+{
+	return (Y)-0L;
+}
+
+static INLINE INT32 D(INT32 U)
+{
+	return (U)-128L;
+}
+
+static INLINE INT32 E(INT32 V)
+{
+	return (V)-128L;
+}
+
+static INLINE BYTE YUV2R(INT32 Y, INT32 U, INT32 V)
+{
+	const INT32 r = (256L * C(Y) + 0L * D(U) + 403L * E(V));
+	const INT32 r8 = r >> 8L;
+	return CLIP(r8);
+}
+
+static INLINE BYTE YUV2G(INT32 Y, INT32 U, INT32 V)
+{
+	const INT32 g = (256L * C(Y) - 48L * D(U) - 120L * E(V));
+	const INT32 g8 = g >> 8L;
+	return CLIP(g8);
+}
+
+static INLINE BYTE YUV2B(INT32 Y, INT32 U, INT32 V)
+{
+	const INT32 b = (256L * C(Y) + 475L * D(U) + 0L * E(V));
+	const INT32 b8 = b >> 8L;
+	return CLIP(b8);
+}
+
+/* Function prototypes for all the init/deinit routines. */
+FREERDP_LOCAL void primitives_init_copy(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_set(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_add(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_andor(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_shift(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_sign(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_alphaComp(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_colors(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_YCoCg(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_YUV(primitives_t* prims);
+
+#if defined(WITH_SSE2) || defined(WITH_NEON)
+FREERDP_LOCAL void primitives_init_copy_opt(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_set_opt(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_add_opt(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_andor_opt(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_shift_opt(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_sign_opt(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_alphaComp_opt(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_colors_opt(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_YCoCg_opt(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_YUV_opt(primitives_t* prims);
+#endif
+
+#if defined(WITH_OPENCL)
+FREERDP_LOCAL BOOL primitives_init_opencl(primitives_t* prims);
+#endif
+
+FREERDP_LOCAL primitives_t* primitives_get_by_type(DWORD type);
+
+#endif /* FREERDP_LIB_PRIM_INTERNAL_H */
diff --git a/libfreerdp/primitives/prim_set.c b/libfreerdp/primitives/prim_set.c
new file mode 100644
index 0000000..c4012e6
--- /dev/null
+++ b/libfreerdp/primitives/prim_set.c
@@ -0,0 +1,122 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Routines to set a chunk of memory to a constant.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+#include <freerdp/config.h>
+
+#include <string.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+/* ========================================================================= */
+static pstatus_t general_set_8u(BYTE val, BYTE* pDst, UINT32 len)
+{
+	memset((void*)pDst, (int)val, (size_t)len);
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_zero(void* pDst, size_t len)
+{
+	memset(pDst, 0, len);
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ========================================================================= */
+static pstatus_t general_set_32s(INT32 val, INT32* pDst, UINT32 len)
+{
+	INT32* dptr = (INT32*)pDst;
+	size_t span = 0;
+	size_t remaining = 0;
+	primitives_t* prims = NULL;
+
+	if (len < 256)
+	{
+		while (len--)
+			*dptr++ = val;
+
+		return PRIMITIVES_SUCCESS;
+	}
+
+	/* else quadratic growth memcpy algorithm */
+	span = 1;
+	*dptr = val;
+	remaining = len - 1;
+	prims = primitives_get();
+
+	while (remaining)
+	{
+		size_t thiswidth = span;
+
+		if (thiswidth > remaining)
+			thiswidth = remaining;
+
+		prims->copy_8u((BYTE*)dptr, (BYTE*)(dptr + span), thiswidth << 2);
+		remaining -= thiswidth;
+		span <<= 1;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_set_32u(UINT32 val, UINT32* pDst, UINT32 len)
+{
+	UINT32* dptr = (UINT32*)pDst;
+	size_t span = 0;
+	size_t remaining = 0;
+	primitives_t* prims = NULL;
+
+	if (len < 256)
+	{
+		while (len--)
+			*dptr++ = val;
+
+		return PRIMITIVES_SUCCESS;
+	}
+
+	/* else quadratic growth memcpy algorithm */
+	span = 1;
+	*dptr = val;
+	remaining = len - 1;
+	prims = primitives_get();
+
+	while (remaining)
+	{
+		size_t thiswidth = span;
+
+		if (thiswidth > remaining)
+			thiswidth = remaining;
+
+		prims->copy_8u((BYTE*)dptr, (BYTE*)(dptr + span), thiswidth << 2);
+		remaining -= thiswidth;
+		span <<= 1;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_set(primitives_t* prims)
+{
+	/* Start with the default. */
+	prims->set_8u = general_set_8u;
+	prims->set_32s = general_set_32s;
+	prims->set_32u = general_set_32u;
+	prims->zero = general_zero;
+}
diff --git a/libfreerdp/primitives/prim_set_opt.c b/libfreerdp/primitives/prim_set_opt.c
new file mode 100644
index 0000000..546d1ac
--- /dev/null
+++ b/libfreerdp/primitives/prim_set_opt.c
@@ -0,0 +1,256 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized routines to set a chunk of memory to a constant.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+#include <freerdp/config.h>
+
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#endif /* WITH_SSE2 */
+#ifdef WITH_IPP
+#include <ipps.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+
+static primitives_t* generic = NULL;
+
+/* ========================================================================= */
+#ifdef WITH_SSE2
+#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 len)
+{
+	BYTE byte = 0;
+	BYTE* dptr = NULL;
+	__m128i xmm0;
+	size_t count = 0;
+
+	if (len < 16)
+		return generic->set_8u(val, pDst, len);
+
+	byte = val;
+	dptr = (BYTE*)pDst;
+
+	/* Seek 16-byte alignment. */
+	while ((ULONG_PTR)dptr & 0x0f)
+	{
+		*dptr++ = byte;
+
+		if (--len == 0)
+			return PRIMITIVES_SUCCESS;
+	}
+
+	xmm0 = _mm_set1_epi8(byte);
+	/* Cover 256-byte chunks via SSE register stores. */
+	count = len >> 8;
+	len -= count << 8;
+
+	/* Do 256-byte chunks using one XMM register. */
+	while (count--)
+	{
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 16;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 16;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 16;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 16;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 16;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 16;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 16;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 16;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 16;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 16;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 16;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 16;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 16;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 16;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 16;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 16;
+	}
+
+	/* Cover 16-byte chunks via SSE register stores. */
+	count = len >> 4;
+	len -= count << 4;
+
+	/* Do 16-byte chunks using one XMM register. */
+	while (count--)
+	{
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 16;
+	}
+
+	/* Do leftover bytes. */
+	while (len--)
+		*dptr++ = byte;
+
+	return PRIMITIVES_SUCCESS;
+}
+#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif /* WITH_SSE2 */
+
+/* ------------------------------------------------------------------------- */
+#ifdef WITH_SSE2
+#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+static pstatus_t sse2_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 len)
+{
+	const primitives_t* prim = primitives_get_generic();
+	UINT32* dptr = (UINT32*)pDst;
+	__m128i xmm0;
+	size_t count = 0;
+
+	/* If really short, just do it here. */
+	if (len < 32)
+	{
+		while (len--)
+			*dptr++ = val;
+
+		return PRIMITIVES_SUCCESS;
+	}
+
+	/* Assure we can reach 16-byte alignment. */
+	if (((ULONG_PTR)dptr & 0x03) != 0)
+	{
+		return prim->set_32u(val, pDst, len);
+	}
+
+	/* Seek 16-byte alignment. */
+	while ((ULONG_PTR)dptr & 0x0f)
+	{
+		*dptr++ = val;
+
+		if (--len == 0)
+			return PRIMITIVES_SUCCESS;
+	}
+
+	xmm0 = _mm_set1_epi32(val);
+	/* Cover 256-byte chunks via SSE register stores. */
+	count = len >> 6;
+	len -= count << 6;
+
+	/* Do 256-byte chunks using one XMM register. */
+	while (count--)
+	{
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 4;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 4;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 4;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 4;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 4;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 4;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 4;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 4;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 4;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 4;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 4;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 4;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 4;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 4;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 4;
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 4;
+	}
+
+	/* Cover 16-byte chunks via SSE register stores. */
+	count = len >> 2;
+	len -= count << 2;
+
+	/* Do 16-byte chunks using one XMM register. */
+	while (count--)
+	{
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 4;
+	}
+
+	/* Do leftover bytes. */
+	while (len--)
+		*dptr++ = val;
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t sse2_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len)
+{
+	UINT32 uval = *((UINT32*)&val);
+	return sse2_set_32u(uval, (UINT32*)pDst, len);
+}
+#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+/* ------------------------------------------------------------------------- */
+static pstatus_t ipp_wrapper_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, INT32 len)
+{
+	/* A little type conversion, then use the signed version. */
+	INT32 sval = *((INT32*)&val);
+	return ippsSet_32s(sval, (INT32*)pDst, len);
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	generic = primitives_get_generic();
+	primitives_init_set(prims);
+	/* Pick tuned versions if possible. */
+#ifdef WITH_IPP
+	prims->set_8u = (__set_8u_t)ippsSet_8u;
+	prims->set_32s = (__set_32s_t)ippsSet_32s;
+	prims->set_32u = (__set_32u_t)ipp_wrapper_set_32u;
+	prims->zero = (__zero_t)ippsZero_8u;
+#elif defined(WITH_SSE2)
+
+	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
+	{
+		prims->set_8u = sse2_set_8u;
+		prims->set_32s = sse2_set_32s;
+		prims->set_32u = sse2_set_32u;
+	}
+
+#endif
+}
diff --git a/libfreerdp/primitives/prim_shift.c b/libfreerdp/primitives/prim_shift.c
new file mode 100644
index 0000000..3729266
--- /dev/null
+++ b/libfreerdp/primitives/prim_shift.c
@@ -0,0 +1,115 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Shift operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+/* ------------------------------------------------------------------------- */
+static INLINE pstatus_t general_lShiftC_16s(const INT16* pSrc, UINT32 val, INT16* pDst, UINT32 len)
+{
+	if (val == 0)
+		return PRIMITIVES_SUCCESS;
+	if (val >= 16)
+		return -1;
+
+	while (len--)
+		*pDst++ = (INT16)((UINT16)*pSrc++ << val);
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static INLINE pstatus_t general_rShiftC_16s(const INT16* pSrc, UINT32 val, INT16* pDst, UINT32 len)
+{
+	if (val == 0)
+		return PRIMITIVES_SUCCESS;
+	if (val >= 16)
+		return -1;
+
+	while (len--)
+		*pDst++ = *pSrc++ >> val;
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static INLINE pstatus_t general_lShiftC_16u(const UINT16* pSrc, UINT32 val, UINT16* pDst,
+                                            UINT32 len)
+{
+	if (val == 0)
+		return PRIMITIVES_SUCCESS;
+	if (val >= 16)
+		return -1;
+
+	while (len--)
+		*pDst++ = (INT16)((UINT16)*pSrc++ << val);
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static INLINE pstatus_t general_rShiftC_16u(const UINT16* pSrc, UINT32 val, UINT16* pDst,
+                                            UINT32 len)
+{
+	if (val == 0)
+		return PRIMITIVES_SUCCESS;
+	if (val >= 16)
+		return -1;
+
+	while (len--)
+		*pDst++ = *pSrc++ >> val;
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static INLINE pstatus_t general_shiftC_16s(const INT16* pSrc, INT32 val, INT16* pDst, UINT32 len)
+{
+	if (val == 0)
+		return PRIMITIVES_SUCCESS;
+
+	if (val < 0)
+		return general_rShiftC_16s(pSrc, -val, pDst, len);
+	else
+		return general_lShiftC_16s(pSrc, val, pDst, len);
+}
+
+/* ------------------------------------------------------------------------- */
+static INLINE pstatus_t general_shiftC_16u(const UINT16* pSrc, INT32 val, UINT16* pDst, UINT32 len)
+{
+	if (val == 0)
+		return PRIMITIVES_SUCCESS;
+
+	if (val < 0)
+		return general_rShiftC_16u(pSrc, -val, pDst, len);
+	else
+		return general_lShiftC_16u(pSrc, val, pDst, len);
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_shift(primitives_t* prims)
+{
+	/* Start with the default. */
+	prims->lShiftC_16s = general_lShiftC_16s;
+	prims->rShiftC_16s = general_rShiftC_16s;
+	prims->lShiftC_16u = general_lShiftC_16u;
+	prims->rShiftC_16u = general_rShiftC_16u;
+	/* Wrappers */
+	prims->shiftC_16s = general_shiftC_16s;
+	prims->shiftC_16u = general_shiftC_16u;
+}
diff --git a/libfreerdp/primitives/prim_shift_opt.c b/libfreerdp/primitives/prim_shift_opt.c
new file mode 100644
index 0000000..9ac9533
--- /dev/null
+++ b/libfreerdp/primitives/prim_shift_opt.c
@@ -0,0 +1,80 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Shift operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+#include <ipps.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+static primitives_t* generic = NULL;
+
+#ifdef WITH_SSE2
+#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16,
+                 *dptr++ = (INT16)((UINT16)*sptr++ << val))
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s, _mm_srai_epi16,
+                 *dptr++ = *sptr++ >> val)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16,
+                 *dptr++ = (INT16)((UINT16)*sptr++ << val))
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16,
+                 *dptr++ = *sptr++ >> val)
+#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
+ * depending on the sign of val.  To avoid using the deprecated inplace
+ * routines, a wrapper can use the src for the dest.
+ */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	generic = primitives_get_generic();
+	primitives_init_shift(prims);
+#if defined(WITH_IPP)
+	prims->lShiftC_16s = ippsLShiftC_16s;
+	prims->rShiftC_16s = ippsRShiftC_16s;
+	prims->lShiftC_16u = ippsLShiftC_16u;
+	prims->rShiftC_16u = ippsRShiftC_16u;
+#elif defined(WITH_SSE2)
+
+	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
+	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+	{
+		prims->lShiftC_16s = sse2_lShiftC_16s;
+		prims->rShiftC_16s = sse2_rShiftC_16s;
+		prims->lShiftC_16u = sse2_lShiftC_16u;
+		prims->rShiftC_16u = sse2_rShiftC_16u;
+	}
+
+#endif
+}
diff --git a/libfreerdp/primitives/prim_sign.c b/libfreerdp/primitives/prim_sign.c
new file mode 100644
index 0000000..d89dc47
--- /dev/null
+++ b/libfreerdp/primitives/prim_sign.c
@@ -0,0 +1,42 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Sign operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+/* ----------------------------------------------------------------------------
+ * Set pDst to the sign-value of the 16-bit values in pSrc (-1, 0, or 1).
+ */
+static pstatus_t general_sign_16s(const INT16* pSrc, INT16* pDst, UINT32 len)
+{
+	while (len--)
+	{
+		INT16 src = *pSrc++;
+		*pDst++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_sign(primitives_t* prims)
+{
+	/* Start with the default. */
+	prims->sign_16s = general_sign_16s;
+}
diff --git a/libfreerdp/primitives/prim_sign_opt.c b/libfreerdp/primitives/prim_sign_opt.c
new file mode 100644
index 0000000..dae76a6
--- /dev/null
+++ b/libfreerdp/primitives/prim_sign_opt.c
@@ -0,0 +1,185 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized sign operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#include "prim_internal.h"
+
+static primitives_t* generic = NULL;
+
+#ifdef WITH_SSE2
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
+                                UINT32 len)
+{
+	const INT16* sptr = (const INT16*)pSrc;
+	INT16* dptr = (INT16*)pDst;
+	size_t count = 0;
+
+	if (len < 16)
+	{
+		return generic->sign_16s(pSrc, pDst, len);
+	}
+
+	/* Check for 16-byte alignment (eventually). */
+	if ((ULONG_PTR)pDst & 0x01)
+	{
+		return generic->sign_16s(pSrc, pDst, len);
+	}
+
+	/* Seek 16-byte alignment. */
+	while ((ULONG_PTR)dptr & 0x0f)
+	{
+		INT16 src = *sptr++;
+		*dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
+
+		if (--len == 0)
+			return PRIMITIVES_SUCCESS;
+	}
+
+	/* Do 32-short chunks using 8 XMM registers. */
+	count = len >> 5;  /* / 32  */
+	len -= count << 5; /* * 32 */
+
+	if ((ULONG_PTR)sptr & 0x0f)
+	{
+		/* Unaligned */
+		while (count--)
+		{
+			__m128i xmm0;
+			__m128i xmm1;
+			__m128i xmm2;
+			__m128i xmm3;
+			__m128i xmm4;
+			__m128i xmm5;
+			__m128i xmm6;
+			__m128i xmm7;
+			xmm0 = _mm_set1_epi16(0x0001U);
+			xmm1 = _mm_set1_epi16(0x0001U);
+			xmm2 = _mm_set1_epi16(0x0001U);
+			xmm3 = _mm_set1_epi16(0x0001U);
+			xmm4 = _mm_lddqu_si128((const __m128i*)sptr);
+			sptr += 8;
+			xmm5 = _mm_lddqu_si128((const __m128i*)sptr);
+			sptr += 8;
+			xmm6 = _mm_lddqu_si128((const __m128i*)sptr);
+			sptr += 8;
+			xmm7 = _mm_lddqu_si128((const __m128i*)sptr);
+			sptr += 8;
+			xmm0 = _mm_sign_epi16(xmm0, xmm4);
+			xmm1 = _mm_sign_epi16(xmm1, xmm5);
+			xmm2 = _mm_sign_epi16(xmm2, xmm6);
+			xmm3 = _mm_sign_epi16(xmm3, xmm7);
+			_mm_store_si128((__m128i*)dptr, xmm0);
+			dptr += 8;
+			_mm_store_si128((__m128i*)dptr, xmm1);
+			dptr += 8;
+			_mm_store_si128((__m128i*)dptr, xmm2);
+			dptr += 8;
+			_mm_store_si128((__m128i*)dptr, xmm3);
+			dptr += 8;
+		}
+	}
+	else
+	{
+		/* Aligned */
+		while (count--)
+		{
+			__m128i xmm0;
+			__m128i xmm1;
+			__m128i xmm2;
+			__m128i xmm3;
+			__m128i xmm4;
+			__m128i xmm5;
+			__m128i xmm6;
+			__m128i xmm7;
+			xmm0 = _mm_set1_epi16(0x0001U);
+			xmm1 = _mm_set1_epi16(0x0001U);
+			xmm2 = _mm_set1_epi16(0x0001U);
+			xmm3 = _mm_set1_epi16(0x0001U);
+			xmm4 = _mm_load_si128((const __m128i*)sptr);
+			sptr += 8;
+			xmm5 = _mm_load_si128((const __m128i*)sptr);
+			sptr += 8;
+			xmm6 = _mm_load_si128((const __m128i*)sptr);
+			sptr += 8;
+			xmm7 = _mm_load_si128((const __m128i*)sptr);
+			sptr += 8;
+			xmm0 = _mm_sign_epi16(xmm0, xmm4);
+			xmm1 = _mm_sign_epi16(xmm1, xmm5);
+			xmm2 = _mm_sign_epi16(xmm2, xmm6);
+			xmm3 = _mm_sign_epi16(xmm3, xmm7);
+			_mm_store_si128((__m128i*)dptr, xmm0);
+			dptr += 8;
+			_mm_store_si128((__m128i*)dptr, xmm1);
+			dptr += 8;
+			_mm_store_si128((__m128i*)dptr, xmm2);
+			dptr += 8;
+			_mm_store_si128((__m128i*)dptr, xmm3);
+			dptr += 8;
+		}
+	}
+
+	/* Do 8-short chunks using two XMM registers. */
+	count = len >> 3;
+	len -= count << 3;
+
+	while (count--)
+	{
+		__m128i xmm0 = _mm_set1_epi16(0x0001U);
+		__m128i xmm1 = LOAD_SI128(sptr);
+		sptr += 8;
+		xmm0 = _mm_sign_epi16(xmm0, xmm1);
+		_mm_store_si128((__m128i*)dptr, xmm0);
+		dptr += 8;
+	}
+
+	/* Do leftovers. */
+	while (len--)
+	{
+		INT16 src = *sptr++;
+		*dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+#endif /* WITH_SSE2 */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	generic = primitives_get_generic();
+	primitives_init_sign(prims);
+	/* Pick tuned versions if possible. */
+	/* I didn't spot an IPP version of this. */
+#if defined(WITH_SSE2)
+
+	if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
+	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+	{
+		prims->sign_16s = ssse3_sign_16s;
+	}
+
+#endif
+}
diff --git a/libfreerdp/primitives/prim_templates.h b/libfreerdp/primitives/prim_templates.h
new file mode 100644
index 0000000..5ab85a8
--- /dev/null
+++ b/libfreerdp/primitives/prim_templates.h
@@ -0,0 +1,444 @@
+/* prim_templates.h
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ */
+
+#ifdef __GNUC__
+#pragma once
+#endif
+
+#ifndef FREERDP_LIB_PRIM_TEMPLATES_H
+#define FREERDP_LIB_PRIM_TEMPLATES_H
+
+/* These are prototypes for SSE (potentially NEON) routines that do a
+ * simple SSE operation over an array of data.  Since so much of this
+ * code is shared except for the operation itself, these prototypes are
+ * used rather than duplicating code.  The naming convention depends on
+ * the parameters:  S=Source param; C=Constant; D=Destination.
+ * All the macros have parameters for a fallback procedure if the data
+ * is too small and an operation "the slow way" for use at 16-byte edges.
+ */
+
+/* SSE3 note:  If someone needs to support an SSE2 version of these without
+ * SSE3 support, an alternative version could be added that merely checks
+ * that 16-byte alignment on both destination and source(s) can be
+ * achieved, rather than use LDDQU for unaligned reads.
+ */
+
+/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
+ * It easily can't do that if the value is stored in a variable.
+ * So don't save it as an intermediate value.
+ */
+
+/* ----------------------------------------------------------------------------
+ * SCD = Source, Constant, Destination
+ */
+#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)                 \
+	static pstatus_t _name_(const _type_* pSrc, UINT32 val, _type_* pDst, UINT32 len) \
+	{                                                                                 \
+		INT32 shifts = 0;                                                             \
+		UINT32 offBeatMask;                                                           \
+		const _type_* sptr = pSrc;                                                    \
+		_type_* dptr = pDst;                                                          \
+		int count;                                                                    \
+		if (val == 0)                                                                 \
+			return PRIMITIVES_SUCCESS;                                                \
+		if (val >= 16)                                                                \
+			return -1;                                                                \
+		if (len < 16) /* pointless if too small */                                    \
+		{                                                                             \
+			return _fallback_(pSrc, val, pDst, len);                                  \
+		}                                                                             \
+		if (sizeof(_type_) == 1)                                                      \
+			shifts = 1;                                                               \
+		else if (sizeof(_type_) == 2)                                                 \
+			shifts = 2;                                                               \
+		else if (sizeof(_type_) == 4)                                                 \
+			shifts = 3;                                                               \
+		else if (sizeof(_type_) == 8)                                                 \
+			shifts = 4;                                                               \
+		offBeatMask = (1 << (shifts - 1)) - 1;                                        \
+		if ((ULONG_PTR)pDst & offBeatMask)                                            \
+		{                                                                             \
+			/* Incrementing the pointer skips over 16-byte boundary. */               \
+			return _fallback_(pSrc, val, pDst, len);                                  \
+		}                                                                             \
+		/* Get to the 16-byte boundary now. */                                        \
+		while ((ULONG_PTR)dptr & 0x0f)                                                \
+		{                                                                             \
+			_slowWay_;                                                                \
+			if (--len == 0)                                                           \
+				return PRIMITIVES_SUCCESS;                                            \
+		}                                                                             \
+		/* Use 8 128-bit SSE registers. */                                            \
+		count = len >> (8 - shifts);                                                  \
+		len -= count << (8 - shifts);                                                 \
+		if ((const ULONG_PTR)sptr & 0x0f)                                             \
+		{                                                                             \
+			while (count--)                                                           \
+			{                                                                         \
+				__m128i xmm0 = _mm_lddqu_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm5 = _mm_lddqu_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm6 = _mm_lddqu_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm7 = _mm_lddqu_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                        \
+				xmm0 = _op_(xmm0, val);                                               \
+				xmm1 = _op_(xmm1, val);                                               \
+				xmm2 = _op_(xmm2, val);                                               \
+				xmm3 = _op_(xmm3, val);                                               \
+				xmm4 = _op_(xmm4, val);                                               \
+				xmm5 = _op_(xmm5, val);                                               \
+				xmm6 = _op_(xmm6, val);                                               \
+				xmm7 = _op_(xmm7, val);                                               \
+				_mm_store_si128((__m128i*)dptr, xmm0);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm1);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm2);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm3);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm4);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm5);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm6);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm7);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+			}                                                                         \
+		}                                                                             \
+		else                                                                          \
+		{                                                                             \
+			while (count--)                                                           \
+			{                                                                         \
+				__m128i xmm0 = _mm_load_si128((const __m128i*)sptr);                  \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm1 = _mm_load_si128((const __m128i*)sptr);                  \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm2 = _mm_load_si128((const __m128i*)sptr);                  \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm3 = _mm_load_si128((const __m128i*)sptr);                  \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm4 = _mm_load_si128((const __m128i*)sptr);                  \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm5 = _mm_load_si128((const __m128i*)sptr);                  \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm6 = _mm_load_si128((const __m128i*)sptr);                  \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm7 = _mm_load_si128((const __m128i*)sptr);                  \
+				sptr += (16 / sizeof(_type_));                                        \
+				xmm0 = _op_(xmm0, val);                                               \
+				xmm1 = _op_(xmm1, val);                                               \
+				xmm2 = _op_(xmm2, val);                                               \
+				xmm3 = _op_(xmm3, val);                                               \
+				xmm4 = _op_(xmm4, val);                                               \
+				xmm5 = _op_(xmm5, val);                                               \
+				xmm6 = _op_(xmm6, val);                                               \
+				xmm7 = _op_(xmm7, val);                                               \
+				_mm_store_si128((__m128i*)dptr, xmm0);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm1);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm2);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm3);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm4);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm5);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm6);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm7);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+			}                                                                         \
+		}                                                                             \
+		/* Use a single 128-bit SSE register. */                                      \
+		count = len >> (5 - shifts);                                                  \
+		len -= count << (5 - shifts);                                                 \
+		while (count--)                                                               \
+		{                                                                             \
+			__m128i xmm0 = LOAD_SI128(sptr);                                          \
+			sptr += (16 / sizeof(_type_));                                            \
+			xmm0 = _op_(xmm0, val);                                                   \
+			_mm_store_si128((__m128i*)dptr, xmm0);                                    \
+			dptr += (16 / sizeof(_type_));                                            \
+		}                                                                             \
+		/* Finish off the remainder. */                                               \
+		while (len--)                                                                 \
+		{                                                                             \
+			_slowWay_;                                                                \
+		}                                                                             \
+		return PRIMITIVES_SUCCESS;                                                    \
+	}
+
+/* ----------------------------------------------------------------------------
+ * SCD = Source, Constant, Destination
+ * PRE = preload xmm0 with the constant.
+ */
+#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)            \
+	static pstatus_t _name_(const _type_* pSrc, _type_ val, _type_* pDst, INT32 len) \
+	{                                                                                \
+		int shifts = 0;                                                              \
+		UINT32 offBeatMask;                                                          \
+		const _type_* sptr = pSrc;                                                   \
+		_type_* dptr = pDst;                                                         \
+		size_t count;                                                                \
+		__m128i xmm0;                                                                \
+		if (len < 16) /* pointless if too small */                                   \
+		{                                                                            \
+			return _fallback_(pSrc, val, pDst, len);                                 \
+		}                                                                            \
+		if (sizeof(_type_) == 1)                                                     \
+			shifts = 1;                                                              \
+		else if (sizeof(_type_) == 2)                                                \
+			shifts = 2;                                                              \
+		else if (sizeof(_type_) == 4)                                                \
+			shifts = 3;                                                              \
+		else if (sizeof(_type_) == 8)                                                \
+			shifts = 4;                                                              \
+		offBeatMask = (1 << (shifts - 1)) - 1;                                       \
+		if ((ULONG_PTR)pDst & offBeatMask)                                           \
+		{                                                                            \
+			/* Incrementing the pointer skips over 16-byte boundary. */              \
+			return _fallback_(pSrc, val, pDst, len);                                 \
+		}                                                                            \
+		/* Get to the 16-byte boundary now. */                                       \
+		while ((ULONG_PTR)dptr & 0x0f)                                               \
+		{                                                                            \
+			_slowWay_;                                                               \
+			if (--len == 0)                                                          \
+				return PRIMITIVES_SUCCESS;                                           \
+		}                                                                            \
+		/* Use 4 128-bit SSE registers. */                                           \
+		count = len >> (7 - shifts);                                                 \
+		len -= count << (7 - shifts);                                                \
+		xmm0 = _mm_set1_epi32(val);                                                  \
+		if ((const ULONG_PTR)sptr & 0x0f)                                            \
+		{                                                                            \
+			while (count--)                                                          \
+			{                                                                        \
+				__m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr);                \
+				sptr += (16 / sizeof(_type_));                                       \
+				__m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr);                \
+				sptr += (16 / sizeof(_type_));                                       \
+				__m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr);                \
+				sptr += (16 / sizeof(_type_));                                       \
+				__m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr);                \
+				sptr += (16 / sizeof(_type_));                                       \
+				xmm1 = _op_(xmm1, xmm0);                                             \
+				xmm2 = _op_(xmm2, xmm0);                                             \
+				xmm3 = _op_(xmm3, xmm0);                                             \
+				xmm4 = _op_(xmm4, xmm0);                                             \
+				_mm_store_si128((__m128i*)dptr, xmm1);                               \
+				dptr += (16 / sizeof(_type_));                                       \
+				_mm_store_si128((__m128i*)dptr, xmm2);                               \
+				dptr += (16 / sizeof(_type_));                                       \
+				_mm_store_si128((__m128i*)dptr, xmm3);                               \
+				dptr += (16 / sizeof(_type_));                                       \
+				_mm_store_si128((__m128i*)dptr, xmm4);                               \
+				dptr += (16 / sizeof(_type_));                                       \
+			}                                                                        \
+		}                                                                            \
+		else                                                                         \
+		{                                                                            \
+			while (count--)                                                          \
+			{                                                                        \
+				__m128i xmm1 = _mm_load_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                       \
+				__m128i xmm2 = _mm_load_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                       \
+				__m128i xmm3 = _mm_load_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                       \
+				__m128i xmm4 = _mm_load_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                       \
+				xmm1 = _op_(xmm1, xmm0);                                             \
+				xmm2 = _op_(xmm2, xmm0);                                             \
+				xmm3 = _op_(xmm3, xmm0);                                             \
+				xmm4 = _op_(xmm4, xmm0);                                             \
+				_mm_store_si128((__m128i*)dptr, xmm1);                               \
+				dptr += (16 / sizeof(_type_));                                       \
+				_mm_store_si128((__m128i*)dptr, xmm2);                               \
+				dptr += (16 / sizeof(_type_));                                       \
+				_mm_store_si128((__m128i*)dptr, xmm3);                               \
+				dptr += (16 / sizeof(_type_));                                       \
+				_mm_store_si128((__m128i*)dptr, xmm4);                               \
+				dptr += (16 / sizeof(_type_));                                       \
+			}                                                                        \
+		}                                                                            \
+		/* Use a single 128-bit SSE register. */                                     \
+		count = len >> (5 - shifts);                                                 \
+		len -= count << (5 - shifts);                                                \
+		while (count--)                                                              \
+		{                                                                            \
+			__m128i xmm1 = LOAD_SI128(sptr);                                         \
+			sptr += (16 / sizeof(_type_));                                           \
+			xmm1 = _op_(xmm1, xmm0);                                                 \
+			_mm_store_si128((__m128i*)dptr, xmm1);                                   \
+			dptr += (16 / sizeof(_type_));                                           \
+		}                                                                            \
+		/* Finish off the remainder. */                                              \
+		while (len--)                                                                \
+		{                                                                            \
+			_slowWay_;                                                               \
+		}                                                                            \
+		return PRIMITIVES_SUCCESS;                                                   \
+	}
+
+/* ----------------------------------------------------------------------------
+ * SSD = Source1, Source2, Destination
+ */
+#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)                           \
+	static pstatus_t _name_(const _type_* pSrc1, const _type_* pSrc2, _type_* pDst, UINT32 len) \
+	{                                                                                           \
+		int shifts = 0;                                                                         \
+		UINT32 offBeatMask;                                                                     \
+		const _type_* sptr1 = pSrc1;                                                            \
+		const _type_* sptr2 = pSrc2;                                                            \
+		_type_* dptr = pDst;                                                                    \
+		size_t count;                                                                           \
+		if (len < 16) /* pointless if too small */                                              \
+		{                                                                                       \
+			return _fallback_(pSrc1, pSrc2, pDst, len);                                         \
+		}                                                                                       \
+		if (sizeof(_type_) == 1)                                                                \
+			shifts = 1;                                                                         \
+		else if (sizeof(_type_) == 2)                                                           \
+			shifts = 2;                                                                         \
+		else if (sizeof(_type_) == 4)                                                           \
+			shifts = 3;                                                                         \
+		else if (sizeof(_type_) == 8)                                                           \
+			shifts = 4;                                                                         \
+		offBeatMask = (1 << (shifts - 1)) - 1;                                                  \
+		if ((ULONG_PTR)pDst & offBeatMask)                                                      \
+		{                                                                                       \
+			/* Incrementing the pointer skips over 16-byte boundary. */                         \
+			return _fallback_(pSrc1, pSrc2, pDst, len);                                         \
+		}                                                                                       \
+		/* Get to the 16-byte boundary now. */                                                  \
+		while ((ULONG_PTR)dptr & 0x0f)                                                          \
+		{                                                                                       \
+			pstatus_t status;                                                                   \
+			status = _slowWay_;                                                                 \
+			if (status != PRIMITIVES_SUCCESS)                                                   \
+				return status;                                                                  \
+			if (--len == 0)                                                                     \
+				return PRIMITIVES_SUCCESS;                                                      \
+		}                                                                                       \
+		/* Use 4 128-bit SSE registers. */                                                      \
+		count = len >> (7 - shifts);                                                            \
+		len -= count << (7 - shifts);                                                           \
+		if (((const ULONG_PTR)sptr1 & 0x0f) || ((const ULONG_PTR)sptr2 & 0x0f))                 \
+		{                                                                                       \
+			/* Unaligned loads */                                                               \
+			while (count--)                                                                     \
+			{                                                                                   \
+				__m128i xmm0 = _mm_lddqu_si128((const __m128i*)sptr1);                          \
+				sptr1 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr1);                          \
+				sptr1 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr1);                          \
+				sptr1 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr1);                          \
+				sptr1 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr2);                          \
+				sptr2 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm5 = _mm_lddqu_si128((const __m128i*)sptr2);                          \
+				sptr2 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm6 = _mm_lddqu_si128((const __m128i*)sptr2);                          \
+				sptr2 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm7 = _mm_lddqu_si128((const __m128i*)sptr2);                          \
+				sptr2 += (16 / sizeof(_type_));                                                 \
+				xmm0 = _op_(xmm0, xmm4);                                                        \
+				xmm1 = _op_(xmm1, xmm5);                                                        \
+				xmm2 = _op_(xmm2, xmm6);                                                        \
+				xmm3 = _op_(xmm3, xmm7);                                                        \
+				_mm_store_si128((__m128i*)dptr, xmm0);                                          \
+				dptr += (16 / sizeof(_type_));                                                  \
+				_mm_store_si128((__m128i*)dptr, xmm1);                                          \
+				dptr += (16 / sizeof(_type_));                                                  \
+				_mm_store_si128((__m128i*)dptr, xmm2);                                          \
+				dptr += (16 / sizeof(_type_));                                                  \
+				_mm_store_si128((__m128i*)dptr, xmm3);                                          \
+				dptr += (16 / sizeof(_type_));                                                  \
+			}                                                                                   \
+		}                                                                                       \
+		else                                                                                    \
+		{                                                                                       \
+			/* Aligned loads */                                                                 \
+			while (count--)                                                                     \
+			{                                                                                   \
+				__m128i xmm0 = _mm_load_si128((const __m128i*)sptr1);                           \
+				sptr1 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm1 = _mm_load_si128((const __m128i*)sptr1);                           \
+				sptr1 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm2 = _mm_load_si128((const __m128i*)sptr1);                           \
+				sptr1 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm3 = _mm_load_si128((const __m128i*)sptr1);                           \
+				sptr1 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm4 = _mm_load_si128((const __m128i*)sptr2);                           \
+				sptr2 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm5 = _mm_load_si128((const __m128i*)sptr2);                           \
+				sptr2 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm6 = _mm_load_si128((const __m128i*)sptr2);                           \
+				sptr2 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm7 = _mm_load_si128((const __m128i*)sptr2);                           \
+				sptr2 += (16 / sizeof(_type_));                                                 \
+				xmm0 = _op_(xmm0, xmm4);                                                        \
+				xmm1 = _op_(xmm1, xmm5);                                                        \
+				xmm2 = _op_(xmm2, xmm6);                                                        \
+				xmm3 = _op_(xmm3, xmm7);                                                        \
+				_mm_store_si128((__m128i*)dptr, xmm0);                                          \
+				dptr += (16 / sizeof(_type_));                                                  \
+				_mm_store_si128((__m128i*)dptr, xmm1);                                          \
+				dptr += (16 / sizeof(_type_));                                                  \
+				_mm_store_si128((__m128i*)dptr, xmm2);                                          \
+				dptr += (16 / sizeof(_type_));                                                  \
+				_mm_store_si128((__m128i*)dptr, xmm3);                                          \
+				dptr += (16 / sizeof(_type_));                                                  \
+			}                                                                                   \
+		}                                                                                       \
+		/* Use a single 128-bit SSE register. */                                                \
+		count = len >> (5 - shifts);                                                            \
+		len -= count << (5 - shifts);                                                           \
+		while (count--)                                                                         \
+		{                                                                                       \
+			__m128i xmm0 = LOAD_SI128(sptr1);                                                   \
+			sptr1 += (16 / sizeof(_type_));                                                     \
+			__m128i xmm1 = LOAD_SI128(sptr2);                                                   \
+			sptr2 += (16 / sizeof(_type_));                                                     \
+			xmm0 = _op_(xmm0, xmm1);                                                            \
+			_mm_store_si128((__m128i*)dptr, xmm0);                                              \
+			dptr += (16 / sizeof(_type_));                                                      \
+		}                                                                                       \
+		/* Finish off the remainder. */                                                         \
+		while (len--)                                                                           \
+		{                                                                                       \
+			_slowWay_;                                                                          \
+		}                                                                                       \
+		return PRIMITIVES_SUCCESS;                                                              \
+	}
+
+#endif /* FREERDP_LIB_PRIM_TEMPLATES_H */
diff --git a/libfreerdp/primitives/primitives.c b/libfreerdp/primitives/primitives.c
new file mode 100644
index 0000000..da8bd40
--- /dev/null
+++ b/libfreerdp/primitives/primitives.c
@@ -0,0 +1,412 @@
+/* primitives.c
+ * This code queries processor features and calls the init/deinit routines.
+ * vi:ts=4 sw=4
+ *
+ * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Copyright 2019 David Fort <contact@hardening-consulting.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <string.h>
+#include <stdlib.h>
+
+#include <winpr/synch.h>
+#include <winpr/sysinfo.h>
+#include <winpr/crypto.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+#include <freerdp/log.h>
+#define TAG FREERDP_TAG("primitives")
+
+/* hints to know which kind of primitives to use */
+static primitive_hints primitivesHints = PRIMITIVES_AUTODETECT;
+static BOOL primitives_init_optimized(primitives_t* prims);
+
+void primitives_set_hints(primitive_hints hints)
+{
+	primitivesHints = hints;
+}
+
+primitive_hints primitives_get_hints(void)
+{
+	return primitivesHints;
+}
+
+/* Singleton pointer used throughout the program when requested. */
+static primitives_t pPrimitivesGeneric = { 0 };
+static INIT_ONCE generic_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
+
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+static primitives_t pPrimitivesCpu = { 0 };
+static INIT_ONCE cpu_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
+
+#endif
+#if defined(WITH_OPENCL)
+static primitives_t pPrimitivesGpu = { 0 };
+static INIT_ONCE gpu_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
+
+#endif
+
+static INIT_ONCE auto_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
+
+static primitives_t pPrimitives = { 0 };
+
+/* ------------------------------------------------------------------------- */
+static BOOL primitives_init_generic(primitives_t* prims)
+{
+	primitives_init_add(prims);
+	primitives_init_andor(prims);
+	primitives_init_alphaComp(prims);
+	primitives_init_copy(prims);
+	primitives_init_set(prims);
+	primitives_init_shift(prims);
+	primitives_init_sign(prims);
+	primitives_init_colors(prims);
+	primitives_init_YCoCg(prims);
+	primitives_init_YUV(prims);
+	prims->uninit = NULL;
+	return TRUE;
+}
+
+static BOOL CALLBACK primitives_init_generic_cb(PINIT_ONCE once, PVOID param, PVOID* context)
+{
+	WINPR_UNUSED(once);
+	WINPR_UNUSED(param);
+	WINPR_UNUSED(context);
+	return primitives_init_generic(&pPrimitivesGeneric);
+}
+
+static BOOL primitives_init_optimized(primitives_t* prims)
+{
+	primitives_init_generic(prims);
+
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+	primitives_init_add_opt(prims);
+	primitives_init_andor_opt(prims);
+	primitives_init_alphaComp_opt(prims);
+	primitives_init_copy_opt(prims);
+	primitives_init_set_opt(prims);
+	primitives_init_shift_opt(prims);
+	primitives_init_sign_opt(prims);
+	primitives_init_colors_opt(prims);
+	primitives_init_YCoCg_opt(prims);
+	primitives_init_YUV_opt(prims);
+	prims->flags |= PRIM_FLAGS_HAVE_EXTCPU;
+#endif
+	return TRUE;
+}
+
+typedef struct
+{
+	BYTE* channels[3];
+	UINT32 steps[3];
+	prim_size_t roi;
+	BYTE* outputBuffer;
+	UINT32 outputStride;
+	UINT32 testedFormat;
+} primitives_YUV_benchmark;
+
+static void primitives_YUV_benchmark_free(primitives_YUV_benchmark* bench)
+{
+	if (!bench)
+		return;
+
+	free(bench->outputBuffer);
+
+	for (int i = 0; i < 3; i++)
+		free(bench->channels[i]);
+	memset(bench, 0, sizeof(primitives_YUV_benchmark));
+}
+
+static primitives_YUV_benchmark* primitives_YUV_benchmark_init(primitives_YUV_benchmark* ret)
+{
+	prim_size_t* roi = NULL;
+	if (!ret)
+		return NULL;
+
+	memset(ret, 0, sizeof(primitives_YUV_benchmark));
+	roi = &ret->roi;
+	roi->width = 1024;
+	roi->height = 768;
+	ret->outputStride = roi->width * 4;
+	ret->testedFormat = PIXEL_FORMAT_BGRA32;
+
+	ret->outputBuffer = calloc(ret->outputStride, roi->height);
+	if (!ret->outputBuffer)
+		goto fail;
+
+	for (int i = 0; i < 3; i++)
+	{
+		BYTE* buf = ret->channels[i] = calloc(roi->width, roi->height);
+		if (!buf)
+			goto fail;
+
+		winpr_RAND(buf, 1ull * roi->width * roi->height);
+		ret->steps[i] = roi->width;
+	}
+
+	return ret;
+
+fail:
+	primitives_YUV_benchmark_free(ret);
+	return ret;
+}
+
+static BOOL primitives_YUV_benchmark_run(primitives_YUV_benchmark* bench, primitives_t* prims,
+                                         UINT64 runTime, UINT32* computations)
+{
+	ULONGLONG dueDate = 0;
+	const BYTE* channels[3] = { 0 };
+	pstatus_t status = 0;
+
+	*computations = 0;
+
+	for (size_t i = 0; i < 3; i++)
+		channels[i] = bench->channels[i];
+
+	/* do a first dry run to initialize cache and such */
+	status = prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
+	                                      bench->outputStride, bench->testedFormat, &bench->roi);
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* let's run the benchmark */
+	dueDate = GetTickCount64() + runTime;
+	while (GetTickCount64() < dueDate)
+	{
+		pstatus_t cstatus =
+		    prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
+		                                 bench->outputStride, bench->testedFormat, &bench->roi);
+		if (cstatus != PRIMITIVES_SUCCESS)
+			return FALSE;
+		*computations = *computations + 1;
+	}
+	return TRUE;
+}
+
+static BOOL primitives_autodetect_best(primitives_t* prims)
+{
+	BOOL ret = FALSE;
+	struct prim_benchmark
+	{
+		const char* name;
+		primitives_t* prims;
+		UINT32 flags;
+		UINT32 count;
+	};
+
+	struct prim_benchmark testcases[] =
+	{
+		{ "generic", NULL, PRIMITIVES_PURE_SOFT, 0 },
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+		{ "optimized", NULL, PRIMITIVES_ONLY_CPU, 0 },
+#endif
+#if defined(WITH_OPENCL)
+		{ "opencl", NULL, PRIMITIVES_ONLY_GPU, 0 },
+#endif
+	};
+	const struct prim_benchmark* best = NULL;
+
+#if !defined(HAVE_CPU_OPTIMIZED_PRIMITIVES) && !defined(WITH_OPENCL)
+	{
+		struct prim_benchmark* cur = &testcases[0];
+		cur->prims = primitives_get_by_type(cur->flags);
+		if (!cur->prims)
+		{
+			WLog_WARN(TAG, "Failed to initialize %s primitives", cur->name);
+			return FALSE;
+		}
+		WLog_DBG(TAG, "primitives benchmark: only one backend, skipping...");
+		best = cur;
+	}
+#else
+	{
+		UINT64 benchDuration = 150; /* 150 ms */
+		primitives_YUV_benchmark bench = { 0 };
+		primitives_YUV_benchmark* yuvBench = primitives_YUV_benchmark_init(&bench);
+		if (!yuvBench)
+			return FALSE;
+
+		WLog_DBG(TAG, "primitives benchmark result:");
+		for (size_t x = 0; x < ARRAYSIZE(testcases); x++)
+		{
+			struct prim_benchmark* cur = &testcases[x];
+			cur->prims = primitives_get_by_type(cur->flags);
+			if (!cur->prims)
+			{
+				WLog_WARN(TAG, "Failed to initialize %s primitives", cur->name);
+				continue;
+			}
+			if (!primitives_YUV_benchmark_run(yuvBench, cur->prims, benchDuration, &cur->count))
+			{
+				WLog_WARN(TAG, "error running %s YUV bench", cur->name);
+				continue;
+			}
+
+			WLog_DBG(TAG, " * %s= %" PRIu32, cur->name, cur->count);
+			if (!best || (best->count < cur->count))
+				best = cur;
+		}
+		primitives_YUV_benchmark_free(yuvBench);
+	}
+#endif
+
+	if (!best)
+	{
+		WLog_ERR(TAG, "No primitives to test, aborting.");
+		goto out;
+	}
+	/* finally compute the results */
+	*prims = *best->prims;
+
+	WLog_DBG(TAG, "primitives autodetect, using %s", best->name);
+	ret = TRUE;
+out:
+	if (!ret)
+		*prims = pPrimitivesGeneric;
+
+	return ret;
+}
+
+#if defined(WITH_OPENCL)
+static BOOL CALLBACK primitives_init_gpu_cb(PINIT_ONCE once, PVOID param, PVOID* context)
+{
+	WINPR_UNUSED(once);
+	WINPR_UNUSED(param);
+	WINPR_UNUSED(context);
+
+	if (!primitives_init_opencl(&pPrimitivesGpu))
+		return FALSE;
+
+	return TRUE;
+}
+#endif
+
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+static BOOL CALLBACK primitives_init_cpu_cb(PINIT_ONCE once, PVOID param, PVOID* context)
+{
+	WINPR_UNUSED(once);
+	WINPR_UNUSED(param);
+	WINPR_UNUSED(context);
+
+	if (!primitives_init_optimized(&pPrimitivesCpu))
+		return FALSE;
+
+	return TRUE;
+}
+#endif
+
+static BOOL CALLBACK primitives_auto_init_cb(PINIT_ONCE once, PVOID param, PVOID* context)
+{
+	WINPR_UNUSED(once);
+	WINPR_UNUSED(param);
+	WINPR_UNUSED(context);
+
+	return primitives_init(&pPrimitives, primitivesHints);
+}
+
+BOOL primitives_init(primitives_t* p, primitive_hints hints)
+{
+	switch (hints)
+	{
+		case PRIMITIVES_AUTODETECT:
+			return primitives_autodetect_best(p);
+		case PRIMITIVES_PURE_SOFT:
+			*p = pPrimitivesGeneric;
+			return TRUE;
+		case PRIMITIVES_ONLY_CPU:
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+			*p = pPrimitivesCpu;
+			return TRUE;
+#endif
+		case PRIMITIVES_ONLY_GPU:
+#if defined(WITH_OPENCL)
+			*p = pPrimitivesGpu;
+			return TRUE;
+#endif
+		default:
+			WLog_ERR(TAG, "unknown hint %d", hints);
+			return FALSE;
+	}
+}
+
+void primitives_uninit(void)
+{
+#if defined(WITH_OPENCL)
+	if (pPrimitivesGpu.uninit)
+		pPrimitivesGpu.uninit();
+#endif
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+	if (pPrimitivesCpu.uninit)
+		pPrimitivesCpu.uninit();
+#endif
+	if (pPrimitivesGeneric.uninit)
+		pPrimitivesGeneric.uninit();
+}
+
+/* ------------------------------------------------------------------------- */
+static void setup(void)
+{
+	InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic_cb, NULL, NULL);
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+	InitOnceExecuteOnce(&cpu_primitives_InitOnce, primitives_init_cpu_cb, NULL, NULL);
+#endif
+#if defined(WITH_OPENCL)
+	InitOnceExecuteOnce(&gpu_primitives_InitOnce, primitives_init_gpu_cb, NULL, NULL);
+#endif
+	InitOnceExecuteOnce(&auto_primitives_InitOnce, primitives_auto_init_cb, NULL, NULL);
+}
+
+primitives_t* primitives_get(void)
+{
+	setup();
+	return &pPrimitives;
+}
+
+primitives_t* primitives_get_generic(void)
+{
+	InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic_cb, NULL, NULL);
+	return &pPrimitivesGeneric;
+}
+
+primitives_t* primitives_get_by_type(DWORD type)
+{
+	InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic_cb, NULL, NULL);
+
+	switch (type)
+	{
+		case PRIMITIVES_ONLY_GPU:
+#if defined(WITH_OPENCL)
+			if (!InitOnceExecuteOnce(&gpu_primitives_InitOnce, primitives_init_gpu_cb, NULL, NULL))
+				return NULL;
+			return &pPrimitivesGpu;
+#endif
+		case PRIMITIVES_ONLY_CPU:
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+			if (!InitOnceExecuteOnce(&cpu_primitives_InitOnce, primitives_init_cpu_cb, NULL, NULL))
+				return NULL;
+			return &pPrimitivesCpu;
+#endif
+		case PRIMITIVES_PURE_SOFT:
+		default:
+			return &pPrimitivesGeneric;
+	}
+}
+
+DWORD primitives_flags(primitives_t* p)
+{
+	return p->flags;
+}
diff --git a/libfreerdp/primitives/primitives.cl b/libfreerdp/primitives/primitives.cl
new file mode 100644
index 0000000..5e094df
--- /dev/null
+++ b/libfreerdp/primitives/primitives.cl
@@ -0,0 +1,463 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Optimized operations using openCL
+ * vi:ts=4 sw=4
+ *
+ * Copyright 2019 David Fort <contact@hardening-consulting.com>
+ * Copyright 2019 Rangee Gmbh
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#define STRINGIFY(x) #x
+
+STRINGIFY(
+uchar clamp_uc(int v, short l, short h)
+{
+    if (v > h)
+        v = h;
+    if (v < l)
+        v = l;
+    return (uchar)v;
+}
+
+__kernel void yuv420_to_rgba_1b(
+    __global const uchar *bufY, unsigned strideY,
+    __global const uchar *bufU, unsigned strideU,
+    __global const uchar *bufV, unsigned strideV,
+    __global uchar *dest, unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
+	short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+	__global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	destPtr[0] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255); /* R */
+	destPtr[1] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8 , 0, 255); /* G */
+	destPtr[2] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255); /* B */
+	/* A */
+}
+
+__kernel void yuv420_to_abgr_1b(
+    __global const uchar *bufY, unsigned strideY,
+    __global const uchar *bufU, unsigned strideU,
+    __global const uchar *bufV, unsigned strideV,
+    __global uchar *dest, unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
+	short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+	__global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	/* A */
+	destPtr[1] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+	destPtr[2] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255);	/* G */
+	destPtr[3] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); 	/* R */
+}
+
+__kernel void yuv444_to_abgr_1b(
+    __global const uchar *bufY, unsigned strideY,
+    __global const uchar *bufU, unsigned strideU,
+    __global const uchar *bufV, unsigned strideV,
+    __global uchar *dest, unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short U = bufU[y * strideU + x] - 128;
+	short V = bufV[y * strideV + x] - 128;
+
+	__global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	/* A */
+	destPtr[1] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+	destPtr[2] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255);	/* G */
+	destPtr[3] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); 	/* R */
+}
+
+__kernel void yuv444_to_rgba_1b(
+    __global const uchar *bufY, unsigned strideY,
+    __global const uchar *bufU, unsigned strideU,
+    __global const uchar *bufV, unsigned strideV,
+    __global uchar *dest, unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short U = bufU[y * strideU + x] - 128;
+	short V = bufV[y * strideV + x] - 128;
+
+	__global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	destPtr[0] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); 	/* R */
+	destPtr[1] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255);	/* G */
+	destPtr[2] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+	/* A */
+}
+
+__kernel void yuv420_to_rgbx_1b(
+    __global const uchar *bufY, unsigned strideY,
+    __global const uchar *bufU, unsigned strideU,
+    __global const uchar *bufV, unsigned strideV,
+    __global uchar *dest, unsigned strideDest)
+{
+    unsigned int x = get_global_id(0);
+    unsigned int y = get_global_id(1);
+
+    short Y = bufY[y * strideY + x];
+    short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
+    short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+    __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+    /**
+                             * | R |   ( | 256     0    403 | |    Y    | )
+                             * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+                             * | B |   ( | 256   475      0 | | V - 128 | )
+                             */
+    int y256 = 256 * Y;
+    destPtr[0] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255); /* R */
+    destPtr[1] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8 , 0, 255); /* G */
+    destPtr[2] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255); /* B */
+    destPtr[3] = 0xff; /* A */
+}
+
+__kernel void yuv420_to_xbgr_1b(
+    __global const uchar *bufY, unsigned strideY,
+    __global const uchar *bufU, unsigned strideU,
+    __global const uchar *bufV, unsigned strideV,
+    __global uchar *dest, unsigned strideDest)
+{
+    unsigned int x = get_global_id(0);
+    unsigned int y = get_global_id(1);
+
+    short Y = bufY[y * strideY + x];
+    short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
+    short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+    __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+    /**
+                             * | R |   ( | 256     0    403 | |    Y    | )
+                             * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+                             * | B |   ( | 256   475      0 | | V - 128 | )
+                             */
+    int y256 = 256 * Y;
+    destPtr[0] = 0xff; /* A */
+    destPtr[1] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+    destPtr[2] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255);	/* G */
+    destPtr[3] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); 	/* R */
+}
+
+__kernel void yuv444_to_xbgr_1b(
+    __global const uchar *bufY, unsigned strideY,
+    __global const uchar *bufU, unsigned strideU,
+    __global const uchar *bufV, unsigned strideV,
+    __global uchar *dest, unsigned strideDest)
+{
+    unsigned int x = get_global_id(0);
+    unsigned int y = get_global_id(1);
+
+    short Y = bufY[y * strideY + x];
+    short U = bufU[y * strideU + x] - 128;
+    short V = bufV[y * strideV + x] - 128;
+
+    __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+    /**
+                             * | R |   ( | 256     0    403 | |    Y    | )
+                             * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+                             * | B |   ( | 256   475      0 | | V - 128 | )
+                             */
+    int y256 = 256 * Y;
+    destPtr[0] = 0xff; /* A */
+    destPtr[1] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+    destPtr[2] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255);	/* G */
+    destPtr[3] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); 	/* R */
+}
+
+__kernel void yuv444_to_rgbx_1b(
+    __global const uchar *bufY, unsigned strideY,
+    __global const uchar *bufU, unsigned strideU,
+    __global const uchar *bufV, unsigned strideV,
+    __global uchar *dest, unsigned strideDest)
+{
+    unsigned int x = get_global_id(0);
+    unsigned int y = get_global_id(1);
+
+    short Y = bufY[y * strideY + x];
+    short U = bufU[y * strideU + x] - 128;
+    short V = bufV[y * strideV + x] - 128;
+
+    __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+    /**
+                             * | R |   ( | 256     0    403 | |    Y    | )
+                             * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+                             * | B |   ( | 256   475      0 | | V - 128 | )
+                             */
+    int y256 = 256 * Y;
+    destPtr[0] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); 	/* R */
+    destPtr[1] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255);	/* G */
+    destPtr[2] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+    destPtr[3] = 0xff; /* A */
+}
+
+
+__kernel void yuv420_to_argb_1b(
+    __global const uchar *bufY, unsigned strideY,
+    __global const uchar *bufU, unsigned strideU,
+    __global const uchar *bufV, unsigned strideV,
+    __global uchar *dest, unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
+	short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+	__global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	/* A */
+	destPtr[1] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255); /* R */
+	destPtr[2] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8 , 0, 255); /* G */
+	destPtr[3] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255); /* B */
+}
+
+__kernel void yuv420_to_bgra_1b(
+    __global const uchar *bufY, unsigned strideY,
+    __global const uchar *bufU, unsigned strideU,
+    __global const uchar *bufV, unsigned strideV,
+    __global uchar *dest, unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
+	short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+	__global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	destPtr[0] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+	destPtr[1] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255);	/* G */
+	destPtr[2] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); 	/* R */
+	/* A */
+}
+
+__kernel void yuv444_to_bgra_1b(
+    __global const uchar *bufY, unsigned strideY,
+    __global const uchar *bufU, unsigned strideU,
+    __global const uchar *bufV, unsigned strideV,
+    __global uchar *dest, unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short U = bufU[y * strideU + x] - 128;
+	short V = bufV[y * strideV + x] - 128;
+
+	__global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	destPtr[0] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+	destPtr[1] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255);	/* G */
+	destPtr[2] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); 	/* R */
+	/* A */
+}
+
+__kernel void yuv444_to_argb_1b(
+    __global const uchar *bufY, unsigned strideY,
+    __global const uchar *bufU, unsigned strideU,
+    __global const uchar *bufV, unsigned strideV,
+    __global uchar *dest, unsigned strideDest)
+{
+	unsigned int x = get_global_id(0);
+	unsigned int y = get_global_id(1);
+
+	short Y = bufY[y * strideY + x];
+	short U = bufU[y * strideU + x] - 128;
+	short V = bufV[y * strideV + x] - 128;
+
+	__global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+	/**
+	 * | R |   ( | 256     0    403 | |    Y    | )
+	 * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+	 * | B |   ( | 256   475      0 | | V - 128 | )
+	 */
+	int y256 = 256 * Y;
+	destPtr[3] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+	destPtr[2] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255);	/* G */
+	destPtr[1] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); 	/* R */
+	/* A */
+}
+
+__kernel void yuv420_to_xrgb_1b(
+    __global const uchar *bufY, unsigned strideY,
+    __global const uchar *bufU, unsigned strideU,
+    __global const uchar *bufV, unsigned strideV,
+    __global uchar *dest, unsigned strideDest)
+{
+    unsigned int x = get_global_id(0);
+    unsigned int y = get_global_id(1);
+
+    short Y = bufY[y * strideY + x];
+    short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
+    short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+    __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+    /**
+                             * | R |   ( | 256     0    403 | |    Y    | )
+                             * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+                             * | B |   ( | 256   475      0 | | V - 128 | )
+                             */
+    int y256 = 256 * Y;
+    destPtr[0] = 0xff; /* A */
+    destPtr[1] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255); /* R */
+    destPtr[2] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8 , 0, 255); /* G */
+    destPtr[3] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255); /* B */
+}
+
+__kernel void yuv420_to_bgrx_1b(
+    __global const uchar *bufY, unsigned strideY,
+    __global const uchar *bufU, unsigned strideU,
+    __global const uchar *bufV, unsigned strideV,
+    __global uchar *dest, unsigned strideDest)
+{
+    unsigned int x = get_global_id(0);
+    unsigned int y = get_global_id(1);
+
+    short Y = bufY[y * strideY + x];
+    short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
+    short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+    __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+    /**
+                             * | R |   ( | 256     0    403 | |    Y    | )
+                             * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+                             * | B |   ( | 256   475      0 | | V - 128 | )
+                             */
+    int y256 = 256 * Y;
+    destPtr[0] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+    destPtr[1] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255);	/* G */
+    destPtr[2] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); 	/* R */
+    destPtr[3] = 0xff; /* A */
+}
+
+__kernel void yuv444_to_bgrx_1b(
+    __global const uchar *bufY, unsigned strideY,
+    __global const uchar *bufU, unsigned strideU,
+    __global const uchar *bufV, unsigned strideV,
+    __global uchar *dest, unsigned strideDest)
+{
+    unsigned int x = get_global_id(0);
+    unsigned int y = get_global_id(1);
+
+    short Y = bufY[y * strideY + x];
+    short U = bufU[y * strideU + x] - 128;
+    short V = bufV[y * strideV + x] - 128;
+
+    __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+    /**
+                             * | R |   ( | 256     0    403 | |    Y    | )
+                             * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+                             * | B |   ( | 256   475      0 | | V - 128 | )
+                             */
+    int y256 = 256 * Y;
+    destPtr[0] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+    destPtr[1] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255);	/* G */
+    destPtr[2] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); 	/* R */
+    destPtr[3] = 0xff; /* A */
+}
+
+__kernel void yuv444_to_xrgb_1b(
+    __global const uchar *bufY, unsigned strideY,
+    __global const uchar *bufU, unsigned strideU,
+    __global const uchar *bufV, unsigned strideV,
+    __global uchar *dest, unsigned strideDest)
+{
+    unsigned int x = get_global_id(0);
+    unsigned int y = get_global_id(1);
+
+    short Y = bufY[y * strideY + x];
+    short U = bufU[y * strideU + x] - 128;
+    short V = bufV[y * strideV + x] - 128;
+
+    __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+    /**
+                             * | R |   ( | 256     0    403 | |    Y    | )
+                             * | G | = ( | 256   -48   -120 | | U - 128 | ) >> 8
+                             * | B |   ( | 256   475      0 | | V - 128 | )
+                             */
+    int y256 = 256 * Y;
+    destPtr[3] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+    destPtr[2] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255);	/* G */
+    destPtr[1] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); 	/* R */
+    destPtr[0] = 0xff; /* A */
+}
+)
diff --git a/libfreerdp/primitives/test/CMakeLists.txt b/libfreerdp/primitives/test/CMakeLists.txt
new file mode 100644
index 0000000..f3b7b72
--- /dev/null
+++ b/libfreerdp/primitives/test/CMakeLists.txt
@@ -0,0 +1,45 @@
+
+set(MODULE_NAME "TestPrimitives")
+set(MODULE_PREFIX "TEST_FREERDP_PRIMITIVES")
+
+set(${MODULE_PREFIX}_DRIVER ${MODULE_NAME}.c)
+
+set(${MODULE_PREFIX}_TESTS
+	TestPrimitivesAdd.c
+	TestPrimitivesAlphaComp.c
+	TestPrimitivesAndOr.c
+	TestPrimitivesColors.c
+	TestPrimitivesCopy.c
+	TestPrimitivesSet.c
+	TestPrimitivesShift.c
+	TestPrimitivesSign.c
+	TestPrimitivesYUV.c
+	TestPrimitivesYCbCr.c
+	TestPrimitivesYCoCg.c)
+
+create_test_sourcelist(${MODULE_PREFIX}_SRCS
+	${${MODULE_PREFIX}_DRIVER}
+	${${MODULE_PREFIX}_TESTS})
+
+set(${MODULE_PREFIX}_EXTRA_SRCS
+	prim_test.c
+	prim_test.h
+	measure.h)
+
+add_executable(${MODULE_NAME} ${${MODULE_PREFIX}_SRCS} ${${MODULE_PREFIX}_EXTRA_SRCS})
+
+set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} winpr freerdp)
+
+target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS})
+
+add_definitions(-DPRIM_STATIC=auto -DALL_PRIMITIVES_VERSIONS)
+
+set_target_properties(${MODULE_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${TESTING_OUTPUT_DIRECTORY}")
+
+foreach(test ${${MODULE_PREFIX}_TESTS})
+	get_filename_component(TestName ${test} NAME_WE)
+	add_test(${TestName} ${TESTING_OUTPUT_DIRECTORY}/${MODULE_NAME} ${TestName})
+endforeach()
+
+set_property(TARGET ${MODULE_NAME} PROPERTY FOLDER "FreeRDP/Test")
+
diff --git a/libfreerdp/primitives/test/TestPrimitivesAdd.c b/libfreerdp/primitives/test/TestPrimitivesAdd.c
new file mode 100644
index 0000000..9edbae9
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesAdd.c
@@ -0,0 +1,82 @@
+/* test_add.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+
+#define FUNC_TEST_SIZE 65536
+/* ========================================================================= */
+static BOOL test_add16s_func(void)
+{
+	pstatus_t status = 0;
+
+	INT16 ALIGN(src1[FUNC_TEST_SIZE + 3]) = { 0 };
+	INT16 ALIGN(src2[FUNC_TEST_SIZE + 3]) = { 0 };
+	INT16 ALIGN(d1[FUNC_TEST_SIZE + 3]) = { 0 };
+	INT16 ALIGN(d2[FUNC_TEST_SIZE + 3]) = { 0 };
+
+	winpr_RAND(src1, sizeof(src1));
+	winpr_RAND(src2, sizeof(src2));
+	status = generic->add_16s(src1 + 1, src2 + 1, d1 + 1, FUNC_TEST_SIZE);
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Unaligned */
+	status = optimized->add_16s(src1 + 1, src2 + 1, d2 + 2, FUNC_TEST_SIZE);
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_add16s_speed(void)
+{
+	BYTE ALIGN(src1[MAX_TEST_SIZE + 3]);
+	BYTE ALIGN(src2[MAX_TEST_SIZE + 3]);
+	BYTE ALIGN(dst[MAX_TEST_SIZE + 3]);
+
+	if (!g_TestPrimitivesPerformance)
+		return TRUE;
+
+	winpr_RAND(src1, sizeof(src1));
+	winpr_RAND(src2, sizeof(src2));
+
+	if (!speed_test("add16s", "aligned", g_Iterations, (speed_test_fkt)generic->add_16s,
+	                (speed_test_fkt)optimized->add_16s, src1, src2, dst, FUNC_TEST_SIZE))
+		return FALSE;
+
+	return TRUE;
+}
+
+int TestPrimitivesAdd(int argc, char* argv[])
+{
+
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+
+	prim_test_setup(FALSE);
+	if (!test_add16s_func())
+		return -1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_add16s_speed())
+			return -1;
+	}
+
+	return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesAlphaComp.c b/libfreerdp/primitives/test/TestPrimitivesAlphaComp.c
new file mode 100644
index 0000000..5aecc2e
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesAlphaComp.c
@@ -0,0 +1,202 @@
+/* test_alphaComp.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+
+#include "prim_test.h"
+
+#define MAX_BLOCK_SIZE 256
+#define SIZE_SQUARED (MAX_BLOCK_SIZE * MAX_BLOCK_SIZE)
+
+/* ========================================================================= */
+#define ALF(_c_) (((_c_)&0xFF000000U) >> 24)
+#define RED(_c_) (((_c_)&0x00FF0000U) >> 16)
+#define GRN(_c_) (((_c_)&0x0000FF00U) >> 8)
+#define BLU(_c_) ((_c_)&0x000000FFU)
+#define TOLERANCE 1
+static inline const UINT32* PIXEL(const BYTE* _addr_, UINT32 _bytes_, UINT32 _x_, UINT32 _y_)
+{
+	const BYTE* addr = _addr_ + _x_ * sizeof(UINT32) + _y_ * _bytes_;
+	return (const UINT32*)addr;
+}
+
+#define SRC1_WIDTH 6
+#define SRC1_HEIGHT 6
+#define SRC2_WIDTH 7
+#define SRC2_HEIGHT 7
+#define DST_WIDTH 9
+#define DST_HEIGHT 9
+#define TEST_WIDTH 4
+#define TEST_HEIGHT 5
+
+/* ------------------------------------------------------------------------- */
+static UINT32 alpha_add(UINT32 c1, UINT32 c2)
+{
+	UINT32 a1 = ALF(c1);
+	UINT32 r1 = RED(c1);
+	UINT32 g1 = GRN(c1);
+	UINT32 b1 = BLU(c1);
+	UINT32 a2 = ALF(c2);
+	UINT32 r2 = RED(c2);
+	UINT32 g2 = GRN(c2);
+	UINT32 b2 = BLU(c2);
+	UINT32 a3 = ((a1 * a1 + (255 - a1) * a2) / 255) & 0xff;
+	UINT32 r3 = ((a1 * r1 + (255 - a1) * r2) / 255) & 0xff;
+	UINT32 g3 = ((a1 * g1 + (255 - a1) * g2) / 255) & 0xff;
+	UINT32 b3 = ((a1 * b1 + (255 - a1) * b2) / 255) & 0xff;
+	return (a3 << 24) | (r3 << 16) | (g3 << 8) | b3;
+}
+
+/* ------------------------------------------------------------------------- */
+static UINT32 colordist(UINT32 c1, UINT32 c2)
+{
+	int d = 0;
+	int maxd = 0;
+	d = ABS((INT32)(ALF(c1) - ALF(c2)));
+
+	if (d > maxd)
+		maxd = d;
+
+	d = ABS((INT32)(RED(c1) - RED(c2)));
+
+	if (d > maxd)
+		maxd = d;
+
+	d = ABS((INT32)(GRN(c1) - GRN(c2)));
+
+	if (d > maxd)
+		maxd = d;
+
+	d = ABS((INT32)(BLU(c1) - BLU(c2)));
+
+	if (d > maxd)
+		maxd = d;
+
+	return maxd;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL check(const BYTE* pSrc1, UINT32 src1Step, const BYTE* pSrc2, UINT32 src2Step,
+                  BYTE* pDst, UINT32 dstStep, UINT32 width, UINT32 height)
+{
+	for (UINT32 y = 0; y < height; ++y)
+	{
+		for (UINT32 x = 0; x < width; ++x)
+		{
+			UINT32 s1 = *PIXEL(pSrc1, src1Step, x, y);
+			UINT32 s2 = *PIXEL(pSrc2, src2Step, x, y);
+			UINT32 c0 = alpha_add(s1, s2);
+			UINT32 c1 = *PIXEL(pDst, dstStep, x, y);
+
+			if (colordist(c0, c1) > TOLERANCE)
+			{
+				printf("alphaComp-general: [%" PRIu32 ",%" PRIu32 "] 0x%08" PRIx32 "+0x%08" PRIx32
+				       "=0x%08" PRIx32 ", got 0x%08" PRIx32 "\n",
+				       x, y, s1, s2, c0, c1);
+				return FALSE;
+			}
+		}
+	}
+
+	return TRUE;
+}
+
+static BOOL test_alphaComp_func(void)
+{
+	pstatus_t status = 0;
+	BYTE ALIGN(src1[SRC1_WIDTH * SRC1_HEIGHT * 4]) = { 0 };
+	BYTE ALIGN(src2[SRC2_WIDTH * SRC2_HEIGHT * 4]) = { 0 };
+	BYTE ALIGN(dst1[DST_WIDTH * DST_HEIGHT * 4]) = { 0 };
+	UINT32* ptr = NULL;
+	winpr_RAND(src1, sizeof(src1));
+	/* Special-case the first two values */
+	src1[0] &= 0x00FFFFFFU;
+	src1[1] |= 0xFF000000U;
+	winpr_RAND(src2, sizeof(src2));
+	/* Set the second operand to fully-opaque. */
+	ptr = (UINT32*)src2;
+
+	for (UINT32 i = 0; i < sizeof(src2) / 4; ++i)
+		*ptr++ |= 0xFF000000U;
+
+	status = generic->alphaComp_argb(src1, 4 * SRC1_WIDTH, src2, 4 * SRC2_WIDTH, dst1,
+	                                 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	if (!check(src1, 4 * SRC1_WIDTH, src2, 4 * SRC2_WIDTH, dst1, 4 * DST_WIDTH, TEST_WIDTH,
+	           TEST_HEIGHT))
+		return FALSE;
+
+	status = optimized->alphaComp_argb((const BYTE*)src1, 4 * SRC1_WIDTH, (const BYTE*)src2,
+	                                   4 * SRC2_WIDTH, (BYTE*)dst1, 4 * DST_WIDTH, TEST_WIDTH,
+	                                   TEST_HEIGHT);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	if (!check(src1, 4 * SRC1_WIDTH, src2, 4 * SRC2_WIDTH, dst1, 4 * DST_WIDTH, TEST_WIDTH,
+	           TEST_HEIGHT))
+		return FALSE;
+
+	return TRUE;
+}
+
+static int test_alphaComp_speed(void)
+{
+	BYTE ALIGN(src1[SRC1_WIDTH * SRC1_HEIGHT]) = { 0 };
+	BYTE ALIGN(src2[SRC2_WIDTH * SRC2_HEIGHT]) = { 0 };
+	BYTE ALIGN(dst1[DST_WIDTH * DST_HEIGHT]) = { 0 };
+	UINT32* ptr = NULL;
+
+	winpr_RAND(src1, sizeof(src1));
+	/* Special-case the first two values */
+	src1[0] &= 0x00FFFFFFU;
+	src1[1] |= 0xFF000000U;
+	winpr_RAND(src2, sizeof(src2));
+	/* Set the second operand to fully-opaque. */
+	ptr = (UINT32*)src2;
+
+	for (UINT32 i = 0; i < sizeof(src2) / 4; ++i)
+		*ptr++ |= 0xFF000000U;
+
+	if (!speed_test("add16s", "aligned", g_Iterations, (speed_test_fkt)generic->alphaComp_argb,
+	                (speed_test_fkt)optimized->alphaComp_argb, src1, 4 * SRC1_WIDTH, src2,
+	                4 * SRC2_WIDTH, dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT))
+		return FALSE;
+
+	return TRUE;
+}
+
+int TestPrimitivesAlphaComp(int argc, char* argv[])
+{
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+
+	prim_test_setup(FALSE);
+
+	if (!test_alphaComp_func())
+		return -1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_alphaComp_speed())
+			return -1;
+	}
+
+	return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesAndOr.c b/libfreerdp/primitives/test/TestPrimitivesAndOr.c
new file mode 100644
index 0000000..b3e52f6
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesAndOr.c
@@ -0,0 +1,169 @@
+/* test_andor.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+
+#include "prim_test.h"
+
+#define FUNC_TEST_SIZE 65536
+
+#define VALUE (0xA5A5A5A5U)
+
+/* ========================================================================= */
+static BOOL test_and_32u_impl(const char* name, __andC_32u_t fkt, const UINT32* src,
+                              const UINT32 val, UINT32* dst, size_t size)
+{
+	pstatus_t status = fkt(src, val, dst, size);
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	for (size_t i = 0; i < size; ++i)
+	{
+		if (dst[i] != (src[i] & val))
+		{
+
+			printf("AND %s FAIL[%" PRIuz "] 0x%08" PRIx32 "&0x%08" PRIx32 "=0x%08" PRIx32
+			       ", got 0x%08" PRIx32 "\n",
+			       name, i, src[i], val, (src[i] & val), dst[i]);
+
+			return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+static BOOL test_and_32u_func(void)
+{
+	UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]) = { 0 };
+	UINT32 ALIGN(dst[FUNC_TEST_SIZE + 3]) = { 0 };
+
+	winpr_RAND(src, sizeof(src));
+
+	if (!test_and_32u_impl("generic->andC_32u aligned", generic->andC_32u, src + 1, VALUE, dst + 1,
+	                       FUNC_TEST_SIZE))
+		return FALSE;
+	if (!test_and_32u_impl("generic->andC_32u unaligned", generic->andC_32u, src + 1, VALUE,
+	                       dst + 2, FUNC_TEST_SIZE))
+		return FALSE;
+	if (!test_and_32u_impl("optimized->andC_32u aligned", optimized->andC_32u, src + 1, VALUE,
+	                       dst + 1, FUNC_TEST_SIZE))
+		return FALSE;
+	if (!test_and_32u_impl("optimized->andC_32u unaligned", optimized->andC_32u, src + 1, VALUE,
+	                       dst + 2, FUNC_TEST_SIZE))
+		return FALSE;
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_and_32u_speed(void)
+{
+	UINT32 ALIGN(src[MAX_TEST_SIZE + 3]) = { 0 };
+	UINT32 ALIGN(dst[MAX_TEST_SIZE + 3]) = { 0 };
+
+	winpr_RAND(src, sizeof(src));
+
+	if (!speed_test("andC_32u", "aligned", g_Iterations, (speed_test_fkt)generic->andC_32u,
+	                (speed_test_fkt)optimized->andC_32u, src + 1, VALUE, dst + 1, MAX_TEST_SIZE))
+		return FALSE;
+	if (!speed_test("andC_32u", "unaligned", g_Iterations, (speed_test_fkt)generic->andC_32u,
+	                (speed_test_fkt)optimized->andC_32u, src + 1, VALUE, dst + 2, MAX_TEST_SIZE))
+		return FALSE;
+
+	return TRUE;
+}
+
+/* ========================================================================= */
+static BOOL check(const UINT32* src, const UINT32* dst, UINT32 size, UINT32 value)
+{
+	for (UINT32 i = 0; i < size; ++i)
+	{
+		if (dst[i] != (src[i] | value))
+		{
+			printf("OR-general general FAIL[%" PRIu32 "] 0x%08" PRIx32 "&0x%08" PRIx32
+			       "=0x%08" PRIx32 ", got 0x%08" PRIx32 "\n",
+			       i, src[i], value, src[i] | value, dst[i]);
+			return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+static BOOL test_or_32u_func(void)
+{
+	pstatus_t status = 0;
+	UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]) = { 0 };
+	UINT32 ALIGN(dst[FUNC_TEST_SIZE + 3]) = { 0 };
+
+	winpr_RAND(src, sizeof(src));
+
+	status = generic->orC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	if (!check(src + 1, dst + 1, FUNC_TEST_SIZE, VALUE))
+		return FALSE;
+
+	status = optimized->orC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	if (!check(src + 1, dst + 1, FUNC_TEST_SIZE, VALUE))
+		return FALSE;
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_or_32u_speed(void)
+{
+	UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]) = { 0 };
+	UINT32 ALIGN(dst[FUNC_TEST_SIZE + 3]) = { 0 };
+
+	winpr_RAND(src, sizeof(src));
+
+	if (!speed_test("add16s", "aligned", g_Iterations, (speed_test_fkt)generic->orC_32u,
+	                (speed_test_fkt)optimized->orC_32u, src + 1, VALUE, dst + 1, FUNC_TEST_SIZE))
+		return FALSE;
+
+	return TRUE;
+}
+
+int TestPrimitivesAndOr(int argc, char* argv[])
+{
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+
+	prim_test_setup(FALSE);
+
+	if (!test_and_32u_func())
+		return -1;
+
+	if (!test_or_32u_func())
+		return -1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_and_32u_speed())
+			return -1;
+		if (!test_or_32u_speed())
+			return -1;
+	}
+
+	return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesColors.c b/libfreerdp/primitives/test/TestPrimitivesColors.c
new file mode 100644
index 0000000..c297b4f
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesColors.c
@@ -0,0 +1,298 @@
+/* test_colors.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include <freerdp/utils/profiler.h>
+
+#include "prim_test.h"
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_RGBToRGB_16s8u_P3AC4R_func(prim_size_t roi, DWORD DstFormat)
+{
+	INT16* r = NULL;
+	INT16* g = NULL;
+	INT16* b = NULL;
+	BYTE* out1 = NULL;
+	BYTE* out2 = NULL;
+	BOOL failed = FALSE;
+	const INT16* ptrs[3];
+	const UINT32 rgbStride = roi.width * 2;
+	const UINT32 dstStride = roi.width * 4;
+	PROFILER_DEFINE(genericProf)
+	PROFILER_DEFINE(optProf)
+	PROFILER_CREATE(genericProf, "RGBToRGB_16s8u_P3AC4R-GENERIC")
+	PROFILER_CREATE(optProf, "RGBToRGB_16s8u_P3AC4R-OPTIMIZED")
+	r = winpr_aligned_calloc(1, rgbStride * roi.height, 16);
+	g = winpr_aligned_calloc(1, rgbStride * roi.height, 16);
+	b = winpr_aligned_calloc(1, rgbStride * roi.height, 16);
+	out1 = winpr_aligned_calloc(1, dstStride * roi.height, 16);
+	out2 = winpr_aligned_calloc(1, dstStride * roi.height, 16);
+
+	if (!r || !g || !b || !out1 || !out2)
+		goto fail;
+
+#if 0
+	{
+		for (UINT32 y = 0; y < roi.height; y++)
+		{
+			for (UINT32 x = 0; x < roi.width; x++)
+			{
+				r[y * roi.width + x] = 0x01;
+				g[y * roi.width + x] = 0x02;
+				b[y * roi.width + x] = 0x04;
+			}
+		}
+	}
+#else
+	winpr_RAND(r, rgbStride * roi.height);
+	winpr_RAND(g, rgbStride * roi.height);
+	winpr_RAND(b, rgbStride * roi.height);
+#endif
+	ptrs[0] = r;
+	ptrs[1] = g;
+	ptrs[2] = b;
+	PROFILER_ENTER(genericProf)
+
+	if (generic->RGBToRGB_16s8u_P3AC4R(ptrs, rgbStride, out1, dstStride, DstFormat, &roi) !=
+	    PRIMITIVES_SUCCESS)
+		goto fail;
+
+	PROFILER_EXIT(genericProf)
+	PROFILER_ENTER(optProf)
+
+	if (optimized->RGBToRGB_16s8u_P3AC4R(ptrs, rgbStride, out2, dstStride, DstFormat, &roi) !=
+	    PRIMITIVES_SUCCESS)
+		goto fail;
+
+	PROFILER_EXIT(optProf)
+
+	if (memcmp(out1, out2, dstStride * roi.height) != 0)
+	{
+		for (UINT64 i = 0; i < 1ull * roi.width * roi.height; ++i)
+		{
+			const UINT32 o1 = FreeRDPReadColor(out1 + 4 * i, DstFormat);
+			const UINT32 o2 = FreeRDPReadColor(out2 + 4 * i, DstFormat);
+
+			if (o1 != o2)
+			{
+				printf("RGBToRGB_16s8u_P3AC4R FAIL: out1[%" PRIu64 "]=0x%08" PRIx8 " out2[%" PRIu64
+				       "]=0x%08" PRIx8 "\n",
+				       i, out1[i], i, out2[i]);
+				failed = TRUE;
+			}
+		}
+	}
+
+	printf("Results for %" PRIu32 "x%" PRIu32 " [%s]", roi.width, roi.height,
+	       FreeRDPGetColorFormatName(DstFormat));
+	PROFILER_PRINT_HEADER
+	PROFILER_PRINT(genericProf)
+	PROFILER_PRINT(optProf)
+	PROFILER_PRINT_FOOTER
+fail:
+	PROFILER_FREE(genericProf)
+	PROFILER_FREE(optProf)
+	winpr_aligned_free(r);
+	winpr_aligned_free(g);
+	winpr_aligned_free(b);
+	winpr_aligned_free(out1);
+	winpr_aligned_free(out2);
+	return !failed;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_RGBToRGB_16s8u_P3AC4R_speed(void)
+{
+	union
+	{
+		const INT16** cpv;
+		INT16** pv;
+	} cnv;
+	const prim_size_t roi64x64 = { 64, 64 };
+	INT16 ALIGN(r[4096 + 1]);
+	INT16 ALIGN(g[4096 + 1]);
+	INT16 ALIGN(b[4096 + 1]);
+	UINT32 ALIGN(dst[4096 + 1]);
+	INT16* ptrs[3];
+	winpr_RAND(r, sizeof(r));
+	winpr_RAND(g, sizeof(g));
+	winpr_RAND(b, sizeof(b));
+
+	/* clear upper bytes */
+	for (int i = 0; i < 4096; ++i)
+	{
+		r[i] &= 0x00FFU;
+		g[i] &= 0x00FFU;
+		b[i] &= 0x00FFU;
+	}
+
+	ptrs[0] = r + 1;
+	ptrs[1] = g + 1;
+	ptrs[2] = b + 1;
+
+	cnv.pv = ptrs;
+	if (!speed_test("RGBToRGB_16s8u_P3AC4R", "aligned", g_Iterations,
+	                generic->RGBToRGB_16s8u_P3AC4R, optimized->RGBToRGB_16s8u_P3AC4R, cnv.cpv,
+	                64 * 2, (BYTE*)dst, 64 * 4, &roi64x64))
+		return FALSE;
+
+	if (!speed_test("RGBToRGB_16s8u_P3AC4R", "unaligned", g_Iterations,
+	                generic->RGBToRGB_16s8u_P3AC4R, optimized->RGBToRGB_16s8u_P3AC4R, cnv.cpv,
+	                64 * 2, ((BYTE*)dst) + 1, 64 * 4, &roi64x64))
+		return FALSE;
+
+	return TRUE;
+}
+
+/* ========================================================================= */
+static BOOL test_yCbCrToRGB_16s16s_P3P3_func(void)
+{
+	pstatus_t status = 0;
+	INT16 ALIGN(y[4096]) = { 0 };
+	INT16 ALIGN(cb[4096]) = { 0 };
+	INT16 ALIGN(cr[4096]) = { 0 };
+	INT16 ALIGN(r1[4096]) = { 0 };
+	INT16 ALIGN(g1[4096]) = { 0 };
+	INT16 ALIGN(b1[4096]) = { 0 };
+	INT16 ALIGN(r2[4096]) = { 0 };
+	INT16 ALIGN(g2[4096]) = { 0 };
+	INT16 ALIGN(b2[4096]) = { 0 };
+	const INT16* in[3];
+	INT16* out1[3];
+	INT16* out2[3];
+	prim_size_t roi = { 64, 64 };
+	winpr_RAND(y, sizeof(y));
+	winpr_RAND(cb, sizeof(cb));
+	winpr_RAND(cr, sizeof(cr));
+
+	/* Normalize to 11.5 fixed radix */
+	for (int i = 0; i < 4096; ++i)
+	{
+		y[i] &= 0x1FE0U;
+		cb[i] &= 0x1FE0U;
+		cr[i] &= 0x1FE0U;
+	}
+
+	in[0] = y;
+	in[1] = cb;
+	in[2] = cr;
+	out1[0] = r1;
+	out1[1] = g1;
+	out1[2] = b1;
+	out2[0] = r2;
+	out2[1] = g2;
+	out2[2] = b2;
+	status = generic->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out1, 64 * 2, &roi);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out2, 64 * 2, &roi);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	for (int i = 0; i < 4096; ++i)
+	{
+		if ((ABS(r1[i] - r2[i]) > 1) || (ABS(g1[i] - g2[i]) > 1) || (ABS(b1[i] - b2[i]) > 1))
+		{
+			printf("YCbCrToRGB-SSE FAIL[%d]: %" PRId16 ",%" PRId16 ",%" PRId16 " vs %" PRId16
+			       ",%" PRId16 ",%" PRId16 "\n",
+			       i, r1[i], g1[i], b1[i], r2[i], g2[i], b2[i]);
+			return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static int test_yCbCrToRGB_16s16s_P3P3_speed(void)
+{
+	prim_size_t roi = { 64, 64 };
+	INT16 ALIGN(y[4096]);
+	INT16 ALIGN(cb[4096]);
+	INT16 ALIGN(cr[4096]);
+	INT16 ALIGN(r[4096]);
+	INT16 ALIGN(g[4096]);
+	INT16 ALIGN(b[4096]);
+	const INT16* input[3];
+	INT16* output[3];
+	winpr_RAND(y, sizeof(y));
+	winpr_RAND(cb, sizeof(cb));
+	winpr_RAND(cr, sizeof(cr));
+
+	/* Normalize to 11.5 fixed radix */
+	for (int i = 0; i < 4096; ++i)
+	{
+		y[i] &= 0x1FE0U;
+		cb[i] &= 0x1FE0U;
+		cr[i] &= 0x1FE0U;
+	}
+
+	input[0] = y;
+	input[1] = cb;
+	input[2] = cr;
+	output[0] = r;
+	output[1] = g;
+	output[2] = b;
+
+	if (!speed_test("yCbCrToRGB_16s16s_P3P3", "aligned", g_Iterations,
+	                (speed_test_fkt)generic->yCbCrToRGB_16s16s_P3P3,
+	                (speed_test_fkt)optimized->yCbCrToRGB_16s16s_P3P3, input, 64 * 2, output,
+	                64 * 2, &roi))
+		return FALSE;
+
+	return TRUE;
+}
+
+int TestPrimitivesColors(int argc, char* argv[])
+{
+	const DWORD formats[] = { PIXEL_FORMAT_ARGB32, PIXEL_FORMAT_XRGB32, PIXEL_FORMAT_ABGR32,
+		                      PIXEL_FORMAT_XBGR32, PIXEL_FORMAT_RGBA32, PIXEL_FORMAT_RGBX32,
+		                      PIXEL_FORMAT_BGRA32, PIXEL_FORMAT_BGRX32 };
+	prim_size_t roi = { 1920 / 4, 1080 / 4 };
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+	prim_test_setup(FALSE);
+
+	for (UINT32 x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
+	{
+		if (!test_RGBToRGB_16s8u_P3AC4R_func(roi, formats[x]))
+			return 1;
+
+#if 0
+
+		if (g_TestPrimitivesPerformance)
+		{
+			if (!test_RGBToRGB_16s8u_P3AC4R_speed())
+				return 1;
+		}
+
+		if (!test_yCbCrToRGB_16s16s_P3P3_func())
+			return 1;
+
+		if (g_TestPrimitivesPerformance)
+		{
+			if (!test_yCbCrToRGB_16s16s_P3P3_speed())
+				return 1;
+		}
+
+#endif
+	}
+
+	return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesCopy.c b/libfreerdp/primitives/test/TestPrimitivesCopy.c
new file mode 100644
index 0000000..8c681f2
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesCopy.c
@@ -0,0 +1,90 @@
+/* test_copy.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+
+#define COPY_TESTSIZE (256 * 2 + 16 * 2 + 15 + 15)
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_copy8u_func(void)
+{
+	primitives_t* prims = primitives_get();
+	BYTE ALIGN(data[COPY_TESTSIZE + 15]) = { 0 };
+	winpr_RAND(data, sizeof(data));
+
+	for (int soff = 0; soff < 16; ++soff)
+	{
+		for (int doff = 0; doff < 16; ++doff)
+		{
+			for (int length = 1; length <= COPY_TESTSIZE - doff; ++length)
+			{
+				BYTE ALIGN(dest[COPY_TESTSIZE + 15]) = { 0 };
+
+				if (prims->copy_8u(data + soff, dest + doff, length) != PRIMITIVES_SUCCESS)
+					return FALSE;
+
+				for (int i = 0; i < length; ++i)
+				{
+					if (dest[i + doff] != data[i + soff])
+					{
+						printf("COPY8U FAIL: off=%d len=%d, dest[%d]=0x%02" PRIx8 ""
+						       "data[%d]=0x%02" PRIx8 "\n",
+						       doff, length, i + doff, dest[i + doff], i + soff, data[i + soff]);
+						return FALSE;
+					}
+				}
+			}
+		}
+	}
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_copy8u_speed(void)
+{
+	BYTE ALIGN(src[MAX_TEST_SIZE + 4]);
+	BYTE ALIGN(dst[MAX_TEST_SIZE + 4]);
+
+	if (!speed_test("copy_8u", "aligned", g_Iterations, (speed_test_fkt)generic->copy_8u,
+	                (speed_test_fkt)optimized->copy_8u, src, dst, MAX_TEST_SIZE))
+		return FALSE;
+
+	if (!speed_test("copy_8u", "unaligned", g_Iterations, (speed_test_fkt)generic->copy_8u,
+	                (speed_test_fkt)optimized->copy_8u, src + 1, dst + 1, MAX_TEST_SIZE))
+		return FALSE;
+
+	return TRUE;
+}
+
+int TestPrimitivesCopy(int argc, char* argv[])
+{
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+	prim_test_setup(FALSE);
+
+	if (!test_copy8u_func())
+		return 1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_copy8u_speed())
+			return 1;
+	}
+
+	return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesSet.c b/libfreerdp/primitives/test/TestPrimitivesSet.c
new file mode 100644
index 0000000..c6cefcc
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesSet.c
@@ -0,0 +1,274 @@
+/* test_set.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+
+/* ------------------------------------------------------------------------- */
+static BOOL check8(const BYTE* src, UINT32 length, UINT32 offset, BYTE value)
+{
+	for (UINT32 i = 0; i < length; ++i)
+	{
+		if (src[offset + i] != value)
+		{
+			printf("SET8U FAILED: off=%" PRIu32 " len=%" PRIu32 " dest[%" PRIu32 "]=0x%02" PRIx8
+			       "\n",
+			       offset, length, i + offset, src[i + offset]);
+			return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+static BOOL test_set8u_func(void)
+{
+	pstatus_t status = 0;
+
+	for (UINT32 off = 0; off < 16; ++off)
+	{
+		BYTE dest[1024];
+
+		memset(dest, 3, sizeof(dest));
+		for (UINT32 len = 1; len < 48 - off; ++len)
+		{
+			status = generic->set_8u(0xa5, dest + off, len);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return FALSE;
+
+			if (!check8(dest, len, off, 0xa5))
+				return FALSE;
+		}
+	}
+
+	for (UINT32 off = 0; off < 16; ++off)
+	{
+		BYTE dest[1024];
+
+		memset(dest, 3, sizeof(dest));
+		for (UINT32 len = 1; len < 48 - off; ++len)
+		{
+			status = optimized->set_8u(0xa5, dest + off, len);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return FALSE;
+
+			if (!check8(dest, len, off, 0xa5))
+				return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_set8u_speed(void)
+{
+	BYTE dest[1024];
+	BYTE value = 0;
+
+	for (UINT32 x = 0; x < 16; x++)
+	{
+		winpr_RAND(&value, sizeof(value));
+
+		if (!speed_test("set_8u", "", g_Iterations, (speed_test_fkt)generic->set_8u,
+		                (speed_test_fkt)optimized->set_8u, value, dest + x, x))
+			return FALSE;
+	}
+
+	return TRUE;
+}
+
+static BOOL check32s(const INT32* src, UINT32 length, UINT32 offset, INT32 value)
+{
+	for (UINT32 i = 0; i < length; ++i)
+	{
+		if (src[offset + i] != value)
+		{
+			printf("SET8U FAILED: off=%" PRIu32 " len=%" PRIu32 " dest[%" PRIu32 "]=0x%08" PRIx32
+			       "\n",
+			       offset, length, i + offset, src[i + offset]);
+			return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_set32s_func(void)
+{
+	pstatus_t status = 0;
+	const INT32 value = -0x12345678;
+
+	for (UINT32 off = 0; off < 16; ++off)
+	{
+		INT32 dest[1024] = { 0 };
+
+		for (UINT32 len = 1; len < 48 - off; ++len)
+		{
+			status = generic->set_32s(value, dest + off, len);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return FALSE;
+
+			if (!check32s(dest, len, off, value))
+				return FALSE;
+		}
+	}
+
+	for (UINT32 off = 0; off < 16; ++off)
+	{
+		INT32 dest[1024] = { 0 };
+
+		for (UINT32 len = 1; len < 48 - off; ++len)
+		{
+			status = optimized->set_32s(value, dest + off, len);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return FALSE;
+
+			if (!check32s(dest, len, off, value))
+				return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+static BOOL check32u(const UINT32* src, UINT32 length, UINT32 offset, UINT32 value)
+{
+	for (UINT32 i = 0; i < length; ++i)
+	{
+		if (src[offset + i] != value)
+		{
+			printf("SET8U FAILED: off=%" PRIu32 " len=%" PRIu32 " dest[%" PRIu32 "]=0x%08" PRIx32
+			       "\n",
+			       offset, length, i + offset, src[i + offset]);
+			return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_set32u_func(void)
+{
+	pstatus_t status = 0;
+	const UINT32 value = 0xABCDEF12;
+
+	for (UINT32 off = 0; off < 16; ++off)
+	{
+		UINT32 dest[1024] = { 0 };
+
+		for (UINT32 len = 1; len < 48 - off; ++len)
+		{
+			status = generic->set_32u(value, dest + off, len);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return FALSE;
+
+			if (!check32u(dest, len, off, value))
+				return FALSE;
+		}
+	}
+
+	for (UINT32 off = 0; off < 16; ++off)
+	{
+		UINT32 dest[1024] = { 0 };
+
+		for (UINT32 len = 1; len < 48 - off; ++len)
+		{
+			status = optimized->set_32u(value, dest + off, len);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return FALSE;
+
+			if (!check32u(dest, len, off, value))
+				return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_set32u_speed(void)
+{
+	UINT32 dest[1024];
+	BYTE value = 0;
+
+	for (UINT32 x = 0; x < 16; x++)
+	{
+		winpr_RAND(&value, sizeof(value));
+
+		if (!speed_test("set_32u", "", g_Iterations, (speed_test_fkt)generic->set_32u,
+		                (speed_test_fkt)optimized->set_32u, value, dest + x, x))
+			return FALSE;
+	}
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_set32s_speed(void)
+{
+	INT32 dest[1024];
+	BYTE value = 0;
+
+	for (UINT32 x = 0; x < 16; x++)
+	{
+		winpr_RAND(&value, sizeof(value));
+
+		if (!speed_test("set_32s", "", g_Iterations, (speed_test_fkt)generic->set_32s,
+		                (speed_test_fkt)optimized->set_32s, value, dest + x, x))
+			return FALSE;
+	}
+
+	return TRUE;
+}
+
+int TestPrimitivesSet(int argc, char* argv[])
+{
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+	prim_test_setup(FALSE);
+
+	if (!test_set8u_func())
+		return -1;
+
+	if (!test_set32s_func())
+		return -1;
+
+	if (!test_set32u_func())
+		return -1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_set8u_speed())
+			return -1;
+
+		if (!test_set32s_speed())
+			return -1;
+
+		if (!test_set32u_speed())
+			return -1;
+	}
+
+	return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesShift.c b/libfreerdp/primitives/test/TestPrimitivesShift.c
new file mode 100644
index 0000000..8845838
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesShift.c
@@ -0,0 +1,470 @@
+/* test_shift.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+
+#define FUNC_TEST_SIZE 65536
+
+static BOOL test_lShift_16s_func(void)
+{
+	pstatus_t status = 0;
+	INT16 ALIGN(src[FUNC_TEST_SIZE + 3]);
+	INT16 ALIGN(d1[FUNC_TEST_SIZE + 3]);
+	UINT32 val = 0;
+	winpr_RAND(&val, sizeof(val));
+	winpr_RAND(src, sizeof(src));
+	val = val % 16;
+	/* Negative tests */
+	status = generic->lShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->lShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Aligned */
+	status = generic->lShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->lShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Unaligned */
+	status = generic->lShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->lShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	return TRUE;
+}
+
+static BOOL test_lShift_16u_func(void)
+{
+	pstatus_t status = 0;
+	UINT16 ALIGN(src[FUNC_TEST_SIZE + 3]);
+	UINT16 ALIGN(d1[FUNC_TEST_SIZE + 3]);
+	UINT32 val = 0;
+	winpr_RAND(&val, sizeof(val));
+	winpr_RAND(src, sizeof(src));
+	val = val % 16;
+
+	/* Negative tests */
+	status = generic->lShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->lShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Aligned */
+	status = generic->lShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->lShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Unaligned */
+	status = generic->lShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->lShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	return TRUE;
+}
+
+static BOOL test_rShift_16s_func(void)
+{
+	pstatus_t status = 0;
+	INT16 ALIGN(src[FUNC_TEST_SIZE + 3]);
+	INT16 ALIGN(d1[FUNC_TEST_SIZE + 3]);
+	UINT32 val = 0;
+	winpr_RAND(&val, sizeof(val));
+	winpr_RAND(src, sizeof(src));
+	val = val % 16;
+
+	/* Negative Tests */
+	status = generic->rShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->rShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Aligned */
+	status = generic->rShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->rShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Unaligned */
+	status = generic->rShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->rShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	return TRUE;
+}
+
+static BOOL test_rShift_16u_func(void)
+{
+	pstatus_t status = 0;
+	UINT16 ALIGN(src[FUNC_TEST_SIZE + 3]);
+	UINT16 ALIGN(d1[FUNC_TEST_SIZE + 3]);
+	UINT32 val = 0;
+	winpr_RAND(&val, sizeof(val));
+	winpr_RAND(src, sizeof(src));
+	val = val % 16;
+	/* Negative tests */
+	status = generic->rShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->rShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Aligned */
+	status = generic->rShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->rShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Unaligned */
+	status = generic->rShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->rShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	return TRUE;
+}
+
+static BOOL test_ShiftWrapper_16s_func(void)
+{
+	pstatus_t status = 0;
+	INT16 ALIGN(src[FUNC_TEST_SIZE + 3]);
+	INT16 ALIGN(d1[FUNC_TEST_SIZE + 3]);
+	UINT32 tmp = 0;
+	INT32 val = 0;
+	winpr_RAND(&tmp, sizeof(tmp));
+	winpr_RAND(src, sizeof(src));
+	val = tmp % 16;
+
+	/* Negative tests */
+	status = generic->shiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Aligned */
+	status = generic->shiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = generic->shiftC_16s(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16s(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Unaligned */
+	status = generic->shiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = generic->shiftC_16s(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16s(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	return TRUE;
+}
+
+static BOOL test_ShiftWrapper_16u_func(void)
+{
+	pstatus_t status = 0;
+	UINT16 ALIGN(src[FUNC_TEST_SIZE + 3]);
+	UINT16 ALIGN(d1[FUNC_TEST_SIZE + 3]);
+	UINT32 tmp = 0;
+	INT32 val = 0;
+	winpr_RAND(&tmp, sizeof(tmp));
+	winpr_RAND(src, sizeof(src));
+	val = tmp % 16;
+
+	/* Negative */
+	status = generic->shiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status == PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Aligned */
+	status = generic->shiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = generic->shiftC_16u(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16u(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	/* Unaligned */
+	status = generic->shiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = generic->shiftC_16u(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->shiftC_16u(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_lShift_16s_speed(void)
+{
+	UINT32 val = 0;
+	INT16 ALIGN(src[MAX_TEST_SIZE + 1]);
+	INT16 ALIGN(dst[MAX_TEST_SIZE + 1]);
+	winpr_RAND(src, sizeof(src));
+	winpr_RAND(&val, sizeof(val));
+
+	val = val % 16;
+	if (!speed_test("lShift_16s", "aligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16s,
+	                (speed_test_fkt)optimized->lShiftC_16s, src, val, dst, MAX_TEST_SIZE))
+		return FALSE;
+
+	if (!speed_test("lShift_16s", "unaligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16s,
+	                (speed_test_fkt)optimized->lShiftC_16s, src + 1, val, dst, MAX_TEST_SIZE))
+		return FALSE;
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_lShift_16u_speed(void)
+{
+	UINT32 val = 0;
+	UINT16 ALIGN(src[MAX_TEST_SIZE + 1]);
+	UINT16 ALIGN(dst[MAX_TEST_SIZE + 1]);
+	winpr_RAND(&val, sizeof(val));
+	winpr_RAND(src, sizeof(src));
+
+	val = val % 16;
+	if (!speed_test("lShift_16u", "aligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16u,
+	                (speed_test_fkt)optimized->lShiftC_16u, src, val, dst, MAX_TEST_SIZE))
+		return FALSE;
+
+	if (!speed_test("lShift_16u", "unaligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16u,
+	                (speed_test_fkt)optimized->lShiftC_16u, src + 1, val, dst, MAX_TEST_SIZE))
+		return FALSE;
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_rShift_16s_speed(void)
+{
+	UINT32 val = 0;
+	INT16 ALIGN(src[MAX_TEST_SIZE + 1]);
+	INT16 ALIGN(dst[MAX_TEST_SIZE + 1]);
+	winpr_RAND(src, sizeof(src));
+	winpr_RAND(&val, sizeof(val));
+
+	val = val % 16;
+	if (!speed_test("rShift_16s", "aligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16s,
+	                (speed_test_fkt)optimized->rShiftC_16s, src, val, dst, MAX_TEST_SIZE))
+		return FALSE;
+
+	if (!speed_test("rShift_16s", "unaligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16s,
+	                (speed_test_fkt)optimized->rShiftC_16s, src + 1, val, dst, MAX_TEST_SIZE))
+		return FALSE;
+
+	return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_rShift_16u_speed(void)
+{
+	UINT32 val = 0;
+	UINT16 ALIGN(src[MAX_TEST_SIZE + 1]);
+	UINT16 ALIGN(dst[MAX_TEST_SIZE + 1]);
+	winpr_RAND(&val, sizeof(val));
+	winpr_RAND(src, sizeof(src));
+
+	val = val % 16;
+	if (!speed_test("rShift_16u", "aligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16u,
+	                (speed_test_fkt)optimized->rShiftC_16u, src, val, dst, MAX_TEST_SIZE))
+		return FALSE;
+
+	if (!speed_test("rShift_16u", "unaligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16u,
+	                (speed_test_fkt)optimized->rShiftC_16u, src + 1, val, dst, MAX_TEST_SIZE))
+		return FALSE;
+
+	return TRUE;
+}
+
+int TestPrimitivesShift(int argc, char* argv[])
+{
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+	prim_test_setup(FALSE);
+
+	if (!test_lShift_16s_func())
+		return 1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_lShift_16s_speed())
+			return 1;
+	}
+
+	if (!test_lShift_16u_func())
+		return 1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_lShift_16u_speed())
+			return 1;
+	}
+
+	if (!test_rShift_16s_func())
+		return 1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_rShift_16s_speed())
+			return 1;
+	}
+
+	if (!test_rShift_16u_func())
+		return 1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_rShift_16u_speed())
+			return 1;
+	}
+
+	if (!test_ShiftWrapper_16s_func())
+		return 1;
+
+	if (!test_ShiftWrapper_16u_func())
+		return 1;
+
+	return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesSign.c b/libfreerdp/primitives/test/TestPrimitivesSign.c
new file mode 100644
index 0000000..fb9549a
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesSign.c
@@ -0,0 +1,93 @@
+/* test_sign.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+
+#define TEST_BUFFER_SIZE 65535
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_sign16s_func(void)
+{
+	pstatus_t status = 0;
+	INT16 ALIGN(src[TEST_BUFFER_SIZE + 16]) = { 0 };
+	INT16 ALIGN(d1[TEST_BUFFER_SIZE + 16]) = { 0 };
+	INT16 ALIGN(d2[TEST_BUFFER_SIZE + 16]) = { 0 };
+	winpr_RAND(src, sizeof(src));
+	status = generic->sign_16s(src + 1, d1 + 1, TEST_BUFFER_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->sign_16s(src + 1, d2 + 1, TEST_BUFFER_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	if (memcmp(d1, d2, sizeof(d1)) != 0)
+		return FALSE;
+
+	status = generic->sign_16s(src + 1, d1 + 2, TEST_BUFFER_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	status = optimized->sign_16s(src + 1, d2 + 2, TEST_BUFFER_SIZE);
+
+	if (status != PRIMITIVES_SUCCESS)
+		return FALSE;
+
+	if (memcmp(d1, d2, sizeof(d1)) != 0)
+		return FALSE;
+
+	return TRUE;
+}
+
+static int test_sign16s_speed(void)
+{
+	INT16 ALIGN(src[MAX_TEST_SIZE + 3]) = { 0 };
+	INT16 ALIGN(dst[MAX_TEST_SIZE + 3]) = { 0 };
+	winpr_RAND(src, sizeof(src));
+
+	if (!speed_test("sign16s", "aligned", g_Iterations, (speed_test_fkt)generic->sign_16s,
+	                (speed_test_fkt)optimized->sign_16s, src + 1, dst + 1, MAX_TEST_SIZE))
+		return FALSE;
+
+	if (!speed_test("sign16s", "unaligned", g_Iterations, (speed_test_fkt)generic->sign_16s,
+	                (speed_test_fkt)optimized->sign_16s, src + 1, dst + 2, MAX_TEST_SIZE))
+		return FALSE;
+
+	return TRUE;
+}
+
+int TestPrimitivesSign(int argc, char* argv[])
+{
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+
+	prim_test_setup(FALSE);
+
+	if (!test_sign16s_func())
+		return 1;
+
+	if (g_TestPrimitivesPerformance)
+	{
+		if (!test_sign16s_speed())
+			return 1;
+	}
+
+	return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesYCbCr.c b/libfreerdp/primitives/test/TestPrimitivesYCbCr.c
new file mode 100644
index 0000000..64e7f91
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesYCbCr.c
@@ -0,0 +1,1835 @@
+
+#include "prim_test.h"
+
+#include <winpr/print.h>
+#include <freerdp/codec/color.h>
+#include <winpr/wlog.h>
+#include <freerdp/utils/profiler.h>
+
+#include <freerdp/config.h>
+
+#define TAG __FILE__
+
+static const INT16 TEST_Y_COMPONENT[4096] = {
+	-32,   +16,   +64,   +272,  -32,   -16,   +0,    -16,   -32,   -24,   -16,   -8,    +0,
+	-24,   -48,   -72,   -96,   -90,   -84,   -78,   -72,   -98,   -124,  -150,  -176,  -192,
+	-208,  -224,  -240,  -256,  -272,  -288,  -304,  -304,  -304,  -304,  -304,  -336,  -368,
+	-400,  -432,  -450,  -468,  -486,  -504,  -522,  -540,  -558,  -576,  -598,  -620,  -642,
+	-664,  -686,  -708,  -730,  -752,  -768,  -784,  -800,  -816,  -816,  -816,  -816,  +68,
+	+120,  +172,  +240,  +53,   +55,   +57,   +43,   +30,   +32,   +34,   +36,   +38,   +20,
+	+2,    -16,   -34,   -36,   -38,   -40,   -42,   -68,   -94,   -120,  -146,  -148,  -151,
+	-186,  -220,  -227,  -233,  -240,  -247,  -254,  -261,  -268,  -275,  -302,  -329,  -356,
+	-384,  -403,  -423,  -443,  -463,  -484,  -506,  -528,  -550,  -572,  -594,  -616,  -639,
+	-673,  -707,  -709,  -712,  -733,  -754,  -775,  -796,  -796,  -796,  -796,  +168,  +224,
+	+281,  +209,  +138,  +126,  +115,  +103,  +92,   +88,   +84,   +80,   +76,   +64,   +52,
+	+40,   +28,   +18,   +8,    -2,    -12,   -38,   -64,   -90,   -116,  -105,  -95,   -148,
+	-201,  -198,  -195,  -192,  -190,  -204,  -218,  -232,  -247,  -269,  -291,  -313,  -336,
+	-357,  -379,  -400,  -422,  -447,  -473,  -498,  -524,  -546,  -569,  -591,  -614,  -660,
+	-707,  -689,  -672,  -698,  -724,  -750,  -776,  -776,  -776,  -776,  +268,  +312,  +357,
+	+273,  +191,  +181,  +172,  +162,  +154,  +144,  +134,  +124,  +114,  +108,  +102,  +80,
+	+58,   +56,   +54,   +52,   +50,   +24,   -2,    -44,   -86,   -61,   -38,   -93,   -149,
+	-137,  -124,  -144,  -165,  -170,  -175,  -196,  -218,  -235,  -252,  -269,  -288,  -310,
+	-334,  -357,  -381,  -409,  -439,  -468,  -498,  -520,  -543,  -565,  -589,  -647,  -706,
+	-668,  -632,  -663,  -694,  -725,  -756,  -756,  -756,  -756,  +368,  +401,  +434,  +339,
+	+244,  +237,  +230,  +223,  +216,  +200,  +184,  +168,  +152,  +152,  +152,  +120,  +88,
+	+94,   +100,  +106,  +112,  +86,   +60,   +2,    -56,   -18,   +19,   -39,   -98,   -76,
+	-55,   -97,   -140,  -136,  -133,  -161,  -190,  -202,  -215,  -227,  -240,  -265,  -290,
+	-315,  -340,  -373,  -406,  -439,  -472,  -495,  -518,  -541,  -564,  -635,  -706,  -649,
+	-592,  -628,  -664,  -700,  -736,  -736,  -736,  -736,  +404,  +556,  +454,  +383,  +313,
+	+531,  +239,  +282,  +326,  +304,  +282,  +260,  +238,  +246,  +254,  +118,  +238,  +196,
+	+154,  +32,   -90,   -88,   -86,   +76,   +238,  +243,  +247,  +29,   -191,  -232,  -272,
+	-121,  +29,   -62,   -153,  -149,  -145,  -162,  -180,  -197,  -216,  -240,  -265,  -289,
+	-315,  -345,  -376,  -406,  -438,  -446,  -456,  -497,  -539,  -595,  -653,  -502,  -608,
+	-625,  -642,  -675,  -708,  -708,  -708,  -708,  +440,  +713,  +475,  +428,  +382,  +827,
+	+249,  +342,  +436,  +408,  +380,  +352,  +324,  +340,  +356,  -140,  -124,  +42,   +208,
+	+214,  +220,  +250,  +280,  +406,  +532,  +504,  +476,  +352,  +229,  +125,  +22,   -146,
+	-314,  -244,  -175,  -138,  -101,  -123,  -146,  -169,  -192,  -216,  -241,  -265,  -290,
+	-318,  -347,  -375,  -404,  -399,  -395,  -454,  -514,  -557,  -601,  -356,  -624,  -622,
+	-620,  -650,  -680,  -680,  -680,  -680,  +604,  +677,  +495,  +457,  +419,  +770,  +354,
+	+386,  +418,  +416,  +414,  +380,  +346,  +258,  -342,  -302,  -6,    +288,  +582,  +604,
+	+626,  +588,  +550,  +688,  +826,  +829,  +833,  +724,  +616,  +481,  +348,  +181,  +15,
+	-139,  -292,  -175,  -56,   -83,   -112,  -139,  -168,  -192,  -216,  -240,  -265,  -291,
+	-317,  -343,  -370,  -351,  -333,  -411,  -489,  -486,  -484,  -402,  -576,  -587,  -598,
+	-625,  -652,  -652,  -652,  -652,  +1280, +1154, +1028, +998,  +968,  +970,  +460,  +430,
+	+400,  +424,  +448,  +408,  +368,  +432,  -528,  -208,  +112,  +534,  +956,  +994,  +1032,
+	+926,  +820,  +970,  +1120, +1155, +1190, +1097, +1004, +839,  +674,  +509,  +344,  +223,
+	+102,  +45,   -12,   -45,   -78,   -111,  -144,  -168,  -192,  -216,  -240,  -264,  -288,
+	-312,  -336,  -304,  -272,  -368,  -464,  -416,  -368,  -448,  -528,  -552,  -576,  -600,
+	-624,  -624,  -624,  -624,  +770,  +671,  +573,  +554,  +536,  +629,  +467,  +464,  +462,
+	+492,  +523,  +490,  +457,  +281,  -405,  -101,  +204,  +599,  +995,  +1310, +1370, +1297,
+	+1225, +1296, +1368, +1432, +1498, +1402, +1308, +1184, +1062, +874,  +688,  +586,  +485,
+	+303,  +123,  -82,   -32,   -76,   -122,  -174,  -226,  -199,  -171,  -193,  -216,  -238,
+	-261,  -314,  -368,  -325,  -283,  -360,  -438,  -451,  -465,  -515,  -565,  -583,  -601,
+	-617,  -633,  -633,  +772,  +701,  +630,  +623,  +616,  +545,  +474,  +499,  +524,  +561,
+	+599,  +572,  +546,  +131,  -283,  +6,    +296,  +665,  +1034, +1627, +1708, +1669, +1630,
+	+1623, +1616, +1711, +1806, +1709, +1612, +1531, +1450, +1241, +1032, +950,  +869,  +563,
+	+258,  -120,  +15,   -42,   -100,  -180,  -261,  -182,  -103,  -123,  -144,  -165,  -186,
+	-325,  -464,  -283,  -102,  -305,  -508,  -455,  -402,  -478,  -554,  -566,  -578,  -610,
+	-642,  -642,  +774,  +730,  +687,  +675,  +664,  +620,  +577,  +581,  +586,  +597,  +610,
+	+590,  +571,  -147,  -96,   +209,  +516,  +794,  +1073, +1575, +1822, +1976, +1875, +1869,
+	+1864, +1988, +2114, +2014, +1916, +1876, +1838, +1606, +1376, +1266, +1156, +902,  +137,
+	-61,   -3,    -120,  -238,  -122,  -7,    -69,   -130,  -164,  -200,  -219,  -239,  -271,
+	-304,  -128,  -209,  -297,  -386,  -426,  -467,  -937,  -895,  -549,  -459,  -667,  -619,
+	-619,  +776,  +760,  +744,  +728,  +712,  +696,  +680,  +664,  +648,  +635,  +622,  +609,
+	+596,  -425,  +90,   +413,  +736,  +924,  +1112, +1524, +1936, +2284, +2120, +2116, +2112,
+	+2267, +2422, +2321, +2220, +2223, +2226, +1973, +1720, +1582, +1444, +1242, +16,   -2,
+	-20,   +58,   +136,  -65,   -267,  -212,  -158,  -207,  -257,  -274,  -292,  -218,  -144,
+	+26,   -316,  -290,  -264,  -142,  -20,   +2956, +2860, -788,  -852,  -980,  -596,  -596,
+	+826,  +807,  +789,  +770,  +752,  +749,  +747,  +744,  +742,  +677,  +613,  +516,  +421,
+	-285,  +288,  +573,  +860,  +1081, +1303, +1668, +2034, +2313, +2337, +2344, +2352, +2452,
+	+2554, +2574, +2596, +2506, +2418, +2248, +2080, +1961, +1843, +925,  +7,    +40,   +74,
+	+748,  +654,  +453,  +251,  +48,   -154,  -107,  -61,   -111,  -161,  -28,   +104,  +45,
+	-271,  -274,  -278,  -842,  +1411, +3007, +3323, +327,  -1389, -1197, -493,  -493,  +876,
+	+855,  +834,  +813,  +792,  +803,  +814,  +825,  +836,  +720,  +605,  +681,  +758,  +110,
+	+487,  +735,  +984,  +1239, +1494, +1813, +2132, +2343, +2554, +2573, +2592, +2639, +2686,
+	+2829, +2972, +2791, +2610, +2525, +2440, +2341, +2243, +608,  -2,    +83,   +169,  +1438,
+	+1172, +970,  +768,  +565,  +363,  +249,  +135,  +52,   -30,   -95,   -160,  -193,  -226,
+	-259,  -292,  +763,  -742,  +2290, +1738, -1118, -902,  -902,  -390,  -390,  +926,  +902,
+	+879,  +855,  +832,  +824,  +817,  +809,  +802,  +763,  +724,  +397,  +2375, +970,  +589,
+	+848,  +1108, +1396, +1685, +1941, +2198, +2468, +2739, +2785, +2832, +2888, +2946, +3178,
+	+2900, +3058, +2962, +2848, +2736, +2896, +2546, -364,  +309,  +205,  +871,  +1760, +1626,
+	+1471, +1317, +1145, +975,  +844,  +714,  +599,  +485,  +351,  +216,  +146,  +75,   -355,
+	+750,  +2687, +529,  -1067, -615,  -835,  -799,  -847,  -383,  -383,  +976,  +950,  +924,
+	+898,  +872,  +846,  +820,  +794,  +768,  +806,  +844,  +882,  +1432, +2598, +692,  +962,
+	+1232, +1554, +1876, +2070, +2264, +2594, +2924, +2998, +3072, +3139, +3206, +3273, +2316,
+	+3071, +3314, +3173, +3032, +2941, +1826, -57,   +108,  +73,   +1574, +2083, +2080, +1973,
+	+1866, +1727, +1588, +1441, +1294, +1147, +1000, +796,  +592,  +484,  +376,  +828,  +256,
+	+772,  -248,  -72,   -408,  +984,  -184,  -536,  -376,  -376,  +1026, +997,  +969,  +941,
+	+913,  +888,  +864,  +840,  +816,  +762,  +709,  +768,  +1339, +2269, +2176, +1411, +1414,
+	+1677, +1941, +2188, +2436, +2730, +3023, +3157, +3291, +3349, +3409, +3420, +2152, +3000,
+	+3594, +3403, +3213, +3233, +951,  +12,   +97,   -303,  +2883, +2755, +2373, +2312, +2252,
+	+2143, +2036, +1861, +1687, +1544, +1403, +1254, +1106, +974,  +842,  +1229, +1105, +21,
+	+217,  +46,   -381,  +1912, +3181, +2765, +301,  -723,  +1076, +1045, +1015, +984,  +954,
+	+931,  +909,  +886,  +864,  +719,  +575,  +654,  +1246, +1685, +3149, +1604, +1596, +1801,
+	+2006, +2307, +2609, +2866, +3123, +3316, +3510, +3561, +3613, +3568, +1988, +2931, +3875,
+	+3634, +3394, +3527, +76,   +81,   +86,   +859,  +3168, +2917, +2666, +2652, +2639, +2561,
+	+2484, +2282, +2081, +1943, +1806, +1713, +1621, +1464, +1308, +1119, +931,  +550,  +170,
+	-92,   -354,  +1560, +3986, +1970, -558,  -558,  +1126, +1092, +1060, +1027, +995,  +973,
+	+953,  +932,  +912,  +899,  +888,  -340,  +1249, +1756, +2521, +2421, +1810, +2036, +2263,
+	+2521, +2781, +3066, +3350, +3443, +3537, +3612, +3688, +3476, +2496, +3021, +3803, +3833,
+	+3863, +2843, +33,   +133,  -21,   +2099, +3197, +3061, +2927, +2944, +2961, +2882, +2804,
+	+2607, +2410, +2309, +2209, +2139, +2071, +1842, +1614, +1328, +1044, +663,  +283,  +10,
+	-263,  -488,  -201,  -201,  -457,  -457,  +1176, +1141, +1106, +1071, +1036, +1017, +998,
+	+979,  +960,  +825,  +690,  +203,  +740,  +1573, +1894, +3239, +2024, +2272, +2521, +2737,
+	+2954, +3010, +3067, +3315, +3564, +3664, +3764, +3384, +3004, +3112, +3732, +3776, +3820,
+	+1905, -10,   +187,  -128,  +3341, +3226, +3207, +3188, +3236, +3284, +3204, +3124, +2932,
+	+2740, +2676, +2612, +2567, +2522, +2221, +1920, +1539, +1158, +777,  +396,  +112,  -172,
+	-488,  -292,  -324,  -356,  -356,  +1194, +1162, +1131, +1099, +1069, +1047, +1026, +972,
+	+920,  +969,  +507,  +380,  +767,  +1428, +1834, +2799, +2486, +2347, +2721, +2919, +3118,
+	+3290, +3462, +3266, +3071, +3157, +3243, +3521, +3800, +3674, +3548, +3710, +3873, +874,
+	+179,  +91,   +517,  +3439, +3291, +3333, +3377, +3403, +3430, +3361, +3292, +3174, +3057,
+	+3004, +2951, +2761, +2572, +2222, +1874, +1554, +1235, +883,  +533,  +220,  -93,   -470,
+	-335,  -319,  -303,  -303,  +1212, +1184, +1157, +1129, +1102, +1078, +1055, +967,  +880,
+	+1114, +325,  +559,  +794,  +1284, +1775, +2361, +2948, +2423, +2923, +3103, +3283, +3314,
+	+3346, +3474, +3602, +3674, +3747, +3659, +3572, +3980, +3877, +3901, +3926, -157,  +368,
+	+253,  +1674, +3795, +3356, +3461, +3566, +3571, +3577, +3518, +3460, +3417, +3375, +3332,
+	+3290, +2956, +2623, +2225, +1828, +1570, +1313, +991,  +670,  +328,  -14,   -452,  -378,
+	-314,  -250,  -250,  +1230, +1206, +1182, +1158, +1135, +1109, +1083, +1025, +968,  +779,
+	+78,   +481,  +885,  +1284, +1939, +2466, +3250, +2626, +2772, +3157, +3543, +3514, +3486,
+	+3729, +3717, +3775, +3834, +3780, +3728, +3934, +3885, +3915, +2667, +92,   +333,  +173,
+	+2831, +3701, +3549, +3587, +3627, +3642, +3659, +3643, +3628, +3675, +3724, +3436, +3149,
+	+2847, +2545, +2275, +2006, +1730, +1454, +1114, +775,  +388,  +1,    -402,  -293,  -309,
+	-325,  -325,  +1248, +1228, +1208, +1188, +1168, +1140, +1112, +1084, +1056, +700,  +344,
+	+660,  +976,  +1284, +2104, +2316, +3040, +2319, +2110, +2189, +2268, +2691, +3114, +3729,
+	+3832, +3877, +3922, +3903, +3884, +3889, +3894, +3931, +1408, +341,  +298,  +95,   +3988,
+	+3609, +3742, +3715, +3688, +3715, +3742, +3769, +3796, +3679, +3562, +3285, +3008, +2738,
+	+2468, +2326, +2184, +1890, +1596, +1238, +880,  +448,  +16,   -352,  -208,  -304,  -400,
+	-400,  +1296, +1284, +1272, +1260, +1249, +1165, +1081, +1093, +1106, +232,  +382,  +677,
+	+971,  +973,  +1232, +834,  +693,  +537,  +639,  +564,  +490,  +563,  +637,  -106,  +944,
+	+2358, +3773, +3795, +4074, +3964, +3855, +4337, +212,  +204,  +197,  +1341, +4023, +3813,
+	+3860, +3810, +3762, +3766, +3771, +3776, +3781, +3603, +3427, +3201, +2977, +2838, +2699,
+	+2400, +2101, +1982, +1607, +1280, +954,  +545,  -120,  -321,  -266,  -314,  -362,  -362,
+	+1344, +1340, +1337, +1333, +1330, +1190, +1051, +1103, +1156, +20,   +933,  +950,  +967,
+	+919,  +872,  +889,  +906,  +805,  +705,  +733,  +761,  +740,  +720,  +668,  +616,  +328,
+	+40,   +1640, +3752, +3784, +3816, +3208, +40,   +581,  +97,   +2589, +4058, +4018, +3979,
+	+3907, +3836, +3818, +3801, +3784, +3767, +3529, +3292, +3375, +3458, +3706, +3954, +3754,
+	+3555, +2843, +1619, +1067, +516,  +386,  -256,  -290,  -324,  -324,  -324,  -324,  +1392,
+	+1364, +1337, +1309, +1283, +1247, +1212, +968,  +982,  +1424, +1099, +1079, +1058, +1072,
+	+1088, +815,  +799,  +1056, +802,  +772,  +743,  +645,  +547,  +769,  +736,  +649,  +563,
+	+332,  +102,  +1939, +4033, +1982, +444,  +332,  -36,   +4076, +4093, +4047, +4001, +3955,
+	+3910, +3870, +3830, +3791, +3752, +3806, +3861, +3835, +3811, +3678, +3545, +3380, +3216,
+	+3639, +3806, +2341, +1134, +1091, +24,   -387,  -286,  -286,  -286,  -286,  +1440, +1389,
+	+1338, +1287, +1236, +1305, +1374, +1091, +1320, +1037, +1267, +1208, +1150, +715,  +281,
+	+486,  +1204, +1564, +901,  +1325, +1750, +1830, +1911, +1383, +344,  +459,  +574,  +817,
+	+548,  +351,  +666,  +757,  +336,  +340,  +856,  +4028, +4128, +4076, +4024, +4004, +3984,
+	+3922, +3861, +3799, +3738, +3828, +3919, +3785, +3652, +3394, +3137, +3007, +2878, +2900,
+	+2923, +3105, +3800, +1284, +1328, +28,   -248,  -248,  -248,  -248,  +1456, +1406, +1358,
+	+1309, +1261, +1209, +1159, +1444, +1218, +1265, +33,   -654,  -1342, -977,  -356,  +394,
+	+1401, +1753, +1338, +1738, +2140, +2575, +3009, +3524, +3784, +2536, +1033, +265,  +522,
+	+440,  +615,  +629,  +388,  +403,  +2211, +4051, +4099, +4078, +4058, +3990, +3922, +3910,
+	+3898, +3886, +3875, +3805, +3735, +3553, +3373, +3126, +2879, +2585, +2291, +2026, +1762,
+	+2649, +3026, +2303, +2092, +665,  -250,  -250,  -250,  -250,  +1472, +1425, +1379, +1332,
+	+1286, +1371, +1457, +1030, -932,  -1834, -1712, -1237, -763,  -621,  +33,   +815,  +1598,
+	+1943, +1776, +2153, +2531, +2808, +3085, +3362, +3640, +4102, +4052, +3042, +496,  +530,
+	+564,  +502,  +440,  +211,  +3055, +3818, +4070, +4081, +4093, +3976, +3860, +3898, +3936,
+	+3974, +4013, +3783, +3553, +3323, +3094, +2858, +2623, +2420, +2217, +1921, +1626, +915,
+	+2764, +250,  +296,  +22,   -252,  -252,  -252,  -252,  +1488, +1443, +1399, +1371, +1343,
+	+1308, +1530, -408,  -1834, -1589, -1089, -811,  -535,  -281,  +485,  +1171, +1859, +2132,
+	+2150, +2503, +2857, +3105, +3352, +3536, +3720, +3875, +3775, +4298, +4054, +2123, +449,
+	+502,  +556,  +546,  +26,   +2113, +3945, +4115, +4031, +3946, +3862, +3838, +3814, +3982,
+	+3894, +3488, +3338, +3140, +2943, +2622, +2302, +2030, +1758, +1495, +1234, +1259, +774,
+	-347,  -188,  -189,  -190,  -222,  -254,  -254,  +1504, +1462, +1420, +1410, +1400, +1246,
+	+1604, -1334, -1712, -1089, -978,  -643,  -308,  +59,   +938,  +1529, +2120, +2322, +2524,
+	+2854, +3184, +3402, +3620, +3710, +3800, +3905, +4010, +4019, +4028, +3973, +334,  +503,
+	+672,  +627,  +582,  +409,  +236,  +2359, +3970, +3917, +3864, +3778, +3692, +3990, +3776,
+	+3194, +3124, +2958, +2792, +2387, +1982, +1641, +1300, +1071, +842,  +69,   -192,  -176,
+	-160,  -144,  -128,  -192,  -256,  -256,  +1546, +1496, +1447, +1430, +1413, +1627, +1330,
+	-2102, -1184, -819,  -712,  -395,  -80,   +405,  +1148, +1713, +2280, +2486, +2692, +2995,
+	+3297, +3467, +3638, +3712, +3787, +3915, +4045, +3917, +4047, +3097, +357,  +655,  +699,
+	+198,  +466,  +381,  +297,  +376,  +200,  +1815, +3431, +3568, +3961, +4114, +3755, +3310,
+	+3121, +2804, +2487, +2208, +1931, +1189, +447,  +37,   -116,  -254,  -136,  -111,  -86,
+	-109,  -132,  -196,  -260,  -260,  +1588, +1531, +1475, +1450, +1426, +1497, +33,   -1591,
+	-1168, -807,  -446,  -149,  +148,  +753,  +1358, +1899, +2440, +2650, +2861, +3136, +3411,
+	+3533, +3656, +3715, +3774, +3927, +4080, +3817, +4066, +2223, +380,  +553,  +214,  +3610,
+	+350,  +354,  +358,  +442,  +526,  +226,  -74,   +286,  +1158, +1678, +1686, +1634, +1582,
+	+1114, +646,  +239,  -168,  -31,   +107,  -228,  -51,   -65,   -80,   -46,   -12,   -74,
+	-136,  -200,  -264,  -264,  +1630, +1565, +1502, +1470, +1439, +1590, -817,  -1399, -960,
+	-633,  -308,  -14,   +280,  +875,  +1472, +1971, +2472, +2718, +2965, +3229, +3492, +3582,
+	+3674, +3701, +3729, +3793, +3859, +4147, +4181, +707,  +563,  +417,  +1297, +3917, +4234,
+	+2198, +163,  +267,  +372,  +348,  +325,  +108,  +147,  +186,  -31,   +38,   +107,  +96,
+	+85,   +61,   +38,   -162,  -106,  -126,  +111,  +876,  -152,  -93,   -34,   -87,   -140,
+	-204,  -268,  -268,  +1672, +1601, +1530, +1491, +1452, +1685, -1666, -1209, -752,  -461,
+	-170,  +121,  +412,  +999,  +1586, +2045, +2504, +2787, +3071, +3322, +3574, +3633, +3693,
+	+3688, +3684, +3661, +3638, +3711, +2760, +473,  +746,  +283,  +2380, +4225, +4022, +4043,
+	+4064, +2141, +218,  +215,  +212,  +186,  +160,  +230,  +300,  +234,  +168,  +102,  +36,
+	-117,  -269,  +218,  +1218, +2025, +2833, +1048, -224,  -140,  -56,   -100,  -144,  -208,
+	-272,  -272,  +1626, +1607, +1589, +1458, +1585, +692,  -1479, -1107, -736,  -451,  -168,
+	+115,  +400,  +805,  +1468, +1937, +2408, +2703, +2999, +3327, +3655, +3568, +3482, +3620,
+	+3759, +3439, +3121, +1601, +851,  +819,  +533,  +437,  +3415, +4252, +4066, +4055, +4045,
+	+4084, +4124, +2995, +1867, +1068, +269,  +62,   -145,  -38,   +69,   +704,  +1339, +2183,
+	+3028, +2816, +2861, +2953, +2790, -349,  +96,   -19,   -134,  -137,  -140,  -204,  -268,
+	-268,  +1580, +1614, +1649, +1427, +1718, -300,  -1293, -1006, -720,  -443,  -166,  +111,
+	+388,  +613,  +1350, +1831, +2312, +2620, +2928, +3076, +3225, +3249, +3273, +3297, +3322,
+	+3475, +3628, +3333, +1502, +655,  +832,  +593,  +3938, +4024, +4110, +4068, +4026, +3980,
+	+3934, +3984, +4034, +3998, +3962, +3990, +4018, +3786, +3554, +3610, +3666, +3459, +3253,
+	+3111, +2969, +2858, +2236, -210,  -96,   -154,  -212,  -174,  -136,  -200,  -264,  -264,
+	+1662, +1653, +1644, +1619, +1851, -988,  -1266, -985,  -704,  -401,  -100,  +9,    +120,
+	+403,  +944,  +1579, +2216, +2504, +2793, +2873, +2954, +2976, +2999, +3085, +3173, +3237,
+	+3303, +3575, +521,  +553,  +587,  +1771, +3981, +4019, +4058, +4032, +4007, +3971, +3936,
+	+3948, +3961, +3920, +3879, +3806, +3989, +3866, +3743, +3636, +3529, +3375, +3222, +3069,
+	+2916, +2907, +1362, -119,  -64,   -113,  -162,  -147,  -132,  -196,  -260,  -260,  +1744,
+	+1692, +1640, +1556, +1472, -1932, -1240, -964,  -688,  -361,  -34,   +165,  +364,  +707,
+	+1050, +1585, +2120, +2389, +2658, +2671, +2684, +2705, +2726, +2875, +3024, +3001, +2978,
+	+2283, +564,  +965,  +342,  +2951, +4024, +4015, +4006, +3997, +3988, +3963, +3938, +3913,
+	+3888, +3842, +3796, +3622, +3960, +3946, +3932, +3662, +3392, +3292, +3192, +3028, +2864,
+	+2956, +488,  -28,   -32,   -72,   -112,  -120,  -128,  -192,  -256,  -256,  +1834, +1635,
+	+1692, +1718, +208,  -1663, -1229, -924,  -619,  -283,  +50,   +256,  +719,  +705,  +948,
+	+1126, +1562, +1845, +2129, +2236, +2344, +2447, +2551, +2654, +2759, +2738, +2719, +1562,
+	+663,  +623,  +327,  +4207, +3992, +4012, +4034, +3990, +3948, +3922, +3898, +3872, +3848,
+	+3774, +3701, +3484, +3523, +3726, +3929, +3812, +3695, +3604, +3513, +3407, +3300, +3350,
+	-440,  -231,  -22,   -48,   -74,   -100,  -126,  -174,  -222,  -222,  +1924, +1578, +1745,
+	+1880, -1057, -1394, -1219, -884,  -550,  -207,  +135,  +93,   +563,  +449,  +847,  +669,
+	+1004, +1302, +1600, +1802, +2005, +2191, +2377, +2435, +2494, +2477, +2460, +843,  +763,
+	+794,  +1337, +3928, +3960, +4011, +4062, +3985, +3908, +3883, +3858, +3833, +3808, +3707,
+	+3607, +3603, +3599, +3506, +3414, +3706, +3998, +3916, +3835, +3786, +3737, +2208, -345,
+	+78,   -12,   -24,   -36,   -80,   -124,  -156,  -188,  -188,  +1598, +1585, +1829, +2154,
+	-1873, -1413, -1208, -556,  -417,  -514,  -102,  +440,  +214,  +191,  +681,  +435,  +702,
+	+870,  +1039, +1224, +1409, +1709, +2010, +2039, +2069, +2086, +1849, +795,  +766,  +596,
+	+2474, +3953, +3896, +3928, +3962, +3914, +3868, +3842, +3818, +3792, +3768, +3687, +3608,
+	+3577, +3546, +3462, +3379, +3312, +3245, +3364, +3484, +3189, +2893, +858,  -154,  +35,
+	-34,   -48,   -62,   -108,  -154,  -154,  -154,  -154,  +1784, +1849, +1915, +892,  -1666,
+	-1176, -1711, -741,  -796,  -822,  +175,  -748,  +378,  +191,  +517,  +202,  +400,  +439,
+	+479,  +646,  +814,  +1229, +1645, +1644, +1644, +1697, +1239, +748,  +770,  +399,  +3613,
+	+3978, +3832, +3847, +3862, +3845, +3828, +3803, +3778, +3753, +3728, +3669, +3611, +3552,
+	+3494, +3419, +3345, +3174, +3004, +2813, +2623, +2592, +2562, -237,  +37,   -9,    -56,
+	-72,   -88,   -136,  -184,  -152,  -120,  -120,  +1802, +1900, +2255, -286,  -1290, -1129,
+	-712,  -391,  -327,  -385,  -445,  +201,  -178,  +436,  +27,   -45,   -118,  +204,  +270,
+	+384,  +498,  +685,  +874,  +998,  +1123, +1252, +1127, +794,  +717,  +1161, +3654, +3843,
+	+3776, +3788, +3802, +3782, +3764, +3616, +3726, +3690, +3656, +3595, +3536, +3476, +3417,
+	+3341, +3265, +3078, +2891, +2687, +2484, +2617, +1982, -28,   +8,    +14,   +18,   -18,
+	-54,   +6,    +66,   -30,   -126,  -126,  +1820, +1696, +2084, -2232, -1939, -570,  -1762,
+	-1834, -1394, -461,  -552,  -387,  -223,  -1110, -462,  -37,   -124,  -31,   -451,  -134,
+	+183,  +143,  +104,  +353,  +602,  +809,  +1017, +841,  +665,  +1924, +3696, +3708, +3720,
+	+3731, +3742, +3721, +3700, +3431, +3674, +3629, +3584, +3523, +3462, +3401, +3341, +3264,
+	+3187, +2982, +2778, +2562, +2346, +2386, +891,  -77,   -20,   +36,   +92,   +36,   -20,
+	-108,  -196,  -164,  -132,  -132,  +1710, +1955, +1177, -2833, -955,  -2075, -2172, -364,
+	-1885, -1352, -820,  -1599, -843,  -1249, -887,  -652,  -674,  -554,  -435,  -636,  -325,
+	-304,  -282,  -101,  -175,  +493,  +906,  +871,  +580,  +2767, +3674, +3653, +3632, +3656,
+	+3682, +3626, +3572, +3436, +3558, +3534, +3512, +3449, +3388, +3325, +3264, +3186, +3108,
+	+2902, +2697, +2500, +2304, +2219, +343,  +179,  +271,  +154,  +38,   -6,    -50,   -110,
+	-170,  -154,  -138,  -138,  +1600, +1959, -242,  -2667, -2020, -2557, -2582, -1455, +696,
+	+316,  +960,  +2052, +2120, +1940, +1760, +1292, +824,  -310,  -932,  -1394, -832,  -750,
+	-668,  -298,  -440,  +434,  +796,  +902,  +496,  +3610, +3652, +3598, +3544, +3583, +3622,
+	+3533, +3444, +3443, +3442, +3441, +3440, +3377, +3314, +3251, +3188, +3109, +3030, +2823,
+	+2616, +2439, +2262, +2053, -204,  +179,  +50,   +17,   -16,   -48,   -80,   -112,  -144,
+	-144,  -144,  -144,  +1956, +1852, -2091, -3025, -1145, +322,  +2045, +1672, +1555, +1328,
+	+1614, +1916, +1706, +1622, +1282, +1502, +1466, +1301, +1393, +940,  -792,  -1548, -768,
+	-820,  -617,  +926,  +934,  +909,  +1397, +3323, +3456, +3446, +3436, +3393, +3351, +3388,
+	+3426, +3373, +3321, +3444, +3313, +3264, +3217, +3153, +3090, +2997, +2906, +2686, +2467,
+	+2290, +2115, +1282, -61,   +136,  +79,   +36,   -5,    -37,   -69,   -101,  -133,  -133,
+	-133,  -133,  +1800, +1746, +669,  +1992, +1779, +1665, +1552, +1727, +1390, +1317, +1245,
+	+1269, +1293, +1560, +1316, +1456, +1084, +1121, +1158, +971,  +1297, +726,  -869,  -1343,
+	-794,  +1419, +1072, +917,  +2299, +3036, +3261, +3294, +3328, +3204, +3080, +3244, +3409,
+	+3305, +3201, +3449, +3186, +3153, +3121, +3056, +2992, +2887, +2783, +2550, +2318, +2143,
+	+1968, +513,  +82,   +95,   +108,  +57,   +6,    -26,   -58,   -90,   -122,  -122,  -122,
+	-122,  +1516, +1832, +1636, +1905, +1406, +1344, +1283, +1589, +1641, +1465, +1291, +1277,
+	+1263, +1386, +1254, +1314, +1118, +1116, +1115, +905,  +953,  +1160, +1111, +118,  -363,
+	+807,  +698,  +700,  +2240, +3325, +2361, +2934, +3252, +2998, +2745, +2924, +3103, +3155,
+	+2952, +3277, +3091, +3057, +3024, +2959, +2894, +2776, +2659, +2414, +2169, +2074, +1981,
+	+255,  +65,   +68,   +73,   +44,   +17,   -15,   -47,   -79,   -111,  -111,  -111,  -111,
+	+1744, +1662, +1581, +1563, +1546, +1536, +1527, +1453, +1380, +1359, +1339, +1286, +1234,
+	+1213, +1193, +1172, +1152, +1112, +1073, +1097, +1122, +826,  +1043, +1067, +1092, +964,
+	+837,  +741,  +2182, +2078, +2487, +2831, +2664, +2793, +2923, +2860, +2798, +3007, +2705,
+	+3106, +2996, +2962, +2928, +2862, +2796, +2666, +2536, +2278, +2020, +1751, +1482, -259,
+	+48,   +43,   +38,   +33,   +28,   -4,    -36,   -68,   -100,  -100,  -100,  -100,  +1684,
+	+1640, +1596, +1584, +1573, +1543, +1513, +1451, +1391, +1359, +1329, +1282, +1236, +1213,
+	+1190, +1168, +1146, +1107, +1069, +1063, +1058, +920,  +1038, +996,  +955,  +924,  +894,
+	+880,  +1635, +1679, +2235, +2439, +2132, +2451, +2771, +2580, +2644, +2713, +2528, +2742,
+	+2701, +2828, +2699, +2570, +2442, +2383, +2324, +2105, +1887, +1732, +811,  -79,   +55,
+	+62,   +71,   +46,   +23,   -7,    -37,   -67,   -97,   -113,  -129,  -129,  +1624, +1618,
+	+1612, +1606, +1601, +1551, +1501, +1451, +1402, +1361, +1320, +1279, +1239, +1214, +1189,
+	+1164, +1140, +1103, +1067, +1031, +995,  +1014, +1034, +926,  +818,  +885,  +953,  +1021,
+	+1089, +1024, +1472, +2048, +2112, +2110, +2109, +2044, +2491, +2421, +2352, +2379, +2406,
+	+2694, +2471, +2279, +2088, +2100, +2113, +1933, +1754, +1715, +140,  +101,  +62,   +83,
+	+104,  +61,   +18,   -10,   -38,   -66,   -94,   -126,  -158,  -158,  +1724, +1788, +1852,
+	+1692, +1532, +1494, +1456, +1418, +1381, +1345, +1311, +1275, +1241, +1214, +1187, +1160,
+	+1134, +1098, +1064, +1029, +995,  +996,  +998,  +935,  +873,  +877,  +883,  +792,  +702,
+	+657,  +1125, +1832, +2284, +1193, +1638, +1796, +2209, +2320, +2176, +2239, +2047, +2560,
+	+2562, +1891, +1734, +1673, +1613, +1744, +1621, +1152, -83,   -8,    +69,   +70,   +73,
+	+42,   +13,   -13,   -39,   -65,   -91,   -139,  -187,  -187,  +1824, +1702, +1580, +1522,
+	+1464, +1438, +1412, +1386, +1360, +1331, +1302, +1273, +1244, +1215, +1186, +1157, +1128,
+	+1095, +1062, +1029, +996,  +979,  +962,  +945,  +928,  +871,  +814,  +821,  +828,  +803,
+	+1290, +1617, +1944, +2068, +1168, +1292, +1416, +1708, +1488, +1844, +1688, +2171, +2142,
+	+1249, +1380, +1503, +1626, +1045, -48,   +79,   +206,  +141,  +76,   +59,   +42,   +25,
+	+8,    -16,   -40,   -64,   -88,   -152,  -216,  -216,  +1688, +1615, +1542, +1501, +1460,
+	+1429, +1398, +1367, +1336, +1309, +1284, +1257, +1232, +1205, +1180, +1153, +1128, +1092,
+	+1058, +1022, +988,  +968,  +950,  +930,  +912,  +861,  +812,  +793,  +776,  +595,  +672,
+	+971,  +1272, +330,  +924,  +1038, +1152, +1298, +1444, +1910, +1608, +1531, +1200, +515,
+	+344,  +259,  +176,  +251,  +72,   +122,  +174,  +128,  +84,   +64,   +46,   +26,   +8,
+	-18,   -44,   -70,   -96,   -144,  -192,  -192,  +1552, +1528, +1504, +1480, +1456, +1420,
+	+1384, +1348, +1312, +1289, +1266, +1243, +1220, +1197, +1174, +1151, +1128, +1091, +1054,
+	+1017, +980,  +959,  +938,  +917,  +896,  +853,  +810,  +767,  +724,  +645,  +566,  +583,
+	+600,  +640,  +680,  +528,  +376,  +376,  +888,  +1464, +1016, +637,  +258,  +295,  +332,
+	+297,  +262,  +227,  +192,  +167,  +142,  +117,  +92,   +71,   +50,   +29,   +8,    -20,
+	-48,   -76,   -104,  -136,  -168,  -168,  +1544, +1521, +1498, +1475, +1452, +1411, +1370,
+	+1329, +1288, +1267, +1248, +1227, +1208, +1187, +1168, +1147, +1128, +1088, +1050, +1010,
+	+972,  +948,  +926,  +902,  +880,  +843,  +808,  +771,  +736,  +677,  +620,  +609,  +600,
+	+614,  +628,  +546,  +464,  +238,  +2060, +1690, +1576, +1709, +308,  +313,  +320,  +285,
+	+252,  +217,  +184,  +162,  +142,  +120,  +100,  +76,   +54,   +30,   +8,    -22,   -52,
+	-82,   -112,  -128,  -144,  -144,  +1536, +1514, +1492, +1470, +1448, +1402, +1356, +1310,
+	+1264, +1247, +1230, +1213, +1196, +1179, +1162, +1145, +1128, +1087, +1046, +1005, +964,
+	+939,  +914,  +889,  +864,  +835,  +806,  +777,  +748,  +711,  +674,  +637,  +600,  +588,
+	+576,  +564,  +552,  +612,  +160,  +1916, +1112, +223,  +358,  +333,  +308,  +275,  +242,
+	+209,  +176,  +159,  +142,  +125,  +108,  +83,   +58,   +33,   +8,    -24,   -56,   -88,
+	-120,  -120,  -120,  -120,  +1536, +1514, +1492, +1470, +1448, +1402, +1356, +1310, +1264,
+	+1246, +1230, +1212, +1196, +1178, +1162, +1144, +1128, +1086, +1046, +1004, +964,  +938,
+	+914,  +888,  +864,  +834,  +806,  +776,  +748,  +710,  +674,  +636,  +600,  +588,  +576,
+	+564,  +552,  +644,  +480,  +108,  +504,  +158,  +326,  +316,  +308,  +274,  +242,  +208,
+	+176,  +158,  +142,  +124,  +108,  +82,   +58,   +32,   +8,    -24,   -56,   -88,   -120,
+	-120,  -120,  -120,  +1536, +1514, +1492, +1470, +1448, +1402, +1356, +1310, +1264, +1247,
+	+1230, +1213, +1196, +1179, +1162, +1145, +1128, +1087, +1046, +1005, +964,  +939,  +914,
+	+889,  +864,  +835,  +806,  +777,  +748,  +711,  +674,  +637,  +600,  +588,  +576,  +564,
+	+552,  +420,  +288,  +348,  +408,  +351,  +294,  +301,  +308,  +275,  +242,  +209,  +176,
+	+159,  +142,  +125,  +108,  +83,   +58,   +33,   +8,    -24,   -56,   -88,   -120,  -120,
+	-120,  -120,  +1536, +1514, +1492, +1470, +1448, +1402, +1356, +1310, +1264, +1246, +1230,
+	+1212, +1196, +1178, +1162, +1144, +1128, +1086, +1046, +1004, +964,  +938,  +914,  +888,
+	+864,  +834,  +806,  +776,  +748,  +710,  +674,  +636,  +600,  +588,  +576,  +564,  +552,
+	+420,  +288,  +348,  +408,  +350,  +294,  +300,  +308,  +274,  +242,  +208,  +176,  +158,
+	+142,  +124,  +108,  +82,   +58,   +32,   +8,    -24,   -56,   -88,   -120,  -120,  -120,
+	-120
+};
+
+static const INT16 TEST_CB_COMPONENT[4096] = {
+	+1728, +1730, +1732, +1734, +1736, +1738, +1740, +1742, +1744, +1740, +1736, +1732, +1728,
+	+1796, +1864, +1804, +1744, +1754, +1764, +1774, +1784, +1794, +1804, +1814, +1824, +1774,
+	+1724, +1802, +1880, +1814, +1748, +1810, +1872, +1878, +1884, +1890, +1896, +1910, +1924,
+	+1938, +1952, +1938, +1924, +1910, +1896, +1914, +1932, +1950, +1968, +1974, +1980, +1986,
+	+1992, +1998, +2004, +2010, +2016, +2016, +2016, +2016, +2016, +2016, +2016, +2016, +1710,
+	+1697, +1684, +1704, +1723, +1726, +1730, +1733, +1737, +1738, +1740, +1741, +1743, +1758,
+	+1774, +1757, +1741, +1762, +1783, +1788, +1793, +1774, +1755, +1784, +1813, +1817, +1821,
+	+1825, +1829, +1857, +1885, +1881, +1877, +1849, +1821, +1857, +1894, +1904, +1914, +1924,
+	+1935, +1928, +1922, +1915, +1909, +1922, +1936, +1949, +1963, +1974, +1985, +1997, +2008,
+	+2009, +2011, +2012, +2014, +2017, +2020, +2023, +2026, +2026, +2026, +2026, +1692, +1664,
+	+1637, +1674, +1711, +1715, +1720, +1725, +1730, +1737, +1744, +1751, +1758, +1721, +1684,
+	+1711, +1738, +1770, +1802, +1802, +1802, +1754, +1706, +1754, +1802, +1860, +1918, +1848,
+	+1778, +1900, +2022, +1952, +1882, +1820, +1759, +1825, +1892, +1898, +1905, +1911, +1918,
+	+1919, +1920, +1921, +1922, +1931, +1940, +1949, +1958, +1974, +1991, +2008, +2025, +2021,
+	+2018, +2015, +2012, +2018, +2024, +2030, +2036, +2036, +2036, +2036, +1674, +1631, +1589,
+	+1644, +1698, +1703, +1710, +1716, +1723, +1735, +1748, +1760, +1773, +1763, +1754, +1760,
+	+1767, +1794, +1821, +1800, +1779, +1830, +1881, +1900, +1919, +2047, +2175, +2015, +1855,
+	+1879, +1903, +1927, +1951, +1759, +1824, +1856, +1890, +1892, +1895, +1897, +1901, +1909,
+	+1918, +1926, +1935, +1939, +1944, +1948, +1953, +1974, +1996, +2019, +2041, +2032, +2025,
+	+2017, +2010, +2019, +2028, +2037, +2046, +2046, +2046, +2046, +1656, +1599, +1543, +1614,
+	+1686, +1693, +1701, +1708, +1716, +1734, +1752, +1770, +1788, +1806, +1824, +1810, +1796,
+	+1818, +1840, +2054, +2268, +1650, +1032, +510,  -12,   -70,   -128,  +390,  +908,  +1602,
+	+2296, +2158, +2020, +1699, +1890, +1889, +1888, +1887, +1886, +1885, +1884, +1900, +1916,
+	+1932, +1948, +1948, +1948, +1948, +1948, +1975, +2003, +2030, +2058, +2045, +2033, +2020,
+	+2008, +2020, +2032, +2044, +2056, +2056, +2056, +2056, +1590, +1570, +1551, +1612, +1673,
+	+1579, +1742, +1713, +1685, +1672, +1660, +1711, +1763, +1694, +1626, +1941, +2001, +2060,
+	+583,  -654,  -1891, -2046, -2201, -2084, -1967, -2049, -2131, -2053, -1975, -1751, -1527,
+	+41,   +1609, +2374, +1859, +2000, +1886, +1898, +1912, +1909, +1907, +1900, +1894, +1919,
+	+1945, +1944, +1944, +1943, +1943, +1967, +1992, +2017, +2042, +2032, +2023, +2014, +2006,
+	+2017, +2028, +2039, +2050, +2050, +2050, +2050, +1524, +1542, +1560, +1610, +1661, +1467,
+	+1785, +1719, +1654, +1611, +1568, +1653, +1738, +1839, +1940, +793,  -866,  -2050, -2210,
+	-2082, -1954, -1902, -1850, -1862, -1874, -1980, -2086, -1936, -1786, -1776, -1766, -1820,
+	-1874, -534,  +1829, +2112, +1884, +1911, +1939, +1934, +1930, +1901, +1872, +1907, +1942,
+	+1941, +1940, +1939, +1938, +1960, +1982, +2004, +2027, +2021, +2015, +2009, +2004, +2014,
+	+2024, +2034, +2044, +2044, +2044, +2044, +1586, +1641, +1697, +1704, +1712, +1577, +1699,
+	+1660, +1623, +1613, +1604, +1642, +1681, +1791, -402,  -2036, -1877, -2144, -1899, -1942,
+	-1985, -1918, -1851, -1880, -1909, -1959, -2009, -1931, -1853, -1801, -1749, -1617, -1485,
+	-1939, -1882, +96,   +2074, +1971, +1869, +1895, +1921, +1885, +1850, +1894, +1939, +1937,
+	+1936, +1934, +1933, +1952, +1972, +1991, +2011, +2008, +2006, +2003, +2002, +2011, +2020,
+	+2029, +2038, +2038, +2038, +2038, +1136, +1229, +1322, +1287, +1252, +1433, +1614, +1603,
+	+1592, +1616, +1640, +1632, +1624, +2256, -1720, -1792, -1864, -1982, -2100, -2058, -2016,
+	-1934, -1852, -1898, -1944, -1938, -1932, -1926, -1920, -1826, -1732, -1670, -1608, -1552,
+	-1496, -1664, -1320, +2288, +1800, +1856, +1912, +1870, +1828, +1882, +1936, +1934, +1932,
+	+1930, +1928, +1945, +1962, +1979, +1996, +1997, +1998, +1999, +2000, +2008, +2016, +2024,
+	+2032, +2032, +2032, +2032, +1552, +1624, +1698, +1674, +1652, +1644, +1638, +1614, +1592,
+	+1611, +1630, +1681, +1733, +1146, -2000, -1787, -1830, -1924, -2019, -2049, -2080, -1986,
+	-1893, -1895, -1898, -1896, -1894, -1860, -1827, -1779, -1731, -1667, -1604, -1615, -1626,
+	-1878, -594,  +2063, +1903, +2016, +1873, +2132, +1880, +1884, +1888, +1921, +1955, +1941,
+	+1927, +1925, +1925, +1955, +1987, +2005, +2025, +2043, +2063, +1995, +1927, +2099, +2015,
+	+2095, +2175, +2175, +1456, +1509, +1562, +1551, +1540, +1601, +1662, +1627, +1592, +1606,
+	+1621, +1731, +1842, +37,   -2281, -1782, -1796, -1867, -1938, -2041, -2144, -2039, -1934,
+	-1893, -1852, -1854, -1857, -1795, -1734, -1732, -1731, -1665, -1600, -1678, -1757, -1836,
+	+645,  +2094, +2007, +1920, +1322, +2139, +1933, +1886, +1840, +1909, +1979, +1952, +1926,
+	+1907, +1888, +1933, +1978, +2015, +2052, +2089, +2126, +1982, +1838, +2174, +1998, +2158,
+	+2318, +2318, +1488, +1520, +1554, +1554, +1556, +1588, +1622, +1606, +1592, +1569, +1547,
+	+1700, +1855, -993,  -2049, -1825, -1858, -1905, -1953, -2016, -2080, -1995, -1911, -1858,
+	-1806, -1812, -1819, -1729, -1641, -1685, -1730, -1678, -1628, -1677, -1727, -2194, +1947,
+	+2125, +2046, +945,  -2205, +114,  +2177, +2144, +1856, +1912, +1970, +1963, +1957, +1935,
+	+1915, +1925, +1937, +1991, +2047, +2181, +2061, +2337, +2613, +1817, +2301, +2157, +2269,
+	+2397, +1520, +1533, +1546, +1559, +1572, +1577, +1582, +1587, +1592, +1533, +1474, +1671,
+	+1868, -2023, -1818, -1869, -1920, -1944, -1968, -1992, -2016, -1952, -1888, -1824, -1760,
+	-1771, -1782, -1665, -1548, -1639, -1730, -1693, -1656, -1677, -1699, -1017, +2226, +1644,
+	+2087, -286,  -2148, -2167, -1674, +611,  +2384, +2173, +1962, +1975, +1988, +1965, +1942,
+	+1919, +1896, +1969, +2042, +2019, +1484, -1916, -1220, +2484, +1068, -916,  +1708, +1964,
+	+1504, +1514, +1526, +1536, +1548, +1550, +1554, +1556, +1560, +1581, +1604, +1786, +689,
+	-2138, -1894, -1905, -1918, -1926, -1935, -1943, -1952, -1878, -1805, -1731, -1658, -1626,
+	-1596, -1549, -1503, -1507, -1513, -1518, -1524, -1526, -1785, +148,  +2080, +1995, +2422,
+	-2094, -2003, -2033, -1809, -1665, -1776, -189,  +1398, +2536, +2139, +2122, +2105, +2327,
+	+2295, +2204, +2113, +2870, -213,  -1669, -1077, -1237, -1653, -1589, +2059, +1931, +1488,
+	+1497, +1506, +1515, +1524, +1525, +1526, +1527, +1528, +1631, +1735, +1902, -490,  -2254,
+	-1971, -1943, -1916, -1909, -1902, -1895, -1888, -1805, -1722, -1639, -1556, -1483, -1411,
+	-1434, -1458, -1377, -1297, -1344, -1392, -1376, -1872, +1312, +1935, +1834, +1734, -2622,
+	-2370, -2157, -1945, -1892, -1840, -2039, -2239, -2022, -782,  -281,  +220,  +433,  +134,
+	-377,  -888,  -1655, -1398, -1166, -934,  -1374, -1302, -726,  +2410, +1898, +1472, +1478,
+	+1486, +1492, +1500, +1498, +1498, +1496, +1496, +1600, +1705, +1666, -933,  -1474, -2015,
+	-1964, -1914, -1891, -1869, -1846, -1824, -1731, -1639, -1546, -1454, -1387, -1321, -1191,
+	-1317, -1150, -1240, -1250, -1260, -1545, -1575, +2459, +1885, +2057, +182,  -2429, -2225,
+	-2088, -1952, -1928, -1904, -1905, -1907, -2149, -1879, -1835, -1793, -1670, -1803, -1645,
+	-1489, -1491, -1239, -1335, -1431, -1335, -1495, +681,  +2345, +2089, +1456, +1461, +1466,
+	+1471, +1476, +1473, +1470, +1467, +1464, +1570, +1676, +1174, -1888, -950,  -2060, -1986,
+	-1912, -1874, -1836, -1798, -1760, -1658, -1556, -1454, -1352, -1292, -1232, -1204, -1688,
+	-1180, -1184, -1156, -1128, -1203, -254,  +2071, +1836, +2281, -1370, -2237, -2080, -2020,
+	-1960, -1964, -1968, -2028, -2088, -2020, -1952, -1855, -1758, -1725, -1692, -1635, -1578,
+	-1329, -1592, -1504, -1416, -1040, -1688, +2088, +2280, +2280, +1428, +1438, +1450, +1460,
+	+1472, +1463, +1454, +1493, +1533, +1512, +1748, -160,  -2068, -1346, -1137, -1775, -1902,
+	-1848, -1794, -1708, -1622, -1544, -1466, -1356, -1247, -1198, -1149, -1196, -1755, -1246,
+	-993,  -1012, -1032, -1202, +930,  +2023, +1837, +2238, -2480, -2286, -1838, -1799, -1761,
+	-1835, -1909, -1954, -2000, -1982, -1964, -1908, -1853, -1829, -1807, -1749, -1692, -1538,
+	-1642, -1526, -1410, -638,  -122,  +774,  +1926, +1926, +1400, +1417, +1434, +1451, +1469,
+	+1454, +1439, +1520, +1602, +1455, +1820, -1239, -1737, -1743, -726,  -1821, -1892, -1822,
+	-1752, -1618, -1485, -1431, -1377, -1259, -1142, -1104, -1066, -1188, -1823, -1313, -803,
+	-869,  -936,  -1203, +2115, +1976, +1838, +916,  -2055, -1569, -1596, -1579, -1563, -1706,
+	-1850, -1881, -1913, -1944, -1976, -1962, -1949, -1935, -1922, -1864, -1807, -1749, -1692,
+	-1548, -1404, -1004, -92,   +996,  +2084, +2084, +1372, +1394, +1418, +1441, +1465, +1444,
+	+1423, +1483, +1543, +1765, +1732, -2204, -1533, -1611, -1179, -1274, -1882, -1764, -1646,
+	-1560, -1475, -1301, -1127, -1113, -1101, -994,  -887,  -1052, -1730, -1395, -804,  -709,
+	-872,  -306,  +2051, +1929, +2063, -151,  -1597, -1347, -1354, -1326, -1300, -1417, -1535,
+	-1599, -1665, -1730, -1796, -1824, -1852, -1880, -1909, -1883, -1857, -1767, -1678, -1570,
+	-1462, -1434, +1154, +2402, +1858, +1858, +1344, +1373, +1403, +1432, +1462, +1435, +1409,
+	+1446, +1484, +1564, +621,  -1890, -1842, -1737, -1633, -728,  -1872, -1706, -1541, -1503,
+	-1466, -1428, -1391, -1225, -1060, -884,  -709,  -917,  -1638, -1478, -807,  -551,  -808,
+	+590,  +1988, +1882, +2288, -1218, -1140, -1126, -1112, -1075, -1038, -1129, -1220, -1319,
+	-1418, -1517, -1616, -1686, -1756, -1826, -1896, -1902, -1908, -1786, -1664, -1592, -1520,
+	-1864, +2400, +2016, +2144, +2144, +1348, +1372, +1398, +1424, +1450, +1463, +1477, +1491,
+	+1505, +1729, -607,  -1838, -1790, -1735, -1681, -1003, -1350, -1710, -1558, -1519, -1480,
+	-1382, -1285, -1379, -1475, -1208, -941,  -611,  -793,  -796,  -800,  -611,  -680,  +1364,
+	+1872, +1932, +1481, -1150, -966,  -926,  -886,  -868,  -851,  -929,  -1009, -1061, -1114,
+	-1230, -1348, -1521, -1695, -1805, -1915, -1900, -1886, -1792, -1698, -1604, -1766, -744,
+	+2326, +2134, +2198, +2198, +1352, +1373, +1395, +1417, +1439, +1492, +1546, +1536, +1526,
+	+1894, -1835, -1787, -1739, -1735, -1731, -1279, -828,  -1714, -1577, -1536, -1495, -1337,
+	-1180, -1023, -866,  -764,  -663,  -562,  -973,  -371,  -282,  -417,  -552,  +2138, +1757,
+	+1983, +674,  -1083, -793,  -726,  -660,  -662,  -665,  -731,  -798,  -804,  -811,  -945,
+	-1080, -1357, -1635, -1784, -1934, -1899, -1865, -1798, -1732, -1616, -2012, +376,  +2252,
+	+2252, +2252, +2252, +1356, +1373, +1391, +1409, +1427, +1425, +1423, +1501, +1579, +907,
+	-1814, -1702, -1847, -1909, -1716, -1634, -786,  -1686, -1819, -1712, -1605, -1371, -1139,
+	-921,  -705,  -656,  -608,  -384,  -416,  -233,  -308,  -477,  +376,  +1968, +1769, +2033,
+	-5,    -839,  -651,  -606,  -562,  -584,  -606,  -660,  -715,  -739,  -763,  -963,  -1164,
+	-1432, -1702, -1843, -1985, -1977, -1971, -1884, -1798, -2012, -2226, +2152, +2178, +2194,
+	+2210, +2210, +1360, +1374, +1388, +1402, +1416, +1358, +1300, +1466, +1632, -81,   -1794,
+	-1619, -1956, -2085, -1702, -1991, -744,  -891,  -526,  -353,  -180,  -383,  -586,  -821,
+	-1056, -805,  -554,  -463,  -372,  -353,  -334,  -539,  +1304, +1799, +1782, +2085, -684,
+	-597,  -510,  -487,  -464,  -506,  -548,  -590,  -632,  -674,  -716,  -982,  -1248, -1509,
+	-1770, -1903, -2036, -2057, -2078, -1971, -1864, -1896, -1416, +2392, +2104, +2136, +2168,
+	+2168, +1346, +1358, +1371, +1383, +1396, +1395, +1393, +1552, +1711, -1177, -1762, -2203,
+	-1364, -465,  +690,  +1942, +1913, +1747, +1837, +1816, +1794, +1889, +1983, +1774, +1564,
+	+548,  -468,  -299,  -386,  -391,  -398,  -147,  +1895, +1920, +1946, +1284, -401,  -397,
+	-393,  -421,  -450,  -478,  -507,  -568,  -629,  -722,  -815,  -1068, -1321, -1697, -2074,
+	-2082, -2091, -2129, -2168, -2030, -1894, -2028, +142,  +2280, +2114, +2082, +2050, +2050,
+	+1332, +1343, +1354, +1365, +1377, +1432, +1487, +1382, +1278, -1763, -195,  +1308, +1788,
+	+1667, +1547, +1522, +1498, +1569, +1641, +1681, +1721, +1600, +1480, +1552, +1624, +1901,
+	+2179, +1145, -401,  -431,  -462,  -12,   +1974, +1786, +2111, +484,  -119,  -198,  -277,
+	-356,  -436,  -451,  -467,  -547,  -627,  -770,  -914,  -898,  -882,  -606,  -330,  -470,
+	-611,  -1435, -2259, -2091, -1924, -2160, +1700, +2168, +2124, +2028, +1932, +1932, +1318,
+	+1327, +1337, +1346, +1357, +1405, +1452, +1420, +1389, +1381, +1629, +1748, +1356, +1495,
+	+1635, +1631, +1627, +1551, +1732, +1689, +1647, +1728, +1809, +1730, +1652, +1686, +1721,
+	+1948, +1921, +874,  -430,  +363,  +1925, +1764, +1859, +148,  -28,   -95,   -160,  -291,
+	-422,  -423,  -426,  -557,  -688,  -370,  -309,  -280,  -251,  -570,  -890,  -858,  -826,
+	-563,  -301,  -1079, -1858, -1636, +2170, +2296, +2166, +2118, +2070, +2070, +1304, +1312,
+	+1321, +1329, +1338, +1378, +1419, +1459, +1500, +1452, +1404, +1420, +1436, +1580, +1724,
+	+1484, +1244, +1022, +1313, +1187, +1062, +1088, +1115, +1397, +1680, +1728, +1777, +1729,
+	+1682, +1922, +1651, +1763, +1876, +1742, +1609, -189,  +62,   +8,    -45,   -226,  -408,
+	-397,  -387,  -568,  -750,  -227,  -217,  -430,  -644,  -1047, -1451, -1502, -1554, -1229,
+	-905,  -580,  -256,  -856,  +1616, +1912, +2208, +2208, +2208, +2208, +1290, +1304, +1319,
+	+1334, +1350, +1377, +1404, +1271, +1395, +1525, +1655, +1769, +1884, +1802, +1720, +1430,
+	+1141, +1026, +1168, +1037, +908,  +700,  +491,  +331,  +172,  +873,  +1575, +1524, +1731,
+	+1991, +1738, +1774, +1811, +1914, +993,  -119,  +48,   -74,   -196,  -271,  -346,  -407,
+	-470,  -324,  -179,  -213,  -503,  -810,  -1117, -1273, -1430, -1636, -1841, -1823, -1551,
+	-1246, -686,  +1194, +1026, +1610, +2194, +2194, +2194, +2194, +1276, +1297, +1319, +1341,
+	+1363, +1376, +1390, +1340, +1802, +1854, +1907, +1863, +1820, +1768, +1717, +1377, +1038,
+	+1031, +1024, +889,  +755,  +568,  +381,  +290,  +200,  +19,   -162,  +553,  +1781, +2060,
+	+1827, +1786, +1746, +2086, +378,  -50,   +35,   -156,  -348,  -316,  -284,  -419,  -554,
+	-337,  -121,  -456,  -791,  -934,  -1078, -1244, -1411, -1514, -1617, -1907, -1686, -1657,
+	-1116, +1964, +1972, +2076, +2180, +2180, +2180, +2180, +1262, +1289, +1318, +1346, +1375,
+	+1359, +1344, +1632, +1921, +1927, +1934, +1876, +1820, +1702, +1585, +1259, +935,  +907,
+	+880,  +724,  +569,  +436,  +302,  +217,  +132,  +44,   -43,   -99,   +102,  +801,  +2011,
+	+1878, +1745, +1426, +2131, +916,  -43,   -191,  -340,  -393,  -446,  -461,  -478,  -237,
+	-254,  -522,  -790,  -962,  -1135, -1519, -1647, -1760, -1872, -1446, -2045, -1827, -1354,
+	+2254, +2278, +2222, +2166, +2166, +2166, +2166, +1248, +1283, +1318, +1353, +1388, +1343,
+	+1298, +1925, +2040, +2001, +1962, +1891, +1820, +1637, +1454, +1143, +832,  +784,  +736,
+	+560,  +384,  +304,  +224,  +144,  +64,   +70,   +76,   +18,   -40,   +54,   +1684, +1714,
+	+1744, +1790, +1836, +1882, +1928, +798,  -332,  -470,  -608,  -505,  -402,  -139,  -388,
+	-589,  -790,  -991,  -1192, -1794, -1884, -2006, -2128, -2266, -868,  +818,  +2504, +2288,
+	+2072, +2112, +2152, +2152, +2152, +2152, +1238, +1263, +1290, +1332, +1375, +1301, +1484,
+	+2002, +2009, +1973, +1939, +1871, +1805, +1608, +1411, +1118, +826,  +751,  +676,  +505,
+	+334,  +273,  +212,  +151,  +91,   +69,   +48,   +11,   -26,   +482,  +1758, +1771, +1784,
+	+2033, +1771, +1860, +1950, +1989, +2029, +884,  -260,  -1156, -261,  -309,  -614,  -922,
+	-975,  -1411, -1848, -2062, -2019, -697,  +626,  +2060, +2471, +2273, +2076, +2051, +2026,
+	+2081, +2136, +2136, +2136, +2136, +1228, +1245, +1263, +1313, +1363, +1260, +1670, +2080,
+	+1978, +1947, +1916, +1853, +1791, +1580, +1369, +1094, +820,  +718,  +616,  +450,  +285,
+	+243,  +201,  +159,  +118,  +69,   +20,   +4,    -13,   +910,  +1833, +1828, +1824, +229,
+	+1706, +1839, +1972, +1901, +1830, +1983, +2136, +2032, +1416, +1056, +696,  +280,  +376,
+	+728,  +1080, +1767, +2454, +2405, +2356, +2035, +2226, +2193, +2160, +2070, +1980, +2050,
+	+2120, +2120, +2120, +2120, +1218, +1226, +1235, +1292, +1350, +1235, +1888, +2061, +1979,
+	+1935, +1893, +1834, +1776, +1551, +1326, +1070, +814,  +685,  +556,  +395,  +235,  +212,
+	+189,  +166,  +145,  +116,  +88,   -68,   +33,   +1306, +1811, +1949, +1576, -200,  -183,
+	+905,  +1994, +1956, +1919, +1881, +1844, +2004, +1909, +2005, +2102, +2042, +2239, +2195,
+	+2152, +2043, +1935, +2370, +2038, +2697, +1821, +368,  +2244, +2121, +1998, +2051, +2104,
+	+2104, +2104, +2104, +1208, +1208, +1209, +1273, +1338, +1210, +2107, +2043, +1980, +1925,
+	+1871, +1816, +1762, +1523, +1285, +1046, +808,  +652,  +497,  +341,  +186,  +182,  +179,
+	+175,  +172,  +164,  +157,  +117,  +590,  +1958, +1791, +1815, +816,  +140,  -24,   -28,
+	-32,   +988,  +2008, +2036, +2064, +1977, +1890, +1931, +1972, +2013, +2054, +2127, +2200,
+	+2320, +2440, +2080, +184,  -1760, -3192, +336,  +2328, +2172, +2016, +2052, +2088, +2088,
+	+2088, +2088, +1222, +1215, +1209, +1266, +1325, +1459, +2104, +2046, +1989, +1945, +1903,
+	+1861, +1819, +1612, +1406, +1136, +866,  +715,  +564,  +446,  +328,  +295,  +263,  +230,
+	+199,  +481,  +764,  +711,  +1427, +2086, +1721, +1692, +128,  -37,   +55,   -14,   -82,
+	-108,  -135,  +335,  +804,  +1293, +1783, +2272, +2250, +2197, +1889, +1356, +568,  -763,
+	-2095, -3010, -2646, -2931, -2705, +2305, +2196, +2159, +2122, +2117, +2112, +2112, +2112,
+	+2112, +1236, +1223, +1210, +1261, +1313, +1708, +2103, +2050, +1998, +1967, +1937, +1907,
+	+1877, +1702, +1528, +1226, +924,  +778,  +633,  +552,  +471,  +409,  +348,  +287,  +226,
+	+287,  +349,  +283,  +1241, +1702, +1652, +1826, -48,   +43,   +134,  +1,    -132,  -181,
+	-230,  -343,  -456,  -670,  -884,  -202,  -544,  -946,  -1860, -1718, -2088, -2311, -2534,
+	-2469, -2404, -2311, -1706, +2483, +2064, +2146, +2228, +2182, +2136, +2136, +2136, +2136,
+	+1250, +1230, +1211, +1255, +1300, +1957, +2101, +2054, +2007, +1956, +1906, +1856, +1806,
+	+1696, +1586, +1284, +982,  +841,  +701,  +657,  +613,  +554,  +497,  +438,  +381,  +412,
+	+445,  +717,  +1758, +1782, +1807, +1095, -128,  -70,   -11,   -97,   -182,  -253,  -325,
+	-428,  -532,  -761,  -991,  -580,  -170,  -1033, -873,  -1976, -1800, -2018, -2237, -2343,
+	-2450, -2650, -35,   +2308, +2092, +2117, +2142, +2151, +2160, +2160, +2160, +2160, +1264,
+	+1238, +1212, +1250, +1288, +2206, +2100, +2058, +2016, +1946, +1876, +1806, +1736, +1690,
+	+1644, +1342, +1040, +905,  +770,  +763,  +756,  +701,  +646,  +591,  +536,  +539,  +542,
+	+897,  +1764, +1607, +1962, +365,  -208,  -182,  -156,  -194,  -232,  -326,  -420,  -514,
+	-608,  -853,  -1098, -1471, -820,  -97,   -910,  -955,  -2024, -2238, -2452, -2474, -2496,
+	-2990, +1636, +2134, +2120, +2088, +2056, +2120, +2184, +2184, +2184, +2184, +1198, +1191,
+	+1185, +1227, +1525, +2065, +2093, +2009, +1925, +1887, +1850, +1781, +1712, +1682, +1653,
+	+1464, +1275, +1130, +986,  +937,  +889,  +840,  +792,  +743,  +696,  +684,  +674,  +1335,
+	+1741, +1839, +1939, +54,   -294,  -295,  -297,  -298,  -300,  -414,  -527,  -641,  -755,
+	-947,  -1140, -1732, -1813, -733,  -166,  -1038, -887,  -1234, -1581, -1609, -1636, -1158,
+	+2392, +2279, +2166, +2119, +2072, +2121, +2170, +2170, +2170, +2170, +1132, +1145, +1159,
+	+1205, +1763, +1924, +2086, +1960, +1834, +1829, +1825, +1756, +1688, +1675, +1663, +1586,
+	+1510, +1356, +1202, +1112, +1023, +981,  +939,  +897,  +856,  +831,  +807,  +1774, +1718,
+	+1817, +1405, -512,  -380,  -409,  -438,  -403,  -369,  -502,  -635,  -768,  -902,  -1042,
+	-1182, -1482, -1782, -2138, -1982, -610,  -262,  -486,  -711,  -744,  -777,  +162,  +2125,
+	+1912, +2212, +2150, +2088, +2122, +2156, +2156, +2156, +2156, +1194, +1146, +1100, +1182,
+	+1776, +1927, +2079, +1863, +1903, +1978, +1799, +1843, +1632, +1619, +1608, +1612, +1617,
+	+1517, +1418, +1351, +1284, +1216, +1149, +1098, +1048, +945,  +1099, +1781, +1695, +1954,
+	+422,  -566,  -530,  -554,  -579,  -571,  -565,  -686,  -806,  -927,  -1049, -1232, -1416,
+	-1679, -1943, -2342, -2486, -2501, -2773, -2074, -1376, -1671, -2221, +458,  +2369, +2137,
+	+2162, +2133, +2104, +2123, +2142, +2142, +2142, +2142, +1256, +1149, +1043, +1160, +1790,
+	+1931, +2073, +1766, +1972, +2129, +1774, +1931, +1576, +1565, +1554, +1639, +1724, +1679,
+	+1635, +1590, +1546, +1453, +1361, +1300, +1240, +1060, +1392, +1788, +1672, +2092, -560,
+	-620,  -680,  -700,  -721,  -741,  -762,  -870,  -979,  -1087, -1196, -1423, -1650, -1877,
+	-2104, -2291, -2478, -2857, -2724, -2895, -3067, -3110, -3666, +2547, +2103, +2107, +2112,
+	+2116, +2120, +2124, +2128, +2128, +2128, +2128, +1214, +1170, +1128, +1453, +1779, +1692,
+	+1861, +1807, +1753, +1732, +1712, +1803, +1640, +1759, +1623, +1710, +1799, +1666, +1790,
+	+1755, +1719, +1628, +1539, +1497, +1456, +1352, +1504, +1752, +1745, +1445, -902,  -898,
+	-894,  -907,  -921,  -935,  -950,  -1070, -1190, -1310, -1431, -1641, -1852, -2062, -2273,
+	-2431, -2590, -2812, -2779, -2929, -3080, -3279, -2198, +2298, +2187, +2124, +2062, +2081,
+	+2100, +2119, +2138, +2138, +2138, +2138, +1172, +1193, +1214, +1747, +1769, +1710, +2163,
+	+2360, +2046, +1592, +1651, +1677, +1704, +1954, +1693, +1783, +1874, +1654, +1947, +1920,
+	+1893, +1805, +1718, +1695, +1672, +1644, +1617, +1717, +1818, +798,  -1245, -1176, -1108,
+	-1115, -1123, -1131, -1139, -1270, -1402, -1534, -1666, -1860, -2054, -2248, -2442, -2572,
+	-2702, -2768, -2834, -2964, -3094, -3192, -219,  +2306, +2272, +2142, +2012, +2046, +2080,
+	+2114, +2148, +2148, +2148, +2148, +1194, +1150, +1364, +1784, +1694, +1983, +2272, +1441,
+	+2147, +1980, +1813, +1838, +1864, +1909, +1698, +1823, +1949, +1818, +1943, +1989, +2034,
+	+1933, +1833, +1812, +1792, +1712, +1633, +1649, +1923, -536,  -1459, -1390, -1322, -1354,
+	-1388, -1421, -1455, -1566, -1678, -1789, -1901, -2078, -2256, -2433, -2611, -2744, -2878,
+	-2915, -2953, -2998, -3044, -3777, +1633, +2298, +1941, +2015, +2090, +2107, +2124, +2141,
+	+2158, +2158, +2158, +2158, +1216, +1109, +1514, +1823, +1620, +2001, +1870, +1803, +1224,
+	+1600, +1464, +1232, +1000, +1096, +1192, +1352, +1512, +1726, +1940, +2058, +2176, +2062,
+	+1948, +1930, +1912, +1781, +1650, +1583, +2028, -1871, -1674, -1605, -1536, -1595, -1654,
+	-1713, -1772, -1863, -1954, -2045, -2136, -2297, -2458, -2619, -2780, -2917, -3054, -3063,
+	-3072, -3033, -2994, -2827, +2460, +2035, +2122, +2145, +2168, +2168, +2168, +2168, +2168,
+	+2168, +2168, +2168, +1190, +1271, +1610, +1756, +1647, +1523, +1144, +1324, +1249, +1364,
+	+1224, +1211, +1199, +1255, +1566, +1430, +1294, +1404, +1514, +1800, +2087, +2075, +2063,
+	+2003, +1944, +1654, +1621, +1811, +979,  -1997, -1903, -1888, -1874, -1927, -1982, -2036,
+	-2091, -2163, -2236, -2308, -2381, -2513, -2646, -2778, -2911, -3005, -3100, -3114, -3129,
+	-3039, -3206, -1084, +2317, +2104, +2148, +2159, +2171, +2175, +2179, +2183, +2187, +2187,
+	+2187, +2187, +1164, +1179, +1195, +1179, +1163, +1302, +1442, +1358, +1274, +1385, +1496,
+	+1447, +1399, +1158, +1429, +1508, +1588, +1594, +1601, +1543, +1486, +1832, +2179, +2077,
+	+1976, +1528, +1593, +1785, -582,  -2381, -2133, -2172, -2212, -2261, -2311, -2361, -2411,
+	-2464, -2518, -2572, -2626, -2730, -2834, -2938, -3042, -3094, -3146, -3166, -3186, -3046,
+	-3418, +658,  +2174, +2174, +2174, +2174, +2174, +2182, +2190, +2198, +2206, +2206, +2206,
+	+2206, +1202, +1230, +1259, +1272, +1286, +1321, +1356, +1343, +1331, +1405, +1480, +1474,
+	+1470, +1349, +1483, +1522, +1562, +1576, +1591, +1573, +1557, +1589, +1622, +1718, +1816,
+	+1690, +1820, +1694, -2015, -2556, -2330, -2376, -2422, -2610, -2799, -2700, -2602, -2669,
+	-2736, -2803, -2871, -2946, -3022, -3097, -3173, -3182, -3192, -3153, -3115, -3324, -3278,
+	+2256, +2159, +2147, +2136, +2156, +2177, +2189, +2201, +2213, +2225, +2225, +2225, +2225,
+	+1240, +1282, +1325, +1367, +1410, +1340, +1271, +1329, +1388, +1426, +1465, +1503, +1542,
+	+1540, +1539, +1537, +1536, +1559, +1582, +1605, +1628, +1603, +1578, +1617, +1656, +1596,
+	+1536, +1604, -2936, -2476, -2528, -2580, -2632, -2704, -2777, -2785, -2794, -2874, -2955,
+	-3035, -3116, -3163, -3210, -3257, -3304, -3271, -3238, -3141, -3044, -3091, -2114, +2319,
+	+2144, +2121, +2098, +2139, +2180, +2196, +2212, +2228, +2244, +2244, +2244, +2244, +1230,
+	+1255, +1281, +1306, +1333, +1303, +1272, +1338, +1405, +1436, +1468, +1500, +1533, +1535,
+	+1537, +1539, +1542, +1562, +1584, +1605, +1627, +1601, +1577, +1616, +1656, +1807, +1959,
+	-417,  -2793, -2797, -2545, -2581, -2618, -2687, -2757, -2794, -2833, -2901, -2968, -3036,
+	-3105, -3145, -3186, -3178, -3171, -3149, -3128, -3058, -2989, -3221, -126,  +2281, +2129,
+	+2084, +2040, +2107, +2175, +2189, +2203, +2217, +2231, +2231, +2231, +2231, +1220, +1229,
+	+1238, +1247, +1257, +1266, +1275, +1348, +1422, +1447, +1473, +1499, +1525, +1530, +1536,
+	+1542, +1548, +1567, +1587, +1606, +1626, +1601, +1577, +1616, +1656, +1763, +1871, +1658,
+	-2138, -2862, -2563, -2583, -2604, -2671, -2738, -2805, -2873, -2928, -2983, -3038, -3094,
+	-3128, -3162, -3100, -3038, -3028, -3018, -2976, -2934, -3352, +1862, +2244, +2114, +2048,
+	+1982, +2076, +2170, +2182, +2194, +2206, +2218, +2218, +2218, +2218, +1210, +1234, +1259,
+	+1283, +1308, +1325, +1341, +1390, +1439, +1457, +1477, +1496, +1516, +1525, +1535, +1544,
+	+1554, +1571, +1589, +1607, +1625, +1616, +1608, +1632, +1656, +1718, +1782, +1685, +1845,
+	+528,  -2836, -2728, -2622, -2654, -2687, -2719, -2752, -2763, -2773, -2992, -2955, -3030,
+	-3106, -2813, -2777, -3226, -2908, -3134, -3359, -971,  +2186, +2270, +2099, +2075, +2052,
+	+2108, +2165, +2175, +2185, +2195, +2205, +2205, +2205, +2205, +1200, +1240, +1280, +1320,
+	+1360, +1384, +1408, +1432, +1456, +1469, +1482, +1495, +1508, +1521, +1534, +1547, +1560,
+	+1576, +1592, +1608, +1624, +1632, +1640, +1648, +1656, +1675, +1694, +1713, +1732, +1871,
+	+986,  -827,  -2640, -2638, -2636, -2634, -2632, -2598, -2564, -2946, -2816, -2933, -3050,
+	-2783, -3028, -3169, -1774, +293,  +2360, +2179, +1998, +2041, +2084, +2103, +2122, +2141,
+	+2160, +2168, +2176, +2184, +2192, +2192, +2192, +2192, +1232, +1266, +1300, +1334, +1368,
+	+1390, +1412, +1434, +1456, +1468, +1482, +1494, +1508, +1520, +1534, +1546, +1560, +1578,
+	+1596, +1614, +1632, +1640, +1648, +1656, +1664, +1645, +1628, +1705, +1784, +2101, +1908,
+	+1298, +688,  +1071, -594,  -1587, -2580, -2891, -3202, -2281, -2640, -2058, -1476, -94,
+	+1032, +2278, +2244, +2209, +2176, +2131, +2088, +2091, +2096, +2111, +2128, +2143, +2160,
+	+2168, +2176, +2184, +2192, +2192, +2192, +2192, +1264, +1292, +1320, +1348, +1376, +1396,
+	+1416, +1436, +1456, +1469, +1482, +1495, +1508, +1521, +1534, +1547, +1560, +1580, +1600,
+	+1620, +1640, +1648, +1656, +1664, +1672, +1617, +1562, +1699, +1836, +1821, +1806, +1887,
+	+1968, +1964, +1960, +2020, +2080, +1936, +1792, +1200, +1632, +1889, +2146, +2083, +2020,
+	+2093, +2166, +2079, +1992, +2085, +2178, +2143, +2108, +2121, +2134, +2147, +2160, +2168,
+	+2176, +2184, +2192, +2192, +2192, +2192, +1296, +1318, +1340, +1362, +1384, +1402, +1420,
+	+1438, +1456, +1468, +1482, +1494, +1508, +1520, +1534, +1546, +1560, +1582, +1604, +1626,
+	+1648, +1656, +1664, +1672, +1680, +1667, +1656, +1739, +1824, +1811, +1800, +1835, +1872,
+	+1881, +1890, +1819, +1748, +1995, +450,  +937,  +912,  +715,  +2056, +2019, +1984, +2035,
+	+2088, +2059, +2032, +2085, +2140, +2129, +2120, +2129, +2140, +2149, +2160, +2168, +2176,
+	+2184, +2192, +2192, +2192, +2192, +1328, +1344, +1360, +1376, +1392, +1408, +1424, +1440,
+	+1456, +1469, +1482, +1495, +1508, +1521, +1534, +1547, +1560, +1584, +1608, +1632, +1656,
+	+1664, +1672, +1680, +1688, +1719, +1750, +1781, +1812, +1803, +1794, +1785, +1776, +1798,
+	+1820, +1874, +1928, +1798, +2180, +674,  +1216, +2103, +1966, +1957, +1948, +1979, +2010,
+	+2041, +2072, +2087, +2102, +2117, +2132, +2139, +2146, +2153, +2160, +2168, +2176, +2184,
+	+2192, +2192, +2192, +2192, +1328, +1344, +1360, +1376, +1392, +1408, +1424, +1440, +1456,
+	+1468, +1482, +1494, +1508, +1520, +1534, +1546, +1560, +1584, +1608, +1632, +1656, +1664,
+	+1672, +1680, +1688, +1718, +1750, +1780, +1812, +1802, +1794, +1784, +1776, +1798, +1820,
+	+1858, +1896, +1750, +1860, +2338, +1792, +2134, +1966, +1956, +1948, +1978, +2010, +2040,
+	+2072, +2086, +2102, +2116, +2132, +2138, +2146, +2152, +2160, +2168, +2176, +2184, +2192,
+	+2192, +2192, +2192, +1328, +1344, +1360, +1376, +1392, +1408, +1424, +1440, +1456, +1469,
+	+1482, +1495, +1508, +1521, +1534, +1547, +1560, +1584, +1608, +1632, +1656, +1664, +1672,
+	+1680, +1688, +1719, +1750, +1781, +1812, +1803, +1794, +1785, +1776, +1798, +1820, +1842,
+	+1864, +1958, +2052, +1954, +1856, +1911, +1966, +1957, +1948, +1979, +2010, +2041, +2072,
+	+2087, +2102, +2117, +2132, +2139, +2146, +2153, +2160, +2168, +2176, +2184, +2192, +2192,
+	+2192, +2192, +1328, +1344, +1360, +1376, +1392, +1408, +1424, +1440, +1456, +1468, +1482,
+	+1494, +1508, +1520, +1534, +1546, +1560, +1584, +1608, +1632, +1656, +1664, +1672, +1680,
+	+1688, +1718, +1750, +1780, +1812, +1802, +1794, +1784, +1776, +1798, +1820, +1842, +1864,
+	+1958, +2052, +1954, +1856, +1910, +1966, +1956, +1948, +1978, +2010, +2040, +2072, +2086,
+	+2102, +2116, +2132, +2138, +2146, +2152, +2160, +2168, +2176, +2184, +2192, +2192, +2192,
+	+2192
+};
+
+static const INT16 TEST_CR_COMPONENT[4096] = {
+	-2112, -2114, -2116, -2118, -2120, -2122, -2124, -2126, -2128, -2118, -2108, -2098, -2088,
+	-2150, -2212, -2146, -2080, -2100, -2120, -2140, -2160, -2164, -2168, -2172, -2176, -2092,
+	-2008, -2052, -2096, -2132, -2168, -2076, -1984, -2088, -2192, -2168, -2144, -2136, -2128,
+	-2120, -2112, -2126, -2140, -2154, -2168, -2150, -2132, -2114, -2096, -2096, -2096, -2096,
+	-2096, -2096, -2096, -2096, -2096, -2080, -2064, -2048, -2032, -2032, -2032, -2032, -2128,
+	-2113, -2098, -2115, -2132, -2133, -2134, -2135, -2137, -2127, -2117, -2107, -2097, -2117,
+	-2137, -2125, -2114, -2134, -2154, -2159, -2163, -2135, -2108, -2128, -2149, -2132, -2116,
+	-2116, -2115, -2115, -2114, -2098, -2082, -2112, -2142, -2141, -2139, -2133, -2128, -2122,
+	-2117, -2127, -2137, -2147, -2158, -2146, -2134, -2122, -2111, -2108, -2106, -2104, -2102,
+	-2101, -2101, -2101, -2101, -2087, -2073, -2059, -2045, -2045, -2045, -2045, -2144, -2112,
+	-2080, -2112, -2145, -2145, -2145, -2145, -2146, -2136, -2126, -2116, -2107, -2085, -2063,
+	-2105, -2148, -2168, -2189, -2178, -2167, -2107, -2048, -2085, -2122, -2173, -2225, -2180,
+	-2135, -2098, -2061, -2120, -2180, -2136, -2093, -2114, -2135, -2131, -2128, -2125, -2122,
+	-2128, -2135, -2141, -2148, -2142, -2137, -2131, -2126, -2121, -2117, -2112, -2108, -2107,
+	-2107, -2106, -2106, -2094, -2082, -2070, -2058, -2058, -2058, -2058, -2160, -2111, -2062,
+	-2109, -2157, -2156, -2155, -2154, -2155, -2145, -2135, -2125, -2116, -2132, -2148, -2132,
+	-2118, -2154, -2191, -2181, -2170, -2494, -2308, -2393, -2479, -2470, -2461, -2243, -2282,
+	-2353, -2167, -2174, -2182, -2160, -2139, -2135, -2130, -2128, -2128, -2127, -2127, -2129,
+	-2132, -2134, -2138, -2138, -2139, -2139, -2141, -2133, -2127, -2120, -2114, -2112, -2112,
+	-2111, -2111, -2101, -2091, -2081, -2071, -2071, -2071, -2071, -2176, -2110, -2045, -2107,
+	-2170, -2168, -2167, -2165, -2164, -2154, -2145, -2135, -2126, -2180, -2235, -2161, -2088,
+	-2141, -2195, -2440, -2686, -2371, -1033, -398,  +236,  +305,  +375,  -3,    -894,  -2096,
+	-2787, -2485, -2184, -2185, -2187, -2156, -2126, -2127, -2129, -2130, -2132, -2131, -2130,
+	-2129, -2128, -2135, -2142, -2149, -2156, -2147, -2138, -2129, -2120, -2119, -2118, -2117,
+	-2116, -2108, -2100, -2092, -2084, -2084, -2084, -2084, -2112, -2085, -2058, -2112, -2166,
+	-2067, -2225, -2190, -2157, -2107, -2057, -2104, -2151, -2119, -2088, -2632, -2666, -2263,
+	-837,  +844,  +2526, +3327, +2847, +2847, +2847, +2726, +2606, +2967, +3070, +2968, +2867,
+	+397,  -2074, -2745, -2137, -2281, -2169, -2202, -2236, -2190, -2145, -2145, -2147, -2148,
+	-2150, -2152, -2156, -2159, -2163, -2159, -2156, -2152, -2150, -2130, -2111, -2123, -2137,
+	-2127, -2117, -2107, -2097, -2097, -2097, -2097, -2048, -2060, -2073, -2118, -2163, -1967,
+	-2284, -2217, -2150, -2060, -1971, -2074, -2177, -2315, -2454, -1057, +1364, +2990, +2568,
+	+2593, +2619, +2369, +2631, +2508, +2386, +2332, +2278, +2352, +2427, +2913, +2888, +3022,
+	+3156, +1302, -2088, -2406, -2213, -2279, -2345, -2251, -2158, -2161, -2165, -2168, -2172,
+	-2171, -2171, -2170, -2170, -2172, -2175, -2177, -2180, -2142, -2105, -2131, -2158, -2146,
+	-2134, -2122, -2110, -2110, -2110, -2110, -2112, -2163, -2215, -2235, -2255, -1994, -2247,
+	-2194, -2143, -2109, -2076, -2123, -2170, -2270, +700,  +3527, +2770, +2035, +2325, +2293,
+	+2263, +2178, +2350, +2265, +2181, +2129, +2078, +2154, +2231, +2521, +2557, +2559, +2562,
+	+3221, +3113, +140,  -2832, -2034, -2261, -2199, -2139, -2160, -2182, -2188, -2194, -2189,
+	-2185, -2181, -2177, -2185, -2193, -2201, -2210, -2154, -2098, -2138, -2179, -2165, -2151,
+	-2137, -2123, -2123, -2123, -2123, -1664, -1755, -1846, -1841, -1836, -1767, -2210, -2173,
+	-2136, -2159, -2182, -2173, -2164, -2739, +2830, +2735, +2640, +2361, +2082, +1995, +1908,
+	+1989, +2070, +2023, +1976, +1927, +1878, +1957, +2036, +2131, +2226, +2353, +2480, +2581,
+	+2682, +2943, +2692, -2815, -2178, -2149, -2120, -2160, -2200, -2208, -2216, -2208, -2200,
+	-2192, -2184, -2198, -2212, -2226, -2240, -2166, -2092, -2146, -2200, -2184, -2168, -2152,
+	-2136, -2136, -2136, -2136, -2096, -2166, -2238, -2228, -2220, -2087, -2210, -2173, -2137,
+	-2189, -2243, -2152, -2318, -2031, +3375, +2861, +2605, +2305, +2007, +1851, +1697, +1756,
+	+1815, +1810, +1806, +1756, +1707, +1754, +1801, +1911, +2023, +2149, +2277, +2299, +2323,
+	+2729, +1345, -2439, -2129, -2217, -2307, -2349, -2136, -2179, -2222, -2223, -2224, -2193,
+	-2162, -2171, -2180, -2190, -2199, -2198, -2198, -2213, -2229, -2172, -2115, -2170, -2225,
+	-2113, -2257, -2257, -2016, -2067, -2118, -2105, -2093, -2152, -2211, -2174, -2138, -2221,
+	-2305, -2132, -2472, +212,  +2897, +2477, +2570, +2251, +1932, +1709, +1487, +1524, +1561,
+	+1598, +1636, +1586, +1537, +1552, +1567, +1693, +1820, +1947, +2074, +2019, +1964, +2261,
+	-514,  -2321, -2080, -2031, -1982, -2283, -2073, -2151, -2229, -2238, -2248, -2194, -2140,
+	-2144, -2149, -2154, -2159, -2231, -2304, -2281, -2258, -2160, -2062, -2188, -2314, -2090,
+	-2378, -2378, -2064, -2094, -2126, -2125, -2125, -2152, -2179, -2159, -2139, -2204, -2270,
+	-2144, -2530, +1688, +2834, +2460, +2343, +2147, +1953, +1678, +1404, +1387, +1370, +1418,
+	+1466, +1416, +1366, +1349, +1332, +1442, +1553, +1663, +1775, +1817, +1861, +2415, -2405,
+	-2457, -1999, -2035, -281,  -1464, -2393, -2378, -2363, -2301, -2240, -2195, -2150, -2165,
+	-2181, -2182, -2182, -2199, -2218, -2188, -2159, -2756, -2329, -1934, -2307, -2627, -2179,
+	-2307, -2112, -2123, -2135, -2146, -2158, -2153, -2149, -2144, -2140, -2188, -2236, -2156,
+	-2588, +3164, +2772, +2444, +2116, +2045, +1975, +1648, +1322, +1251, +1181, +1238, +1296,
+	+1246, +1197, +1147, +1098, +1192, +1287, +1381, +1476, +1617, +1758, +1291, -2760, -2083,
+	-2430, -1273, -628,  -647,  -667,  -1582, -2498, -2365, -2233, -2196, -2160, -2187, -2215,
+	-2210, -2206, -2169, -2133, -2096, -2060, -280,  -548,  -2448, -1788, -860,  -1980, -2236,
+	-2112, -2120, -2130, -2140, -2150, -2145, -2141, -2137, -2133, -2147, -2161, -2079, -718,
+	+3207, +2525, +2291, +2057, +1941, +1827, +1553, +1279, +1174, +1070, +1094, +1118, +1044,
+	+970,  +976,  +983,  +1001, +1019, +1165, +1313, +1305, +1555, -212,  -2491, -2189, -2401,
+	-867,  -615,  -642,  -671,  -603,  -536,  -1354, -2172, -2271, -2370, -2340, -2311, -2330,
+	-2349, -2315, -2282, -2697, -1321, -420,  -543,  -394,  -757,  -741,  -2261, -2261, -2112,
+	-2119, -2127, -2135, -2143, -2138, -2134, -2130, -2126, -2106, -2087, -2259, +640,  +2995,
+	+2279, +2138, +1998, +1839, +1681, +1459, +1237, +1098, +960,  +950,  +940,  +842,  +744,
+	+806,  +869,  +811,  +753,  +951,  +1150, +995,  +1352, -1715, -2222, -2297, -2372, -463,
+	-602,  -639,  -676,  -649,  -623,  -600,  -577,  -810,  -1044, -1214, -1384, -1426, -1469,
+	-1183, -897,  -483,  -582,  -560,  -538,  -900,  -750,  -1134, -2542, -2286, -2112, -2117,
+	-2123, -2129, -2135, -2131, -2127, -2123, -2119, -2017, -1916, -2886, +1262, +2014, +2256,
+	+2097, +1939, +1736, +1534, +1364, +1194, +1022, +850,  +806,  +762,  +736,  +710,  +508,
+	+818,  +604,  +646,  +752,  +859,  +1131, +1149, -2865, -2273, -2339, -1639, -425,  -493,
+	-522,  -553,  -566,  -581,  -677,  -773,  -661,  -550,  -567,  -585,  -586,  -588,  -657,
+	-727,  -572,  -675,  -668,  -661,  -798,  -679,  -1799, -2407, -2151, -2112, -2116, -2120,
+	-2124, -2128, -2124, -2120, -2116, -2112, -2185, -2258, -1723, +1884, +1035, +2234, +2057,
+	+1880, +1634, +1388, +1270, +1152, +946,  +740,  +662,  +584,  +630,  +676,  +466,  +1280,
+	+654,  +540,  +554,  +568,  +757,  -78,   -2481, -2324, -2383, -906,  -389,  -384,  -407,
+	-430,  -485,  -540,  -499,  -458,  -513,  -568,  -689,  -810,  -771,  -732,  -645,  -558,
+	-663,  -768,  -776,  -784,  -696,  -608,  -2464, -2272, -2016, -2104, -2110, -2116, -2122,
+	-2129, -2105, -2081, -2105, -2130, -2204, -2536, -84,   +1856, +1148, +1209, +1701, +1683,
+	+1507, +1332, +1188, +1045, +837,  +630,  +518,  +407,  +489,  +572,  +398,  +1249, +662,
+	+330,  +383,  +436,  +589,  -1304, -2350, -2117, -2615, +213,  -12,   -239,  -265,  -293,
+	-320,  -348,  -377,  -407,  -484,  -562,  -626,  -691,  -675,  -661,  -625,  -590,  -682,
+	-776,  -804,  -832,  -540,  -248,  -664,  -1848, -2616, -2096, -2104, -2113, -2121, -2130,
+	-2086, -2043, -2095, -2148, -2225, -2815, +1555, +1829, +1519, +697,  +1603, +1486, +1381,
+	+1276, +1107, +938,  +729,  +520,  +375,  +230,  +349,  +468,  +331,  +1219, +670,  +121,
+	+212,  +304,  +423,  -2531, -2477, -2423, -1569, +309,  -149,  -94,   -125,  -157,  -157,
+	-157,  -256,  -356,  -456,  -556,  -564,  -573,  -581,  -590,  -606,  -623,  -703,  -784,
+	-832,  -880,  -384,  +112,  -1424, -2448, -2192, -2088, -2098, -2109, -2119, -2131, -2099,
+	-2068, -2100, -2134, -2485, -2325, +2921, +2025, +1536, +1048, +1088, +1385, +1270, +1156,
+	+993,  +831,  +700,  +570,  +407,  +245,  +256,  +268,  +343,  +932,  +662,  +135,  +185,
+	+236,  -337,  -2445, -2346, -2504, -793,  +149,  -75,   -45,   -64,   -84,   -88,   -93,
+	-183,  -273,  -363,  -454,  -454,  -454,  -518,  -583,  -619,  -655,  -723,  -792,  -796,
+	-800,  -868,  -1960, -2296, -2376, -2248, -2080, -2093, -2106, -2119, -2132, -2113, -2094,
+	-2107, -2120, -2234, -813,  +2752, +2222, +1555, +1401, +574,  +1284, +1160, +1036, +880,
+	+724,  +672,  +620,  +440,  +260,  +164,  +69,   +357,  +646,  +654,  +151,  +159,  +168,
+	-1096, -2361, -2217, -2586, -18,   -11,   -3,    +4,    -4,    -13,   -21,   -30,   -110,
+	-191,  -271,  -352,  -344,  -336,  -456,  -576,  -632,  -688,  -744,  -800,  -760,  -720,
+	-584,  -2496, -2400, -2304, -2304, -2072, -2086, -2102, -2117, -2133, -2171, -2211, -2170,
+	-2130, -2462, +1045, +2615, +2138, +1656, +1432, +807,  +951,  +1193, +924,  +734,  +545,
+	+397,  +250,  +486,  +723,  +569,  +416,  +311,  +207,  +384,  +305,  +242,  +180,  -1825,
+	-2295, -2348, -1891, +69,   -19,   -10,   -3,    -7,    -12,   -16,   -22,   -65,   -107,
+	-182,  -258,  -309,  -361,  -477,  -593,  -640,  -688,  -736,  -784,  -752,  -720,  -1200,
+	-2448, -2384, -2320, -2320, -2064, -2081, -2099, -2116, -2134, -2231, -2329, -2234, -2140,
+	-2691, +2902, +2478, +2055, +1759, +1464, +1041, +618,  +1227, +812,  +589,  +366,  +379,
+	+392,  +277,  +162,  +207,  +253,  +267,  +281,  +114,  -52,   +70,   +192,  -2555, -2230,
+	-2481, -1197, +156,  -28,   -19,   -10,   -11,   -12,   -13,   -15,   -20,   -25,   -94,
+	-164,  -275,  -387,  -498,  -610,  -649,  -689,  -728,  -768,  -744,  -720,  -1816, -2400,
+	-2368, -2336, -2336, -2056, -2075, -2095, -2115, -2135, -2178, -2222, -2138, -2310, -1319,
+	+2743, +2293, +2099, +1893, +1432, +1242, +541,  +1036, +1020, +699,  +379,  +376,  +374,
+	+275,  +177,  +196,  +217,  +189,  +162,  +100,  +39,   +153,  -756,  -2420, -2293, -2549,
+	-502,  +131,  -4,    -10,   -17,   -14,   -12,   -9,    -7,    -7,    -6,    -102,  -198,
+	-320,  -444,  -519,  -595,  -641,  -689,  -720,  -752,  -768,  -784,  -2192, -2320, -2336,
+	-2352, -2352, -2048, -2070, -2092, -2114, -2136, -2126, -2116, -2042, -2480, +52,   +2584,
+	+2108, +2144, +2028, +1400, +1444, +464,  +78,   -308,  -470,  -632,  -394,  -156,  +18,
+	+192,  +187,  +182,  +113,  +44,   +87,   +130,  +237,  -1704, -2286, -2356, -2618, +192,
+	+106,  +20,   -2,    -24,   -18,   -12,   -6,    +0,    +6,    +12,   -110,  -232,  -367,
+	-502,  -541,  -580,  -635,  -690,  -713,  -736,  -792,  -848,  -2568, -2240, -2304, -2368,
+	-2368, -2046, -2068, -2091, -2113, -2136, -2121, -2105, -2186, -2523, +1999, +2681, +2740,
+	+1518, +117,  -1541, -2639, -2457, -2465, -2474, -2466, -2459, -2498, -2536, -2303, -2070,
+	-995,  +81,   -76,   +24,   +35,   +47,   -150,  -2394, -2422, -2450, -1806, +117,  +85,
+	+53,   +21,   -11,   -11,   -11,   -11,   -11,   -11,   -11,   -107,  -203,  -404,  -606,
+	-615,  -625,  -610,  -596,  -693,  -791,  -757,  -1491, -2401, -2287, -2303, -2319, -2319,
+	-2044, -2067, -2090, -2113, -2137, -2116, -2095, -2074, -2054, +2923, +219,  -1748, -2692,
+	-2563, -2435, -2114, -2306, -2193, -2080, -2159, -2239, -2298, -2357, -2320, -2284, -2432,
+	-2580, -1544, +4,    -16,   -36,   -280,  -2572, -2302, -2544, -994,  +43,   +64,   +86,
+	+44,   +2,    -4,    -10,   -16,   -22,   -28,   -34,   -104,  -174,  -186,  -198,  -178,
+	-158,  -330,  -502,  -674,  -846,  -722,  -2134, -2234, -2334, -2302, -2270, -2270, -2042,
+	-2065, -2089, -2112, -2137, -2159, -2180, -2154, -2129, -2458, -2532, -2604, -2166, -2218,
+	-2272, -2293, -2315, -2000, -2198, -2219, -2242, -2322, -2401, -2385, -2370, -2285, -2201,
+	-2452, -2704, -1411, +137,  -1402, -2174, -2502, -2830, +250,  +0,    +28,   +55,   +35,
+	+15,   +3,    -9,    -21,   -33,   -45,   -57,   -101,  -145,  -175,  -206,  -220,  -235,
+	-177,  -120,  -414,  -709,  -191,  -2489, -2547, -2349, -2349, -2349, -2349, -2040, -2064,
+	-2089, -2113, -2138, -2202, -2267, -2235, -2204, -2207, -2210, -2181, -2152, -2131, -2110,
+	-2217, -1812, -1552, -2317, -2025, -1734, -1578, -1423, -1939, -2456, -2395, -2334, -2081,
+	-2340, -2551, -2250, -2013, -2288, -2446, -2093, -43,   -42,   -8,    +25,   +26,   +28,
+	+10,   -8,    -26,   -44,   -62,   -80,   -98,   -116,  -165,  -214,  -263,  -312,  -281,
+	-250,  -155,  -60,   -940,  -1820, -2348, -2364, -2396, -2428, -2428, -2038, -2058, -2079,
+	-2100, -2122, -2123, -2124, -2285, -2191, -2065, -1940, -1910, -1882, -2232, -2327, -2149,
+	-1717, -1485, -2022, -1759, -1497, -1242, -987,  -716,  -446,  -1226, -2007, -2723, -2160,
+	-2330, -2245, -2175, -2362, -2338, -1034, +109,  -28,   -19,   -10,   +15,   +41,   +19,
+	-3,    -25,   -47,   -89,   -131,  -141,  -151,  -208,  -266,  -355,  -445,  -458,  -472,
+	-405,  -83,   -1135, -1163, -1895, -2371, -2387, -2403, -2403, -2036, -2053, -2071, -2089,
+	-2107, -2044, -1982, -2080, -1666, -1668, -1671, -1897, -2124, -2590, -2545, -2083, -1622,
+	-1419, -1729, -1495, -1261, -1162, -1064, -774,  -484,  -314,  -144,  -806,  -2492, -2366,
+	-2240, -2338, -2436, -2486, -489,  +4,    -15,   -30,   -45,   +4,    +54,   +28,   +2,
+	-24,   -50,   -116,  -182,  -184,  -186,  -252,  -318,  -448,  -578,  -636,  -694,  -656,
+	-106,  -2098, -2042, -2210, -2378, -2378, -2378, -2378, -2034, -2047, -2062, -2076, -2091,
+	-2093, -2096, -1650, -1461, -1687, -1913, -2155, -2398, -2676, -2442, -2016, -1591, -1448,
+	-1563, -1341, -1120, -986,  -853,  -623,  -394,  -265,  -137,  +200,  +24,   -1554, -2363,
+	-2324, -2286, -2122, -2727, -1220, +31,   +136,  -15,   +25,   +67,   +37,   +7,    -7,
+	-21,   -111,  -201,  -211,  -221,  -295,  -370,  -460,  -551,  -509,  -468,  -634,  -545,
+	-2805, -2249, -2301, -2353, -2353, -2353, -2353, -2032, -2043, -2054, -2065, -2076, -2143,
+	-2210, -1477, -1768, -1962, -2156, -2414, -2672, -2762, -2340, -1950, -1560, -1479, -1398,
+	-1189, -980,  -811,  -642,  -473,  -304,  -217,  -130,  -75,   -20,   +27,   -2486, -2311,
+	-2136, -2527, -2406, -2445, -2484, -979,  +14,   +47,   +80,   +46,   +12,   +10,   +8,
+	-106,  -220,  -238,  -256,  -339,  -422,  -473,  -524,  -639,  -754,  -1637, -2520, -2232,
+	-2456, -2392, -2328, -2328, -2328, -2328, -2012, -2030, -2049, -2052, -2055, -2191, -2073,
+	-1585, -1867, -2081, -2296, -2526, -2757, -2653, -2294, -1886, -1479, -1380, -1282, -1087,
+	-893,  -748,  -604,  -491,  -379,  -243,  -109,  -181,  +1,    -606,  -2493, -2283, -2331,
+	-2481, -2376, -2413, -2452, -2308, -2421, -1350, -278,  -124,  +30,   +88,   +145,  +127,
+	+109,  +27,   -56,   -278,  -501,  -1107, -1714, -2162, -2612, -2532, -2453, -2297, -2397,
+	-2369, -2341, -2341, -2341, -2341, -1992, -2018, -2045, -2040, -2035, -2241, -1936, -1695,
+	-1966, -2201, -2436, -2639, -2842, -2545, -2248, -1823, -1398, -1282, -1166, -986,  -806,
+	-686,  -566,  -510,  -454,  -271,  -88,   -289,  +22,   -1239, -2500, -2257, -2526, -388,
+	-2346, -2383, -2421, -2358, -2296, -2490, -2684, -2342, -2001, -1627, -1254, -1176, -1099,
+	-1501, -1904, -2266, -2628, -2510, -2393, -2407, -2422, -2404, -2386, -2362, -2338, -2346,
+	-2354, -2354, -2354, -2354, -1972, -2006, -2040, -2043, -2046, -2194, -1831, -1835, -2097,
+	-2336, -2576, -2735, -2895, -2564, -2234, -1839, -1445, -1279, -1114, -916,  -719,  -623,
+	-528,  -528,  -529,  -425,  -323,  -59,   -53,   -2527, -2443, -2517, -2081, +170,  -140,
+	-1312, -2485, -2440, -2395, -2382, -2370, -2400, -2431, -2509, -2589, -2559, -2530, -2500,
+	-2472, -2429, -2387, -2489, -2335, -2939, -2008, -1331, -2447, -2395, -2343, -2355, -2367,
+	-2367, -2367, -2367, -1952, -1994, -2037, -2047, -2058, -2148, -1727, -1977, -2228, -2472,
+	-2716, -2832, -2948, -2584, -2220, -1856, -1492, -1277, -1062, -847,  -632,  -561,  -490,
+	-547,  -604,  -581,  -558,  -343,  -1152, -2281, -2386, -2523, -1124, -40,   +19,   +15,
+	+10,   -1242, -2495, -2531, -2568, -2459, -2350, -2369, -2388, -2407, -2426, -2477, -2528,
+	-2593, -2659, -2212, -1254, +369,  +967,  -1026, -2508, -2428, -2348, -2364, -2380, -2380,
+	-2380, -2380, -1948, -1996, -2044, -2060, -2077, -1957, -1837, -2069, -2303, -2545, -2788,
+	-2918, -3049, -2873, -2442, -2026, -1611, -1374, -1138, -965,  -793,  -732,  -672,  -707,
+	-743,  -847,  -953,  -2017, -2059, -2441, -2313, -2327, -295,  +99,   -19,   +23,   +65,
+	+26,   -13,   -629,  -1246, -1795, -2345, -2509, -2675, -2540, -2406, -1887, -1368, -467,
+	+434,  +439,  +699,  +1162, +856,  -2695, -2409, -2413, -2417, -2389, -2361, -2361, -2361,
+	-2361, -1944, -1998, -2052, -2074, -2097, -1767, -1949, -2163, -2378, -2619, -2860, -3005,
+	-3150, -3163, -2664, -2197, -1730, -1472, -1214, -1084, -954,  -904,  -854,  -868,  -882,
+	-859,  -836,  -877,  -1942, -2091, -2240, -2389, +22,   -18,   -57,   +32,   +121,  +14,
+	-93,   -9,    +76,   +149,  +221,  +166,  +110,  +143,  +175,  +239,  +304,  +379,  +455,
+	+530,  +605,  +676,  +235,  -2573, -2310, -2398, -2486, -2414, -2342, -2342, -2342, -2342,
+	-1940, -2000, -2060, -2072, -2084, -1640, -1964, -2144, -2325, -2532, -2740, -2899, -3059,
+	-3052, -2790, -2319, -1849, -1569, -1290, -1202, -1115, -1075, -1036, -1028, -1021, -1077,
+	-1135, -503,  -2689, -2395, -2359, -1553, +19,   -6,    -30,   +25,   +80,   +34,   -12,
+	+37,   +86,   +124,  +162,  +137,  +111,  +137,  +163,  +237,  +312,  +393,  +475,  +525,
+	+574,  +654,  -803,  -2466, -2339, -2383, -2427, -2375, -2323, -2323, -2323, -2323, -1936,
+	-2002, -2068, -2070, -2072, -1514, -1980, -2126, -2272, -2446, -2620, -2794, -2968, -2942,
+	-2916, -2442, -1968, -1667, -1366, -1321, -1276, -1247, -1218, -1189, -1160, -1041, -922,
+	-1411, -2412, -2189, -2478, -719,  +16,   +6,    -4,    +18,   +40,   +54,   +68,   +82,
+	+96,   +100,  +104,  +108,  +112,  +132,  +152,  +236,  +320,  +408,  +496,  +520,  +544,
+	+632,  -1840, -2360, -2368, -2368, -2368, -2336, -2304, -2304, -2304, -2304, -1898, -1921,
+	-1944, -2111, -1766, -1551, -1848, -1985, -2122, -2318, -2515, -2664, -2813, -3074, -3079,
+	-2828, -2321, -2024, -1729, -1608, -1489, -1457, -1425, -1393, -1362, -1246, -1131, -1879,
+	-2372, -2532, -2693, +331,  +25,   +40,   +55,   +54,   +54,   +71,   +88,   +105,  +123,
+	+151,  +180,  +208,  +237,  +83,   -70,   +48,   +167,  +248,  +329,  +346,  +363,  +733,
+	-2738, -2577, -2416, -2395, -2374, -2353, -2332, -2332, -2332, -2332, -1860, -1840, -1820,
+	-2152, -1460, -1588, -1716, -1844, -1972, -2191, -2411, -2535, -2659, -2950, -2730, -2958,
+	-2674, -2383, -2092, -1897, -1703, -1668, -1633, -1598, -1564, -1452, -1340, -2348, -2333,
+	-2365, -1885, -157,  +34,   +74,   +115,  +91,   +68,   +88,   +109,  +129,  +150,  +203,
+	+256,  +309,  +362,  +291,  +220,  +117,  +14,   +88,   +162,  +172,  +183,  -702,  -2612,
+	-2282, -2464, -2422, -2380, -2370, -2360, -2360, -2360, -2360, -2110, -1967, -1824, -1953,
+	-1314, -1513, -1712, -1815, -1918, -2207, -2242, -2453, -2408, -2602, -2541, -2752, -2707,
+	-2692, -2679, -2409, -2140, -2054, -1968, -1867, -1766, -1721, -1677, -2369, -2293, -2516,
+	-948,  -53,   +75,   +92,   +110,  +95,   +82,   +105,  +129,  +152,  +177,  +222,  +268,
+	+313,  +359,  +354,  +350,  +441,  +533,  +472,  +411,  +414,  +674,  -1689, -2518, -2339,
+	-2416, -2401, -2386, -2387, -2388, -2388, -2388, -2388, -1848, -1838, -1828, -1754, -1168,
+	-1438, -1708, -1786, -1864, -2225, -2075, -2372, -2158, -2255, -2353, -2546, -2740, -2747,
+	-2755, -2666, -2578, -2441, -2305, -2136, -1968, -1991, -2015, -2390, -2254, -2669, -13,
+	+51,   +116,  +111,  +106,  +101,  +96,   +123,  +150,  +177,  +204,  +242,  +280,  +318,
+	+356,  +418,  +480,  +510,  +540,  +600,  +661,  +657,  +1166, -2677, -2425, -2396, -2368,
+	-2380, -2392, -2404, -2416, -2416, -2416, -2416, -1882, -1711, -1796, -1369, -1198, -1419,
+	-1640, -1749, -1858, -1977, -1842, -2058, -2019, -2113, -2207, -2366, -2525, -2478, -2689,
+	-2836, -2983, -2759, -2536, -2393, -2250, -2194, -2139, -2357, -2318, -2018, +72,   +113,
+	+157,  +150,  +145,  +139,  +134,  +159,  +186,  +212,  +239,  +273,  +308,  +342,  +377,
+	+439,  +502,  +548,  +595,  +632,  +669,  +931,  +170,  -2666, -2430, -2403, -2376, -2385,
+	-2394, -2403, -2412, -2412, -2412, -2412, -1916, -1840, -2276, -1240, -1228, -1400, -1572,
+	-1712, -1852, -1731, -1610, -1745, -1881, -1972, -2063, -2186, -2310, -2211, -2625, -2751,
+	-2877, -2822, -2768, -2650, -2532, -2398, -2265, -2324, -2383, -1369, +156,  +177,  +198,
+	+191,  +185,  +178,  +172,  +197,  +223,  +248,  +274,  +305,  +336,  +367,  +398,  +461,
+	+524,  +587,  +650,  +664,  +679,  +1206, -827,  -2656, -2437, -2410, -2384, -2390, -2396,
+	-2402, -2408, -2408, -2408, -2408, -1950, -1953, -1956, -1063, -1194, -1317, -1440, -1435,
+	-1430, -1499, -1314, -1431, -1550, -1638, -1726, -1798, -1871, -1927, -2240, -2409, -2578,
+	-2597, -2616, -2731, -2846, -2554, -2262, -2259, -2511, -527,  +176,  +207,  +239,  +231,
+	+224,  +217,  +210,  +234,  +259,  +284,  +309,  +336,  +364,  +391,  +419,  +482,  +546,
+	+609,  +673,  +744,  +816,  +936,  -2015, -2485, -2187, -2289, -2392, -2395, -2398, -2401,
+	-2404, -2404, -2404, -2404, -1984, -2066, -1636, -886,  -1160, -1234, -1308, -1414, -1520,
+	-2037, -2042, -1887, -1732, -1817, -1902, -1923, -1944, -1900, -1856, -2068, -2280, -2372,
+	-2464, -2556, -2648, -2454, -2260, -2194, -2640, +314,  +196,  +238,  +280,  +272,  +264,
+	+256,  +248,  +272,  +296,  +320,  +344,  +368,  +392,  +416,  +440,  +504,  +568,  +632,
+	+696,  +825,  +954,  +923,  -2692, -2315, -2450, -2425, -2400, -2400, -2400, -2400, -2400,
+	-2400, -2400, -2400, -2252, -1953, -1142, -1035, -1441, -1826, -2211, -2244, -2278, -2220,
+	-1908, -1914, -1922, -2001, -2336, -2095, -2111, -2171, -2231, -2131, -2031, -2143, -2255,
+	-2303, -2352, -2306, -2260, -2359, -1689, +442,  +269,  +305,  +341,  +333,  +325,  +317,
+	+309,  +329,  +349,  +369,  +389,  +415,  +441,  +468,  +494,  +536,  +579,  +669,  +760,
+	+797,  +1091, -248,  -2610, -2406, -2459, -2431, -2404, -2400, -2396, -2392, -2388, -2388,
+	-2388, -2388, -2008, -2096, -1673, -1953, -2234, -2162, -2091, -2051, -2012, -2149, -2286,
+	-2199, -2113, -1930, -2259, -2012, -2278, -2186, -2094, -2194, -2295, -2171, -2047, -2051,
+	-2056, -2158, -2261, -2524, -739,  +570,  +343,  +372,  +402,  +394,  +386,  +378,  +370,
+	+386,  +402,  +418,  +434,  +462,  +491,  +520,  +549,  +569,  +590,  +707,  +824,  +770,
+	+1228, -1418, -2528, -2498, -2468, -2438, -2408, -2400, -2392, -2384, -2376, -2376, -2376,
+	-2376, -1988, -2191, -2139, -2150, -2163, -2130, -2098, -2081, -2066, -2140, -2216, -2179,
+	-2143, -2066, -2245, -2137, -2285, -2233, -2181, -2225, -2270, -2326, -2382, -2166, -1952,
+	-2250, -2549, -2465, +180,  +394,  +352,  +407,  +463,  +455,  +447,  +423,  +399,  +523,
+	+391,  +547,  +447,  +493,  +540,  +572,  +603,  +633,  +665,  +792,  +920,  +1094, +1269,
+	-2764, -2446, -2429, -2413, -2412, -2412, -2400, -2388, -2376, -2364, -2364, -2364, -2364,
+	-1968, -2031, -2094, -2093, -2092, -2099, -2106, -2113, -2120, -2133, -2147, -2160, -2174,
+	-2203, -2233, -2262, -2292, -2280, -2269, -2257, -2246, -2226, -2207, -2283, -2360, -2343,
+	-2327, -2406, +586,  -38,   +363,  +443,  +524,  +516,  +508,  +468,  +428,  +660,  +380,
+	+676,  +460,  +525,  +591,  +624,  +658,  +699,  +741,  +878,  +1016, +907,  +286,  -2575,
+	-2364, -2361, -2358, -2387, -2416, -2400, -2384, -2368, -2352, -2352, -2352, -2352, -2020,
+	-2071, -2124, -2080, -2037, -2062, -2089, -2115, -2142, -2152, -2164, -2176, -2188, -2211,
+	-2235, -2259, -2283, -2275, -2267, -2260, -2253, -2249, -2246, -2290, -2336, -2337, -2339,
+	-1205, -71,   -16,   +296,  +496,  +441,  +469,  +497,  +381,  +521,  +635,  +493,  +735,
+	+465,  +544,  +624,  +640,  +656,  +747,  +839,  +899,  +960,  +1115, -1033, -2493, -2418,
+	-2378, -2339, -2379, -2420, -2408, -2396, -2384, -2372, -2372, -2372, -2372, -2072, -2113,
+	-2155, -2068, -1982, -2027, -2073, -2118, -2164, -2173, -2183, -2193, -2203, -2220, -2238,
+	-2256, -2274, -2270, -2267, -2264, -2261, -2273, -2286, -2299, -2312, -2332, -2352, -2052,
+	-729,  +7,    +230,  +550,  +358,  +422,  +486,  +294,  +614,  +610,  +606,  +794,  +470,
+	+564,  +658,  +656,  +655,  +797,  +939,  +921,  +904,  +1324, -2352, -2412, -2472, -2396,
+	-2320, -2372, -2424, -2416, -2408, -2400, -2392, -2392, -2392, -2392, -1996, -1930, -1865,
+	-1960, -2055, -2087, -2120, -2153, -2186, -2193, -2201, -2209, -2217, -2229, -2241, -2253,
+	-2265, -2265, -2266, -2267, -2268, -2280, -2294, -2306, -2320, -2342, -2365, -2707, -2538,
+	-1491, -188,  +172,  +275,  +327,  +379,  +287,  +451,  +505,  +559,  +773,  +475,  +551,
+	+628,  +512,  +653,  +909,  +654,  +1007, +1104, -739,  -2583, -2506, -2430, -2397, -2365,
+	-2396, -2428, -2424, -2420, -2416, -2412, -2412, -2412, -2412, -1920, -2004, -2088, -2108,
+	-2128, -2148, -2168, -2188, -2208, -2214, -2220, -2226, -2232, -2238, -2244, -2250, -2256,
+	-2261, -2266, -2271, -2276, -2289, -2302, -2315, -2328, -2353, -2378, -2339, -2300, -2477,
+	-1630, -719,  +192,  +232,  +272,  +280,  +288,  +400,  +512,  +752,  +480,  +539,  +598,
+	+369,  +652,  +767,  -142,  -1211, -2792, -2547, -2302, -2345, -2388, -2399, -2410, -2421,
+	-2432, -2432, -2432, -2432, -2432, -2432, -2432, -2432, -2024, -2070, -2116, -2130, -2144,
+	-2164, -2184, -2204, -2224, -2228, -2232, -2236, -2240, -2244, -2248, -2252, -2256, -2262,
+	-2270, -2276, -2284, -2296, -2310, -2322, -2336, -2319, -2304, -2287, -2272, -2559, -2336,
+	-1855, -1376, -2264, -1104, -520,  +64,   +384,  +704,  +704,  +192,  -44,   -280,  -1236,
+	-1936, -3018, -2564, -2349, -2392, -2390, -2390, -2388, -2388, -2398, -2410, -2420, -2432,
+	-2432, -2432, -2432, -2432, -2432, -2432, -2432, -2128, -2136, -2144, -2152, -2160, -2180,
+	-2200, -2220, -2240, -2242, -2244, -2246, -2248, -2250, -2252, -2254, -2256, -2265, -2274,
+	-2283, -2292, -2305, -2318, -2331, -2344, -2287, -2230, -2237, -2244, -2387, -2530, -2481,
+	-2432, -2456, -2480, -2600, -2720, -2448, -2176, -1904, -2144, -2419, -2694, -2585, -2476,
+	-2451, -2426, -2465, -2504, -2491, -2478, -2433, -2388, -2399, -2410, -2421, -2432, -2432,
+	-2432, -2432, -2432, -2432, -2432, -2432, -2104, -2122, -2140, -2158, -2176, -2196, -2216,
+	-2236, -2256, -2256, -2256, -2256, -2256, -2256, -2256, -2256, -2256, -2266, -2278, -2288,
+	-2300, -2312, -2326, -2338, -2352, -2317, -2284, -2281, -2280, -2357, -2436, -2417, -2400,
+	-2408, -2416, -2360, -2304, -2480, -864,  -1648, -1408, -1225, -2580, -2509, -2440, -2427,
+	-2416, -2435, -2456, -2446, -2438, -2412, -2388, -2398, -2410, -2420, -2432, -2432, -2432,
+	-2432, -2432, -2432, -2432, -2432, -2080, -2108, -2136, -2164, -2192, -2212, -2232, -2252,
+	-2272, -2270, -2268, -2266, -2264, -2262, -2260, -2258, -2256, -2269, -2282, -2295, -2308,
+	-2321, -2334, -2347, -2360, -2349, -2338, -2327, -2316, -2329, -2342, -2355, -2368, -2360,
+	-2352, -2376, -2400, -2256, -2624, -1392, -1696, -2593, -2466, -2435, -2404, -2405, -2406,
+	-2407, -2408, -2403, -2398, -2393, -2388, -2399, -2410, -2421, -2432, -2432, -2432, -2432,
+	-2432, -2432, -2432, -2432, -2080, -2108, -2136, -2164, -2192, -2212, -2232, -2252, -2272,
+	-2270, -2268, -2266, -2264, -2262, -2260, -2258, -2256, -2268, -2282, -2294, -2308, -2320,
+	-2334, -2346, -2360, -2348, -2338, -2326, -2316, -2328, -2342, -2354, -2368, -2360, -2352,
+	-2360, -2368, -2352, -2592, -2192, -2560, -2768, -2466, -2434, -2404, -2404, -2406, -2406,
+	-2408, -2402, -2398, -2392, -2388, -2398, -2410, -2420, -2432, -2432, -2432, -2432, -2432,
+	-2432, -2432, -2432, -2080, -2108, -2136, -2164, -2192, -2212, -2232, -2252, -2272, -2270,
+	-2268, -2266, -2264, -2262, -2260, -2258, -2256, -2269, -2282, -2295, -2308, -2321, -2334,
+	-2347, -2360, -2349, -2338, -2327, -2316, -2329, -2342, -2355, -2368, -2360, -2352, -2344,
+	-2336, -2448, -2560, -2480, -2400, -2433, -2466, -2435, -2404, -2405, -2406, -2407, -2408,
+	-2403, -2398, -2393, -2388, -2399, -2410, -2421, -2432, -2432, -2432, -2432, -2432, -2432,
+	-2432, -2432, -2080, -2108, -2136, -2164, -2192, -2212, -2232, -2252, -2272, -2270, -2268,
+	-2266, -2264, -2262, -2260, -2258, -2256, -2268, -2282, -2294, -2308, -2320, -2334, -2346,
+	-2360, -2348, -2338, -2326, -2316, -2328, -2342, -2354, -2368, -2360, -2352, -2344, -2336,
+	-2448, -2560, -2480, -2400, -2432, -2466, -2434, -2404, -2404, -2406, -2406, -2408, -2402,
+	-2398, -2392, -2388, -2398, -2410, -2420, -2432, -2432, -2432, -2432, -2432, -2432, -2432,
+	-2432
+};
+
+/**
+ * 64x64 XRGB Image
+ */
+
+static const UINT32 TEST_XRGB_IMAGE[4096] = {
+	0xFF229cdf, 0xFF249de0, 0xFF259fe2, 0xFF2ca5e8, 0xFF229cdf, 0xFF229ce0, 0xFF239de0, 0xFF229ce0,
+	0xFF229cdf, 0xFF229cdf, 0xFF239ce0, 0xFF249ce0, 0xFF249ce0, 0xFF219ce3, 0xFF1e9ce6, 0xFF209ae2,
+	0xFF2299dd, 0xFF2199de, 0xFF209adf, 0xFF209ae0, 0xFF1f9be0, 0xFF1e9ae0, 0xFF1d99e0, 0xFF1c98e0,
+	0xFF1b97df, 0xFF1e96dc, 0xFF2194d9, 0xFF1f93dd, 0xFF1d93e0, 0xFF1b94dc, 0xFF1895d8, 0xFF1c92db,
+	0xFF208fde, 0xFF1b91de, 0xFF1693df, 0xFF1793df, 0xFF1992df, 0xFF1891df, 0xFF178fdf, 0xFF178edf,
+	0xFF168dde, 0xFF158cdd, 0xFF148cdc, 0xFF128cda, 0xFF118cd9, 0xFF118bd9, 0xFF128ada, 0xFF1289da,
+	0xFF1288db, 0xFF1187da, 0xFF1186da, 0xFF1085da, 0xFF0f85d9, 0xFF0f84d9, 0xFF0e83d9, 0xFF0d82d8,
+	0xFF0d82d8, 0xFF0d81d8, 0xFF0d80d7, 0xFF0d7fd7, 0xFF0d7ed6, 0xFF0d7ed6, 0xFF0d7ed6, 0xFF0d7ed6,
+	0xFF259fe1, 0xFF27a1e2, 0xFF29a2e3, 0xFF2ba4e6, 0xFF249fe1, 0xFF249fe1, 0xFF249fe1, 0xFF249ee1,
+	0xFF239ee1, 0xFF249ee1, 0xFF249ee1, 0xFF259de1, 0xFF259de2, 0xFF249de2, 0xFF229de2, 0xFF229ce1,
+	0xFF229bdf, 0xFF219ce0, 0xFF209ce1, 0xFF209ce2, 0xFF209ce2, 0xFF209ae0, 0xFF2199de, 0xFF1f99df,
+	0xFF1d98e0, 0xFF1e97e0, 0xFF1f97e0, 0xFF1d96df, 0xFF1c95de, 0xFF1c94e0, 0xFF1c94e1, 0xFF1d93e1,
+	0xFF1d92e0, 0xFF1b93de, 0xFF1a94dc, 0xFF1a93de, 0xFF1a93e0, 0xFF1992e0, 0xFF1891df, 0xFF188fdf,
+	0xFF178edf, 0xFF168ede, 0xFF158edd, 0xFF148ddc, 0xFF138ddb, 0xFF138cdb, 0xFF138bdb, 0xFF128adb,
+	0xFF1289db, 0xFF1288db, 0xFF1187db, 0xFF1186db, 0xFF1085db, 0xFF0f84da, 0xFF0e83d9, 0xFF0e83d9,
+	0xFF0e83d9, 0xFF0e82d9, 0xFF0e81d8, 0xFF0e80d8, 0xFF0d7fd7, 0xFF0d7fd7, 0xFF0d7fd7, 0xFF0d7fd7,
+	0xFF27a3e3, 0xFF2aa4e3, 0xFF2ea6e3, 0xFF2aa4e3, 0xFF26a2e3, 0xFF26a1e3, 0xFF25a1e3, 0xFF25a0e3,
+	0xFF25a0e3, 0xFF25a0e3, 0xFF259fe3, 0xFF269fe3, 0xFF269ee4, 0xFF279ee1, 0xFF279edf, 0xFF259ee0,
+	0xFF239ee1, 0xFF219ee2, 0xFF209ee4, 0xFF209de4, 0xFF219de3, 0xFF229be0, 0xFF2499dc, 0xFF2299de,
+	0xFF1f98e0, 0xFF1d99e4, 0xFF1b9ae7, 0xFF1c98e2, 0xFF1c96dc, 0xFF1e94e3, 0xFF2092ea, 0xFF1d94e6,
+	0xFF1a96e2, 0xFF1c96de, 0xFF1d95da, 0xFF1c94de, 0xFF1b94e1, 0xFF1a93e0, 0xFF1a92e0, 0xFF1991e0,
+	0xFF1890e0, 0xFF1790df, 0xFF178fde, 0xFF168fde, 0xFF158edd, 0xFF148ddd, 0xFF138cdc, 0xFF138bdc,
+	0xFF128adc, 0xFF1289dc, 0xFF1188dc, 0xFF1187dd, 0xFF1086dd, 0xFF0f85db, 0xFF0e83d9, 0xFF0e84da,
+	0xFF0f84da, 0xFF0e83da, 0xFF0e82d9, 0xFF0e81d9, 0xFF0e80d8, 0xFF0e80d8, 0xFF0e80d8, 0xFF0e80d8,
+	0xFF2aa7e5, 0xFF2da7e4, 0xFF31a8e3, 0xFF2ca6e3, 0xFF27a4e4, 0xFF27a3e4, 0xFF27a3e4, 0xFF27a3e4,
+	0xFF26a2e4, 0xFF26a2e4, 0xFF27a1e5, 0xFF27a0e5, 0xFF27a0e6, 0xFF26a0e5, 0xFF25a0e4, 0xFF259fe4,
+	0xFF259ee3, 0xFF239ee5, 0xFF229fe6, 0xFF229fe5, 0xFF229fe4, 0xFF13a5e6, 0xFF1b9fe8, 0xFF16a0e8,
+	0xFF11a0e7, 0xFF129fef, 0xFF139ef7, 0xFF1b99ec, 0xFF179ae2, 0xFF149ce4, 0xFF1d98e5, 0xFF1c97e6,
+	0xFF1b96e7, 0xFF1c98dc, 0xFF1d97df, 0xFF1c96e1, 0xFF1c94e2, 0xFF1b94e1, 0xFF1b93e1, 0xFF1a93e0,
+	0xFF1a92e0, 0xFF1991e0, 0xFF1890e0, 0xFF1790df, 0xFF168fdf, 0xFF158ede, 0xFF158dde, 0xFF148cdd,
+	0xFF138bdc, 0xFF128add, 0xFF1289dd, 0xFF1188de, 0xFF1187de, 0xFF0f85dc, 0xFF0d83da, 0xFF0f85db,
+	0xFF1086db, 0xFF0f84db, 0xFF0f83da, 0xFF0e82da, 0xFF0e81da, 0xFF0e81da, 0xFF0e81da, 0xFF0e81da,
+	0xFF2caae7, 0xFF30aae5, 0xFF34abe3, 0xFF2ea8e4, 0xFF29a6e5, 0xFF28a6e5, 0xFF28a5e5, 0xFF28a5e5,
+	0xFF28a5e6, 0xFF28a4e6, 0xFF28a3e7, 0xFF28a2e7, 0xFF28a1e8, 0xFF25a2e9, 0xFF23a3ea, 0xFF25a0e8,
+	0xFF279ee6, 0xFF259fe7, 0xFF23a0e9, 0xFF18a4f5, 0xFF0ea7ff, 0xFF1ba6de, 0xFF558ebb, 0xFF6f839c,
+	0xFF89797e, 0xFF8d797c, 0xFF917979, 0xFF7f7b94, 0xFF5687af, 0xFF229bd6, 0xFF04a4fd, 0xFF109df4,
+	0xFF1c97eb, 0xFF1c9ada, 0xFF1c98e4, 0xFF1c97e3, 0xFF1d95e2, 0xFF1c95e2, 0xFF1c94e2, 0xFF1c94e1,
+	0xFF1b94e1, 0xFF1a93e1, 0xFF1a92e1, 0xFF1991e1, 0xFF1890e1, 0xFF178fe0, 0xFF158edf, 0xFF148dde,
+	0xFF138cdd, 0xFF128bde, 0xFF128adf, 0xFF1289df, 0xFF1188e0, 0xFF0f85dd, 0xFF0d83da, 0xFF0f85db,
+	0xFF1187dd, 0xFF1086dc, 0xFF0f84dc, 0xFF0e83db, 0xFF0e81db, 0xFF0e81db, 0xFF0e81db, 0xFF0e81db,
+	0xFF30abe5, 0xFF36afe8, 0xFF34abe4, 0xFF2faae5, 0xFF2ba8e6, 0xFF36aee8, 0xFF26a6e8, 0xFF29a7e7,
+	0xFF2ca8e7, 0xFF2da7e6, 0xFF2fa5e5, 0xFF2ca5e7, 0xFF29a4e9, 0xFF2ba5e5, 0xFF2ca5e2, 0xFF10aaef,
+	0xFF13adf6, 0xFF23a3f8, 0xFF6091a5, 0xFFa6755d, 0xFFec5915, 0xFFff490c, 0xFFfa5504, 0xFFff590f,
+	0xFFff5d1b, 0xFFff6116, 0xFFfa6412, 0xFFff550f, 0xFFff4b0d, 0xFFfb4918, 0xFFf54823, 0xFF8e737e,
+	0xFF269eda, 0xFF06a2ff, 0xFF1d97e2, 0xFF1799ea, 0xFF1c97e4, 0xFF1a98e4, 0xFF1898e4, 0xFF1a96e3,
+	0xFF1b95e3, 0xFF1a94e2, 0xFF1a93e0, 0xFF1992e1, 0xFF1891e2, 0xFF1790e1, 0xFF168fe0, 0xFF158fdf,
+	0xFF138ede, 0xFF138ddf, 0xFF138ce0, 0xFF128be0, 0xFF1189e0, 0xFF1087de, 0xFF0f85db, 0xFF138ae0,
+	0xFF0f87dc, 0xFF0f86dc, 0xFF0f85dc, 0xFF0f84dc, 0xFF0e83db, 0xFF0e83db, 0xFF0e83db, 0xFF0e83db,
+	0xFF34abe2, 0xFF3cb4ec, 0xFF34ace5, 0xFF31abe6, 0xFF2daae8, 0xFF44b6eb, 0xFF24a7ea, 0xFF29aaea,
+	0xFF2face9, 0xFF32a9e6, 0xFF35a7e3, 0xFF30a7e6, 0xFF2ba8ea, 0xFF25aaf0, 0xFF20adf6, 0xFF4d8ba7,
+	0xFFb8674c, 0xFFff5510, 0xFFf7650c, 0xFFf86313, 0xFFfa611b, 0xFFf0671f, 0xFFfc6222, 0xFFfb6926,
+	0xFFf96f29, 0xFFf67122, 0xFFf3721b, 0xFFf26b20, 0xFFf16424, 0xFFff5622, 0xFFff531f, 0xFFff4b17,
+	0xFFff440e, 0xFFb1615b, 0xFF1f95e0, 0xFF129bf0, 0xFF1c9ae5, 0xFF189ae6, 0xFF159be7, 0xFF1898e6,
+	0xFF1b95e5, 0xFF1b95e2, 0xFF1995e0, 0xFF1994e1, 0xFF1892e2, 0xFF1792e1, 0xFF1691e0, 0xFF1590df,
+	0xFF148fdf, 0xFF148fe0, 0xFF148fe1, 0xFF128de1, 0xFF108be0, 0xFF1189de, 0xFF1186dd, 0xFF178fe4,
+	0xFF0e87db, 0xFF0e87dc, 0xFF0f87dd, 0xFF0f85dc, 0xFF0e84dc, 0xFF0e84dc, 0xFF0e84dc, 0xFF0e84dc,
+	0xFF36b1eb, 0xFF36b4f0, 0xFF2eafed, 0xFF2caeec, 0xFF2aadec, 0xFF41b4ef, 0xFF29abe9, 0xFF2cabe8,
+	0xFF2fabe7, 0xFF31abe6, 0xFF32aae6, 0xFF2faae7, 0xFF2ca9e8, 0xFF25a7eb, 0xFF946a5f, 0xFFff3e06,
+	0xFFf95618, 0xFFe27312, 0xFFf87329, 0xFFf77427, 0xFFf77626, 0xFFf27628, 0xFFf8712b, 0xFFf9772e,
+	0xFFf97e30, 0xFFf77f2e, 0xFFf5812b, 0xFFf57b2c, 0xFFf5752d, 0xFFfd6a2b, 0xFFfb652a, 0xFFf65e2c,
+	0xFFf1572e, 0xFFff4810, 0xFFff460f, 0xFF817680, 0xFF02a7f1, 0xFF2496ea, 0xFF199be4, 0xFF1b98e4,
+	0xFF1d96e5, 0xFF1b96e2, 0xFF1a96e0, 0xFF1995e1, 0xFF1794e3, 0xFF1793e2, 0xFF1692e1, 0xFF1691e0,
+	0xFF1590df, 0xFF1591e1, 0xFF1591e3, 0xFF138fe1, 0xFF108ce0, 0xFF128be0, 0xFF158ae0, 0xFF168de2,
+	0xFF0f89dd, 0xFF0f88dd, 0xFF0f88dd, 0xFF0f86dd, 0xFF0f85dc, 0xFF0f85dc, 0xFF0f85dc, 0xFF0f85dc,
+	0xFF5fc1e7, 0xFF57bee8, 0xFF4fbbe9, 0xFF4ebae6, 0xFF4ebae3, 0xFF51b6ee, 0xFF2eaee8, 0xFF2eade6,
+	0xFF2fabe5, 0xFF2face7, 0xFF2eade9, 0xFF2eace7, 0xFF2daae5, 0xFF15b2ff, 0xFFec4310, 0xFFf15016,
+	0xFFf75d1c, 0xFFf87123, 0xFFf9862a, 0xFFf6882d, 0xFFf48b31, 0xFFf48532, 0xFFf47f33, 0xFFf78535,
+	0xFFfa8c37, 0xFFf88e39, 0xFFf7903a, 0xFFf88b38, 0xFFf98635, 0xFFf87e35, 0xFFf77635, 0xFFf76d34,
+	0xFFf76532, 0xFFf85e31, 0xFFf95730, 0xFFff5125, 0xFFf65237, 0xFF03a5fd, 0xFF1e9be1, 0xFF1e98e3,
+	0xFF1f96e5, 0xFF1c97e2, 0xFF1a97df, 0xFF1896e1, 0xFF1795e4, 0xFF1794e3, 0xFF1793e2, 0xFF1692e1,
+	0xFF1692e0, 0xFF1693e2, 0xFF1794e4, 0xFF1391e2, 0xFF0f8ee0, 0xFF148ee1, 0xFF198ee3, 0xFF148ce1,
+	0xFF0f8bde, 0xFF0f8ade, 0xFF0f89de, 0xFF0f88dd, 0xFF0f86dd, 0xFF0f86dd, 0xFF0f86dd, 0xFF0f86dd,
+	0xFF3cb6ee, 0xFF36b4ef, 0xFF30b2f0, 0xFF30b1ee, 0xFF2fb1ec, 0xFF38b0ef, 0xFF2eaee9, 0xFF2faee8,
+	0xFF31ade6, 0xFF2fafe8, 0xFF2eb1ea, 0xFF31adec, 0xFF29afee, 0xFF30aac8, 0xFFff3d05, 0xFFfa501a,
+	0xFFf96021, 0xFFf87428, 0xFFf7882f, 0xFFfa9638, 0xFFf59b38, 0xFFf5973b, 0xFFf6923e, 0xFFf89440,
+	0xFFfa9742, 0xFFfa9a44, 0xFFfa9d46, 0xFFf99845, 0xFFf89444, 0xFFf98d43, 0xFFfa8641, 0xFFf97d3f,
+	0xFFf9743d, 0xFFf77039, 0xFFf56d35, 0xFFff6122, 0xFFbf6c63, 0xFF129eef, 0xFF229ae8, 0xFF1c99ed,
+	0xFF179ce4, 0xFF1498f0, 0xFF1b94e1, 0xFF1a96e2, 0xFF1998e3, 0xFF1897e4, 0xFF1896e5, 0xFF1895e4,
+	0xFF1993e2, 0xFF1792e1, 0xFF1590df, 0xFF1692e2, 0xFF1793e5, 0xFF1490e4, 0xFF128ee2, 0xFF118de3,
+	0xFF108de3, 0xFF118bde, 0xFF1289d9, 0xFF0f88e2, 0xFF0c89dd, 0xFF1085e0, 0xFF0987e4, 0xFF0987e4,
+	0xFF40b5e9, 0xFF3bb4e9, 0xFF37b2ea, 0xFF37b2e9, 0xFF38b1e8, 0xFF33b0ea, 0xFF2eaeeb, 0xFF30afe9,
+	0xFF33afe8, 0xFF30b2ea, 0xFF2eb5ec, 0xFF34aff2, 0xFF25b4f7, 0xFF8d7f86, 0xFFf64f00, 0xFFed5c1e,
+	0xFFfa6326, 0xFFf7762d, 0xFFf58a35, 0xFFfea242, 0xFFf7ab3f, 0xFFf7a843, 0xFFf7a548, 0xFFf9a34a,
+	0xFFfaa24c, 0xFFfba64f, 0xFFfcaa52, 0xFFf9a652, 0xFFf7a252, 0xFFfa9c50, 0xFFfd974e, 0xFFfc8d4b,
+	0xFFfb8348, 0xFFf68341, 0xFFf1823a, 0xFFf5732c, 0xFF718cac, 0xFF179af0, 0xFF2599ef, 0xFF2697e9,
+	0xFF269bc6, 0xFF1696f1, 0xFF1d91e3, 0xFF1c96e3, 0xFF1b9be3, 0xFF1a99e6, 0xFF1998e9, 0xFF1b97e7,
+	0xFF1c95e5, 0xFF1891df, 0xFF138dda, 0xFF1992e2, 0xFF1e98ea, 0xFF1592e6, 0xFF0b8de2, 0xFF0e8ee5,
+	0xFF108fe9, 0xFF128cdf, 0xFF1489d4, 0xFF0e88e6, 0xFF088cdc, 0xFF1184e4, 0xFF0488ec, 0xFF0488ec,
+	0xFF3eb6ea, 0xFF3bb5eb, 0xFF38b4eb, 0xFF38b4eb, 0xFF38b3eb, 0xFF35b2eb, 0xFF33b1ec, 0xFF34b1eb,
+	0xFF35b1ea, 0xFF32b3e9, 0xFF30b5e9, 0xFF34b0f0, 0xFF23b6f8, 0xFFc56044, 0xFFf9540c, 0xFFf26322,
+	0xFFf77029, 0xFFf77d2f, 0xFFf78b35, 0xFFfba142, 0xFFf6b046, 0xFFfbb44f, 0xFFf7b051, 0xFFf9af54,
+	0xFFfbad56, 0xFFfcb25a, 0xFFfeb75d, 0xFFfab35f, 0xFFf6b061, 0xFFfaac5d, 0xFFfda95a, 0xFFfb9f55,
+	0xFFf99551, 0xFFf7914b, 0xFFf68d45, 0xFFff7e23, 0xFF1ba5f0, 0xFF129ef4, 0xFF2896f1, 0xFF239fb1,
+	0xFF6c9600, 0xFF3c9c82, 0xFF179ef8, 0xFF169cf4, 0xFF149de3, 0xFF169ae5, 0xFF1897e7, 0xFF1995e6,
+	0xFF1a93e5, 0xFF1993e3, 0xFF1793e0, 0xFF1c98e6, 0xFF1a95e5, 0xFF1692e5, 0xFF138fe5, 0xFF138ceb,
+	0xFF138be3, 0xFF0087e4, 0xFF007cf5, 0xFF1a86d3, 0xFF0d8cf1, 0xFF008fe2, 0xFF0d85ea, 0xFF0886f1,
+	0xFF3cb7ec, 0xFF3bb7ed, 0xFF3ab6ed, 0xFF39b6ed, 0xFF38b5ed, 0xFF37b5ed, 0xFF37b4ed, 0xFF37b3ed,
+	0xFF36b3ec, 0xFF34b4e9, 0xFF31b5e5, 0xFF35b1ef, 0xFF21b8fa, 0xFFfd4203, 0xFFfc581e, 0xFFf86a26,
+	0xFFf47c2d, 0xFFf78431, 0xFFf98c36, 0xFFf8a041, 0xFFf6b54d, 0xFFfec05b, 0xFFf6bc5a, 0xFFf8ba5d,
+	0xFFfbb861, 0xFFfdbe65, 0xFFffc469, 0xFFfbc16c, 0xFFf5bd70, 0xFFfabc6b, 0xFFfebb66, 0xFFfab160,
+	0xFFf6a75a, 0xFFf89f55, 0xFFfa984f, 0xFFdf956f, 0xFF08a6fc, 0xFF259ddb, 0xFF159ff3, 0xFF4aa172,
+	0xFF69a90d, 0xFF62a406, 0xFF5a981b, 0xFF34969b, 0xFF0e99ff, 0xFF1297f2, 0xFF1695e4, 0xFF1793e5,
+	0xFF1892e5, 0xFF1995e6, 0xFF1a98e7, 0xFF209deb, 0xFF1593df, 0xFF1892e4, 0xFF1a91e9, 0xFF2095eb,
+	0xFF259dd1, 0xFFd0f772, 0xFFc1f396, 0xFF0083f1, 0xFF1782a0, 0xFF3c7e2f, 0xFF1787cc, 0xFF0b8ada,
+	0xFF3db9ed, 0xFF3cb8ed, 0xFF3bb8ed, 0xFF3ab7ed, 0xFF39b7ed, 0xFF39b7ed, 0xFF39b6ed, 0xFF3ab6ed,
+	0xFF3ab6ed, 0xFF37b4ed, 0xFF34b2ec, 0xFF35abf3, 0xFF6e96b3, 0xFFff4601, 0xFFf86520, 0xFFf67329,
+	0xFFf58131, 0xFFf78b37, 0xFFf9953e, 0xFFf8a649, 0xFFf8b854, 0xFFfcc260, 0xFFf8c465, 0xFFf9c36a,
+	0xFFfac26e, 0xFFfac773, 0xFFfacb77, 0xFFfbcb7b, 0xFFfccb7e, 0xFFfac87b, 0xFFf8c578, 0xFFf9bc72,
+	0xFFfbb46d, 0xFFf6b069, 0xFFfeaa57, 0xFF94a0a5, 0xFF13a1f3, 0xFF219df0, 0xFF199eff, 0xFF71c124,
+	0xFF79b826, 0xFF72b21e, 0xFF6aaa24, 0xFF67a125, 0xFF649a19, 0xFF419d72, 0xFF1f9fcb, 0xFF1994ff,
+	0xFF1399f1, 0xFF199cf4, 0xFF1ea0f8, 0xFF1b9cff, 0xFF1193f6, 0xFF1293f1, 0xFF1393ec, 0xFF0083ff,
+	0xFF72cca0, 0xFFcbf982, 0xFFd0ffac, 0xFF79a046, 0xFF337700, 0xFF3a7c03, 0xFF0d8de2, 0xFF0d8edb,
+	0xFF3fbbee, 0xFF3ebaed, 0xFF3db9ed, 0xFF3cb9ed, 0xFF3bb8ed, 0xFF3bb8ed, 0xFF3cb9ee, 0xFF3cb9ee,
+	0xFF3db9ef, 0xFF3ab4f1, 0xFF37aff3, 0xFF32b3fe, 0xFFb48f7d, 0xFFff5907, 0xFFf37122, 0xFFf57c2b,
+	0xFFf68735, 0xFFf7923d, 0xFFf89d45, 0xFFf9ac50, 0xFFf9bb5a, 0xFFf9c465, 0xFFfacd71, 0xFFfacd76,
+	0xFFfacd7b, 0xFFf7cf80, 0xFFf4d286, 0xFFfcd689, 0xFFffd98c, 0xFFfbd48b, 0xFFf3cf8a, 0xFFf9c885,
+	0xFFffc17f, 0xFFf5c27d, 0xFFffbc5e, 0xFF48abdc, 0xFF1e9deb, 0xFF1ea2e8, 0xFF1da8e5, 0xFF99d31c,
+	0xFF8acb22, 0xFF82c427, 0xFF7abc2c, 0xFF75b429, 0xFF70ad25, 0xFF6dab17, 0xFF6ba908, 0xFF5ea912,
+	0xFF519f54, 0xFF489b6d, 0xFF3e9887, 0xFF3b9592, 0xFF389880, 0xFF449663, 0xFF509446, 0xFF83b43c,
+	0xFF4f851b, 0xFFafe187, 0xFF9fcc83, 0xFF368011, 0xFF43821c, 0xFF32853c, 0xFF0492f9, 0xFF1092dd,
+	0xFF40bcee, 0xFF3fbcee, 0xFF3ebbee, 0xFF3dbaed, 0xFF3cbaed, 0xFF3cb9ed, 0xFF3cb9ec, 0xFF3cb9ec,
+	0xFF3cb8ec, 0xFF3fb4f0, 0xFF43aff5, 0xFF0ebbe9, 0xFFffb897, 0xFFf7814d, 0xFFf57623, 0xFFf6812e,
+	0xFFf88c39, 0xFFf89943, 0xFFf8a64d, 0xFFf8b257, 0xFFf9bd60, 0xFFfac96d, 0xFFfbd47b, 0xFFfad681,
+	0xFFfad788, 0xFFfbd98e, 0xFFfbda93, 0xFFfae5a1, 0xFFfed692, 0xFFfadea0, 0xFFf9db98, 0xFFfad694,
+	0xFFfbd090, 0xFFffd285, 0xFFffc778, 0xFF009afd, 0xFF26a8f2, 0xFF20a4f8, 0xFF53bea5, 0xFFa4da31,
+	0xFF9dd638, 0xFF97d03a, 0xFF91ca3d, 0xFF8bc539, 0xFF85c035, 0xFF7dbe31, 0xFF74bc2d, 0xFF76b81c,
+	0xFF77b027, 0xFF72ab25, 0xFF6da724, 0xFF6ba328, 0xFF68a31f, 0xFF58951a, 0xFF78b745, 0xFFbbf181,
+	0xFF73ad4c, 0xFF417c15, 0xFF508b1e, 0xFF43861c, 0xFF498614, 0xFF17868b, 0xFF0b90f6, 0xFF168ee8,
+	0xFF42beef, 0xFF41bdee, 0xFF40bcee, 0xFF3fbced, 0xFF3ebbed, 0xFF3dbaec, 0xFF3db9eb, 0xFF3cb8ea,
+	0xFF3bb7e9, 0xFF39b9f0, 0xFF37bbf7, 0xFF50b5dc, 0xFFff9744, 0xFFfec49d, 0xFFf87a24, 0xFFf88530,
+	0xFFf9913d, 0xFFf8a049, 0xFFf7af55, 0xFFf8b85d, 0xFFf9c065, 0xFFface75, 0xFFfcdb85, 0xFFfbde8d,
+	0xFFfae195, 0xFFfee29b, 0xFFffe2a0, 0xFFfbe9a4, 0xFFffbe6b, 0xFFfdde9f, 0xFFffe8a6, 0xFFfbe3a3,
+	0xFFf8dea0, 0xFFfdd899, 0xFFb6bdab, 0xFF119ff1, 0xFF1ea4e9, 0xFF1a9fff, 0xFF89d465, 0xFFb0e245,
+	0xFFb0e04e, 0xFFacdc4e, 0xFFa7d94e, 0xFFa1d649, 0xFF9ad345, 0xFF97ce3d, 0xFF94c935, 0xFF8dc534,
+	0xFF86c133, 0xFF7bbc32, 0xFF6fb731, 0xFF6db330, 0xFF6cae2e, 0xFF7eba3f, 0xFF70a531, 0xFF7bb54f,
+	0xFF579a20, 0xFF5c9f2b, 0xFF519425, 0xFF80b965, 0xFF609a1d, 0xFF0390e3, 0xFF118ef2, 0xFF1c89f2,
+	0xFF44c0ef, 0xFF43bfef, 0xFF42beee, 0xFF40bdee, 0xFF3fbcee, 0xFF3fbbed, 0xFF40baeb, 0xFF3eb9ed,
+	0xFF3cb9ee, 0xFF37b9eb, 0xFF27bcf7, 0xFF949c8f, 0xFFfb9637, 0xFFf9bc7c, 0xFFf9b585, 0xFFf7994a,
+	0xFFf69b43, 0xFFf6a64e, 0xFFf7b259, 0xFFf8bc66, 0xFFfac672, 0xFFfad380, 0xFFfae08d, 0xFFf9e698,
+	0xFFf9eba2, 0xFFfeeaa6, 0xFFffeaab, 0xFFfcefa9, 0xFFfaba62, 0xFFfbdc99, 0xFFfff4b9, 0xFFfbecb2,
+	0xFFf7e6ab, 0xFFffe5a3, 0xFF64b1d1, 0xFF199ff0, 0xFF269fe9, 0xFF0499f2, 0xFFe3f051, 0xFFd5ef58,
+	0xFFc0e364, 0xFFbde165, 0xFFbae065, 0xFFb5de5d, 0xFFb0dc56, 0xFFaad74e, 0xFFa3d346, 0xFF9bd043,
+	0xFF93cd3f, 0xFF8cc93e, 0xFF84c63c, 0xFF81c139, 0xFF7dbc36, 0xFF8bc746, 0xFF89c245, 0xFF63a02c,
+	0xFF65aa2c, 0xFF5ea42d, 0xFF509626, 0xFFa4cf98, 0xFFd9eadd, 0xFFb9ddff, 0xFF389ef4, 0xFF008fd4,
+	0xFF46c1ef, 0xFF44c0ef, 0xFF43bfef, 0xFF42beef, 0xFF40bdef, 0xFF42bced, 0xFF43baec, 0xFF40baf0,
+	0xFF3dbaf4, 0xFF35b8e7, 0xFF17bdf7, 0xFFd97f50, 0xFFf79147, 0xFFf7a554, 0xFFffdbba, 0xFFf8a24d,
+	0xFFf3a549, 0xFFf5ad53, 0xFFf7b55e, 0xFFf9c16f, 0xFFfbcc7f, 0xFFf9d88a, 0xFFf8e595, 0xFFf8eda2,
+	0xFFf8f5ae, 0xFFfff3b2, 0xFFfff2b6, 0xFFfef5ae, 0xFFf4b659, 0xFFf9db93, 0xFFfeffcd, 0xFFfbf6c1,
+	0xFFf7edb6, 0xFFfff2ac, 0xFF13a4f7, 0xFF16a5f0, 0xFF18a5e8, 0xFF56b4cd, 0xFFf1f271, 0xFFd5ef84,
+	0xFFcfe67b, 0xFFcde77c, 0xFFcbe77c, 0xFFc9e672, 0xFFc7e567, 0xFFbce15f, 0xFFb1dd57, 0xFFa9dc51,
+	0xFFa0da4b, 0xFF9dd749, 0xFF9ad447, 0xFF94cf43, 0xFF8fcb3f, 0xFF88c43c, 0xFF82be39, 0xFF72b430,
+	0xFF63a928, 0xFF59a028, 0xFF4e9827, 0xFFa0c479, 0xFFfffbf7, 0xFF7fd3f5, 0xFF038fe2, 0xFF0e89e2,
+	0xFF48c3ef, 0xFF46c2ef, 0xFF45c1f0, 0xFF43c0f0, 0xFF42bff0, 0xFF42beee, 0xFF43bdec, 0xFF41bcef,
+	0xFF3fbcf2, 0xFF2fc0fe, 0xFF36bdfc, 0xFFf54c00, 0xFFff8a52, 0xFFfaa65e, 0xFFfdc48e, 0xFFfbc185,
+	0xFFf5ae50, 0xFFf7b65e, 0xFFf9be6c, 0xFFfac978, 0xFFfbd485, 0xFFfede98, 0xFFffe8aa, 0xFFfdeeae,
+	0xFFf9f5b2, 0xFFfcf6ba, 0xFFfff7c2, 0xFFfcf0b2, 0xFFf7cc6e, 0xFFfbde91, 0xFFfdfcca, 0xFFfffbd1,
+	0xFFfffdc8, 0xFFcae4c8, 0xFF16a1f2, 0xFF1da4ef, 0xFF12a1f1, 0xFF9fd5b9, 0xFFeaf28c, 0xFFdcf095,
+	0xFFd9eb90, 0xFFd9ec93, 0xFFd9ec95, 0xFFd6eb8c, 0xFFd4ea83, 0xFFc9e779, 0xFFbfe36f, 0xFFb8e368,
+	0xFFb1e262, 0xFFafe05e, 0xFFaddf5a, 0xFFa3d952, 0xFF99d449, 0xFF8ecb41, 0xFF84c33a, 0xFF75b833,
+	0xFF66ac2c, 0xFF5da329, 0xFF559927, 0xFF4b9421, 0xFF2499b9, 0xFF1593fe, 0xFF0993d8, 0xFF0f90d8,
+	0xFF4ac5ef, 0xFF48c4f0, 0xFF46c2f0, 0xFF45c1f1, 0xFF43c0f1, 0xFF43bfef, 0xFF43bfed, 0xFF42beee,
+	0xFF41bdf0, 0xFF38bbf0, 0xFF72a1b8, 0xFFff5d1e, 0xFFf97931, 0xFFf5a151, 0xFFf9ad61, 0xFFfee0bd,
+	0xFFf8b758, 0xFFfabf69, 0xFFfcc87a, 0xFFfcd282, 0xFFfcdc8b, 0xFFfbde8f, 0xFFfbe193, 0xFFfbeba4,
+	0xFFfbf5b5, 0xFFfaf8c2, 0xFFf9fcce, 0xFFf9ecb7, 0xFFfae183, 0xFFfee290, 0xFFfbfac8, 0xFFfdf8d8,
+	0xFFfffccb, 0xFF8bcedc, 0xFF189fee, 0xFF25a3ee, 0xFF0b9dfb, 0xFFe8f6a5, 0xFFe4f1a6, 0xFFe4f0a6,
+	0xFFe4efa6, 0xFFe5f1aa, 0xFFe6f2ad, 0xFFe3f1a6, 0xFFe0ef9e, 0xFFd7ec93, 0xFFcde987, 0xFFc8ea80,
+	0xFFc2eb78, 0xFFc1ea73, 0xFFc0e96e, 0xFFb1e360, 0xFFa3dd53, 0xFF94d247, 0xFF86c83b, 0xFF78bc35,
+	0xFF69b030, 0xFF62a52b, 0xFF5b9b27, 0xFF57920a, 0xFF0995fc, 0xFF0d96e5, 0xFF1091eb, 0xFF1091eb,
+	0xFF4ac5f0, 0xFF49c4f0, 0xFF47c3f1, 0xFF45c2f1, 0xFF44c1f2, 0xFF41c1f2, 0xFF3fc1f2, 0xFF3fbff1,
+	0xFF3fbcf0, 0xFF32c3fe, 0xFFbe7f6e, 0xFFfe6526, 0xFFf67b35, 0xFFf59a4d, 0xFFf8ab5c, 0xFFfbd0a0,
+	0xFFf7c783, 0xFFfec16b, 0xFFfdd17f, 0xFFfbdb87, 0xFFf9e590, 0xFFf8ed9a, 0xFFf7f4a5, 0xFFfbea9a,
+	0xFFffdf8e, 0xFFfce3a0, 0xFFf7e6b1, 0xFFfceecc, 0xFFfffbcb, 0xFFfff3c7, 0xFFfcf1c3, 0xFFfef5d2,
+	0xFFfffcd3, 0xFF4bb5e7, 0xFF21a5ed, 0xFF1ca2ee, 0xFF3daae2, 0xFFeef6ac, 0xFFe6f2b1, 0xFFe8f2b5,
+	0xFFe9f3b8, 0xFFeaf4ba, 0xFFebf5bc, 0xFFe8f3b6, 0xFFe6f2af, 0xFFe0f0a8, 0xFFdbeea2, 0xFFd6ef9a,
+	0xFFd1f092, 0xFFc9ed82, 0xFFc1eb73, 0xFFb0e362, 0xFFa1dc51, 0xFF94d347, 0xFF88ca3e, 0xFF7bbf38,
+	0xFF6eb433, 0xFF66a92e, 0xFF5da01b, 0xFF3d9448, 0xFF0a93f6, 0xFF0e94ec, 0xFF1193f0, 0xFF1193f0,
+	0xFF4bc5f1, 0xFF4ac5f1, 0xFF48c4f1, 0xFF47c3f2, 0xFF45c3f2, 0xFF40c3f4, 0xFF3bc4f6, 0xFF3cbff3,
+	0xFF3ebbf0, 0xFF2dcaff, 0xFFff5d25, 0xFFfe6d2f, 0xFFf37d39, 0xFFf59348, 0xFFf8a958, 0xFFf7c083,
+	0xFFf7d7ae, 0xFFffc36d, 0xFFffda84, 0xFFfbe48c, 0xFFf7ee94, 0xFFf8ed9e, 0xFFfaeca7, 0xFFf9f1b4,
+	0xFFf8f6c1, 0xFFfcf6c8, 0xFFfff6d0, 0xFFfef2d3, 0xFFfcf4ba, 0xFFfffee8, 0xFFf7fdea, 0xFFfdfde3,
+	0xFFfffcdc, 0xFF0b9df1, 0xFF2aaaed, 0xFF1baaf6, 0xFF80c8da, 0xFFfdffbb, 0xFFe8f2bd, 0xFFebf4c4,
+	0xFFeff7cb, 0xFFeff7cb, 0xFFeff7cb, 0xFFedf6c5, 0xFFebf5c0, 0xFFeaf4be, 0xFFe8f3bd, 0xFFe4f4b4,
+	0xFFe0f6ab, 0xFFd0f191, 0xFFc1ec77, 0xFFb0e463, 0xFF9edb4e, 0xFF95d448, 0xFF8bcc42, 0xFF7fc23b,
+	0xFF73b935, 0xFF6aac31, 0xFF60a510, 0xFF229687, 0xFF0b91f1, 0xFF0e93f3, 0xFF1294f5, 0xFF1294f5,
+	0xFF4cc6f1, 0xFF4bc5f2, 0xFF49c5f2, 0xFF47c4f2, 0xFF46c4f2, 0xFF43c4f1, 0xFF40c4f0, 0xFF42c0f3,
+	0xFF39c1f6, 0xFF5eacca, 0xFFfb591e, 0xFFf36e31, 0xFFf88135, 0xFFfb923f, 0xFFfbaf5e, 0xFFffc373,
+	0xFFfde2ba, 0xFFffcd75, 0xFFffd372, 0xFFffe584, 0xFFfff796, 0xFFfef4a2, 0xFFfdf1ae, 0xFFfff8c2,
+	0xFFfcf8cd, 0xFFfef8d2, 0xFFfff9d6, 0xFFfef6e1, 0xFFfcf5dd, 0xFFfffbee, 0xFFfbfce8, 0xFFfffce0,
+	0xFFb2e0e8, 0xFF19a4f0, 0xFF26abec, 0xFF16a8f6, 0xFFc2e4d8, 0xFFf9fac5, 0xFFeff6cb, 0xFFf0f7ce,
+	0xFFf1f8d2, 0xFFf1f8d1, 0xFFf2f9d1, 0xFFf1f9cd, 0xFFf1f9ca, 0xFFf2fbca, 0xFFf4fdca, 0xFFe7f8b6,
+	0xFFdaf3a2, 0xFFcbef8a, 0xFFbcec71, 0xFFb0e661, 0xFFa5e151, 0xFF9ad949, 0xFF8fd240, 0xFF83c73b,
+	0xFF77bc35, 0xFF6ab31d, 0xFF5ea905, 0xFF138dea, 0xFF1193ef, 0xFF1093f0, 0xFF0f93f0, 0xFF0f93f0,
+	0xFF4dc6f2, 0xFF4cc6f2, 0xFF4ac5f3, 0xFF48c5f3, 0xFF47c5f3, 0xFF46c4ef, 0xFF46c4eb, 0xFF48c0f3,
+	0xFF34c7fb, 0xFF989591, 0xFFfc6428, 0xFFf1773b, 0xFFfc8432, 0xFFff9135, 0xFFffb564, 0xFFffbe5a,
+	0xFFf3ddb6, 0xFFccd097, 0xFFb4cea5, 0xFFb0d3b1, 0xFFabd7bd, 0xFFc3e1bf, 0xFFdaebc1, 0xFFf5fdc7,
+	0xFFffffbd, 0xFFfffecd, 0xFFfffcdc, 0xFFfffce0, 0xFFfbfce5, 0xFFfdfbe6, 0xFFfffae7, 0xFFfffbdd,
+	0xFF61c4f4, 0xFF26aaee, 0xFF22abec, 0xFF10a7f6, 0xFFffffd7, 0xFFf5f5d0, 0xFFf6fad9, 0xFFf4f9d9,
+	0xFFf2f9da, 0xFFf3fad8, 0xFFf4fbd7, 0xFFf5fcd5, 0xFFf7fdd4, 0xFFf3face, 0xFFf0f7c8, 0xFFe2f4b0,
+	0xFFd4f199, 0xFFc5ee82, 0xFFb7eb6b, 0xFFb1e95f, 0xFFabe754, 0xFF9fdf49, 0xFF94d83f, 0xFF87cc3a,
+	0xFF7bc034, 0xFF6bb425, 0xFF5ba332, 0xFF0495f9, 0xFF1795ee, 0xFF1293ed, 0xFF0c91eb, 0xFF0c91eb,
+	0xFF4fc8f3, 0xFF4dc8f3, 0xFF4cc8f4, 0xFF4bc8f4, 0xFF49c8f4, 0xFF47c5f2, 0xFF45c2ef, 0xFF42c2f8,
+	0xFF34c8ff, 0xFFdf6746, 0xFFff632a, 0xFFff701b, 0xFFe18b53, 0xFFa4a185, 0xFF63c1cd, 0xFF26c0ff,
+	0xFF2ab8ff, 0xFF25b5f1, 0xFF27b7f9, 0xFF26b5f6, 0xFF23b3f2, 0xFF24b5fa, 0xFF25b7ff, 0xFF189ddf,
+	0xFF43bbf4, 0xFF9edae8, 0xFFf9f9dc, 0xFFf3fbe6, 0xFFffffea, 0xFFfdffe6, 0xFFfafce2, 0xFFffffff,
+	0xFF1ea8ef, 0xFF1ca8f1, 0xFF1ba8f2, 0xFF5bc4f1, 0xFFffffe7, 0xFFfbf9e1, 0xFFfbfce3, 0xFFf8fbe0,
+	0xFFf5fadd, 0xFFf5fbdb, 0xFFf5fbda, 0xFFf6fcd7, 0xFFf6fdd3, 0xFFf0f8c9, 0xFFebf4be, 0xFFdff2a9,
+	0xFFd4f094, 0xFFc7f47b, 0xFFbaf862, 0xFFb0ef58, 0xFFa6e64e, 0xFFa3e248, 0xFF98d73a, 0xFF8acd38,
+	0xFF7bc435, 0xFF70b821, 0xFF3b9c84, 0xFF0d93f4, 0xFF1394ed, 0xFF1193e9, 0xFF0f92e6, 0xFF0f92e6,
+	0xFF50c9f4, 0xFF4fcaf4, 0xFF4ecaf5, 0xFF4dcaf5, 0xFF4ccaf6, 0xFF48c5f4, 0xFF45c0f3, 0xFF47c2ef,
+	0xFF4ac4eb, 0xFFff521f, 0xFFa79a92, 0xFF51b7e6, 0xFF28c7ff, 0xFF2cc4f9, 0xFF31c1f1, 0xFF3fbbf0,
+	0xFF37c0ef, 0xFF39b9f0, 0xFF3bb3f1, 0xFF38b5f4, 0xFF36b7f7, 0xFF32b9f0, 0xFF2fbbe8, 0xFF2fb8eb,
+	0xFF2fb5ed, 0xFF20acf3, 0xFF10a3fa, 0xFF70c9f3, 0xFFf5f9df, 0xFFf6fbde, 0xFFf6fdde, 0xFFd8ebe4,
+	0xFF11a5ee, 0xFF2db2f5, 0xFF14a5f8, 0xFFa5e2ec, 0xFFfffff8, 0xFFfffef3, 0xFFfffded, 0xFFfcfde6,
+	0xFFf8fce0, 0xFFf7fcde, 0xFFf6fcdd, 0xFFf6fcd8, 0xFFf5fdd3, 0xFFedf7c4, 0xFFe5f1b4, 0xFFe5f5b8,
+	0xFFe4f9bb, 0xFFecfed2, 0xFFf3ffe9, 0xFFedfedb, 0xFFe8f9cd, 0xFFcaef89, 0xFF9cd636, 0xFF84c72e,
+	0xFF6bb826, 0xFF6cb315, 0xFF1a95d6, 0xFF1591ef, 0xFF1093eb, 0xFF1193e6, 0xFF1294e1, 0xFF1294e1,
+	0xFF52cbf4, 0xFF50caf4, 0xFF4ecaf4, 0xFF4ccaf3, 0xFF4ac9f3, 0xFF48c8f5, 0xFF46c7f6, 0xFF40bfed,
+	0xFF41bfeb, 0xFF41d4f9, 0xFF33c9fc, 0xFF2fc9ff, 0xFF42c3ec, 0xFF40c3f4, 0xFF3ec3fc, 0xFF35bbf4,
+	0xFF33bbf3, 0xFF49bdf7, 0xFF39b7f9, 0xFF37b7f6, 0xFF35b7f2, 0xFF2eb5f4, 0xFF28b3f5, 0xFF2fbbf8,
+	0xFF2fbaf2, 0xFF30b5f2, 0xFF31b0f1, 0xFF1facf6, 0xFF0dabed, 0xFF7fd2ed, 0xFFffffe6, 0xFF80d9d2,
+	0xFF2faaf8, 0xFF1dafec, 0xFF03aae6, 0xFFfff8ff, 0xFFfffffe, 0xFFfffff9, 0xFFfffdf4, 0xFFfdfeeb,
+	0xFFfbfee3, 0xFFf9fde1, 0xFFf7fce0, 0xFFf5fdd8, 0xFFf4fdcf, 0xFFf5fce2, 0xFFf6fde8, 0xFFf3fde8,
+	0xFFf1fde9, 0xFFebfdd3, 0xFFe6fdbe, 0xFFe0f8ba, 0xFFdaf2b7, 0xFFeafcd2, 0xFFf2fde6, 0xFFb7de8d,
+	0xFF84c73d, 0xFF9ab848, 0xFF14a1f9, 0xFF0494f3, 0xFF1094ef, 0xFF1095ec, 0xFF1095e9, 0xFF1095e9,
+	0xFF54ccf5, 0xFF51cbf4, 0xFF4ecaf3, 0xFF4cc9f2, 0xFF49c8f1, 0xFF48cbf5, 0xFF48cef9, 0xFF40c4f3,
+	0xFF49cafc, 0xFF40c2f1, 0xFF47caf5, 0xFF46c7f4, 0xFF46c4f3, 0xFF39b5ee, 0xFF2ca5e8, 0xFF2eb1e1,
+	0xFF56c1ea, 0xFF6dc9e9, 0xFF37c2e5, 0xFF51caeb, 0xFF6bd2f1, 0xFF74d1f5, 0xFF7dcff9, 0xFF56c7f8,
+	0xFF1fafe8, 0xFF25b1ee, 0xFF2cb3f4, 0xFF3eb5f9, 0xFF2bb3ee, 0xFF1baff5, 0xFF32b5f0, 0xFF3fb2f9,
+	0xFF26a9f2, 0xFF1faeeb, 0xFF3fb8f4, 0xFFfcfff3, 0xFFffffff, 0xFFffffff, 0xFFfffefb, 0xFFfefff1,
+	0xFFfeffe6, 0xFFfbffe5, 0xFFf8fde3, 0xFFf5fdd7, 0xFFf3fecb, 0xFFf5fbeb, 0xFFf7feee, 0xFFf2fdde,
+	0xFFedfccf, 0xFFe3f9b0, 0xFFd9f692, 0xFFd2f48b, 0xFFccf184, 0xFFceee97, 0xFFd0eaa9, 0xFFdaebc1,
+	0xFFf4fbe9, 0xFF7fc679, 0xFF5ac1ff, 0xFF1aa1eb, 0xFF1195f2, 0xFF0f96f2, 0xFF0e97f2, 0xFF0e97f2,
+	0xFF54cdf5, 0xFF52ccf4, 0xFF4fcbf3, 0xFF4dc9f3, 0xFF4ac8f2, 0xFF49c6f2, 0xFF47c4f2, 0xFF49d2f3,
+	0xFF46c8f3, 0xFF4dc5fc, 0xFF2c9add, 0xFF1883cd, 0xFF046cbe, 0xFF0080c5, 0xFF0f96d4, 0xFF2eaddb,
+	0xFF60c6eb, 0xFF76cdef, 0xFF51caea, 0xFF69d2f0, 0xFF81daf5, 0xFF9ae4f7, 0xFFb3eff9, 0xFFcffaff,
+	0xFFe3feff, 0xFF9ae1ff, 0xFF48bcf7, 0xFF11b5dd, 0xFF32aef0, 0xFF28acfc, 0xFF31b2f3, 0xFF34b1f6,
+	0xFF25adf0, 0xFF26acf6, 0xFF98d1fc, 0xFFfffdf8, 0xFFffffff, 0xFFfffffb, 0xFFfefff4, 0xFFfdffee,
+	0xFFfcfde7, 0xFFfbfee4, 0xFFfaffe0, 0xFFf8fde7, 0xFFf7fcef, 0xFFf3fbeb, 0xFFeffdd9, 0xFFe9fbc2,
+	0xFFe3f9ac, 0xFFd9f49b, 0xFFceef8b, 0xFFc1ea76, 0xFFb4e562, 0xFFabdd5a, 0xFFa2d261, 0xFFc1e98e,
+	0xFFdbe8b9, 0xFF96d4ff, 0xFF8ed0fa, 0xFF42aeee, 0xFF1095f1, 0xFF1096f1, 0xFF0f96f1, 0xFF0f96f1,
+	0xFF55cef5, 0xFF53ccf4, 0xFF50cbf4, 0xFF4ecaf4, 0xFF4cc8f4, 0xFF51caf7, 0xFF57cbfa, 0xFF45c0ea,
+	0xFF1a75c7, 0xFF0058ad, 0xFF015bb4, 0xFF066fc0, 0xFF0b84cd, 0xFF0093ce, 0xFF11a7e0, 0xFF3eb9e6,
+	0xFF6bcbeb, 0xFF7ed1f6, 0xFF6cd3f0, 0xFF82dbf4, 0xFF98e3f9, 0xFFa5ecf7, 0xFFb2f4f5, 0xFFc7f7f9,
+	0xFFddfafd, 0xFFf2ffff, 0xFFf8fff6, 0xFFbcebfe, 0xFF22b4f2, 0xFF29afff, 0xFF2fb0f7, 0xFF29b1f2,
+	0xFF23b1ee, 0xFF1aa7fa, 0xFFcae6f4, 0xFFf7f8f4, 0xFFfeffff, 0xFFfefff7, 0xFFfeffed, 0xFFfcffeb,
+	0xFFfbfae9, 0xFFfbfee3, 0xFFfbffdc, 0xFFfbffe9, 0xFFfbfff7, 0xFFf1fedd, 0xFFe7fbc3, 0xFFe0f6b4,
+	0xFFd8f0a5, 0xFFceec94, 0xFFc4e884, 0xFFb8e678, 0xFFace36c, 0xFFa0df53, 0xFF94d455, 0xFF80bd41,
+	0xFFd2e599, 0xFF2ca1f4, 0xFF30a2f6, 0xFF209cf3, 0xFF1096f1, 0xFF1096f1, 0xFF1096f1, 0xFF1096f1,
+	0xFF55cef4, 0xFF53cdf4, 0xFF51cbf5, 0xFF50cbf5, 0xFF4ecaf6, 0xFF4dc9f4, 0xFF54d0fa, 0xFF2b86ce,
+	0xFF0752b1, 0xFF045fb9, 0xFF0a74c9, 0xFF0882ce, 0xFF0691d4, 0xFF02a0d5, 0xFF24b5e7, 0xFF4cc4ea,
+	0xFF74d3ee, 0xFF83d9f5, 0xFF7fddf4, 0xFF93e4f6, 0xFFa8ecf9, 0xFFb6f2f9, 0xFFc3f9f9, 0xFFd3fafb,
+	0xFFe3fcfc, 0xFFedfefb, 0xFFf0f9f3, 0xFFffffff, 0xFFfffdff, 0xFF7edcef, 0xFF26adfd, 0xFF2aaff7,
+	0xFF2db2f2, 0xFF34b1e0, 0xFF09a7f7, 0xFF8dd3f5, 0xFFfdfbf9, 0xFFfffff6, 0xFFfdffeb, 0xFFfcffe6,
+	0xFFfcfce0, 0xFFf9fcde, 0xFFf7fcdd, 0xFFfcffef, 0xFFf9fdec, 0xFFe8f5d0, 0xFFdff5bd, 0xFFd9f1ad,
+	0xFFd2ed9d, 0xFFc5e97e, 0xFFb8e26d, 0xFFabdd5e, 0xFF9fd74f, 0xFF98c95f, 0xFF92c735, 0xFF8bc942,
+	0xFF80b34d, 0xFF009bf2, 0xFF1894f8, 0xFF1595f5, 0xFF1397f2, 0xFF1296f1, 0xFF1195f0, 0xFF1195f0,
+	0xFF56cff4, 0xFF54cdf5, 0xFF52ccf5, 0xFF51cbf7, 0xFF51cbf9, 0xFF49c8f1, 0xFF51d5fa, 0xFF1662c1,
+	0xFF005cbb, 0xFF0874cd, 0xFF037cce, 0xFF028dd4, 0xFF019edb, 0xFF09aedc, 0xFF37c2ee, 0xFF5acfef,
+	0xFF7edcf0, 0xFF88e1f4, 0xFF92e6f8, 0xFFa5eef8, 0xFFb9f5f9, 0xFFc7f9fb, 0xFFd5fdfe, 0xFFdffdfc,
+	0xFFe9fdfa, 0xFFf0fefe, 0xFFf8ffff, 0xFFfafffe, 0xFFfdfffc, 0xFFfdfbff, 0xFF1db0e8, 0xFF2ab1ee,
+	0xFF37b2f5, 0xFF25b9f7, 0xFF29b4f8, 0xFF22aff5, 0xFF1baaf2, 0xFF9fd7f6, 0xFFfdffea, 0xFFfcfee0,
+	0xFFfcfdd7, 0xFFf8fada, 0xFFf4f7dd, 0xFFfdfef5, 0xFFf6fae1, 0xFFdfecc3, 0xFFd8efb6, 0xFFd2eca6,
+	0xFFccea95, 0xFFbce567, 0xFFabdb56, 0xFF9fd344, 0xFF92cb33, 0xFF85c824, 0xFF79b46a, 0xFF3a9eaf,
+	0xFF0c97ff, 0xFF1994f9, 0xFF0f9bee, 0xFF139af0, 0xFF1699f3, 0xFF1497f1, 0xFF1295ef, 0xFF1295ef,
+	0xFF58d0f5, 0xFF56cef5, 0xFF53cdf4, 0xFF53ccf6, 0xFF52cbf8, 0xFF53d6fb, 0xFF4fc8fc, 0xFF004cad,
+	0xFF096fca, 0xFF0b80d4, 0xFF0588d5, 0xFF0598db, 0xFF05a8e1, 0xFF18b6e6, 0xFF3fc8f2, 0xFF63d3f3,
+	0xFF86dff5, 0xFF91e4f7, 0xFF9ce9fa, 0xFFaef0f9, 0xFFc0f7f9, 0xFFcbfafb, 0xFFd7fdfd, 0xFFdefdfc,
+	0xFFe6fefb, 0xFFf0fffe, 0xFFfaffff, 0xFFf2fefb, 0xFFfefffd, 0xFFc6e9fb, 0xFF1eb0ec, 0xFF30b4f6,
+	0xFF30b7f8, 0xFF19a8f7, 0xFF26b0f0, 0xFF22aef3, 0xFF1eabf5, 0xFF27aafa, 0xFF1ca6f6, 0xFF7dcdea,
+	0xFFdff4dd, 0xFFeaffb0, 0xFFfdfeed, 0xFFffffef, 0xFFfcf9d3, 0xFFedeeb4, 0xFFe6e9ac, 0xFFd9e68a,
+	0xFFcbe367, 0xFFb9e153, 0xFFa6dd4d, 0xFF75c57f, 0xFF43adb0, 0xFF229bf3, 0xFF0a9cff, 0xFF0998f6,
+	0xFF109cef, 0xFF189aee, 0xFF149ded, 0xFF159bf0, 0xFF1599f2, 0xFF1397f0, 0xFF1195ee, 0xFF1195ee,
+	0xFF5ad1f6, 0xFF57cff5, 0xFF54cef4, 0xFF54cdf6, 0xFF53cbf8, 0xFF4dd3f4, 0xFF2c9add, 0xFF045ec1,
+	0xFF0572c9, 0xFF0683d2, 0xFF0794dc, 0xFF08a2e2, 0xFF08b1e8, 0xFF28bfef, 0xFF48cef6, 0xFF6bd8f8,
+	0xFF8fe3fa, 0xFF9be8fa, 0xFFa6edfb, 0xFFb7f3fb, 0xFFc7f9fa, 0xFFd0fbfc, 0xFFd9fdfd, 0xFFdefefd,
+	0xFFe2fffc, 0xFFeffffe, 0xFFfcffff, 0xFFebfef7, 0xFFfffffe, 0xFF8fd7f8, 0xFF1eb0f1, 0xFF2eb0f6,
+	0xFF18abec, 0xFFe0f7fd, 0xFF24ade9, 0xFF23acf1, 0xFF21acf8, 0xFF26aef7, 0xFF2cb0f6, 0xFF1aa9f5,
+	0xFF08a3f4, 0xFF22a7f9, 0xFF4cc2f2, 0xFF6dcdef, 0xFF7ec9db, 0xFF7fcac2, 0xFF81c6c6, 0xFF61bccb,
+	0xFF41b3d0, 0xFF24a7e9, 0xFF089bff, 0xFF119dff, 0xFF1a9fff, 0xFF0f99e9, 0xFF149cf9, 0xFF159cf7,
+	0xFF159cf5, 0xFF179df1, 0xFF199eed, 0xFF179cef, 0xFF1599f1, 0xFF1397ef, 0xFF1195ed, 0xFF1195ed,
+	0xFF5cd2f6, 0xFF59d0f5, 0xFF55cff3, 0xFF54cdf5, 0xFF53ccf8, 0xFF51d5f6, 0xFF167bcf, 0xFF0467c6,
+	0xFF067bcf, 0xFF068bd7, 0xFF059cdf, 0xFF08a9e5, 0xFF0ab6eb, 0xFF2bc4f1, 0xFF4cd2f7, 0xFF6ddbf9,
+	0xFF8ee5fa, 0xFF9deafb, 0xFFaceffb, 0xFFbdf5fb, 0xFFcefbfa, 0xFFd5fbfc, 0xFFdcfcfd, 0xFFdcfefd,
+	0xFFddfffd, 0xFFe4fffd, 0xFFeafffd, 0xFFfffffe, 0xFFffffff, 0xFF27c0de, 0xFF26b5f6, 0xFF1fb0f9,
+	0xFF4dc6ff, 0xFFfff9ef, 0xFFfefffa, 0xFF8bd8f7, 0xFF18a7f3, 0xFF1daaf4, 0xFF23acf6, 0xFF22acf3,
+	0xFF22abf0, 0xFF1aa3f2, 0xFF1aa6ee, 0xFF18a8f5, 0xFF0ea2f3, 0xFF11a4f2, 0xFF14a4ff, 0xFF15a3fc,
+	0xFF16a3fa, 0xFF17a2f3, 0xFF19a2ec, 0xFF0e99fe, 0xFF169bed, 0xFF00a1ff, 0xFF2b9de8, 0xFF61b5b0,
+	0xFF109af7, 0xFF149cf2, 0xFF189eed, 0xFF169cef, 0xFF149af0, 0xFF1298ee, 0xFF1096ec, 0xFF1096ec,
+	0xFF5fd3f7, 0xFF5bd2f5, 0xFF56d0f3, 0xFF55cef5, 0xFF53cdf7, 0xFF56d8f8, 0xFF005cc0, 0xFF0370cb,
+	0xFF0785d6, 0xFF0594dc, 0xFF04a3e2, 0xFF08afe8, 0xFF0cbcee, 0xFF2ec8f3, 0xFF50d5f9, 0xFF6fdefa,
+	0xFF8de7fb, 0xFF9fecfb, 0xFFb1f2fb, 0xFFc3f7fb, 0xFFd4fcfa, 0xFFd9fcfc, 0xFFdefcfd, 0xFFdbfdfd,
+	0xFFd9fffd, 0xFFd9fdfb, 0xFFd9fcfa, 0xFFe5fafa, 0xFFa4eaf7, 0xFF2badfb, 0xFF2fb9fa, 0xFF1aaeed,
+	0xFF99dbf8, 0xFFffffff, 0xFFfefdfc, 0xFFfffefd, 0xFFfffffd, 0xFF8cd4fa, 0xFF19a9f6, 0xFF18a9f7,
+	0xFF16aaf9, 0xFF1aa7f3, 0xFF1ea5ee, 0xFF1fa7f2, 0xFF21a9f6, 0xFF1ea7f7, 0xFF1ba5f7, 0xFF17a4f9,
+	0xFF12a2fb, 0xFF0b9dfd, 0xFF0399fe, 0xFF26a2fa, 0xFF6fc0b0, 0xFFcfca5e, 0xFFffe528, 0xFF74b4b3,
+	0xFF0b98fa, 0xFF119af4, 0xFF179dee, 0xFF159cee, 0xFF139aef, 0xFF1198ed, 0xFF0f96eb, 0xFF0f96eb,
+	0xFF5dd1f6, 0xFF5bd2f5, 0xFF58d2f4, 0xFF53cef4, 0xFF56d2fb, 0xFF40b2e6, 0xFF0164c6, 0xFF0376cf,
+	0xFF0487d7, 0xFF0296dd, 0xFF01a4e4, 0xFF04b1ea, 0xFF07bdf1, 0xFF1bc8f2, 0xFF43d5fc, 0xFF64ddfb,
+	0xFF85e6fb, 0xFF98ebfc, 0xFFacf1fd, 0xFFbef9ff, 0xFFcfffff, 0xFFcffdff, 0xFFcff9fb, 0xFFd2fefe,
+	0xFFd5ffff, 0xFFc6f9ff, 0xFFb8efff, 0xFF5ad7d9, 0xFF40b9e9, 0xFF2fb9ff, 0xFF2bb2f0, 0xFF28afeb,
+	0xFFdef0f2, 0xFFffffff, 0xFFfeffff, 0xFFfffefe, 0xFFfffefa, 0xFFfffffa, 0xFFfffff9, 0xFFc2e8f0,
+	0xFF84cde7, 0xFF53bbe9, 0xFF22a9eb, 0xFF14a1ff, 0xFF069ff8, 0xFF0fa0f8, 0xFF19a3eb, 0xFF43b1e1,
+	0xFF6ec2c9, 0xFFb0d79a, 0xFFf2eb6b, 0xFFebee32, 0xFFf8e647, 0xFFffe23a, 0xFFfde142, 0xFF0098f4,
+	0xFF19a1fc, 0xFF169ef7, 0xFF129bf1, 0xFF139af1, 0xFF149af0, 0xFF1298ee, 0xFF1096ec, 0xFF1096ec,
+	0xFF5ccff6, 0xFF5bd2f6, 0xFF5ad4f6, 0xFF52cdf2, 0xFF5ad6fe, 0xFF298cd5, 0xFF026ccc, 0xFF027bd2,
+	0xFF0189d8, 0xFF0097df, 0xFF00a6e6, 0xFF00b2ed, 0xFF02bef4, 0xFF09c7f1, 0xFF35d5ff, 0xFF59ddfd,
+	0xFF7ce5fb, 0xFF91eafd, 0xFFa6f0ff, 0xFFb1f2ff, 0xFFbbf5ff, 0xFFbef5fc, 0xFFc1f6f9, 0xFFc1f7f7,
+	0xFFc1f9f4, 0xFFc7fdfc, 0xFFcdffff, 0xFFc2f9f8, 0xFF5acdf4, 0xFF39b1f3, 0xFF38baf5, 0xFF2ab4f7,
+	0xFFfcfbf8, 0xFFfdfeff, 0xFFfeffff, 0xFFfffeff, 0xFFfffcf6, 0xFFfdfef2, 0xFFf7ffee, 0xFFfcffea,
+	0xFFffffe5, 0xFFffffd8, 0xFFffffcb, 0xFFfffbf1, 0xFFffffdf, 0xFFfdfdc2, 0xFFf7ff88, 0xFFfbfe92,
+	0xFFffff7f, 0xFFfdfc6c, 0xFFfaf759, 0xFFf8f059, 0xFFf7e958, 0xFFf7e359, 0xFFd0d368, 0xFF0998ff,
+	0xFF189aef, 0xFF129af2, 0xFF0c99f5, 0xFF1199f3, 0xFF1599f2, 0xFF1397f0, 0xFF1195ee, 0xFF1195ee,
+	0xFF5fd2f9, 0xFF5cd3f8, 0xFF59d4f6, 0xFF58d3f8, 0xFF5edaff, 0xFF1971cd, 0xFF026ecd, 0xFF037bd3,
+	0xFF0488d9, 0xFF0497e0, 0xFF05a6e6, 0xFF01ade7, 0xFF00b5e8, 0xFF07beea, 0xFF23cbf5, 0xFF4cd7f8,
+	0xFF74e4fc, 0xFF89e8fd, 0xFF9fecfe, 0xFFa5edfe, 0xFFabeffe, 0xFFaeeffc, 0xFFb0eff9, 0xFFb3f3f9,
+	0xFFb6f6f8, 0xFFb6f9fc, 0xFFb5fcff, 0xFFdaf3ff, 0xFF1ab9f1, 0xFF28b3f4, 0xFF2bb3f6, 0xFF73cef4,
+	0xFFfdfdf5, 0xFFfdfefa, 0xFFfdfffe, 0xFFfffef9, 0xFFfffdf3, 0xFFfdfeee, 0xFFfaffe9, 0xFFfdffe4,
+	0xFFffffde, 0xFFffffd0, 0xFFffffc2, 0xFFfdfad7, 0xFFfffcf3, 0xFFffffc0, 0xFFfcfbc5, 0xFFfcff84,
+	0xFFfcfb8b, 0xFFfbf67a, 0xFFf9f269, 0xFFf7ed5e, 0xFFf4e954, 0xFFf7e948, 0xFF87bda9, 0xFF109afc,
+	0xFF179cf2, 0xFF149bf1, 0xFF119af1, 0xFF1399f2, 0xFF1698f3, 0xFF1496f1, 0xFF1294ef, 0xFF1294ef,
+	0xFF62d4fc, 0xFF5dd4f9, 0xFF59d4f6, 0xFF56d1f6, 0xFF53cef5, 0xFF014ebe, 0xFF026fcd, 0xFF057bd4,
+	0xFF0787da, 0xFF0996e0, 0xFF0ca5e7, 0xFF0bb0e9, 0xFF09bbeb, 0xFF15c5f3, 0xFF21d0fc, 0xFF46dafc,
+	0xFF6ce3fc, 0xFF82e6fd, 0xFF97e9fe, 0xFF99e9fe, 0xFF9ce8fe, 0xFF9ee9fb, 0xFFa0e9f9, 0xFFa6eefa,
+	0xFFacf3fc, 0xFFb0effc, 0xFFb5ecfb, 0xFF89ddf9, 0xFF28b4f3, 0xFF3ebef7, 0xFF1eadf7, 0xFFbde8f0,
+	0xFFfefff2, 0xFFfefff3, 0xFFfdfff4, 0xFFfefef2, 0xFFfefef0, 0xFFfefeea, 0xFFfefee4, 0xFFfefede,
+	0xFFfefed8, 0xFFfcffc9, 0xFFfbffba, 0xFFf6fea0, 0xFFffffce, 0xFFfff9f6, 0xFFffffc9, 0xFFfdf7be,
+	0xFFf8f87a, 0xFFf9f66b, 0xFFf9f35c, 0xFFf5ee56, 0xFFf1e84f, 0xFFf8ee37, 0xFF3fa7ea, 0xFF189df5,
+	0xFF179df4, 0xFF169cf1, 0xFF159bee, 0xFF169af2, 0xFF1798f5, 0xFF1596f3, 0xFF1394f1, 0xFF1394f1,
+	0xFF66d7fc, 0xFF5fd1f5, 0xFF60d4f6, 0xFF59d8f9, 0xFF399ddb, 0xFF0858be, 0xFF096ccd, 0xFF0c7ad2,
+	0xFF1087d7, 0xFF1296df, 0xFF13a6e8, 0xFF13b0eb, 0xFF1bc3f5, 0xFF0fc8f3, 0xFF17d0f9, 0xFF27d3f4,
+	0xFF4bd7f7, 0xFF61dbf8, 0xFF77def9, 0xFF7fe0fa, 0xFF88e1fa, 0xFF8de4fb, 0xFF91e7fb, 0xFF96eafc,
+	0xFF9aedfd, 0xFF9feafb, 0xFFa3e7fa, 0xFF5eccfb, 0xFF2db7f5, 0xFF24b8f9, 0xFF14b1f5, 0xFFfffbff,
+	0xFFfeffec, 0xFFffffed, 0xFFffffee, 0xFFffffec, 0xFFfefdeb, 0xFFfefde4, 0xFFfefddd, 0xFFfefed6,
+	0xFFfefece, 0xFFfcfdc1, 0xFFfcfcb5, 0xFFf6fb8d, 0xFFf8fc8a, 0xFFf8facc, 0xFFf8fef2, 0xFFf9ffbe,
+	0xFFfbf9c2, 0xFFfbf8ac, 0xFFfcf796, 0xFFfaf491, 0xFFf7f18d, 0xFFffe5a9, 0xFF0096f7, 0xFF089af7,
+	0xFF159ef7, 0xFF169df4, 0xFF169cf0, 0xFF169bf2, 0xFF1699f4, 0xFF1497f3, 0xFF1396f1, 0xFF1396f1,
+	0xFF6bd9fb, 0xFF61cef1, 0xFF67d3f7, 0xFF5cdefd, 0xFF1f6cc0, 0xFF0f63bf, 0xFF0f6acd, 0xFF1478d1,
+	0xFF1887d4, 0xFF1997df, 0xFF1aa6e9, 0xFF14a9e4, 0xFF1dbbef, 0xFF0dbeeb, 0xFF23c5f6, 0xFF13c6ed,
+	0xFF2acbf3, 0xFF40cff4, 0xFF56d4f4, 0xFF65d7f6, 0xFF74daf7, 0xFF7bdffb, 0xFF83e5fe, 0xFF86e6fe,
+	0xFF89e8fd, 0xFF8ee5fb, 0xFF92e2fa, 0xFF33bcfc, 0xFF32b9f7, 0xFF31bafd, 0xFF57c5f7, 0xFFf4ffde,
+	0xFFfdffe7, 0xFFffffe7, 0xFFffffe7, 0xFFffffe6, 0xFFfdfce6, 0xFFfdfddd, 0xFFfdfdd5, 0xFFfdfdcd,
+	0xFFfefdc5, 0xFFfdfaba, 0xFFfcf8af, 0xFFfef99f, 0xFFfffb8e, 0xFFfafe77, 0xFFf4fb7d, 0xFFf9f8d2,
+	0xFFfdffee, 0xFFfefedf, 0xFFfffcd0, 0xFFfefacd, 0xFFfdf9ca, 0xFFa6d3ce, 0xFF0399eb, 0xFF1ea1ec,
+	0xFF149ffa, 0xFF159ef6, 0xFF179ef2, 0xFF169cf3, 0xFF159af3, 0xFF1499f2, 0xFF1398f1, 0xFF1398f1,
+	0xFF55d4f4, 0xFF5bd1f1, 0xFF69d6f6, 0xFF6ee2ff, 0xFF0c50a8, 0xFF1161be, 0xFF0f6acd, 0xFF1f83d6,
+	0xFF1f89dc, 0xFF0f8cdd, 0xFF1a9be0, 0xFF22b1f4, 0xFF1dabe1, 0xFF14aedf, 0xFF26bdee, 0xFF15bae7,
+	0xFF1fc1ef, 0xFF25c7ef, 0xFF2bcdef, 0xFF3dcdf1, 0xFF4ecef3, 0xFF5bd6f9, 0xFF68defe, 0xFF6eddfc,
+	0xFF73ddfb, 0xFF76ddf5, 0xFF70d3f7, 0xFF31bafb, 0xFF33b9f6, 0xFF24b6ff, 0xFFa4dee5, 0xFFf9ffdc,
+	0xFFfdfedc, 0xFFffffdc, 0xFFffffdc, 0xFFfefedb, 0xFFfcfdda, 0xFFfdfdd2, 0xFFfdfdcb, 0xFFfdfdc3,
+	0xFFfefdbc, 0xFFfdfbaf, 0xFFfcfaa2, 0xFFfdfb93, 0xFFfefb83, 0xFFfcfd6b, 0xFFf9fc60, 0xFFfbf85d,
+	0xFFfdf74c, 0xFFfef576, 0xFFfff2a1, 0xFFf6ec87, 0xFFf8e360, 0xFF51bbb4, 0xFF0d9afe, 0xFF1a9ef7,
+	0xFF159ef6, 0xFF159df4, 0xFF159df2, 0xFF149bf2, 0xFF1299f2, 0xFF1299f2, 0xFF1299f2, 0xFF1299f2,
+	0xFF67d4fd, 0xFF69d6f9, 0xFF6cd9f5, 0xFF4fb7dc, 0xFF1953af, 0xFF1c67c6, 0xFF005abd, 0xFF1a7eca,
+	0xFF157bd4, 0xFF0581dc, 0xFF2aa1e7, 0xFF0189d3, 0xFF2dabe3, 0xFF23a7dc, 0xFF29b4e6, 0xFF17ade1,
+	0xFF14b7ec, 0xFF15b9ea, 0xFF16bbe9, 0xFF1fbfec, 0xFF28c2ef, 0xFF3bcdf7, 0xFF4ed8ff, 0xFF56d5fb,
+	0xFF5dd2f8, 0xFF5ed6f0, 0xFF4ec5f4, 0xFF2fb9fa, 0xFF35b8f4, 0xFF17b1ff, 0xFFf0f7d2, 0xFFfeffda,
+	0xFFfdfcd2, 0xFFfdfdd1, 0xFFfdfed1, 0xFFfdfecf, 0xFFfcfecd, 0xFFfcfdc7, 0xFFfdfdc0, 0xFFfdfdb9,
+	0xFFfdfdb2, 0xFFfdfca4, 0xFFfdfc95, 0xFFfdfc87, 0xFFfdfc79, 0xFFfdfa6c, 0xFFfef85f, 0xFFf9f645,
+	0xFFf6ef47, 0xFFf2e938, 0xFFefe428, 0xFFeee425, 0xFFffdd05, 0xFF0399ff, 0xFF17a1f5, 0xFF179ef4,
+	0xFF169cf3, 0xFF159cf3, 0xFF149cf3, 0xFF129bf1, 0xFF1099f0, 0xFF119af1, 0xFF129bf2, 0xFF129bf2,
+	0xFF66d5fb, 0xFF70d5fc, 0xFF78e2ff, 0xFF3b86c7, 0xFF235fba, 0xFF1e6aba, 0xFF227ad1, 0xFF2787d8,
+	0xFF248cd7, 0xFF1d8dd4, 0xFF2189d1, 0xFF2ca1ea, 0xFF2296d5, 0xFF31aaef, 0xFF20a1db, 0xFF17a1dd,
+	0xFF0ea1e0, 0xFF1aace3, 0xFF13b1eb, 0xFF10b8ed, 0xFF0dc0ef, 0xFF1cc1ef, 0xFF2cc3f0, 0xFF36c4f2,
+	0xFF40c5f4, 0xFF47c9f2, 0xFF45c3f6, 0xFF31bafa, 0xFF31b7f7, 0xFF4cc2f4, 0xFFf5fac0, 0xFFfdffc6,
+	0xFFfdfcc5, 0xFFfdfdc4, 0xFFfdfdc4, 0xFFfcfdc2, 0xFFfbfdc1, 0xFFf8f9b6, 0xFFfdfdb3, 0xFFfdfdab,
+	0xFFfdfca3, 0xFFfcfc95, 0xFFfcfb88, 0xFFfcfb7b, 0xFFfbfb6d, 0xFFfcf962, 0xFFfcf757, 0xFFf8f245,
+	0xFFf4eb41, 0xFFf0e532, 0xFFebe023, 0xFFfbe01c, 0xFFc5d244, 0xFF0aa2fe, 0xFF169ff9, 0xFF179ff6,
+	0xFF189ff3, 0xFF179ef2, 0xFF159df2, 0xFF179ff5, 0xFF18a1f8, 0xFF159ef5, 0xFF129bf2, 0xFF129bf2,
+	0xFF65d7fa, 0xFF64d1f7, 0xFF5de7ff, 0xFF04439b, 0xFF0e4ca5, 0xFF317bcd, 0xFF0455c1, 0xFF0053c9,
+	0xFF0368c6, 0xFF2687ca, 0xFF2881ca, 0xFF2789d1, 0xFF2791d7, 0xFF0774c9, 0xFF178dcf, 0xFF1f9ce1,
+	0xFF179be4, 0xFF1e9eda, 0xFF0097de, 0xFF03a5e6, 0xFF08b1ee, 0xFF09b0e8, 0xFF0aafe2, 0xFF17b4e9,
+	0xFF24b9ef, 0xFF30bdf4, 0xFF3cc1f9, 0xFF34bcf9, 0xFF2cb6f9, 0xFF80d2e8, 0xFFfafdaf, 0xFFfcfdb3,
+	0xFFfdfcb7, 0xFFfdfcb7, 0xFFfdfdb7, 0xFFfcfcb6, 0xFFfbfcb5, 0xFFf4f4a5, 0xFFfdfda5, 0xFFfcfc9d,
+	0xFFfcfc94, 0xFFfbfb87, 0xFFfbfb7b, 0xFFfafa6e, 0xFFfafa61, 0xFFfaf758, 0xFFfaf54e, 0xFFf7ee44,
+	0xFFf3e73a, 0xFFede12c, 0xFFe7db1e, 0xFFffd21a, 0xFF78b090, 0xFF09a0fd, 0xFF159dfd, 0xFF18a0f8,
+	0xFF1aa2f2, 0xFF18a0f2, 0xFF169ef2, 0xFF139bf2, 0xFF1099f1, 0xFF119af2, 0xFF129bf3, 0xFF129bf3,
+	0xFF60d4f7, 0xFF67dcfd, 0xFF4fc2f0, 0xFF002c8a, 0xFF2e6bc0, 0xFF0547ad, 0xFF0044ba, 0xFF3685c4,
+	0xFF064ebc, 0xFF1462c3, 0xFF2d70cb, 0xFF0f5ab4, 0xFF2274cd, 0xFF1169c2, 0xFF1979c2, 0xFF1d80d0,
+	0xFF1980d7, 0xFF1a86d3, 0xFF1090de, 0xFF038dda, 0xFF0599e6, 0xFF059ce1, 0xFF049edd, 0xFF05a6e1,
+	0xFF00a7de, 0xFF1fb6ee, 0xFF39bdf7, 0xFF38bcf6, 0xFF24b5fc, 0xFFbfe8b9, 0xFFfafea2, 0xFFfbfca5,
+	0xFFfcfaa8, 0xFFfcfca7, 0xFFfdfda6, 0xFFfbfca3, 0xFFf9fb9f, 0xFFf6f795, 0xFFfafb92, 0xFFfbfb8b,
+	0xFFfbfb85, 0xFFfafa79, 0xFFfafa6d, 0xFFf9f961, 0xFFf8f956, 0xFFf9f64c, 0xFFf9f442, 0xFFf5ec39,
+	0xFFf2e531, 0xFFefde28, 0xFFecd620, 0xFFeed900, 0xFF32a6e5, 0xFF19a4ff, 0xFF29a4f4, 0xFF20a2f4,
+	0xFF18a0f5, 0xFF179ef4, 0xFF159df4, 0xFF139bf3, 0xFF1199f2, 0xFF129af2, 0xFF129af3, 0xFF129af3,
+	0xFF5bd1f5, 0xFF63dffa, 0xFF318dcc, 0xFF062d91, 0xFF0e499a, 0xFF00369f, 0xFF003897, 0xFF155fb6,
+	0xFF53aad9, 0xFF31a6e2, 0xFF45bcef, 0xFF6dddff, 0xFF76defa, 0xFF6dd9f9, 0xFF64d5f9, 0xFF54c5f3,
+	0xFF45b5ed, 0xFF238ed6, 0xFF1277ce, 0xFF006cc6, 0xFF0282de, 0xFF0187db, 0xFF008dd7, 0xFF079be1,
+	0xFF0099dc, 0xFF22b1f0, 0xFF36baf4, 0xFF3cbcf4, 0xFF1cb5ff, 0xFFfffe89, 0xFFfbff96, 0xFFfbfc98,
+	0xFFfbf99a, 0xFFfcfb98, 0xFFfdfd96, 0xFFfafb90, 0xFFf6f98a, 0xFFf7f984, 0xFFf8fa7f, 0xFFfafa7a,
+	0xFFfbfb75, 0xFFfafa6a, 0xFFf9f960, 0xFFf8f855, 0xFFf7f84a, 0xFFf7f540, 0xFFf8f336, 0xFFf4eb2f,
+	0xFFf0e328, 0xFFf0da24, 0xFFf0d121, 0xFFe9ca24, 0xFF049bff, 0xFF20a3f6, 0xFF16a1f7, 0xFF16a0f7,
+	0xFF169ef7, 0xFF159df6, 0xFF149cf5, 0xFF139bf4, 0xFF129af3, 0xFF129af3, 0xFF129af3, 0xFF129af3,
+	0xFF5ae3ff, 0xFF64d8ff, 0xFF0d4798, 0xFF002682, 0xFF1d6bb7, 0xFF3aa2de, 0xFF5fe5ff, 0xFF52d8fd,
+	0xFF4dd6f6, 0xFF48ccf5, 0xFF5fd0f6, 0xFF68d9ff, 0xFF61d3f8, 0xFF5bd2f8, 0xFF42cbff, 0xFF53cefe,
+	0xFF51cff5, 0xFF49caf6, 0xFF4acdff, 0xFF40baff, 0xFF0e7edb, 0xFF0069c2, 0xFF0584da, 0xFF0184d5,
+	0xFF068cd8, 0xFF38bef8, 0xFF3abef7, 0xFF35beff, 0xFF62c7e2, 0xFFfbf379, 0xFFf8fa83, 0xFFf9f983,
+	0xFFfaf884, 0xFFf9f77f, 0xFFf7f77b, 0xFFf8f979, 0xFFf9fa77, 0xFFf8f972, 0xFFf7f86c, 0xFFfcfc6c,
+	0xFFf9f864, 0xFFf8f85b, 0xFFf8f752, 0xFFf7f649, 0xFFf6f53f, 0xFFf5f237, 0xFFf4ef2f, 0xFFf1e628,
+	0xFFeede20, 0xFFead61f, 0xFFf2cc11, 0xFF9db96c, 0xFF0c9ffe, 0xFF1ba3f9, 0xFF17a2f9, 0xFF17a0f9,
+	0xFF169ef8, 0xFF169df7, 0xFF159cf6, 0xFF149bf5, 0xFF139af5, 0xFF139af5, 0xFF139af5, 0xFF139af5,
+	0xFF60d8f9, 0xFF5bd9f8, 0xFF4cadd7, 0xFF69ddff, 0xFF56ddf8, 0xFF55d6fc, 0xFF55d0ff, 0xFF5cd5ff,
+	0xFF53cbf2, 0xFF4bcaf6, 0xFF43cafa, 0xFF47c9f8, 0xFF4cc8f6, 0xFF5ccff1, 0xFF46ccf8, 0xFF55caff,
+	0xFF3ec4fa, 0xFF43c3fb, 0xFF48c2fd, 0xFF3ebff4, 0xFF44ccfb, 0xFF37b3fc, 0xFF0b7bdd, 0xFF006dc9,
+	0xFF0d80d4, 0xFF4eccff, 0xFF3ec3fa, 0xFF2ec2ff, 0xFFa7dea8, 0xFFf8ec5b, 0xFFf5f570, 0xFFf7f66f,
+	0xFFfaf76e, 0xFFf5f467, 0xFFf1f060, 0xFFf6f663, 0xFFfbfc65, 0xFFf8f95f, 0xFFf6f659, 0xFFfefe5d,
+	0xFFf7f652, 0xFFf7f54c, 0xFFf7f545, 0xFFf6f33d, 0xFFf6f235, 0xFFf3ef2f, 0xFFf1eb29, 0xFFefe221,
+	0xFFecd818, 0xFFe5d21a, 0xFFf3c700, 0xFF52a9b4, 0xFF14a4fb, 0xFF15a3fb, 0xFF17a3fc, 0xFF17a1fa,
+	0xFF179ff8, 0xFF169df8, 0xFF159cf7, 0xFF159bf7, 0xFF1499f6, 0xFF1499f6, 0xFF1499f6, 0xFF1499f6,
+	0xFF58cff2, 0xFF59ddfd, 0xFF55d5f9, 0xFF5ddeff, 0xFF4dcef3, 0xFF4dcbf3, 0xFF4cc8f3, 0xFF56d2fc,
+	0xFF59d3fd, 0xFF50cefb, 0xFF47cafa, 0xFF48c9f9, 0xFF49c7f9, 0xFF51cbf6, 0xFF45c9f9, 0xFF4bc8fd,
+	0xFF3fc5f9, 0xFF41c4fa, 0xFF43c2fb, 0xFF3bbdf3, 0xFF3ac0f4, 0xFF3ec7fc, 0xFF3ac6fc, 0xFF25a1e3,
+	0xFF1f8dd9, 0xFF37b9f7, 0xFF26bbfa, 0xFF2abbf4, 0xFFced857, 0xFFf9fa5b, 0xFFd9db49, 0xFFedec58,
+	0xFFfaf560, 0xFFf2ef4d, 0xFFe9ea3b, 0xFFeeef46, 0xFFf2f451, 0xFFf9f34f, 0xFFedf145, 0xFFfef84b,
+	0xFFf4f542, 0xFFf5f43d, 0xFFf6f337, 0xFFf5f131, 0xFFf5ef2b, 0xFFf2eb27, 0xFFf0e622, 0xFFeedb1d,
+	0xFFecd117, 0xFFf1cc09, 0xFFf5c509, 0xFF0fadff, 0xFF17a1f9, 0xFF18a1f9, 0xFF18a1f8, 0xFF18a0f9,
+	0xFF179ff9, 0xFF169df9, 0xFF169cf8, 0xFF159bf8, 0xFF1599f8, 0xFF1599f8, 0xFF1599f8, 0xFF1599f8,
+	0xFF60d5fb, 0xFF5bd3fb, 0xFF56d2fb, 0xFF55d1fc, 0xFF55d0fe, 0xFF54d0fa, 0xFF53d1f6, 0xFF51cef7,
+	0xFF4ecbf8, 0xFF4dcbf9, 0xFF4ccafb, 0xFF49c8fb, 0xFF47c6fc, 0xFF45c6fb, 0xFF43c6fa, 0xFF41c6fa,
+	0xFF40c7f9, 0xFF3fc5f9, 0xFF3ec3f9, 0xFF3fc3fb, 0xFF41c4fd, 0xFF38baf2, 0xFF40c1f8, 0xFF3dc3fb,
+	0xFF3bc5fe, 0xFF37c1f6, 0xFF34beef, 0xFF2ebcf0, 0xFFded722, 0xFFbfdc38, 0xFFdee142, 0xFFecea4a,
+	0xFFeae442, 0xFFeee942, 0xFFf2ee42, 0xFFeeed3f, 0xFFeaec3d, 0xFFfbee3f, 0xFFe5ec31, 0xFFfff239,
+	0xFFf2f531, 0xFFf4f32e, 0xFFf5f12a, 0xFFf5ee25, 0xFFf4ec21, 0xFFf2e71e, 0xFFf0e11c, 0xFFeed519,
+	0xFFecc917, 0xFFdec40c, 0xFFbbbe39, 0xFF0798f8, 0xFF1a9ff8, 0xFF1a9ff7, 0xFF1a9ff5, 0xFF189ff7,
+	0xFF179ff9, 0xFF179ef9, 0xFF169cf9, 0xFF169bf9, 0xFF1699f9, 0xFF1699f9, 0xFF1699f9, 0xFF1699f9,
+	0xFF5cd4f9, 0xFF58d4f9, 0xFF55d3f9, 0xFF56d2fa, 0xFF58d0fb, 0xFF56d0f8, 0xFF54d0f6, 0xFF51cef7,
+	0xFF4dccf9, 0xFF4ccbfa, 0xFF4bcafb, 0xFF49c8fb, 0xFF47c7fb, 0xFF45c7fb, 0xFF43c6fa, 0xFF41c6fa,
+	0xFF40c6f9, 0xFF3fc4f9, 0xFF3ec3f9, 0xFF3ec2fa, 0xFF3ec2fb, 0xFF3abef5, 0xFF3ec2f8, 0xFF3bc1f9,
+	0xFF37c0f9, 0xFF36beff, 0xFF35bbff, 0xFF67bb84, 0xFFb0d219, 0xFFb4d31a, 0xFFd3da39, 0xFFe2dd3d,
+	0xFFd6d532, 0xFFe1df38, 0xFFece93e, 0xFFe1e636, 0xFFe9e536, 0xFFf1e634, 0xFFe5e42b, 0xFFf6e62e,
+	0xFFe9eb29, 0xFFf0ee2a, 0xFFf0e824, 0xFFece420, 0xFFe9e01d, 0xFFebdb1c, 0xFFedd71c, 0xFFe9ce19,
+	0xFFe5c516, 0xFFe7c004, 0xFF6cb292, 0xFF109dfc, 0xFF18a1f7, 0xFF1aa0f5, 0xFF1ca0f3, 0xFF19a0f6,
+	0xFF179ff9, 0xFF169ef9, 0xFF169cf9, 0xFF159bf8, 0xFF159af8, 0xFF1499f8, 0xFF1499f7, 0xFF1499f7,
+	0xFF58d4f6, 0xFF56d4f6, 0xFF54d5f7, 0xFF57d3f7, 0xFF5bd1f8, 0xFF58d0f6, 0xFF54cff5, 0xFF50cef8,
+	0xFF4dcdfa, 0xFF4bcbfb, 0xFF4acafb, 0xFF48c9fb, 0xFF46c7fb, 0xFF45c7fa, 0xFF43c7fa, 0xFF42c6fa,
+	0xFF40c6f9, 0xFF3fc4f9, 0xFF3ec3f9, 0xFF3dc1f9, 0xFF3cc0f9, 0xFF3cc1f8, 0xFF3cc2f7, 0xFF38bff6,
+	0xFF34bbf5, 0xFF35bdfd, 0xFF37beff, 0xFF46bcfc, 0xFF82c92c, 0xFFa0be02, 0xFFb8c420, 0xFFd8cf31,
+	0xFFd2d632, 0xFFd4d52e, 0xFFd7d42a, 0xFFcdd725, 0xFFe9df2f, 0xFFe6dd2a, 0xFFe4dc25, 0xFFedd922,
+	0xFFe0e220, 0xFFede927, 0xFFeae01e, 0xFFe4da1c, 0xFFded319, 0xFFe5d01a, 0xFFebcd1b, 0xFFe5c818,
+	0xFFdec214, 0xFFf0bc00, 0xFF1da5eb, 0xFF19a1ff, 0xFF16a2f7, 0xFF19a2f4, 0xFF1ea2f1, 0xFF1aa0f5,
+	0xFF169ff9, 0xFF169ef8, 0xFF159df8, 0xFF159cf8, 0xFF149bf8, 0xFF139af7, 0xFF1299f6, 0xFF1299f6,
+	0xFF5ed5f9, 0xFF63d6fc, 0xFF68d6ff, 0xFF5fd3fc, 0xFF56d0f8, 0xFF53cff8, 0xFF51cef8, 0xFF4ecdf9,
+	0xFF4bccfb, 0xFF4acbfb, 0xFF48cafb, 0xFF47c9fa, 0xFF46c8fb, 0xFF44c7fa, 0xFF43c7fa, 0xFF42c6fa,
+	0xFF40c5f9, 0xFF3fc4f9, 0xFF3ec3f9, 0xFF3dc1f9, 0xFF3cc0f9, 0xFF3bc1f9, 0xFF3bc1f8, 0xFF38bff7,
+	0xFF36bdf7, 0xFF35bdfa, 0xFF34bdfe, 0xFF22c3f6, 0xFF27bbfc, 0xFF53b0b2, 0xFF9bc606, 0xFFc1d322,
+	0xFFd3dd36, 0xFFb4ba12, 0xFFc4c71f, 0xFFc5cf22, 0xFFd9d82d, 0xFFdfdb30, 0xFFdcd52b, 0xFFe8d520,
+	0xFFd5d51c, 0xFFe8e428, 0xFFece324, 0xFFd1ce1f, 0xFFd3c51d, 0xFFdcc302, 0xFFcfc312, 0xFFe3c209,
+	0xFFe3be00, 0xFF84bf6e, 0xFF0ca0f6, 0xFF129ffd, 0xFF18a2f6, 0xFF19a1f5, 0xFF1ba1f4, 0xFF18a0f6,
+	0xFF169ff8, 0xFF159ef8, 0xFF159df8, 0xFF149cf7, 0xFF139bf7, 0xFF129af6, 0xFF1098f4, 0xFF1098f4,
+	0xFF65d7fb, 0xFF5dd4fa, 0xFF56d2f8, 0xFF53d0f9, 0xFF50cff9, 0xFF4fcef9, 0xFF4dcdfa, 0xFF4bcdfa,
+	0xFF4accfb, 0xFF48cbfb, 0xFF47cafb, 0xFF46c9fa, 0xFF45c8fa, 0xFF44c7fa, 0xFF43c7fa, 0xFF42c6fa,
+	0xFF40c5fa, 0xFF3fc4f9, 0xFF3ec3f9, 0xFF3dc1f9, 0xFF3bc0f9, 0xFF3ac0f9, 0xFF39c0f9, 0xFF38bff9,
+	0xFF37bff9, 0xFF34bef8, 0xFF31bcf7, 0xFF33bbf8, 0xFF35bbfa, 0xFF2cbcff, 0xFF61c2df, 0xFF93cb85,
+	0xFFc5d52b, 0xFFcbd82f, 0xFFb0bb13, 0xFFb5be17, 0xFFb9c21b, 0xFFc7c826, 0xFFc5bf21, 0xFFdbc817,
+	0xFFcac819, 0xFFdbd722, 0xFFddd61a, 0xFFb7bd0d, 0xFFc8bd04, 0xFFd0c000, 0xFFadc951, 0xFF6cb8b1,
+	0xFF04a3ff, 0xFF13a4fb, 0xFF21a4f5, 0xFF1ea3f5, 0xFF1aa1f6, 0xFF19a1f6, 0xFF18a0f7, 0xFF17a0f7,
+	0xFF169ff8, 0xFF159ef7, 0xFF149ef7, 0xFF139df7, 0xFF139cf6, 0xFF119af4, 0xFF0f98f2, 0xFF0f98f2,
+	0xFF5cd5f9, 0xFF58d3f8, 0xFF53d1f8, 0xFF52d0f9, 0xFF50cff9, 0xFF4ecefa, 0xFF4ccdfa, 0xFF4accfa,
+	0xFF48ccfa, 0xFF47cbfa, 0xFF46cafa, 0xFF45c9fa, 0xFF44c8fa, 0xFF43c7fa, 0xFF42c7fa, 0xFF41c6fa,
+	0xFF40c5fa, 0xFF3fc4f9, 0xFF3ec2f9, 0xFF3cc1f9, 0xFF3bc0f9, 0xFF3ac0f9, 0xFF38bff9, 0xFF37bff9,
+	0xFF36bff9, 0xFF35bdf6, 0xFF34bbf3, 0xFF35b9f7, 0xFF35b8fb, 0xFF22b5ff, 0xFF2fb5ff, 0xFF4dbae6,
+	0xFF6bbfce, 0xFF27b1c5, 0xFF6cbc7c, 0xFF8abd49, 0xFFa7be15, 0xFFb9bf09, 0xFFccc000, 0xFFdac43d,
+	0xFFbbca20, 0xFFaec73e, 0xFF99bc54, 0xFF5aad8b, 0xFF36abc4, 0xFF04b3ff, 0xFF15a7ff, 0xFF21a4ff,
+	0xFF19a0fb, 0xFF1ba2fa, 0xFF1da4f9, 0xFF1ba3f8, 0xFF1aa1f7, 0xFF19a1f7, 0xFF18a0f7, 0xFF17a0f7,
+	0xFF169ff8, 0xFF159ef7, 0xFF149ef7, 0xFF139df7, 0xFF129cf6, 0xFF119af5, 0xFF0f99f3, 0xFF0f99f3,
+	0xFF53d2f6, 0xFF52d1f7, 0xFF51d1f8, 0xFF50d0f9, 0xFF4fcffa, 0xFF4dcefa, 0xFF4bcdfa, 0xFF49ccfa,
+	0xFF47cbfa, 0xFF46caf9, 0xFF45caf9, 0xFF44c9f9, 0xFF44c8fa, 0xFF43c7fa, 0xFF42c6f9, 0xFF41c6f9,
+	0xFF40c5fa, 0xFF3fc4f9, 0xFF3dc2f9, 0xFF3cc1f9, 0xFF3ac0f9, 0xFF39c0f9, 0xFF38bff9, 0xFF36bff9,
+	0xFF35bef8, 0xFF36bcf4, 0xFF38baf0, 0xFF36b8f6, 0xFF34b5fc, 0xFF2cb6f9, 0xFF23b7f6, 0xFF25b5fa,
+	0xFF28b4ff, 0xFF28b6ff, 0xFF29b7ff, 0xFF1fb5ff, 0xFF15b2ff, 0xFF20aef7, 0xFF3cb9ff, 0xFF5acbf0,
+	0xFF42befa, 0xFF2ab6fc, 0xFF12adff, 0xFF18acfc, 0xFF1eacfa, 0xFF1ea9fd, 0xFF1ea7ff, 0xFF1ba8fa,
+	0xFF18a8f4, 0xFF18a6f8, 0xFF18a4fd, 0xFF19a3fa, 0xFF1aa1f7, 0xFF19a1f7, 0xFF18a0f8, 0xFF17a0f8,
+	0xFF169ff8, 0xFF159ef7, 0xFF149df7, 0xFF139cf6, 0xFF129bf6, 0xFF119af5, 0xFF1099f4, 0xFF1099f4,
+	0xFF54d1f8, 0xFF52d1f8, 0xFF51d0f9, 0xFF4fcff9, 0xFF4ecffa, 0xFF4ccefa, 0xFF4acdf9, 0xFF48ccf9,
+	0xFF45cbf9, 0xFF45caf9, 0xFF44c9f9, 0xFF43c8f9, 0xFF43c8f9, 0xFF42c7f9, 0xFF42c6f9, 0xFF41c5f9,
+	0xFF40c5fa, 0xFF3fc4f9, 0xFF3dc2f9, 0xFF3bc1f9, 0xFF3ac0fa, 0xFF38bff9, 0xFF37bff9, 0xFF36bef9,
+	0xFF34bef8, 0xFF35bcf6, 0xFF35baf5, 0xFF34b8f8, 0xFF33b6fc, 0xFF2eb6f9, 0xFF29b6f7, 0xFF29b5f8,
+	0xFF2ab4fa, 0xFF2ab5fb, 0xFF2ab5fc, 0xFF2ab2f6, 0xFF2aafef, 0xFF1ba9f6, 0xFF9bcfd9, 0xFF6dcfe9,
+	0xFF74c7e4, 0xFF80c9dd, 0xFF19adfb, 0xFF1cacf9, 0xFF1fabf8, 0xFF1fa9f9, 0xFF1ea7fb, 0xFF1ca7f9,
+	0xFF1aa7f6, 0xFF1aa5f8, 0xFF1aa4fb, 0xFF1aa3fa, 0xFF1aa2f8, 0xFF19a1f8, 0xFF18a0f8, 0xFF17a0f8,
+	0xFF169ff8, 0xFF159ef7, 0xFF149df7, 0xFF139cf6, 0xFF129bf6, 0xFF119bf5, 0xFF119af5, 0xFF119af5,
+	0xFF55d0f9, 0xFF53d0fa, 0xFF51d0fa, 0xFF4fcffa, 0xFF4dcffa, 0xFF4bcefa, 0xFF49cdf9, 0xFF46ccf9,
+	0xFF44caf8, 0xFF43caf8, 0xFF43c9f8, 0xFF43c8f9, 0xFF42c8f9, 0xFF42c7f9, 0xFF41c6f9, 0xFF41c6f9,
+	0xFF40c5fa, 0xFF3ec3f9, 0xFF3dc2fa, 0xFF3bc1fa, 0xFF39c0fa, 0xFF38bff9, 0xFF36bff9, 0xFF35bef9,
+	0xFF34bdf8, 0xFF33bcf9, 0xFF33bafa, 0xFF32b9fb, 0xFF32b8fc, 0xFF30b7fa, 0xFF2eb6f8, 0xFF2db5f7,
+	0xFF2bb4f5, 0xFF2bb4f6, 0xFF2bb3f7, 0xFF29b2f9, 0xFF28b2fc, 0xFF30b2f7, 0xFF12a8fe, 0xFF7fd4e1,
+	0xFF58bbe6, 0xFF15aafb, 0xFF1fadf8, 0xFF20acf7, 0xFF20aaf5, 0xFF1fa9f6, 0xFF1ea8f7, 0xFF1da6f7,
+	0xFF1ca5f8, 0xFF1ca4f8, 0xFF1ba3f9, 0xFF1ba3f9, 0xFF1ba2f9, 0xFF19a1f9, 0xFF18a0f8, 0xFF17a0f8,
+	0xFF169ff8, 0xFF159ef7, 0xFF149df7, 0xFF139cf6, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5,
+	0xFF55d0f9, 0xFF53d0fa, 0xFF51d0fa, 0xFF4fcffa, 0xFF4dcffa, 0xFF4bcefa, 0xFF49cdf9, 0xFF46ccf9,
+	0xFF44caf8, 0xFF43caf8, 0xFF43c9f8, 0xFF43c8f9, 0xFF42c8f9, 0xFF42c7f9, 0xFF41c6f9, 0xFF41c6f9,
+	0xFF40c5fa, 0xFF3ec3f9, 0xFF3dc2fa, 0xFF3bc1fa, 0xFF39c0fa, 0xFF38bff9, 0xFF36bff9, 0xFF35bef9,
+	0xFF34bdf8, 0xFF33bcf9, 0xFF33bafa, 0xFF32b9fb, 0xFF32b8fc, 0xFF30b7fa, 0xFF2eb6f8, 0xFF2db5f7,
+	0xFF2bb4f5, 0xFF2bb4f6, 0xFF2bb3f7, 0xFF2ab2f8, 0xFF29b2fa, 0xFF2db6f5, 0xFF1db5f6, 0xFF239bff,
+	0xFF20b6f3, 0xFF0cacfb, 0xFF1eacf7, 0xFF1fabf6, 0xFF20aaf5, 0xFF1fa9f6, 0xFF1ea8f7, 0xFF1da6f7,
+	0xFF1ca5f8, 0xFF1ca4f8, 0xFF1ba3f9, 0xFF1ba3f9, 0xFF1ba2f9, 0xFF19a1f9, 0xFF18a0f8, 0xFF17a0f8,
+	0xFF169ff8, 0xFF159ef7, 0xFF149df7, 0xFF139cf6, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5,
+	0xFF55d0f9, 0xFF53d0fa, 0xFF51d0fa, 0xFF4fcffa, 0xFF4dcffa, 0xFF4bcefa, 0xFF49cdf9, 0xFF46ccf9,
+	0xFF44caf8, 0xFF43caf8, 0xFF43c9f8, 0xFF43c8f9, 0xFF42c8f9, 0xFF42c7f9, 0xFF41c6f9, 0xFF41c6f9,
+	0xFF40c5fa, 0xFF3ec3f9, 0xFF3dc2fa, 0xFF3bc1fa, 0xFF39c0fa, 0xFF38bff9, 0xFF36bff9, 0xFF35bef9,
+	0xFF34bdf8, 0xFF33bcf9, 0xFF33bafa, 0xFF32b9fb, 0xFF32b8fc, 0xFF30b7fa, 0xFF2eb6f8, 0xFF2db5f7,
+	0xFF2bb4f5, 0xFF2bb4f6, 0xFF2bb3f7, 0xFF2bb2f8, 0xFF2bb1f8, 0xFF22aff9, 0xFF19acfa, 0xFF1eadf7,
+	0xFF24aef3, 0xFF20adf5, 0xFF1dabf6, 0xFF1fabf6, 0xFF20aaf5, 0xFF1fa9f6, 0xFF1ea8f7, 0xFF1da6f7,
+	0xFF1ca5f8, 0xFF1ca4f8, 0xFF1ba3f9, 0xFF1ba3f9, 0xFF1ba2f9, 0xFF19a1f9, 0xFF18a0f8, 0xFF17a0f8,
+	0xFF169ff8, 0xFF159ef7, 0xFF149df7, 0xFF139cf6, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5,
+	0xFF55d0f9, 0xFF53d0fa, 0xFF51d0fa, 0xFF4fcffa, 0xFF4dcffa, 0xFF4bcefa, 0xFF49cdf9, 0xFF46ccf9,
+	0xFF44caf8, 0xFF43caf8, 0xFF43c9f8, 0xFF43c8f9, 0xFF42c8f9, 0xFF42c7f9, 0xFF41c6f9, 0xFF41c6f9,
+	0xFF40c5fa, 0xFF3ec3f9, 0xFF3dc2fa, 0xFF3bc1fa, 0xFF39c0fa, 0xFF38bff9, 0xFF36bff9, 0xFF35bef9,
+	0xFF34bdf8, 0xFF33bcf9, 0xFF33bafa, 0xFF32b9fb, 0xFF32b8fc, 0xFF30b7fa, 0xFF2eb6f8, 0xFF2db5f7,
+	0xFF2bb4f5, 0xFF2bb4f6, 0xFF2bb3f7, 0xFF2bb2f8, 0xFF2bb1f8, 0xFF22aff9, 0xFF19acfa, 0xFF1eadf7,
+	0xFF24aef3, 0xFF20adf5, 0xFF1dabf6, 0xFF1fabf6, 0xFF20aaf5, 0xFF1fa9f6, 0xFF1ea8f7, 0xFF1da6f7,
+	0xFF1ca5f8, 0xFF1ca4f8, 0xFF1ba3f9, 0xFF1ba3f9, 0xFF1ba2f9, 0xFF19a1f9, 0xFF18a0f8, 0xFF17a0f8,
+	0xFF169ff8, 0xFF159ef7, 0xFF149df7, 0xFF139cf6, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5
+};
+
+static int test_bmp_cmp_count(const BYTE* mem1, const BYTE* mem2, int size, int channel, int margin)
+{
+	int error = 0;
+	int count = 0;
+	size /= 4;
+	mem1 += channel;
+	mem2 += channel;
+
+	for (int index = 0; index < size; index++)
+	{
+		if (*mem1 != *mem2)
+		{
+			error = (*mem1 > *mem2) ? *mem1 - *mem2 : *mem2 - *mem1;
+
+			if (error > margin)
+				count++;
+		}
+
+		mem1 += 4;
+		mem2 += 4;
+	}
+
+	return count;
+}
+
+static int test_bmp_cmp_dump(const BYTE* actual, const BYTE* expected, int size, int channel,
+                             int margin)
+{
+	int error[3];
+	int count = 0;
+	size /= 4;
+	actual += channel;
+	expected += channel;
+
+	for (int index = 0; index < size; index++)
+	{
+		if (*actual != *expected)
+		{
+			const UINT32 pixel = *((const UINT32*)&actual[-channel]);
+			const UINT32 ePixel = *((const UINT32*)&expected[-channel]);
+			const INT16 Y = TEST_Y_COMPONENT[index];
+			const INT16 Cb = TEST_CB_COMPONENT[index];
+			const INT16 Cr = TEST_CR_COMPONENT[index];
+			const int x = index % 64;
+			const int y = (index - x) / 64;
+			BYTE R = 0;
+			BYTE G = 0;
+			BYTE B = 0;
+			BYTE eR = 0;
+			BYTE eG = 0;
+			BYTE eB = 0;
+
+			FreeRDPSplitColor(pixel, PIXEL_FORMAT_XRGB32, &R, &G, &B, NULL, NULL);
+			FreeRDPSplitColor(ePixel, PIXEL_FORMAT_XRGB32, &eR, &eG, &eB, NULL, NULL);
+			error[0] = (R > eR) ? R - eR : eR - R;
+			error[1] = (G > eG) ? G - eG : eG - G;
+			error[2] = (B > eB) ? B - eB : eB - B;
+
+			if ((error[0] > margin) || (error[1] > margin) || (error[2] > margin))
+			{
+				printf("(%2d,%2d)    Y: %+5" PRId16 " Cb: %+5" PRId16 " Cr: %+5" PRId16
+				       "    R: %03" PRIu8 "/%03" PRIu8 " G: %03" PRIu8 "/%03" PRIu8 " B: %03" PRIu8
+				       "/%03" PRIu8 "    %d %d %d\n",
+				       x, y, Y, Cb, Cr, R, eR, G, eG, B, eB, R - eR, G - eG, B - eB);
+				count++;
+			}
+		}
+
+		actual += 4;
+		expected += 4;
+	}
+
+	return count;
+}
+
+static int test_PrimitivesYCbCr(const primitives_t* prims, UINT32 format, prim_size_t roi,
+                                BOOL compare)
+{
+	union
+	{
+		const INT16** cpi;
+		INT16** pi;
+		const UINT16** cpv;
+		UINT16** pv;
+	} cnv;
+	pstatus_t status = -1;
+	int cnt[3];
+	float err[3];
+	BYTE* actual = NULL;
+	BYTE* actual1 = NULL;
+	const BYTE* expected = (const BYTE*)TEST_XRGB_IMAGE;
+	int margin = 1;
+	INT16* pYCbCr[3] = { NULL, NULL, NULL };
+	const UINT32 srcStride = roi.width * 2;
+	const UINT32 dstStride = roi.width * FreeRDPGetBytesPerPixel(format);
+	const UINT32 srcSize = srcStride * roi.height;
+	const UINT32 dstSize = dstStride * roi.height;
+	PROFILER_DEFINE(prof)
+	PROFILER_DEFINE(prof1)
+	PROFILER_DEFINE(prof2)
+	// return test_YCbCr_pixels();
+
+	actual = winpr_aligned_malloc(dstSize, 16);
+	actual1 = winpr_aligned_malloc(dstSize, 16);
+	PROFILER_CREATE(prof, "yCbCrToRGB_16s8u")
+	PROFILER_CREATE(prof1, "yCbCrToRGB16s16s")
+	PROFILER_CREATE(prof2, "RGBToRGB_16s8u")
+
+	if (!actual || !actual1)
+		goto fail;
+
+	ZeroMemory(actual, dstSize);
+	ZeroMemory(actual1, dstSize);
+	pYCbCr[0] = winpr_aligned_malloc(srcSize, 16);
+	pYCbCr[1] = winpr_aligned_malloc(srcSize, 16);
+	pYCbCr[2] = winpr_aligned_malloc(srcSize, 16);
+
+	if (!pYCbCr[0] || !pYCbCr[1] || !pYCbCr[2])
+		goto fail;
+
+	winpr_RAND(pYCbCr[0], srcSize);
+	winpr_RAND(pYCbCr[1], srcSize);
+	winpr_RAND(pYCbCr[2], srcSize);
+
+	if (compare)
+	{
+		memcpy(pYCbCr[0], TEST_Y_COMPONENT, srcSize);
+		memcpy(pYCbCr[1], TEST_CB_COMPONENT, srcSize);
+		memcpy(pYCbCr[2], TEST_CR_COMPONENT, srcSize);
+	}
+
+	{
+		PROFILER_ENTER(prof)
+		cnv.pi = pYCbCr;
+		status =
+		    prims->yCbCrToRGB_16s8u_P3AC4R(cnv.cpi, srcStride, actual, dstStride, format, &roi);
+		if (status != PRIMITIVES_SUCCESS)
+			goto fail;
+
+		PROFILER_EXIT(prof)
+	}
+
+	{
+		INT16* pSrcDst[3];
+		pSrcDst[0] = winpr_aligned_malloc(srcSize, 16);
+		pSrcDst[1] = winpr_aligned_malloc(srcSize, 16);
+		pSrcDst[2] = winpr_aligned_malloc(srcSize, 16);
+		CopyMemory(pSrcDst[0], pYCbCr[0], srcSize);
+		CopyMemory(pSrcDst[1], pYCbCr[1], srcSize);
+		CopyMemory(pSrcDst[2], pYCbCr[2], srcSize);
+		PROFILER_ENTER(prof1)
+		cnv.pi = pSrcDst;
+		status = prims->yCbCrToRGB_16s16s_P3P3(cnv.cpi, srcStride, pSrcDst, srcStride, &roi);
+		PROFILER_EXIT(prof1)
+
+		if (status != PRIMITIVES_SUCCESS)
+			goto fail2;
+
+		PROFILER_ENTER(prof2)
+		status = prims->RGBToRGB_16s8u_P3AC4R(cnv.cpi, srcStride, actual1, dstStride, format, &roi);
+		PROFILER_EXIT(prof2)
+	fail2:
+		winpr_aligned_free(pSrcDst[0]);
+		winpr_aligned_free(pSrcDst[1]);
+		winpr_aligned_free(pSrcDst[2]);
+
+		if (status != PRIMITIVES_SUCCESS)
+			goto fail;
+	}
+
+	if (compare)
+	{
+		cnt[2] = test_bmp_cmp_count(actual, expected, dstSize, 2, margin); /* red */
+		err[2] = ((float)cnt[2]) / ((float)dstSize / 4.0f) * 100.0f;
+		cnt[1] = test_bmp_cmp_count(actual, expected, dstSize, 1, margin); /* green */
+		err[1] = ((float)cnt[1]) / ((float)dstSize / 4.0f) * 100.0f;
+		cnt[0] = test_bmp_cmp_count(actual, expected, dstSize, 0, margin); /* blue */
+		err[0] = ((float)cnt[0]) / ((float)dstSize / 4.0f) * 100.0f;
+
+		if (cnt[0] || cnt[1] || cnt[2])
+		{
+			printf("Summary information yCbCrToRGB_16s8u_P3AC4R\n");
+			printf("Red Error Dump:\n");
+			test_bmp_cmp_dump(actual, expected, dstSize, 2, margin); /* red */
+			printf("Green Error Dump:\n");
+			test_bmp_cmp_dump(actual, expected, dstSize, 1, margin); /* green */
+			printf("Blue Error Dump:\n");
+			test_bmp_cmp_dump(actual, expected, dstSize, 0, margin); /* blue */
+			printf("R: diff: %d (%f%%)\n", cnt[2], err[2]);
+			printf("G: diff: %d (%f%%)\n", cnt[1], err[1]);
+			printf("B: diff: %d (%f%%)\n", cnt[0], err[0]);
+		}
+
+		cnt[2] = test_bmp_cmp_count(actual1, expected, dstSize, 2, margin); /* red */
+		err[2] = ((float)cnt[2]) / ((float)dstSize / 4.0f) * 100.0f;
+		cnt[1] = test_bmp_cmp_count(actual1, expected, dstSize, 1, margin); /* green */
+		err[1] = ((float)cnt[1]) / ((float)dstSize / 4.0f) * 100.0f;
+		cnt[0] = test_bmp_cmp_count(actual1, expected, dstSize, 0, margin); /* blue */
+		err[0] = ((float)cnt[0]) / ((float)dstSize / 4.0f) * 100.0f;
+
+		if (cnt[0] || cnt[1] || cnt[2])
+		{
+			printf("Summary information yCbCrToRGB_16s16s_P3P3 & RGBToRGB_16s8u_P3AC4R\n");
+			printf("Red Error Dump:\n");
+			test_bmp_cmp_dump(actual1, expected, dstSize, 2, margin); /* red */
+			printf("Green Error Dump:\n");
+			test_bmp_cmp_dump(actual1, expected, dstSize, 1, margin); /* green */
+			printf("Blue Error Dump:\n");
+			test_bmp_cmp_dump(actual1, expected, dstSize, 0, margin); /* blue */
+			printf("R: diff: %d (%f%%)\n", cnt[2], err[2]);
+			printf("G: diff: %d (%f%%)\n", cnt[1], err[1]);
+			printf("B: diff: %d (%f%%)\n", cnt[0], err[0]);
+		}
+	}
+
+	PROFILER_PRINT_HEADER
+	PROFILER_PRINT(prof)
+	PROFILER_PRINT(prof1)
+	PROFILER_PRINT(prof2)
+	PROFILER_PRINT_FOOTER
+fail:
+	winpr_aligned_free((BYTE*)pYCbCr[0]);
+	winpr_aligned_free((BYTE*)pYCbCr[1]);
+	winpr_aligned_free((BYTE*)pYCbCr[2]);
+	winpr_aligned_free(actual);
+	winpr_aligned_free(actual1);
+	PROFILER_FREE(prof)
+	PROFILER_FREE(prof1)
+	PROFILER_FREE(prof2)
+	return status;
+}
+
+int TestPrimitivesYCbCr(int argc, char* argv[])
+{
+	const UINT32 formats[] = { PIXEL_FORMAT_XRGB32, PIXEL_FORMAT_XBGR32, PIXEL_FORMAT_ARGB32,
+		                       PIXEL_FORMAT_ABGR32, PIXEL_FORMAT_RGBA32, PIXEL_FORMAT_RGBX32,
+		                       PIXEL_FORMAT_BGRA32, PIXEL_FORMAT_BGRX32 };
+	const primitives_t* prims = primitives_get();
+	const primitives_t* generics = primitives_get_generic();
+
+	WINPR_UNUSED(argv);
+
+	if (argc < 2)
+	{
+		{
+			/* Do content comparison. */
+			for (UINT32 x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
+			{
+				prim_size_t roi = { 64, 64 };
+				int rc = 0;
+				printf("----------------------- GENERIC %s [%" PRIu32 "x%" PRIu32
+				       "] COMPARE CONTENT ----\n",
+				       FreeRDPGetColorFormatName(formats[x]), roi.width, roi.height);
+				rc = test_PrimitivesYCbCr(generics, formats[x], roi, TRUE);
+
+				if (rc != PRIMITIVES_SUCCESS)
+					return rc;
+
+				printf("------------------------- END %s ----------------------\n",
+				       FreeRDPGetColorFormatName(formats[x]));
+				printf("---------------------- OPTIMIZED %s [%" PRIu32 "x%" PRIu32
+				       "] COMPARE CONTENT ----\n",
+				       FreeRDPGetColorFormatName(formats[x]), roi.width, roi.height);
+				rc = test_PrimitivesYCbCr(prims, formats[x], roi, TRUE);
+
+				if (rc != PRIMITIVES_SUCCESS)
+					return rc;
+
+				printf("------------------------- END %s ----------------------\n",
+				       FreeRDPGetColorFormatName(formats[x]));
+			}
+		}
+		/* Do random data conversion with random sizes */
+		{
+			prim_size_t roi;
+
+			do
+			{
+				winpr_RAND(&roi.width, sizeof(roi.width));
+				roi.width %= 2048 / 4;
+			} while (roi.width < 16);
+
+			do
+			{
+				winpr_RAND(&roi.height, sizeof(roi.height));
+				roi.height %= 2048 / 4;
+			} while (roi.height < 16);
+
+			for (size_t x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
+			{
+				int rc = 0;
+				printf("----------------------- GENERIC %s [%" PRIu32 "x%" PRIu32
+				       "] COMPARE CONTENT ----\n",
+				       FreeRDPGetColorFormatName(formats[x]), roi.width, roi.height);
+				rc = test_PrimitivesYCbCr(generics, formats[x], roi, FALSE);
+
+				if (rc != PRIMITIVES_SUCCESS)
+					return rc;
+
+				printf("------------------------- END %s ----------------------\n",
+				       FreeRDPGetColorFormatName(formats[x]));
+				printf("---------------------- OPTIMIZED %s [%" PRIu32 "x%" PRIu32
+				       "] COMPARE CONTENT ----\n",
+				       FreeRDPGetColorFormatName(formats[x]), roi.width, roi.height);
+				rc = test_PrimitivesYCbCr(prims, formats[x], roi, FALSE);
+
+				if (rc != PRIMITIVES_SUCCESS)
+					return rc;
+
+				printf("------------------------- END %s ----------------------\n",
+				       FreeRDPGetColorFormatName(formats[x]));
+			}
+		}
+	}
+	/* Do a performance run with full HD */
+	else
+	{
+		prim_size_t roi = { 1928 / 8, 1080 / 8 };
+
+		for (size_t x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
+		{
+			int rc = 0;
+			printf("----------------------- GENERIC %s [%" PRIu32 "x%" PRIu32
+			       "] COMPARE CONTENT ----\n",
+			       FreeRDPGetColorFormatName(formats[x]), roi.width, roi.height);
+			rc = test_PrimitivesYCbCr(generics, formats[x], roi, FALSE);
+
+			if (rc != PRIMITIVES_SUCCESS)
+				return rc;
+
+			printf("------------------------- END %s ----------------------\n",
+			       FreeRDPGetColorFormatName(formats[x]));
+			printf("---------------------- OPTIMIZED %s [%" PRIu32 "x%" PRIu32
+			       "] COMPARE CONTENT ----\n",
+			       FreeRDPGetColorFormatName(formats[x]), roi.width, roi.height);
+			rc = test_PrimitivesYCbCr(prims, formats[x], roi, FALSE);
+
+			if (rc != PRIMITIVES_SUCCESS)
+				return rc;
+
+			printf("------------------------- END %s ----------------------\n",
+			       FreeRDPGetColorFormatName(formats[x]));
+		}
+	}
+
+	return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesYCoCg.c b/libfreerdp/primitives/test/TestPrimitivesYCoCg.c
new file mode 100644
index 0000000..318aec6
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesYCoCg.c
@@ -0,0 +1,145 @@
+/* test_YCoCg.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+#include <freerdp/utils/profiler.h>
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_YCoCgRToRGB_8u_AC4R_func(UINT32 width, UINT32 height)
+{
+	pstatus_t status = -1;
+	BYTE* out_sse = NULL;
+	BYTE* in = NULL;
+	BYTE* out_c = NULL;
+	const UINT32 srcStride = width * 4;
+	const UINT32 size = srcStride * height;
+	const UINT32 formats[] = { PIXEL_FORMAT_ARGB32, PIXEL_FORMAT_ABGR32, PIXEL_FORMAT_RGBA32,
+		                       PIXEL_FORMAT_RGBX32, PIXEL_FORMAT_BGRA32, PIXEL_FORMAT_BGRX32 };
+	PROFILER_DEFINE(genericProf)
+	PROFILER_DEFINE(optProf)
+	in = winpr_aligned_calloc(1, size, 16);
+	out_c = winpr_aligned_calloc(1, size, 16);
+	out_sse = winpr_aligned_calloc(1, size, 16);
+
+	if (!in || !out_c || !out_sse)
+		goto fail;
+
+	winpr_RAND(in, size);
+
+	for (size_t x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
+	{
+		const UINT32 format = formats[x];
+		const UINT32 dstStride = width * FreeRDPGetBytesPerPixel(format);
+		const char* formatName = FreeRDPGetColorFormatName(format);
+		PROFILER_CREATE(genericProf, "YCoCgRToRGB_8u_AC4R-GENERIC")
+		PROFILER_CREATE(optProf, "YCoCgRToRGB_8u_AC4R-OPT")
+		PROFILER_ENTER(genericProf)
+		status = generic->YCoCgToRGB_8u_AC4R(in, srcStride, out_c, format, dstStride, width, height,
+		                                     2, TRUE);
+		PROFILER_EXIT(genericProf)
+
+		if (status != PRIMITIVES_SUCCESS)
+			goto loop_fail;
+
+		PROFILER_ENTER(optProf)
+		status = optimized->YCoCgToRGB_8u_AC4R(in, srcStride, out_sse, format, dstStride, width,
+		                                       height, 2, TRUE);
+		PROFILER_EXIT(optProf)
+
+		if (status != PRIMITIVES_SUCCESS)
+			goto loop_fail;
+
+		if (memcmp(out_c, out_sse, dstStride * height) != 0)
+		{
+			for (size_t i = 0; i < 1ull * width * height; ++i)
+			{
+				const UINT32 c = FreeRDPReadColor(out_c + 4 * i, format);
+				const UINT32 sse = FreeRDPReadColor(out_sse + 4 * i, format);
+
+				if (c != sse)
+				{
+					printf("optimized->YCoCgRToRGB FAIL[%s] [%" PRIu32 "]: 0x%08" PRIx32
+					       " -> C 0x%08" PRIx32 " vs optimized 0x%08" PRIx32 "\n",
+					       formatName, i, in[i + 1], c, sse);
+					status = -1;
+				}
+			}
+		}
+
+		printf("--------------------------- [%s] [%" PRIu32 "x%" PRIu32
+		       "] ---------------------------\n",
+		       formatName, width, height);
+		PROFILER_PRINT_HEADER
+		PROFILER_PRINT(genericProf)
+		PROFILER_PRINT(optProf)
+		PROFILER_PRINT_FOOTER
+	loop_fail:
+		PROFILER_FREE(genericProf)
+		PROFILER_FREE(optProf)
+
+		if (status != PRIMITIVES_SUCCESS)
+			goto fail;
+	}
+
+fail:
+	winpr_aligned_free(in);
+	winpr_aligned_free(out_c);
+	winpr_aligned_free(out_sse);
+	return status == PRIMITIVES_SUCCESS;
+}
+
+int TestPrimitivesYCoCg(int argc, char* argv[])
+{
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+	prim_test_setup(FALSE);
+
+	/* Random resolution tests */
+	if (argc < 2)
+	{
+		for (UINT32 x = 0; x < 10; x++)
+		{
+			UINT32 w = 0;
+			UINT32 h = 0;
+
+			do
+			{
+				winpr_RAND(&w, sizeof(w));
+				w %= 2048 / 4;
+			} while (w < 16);
+
+			do
+			{
+				winpr_RAND(&h, sizeof(h));
+				h %= 2048 / 4;
+			} while (h < 16);
+
+			if (!test_YCoCgRToRGB_8u_AC4R_func(w, h))
+				return 1;
+		}
+	}
+
+	/* Test once with full HD/4 */
+	if (!test_YCoCgRToRGB_8u_AC4R_func(1920 / 4, 1080 / 4))
+		return 1;
+
+	return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesYUV.c b/libfreerdp/primitives/test/TestPrimitivesYUV.c
new file mode 100644
index 0000000..f679c07
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesYUV.c
@@ -0,0 +1,979 @@
+
+#include <freerdp/config.h>
+
+#include <math.h>
+
+#include "prim_test.h"
+
+#include <winpr/wlog.h>
+#include <winpr/crypto.h>
+#include <freerdp/primitives.h>
+#include <freerdp/utils/profiler.h>
+
+#define TAG __FILE__
+
+#define PADDING_FILL_VALUE 0x37
+
+/* YUV to RGB conversion is lossy, so consider every value only
+ * differing by less than 2 abs equal. */
+static BOOL similar(const BYTE* src, const BYTE* dst, size_t size)
+{
+	for (size_t x = 0; x < size; x++)
+	{
+		int diff = src[x] - dst[x];
+
+		if (abs(diff) > 4)
+		{
+			fprintf(stderr, "%" PRIuz " %02" PRIX8 " : %02" PRIX8 " diff=%d\n", x, src[x], dst[x],
+			        abs(diff));
+			return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+static BOOL similarRGB(const BYTE* src, const BYTE* dst, size_t size, UINT32 format, BOOL use444)
+{
+	const UINT32 bpp = FreeRDPGetBytesPerPixel(format);
+	BYTE fill = PADDING_FILL_VALUE;
+	if (!FreeRDPColorHasAlpha(format))
+		fill = 0xFF;
+
+	for (size_t x = 0; x < size; x++)
+	{
+		const LONG maxDiff = 4;
+		UINT32 sColor = 0;
+		UINT32 dColor = 0;
+		BYTE sR = 0;
+		BYTE sG = 0;
+		BYTE sB = 0;
+		BYTE sA = 0;
+		BYTE dR = 0;
+		BYTE dG = 0;
+		BYTE dB = 0;
+		BYTE dA = 0;
+		sColor = FreeRDPReadColor(src, format);
+		dColor = FreeRDPReadColor(dst, format);
+		src += bpp;
+		dst += bpp;
+		FreeRDPSplitColor(sColor, format, &sR, &sG, &sB, &sA, NULL);
+		FreeRDPSplitColor(dColor, format, &dR, &dG, &dB, &dA, NULL);
+
+		if ((labs(sR - dR) > maxDiff) || (labs(sG - dG) > maxDiff) || (labs(sB - dB) > maxDiff))
+		{
+			fprintf(
+			    stderr,
+			    "Color value  mismatch R[%02X %02X], G[%02X %02X], B[%02X %02X] at position %" PRIuz
+			    "\n",
+			    sR, dR, sG, dG, sA, dA, x);
+			return FALSE;
+		}
+
+		if (dA != fill)
+		{
+			fprintf(
+			    stderr,
+			    "[%s] Invalid destination alpha value 0x%02X [expected 0x%02X] at position %" PRIuz
+			    "\n",
+			    use444 ? "AVC444" : "AVC420", dA, fill, x);
+			return FALSE;
+		}
+	}
+
+	return TRUE;
+}
+
+static void get_size(BOOL large, UINT32* width, UINT32* height)
+{
+	UINT32 shift = large ? 8 : 1;
+	winpr_RAND(width, sizeof(*width));
+	winpr_RAND(height, sizeof(*height));
+	// TODO: Algorithm only works on even resolutions...
+	*width = (*width % 64 + 1) << shift;
+	*height = (*height % 64 + 1) << shift;
+}
+
+static BOOL check_padding(const BYTE* psrc, size_t size, size_t padding, const char* buffer)
+{
+	BOOL rc = TRUE;
+	const BYTE* src = NULL;
+	const BYTE* esrc = NULL;
+	size_t halfPad = (padding + 1) / 2;
+
+	if (!psrc)
+		return FALSE;
+
+	src = psrc - halfPad;
+	esrc = src + size + halfPad;
+
+	for (size_t x = 0; x < halfPad; x++)
+	{
+		const BYTE s = *src++;
+		const BYTE d = *esrc++;
+
+		if (s != 'A')
+		{
+			size_t start = x;
+
+			while ((x < halfPad) && (*esrc++ != 'A'))
+				x++;
+
+			fprintf(stderr,
+			        "Buffer underflow detected %02" PRIx8 " != %02X %s [%" PRIuz "-%" PRIuz "]\n",
+			        d, 'A', buffer, start, x);
+			return FALSE;
+		}
+
+		if (d != 'A')
+		{
+			size_t start = x;
+
+			while ((x < halfPad) && (*esrc++ != 'A'))
+				x++;
+
+			fprintf(stderr,
+			        "Buffer overflow detected %02" PRIx8 " != %02X %s [%" PRIuz "-%" PRIuz "]\n", d,
+			        'A', buffer, start, x);
+			return FALSE;
+		}
+	}
+
+	return rc;
+}
+
+static void* set_padding(size_t size, size_t padding)
+{
+	size_t halfPad = (padding + 1) / 2;
+	BYTE* psrc = NULL;
+	BYTE* src = winpr_aligned_malloc(size + 2 * halfPad, 16);
+
+	if (!src)
+		return NULL;
+
+	memset(&src[0], 'A', halfPad);
+	memset(&src[halfPad], PADDING_FILL_VALUE, size);
+	memset(&src[halfPad + size], 'A', halfPad);
+	psrc = &src[halfPad];
+
+	if (!check_padding(psrc, size, padding, "init"))
+	{
+		winpr_aligned_free(src);
+		return NULL;
+	}
+
+	return psrc;
+}
+
+static void free_padding(void* src, size_t padding)
+{
+	BYTE* ptr = NULL;
+
+	if (!src)
+		return;
+
+	ptr = ((BYTE*)src) - (padding + 1) / 2;
+	winpr_aligned_free(ptr);
+}
+
+/* Create 2 pseudo YUV420 frames of same size.
+ * Combine them and check, if the data is at the expected position. */
+static BOOL TestPrimitiveYUVCombine(primitives_t* prims, prim_size_t roi)
+{
+	union
+	{
+		const BYTE** cpv;
+		BYTE** pv;
+	} cnv;
+	UINT32 awidth = 0;
+	UINT32 aheight = 0;
+	BOOL rc = FALSE;
+	BYTE* luma[3] = { 0 };
+	BYTE* chroma[3] = { 0 };
+	BYTE* yuv[3] = { 0 };
+	BYTE* pmain[3] = { 0 };
+	BYTE* paux[3] = { 0 };
+	UINT32 lumaStride[3];
+	UINT32 chromaStride[3];
+	UINT32 yuvStride[3];
+	const size_t padding = 10000;
+	RECTANGLE_16 rect;
+	PROFILER_DEFINE(yuvCombine)
+	PROFILER_DEFINE(yuvSplit)
+	awidth = roi.width + 16 - roi.width % 16;
+	aheight = roi.height + 16 - roi.height % 16;
+	fprintf(stderr,
+	        "Running YUVCombine on frame size %" PRIu32 "x%" PRIu32 " [%" PRIu32 "x%" PRIu32 "]\n",
+	        roi.width, roi.height, awidth, aheight);
+	PROFILER_CREATE(yuvCombine, "YUV420CombineToYUV444")
+	PROFILER_CREATE(yuvSplit, "YUV444SplitToYUV420")
+	rect.left = 0;
+	rect.top = 0;
+	rect.right = roi.width;
+	rect.bottom = roi.height;
+
+	if (!prims || !prims->YUV420CombineToYUV444)
+		goto fail;
+
+	for (UINT32 x = 0; x < 3; x++)
+	{
+		size_t halfStride = ((x > 0) ? awidth / 2 : awidth);
+		size_t size = aheight * awidth;
+		size_t halfSize = ((x > 0) ? halfStride * aheight / 2 : awidth * aheight);
+		yuvStride[x] = awidth;
+
+		if (!(yuv[x] = set_padding(size, padding)))
+			goto fail;
+
+		lumaStride[x] = halfStride;
+
+		if (!(luma[x] = set_padding(halfSize, padding)))
+			goto fail;
+
+		if (!(pmain[x] = set_padding(halfSize, padding)))
+			goto fail;
+
+		chromaStride[x] = halfStride;
+
+		if (!(chroma[x] = set_padding(halfSize, padding)))
+			goto fail;
+
+		if (!(paux[x] = set_padding(halfSize, padding)))
+			goto fail;
+
+		memset(luma[x], 0xAB + 3 * x, halfSize);
+		memset(chroma[x], 0x80 + 2 * x, halfSize);
+
+		if (!check_padding(luma[x], halfSize, padding, "luma"))
+			goto fail;
+
+		if (!check_padding(chroma[x], halfSize, padding, "chroma"))
+			goto fail;
+
+		if (!check_padding(pmain[x], halfSize, padding, "main"))
+			goto fail;
+
+		if (!check_padding(paux[x], halfSize, padding, "aux"))
+			goto fail;
+
+		if (!check_padding(yuv[x], size, padding, "yuv"))
+			goto fail;
+	}
+
+	PROFILER_ENTER(yuvCombine)
+
+	cnv.pv = luma;
+	if (prims->YUV420CombineToYUV444(AVC444_LUMA, cnv.cpv, lumaStride, roi.width, roi.height, yuv,
+	                                 yuvStride, &rect) != PRIMITIVES_SUCCESS)
+	{
+		PROFILER_EXIT(yuvCombine)
+		goto fail;
+	}
+
+	cnv.pv = chroma;
+	if (prims->YUV420CombineToYUV444(AVC444_CHROMAv1, cnv.cpv, chromaStride, roi.width, roi.height,
+	                                 yuv, yuvStride, &rect) != PRIMITIVES_SUCCESS)
+	{
+		PROFILER_EXIT(yuvCombine)
+		goto fail;
+	}
+
+	PROFILER_EXIT(yuvCombine)
+
+	for (UINT32 x = 0; x < 3; x++)
+	{
+		size_t halfStride = ((x > 0) ? awidth / 2 : awidth);
+		size_t size = aheight * awidth;
+		size_t halfSize = ((x > 0) ? halfStride * aheight / 2 : awidth * aheight);
+
+		if (!check_padding(luma[x], halfSize, padding, "luma"))
+			goto fail;
+
+		if (!check_padding(chroma[x], halfSize, padding, "chroma"))
+			goto fail;
+
+		if (!check_padding(yuv[x], size, padding, "yuv"))
+			goto fail;
+	}
+
+	PROFILER_ENTER(yuvSplit)
+
+	cnv.pv = yuv;
+	if (prims->YUV444SplitToYUV420(cnv.cpv, yuvStride, pmain, lumaStride, paux, chromaStride,
+	                               &roi) != PRIMITIVES_SUCCESS)
+	{
+		PROFILER_EXIT(yuvSplit)
+		goto fail;
+	}
+
+	PROFILER_EXIT(yuvSplit)
+
+	for (UINT32 x = 0; x < 3; x++)
+	{
+		size_t halfStride = ((x > 0) ? awidth / 2 : awidth);
+		size_t size = aheight * awidth;
+		size_t halfSize = ((x > 0) ? halfStride * aheight / 2 : awidth * aheight);
+
+		if (!check_padding(pmain[x], halfSize, padding, "main"))
+			goto fail;
+
+		if (!check_padding(paux[x], halfSize, padding, "aux"))
+			goto fail;
+
+		if (!check_padding(yuv[x], size, padding, "yuv"))
+			goto fail;
+	}
+
+	for (UINT32 i = 0; i < 3; i++)
+	{
+		for (UINT32 y = 0; y < roi.height; y++)
+		{
+			UINT32 w = roi.width;
+			UINT32 lstride = lumaStride[i];
+			UINT32 cstride = chromaStride[i];
+
+			if (i > 0)
+			{
+				w = (roi.width + 3) / 4;
+
+				if (roi.height > (roi.height + 1) / 2)
+					continue;
+			}
+
+			if (!similar(luma[i] + y * lstride, pmain[i] + y * lstride, w))
+				goto fail;
+
+			/* Need to ignore lines of destination Y plane,
+			 * if the lines are not a multiple of 16
+			 * as the UV planes are packed in 8 line stripes. */
+			if (i == 0)
+			{
+				/* TODO: This check is not perfect, it does not
+				 * include the last V lines packed to the Y
+				 * frame. */
+				UINT32 rem = roi.height % 16;
+
+				if (y > roi.height - rem)
+					continue;
+			}
+
+			if (!similar(chroma[i] + y * cstride, paux[i] + y * cstride, w))
+				goto fail;
+		}
+	}
+
+	PROFILER_PRINT_HEADER
+	PROFILER_PRINT(yuvSplit)
+	PROFILER_PRINT(yuvCombine)
+	PROFILER_PRINT_FOOTER
+	rc = TRUE;
+fail:
+	PROFILER_FREE(yuvCombine)
+	PROFILER_FREE(yuvSplit)
+
+	for (UINT32 x = 0; x < 3; x++)
+	{
+		free_padding(yuv[x], padding);
+		free_padding(luma[x], padding);
+		free_padding(chroma[x], padding);
+		free_padding(pmain[x], padding);
+		free_padding(paux[x], padding);
+	}
+
+	return rc;
+}
+
+static BOOL TestPrimitiveYUV(primitives_t* prims, prim_size_t roi, BOOL use444)
+{
+	union
+	{
+		const BYTE** cpv;
+		BYTE** pv;
+	} cnv;
+	BOOL res = FALSE;
+	UINT32 awidth = 0;
+	UINT32 aheight = 0;
+	BYTE* yuv[3] = { 0 };
+	UINT32 yuv_step[3];
+	BYTE* rgb = NULL;
+	BYTE* rgb_dst = NULL;
+	size_t size = 0;
+	size_t uvsize = 0;
+	size_t uvwidth = 0;
+	size_t padding = 100 * 16;
+	UINT32 stride = 0;
+	const UINT32 formats[] = { PIXEL_FORMAT_XRGB32, PIXEL_FORMAT_XBGR32, PIXEL_FORMAT_ARGB32,
+		                       PIXEL_FORMAT_ABGR32, PIXEL_FORMAT_RGBA32, PIXEL_FORMAT_RGBX32,
+		                       PIXEL_FORMAT_BGRA32, PIXEL_FORMAT_BGRX32 };
+	PROFILER_DEFINE(rgbToYUV420)
+	PROFILER_DEFINE(rgbToYUV444)
+	PROFILER_DEFINE(yuv420ToRGB)
+	PROFILER_DEFINE(yuv444ToRGB)
+	/* Buffers need to be 16x16 aligned. */
+	awidth = roi.width + 16 - roi.width % 16;
+	aheight = roi.height + 16 - roi.height % 16;
+	stride = awidth * sizeof(UINT32);
+	size = awidth * aheight;
+
+	if (use444)
+	{
+		uvwidth = awidth;
+		uvsize = size;
+
+		if (!prims || !prims->RGBToYUV444_8u_P3AC4R || !prims->YUV444ToRGB_8u_P3AC4R)
+			return FALSE;
+	}
+	else
+	{
+		uvwidth = (awidth + 1) / 2;
+		uvsize = (aheight + 1) / 2 * uvwidth;
+
+		if (!prims || !prims->RGBToYUV420_8u_P3AC4R || !prims->YUV420ToRGB_8u_P3AC4R)
+			return FALSE;
+	}
+
+	fprintf(stderr, "Running AVC%s on frame size %" PRIu32 "x%" PRIu32 "\n", use444 ? "444" : "420",
+	        roi.width, roi.height);
+
+	/* Test RGB to YUV444 conversion and vice versa */
+	if (!(rgb = set_padding(size * sizeof(UINT32), padding)))
+		goto fail;
+
+	if (!(rgb_dst = set_padding(size * sizeof(UINT32), padding)))
+		goto fail;
+
+	if (!(yuv[0] = set_padding(size, padding)))
+		goto fail;
+
+	if (!(yuv[1] = set_padding(uvsize, padding)))
+		goto fail;
+
+	if (!(yuv[2] = set_padding(uvsize, padding)))
+		goto fail;
+
+	for (UINT32 y = 0; y < roi.height; y++)
+	{
+		BYTE* line = &rgb[y * stride];
+
+		for (UINT32 x = 0; x < roi.width; x++)
+		{
+			line[x * 4 + 0] = 0x81;
+			line[x * 4 + 1] = 0x33;
+			line[x * 4 + 2] = 0xAB;
+			line[x * 4 + 3] = 0xFF;
+		}
+	}
+
+	yuv_step[0] = awidth;
+	yuv_step[1] = uvwidth;
+	yuv_step[2] = uvwidth;
+
+	for (UINT32 x = 0; x < ARRAYSIZE(formats); x++)
+	{
+		pstatus_t rc = 0;
+		const UINT32 DstFormat = formats[x];
+		printf("Testing destination color format %s\n", FreeRDPGetColorFormatName(DstFormat));
+		memset(rgb_dst, PADDING_FILL_VALUE, size * sizeof(UINT32));
+
+		PROFILER_CREATE(rgbToYUV420, "RGBToYUV420")
+		PROFILER_CREATE(rgbToYUV444, "RGBToYUV444")
+		PROFILER_CREATE(yuv420ToRGB, "YUV420ToRGB")
+		PROFILER_CREATE(yuv444ToRGB, "YUV444ToRGB")
+
+		if (use444)
+		{
+			PROFILER_ENTER(rgbToYUV444)
+			rc = prims->RGBToYUV444_8u_P3AC4R(rgb, DstFormat, stride, yuv, yuv_step, &roi);
+			PROFILER_EXIT(rgbToYUV444)
+
+			if (rc != PRIMITIVES_SUCCESS)
+				goto loop_fail;
+
+			PROFILER_PRINT_HEADER
+			PROFILER_PRINT(rgbToYUV444)
+			PROFILER_PRINT_FOOTER
+		}
+		else
+		{
+			PROFILER_ENTER(rgbToYUV420)
+			rc = prims->RGBToYUV420_8u_P3AC4R(rgb, DstFormat, stride, yuv, yuv_step, &roi);
+			PROFILER_EXIT(rgbToYUV420)
+
+			if (rc != PRIMITIVES_SUCCESS)
+				goto loop_fail;
+
+			PROFILER_PRINT_HEADER
+			PROFILER_PRINT(rgbToYUV420)
+			PROFILER_PRINT_FOOTER
+		}
+
+		if (!check_padding(rgb, size * sizeof(UINT32), padding, "rgb"))
+		{
+			rc = -1;
+			goto loop_fail;
+		}
+
+		if ((!check_padding(yuv[0], size, padding, "Y")) ||
+		    (!check_padding(yuv[1], uvsize, padding, "U")) ||
+		    (!check_padding(yuv[2], uvsize, padding, "V")))
+		{
+			rc = -1;
+			goto loop_fail;
+		}
+
+		cnv.pv = yuv;
+		if (use444)
+		{
+			PROFILER_ENTER(yuv444ToRGB)
+			rc = prims->YUV444ToRGB_8u_P3AC4R(cnv.cpv, yuv_step, rgb_dst, stride, DstFormat, &roi);
+			PROFILER_EXIT(yuv444ToRGB)
+
+			if (rc != PRIMITIVES_SUCCESS)
+				goto loop_fail;
+
+		loop_fail:
+			PROFILER_EXIT(yuv444ToRGB)
+			PROFILER_PRINT_HEADER
+			PROFILER_PRINT(yuv444ToRGB)
+			PROFILER_PRINT_FOOTER
+
+			if (rc != PRIMITIVES_SUCCESS)
+				goto fail;
+		}
+		else
+		{
+			PROFILER_ENTER(yuv420ToRGB)
+
+			if (prims->YUV420ToRGB_8u_P3AC4R(cnv.cpv, yuv_step, rgb_dst, stride, DstFormat, &roi) !=
+			    PRIMITIVES_SUCCESS)
+			{
+				PROFILER_EXIT(yuv420ToRGB)
+				goto fail;
+			}
+
+			PROFILER_EXIT(yuv420ToRGB)
+			PROFILER_PRINT_HEADER
+			PROFILER_PRINT(yuv420ToRGB)
+			PROFILER_PRINT_FOOTER
+		}
+
+		if (!check_padding(rgb_dst, size * sizeof(UINT32), padding, "rgb dst"))
+			goto fail;
+
+		if ((!check_padding(yuv[0], size, padding, "Y")) ||
+		    (!check_padding(yuv[1], uvsize, padding, "U")) ||
+		    (!check_padding(yuv[2], uvsize, padding, "V")))
+			goto fail;
+
+		for (UINT32 y = 0; y < roi.height; y++)
+		{
+			BYTE* srgb = &rgb[y * stride];
+			BYTE* drgb = &rgb_dst[y * stride];
+
+			if (!similarRGB(srgb, drgb, roi.width, DstFormat, use444))
+				goto fail;
+		}
+
+		PROFILER_FREE(rgbToYUV420)
+		PROFILER_FREE(rgbToYUV444)
+		PROFILER_FREE(yuv420ToRGB)
+		PROFILER_FREE(yuv444ToRGB)
+	}
+
+	res = TRUE;
+fail:
+	free_padding(rgb, padding);
+	free_padding(rgb_dst, padding);
+	free_padding(yuv[0], padding);
+	free_padding(yuv[1], padding);
+	free_padding(yuv[2], padding);
+	return res;
+}
+
+static BOOL allocate_yuv420(BYTE** planes, UINT32 width, UINT32 height, UINT32 padding)
+{
+	const size_t size = width * height;
+	const size_t uvwidth = (width + 1) / 2;
+	const size_t uvsize = (height + 1) / 2 * uvwidth;
+
+	if (!(planes[0] = set_padding(size, padding)))
+		goto fail;
+
+	if (!(planes[1] = set_padding(uvsize, padding)))
+		goto fail;
+
+	if (!(planes[2] = set_padding(uvsize, padding)))
+		goto fail;
+
+	return TRUE;
+fail:
+	free_padding(planes[0], padding);
+	free_padding(planes[1], padding);
+	free_padding(planes[2], padding);
+	return FALSE;
+}
+
+static void free_yuv420(BYTE** planes, UINT32 padding)
+{
+	if (!planes)
+		return;
+
+	free_padding(planes[0], padding);
+	free_padding(planes[1], padding);
+	free_padding(planes[2], padding);
+	planes[0] = NULL;
+	planes[1] = NULL;
+	planes[2] = NULL;
+}
+static BOOL check_yuv420(BYTE** planes, UINT32 width, UINT32 height, UINT32 padding)
+{
+	const size_t size = width * height;
+	const size_t uvwidth = (width + 1) / 2;
+	const size_t uvsize = (height + 1) / 2 * uvwidth;
+	const BOOL yOk = check_padding(planes[0], size, padding, "Y");
+	const BOOL uOk = check_padding(planes[1], uvsize, padding, "U");
+	const BOOL vOk = check_padding(planes[2], uvsize, padding, "V");
+	return (yOk && uOk && vOk);
+}
+
+static BOOL check_for_mismatches(const BYTE* planeA, const BYTE* planeB, UINT32 size)
+{
+	BOOL rc = FALSE;
+
+	for (UINT32 x = 0; x < size; x++)
+	{
+		const BYTE a = planeA[x];
+		const BYTE b = planeB[x];
+
+		if (fabsf((float)a - (float)b) > 2.0f)
+		{
+			rc = TRUE;
+			fprintf(stderr, "[%08x] %02x != %02x\n", x, a, b);
+		}
+	}
+
+	return rc;
+}
+
+static BOOL compare_yuv420(BYTE** planesA, BYTE** planesB, UINT32 width, UINT32 height,
+                           UINT32 padding)
+{
+	BOOL rc = TRUE;
+	const size_t size = width * height;
+	const size_t uvwidth = (width + 1) / 2;
+	const size_t uvsize = (height + 1) / 2 * uvwidth;
+
+	if (check_for_mismatches(planesA[0], planesB[0], size))
+	{
+		fprintf(stderr, "Mismatch in Y planes!");
+		rc = FALSE;
+	}
+
+	if (check_for_mismatches(planesA[1], planesB[1], uvsize))
+	{
+		fprintf(stderr, "Mismatch in U planes!");
+		rc = FALSE;
+	}
+
+	if (check_for_mismatches(planesA[2], planesB[2], uvsize))
+	{
+		fprintf(stderr, "Mismatch in V planes!");
+		rc = FALSE;
+	}
+
+	return rc;
+}
+
+static BOOL TestPrimitiveRgbToLumaChroma(primitives_t* prims, prim_size_t roi, UINT32 version)
+{
+	BOOL res = FALSE;
+	UINT32 awidth = 0;
+	UINT32 aheight = 0;
+	BYTE* luma[3] = { 0 };
+	BYTE* chroma[3] = { 0 };
+	BYTE* lumaGeneric[3] = { 0 };
+	BYTE* chromaGeneric[3] = { 0 };
+	UINT32 yuv_step[3];
+	BYTE* rgb = NULL;
+	size_t size = 0;
+	size_t uvwidth = 0;
+	const size_t padding = 0x1000;
+	UINT32 stride = 0;
+	__RGBToAVC444YUV_t fkt = NULL;
+	__RGBToAVC444YUV_t gen = NULL;
+	const UINT32 formats[] = { PIXEL_FORMAT_XRGB32, PIXEL_FORMAT_XBGR32, PIXEL_FORMAT_ARGB32,
+		                       PIXEL_FORMAT_ABGR32, PIXEL_FORMAT_RGBA32, PIXEL_FORMAT_RGBX32,
+		                       PIXEL_FORMAT_BGRA32, PIXEL_FORMAT_BGRX32 };
+	PROFILER_DEFINE(rgbToYUV444)
+	PROFILER_DEFINE(rgbToYUV444opt)
+	/* Buffers need to be 16x16 aligned. */
+	awidth = roi.width;
+
+	if (awidth % 16 != 0)
+		awidth += 16 - roi.width % 16;
+
+	aheight = roi.height;
+
+	if (aheight % 16 != 0)
+		aheight += 16 - roi.height % 16;
+
+	stride = awidth * sizeof(UINT32);
+	size = awidth * aheight;
+	uvwidth = (awidth + 1) / 2;
+
+	if (!prims || !generic)
+		return FALSE;
+
+	switch (version)
+	{
+		case 1:
+			fkt = prims->RGBToAVC444YUV;
+			gen = generic->RGBToAVC444YUV;
+			break;
+
+		case 2:
+			fkt = prims->RGBToAVC444YUVv2;
+			gen = generic->RGBToAVC444YUVv2;
+			break;
+
+		default:
+			return FALSE;
+	}
+
+	if (!fkt || !gen)
+		return FALSE;
+
+	fprintf(stderr, "Running AVC444 on frame size %" PRIu32 "x%" PRIu32 "\n", roi.width,
+	        roi.height);
+
+	/* Test RGB to YUV444 conversion and vice versa */
+	if (!(rgb = set_padding(size * sizeof(UINT32), padding)))
+		goto fail;
+
+	if (!allocate_yuv420(luma, awidth, aheight, padding))
+		goto fail;
+
+	if (!allocate_yuv420(chroma, awidth, aheight, padding))
+		goto fail;
+
+	if (!allocate_yuv420(lumaGeneric, awidth, aheight, padding))
+		goto fail;
+
+	if (!allocate_yuv420(chromaGeneric, awidth, aheight, padding))
+		goto fail;
+
+	for (UINT32 y = 0; y < roi.height; y++)
+	{
+		BYTE* line = &rgb[y * stride];
+
+		for (UINT32 x = 0; x < roi.width; x++)
+		{
+#if 1
+			line[x * 4 + 0] = rand();
+			line[x * 4 + 1] = rand();
+			line[x * 4 + 2] = rand();
+			line[x * 4 + 3] = rand();
+#else
+			line[x * 4 + 0] = (y * roi.width + x) * 16 + 5;
+			line[x * 4 + 1] = (y * roi.width + x) * 16 + 7;
+			line[x * 4 + 2] = (y * roi.width + x) * 16 + 11;
+			line[x * 4 + 3] = (y * roi.width + x) * 16 + 0;
+#endif
+		}
+	}
+
+	yuv_step[0] = awidth;
+	yuv_step[1] = uvwidth;
+	yuv_step[2] = uvwidth;
+
+	for (UINT32 x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
+	{
+		pstatus_t rc = -1;
+		const UINT32 DstFormat = formats[x];
+		printf("Testing destination color format %s\n", FreeRDPGetColorFormatName(DstFormat));
+		PROFILER_CREATE(rgbToYUV444, "RGBToYUV444-generic")
+		PROFILER_CREATE(rgbToYUV444opt, "RGBToYUV444-optimized")
+
+		for (UINT32 cnt = 0; cnt < 10; cnt++)
+		{
+			PROFILER_ENTER(rgbToYUV444opt)
+			rc = fkt(rgb, DstFormat, stride, luma, yuv_step, chroma, yuv_step, &roi);
+			PROFILER_EXIT(rgbToYUV444opt)
+
+			if (rc != PRIMITIVES_SUCCESS)
+				goto loop_fail;
+		}
+
+		PROFILER_PRINT_HEADER
+		PROFILER_PRINT(rgbToYUV444opt)
+		PROFILER_PRINT_FOOTER
+
+		if (!check_padding(rgb, size * sizeof(UINT32), padding, "rgb"))
+		{
+			rc = -1;
+			goto loop_fail;
+		}
+
+		if (!check_yuv420(luma, awidth, aheight, padding) ||
+		    !check_yuv420(chroma, awidth, aheight, padding))
+		{
+			rc = -1;
+			goto loop_fail;
+		}
+
+		for (UINT32 cnt = 0; cnt < 10; cnt++)
+		{
+			PROFILER_ENTER(rgbToYUV444)
+			rc = gen(rgb, DstFormat, stride, lumaGeneric, yuv_step, chromaGeneric, yuv_step, &roi);
+			PROFILER_EXIT(rgbToYUV444)
+
+			if (rc != PRIMITIVES_SUCCESS)
+				goto loop_fail;
+		}
+
+		PROFILER_PRINT_HEADER
+		PROFILER_PRINT(rgbToYUV444)
+		PROFILER_PRINT_FOOTER
+
+		if (!check_padding(rgb, size * sizeof(UINT32), padding, "rgb"))
+		{
+			rc = -1;
+			goto loop_fail;
+		}
+
+		if (!check_yuv420(lumaGeneric, awidth, aheight, padding) ||
+		    !check_yuv420(chromaGeneric, awidth, aheight, padding))
+		{
+			rc = -1;
+			goto loop_fail;
+		}
+
+		if (!compare_yuv420(luma, lumaGeneric, awidth, aheight, padding) ||
+		    !compare_yuv420(chroma, chromaGeneric, awidth, aheight, padding))
+		{
+			rc = -1;
+			goto loop_fail;
+		}
+
+	loop_fail:
+		PROFILER_FREE(rgbToYUV444)
+		PROFILER_FREE(rgbToYUV444opt)
+
+		if (rc != PRIMITIVES_SUCCESS)
+			goto fail;
+	}
+
+	res = TRUE;
+fail:
+	free_padding(rgb, padding);
+	free_yuv420(luma, padding);
+	free_yuv420(chroma, padding);
+	free_yuv420(lumaGeneric, padding);
+	free_yuv420(chromaGeneric, padding);
+	return res;
+}
+
+int TestPrimitivesYUV(int argc, char* argv[])
+{
+	BOOL large = (argc > 1);
+	int rc = -1;
+	WINPR_UNUSED(argc);
+	WINPR_UNUSED(argv);
+	prim_test_setup(FALSE);
+	primitives_t* prims = primitives_get();
+
+	for (UINT32 x = 0; x < 5; x++)
+	{
+		prim_size_t roi;
+
+		if (argc > 1)
+		{
+			int crc = sscanf(argv[1], "%" PRIu32 "x%" PRIu32, &roi.width, &roi.height);
+
+			if (crc != 2)
+			{
+				roi.width = 1920;
+				roi.height = 1080;
+			}
+		}
+		else
+			get_size(large, &roi.width, &roi.height);
+
+		printf("-------------------- GENERIC ------------------------\n");
+
+		if (!TestPrimitiveYUV(generic, roi, TRUE))
+		{
+			printf("TestPrimitiveYUV (444) failed.\n");
+			goto end;
+		}
+
+		printf("---------------------- END --------------------------\n");
+		printf("------------------- OPTIMIZED -----------------------\n");
+
+		if (!TestPrimitiveYUV(prims, roi, TRUE))
+		{
+			printf("TestPrimitiveYUV (444) failed.\n");
+			goto end;
+		}
+
+		printf("---------------------- END --------------------------\n");
+		printf("-------------------- GENERIC ------------------------\n");
+
+		if (!TestPrimitiveYUV(generic, roi, FALSE))
+		{
+			printf("TestPrimitiveYUV (420) failed.\n");
+			goto end;
+		}
+
+		printf("---------------------- END --------------------------\n");
+		printf("------------------- OPTIMIZED -----------------------\n");
+
+		if (!TestPrimitiveYUV(prims, roi, FALSE))
+		{
+			printf("TestPrimitiveYUV (420) failed.\n");
+			goto end;
+		}
+
+		printf("---------------------- END --------------------------\n");
+		printf("-------------------- GENERIC ------------------------\n");
+
+		if (!TestPrimitiveYUVCombine(generic, roi))
+		{
+			printf("TestPrimitiveYUVCombine failed.\n");
+			goto end;
+		}
+
+		printf("---------------------- END --------------------------\n");
+		printf("------------------- OPTIMIZED -----------------------\n");
+
+		if (!TestPrimitiveYUVCombine(prims, roi))
+		{
+			printf("TestPrimitiveYUVCombine failed.\n");
+			goto end;
+		}
+
+		printf("---------------------- END --------------------------\n");
+		printf("------------------- OPTIMIZED -----------------------\n");
+
+		if (!TestPrimitiveRgbToLumaChroma(prims, roi, 1))
+		{
+			printf("TestPrimitiveRgbToLumaChroma failed.\n");
+			goto end;
+		}
+
+		printf("---------------------- END --------------------------\n");
+		printf("-------------------- GENERIC ------------------------\n");
+
+		if (!TestPrimitiveRgbToLumaChroma(prims, roi, 2))
+		{
+			printf("TestPrimitiveYUVCombine failed.\n");
+			goto end;
+		}
+
+		printf("---------------------- END --------------------------\n");
+	}
+
+	rc = 0;
+end:
+	return rc;
+}
diff --git a/libfreerdp/primitives/test/measure.h b/libfreerdp/primitives/test/measure.h
new file mode 100644
index 0000000..ee04abd
--- /dev/null
+++ b/libfreerdp/primitives/test/measure.h
@@ -0,0 +1,145 @@
+/* measure.h
+ * Macros to help with performance measurement.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ * MEASURE_LOOP_START("measurement", 2000)
+ *   code to be measured
+ * MEASURE_LOOP_STOP
+ *   buffer flush and such
+ * MEASURE_SHOW_RESULTS
+ *
+ * Define GOOGLE_PROFILER if you want gperftools included.
+ */
+
+#ifndef TEST_MEASURE_H_INCLUDED
+#define TEST_MEASURE_H_INCLUDED
+
+#include <freerdp/config.h>
+
+#include <time.h>
+#include <winpr/string.h>
+
+#ifndef _WIN32
+#include <sys/param.h>
+#endif
+
+#include <winpr/crt.h>
+
+#ifdef _WIN32
+
+#define PROFILER_START(_prefix_)
+#define PROFILER_STOP
+
+#define MEASURE_LOOP_START(_prefix_, _count_)
+#define MEASURE_LOOP_STOP
+#define MEASURE_GET_RESULTS(_result_)
+#define MEASURE_SHOW_RESULTS(_result_)
+#define MEASURE_SHOW_RESULTS_SCALED(_scale_, _label_)
+#define MEASURE_TIMED(_label_, _init_iter_, _test_time_, _result_, _call_)
+
+#else
+
+#ifdef GOOGLE_PROFILER
+#include <gperftools/profiler.h>
+#define PROFILER_START(_prefix_)                                  \
+	do                                                            \
+	{                                                             \
+		char _path[PATH_MAX];                                     \
+		sprintf_s(_path, sizeof(_path), "./%s.prof", (_prefix_)); \
+		ProfilerStart(_path);                                     \
+	} while (0);
+#define PROFILER_STOP   \
+	do                  \
+	{                   \
+		ProfilerStop(); \
+	} while (0);
+#else
+#define PROFILER_START(_prefix_)
+#define PROFILER_STOP
+#endif // GOOGLE_PROFILER
+
+extern float _delta_time(const struct timespec* t0, const struct timespec* t1);
+extern void _floatprint(float t, char* output);
+
+#ifndef CLOCK_MONOTONIC_RAW
+#define CLOCK_MONOTONIC_RAW 4
+#endif // !CLOCK_MONOTONIC_RAW
+
+#define MEASURE_LOOP_START(_prefix_, _count_)        \
+	{                                                \
+		struct timespec _start, _stop;               \
+		char* _prefix;                               \
+		int _count = (_count_);                      \
+		int _loop;                                   \
+		float _delta;                                \
+		char _str1[32], _str2[32];                   \
+		_prefix = _strdup(_prefix_);                 \
+		_str1[0] = '\0';                             \
+		_str2[0] = '\0';                             \
+		clock_gettime(CLOCK_MONOTONIC_RAW, &_start); \
+		PROFILER_START(_prefix);                     \
+		_loop = (_count);                            \
+		do                                           \
+		{
+
+#define MEASURE_LOOP_STOP \
+	}                     \
+	while (--_loop)       \
+		;
+
+#define MEASURE_GET_RESULTS(_result_)           \
+	PROFILER_STOP;                              \
+	clock_gettime(CLOCK_MONOTONIC_RAW, &_stop); \
+	_delta = _delta_time(&_start, &_stop);      \
+	(_result_) = (float)_count / _delta;        \
+	free(_prefix);                              \
+	}
+
+#define MEASURE_SHOW_RESULTS(_result_)                                                       \
+	PROFILER_STOP;                                                                           \
+	clock_gettime(CLOCK_MONOTONIC_RAW, &_stop);                                              \
+	_delta = _delta_time(&_start, &_stop);                                                   \
+	(_result_) = (float)_count / _delta;                                                     \
+	_floatprint((float)_count / _delta, _str1);                                              \
+	printf("%s: %9d iterations in %5.1f seconds = %s/s \n", _prefix, _count, _delta, _str1); \
+	free(_prefix);                                                                           \
+	}
+
+#define MEASURE_SHOW_RESULTS_SCALED(_scale_, _label_)                                              \
+	PROFILER_STOP;                                                                                 \
+	clock_gettime(CLOCK_MONOTONIC_RAW, &_stop);                                                    \
+	_delta = _delta_time(&_start, &_stop);                                                         \
+	_floatprint((float)_count / _delta, _str1);                                                    \
+	_floatprint((float)_count / _delta * (_scale_), _str2);                                        \
+	printf("%s: %9d iterations in %5.1f seconds = %s/s = %s%s \n", _prefix, _count, _delta, _str1, \
+	       _str2, _label_);                                                                        \
+	free(_prefix);                                                                                 \
+	}
+
+#define MEASURE_TIMED(_label_, _init_iter_, _test_time_, _result_, _call_) \
+	{                                                                      \
+		float _r;                                                          \
+		MEASURE_LOOP_START(_label_, _init_iter_);                          \
+		_call_;                                                            \
+		MEASURE_LOOP_STOP;                                                 \
+		MEASURE_GET_RESULTS(_r);                                           \
+		MEASURE_LOOP_START(_label_, _r* _test_time_);                      \
+		_call_;                                                            \
+		MEASURE_LOOP_STOP;                                                 \
+		MEASURE_SHOW_RESULTS(_result_);                                    \
+	}
+
+#endif
+
+#endif // __MEASURE_H_INCLUDED__
diff --git a/libfreerdp/primitives/test/prim_test.c b/libfreerdp/primitives/test/prim_test.c
new file mode 100644
index 0000000..ede8316
--- /dev/null
+++ b/libfreerdp/primitives/test/prim_test.c
@@ -0,0 +1,109 @@
+/* prim_test.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include "prim_test.h"
+
+#ifndef _WIN32
+#include <fcntl.h>
+#include <math.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#endif
+
+#include <winpr/sysinfo.h>
+#include <winpr/platform.h>
+#include <winpr/crypto.h>
+
+primitives_t* generic = NULL;
+primitives_t* optimized = NULL;
+BOOL g_TestPrimitivesPerformance = FALSE;
+UINT32 g_Iterations = 1000;
+
+int test_sizes[] = { 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096 };
+
+/* ------------------------------------------------------------------------- */
+
+#ifdef _WIN32
+float _delta_time(const struct timespec* t0, const struct timespec* t1)
+{
+	return 0.0f;
+}
+#else
+float _delta_time(const struct timespec* t0, const struct timespec* t1)
+{
+	INT64 secs = (INT64)(t1->tv_sec) - (INT64)(t0->tv_sec);
+	long nsecs = t1->tv_nsec - t0->tv_nsec;
+	double retval = NAN;
+
+	if (nsecs < 0)
+	{
+		--secs;
+		nsecs += 1000000000;
+	}
+
+	retval = (double)secs + (double)nsecs / (double)1000000000.0;
+	return (retval < 0.0) ? 0.0 : (float)retval;
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void _floatprint(float t, char* output)
+{
+	/* I don't want to link against -lm, so avoid log,exp,... */
+	float f = 10.0;
+	int i = 0;
+
+	while (t > f)
+		f *= 10.0;
+
+	f /= 1000.0;
+	i = ((int)(t / f + 0.5f)) * (int)f;
+
+	if (t < 0.0f)
+		sprintf(output, "%f", t);
+	else if (i == 0)
+		sprintf(output, "%d", (int)(t + 0.5f));
+	else if (t < 1e+3f)
+		sprintf(output, "%3d", i);
+	else if (t < 1e+6f)
+		sprintf(output, "%3d,%03d", i / 1000, i % 1000);
+	else if (t < 1e+9f)
+		sprintf(output, "%3d,%03d,000", i / 1000000, (i % 1000000) / 1000);
+	else if (t < 1e+12f)
+		sprintf(output, "%3d,%03d,000,000", i / 1000000000, (i % 1000000000) / 1000000);
+	else
+		sprintf(output, "%f", t);
+}
+
+void prim_test_setup(BOOL performance)
+{
+	generic = primitives_get_generic();
+	optimized = primitives_get();
+	g_TestPrimitivesPerformance = performance;
+}
+
+BOOL speed_test(const char* name, const char* dsc, UINT32 iterations, pstatus_t (*fkt_generic)(),
+                pstatus_t (*optimised)(), ...)
+{
+	if (!name || !generic || !optimised || (iterations == 0))
+		return FALSE;
+
+	for (UINT32 i = 0; i < iterations; i++)
+	{
+	}
+
+	return TRUE;
+}
diff --git a/libfreerdp/primitives/test/prim_test.h b/libfreerdp/primitives/test/prim_test.h
new file mode 100644
index 0000000..3642f51
--- /dev/null
+++ b/libfreerdp/primitives/test/prim_test.h
@@ -0,0 +1,59 @@
+/* primtest.h
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ */
+
+#ifndef FREERDP_LIB_PRIMTEST_H
+#define FREERDP_LIB_PRIMTEST_H
+
+#include <winpr/crt.h>
+#include <winpr/spec.h>
+#include <winpr/wtypes.h>
+#include <winpr/platform.h>
+#include <winpr/crypto.h>
+
+#include <freerdp/primitives.h>
+
+#include "measure.h"
+
+#ifdef WITH_IPP
+#include <ipps.h>
+#include <ippi.h>
+#endif
+
+#ifdef _WIN32
+#define ALIGN(x) x
+#else
+#define ALIGN(x) x DECLSPEC_ALIGN(MEMORY_ALLOCATION_ALIGNMENT)
+#endif
+
+#define ABS(_x_) ((_x_) < 0 ? (-(_x_)) : (_x_))
+#define MAX_TEST_SIZE 4096
+
+extern int test_sizes[];
+#define NUM_TEST_SIZES 10
+
+extern BOOL g_TestPrimitivesPerformance;
+extern UINT32 g_Iterations;
+
+extern primitives_t* generic;
+extern primitives_t* optimized;
+
+void prim_test_setup(BOOL performance);
+
+typedef pstatus_t (*speed_test_fkt)();
+
+BOOL speed_test(const char* name, const char* dsc, UINT32 iterations, speed_test_fkt generic,
+                speed_test_fkt optimized, ...);
+
+#endif /* FREERDP_LIB_PRIMTEST_H */