1 files changed, 589 insertions, 0 deletions
diff --git a/libfreerdp/primitives/prim_YCoCg_opt.c b/libfreerdp/primitives/prim_YCoCg_opt.c
new file mode 100644
index 0000000..bba13fa
--- /dev/null
+++ b/libfreerdp/primitives/prim_YCoCg_opt.c
@@ -0,0 +1,589 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized YCoCg<->RGB conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#elif defined(WITH_NEON)
+#include <arm_neon.h>
+#endif /* WITH_SSE2 else WITH_NEON */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+static primitives_t* generic = NULL;
+
+#ifdef WITH_SSE2
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
+                                                  BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
+                                                  UINT32 dstStep, UINT32 width, UINT32 height,
+                                                  UINT8 shift, BOOL withAlpha)
+{
+	const BYTE* sptr = pSrc;
+	BYTE* dptr = (BYTE*)pDst;
+	int sRowBump = srcStep - width * sizeof(UINT32);
+	int dRowBump = dstStep - width * sizeof(UINT32);
+	/* Shift left by "shift" and divide by two is the same as shift
+	 * left by "shift-1".
+	 */
+	int dataShift = shift - 1;
+	BYTE mask = (BYTE)(0xFFU << dataShift);
+
+	/* Let's say the data is of the form:
+	 * y0y0o0g0 a1y1o1g1 a2y2o2g2...
+	 * Apply:
+	 * |R|   | 1  1/2 -1/2 |   |y|
+	 * |G| = | 1  0    1/2 | * |o|
+	 * |B|   | 1 -1/2 -1/2 |   |g|
+	 * where Y is 8-bit unsigned and o & g are 8-bit signed.
+	 */
+
+	if ((width < 8) || (ULONG_PTR)dptr & 0x03)
+	{
+		/* Too small, or we'll never hit a 16-byte boundary.  Punt. */
+		return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+		                                   shift, withAlpha);
+	}
+
+	for (UINT32 h = 0; h < height; h++)
+	{
+		UINT32 w = width;
+		BOOL onStride = 0;
+
+		/* Get to a 16-byte destination boundary. */
+		if ((ULONG_PTR)dptr & 0x0f)
+		{
+			pstatus_t status = 0;
+			UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
+
+			if (startup > width)
+				startup = width;
+
+			status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
+			                                     1, shift, withAlpha);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return status;
+
+			sptr += startup * sizeof(UINT32);
+			dptr += startup * sizeof(UINT32);
+			w -= startup;
+		}
+
+		/* Each loop handles eight pixels at a time. */
+		onStride = (((ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
+
+		while (w >= 8)
+		{
+			__m128i R0;
+			__m128i R1;
+			__m128i R2;
+			__m128i R3;
+			__m128i R4;
+			__m128i R5;
+			__m128i R6;
+			__m128i R7;
+
+			if (onStride)
+			{
+				/* The faster path, 16-byte aligned load. */
+				R0 = _mm_load_si128((const __m128i*)sptr);
+				sptr += (128 / 8);
+				R1 = _mm_load_si128((const __m128i*)sptr);
+				sptr += (128 / 8);
+			}
+			else
+			{
+				/* Off-stride, slower LDDQU load. */
+				R0 = _mm_lddqu_si128((const __m128i*)sptr);
+				sptr += (128 / 8);
+				R1 = _mm_lddqu_si128((const __m128i*)sptr);
+				sptr += (128 / 8);
+			}
+
+			/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
+			/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
+			/* Shuffle to pack all the like types together. */
+			R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+			R3 = _mm_shuffle_epi8(R0, R2);
+			R4 = _mm_shuffle_epi8(R1, R2);
+			/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
+			/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
+			R5 = _mm_unpackhi_epi32(R3, R4);
+			R6 = _mm_unpacklo_epi32(R3, R4);
+
+			/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
+			/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+			/* Save alphas aside */
+			if (withAlpha)
+				R7 = _mm_unpackhi_epi64(R5, R5);
+			else
+				R7 = _mm_set1_epi32(0xFFFFFFFFU);
+
+			/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
+			/* Expand Y's from 8-bit unsigned to 16-bit signed. */
+			R1 = _mm_set1_epi32(0);
+			R0 = _mm_unpacklo_epi8(R5, R1);
+			/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
+			/* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
+			 * Note: this must be done before sign-conversion.
+			 * Note also there is no slli_epi8, so we have to use a 16-bit
+			 * version and then mask.
+			 */
+			R6 = _mm_slli_epi16(R6, dataShift);
+			R1 = _mm_set1_epi8(mask);
+			R6 = _mm_and_si128(R6, R1);
+			/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+			/* Expand Co's from 8-bit signed to 16-bit signed */
+			R1 = _mm_unpackhi_epi8(R6, R6);
+			R1 = _mm_srai_epi16(R1, 8);
+			/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
+			/* Expand Cg's form 8-bit signed to 16-bit signed */
+			R2 = _mm_unpacklo_epi8(R6, R6);
+			R2 = _mm_srai_epi16(R2, 8);
+			/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
+			/* Get Y - halfCg and save */
+			R6 = _mm_subs_epi16(R0, R2);
+			/* R = (Y-halfCg) + halfCo */
+			R3 = _mm_adds_epi16(R6, R1);
+			/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
+			/* G = Y + Cg(/2) */
+			R4 = _mm_adds_epi16(R0, R2);
+			/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
+			/* B = (Y-halfCg) - Co(/2) */
+			R5 = _mm_subs_epi16(R6, R1);
+			/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
+			/* Repack R's & B's.  */
+			R0 = _mm_packus_epi16(R3, R5);
+			/* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
+			/* Repack G's. */
+			R1 = _mm_packus_epi16(R4, R4);
+			/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
+			/* And add the A's. */
+			R1 = _mm_unpackhi_epi64(R1, R7);
+			/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
+			/* Now do interleaving again. */
+			R2 = _mm_unpacklo_epi8(R0, R1);
+			/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
+			R3 = _mm_unpackhi_epi8(R0, R1);
+			/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
+			R4 = _mm_unpacklo_epi16(R2, R3);
+			/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
+			R5 = _mm_unpackhi_epi16(R2, R3);
+			/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
+			_mm_store_si128((__m128i*)dptr, R4);
+			dptr += (128 / 8);
+			_mm_store_si128((__m128i*)dptr, R5);
+			dptr += (128 / 8);
+			w -= 8;
+		}
+
+		/* Handle any remainder pixels. */
+		if (w > 0)
+		{
+			pstatus_t status = 0;
+			status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
+			                                     shift, withAlpha);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return status;
+
+			sptr += w * sizeof(UINT32);
+			dptr += w * sizeof(UINT32);
+		}
+
+		sptr += sRowBump;
+		dptr += dRowBump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
+                                                     UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+                                                     UINT32 DstFormat, UINT32 dstStep, UINT32 width,
+                                                     UINT32 height, UINT8 shift, BOOL withAlpha)
+{
+	const BYTE* sptr = pSrc;
+	BYTE* dptr = (BYTE*)pDst;
+	int sRowBump = srcStep - width * sizeof(UINT32);
+	int dRowBump = dstStep - width * sizeof(UINT32);
+	/* Shift left by "shift" and divide by two is the same as shift
+	 * left by "shift-1".
+	 */
+	int dataShift = shift - 1;
+	BYTE mask = (BYTE)(0xFFU << dataShift);
+
+	/* Let's say the data is of the form:
+	 * y0y0o0g0 a1y1o1g1 a2y2o2g2...
+	 * Apply:
+	 * |R|   | 1  1/2 -1/2 |   |y|
+	 * |G| = | 1  0    1/2 | * |o|
+	 * |B|   | 1 -1/2 -1/2 |   |g|
+	 * where Y is 8-bit unsigned and o & g are 8-bit signed.
+	 */
+
+	if ((width < 8) || (ULONG_PTR)dptr & 0x03)
+	{
+		/* Too small, or we'll never hit a 16-byte boundary.  Punt. */
+		return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+		                                   shift, withAlpha);
+	}
+
+	for (UINT32 h = 0; h < height; h++)
+	{
+		int w = width;
+		BOOL onStride = 0;
+
+		/* Get to a 16-byte destination boundary. */
+		if ((ULONG_PTR)dptr & 0x0f)
+		{
+			pstatus_t status = 0;
+			UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
+
+			if (startup > width)
+				startup = width;
+
+			status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
+			                                     1, shift, withAlpha);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return status;
+
+			sptr += startup * sizeof(UINT32);
+			dptr += startup * sizeof(UINT32);
+			w -= startup;
+		}
+
+		/* Each loop handles eight pixels at a time. */
+		onStride = (((const ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
+
+		while (w >= 8)
+		{
+			__m128i R0;
+			__m128i R1;
+			__m128i R2;
+			__m128i R3;
+			__m128i R4;
+			__m128i R5;
+			__m128i R6;
+			__m128i R7;
+
+			if (onStride)
+			{
+				/* The faster path, 16-byte aligned load. */
+				R0 = _mm_load_si128((const __m128i*)sptr);
+				sptr += (128 / 8);
+				R1 = _mm_load_si128((const __m128i*)sptr);
+				sptr += (128 / 8);
+			}
+			else
+			{
+				/* Off-stride, slower LDDQU load. */
+				R0 = _mm_lddqu_si128((const __m128i*)sptr);
+				sptr += (128 / 8);
+				R1 = _mm_lddqu_si128((const __m128i*)sptr);
+				sptr += (128 / 8);
+			}
+
+			/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
+			/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
+			/* Shuffle to pack all the like types together. */
+			R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+			R3 = _mm_shuffle_epi8(R0, R2);
+			R4 = _mm_shuffle_epi8(R1, R2);
+			/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
+			/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
+			R5 = _mm_unpackhi_epi32(R3, R4);
+			R6 = _mm_unpacklo_epi32(R3, R4);
+
+			/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
+			/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+			/* Save alphas aside */
+			if (withAlpha)
+				R7 = _mm_unpackhi_epi64(R5, R5);
+			else
+				R7 = _mm_set1_epi32(0xFFFFFFFFU);
+
+			/* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
+			/* Expand Y's from 8-bit unsigned to 16-bit signed. */
+			R1 = _mm_set1_epi32(0);
+			R0 = _mm_unpacklo_epi8(R5, R1);
+			/* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
+			/* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
+			 * Note: this must be done before sign-conversion.
+			 * Note also there is no slli_epi8, so we have to use a 16-bit
+			 * version and then mask.
+			 */
+			R6 = _mm_slli_epi16(R6, dataShift);
+			R1 = _mm_set1_epi8(mask);
+			R6 = _mm_and_si128(R6, R1);
+			/* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+			/* Expand Co's from 8-bit signed to 16-bit signed */
+			R1 = _mm_unpackhi_epi8(R6, R6);
+			R1 = _mm_srai_epi16(R1, 8);
+			/* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
+			/* Expand Cg's form 8-bit signed to 16-bit signed */
+			R2 = _mm_unpacklo_epi8(R6, R6);
+			R2 = _mm_srai_epi16(R2, 8);
+			/* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
+			/* Get Y - halfCg and save */
+			R6 = _mm_subs_epi16(R0, R2);
+			/* R = (Y-halfCg) + halfCo */
+			R3 = _mm_adds_epi16(R6, R1);
+			/* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
+			/* G = Y + Cg(/2) */
+			R4 = _mm_adds_epi16(R0, R2);
+			/* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
+			/* B = (Y-halfCg) - Co(/2) */
+			R5 = _mm_subs_epi16(R6, R1);
+			/* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
+			/* Repack R's & B's.  */
+			/* This line is the only diff between inverted and non-inverted.
+			 * Unfortunately, it would be expensive to check "inverted"
+			 * every time through this loop.
+			 */
+			R0 = _mm_packus_epi16(R5, R3);
+			/* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
+			/* Repack G's. */
+			R1 = _mm_packus_epi16(R4, R4);
+			/* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
+			/* And add the A's. */
+			R1 = _mm_unpackhi_epi64(R1, R7);
+			/* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
+			/* Now do interleaving again. */
+			R2 = _mm_unpacklo_epi8(R0, R1);
+			/* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
+			R3 = _mm_unpackhi_epi8(R0, R1);
+			/* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
+			R4 = _mm_unpacklo_epi16(R2, R3);
+			/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
+			R5 = _mm_unpackhi_epi16(R2, R3);
+			/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
+			_mm_store_si128((__m128i*)dptr, R4);
+			dptr += (128 / 8);
+			_mm_store_si128((__m128i*)dptr, R5);
+			dptr += (128 / 8);
+			w -= 8;
+		}
+
+		/* Handle any remainder pixels. */
+		if (w > 0)
+		{
+			pstatus_t status = 0;
+			status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
+			                                     shift, withAlpha);
+
+			if (status != PRIMITIVES_SUCCESS)
+				return status;
+
+			sptr += w * sizeof(UINT32);
+			dptr += w * sizeof(UINT32);
+		}
+
+		sptr += sRowBump;
+		dptr += dRowBump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_SSE2
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
+                                           BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
+                                           INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
+                                           BOOL withAlpha)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRX32:
+		case PIXEL_FORMAT_BGRA32:
+			return ssse3_YCoCgRToRGB_8u_AC4R_invert(pSrc, srcStep, pDst, DstFormat, dstStep, width,
+			                                        height, shift, withAlpha);
+
+		case PIXEL_FORMAT_RGBX32:
+		case PIXEL_FORMAT_RGBA32:
+			return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(pSrc, srcStep, pDst, DstFormat, dstStep,
+			                                           width, height, shift, withAlpha);
+
+		default:
+			return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
+			                                   height, shift, withAlpha);
+	}
+}
+#elif defined(WITH_NEON)
+
+static pstatus_t neon_YCoCgToRGB_8u_X(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
+                                      BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
+                                      UINT32 width, UINT32 height, UINT8 shift, BYTE bPos,
+                                      BYTE gPos, BYTE rPos, BYTE aPos, BOOL alpha)
+{
+	BYTE* dptr = pDst;
+	const BYTE* sptr = pSrc;
+	const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+	const int8_t cll = shift - 1; /* -1 builds in the /2's */
+	const UINT32 srcPad = srcStep - (width * 4);
+	const UINT32 dstPad = dstStep - (width * formatSize);
+	const UINT32 pad = width % 8;
+	const uint8x8_t aVal = vdup_n_u8(0xFF);
+	const int8x8_t cllv = vdup_n_s8(cll);
+
+	for (UINT32 y = 0; y < height; y++)
+	{
+		for (UINT32 x = 0; x < width - pad; x += 8)
+		{
+			/* Note: shifts must be done before sign-conversion. */
+			const uint8x8x4_t raw = vld4_u8(sptr);
+			const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv));
+			const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv));
+			const int16x8_t Cg = vmovl_s8(CgRaw);
+			const int16x8_t Co = vmovl_s8(CoRaw);
+			const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */
+			const int16x8_t T = vsubq_s16(Y, Cg);
+			const int16x8_t R = vaddq_s16(T, Co);
+			const int16x8_t G = vaddq_s16(Y, Cg);
+			const int16x8_t B = vsubq_s16(T, Co);
+			uint8x8x4_t bgrx;
+			bgrx.val[bPos] = vqmovun_s16(B);
+			bgrx.val[gPos] = vqmovun_s16(G);
+			bgrx.val[rPos] = vqmovun_s16(R);
+
+			if (alpha)
+				bgrx.val[aPos] = raw.val[3];
+			else
+				bgrx.val[aPos] = aVal;
+
+			vst4_u8(dptr, bgrx);
+			sptr += sizeof(raw);
+			dptr += sizeof(bgrx);
+		}
+
+		for (UINT32 x = 0; x < pad; x++)
+		{
+			/* Note: shifts must be done before sign-conversion. */
+			const INT16 Cg = (INT16)((INT8)((*sptr++) << cll));
+			const INT16 Co = (INT16)((INT8)((*sptr++) << cll));
+			const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */
+			const INT16 T = Y - Cg;
+			const INT16 R = T + Co;
+			const INT16 G = Y + Cg;
+			const INT16 B = T - Co;
+			BYTE bgra[4];
+			bgra[bPos] = CLIP(B);
+			bgra[gPos] = CLIP(G);
+			bgra[rPos] = CLIP(R);
+			bgra[aPos] = *sptr++;
+
+			if (!alpha)
+				bgra[aPos] = 0xFF;
+
+			*dptr++ = bgra[0];
+			*dptr++ = bgra[1];
+			*dptr++ = bgra[2];
+			*dptr++ = bgra[3];
+		}
+
+		sptr += srcPad;
+		dptr += dstPad;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
+                                         BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
+                                         UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 2, 1, 0, 3, withAlpha);
+
+		case PIXEL_FORMAT_BGRX32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 2, 1, 0, 3, withAlpha);
+
+		case PIXEL_FORMAT_RGBA32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 0, 1, 2, 3, withAlpha);
+
+		case PIXEL_FORMAT_RGBX32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 0, 1, 2, 3, withAlpha);
+
+		case PIXEL_FORMAT_ARGB32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 1, 2, 3, 0, withAlpha);
+
+		case PIXEL_FORMAT_XRGB32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 1, 2, 3, 0, withAlpha);
+
+		case PIXEL_FORMAT_ABGR32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 3, 2, 1, 0, withAlpha);
+
+		case PIXEL_FORMAT_XBGR32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 3, 2, 1, 0, withAlpha);
+
+		default:
+			return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
+			                                   height, shift, withAlpha);
+	}
+}
+#endif /* WITH_SSE2 */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	generic = primitives_get_generic();
+	primitives_init_YCoCg(prims);
+	/* While IPP acknowledges the existence of YCoCg-R, it doesn't currently
+	 * include any routines to work with it, especially with variable shift
+	 * width.
+	 */
+#if defined(WITH_SSE2)
+
+	if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
+	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+	{
+		prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
+	}
+
+#elif defined(WITH_NEON)
+
+	if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
+	{
+		prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R;
+	}
+
+#endif /* WITH_SSE2 */
+}