summaryrefslogtreecommitdiffstats
path: root/libfreerdp/primitives
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 01:24:41 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 01:24:41 +0000
commita9bcc81f821d7c66f623779fa5147e728eb3c388 (patch)
tree98676963bcdd537ae5908a067a8eb110b93486a6 /libfreerdp/primitives
parentInitial commit. (diff)
downloadfreerdp3-a9bcc81f821d7c66f623779fa5147e728eb3c388.tar.xz
freerdp3-a9bcc81f821d7c66f623779fa5147e728eb3c388.zip
Adding upstream version 3.3.0+dfsg1.upstream/3.3.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'libfreerdp/primitives')
-rw-r--r--libfreerdp/primitives/README.txt101
-rw-r--r--libfreerdp/primitives/prim_YCoCg.c73
-rw-r--r--libfreerdp/primitives/prim_YCoCg_opt.c589
-rw-r--r--libfreerdp/primitives/prim_YUV.c1877
-rw-r--r--libfreerdp/primitives/prim_YUV_neon.c751
-rw-r--r--libfreerdp/primitives/prim_YUV_opencl.c500
-rw-r--r--libfreerdp/primitives/prim_YUV_ssse3.c1515
-rw-r--r--libfreerdp/primitives/prim_add.c48
-rw-r--r--libfreerdp/primitives/prim_add_opt.c61
-rw-r--r--libfreerdp/primitives/prim_alphaComp.c94
-rw-r--r--libfreerdp/primitives/prim_alphaComp_opt.c245
-rw-r--r--libfreerdp/primitives/prim_andor.c57
-rw-r--r--libfreerdp/primitives/prim_andor_opt.c63
-rw-r--r--libfreerdp/primitives/prim_colors.c509
-rw-r--r--libfreerdp/primitives/prim_colors_opt.c1591
-rw-r--r--libfreerdp/primitives/prim_copy.c178
-rw-r--r--libfreerdp/primitives/prim_internal.h297
-rw-r--r--libfreerdp/primitives/prim_set.c122
-rw-r--r--libfreerdp/primitives/prim_set_opt.c256
-rw-r--r--libfreerdp/primitives/prim_shift.c115
-rw-r--r--libfreerdp/primitives/prim_shift_opt.c80
-rw-r--r--libfreerdp/primitives/prim_sign.c42
-rw-r--r--libfreerdp/primitives/prim_sign_opt.c185
-rw-r--r--libfreerdp/primitives/prim_templates.h444
-rw-r--r--libfreerdp/primitives/primitives.c412
-rw-r--r--libfreerdp/primitives/primitives.cl463
-rw-r--r--libfreerdp/primitives/test/CMakeLists.txt45
-rw-r--r--libfreerdp/primitives/test/TestPrimitivesAdd.c82
-rw-r--r--libfreerdp/primitives/test/TestPrimitivesAlphaComp.c202
-rw-r--r--libfreerdp/primitives/test/TestPrimitivesAndOr.c169
-rw-r--r--libfreerdp/primitives/test/TestPrimitivesColors.c298
-rw-r--r--libfreerdp/primitives/test/TestPrimitivesCopy.c90
-rw-r--r--libfreerdp/primitives/test/TestPrimitivesSet.c274
-rw-r--r--libfreerdp/primitives/test/TestPrimitivesShift.c470
-rw-r--r--libfreerdp/primitives/test/TestPrimitivesSign.c93
-rw-r--r--libfreerdp/primitives/test/TestPrimitivesYCbCr.c1835
-rw-r--r--libfreerdp/primitives/test/TestPrimitivesYCoCg.c145
-rw-r--r--libfreerdp/primitives/test/TestPrimitivesYUV.c979
-rw-r--r--libfreerdp/primitives/test/measure.h145
-rw-r--r--libfreerdp/primitives/test/prim_test.c109
-rw-r--r--libfreerdp/primitives/test/prim_test.h59
41 files changed, 15663 insertions, 0 deletions
diff --git a/libfreerdp/primitives/README.txt b/libfreerdp/primitives/README.txt
new file mode 100644
index 0000000..81c7e97
--- /dev/null
+++ b/libfreerdp/primitives/README.txt
@@ -0,0 +1,101 @@
+The Primitives Library
+
+Introduction
+------------
+The purpose of the primitives library is to give the freerdp code easy
+access to *run-time* optimization via SIMD operations. When the library
+is initialized, dynamic checks of processor features are run (such as
+the support of SSE3 or Neon), and entrypoints are linked to through
+function pointers to provide the fastest possible operations. All
+routines offer generic C alternatives as fallbacks.
+
+Run-time optimization has the advantage of allowing a single executable
+to run fast on multiple platforms with different SIMD capabilities.
+
+
+Use In Code
+-----------
+A singleton pointing to a structure containing the function pointers
+is accessed through primitives_get(). The function pointers can then
+be used from that structure, e.g.
+
+ primitives_t *prims = primitives_get();
+ prims->shiftC_16s(buffer, shifts, buffer, 256);
+
+Of course, there is some overhead in calling through the function pointer
+and setting up the SIMD operations, so it would be counterproductive to
+call the primitives library for very small operation, e.g. initializing an
+array of eight values to a constant. The primitives library is intended
+for larger-scale operations, e.g. arrays of size 64 and larger.
+
+
+Initialization and Cleanup
+--------------------------
+Library initialization is done the first time primitives_init() is called
+or the first time primitives_get() is used. Cleanup (if any) is done by
+primitives_deinit().
+
+
+Intel Integrated Performance Primitives (IPP)
+---------------------------------------------
+If freerdp is compiled with IPP support (-DWITH_IPP=ON), the IPP function
+calls will be used (where available) to fill the function pointers.
+Where possible, function names and parameter lists match IPP format so
+that the IPP functions can be plugged into the function pointers without
+a wrapper layer. Use of IPP is completely optional, and in many cases
+the SSE operations in the primitives library itself are faster or similar
+in performance.
+
+
+Coverage
+--------
+The primitives library is not meant to be comprehensive, offering
+entrypoints for every operation and operand type. Instead, the coverage
+is focused on operations known to be performance bottlenecks in the code.
+For instance, 16-bit signed operations are used widely in the RemoteFX
+software, so you'll find 16s versions of several operations, but there
+is no attempt to provide (unused) copies of the same code for 8u, 16u,
+32s, etc.
+
+
+New Optimizations
+-----------------
+As the need arises, new optimizations can be added to the library,
+including NEON, AVX, and perhaps OpenCL or other SIMD implementations.
+The CPU feature detection is done in winpr/sysinfo.
+
+
+Adding Entrypoints
+------------------
+As the need for new operations or operands arises, new entrypoints can
+be added.
+ 1) Function prototypes and pointers are added to
+ include/freerdp/primitives.h
+ 2) New module initialization and cleanup function prototypes are added
+ to prim_internal.h and called in primitives.c (primitives_init()
+ and primitives_deinit()).
+ 3) Operation names and parameter lists should be compatible with the IPP.
+ IPP manuals are available online at software.intel.com.
+ 4) A generic C entrypoint must be available as a fallback.
+ 5) prim_templates.h contains macro-based templates for simple operations,
+ such as applying a single SSE operation to arrays of data.
+ The template functions can frequently be used to extend the
+ operations without writing a lot of new code.
+
+Cache Management
+----------------
+I haven't found a lot of speed improvement by attempting prefetch, and
+in fact it seems to have a negative impact in some cases. Done correctly
+perhaps the routines could be further accelerated by proper use of prefetch,
+fences, etc.
+
+
+Testing
+-------
+In the test subdirectory is an executable (prim_test) that tests both
+functionality and speed of primitives library operations. Any new
+modules should be added to that test, following the conventions already
+established in that directory. The program can be executed on various
+target hardware to compare generic C, optimized, and IPP performance
+with various array sizes.
+
diff --git a/libfreerdp/primitives/prim_YCoCg.c b/libfreerdp/primitives/prim_YCoCg.c
new file mode 100644
index 0000000..7c1a429
--- /dev/null
+++ b/libfreerdp/primitives/prim_YCoCg.c
@@ -0,0 +1,73 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * YCoCg<->RGB Color conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+/* helper function to convert raw 8 bit values to signed 16bit values.
+ */
+static INT16 convert(UINT8 raw, int shift)
+{
+ const int cll = shift - 1; /* -1 builds in the /2's */
+ return (INT16)((INT8)(raw << cll));
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_YCoCgToRGB_8u_AC4R(const BYTE* pSrc, INT32 srcStep, BYTE* pDst,
+ UINT32 DstFormat, INT32 dstStep, UINT32 width,
+ UINT32 height, UINT8 shift, BOOL withAlpha)
+{
+ const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+ fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, TRUE);
+
+ for (UINT32 y = 0; y < height; y++)
+ {
+ const BYTE* sptr = &pSrc[srcStep * y];
+ BYTE* dptr = &pDst[dstStep * y];
+ for (UINT32 x = 0; x < width; x++)
+ {
+ /* Note: shifts must be done before sign-conversion. */
+ const INT16 Cg = convert(*sptr++, shift);
+ const INT16 Co = convert(*sptr++, shift);
+ const INT16 Y = *sptr++; /* UINT8->INT16 */
+ const INT16 T = Y - Cg;
+ const INT16 B = T + Co;
+ const INT16 G = Y + Cg;
+ const INT16 R = T - Co;
+ BYTE A = *sptr++;
+
+ if (!withAlpha)
+ A = 0xFFU;
+
+ dptr = writePixel(dptr, formatSize, DstFormat, CLIP(R), CLIP(G), CLIP(B), A);
+ }
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_YCoCg(primitives_t* prims)
+{
+ prims->YCoCgToRGB_8u_AC4R = general_YCoCgToRGB_8u_AC4R;
+}
diff --git a/libfreerdp/primitives/prim_YCoCg_opt.c b/libfreerdp/primitives/prim_YCoCg_opt.c
new file mode 100644
index 0000000..bba13fa
--- /dev/null
+++ b/libfreerdp/primitives/prim_YCoCg_opt.c
@@ -0,0 +1,589 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized YCoCg<->RGB conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#elif defined(WITH_NEON)
+#include <arm_neon.h>
+#endif /* WITH_SSE2 else WITH_NEON */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+static primitives_t* generic = NULL;
+
+#ifdef WITH_SSE2
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
+ BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
+ UINT32 dstStep, UINT32 width, UINT32 height,
+ UINT8 shift, BOOL withAlpha)
+{
+ const BYTE* sptr = pSrc;
+ BYTE* dptr = (BYTE*)pDst;
+ int sRowBump = srcStep - width * sizeof(UINT32);
+ int dRowBump = dstStep - width * sizeof(UINT32);
+ /* Shift left by "shift" and divide by two is the same as shift
+ * left by "shift-1".
+ */
+ int dataShift = shift - 1;
+ BYTE mask = (BYTE)(0xFFU << dataShift);
+
+ /* Let's say the data is of the form:
+ * y0y0o0g0 a1y1o1g1 a2y2o2g2...
+ * Apply:
+ * |R| | 1 1/2 -1/2 | |y|
+ * |G| = | 1 0 1/2 | * |o|
+ * |B| | 1 -1/2 -1/2 | |g|
+ * where Y is 8-bit unsigned and o & g are 8-bit signed.
+ */
+
+ if ((width < 8) || (ULONG_PTR)dptr & 0x03)
+ {
+ /* Too small, or we'll never hit a 16-byte boundary. Punt. */
+ return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+ shift, withAlpha);
+ }
+
+ for (UINT32 h = 0; h < height; h++)
+ {
+ UINT32 w = width;
+ BOOL onStride = 0;
+
+ /* Get to a 16-byte destination boundary. */
+ if ((ULONG_PTR)dptr & 0x0f)
+ {
+ pstatus_t status = 0;
+ UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
+
+ if (startup > width)
+ startup = width;
+
+ status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
+ 1, shift, withAlpha);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return status;
+
+ sptr += startup * sizeof(UINT32);
+ dptr += startup * sizeof(UINT32);
+ w -= startup;
+ }
+
+ /* Each loop handles eight pixels at a time. */
+ onStride = (((ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
+
+ while (w >= 8)
+ {
+ __m128i R0;
+ __m128i R1;
+ __m128i R2;
+ __m128i R3;
+ __m128i R4;
+ __m128i R5;
+ __m128i R6;
+ __m128i R7;
+
+ if (onStride)
+ {
+ /* The faster path, 16-byte aligned load. */
+ R0 = _mm_load_si128((const __m128i*)sptr);
+ sptr += (128 / 8);
+ R1 = _mm_load_si128((const __m128i*)sptr);
+ sptr += (128 / 8);
+ }
+ else
+ {
+ /* Off-stride, slower LDDQU load. */
+ R0 = _mm_lddqu_si128((const __m128i*)sptr);
+ sptr += (128 / 8);
+ R1 = _mm_lddqu_si128((const __m128i*)sptr);
+ sptr += (128 / 8);
+ }
+
+ /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
+ /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
+ /* Shuffle to pack all the like types together. */
+ R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+ R3 = _mm_shuffle_epi8(R0, R2);
+ R4 = _mm_shuffle_epi8(R1, R2);
+ /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
+ /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
+ R5 = _mm_unpackhi_epi32(R3, R4);
+ R6 = _mm_unpacklo_epi32(R3, R4);
+
+ /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
+ /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+ /* Save alphas aside */
+ if (withAlpha)
+ R7 = _mm_unpackhi_epi64(R5, R5);
+ else
+ R7 = _mm_set1_epi32(0xFFFFFFFFU);
+
+ /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
+ /* Expand Y's from 8-bit unsigned to 16-bit signed. */
+ R1 = _mm_set1_epi32(0);
+ R0 = _mm_unpacklo_epi8(R5, R1);
+ /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
+ /* Shift Co's and Cg's by (shift-1). -1 covers division by two.
+ * Note: this must be done before sign-conversion.
+ * Note also there is no slli_epi8, so we have to use a 16-bit
+ * version and then mask.
+ */
+ R6 = _mm_slli_epi16(R6, dataShift);
+ R1 = _mm_set1_epi8(mask);
+ R6 = _mm_and_si128(R6, R1);
+ /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+ /* Expand Co's from 8-bit signed to 16-bit signed */
+ R1 = _mm_unpackhi_epi8(R6, R6);
+ R1 = _mm_srai_epi16(R1, 8);
+ /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
+ /* Expand Cg's form 8-bit signed to 16-bit signed */
+ R2 = _mm_unpacklo_epi8(R6, R6);
+ R2 = _mm_srai_epi16(R2, 8);
+ /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
+ /* Get Y - halfCg and save */
+ R6 = _mm_subs_epi16(R0, R2);
+ /* R = (Y-halfCg) + halfCo */
+ R3 = _mm_adds_epi16(R6, R1);
+ /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
+ /* G = Y + Cg(/2) */
+ R4 = _mm_adds_epi16(R0, R2);
+ /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
+ /* B = (Y-halfCg) - Co(/2) */
+ R5 = _mm_subs_epi16(R6, R1);
+ /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
+ /* Repack R's & B's. */
+ R0 = _mm_packus_epi16(R3, R5);
+ /* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
+ /* Repack G's. */
+ R1 = _mm_packus_epi16(R4, R4);
+ /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
+ /* And add the A's. */
+ R1 = _mm_unpackhi_epi64(R1, R7);
+ /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
+ /* Now do interleaving again. */
+ R2 = _mm_unpacklo_epi8(R0, R1);
+ /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
+ R3 = _mm_unpackhi_epi8(R0, R1);
+ /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
+ R4 = _mm_unpacklo_epi16(R2, R3);
+ /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
+ R5 = _mm_unpackhi_epi16(R2, R3);
+ /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
+ _mm_store_si128((__m128i*)dptr, R4);
+ dptr += (128 / 8);
+ _mm_store_si128((__m128i*)dptr, R5);
+ dptr += (128 / 8);
+ w -= 8;
+ }
+
+ /* Handle any remainder pixels. */
+ if (w > 0)
+ {
+ pstatus_t status = 0;
+ status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
+ shift, withAlpha);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return status;
+
+ sptr += w * sizeof(UINT32);
+ dptr += w * sizeof(UINT32);
+ }
+
+ sptr += sRowBump;
+ dptr += dRowBump;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT pSrc,
+ UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+ UINT32 DstFormat, UINT32 dstStep, UINT32 width,
+ UINT32 height, UINT8 shift, BOOL withAlpha)
+{
+ const BYTE* sptr = pSrc;
+ BYTE* dptr = (BYTE*)pDst;
+ int sRowBump = srcStep - width * sizeof(UINT32);
+ int dRowBump = dstStep - width * sizeof(UINT32);
+ /* Shift left by "shift" and divide by two is the same as shift
+ * left by "shift-1".
+ */
+ int dataShift = shift - 1;
+ BYTE mask = (BYTE)(0xFFU << dataShift);
+
+ /* Let's say the data is of the form:
+ * y0y0o0g0 a1y1o1g1 a2y2o2g2...
+ * Apply:
+ * |R| | 1 1/2 -1/2 | |y|
+ * |G| = | 1 0 1/2 | * |o|
+ * |B| | 1 -1/2 -1/2 | |g|
+ * where Y is 8-bit unsigned and o & g are 8-bit signed.
+ */
+
+ if ((width < 8) || (ULONG_PTR)dptr & 0x03)
+ {
+ /* Too small, or we'll never hit a 16-byte boundary. Punt. */
+ return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+ shift, withAlpha);
+ }
+
+ for (UINT32 h = 0; h < height; h++)
+ {
+ int w = width;
+ BOOL onStride = 0;
+
+ /* Get to a 16-byte destination boundary. */
+ if ((ULONG_PTR)dptr & 0x0f)
+ {
+ pstatus_t status = 0;
+ UINT32 startup = (16 - ((ULONG_PTR)dptr & 0x0f)) / 4;
+
+ if (startup > width)
+ startup = width;
+
+ status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, startup,
+ 1, shift, withAlpha);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return status;
+
+ sptr += startup * sizeof(UINT32);
+ dptr += startup * sizeof(UINT32);
+ w -= startup;
+ }
+
+ /* Each loop handles eight pixels at a time. */
+ onStride = (((const ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
+
+ while (w >= 8)
+ {
+ __m128i R0;
+ __m128i R1;
+ __m128i R2;
+ __m128i R3;
+ __m128i R4;
+ __m128i R5;
+ __m128i R6;
+ __m128i R7;
+
+ if (onStride)
+ {
+ /* The faster path, 16-byte aligned load. */
+ R0 = _mm_load_si128((const __m128i*)sptr);
+ sptr += (128 / 8);
+ R1 = _mm_load_si128((const __m128i*)sptr);
+ sptr += (128 / 8);
+ }
+ else
+ {
+ /* Off-stride, slower LDDQU load. */
+ R0 = _mm_lddqu_si128((const __m128i*)sptr);
+ sptr += (128 / 8);
+ R1 = _mm_lddqu_si128((const __m128i*)sptr);
+ sptr += (128 / 8);
+ }
+
+ /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
+ /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
+ /* Shuffle to pack all the like types together. */
+ R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+ R3 = _mm_shuffle_epi8(R0, R2);
+ R4 = _mm_shuffle_epi8(R1, R2);
+ /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
+ /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
+ R5 = _mm_unpackhi_epi32(R3, R4);
+ R6 = _mm_unpacklo_epi32(R3, R4);
+
+ /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
+ /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+ /* Save alphas aside */
+ if (withAlpha)
+ R7 = _mm_unpackhi_epi64(R5, R5);
+ else
+ R7 = _mm_set1_epi32(0xFFFFFFFFU);
+
+ /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
+ /* Expand Y's from 8-bit unsigned to 16-bit signed. */
+ R1 = _mm_set1_epi32(0);
+ R0 = _mm_unpacklo_epi8(R5, R1);
+ /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
+ /* Shift Co's and Cg's by (shift-1). -1 covers division by two.
+ * Note: this must be done before sign-conversion.
+ * Note also there is no slli_epi8, so we have to use a 16-bit
+ * version and then mask.
+ */
+ R6 = _mm_slli_epi16(R6, dataShift);
+ R1 = _mm_set1_epi8(mask);
+ R6 = _mm_and_si128(R6, R1);
+ /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+ /* Expand Co's from 8-bit signed to 16-bit signed */
+ R1 = _mm_unpackhi_epi8(R6, R6);
+ R1 = _mm_srai_epi16(R1, 8);
+ /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
+ /* Expand Cg's form 8-bit signed to 16-bit signed */
+ R2 = _mm_unpacklo_epi8(R6, R6);
+ R2 = _mm_srai_epi16(R2, 8);
+ /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
+ /* Get Y - halfCg and save */
+ R6 = _mm_subs_epi16(R0, R2);
+ /* R = (Y-halfCg) + halfCo */
+ R3 = _mm_adds_epi16(R6, R1);
+ /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
+ /* G = Y + Cg(/2) */
+ R4 = _mm_adds_epi16(R0, R2);
+ /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
+ /* B = (Y-halfCg) - Co(/2) */
+ R5 = _mm_subs_epi16(R6, R1);
+ /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
+ /* Repack R's & B's. */
+ /* This line is the only diff between inverted and non-inverted.
+ * Unfortunately, it would be expensive to check "inverted"
+ * every time through this loop.
+ */
+ R0 = _mm_packus_epi16(R5, R3);
+ /* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
+ /* Repack G's. */
+ R1 = _mm_packus_epi16(R4, R4);
+ /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
+ /* And add the A's. */
+ R1 = _mm_unpackhi_epi64(R1, R7);
+ /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
+ /* Now do interleaving again. */
+ R2 = _mm_unpacklo_epi8(R0, R1);
+ /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
+ R3 = _mm_unpackhi_epi8(R0, R1);
+ /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
+ R4 = _mm_unpacklo_epi16(R2, R3);
+ /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
+ R5 = _mm_unpackhi_epi16(R2, R3);
+ /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
+ _mm_store_si128((__m128i*)dptr, R4);
+ dptr += (128 / 8);
+ _mm_store_si128((__m128i*)dptr, R5);
+ dptr += (128 / 8);
+ w -= 8;
+ }
+
+ /* Handle any remainder pixels. */
+ if (w > 0)
+ {
+ pstatus_t status = 0;
+ status = generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, DstFormat, dstStep, w, 1,
+ shift, withAlpha);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return status;
+
+ sptr += w * sizeof(UINT32);
+ dptr += w * sizeof(UINT32);
+ }
+
+ sptr += sRowBump;
+ dptr += dRowBump;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_SSE2
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
+ BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
+ INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift,
+ BOOL withAlpha)
+{
+ switch (DstFormat)
+ {
+ case PIXEL_FORMAT_BGRX32:
+ case PIXEL_FORMAT_BGRA32:
+ return ssse3_YCoCgRToRGB_8u_AC4R_invert(pSrc, srcStep, pDst, DstFormat, dstStep, width,
+ height, shift, withAlpha);
+
+ case PIXEL_FORMAT_RGBX32:
+ case PIXEL_FORMAT_RGBA32:
+ return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(pSrc, srcStep, pDst, DstFormat, dstStep,
+ width, height, shift, withAlpha);
+
+ default:
+ return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
+ height, shift, withAlpha);
+ }
+}
+#elif defined(WITH_NEON)
+
+static pstatus_t neon_YCoCgToRGB_8u_X(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
+ BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
+ UINT32 width, UINT32 height, UINT8 shift, BYTE bPos,
+ BYTE gPos, BYTE rPos, BYTE aPos, BOOL alpha)
+{
+ BYTE* dptr = pDst;
+ const BYTE* sptr = pSrc;
+ const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+ const int8_t cll = shift - 1; /* -1 builds in the /2's */
+ const UINT32 srcPad = srcStep - (width * 4);
+ const UINT32 dstPad = dstStep - (width * formatSize);
+ const UINT32 pad = width % 8;
+ const uint8x8_t aVal = vdup_n_u8(0xFF);
+ const int8x8_t cllv = vdup_n_s8(cll);
+
+ for (UINT32 y = 0; y < height; y++)
+ {
+ for (UINT32 x = 0; x < width - pad; x += 8)
+ {
+ /* Note: shifts must be done before sign-conversion. */
+ const uint8x8x4_t raw = vld4_u8(sptr);
+ const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv));
+ const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv));
+ const int16x8_t Cg = vmovl_s8(CgRaw);
+ const int16x8_t Co = vmovl_s8(CoRaw);
+ const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */
+ const int16x8_t T = vsubq_s16(Y, Cg);
+ const int16x8_t R = vaddq_s16(T, Co);
+ const int16x8_t G = vaddq_s16(Y, Cg);
+ const int16x8_t B = vsubq_s16(T, Co);
+ uint8x8x4_t bgrx;
+ bgrx.val[bPos] = vqmovun_s16(B);
+ bgrx.val[gPos] = vqmovun_s16(G);
+ bgrx.val[rPos] = vqmovun_s16(R);
+
+ if (alpha)
+ bgrx.val[aPos] = raw.val[3];
+ else
+ bgrx.val[aPos] = aVal;
+
+ vst4_u8(dptr, bgrx);
+ sptr += sizeof(raw);
+ dptr += sizeof(bgrx);
+ }
+
+ for (UINT32 x = 0; x < pad; x++)
+ {
+ /* Note: shifts must be done before sign-conversion. */
+ const INT16 Cg = (INT16)((INT8)((*sptr++) << cll));
+ const INT16 Co = (INT16)((INT8)((*sptr++) << cll));
+ const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */
+ const INT16 T = Y - Cg;
+ const INT16 R = T + Co;
+ const INT16 G = Y + Cg;
+ const INT16 B = T - Co;
+ BYTE bgra[4];
+ bgra[bPos] = CLIP(B);
+ bgra[gPos] = CLIP(G);
+ bgra[rPos] = CLIP(R);
+ bgra[aPos] = *sptr++;
+
+ if (!alpha)
+ bgra[aPos] = 0xFF;
+
+ *dptr++ = bgra[0];
+ *dptr++ = bgra[1];
+ *dptr++ = bgra[2];
+ *dptr++ = bgra[3];
+ }
+
+ sptr += srcPad;
+ dptr += dstPad;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
+ BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
+ UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha)
+{
+ switch (DstFormat)
+ {
+ case PIXEL_FORMAT_BGRA32:
+ return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+ shift, 2, 1, 0, 3, withAlpha);
+
+ case PIXEL_FORMAT_BGRX32:
+ return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+ shift, 2, 1, 0, 3, withAlpha);
+
+ case PIXEL_FORMAT_RGBA32:
+ return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+ shift, 0, 1, 2, 3, withAlpha);
+
+ case PIXEL_FORMAT_RGBX32:
+ return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+ shift, 0, 1, 2, 3, withAlpha);
+
+ case PIXEL_FORMAT_ARGB32:
+ return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+ shift, 1, 2, 3, 0, withAlpha);
+
+ case PIXEL_FORMAT_XRGB32:
+ return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+ shift, 1, 2, 3, 0, withAlpha);
+
+ case PIXEL_FORMAT_ABGR32:
+ return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+ shift, 3, 2, 1, 0, withAlpha);
+
+ case PIXEL_FORMAT_XBGR32:
+ return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+ shift, 3, 2, 1, 0, withAlpha);
+
+ default:
+ return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
+ height, shift, withAlpha);
+ }
+}
+#endif /* WITH_SSE2 */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims)
+{
+ generic = primitives_get_generic();
+ primitives_init_YCoCg(prims);
+ /* While IPP acknowledges the existence of YCoCg-R, it doesn't currently
+ * include any routines to work with it, especially with variable shift
+ * width.
+ */
+#if defined(WITH_SSE2)
+
+ if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
+ IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+ {
+ prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
+ }
+
+#elif defined(WITH_NEON)
+
+ if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
+ {
+ prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R;
+ }
+
+#endif /* WITH_SSE2 */
+}
diff --git a/libfreerdp/primitives/prim_YUV.c b/libfreerdp/primitives/prim_YUV.c
new file mode 100644
index 0000000..ec02139
--- /dev/null
+++ b/libfreerdp/primitives/prim_YUV.c
@@ -0,0 +1,1877 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Generic YUV/RGB conversion operations
+ *
+ * Copyright 2014 Marc-Andre Moreau <marcandre.moreau@gmail.com>
+ * Copyright 2015-2017 Armin Novak <armin.novak@thincast.com>
+ * Copyright 2015-2017 Norbert Federa <norbert.federa@thincast.com>
+ * Copyright 2015-2017 Vic Lee
+ * Copyright 2015-2017 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <winpr/wtypes.h>
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <freerdp/codec/color.h>
+#include "prim_internal.h"
+
+static pstatus_t general_LumaToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3],
+ const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
+ const UINT32 dstStep[3],
+ const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+ const UINT32 nWidth = roi->right - roi->left;
+ const UINT32 nHeight = roi->bottom - roi->top;
+ const UINT32 halfWidth = (nWidth + 1) / 2;
+ const UINT32 halfHeight = (nHeight + 1) / 2;
+ const UINT32 oddY = 1;
+ const UINT32 evenY = 0;
+ const UINT32 oddX = 1;
+ const UINT32 evenX = 0;
+ const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
+ pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
+ pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
+ BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
+ pDstRaw[1] + roi->top * dstStep[1] + roi->left,
+ pDstRaw[2] + roi->top * dstStep[2] + roi->left };
+
+ /* Y data is already here... */
+ /* B1 */
+ for (UINT32 y = 0; y < nHeight; y++)
+ {
+ const BYTE* Ym = pSrc[0] + srcStep[0] * y;
+ BYTE* pY = pDst[0] + dstStep[0] * y;
+ memcpy(pY, Ym, nWidth);
+ }
+
+ /* The first half of U, V are already here part of this frame. */
+ /* B2 and B3 */
+ for (UINT32 y = 0; y < halfHeight; y++)
+ {
+ const UINT32 val2y = (2 * y + evenY);
+ const UINT32 val2y1 = val2y + oddY;
+ const BYTE* Um = pSrc[1] + srcStep[1] * y;
+ const BYTE* Vm = pSrc[2] + srcStep[2] * y;
+ BYTE* pU = pDst[1] + dstStep[1] * val2y;
+ BYTE* pV = pDst[2] + dstStep[2] * val2y;
+ BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;
+ BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;
+
+ for (UINT32 x = 0; x < halfWidth; x++)
+ {
+ const UINT32 val2x = 2 * x + evenX;
+ const UINT32 val2x1 = val2x + oddX;
+ pU[val2x] = Um[x];
+ pV[val2x] = Vm[x];
+ pU[val2x1] = Um[x];
+ pV[val2x1] = Vm[x];
+ pU1[val2x] = Um[x];
+ pV1[val2x] = Vm[x];
+ pU1[val2x1] = Um[x];
+ pV1[val2x1] = Vm[x];
+ }
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_ChromaFilter(BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
+ const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+ const UINT32 oddY = 1;
+ const UINT32 evenY = 0;
+ const UINT32 nWidth = roi->right - roi->left;
+ const UINT32 nHeight = roi->bottom - roi->top;
+ const UINT32 halfHeight = (nHeight + 1) / 2;
+ const UINT32 halfWidth = (nWidth + 1) / 2;
+
+ /* Filter */
+ for (UINT32 y = roi->top; y < halfHeight + roi->top; y++)
+ {
+ const UINT32 val2y = (y * 2 + evenY);
+ const UINT32 val2y1 = val2y + oddY;
+ BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;
+ BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;
+ BYTE* pU = pDst[1] + dstStep[1] * val2y;
+ BYTE* pV = pDst[2] + dstStep[2] * val2y;
+
+ if (val2y1 > nHeight)
+ continue;
+
+ for (UINT32 x = roi->left; x < halfWidth + roi->left; x++)
+ {
+ const UINT32 val2x = (x * 2);
+ const UINT32 val2x1 = val2x + 1;
+ const BYTE inU = pU[val2x];
+ const BYTE inV = pV[val2x];
+ const INT32 up = inU * 4;
+ const INT32 vp = inV * 4;
+ INT32 u2020 = 0;
+ INT32 v2020 = 0;
+
+ if (val2x1 > nWidth)
+ continue;
+
+ u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1];
+ v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1];
+
+ pU[val2x] = CONDITIONAL_CLIP(u2020, inU);
+ pV[val2x] = CONDITIONAL_CLIP(v2020, inV);
+ }
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_ChromaV1ToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3],
+ const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
+ const UINT32 dstStep[3],
+ const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+ const UINT32 mod = 16;
+ UINT32 uY = 0;
+ UINT32 vY = 0;
+ const UINT32 nWidth = roi->right - roi->left;
+ const UINT32 nHeight = roi->bottom - roi->top;
+ const UINT32 halfWidth = (nWidth) / 2;
+ const UINT32 halfHeight = (nHeight) / 2;
+ const UINT32 oddY = 1;
+ const UINT32 evenY = 0;
+ const UINT32 oddX = 1;
+ /* The auxilary frame is aligned to multiples of 16x16.
+ * We need the padded height for B4 and B5 conversion. */
+ const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
+ const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
+ pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
+ pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
+ BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
+ pDstRaw[1] + roi->top * dstStep[1] + roi->left,
+ pDstRaw[2] + roi->top * dstStep[2] + roi->left };
+
+ /* The second half of U and V is a bit more tricky... */
+ /* B4 and B5 */
+ for (UINT32 y = 0; y < padHeigth; y++)
+ {
+ const BYTE* Ya = pSrc[0] + srcStep[0] * y;
+ BYTE* pX = NULL;
+
+ if ((y) % mod < (mod + 1) / 2)
+ {
+ const UINT32 pos = (2 * uY++ + oddY);
+
+ if (pos >= nHeight)
+ continue;
+
+ pX = pDst[1] + dstStep[1] * pos;
+ }
+ else
+ {
+ const UINT32 pos = (2 * vY++ + oddY);
+
+ if (pos >= nHeight)
+ continue;
+
+ pX = pDst[2] + dstStep[2] * pos;
+ }
+
+ memcpy(pX, Ya, nWidth);
+ }
+
+ /* B6 and B7 */
+ for (UINT32 y = 0; y < halfHeight; y++)
+ {
+ const UINT32 val2y = (y * 2 + evenY);
+ const BYTE* Ua = pSrc[1] + srcStep[1] * y;
+ const BYTE* Va = pSrc[2] + srcStep[2] * y;
+ BYTE* pU = pDst[1] + dstStep[1] * val2y;
+ BYTE* pV = pDst[2] + dstStep[2] * val2y;
+
+ for (UINT32 x = 0; x < halfWidth; x++)
+ {
+ const UINT32 val2x1 = (x * 2 + oddX);
+ pU[val2x1] = Ua[x];
+ pV[val2x1] = Va[x];
+ }
+ }
+
+ /* Filter */
+ return general_ChromaFilter(pDst, dstStep, roi);
+}
+
+static pstatus_t general_ChromaV2ToYUV444(const BYTE* const WINPR_RESTRICT pSrc[3],
+ const UINT32 srcStep[3], UINT32 nTotalWidth,
+ UINT32 nTotalHeight, BYTE* WINPR_RESTRICT pDst[3],
+ const UINT32 dstStep[3],
+ const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+ const UINT32 nWidth = roi->right - roi->left;
+ const UINT32 nHeight = roi->bottom - roi->top;
+ const UINT32 halfWidth = (nWidth + 1) / 2;
+ const UINT32 halfHeight = (nHeight + 1) / 2;
+ const UINT32 quaterWidth = (nWidth + 3) / 4;
+
+ /* B4 and B5: odd UV values for width/2, height */
+ for (UINT32 y = 0; y < nHeight; y++)
+ {
+ const UINT32 yTop = y + roi->top;
+ const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
+ const BYTE* pYaV = pYaU + nTotalWidth / 2;
+ BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left;
+ BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left;
+
+ for (UINT32 x = 0; x < halfWidth; x++)
+ {
+ const UINT32 odd = 2 * x + 1;
+ pU[odd] = *pYaU++;
+ pV[odd] = *pYaV++;
+ }
+ }
+
+ /* B6 - B9 */
+ for (UINT32 y = 0; y < halfHeight; y++)
+ {
+ const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
+ const BYTE* pUaV = pUaU + nTotalWidth / 4;
+ const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
+ const BYTE* pVaV = pVaU + nTotalWidth / 4;
+ BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
+ BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
+
+ for (UINT32 x = 0; x < quaterWidth; x++)
+ {
+ pU[4 * x + 0] = *pUaU++;
+ pV[4 * x + 0] = *pUaV++;
+ pU[4 * x + 2] = *pVaU++;
+ pV[4 * x + 2] = *pVaV++;
+ }
+ }
+
+ return general_ChromaFilter(pDst, dstStep, roi);
+}
+
+static pstatus_t general_YUV420CombineToYUV444(avc444_frame_type type,
+ const BYTE* const WINPR_RESTRICT pSrc[3],
+ const UINT32 srcStep[3], UINT32 nWidth,
+ UINT32 nHeight, BYTE* WINPR_RESTRICT pDst[3],
+ const UINT32 dstStep[3],
+ const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+ if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
+ return -1;
+
+ if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
+ return -1;
+
+ if (!roi)
+ return -1;
+
+ switch (type)
+ {
+ case AVC444_LUMA:
+ return general_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
+
+ case AVC444_CHROMAv1:
+ return general_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
+
+ case AVC444_CHROMAv2:
+ return general_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
+
+ default:
+ return -1;
+ }
+}
+
+static pstatus_t
+general_YUV444SplitToYUV420(const BYTE* const WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3],
+ BYTE* WINPR_RESTRICT pMainDst[3], const UINT32 dstMainStep[3],
+ BYTE* WINPR_RESTRICT pAuxDst[3], const UINT32 dstAuxStep[3],
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ UINT32 uY = 0;
+ UINT32 vY = 0;
+ UINT32 halfWidth = 0;
+ UINT32 halfHeight = 0;
+ /* The auxilary frame is aligned to multiples of 16x16.
+ * We need the padded height for B4 and B5 conversion. */
+ const UINT32 padHeigth = roi->height + 16 - roi->height % 16;
+ halfWidth = (roi->width + 1) / 2;
+ halfHeight = (roi->height + 1) / 2;
+
+ /* B1 */
+ for (UINT32 y = 0; y < roi->height; y++)
+ {
+ const BYTE* pSrcY = pSrc[0] + y * srcStep[0];
+ BYTE* pY = pMainDst[0] + y * dstMainStep[0];
+ memcpy(pY, pSrcY, roi->width);
+ }
+
+ /* B2 and B3 */
+ for (UINT32 y = 0; y < halfHeight; y++)
+ {
+ const BYTE* pSrcU = pSrc[1] + 2 * y * srcStep[1];
+ const BYTE* pSrcV = pSrc[2] + 2 * y * srcStep[2];
+ const BYTE* pSrcU1 = pSrc[1] + (2 * y + 1) * srcStep[1];
+ const BYTE* pSrcV1 = pSrc[2] + (2 * y + 1) * srcStep[2];
+ BYTE* pU = pMainDst[1] + y * dstMainStep[1];
+ BYTE* pV = pMainDst[2] + y * dstMainStep[2];
+
+ for (UINT32 x = 0; x < halfWidth; x++)
+ {
+ /* Filter */
+ const INT32 u = pSrcU[2 * x] + pSrcU[2 * x + 1] + pSrcU1[2 * x] + pSrcU1[2 * x + 1];
+ const INT32 v = pSrcV[2 * x] + pSrcV[2 * x + 1] + pSrcV1[2 * x] + pSrcV1[2 * x + 1];
+ pU[x] = CLIP(u / 4L);
+ pV[x] = CLIP(v / 4L);
+ }
+ }
+
+ /* B4 and B5 */
+ for (UINT32 y = 0; y < padHeigth; y++)
+ {
+ BYTE* pY = pAuxDst[0] + y * dstAuxStep[0];
+
+ if (y % 16 < 8)
+ {
+ const UINT32 pos = (2 * uY++ + 1);
+ const BYTE* pSrcU = pSrc[1] + pos * srcStep[1];
+
+ if (pos >= roi->height)
+ continue;
+
+ memcpy(pY, pSrcU, roi->width);
+ }
+ else
+ {
+ const UINT32 pos = (2 * vY++ + 1);
+ const BYTE* pSrcV = pSrc[2] + pos * srcStep[2];
+
+ if (pos >= roi->height)
+ continue;
+
+ memcpy(pY, pSrcV, roi->width);
+ }
+ }
+
+ /* B6 and B7 */
+ for (UINT32 y = 0; y < halfHeight; y++)
+ {
+ const BYTE* pSrcU = pSrc[1] + 2 * y * srcStep[1];
+ const BYTE* pSrcV = pSrc[2] + 2 * y * srcStep[2];
+ BYTE* pU = pAuxDst[1] + y * dstAuxStep[1];
+ BYTE* pV = pAuxDst[2] + y * dstAuxStep[2];
+
+ for (UINT32 x = 0; x < halfWidth; x++)
+ {
+ pU[x] = pSrcU[2 * x + 1];
+ pV[x] = pSrcV[2 * x + 1];
+ }
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_YUV444ToRGB_8u_P3AC4R_general(const BYTE* const WINPR_RESTRICT pSrc[3],
+ const UINT32 srcStep[3],
+ BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
+ UINT32 DstFormat,
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+ fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, FALSE);
+
+ WINPR_ASSERT(pSrc);
+ WINPR_ASSERT(pDst);
+ WINPR_ASSERT(roi);
+
+ const UINT32 nWidth = roi->width;
+ const UINT32 nHeight = roi->height;
+
+ for (UINT32 y = 0; y < nHeight; y++)
+ {
+ const BYTE* pY = pSrc[0] + y * srcStep[0];
+ const BYTE* pU = pSrc[1] + y * srcStep[1];
+ const BYTE* pV = pSrc[2] + y * srcStep[2];
+ BYTE* pRGB = pDst + y * dstStep;
+
+ for (UINT32 x = 0; x < nWidth; x++)
+ {
+ const BYTE Y = pY[x];
+ const BYTE U = pU[x];
+ const BYTE V = pV[x];
+ const BYTE r = YUV2R(Y, U, V);
+ const BYTE g = YUV2G(Y, U, V);
+ const BYTE b = YUV2B(Y, U, V);
+ pRGB = writePixel(pRGB, formatSize, DstFormat, r, g, b, 0);
+ }
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_YUV444ToRGB_8u_P3AC4R_BGRX(const BYTE* const WINPR_RESTRICT pSrc[3],
+ const UINT32 srcStep[3],
+ BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
+ UINT32 DstFormat,
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+
+ WINPR_ASSERT(pSrc);
+ WINPR_ASSERT(pDst);
+ WINPR_ASSERT(roi);
+
+ const UINT32 nWidth = roi->width;
+ const UINT32 nHeight = roi->height;
+
+ for (UINT32 y = 0; y < nHeight; y++)
+ {
+ const BYTE* pY = pSrc[0] + y * srcStep[0];
+ const BYTE* pU = pSrc[1] + y * srcStep[1];
+ const BYTE* pV = pSrc[2] + y * srcStep[2];
+ BYTE* pRGB = pDst + y * dstStep;
+
+ for (UINT32 x = 0; x < nWidth; x++)
+ {
+ const BYTE Y = pY[x];
+ const BYTE U = pU[x];
+ const BYTE V = pV[x];
+ const BYTE r = YUV2R(Y, U, V);
+ const BYTE g = YUV2G(Y, U, V);
+ const BYTE b = YUV2B(Y, U, V);
+ pRGB = writePixelBGRX(pRGB, formatSize, DstFormat, r, g, b, 0);
+ }
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_YUV444ToRGB_8u_P3AC4R(const BYTE* const WINPR_RESTRICT pSrc[3],
+ const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+ UINT32 dstStep, UINT32 DstFormat,
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ switch (DstFormat)
+ {
+ case PIXEL_FORMAT_BGRA32:
+ case PIXEL_FORMAT_BGRX32:
+ return general_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+
+ default:
+ return general_YUV444ToRGB_8u_P3AC4R_general(pSrc, srcStep, pDst, dstStep, DstFormat,
+ roi);
+ }
+}
+/**
+ * | R | ( | 256 0 403 | | Y | )
+ * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
+ * | B | ( | 256 475 0 | | V - 128 | )
+ */
+static pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* const WINPR_RESTRICT pSrc[3],
+ const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+ UINT32 dstStep, UINT32 DstFormat,
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ UINT32 dstPad = 0;
+ UINT32 srcPad[3];
+ BYTE Y = 0;
+ BYTE U = 0;
+ BYTE V = 0;
+ UINT32 halfWidth = 0;
+ UINT32 halfHeight = 0;
+ const BYTE* pY = NULL;
+ const BYTE* pU = NULL;
+ const BYTE* pV = NULL;
+ BYTE* pRGB = pDst;
+ UINT32 nWidth = 0;
+ UINT32 nHeight = 0;
+ UINT32 lastRow = 0;
+ UINT32 lastCol = 0;
+ const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+ fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, FALSE);
+ pY = pSrc[0];
+ pU = pSrc[1];
+ pV = pSrc[2];
+ lastCol = roi->width & 0x01;
+ lastRow = roi->height & 0x01;
+ nWidth = (roi->width + 1) & ~0x0001;
+ nHeight = (roi->height + 1) & ~0x0001;
+ halfWidth = nWidth / 2;
+ halfHeight = nHeight / 2;
+ srcPad[0] = (srcStep[0] - nWidth);
+ srcPad[1] = (srcStep[1] - halfWidth);
+ srcPad[2] = (srcStep[2] - halfWidth);
+ dstPad = (dstStep - (nWidth * 4));
+
+ for (UINT32 y = 0; y < halfHeight;)
+ {
+ if (++y == halfHeight)
+ lastRow <<= 1;
+
+ for (UINT32 x = 0; x < halfWidth;)
+ {
+ BYTE r = 0;
+ BYTE g = 0;
+ BYTE b = 0;
+
+ if (++x == halfWidth)
+ lastCol <<= 1;
+
+ U = *pU++;
+ V = *pV++;
+ /* 1st pixel */
+ Y = *pY++;
+ r = YUV2R(Y, U, V);
+ g = YUV2G(Y, U, V);
+ b = YUV2B(Y, U, V);
+ pRGB = writePixel(pRGB, formatSize, DstFormat, r, g, b, 0);
+
+ /* 2nd pixel */
+ if (!(lastCol & 0x02))
+ {
+ Y = *pY++;
+ r = YUV2R(Y, U, V);
+ g = YUV2G(Y, U, V);
+ b = YUV2B(Y, U, V);
+ pRGB = writePixel(pRGB, formatSize, DstFormat, r, g, b, 0);
+ }
+ else
+ {
+ pY++;
+ pRGB += formatSize;
+ lastCol >>= 1;
+ }
+ }
+
+ pY += srcPad[0];
+ pU -= halfWidth;
+ pV -= halfWidth;
+ pRGB += dstPad;
+
+ if (lastRow & 0x02)
+ break;
+
+ for (UINT32 x = 0; x < halfWidth;)
+ {
+ BYTE r = 0;
+ BYTE g = 0;
+ BYTE b = 0;
+
+ if (++x == halfWidth)
+ lastCol <<= 1;
+
+ U = *pU++;
+ V = *pV++;
+ /* 3rd pixel */
+ Y = *pY++;
+ r = YUV2R(Y, U, V);
+ g = YUV2G(Y, U, V);
+ b = YUV2B(Y, U, V);
+ pRGB = writePixel(pRGB, formatSize, DstFormat, r, g, b, 0);
+
+ /* 4th pixel */
+ if (!(lastCol & 0x02))
+ {
+ Y = *pY++;
+ r = YUV2R(Y, U, V);
+ g = YUV2G(Y, U, V);
+ b = YUV2B(Y, U, V);
+ pRGB = writePixel(pRGB, formatSize, DstFormat, r, g, b, 0);
+ }
+ else
+ {
+ pY++;
+ pRGB += formatSize;
+ lastCol >>= 1;
+ }
+ }
+
+ pY += srcPad[0];
+ pU += srcPad[1];
+ pV += srcPad[2];
+ pRGB += dstPad;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/**
+ * | Y | ( | 54 183 18 | | R | ) | 0 |
+ * | U | = ( | -29 -99 128 | | G | ) >> 8 + | 128 |
+ * | V | ( | 128 -116 -12 | | B | ) | 128 |
+ */
+static INLINE BYTE RGB2Y(BYTE R, BYTE G, BYTE B)
+{
+ return (54 * R + 183 * G + 18 * B) >> 8;
+}
+
+static INLINE BYTE RGB2U(BYTE R, BYTE G, BYTE B)
+{
+ return ((-29 * R - 99 * G + 128 * B) >> 8) + 128;
+}
+
+static INLINE BYTE RGB2V(INT32 R, INT32 G, INT32 B)
+{
+ return ((128 * R - 116 * G - 12 * B) >> 8) + 128;
+}
+
+static pstatus_t general_RGBToYUV444_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc, UINT32 SrcFormat,
+ const UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[3],
+ UINT32 dstStep[3],
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ const UINT32 bpp = FreeRDPGetBytesPerPixel(SrcFormat);
+ UINT32 nWidth = 0;
+ UINT32 nHeight = 0;
+ nWidth = roi->width;
+ nHeight = roi->height;
+
+ for (UINT32 y = 0; y < nHeight; y++)
+ {
+ const BYTE* pRGB = pSrc + y * srcStep;
+ BYTE* pY = pDst[0] + y * dstStep[0];
+ BYTE* pU = pDst[1] + y * dstStep[1];
+ BYTE* pV = pDst[2] + y * dstStep[2];
+
+ for (UINT32 x = 0; x < nWidth; x++)
+ {
+ BYTE B = 0;
+ BYTE G = 0;
+ BYTE R = 0;
+ const UINT32 color = FreeRDPReadColor(&pRGB[x * bpp], SrcFormat);
+ FreeRDPSplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
+ pY[x] = RGB2Y(R, G, B);
+ pU[x] = RGB2U(R, G, B);
+ pV[x] = RGB2V(R, G, B);
+ }
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static INLINE pstatus_t general_RGBToYUV420_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
+ BYTE* WINPR_RESTRICT pDst[3],
+ const UINT32 dstStep[3],
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ UINT32 i = 0;
+ size_t x1 = 0;
+ size_t x2 = 4;
+ size_t x3 = srcStep;
+ size_t x4 = srcStep + 4;
+ size_t y1 = 0;
+ size_t y2 = 1;
+ size_t y3 = dstStep[0];
+ size_t y4 = dstStep[0] + 1;
+ UINT32 max_x = roi->width - 1;
+ UINT32 max_y = roi->height - 1;
+
+ for (UINT32 y = i = 0; y < roi->height; y += 2, i++)
+ {
+ const BYTE* src = pSrc + y * srcStep;
+ BYTE* ydst = pDst[0] + y * dstStep[0];
+ BYTE* udst = pDst[1] + i * dstStep[1];
+ BYTE* vdst = pDst[2] + i * dstStep[2];
+
+ for (UINT32 x = 0; x < roi->width; x += 2)
+ {
+ BYTE R = 0;
+ BYTE G = 0;
+ BYTE B = 0;
+ INT32 Ra = 0;
+ INT32 Ga = 0;
+ INT32 Ba = 0;
+ /* row 1, pixel 1 */
+ Ba = B = *(src + x1 + 0);
+ Ga = G = *(src + x1 + 1);
+ Ra = R = *(src + x1 + 2);
+ ydst[y1] = RGB2Y(R, G, B);
+
+ if (x < max_x)
+ {
+ /* row 1, pixel 2 */
+ Ba += B = *(src + x2 + 0);
+ Ga += G = *(src + x2 + 1);
+ Ra += R = *(src + x2 + 2);
+ ydst[y2] = RGB2Y(R, G, B);
+ }
+
+ if (y < max_y)
+ {
+ /* row 2, pixel 1 */
+ Ba += B = *(src + x3 + 0);
+ Ga += G = *(src + x3 + 1);
+ Ra += R = *(src + x3 + 2);
+ ydst[y3] = RGB2Y(R, G, B);
+
+ if (x < max_x)
+ {
+ /* row 2, pixel 2 */
+ Ba += B = *(src + x4 + 0);
+ Ga += G = *(src + x4 + 1);
+ Ra += R = *(src + x4 + 2);
+ ydst[y4] = RGB2Y(R, G, B);
+ }
+ }
+
+ Ba >>= 2;
+ Ga >>= 2;
+ Ra >>= 2;
+ *udst++ = RGB2U(Ra, Ga, Ba);
+ *vdst++ = RGB2V(Ra, Ga, Ba);
+ ydst += 2;
+ src += 8;
+ }
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static INLINE pstatus_t general_RGBToYUV420_RGBX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
+ BYTE* WINPR_RESTRICT pDst[3],
+ const UINT32 dstStep[3],
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ size_t x1 = 0;
+ size_t x2 = 4;
+ size_t x3 = srcStep;
+ size_t x4 = srcStep + 4;
+ size_t y1 = 0;
+ size_t y2 = 1;
+ size_t y3 = dstStep[0];
+ size_t y4 = dstStep[0] + 1;
+ UINT32 max_x = roi->width - 1;
+ UINT32 max_y = roi->height - 1;
+
+ for (UINT32 y = 0, i = 0; y < roi->height; y += 2, i++)
+ {
+ const BYTE* src = pSrc + y * srcStep;
+ BYTE* ydst = pDst[0] + y * dstStep[0];
+ BYTE* udst = pDst[1] + i * dstStep[1];
+ BYTE* vdst = pDst[2] + i * dstStep[2];
+
+ for (UINT32 x = 0; x < roi->width; x += 2)
+ {
+ BYTE R = 0;
+ BYTE G = 0;
+ BYTE B = 0;
+ INT32 Ra = 0;
+ INT32 Ga = 0;
+ INT32 Ba = 0;
+ /* row 1, pixel 1 */
+ Ra = R = *(src + x1 + 0);
+ Ga = G = *(src + x1 + 1);
+ Ba = B = *(src + x1 + 2);
+ ydst[y1] = RGB2Y(R, G, B);
+
+ if (x < max_x)
+ {
+ /* row 1, pixel 2 */
+ Ra += R = *(src + x2 + 0);
+ Ga += G = *(src + x2 + 1);
+ Ba += B = *(src + x2 + 2);
+ ydst[y2] = RGB2Y(R, G, B);
+ }
+
+ if (y < max_y)
+ {
+ /* row 2, pixel 1 */
+ Ra += R = *(src + x3 + 0);
+ Ga += G = *(src + x3 + 1);
+ Ba += B = *(src + x3 + 2);
+ ydst[y3] = RGB2Y(R, G, B);
+
+ if (x < max_x)
+ {
+ /* row 2, pixel 2 */
+ Ra += R = *(src + x4 + 0);
+ Ga += G = *(src + x4 + 1);
+ Ba += B = *(src + x4 + 2);
+ ydst[y4] = RGB2Y(R, G, B);
+ }
+ }
+
+ Ba >>= 2;
+ Ga >>= 2;
+ Ra >>= 2;
+ *udst++ = RGB2U(Ra, Ga, Ba);
+ *vdst++ = RGB2V(Ra, Ga, Ba);
+ ydst += 2;
+ src += 8;
+ }
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static INLINE pstatus_t general_RGBToYUV420_ANY(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+ UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[3],
+ const UINT32 dstStep[3],
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ const UINT32 bpp = FreeRDPGetBytesPerPixel(srcFormat);
+ size_t x1 = 0;
+ size_t x2 = bpp;
+ size_t x3 = srcStep;
+ size_t x4 = srcStep + bpp;
+ size_t y1 = 0;
+ size_t y2 = 1;
+ size_t y3 = dstStep[0];
+ size_t y4 = dstStep[0] + 1;
+ UINT32 max_x = roi->width - 1;
+ UINT32 max_y = roi->height - 1;
+
+ for (UINT32 y = 0, i = 0; y < roi->height; y += 2, i++)
+ {
+ const BYTE* src = pSrc + y * srcStep;
+ BYTE* ydst = pDst[0] + y * dstStep[0];
+ BYTE* udst = pDst[1] + i * dstStep[1];
+ BYTE* vdst = pDst[2] + i * dstStep[2];
+
+ for (UINT32 x = 0; x < roi->width; x += 2)
+ {
+ BYTE R = 0;
+ BYTE G = 0;
+ BYTE B = 0;
+ INT32 Ra = 0;
+ INT32 Ga = 0;
+ INT32 Ba = 0;
+ UINT32 color = 0;
+ /* row 1, pixel 1 */
+ color = FreeRDPReadColor(src + x1, srcFormat);
+ FreeRDPSplitColor(color, srcFormat, &R, &G, &B, NULL, NULL);
+ Ra = R;
+ Ga = G;
+ Ba = B;
+ ydst[y1] = RGB2Y(R, G, B);
+
+ if (x < max_x)
+ {
+ /* row 1, pixel 2 */
+ color = FreeRDPReadColor(src + x2, srcFormat);
+ FreeRDPSplitColor(color, srcFormat, &R, &G, &B, NULL, NULL);
+ Ra += R;
+ Ga += G;
+ Ba += B;
+ ydst[y2] = RGB2Y(R, G, B);
+ }
+
+ if (y < max_y)
+ {
+ /* row 2, pixel 1 */
+ color = FreeRDPReadColor(src + x3, srcFormat);
+ FreeRDPSplitColor(color, srcFormat, &R, &G, &B, NULL, NULL);
+ Ra += R;
+ Ga += G;
+ Ba += B;
+ ydst[y3] = RGB2Y(R, G, B);
+
+ if (x < max_x)
+ {
+ /* row 2, pixel 2 */
+ color = FreeRDPReadColor(src + x4, srcFormat);
+ FreeRDPSplitColor(color, srcFormat, &R, &G, &B, NULL, NULL);
+ Ra += R;
+ Ga += G;
+ Ba += B;
+ ydst[y4] = RGB2Y(R, G, B);
+ }
+ }
+
+ Ra >>= 2;
+ Ga >>= 2;
+ Ba >>= 2;
+ *udst++ = RGB2U(Ra, Ga, Ba);
+ *vdst++ = RGB2V(Ra, Ga, Ba);
+ ydst += 2;
+ src += 2 * bpp;
+ }
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_RGBToYUV420_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+ UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[3],
+ const UINT32 dstStep[3],
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ switch (srcFormat)
+ {
+ case PIXEL_FORMAT_BGRA32:
+ case PIXEL_FORMAT_BGRX32:
+ return general_RGBToYUV420_BGRX(pSrc, srcStep, pDst, dstStep, roi);
+
+ case PIXEL_FORMAT_RGBA32:
+ case PIXEL_FORMAT_RGBX32:
+ return general_RGBToYUV420_RGBX(pSrc, srcStep, pDst, dstStep, roi);
+
+ default:
+ return general_RGBToYUV420_ANY(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
+ }
+}
+
+static INLINE void general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
+ const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
+ BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
+ BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
+ BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
+{
+ for (UINT32 x = 0; x < width; x += 2)
+ {
+ const BOOL lastX = (x + 1) >= width;
+ BYTE Y1e = 0;
+ BYTE Y2e = 0;
+ BYTE U1e = 0;
+ BYTE V1e = 0;
+ BYTE U2e = 0;
+ BYTE V2e = 0;
+ BYTE Y1o = 0;
+ BYTE Y2o = 0;
+ BYTE U1o = 0;
+ BYTE V1o = 0;
+ BYTE U2o = 0;
+ BYTE V2o = 0;
+ /* Read 4 pixels, 2 from even, 2 from odd lines */
+ {
+ const BYTE b = *srcEven++;
+ const BYTE g = *srcEven++;
+ const BYTE r = *srcEven++;
+ srcEven++;
+ Y1e = Y2e = Y1o = Y2o = RGB2Y(r, g, b);
+ U1e = U2e = U1o = U2o = RGB2U(r, g, b);
+ V1e = V2e = V1o = V2o = RGB2V(r, g, b);
+ }
+
+ if (!lastX)
+ {
+ const BYTE b = *srcEven++;
+ const BYTE g = *srcEven++;
+ const BYTE r = *srcEven++;
+ srcEven++;
+ Y2e = RGB2Y(r, g, b);
+ U2e = RGB2U(r, g, b);
+ V2e = RGB2V(r, g, b);
+ }
+
+ if (b1Odd)
+ {
+ const BYTE b = *srcOdd++;
+ const BYTE g = *srcOdd++;
+ const BYTE r = *srcOdd++;
+ srcOdd++;
+ Y1o = Y2o = RGB2Y(r, g, b);
+ U1o = U2o = RGB2U(r, g, b);
+ V1o = V2o = RGB2V(r, g, b);
+ }
+
+ if (b1Odd && !lastX)
+ {
+ const BYTE b = *srcOdd++;
+ const BYTE g = *srcOdd++;
+ const BYTE r = *srcOdd++;
+ srcOdd++;
+ Y2o = RGB2Y(r, g, b);
+ U2o = RGB2U(r, g, b);
+ V2o = RGB2V(r, g, b);
+ }
+
+ /* We have 4 Y pixels, so store them. */
+ *b1Even++ = Y1e;
+ *b1Even++ = Y2e;
+
+ if (b1Odd)
+ {
+ *b1Odd++ = Y1o;
+ *b1Odd++ = Y2o;
+ }
+
+ /* 2x 2y pixel in luma UV plane use averaging
+ */
+ {
+ const BYTE Uavg = ((UINT16)U1e + (UINT16)U2e + (UINT16)U1o + (UINT16)U2o) / 4;
+ const BYTE Vavg = ((UINT16)V1e + (UINT16)V2e + (UINT16)V1o + (UINT16)V2o) / 4;
+ *b2++ = Uavg;
+ *b3++ = Vavg;
+ }
+
+ /* UV from 2x, 2y+1 */
+ if (b1Odd)
+ {
+ *b4++ = U1o;
+ *b5++ = V1o;
+
+ if (!lastX)
+ {
+ *b4++ = U2o;
+ *b5++ = V2o;
+ }
+ }
+
+ /* UV from 2x+1, 2y */
+ if (!lastX)
+ {
+ *b6++ = U2e;
+ *b7++ = V2e;
+ }
+ }
+}
+
+static INLINE pstatus_t general_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
+ BYTE* WINPR_RESTRICT pDst1[3],
+ const UINT32 dst1Step[3],
+ BYTE* WINPR_RESTRICT pDst2[3],
+ const UINT32 dst2Step[3],
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ /**
+ * Note:
+ * Read information in function general_RGBToAVC444YUV_ANY below !
+ */
+ const BYTE* pMaxSrc = pSrc + (roi->height - 1) * srcStep;
+
+ for (UINT32 y = 0; y < roi->height; y += 2)
+ {
+ const BOOL last = (y >= (roi->height - 1));
+ const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
+ const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
+ const UINT32 i = y >> 1;
+ const UINT32 n = (i & ~7) + i;
+ BYTE* b1Even = pDst1[0] + y * dst1Step[0];
+ BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
+ BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
+ BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
+ BYTE* b4 = pDst2[0] + dst2Step[0] * n;
+ BYTE* b5 = b4 + 8 * dst2Step[0];
+ BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
+ BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
+ general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6,
+ b7, roi->width);
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static INLINE void general_RGBToAVC444YUV_RGBX_DOUBLE_ROW(
+ const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
+ BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
+ BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
+ BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
+{
+ for (UINT32 x = 0; x < width; x += 2)
+ {
+ const BOOL lastX = (x + 1) >= width;
+ BYTE Y1e = 0;
+ BYTE Y2e = 0;
+ BYTE U1e = 0;
+ BYTE V1e = 0;
+ BYTE U2e = 0;
+ BYTE V2e = 0;
+ BYTE Y1o = 0;
+ BYTE Y2o = 0;
+ BYTE U1o = 0;
+ BYTE V1o = 0;
+ BYTE U2o = 0;
+ BYTE V2o = 0;
+ /* Read 4 pixels, 2 from even, 2 from odd lines */
+ {
+ const BYTE r = *srcEven++;
+ const BYTE g = *srcEven++;
+ const BYTE b = *srcEven++;
+ srcEven++;
+ Y1e = Y2e = Y1o = Y2o = RGB2Y(r, g, b);
+ U1e = U2e = U1o = U2o = RGB2U(r, g, b);
+ V1e = V2e = V1o = V2o = RGB2V(r, g, b);
+ }
+
+ if (!lastX)
+ {
+ const BYTE r = *srcEven++;
+ const BYTE g = *srcEven++;
+ const BYTE b = *srcEven++;
+ srcEven++;
+ Y2e = RGB2Y(r, g, b);
+ U2e = RGB2U(r, g, b);
+ V2e = RGB2V(r, g, b);
+ }
+
+ if (b1Odd)
+ {
+ const BYTE r = *srcOdd++;
+ const BYTE g = *srcOdd++;
+ const BYTE b = *srcOdd++;
+ srcOdd++;
+ Y1o = Y2o = RGB2Y(r, g, b);
+ U1o = U2o = RGB2U(r, g, b);
+ V1o = V2o = RGB2V(r, g, b);
+ }
+
+ if (b1Odd && !lastX)
+ {
+ const BYTE r = *srcOdd++;
+ const BYTE g = *srcOdd++;
+ const BYTE b = *srcOdd++;
+ srcOdd++;
+ Y2o = RGB2Y(r, g, b);
+ U2o = RGB2U(r, g, b);
+ V2o = RGB2V(r, g, b);
+ }
+
+ /* We have 4 Y pixels, so store them. */
+ *b1Even++ = Y1e;
+ *b1Even++ = Y2e;
+
+ if (b1Odd)
+ {
+ *b1Odd++ = Y1o;
+ *b1Odd++ = Y2o;
+ }
+
+ /* 2x 2y pixel in luma UV plane use averaging
+ */
+ {
+ const BYTE Uavg = ((UINT16)U1e + (UINT16)U2e + (UINT16)U1o + (UINT16)U2o) / 4;
+ const BYTE Vavg = ((UINT16)V1e + (UINT16)V2e + (UINT16)V1o + (UINT16)V2o) / 4;
+ *b2++ = Uavg;
+ *b3++ = Vavg;
+ }
+
+ /* UV from 2x, 2y+1 */
+ if (b1Odd)
+ {
+ *b4++ = U1o;
+ *b5++ = V1o;
+
+ if (!lastX)
+ {
+ *b4++ = U2o;
+ *b5++ = V2o;
+ }
+ }
+
+ /* UV from 2x+1, 2y */
+ if (!lastX)
+ {
+ *b6++ = U2e;
+ *b7++ = V2e;
+ }
+ }
+}
+
+static INLINE pstatus_t general_RGBToAVC444YUV_RGBX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
+ BYTE* WINPR_RESTRICT pDst1[3],
+ const UINT32 dst1Step[3],
+ BYTE* WINPR_RESTRICT pDst2[3],
+ const UINT32 dst2Step[3],
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ /**
+ * Note:
+ * Read information in function general_RGBToAVC444YUV_ANY below !
+ */
+ const BYTE* pMaxSrc = pSrc + (roi->height - 1) * srcStep;
+
+ for (UINT32 y = 0; y < roi->height; y += 2)
+ {
+ const BOOL last = (y >= (roi->height - 1));
+ const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
+ const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
+ const UINT32 i = y >> 1;
+ const UINT32 n = (i & ~7) + i;
+ BYTE* b1Even = pDst1[0] + y * dst1Step[0];
+ BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
+ BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
+ BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
+ BYTE* b4 = pDst2[0] + dst2Step[0] * n;
+ BYTE* b5 = b4 + 8 * dst2Step[0];
+ BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
+ BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
+ general_RGBToAVC444YUV_RGBX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6,
+ b7, roi->width);
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static INLINE void general_RGBToAVC444YUV_ANY_DOUBLE_ROW(
+ const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd, UINT32 srcFormat,
+ BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
+ BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
+ BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
+{
+ const UINT32 bpp = FreeRDPGetBytesPerPixel(srcFormat);
+ for (UINT32 x = 0; x < width; x += 2)
+ {
+ const BOOL lastX = (x + 1) >= width;
+ BYTE Y1e = 0;
+ BYTE Y2e = 0;
+ BYTE U1e = 0;
+ BYTE V1e = 0;
+ BYTE U2e = 0;
+ BYTE V2e = 0;
+ BYTE Y1o = 0;
+ BYTE Y2o = 0;
+ BYTE U1o = 0;
+ BYTE V1o = 0;
+ BYTE U2o = 0;
+ BYTE V2o = 0;
+ /* Read 4 pixels, 2 from even, 2 from odd lines */
+ {
+ BYTE r = 0;
+ BYTE g = 0;
+ BYTE b = 0;
+ const UINT32 color = FreeRDPReadColor(srcEven, srcFormat);
+ srcEven += bpp;
+ FreeRDPSplitColor(color, srcFormat, &r, &g, &b, NULL, NULL);
+ Y1e = Y2e = Y1o = Y2o = RGB2Y(r, g, b);
+ U1e = U2e = U1o = U2o = RGB2U(r, g, b);
+ V1e = V2e = V1o = V2o = RGB2V(r, g, b);
+ }
+
+ if (!lastX)
+ {
+ BYTE r = 0;
+ BYTE g = 0;
+ BYTE b = 0;
+ const UINT32 color = FreeRDPReadColor(srcEven, srcFormat);
+ srcEven += bpp;
+ FreeRDPSplitColor(color, srcFormat, &r, &g, &b, NULL, NULL);
+ Y2e = RGB2Y(r, g, b);
+ U2e = RGB2U(r, g, b);
+ V2e = RGB2V(r, g, b);
+ }
+
+ if (b1Odd)
+ {
+ BYTE r = 0;
+ BYTE g = 0;
+ BYTE b = 0;
+ const UINT32 color = FreeRDPReadColor(srcOdd, srcFormat);
+ srcOdd += bpp;
+ FreeRDPSplitColor(color, srcFormat, &r, &g, &b, NULL, NULL);
+ Y1o = Y2o = RGB2Y(r, g, b);
+ U1o = U2o = RGB2U(r, g, b);
+ V1o = V2o = RGB2V(r, g, b);
+ }
+
+ if (b1Odd && !lastX)
+ {
+ BYTE r = 0;
+ BYTE g = 0;
+ BYTE b = 0;
+ const UINT32 color = FreeRDPReadColor(srcOdd, srcFormat);
+ srcOdd += bpp;
+ FreeRDPSplitColor(color, srcFormat, &r, &g, &b, NULL, NULL);
+ Y2o = RGB2Y(r, g, b);
+ U2o = RGB2U(r, g, b);
+ V2o = RGB2V(r, g, b);
+ }
+
+ /* We have 4 Y pixels, so store them. */
+ *b1Even++ = Y1e;
+ *b1Even++ = Y2e;
+
+ if (b1Odd)
+ {
+ *b1Odd++ = Y1o;
+ *b1Odd++ = Y2o;
+ }
+
+ /* 2x 2y pixel in luma UV plane use averaging
+ */
+ {
+ const BYTE Uavg = ((UINT16)U1e + (UINT16)U2e + (UINT16)U1o + (UINT16)U2o) / 4;
+ const BYTE Vavg = ((UINT16)V1e + (UINT16)V2e + (UINT16)V1o + (UINT16)V2o) / 4;
+ *b2++ = Uavg;
+ *b3++ = Vavg;
+ }
+
+ /* UV from 2x, 2y+1 */
+ if (b1Odd)
+ {
+ *b4++ = U1o;
+ *b5++ = V1o;
+
+ if (!lastX)
+ {
+ *b4++ = U2o;
+ *b5++ = V2o;
+ }
+ }
+
+ /* UV from 2x+1, 2y */
+ if (!lastX)
+ {
+ *b6++ = U2e;
+ *b7++ = V2e;
+ }
+ }
+}
+
+static INLINE pstatus_t general_RGBToAVC444YUV_ANY(
+ const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat, UINT32 srcStep,
+ BYTE* WINPR_RESTRICT pDst1[3], const UINT32 dst1Step[3], BYTE* WINPR_RESTRICT pDst2[3],
+ const UINT32 dst2Step[3], const prim_size_t* WINPR_RESTRICT roi)
+{
+ /**
+ * Note: According to [MS-RDPEGFX 2.2.4.4 RFX_AVC420_BITMAP_STREAM] the
+ * width and height of the MPEG-4 AVC/H.264 codec bitstream MUST be aligned
+ * to a multiple of 16.
+ * Hence the passed destination YUV420/CHROMA420 buffers must have been
+ * allocated accordingly !!
+ */
+ /**
+ * [MS-RDPEGFX 3.3.8.3.2 YUV420p Stream Combination] defines the following "Bx areas":
+ *
+ * YUV420 frame (main view):
+ * B1: From Y444 all pixels
+ * B2: From U444 all pixels in even rows with even columns
+ * B3: From V444 all pixels in even rows with even columns
+ *
+ * Chroma420 frame (auxillary view):
+ * B45: From U444 and V444 all pixels from all odd rows
+ * (The odd U444 and V444 rows must be interleaved in 8-line blocks in B45 !!!)
+ * B6: From U444 all pixels in even rows with odd columns
+ * B7: From V444 all pixels in even rows with odd columns
+ *
+ * Microsoft's horrible unclear description in MS-RDPEGFX translated to pseudo code looks like
+ * this:
+ *
+ * for (y = 0; y < fullHeight; y++)
+ * {
+ * for (x = 0; x < fullWidth; x++)
+ * {
+ * B1[x,y] = Y444[x,y];
+ * }
+ * }
+ *
+ * for (y = 0; y < halfHeight; y++)
+ * {
+ * for (x = 0; x < halfWidth; x++)
+ * {
+ * B2[x,y] = U444[2 * x, 2 * y];
+ * B3[x,y] = V444[2 * x, 2 * y];
+ * B6[x,y] = U444[2 * x + 1, 2 * y];
+ * B7[x,y] = V444[2 * x + 1, 2 * y];
+ * }
+ * }
+ *
+ * for (y = 0; y < halfHeight; y++)
+ * {
+ * yU = (y / 8) * 16; // identify first row of correct 8-line U block in B45
+ * yU += (y % 8); // add offset rows in destination block
+ * yV = yU + 8; // the corresponding v line is always 8 rows ahead
+ *
+ * for (x = 0; x < fullWidth; x++)
+ * {
+ * B45[x,yU] = U444[x, 2 * y + 1];
+ * B45[x,yV] = V444[x, 2 * y + 1];
+ * }
+ * }
+ *
+ */
+ const BYTE* pMaxSrc = pSrc + (roi->height - 1) * srcStep;
+
+ for (UINT32 y = 0; y < roi->height; y += 2)
+ {
+ const BOOL last = (y >= (roi->height - 1));
+ const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
+ const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
+ const UINT32 i = y >> 1;
+ const UINT32 n = (i & ~7) + i;
+ BYTE* b1Even = pDst1[0] + y * dst1Step[0];
+ BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
+ BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
+ BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
+ BYTE* b4 = pDst2[0] + dst2Step[0] * n;
+ BYTE* b5 = b4 + 8 * dst2Step[0];
+ BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
+ BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
+ general_RGBToAVC444YUV_ANY_DOUBLE_ROW(srcEven, srcOdd, srcFormat, b1Even, b1Odd, b2, b3, b4,
+ b5, b6, b7, roi->width);
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static INLINE pstatus_t general_RGBToAVC444YUV(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+ UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[3],
+ const UINT32 dst1Step[3],
+ BYTE* WINPR_RESTRICT pDst2[3],
+ const UINT32 dst2Step[3],
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ if (!pSrc || !pDst1 || !dst1Step || !pDst2 || !dst2Step)
+ return -1;
+
+ if (!pDst1[0] || !pDst1[1] || !pDst1[2])
+ return -1;
+
+ if (!dst1Step[0] || !dst1Step[1] || !dst1Step[2])
+ return -1;
+
+ if (!pDst2[0] || !pDst2[1] || !pDst2[2])
+ return -1;
+
+ if (!dst2Step[0] || !dst2Step[1] || !dst2Step[2])
+ return -1;
+
+ switch (srcFormat)
+ {
+ case PIXEL_FORMAT_BGRA32:
+ case PIXEL_FORMAT_BGRX32:
+ return general_RGBToAVC444YUV_BGRX(pSrc, srcStep, pDst1, dst1Step, pDst2, dst2Step,
+ roi);
+
+ case PIXEL_FORMAT_RGBA32:
+ case PIXEL_FORMAT_RGBX32:
+ return general_RGBToAVC444YUV_RGBX(pSrc, srcStep, pDst1, dst1Step, pDst2, dst2Step,
+ roi);
+
+ default:
+ return general_RGBToAVC444YUV_ANY(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
+ dst2Step, roi);
+ }
+
+ return !PRIMITIVES_SUCCESS;
+}
+
+static INLINE void general_RGBToAVC444YUVv2_ANY_DOUBLE_ROW(
+ const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd, UINT32 srcFormat,
+ BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
+ BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
+ BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
+ BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
+ BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
+ BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width)
+{
+ const UINT32 bpp = FreeRDPGetBytesPerPixel(srcFormat);
+
+ for (UINT32 x = 0; x < width; x += 2)
+ {
+ BYTE Ya = 0;
+ BYTE Ua = 0;
+ BYTE Va = 0;
+ BYTE Yb = 0;
+ BYTE Ub = 0;
+ BYTE Vb = 0;
+ BYTE Yc = 0;
+ BYTE Uc = 0;
+ BYTE Vc = 0;
+ BYTE Yd = 0;
+ BYTE Ud = 0;
+ BYTE Vd = 0;
+ {
+ BYTE b = 0;
+ BYTE g = 0;
+ BYTE r = 0;
+ const UINT32 color = FreeRDPReadColor(srcEven, srcFormat);
+ srcEven += bpp;
+ FreeRDPSplitColor(color, srcFormat, &r, &g, &b, NULL, NULL);
+ Ya = RGB2Y(r, g, b);
+ Ua = RGB2U(r, g, b);
+ Va = RGB2V(r, g, b);
+ }
+
+ if (x < width - 1)
+ {
+ BYTE b = 0;
+ BYTE g = 0;
+ BYTE r = 0;
+ const UINT32 color = FreeRDPReadColor(srcEven, srcFormat);
+ srcEven += bpp;
+ FreeRDPSplitColor(color, srcFormat, &r, &g, &b, NULL, NULL);
+ Yb = RGB2Y(r, g, b);
+ Ub = RGB2U(r, g, b);
+ Vb = RGB2V(r, g, b);
+ }
+ else
+ {
+ Yb = Ya;
+ Ub = Ua;
+ Vb = Va;
+ }
+
+ if (srcOdd)
+ {
+ BYTE b = 0;
+ BYTE g = 0;
+ BYTE r = 0;
+ const UINT32 color = FreeRDPReadColor(srcOdd, srcFormat);
+ srcOdd += bpp;
+ FreeRDPSplitColor(color, srcFormat, &r, &g, &b, NULL, NULL);
+ Yc = RGB2Y(r, g, b);
+ Uc = RGB2U(r, g, b);
+ Vc = RGB2V(r, g, b);
+ }
+ else
+ {
+ Yc = Ya;
+ Uc = Ua;
+ Vc = Va;
+ }
+
+ if (srcOdd && (x < width - 1))
+ {
+ BYTE b = 0;
+ BYTE g = 0;
+ BYTE r = 0;
+ const UINT32 color = FreeRDPReadColor(srcOdd, srcFormat);
+ srcOdd += bpp;
+ FreeRDPSplitColor(color, srcFormat, &r, &g, &b, NULL, NULL);
+ Yd = RGB2Y(r, g, b);
+ Ud = RGB2U(r, g, b);
+ Vd = RGB2V(r, g, b);
+ }
+ else
+ {
+ Yd = Ya;
+ Ud = Ua;
+ Vd = Va;
+ }
+
+ /* Y [b1] */
+ *yLumaDstEven++ = Ya;
+
+ if (x < width - 1)
+ *yLumaDstEven++ = Yb;
+
+ if (srcOdd)
+ *yLumaDstOdd++ = Yc;
+
+ if (srcOdd && (x < width - 1))
+ *yLumaDstOdd++ = Yd;
+
+ /* 2x 2y [b2,b3] */
+ *uLumaDst++ = (Ua + Ub + Uc + Ud) / 4;
+ *vLumaDst++ = (Va + Vb + Vc + Vd) / 4;
+
+ /* 2x+1, y [b4,b5] even */
+ if (x < width - 1)
+ {
+ *yEvenChromaDst1++ = Ub;
+ *yEvenChromaDst2++ = Vb;
+ }
+
+ if (srcOdd)
+ {
+ /* 2x+1, y [b4,b5] odd */
+ if (x < width - 1)
+ {
+ *yOddChromaDst1++ = Ud;
+ *yOddChromaDst2++ = Vd;
+ }
+
+ /* 4x 2y+1 [b6, b7] */
+ if (x % 4 == 0)
+ {
+ *uChromaDst1++ = Uc;
+ *uChromaDst2++ = Vc;
+ }
+ /* 4x+2 2y+1 [b8, b9] */
+ else
+ {
+ *vChromaDst1++ = Uc;
+ *vChromaDst2++ = Vc;
+ }
+ }
+ }
+}
+
+static INLINE pstatus_t general_RGBToAVC444YUVv2_ANY(
+ const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat, UINT32 srcStep,
+ BYTE* WINPR_RESTRICT pDst1[3], const UINT32 dst1Step[3], BYTE* WINPR_RESTRICT pDst2[3],
+ const UINT32 dst2Step[3], const prim_size_t* WINPR_RESTRICT roi)
+{
+ /**
+ * Note: According to [MS-RDPEGFX 2.2.4.4 RFX_AVC420_BITMAP_STREAM] the
+ * width and height of the MPEG-4 AVC/H.264 codec bitstream MUST be aligned
+ * to a multiple of 16.
+ * Hence the passed destination YUV420/CHROMA420 buffers must have been
+ * allocated accordingly !!
+ */
+ /**
+ * [MS-RDPEGFX 3.3.8.3.3 YUV420p Stream Combination for YUV444v2 mode] defines the following "Bx
+ * areas":
+ *
+ * YUV420 frame (main view):
+ * B1: From Y444 all pixels
+ * B2: From U444 all pixels in even rows with even rows and columns
+ * B3: From V444 all pixels in even rows with even rows and columns
+ *
+ * Chroma420 frame (auxillary view):
+ * B45: From U444 and V444 all pixels from all odd columns
+ * B67: From U444 and V444 every 4th pixel in odd rows
+ * B89: From U444 and V444 every 4th pixel (initial offset of 2) in odd rows
+ *
+ * Chroma Bxy areas correspond to the left and right half of the YUV420 plane.
+ * for (y = 0; y < fullHeight; y++)
+ * {
+ * for (x = 0; x < fullWidth; x++)
+ * {
+ * B1[x,y] = Y444[x,y];
+ * }
+ *
+ * for (x = 0; x < halfWidth; x++)
+ * {
+ * B4[x,y] = U444[2 * x, 2 * y];
+ * B5[x,y] = V444[2 * x, 2 * y];
+ * }
+ * }
+ *
+ * for (y = 0; y < halfHeight; y++)
+ * {
+ * for (x = 0; x < halfWidth; x++)
+ * {
+ * B2[x,y] = U444[2 * x, 2 * y];
+ * B3[x,y] = V444[2 * x, 2 * y];
+ * B6[x,y] = U444[4 * x, 2 * y + 1];
+ * B7[x,y] = V444[4 * x, 2 * y + 1];
+ * B8[x,y] = V444[4 * x + 2, 2 * y + 1];
+ * B9[x,y] = V444[4 * x + 2, 2 * y] + 1;
+ * }
+ * }
+ *
+ */
+ if (roi->height < 1 || roi->width < 1)
+ return !PRIMITIVES_SUCCESS;
+
+ for (UINT32 y = 0; y < roi->height; y += 2)
+ {
+ const BYTE* srcEven = (pSrc + y * srcStep);
+ const BYTE* srcOdd = (y < roi->height - 1) ? (srcEven + srcStep) : NULL;
+ BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
+ BYTE* dstLumaYOdd = (dstLumaYEven + dst1Step[0]);
+ BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
+ BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
+ BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
+ BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
+ BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0];
+ BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0];
+ BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
+ BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
+ BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
+ BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
+ general_RGBToAVC444YUVv2_ANY_DOUBLE_ROW(
+ srcEven, srcOdd, srcFormat, dstLumaYEven, dstLumaYOdd, dstLumaU, dstLumaV,
+ dstEvenChromaY1, dstEvenChromaY2, dstOddChromaY1, dstOddChromaY2, dstChromaU1,
+ dstChromaU2, dstChromaV1, dstChromaV2, roi->width);
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static INLINE void general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
+ const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
+ BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
+ BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
+ BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
+ BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
+ BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
+ BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width)
+{
+ for (UINT32 x = 0; x < width; x += 2)
+ {
+ BYTE Ya = 0;
+ BYTE Ua = 0;
+ BYTE Va = 0;
+ BYTE Yb = 0;
+ BYTE Ub = 0;
+ BYTE Vb = 0;
+ BYTE Yc = 0;
+ BYTE Uc = 0;
+ BYTE Vc = 0;
+ BYTE Yd = 0;
+ BYTE Ud = 0;
+ BYTE Vd = 0;
+ {
+ const BYTE b = *srcEven++;
+ const BYTE g = *srcEven++;
+ const BYTE r = *srcEven++;
+ srcEven++;
+ Ya = RGB2Y(r, g, b);
+ Ua = RGB2U(r, g, b);
+ Va = RGB2V(r, g, b);
+ }
+
+ if (x < width - 1)
+ {
+ const BYTE b = *srcEven++;
+ const BYTE g = *srcEven++;
+ const BYTE r = *srcEven++;
+ srcEven++;
+ Yb = RGB2Y(r, g, b);
+ Ub = RGB2U(r, g, b);
+ Vb = RGB2V(r, g, b);
+ }
+ else
+ {
+ Yb = Ya;
+ Ub = Ua;
+ Vb = Va;
+ }
+
+ if (srcOdd)
+ {
+ const BYTE b = *srcOdd++;
+ const BYTE g = *srcOdd++;
+ const BYTE r = *srcOdd++;
+ srcOdd++;
+ Yc = RGB2Y(r, g, b);
+ Uc = RGB2U(r, g, b);
+ Vc = RGB2V(r, g, b);
+ }
+ else
+ {
+ Yc = Ya;
+ Uc = Ua;
+ Vc = Va;
+ }
+
+ if (srcOdd && (x < width - 1))
+ {
+ const BYTE b = *srcOdd++;
+ const BYTE g = *srcOdd++;
+ const BYTE r = *srcOdd++;
+ srcOdd++;
+ Yd = RGB2Y(r, g, b);
+ Ud = RGB2U(r, g, b);
+ Vd = RGB2V(r, g, b);
+ }
+ else
+ {
+ Yd = Ya;
+ Ud = Ua;
+ Vd = Va;
+ }
+
+ /* Y [b1] */
+ *yLumaDstEven++ = Ya;
+
+ if (x < width - 1)
+ *yLumaDstEven++ = Yb;
+
+ if (srcOdd)
+ *yLumaDstOdd++ = Yc;
+
+ if (srcOdd && (x < width - 1))
+ *yLumaDstOdd++ = Yd;
+
+ /* 2x 2y [b2,b3] */
+ *uLumaDst++ = (Ua + Ub + Uc + Ud) / 4;
+ *vLumaDst++ = (Va + Vb + Vc + Vd) / 4;
+
+ /* 2x+1, y [b4,b5] even */
+ if (x < width - 1)
+ {
+ *yEvenChromaDst1++ = Ub;
+ *yEvenChromaDst2++ = Vb;
+ }
+
+ if (srcOdd)
+ {
+ /* 2x+1, y [b4,b5] odd */
+ if (x < width - 1)
+ {
+ *yOddChromaDst1++ = Ud;
+ *yOddChromaDst2++ = Vd;
+ }
+
+ /* 4x 2y+1 [b6, b7] */
+ if (x % 4 == 0)
+ {
+ *uChromaDst1++ = Uc;
+ *uChromaDst2++ = Vc;
+ }
+ /* 4x+2 2y+1 [b8, b9] */
+ else
+ {
+ *vChromaDst1++ = Uc;
+ *vChromaDst2++ = Vc;
+ }
+ }
+ }
+}
+
+static INLINE pstatus_t general_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT pSrc,
+ UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[3],
+ const UINT32 dst1Step[3],
+ BYTE* WINPR_RESTRICT pDst2[3],
+ const UINT32 dst2Step[3],
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ if (roi->height < 1 || roi->width < 1)
+ return !PRIMITIVES_SUCCESS;
+
+ for (UINT32 y = 0; y < roi->height; y += 2)
+ {
+ const BYTE* srcEven = (pSrc + y * srcStep);
+ const BYTE* srcOdd = (y < roi->height - 1) ? (srcEven + srcStep) : NULL;
+ BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
+ BYTE* dstLumaYOdd = (dstLumaYEven + dst1Step[0]);
+ BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
+ BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
+ BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
+ BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
+ BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0];
+ BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0];
+ BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
+ BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
+ BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
+ BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
+ general_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
+ srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU, dstLumaV, dstEvenChromaY1,
+ dstEvenChromaY2, dstOddChromaY1, dstOddChromaY2, dstChromaU1, dstChromaU2, dstChromaV1,
+ dstChromaV2, roi->width);
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static INLINE pstatus_t general_RGBToAVC444YUVv2(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+ UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[3],
+ const UINT32 dst1Step[3],
+ BYTE* WINPR_RESTRICT pDst2[3],
+ const UINT32 dst2Step[3],
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ switch (srcFormat)
+ {
+ case PIXEL_FORMAT_BGRA32:
+ case PIXEL_FORMAT_BGRX32:
+ return general_RGBToAVC444YUVv2_BGRX(pSrc, srcStep, pDst1, dst1Step, pDst2, dst2Step,
+ roi);
+
+ default:
+ return general_RGBToAVC444YUVv2_ANY(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
+ dst2Step, roi);
+ }
+
+ return !PRIMITIVES_SUCCESS;
+}
+
+void primitives_init_YUV(primitives_t* WINPR_RESTRICT prims)
+{
+ prims->YUV420ToRGB_8u_P3AC4R = general_YUV420ToRGB_8u_P3AC4R;
+ prims->YUV444ToRGB_8u_P3AC4R = general_YUV444ToRGB_8u_P3AC4R;
+ prims->RGBToYUV420_8u_P3AC4R = general_RGBToYUV420_8u_P3AC4R;
+ prims->RGBToYUV444_8u_P3AC4R = general_RGBToYUV444_8u_P3AC4R;
+ prims->YUV420CombineToYUV444 = general_YUV420CombineToYUV444;
+ prims->YUV444SplitToYUV420 = general_YUV444SplitToYUV420;
+ prims->RGBToAVC444YUV = general_RGBToAVC444YUV;
+ prims->RGBToAVC444YUVv2 = general_RGBToAVC444YUVv2;
+}
diff --git a/libfreerdp/primitives/prim_YUV_neon.c b/libfreerdp/primitives/prim_YUV_neon.c
new file mode 100644
index 0000000..5e2039e
--- /dev/null
+++ b/libfreerdp/primitives/prim_YUV_neon.c
@@ -0,0 +1,751 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Optimized YUV/RGB conversion operations
+ *
+ * Copyright 2014 Thomas Erbesdobler
+ * Copyright 2016-2017 Armin Novak <armin.novak@thincast.com>
+ * Copyright 2016-2017 Norbert Federa <norbert.federa@thincast.com>
+ * Copyright 2016-2017 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include <winpr/crt.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+#if !defined(WITH_NEON)
+#error "This file must only be included if WITH_NEON is active!"
+#endif
+
+#include <arm_neon.h>
+
+static primitives_t* generic = NULL;
+
+static INLINE uint8x8_t neon_YUV2R(int32x4_t Ch, int32x4_t Cl, int16x4_t Dh, int16x4_t Dl,
+ int16x4_t Eh, int16x4_t El)
+{
+ /* R = (256 * Y + 403 * (V - 128)) >> 8 */
+ const int16x4_t c403 = vdup_n_s16(403);
+ const int32x4_t CEh = vmlal_s16(Ch, Eh, c403);
+ const int32x4_t CEl = vmlal_s16(Cl, El, c403);
+ const int32x4_t Rh = vrshrq_n_s32(CEh, 8);
+ const int32x4_t Rl = vrshrq_n_s32(CEl, 8);
+ const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh));
+ return vqmovun_s16(R);
+}
+
+static INLINE uint8x8_t neon_YUV2G(int32x4_t Ch, int32x4_t Cl, int16x4_t Dh, int16x4_t Dl,
+ int16x4_t Eh, int16x4_t El)
+{
+ /* G = (256L * Y - 48 * (U - 128) - 120 * (V - 128)) >> 8 */
+ const int16x4_t c48 = vdup_n_s16(48);
+ const int16x4_t c120 = vdup_n_s16(120);
+ const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48);
+ const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48);
+ const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120);
+ const int32x4_t CDEl = vmlsl_s16(CDl, El, c120);
+ const int32x4_t Gh = vrshrq_n_s32(CDEh, 8);
+ const int32x4_t Gl = vrshrq_n_s32(CDEl, 8);
+ const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh));
+ return vqmovun_s16(G);
+}
+
+static INLINE uint8x8_t neon_YUV2B(int32x4_t Ch, int32x4_t Cl, int16x4_t Dh, int16x4_t Dl,
+ int16x4_t Eh, int16x4_t El)
+{
+ /* B = (256L * Y + 475 * (U - 128)) >> 8*/
+ const int16x4_t c475 = vdup_n_s16(475);
+ const int32x4_t CDh = vmlal_s16(Ch, Dh, c475);
+ const int32x4_t CDl = vmlal_s16(Ch, Dl, c475);
+ const int32x4_t Bh = vrshrq_n_s32(CDh, 8);
+ const int32x4_t Bl = vrshrq_n_s32(CDl, 8);
+ const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh));
+ return vqmovun_s16(B);
+}
+
+static INLINE BYTE* neon_YuvToRgbPixel(BYTE* pRGB, int16x8_t Y, int16x8_t D, int16x8_t E,
+ const uint8_t rPos, const uint8_t gPos, const uint8_t bPos,
+ const uint8_t aPos)
+{
+ uint8x8x4_t bgrx;
+ const int32x4_t Ch = vmulq_n_s32(vmovl_s16(vget_high_s16(Y)), 256); /* Y * 256 */
+ const int32x4_t Cl = vmulq_n_s32(vmovl_s16(vget_low_s16(Y)), 256); /* Y * 256 */
+ const int16x4_t Dh = vget_high_s16(D);
+ const int16x4_t Dl = vget_low_s16(D);
+ const int16x4_t Eh = vget_high_s16(E);
+ const int16x4_t El = vget_low_s16(E);
+ {
+ /* B = (256L * Y + 475 * (U - 128)) >> 8*/
+ const int16x4_t c475 = vdup_n_s16(475);
+ const int32x4_t CDh = vmlal_s16(Ch, Dh, c475);
+ const int32x4_t CDl = vmlal_s16(Cl, Dl, c475);
+ const int32x4_t Bh = vrshrq_n_s32(CDh, 8);
+ const int32x4_t Bl = vrshrq_n_s32(CDl, 8);
+ const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh));
+ bgrx.val[bPos] = vqmovun_s16(B);
+ }
+ {
+ /* G = (256L * Y - 48 * (U - 128) - 120 * (V - 128)) >> 8 */
+ const int16x4_t c48 = vdup_n_s16(48);
+ const int16x4_t c120 = vdup_n_s16(120);
+ const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48);
+ const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48);
+ const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120);
+ const int32x4_t CDEl = vmlsl_s16(CDl, El, c120);
+ const int32x4_t Gh = vrshrq_n_s32(CDEh, 8);
+ const int32x4_t Gl = vrshrq_n_s32(CDEl, 8);
+ const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh));
+ bgrx.val[gPos] = vqmovun_s16(G);
+ }
+ {
+ /* R = (256 * Y + 403 * (V - 128)) >> 8 */
+ const int16x4_t c403 = vdup_n_s16(403);
+ const int32x4_t CEh = vmlal_s16(Ch, Eh, c403);
+ const int32x4_t CEl = vmlal_s16(Cl, El, c403);
+ const int32x4_t Rh = vrshrq_n_s32(CEh, 8);
+ const int32x4_t Rl = vrshrq_n_s32(CEl, 8);
+ const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh));
+ bgrx.val[rPos] = vqmovun_s16(R);
+ }
+ {
+ /* A */
+ bgrx.val[aPos] = vdup_n_u8(0xFF);
+ }
+ vst4_u8(pRGB, bgrx);
+ pRGB += 32;
+ return pRGB;
+}
+
+static INLINE pstatus_t neon_YUV420ToX(const BYTE* const WINPR_RESTRICT pSrc[3],
+ const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+ UINT32 dstStep, const prim_size_t* WINPR_RESTRICT roi,
+ const uint8_t rPos, const uint8_t gPos, const uint8_t bPos,
+ const uint8_t aPos)
+{
+ const UINT32 nWidth = roi->width;
+ const UINT32 nHeight = roi->height;
+ const DWORD pad = nWidth % 16;
+ const UINT32 yPad = srcStep[0] - roi->width;
+ const UINT32 uPad = srcStep[1] - roi->width / 2;
+ const UINT32 vPad = srcStep[2] - roi->width / 2;
+ const UINT32 dPad = dstStep - roi->width * 4;
+ const int16x8_t c128 = vdupq_n_s16(128);
+
+ for (UINT32 y = 0; y < nHeight; y += 2)
+ {
+ const uint8_t* pY1 = pSrc[0] + y * srcStep[0];
+ const uint8_t* pY2 = pY1 + srcStep[0];
+ const uint8_t* pU = pSrc[1] + (y / 2) * srcStep[1];
+ const uint8_t* pV = pSrc[2] + (y / 2) * srcStep[2];
+ uint8_t* pRGB1 = pDst + y * dstStep;
+ uint8_t* pRGB2 = pRGB1 + dstStep;
+ const BOOL lastY = y >= nHeight - 1;
+
+ for (UINT32 x = 0; x < nWidth - pad;)
+ {
+ const uint8x8_t Uraw = vld1_u8(pU);
+ const uint8x8x2_t Uu = vzip_u8(Uraw, Uraw);
+ const int16x8_t U1 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[0]));
+ const int16x8_t U2 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[1]));
+ const uint8x8_t Vraw = vld1_u8(pV);
+ const uint8x8x2_t Vu = vzip_u8(Vraw, Vraw);
+ const int16x8_t V1 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[0]));
+ const int16x8_t V2 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[1]));
+ const int16x8_t D1 = vsubq_s16(U1, c128);
+ const int16x8_t E1 = vsubq_s16(V1, c128);
+ const int16x8_t D2 = vsubq_s16(U2, c128);
+ const int16x8_t E2 = vsubq_s16(V2, c128);
+ {
+ const uint8x8_t Y1u = vld1_u8(pY1);
+ const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u));
+ pRGB1 = neon_YuvToRgbPixel(pRGB1, Y1, D1, E1, rPos, gPos, bPos, aPos);
+ pY1 += 8;
+ x += 8;
+ }
+ {
+ const uint8x8_t Y1u = vld1_u8(pY1);
+ const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u));
+ pRGB1 = neon_YuvToRgbPixel(pRGB1, Y1, D2, E2, rPos, gPos, bPos, aPos);
+ pY1 += 8;
+ x += 8;
+ }
+
+ if (!lastY)
+ {
+ {
+ const uint8x8_t Y2u = vld1_u8(pY2);
+ const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u));
+ pRGB2 = neon_YuvToRgbPixel(pRGB2, Y2, D1, E1, rPos, gPos, bPos, aPos);
+ pY2 += 8;
+ }
+ {
+ const uint8x8_t Y2u = vld1_u8(pY2);
+ const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u));
+ pRGB2 = neon_YuvToRgbPixel(pRGB2, Y2, D2, E2, rPos, gPos, bPos, aPos);
+ pY2 += 8;
+ }
+ }
+
+ pU += 8;
+ pV += 8;
+ }
+
+ for (; x < nWidth; x++)
+ {
+ const BYTE U = *pU;
+ const BYTE V = *pV;
+ {
+ const BYTE Y = *pY1++;
+ const BYTE r = YUV2R(Y, U, V);
+ const BYTE g = YUV2G(Y, U, V);
+ const BYTE b = YUV2B(Y, U, V);
+ pRGB1[aPos] = 0xFF;
+ pRGB1[rPos] = r;
+ pRGB1[gPos] = g;
+ pRGB1[bPos] = b;
+ pRGB1 += 4;
+ }
+
+ if (!lastY)
+ {
+ const BYTE Y = *pY2++;
+ const BYTE r = YUV2R(Y, U, V);
+ const BYTE g = YUV2G(Y, U, V);
+ const BYTE b = YUV2B(Y, U, V);
+ pRGB2[aPos] = 0xFF;
+ pRGB2[rPos] = r;
+ pRGB2[gPos] = g;
+ pRGB2[bPos] = b;
+ pRGB2 += 4;
+ }
+
+ if (x % 2)
+ {
+ pU++;
+ pV++;
+ }
+ }
+
+ pRGB1 += dPad;
+ pRGB2 += dPad;
+ pY1 += yPad;
+ pY2 += yPad;
+ pU += uPad;
+ pV += vPad;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_YUV420ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT const pSrc[3],
+ const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+ UINT32 dstStep, UINT32 DstFormat,
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ switch (DstFormat)
+ {
+ case PIXEL_FORMAT_BGRA32:
+ case PIXEL_FORMAT_BGRX32:
+ return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
+
+ case PIXEL_FORMAT_RGBA32:
+ case PIXEL_FORMAT_RGBX32:
+ return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
+
+ case PIXEL_FORMAT_ARGB32:
+ case PIXEL_FORMAT_XRGB32:
+ return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
+
+ case PIXEL_FORMAT_ABGR32:
+ case PIXEL_FORMAT_XBGR32:
+ return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
+
+ default:
+ return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+ }
+}
+
+static INLINE pstatus_t neon_YUV444ToX(const BYTE* const WINPR_RESTRICT pSrc[3],
+ const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+ UINT32 dstStep, const prim_size_t* WINPR_RESTRICT roi,
+ const uint8_t rPos, const uint8_t gPos, const uint8_t bPos,
+ const uint8_t aPos)
+{
+ const UINT32 nWidth = roi->width;
+ const UINT32 nHeight = roi->height;
+ const UINT32 yPad = srcStep[0] - roi->width;
+ const UINT32 uPad = srcStep[1] - roi->width;
+ const UINT32 vPad = srcStep[2] - roi->width;
+ const UINT32 dPad = dstStep - roi->width * 4;
+ const uint8_t* pY = pSrc[0];
+ const uint8_t* pU = pSrc[1];
+ const uint8_t* pV = pSrc[2];
+ uint8_t* pRGB = pDst;
+ const int16x8_t c128 = vdupq_n_s16(128);
+ const DWORD pad = nWidth % 8;
+
+ for (UINT32 y = 0; y < nHeight; y++)
+ {
+ for (UINT32 x = 0; x < nWidth - pad; x += 8)
+ {
+ const uint8x8_t Yu = vld1_u8(pY);
+ const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(Yu));
+ const uint8x8_t Uu = vld1_u8(pU);
+ const int16x8_t U = vreinterpretq_s16_u16(vmovl_u8(Uu));
+ const uint8x8_t Vu = vld1_u8(pV);
+ const int16x8_t V = vreinterpretq_s16_u16(vmovl_u8(Vu));
+ /* Do the calculations on Y in 32bit width, the result of 255 * 256 does not fit
+ * a signed 16 bit value. */
+ const int16x8_t D = vsubq_s16(U, c128);
+ const int16x8_t E = vsubq_s16(V, c128);
+ pRGB = neon_YuvToRgbPixel(pRGB, Y, D, E, rPos, gPos, bPos, aPos);
+ pY += 8;
+ pU += 8;
+ pV += 8;
+ }
+
+ for (UINT32 x = 0; x < pad; x++)
+ {
+ const BYTE Y = *pY++;
+ const BYTE U = *pU++;
+ const BYTE V = *pV++;
+ const BYTE r = YUV2R(Y, U, V);
+ const BYTE g = YUV2G(Y, U, V);
+ const BYTE b = YUV2B(Y, U, V);
+ pRGB[aPos] = 0xFF;
+ pRGB[rPos] = r;
+ pRGB[gPos] = g;
+ pRGB[bPos] = b;
+ pRGB += 4;
+ }
+
+ pRGB += dPad;
+ pY += yPad;
+ pU += uPad;
+ pV += vPad;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_YUV444ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT const pSrc[3],
+ const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+ UINT32 dstStep, UINT32 DstFormat,
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ switch (DstFormat)
+ {
+ case PIXEL_FORMAT_BGRA32:
+ case PIXEL_FORMAT_BGRX32:
+ return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
+
+ case PIXEL_FORMAT_RGBA32:
+ case PIXEL_FORMAT_RGBX32:
+ return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
+
+ case PIXEL_FORMAT_ARGB32:
+ case PIXEL_FORMAT_XRGB32:
+ return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
+
+ case PIXEL_FORMAT_ABGR32:
+ case PIXEL_FORMAT_XBGR32:
+ return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
+
+ default:
+ return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+ }
+}
+
+static pstatus_t neon_LumaToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3],
+ const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
+ const UINT32 dstStep[3], const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+ const UINT32 nWidth = roi->right - roi->left;
+ const UINT32 nHeight = roi->bottom - roi->top;
+ const UINT32 halfWidth = (nWidth + 1) / 2;
+ const UINT32 halfHeight = (nHeight + 1) / 2;
+ const UINT32 evenY = 0;
+ const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
+ pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
+ pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
+ BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
+ pDstRaw[1] + roi->top * dstStep[1] + roi->left,
+ pDstRaw[2] + roi->top * dstStep[2] + roi->left };
+
+ /* Y data is already here... */
+ /* B1 */
+ for (UINT32 y = 0; y < nHeight; y++)
+ {
+ const BYTE* Ym = pSrc[0] + srcStep[0] * y;
+ BYTE* pY = pDst[0] + dstStep[0] * y;
+ memcpy(pY, Ym, nWidth);
+ }
+
+ /* The first half of U, V are already here part of this frame. */
+ /* B2 and B3 */
+ for (UINT32 y = 0; y < halfHeight; y++)
+ {
+ const UINT32 val2y = (2 * y + evenY);
+ const BYTE* Um = pSrc[1] + srcStep[1] * y;
+ const BYTE* Vm = pSrc[2] + srcStep[2] * y;
+ BYTE* pU = pDst[1] + dstStep[1] * val2y;
+ BYTE* pV = pDst[2] + dstStep[2] * val2y;
+ BYTE* pU1 = pU + dstStep[1];
+ BYTE* pV1 = pV + dstStep[2];
+
+ for (UINT32 x = 0; x + 16 < halfWidth; x += 16)
+ {
+ {
+ const uint8x16_t u = vld1q_u8(Um);
+ uint8x16x2_t u2x;
+ u2x.val[0] = u;
+ u2x.val[1] = u;
+ vst2q_u8(pU, u2x);
+ vst2q_u8(pU1, u2x);
+ Um += 16;
+ pU += 32;
+ pU1 += 32;
+ }
+ {
+ const uint8x16_t v = vld1q_u8(Vm);
+ uint8x16x2_t v2x;
+ v2x.val[0] = v;
+ v2x.val[1] = v;
+ vst2q_u8(pV, v2x);
+ vst2q_u8(pV1, v2x);
+ Vm += 16;
+ pV += 32;
+ pV1 += 32;
+ }
+ }
+
+ for (; x < halfWidth; x++)
+ {
+ const BYTE u = *Um++;
+ const BYTE v = *Vm++;
+ *pU++ = u;
+ *pU++ = u;
+ *pU1++ = u;
+ *pU1++ = u;
+ *pV++ = v;
+ *pV++ = v;
+ *pV1++ = v;
+ *pV1++ = v;
+ }
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_ChromaFilter(BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
+ const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+ const UINT32 oddY = 1;
+ const UINT32 evenY = 0;
+ const UINT32 nWidth = roi->right - roi->left;
+ const UINT32 nHeight = roi->bottom - roi->top;
+ const UINT32 halfHeight = (nHeight + 1) / 2;
+ const UINT32 halfWidth = (nWidth + 1) / 2;
+ const UINT32 halfPad = halfWidth % 16;
+
+ /* Filter */
+ for (UINT32 y = roi->top; y < halfHeight + roi->top; y++)
+ {
+ const UINT32 val2y = (y * 2 + evenY);
+ const UINT32 val2y1 = val2y + oddY;
+ BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;
+ BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;
+ BYTE* pU = pDst[1] + dstStep[1] * val2y;
+ BYTE* pV = pDst[2] + dstStep[2] * val2y;
+
+ if (val2y1 > nHeight)
+ continue;
+
+ for (UINT32 x = roi->left / 2; x < halfWidth + roi->left / 2 - halfPad; x += 16)
+ {
+ {
+ /* U = (U2x,2y << 2) - U2x1,2y - U2x,2y1 - U2x1,2y1 */
+ uint8x8x2_t u = vld2_u8(&pU[2 * x]);
+ const int16x8_t up =
+ vreinterpretq_s16_u16(vshll_n_u8(u.val[0], 2)); /* Ux2,2y << 2 */
+ const uint8x8x2_t u1 = vld2_u8(&pU1[2 * x]);
+ const uint16x8_t usub = vaddl_u8(u1.val[1], u1.val[0]); /* U2x,2y1 + U2x1,2y1 */
+ const int16x8_t us = vreinterpretq_s16_u16(
+ vaddw_u8(usub, u.val[1])); /* U2x1,2y + U2x,2y1 + U2x1,2y1 */
+ const int16x8_t un = vsubq_s16(up, us);
+ const uint8x8_t u8 = vqmovun_s16(un); /* CLIP(un) */
+ u.val[0] = u8;
+ vst2_u8(&pU[2 * x], u);
+ }
+ {
+ /* V = (V2x,2y << 2) - V2x1,2y - V2x,2y1 - V2x1,2y1 */
+ uint8x8x2_t v = vld2_u8(&pV[2 * x]);
+ const int16x8_t vp =
+ vreinterpretq_s16_u16(vshll_n_u8(v.val[0], 2)); /* Vx2,2y << 2 */
+ const uint8x8x2_t v1 = vld2_u8(&pV1[2 * x]);
+ const uint16x8_t vsub = vaddl_u8(v1.val[1], v1.val[0]); /* V2x,2y1 + V2x1,2y1 */
+ const int16x8_t vs = vreinterpretq_s16_u16(
+ vaddw_u8(vsub, v.val[1])); /* V2x1,2y + V2x,2y1 + V2x1,2y1 */
+ const int16x8_t vn = vsubq_s16(vp, vs);
+ const uint8x8_t v8 = vqmovun_s16(vn); /* CLIP(vn) */
+ v.val[0] = v8;
+ vst2_u8(&pV[2 * x], v);
+ }
+ }
+
+ for (; x < halfWidth + roi->left / 2; x++)
+ {
+ const UINT32 val2x = (x * 2);
+ const UINT32 val2x1 = val2x + 1;
+ const BYTE inU = pU[val2x];
+ const BYTE inV = pV[val2x];
+ const INT32 up = inU * 4;
+ const INT32 vp = inV * 4;
+ INT32 u2020;
+ INT32 v2020;
+
+ if (val2x1 > nWidth)
+ continue;
+
+ u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1];
+ v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1];
+ pU[val2x] = CONDITIONAL_CLIP(u2020, inU);
+ pV[val2x] = CONDITIONAL_CLIP(v2020, inV);
+ }
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_ChromaV1ToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3],
+ const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
+ const UINT32 dstStep[3],
+ const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+ const UINT32 mod = 16;
+ UINT32 uY = 0;
+ UINT32 vY = 0;
+ const UINT32 nWidth = roi->right - roi->left;
+ const UINT32 nHeight = roi->bottom - roi->top;
+ const UINT32 halfWidth = (nWidth) / 2;
+ const UINT32 halfHeight = (nHeight) / 2;
+ const UINT32 oddY = 1;
+ const UINT32 evenY = 0;
+ const UINT32 oddX = 1;
+ /* The auxilary frame is aligned to multiples of 16x16.
+ * We need the padded height for B4 and B5 conversion. */
+ const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
+ const UINT32 halfPad = halfWidth % 16;
+ const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
+ pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
+ pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
+ BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
+ pDstRaw[1] + roi->top * dstStep[1] + roi->left,
+ pDstRaw[2] + roi->top * dstStep[2] + roi->left };
+
+ /* The second half of U and V is a bit more tricky... */
+ /* B4 and B5 */
+ for (UINT32 y = 0; y < padHeigth; y++)
+ {
+ const BYTE* Ya = pSrc[0] + srcStep[0] * y;
+ BYTE* pX;
+
+ if ((y) % mod < (mod + 1) / 2)
+ {
+ const UINT32 pos = (2 * uY++ + oddY);
+
+ if (pos >= nHeight)
+ continue;
+
+ pX = pDst[1] + dstStep[1] * pos;
+ }
+ else
+ {
+ const UINT32 pos = (2 * vY++ + oddY);
+
+ if (pos >= nHeight)
+ continue;
+
+ pX = pDst[2] + dstStep[2] * pos;
+ }
+
+ memcpy(pX, Ya, nWidth);
+ }
+
+ /* B6 and B7 */
+ for (UINT32 y = 0; y < halfHeight; y++)
+ {
+ const UINT32 val2y = (y * 2 + evenY);
+ const BYTE* Ua = pSrc[1] + srcStep[1] * y;
+ const BYTE* Va = pSrc[2] + srcStep[2] * y;
+ BYTE* pU = pDst[1] + dstStep[1] * val2y;
+ BYTE* pV = pDst[2] + dstStep[2] * val2y;
+
+ for (UINT32 x = 0; x < halfWidth - halfPad; x += 16)
+ {
+ {
+ uint8x16x2_t u = vld2q_u8(&pU[2 * x]);
+ u.val[1] = vld1q_u8(&Ua[x]);
+ vst2q_u8(&pU[2 * x], u);
+ }
+ {
+ uint8x16x2_t v = vld2q_u8(&pV[2 * x]);
+ v.val[1] = vld1q_u8(&Va[x]);
+ vst2q_u8(&pV[2 * x], v);
+ }
+ }
+
+ for (; x < halfWidth; x++)
+ {
+ const UINT32 val2x1 = (x * 2 + oddX);
+ pU[val2x1] = Ua[x];
+ pV[val2x1] = Va[x];
+ }
+ }
+
+ /* Filter */
+ return neon_ChromaFilter(pDst, dstStep, roi);
+}
+
+static pstatus_t neon_ChromaV2ToYUV444(const BYTE* const WINPR_RESTRICT pSrc[3],
+ const UINT32 srcStep[3], UINT32 nTotalWidth,
+ UINT32 nTotalHeight, BYTE* WINPR_RESTRICT pDst[3],
+ const UINT32 dstStep[3],
+ const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+ const UINT32 nWidth = roi->right - roi->left;
+ const UINT32 nHeight = roi->bottom - roi->top;
+ const UINT32 halfWidth = (nWidth + 1) / 2;
+ const UINT32 halfPad = halfWidth % 16;
+ const UINT32 halfHeight = (nHeight + 1) / 2;
+ const UINT32 quaterWidth = (nWidth + 3) / 4;
+ const UINT32 quaterPad = quaterWidth % 16;
+
+ /* B4 and B5: odd UV values for width/2, height */
+ for (UINT32 y = 0; y < nHeight; y++)
+ {
+ const UINT32 yTop = y + roi->top;
+ const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
+ const BYTE* pYaV = pYaU + nTotalWidth / 2;
+ BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left;
+ BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left;
+
+ for (UINT32 x = 0; x < halfWidth - halfPad; x += 16)
+ {
+ {
+ uint8x16x2_t u = vld2q_u8(&pU[2 * x]);
+ u.val[1] = vld1q_u8(&pYaU[x]);
+ vst2q_u8(&pU[2 * x], u);
+ }
+ {
+ uint8x16x2_t v = vld2q_u8(&pV[2 * x]);
+ v.val[1] = vld1q_u8(&pYaV[x]);
+ vst2q_u8(&pV[2 * x], v);
+ }
+ }
+
+ for (; x < halfWidth; x++)
+ {
+ const UINT32 odd = 2 * x + 1;
+ pU[odd] = pYaU[x];
+ pV[odd] = pYaV[x];
+ }
+ }
+
+ /* B6 - B9 */
+ for (UINT32 y = 0; y < halfHeight; y++)
+ {
+ const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
+ const BYTE* pUaV = pUaU + nTotalWidth / 4;
+ const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
+ const BYTE* pVaV = pVaU + nTotalWidth / 4;
+ BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
+ BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
+
+ for (UINT32 x = 0; x < quaterWidth - quaterPad; x += 16)
+ {
+ {
+ uint8x16x4_t u = vld4q_u8(&pU[4 * x]);
+ u.val[0] = vld1q_u8(&pUaU[x]);
+ u.val[2] = vld1q_u8(&pVaU[x]);
+ vst4q_u8(&pU[4 * x], u);
+ }
+ {
+ uint8x16x4_t v = vld4q_u8(&pV[4 * x]);
+ v.val[0] = vld1q_u8(&pUaV[x]);
+ v.val[2] = vld1q_u8(&pVaV[x]);
+ vst4q_u8(&pV[4 * x], v);
+ }
+ }
+
+ for (; x < quaterWidth; x++)
+ {
+ pU[4 * x + 0] = pUaU[x];
+ pV[4 * x + 0] = pUaV[x];
+ pU[4 * x + 2] = pVaU[x];
+ pV[4 * x + 2] = pVaV[x];
+ }
+ }
+
+ return neon_ChromaFilter(pDst, dstStep, roi);
+}
+
+static pstatus_t neon_YUV420CombineToYUV444(avc444_frame_type type,
+ const BYTE* const WINPR_RESTRICT pSrc[3],
+ const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
+ BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
+ const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+ if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
+ return -1;
+
+ if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
+ return -1;
+
+ if (!roi)
+ return -1;
+
+ switch (type)
+ {
+ case AVC444_LUMA:
+ return neon_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
+
+ case AVC444_CHROMAv1:
+ return neon_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
+
+ case AVC444_CHROMAv2:
+ return neon_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
+
+ default:
+ return -1;
+ }
+}
+
+void primitives_init_YUV_opt(primitives_t* prims)
+{
+ generic = primitives_get_generic();
+ primitives_init_YUV(prims);
+
+ if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
+ {
+ prims->YUV420ToRGB_8u_P3AC4R = neon_YUV420ToRGB_8u_P3AC4R;
+ prims->YUV444ToRGB_8u_P3AC4R = neon_YUV444ToRGB_8u_P3AC4R;
+ prims->YUV420CombineToYUV444 = neon_YUV420CombineToYUV444;
+ }
+}
diff --git a/libfreerdp/primitives/prim_YUV_opencl.c b/libfreerdp/primitives/prim_YUV_opencl.c
new file mode 100644
index 0000000..2ca1b31
--- /dev/null
+++ b/libfreerdp/primitives/prim_YUV_opencl.c
@@ -0,0 +1,500 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Optimized YUV/RGB conversion operations using openCL
+ *
+ * Copyright 2019 David Fort <contact@hardening-consulting.com>
+ * Copyright 2019 Rangee Gmbh
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include "prim_internal.h"
+
+#if defined(WITH_OPENCL)
+#ifdef __APPLE__
+#include "OpenCL/opencl.h"
+#else
+#include <CL/cl.h>
+#endif
+#endif
+
+#include <freerdp/log.h>
+#define TAG FREERDP_TAG("primitives")
+
+typedef struct
+{
+ BOOL support;
+ cl_platform_id platformId;
+ cl_device_id deviceId;
+ cl_context context;
+ cl_command_queue commandQueue;
+ cl_program program;
+} primitives_opencl_context;
+
+typedef struct
+{
+ primitives_opencl_context* cl;
+ cl_kernel kernel;
+ cl_mem srcObjs[3];
+ cl_mem dstObj;
+ prim_size_t roi;
+ size_t dstStep;
+} primitives_cl_kernel;
+
+static primitives_opencl_context* primitives_get_opencl_context(void);
+
+static void cl_kernel_free(primitives_cl_kernel* kernel)
+{
+ if (!kernel)
+ return;
+
+ if (kernel->dstObj)
+ clReleaseMemObject(kernel->dstObj);
+
+ for (size_t i = 0; i < ARRAYSIZE(kernel->srcObjs); i++)
+ {
+ cl_mem obj = kernel->srcObjs[i];
+ kernel->srcObjs[i] = NULL;
+ if (obj)
+ clReleaseMemObject(obj);
+ }
+
+ if (kernel->kernel)
+ clReleaseKernel(kernel->kernel);
+
+ free(kernel);
+}
+
+static primitives_cl_kernel* cl_kernel_new(const char* kernelName, const prim_size_t* roi)
+{
+ WINPR_ASSERT(kernelName);
+ WINPR_ASSERT(roi);
+
+ primitives_cl_kernel* kernel = calloc(1, sizeof(primitives_cl_kernel));
+ if (!kernel)
+ goto fail;
+
+ kernel->roi = *roi;
+ kernel->cl = primitives_get_opencl_context();
+ if (!kernel->cl)
+ goto fail;
+
+ cl_int ret = CL_INVALID_VALUE;
+ kernel->kernel = clCreateKernel(kernel->cl->program, kernelName, &ret);
+ if (ret != CL_SUCCESS)
+ {
+ WLog_ERR(TAG, "openCL: unable to create kernel %s", kernelName);
+ goto fail;
+ }
+
+ return kernel;
+fail:
+ cl_kernel_free(kernel);
+ return NULL;
+}
+
+static BOOL cl_kernel_set_sources(primitives_cl_kernel* ctx,
+ const BYTE* const WINPR_RESTRICT pSrc[3], const UINT32 srcStep[3])
+{
+ const char* sourceNames[] = { "Y", "U", "V" };
+
+ WINPR_ASSERT(ctx);
+ WINPR_ASSERT(pSrc);
+ WINPR_ASSERT(srcStep);
+
+ for (cl_uint i = 0; i < ARRAYSIZE(ctx->srcObjs); i++)
+ {
+ cl_int ret = CL_INVALID_VALUE;
+ ctx->srcObjs[i] = clCreateBuffer(ctx->cl->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+ 1ull * srcStep[i] * ctx->roi.height, pSrc[i], &ret);
+ if (ret != CL_SUCCESS)
+ {
+ WLog_ERR(TAG, "unable to create %sobj", sourceNames[i]);
+ return FALSE;
+ }
+
+ ret = clSetKernelArg(ctx->kernel, i * 2, sizeof(cl_mem), &ctx->srcObjs[i]);
+ if (ret != CL_SUCCESS)
+ {
+ WLog_ERR(TAG, "unable to set arg for %sobj", sourceNames[i]);
+ return FALSE;
+ }
+
+ ret = clSetKernelArg(ctx->kernel, i * 2 + 1, sizeof(cl_uint), &srcStep[i]);
+ if (ret != CL_SUCCESS)
+ {
+ WLog_ERR(TAG, "unable to set arg stride for %sobj", sourceNames[i]);
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
+
+static BOOL cl_kernel_set_destination(primitives_cl_kernel* ctx, UINT32 dstStep)
+{
+
+ WINPR_ASSERT(ctx);
+
+ ctx->dstStep = dstStep;
+ cl_int ret = CL_INVALID_VALUE;
+ ctx->dstObj = clCreateBuffer(ctx->cl->context, CL_MEM_WRITE_ONLY,
+ 1ull * dstStep * ctx->roi.height, NULL, &ret);
+ if (ret != CL_SUCCESS)
+ {
+ WLog_ERR(TAG, "unable to create dest obj");
+ return FALSE;
+ }
+
+ ret = clSetKernelArg(ctx->kernel, 6, sizeof(cl_mem), &ctx->dstObj);
+ if (ret != CL_SUCCESS)
+ {
+ WLog_ERR(TAG, "unable to set arg destObj");
+ return FALSE;
+ }
+
+ ret = clSetKernelArg(ctx->kernel, 7, sizeof(cl_uint), &dstStep);
+ if (ret != CL_SUCCESS)
+ {
+ WLog_ERR(TAG, "unable to set arg dstStep");
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+static BOOL cl_kernel_process(primitives_cl_kernel* ctx, BYTE* pDst)
+{
+ WINPR_ASSERT(ctx);
+ WINPR_ASSERT(pDst);
+
+ size_t indexes[2] = { 0 };
+ indexes[0] = ctx->roi.width;
+ indexes[1] = ctx->roi.height;
+
+ cl_int ret = clEnqueueNDRangeKernel(ctx->cl->commandQueue, ctx->kernel, 2, NULL, indexes, NULL,
+ 0, NULL, NULL);
+ if (ret != CL_SUCCESS)
+ {
+ WLog_ERR(TAG, "unable to enqueue call kernel");
+ return FALSE;
+ }
+
+ /* Transfer result to host */
+ ret = clEnqueueReadBuffer(ctx->cl->commandQueue, ctx->dstObj, CL_TRUE, 0,
+ ctx->roi.height * ctx->dstStep, pDst, 0, NULL, NULL);
+ if (ret != CL_SUCCESS)
+ {
+ WLog_ERR(TAG, "unable to read back buffer");
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+static pstatus_t opencl_YUVToRGB(const char* kernelName, const BYTE* const WINPR_RESTRICT pSrc[3],
+ const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ pstatus_t res = -1;
+
+ primitives_cl_kernel* ctx = cl_kernel_new(kernelName, roi);
+ if (!ctx)
+ goto fail;
+
+ if (!cl_kernel_set_sources(ctx, pSrc, srcStep))
+ goto fail;
+
+ if (!cl_kernel_set_destination(ctx, dstStep))
+ goto fail;
+
+ if (!cl_kernel_process(ctx, pDst))
+ goto fail;
+
+ res = PRIMITIVES_SUCCESS;
+
+fail:
+ cl_kernel_free(ctx);
+ return res;
+}
+
+static primitives_opencl_context openclContext = { 0 };
+
+static primitives_opencl_context* primitives_get_opencl_context(void)
+{
+ return &openclContext;
+}
+
+static void cl_context_free(primitives_opencl_context* ctx)
+{
+ if (!ctx)
+ return;
+ clReleaseProgram(ctx->program);
+ clReleaseCommandQueue(ctx->commandQueue);
+ clReleaseContext(ctx->context);
+ clReleaseDevice(ctx->deviceId);
+ ctx->support = FALSE;
+}
+
+static pstatus_t primitives_uninit_opencl(void)
+{
+ if (!openclContext.support)
+ return PRIMITIVES_SUCCESS;
+
+ cl_context_free(&openclContext);
+ return PRIMITIVES_SUCCESS;
+}
+
+static const char openclProgram[] =
+#include "primitives.cl"
+ ;
+
+static BOOL primitives_init_opencl_context(primitives_opencl_context* cl)
+{
+ cl_platform_id* platform_ids = NULL;
+ cl_uint ndevices = 0;
+ cl_uint nplatforms = 0;
+ cl_kernel kernel = NULL;
+ cl_int ret = 0;
+
+ BOOL gotGPU = FALSE;
+ size_t programLen = 0;
+
+ ret = clGetPlatformIDs(0, NULL, &nplatforms);
+ if (ret != CL_SUCCESS || nplatforms < 1)
+ return FALSE;
+
+ platform_ids = calloc(nplatforms, sizeof(*platform_ids));
+ if (!platform_ids)
+ return FALSE;
+
+ ret = clGetPlatformIDs(nplatforms, platform_ids, &nplatforms);
+ if (ret != CL_SUCCESS)
+ {
+ free(platform_ids);
+ return FALSE;
+ }
+
+ for (cl_uint i = 0; (i < nplatforms) && !gotGPU; i++)
+ {
+ cl_device_id device_id = NULL;
+ cl_context context = NULL;
+ char platformName[1000] = { 0 };
+ char deviceName[1000] = { 0 };
+
+ ret = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, sizeof(platformName),
+ platformName, NULL);
+ if (ret != CL_SUCCESS)
+ continue;
+
+ ret = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, 1, &device_id, &ndevices);
+ if (ret != CL_SUCCESS)
+ continue;
+
+ ret = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);
+ if (ret != CL_SUCCESS)
+ {
+ WLog_ERR(TAG, "openCL: unable get device name for platform %s", platformName);
+ clReleaseDevice(device_id);
+ continue;
+ }
+
+ context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
+ if (ret != CL_SUCCESS)
+ {
+ WLog_ERR(TAG, "openCL: unable to create context for platform %s, device %s",
+ platformName, deviceName);
+ clReleaseDevice(device_id);
+ continue;
+ }
+
+ cl->commandQueue = clCreateCommandQueue(context, device_id, 0, &ret);
+ if (ret != CL_SUCCESS)
+ {
+ WLog_ERR(TAG, "openCL: unable to create command queue");
+ clReleaseContext(context);
+ clReleaseDevice(device_id);
+ continue;
+ }
+
+ WLog_INFO(TAG, "openCL: using platform=%s device=%s", platformName, deviceName);
+
+ cl->platformId = platform_ids[i];
+ cl->deviceId = device_id;
+ cl->context = context;
+ gotGPU = TRUE;
+ }
+
+ free(platform_ids);
+
+ if (!gotGPU)
+ {
+ WLog_ERR(TAG, "openCL: no GPU found");
+ return FALSE;
+ }
+
+ programLen = strnlen(openclProgram, sizeof(openclProgram));
+ const char* ptr = openclProgram;
+ cl->program = clCreateProgramWithSource(cl->context, 1, &ptr, &programLen, &ret);
+ if (ret != CL_SUCCESS)
+ {
+ WLog_ERR(TAG, "openCL: unable to create program");
+ goto fail;
+ }
+
+ ret = clBuildProgram(cl->program, 1, &cl->deviceId, NULL, NULL, NULL);
+ if (ret != CL_SUCCESS)
+ {
+ size_t length = 0;
+ char buffer[2048];
+ ret = clGetProgramBuildInfo(cl->program, cl->deviceId, CL_PROGRAM_BUILD_LOG, sizeof(buffer),
+ buffer, &length);
+ if (ret != CL_SUCCESS)
+ {
+ WLog_ERR(TAG,
+ "openCL: building program failed but unable to retrieve buildLog, error=%d",
+ ret);
+ }
+ else
+ {
+ WLog_ERR(TAG, "openCL: unable to build program, errorLog=%s", buffer);
+ }
+ goto fail;
+ }
+
+ kernel = clCreateKernel(cl->program, "yuv420_to_bgra_1b", &ret);
+ if (ret != CL_SUCCESS)
+ {
+ WLog_ERR(TAG, "openCL: unable to create yuv420_to_bgra_1b kernel");
+ goto fail;
+ }
+ clReleaseKernel(kernel);
+
+ cl->support = TRUE;
+ return TRUE;
+
+fail:
+ cl_context_free(cl);
+ return FALSE;
+}
+
+static pstatus_t opencl_YUV420ToRGB_8u_P3AC4R(const BYTE* const WINPR_RESTRICT pSrc[3],
+ const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+ UINT32 dstStep, UINT32 DstFormat,
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ const char* kernel_name = NULL;
+
+ switch (DstFormat)
+ {
+ case PIXEL_FORMAT_ABGR32:
+ kernel_name = "yuv420_to_abgr_1b";
+ break;
+ case PIXEL_FORMAT_XBGR32:
+ kernel_name = "yuv420_to_xbgr_1b";
+ break;
+ case PIXEL_FORMAT_RGBX32:
+ kernel_name = "yuv420_to_rgba_1b";
+ break;
+ case PIXEL_FORMAT_RGBA32:
+ kernel_name = "yuv420_to_rgbx_1b";
+ break;
+ case PIXEL_FORMAT_BGRA32:
+ kernel_name = "yuv420_to_bgra_1b";
+ break;
+ case PIXEL_FORMAT_BGRX32:
+ kernel_name = "yuv420_to_bgrx_1b";
+ break;
+ case PIXEL_FORMAT_XRGB32:
+ kernel_name = "yuv420_to_xrgb_1b";
+ break;
+ case PIXEL_FORMAT_ARGB32:
+ kernel_name = "yuv420_to_argb_1b";
+ break;
+ default:
+ {
+ primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
+ if (!p)
+ return -1;
+ return p->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+ }
+ }
+
+ return opencl_YUVToRGB(kernel_name, pSrc, srcStep, pDst, dstStep, roi);
+}
+
+static pstatus_t opencl_YUV444ToRGB_8u_P3AC4R(const BYTE* const WINPR_RESTRICT pSrc[3],
+ const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+ UINT32 dstStep, UINT32 DstFormat,
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ const char* kernel_name = NULL;
+
+ switch (DstFormat)
+ {
+ case PIXEL_FORMAT_ABGR32:
+ kernel_name = "yuv444_to_abgr_1b";
+ break;
+ case PIXEL_FORMAT_XBGR32:
+ kernel_name = "yuv444_to_xbgr_1b";
+ break;
+ case PIXEL_FORMAT_RGBX32:
+ kernel_name = "yuv444_to_rgba_1b";
+ break;
+ case PIXEL_FORMAT_RGBA32:
+ kernel_name = "yuv444_to_rgbx_1b";
+ break;
+ case PIXEL_FORMAT_BGRA32:
+ kernel_name = "yuv444_to_bgra_1b";
+ break;
+ case PIXEL_FORMAT_BGRX32:
+ kernel_name = "yuv444_to_bgrx_1b";
+ break;
+ case PIXEL_FORMAT_XRGB32:
+ kernel_name = "yuv444_to_xrgb_1b";
+ break;
+ case PIXEL_FORMAT_ARGB32:
+ kernel_name = "yuv444_to_argb_1b";
+ break;
+ default:
+ {
+ primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
+ if (!p)
+ return -1;
+ return p->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+ }
+ }
+
+ return opencl_YUVToRGB(kernel_name, pSrc, srcStep, pDst, dstStep, roi);
+}
+
+BOOL primitives_init_opencl(primitives_t* prims)
+{
+ primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
+ if (!prims || !p)
+ return FALSE;
+ *prims = *p;
+
+ if (!primitives_init_opencl_context(&openclContext))
+ return FALSE;
+
+ prims->YUV420ToRGB_8u_P3AC4R = opencl_YUV420ToRGB_8u_P3AC4R;
+ prims->YUV444ToRGB_8u_P3AC4R = opencl_YUV444ToRGB_8u_P3AC4R;
+ prims->flags |= PRIM_FLAGS_HAVE_EXTGPU;
+ prims->uninit = primitives_uninit_opencl;
+ return TRUE;
+}
diff --git a/libfreerdp/primitives/prim_YUV_ssse3.c b/libfreerdp/primitives/prim_YUV_ssse3.c
new file mode 100644
index 0000000..2fbef3e
--- /dev/null
+++ b/libfreerdp/primitives/prim_YUV_ssse3.c
@@ -0,0 +1,1515 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Optimized YUV/RGB conversion operations
+ *
+ * Copyright 2014 Thomas Erbesdobler
+ * Copyright 2016-2017 Armin Novak <armin.novak@thincast.com>
+ * Copyright 2016-2017 Norbert Federa <norbert.federa@thincast.com>
+ * Copyright 2016-2017 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <winpr/wtypes.h>
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include <winpr/crt.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#if !defined(WITH_SSE2)
+#error "This file needs WITH_SSE2 enabled!"
+#endif
+
+static primitives_t* generic = NULL;
+
+/****************************************************************************/
+/* SSSE3 YUV420 -> RGB conversion */
+/****************************************************************************/
+static __m128i* ssse3_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yraw, __m128i Uraw,
+ __m128i Vraw, UINT8 pos)
+{
+ /* Visual Studio 2010 doesn't like _mm_set_epi32 in array initializer list */
+ /* Note: This also applies to Visual Studio 2013 before Update 4 */
+#if !defined(_MSC_VER) || (_MSC_VER > 1600)
+ const __m128i mapY[] = { _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
+ _mm_set_epi32(0x80800780, 0x80800680, 0x80800580, 0x80800480),
+ _mm_set_epi32(0x80800B80, 0x80800A80, 0x80800980, 0x80800880),
+ _mm_set_epi32(0x80800F80, 0x80800E80, 0x80800D80, 0x80800C80) };
+ const __m128i mapUV[] = { _mm_set_epi32(0x80038002, 0x80018000, 0x80808080, 0x80808080),
+ _mm_set_epi32(0x80078006, 0x80058004, 0x80808080, 0x80808080),
+ _mm_set_epi32(0x800B800A, 0x80098008, 0x80808080, 0x80808080),
+ _mm_set_epi32(0x800F800E, 0x800D800C, 0x80808080, 0x80808080) };
+ const __m128i mask[] = { _mm_set_epi32(0x80038080, 0x80028080, 0x80018080, 0x80008080),
+ _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
+ _mm_set_epi32(0x80808003, 0x80808002, 0x80808001, 0x80808000) };
+#else
+ /* Note: must be in little-endian format ! */
+ const __m128i mapY[] = { { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80,
+ 0x80, 0x80, 0x03, 0x80, 0x80 },
+ { 0x80, 0x04, 0x80, 0x80, 0x80, 0x05, 0x80, 0x80, 0x80, 0x06, 0x80,
+ 0x80, 0x80, 0x07, 0x80, 0x80 },
+ { 0x80, 0x08, 0x80, 0x80, 0x80, 0x09, 0x80, 0x80, 0x80, 0x0a, 0x80,
+ 0x80, 0x80, 0x0b, 0x80, 0x80 },
+ { 0x80, 0x0c, 0x80, 0x80, 0x80, 0x0d, 0x80, 0x80, 0x80, 0x0e, 0x80,
+ 0x80, 0x80, 0x0f, 0x80, 0x80 }
+
+ };
+ const __m128i mapUV[] = { { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x80, 0x01,
+ 0x80, 0x02, 0x80, 0x03, 0x80 },
+ { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x04, 0x80, 0x05,
+ 0x80, 0x06, 0x80, 0x07, 0x80 },
+ { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x08, 0x80, 0x09,
+ 0x80, 0x0a, 0x80, 0x0b, 0x80 },
+ { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0c, 0x80, 0x0d,
+ 0x80, 0x0e, 0x80, 0x0f, 0x80 } };
+ const __m128i mask[] = { { 0x80, 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02,
+ 0x80, 0x80, 0x80, 0x03, 0x80 },
+ { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80,
+ 0x80, 0x80, 0x03, 0x80, 0x80 },
+ { 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80,
+ 0x80, 0x03, 0x80, 0x80, 0x80 } };
+#endif
+ const __m128i c128 = _mm_set1_epi16(128);
+ __m128i BGRX = _mm_and_si128(_mm_loadu_si128(dst),
+ _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000));
+ {
+ __m128i C;
+ __m128i D;
+ __m128i E;
+ /* Load Y values and expand to 32 bit */
+ {
+ C = _mm_shuffle_epi8(Yraw, mapY[pos]); /* Reorder and multiply by 256 */
+ }
+ /* Load U values and expand to 32 bit */
+ {
+ const __m128i U = _mm_shuffle_epi8(Uraw, mapUV[pos]); /* Reorder dcba */
+ D = _mm_sub_epi16(U, c128); /* D = U - 128 */
+ }
+ /* Load V values and expand to 32 bit */
+ {
+ const __m128i V = _mm_shuffle_epi8(Vraw, mapUV[pos]); /* Reorder dcba */
+ E = _mm_sub_epi16(V, c128); /* E = V - 128 */
+ }
+ /* Get the R value */
+ {
+ const __m128i c403 = _mm_set1_epi16(403);
+ const __m128i e403 =
+ _mm_unpackhi_epi16(_mm_mullo_epi16(E, c403), _mm_mulhi_epi16(E, c403));
+ const __m128i Rs = _mm_add_epi32(C, e403);
+ const __m128i R32 = _mm_srai_epi32(Rs, 8);
+ const __m128i R16 = _mm_packs_epi32(R32, _mm_setzero_si128());
+ const __m128i R = _mm_packus_epi16(R16, _mm_setzero_si128());
+ const __m128i packed = _mm_shuffle_epi8(R, mask[0]);
+ BGRX = _mm_or_si128(BGRX, packed);
+ }
+ /* Get the G value */
+ {
+ const __m128i c48 = _mm_set1_epi16(48);
+ const __m128i d48 =
+ _mm_unpackhi_epi16(_mm_mullo_epi16(D, c48), _mm_mulhi_epi16(D, c48));
+ const __m128i c120 = _mm_set1_epi16(120);
+ const __m128i e120 =
+ _mm_unpackhi_epi16(_mm_mullo_epi16(E, c120), _mm_mulhi_epi16(E, c120));
+ const __m128i de = _mm_add_epi32(d48, e120);
+ const __m128i Gs = _mm_sub_epi32(C, de);
+ const __m128i G32 = _mm_srai_epi32(Gs, 8);
+ const __m128i G16 = _mm_packs_epi32(G32, _mm_setzero_si128());
+ const __m128i G = _mm_packus_epi16(G16, _mm_setzero_si128());
+ const __m128i packed = _mm_shuffle_epi8(G, mask[1]);
+ BGRX = _mm_or_si128(BGRX, packed);
+ }
+ /* Get the B value */
+ {
+ const __m128i c475 = _mm_set1_epi16(475);
+ const __m128i d475 =
+ _mm_unpackhi_epi16(_mm_mullo_epi16(D, c475), _mm_mulhi_epi16(D, c475));
+ const __m128i Bs = _mm_add_epi32(C, d475);
+ const __m128i B32 = _mm_srai_epi32(Bs, 8);
+ const __m128i B16 = _mm_packs_epi32(B32, _mm_setzero_si128());
+ const __m128i B = _mm_packus_epi16(B16, _mm_setzero_si128());
+ const __m128i packed = _mm_shuffle_epi8(B, mask[2]);
+ BGRX = _mm_or_si128(BGRX, packed);
+ }
+ }
+ _mm_storeu_si128(dst++, BGRX);
+ return dst;
+}
+
+static pstatus_t ssse3_YUV420ToRGB_BGRX(const BYTE* const WINPR_RESTRICT pSrc[],
+ const UINT32* WINPR_RESTRICT srcStep,
+ BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ const UINT32 nWidth = roi->width;
+ const UINT32 nHeight = roi->height;
+ const UINT32 pad = roi->width % 16;
+ const __m128i duplicate = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+
+ for (UINT32 y = 0; y < nHeight; y++)
+ {
+ __m128i* dst = (__m128i*)(pDst + dstStep * y);
+ const BYTE* YData = pSrc[0] + y * srcStep[0];
+ const BYTE* UData = pSrc[1] + (y / 2) * srcStep[1];
+ const BYTE* VData = pSrc[2] + (y / 2) * srcStep[2];
+
+ for (UINT32 x = 0; x < nWidth - pad; x += 16)
+ {
+ const __m128i Y = _mm_loadu_si128((const __m128i*)YData);
+ const __m128i uRaw = _mm_loadu_si128((const __m128i*)UData);
+ const __m128i vRaw = _mm_loadu_si128((const __m128i*)VData);
+ const __m128i U = _mm_shuffle_epi8(uRaw, duplicate);
+ const __m128i V = _mm_shuffle_epi8(vRaw, duplicate);
+ YData += 16;
+ UData += 8;
+ VData += 8;
+ dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
+ dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
+ dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
+ dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
+ }
+
+ for (UINT32 x = 0; x < pad; x++)
+ {
+ const BYTE Y = *YData++;
+ const BYTE U = *UData;
+ const BYTE V = *VData;
+ const BYTE r = YUV2R(Y, U, V);
+ const BYTE g = YUV2G(Y, U, V);
+ const BYTE b = YUV2B(Y, U, V);
+ dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0);
+
+ if (x % 2)
+ {
+ UData++;
+ VData++;
+ }
+ }
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t ssse3_YUV420ToRGB(const BYTE* const WINPR_RESTRICT pSrc[3],
+ const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,
+ UINT32 dstStep, UINT32 DstFormat,
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ switch (DstFormat)
+ {
+ case PIXEL_FORMAT_BGRX32:
+ case PIXEL_FORMAT_BGRA32:
+ return ssse3_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, roi);
+
+ default:
+ return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+ }
+}
+
+static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(const BYTE* const WINPR_RESTRICT pSrc[],
+ const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
+ UINT32 dstStep,
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ const UINT32 nWidth = roi->width;
+ const UINT32 nHeight = roi->height;
+ const UINT32 pad = roi->width % 16;
+
+ for (UINT32 y = 0; y < nHeight; y++)
+ {
+ __m128i* dst = (__m128i*)(pDst + dstStep * y);
+ const BYTE* YData = pSrc[0] + y * srcStep[0];
+ const BYTE* UData = pSrc[1] + y * srcStep[1];
+ const BYTE* VData = pSrc[2] + y * srcStep[2];
+
+ for (UINT32 x = 0; x < nWidth - pad; x += 16)
+ {
+ __m128i Y = _mm_load_si128((const __m128i*)YData);
+ __m128i U = _mm_load_si128((const __m128i*)UData);
+ __m128i V = _mm_load_si128((const __m128i*)VData);
+ YData += 16;
+ UData += 16;
+ VData += 16;
+ dst = ssse3_YUV444Pixel(dst, Y, U, V, 0);
+ dst = ssse3_YUV444Pixel(dst, Y, U, V, 1);
+ dst = ssse3_YUV444Pixel(dst, Y, U, V, 2);
+ dst = ssse3_YUV444Pixel(dst, Y, U, V, 3);
+ }
+
+ for (UINT32 x = 0; x < pad; x++)
+ {
+ const BYTE Y = *YData++;
+ const BYTE U = *UData++;
+ const BYTE V = *VData++;
+ const BYTE r = YUV2R(Y, U, V);
+ const BYTE g = YUV2G(Y, U, V);
+ const BYTE b = YUV2B(Y, U, V);
+ dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0);
+ }
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R(const BYTE* const WINPR_RESTRICT pSrc[],
+ const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDst,
+ UINT32 dstStep, UINT32 DstFormat,
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ if ((uintptr_t)pSrc[0] % 16 || (uintptr_t)pSrc[1] % 16 || (uintptr_t)pSrc[2] % 16 ||
+ srcStep[0] % 16 || srcStep[1] % 16 || srcStep[2] % 16)
+ return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+
+ switch (DstFormat)
+ {
+ case PIXEL_FORMAT_BGRX32:
+ case PIXEL_FORMAT_BGRA32:
+ return ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
+
+ default:
+ return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+ }
+}
+
+/****************************************************************************/
+/* SSSE3 RGB -> YUV420 conversion **/
+/****************************************************************************/
+
+/**
+ * Note (nfedera):
+ * The used forward transformation factors from RGB to YUV are based on the
+ * values specified in [Rec. ITU-R BT.709-6] Section 3:
+ * http://www.itu.int/rec/R-REC-BT.709-6-201506-I/en
+ *
+ * Y = 0.21260 * R + 0.71520 * G + 0.07220 * B + 0;
+ * U = -0.11457 * R - 0.38543 * G + 0.50000 * B + 128;
+ * V = 0.50000 * R - 0.45415 * G - 0.04585 * B + 128;
+ *
+ * The most accurate integer arithmetic approximation when using 8-bit signed
+ * integer factors with 16-bit signed integer intermediate results is:
+ *
+ * Y = ( ( 27 * R + 92 * G + 9 * B) >> 7 );
+ * U = ( (-29 * R - 99 * G + 128 * B) >> 8 ) + 128;
+ * V = ( ( 128 * R - 116 * G - 12 * B) >> 8 ) + 128;
+ *
+ * Due to signed 8bit range being [-128,127] the U and V constants of 128 are
+ * rounded to 127
+ */
+
+#define BGRX_Y_FACTORS _mm_set_epi8(0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9)
+#define BGRX_U_FACTORS \
+ _mm_set_epi8(0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127, 0, -29, -99, 127)
+#define BGRX_V_FACTORS \
+ _mm_set_epi8(0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12, 0, 127, -116, -12)
+#define CONST128_FACTORS _mm_set1_epi8(-128)
+
+#define Y_SHIFT 7
+#define U_SHIFT 8
+#define V_SHIFT 8
+
+/*
+TODO:
+RGB[AX] can simply be supported using the following factors. And instead of loading the
+globals directly the functions below could be passed pointers to the correct vectors
+depending on the source picture format.
+
+PRIM_ALIGN_128 static const BYTE rgbx_y_factors[] = {
+ 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0
+};
+PRIM_ALIGN_128 static const BYTE rgbx_u_factors[] = {
+ -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0
+};
+PRIM_ALIGN_128 static const BYTE rgbx_v_factors[] = {
+ 64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0
+};
+*/
+
+/* compute the luma (Y) component from a single rgb source line */
+
+static INLINE void ssse3_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE* dst, UINT32 width)
+{
+ __m128i x0;
+ __m128i x1;
+ __m128i x2;
+ __m128i x3;
+ const __m128i y_factors = BGRX_Y_FACTORS;
+ const __m128i* argb = (const __m128i*)src;
+ __m128i* ydst = (__m128i*)dst;
+
+ for (UINT32 x = 0; x < width; x += 16)
+ {
+ /* store 16 rgba pixels in 4 128 bit registers */
+ x0 = _mm_load_si128(argb++); // 1st 4 pixels
+ x1 = _mm_load_si128(argb++); // 2nd 4 pixels
+ x2 = _mm_load_si128(argb++); // 3rd 4 pixels
+ x3 = _mm_load_si128(argb++); // 4th 4 pixels
+ /* multiplications and subtotals */
+ x0 = _mm_maddubs_epi16(x0, y_factors);
+ x1 = _mm_maddubs_epi16(x1, y_factors);
+ x2 = _mm_maddubs_epi16(x2, y_factors);
+ x3 = _mm_maddubs_epi16(x3, y_factors);
+ /* the total sums */
+ x0 = _mm_hadd_epi16(x0, x1);
+ x2 = _mm_hadd_epi16(x2, x3);
+ /* shift the results */
+ x0 = _mm_srli_epi16(x0, Y_SHIFT);
+ x2 = _mm_srli_epi16(x2, Y_SHIFT);
+ /* pack the 16 words into bytes */
+ x0 = _mm_packus_epi16(x0, x2);
+ /* save to y plane */
+ _mm_storeu_si128(ydst++, x0);
+ }
+}
+
+/* compute the chrominance (UV) components from two rgb source lines */
+
+static INLINE void ssse3_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1,
+ const BYTE* WINPR_RESTRICT src2,
+ BYTE* WINPR_RESTRICT dst1, BYTE* WINPR_RESTRICT dst2,
+ UINT32 width)
+{
+ const __m128i u_factors = BGRX_U_FACTORS;
+ const __m128i v_factors = BGRX_V_FACTORS;
+ const __m128i vector128 = CONST128_FACTORS;
+ __m128i x0;
+ __m128i x1;
+ __m128i x2;
+ __m128i x3;
+ __m128i x4;
+ __m128i x5;
+ const __m128i* rgb1 = (const __m128i*)src1;
+ const __m128i* rgb2 = (const __m128i*)src2;
+ __m64* udst = (__m64*)dst1;
+ __m64* vdst = (__m64*)dst2;
+
+ for (UINT32 x = 0; x < width; x += 16)
+ {
+ /* subsample 16x2 pixels into 16x1 pixels */
+ x0 = _mm_load_si128(rgb1++);
+ x4 = _mm_load_si128(rgb2++);
+ x0 = _mm_avg_epu8(x0, x4);
+ x1 = _mm_load_si128(rgb1++);
+ x4 = _mm_load_si128(rgb2++);
+ x1 = _mm_avg_epu8(x1, x4);
+ x2 = _mm_load_si128(rgb1++);
+ x4 = _mm_load_si128(rgb2++);
+ x2 = _mm_avg_epu8(x2, x4);
+ x3 = _mm_load_si128(rgb1++);
+ x4 = _mm_load_si128(rgb2++);
+ x3 = _mm_avg_epu8(x3, x4);
+ /* subsample these 16x1 pixels into 8x1 pixels */
+ /**
+ * shuffle controls
+ * c = a[0],a[2],b[0],b[2] == 10 00 10 00 = 0x88
+ * c = a[1],a[3],b[1],b[3] == 11 01 11 01 = 0xdd
+ */
+ x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88));
+ x0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd));
+ x0 = _mm_avg_epu8(x0, x4);
+ x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88));
+ x1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd));
+ x1 = _mm_avg_epu8(x1, x4);
+ /* multiplications and subtotals */
+ x2 = _mm_maddubs_epi16(x0, u_factors);
+ x3 = _mm_maddubs_epi16(x1, u_factors);
+ x4 = _mm_maddubs_epi16(x0, v_factors);
+ x5 = _mm_maddubs_epi16(x1, v_factors);
+ /* the total sums */
+ x0 = _mm_hadd_epi16(x2, x3);
+ x1 = _mm_hadd_epi16(x4, x5);
+ /* shift the results */
+ x0 = _mm_srai_epi16(x0, U_SHIFT);
+ x1 = _mm_srai_epi16(x1, V_SHIFT);
+ /* pack the 16 words into bytes */
+ x0 = _mm_packs_epi16(x0, x1);
+ /* add 128 */
+ x0 = _mm_sub_epi8(x0, vector128);
+ /* the lower 8 bytes go to the u plane */
+ _mm_storel_pi(udst++, _mm_castsi128_ps(x0));
+ /* the upper 8 bytes go to the v plane */
+ _mm_storeh_pi(vdst++, _mm_castsi128_ps(x0));
+ }
+}
+
+static pstatus_t ssse3_RGBToYUV420_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+ UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
+ const UINT32 dstStep[],
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ const BYTE* argb = pSrc;
+ BYTE* ydst = pDst[0];
+ BYTE* udst = pDst[1];
+ BYTE* vdst = pDst[2];
+
+ if (roi->height < 1 || roi->width < 1)
+ {
+ return !PRIMITIVES_SUCCESS;
+ }
+
+ if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
+ {
+ return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
+ }
+
+ for (UINT32 y = 0; y < roi->height - 1; y += 2)
+ {
+ const BYTE* line1 = argb;
+ const BYTE* line2 = argb + srcStep;
+ ssse3_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width);
+ ssse3_RGBToYUV420_BGRX_Y(line1, ydst, roi->width);
+ ssse3_RGBToYUV420_BGRX_Y(line2, ydst + dstStep[0], roi->width);
+ argb += 2 * srcStep;
+ ydst += 2 * dstStep[0];
+ udst += 1 * dstStep[1];
+ vdst += 1 * dstStep[2];
+ }
+
+ if (roi->height & 1)
+ {
+ /* pass the same last line of an odd height twice for UV */
+ ssse3_RGBToYUV420_BGRX_UV(argb, argb, udst, vdst, roi->width);
+ ssse3_RGBToYUV420_BGRX_Y(argb, ydst, roi->width);
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t ssse3_RGBToYUV420(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+ UINT32 srcStep, BYTE* WINPR_RESTRICT pDst[],
+ const UINT32 dstStep[], const prim_size_t* WINPR_RESTRICT roi)
+{
+ switch (srcFormat)
+ {
+ case PIXEL_FORMAT_BGRX32:
+ case PIXEL_FORMAT_BGRA32:
+ return ssse3_RGBToYUV420_BGRX(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
+
+ default:
+ return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
+ }
+}
+
+/****************************************************************************/
+/* SSSE3 RGB -> AVC444-YUV conversion **/
+/****************************************************************************/
+
+static INLINE void ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
+ const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
+ BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,
+ BYTE* WINPR_RESTRICT b3, BYTE* WINPR_RESTRICT b4, BYTE* WINPR_RESTRICT b5,
+ BYTE* WINPR_RESTRICT b6, BYTE* WINPR_RESTRICT b7, UINT32 width)
+{
+ const __m128i* argbEven = (const __m128i*)srcEven;
+ const __m128i* argbOdd = (const __m128i*)srcOdd;
+ const __m128i y_factors = BGRX_Y_FACTORS;
+ const __m128i u_factors = BGRX_U_FACTORS;
+ const __m128i v_factors = BGRX_V_FACTORS;
+ const __m128i vector128 = CONST128_FACTORS;
+
+ for (UINT32 x = 0; x < width; x += 16)
+ {
+ /* store 16 rgba pixels in 4 128 bit registers */
+ const __m128i xe1 = _mm_load_si128(argbEven++); // 1st 4 pixels
+ const __m128i xe2 = _mm_load_si128(argbEven++); // 2nd 4 pixels
+ const __m128i xe3 = _mm_load_si128(argbEven++); // 3rd 4 pixels
+ const __m128i xe4 = _mm_load_si128(argbEven++); // 4th 4 pixels
+ const __m128i xo1 = _mm_load_si128(argbOdd++); // 1st 4 pixels
+ const __m128i xo2 = _mm_load_si128(argbOdd++); // 2nd 4 pixels
+ const __m128i xo3 = _mm_load_si128(argbOdd++); // 3rd 4 pixels
+ const __m128i xo4 = _mm_load_si128(argbOdd++); // 4th 4 pixels
+ {
+ /* Y: multiplications with subtotals and horizontal sums */
+ const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
+ _mm_maddubs_epi16(xe2, y_factors)),
+ Y_SHIFT);
+ const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
+ _mm_maddubs_epi16(xe4, y_factors)),
+ Y_SHIFT);
+ const __m128i ye = _mm_packus_epi16(ye1, ye2);
+ const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
+ _mm_maddubs_epi16(xo2, y_factors)),
+ Y_SHIFT);
+ const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
+ _mm_maddubs_epi16(xo4, y_factors)),
+ Y_SHIFT);
+ const __m128i yo = _mm_packus_epi16(yo1, yo2);
+ /* store y [b1] */
+ _mm_storeu_si128((__m128i*)b1Even, ye);
+ b1Even += 16;
+
+ if (b1Odd)
+ {
+ _mm_storeu_si128((__m128i*)b1Odd, yo);
+ b1Odd += 16;
+ }
+ }
+ {
+ /* We have now
+ * 16 even U values in ue
+ * 16 odd U values in uo
+ *
+ * We need to split these according to
+ * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
+ __m128i ue;
+ __m128i uo = { 0 };
+ {
+ const __m128i ue1 =
+ _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
+ _mm_maddubs_epi16(xe2, u_factors)),
+ U_SHIFT);
+ const __m128i ue2 =
+ _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
+ _mm_maddubs_epi16(xe4, u_factors)),
+ U_SHIFT);
+ ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
+ }
+
+ if (b1Odd)
+ {
+ const __m128i uo1 =
+ _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
+ _mm_maddubs_epi16(xo2, u_factors)),
+ U_SHIFT);
+ const __m128i uo2 =
+ _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
+ _mm_maddubs_epi16(xo4, u_factors)),
+ U_SHIFT);
+ uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
+ }
+
+ /* Now we need the following storage distribution:
+ * 2x 2y -> b2
+ * x 2y+1 -> b4
+ * 2x+1 2y -> b6 */
+ if (b1Odd) /* b2 */
+ {
+ const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128());
+ const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128());
+ const __m128i hi = _mm_add_epi16(ueh, uoh);
+ const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128());
+ const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128());
+ const __m128i lo = _mm_add_epi16(uel, uol);
+ const __m128i added = _mm_hadd_epi16(lo, hi);
+ const __m128i avg16 = _mm_srai_epi16(added, 2);
+ const __m128i avg = _mm_packus_epi16(avg16, avg16);
+ _mm_storel_epi64((__m128i*)b2, avg);
+ }
+ else
+ {
+ const __m128i mask =
+ _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+ (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ const __m128i ud = _mm_shuffle_epi8(ue, mask);
+ _mm_storel_epi64((__m128i*)b2, ud);
+ }
+
+ b2 += 8;
+
+ if (b1Odd) /* b4 */
+ {
+ _mm_store_si128((__m128i*)b4, uo);
+ b4 += 16;
+ }
+
+ {
+ /* b6 */
+ const __m128i mask =
+ _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+ (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+ const __m128i ude = _mm_shuffle_epi8(ue, mask);
+ _mm_storel_epi64((__m128i*)b6, ude);
+ b6 += 8;
+ }
+ }
+ {
+ /* We have now
+ * 16 even V values in ue
+ * 16 odd V values in uo
+ *
+ * We need to split these according to
+ * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
+ __m128i ve;
+ __m128i vo = { 0 };
+ {
+ const __m128i ve1 =
+ _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
+ _mm_maddubs_epi16(xe2, v_factors)),
+ V_SHIFT);
+ const __m128i ve2 =
+ _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
+ _mm_maddubs_epi16(xe4, v_factors)),
+ V_SHIFT);
+ ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
+ }
+
+ if (b1Odd)
+ {
+ const __m128i vo1 =
+ _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
+ _mm_maddubs_epi16(xo2, v_factors)),
+ V_SHIFT);
+ const __m128i vo2 =
+ _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
+ _mm_maddubs_epi16(xo4, v_factors)),
+ V_SHIFT);
+ vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
+ }
+
+ /* Now we need the following storage distribution:
+ * 2x 2y -> b3
+ * x 2y+1 -> b5
+ * 2x+1 2y -> b7 */
+ if (b1Odd) /* b3 */
+ {
+ const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128());
+ const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128());
+ const __m128i hi = _mm_add_epi16(veh, voh);
+ const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128());
+ const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128());
+ const __m128i lo = _mm_add_epi16(vel, vol);
+ const __m128i added = _mm_hadd_epi16(lo, hi);
+ const __m128i avg16 = _mm_srai_epi16(added, 2);
+ const __m128i avg = _mm_packus_epi16(avg16, avg16);
+ _mm_storel_epi64((__m128i*)b3, avg);
+ }
+ else
+ {
+ const __m128i mask =
+ _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+ (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ const __m128i vd = _mm_shuffle_epi8(ve, mask);
+ _mm_storel_epi64((__m128i*)b3, vd);
+ }
+
+ b3 += 8;
+
+ if (b1Odd) /* b5 */
+ {
+ _mm_store_si128((__m128i*)b5, vo);
+ b5 += 16;
+ }
+
+ {
+ /* b7 */
+ const __m128i mask =
+ _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+ (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+ const __m128i vde = _mm_shuffle_epi8(ve, mask);
+ _mm_storel_epi64((__m128i*)b7, vde);
+ b7 += 8;
+ }
+ }
+ }
+}
+
+static pstatus_t ssse3_RGBToAVC444YUV_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+ UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
+ const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
+ const UINT32 dst2Step[],
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ const BYTE* pMaxSrc = pSrc + (roi->height - 1) * srcStep;
+
+ if (roi->height < 1 || roi->width < 1)
+ return !PRIMITIVES_SUCCESS;
+
+ if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
+ return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
+ roi);
+
+ for (UINT32 y = 0; y < roi->height; y += 2)
+ {
+ const BOOL last = (y >= (roi->height - 1));
+ const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
+ const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
+ const UINT32 i = y >> 1;
+ const UINT32 n = (i & ~7) + i;
+ BYTE* b1Even = pDst1[0] + y * dst1Step[0];
+ BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
+ BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
+ BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
+ BYTE* b4 = pDst2[0] + dst2Step[0] * n;
+ BYTE* b5 = b4 + 8 * dst2Step[0];
+ BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
+ BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
+ ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7,
+ roi->width);
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t ssse3_RGBToAVC444YUV(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+ UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
+ const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
+ const UINT32 dst2Step[],
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ switch (srcFormat)
+ {
+ case PIXEL_FORMAT_BGRX32:
+ case PIXEL_FORMAT_BGRA32:
+ return ssse3_RGBToAVC444YUV_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
+ dst2Step, roi);
+
+ default:
+ return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
+ dst2Step, roi);
+ }
+}
+
+/* Mapping of arguments:
+ *
+ * b1 [even lines] -> yLumaDstEven
+ * b1 [odd lines] -> yLumaDstOdd
+ * b2 -> uLumaDst
+ * b3 -> vLumaDst
+ * b4 -> yChromaDst1
+ * b5 -> yChromaDst2
+ * b6 -> uChromaDst1
+ * b7 -> uChromaDst2
+ * b8 -> vChromaDst1
+ * b9 -> vChromaDst2
+ */
+static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
+ const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
+ BYTE* WINPR_RESTRICT yLumaDstEven, BYTE* WINPR_RESTRICT yLumaDstOdd,
+ BYTE* WINPR_RESTRICT uLumaDst, BYTE* WINPR_RESTRICT vLumaDst,
+ BYTE* WINPR_RESTRICT yEvenChromaDst1, BYTE* WINPR_RESTRICT yEvenChromaDst2,
+ BYTE* WINPR_RESTRICT yOddChromaDst1, BYTE* WINPR_RESTRICT yOddChromaDst2,
+ BYTE* WINPR_RESTRICT uChromaDst1, BYTE* WINPR_RESTRICT uChromaDst2,
+ BYTE* WINPR_RESTRICT vChromaDst1, BYTE* WINPR_RESTRICT vChromaDst2, UINT32 width)
+{
+ const __m128i vector128 = CONST128_FACTORS;
+ const __m128i* argbEven = (const __m128i*)srcEven;
+ const __m128i* argbOdd = (const __m128i*)srcOdd;
+
+ for (UINT32 x = 0; x < width; x += 16)
+ {
+ /* store 16 rgba pixels in 4 128 bit registers
+ * for even and odd rows.
+ */
+ const __m128i xe1 = _mm_load_si128(argbEven++); /* 1st 4 pixels */
+ const __m128i xe2 = _mm_load_si128(argbEven++); /* 2nd 4 pixels */
+ const __m128i xe3 = _mm_load_si128(argbEven++); /* 3rd 4 pixels */
+ const __m128i xe4 = _mm_load_si128(argbEven++); /* 4th 4 pixels */
+ const __m128i xo1 = _mm_load_si128(argbOdd++); /* 1st 4 pixels */
+ const __m128i xo2 = _mm_load_si128(argbOdd++); /* 2nd 4 pixels */
+ const __m128i xo3 = _mm_load_si128(argbOdd++); /* 3rd 4 pixels */
+ const __m128i xo4 = _mm_load_si128(argbOdd++); /* 4th 4 pixels */
+ {
+ /* Y: multiplications with subtotals and horizontal sums */
+ const __m128i y_factors = BGRX_Y_FACTORS;
+ const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
+ _mm_maddubs_epi16(xe2, y_factors)),
+ Y_SHIFT);
+ const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
+ _mm_maddubs_epi16(xe4, y_factors)),
+ Y_SHIFT);
+ const __m128i ye = _mm_packus_epi16(ye1, ye2);
+ /* store y [b1] */
+ _mm_storeu_si128((__m128i*)yLumaDstEven, ye);
+ yLumaDstEven += 16;
+ }
+
+ if (yLumaDstOdd)
+ {
+ const __m128i y_factors = BGRX_Y_FACTORS;
+ const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
+ _mm_maddubs_epi16(xo2, y_factors)),
+ Y_SHIFT);
+ const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
+ _mm_maddubs_epi16(xo4, y_factors)),
+ Y_SHIFT);
+ const __m128i yo = _mm_packus_epi16(yo1, yo2);
+ _mm_storeu_si128((__m128i*)yLumaDstOdd, yo);
+ yLumaDstOdd += 16;
+ }
+
+ {
+ /* We have now
+ * 16 even U values in ue
+ * 16 odd U values in uo
+ *
+ * We need to split these according to
+ * 3.3.8.3.3 YUV420p Stream Combination for YUV444v2 mode */
+ /* U: multiplications with subtotals and horizontal sums */
+ __m128i ue;
+ __m128i uo;
+ __m128i uavg;
+ {
+ const __m128i u_factors = BGRX_U_FACTORS;
+ const __m128i ue1 =
+ _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
+ _mm_maddubs_epi16(xe2, u_factors)),
+ U_SHIFT);
+ const __m128i ue2 =
+ _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
+ _mm_maddubs_epi16(xe4, u_factors)),
+ U_SHIFT);
+ const __m128i ueavg = _mm_hadd_epi16(ue1, ue2);
+ ue = _mm_sub_epi8(_mm_packs_epi16(ue1, ue2), vector128);
+ uavg = ueavg;
+ }
+ {
+ const __m128i u_factors = BGRX_U_FACTORS;
+ const __m128i uo1 =
+ _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
+ _mm_maddubs_epi16(xo2, u_factors)),
+ U_SHIFT);
+ const __m128i uo2 =
+ _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
+ _mm_maddubs_epi16(xo4, u_factors)),
+ U_SHIFT);
+ const __m128i uoavg = _mm_hadd_epi16(uo1, uo2);
+ uo = _mm_sub_epi8(_mm_packs_epi16(uo1, uo2), vector128);
+ uavg = _mm_add_epi16(uavg, uoavg);
+ uavg = _mm_srai_epi16(uavg, 2);
+ uavg = _mm_packs_epi16(uavg, uoavg);
+ uavg = _mm_sub_epi8(uavg, vector128);
+ }
+ /* Now we need the following storage distribution:
+ * 2x 2y -> uLumaDst
+ * 2x+1 y -> yChromaDst1
+ * 4x 2y+1 -> uChromaDst1
+ * 4x+2 2y+1 -> vChromaDst1 */
+ {
+ const __m128i mask =
+ _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+ (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+ const __m128i ude = _mm_shuffle_epi8(ue, mask);
+ _mm_storel_epi64((__m128i*)yEvenChromaDst1, ude);
+ yEvenChromaDst1 += 8;
+ }
+
+ if (yLumaDstOdd)
+ {
+ const __m128i mask =
+ _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+ (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+ const __m128i udo = _mm_shuffle_epi8(uo, mask);
+ _mm_storel_epi64((__m128i*)yOddChromaDst1, udo);
+ yOddChromaDst1 += 8;
+ }
+
+ if (yLumaDstOdd)
+ {
+ const __m128i mask =
+ _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+ (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
+ const __m128i ud = _mm_shuffle_epi8(uo, mask);
+ int* uDst1 = (int*)uChromaDst1;
+ int* vDst1 = (int*)vChromaDst1;
+ const int* src = (const int*)&ud;
+ _mm_stream_si32(uDst1, src[0]);
+ _mm_stream_si32(vDst1, src[1]);
+ uChromaDst1 += 4;
+ vChromaDst1 += 4;
+ }
+
+ if (yLumaDstOdd)
+ {
+ _mm_storel_epi64((__m128i*)uLumaDst, uavg);
+ uLumaDst += 8;
+ }
+ else
+ {
+ const __m128i mask =
+ _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+ (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ const __m128i ud = _mm_shuffle_epi8(ue, mask);
+ _mm_storel_epi64((__m128i*)uLumaDst, ud);
+ uLumaDst += 8;
+ }
+ }
+
+ {
+ /* V: multiplications with subtotals and horizontal sums */
+ __m128i ve;
+ __m128i vo;
+ __m128i vavg;
+ {
+ const __m128i v_factors = BGRX_V_FACTORS;
+ const __m128i ve1 =
+ _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
+ _mm_maddubs_epi16(xe2, v_factors)),
+ V_SHIFT);
+ const __m128i ve2 =
+ _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
+ _mm_maddubs_epi16(xe4, v_factors)),
+ V_SHIFT);
+ const __m128i veavg = _mm_hadd_epi16(ve1, ve2);
+ ve = _mm_sub_epi8(_mm_packs_epi16(ve1, ve2), vector128);
+ vavg = veavg;
+ }
+ {
+ const __m128i v_factors = BGRX_V_FACTORS;
+ const __m128i vo1 =
+ _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
+ _mm_maddubs_epi16(xo2, v_factors)),
+ V_SHIFT);
+ const __m128i vo2 =
+ _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
+ _mm_maddubs_epi16(xo4, v_factors)),
+ V_SHIFT);
+ const __m128i voavg = _mm_hadd_epi16(vo1, vo2);
+ vo = _mm_sub_epi8(_mm_packs_epi16(vo1, vo2), vector128);
+ vavg = _mm_add_epi16(vavg, voavg);
+ vavg = _mm_srai_epi16(vavg, 2);
+ vavg = _mm_packs_epi16(vavg, voavg);
+ vavg = _mm_sub_epi8(vavg, vector128);
+ }
+ /* Now we need the following storage distribution:
+ * 2x 2y -> vLumaDst
+ * 2x+1 y -> yChromaDst2
+ * 4x 2y+1 -> uChromaDst2
+ * 4x+2 2y+1 -> vChromaDst2 */
+ {
+ const __m128i mask =
+ _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+ (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+ __m128i vde = _mm_shuffle_epi8(ve, mask);
+ _mm_storel_epi64((__m128i*)yEvenChromaDst2, vde);
+ yEvenChromaDst2 += 8;
+ }
+
+ if (yLumaDstOdd)
+ {
+ const __m128i mask =
+ _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+ (char)0x80, (char)0x80, (char)0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+ __m128i vdo = _mm_shuffle_epi8(vo, mask);
+ _mm_storel_epi64((__m128i*)yOddChromaDst2, vdo);
+ yOddChromaDst2 += 8;
+ }
+
+ if (yLumaDstOdd)
+ {
+ const __m128i mask =
+ _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+ (char)0x80, (char)0x80, (char)0x80, 14, 10, 6, 2, 12, 8, 4, 0);
+ const __m128i vd = _mm_shuffle_epi8(vo, mask);
+ int* uDst2 = (int*)uChromaDst2;
+ int* vDst2 = (int*)vChromaDst2;
+ const int* src = (const int*)&vd;
+ _mm_stream_si32(uDst2, src[0]);
+ _mm_stream_si32(vDst2, src[1]);
+ uChromaDst2 += 4;
+ vChromaDst2 += 4;
+ }
+
+ if (yLumaDstOdd)
+ {
+ _mm_storel_epi64((__m128i*)vLumaDst, vavg);
+ vLumaDst += 8;
+ }
+ else
+ {
+ const __m128i mask =
+ _mm_set_epi8((char)0x80, (char)0x80, (char)0x80, (char)0x80, (char)0x80,
+ (char)0x80, (char)0x80, (char)0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m128i vd = _mm_shuffle_epi8(ve, mask);
+ _mm_storel_epi64((__m128i*)vLumaDst, vd);
+ vLumaDst += 8;
+ }
+ }
+ }
+}
+
+static pstatus_t ssse3_RGBToAVC444YUVv2_BGRX(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+ UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
+ const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
+ const UINT32 dst2Step[],
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ if (roi->height < 1 || roi->width < 1)
+ return !PRIMITIVES_SUCCESS;
+
+ if (roi->width % 16 || (uintptr_t)pSrc % 16 || srcStep % 16)
+ return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step,
+ roi);
+
+ for (UINT32 y = 0; y < roi->height; y += 2)
+ {
+ const BYTE* srcEven = (pSrc + y * srcStep);
+ const BYTE* srcOdd = (srcEven + srcStep);
+ BYTE* dstLumaYEven = (pDst1[0] + y * dst1Step[0]);
+ BYTE* dstLumaYOdd = (y < roi->height - 1) ? (dstLumaYEven + dst1Step[0]) : NULL;
+ BYTE* dstLumaU = (pDst1[1] + (y / 2) * dst1Step[1]);
+ BYTE* dstLumaV = (pDst1[2] + (y / 2) * dst1Step[2]);
+ BYTE* dstEvenChromaY1 = (pDst2[0] + y * dst2Step[0]);
+ BYTE* dstEvenChromaY2 = dstEvenChromaY1 + roi->width / 2;
+ BYTE* dstOddChromaY1 = dstEvenChromaY1 + dst2Step[0];
+ BYTE* dstOddChromaY2 = dstEvenChromaY2 + dst2Step[0];
+ BYTE* dstChromaU1 = (pDst2[1] + (y / 2) * dst2Step[1]);
+ BYTE* dstChromaV1 = (pDst2[2] + (y / 2) * dst2Step[2]);
+ BYTE* dstChromaU2 = dstChromaU1 + roi->width / 4;
+ BYTE* dstChromaV2 = dstChromaV1 + roi->width / 4;
+ ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(srcEven, srcOdd, dstLumaYEven, dstLumaYOdd, dstLumaU,
+ dstLumaV, dstEvenChromaY1, dstEvenChromaY2,
+ dstOddChromaY1, dstOddChromaY2, dstChromaU1,
+ dstChromaU2, dstChromaV1, dstChromaV2, roi->width);
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t ssse3_RGBToAVC444YUVv2(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcFormat,
+ UINT32 srcStep, BYTE* WINPR_RESTRICT pDst1[],
+ const UINT32 dst1Step[], BYTE* WINPR_RESTRICT pDst2[],
+ const UINT32 dst2Step[],
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ switch (srcFormat)
+ {
+ case PIXEL_FORMAT_BGRX32:
+ case PIXEL_FORMAT_BGRA32:
+ return ssse3_RGBToAVC444YUVv2_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
+ dst2Step, roi);
+
+ default:
+ return generic->RGBToAVC444YUVv2(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2,
+ dst2Step, roi);
+ }
+}
+
+static pstatus_t ssse3_LumaToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[],
+ const UINT32 srcStep[], BYTE* WINPR_RESTRICT pDstRaw[],
+ const UINT32 dstStep[], const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+ const UINT32 nWidth = roi->right - roi->left;
+ const UINT32 nHeight = roi->bottom - roi->top;
+ const UINT32 halfWidth = (nWidth + 1) / 2;
+ const UINT32 halfPad = halfWidth % 16;
+ const UINT32 halfHeight = (nHeight + 1) / 2;
+ const UINT32 oddY = 1;
+ const UINT32 evenY = 0;
+ const UINT32 oddX = 1;
+ const UINT32 evenX = 0;
+ const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
+ pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
+ pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
+ BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
+ pDstRaw[1] + roi->top * dstStep[1] + roi->left,
+ pDstRaw[2] + roi->top * dstStep[2] + roi->left };
+
+ /* Y data is already here... */
+ /* B1 */
+ for (UINT32 y = 0; y < nHeight; y++)
+ {
+ const BYTE* Ym = pSrc[0] + srcStep[0] * y;
+ BYTE* pY = pDst[0] + dstStep[0] * y;
+ memcpy(pY, Ym, nWidth);
+ }
+
+ /* The first half of U, V are already here part of this frame. */
+ /* B2 and B3 */
+ for (UINT32 y = 0; y < halfHeight; y++)
+ {
+ const UINT32 val2y = (2 * y + evenY);
+ const UINT32 val2y1 = val2y + oddY;
+ const BYTE* Um = pSrc[1] + srcStep[1] * y;
+ const BYTE* Vm = pSrc[2] + srcStep[2] * y;
+ BYTE* pU = pDst[1] + dstStep[1] * val2y;
+ BYTE* pV = pDst[2] + dstStep[2] * val2y;
+ BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;
+ BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;
+
+ UINT32 x = 0;
+ for (; x < halfWidth - halfPad; x += 16)
+ {
+ const __m128i unpackHigh = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+ const __m128i unpackLow =
+ _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8);
+ {
+ const __m128i u = _mm_loadu_si128((const __m128i*)&Um[x]);
+ const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
+ const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
+ _mm_storeu_si128((__m128i*)&pU[2 * x], uHigh);
+ _mm_storeu_si128((__m128i*)&pU[2 * x + 16], uLow);
+ _mm_storeu_si128((__m128i*)&pU1[2 * x], uHigh);
+ _mm_storeu_si128((__m128i*)&pU1[2 * x + 16], uLow);
+ }
+ {
+ const __m128i u = _mm_loadu_si128((const __m128i*)&Vm[x]);
+ const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
+ const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
+ _mm_storeu_si128((__m128i*)&pV[2 * x], uHigh);
+ _mm_storeu_si128((__m128i*)&pV[2 * x + 16], uLow);
+ _mm_storeu_si128((__m128i*)&pV1[2 * x], uHigh);
+ _mm_storeu_si128((__m128i*)&pV1[2 * x + 16], uLow);
+ }
+ }
+
+ for (; x < halfWidth; x++)
+ {
+ const UINT32 val2x = 2 * x + evenX;
+ const UINT32 val2x1 = val2x + oddX;
+ pU[val2x] = Um[x];
+ pV[val2x] = Vm[x];
+ pU[val2x1] = Um[x];
+ pV[val2x1] = Vm[x];
+ pU1[val2x] = Um[x];
+ pV1[val2x] = Vm[x];
+ pU1[val2x1] = Um[x];
+ pV1[val2x1] = Vm[x];
+ }
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static INLINE void ssse3_filter(BYTE* WINPR_RESTRICT pSrcDst, const BYTE* WINPR_RESTRICT pSrc2)
+{
+ const __m128i even = _mm_set_epi8((char)0x80, 14, (char)0x80, 12, (char)0x80, 10, (char)0x80, 8,
+ (char)0x80, 6, (char)0x80, 4, (char)0x80, 2, (char)0x80, 0);
+ const __m128i odd = _mm_set_epi8((char)0x80, 15, (char)0x80, 13, (char)0x80, 11, (char)0x80, 9,
+ (char)0x80, 7, (char)0x80, 5, (char)0x80, 3, (char)0x80, 1);
+ const __m128i interleave = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
+ const __m128i u = _mm_loadu_si128((const __m128i*)pSrcDst);
+ const __m128i u1 = _mm_loadu_si128((const __m128i*)pSrc2);
+ const __m128i uEven = _mm_shuffle_epi8(u, even);
+ const __m128i uEven4 = _mm_slli_epi16(uEven, 2);
+ const __m128i uOdd = _mm_shuffle_epi8(u, odd);
+ const __m128i u1Even = _mm_shuffle_epi8(u1, even);
+ const __m128i u1Odd = _mm_shuffle_epi8(u1, odd);
+ const __m128i tmp1 = _mm_add_epi16(uOdd, u1Even);
+ const __m128i tmp2 = _mm_add_epi16(tmp1, u1Odd);
+ const __m128i result = _mm_sub_epi16(uEven4, tmp2);
+ const __m128i packed = _mm_packus_epi16(result, uOdd);
+ const __m128i interleaved = _mm_shuffle_epi8(packed, interleave);
+ _mm_storeu_si128((__m128i*)pSrcDst, interleaved);
+}
+
+static pstatus_t ssse3_ChromaFilter(BYTE* WINPR_RESTRICT pDst[], const UINT32 dstStep[],
+ const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+ const UINT32 oddY = 1;
+ const UINT32 evenY = 0;
+ const UINT32 nWidth = roi->right - roi->left;
+ const UINT32 nHeight = roi->bottom - roi->top;
+ const UINT32 halfHeight = (nHeight + 1) / 2;
+ const UINT32 halfWidth = (nWidth + 1) / 2;
+ const UINT32 halfPad = halfWidth % 16;
+
+ /* Filter */
+ for (UINT32 y = roi->top; y < halfHeight + roi->top; y++)
+ {
+ UINT32 x = roi->left;
+ const UINT32 val2y = (y * 2 + evenY);
+ const UINT32 val2y1 = val2y + oddY;
+ BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;
+ BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;
+ BYTE* pU = pDst[1] + dstStep[1] * val2y;
+ BYTE* pV = pDst[2] + dstStep[2] * val2y;
+
+ if (val2y1 > nHeight)
+ continue;
+
+ for (; x < halfWidth + roi->left - halfPad; x += 16)
+ {
+ ssse3_filter(&pU[2 * x], &pU1[2 * x]);
+ ssse3_filter(&pV[2 * x], &pV1[2 * x]);
+ }
+
+ for (; x < halfWidth + roi->left; x++)
+ {
+ const UINT32 val2x = (x * 2);
+ const UINT32 val2x1 = val2x + 1;
+ const BYTE inU = pU[val2x];
+ const BYTE inV = pV[val2x];
+ const INT32 up = inU * 4;
+ const INT32 vp = inV * 4;
+ INT32 u2020 = 0;
+ INT32 v2020 = 0;
+
+ if (val2x1 > nWidth)
+ continue;
+
+ u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1];
+ v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1];
+ pU[val2x] = CONDITIONAL_CLIP(u2020, inU);
+ pV[val2x] = CONDITIONAL_CLIP(v2020, inV);
+ }
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t ssse3_ChromaV1ToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3],
+ const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
+ const UINT32 dstStep[3],
+ const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+ const UINT32 mod = 16;
+ UINT32 uY = 0;
+ UINT32 vY = 0;
+ const UINT32 nWidth = roi->right - roi->left;
+ const UINT32 nHeight = roi->bottom - roi->top;
+ const UINT32 halfWidth = (nWidth + 1) / 2;
+ const UINT32 halfPad = halfWidth % 16;
+ const UINT32 halfHeight = (nHeight + 1) / 2;
+ const UINT32 oddY = 1;
+ const UINT32 evenY = 0;
+ const UINT32 oddX = 1;
+ /* The auxilary frame is aligned to multiples of 16x16.
+ * We need the padded height for B4 and B5 conversion. */
+ const UINT32 padHeigth = nHeight + 16 - nHeight % 16;
+ const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left,
+ pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,
+ pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };
+ BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left,
+ pDstRaw[1] + roi->top * dstStep[1] + roi->left,
+ pDstRaw[2] + roi->top * dstStep[2] + roi->left };
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i mask = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0,
+ (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80);
+
+ /* The second half of U and V is a bit more tricky... */
+ /* B4 and B5 */
+ for (UINT32 y = 0; y < padHeigth; y++)
+ {
+ const BYTE* Ya = pSrc[0] + srcStep[0] * y;
+ BYTE* pX = NULL;
+
+ if ((y) % mod < (mod + 1) / 2)
+ {
+ const UINT32 pos = (2 * uY++ + oddY);
+
+ if (pos >= nHeight)
+ continue;
+
+ pX = pDst[1] + dstStep[1] * pos;
+ }
+ else
+ {
+ const UINT32 pos = (2 * vY++ + oddY);
+
+ if (pos >= nHeight)
+ continue;
+
+ pX = pDst[2] + dstStep[2] * pos;
+ }
+
+ memcpy(pX, Ya, nWidth);
+ }
+
+ /* B6 and B7 */
+ for (UINT32 y = 0; y < halfHeight; y++)
+ {
+ const UINT32 val2y = (y * 2 + evenY);
+ const BYTE* Ua = pSrc[1] + srcStep[1] * y;
+ const BYTE* Va = pSrc[2] + srcStep[2] * y;
+ BYTE* pU = pDst[1] + dstStep[1] * val2y;
+ BYTE* pV = pDst[2] + dstStep[2] * val2y;
+
+ UINT32 x = 0;
+ for (; x < halfWidth - halfPad; x += 16)
+ {
+ {
+ const __m128i u = _mm_loadu_si128((const __m128i*)&Ua[x]);
+ const __m128i u2 = _mm_unpackhi_epi8(u, zero);
+ const __m128i u1 = _mm_unpacklo_epi8(u, zero);
+ _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
+ _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
+ }
+ {
+ const __m128i u = _mm_loadu_si128((const __m128i*)&Va[x]);
+ const __m128i u2 = _mm_unpackhi_epi8(u, zero);
+ const __m128i u1 = _mm_unpacklo_epi8(u, zero);
+ _mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x]);
+ _mm_maskmoveu_si128(u2, mask, (char*)&pV[2 * x + 16]);
+ }
+ }
+
+ for (; x < halfWidth; x++)
+ {
+ const UINT32 val2x1 = (x * 2 + oddX);
+ pU[val2x1] = Ua[x];
+ pV[val2x1] = Va[x];
+ }
+ }
+
+ /* Filter */
+ return ssse3_ChromaFilter(pDst, dstStep, roi);
+}
+
+static pstatus_t ssse3_ChromaV2ToYUV444(const BYTE* const WINPR_RESTRICT pSrc[3],
+ const UINT32 srcStep[3], UINT32 nTotalWidth,
+ UINT32 nTotalHeight, BYTE* WINPR_RESTRICT pDst[3],
+ const UINT32 dstStep[3],
+ const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+ const UINT32 nWidth = roi->right - roi->left;
+ const UINT32 nHeight = roi->bottom - roi->top;
+ const UINT32 halfWidth = (nWidth + 1) / 2;
+ const UINT32 halfPad = halfWidth % 16;
+ const UINT32 halfHeight = (nHeight + 1) / 2;
+ const UINT32 quaterWidth = (nWidth + 3) / 4;
+ const UINT32 quaterPad = quaterWidth % 16;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i mask = _mm_set_epi8((char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0,
+ (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0);
+ const __m128i mask2 = _mm_set_epi8(0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80,
+ 0, (char)0x80, 0, (char)0x80, 0, (char)0x80, 0, (char)0x80);
+ const __m128i shuffle1 =
+ _mm_set_epi8((char)0x80, 15, (char)0x80, 14, (char)0x80, 13, (char)0x80, 12, (char)0x80, 11,
+ (char)0x80, 10, (char)0x80, 9, (char)0x80, 8);
+ const __m128i shuffle2 =
+ _mm_set_epi8((char)0x80, 7, (char)0x80, 6, (char)0x80, 5, (char)0x80, 4, (char)0x80, 3,
+ (char)0x80, 2, (char)0x80, 1, (char)0x80, 0);
+
+ /* B4 and B5: odd UV values for width/2, height */
+ for (UINT32 y = 0; y < nHeight; y++)
+ {
+ const UINT32 yTop = y + roi->top;
+ const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;
+ const BYTE* pYaV = pYaU + nTotalWidth / 2;
+ BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left;
+ BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left;
+
+ UINT32 x = 0;
+ for (; x < halfWidth - halfPad; x += 16)
+ {
+ {
+ const __m128i u = _mm_loadu_si128((const __m128i*)&pYaU[x]);
+ const __m128i u2 = _mm_unpackhi_epi8(zero, u);
+ const __m128i u1 = _mm_unpacklo_epi8(zero, u);
+ _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
+ _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
+ }
+ {
+ const __m128i v = _mm_loadu_si128((const __m128i*)&pYaV[x]);
+ const __m128i v2 = _mm_unpackhi_epi8(zero, v);
+ const __m128i v1 = _mm_unpacklo_epi8(zero, v);
+ _mm_maskmoveu_si128(v1, mask, (char*)&pV[2 * x]);
+ _mm_maskmoveu_si128(v2, mask, (char*)&pV[2 * x + 16]);
+ }
+ }
+
+ for (; x < halfWidth; x++)
+ {
+ const UINT32 odd = 2 * x + 1;
+ pU[odd] = pYaU[x];
+ pV[odd] = pYaV[x];
+ }
+ }
+
+ /* B6 - B9 */
+ for (UINT32 y = 0; y < halfHeight; y++)
+ {
+ const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;
+ const BYTE* pUaV = pUaU + nTotalWidth / 4;
+ const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;
+ const BYTE* pVaV = pVaU + nTotalWidth / 4;
+ BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;
+ BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;
+
+ UINT32 x = 0;
+ for (; x < quaterWidth - quaterPad; x += 16)
+ {
+ {
+ const __m128i uU = _mm_loadu_si128((const __m128i*)&pUaU[x]);
+ const __m128i uV = _mm_loadu_si128((const __m128i*)&pVaU[x]);
+ const __m128i uHigh = _mm_unpackhi_epi8(uU, uV);
+ const __m128i uLow = _mm_unpacklo_epi8(uU, uV);
+ const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2);
+ const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1);
+ const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2);
+ const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1);
+ _mm_maskmoveu_si128(u1, mask2, (char*)&pU[4 * x + 0]);
+ _mm_maskmoveu_si128(u2, mask2, (char*)&pU[4 * x + 16]);
+ _mm_maskmoveu_si128(u3, mask2, (char*)&pU[4 * x + 32]);
+ _mm_maskmoveu_si128(u4, mask2, (char*)&pU[4 * x + 48]);
+ }
+ {
+ const __m128i vU = _mm_loadu_si128((const __m128i*)&pUaV[x]);
+ const __m128i vV = _mm_loadu_si128((const __m128i*)&pVaV[x]);
+ const __m128i vHigh = _mm_unpackhi_epi8(vU, vV);
+ const __m128i vLow = _mm_unpacklo_epi8(vU, vV);
+ const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2);
+ const __m128i v2 = _mm_shuffle_epi8(vLow, shuffle1);
+ const __m128i v3 = _mm_shuffle_epi8(vHigh, shuffle2);
+ const __m128i v4 = _mm_shuffle_epi8(vHigh, shuffle1);
+ _mm_maskmoveu_si128(v1, mask2, (char*)&pV[4 * x + 0]);
+ _mm_maskmoveu_si128(v2, mask2, (char*)&pV[4 * x + 16]);
+ _mm_maskmoveu_si128(v3, mask2, (char*)&pV[4 * x + 32]);
+ _mm_maskmoveu_si128(v4, mask2, (char*)&pV[4 * x + 48]);
+ }
+ }
+
+ for (; x < quaterWidth; x++)
+ {
+ pU[4 * x + 0] = pUaU[x];
+ pV[4 * x + 0] = pUaV[x];
+ pU[4 * x + 2] = pVaU[x];
+ pV[4 * x + 2] = pVaV[x];
+ }
+ }
+
+ return ssse3_ChromaFilter(pDst, dstStep, roi);
+}
+
+static pstatus_t ssse3_YUV420CombineToYUV444(avc444_frame_type type,
+ const BYTE* const WINPR_RESTRICT pSrc[3],
+ const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,
+ BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],
+ const RECTANGLE_16* WINPR_RESTRICT roi)
+{
+ if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])
+ return -1;
+
+ if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])
+ return -1;
+
+ if (!roi)
+ return -1;
+
+ switch (type)
+ {
+ case AVC444_LUMA:
+ return ssse3_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);
+
+ case AVC444_CHROMAv1:
+ return ssse3_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);
+
+ case AVC444_CHROMAv2:
+ return ssse3_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);
+
+ default:
+ return -1;
+ }
+}
+
+void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims)
+{
+ generic = primitives_get_generic();
+ primitives_init_YUV(prims);
+
+ if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
+ IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+ {
+ prims->RGBToYUV420_8u_P3AC4R = ssse3_RGBToYUV420;
+ prims->RGBToAVC444YUV = ssse3_RGBToAVC444YUV;
+ prims->RGBToAVC444YUVv2 = ssse3_RGBToAVC444YUVv2;
+ prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB;
+ prims->YUV444ToRGB_8u_P3AC4R = ssse3_YUV444ToRGB_8u_P3AC4R;
+ prims->YUV420CombineToYUV444 = ssse3_YUV420CombineToYUV444;
+ }
+}
diff --git a/libfreerdp/primitives/prim_add.c b/libfreerdp/primitives/prim_add.c
new file mode 100644
index 0000000..674e04f
--- /dev/null
+++ b/libfreerdp/primitives/prim_add.c
@@ -0,0 +1,48 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Add operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+/* ----------------------------------------------------------------------------
+ * 16-bit signed add with saturation (under and over).
+ */
+static pstatus_t general_add_16s(const INT16* pSrc1, const INT16* pSrc2, INT16* pDst, UINT32 len)
+{
+ while (len--)
+ {
+ INT32 k = (INT32)(*pSrc1++) + (INT32)(*pSrc2++);
+
+ if (k > 32767)
+ *pDst++ = ((INT16)32767);
+ else if (k < -32768)
+ *pDst++ = ((INT16)-32768);
+ else
+ *pDst++ = (INT16)k;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_add(primitives_t* prims)
+{
+ prims->add_16s = general_add_16s;
+}
diff --git a/libfreerdp/primitives/prim_add_opt.c b/libfreerdp/primitives/prim_add_opt.c
new file mode 100644
index 0000000..88c8b66
--- /dev/null
+++ b/libfreerdp/primitives/prim_add_opt.c
@@ -0,0 +1,61 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized add operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+#include <ipps.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+static primitives_t* generic = NULL;
+
+#ifdef WITH_SSE2
+#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+/* ------------------------------------------------------------------------- */
+SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16,
+ generic->add_16s(sptr1++, sptr2++, dptr++, 1))
+#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims)
+{
+ generic = primitives_get_generic();
+ primitives_init_add(prims);
+#ifdef WITH_IPP
+ prims->add_16s = (__add_16s_t)ippsAdd_16s;
+#elif defined(WITH_SSE2)
+
+ if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
+ IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
+ {
+ prims->add_16s = sse3_add_16s;
+ }
+
+#endif
+}
diff --git a/libfreerdp/primitives/prim_alphaComp.c b/libfreerdp/primitives/prim_alphaComp.c
new file mode 100644
index 0000000..fe4f8dc
--- /dev/null
+++ b/libfreerdp/primitives/prim_alphaComp.c
@@ -0,0 +1,94 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Alpha blending routines.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ * Note: this code assumes the second operand is fully opaque,
+ * e.g.
+ * newval = alpha1*val1 + (1-alpha1)*val2
+ * rather than
+ * newval = alpha1*val1 + (1-alpha1)*alpha2*val2
+ * The IPP gives other options.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+#define ALPHA(_k_) (((_k_)&0xFF000000U) >> 24)
+#define RED(_k_) (((_k_)&0x00FF0000U) >> 16)
+#define GRN(_k_) (((_k_)&0x0000FF00U) >> 8)
+#define BLU(_k_) (((_k_)&0x000000FFU))
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_alphaComp_argb(const BYTE* pSrc1, UINT32 src1Step, const BYTE* pSrc2,
+ UINT32 src2Step, BYTE* pDst, UINT32 dstStep, UINT32 width,
+ UINT32 height)
+{
+ for (UINT32 y = 0; y < height; y++)
+ {
+ const UINT32* sptr1 = (const UINT32*)(pSrc1 + y * src1Step);
+ const UINT32* sptr2 = (const UINT32*)(pSrc2 + y * src2Step);
+ UINT32* dptr = (UINT32*)(pDst + y * dstStep);
+
+ for (UINT32 x = 0; x < width; x++)
+ {
+ const UINT32 src1 = *sptr1++;
+ const UINT32 src2 = *sptr2++;
+ UINT32 alpha = ALPHA(src1) + 1;
+
+ if (alpha == 256)
+ {
+ /* If alpha is 255+1, just copy src1. */
+ *dptr++ = src1;
+ }
+ else if (alpha <= 1)
+ {
+ /* If alpha is 0+1, just copy src2. */
+ *dptr++ = src2;
+ }
+ else
+ {
+ /* A perfectly accurate blend would do (a*src + (255-a)*dst)/255
+ * rather than adding one to alpha and dividing by 256, but this
+ * is much faster and only differs by one 16% of the time.
+ * I'm not sure who first designed the double-ops trick
+ * (Red Blue and Alpha Green).
+ */
+ UINT32 rb = 0;
+ UINT32 ag = 0;
+ UINT32 s2rb = src2 & 0x00FF00FFU;
+ UINT32 s2ag = (src2 >> 8) & 0x00FF00FFU;
+ UINT32 s1rb = src1 & 0x00FF00FFU;
+ UINT32 s1ag = (src1 >> 8) & 0x00FF00FFU;
+ UINT32 drb = s1rb - s2rb;
+ UINT32 dag = s1ag - s2ag;
+ drb *= alpha;
+ dag *= alpha;
+ rb = ((drb >> 8) + s2rb) & 0x00FF00FFU;
+ ag = (((dag >> 8) + s2ag) << 8) & 0xFF00FF00U;
+ *dptr++ = rb | ag;
+ }
+ }
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_alphaComp(primitives_t* prims)
+{
+ prims->alphaComp_argb = general_alphaComp_argb;
+}
diff --git a/libfreerdp/primitives/prim_alphaComp_opt.c b/libfreerdp/primitives/prim_alphaComp_opt.c
new file mode 100644
index 0000000..2c675a4
--- /dev/null
+++ b/libfreerdp/primitives/prim_alphaComp_opt.c
@@ -0,0 +1,245 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized alpha blending routines.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ * Note: this code assumes the second operand is fully opaque,
+ * e.g.
+ * newval = alpha1*val1 + (1-alpha1)*val2
+ * rather than
+ * newval = alpha1*val1 + (1-alpha1)*alpha2*val2
+ * The IPP gives other options.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+#include <ippi.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+
+static primitives_t* generic = NULL;
+
+/* ------------------------------------------------------------------------- */
+#ifdef WITH_SSE2
+#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+
+static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
+ const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
+ BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 width,
+ UINT32 height)
+{
+ const UINT32* sptr1 = (const UINT32*)pSrc1;
+ const UINT32* sptr2 = (const UINT32*)pSrc2;
+ UINT32* dptr = NULL;
+ int linebytes = 0;
+ int src1Jump = 0;
+ int src2Jump = 0;
+ int dstJump = 0;
+ __m128i xmm0;
+ __m128i xmm1;
+
+ if ((width <= 0) || (height <= 0))
+ return PRIMITIVES_SUCCESS;
+
+ if (width < 4) /* pointless if too small */
+ {
+ return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width,
+ height);
+ }
+
+ dptr = (UINT32*)pDst;
+ linebytes = width * sizeof(UINT32);
+ src1Jump = (src1Step - linebytes) / sizeof(UINT32);
+ src2Jump = (src2Step - linebytes) / sizeof(UINT32);
+ dstJump = (dstStep - linebytes) / sizeof(UINT32);
+ xmm0 = _mm_set1_epi32(0);
+ xmm1 = _mm_set1_epi16(1);
+
+ for (UINT32 y = 0; y < height; ++y)
+ {
+ int pixels = width;
+ int count = 0;
+ /* Get to the 16-byte boundary now. */
+ int leadIn = 0;
+
+ switch ((ULONG_PTR)dptr & 0x0f)
+ {
+ case 0:
+ leadIn = 0;
+ break;
+
+ case 4:
+ leadIn = 3;
+ break;
+
+ case 8:
+ leadIn = 2;
+ break;
+
+ case 12:
+ leadIn = 1;
+ break;
+
+ default:
+ /* We'll never hit a 16-byte boundary, so do the whole
+ * thing the slow way.
+ */
+ leadIn = width;
+ break;
+ }
+
+ if (leadIn)
+ {
+ pstatus_t status = 0;
+ status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
+ src2Step, (BYTE*)dptr, dstStep, leadIn, 1);
+ if (status != PRIMITIVES_SUCCESS)
+ return status;
+
+ sptr1 += leadIn;
+ sptr2 += leadIn;
+ dptr += leadIn;
+ pixels -= leadIn;
+ }
+
+ /* Use SSE registers to do 4 pixels at a time. */
+ count = pixels >> 2;
+ pixels -= count << 2;
+
+ while (count--)
+ {
+ __m128i xmm2;
+ __m128i xmm3;
+ __m128i xmm4;
+ __m128i xmm5;
+ __m128i xmm6;
+ __m128i xmm7;
+ /* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
+ xmm2 = LOAD_SI128(sptr1);
+ sptr1 += 4;
+ /* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
+ xmm3 = LOAD_SI128(sptr2);
+ sptr2 += 4;
+ /* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
+ xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
+ /* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
+ xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
+ /* subtract */
+ xmm6 = _mm_subs_epi16(xmm4, xmm5);
+ /* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
+ xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
+ /* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
+ xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
+ /* Add one to alphas */
+ xmm4 = _mm_adds_epi16(xmm4, xmm1);
+ /* Multiply and take low word */
+ xmm4 = _mm_mullo_epi16(xmm4, xmm6);
+ /* Shift 8 right */
+ xmm4 = _mm_srai_epi16(xmm4, 8);
+ /* Add xmm5 */
+ xmm4 = _mm_adds_epi16(xmm4, xmm5);
+ /* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
+ /* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
+ xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
+ /* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
+ xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
+ /* subtract */
+ xmm7 = _mm_subs_epi16(xmm5, xmm6);
+ /* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
+ xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
+ /* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
+ xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
+ /* Add one to alphas */
+ xmm5 = _mm_adds_epi16(xmm5, xmm1);
+ /* Multiply and take low word */
+ xmm5 = _mm_mullo_epi16(xmm5, xmm7);
+ /* Shift 8 right */
+ xmm5 = _mm_srai_epi16(xmm5, 8);
+ /* Add xmm6 */
+ xmm5 = _mm_adds_epi16(xmm5, xmm6);
+ /* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
+ /* Must mask off remainders or pack gets confused */
+ xmm3 = _mm_set1_epi16(0x00ffU);
+ xmm4 = _mm_and_si128(xmm4, xmm3);
+ xmm5 = _mm_and_si128(xmm5, xmm3);
+ /* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
+ xmm5 = _mm_packus_epi16(xmm5, xmm4);
+ _mm_store_si128((__m128i*)dptr, xmm5);
+ dptr += 4;
+ }
+
+ /* Finish off the remainder. */
+ if (pixels)
+ {
+ pstatus_t status = 0;
+ status = generic->alphaComp_argb((const BYTE*)sptr1, src1Step, (const BYTE*)sptr2,
+ src2Step, (BYTE*)dptr, dstStep, pixels, 1);
+ if (status != PRIMITIVES_SUCCESS)
+ return status;
+
+ sptr1 += pixels;
+ sptr2 += pixels;
+ dptr += pixels;
+ }
+
+ /* Jump to next row. */
+ sptr1 += src1Jump;
+ sptr2 += src2Jump;
+ dptr += dstJump;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+#ifdef WITH_IPP
+/* ------------------------------------------------------------------------- */
+static pstatus_t ipp_alphaComp_argb(const BYTE* pSrc1, INT32 src1Step, const BYTE* pSrc2,
+ INT32 src2Step, BYTE* pDst, INT32 dstStep, INT32 width,
+ INT32 height)
+{
+ IppiSize sz;
+ sz.width = width;
+ sz.height = height;
+ return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, sz, ippAlphaOver);
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims)
+{
+ generic = primitives_get_generic();
+ primitives_init_alphaComp(prims);
+#ifdef WITH_IPP
+ prims->alphaComp_argb = ipp_alphaComp_argb;
+#elif defined(WITH_SSE2)
+
+ if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
+ IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
+ {
+ prims->alphaComp_argb = sse2_alphaComp_argb;
+ }
+
+#endif
+}
diff --git a/libfreerdp/primitives/prim_andor.c b/libfreerdp/primitives/prim_andor.c
new file mode 100644
index 0000000..9216546
--- /dev/null
+++ b/libfreerdp/primitives/prim_andor.c
@@ -0,0 +1,57 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Logical operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+/* ----------------------------------------------------------------------------
+ * 32-bit AND with a constant.
+ */
+static pstatus_t general_andC_32u(const UINT32* pSrc, UINT32 val, UINT32* pDst, INT32 len)
+{
+ if (val == 0)
+ return PRIMITIVES_SUCCESS;
+
+ while (len--)
+ *pDst++ = *pSrc++ & val;
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ----------------------------------------------------------------------------
+ * 32-bit OR with a constant.
+ */
+static pstatus_t general_orC_32u(const UINT32* pSrc, UINT32 val, UINT32* pDst, INT32 len)
+{
+ if (val == 0)
+ return PRIMITIVES_SUCCESS;
+
+ while (len--)
+ *pDst++ = *pSrc++ | val;
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_andor(primitives_t* prims)
+{
+ /* Start with the default. */
+ prims->andC_32u = general_andC_32u;
+ prims->orC_32u = general_orC_32u;
+}
diff --git a/libfreerdp/primitives/prim_andor_opt.c b/libfreerdp/primitives/prim_andor_opt.c
new file mode 100644
index 0000000..bc51f1c
--- /dev/null
+++ b/libfreerdp/primitives/prim_andor_opt.c
@@ -0,0 +1,63 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized Logical operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+#include <ipps.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+static primitives_t* generic = NULL;
+
+#ifdef WITH_SSE2
+#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, generic->andC_32u, _mm_and_si128,
+ *dptr++ = *sptr++ & val)
+SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, generic->orC_32u, _mm_or_si128, *dptr++ = *sptr++ | val)
+#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims)
+{
+ generic = primitives_get_generic();
+ primitives_init_andor(prims);
+#if defined(WITH_IPP)
+ prims->andC_32u = (__andC_32u_t)ippsAndC_32u;
+ prims->orC_32u = (__orC_32u_t)ippsOrC_32u;
+#elif defined(WITH_SSE2)
+
+ if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
+ IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+ {
+ prims->andC_32u = sse3_andC_32u;
+ prims->orC_32u = sse3_orC_32u;
+ }
+
+#endif
+}
diff --git a/libfreerdp/primitives/prim_colors.c b/libfreerdp/primitives/prim_colors.c
new file mode 100644
index 0000000..4a23129
--- /dev/null
+++ b/libfreerdp/primitives/prim_colors.c
@@ -0,0 +1,509 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Color conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * Copyright 2011 Stephen Erisman
+ * Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
+ * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <freerdp/codec/color.h>
+
+#include "prim_internal.h"
+
+#ifndef MINMAX
+#define MINMAX(_v_, _l_, _h_) ((_v_) < (_l_) ? (_l_) : ((_v_) > (_h_) ? (_h_) : (_v_)))
+#endif /* !MINMAX */
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* const WINPR_RESTRICT pSrc[3],
+ UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+ UINT32 dstStep, UINT32 DstFormat,
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ BYTE* pRGB = pDst;
+ const INT16* pY = pSrc[0];
+ const INT16* pCb = pSrc[1];
+ const INT16* pCr = pSrc[2];
+ const size_t srcPad = (srcStep - (roi->width * 2)) / 2;
+ const size_t dstPad = (dstStep - (roi->width * 4));
+ const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+
+ for (UINT32 y = 0; y < roi->height; y++)
+ {
+ for (UINT32 x = 0; x < roi->width; x++)
+ {
+ INT16 R = 0;
+ INT16 G = 0;
+ INT16 B = 0;
+ const INT32 divisor = 16;
+ const INT32 Y = (INT32)((UINT32)((*pY++) + 4096) << divisor);
+ const INT32 Cb = (*pCb++);
+ const INT32 Cr = (*pCr++);
+ const INT64 CrR = Cr * (INT64)(1.402525f * (1 << divisor)) * 1LL;
+ const INT64 CrG = Cr * (INT64)(0.714401f * (1 << divisor)) * 1LL;
+ const INT64 CbG = Cb * (INT64)(0.343730f * (1 << divisor)) * 1LL;
+ const INT64 CbB = Cb * (INT64)(1.769905f * (1 << divisor)) * 1LL;
+ R = ((INT16)((CrR + Y) >> divisor) >> 5);
+ G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
+ B = ((INT16)((CbB + Y) >> divisor) >> 5);
+ pRGB = writePixelBGRX(pRGB, formatSize, DstFormat, CLIP(R), CLIP(G), CLIP(B), 0);
+ }
+
+ pY += srcPad;
+ pCb += srcPad;
+ pCr += srcPad;
+ pRGB += dstPad;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_yCbCrToRGB_16s8u_P3AC4R_general(const INT16* const WINPR_RESTRICT pSrc[3],
+ UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+ UINT32 dstStep, UINT32 DstFormat,
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ BYTE* pRGB = pDst;
+ const INT16* pY = pSrc[0];
+ const INT16* pCb = pSrc[1];
+ const INT16* pCr = pSrc[2];
+ const size_t srcPad = (srcStep - (roi->width * 2)) / 2;
+ const size_t dstPad = (dstStep - (roi->width * 4));
+ const fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, FALSE);
+ const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+
+ for (UINT32 y = 0; y < roi->height; y++)
+ {
+ for (UINT32 x = 0; x < roi->width; x++)
+ {
+ INT64 R = 0;
+ INT64 G = 0;
+ INT64 B = 0;
+ const INT32 divisor = 16;
+ const INT32 Y = (INT32)((UINT32)((*pY++) + 4096) << divisor);
+ const INT32 Cb = (*pCb++);
+ const INT32 Cr = (*pCr++);
+ const INT64 CrR = Cr * (INT64)(1.402525f * (1 << divisor)) * 1LL;
+ const INT64 CrG = Cr * (INT64)(0.714401f * (1 << divisor)) * 1LL;
+ const INT64 CbG = Cb * (INT64)(0.343730f * (1 << divisor)) * 1LL;
+ const INT64 CbB = Cb * (INT64)(1.769905f * (1 << divisor)) * 1LL;
+ R = (INT64)((CrR + Y) >> (divisor + 5));
+ G = (INT64)((Y - CbG - CrG) >> (divisor + 5));
+ B = (INT64)((CbB + Y) >> (divisor + 5));
+ pRGB = writePixel(pRGB, formatSize, DstFormat, CLIP(R), CLIP(G), CLIP(B), 0);
+ }
+
+ pY += srcPad;
+ pCb += srcPad;
+ pCr += srcPad;
+ pRGB += dstPad;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_yCbCrToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3],
+ UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+ UINT32 dstStep, UINT32 DstFormat,
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ switch (DstFormat)
+ {
+ case PIXEL_FORMAT_BGRA32:
+ case PIXEL_FORMAT_BGRX32:
+ return general_yCbCrToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, DstFormat,
+ roi);
+
+ default:
+ return general_yCbCrToRGB_16s8u_P3AC4R_general(pSrc, srcStep, pDst, dstStep, DstFormat,
+ roi);
+ }
+}
+
+/* ------------------------------------------------------------------------- */
+
+static pstatus_t
+general_yCbCrToRGB_16s16s_P3P3(const INT16* const WINPR_RESTRICT pSrc[3], INT32 srcStep,
+ INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
+ const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+ /**
+ * The decoded YCbCr coeffectients are represented as 11.5 fixed-point
+ * numbers:
+ *
+ * 1 sign bit + 10 integer bits + 5 fractional bits
+ *
+ * However only 7 integer bits will be actually used since the value range
+ * is [-128.0, 127.0]. In other words, the decoded coefficients are scaled
+ * by << 5 when interpreted as INT16.
+ * It was scaled in the quantization phase, so we must scale it back here.
+ */
+ const INT16* yptr = pSrc[0];
+ const INT16* cbptr = pSrc[1];
+ const INT16* crptr = pSrc[2];
+ INT16* rptr = pDst[0];
+ INT16* gptr = pDst[1];
+ INT16* bptr = pDst[2];
+ UINT32 srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+ UINT32 dstbump = (dstStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+
+ for (UINT32 y = 0; y < roi->height; y++)
+ {
+ for (UINT32 x = 0; x < roi->width; ++x)
+ {
+ /* INT32 is used intentionally because we calculate
+ * with shifted factors!
+ */
+ INT32 cy = (INT32)(*yptr++);
+ INT32 cb = (INT32)(*cbptr++);
+ INT32 cr = (INT32)(*crptr++);
+ INT64 r = 0;
+ INT64 g = 0;
+ INT64 b = 0;
+ /*
+ * This is the slow floating point version kept here for reference.
+ * y = y + 4096; // 128<<5=4096 so that we can scale the sum by>>5
+ * r = y + cr*1.403f;
+ * g = y - cb*0.344f - cr*0.714f;
+ * b = y + cb*1.770f;
+ * y_r_buf[i] = CLIP(r>>5);
+ * cb_g_buf[i] = CLIP(g>>5);
+ * cr_b_buf[i] = CLIP(b>>5);
+ */
+ /*
+ * We scale the factors by << 16 into 32-bit integers in order to
+ * avoid slower floating point multiplications. Since the final
+ * result needs to be scaled by >> 5 we will extract only the
+ * upper 11 bits (>> 21) from the final sum.
+ * Hence we also have to scale the other terms of the sum by << 16.
+ * R: 1.403 << 16 = 91947
+ * G: 0.344 << 16 = 22544, 0.714 << 16 = 46792
+ * B: 1.770 << 16 = 115998
+ */
+ cy = (INT32)((UINT32)(cy + 4096) << 16);
+ r = cy + cr * 91947LL;
+ g = cy - cb * 22544LL - cr * 46792LL;
+ b = cy + cb * 115998LL;
+ *rptr++ = CLIP(r >> 21);
+ *gptr++ = CLIP(g >> 21);
+ *bptr++ = CLIP(b >> 21);
+ }
+
+ yptr += srcbump;
+ cbptr += srcbump;
+ crptr += srcbump;
+ rptr += dstbump;
+ gptr += dstbump;
+ bptr += dstbump;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t
+general_RGBToYCbCr_16s16s_P3P3(const INT16* const WINPR_RESTRICT pSrc[3], INT32 srcStep,
+ INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
+ const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+ /* The encoded YCbCr coefficients are represented as 11.5 fixed-point
+ * numbers:
+ *
+ * 1 sign bit + 10 integer bits + 5 fractional bits
+ *
+ * However only 7 integer bits will be actually used since the value
+ * range is [-128.0, 127.0]. In other words, the encoded coefficients
+ * is scaled by << 5 when interpreted as INT16.
+ * It will be scaled down to original during the quantization phase.
+ */
+ const INT16* rptr = pSrc[0];
+ const INT16* gptr = pSrc[1];
+ const INT16* bptr = pSrc[2];
+ INT16* yptr = pDst[0];
+ INT16* cbptr = pDst[1];
+ INT16* crptr = pDst[2];
+ UINT32 srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+ UINT32 dstbump = (dstStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+
+ for (UINT32 y = 0; y < roi->height; y++)
+ {
+ for (UINT32 x = 0; x < roi->width; ++x)
+ {
+ /* INT32 is used intentionally because we calculate with
+ * shifted factors!
+ */
+ INT32 r = (INT32)(*rptr++);
+ INT32 g = (INT32)(*gptr++);
+ INT32 b = (INT32)(*bptr++);
+ /* We scale the factors by << 15 into 32-bit integers in order
+ * to avoid slower floating point multiplications. Since the
+ * terms need to be scaled by << 5 we simply scale the final
+ * sum by >> 10
+ *
+ * Y: 0.299000 << 15 = 9798, 0.587000 << 15 = 19235,
+ * 0.114000 << 15 = 3735
+ * Cb: 0.168935 << 15 = 5535, 0.331665 << 15 = 10868,
+ * 0.500590 << 15 = 16403
+ * Cr: 0.499813 << 15 = 16377, 0.418531 << 15 = 13714,
+ * 0.081282 << 15 = 2663
+ */
+ INT32 cy = (r * 9798 + g * 19235 + b * 3735) >> 10;
+ INT32 cb = (r * -5535 + g * -10868 + b * 16403) >> 10;
+ INT32 cr = (r * 16377 + g * -13714 + b * -2663) >> 10;
+ *yptr++ = (INT16)MINMAX(cy - 4096, -4096, 4095);
+ *cbptr++ = (INT16)MINMAX(cb, -4096, 4095);
+ *crptr++ = (INT16)MINMAX(cr, -4096, 4095);
+ }
+
+ yptr += srcbump;
+ cbptr += srcbump;
+ crptr += srcbump;
+ rptr += dstbump;
+ gptr += dstbump;
+ bptr += dstbump;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static INLINE void writeScanlineGeneric(BYTE* dst, DWORD formatSize, UINT32 DstFormat,
+ const INT16* r, const INT16* g, const INT16* b, DWORD width)
+{
+ fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, FALSE);
+
+ for (UINT32 x = 0; x < width; x++)
+ dst = writePixel(dst, formatSize, DstFormat, *r++, *g++, *b++, 0);
+}
+
+static INLINE void writeScanlineRGB(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
+ const INT16* g, const INT16* b, DWORD width)
+{
+ WINPR_UNUSED(formatSize);
+ WINPR_UNUSED(DstFormat);
+
+ for (UINT32 x = 0; x < width; x++)
+ {
+ const BYTE R = CLIP(*r++);
+ const BYTE G = CLIP(*g++);
+ const BYTE B = CLIP(*b++);
+ *dst++ = R;
+ *dst++ = G;
+ *dst++ = B;
+ }
+}
+
+static INLINE void writeScanlineBGR(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
+ const INT16* g, const INT16* b, DWORD width)
+{
+ WINPR_UNUSED(formatSize);
+ WINPR_UNUSED(DstFormat);
+
+ for (UINT32 x = 0; x < width; x++)
+ {
+ const BYTE R = CLIP(*r++);
+ const BYTE G = CLIP(*g++);
+ const BYTE B = CLIP(*b++);
+ *dst++ = B;
+ *dst++ = G;
+ *dst++ = R;
+ }
+}
+
+static INLINE void writeScanlineBGRX(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
+ const INT16* g, const INT16* b, DWORD width)
+{
+ WINPR_UNUSED(formatSize);
+ WINPR_UNUSED(DstFormat);
+
+ for (UINT32 x = 0; x < width; x++)
+ {
+ const BYTE R = CLIP(*r++);
+ const BYTE G = CLIP(*g++);
+ const BYTE B = CLIP(*b++);
+ *dst++ = B;
+ *dst++ = G;
+ *dst++ = R;
+ *dst++ = 0xFF;
+ }
+}
+
+static INLINE void writeScanlineRGBX(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
+ const INT16* g, const INT16* b, DWORD width)
+{
+ WINPR_UNUSED(formatSize);
+ WINPR_UNUSED(DstFormat);
+
+ for (UINT32 x = 0; x < width; x++)
+ {
+ const BYTE R = CLIP(*r++);
+ const BYTE G = CLIP(*g++);
+ const BYTE B = CLIP(*b++);
+ *dst++ = R;
+ *dst++ = G;
+ *dst++ = B;
+ *dst++ = 0xFF;
+ }
+}
+
+static INLINE void writeScanlineXBGR(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
+ const INT16* g, const INT16* b, DWORD width)
+{
+ WINPR_UNUSED(formatSize);
+ WINPR_UNUSED(DstFormat);
+
+ for (UINT32 x = 0; x < width; x++)
+ {
+ const BYTE R = CLIP(*r++);
+ const BYTE G = CLIP(*g++);
+ const BYTE B = CLIP(*b++);
+ *dst++ = 0xFF;
+ *dst++ = B;
+ *dst++ = G;
+ *dst++ = R;
+ }
+}
+
+static INLINE void writeScanlineXRGB(BYTE* dst, DWORD formatSize, UINT32 DstFormat, const INT16* r,
+ const INT16* g, const INT16* b, DWORD width)
+{
+ WINPR_UNUSED(formatSize);
+ WINPR_UNUSED(DstFormat);
+
+ for (UINT32 x = 0; x < width; x++)
+ {
+ const BYTE R = CLIP(*r++);
+ const BYTE G = CLIP(*g++);
+ const BYTE B = CLIP(*b++);
+ *dst++ = 0xFF;
+ *dst++ = R;
+ *dst++ = G;
+ *dst++ = B;
+ }
+}
+
+typedef void (*fkt_writeScanline)(BYTE*, DWORD, UINT32, const INT16*, const INT16*, const INT16*,
+ DWORD);
+
+static INLINE fkt_writeScanline getScanlineWriteFunction(DWORD format)
+{
+ switch (format)
+ {
+ case PIXEL_FORMAT_ARGB32:
+ case PIXEL_FORMAT_XRGB32:
+ return writeScanlineXRGB;
+
+ case PIXEL_FORMAT_ABGR32:
+ case PIXEL_FORMAT_XBGR32:
+ return writeScanlineXBGR;
+
+ case PIXEL_FORMAT_RGBA32:
+ case PIXEL_FORMAT_RGBX32:
+ return writeScanlineRGBX;
+
+ case PIXEL_FORMAT_BGRA32:
+ case PIXEL_FORMAT_BGRX32:
+ return writeScanlineBGRX;
+
+ case PIXEL_FORMAT_BGR24:
+ return writeScanlineBGR;
+
+ case PIXEL_FORMAT_RGB24:
+ return writeScanlineRGB;
+
+ default:
+ return writeScanlineGeneric;
+ }
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_RGBToRGB_16s8u_P3AC4R_general(
+ const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+ UINT32 srcStep, /* bytes between rows in source data */
+ BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
+ UINT32 dstStep, /* bytes between rows in dest data */
+ UINT32 DstFormat, const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+ const INT16* r = pSrc[0];
+ const INT16* g = pSrc[1];
+ const INT16* b = pSrc[2];
+ const DWORD srcAdd = srcStep / sizeof(INT16);
+ fkt_writeScanline writeScanline = getScanlineWriteFunction(DstFormat);
+ const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+
+ for (UINT32 y = 0; y < roi->height; ++y)
+ {
+ (*writeScanline)(pDst, formatSize, DstFormat, r, g, b, roi->width);
+ pDst += dstStep;
+ r += srcAdd;
+ g += srcAdd;
+ b += srcAdd;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_RGBToRGB_16s8u_P3AC4R_BGRX(
+ const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+ UINT32 srcStep, /* bytes between rows in source data */
+ BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
+ UINT32 dstStep, /* bytes between rows in dest data */
+ UINT32 DstFormat, const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+ const INT16* r = pSrc[0];
+ const INT16* g = pSrc[1];
+ const INT16* b = pSrc[2];
+ const DWORD srcAdd = srcStep / sizeof(INT16);
+ const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+
+ for (UINT32 y = 0; y < roi->height; ++y)
+ {
+ writeScanlineBGRX(pDst, formatSize, DstFormat, r, g, b, roi->width);
+ pDst += dstStep;
+ r += srcAdd;
+ g += srcAdd;
+ b += srcAdd;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t general_RGBToRGB_16s8u_P3AC4R(
+ const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+ UINT32 srcStep, /* bytes between rows in source data */
+ BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
+ UINT32 dstStep, /* bytes between rows in dest data */
+ UINT32 DstFormat, const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+ switch (DstFormat)
+ {
+ case PIXEL_FORMAT_BGRA32:
+ case PIXEL_FORMAT_BGRX32:
+ return general_RGBToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+
+ default:
+ return general_RGBToRGB_16s8u_P3AC4R_general(pSrc, srcStep, pDst, dstStep, DstFormat,
+ roi);
+ }
+}
+/* ------------------------------------------------------------------------- */
+void primitives_init_colors(primitives_t* prims)
+{
+ prims->yCbCrToRGB_16s8u_P3AC4R = general_yCbCrToRGB_16s8u_P3AC4R;
+ prims->yCbCrToRGB_16s16s_P3P3 = general_yCbCrToRGB_16s16s_P3P3;
+ prims->RGBToYCbCr_16s16s_P3P3 = general_RGBToYCbCr_16s16s_P3P3;
+ prims->RGBToRGB_16s8u_P3AC4R = general_RGBToRGB_16s8u_P3AC4R;
+}
diff --git a/libfreerdp/primitives/prim_colors_opt.c b/libfreerdp/primitives/prim_colors_opt.c
new file mode 100644
index 0000000..60debc3
--- /dev/null
+++ b/libfreerdp/primitives/prim_colors_opt.c
@@ -0,0 +1,1591 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized Color conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * Copyright 2011 Stephen Erisman
+ * Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
+ * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#elif defined(WITH_NEON)
+#include <arm_neon.h>
+#endif /* WITH_SSE2 else WITH_NEON */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+static primitives_t* generic = NULL;
+
+#ifdef WITH_SSE2
+
+#ifdef __GNUC__
+#define GNU_INLINE __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+#else
+#define GNU_INLINE
+#endif
+
+#define CACHE_LINE_BYTES 64
+
+#define _mm_between_epi16(_val, _min, _max) \
+ do \
+ { \
+ _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); \
+ } while (0)
+
+#ifdef DO_PREFETCH
+/*---------------------------------------------------------------------------*/
+static inline void GNU_INLINE _mm_prefetch_buffer(char* WINPR_RESTRICT buffer, int num_bytes)
+{
+ __m128i* buf = (__m128i*)buffer;
+
+ for (unsigned int i = 0; i < (num_bytes / sizeof(__m128i));
+ i += (CACHE_LINE_BYTES / sizeof(__m128i)))
+ {
+ _mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
+ }
+}
+#endif /* DO_PREFETCH */
+
+/*---------------------------------------------------------------------------*/
+static pstatus_t
+sse2_yCbCrToRGB_16s16s_P3P3(const INT16* const WINPR_RESTRICT pSrc[3], int srcStep,
+ INT16* WINPR_RESTRICT pDst[3], int dstStep,
+ const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+ __m128i zero;
+ __m128i max;
+ __m128i r_cr;
+ __m128i g_cb;
+ __m128i g_cr;
+ __m128i b_cb;
+ __m128i c4096;
+ const __m128i* y_buf = NULL;
+ const __m128i* cb_buf = NULL;
+ const __m128i* cr_buf = NULL;
+ __m128i* r_buf = NULL;
+ __m128i* g_buf = NULL;
+ __m128i* b_buf = NULL;
+ int srcbump = 0;
+ int dstbump = 0;
+ int imax = 0;
+
+ if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
+ ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
+ ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
+ (srcStep & 127) || (dstStep & 127))
+ {
+ /* We can't maintain 16-byte alignment. */
+ return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
+ }
+
+ zero = _mm_setzero_si128();
+ max = _mm_set1_epi16(255);
+ y_buf = (const __m128i*)(pSrc[0]);
+ cb_buf = (const __m128i*)(pSrc[1]);
+ cr_buf = (const __m128i*)(pSrc[2]);
+ r_buf = (__m128i*)(pDst[0]);
+ g_buf = (__m128i*)(pDst[1]);
+ b_buf = (__m128i*)(pDst[2]);
+ r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */
+ g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */
+ g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
+ b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */
+ c4096 = _mm_set1_epi16(4096);
+ srcbump = srcStep / sizeof(__m128i);
+ dstbump = dstStep / sizeof(__m128i);
+#ifdef DO_PREFETCH
+
+ /* Prefetch Y's, Cb's, and Cr's. */
+ for (UINT32 yp = 0; yp < roi->height; yp++)
+ {
+ for (int i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
+ i += (CACHE_LINE_BYTES / sizeof(__m128i)))
+ {
+ _mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA);
+ _mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
+ _mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
+ }
+
+ y_buf += srcbump;
+ cb_buf += srcbump;
+ cr_buf += srcbump;
+ }
+
+ y_buf = (__m128i*)(pSrc[0]);
+ cb_buf = (__m128i*)(pSrc[1]);
+ cr_buf = (__m128i*)(pSrc[2]);
+#endif /* DO_PREFETCH */
+ imax = roi->width * sizeof(INT16) / sizeof(__m128i);
+
+ for (UINT32 yp = 0; yp < roi->height; ++yp)
+ {
+ for (int i = 0; i < imax; i++)
+ {
+ /* In order to use SSE2 signed 16-bit integer multiplication
+ * we need to convert the floating point factors to signed int
+ * without losing information.
+ * The result of this multiplication is 32 bit and we have two
+ * SSE instructions that return either the hi or lo word.
+ * Thus we will multiply the factors by the highest possible 2^n,
+ * take the upper 16 bits of the signed 32-bit result
+ * (_mm_mulhi_epi16) and correct this result by multiplying
+ * it by 2^(16-n).
+ *
+ * For the given factors in the conversion matrix the best
+ * possible n is 14.
+ *
+ * Example for calculating r:
+ * r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
+ * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
+ * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
+ * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
+ */
+ /* y = (y_r_buf[i] + 4096) >> 2 */
+ __m128i y;
+ __m128i cb;
+ __m128i cr;
+ __m128i r;
+ __m128i g;
+ __m128i b;
+ y = _mm_load_si128(y_buf + i);
+ y = _mm_add_epi16(y, c4096);
+ y = _mm_srai_epi16(y, 2);
+ /* cb = cb_g_buf[i]; */
+ cb = _mm_load_si128(cb_buf + i);
+ /* cr = cr_b_buf[i]; */
+ cr = _mm_load_si128(cr_buf + i);
+ /* (y + HIWORD(cr*22986)) >> 3 */
+ r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
+ r = _mm_srai_epi16(r, 3);
+ /* r_buf[i] = CLIP(r); */
+ _mm_between_epi16(r, zero, max);
+ _mm_store_si128(r_buf + i, r);
+ /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
+ g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
+ g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
+ g = _mm_srai_epi16(g, 3);
+ /* g_buf[i] = CLIP(g); */
+ _mm_between_epi16(g, zero, max);
+ _mm_store_si128(g_buf + i, g);
+ /* (y + HIWORD(cb*28999)) >> 3 */
+ b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
+ b = _mm_srai_epi16(b, 3);
+ /* b_buf[i] = CLIP(b); */
+ _mm_between_epi16(b, zero, max);
+ _mm_store_si128(b_buf + i, b);
+ }
+
+ y_buf += srcbump;
+ cb_buf += srcbump;
+ cr_buf += srcbump;
+ r_buf += dstbump;
+ g_buf += dstbump;
+ b_buf += dstbump;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/*---------------------------------------------------------------------------*/
+static pstatus_t
+sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* const WINPR_RESTRICT pSrc[3], UINT32 srcStep,
+ BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
+ const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i max = _mm_set1_epi16(255);
+ const __m128i r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */
+ const __m128i g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */
+ const __m128i g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
+ const __m128i b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */
+ const __m128i c4096 = _mm_set1_epi16(4096);
+ const INT16* y_buf = (const INT16*)pSrc[0];
+ const INT16* cb_buf = (const INT16*)pSrc[1];
+ const INT16* cr_buf = (const INT16*)pSrc[2];
+ const UINT32 pad = roi->width % 16;
+ const UINT32 step = sizeof(__m128i) / sizeof(INT16);
+ const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
+ BYTE* d_buf = pDst;
+ const size_t dstPad = (dstStep - roi->width * 4);
+#ifdef DO_PREFETCH
+
+ /* Prefetch Y's, Cb's, and Cr's. */
+ for (UINT32 yp = 0; yp < roi->height; yp++)
+ {
+ for (int i = 0; i < imax; i += (CACHE_LINE_BYTES / sizeof(__m128i)))
+ {
+ _mm_prefetch((char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
+ _mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
+ _mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
+ }
+
+ y_buf += srcStep / sizeof(INT16);
+ cb_buf += srcStep / sizeof(INT16);
+ cr_buf += srcStep / sizeof(INT16);
+ }
+
+ y_buf = (INT16*)pSrc[0];
+ cb_buf = (INT16*)pSrc[1];
+ cr_buf = (INT16*)pSrc[2];
+#endif /* DO_PREFETCH */
+
+ for (UINT32 yp = 0; yp < roi->height; ++yp)
+ {
+ for (UINT32 i = 0; i < imax; i += 2)
+ {
+ /* In order to use SSE2 signed 16-bit integer multiplication
+ * we need to convert the floating point factors to signed int
+ * without losing information.
+ * The result of this multiplication is 32 bit and we have two
+ * SSE instructions that return either the hi or lo word.
+ * Thus we will multiply the factors by the highest possible 2^n,
+ * take the upper 16 bits of the signed 32-bit result
+ * (_mm_mulhi_epi16) and correct this result by multiplying
+ * it by 2^(16-n).
+ *
+ * For the given factors in the conversion matrix the best
+ * possible n is 14.
+ *
+ * Example for calculating r:
+ * r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
+ * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
+ * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
+ * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
+ */
+ /* y = (y_r_buf[i] + 4096) >> 2 */
+ __m128i y1;
+ __m128i y2;
+ __m128i cb1;
+ __m128i cb2;
+ __m128i cr1;
+ __m128i cr2;
+ __m128i r1;
+ __m128i r2;
+ __m128i g1;
+ __m128i g2;
+ __m128i b1;
+ __m128i b2;
+ y1 = _mm_load_si128((const __m128i*)y_buf);
+ y_buf += step;
+ y1 = _mm_add_epi16(y1, c4096);
+ y1 = _mm_srai_epi16(y1, 2);
+ /* cb = cb_g_buf[i]; */
+ cb1 = _mm_load_si128((const __m128i*)cb_buf);
+ cb_buf += step;
+ /* cr = cr_b_buf[i]; */
+ cr1 = _mm_load_si128((const __m128i*)cr_buf);
+ cr_buf += step;
+ /* (y + HIWORD(cr*22986)) >> 3 */
+ r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
+ r1 = _mm_srai_epi16(r1, 3);
+ /* r_buf[i] = CLIP(r); */
+ _mm_between_epi16(r1, zero, max);
+ /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
+ g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
+ g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
+ g1 = _mm_srai_epi16(g1, 3);
+ /* g_buf[i] = CLIP(g); */
+ _mm_between_epi16(g1, zero, max);
+ /* (y + HIWORD(cb*28999)) >> 3 */
+ b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
+ b1 = _mm_srai_epi16(b1, 3);
+ /* b_buf[i] = CLIP(b); */
+ _mm_between_epi16(b1, zero, max);
+ y2 = _mm_load_si128((const __m128i*)y_buf);
+ y_buf += step;
+ y2 = _mm_add_epi16(y2, c4096);
+ y2 = _mm_srai_epi16(y2, 2);
+ /* cb = cb_g_buf[i]; */
+ cb2 = _mm_load_si128((const __m128i*)cb_buf);
+ cb_buf += step;
+ /* cr = cr_b_buf[i]; */
+ cr2 = _mm_load_si128((const __m128i*)cr_buf);
+ cr_buf += step;
+ /* (y + HIWORD(cr*22986)) >> 3 */
+ r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
+ r2 = _mm_srai_epi16(r2, 3);
+ /* r_buf[i] = CLIP(r); */
+ _mm_between_epi16(r2, zero, max);
+ /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
+ g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
+ g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
+ g2 = _mm_srai_epi16(g2, 3);
+ /* g_buf[i] = CLIP(g); */
+ _mm_between_epi16(g2, zero, max);
+ /* (y + HIWORD(cb*28999)) >> 3 */
+ b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
+ b2 = _mm_srai_epi16(b2, 3);
+ /* b_buf[i] = CLIP(b); */
+ _mm_between_epi16(b2, zero, max);
+ {
+ __m128i R0;
+ __m128i R1;
+ __m128i R2;
+ __m128i R3;
+ __m128i R4;
+ /* The comments below pretend these are 8-byte registers
+ * rather than 16-byte, for readability.
+ */
+ R0 = b1; /* R0 = 00B300B200B100B0 */
+ R1 = b2; /* R1 = 00B700B600B500B4 */
+ R0 = _mm_packus_epi16(R0, R1); /* R0 = B7B6B5B4B3B2B1B0 */
+ R1 = g1; /* R1 = 00G300G200G100G0 */
+ R2 = g2; /* R2 = 00G700G600G500G4 */
+ R1 = _mm_packus_epi16(R1, R2); /* R1 = G7G6G5G4G3G2G1G0 */
+ R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */
+ R2 = _mm_unpacklo_epi8(R0, R2); /* R2 = B3G3B2G2B1G1B0G0 */
+ R1 = _mm_unpackhi_epi8(R0, R1); /* R1 = B7G7B6G6B5G5B4G4 */
+ R0 = r1; /* R0 = 00R300R200R100R0 */
+ R3 = r2; /* R3 = 00R700R600R500R4 */
+ R0 = _mm_packus_epi16(R0, R3); /* R0 = R7R6R5R4R3R2R1R0 */
+ R3 = _mm_set1_epi32(0xFFFFFFFFU); /* R3 = FFFFFFFFFFFFFFFF */
+ R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */
+ R4 = _mm_unpacklo_epi8(R0, R4); /* R4 = R3FFR2FFR1FFR0FF */
+ R3 = _mm_unpackhi_epi8(R0, R3); /* R3 = R7FFR6FFR5FFR4FF */
+ R0 = R4; /* R0 = R4 */
+ R0 = _mm_unpacklo_epi16(R2, R0); /* R0 = B1G1R1FFB0G0R0FF */
+ R4 = _mm_unpackhi_epi16(R2, R4); /* R4 = B3G3R3FFB2G2R2FF */
+ R2 = R3; /* R2 = R3 */
+ R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = B5G5R5FFB4G4R4FF */
+ R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = B7G7R7FFB6G6R6FF */
+ _mm_store_si128((__m128i*)d_buf, R0); /* B1G1R1FFB0G0R0FF */
+ d_buf += sizeof(__m128i);
+ _mm_store_si128((__m128i*)d_buf, R4); /* B3G3R3FFB2G2R2FF */
+ d_buf += sizeof(__m128i);
+ _mm_store_si128((__m128i*)d_buf, R2); /* B5G5R5FFB4G4R4FF */
+ d_buf += sizeof(__m128i);
+ _mm_store_si128((__m128i*)d_buf, R3); /* B7G7R7FFB6G6R6FF */
+ d_buf += sizeof(__m128i);
+ }
+ }
+
+ for (UINT32 i = 0; i < pad; i++)
+ {
+ const INT32 divisor = 16;
+ const INT32 Y = ((*y_buf++) + 4096) << divisor;
+ const INT32 Cb = (*cb_buf++);
+ const INT32 Cr = (*cr_buf++);
+ const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
+ const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
+ const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
+ const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
+ const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
+ const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
+ const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
+ *d_buf++ = CLIP(B);
+ *d_buf++ = CLIP(G);
+ *d_buf++ = CLIP(R);
+ *d_buf++ = 0xFF;
+ }
+
+ d_buf += dstPad;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/*---------------------------------------------------------------------------*/
+static pstatus_t
+sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* const WINPR_RESTRICT pSrc[3], UINT32 srcStep,
+ BYTE* WINPR_RESTRICT pDst, UINT32 dstStep,
+ const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i max = _mm_set1_epi16(255);
+ const __m128i r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */
+ const __m128i g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */
+ const __m128i g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
+ const __m128i b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */
+ const __m128i c4096 = _mm_set1_epi16(4096);
+ const INT16* y_buf = (const INT16*)pSrc[0];
+ const INT16* cb_buf = (const INT16*)pSrc[1];
+ const INT16* cr_buf = (const INT16*)pSrc[2];
+ const UINT32 pad = roi->width % 16;
+ const UINT32 step = sizeof(__m128i) / sizeof(INT16);
+ const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
+ BYTE* d_buf = pDst;
+ const size_t dstPad = (dstStep - roi->width * 4);
+#ifdef DO_PREFETCH
+
+ /* Prefetch Y's, Cb's, and Cr's. */
+ for (UINT32 yp = 0; yp < roi->height; yp++)
+ {
+ for (int i = 0; i < imax; i += (CACHE_LINE_BYTES / sizeof(__m128i)))
+ {
+ _mm_prefetch((char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
+ _mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
+ _mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
+ }
+
+ y_buf += srcStep / sizeof(INT16);
+ cb_buf += srcStep / sizeof(INT16);
+ cr_buf += srcStep / sizeof(INT16);
+ }
+
+ y_buf = (INT16*)(pSrc[0]);
+ cb_buf = (INT16*)(pSrc[1]);
+ cr_buf = (INT16*)(pSrc[2]);
+#endif /* DO_PREFETCH */
+
+ for (UINT32 yp = 0; yp < roi->height; ++yp)
+ {
+ for (UINT32 i = 0; i < imax; i += 2)
+ {
+ /* In order to use SSE2 signed 16-bit integer multiplication
+ * we need to convert the floating point factors to signed int
+ * without losing information.
+ * The result of this multiplication is 32 bit and we have two
+ * SSE instructions that return either the hi or lo word.
+ * Thus we will multiply the factors by the highest possible 2^n,
+ * take the upper 16 bits of the signed 32-bit result
+ * (_mm_mulhi_epi16) and correct this result by multiplying
+ * it by 2^(16-n).
+ *
+ * For the given factors in the conversion matrix the best
+ * possible n is 14.
+ *
+ * Example for calculating r:
+ * r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
+ * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
+ * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
+ * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
+ */
+ /* y = (y_r_buf[i] + 4096) >> 2 */
+ __m128i y1;
+ __m128i y2;
+ __m128i cb1;
+ __m128i cb2;
+ __m128i cr1;
+ __m128i cr2;
+ __m128i r1;
+ __m128i r2;
+ __m128i g1;
+ __m128i g2;
+ __m128i b1;
+ __m128i b2;
+ y1 = _mm_load_si128((const __m128i*)y_buf);
+ y_buf += step;
+ y1 = _mm_add_epi16(y1, c4096);
+ y1 = _mm_srai_epi16(y1, 2);
+ /* cb = cb_g_buf[i]; */
+ cb1 = _mm_load_si128((const __m128i*)cb_buf);
+ cb_buf += step;
+ /* cr = cr_b_buf[i]; */
+ cr1 = _mm_load_si128((const __m128i*)cr_buf);
+ cr_buf += step;
+ /* (y + HIWORD(cr*22986)) >> 3 */
+ r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
+ r1 = _mm_srai_epi16(r1, 3);
+ /* r_buf[i] = CLIP(r); */
+ _mm_between_epi16(r1, zero, max);
+ /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
+ g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
+ g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
+ g1 = _mm_srai_epi16(g1, 3);
+ /* g_buf[i] = CLIP(g); */
+ _mm_between_epi16(g1, zero, max);
+ /* (y + HIWORD(cb*28999)) >> 3 */
+ b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
+ b1 = _mm_srai_epi16(b1, 3);
+ /* b_buf[i] = CLIP(b); */
+ _mm_between_epi16(b1, zero, max);
+ y2 = _mm_load_si128((const __m128i*)y_buf);
+ y_buf += step;
+ y2 = _mm_add_epi16(y2, c4096);
+ y2 = _mm_srai_epi16(y2, 2);
+ /* cb = cb_g_buf[i]; */
+ cb2 = _mm_load_si128((const __m128i*)cb_buf);
+ cb_buf += step;
+ /* cr = cr_b_buf[i]; */
+ cr2 = _mm_load_si128((const __m128i*)cr_buf);
+ cr_buf += step;
+ /* (y + HIWORD(cr*22986)) >> 3 */
+ r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
+ r2 = _mm_srai_epi16(r2, 3);
+ /* r_buf[i] = CLIP(r); */
+ _mm_between_epi16(r2, zero, max);
+ /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
+ g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
+ g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
+ g2 = _mm_srai_epi16(g2, 3);
+ /* g_buf[i] = CLIP(g); */
+ _mm_between_epi16(g2, zero, max);
+ /* (y + HIWORD(cb*28999)) >> 3 */
+ b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
+ b2 = _mm_srai_epi16(b2, 3);
+ /* b_buf[i] = CLIP(b); */
+ _mm_between_epi16(b2, zero, max);
+ {
+ __m128i R0;
+ __m128i R1;
+ __m128i R2;
+ __m128i R3;
+ __m128i R4;
+ /* The comments below pretend these are 8-byte registers
+ * rather than 16-byte, for readability.
+ */
+ R0 = r1; /* R0 = 00R300R200R100R0 */
+ R1 = r2; /* R1 = 00R700R600R500R4 */
+ R0 = _mm_packus_epi16(R0, R1); /* R0 = R7R6R5R4R3R2R1R0 */
+ R1 = g1; /* R1 = 00G300G200G100G0 */
+ R2 = g2; /* R2 = 00G700G600G500G4 */
+ R1 = _mm_packus_epi16(R1, R2); /* R1 = G7G6G5G4G3G2G1G0 */
+ R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */
+ R2 = _mm_unpacklo_epi8(R0, R2); /* R2 = R3G3R2G2R1G1R0G0 */
+ R1 = _mm_unpackhi_epi8(R0, R1); /* R1 = R7G7R6G6R5G5R4G4 */
+ R0 = b1; /* R0 = 00B300B200B100B0 */
+ R3 = b2; /* R3 = 00B700B600B500B4 */
+ R0 = _mm_packus_epi16(R0, R3); /* R0 = B7B6B5B4B3B2B1B0 */
+ R3 = _mm_set1_epi32(0xFFFFFFFFU); /* R3 = FFFFFFFFFFFFFFFF */
+ R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */
+ R4 = _mm_unpacklo_epi8(R0, R4); /* R4 = B3FFB2FFB1FFB0FF */
+ R3 = _mm_unpackhi_epi8(R0, R3); /* R3 = B7FFB6FFB5FFB4FF */
+ R0 = R4; /* R0 = R4 */
+ R0 = _mm_unpacklo_epi16(R2, R0); /* R0 = R1G1B1FFR0G0B0FF */
+ R4 = _mm_unpackhi_epi16(R2, R4); /* R4 = R3G3B3FFR2G2B2FF */
+ R2 = R3; /* R2 = R3 */
+ R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = R5G5B5FFR4G4B4FF */
+ R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = R7G7B7FFR6G6B6FF */
+ _mm_store_si128((__m128i*)d_buf, R0); /* R1G1B1FFR0G0B0FF */
+ d_buf += sizeof(__m128i);
+ _mm_store_si128((__m128i*)d_buf, R4); /* R3G3B3FFR2G2B2FF */
+ d_buf += sizeof(__m128i);
+ _mm_store_si128((__m128i*)d_buf, R2); /* R5G5B5FFR4G4B4FF */
+ d_buf += sizeof(__m128i);
+ _mm_store_si128((__m128i*)d_buf, R3); /* R7G7B7FFR6G6B6FF */
+ d_buf += sizeof(__m128i);
+ }
+ }
+
+ for (UINT32 i = 0; i < pad; i++)
+ {
+ const INT32 divisor = 16;
+ const INT32 Y = ((*y_buf++) + 4096) << divisor;
+ const INT32 Cb = (*cb_buf++);
+ const INT32 Cr = (*cr_buf++);
+ const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
+ const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
+ const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
+ const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
+ const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
+ const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
+ const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
+ *d_buf++ = CLIP(R);
+ *d_buf++ = CLIP(G);
+ *d_buf++ = CLIP(B);
+ *d_buf++ = 0xFF;
+ }
+
+ d_buf += dstPad;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t
+sse2_yCbCrToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], UINT32 srcStep,
+ BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 DstFormat,
+ const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+ if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
+ ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst)&0x0f) || (srcStep & 0x0f) ||
+ (dstStep & 0x0f))
+ {
+ /* We can't maintain 16-byte alignment. */
+ return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+ }
+
+ switch (DstFormat)
+ {
+ case PIXEL_FORMAT_BGRA32:
+ case PIXEL_FORMAT_BGRX32:
+ return sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
+
+ case PIXEL_FORMAT_RGBA32:
+ case PIXEL_FORMAT_RGBX32:
+ return sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
+
+ default:
+ return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+ }
+}
+/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
+ * numbers. See the general code above.
+ */
+static pstatus_t
+sse2_RGBToYCbCr_16s16s_P3P3(const INT16* const WINPR_RESTRICT pSrc[3], int srcStep,
+ INT16* WINPR_RESTRICT pDst[3], int dstStep,
+ const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+ __m128i min;
+ __m128i max;
+ __m128i y_r;
+ __m128i y_g;
+ __m128i y_b;
+ __m128i cb_r;
+ __m128i cb_g;
+ __m128i cb_b;
+ __m128i cr_r;
+ __m128i cr_g;
+ __m128i cr_b;
+ const __m128i* r_buf = (const __m128i*)(pSrc[0]);
+ const __m128i* g_buf = (const __m128i*)(pSrc[1]);
+ const __m128i* b_buf = (const __m128i*)(pSrc[2]);
+ __m128i* y_buf = (__m128i*)(pDst[0]);
+ __m128i* cb_buf = (__m128i*)(pDst[1]);
+ __m128i* cr_buf = (__m128i*)(pDst[2]);
+ int srcbump = 0;
+ int dstbump = 0;
+ int imax = 0;
+
+ if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
+ ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
+ ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
+ (srcStep & 127) || (dstStep & 127))
+ {
+ /* We can't maintain 16-byte alignment. */
+ return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
+ }
+
+ min = _mm_set1_epi16(-128 * 32);
+ max = _mm_set1_epi16(127 * 32);
+
+ y_r = _mm_set1_epi16(9798); /* 0.299000 << 15 */
+ y_g = _mm_set1_epi16(19235); /* 0.587000 << 15 */
+ y_b = _mm_set1_epi16(3735); /* 0.114000 << 15 */
+ cb_r = _mm_set1_epi16(-5535); /* -0.168935 << 15 */
+ cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
+ cb_b = _mm_set1_epi16(16403); /* 0.500590 << 15 */
+ cr_r = _mm_set1_epi16(16377); /* 0.499813 << 15 */
+ cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
+ cr_b = _mm_set1_epi16(-2663); /* -0.081282 << 15 */
+ srcbump = srcStep / sizeof(__m128i);
+ dstbump = dstStep / sizeof(__m128i);
+#ifdef DO_PREFETCH
+
+ /* Prefetch RGB's. */
+ for (UINT32 yp = 0; yp < roi->height; yp++)
+ {
+ for (int i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
+ i += (CACHE_LINE_BYTES / sizeof(__m128i)))
+ {
+ _mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
+ _mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
+ _mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
+ }
+
+ r_buf += srcbump;
+ g_buf += srcbump;
+ b_buf += srcbump;
+ }
+
+ r_buf = (__m128i*)(pSrc[0]);
+ g_buf = (__m128i*)(pSrc[1]);
+ b_buf = (__m128i*)(pSrc[2]);
+#endif /* DO_PREFETCH */
+ imax = roi->width * sizeof(INT16) / sizeof(__m128i);
+
+ for (UINT32 yp = 0; yp < roi->height; ++yp)
+ {
+ for (int i = 0; i < imax; i++)
+ {
+ /* In order to use SSE2 signed 16-bit integer multiplication we
+ * need to convert the floating point factors to signed int
+ * without loosing information. The result of this multiplication
+ * is 32 bit and using SSE2 we get either the product's hi or lo
+ * word. Thus we will multiply the factors by the highest
+ * possible 2^n and take the upper 16 bits of the signed 32-bit
+ * result (_mm_mulhi_epi16). Since the final result needs to
+ * be scaled by << 5 and also in in order to keep the precision
+ * within the upper 16 bits we will also have to scale the RGB
+ * values used in the multiplication by << 5+(16-n).
+ */
+ __m128i r;
+ __m128i g;
+ __m128i b;
+ __m128i y;
+ __m128i cb;
+ __m128i cr;
+ r = _mm_load_si128(r_buf + i);
+ g = _mm_load_si128(g_buf + i);
+ b = _mm_load_si128(b_buf + i);
+ /* r<<6; g<<6; b<<6 */
+ r = _mm_slli_epi16(r, 6);
+ g = _mm_slli_epi16(g, 6);
+ b = _mm_slli_epi16(b, 6);
+ /* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
+ y = _mm_mulhi_epi16(r, y_r);
+ y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
+ y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
+ y = _mm_add_epi16(y, min);
+ /* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
+ _mm_between_epi16(y, min, max);
+ _mm_store_si128(y_buf + i, y);
+ /* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
+ cb = _mm_mulhi_epi16(r, cb_r);
+ cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
+ cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
+ /* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
+ _mm_between_epi16(cb, min, max);
+ _mm_store_si128(cb_buf + i, cb);
+ /* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
+ cr = _mm_mulhi_epi16(r, cr_r);
+ cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
+ cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
+ /* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
+ _mm_between_epi16(cr, min, max);
+ _mm_store_si128(cr_buf + i, cr);
+ }
+
+ y_buf += srcbump;
+ cb_buf += srcbump;
+ cr_buf += srcbump;
+ r_buf += dstbump;
+ g_buf += dstbump;
+ b_buf += dstbump;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/*---------------------------------------------------------------------------*/
+static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX(
+ const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+ UINT32 srcStep, /* bytes between rows in source data */
+ BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
+ UINT32 dstStep, /* bytes between rows in dest data */
+ const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+ const UINT16* pr = (const UINT16*)(pSrc[0]);
+ const UINT16* pg = (const UINT16*)(pSrc[1]);
+ const UINT16* pb = (const UINT16*)(pSrc[2]);
+ const UINT32 pad = roi->width % 16;
+ const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
+ BYTE* out = NULL;
+ UINT32 srcbump = 0;
+ UINT32 dstbump = 0;
+ out = (BYTE*)pDst;
+ srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+ dstbump = (dstStep - (roi->width * sizeof(UINT32)));
+
+ for (UINT32 y = 0; y < roi->height; ++y)
+ {
+ for (UINT32 x = 0; x < roi->width - pad; x += 16)
+ {
+ __m128i r;
+ __m128i g;
+ __m128i b;
+ /* The comments below pretend these are 8-byte registers
+ * rather than 16-byte, for readability.
+ */
+ {
+ __m128i R0;
+ __m128i R1;
+ R0 = _mm_load_si128((const __m128i*)pb);
+ pb += 8; /* R0 = 00B300B200B100B0 */
+ R1 = _mm_load_si128((const __m128i*)pb);
+ pb += 8; /* R1 = 00B700B600B500B4 */
+ b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
+ }
+ {
+ __m128i R0;
+ __m128i R1;
+ R0 = _mm_load_si128((const __m128i*)pg);
+ pg += 8; /* R1 = 00G300G200G100G0 */
+ R1 = _mm_load_si128((const __m128i*)pg);
+ pg += 8; /* R2 = 00G700G600G500G4 */
+ g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
+ }
+ {
+ __m128i R0;
+ __m128i R1;
+ R0 = _mm_load_si128((const __m128i*)pr);
+ pr += 8; /* R0 = 00R300R200R100R0 */
+ R1 = _mm_load_si128((const __m128i*)pr);
+ pr += 8; /* R3 = 00R700R600R500R4 */
+ r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
+ }
+ {
+ __m128i gbHi;
+ __m128i gbLo;
+ __m128i arHi;
+ __m128i arLo;
+ {
+ gbLo = _mm_unpacklo_epi8(b, g); /* R0 = G7G6G5G4G3G2G1G0 */
+ gbHi = _mm_unpackhi_epi8(b, g); /* R1 = G7B7G6B7G5B5G4B4 */
+ arLo = _mm_unpacklo_epi8(r, a); /* R4 = FFR3FFR2FFR1FFR0 */
+ arHi = _mm_unpackhi_epi8(r, a); /* R3 = FFR7FFR6FFR5FFR4 */
+ }
+ {
+ const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
+ _mm_store_si128((__m128i*)out, bgrx);
+ out += 16; /* FFR1G1B1FFR0G0B0 */
+ }
+ {
+ const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
+ _mm_store_si128((__m128i*)out, bgrx);
+ out += 16; /* FFR3G3B3FFR2G2B2 */
+ }
+ {
+ const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
+ _mm_store_si128((__m128i*)out, bgrx);
+ out += 16; /* FFR5G5B5FFR4G4B4 */
+ }
+ {
+ const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
+ _mm_store_si128((__m128i*)out, bgrx);
+ out += 16; /* FFR7G7B7FFR6G6B6 */
+ }
+ }
+ }
+
+ for (UINT32 x = 0; x < pad; x++)
+ {
+ const BYTE R = CLIP(*pr++);
+ const BYTE G = CLIP(*pg++);
+ const BYTE B = CLIP(*pb++);
+ *out++ = B;
+ *out++ = G;
+ *out++ = R;
+ *out++ = 0xFF;
+ }
+
+ /* Jump to next row. */
+ pr += srcbump;
+ pg += srcbump;
+ pb += srcbump;
+ out += dstbump;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX(
+ const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+ UINT32 srcStep, /* bytes between rows in source data */
+ BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
+ UINT32 dstStep, /* bytes between rows in dest data */
+ const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+ const UINT16* pr = (const UINT16*)(pSrc[0]);
+ const UINT16* pg = (const UINT16*)(pSrc[1]);
+ const UINT16* pb = (const UINT16*)(pSrc[2]);
+ const UINT32 pad = roi->width % 16;
+ const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
+ BYTE* out = NULL;
+ UINT32 srcbump = 0;
+ UINT32 dstbump = 0;
+ out = (BYTE*)pDst;
+ srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+ dstbump = (dstStep - (roi->width * sizeof(UINT32)));
+
+ for (UINT32 y = 0; y < roi->height; ++y)
+ {
+ for (UINT32 x = 0; x < roi->width - pad; x += 16)
+ {
+ __m128i r;
+ __m128i g;
+ __m128i b;
+ /* The comments below pretend these are 8-byte registers
+ * rather than 16-byte, for readability.
+ */
+ {
+ __m128i R0;
+ __m128i R1;
+ R0 = _mm_load_si128((const __m128i*)pb);
+ pb += 8; /* R0 = 00B300B200B100B0 */
+ R1 = _mm_load_si128((const __m128i*)pb);
+ pb += 8; /* R1 = 00B700B600B500B4 */
+ b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
+ }
+ {
+ __m128i R0;
+ __m128i R1;
+ R0 = _mm_load_si128((const __m128i*)pg);
+ pg += 8; /* R1 = 00G300G200G100G0 */
+ R1 = _mm_load_si128((const __m128i*)pg);
+ pg += 8; /* R2 = 00G700G600G500G4 */
+ g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
+ }
+ {
+ __m128i R0;
+ __m128i R1;
+ R0 = _mm_load_si128((const __m128i*)pr);
+ pr += 8; /* R0 = 00R300R200R100R0 */
+ R1 = _mm_load_si128((const __m128i*)pr);
+ pr += 8; /* R3 = 00R700R600R500R4 */
+ r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
+ }
+ {
+ __m128i gbHi;
+ __m128i gbLo;
+ __m128i arHi;
+ __m128i arLo;
+ {
+ gbLo = _mm_unpacklo_epi8(r, g); /* R0 = G7G6G5G4G3G2G1G0 */
+ gbHi = _mm_unpackhi_epi8(r, g); /* R1 = G7B7G6B7G5B5G4B4 */
+ arLo = _mm_unpacklo_epi8(b, a); /* R4 = FFR3FFR2FFR1FFR0 */
+ arHi = _mm_unpackhi_epi8(b, a); /* R3 = FFR7FFR6FFR5FFR4 */
+ }
+ {
+ const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
+ _mm_store_si128((__m128i*)out, bgrx);
+ out += 16; /* FFR1G1B1FFR0G0B0 */
+ }
+ {
+ const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
+ _mm_store_si128((__m128i*)out, bgrx);
+ out += 16; /* FFR3G3B3FFR2G2B2 */
+ }
+ {
+ const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
+ _mm_store_si128((__m128i*)out, bgrx);
+ out += 16; /* FFR5G5B5FFR4G4B4 */
+ }
+ {
+ const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
+ _mm_store_si128((__m128i*)out, bgrx);
+ out += 16; /* FFR7G7B7FFR6G6B6 */
+ }
+ }
+ }
+
+ for (UINT32 x = 0; x < pad; x++)
+ {
+ const BYTE R = CLIP(*pr++);
+ const BYTE G = CLIP(*pg++);
+ const BYTE B = CLIP(*pb++);
+ *out++ = R;
+ *out++ = G;
+ *out++ = B;
+ *out++ = 0xFF;
+ }
+
+ /* Jump to next row. */
+ pr += srcbump;
+ pg += srcbump;
+ pb += srcbump;
+ out += dstbump;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR(
+ const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+ UINT32 srcStep, /* bytes between rows in source data */
+ BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
+ UINT32 dstStep, /* bytes between rows in dest data */
+ const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+ const UINT16* pr = (const UINT16*)(pSrc[0]);
+ const UINT16* pg = (const UINT16*)(pSrc[1]);
+ const UINT16* pb = (const UINT16*)(pSrc[2]);
+ const UINT32 pad = roi->width % 16;
+ const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
+ BYTE* out = NULL;
+ UINT32 srcbump = 0;
+ UINT32 dstbump = 0;
+ out = (BYTE*)pDst;
+ srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+ dstbump = (dstStep - (roi->width * sizeof(UINT32)));
+
+ for (UINT32 y = 0; y < roi->height; ++y)
+ {
+ for (UINT32 x = 0; x < roi->width - pad; x += 16)
+ {
+ __m128i r;
+ __m128i g;
+ __m128i b;
+ /* The comments below pretend these are 8-byte registers
+ * rather than 16-byte, for readability.
+ */
+ {
+ __m128i R0;
+ __m128i R1;
+ R0 = _mm_load_si128((const __m128i*)pb);
+ pb += 8; /* R0 = 00B300B200B100B0 */
+ R1 = _mm_load_si128((const __m128i*)pb);
+ pb += 8; /* R1 = 00B700B600B500B4 */
+ b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
+ }
+ {
+ __m128i R0;
+ __m128i R1;
+ R0 = _mm_load_si128((const __m128i*)pg);
+ pg += 8; /* R1 = 00G300G200G100G0 */
+ R1 = _mm_load_si128((const __m128i*)pg);
+ pg += 8; /* R2 = 00G700G600G500G4 */
+ g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
+ }
+ {
+ __m128i R0;
+ __m128i R1;
+ R0 = _mm_load_si128((const __m128i*)pr);
+ pr += 8; /* R0 = 00R300R200R100R0 */
+ R1 = _mm_load_si128((const __m128i*)pr);
+ pr += 8; /* R3 = 00R700R600R500R4 */
+ r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
+ }
+ {
+ __m128i gbHi;
+ __m128i gbLo;
+ __m128i arHi;
+ __m128i arLo;
+ {
+ gbLo = _mm_unpacklo_epi8(a, b); /* R0 = G7G6G5G4G3G2G1G0 */
+ gbHi = _mm_unpackhi_epi8(a, b); /* R1 = G7B7G6B7G5B5G4B4 */
+ arLo = _mm_unpacklo_epi8(g, r); /* R4 = FFR3FFR2FFR1FFR0 */
+ arHi = _mm_unpackhi_epi8(g, r); /* R3 = FFR7FFR6FFR5FFR4 */
+ }
+ {
+ const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
+ _mm_store_si128((__m128i*)out, bgrx);
+ out += 16; /* FFR1G1B1FFR0G0B0 */
+ }
+ {
+ const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
+ _mm_store_si128((__m128i*)out, bgrx);
+ out += 16; /* FFR3G3B3FFR2G2B2 */
+ }
+ {
+ const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
+ _mm_store_si128((__m128i*)out, bgrx);
+ out += 16; /* FFR5G5B5FFR4G4B4 */
+ }
+ {
+ const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
+ _mm_store_si128((__m128i*)out, bgrx);
+ out += 16; /* FFR7G7B7FFR6G6B6 */
+ }
+ }
+ }
+
+ for (UINT32 x = 0; x < pad; x++)
+ {
+ const BYTE R = CLIP(*pr++);
+ const BYTE G = CLIP(*pg++);
+ const BYTE B = CLIP(*pb++);
+ *out++ = 0xFF;
+ *out++ = B;
+ *out++ = G;
+ *out++ = R;
+ }
+
+ /* Jump to next row. */
+ pr += srcbump;
+ pg += srcbump;
+ pb += srcbump;
+ out += dstbump;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB(
+ const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+ UINT32 srcStep, /* bytes between rows in source data */
+ BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
+ UINT32 dstStep, /* bytes between rows in dest data */
+ const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+ const UINT16* pr = (const UINT16*)(pSrc[0]);
+ const UINT16* pg = (const UINT16*)(pSrc[1]);
+ const UINT16* pb = (const UINT16*)(pSrc[2]);
+ const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
+ const UINT32 pad = roi->width % 16;
+ BYTE* out = NULL;
+ UINT32 srcbump = 0;
+ UINT32 dstbump = 0;
+ out = (BYTE*)pDst;
+ srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+ dstbump = (dstStep - (roi->width * sizeof(UINT32)));
+
+ for (UINT32 y = 0; y < roi->height; ++y)
+ {
+ for (UINT32 x = 0; x < roi->width - pad; x += 16)
+ {
+ __m128i r;
+ __m128i g;
+ __m128i b;
+ /* The comments below pretend these are 8-byte registers
+ * rather than 16-byte, for readability.
+ */
+ {
+ __m128i R0;
+ __m128i R1;
+ R0 = _mm_load_si128((const __m128i*)pb);
+ pb += 8; /* R0 = 00B300B200B100B0 */
+ R1 = _mm_load_si128((const __m128i*)pb);
+ pb += 8; /* R1 = 00B700B600B500B4 */
+ b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
+ }
+ {
+ __m128i R0;
+ __m128i R1;
+ R0 = _mm_load_si128((const __m128i*)pg);
+ pg += 8; /* R1 = 00G300G200G100G0 */
+ R1 = _mm_load_si128((const __m128i*)pg);
+ pg += 8; /* R2 = 00G700G600G500G4 */
+ g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
+ }
+ {
+ __m128i R0;
+ __m128i R1;
+ R0 = _mm_load_si128((const __m128i*)pr);
+ pr += 8; /* R0 = 00R300R200R100R0 */
+ R1 = _mm_load_si128((const __m128i*)pr);
+ pr += 8; /* R3 = 00R700R600R500R4 */
+ r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
+ }
+ {
+ __m128i gbHi;
+ __m128i gbLo;
+ __m128i arHi;
+ __m128i arLo;
+ {
+ gbLo = _mm_unpacklo_epi8(a, r); /* R0 = G7G6G5G4G3G2G1G0 */
+ gbHi = _mm_unpackhi_epi8(a, r); /* R1 = G7B7G6B7G5B5G4B4 */
+ arLo = _mm_unpacklo_epi8(g, b); /* R4 = FFR3FFR2FFR1FFR0 */
+ arHi = _mm_unpackhi_epi8(g, b); /* R3 = FFR7FFR6FFR5FFR4 */
+ }
+ {
+ const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
+ _mm_store_si128((__m128i*)out, bgrx);
+ out += 16; /* FFR1G1B1FFR0G0B0 */
+ }
+ {
+ const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
+ _mm_store_si128((__m128i*)out, bgrx);
+ out += 16; /* FFR3G3B3FFR2G2B2 */
+ }
+ {
+ const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
+ _mm_store_si128((__m128i*)out, bgrx);
+ out += 16; /* FFR5G5B5FFR4G4B4 */
+ }
+ {
+ const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
+ _mm_store_si128((__m128i*)out, bgrx);
+ out += 16; /* FFR7G7B7FFR6G6B6 */
+ }
+ }
+ }
+
+ for (UINT32 x = 0; x < pad; x++)
+ {
+ const BYTE R = CLIP(*pr++);
+ const BYTE G = CLIP(*pg++);
+ const BYTE B = CLIP(*pb++);
+ *out++ = 0xFF;
+ *out++ = R;
+ *out++ = G;
+ *out++ = B;
+ }
+
+ /* Jump to next row. */
+ pr += srcbump;
+ pg += srcbump;
+ pb += srcbump;
+ out += dstbump;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t
+sse2_RGBToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+ UINT32 srcStep, /* bytes between rows in source data */
+ BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
+ UINT32 dstStep, /* bytes between rows in dest data */
+ UINT32 DstFormat, const prim_size_t* WINPR_RESTRICT roi)
+{
+ if (((ULONG_PTR)pSrc[0] & 0x0f) || ((ULONG_PTR)pSrc[1] & 0x0f) || ((ULONG_PTR)pSrc[2] & 0x0f) ||
+ (srcStep & 0x0f) || ((ULONG_PTR)pDst & 0x0f) || (dstStep & 0x0f))
+ return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+
+ switch (DstFormat)
+ {
+ case PIXEL_FORMAT_BGRA32:
+ case PIXEL_FORMAT_BGRX32:
+ return sse2_RGBToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
+
+ case PIXEL_FORMAT_RGBA32:
+ case PIXEL_FORMAT_RGBX32:
+ return sse2_RGBToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
+
+ case PIXEL_FORMAT_ABGR32:
+ case PIXEL_FORMAT_XBGR32:
+ return sse2_RGBToRGB_16s8u_P3AC4R_XBGR(pSrc, srcStep, pDst, dstStep, roi);
+
+ case PIXEL_FORMAT_ARGB32:
+ case PIXEL_FORMAT_XRGB32:
+ return sse2_RGBToRGB_16s8u_P3AC4R_XRGB(pSrc, srcStep, pDst, dstStep, roi);
+
+ default:
+ return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+ }
+}
+#endif /* WITH_SSE2 */
+
+/*---------------------------------------------------------------------------*/
+#ifdef WITH_NEON
+static pstatus_t
+neon_yCbCrToRGB_16s16s_P3P3(const INT16* const WINPR_RESTRICT pSrc[3], INT32 srcStep,
+ INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
+ const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+ /* TODO: If necessary, check alignments and call the general version. */
+ int16x8_t zero = vdupq_n_s16(0);
+ int16x8_t max = vdupq_n_s16(255);
+ int16x8_t r_cr = vdupq_n_s16(22986); // 1.403 << 14
+ int16x8_t g_cb = vdupq_n_s16(-5636); // -0.344 << 14
+ int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14
+ int16x8_t b_cb = vdupq_n_s16(28999); // 1.770 << 14
+ int16x8_t c4096 = vdupq_n_s16(4096);
+ int16x8_t* y_buf = (int16x8_t*)pSrc[0];
+ int16x8_t* cb_buf = (int16x8_t*)pSrc[1];
+ int16x8_t* cr_buf = (int16x8_t*)pSrc[2];
+ int16x8_t* r_buf = (int16x8_t*)pDst[0];
+ int16x8_t* g_buf = (int16x8_t*)pDst[1];
+ int16x8_t* b_buf = (int16x8_t*)pDst[2];
+ int srcbump = srcStep / sizeof(int16x8_t);
+ int dstbump = dstStep / sizeof(int16x8_t);
+ int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
+
+ for (int yp = 0; yp < roi->height; ++yp)
+ {
+ for (int i = 0; i < imax; i++)
+ {
+ /*
+ In order to use NEON signed 16-bit integer multiplication we need to convert
+ the floating point factors to signed int without loosing information.
+ The result of this multiplication is 32 bit and we have a NEON instruction
+ that returns the hi word of the saturated double.
+ Thus we will multiply the factors by the highest possible 2^n, take the
+ upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
+ shift by 1 to reverse the doubling) and correct this result by multiplying it
+ by 2^(16-n).
+ For the given factors in the conversion matrix the best possible n is 14.
+
+ Example for calculating r:
+ r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
+ r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
+ r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
+ r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
+ */
+ /* y = (y_buf[i] + 4096) >> 2 */
+ int16x8_t y = vld1q_s16((INT16*)&y_buf[i]);
+ y = vaddq_s16(y, c4096);
+ y = vshrq_n_s16(y, 2);
+ /* cb = cb_buf[i]; */
+ int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
+ /* cr = cr_buf[i]; */
+ int16x8_t cr = vld1q_s16((INT16*)&cr_buf[i]);
+ /* (y + HIWORD(cr*22986)) >> 3 */
+ int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
+ r = vshrq_n_s16(r, 3);
+ /* r_buf[i] = CLIP(r); */
+ r = vminq_s16(vmaxq_s16(r, zero), max);
+ vst1q_s16((INT16*)&r_buf[i], r);
+ /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
+ int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
+ g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
+ g = vshrq_n_s16(g, 3);
+ /* g_buf[i] = CLIP(g); */
+ g = vminq_s16(vmaxq_s16(g, zero), max);
+ vst1q_s16((INT16*)&g_buf[i], g);
+ /* (y + HIWORD(cb*28999)) >> 3 */
+ int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
+ b = vshrq_n_s16(b, 3);
+ /* b_buf[i] = CLIP(b); */
+ b = vminq_s16(vmaxq_s16(b, zero), max);
+ vst1q_s16((INT16*)&b_buf[i], b);
+ }
+
+ y_buf += srcbump;
+ cb_buf += srcbump;
+ cr_buf += srcbump;
+ r_buf += dstbump;
+ g_buf += dstbump;
+ b_buf += dstbump;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16* const WINPR_RESTRICT pSrc[3],
+ UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+ UINT32 dstStep,
+ const prim_size_t* WINPR_RESTRICT roi, uint8_t rPos,
+ uint8_t gPos, uint8_t bPos, uint8_t aPos)
+{
+ BYTE* pRGB = pDst;
+ const INT16* pY = pSrc[0];
+ const INT16* pCb = pSrc[1];
+ const INT16* pCr = pSrc[2];
+ const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16);
+ const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
+ const size_t pad = roi->width % 8;
+ const int16x4_t c4096 = vdup_n_s16(4096);
+
+ for (UINT32 y = 0; y < roi->height; y++)
+ {
+ for (UINT32 x = 0; x < roi->width - pad; x += 8)
+ {
+ const int16x8_t Y = vld1q_s16(pY);
+ const int16x4_t Yh = vget_high_s16(Y);
+ const int16x4_t Yl = vget_low_s16(Y);
+ const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */
+ const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */
+ const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
+ const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
+ const int16x8_t Cr = vld1q_s16(pCr);
+ const int16x4_t Crh = vget_high_s16(Cr);
+ const int16x4_t Crl = vget_low_s16(Cr);
+ const int16x8_t Cb = vld1q_s16(pCb);
+ const int16x4_t Cbh = vget_high_s16(Cb);
+ const int16x4_t Cbl = vget_low_s16(Cb);
+ uint8x8x4_t bgrx;
+ {
+ /* R */
+ const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */
+ const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */
+ const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
+ const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
+ const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
+ const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
+ const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
+ bgrx.val[rPos] = vqmovun_s16(Rs);
+ }
+ {
+ /* G */
+ const int32x4_t CbGh = vmull_n_s16(Cbh, 22527); /* 0.343730 * 2^16 */
+ const int32x4_t CbGl = vmull_n_s16(Cbl, 22527); /* 0.343730 * 2^16 */
+ const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */
+ const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */
+ const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
+ const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
+ const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
+ const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
+ const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
+ const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
+ const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
+ const uint8x8_t G = vqmovun_s16(Gs);
+ bgrx.val[gPos] = G;
+ }
+ {
+ /* B */
+ const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */
+ const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */
+ const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
+ const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
+ const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
+ const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
+ const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
+ const uint8x8_t B = vqmovun_s16(Bs);
+ bgrx.val[bPos] = B;
+ }
+ /* A */
+ {
+ bgrx.val[aPos] = vdup_n_u8(0xFF);
+ }
+ vst4_u8(pRGB, bgrx);
+ pY += 8;
+ pCb += 8;
+ pCr += 8;
+ pRGB += 32;
+ }
+
+ for (UINT32 x = 0; x < pad; x++)
+ {
+ const INT32 divisor = 16;
+ const INT32 Y = ((*pY++) + 4096) << divisor;
+ const INT32 Cb = (*pCb++);
+ const INT32 Cr = (*pCr++);
+ const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
+ const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
+ const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
+ const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
+ INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
+ INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
+ INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
+ BYTE bgrx[4];
+ bgrx[bPos] = CLIP(B);
+ bgrx[gPos] = CLIP(G);
+ bgrx[rPos] = CLIP(R);
+ bgrx[aPos] = 0xFF;
+ *pRGB++ = bgrx[0];
+ *pRGB++ = bgrx[1];
+ *pRGB++ = bgrx[2];
+ *pRGB++ = bgrx[3];
+ }
+
+ pY += srcPad;
+ pCb += srcPad;
+ pCr += srcPad;
+ pRGB += dstPad;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3],
+ UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+ UINT32 dstStep, UINT32 DstFormat,
+ const prim_size_t* WINPR_RESTRICT roi)
+{
+ switch (DstFormat)
+ {
+ case PIXEL_FORMAT_BGRA32:
+ case PIXEL_FORMAT_BGRX32:
+ return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
+
+ case PIXEL_FORMAT_RGBA32:
+ case PIXEL_FORMAT_RGBX32:
+ return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
+
+ case PIXEL_FORMAT_ARGB32:
+ case PIXEL_FORMAT_XRGB32:
+ return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
+
+ case PIXEL_FORMAT_ABGR32:
+ case PIXEL_FORMAT_XBGR32:
+ return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
+
+ default:
+ return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+ }
+}
+
+static pstatus_t neon_RGBToRGB_16s8u_P3AC4R_X(
+ const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+ UINT32 srcStep, /* bytes between rows in source data */
+ BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
+ UINT32 dstStep, /* bytes between rows in dest data */
+ const prim_size_t* WINPR_RESTRICT roi, /* region of interest */
+ uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
+{
+ UINT32 pad = roi->width % 8;
+
+ for (UINT32 y = 0; y < roi->height; y++)
+ {
+ const INT16* pr = (INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
+ const INT16* pg = (INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
+ const INT16* pb = (INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
+ BYTE* dst = pDst + y * dstStep;
+
+ for (UINT32 x = 0; x < roi->width - pad; x += 8)
+ {
+ int16x8_t r = vld1q_s16(pr);
+ int16x8_t g = vld1q_s16(pg);
+ int16x8_t b = vld1q_s16(pb);
+ uint8x8x4_t bgrx;
+ bgrx.val[aPos] = vdup_n_u8(0xFF);
+ bgrx.val[rPos] = vqmovun_s16(r);
+ bgrx.val[gPos] = vqmovun_s16(g);
+ bgrx.val[bPos] = vqmovun_s16(b);
+ vst4_u8(dst, bgrx);
+ pr += 8;
+ pg += 8;
+ pb += 8;
+ dst += 32;
+ }
+
+ for (UINT32 x = 0; x < pad; x++)
+ {
+ BYTE bgrx[4];
+ bgrx[bPos] = *pb++;
+ bgrx[gPos] = *pg++;
+ bgrx[rPos] = *pr++;
+ bgrx[aPos] = 0xFF;
+ *dst++ = bgrx[0];
+ *dst++ = bgrx[1];
+ *dst++ = bgrx[2];
+ *dst++ = bgrx[3];
+ }
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t
+neon_RGBToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+ UINT32 srcStep, /* bytes between rows in source data */
+ BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
+ UINT32 dstStep, /* bytes between rows in dest data */
+ UINT32 DstFormat,
+ const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+ switch (DstFormat)
+ {
+ case PIXEL_FORMAT_BGRA32:
+ case PIXEL_FORMAT_BGRX32:
+ return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
+
+ case PIXEL_FORMAT_RGBA32:
+ case PIXEL_FORMAT_RGBX32:
+ return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
+
+ case PIXEL_FORMAT_ARGB32:
+ case PIXEL_FORMAT_XRGB32:
+ return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
+
+ case PIXEL_FORMAT_ABGR32:
+ case PIXEL_FORMAT_XBGR32:
+ return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
+
+ default:
+ return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+ }
+}
+#endif /* WITH_NEON */
+/* I don't see a direct IPP version of this, since the input is INT16
+ * YCbCr. It may be possible via Deinterleave and then YCbCrToRGB_<mod>.
+ * But that would likely be slower.
+ */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_colors_opt(primitives_t* prims)
+{
+ generic = primitives_get_generic();
+ primitives_init_colors(prims);
+#if defined(WITH_SSE2)
+
+ if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
+ {
+ prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
+ prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
+ prims->yCbCrToRGB_16s8u_P3AC4R = sse2_yCbCrToRGB_16s8u_P3AC4R;
+ prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
+ }
+
+#elif defined(WITH_NEON)
+
+ if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
+ {
+ prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
+ prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
+ prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
+ }
+
+#endif /* WITH_SSE2 */
+}
diff --git a/libfreerdp/primitives/prim_copy.c b/libfreerdp/primitives/prim_copy.c
new file mode 100644
index 0000000..f140c20
--- /dev/null
+++ b/libfreerdp/primitives/prim_copy.c
@@ -0,0 +1,178 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Copy operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#ifdef WITH_IPP
+#include <ipps.h>
+#include <ippi.h>
+#endif /* WITH_IPP */
+#include "prim_internal.h"
+
+static primitives_t* generic = NULL;
+
+/* ------------------------------------------------------------------------- */
+/*static inline BOOL memory_regions_overlap_1d(*/
+static BOOL memory_regions_overlap_1d(const BYTE* p1, const BYTE* p2, size_t bytes)
+{
+ const ULONG_PTR p1m = (const ULONG_PTR)p1;
+ const ULONG_PTR p2m = (const ULONG_PTR)p2;
+
+ if (p1m <= p2m)
+ {
+ if (p1m + bytes > p2m)
+ return TRUE;
+ }
+ else
+ {
+ if (p2m + bytes > p1m)
+ return TRUE;
+ }
+
+ /* else */
+ return FALSE;
+}
+
+/* ------------------------------------------------------------------------- */
+/*static inline BOOL memory_regions_overlap_2d( */
+static BOOL memory_regions_overlap_2d(const BYTE* p1, int p1Step, int p1Size, const BYTE* p2,
+ int p2Step, int p2Size, int width, int height)
+{
+ ULONG_PTR p1m = (ULONG_PTR)p1;
+ ULONG_PTR p2m = (ULONG_PTR)p2;
+
+ if (p1m <= p2m)
+ {
+ ULONG_PTR p1mEnd = p1m + 1ull * (height - 1) * p1Step + 1ull * width * p1Size;
+
+ if (p1mEnd > p2m)
+ return TRUE;
+ }
+ else
+ {
+ ULONG_PTR p2mEnd = p2m + 1ull * (height - 1) * p2Step + 1ull * width * p2Size;
+
+ if (p2mEnd > p1m)
+ return TRUE;
+ }
+
+ /* else */
+ return FALSE;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_copy_8u(const BYTE* pSrc, BYTE* pDst, INT32 len)
+{
+ if (memory_regions_overlap_1d(pSrc, pDst, (size_t)len))
+ {
+ memmove((void*)pDst, (const void*)pSrc, (size_t)len);
+ }
+ else
+ {
+ memcpy((void*)pDst, (const void*)pSrc, (size_t)len);
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+/* Copy a block of pixels from one buffer to another.
+ * The addresses are assumed to have been already offset to the upper-left
+ * corners of the source and destination region of interest.
+ */
+static pstatus_t general_copy_8u_AC4r(const BYTE* pSrc, INT32 srcStep, BYTE* pDst, INT32 dstStep,
+ INT32 width, INT32 height)
+{
+ const BYTE* src = (const BYTE*)pSrc;
+ BYTE* dst = (BYTE*)pDst;
+ int rowbytes = width * sizeof(UINT32);
+
+ if ((width == 0) || (height == 0))
+ return PRIMITIVES_SUCCESS;
+
+ if (memory_regions_overlap_2d(pSrc, srcStep, sizeof(UINT32), pDst, dstStep, sizeof(UINT32),
+ width, height))
+ {
+ do
+ {
+ generic->copy(src, dst, rowbytes);
+ src += srcStep;
+ dst += dstStep;
+ } while (--height);
+ }
+ else
+ {
+ /* TODO: do it in one operation when the rowdata is adjacent. */
+ do
+ {
+ /* If we find a replacement for memcpy that is consistently
+ * faster, this could be replaced with that.
+ */
+ memcpy(dst, src, rowbytes);
+ src += srcStep;
+ dst += dstStep;
+ } while (--height);
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+#ifdef WITH_IPP
+/* ------------------------------------------------------------------------- */
+/* This is just ippiCopy_8u_AC4R without the IppiSize structure parameter. */
+static pstatus_t ippiCopy_8u_AC4r(const BYTE* pSrc, INT32 srcStep, BYTE* pDst, INT32 dstStep,
+ INT32 width, INT32 height)
+{
+ IppiSize roi;
+ roi.width = width;
+ roi.height = height;
+ return (pstatus_t)ippiCopy_8u_AC4R(pSrc, srcStep, pDst, dstStep, roi);
+}
+#endif /* WITH_IPP */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_copy(primitives_t* prims)
+{
+ /* Start with the default. */
+ prims->copy_8u = general_copy_8u;
+ prims->copy_8u_AC4r = general_copy_8u_AC4r;
+ /* This is just an alias with void* parameters */
+ prims->copy = (__copy_t)(prims->copy_8u);
+}
+
+#if defined(WITH_SSE2) || defined(WITH_NEON)
+void primitives_init_copy_opt(primitives_t* prims)
+{
+ generic = primitives_get_generic();
+ primitives_init_copy(prims);
+ /* Pick tuned versions if possible. */
+#ifdef WITH_IPP
+ prims->copy_8u = (__copy_8u_t)ippsCopy_8u;
+ prims->copy_8u_AC4r = (__copy_8u_AC4r_t)ippiCopy_8u_AC4r;
+#endif
+ /* Performance with an SSE2 version with no prefetch seemed to be
+ * all over the map vs. memcpy.
+ * Sometimes it was significantly faster, sometimes dreadfully slower,
+ * and it seemed to vary a lot depending on block size and processor.
+ * Hence, no SSE version is used here unless once can be written that
+ * is consistently faster than memcpy.
+ */
+ /* This is just an alias with void* parameters */
+ prims->copy = (__copy_t)(prims->copy_8u);
+}
+#endif
diff --git a/libfreerdp/primitives/prim_internal.h b/libfreerdp/primitives/prim_internal.h
new file mode 100644
index 0000000..cf5c124
--- /dev/null
+++ b/libfreerdp/primitives/prim_internal.h
@@ -0,0 +1,297 @@
+/* prim_internal.h
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License. Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ */
+
+#ifndef FREERDP_LIB_PRIM_INTERNAL_H
+#define FREERDP_LIB_PRIM_INTERNAL_H
+
+#include <freerdp/config.h>
+
+#include <freerdp/primitives.h>
+#include <freerdp/api.h>
+
+#ifdef __GNUC__
+#define PRIM_ALIGN_128 __attribute__((aligned(16)))
+#else
+#ifdef _WIN32
+#define PRIM_ALIGN_128 __declspec(align(16))
+#endif
+#endif
+
+#if defined(WITH_SSE2) || defined(WITH_NEON) || defined(WITH_OPENCL)
+#define HAVE_OPTIMIZED_PRIMITIVES 1
+#endif
+
+#if defined(WITH_SSE2) || defined(WITH_NEON)
+#define HAVE_CPU_OPTIMIZED_PRIMITIVES 1
+#endif
+
+#if defined(WITH_SSE2)
+/* Use lddqu for unaligned; load for 16-byte aligned. */
+#define LOAD_SI128(_ptr_) \
+ (((const ULONG_PTR)(_ptr_)&0x0f) ? _mm_lddqu_si128((const __m128i*)(_ptr_)) \
+ : _mm_load_si128((const __m128i*)(_ptr_)))
+#endif
+
+static INLINE BYTE* writePixelBGRA(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+ BYTE B, BYTE A)
+{
+ WINPR_UNUSED(formatSize);
+ WINPR_UNUSED(format);
+
+ *dst++ = B;
+ *dst++ = G;
+ *dst++ = R;
+ *dst++ = A;
+ return dst;
+}
+
+static INLINE BYTE* writePixelBGRX(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+ BYTE B, BYTE A)
+{
+ WINPR_UNUSED(formatSize);
+ WINPR_UNUSED(format);
+ WINPR_UNUSED(A);
+
+ *dst++ = B;
+ *dst++ = G;
+ *dst++ = R;
+ dst++; /* Do not touch alpha */
+
+ return dst;
+}
+
+static INLINE BYTE* writePixelRGBA(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+ BYTE B, BYTE A)
+{
+ WINPR_UNUSED(formatSize);
+ WINPR_UNUSED(format);
+
+ *dst++ = R;
+ *dst++ = G;
+ *dst++ = B;
+ *dst++ = A;
+ return dst;
+}
+
+static INLINE BYTE* writePixelRGBX(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+ BYTE B, BYTE A)
+{
+ WINPR_UNUSED(formatSize);
+ WINPR_UNUSED(format);
+ WINPR_UNUSED(A);
+
+ *dst++ = R;
+ *dst++ = G;
+ *dst++ = B;
+ dst++; /* Do not touch alpha */
+
+ return dst;
+}
+
+static INLINE BYTE* writePixelABGR(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+ BYTE B, BYTE A)
+{
+ WINPR_UNUSED(formatSize);
+ WINPR_UNUSED(format);
+
+ *dst++ = A;
+ *dst++ = B;
+ *dst++ = G;
+ *dst++ = R;
+ return dst;
+}
+
+static INLINE BYTE* writePixelXBGR(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+ BYTE B, BYTE A)
+{
+ WINPR_UNUSED(formatSize);
+ WINPR_UNUSED(format);
+ WINPR_UNUSED(A);
+
+ dst++; /* Do not touch alpha */
+ *dst++ = B;
+ *dst++ = G;
+ *dst++ = R;
+ return dst;
+}
+
+static INLINE BYTE* writePixelARGB(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+ BYTE B, BYTE A)
+{
+ WINPR_UNUSED(formatSize);
+ WINPR_UNUSED(format);
+
+ *dst++ = A;
+ *dst++ = R;
+ *dst++ = G;
+ *dst++ = B;
+ return dst;
+}
+
+static INLINE BYTE* writePixelXRGB(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+ BYTE B, BYTE A)
+{
+ WINPR_UNUSED(formatSize);
+ WINPR_UNUSED(format);
+ WINPR_UNUSED(A);
+
+ dst++; /* Do not touch alpha */
+ *dst++ = R;
+ *dst++ = G;
+ *dst++ = B;
+ return dst;
+}
+
+static INLINE BYTE* writePixelGenericAlpha(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R,
+ BYTE G, BYTE B, BYTE A)
+{
+ UINT32 color = FreeRDPGetColor(format, R, G, B, A);
+ FreeRDPWriteColor(dst, format, color);
+ return dst + formatSize;
+}
+
+static INLINE BYTE* writePixelGeneric(BYTE* dst, DWORD formatSize, UINT32 format, BYTE R, BYTE G,
+ BYTE B, BYTE A)
+{
+ UINT32 color = FreeRDPGetColor(format, R, G, B, A);
+ FreeRDPWriteColorIgnoreAlpha(dst, format, color);
+ return dst + formatSize;
+}
+
+typedef BYTE* (*fkt_writePixel)(BYTE*, DWORD, UINT32, BYTE, BYTE, BYTE, BYTE);
+
+static INLINE fkt_writePixel getPixelWriteFunction(DWORD format, BOOL useAlpha)
+{
+ switch (format)
+ {
+ case PIXEL_FORMAT_ARGB32:
+ case PIXEL_FORMAT_XRGB32:
+ return useAlpha ? writePixelARGB : writePixelXRGB;
+
+ case PIXEL_FORMAT_ABGR32:
+ case PIXEL_FORMAT_XBGR32:
+ return useAlpha ? writePixelABGR : writePixelXBGR;
+
+ case PIXEL_FORMAT_RGBA32:
+ case PIXEL_FORMAT_RGBX32:
+ return useAlpha ? writePixelRGBA : writePixelRGBX;
+
+ case PIXEL_FORMAT_BGRA32:
+ case PIXEL_FORMAT_BGRX32:
+ return useAlpha ? writePixelBGRA : writePixelBGRX;
+
+ default:
+ return useAlpha ? writePixelGenericAlpha : writePixelGeneric;
+ }
+}
+
+static INLINE BYTE CLIP(INT64 X)
+{
+ if (X > 255L)
+ return 255L;
+
+ if (X < 0L)
+ return 0L;
+
+ return (BYTE)X;
+}
+
+static INLINE BYTE CONDITIONAL_CLIP(INT32 in, BYTE original)
+{
+ BYTE out = CLIP(in);
+ BYTE diff;
+ if (out > original)
+ diff = out - original;
+ else
+ diff = original - out;
+ if (diff < 30)
+ return original;
+ return out;
+}
+
+/**
+ * | R | ( | 256 0 403 | | Y | )
+ * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
+ * | B | ( | 256 475 0 | | V - 128 | )
+ */
+static INLINE INT32 C(INT32 Y)
+{
+ return (Y)-0L;
+}
+
+static INLINE INT32 D(INT32 U)
+{
+ return (U)-128L;
+}
+
+static INLINE INT32 E(INT32 V)
+{
+ return (V)-128L;
+}
+
+static INLINE BYTE YUV2R(INT32 Y, INT32 U, INT32 V)
+{
+ const INT32 r = (256L * C(Y) + 0L * D(U) + 403L * E(V));
+ const INT32 r8 = r >> 8L;
+ return CLIP(r8);
+}
+
+static INLINE BYTE YUV2G(INT32 Y, INT32 U, INT32 V)
+{
+ const INT32 g = (256L * C(Y) - 48L * D(U) - 120L * E(V));
+ const INT32 g8 = g >> 8L;
+ return CLIP(g8);
+}
+
+static INLINE BYTE YUV2B(INT32 Y, INT32 U, INT32 V)
+{
+ const INT32 b = (256L * C(Y) + 475L * D(U) + 0L * E(V));
+ const INT32 b8 = b >> 8L;
+ return CLIP(b8);
+}
+
+/* Function prototypes for all the init/deinit routines. */
+FREERDP_LOCAL void primitives_init_copy(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_set(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_add(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_andor(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_shift(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_sign(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_alphaComp(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_colors(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_YCoCg(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_YUV(primitives_t* prims);
+
+#if defined(WITH_SSE2) || defined(WITH_NEON)
+FREERDP_LOCAL void primitives_init_copy_opt(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_set_opt(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_add_opt(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_andor_opt(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_shift_opt(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_sign_opt(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_alphaComp_opt(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_colors_opt(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_YCoCg_opt(primitives_t* prims);
+FREERDP_LOCAL void primitives_init_YUV_opt(primitives_t* prims);
+#endif
+
+#if defined(WITH_OPENCL)
+FREERDP_LOCAL BOOL primitives_init_opencl(primitives_t* prims);
+#endif
+
+FREERDP_LOCAL primitives_t* primitives_get_by_type(DWORD type);
+
+#endif /* FREERDP_LIB_PRIM_INTERNAL_H */
diff --git a/libfreerdp/primitives/prim_set.c b/libfreerdp/primitives/prim_set.c
new file mode 100644
index 0000000..c4012e6
--- /dev/null
+++ b/libfreerdp/primitives/prim_set.c
@@ -0,0 +1,122 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Routines to set a chunk of memory to a constant.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+#include <freerdp/config.h>
+
+#include <string.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+/* ========================================================================= */
+static pstatus_t general_set_8u(BYTE val, BYTE* pDst, UINT32 len)
+{
+ memset((void*)pDst, (int)val, (size_t)len);
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_zero(void* pDst, size_t len)
+{
+ memset(pDst, 0, len);
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ========================================================================= */
+static pstatus_t general_set_32s(INT32 val, INT32* pDst, UINT32 len)
+{
+ INT32* dptr = (INT32*)pDst;
+ size_t span = 0;
+ size_t remaining = 0;
+ primitives_t* prims = NULL;
+
+ if (len < 256)
+ {
+ while (len--)
+ *dptr++ = val;
+
+ return PRIMITIVES_SUCCESS;
+ }
+
+ /* else quadratic growth memcpy algorithm */
+ span = 1;
+ *dptr = val;
+ remaining = len - 1;
+ prims = primitives_get();
+
+ while (remaining)
+ {
+ size_t thiswidth = span;
+
+ if (thiswidth > remaining)
+ thiswidth = remaining;
+
+ prims->copy_8u((BYTE*)dptr, (BYTE*)(dptr + span), thiswidth << 2);
+ remaining -= thiswidth;
+ span <<= 1;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t general_set_32u(UINT32 val, UINT32* pDst, UINT32 len)
+{
+ UINT32* dptr = (UINT32*)pDst;
+ size_t span = 0;
+ size_t remaining = 0;
+ primitives_t* prims = NULL;
+
+ if (len < 256)
+ {
+ while (len--)
+ *dptr++ = val;
+
+ return PRIMITIVES_SUCCESS;
+ }
+
+ /* else quadratic growth memcpy algorithm */
+ span = 1;
+ *dptr = val;
+ remaining = len - 1;
+ prims = primitives_get();
+
+ while (remaining)
+ {
+ size_t thiswidth = span;
+
+ if (thiswidth > remaining)
+ thiswidth = remaining;
+
+ prims->copy_8u((BYTE*)dptr, (BYTE*)(dptr + span), thiswidth << 2);
+ remaining -= thiswidth;
+ span <<= 1;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_set(primitives_t* prims)
+{
+ /* Start with the default. */
+ prims->set_8u = general_set_8u;
+ prims->set_32s = general_set_32s;
+ prims->set_32u = general_set_32u;
+ prims->zero = general_zero;
+}
diff --git a/libfreerdp/primitives/prim_set_opt.c b/libfreerdp/primitives/prim_set_opt.c
new file mode 100644
index 0000000..546d1ac
--- /dev/null
+++ b/libfreerdp/primitives/prim_set_opt.c
@@ -0,0 +1,256 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized routines to set a chunk of memory to a constant.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+#include <freerdp/config.h>
+
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#endif /* WITH_SSE2 */
+#ifdef WITH_IPP
+#include <ipps.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+
+static primitives_t* generic = NULL;
+
+/* ========================================================================= */
+#ifdef WITH_SSE2
+#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 len)
+{
+ BYTE byte = 0;
+ BYTE* dptr = NULL;
+ __m128i xmm0;
+ size_t count = 0;
+
+ if (len < 16)
+ return generic->set_8u(val, pDst, len);
+
+ byte = val;
+ dptr = (BYTE*)pDst;
+
+ /* Seek 16-byte alignment. */
+ while ((ULONG_PTR)dptr & 0x0f)
+ {
+ *dptr++ = byte;
+
+ if (--len == 0)
+ return PRIMITIVES_SUCCESS;
+ }
+
+ xmm0 = _mm_set1_epi8(byte);
+ /* Cover 256-byte chunks via SSE register stores. */
+ count = len >> 8;
+ len -= count << 8;
+
+ /* Do 256-byte chunks using one XMM register. */
+ while (count--)
+ {
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 16;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 16;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 16;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 16;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 16;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 16;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 16;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 16;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 16;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 16;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 16;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 16;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 16;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 16;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 16;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 16;
+ }
+
+ /* Cover 16-byte chunks via SSE register stores. */
+ count = len >> 4;
+ len -= count << 4;
+
+ /* Do 16-byte chunks using one XMM register. */
+ while (count--)
+ {
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 16;
+ }
+
+ /* Do leftover bytes. */
+ while (len--)
+ *dptr++ = byte;
+
+ return PRIMITIVES_SUCCESS;
+}
+#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif /* WITH_SSE2 */
+
+/* ------------------------------------------------------------------------- */
+#ifdef WITH_SSE2
+#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+static pstatus_t sse2_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 len)
+{
+ const primitives_t* prim = primitives_get_generic();
+ UINT32* dptr = (UINT32*)pDst;
+ __m128i xmm0;
+ size_t count = 0;
+
+ /* If really short, just do it here. */
+ if (len < 32)
+ {
+ while (len--)
+ *dptr++ = val;
+
+ return PRIMITIVES_SUCCESS;
+ }
+
+ /* Assure we can reach 16-byte alignment. */
+ if (((ULONG_PTR)dptr & 0x03) != 0)
+ {
+ return prim->set_32u(val, pDst, len);
+ }
+
+ /* Seek 16-byte alignment. */
+ while ((ULONG_PTR)dptr & 0x0f)
+ {
+ *dptr++ = val;
+
+ if (--len == 0)
+ return PRIMITIVES_SUCCESS;
+ }
+
+ xmm0 = _mm_set1_epi32(val);
+ /* Cover 256-byte chunks via SSE register stores. */
+ count = len >> 6;
+ len -= count << 6;
+
+ /* Do 256-byte chunks using one XMM register. */
+ while (count--)
+ {
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 4;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 4;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 4;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 4;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 4;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 4;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 4;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 4;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 4;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 4;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 4;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 4;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 4;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 4;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 4;
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 4;
+ }
+
+ /* Cover 16-byte chunks via SSE register stores. */
+ count = len >> 2;
+ len -= count << 2;
+
+ /* Do 16-byte chunks using one XMM register. */
+ while (count--)
+ {
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 4;
+ }
+
+ /* Do leftover bytes. */
+ while (len--)
+ *dptr++ = val;
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t sse2_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len)
+{
+ UINT32 uval = *((UINT32*)&val);
+ return sse2_set_32u(uval, (UINT32*)pDst, len);
+}
+#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+/* ------------------------------------------------------------------------- */
+static pstatus_t ipp_wrapper_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, INT32 len)
+{
+ /* A little type conversion, then use the signed version. */
+ INT32 sval = *((INT32*)&val);
+ return ippsSet_32s(sval, (INT32*)pDst, len);
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims)
+{
+ generic = primitives_get_generic();
+ primitives_init_set(prims);
+ /* Pick tuned versions if possible. */
+#ifdef WITH_IPP
+ prims->set_8u = (__set_8u_t)ippsSet_8u;
+ prims->set_32s = (__set_32s_t)ippsSet_32s;
+ prims->set_32u = (__set_32u_t)ipp_wrapper_set_32u;
+ prims->zero = (__zero_t)ippsZero_8u;
+#elif defined(WITH_SSE2)
+
+ if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
+ {
+ prims->set_8u = sse2_set_8u;
+ prims->set_32s = sse2_set_32s;
+ prims->set_32u = sse2_set_32u;
+ }
+
+#endif
+}
diff --git a/libfreerdp/primitives/prim_shift.c b/libfreerdp/primitives/prim_shift.c
new file mode 100644
index 0000000..3729266
--- /dev/null
+++ b/libfreerdp/primitives/prim_shift.c
@@ -0,0 +1,115 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Shift operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+/* ------------------------------------------------------------------------- */
+static INLINE pstatus_t general_lShiftC_16s(const INT16* pSrc, UINT32 val, INT16* pDst, UINT32 len)
+{
+ if (val == 0)
+ return PRIMITIVES_SUCCESS;
+ if (val >= 16)
+ return -1;
+
+ while (len--)
+ *pDst++ = (INT16)((UINT16)*pSrc++ << val);
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static INLINE pstatus_t general_rShiftC_16s(const INT16* pSrc, UINT32 val, INT16* pDst, UINT32 len)
+{
+ if (val == 0)
+ return PRIMITIVES_SUCCESS;
+ if (val >= 16)
+ return -1;
+
+ while (len--)
+ *pDst++ = *pSrc++ >> val;
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static INLINE pstatus_t general_lShiftC_16u(const UINT16* pSrc, UINT32 val, UINT16* pDst,
+ UINT32 len)
+{
+ if (val == 0)
+ return PRIMITIVES_SUCCESS;
+ if (val >= 16)
+ return -1;
+
+ while (len--)
+ *pDst++ = (INT16)((UINT16)*pSrc++ << val);
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static INLINE pstatus_t general_rShiftC_16u(const UINT16* pSrc, UINT32 val, UINT16* pDst,
+ UINT32 len)
+{
+ if (val == 0)
+ return PRIMITIVES_SUCCESS;
+ if (val >= 16)
+ return -1;
+
+ while (len--)
+ *pDst++ = *pSrc++ >> val;
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static INLINE pstatus_t general_shiftC_16s(const INT16* pSrc, INT32 val, INT16* pDst, UINT32 len)
+{
+ if (val == 0)
+ return PRIMITIVES_SUCCESS;
+
+ if (val < 0)
+ return general_rShiftC_16s(pSrc, -val, pDst, len);
+ else
+ return general_lShiftC_16s(pSrc, val, pDst, len);
+}
+
+/* ------------------------------------------------------------------------- */
+static INLINE pstatus_t general_shiftC_16u(const UINT16* pSrc, INT32 val, UINT16* pDst, UINT32 len)
+{
+ if (val == 0)
+ return PRIMITIVES_SUCCESS;
+
+ if (val < 0)
+ return general_rShiftC_16u(pSrc, -val, pDst, len);
+ else
+ return general_lShiftC_16u(pSrc, val, pDst, len);
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_shift(primitives_t* prims)
+{
+ /* Start with the default. */
+ prims->lShiftC_16s = general_lShiftC_16s;
+ prims->rShiftC_16s = general_rShiftC_16s;
+ prims->lShiftC_16u = general_lShiftC_16u;
+ prims->rShiftC_16u = general_rShiftC_16u;
+ /* Wrappers */
+ prims->shiftC_16s = general_shiftC_16s;
+ prims->shiftC_16u = general_shiftC_16u;
+}
diff --git a/libfreerdp/primitives/prim_shift_opt.c b/libfreerdp/primitives/prim_shift_opt.c
new file mode 100644
index 0000000..9ac9533
--- /dev/null
+++ b/libfreerdp/primitives/prim_shift_opt.c
@@ -0,0 +1,80 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Shift operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+#include <ipps.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+static primitives_t* generic = NULL;
+
+#ifdef WITH_SSE2
+#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16,
+ *dptr++ = (INT16)((UINT16)*sptr++ << val))
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s, _mm_srai_epi16,
+ *dptr++ = *sptr++ >> val)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16,
+ *dptr++ = (INT16)((UINT16)*sptr++ << val))
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16,
+ *dptr++ = *sptr++ >> val)
+#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
+ * depending on the sign of val. To avoid using the deprecated inplace
+ * routines, a wrapper can use the src for the dest.
+ */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims)
+{
+ generic = primitives_get_generic();
+ primitives_init_shift(prims);
+#if defined(WITH_IPP)
+ prims->lShiftC_16s = ippsLShiftC_16s;
+ prims->rShiftC_16s = ippsRShiftC_16s;
+ prims->lShiftC_16u = ippsLShiftC_16u;
+ prims->rShiftC_16u = ippsRShiftC_16u;
+#elif defined(WITH_SSE2)
+
+ if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
+ IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+ {
+ prims->lShiftC_16s = sse2_lShiftC_16s;
+ prims->rShiftC_16s = sse2_rShiftC_16s;
+ prims->lShiftC_16u = sse2_lShiftC_16u;
+ prims->rShiftC_16u = sse2_rShiftC_16u;
+ }
+
+#endif
+}
diff --git a/libfreerdp/primitives/prim_sign.c b/libfreerdp/primitives/prim_sign.c
new file mode 100644
index 0000000..d89dc47
--- /dev/null
+++ b/libfreerdp/primitives/prim_sign.c
@@ -0,0 +1,42 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Sign operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+/* ----------------------------------------------------------------------------
+ * Set pDst to the sign-value of the 16-bit values in pSrc (-1, 0, or 1).
+ */
+static pstatus_t general_sign_16s(const INT16* pSrc, INT16* pDst, UINT32 len)
+{
+ while (len--)
+ {
+ INT16 src = *pSrc++;
+ *pDst++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_sign(primitives_t* prims)
+{
+ /* Start with the default. */
+ prims->sign_16s = general_sign_16s;
+}
diff --git a/libfreerdp/primitives/prim_sign_opt.c b/libfreerdp/primitives/prim_sign_opt.c
new file mode 100644
index 0000000..dae76a6
--- /dev/null
+++ b/libfreerdp/primitives/prim_sign_opt.c
@@ -0,0 +1,185 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized sign operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#include "prim_internal.h"
+
+static primitives_t* generic = NULL;
+
+#ifdef WITH_SSE2
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
+ UINT32 len)
+{
+ const INT16* sptr = (const INT16*)pSrc;
+ INT16* dptr = (INT16*)pDst;
+ size_t count = 0;
+
+ if (len < 16)
+ {
+ return generic->sign_16s(pSrc, pDst, len);
+ }
+
+ /* Check for 16-byte alignment (eventually). */
+ if ((ULONG_PTR)pDst & 0x01)
+ {
+ return generic->sign_16s(pSrc, pDst, len);
+ }
+
+ /* Seek 16-byte alignment. */
+ while ((ULONG_PTR)dptr & 0x0f)
+ {
+ INT16 src = *sptr++;
+ *dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
+
+ if (--len == 0)
+ return PRIMITIVES_SUCCESS;
+ }
+
+ /* Do 32-short chunks using 8 XMM registers. */
+ count = len >> 5; /* / 32 */
+ len -= count << 5; /* * 32 */
+
+ if ((ULONG_PTR)sptr & 0x0f)
+ {
+ /* Unaligned */
+ while (count--)
+ {
+ __m128i xmm0;
+ __m128i xmm1;
+ __m128i xmm2;
+ __m128i xmm3;
+ __m128i xmm4;
+ __m128i xmm5;
+ __m128i xmm6;
+ __m128i xmm7;
+ xmm0 = _mm_set1_epi16(0x0001U);
+ xmm1 = _mm_set1_epi16(0x0001U);
+ xmm2 = _mm_set1_epi16(0x0001U);
+ xmm3 = _mm_set1_epi16(0x0001U);
+ xmm4 = _mm_lddqu_si128((const __m128i*)sptr);
+ sptr += 8;
+ xmm5 = _mm_lddqu_si128((const __m128i*)sptr);
+ sptr += 8;
+ xmm6 = _mm_lddqu_si128((const __m128i*)sptr);
+ sptr += 8;
+ xmm7 = _mm_lddqu_si128((const __m128i*)sptr);
+ sptr += 8;
+ xmm0 = _mm_sign_epi16(xmm0, xmm4);
+ xmm1 = _mm_sign_epi16(xmm1, xmm5);
+ xmm2 = _mm_sign_epi16(xmm2, xmm6);
+ xmm3 = _mm_sign_epi16(xmm3, xmm7);
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 8;
+ _mm_store_si128((__m128i*)dptr, xmm1);
+ dptr += 8;
+ _mm_store_si128((__m128i*)dptr, xmm2);
+ dptr += 8;
+ _mm_store_si128((__m128i*)dptr, xmm3);
+ dptr += 8;
+ }
+ }
+ else
+ {
+ /* Aligned */
+ while (count--)
+ {
+ __m128i xmm0;
+ __m128i xmm1;
+ __m128i xmm2;
+ __m128i xmm3;
+ __m128i xmm4;
+ __m128i xmm5;
+ __m128i xmm6;
+ __m128i xmm7;
+ xmm0 = _mm_set1_epi16(0x0001U);
+ xmm1 = _mm_set1_epi16(0x0001U);
+ xmm2 = _mm_set1_epi16(0x0001U);
+ xmm3 = _mm_set1_epi16(0x0001U);
+ xmm4 = _mm_load_si128((const __m128i*)sptr);
+ sptr += 8;
+ xmm5 = _mm_load_si128((const __m128i*)sptr);
+ sptr += 8;
+ xmm6 = _mm_load_si128((const __m128i*)sptr);
+ sptr += 8;
+ xmm7 = _mm_load_si128((const __m128i*)sptr);
+ sptr += 8;
+ xmm0 = _mm_sign_epi16(xmm0, xmm4);
+ xmm1 = _mm_sign_epi16(xmm1, xmm5);
+ xmm2 = _mm_sign_epi16(xmm2, xmm6);
+ xmm3 = _mm_sign_epi16(xmm3, xmm7);
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 8;
+ _mm_store_si128((__m128i*)dptr, xmm1);
+ dptr += 8;
+ _mm_store_si128((__m128i*)dptr, xmm2);
+ dptr += 8;
+ _mm_store_si128((__m128i*)dptr, xmm3);
+ dptr += 8;
+ }
+ }
+
+ /* Do 8-short chunks using two XMM registers. */
+ count = len >> 3;
+ len -= count << 3;
+
+ while (count--)
+ {
+ __m128i xmm0 = _mm_set1_epi16(0x0001U);
+ __m128i xmm1 = LOAD_SI128(sptr);
+ sptr += 8;
+ xmm0 = _mm_sign_epi16(xmm0, xmm1);
+ _mm_store_si128((__m128i*)dptr, xmm0);
+ dptr += 8;
+ }
+
+ /* Do leftovers. */
+ while (len--)
+ {
+ INT16 src = *sptr++;
+ *dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0);
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+#endif /* WITH_SSE2 */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims)
+{
+ generic = primitives_get_generic();
+ primitives_init_sign(prims);
+ /* Pick tuned versions if possible. */
+ /* I didn't spot an IPP version of this. */
+#if defined(WITH_SSE2)
+
+ if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
+ IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+ {
+ prims->sign_16s = ssse3_sign_16s;
+ }
+
+#endif
+}
diff --git a/libfreerdp/primitives/prim_templates.h b/libfreerdp/primitives/prim_templates.h
new file mode 100644
index 0000000..5ab85a8
--- /dev/null
+++ b/libfreerdp/primitives/prim_templates.h
@@ -0,0 +1,444 @@
+/* prim_templates.h
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License. Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ */
+
+#ifdef __GNUC__
+#pragma once
+#endif
+
+#ifndef FREERDP_LIB_PRIM_TEMPLATES_H
+#define FREERDP_LIB_PRIM_TEMPLATES_H
+
+/* These are prototypes for SSE (potentially NEON) routines that do a
+ * simple SSE operation over an array of data. Since so much of this
+ * code is shared except for the operation itself, these prototypes are
+ * used rather than duplicating code. The naming convention depends on
+ * the parameters: S=Source param; C=Constant; D=Destination.
+ * All the macros have parameters for a fallback procedure if the data
+ * is too small and an operation "the slow way" for use at 16-byte edges.
+ */
+
+/* SSE3 note: If someone needs to support an SSE2 version of these without
+ * SSE3 support, an alternative version could be added that merely checks
+ * that 16-byte alignment on both destination and source(s) can be
+ * achieved, rather than use LDDQU for unaligned reads.
+ */
+
+/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
+ * It easily can't do that if the value is stored in a variable.
+ * So don't save it as an intermediate value.
+ */
+
+/* ----------------------------------------------------------------------------
+ * SCD = Source, Constant, Destination
+ */
+#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
+ static pstatus_t _name_(const _type_* pSrc, UINT32 val, _type_* pDst, UINT32 len) \
+ { \
+ INT32 shifts = 0; \
+ UINT32 offBeatMask; \
+ const _type_* sptr = pSrc; \
+ _type_* dptr = pDst; \
+ int count; \
+ if (val == 0) \
+ return PRIMITIVES_SUCCESS; \
+ if (val >= 16) \
+ return -1; \
+ if (len < 16) /* pointless if too small */ \
+ { \
+ return _fallback_(pSrc, val, pDst, len); \
+ } \
+ if (sizeof(_type_) == 1) \
+ shifts = 1; \
+ else if (sizeof(_type_) == 2) \
+ shifts = 2; \
+ else if (sizeof(_type_) == 4) \
+ shifts = 3; \
+ else if (sizeof(_type_) == 8) \
+ shifts = 4; \
+ offBeatMask = (1 << (shifts - 1)) - 1; \
+ if ((ULONG_PTR)pDst & offBeatMask) \
+ { \
+ /* Incrementing the pointer skips over 16-byte boundary. */ \
+ return _fallback_(pSrc, val, pDst, len); \
+ } \
+ /* Get to the 16-byte boundary now. */ \
+ while ((ULONG_PTR)dptr & 0x0f) \
+ { \
+ _slowWay_; \
+ if (--len == 0) \
+ return PRIMITIVES_SUCCESS; \
+ } \
+ /* Use 8 128-bit SSE registers. */ \
+ count = len >> (8 - shifts); \
+ len -= count << (8 - shifts); \
+ if ((const ULONG_PTR)sptr & 0x0f) \
+ { \
+ while (count--) \
+ { \
+ __m128i xmm0 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm5 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm6 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm7 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ xmm0 = _op_(xmm0, val); \
+ xmm1 = _op_(xmm1, val); \
+ xmm2 = _op_(xmm2, val); \
+ xmm3 = _op_(xmm3, val); \
+ xmm4 = _op_(xmm4, val); \
+ xmm5 = _op_(xmm5, val); \
+ xmm6 = _op_(xmm6, val); \
+ xmm7 = _op_(xmm7, val); \
+ _mm_store_si128((__m128i*)dptr, xmm0); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm1); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm2); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm3); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm4); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm5); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm6); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm7); \
+ dptr += (16 / sizeof(_type_)); \
+ } \
+ } \
+ else \
+ { \
+ while (count--) \
+ { \
+ __m128i xmm0 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm1 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm2 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm3 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm4 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm5 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm6 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm7 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ xmm0 = _op_(xmm0, val); \
+ xmm1 = _op_(xmm1, val); \
+ xmm2 = _op_(xmm2, val); \
+ xmm3 = _op_(xmm3, val); \
+ xmm4 = _op_(xmm4, val); \
+ xmm5 = _op_(xmm5, val); \
+ xmm6 = _op_(xmm6, val); \
+ xmm7 = _op_(xmm7, val); \
+ _mm_store_si128((__m128i*)dptr, xmm0); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm1); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm2); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm3); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm4); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm5); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm6); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm7); \
+ dptr += (16 / sizeof(_type_)); \
+ } \
+ } \
+ /* Use a single 128-bit SSE register. */ \
+ count = len >> (5 - shifts); \
+ len -= count << (5 - shifts); \
+ while (count--) \
+ { \
+ __m128i xmm0 = LOAD_SI128(sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ xmm0 = _op_(xmm0, val); \
+ _mm_store_si128((__m128i*)dptr, xmm0); \
+ dptr += (16 / sizeof(_type_)); \
+ } \
+ /* Finish off the remainder. */ \
+ while (len--) \
+ { \
+ _slowWay_; \
+ } \
+ return PRIMITIVES_SUCCESS; \
+ }
+
+/* ----------------------------------------------------------------------------
+ * SCD = Source, Constant, Destination
+ * PRE = preload xmm0 with the constant.
+ */
+#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
+ static pstatus_t _name_(const _type_* pSrc, _type_ val, _type_* pDst, INT32 len) \
+ { \
+ int shifts = 0; \
+ UINT32 offBeatMask; \
+ const _type_* sptr = pSrc; \
+ _type_* dptr = pDst; \
+ size_t count; \
+ __m128i xmm0; \
+ if (len < 16) /* pointless if too small */ \
+ { \
+ return _fallback_(pSrc, val, pDst, len); \
+ } \
+ if (sizeof(_type_) == 1) \
+ shifts = 1; \
+ else if (sizeof(_type_) == 2) \
+ shifts = 2; \
+ else if (sizeof(_type_) == 4) \
+ shifts = 3; \
+ else if (sizeof(_type_) == 8) \
+ shifts = 4; \
+ offBeatMask = (1 << (shifts - 1)) - 1; \
+ if ((ULONG_PTR)pDst & offBeatMask) \
+ { \
+ /* Incrementing the pointer skips over 16-byte boundary. */ \
+ return _fallback_(pSrc, val, pDst, len); \
+ } \
+ /* Get to the 16-byte boundary now. */ \
+ while ((ULONG_PTR)dptr & 0x0f) \
+ { \
+ _slowWay_; \
+ if (--len == 0) \
+ return PRIMITIVES_SUCCESS; \
+ } \
+ /* Use 4 128-bit SSE registers. */ \
+ count = len >> (7 - shifts); \
+ len -= count << (7 - shifts); \
+ xmm0 = _mm_set1_epi32(val); \
+ if ((const ULONG_PTR)sptr & 0x0f) \
+ { \
+ while (count--) \
+ { \
+ __m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ xmm1 = _op_(xmm1, xmm0); \
+ xmm2 = _op_(xmm2, xmm0); \
+ xmm3 = _op_(xmm3, xmm0); \
+ xmm4 = _op_(xmm4, xmm0); \
+ _mm_store_si128((__m128i*)dptr, xmm1); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm2); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm3); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm4); \
+ dptr += (16 / sizeof(_type_)); \
+ } \
+ } \
+ else \
+ { \
+ while (count--) \
+ { \
+ __m128i xmm1 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm2 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm3 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm4 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ xmm1 = _op_(xmm1, xmm0); \
+ xmm2 = _op_(xmm2, xmm0); \
+ xmm3 = _op_(xmm3, xmm0); \
+ xmm4 = _op_(xmm4, xmm0); \
+ _mm_store_si128((__m128i*)dptr, xmm1); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm2); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm3); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm4); \
+ dptr += (16 / sizeof(_type_)); \
+ } \
+ } \
+ /* Use a single 128-bit SSE register. */ \
+ count = len >> (5 - shifts); \
+ len -= count << (5 - shifts); \
+ while (count--) \
+ { \
+ __m128i xmm1 = LOAD_SI128(sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ xmm1 = _op_(xmm1, xmm0); \
+ _mm_store_si128((__m128i*)dptr, xmm1); \
+ dptr += (16 / sizeof(_type_)); \
+ } \
+ /* Finish off the remainder. */ \
+ while (len--) \
+ { \
+ _slowWay_; \
+ } \
+ return PRIMITIVES_SUCCESS; \
+ }
+
+/* ----------------------------------------------------------------------------
+ * SSD = Source1, Source2, Destination
+ */
+#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
+ static pstatus_t _name_(const _type_* pSrc1, const _type_* pSrc2, _type_* pDst, UINT32 len) \
+ { \
+ int shifts = 0; \
+ UINT32 offBeatMask; \
+ const _type_* sptr1 = pSrc1; \
+ const _type_* sptr2 = pSrc2; \
+ _type_* dptr = pDst; \
+ size_t count; \
+ if (len < 16) /* pointless if too small */ \
+ { \
+ return _fallback_(pSrc1, pSrc2, pDst, len); \
+ } \
+ if (sizeof(_type_) == 1) \
+ shifts = 1; \
+ else if (sizeof(_type_) == 2) \
+ shifts = 2; \
+ else if (sizeof(_type_) == 4) \
+ shifts = 3; \
+ else if (sizeof(_type_) == 8) \
+ shifts = 4; \
+ offBeatMask = (1 << (shifts - 1)) - 1; \
+ if ((ULONG_PTR)pDst & offBeatMask) \
+ { \
+ /* Incrementing the pointer skips over 16-byte boundary. */ \
+ return _fallback_(pSrc1, pSrc2, pDst, len); \
+ } \
+ /* Get to the 16-byte boundary now. */ \
+ while ((ULONG_PTR)dptr & 0x0f) \
+ { \
+ pstatus_t status; \
+ status = _slowWay_; \
+ if (status != PRIMITIVES_SUCCESS) \
+ return status; \
+ if (--len == 0) \
+ return PRIMITIVES_SUCCESS; \
+ } \
+ /* Use 4 128-bit SSE registers. */ \
+ count = len >> (7 - shifts); \
+ len -= count << (7 - shifts); \
+ if (((const ULONG_PTR)sptr1 & 0x0f) || ((const ULONG_PTR)sptr2 & 0x0f)) \
+ { \
+ /* Unaligned loads */ \
+ while (count--) \
+ { \
+ __m128i xmm0 = _mm_lddqu_si128((const __m128i*)sptr1); \
+ sptr1 += (16 / sizeof(_type_)); \
+ __m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr1); \
+ sptr1 += (16 / sizeof(_type_)); \
+ __m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr1); \
+ sptr1 += (16 / sizeof(_type_)); \
+ __m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr1); \
+ sptr1 += (16 / sizeof(_type_)); \
+ __m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr2); \
+ sptr2 += (16 / sizeof(_type_)); \
+ __m128i xmm5 = _mm_lddqu_si128((const __m128i*)sptr2); \
+ sptr2 += (16 / sizeof(_type_)); \
+ __m128i xmm6 = _mm_lddqu_si128((const __m128i*)sptr2); \
+ sptr2 += (16 / sizeof(_type_)); \
+ __m128i xmm7 = _mm_lddqu_si128((const __m128i*)sptr2); \
+ sptr2 += (16 / sizeof(_type_)); \
+ xmm0 = _op_(xmm0, xmm4); \
+ xmm1 = _op_(xmm1, xmm5); \
+ xmm2 = _op_(xmm2, xmm6); \
+ xmm3 = _op_(xmm3, xmm7); \
+ _mm_store_si128((__m128i*)dptr, xmm0); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm1); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm2); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm3); \
+ dptr += (16 / sizeof(_type_)); \
+ } \
+ } \
+ else \
+ { \
+ /* Aligned loads */ \
+ while (count--) \
+ { \
+ __m128i xmm0 = _mm_load_si128((const __m128i*)sptr1); \
+ sptr1 += (16 / sizeof(_type_)); \
+ __m128i xmm1 = _mm_load_si128((const __m128i*)sptr1); \
+ sptr1 += (16 / sizeof(_type_)); \
+ __m128i xmm2 = _mm_load_si128((const __m128i*)sptr1); \
+ sptr1 += (16 / sizeof(_type_)); \
+ __m128i xmm3 = _mm_load_si128((const __m128i*)sptr1); \
+ sptr1 += (16 / sizeof(_type_)); \
+ __m128i xmm4 = _mm_load_si128((const __m128i*)sptr2); \
+ sptr2 += (16 / sizeof(_type_)); \
+ __m128i xmm5 = _mm_load_si128((const __m128i*)sptr2); \
+ sptr2 += (16 / sizeof(_type_)); \
+ __m128i xmm6 = _mm_load_si128((const __m128i*)sptr2); \
+ sptr2 += (16 / sizeof(_type_)); \
+ __m128i xmm7 = _mm_load_si128((const __m128i*)sptr2); \
+ sptr2 += (16 / sizeof(_type_)); \
+ xmm0 = _op_(xmm0, xmm4); \
+ xmm1 = _op_(xmm1, xmm5); \
+ xmm2 = _op_(xmm2, xmm6); \
+ xmm3 = _op_(xmm3, xmm7); \
+ _mm_store_si128((__m128i*)dptr, xmm0); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm1); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm2); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm3); \
+ dptr += (16 / sizeof(_type_)); \
+ } \
+ } \
+ /* Use a single 128-bit SSE register. */ \
+ count = len >> (5 - shifts); \
+ len -= count << (5 - shifts); \
+ while (count--) \
+ { \
+ __m128i xmm0 = LOAD_SI128(sptr1); \
+ sptr1 += (16 / sizeof(_type_)); \
+ __m128i xmm1 = LOAD_SI128(sptr2); \
+ sptr2 += (16 / sizeof(_type_)); \
+ xmm0 = _op_(xmm0, xmm1); \
+ _mm_store_si128((__m128i*)dptr, xmm0); \
+ dptr += (16 / sizeof(_type_)); \
+ } \
+ /* Finish off the remainder. */ \
+ while (len--) \
+ { \
+ _slowWay_; \
+ } \
+ return PRIMITIVES_SUCCESS; \
+ }
+
+#endif /* FREERDP_LIB_PRIM_TEMPLATES_H */
diff --git a/libfreerdp/primitives/primitives.c b/libfreerdp/primitives/primitives.c
new file mode 100644
index 0000000..da8bd40
--- /dev/null
+++ b/libfreerdp/primitives/primitives.c
@@ -0,0 +1,412 @@
+/* primitives.c
+ * This code queries processor features and calls the init/deinit routines.
+ * vi:ts=4 sw=4
+ *
+ * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Copyright 2019 David Fort <contact@hardening-consulting.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <string.h>
+#include <stdlib.h>
+
+#include <winpr/synch.h>
+#include <winpr/sysinfo.h>
+#include <winpr/crypto.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+
+#include <freerdp/log.h>
+#define TAG FREERDP_TAG("primitives")
+
+/* hints to know which kind of primitives to use */
+static primitive_hints primitivesHints = PRIMITIVES_AUTODETECT;
+static BOOL primitives_init_optimized(primitives_t* prims);
+
+void primitives_set_hints(primitive_hints hints)
+{
+ primitivesHints = hints;
+}
+
+primitive_hints primitives_get_hints(void)
+{
+ return primitivesHints;
+}
+
+/* Singleton pointer used throughout the program when requested. */
+static primitives_t pPrimitivesGeneric = { 0 };
+static INIT_ONCE generic_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
+
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+static primitives_t pPrimitivesCpu = { 0 };
+static INIT_ONCE cpu_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
+
+#endif
+#if defined(WITH_OPENCL)
+static primitives_t pPrimitivesGpu = { 0 };
+static INIT_ONCE gpu_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
+
+#endif
+
+static INIT_ONCE auto_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
+
+static primitives_t pPrimitives = { 0 };
+
+/* ------------------------------------------------------------------------- */
+static BOOL primitives_init_generic(primitives_t* prims)
+{
+ primitives_init_add(prims);
+ primitives_init_andor(prims);
+ primitives_init_alphaComp(prims);
+ primitives_init_copy(prims);
+ primitives_init_set(prims);
+ primitives_init_shift(prims);
+ primitives_init_sign(prims);
+ primitives_init_colors(prims);
+ primitives_init_YCoCg(prims);
+ primitives_init_YUV(prims);
+ prims->uninit = NULL;
+ return TRUE;
+}
+
+static BOOL CALLBACK primitives_init_generic_cb(PINIT_ONCE once, PVOID param, PVOID* context)
+{
+ WINPR_UNUSED(once);
+ WINPR_UNUSED(param);
+ WINPR_UNUSED(context);
+ return primitives_init_generic(&pPrimitivesGeneric);
+}
+
+static BOOL primitives_init_optimized(primitives_t* prims)
+{
+ primitives_init_generic(prims);
+
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+ primitives_init_add_opt(prims);
+ primitives_init_andor_opt(prims);
+ primitives_init_alphaComp_opt(prims);
+ primitives_init_copy_opt(prims);
+ primitives_init_set_opt(prims);
+ primitives_init_shift_opt(prims);
+ primitives_init_sign_opt(prims);
+ primitives_init_colors_opt(prims);
+ primitives_init_YCoCg_opt(prims);
+ primitives_init_YUV_opt(prims);
+ prims->flags |= PRIM_FLAGS_HAVE_EXTCPU;
+#endif
+ return TRUE;
+}
+
+typedef struct
+{
+ BYTE* channels[3];
+ UINT32 steps[3];
+ prim_size_t roi;
+ BYTE* outputBuffer;
+ UINT32 outputStride;
+ UINT32 testedFormat;
+} primitives_YUV_benchmark;
+
+static void primitives_YUV_benchmark_free(primitives_YUV_benchmark* bench)
+{
+ if (!bench)
+ return;
+
+ free(bench->outputBuffer);
+
+ for (int i = 0; i < 3; i++)
+ free(bench->channels[i]);
+ memset(bench, 0, sizeof(primitives_YUV_benchmark));
+}
+
+static primitives_YUV_benchmark* primitives_YUV_benchmark_init(primitives_YUV_benchmark* ret)
+{
+ prim_size_t* roi = NULL;
+ if (!ret)
+ return NULL;
+
+ memset(ret, 0, sizeof(primitives_YUV_benchmark));
+ roi = &ret->roi;
+ roi->width = 1024;
+ roi->height = 768;
+ ret->outputStride = roi->width * 4;
+ ret->testedFormat = PIXEL_FORMAT_BGRA32;
+
+ ret->outputBuffer = calloc(ret->outputStride, roi->height);
+ if (!ret->outputBuffer)
+ goto fail;
+
+ for (int i = 0; i < 3; i++)
+ {
+ BYTE* buf = ret->channels[i] = calloc(roi->width, roi->height);
+ if (!buf)
+ goto fail;
+
+ winpr_RAND(buf, 1ull * roi->width * roi->height);
+ ret->steps[i] = roi->width;
+ }
+
+ return ret;
+
+fail:
+ primitives_YUV_benchmark_free(ret);
+ return ret;
+}
+
+static BOOL primitives_YUV_benchmark_run(primitives_YUV_benchmark* bench, primitives_t* prims,
+ UINT64 runTime, UINT32* computations)
+{
+ ULONGLONG dueDate = 0;
+ const BYTE* channels[3] = { 0 };
+ pstatus_t status = 0;
+
+ *computations = 0;
+
+ for (size_t i = 0; i < 3; i++)
+ channels[i] = bench->channels[i];
+
+ /* do a first dry run to initialize cache and such */
+ status = prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
+ bench->outputStride, bench->testedFormat, &bench->roi);
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ /* let's run the benchmark */
+ dueDate = GetTickCount64() + runTime;
+ while (GetTickCount64() < dueDate)
+ {
+ pstatus_t cstatus =
+ prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
+ bench->outputStride, bench->testedFormat, &bench->roi);
+ if (cstatus != PRIMITIVES_SUCCESS)
+ return FALSE;
+ *computations = *computations + 1;
+ }
+ return TRUE;
+}
+
+static BOOL primitives_autodetect_best(primitives_t* prims)
+{
+ BOOL ret = FALSE;
+ struct prim_benchmark
+ {
+ const char* name;
+ primitives_t* prims;
+ UINT32 flags;
+ UINT32 count;
+ };
+
+ struct prim_benchmark testcases[] =
+ {
+ { "generic", NULL, PRIMITIVES_PURE_SOFT, 0 },
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+ { "optimized", NULL, PRIMITIVES_ONLY_CPU, 0 },
+#endif
+#if defined(WITH_OPENCL)
+ { "opencl", NULL, PRIMITIVES_ONLY_GPU, 0 },
+#endif
+ };
+ const struct prim_benchmark* best = NULL;
+
+#if !defined(HAVE_CPU_OPTIMIZED_PRIMITIVES) && !defined(WITH_OPENCL)
+ {
+ struct prim_benchmark* cur = &testcases[0];
+ cur->prims = primitives_get_by_type(cur->flags);
+ if (!cur->prims)
+ {
+ WLog_WARN(TAG, "Failed to initialize %s primitives", cur->name);
+ return FALSE;
+ }
+ WLog_DBG(TAG, "primitives benchmark: only one backend, skipping...");
+ best = cur;
+ }
+#else
+ {
+ UINT64 benchDuration = 150; /* 150 ms */
+ primitives_YUV_benchmark bench = { 0 };
+ primitives_YUV_benchmark* yuvBench = primitives_YUV_benchmark_init(&bench);
+ if (!yuvBench)
+ return FALSE;
+
+ WLog_DBG(TAG, "primitives benchmark result:");
+ for (size_t x = 0; x < ARRAYSIZE(testcases); x++)
+ {
+ struct prim_benchmark* cur = &testcases[x];
+ cur->prims = primitives_get_by_type(cur->flags);
+ if (!cur->prims)
+ {
+ WLog_WARN(TAG, "Failed to initialize %s primitives", cur->name);
+ continue;
+ }
+ if (!primitives_YUV_benchmark_run(yuvBench, cur->prims, benchDuration, &cur->count))
+ {
+ WLog_WARN(TAG, "error running %s YUV bench", cur->name);
+ continue;
+ }
+
+ WLog_DBG(TAG, " * %s= %" PRIu32, cur->name, cur->count);
+ if (!best || (best->count < cur->count))
+ best = cur;
+ }
+ primitives_YUV_benchmark_free(yuvBench);
+ }
+#endif
+
+ if (!best)
+ {
+ WLog_ERR(TAG, "No primitives to test, aborting.");
+ goto out;
+ }
+ /* finally compute the results */
+ *prims = *best->prims;
+
+ WLog_DBG(TAG, "primitives autodetect, using %s", best->name);
+ ret = TRUE;
+out:
+ if (!ret)
+ *prims = pPrimitivesGeneric;
+
+ return ret;
+}
+
+#if defined(WITH_OPENCL)
+static BOOL CALLBACK primitives_init_gpu_cb(PINIT_ONCE once, PVOID param, PVOID* context)
+{
+ WINPR_UNUSED(once);
+ WINPR_UNUSED(param);
+ WINPR_UNUSED(context);
+
+ if (!primitives_init_opencl(&pPrimitivesGpu))
+ return FALSE;
+
+ return TRUE;
+}
+#endif
+
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+static BOOL CALLBACK primitives_init_cpu_cb(PINIT_ONCE once, PVOID param, PVOID* context)
+{
+ WINPR_UNUSED(once);
+ WINPR_UNUSED(param);
+ WINPR_UNUSED(context);
+
+ if (!primitives_init_optimized(&pPrimitivesCpu))
+ return FALSE;
+
+ return TRUE;
+}
+#endif
+
+static BOOL CALLBACK primitives_auto_init_cb(PINIT_ONCE once, PVOID param, PVOID* context)
+{
+ WINPR_UNUSED(once);
+ WINPR_UNUSED(param);
+ WINPR_UNUSED(context);
+
+ return primitives_init(&pPrimitives, primitivesHints);
+}
+
+BOOL primitives_init(primitives_t* p, primitive_hints hints)
+{
+ switch (hints)
+ {
+ case PRIMITIVES_AUTODETECT:
+ return primitives_autodetect_best(p);
+ case PRIMITIVES_PURE_SOFT:
+ *p = pPrimitivesGeneric;
+ return TRUE;
+ case PRIMITIVES_ONLY_CPU:
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+ *p = pPrimitivesCpu;
+ return TRUE;
+#endif
+ case PRIMITIVES_ONLY_GPU:
+#if defined(WITH_OPENCL)
+ *p = pPrimitivesGpu;
+ return TRUE;
+#endif
+ default:
+ WLog_ERR(TAG, "unknown hint %d", hints);
+ return FALSE;
+ }
+}
+
+void primitives_uninit(void)
+{
+#if defined(WITH_OPENCL)
+ if (pPrimitivesGpu.uninit)
+ pPrimitivesGpu.uninit();
+#endif
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+ if (pPrimitivesCpu.uninit)
+ pPrimitivesCpu.uninit();
+#endif
+ if (pPrimitivesGeneric.uninit)
+ pPrimitivesGeneric.uninit();
+}
+
+/* ------------------------------------------------------------------------- */
+static void setup(void)
+{
+ InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic_cb, NULL, NULL);
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+ InitOnceExecuteOnce(&cpu_primitives_InitOnce, primitives_init_cpu_cb, NULL, NULL);
+#endif
+#if defined(WITH_OPENCL)
+ InitOnceExecuteOnce(&gpu_primitives_InitOnce, primitives_init_gpu_cb, NULL, NULL);
+#endif
+ InitOnceExecuteOnce(&auto_primitives_InitOnce, primitives_auto_init_cb, NULL, NULL);
+}
+
+primitives_t* primitives_get(void)
+{
+ setup();
+ return &pPrimitives;
+}
+
+primitives_t* primitives_get_generic(void)
+{
+ InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic_cb, NULL, NULL);
+ return &pPrimitivesGeneric;
+}
+
+primitives_t* primitives_get_by_type(DWORD type)
+{
+ InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic_cb, NULL, NULL);
+
+ switch (type)
+ {
+ case PRIMITIVES_ONLY_GPU:
+#if defined(WITH_OPENCL)
+ if (!InitOnceExecuteOnce(&gpu_primitives_InitOnce, primitives_init_gpu_cb, NULL, NULL))
+ return NULL;
+ return &pPrimitivesGpu;
+#endif
+ case PRIMITIVES_ONLY_CPU:
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+ if (!InitOnceExecuteOnce(&cpu_primitives_InitOnce, primitives_init_cpu_cb, NULL, NULL))
+ return NULL;
+ return &pPrimitivesCpu;
+#endif
+ case PRIMITIVES_PURE_SOFT:
+ default:
+ return &pPrimitivesGeneric;
+ }
+}
+
+DWORD primitives_flags(primitives_t* p)
+{
+ return p->flags;
+}
diff --git a/libfreerdp/primitives/primitives.cl b/libfreerdp/primitives/primitives.cl
new file mode 100644
index 0000000..5e094df
--- /dev/null
+++ b/libfreerdp/primitives/primitives.cl
@@ -0,0 +1,463 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Optimized operations using openCL
+ * vi:ts=4 sw=4
+ *
+ * Copyright 2019 David Fort <contact@hardening-consulting.com>
+ * Copyright 2019 Rangee Gmbh
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#define STRINGIFY(x) #x
+
+STRINGIFY(
+uchar clamp_uc(int v, short l, short h)
+{
+ if (v > h)
+ v = h;
+ if (v < l)
+ v = l;
+ return (uchar)v;
+}
+
+__kernel void yuv420_to_rgba_1b(
+ __global const uchar *bufY, unsigned strideY,
+ __global const uchar *bufU, unsigned strideU,
+ __global const uchar *bufV, unsigned strideV,
+ __global uchar *dest, unsigned strideDest)
+{
+ unsigned int x = get_global_id(0);
+ unsigned int y = get_global_id(1);
+
+ short Y = bufY[y * strideY + x];
+ short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
+ short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+ __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+ /**
+ * | R | ( | 256 0 403 | | Y | )
+ * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
+ * | B | ( | 256 475 0 | | V - 128 | )
+ */
+ int y256 = 256 * Y;
+ destPtr[0] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255); /* R */
+ destPtr[1] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8 , 0, 255); /* G */
+ destPtr[2] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255); /* B */
+ /* A */
+}
+
+__kernel void yuv420_to_abgr_1b(
+ __global const uchar *bufY, unsigned strideY,
+ __global const uchar *bufU, unsigned strideU,
+ __global const uchar *bufV, unsigned strideV,
+ __global uchar *dest, unsigned strideDest)
+{
+ unsigned int x = get_global_id(0);
+ unsigned int y = get_global_id(1);
+
+ short Y = bufY[y * strideY + x];
+ short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
+ short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+ __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+ /**
+ * | R | ( | 256 0 403 | | Y | )
+ * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
+ * | B | ( | 256 475 0 | | V - 128 | )
+ */
+ int y256 = 256 * Y;
+ /* A */
+ destPtr[1] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+ destPtr[2] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255); /* G */
+ destPtr[3] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
+}
+
+__kernel void yuv444_to_abgr_1b(
+ __global const uchar *bufY, unsigned strideY,
+ __global const uchar *bufU, unsigned strideU,
+ __global const uchar *bufV, unsigned strideV,
+ __global uchar *dest, unsigned strideDest)
+{
+ unsigned int x = get_global_id(0);
+ unsigned int y = get_global_id(1);
+
+ short Y = bufY[y * strideY + x];
+ short U = bufU[y * strideU + x] - 128;
+ short V = bufV[y * strideV + x] - 128;
+
+ __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+ /**
+ * | R | ( | 256 0 403 | | Y | )
+ * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
+ * | B | ( | 256 475 0 | | V - 128 | )
+ */
+ int y256 = 256 * Y;
+ /* A */
+ destPtr[1] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+ destPtr[2] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255); /* G */
+ destPtr[3] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
+}
+
+__kernel void yuv444_to_rgba_1b(
+ __global const uchar *bufY, unsigned strideY,
+ __global const uchar *bufU, unsigned strideU,
+ __global const uchar *bufV, unsigned strideV,
+ __global uchar *dest, unsigned strideDest)
+{
+ unsigned int x = get_global_id(0);
+ unsigned int y = get_global_id(1);
+
+ short Y = bufY[y * strideY + x];
+ short U = bufU[y * strideU + x] - 128;
+ short V = bufV[y * strideV + x] - 128;
+
+ __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+ /**
+ * | R | ( | 256 0 403 | | Y | )
+ * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
+ * | B | ( | 256 475 0 | | V - 128 | )
+ */
+ int y256 = 256 * Y;
+ destPtr[0] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
+ destPtr[1] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255); /* G */
+ destPtr[2] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+ /* A */
+}
+
+__kernel void yuv420_to_rgbx_1b(
+ __global const uchar *bufY, unsigned strideY,
+ __global const uchar *bufU, unsigned strideU,
+ __global const uchar *bufV, unsigned strideV,
+ __global uchar *dest, unsigned strideDest)
+{
+ unsigned int x = get_global_id(0);
+ unsigned int y = get_global_id(1);
+
+ short Y = bufY[y * strideY + x];
+ short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
+ short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+ __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+ /**
+ * | R | ( | 256 0 403 | | Y | )
+ * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
+ * | B | ( | 256 475 0 | | V - 128 | )
+ */
+ int y256 = 256 * Y;
+ destPtr[0] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255); /* R */
+ destPtr[1] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8 , 0, 255); /* G */
+ destPtr[2] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255); /* B */
+ destPtr[3] = 0xff; /* A */
+}
+
+__kernel void yuv420_to_xbgr_1b(
+ __global const uchar *bufY, unsigned strideY,
+ __global const uchar *bufU, unsigned strideU,
+ __global const uchar *bufV, unsigned strideV,
+ __global uchar *dest, unsigned strideDest)
+{
+ unsigned int x = get_global_id(0);
+ unsigned int y = get_global_id(1);
+
+ short Y = bufY[y * strideY + x];
+ short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
+ short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+ __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+ /**
+ * | R | ( | 256 0 403 | | Y | )
+ * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
+ * | B | ( | 256 475 0 | | V - 128 | )
+ */
+ int y256 = 256 * Y;
+ destPtr[0] = 0xff; /* A */
+ destPtr[1] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+ destPtr[2] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255); /* G */
+ destPtr[3] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
+}
+
+__kernel void yuv444_to_xbgr_1b(
+ __global const uchar *bufY, unsigned strideY,
+ __global const uchar *bufU, unsigned strideU,
+ __global const uchar *bufV, unsigned strideV,
+ __global uchar *dest, unsigned strideDest)
+{
+ unsigned int x = get_global_id(0);
+ unsigned int y = get_global_id(1);
+
+ short Y = bufY[y * strideY + x];
+ short U = bufU[y * strideU + x] - 128;
+ short V = bufV[y * strideV + x] - 128;
+
+ __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+ /**
+ * | R | ( | 256 0 403 | | Y | )
+ * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
+ * | B | ( | 256 475 0 | | V - 128 | )
+ */
+ int y256 = 256 * Y;
+ destPtr[0] = 0xff; /* A */
+ destPtr[1] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+ destPtr[2] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255); /* G */
+ destPtr[3] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
+}
+
+__kernel void yuv444_to_rgbx_1b(
+ __global const uchar *bufY, unsigned strideY,
+ __global const uchar *bufU, unsigned strideU,
+ __global const uchar *bufV, unsigned strideV,
+ __global uchar *dest, unsigned strideDest)
+{
+ unsigned int x = get_global_id(0);
+ unsigned int y = get_global_id(1);
+
+ short Y = bufY[y * strideY + x];
+ short U = bufU[y * strideU + x] - 128;
+ short V = bufV[y * strideV + x] - 128;
+
+ __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+ /**
+ * | R | ( | 256 0 403 | | Y | )
+ * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
+ * | B | ( | 256 475 0 | | V - 128 | )
+ */
+ int y256 = 256 * Y;
+ destPtr[0] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
+ destPtr[1] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255); /* G */
+ destPtr[2] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+ destPtr[3] = 0xff; /* A */
+}
+
+
+__kernel void yuv420_to_argb_1b(
+ __global const uchar *bufY, unsigned strideY,
+ __global const uchar *bufU, unsigned strideU,
+ __global const uchar *bufV, unsigned strideV,
+ __global uchar *dest, unsigned strideDest)
+{
+ unsigned int x = get_global_id(0);
+ unsigned int y = get_global_id(1);
+
+ short Y = bufY[y * strideY + x];
+ short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
+ short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+ __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+ /**
+ * | R | ( | 256 0 403 | | Y | )
+ * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
+ * | B | ( | 256 475 0 | | V - 128 | )
+ */
+ int y256 = 256 * Y;
+ /* A */
+ destPtr[1] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255); /* R */
+ destPtr[2] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8 , 0, 255); /* G */
+ destPtr[3] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255); /* B */
+}
+
+__kernel void yuv420_to_bgra_1b(
+ __global const uchar *bufY, unsigned strideY,
+ __global const uchar *bufU, unsigned strideU,
+ __global const uchar *bufV, unsigned strideV,
+ __global uchar *dest, unsigned strideDest)
+{
+ unsigned int x = get_global_id(0);
+ unsigned int y = get_global_id(1);
+
+ short Y = bufY[y * strideY + x];
+ short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
+ short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+ __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+ /**
+ * | R | ( | 256 0 403 | | Y | )
+ * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
+ * | B | ( | 256 475 0 | | V - 128 | )
+ */
+ int y256 = 256 * Y;
+ destPtr[0] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+ destPtr[1] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255); /* G */
+ destPtr[2] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
+ /* A */
+}
+
+__kernel void yuv444_to_bgra_1b(
+ __global const uchar *bufY, unsigned strideY,
+ __global const uchar *bufU, unsigned strideU,
+ __global const uchar *bufV, unsigned strideV,
+ __global uchar *dest, unsigned strideDest)
+{
+ unsigned int x = get_global_id(0);
+ unsigned int y = get_global_id(1);
+
+ short Y = bufY[y * strideY + x];
+ short U = bufU[y * strideU + x] - 128;
+ short V = bufV[y * strideV + x] - 128;
+
+ __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+ /**
+ * | R | ( | 256 0 403 | | Y | )
+ * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
+ * | B | ( | 256 475 0 | | V - 128 | )
+ */
+ int y256 = 256 * Y;
+ destPtr[0] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+ destPtr[1] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255); /* G */
+ destPtr[2] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
+ /* A */
+}
+
+__kernel void yuv444_to_argb_1b(
+ __global const uchar *bufY, unsigned strideY,
+ __global const uchar *bufU, unsigned strideU,
+ __global const uchar *bufV, unsigned strideV,
+ __global uchar *dest, unsigned strideDest)
+{
+ unsigned int x = get_global_id(0);
+ unsigned int y = get_global_id(1);
+
+ short Y = bufY[y * strideY + x];
+ short U = bufU[y * strideU + x] - 128;
+ short V = bufV[y * strideV + x] - 128;
+
+ __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+ /**
+ * | R | ( | 256 0 403 | | Y | )
+ * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
+ * | B | ( | 256 475 0 | | V - 128 | )
+ */
+ int y256 = 256 * Y;
+ destPtr[3] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+ destPtr[2] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255); /* G */
+ destPtr[1] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
+ /* A */
+}
+
+__kernel void yuv420_to_xrgb_1b(
+ __global const uchar *bufY, unsigned strideY,
+ __global const uchar *bufU, unsigned strideU,
+ __global const uchar *bufV, unsigned strideV,
+ __global uchar *dest, unsigned strideDest)
+{
+ unsigned int x = get_global_id(0);
+ unsigned int y = get_global_id(1);
+
+ short Y = bufY[y * strideY + x];
+ short Udim = bufU[(y / 2) * strideU + (x / 2)] - 128;
+ short Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+ __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+ /**
+ * | R | ( | 256 0 403 | | Y | )
+ * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
+ * | B | ( | 256 475 0 | | V - 128 | )
+ */
+ int y256 = 256 * Y;
+ destPtr[0] = 0xff; /* A */
+ destPtr[1] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255); /* R */
+ destPtr[2] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8 , 0, 255); /* G */
+ destPtr[3] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255); /* B */
+}
+
+__kernel void yuv420_to_bgrx_1b(
+ __global const uchar *bufY, unsigned strideY,
+ __global const uchar *bufU, unsigned strideU,
+ __global const uchar *bufV, unsigned strideV,
+ __global uchar *dest, unsigned strideDest)
+{
+ unsigned int x = get_global_id(0);
+ unsigned int y = get_global_id(1);
+
+ short Y = bufY[y * strideY + x];
+ short U = bufU[(y / 2) * strideU + (x / 2)] - 128;
+ short V = bufV[(y / 2) * strideV + (x / 2)] - 128;
+
+ __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+ /**
+ * | R | ( | 256 0 403 | | Y | )
+ * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
+ * | B | ( | 256 475 0 | | V - 128 | )
+ */
+ int y256 = 256 * Y;
+ destPtr[0] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+ destPtr[1] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255); /* G */
+ destPtr[2] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
+ destPtr[3] = 0xff; /* A */
+}
+
+__kernel void yuv444_to_bgrx_1b(
+ __global const uchar *bufY, unsigned strideY,
+ __global const uchar *bufU, unsigned strideU,
+ __global const uchar *bufV, unsigned strideV,
+ __global uchar *dest, unsigned strideDest)
+{
+ unsigned int x = get_global_id(0);
+ unsigned int y = get_global_id(1);
+
+ short Y = bufY[y * strideY + x];
+ short U = bufU[y * strideU + x] - 128;
+ short V = bufV[y * strideV + x] - 128;
+
+ __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+ /**
+ * | R | ( | 256 0 403 | | Y | )
+ * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
+ * | B | ( | 256 475 0 | | V - 128 | )
+ */
+ int y256 = 256 * Y;
+ destPtr[0] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+ destPtr[1] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255); /* G */
+ destPtr[2] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
+ destPtr[3] = 0xff; /* A */
+}
+
+__kernel void yuv444_to_xrgb_1b(
+ __global const uchar *bufY, unsigned strideY,
+ __global const uchar *bufU, unsigned strideU,
+ __global const uchar *bufV, unsigned strideV,
+ __global uchar *dest, unsigned strideDest)
+{
+ unsigned int x = get_global_id(0);
+ unsigned int y = get_global_id(1);
+
+ short Y = bufY[y * strideY + x];
+ short U = bufU[y * strideU + x] - 128;
+ short V = bufV[y * strideV + x] - 128;
+
+ __global uchar *destPtr = dest + (strideDest * y) + (x * 4);
+
+ /**
+ * | R | ( | 256 0 403 | | Y | )
+ * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
+ * | B | ( | 256 475 0 | | V - 128 | )
+ */
+ int y256 = 256 * Y;
+ destPtr[3] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */
+ destPtr[2] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255); /* G */
+ destPtr[1] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
+ destPtr[0] = 0xff; /* A */
+}
+)
diff --git a/libfreerdp/primitives/test/CMakeLists.txt b/libfreerdp/primitives/test/CMakeLists.txt
new file mode 100644
index 0000000..f3b7b72
--- /dev/null
+++ b/libfreerdp/primitives/test/CMakeLists.txt
@@ -0,0 +1,45 @@
+
+set(MODULE_NAME "TestPrimitives")
+set(MODULE_PREFIX "TEST_FREERDP_PRIMITIVES")
+
+set(${MODULE_PREFIX}_DRIVER ${MODULE_NAME}.c)
+
+set(${MODULE_PREFIX}_TESTS
+ TestPrimitivesAdd.c
+ TestPrimitivesAlphaComp.c
+ TestPrimitivesAndOr.c
+ TestPrimitivesColors.c
+ TestPrimitivesCopy.c
+ TestPrimitivesSet.c
+ TestPrimitivesShift.c
+ TestPrimitivesSign.c
+ TestPrimitivesYUV.c
+ TestPrimitivesYCbCr.c
+ TestPrimitivesYCoCg.c)
+
+create_test_sourcelist(${MODULE_PREFIX}_SRCS
+ ${${MODULE_PREFIX}_DRIVER}
+ ${${MODULE_PREFIX}_TESTS})
+
+set(${MODULE_PREFIX}_EXTRA_SRCS
+ prim_test.c
+ prim_test.h
+ measure.h)
+
+add_executable(${MODULE_NAME} ${${MODULE_PREFIX}_SRCS} ${${MODULE_PREFIX}_EXTRA_SRCS})
+
+set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} winpr freerdp)
+
+target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS})
+
+add_definitions(-DPRIM_STATIC=auto -DALL_PRIMITIVES_VERSIONS)
+
+set_target_properties(${MODULE_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${TESTING_OUTPUT_DIRECTORY}")
+
+foreach(test ${${MODULE_PREFIX}_TESTS})
+ get_filename_component(TestName ${test} NAME_WE)
+ add_test(${TestName} ${TESTING_OUTPUT_DIRECTORY}/${MODULE_NAME} ${TestName})
+endforeach()
+
+set_property(TARGET ${MODULE_NAME} PROPERTY FOLDER "FreeRDP/Test")
+
diff --git a/libfreerdp/primitives/test/TestPrimitivesAdd.c b/libfreerdp/primitives/test/TestPrimitivesAdd.c
new file mode 100644
index 0000000..9edbae9
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesAdd.c
@@ -0,0 +1,82 @@
+/* test_add.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+
+#define FUNC_TEST_SIZE 65536
+/* ========================================================================= */
+static BOOL test_add16s_func(void)
+{
+ pstatus_t status = 0;
+
+ INT16 ALIGN(src1[FUNC_TEST_SIZE + 3]) = { 0 };
+ INT16 ALIGN(src2[FUNC_TEST_SIZE + 3]) = { 0 };
+ INT16 ALIGN(d1[FUNC_TEST_SIZE + 3]) = { 0 };
+ INT16 ALIGN(d2[FUNC_TEST_SIZE + 3]) = { 0 };
+
+ winpr_RAND(src1, sizeof(src1));
+ winpr_RAND(src2, sizeof(src2));
+ status = generic->add_16s(src1 + 1, src2 + 1, d1 + 1, FUNC_TEST_SIZE);
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ /* Unaligned */
+ status = optimized->add_16s(src1 + 1, src2 + 1, d2 + 2, FUNC_TEST_SIZE);
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_add16s_speed(void)
+{
+ BYTE ALIGN(src1[MAX_TEST_SIZE + 3]);
+ BYTE ALIGN(src2[MAX_TEST_SIZE + 3]);
+ BYTE ALIGN(dst[MAX_TEST_SIZE + 3]);
+
+ if (!g_TestPrimitivesPerformance)
+ return TRUE;
+
+ winpr_RAND(src1, sizeof(src1));
+ winpr_RAND(src2, sizeof(src2));
+
+ if (!speed_test("add16s", "aligned", g_Iterations, (speed_test_fkt)generic->add_16s,
+ (speed_test_fkt)optimized->add_16s, src1, src2, dst, FUNC_TEST_SIZE))
+ return FALSE;
+
+ return TRUE;
+}
+
+int TestPrimitivesAdd(int argc, char* argv[])
+{
+
+ WINPR_UNUSED(argc);
+ WINPR_UNUSED(argv);
+
+ prim_test_setup(FALSE);
+ if (!test_add16s_func())
+ return -1;
+
+ if (g_TestPrimitivesPerformance)
+ {
+ if (!test_add16s_speed())
+ return -1;
+ }
+
+ return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesAlphaComp.c b/libfreerdp/primitives/test/TestPrimitivesAlphaComp.c
new file mode 100644
index 0000000..5aecc2e
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesAlphaComp.c
@@ -0,0 +1,202 @@
+/* test_alphaComp.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+
+#include "prim_test.h"
+
+#define MAX_BLOCK_SIZE 256
+#define SIZE_SQUARED (MAX_BLOCK_SIZE * MAX_BLOCK_SIZE)
+
+/* ========================================================================= */
+#define ALF(_c_) (((_c_)&0xFF000000U) >> 24)
+#define RED(_c_) (((_c_)&0x00FF0000U) >> 16)
+#define GRN(_c_) (((_c_)&0x0000FF00U) >> 8)
+#define BLU(_c_) ((_c_)&0x000000FFU)
+#define TOLERANCE 1
+static inline const UINT32* PIXEL(const BYTE* _addr_, UINT32 _bytes_, UINT32 _x_, UINT32 _y_)
+{
+ const BYTE* addr = _addr_ + _x_ * sizeof(UINT32) + _y_ * _bytes_;
+ return (const UINT32*)addr;
+}
+
+#define SRC1_WIDTH 6
+#define SRC1_HEIGHT 6
+#define SRC2_WIDTH 7
+#define SRC2_HEIGHT 7
+#define DST_WIDTH 9
+#define DST_HEIGHT 9
+#define TEST_WIDTH 4
+#define TEST_HEIGHT 5
+
+/* ------------------------------------------------------------------------- */
+static UINT32 alpha_add(UINT32 c1, UINT32 c2)
+{
+ UINT32 a1 = ALF(c1);
+ UINT32 r1 = RED(c1);
+ UINT32 g1 = GRN(c1);
+ UINT32 b1 = BLU(c1);
+ UINT32 a2 = ALF(c2);
+ UINT32 r2 = RED(c2);
+ UINT32 g2 = GRN(c2);
+ UINT32 b2 = BLU(c2);
+ UINT32 a3 = ((a1 * a1 + (255 - a1) * a2) / 255) & 0xff;
+ UINT32 r3 = ((a1 * r1 + (255 - a1) * r2) / 255) & 0xff;
+ UINT32 g3 = ((a1 * g1 + (255 - a1) * g2) / 255) & 0xff;
+ UINT32 b3 = ((a1 * b1 + (255 - a1) * b2) / 255) & 0xff;
+ return (a3 << 24) | (r3 << 16) | (g3 << 8) | b3;
+}
+
+/* ------------------------------------------------------------------------- */
+static UINT32 colordist(UINT32 c1, UINT32 c2)
+{
+ int d = 0;
+ int maxd = 0;
+ d = ABS((INT32)(ALF(c1) - ALF(c2)));
+
+ if (d > maxd)
+ maxd = d;
+
+ d = ABS((INT32)(RED(c1) - RED(c2)));
+
+ if (d > maxd)
+ maxd = d;
+
+ d = ABS((INT32)(GRN(c1) - GRN(c2)));
+
+ if (d > maxd)
+ maxd = d;
+
+ d = ABS((INT32)(BLU(c1) - BLU(c2)));
+
+ if (d > maxd)
+ maxd = d;
+
+ return maxd;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL check(const BYTE* pSrc1, UINT32 src1Step, const BYTE* pSrc2, UINT32 src2Step,
+ BYTE* pDst, UINT32 dstStep, UINT32 width, UINT32 height)
+{
+ for (UINT32 y = 0; y < height; ++y)
+ {
+ for (UINT32 x = 0; x < width; ++x)
+ {
+ UINT32 s1 = *PIXEL(pSrc1, src1Step, x, y);
+ UINT32 s2 = *PIXEL(pSrc2, src2Step, x, y);
+ UINT32 c0 = alpha_add(s1, s2);
+ UINT32 c1 = *PIXEL(pDst, dstStep, x, y);
+
+ if (colordist(c0, c1) > TOLERANCE)
+ {
+ printf("alphaComp-general: [%" PRIu32 ",%" PRIu32 "] 0x%08" PRIx32 "+0x%08" PRIx32
+ "=0x%08" PRIx32 ", got 0x%08" PRIx32 "\n",
+ x, y, s1, s2, c0, c1);
+ return FALSE;
+ }
+ }
+ }
+
+ return TRUE;
+}
+
+static BOOL test_alphaComp_func(void)
+{
+ pstatus_t status = 0;
+ BYTE ALIGN(src1[SRC1_WIDTH * SRC1_HEIGHT * 4]) = { 0 };
+ BYTE ALIGN(src2[SRC2_WIDTH * SRC2_HEIGHT * 4]) = { 0 };
+ BYTE ALIGN(dst1[DST_WIDTH * DST_HEIGHT * 4]) = { 0 };
+ UINT32* ptr = NULL;
+ winpr_RAND(src1, sizeof(src1));
+ /* Special-case the first two values */
+ src1[0] &= 0x00FFFFFFU;
+ src1[1] |= 0xFF000000U;
+ winpr_RAND(src2, sizeof(src2));
+ /* Set the second operand to fully-opaque. */
+ ptr = (UINT32*)src2;
+
+ for (UINT32 i = 0; i < sizeof(src2) / 4; ++i)
+ *ptr++ |= 0xFF000000U;
+
+ status = generic->alphaComp_argb(src1, 4 * SRC1_WIDTH, src2, 4 * SRC2_WIDTH, dst1,
+ 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ if (!check(src1, 4 * SRC1_WIDTH, src2, 4 * SRC2_WIDTH, dst1, 4 * DST_WIDTH, TEST_WIDTH,
+ TEST_HEIGHT))
+ return FALSE;
+
+ status = optimized->alphaComp_argb((const BYTE*)src1, 4 * SRC1_WIDTH, (const BYTE*)src2,
+ 4 * SRC2_WIDTH, (BYTE*)dst1, 4 * DST_WIDTH, TEST_WIDTH,
+ TEST_HEIGHT);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ if (!check(src1, 4 * SRC1_WIDTH, src2, 4 * SRC2_WIDTH, dst1, 4 * DST_WIDTH, TEST_WIDTH,
+ TEST_HEIGHT))
+ return FALSE;
+
+ return TRUE;
+}
+
+static int test_alphaComp_speed(void)
+{
+ BYTE ALIGN(src1[SRC1_WIDTH * SRC1_HEIGHT]) = { 0 };
+ BYTE ALIGN(src2[SRC2_WIDTH * SRC2_HEIGHT]) = { 0 };
+ BYTE ALIGN(dst1[DST_WIDTH * DST_HEIGHT]) = { 0 };
+ UINT32* ptr = NULL;
+
+ winpr_RAND(src1, sizeof(src1));
+ /* Special-case the first two values */
+ src1[0] &= 0x00FFFFFFU;
+ src1[1] |= 0xFF000000U;
+ winpr_RAND(src2, sizeof(src2));
+ /* Set the second operand to fully-opaque. */
+ ptr = (UINT32*)src2;
+
+ for (UINT32 i = 0; i < sizeof(src2) / 4; ++i)
+ *ptr++ |= 0xFF000000U;
+
+ if (!speed_test("add16s", "aligned", g_Iterations, (speed_test_fkt)generic->alphaComp_argb,
+ (speed_test_fkt)optimized->alphaComp_argb, src1, 4 * SRC1_WIDTH, src2,
+ 4 * SRC2_WIDTH, dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT))
+ return FALSE;
+
+ return TRUE;
+}
+
+int TestPrimitivesAlphaComp(int argc, char* argv[])
+{
+ WINPR_UNUSED(argc);
+ WINPR_UNUSED(argv);
+
+ prim_test_setup(FALSE);
+
+ if (!test_alphaComp_func())
+ return -1;
+
+ if (g_TestPrimitivesPerformance)
+ {
+ if (!test_alphaComp_speed())
+ return -1;
+ }
+
+ return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesAndOr.c b/libfreerdp/primitives/test/TestPrimitivesAndOr.c
new file mode 100644
index 0000000..b3e52f6
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesAndOr.c
@@ -0,0 +1,169 @@
+/* test_andor.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+
+#include "prim_test.h"
+
+#define FUNC_TEST_SIZE 65536
+
+#define VALUE (0xA5A5A5A5U)
+
+/* ========================================================================= */
+static BOOL test_and_32u_impl(const char* name, __andC_32u_t fkt, const UINT32* src,
+ const UINT32 val, UINT32* dst, size_t size)
+{
+ pstatus_t status = fkt(src, val, dst, size);
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ for (size_t i = 0; i < size; ++i)
+ {
+ if (dst[i] != (src[i] & val))
+ {
+
+ printf("AND %s FAIL[%" PRIuz "] 0x%08" PRIx32 "&0x%08" PRIx32 "=0x%08" PRIx32
+ ", got 0x%08" PRIx32 "\n",
+ name, i, src[i], val, (src[i] & val), dst[i]);
+
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
+
+static BOOL test_and_32u_func(void)
+{
+ UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]) = { 0 };
+ UINT32 ALIGN(dst[FUNC_TEST_SIZE + 3]) = { 0 };
+
+ winpr_RAND(src, sizeof(src));
+
+ if (!test_and_32u_impl("generic->andC_32u aligned", generic->andC_32u, src + 1, VALUE, dst + 1,
+ FUNC_TEST_SIZE))
+ return FALSE;
+ if (!test_and_32u_impl("generic->andC_32u unaligned", generic->andC_32u, src + 1, VALUE,
+ dst + 2, FUNC_TEST_SIZE))
+ return FALSE;
+ if (!test_and_32u_impl("optimized->andC_32u aligned", optimized->andC_32u, src + 1, VALUE,
+ dst + 1, FUNC_TEST_SIZE))
+ return FALSE;
+ if (!test_and_32u_impl("optimized->andC_32u unaligned", optimized->andC_32u, src + 1, VALUE,
+ dst + 2, FUNC_TEST_SIZE))
+ return FALSE;
+
+ return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_and_32u_speed(void)
+{
+ UINT32 ALIGN(src[MAX_TEST_SIZE + 3]) = { 0 };
+ UINT32 ALIGN(dst[MAX_TEST_SIZE + 3]) = { 0 };
+
+ winpr_RAND(src, sizeof(src));
+
+ if (!speed_test("andC_32u", "aligned", g_Iterations, (speed_test_fkt)generic->andC_32u,
+ (speed_test_fkt)optimized->andC_32u, src + 1, VALUE, dst + 1, MAX_TEST_SIZE))
+ return FALSE;
+ if (!speed_test("andC_32u", "unaligned", g_Iterations, (speed_test_fkt)generic->andC_32u,
+ (speed_test_fkt)optimized->andC_32u, src + 1, VALUE, dst + 2, MAX_TEST_SIZE))
+ return FALSE;
+
+ return TRUE;
+}
+
+/* ========================================================================= */
+static BOOL check(const UINT32* src, const UINT32* dst, UINT32 size, UINT32 value)
+{
+ for (UINT32 i = 0; i < size; ++i)
+ {
+ if (dst[i] != (src[i] | value))
+ {
+ printf("OR-general general FAIL[%" PRIu32 "] 0x%08" PRIx32 "&0x%08" PRIx32
+ "=0x%08" PRIx32 ", got 0x%08" PRIx32 "\n",
+ i, src[i], value, src[i] | value, dst[i]);
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
+
+static BOOL test_or_32u_func(void)
+{
+ pstatus_t status = 0;
+ UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]) = { 0 };
+ UINT32 ALIGN(dst[FUNC_TEST_SIZE + 3]) = { 0 };
+
+ winpr_RAND(src, sizeof(src));
+
+ status = generic->orC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ if (!check(src + 1, dst + 1, FUNC_TEST_SIZE, VALUE))
+ return FALSE;
+
+ status = optimized->orC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ if (!check(src + 1, dst + 1, FUNC_TEST_SIZE, VALUE))
+ return FALSE;
+
+ return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_or_32u_speed(void)
+{
+ UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]) = { 0 };
+ UINT32 ALIGN(dst[FUNC_TEST_SIZE + 3]) = { 0 };
+
+ winpr_RAND(src, sizeof(src));
+
+ if (!speed_test("add16s", "aligned", g_Iterations, (speed_test_fkt)generic->orC_32u,
+ (speed_test_fkt)optimized->orC_32u, src + 1, VALUE, dst + 1, FUNC_TEST_SIZE))
+ return FALSE;
+
+ return TRUE;
+}
+
+int TestPrimitivesAndOr(int argc, char* argv[])
+{
+ WINPR_UNUSED(argc);
+ WINPR_UNUSED(argv);
+
+ prim_test_setup(FALSE);
+
+ if (!test_and_32u_func())
+ return -1;
+
+ if (!test_or_32u_func())
+ return -1;
+
+ if (g_TestPrimitivesPerformance)
+ {
+ if (!test_and_32u_speed())
+ return -1;
+ if (!test_or_32u_speed())
+ return -1;
+ }
+
+ return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesColors.c b/libfreerdp/primitives/test/TestPrimitivesColors.c
new file mode 100644
index 0000000..c297b4f
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesColors.c
@@ -0,0 +1,298 @@
+/* test_colors.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include <freerdp/utils/profiler.h>
+
+#include "prim_test.h"
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_RGBToRGB_16s8u_P3AC4R_func(prim_size_t roi, DWORD DstFormat)
+{
+ INT16* r = NULL;
+ INT16* g = NULL;
+ INT16* b = NULL;
+ BYTE* out1 = NULL;
+ BYTE* out2 = NULL;
+ BOOL failed = FALSE;
+ const INT16* ptrs[3];
+ const UINT32 rgbStride = roi.width * 2;
+ const UINT32 dstStride = roi.width * 4;
+ PROFILER_DEFINE(genericProf)
+ PROFILER_DEFINE(optProf)
+ PROFILER_CREATE(genericProf, "RGBToRGB_16s8u_P3AC4R-GENERIC")
+ PROFILER_CREATE(optProf, "RGBToRGB_16s8u_P3AC4R-OPTIMIZED")
+ r = winpr_aligned_calloc(1, rgbStride * roi.height, 16);
+ g = winpr_aligned_calloc(1, rgbStride * roi.height, 16);
+ b = winpr_aligned_calloc(1, rgbStride * roi.height, 16);
+ out1 = winpr_aligned_calloc(1, dstStride * roi.height, 16);
+ out2 = winpr_aligned_calloc(1, dstStride * roi.height, 16);
+
+ if (!r || !g || !b || !out1 || !out2)
+ goto fail;
+
+#if 0
+ {
+ for (UINT32 y = 0; y < roi.height; y++)
+ {
+ for (UINT32 x = 0; x < roi.width; x++)
+ {
+ r[y * roi.width + x] = 0x01;
+ g[y * roi.width + x] = 0x02;
+ b[y * roi.width + x] = 0x04;
+ }
+ }
+ }
+#else
+ winpr_RAND(r, rgbStride * roi.height);
+ winpr_RAND(g, rgbStride * roi.height);
+ winpr_RAND(b, rgbStride * roi.height);
+#endif
+ ptrs[0] = r;
+ ptrs[1] = g;
+ ptrs[2] = b;
+ PROFILER_ENTER(genericProf)
+
+ if (generic->RGBToRGB_16s8u_P3AC4R(ptrs, rgbStride, out1, dstStride, DstFormat, &roi) !=
+ PRIMITIVES_SUCCESS)
+ goto fail;
+
+ PROFILER_EXIT(genericProf)
+ PROFILER_ENTER(optProf)
+
+ if (optimized->RGBToRGB_16s8u_P3AC4R(ptrs, rgbStride, out2, dstStride, DstFormat, &roi) !=
+ PRIMITIVES_SUCCESS)
+ goto fail;
+
+ PROFILER_EXIT(optProf)
+
+ if (memcmp(out1, out2, dstStride * roi.height) != 0)
+ {
+ for (UINT64 i = 0; i < 1ull * roi.width * roi.height; ++i)
+ {
+ const UINT32 o1 = FreeRDPReadColor(out1 + 4 * i, DstFormat);
+ const UINT32 o2 = FreeRDPReadColor(out2 + 4 * i, DstFormat);
+
+ if (o1 != o2)
+ {
+ printf("RGBToRGB_16s8u_P3AC4R FAIL: out1[%" PRIu64 "]=0x%08" PRIx8 " out2[%" PRIu64
+ "]=0x%08" PRIx8 "\n",
+ i, out1[i], i, out2[i]);
+ failed = TRUE;
+ }
+ }
+ }
+
+ printf("Results for %" PRIu32 "x%" PRIu32 " [%s]", roi.width, roi.height,
+ FreeRDPGetColorFormatName(DstFormat));
+ PROFILER_PRINT_HEADER
+ PROFILER_PRINT(genericProf)
+ PROFILER_PRINT(optProf)
+ PROFILER_PRINT_FOOTER
+fail:
+ PROFILER_FREE(genericProf)
+ PROFILER_FREE(optProf)
+ winpr_aligned_free(r);
+ winpr_aligned_free(g);
+ winpr_aligned_free(b);
+ winpr_aligned_free(out1);
+ winpr_aligned_free(out2);
+ return !failed;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_RGBToRGB_16s8u_P3AC4R_speed(void)
+{
+ union
+ {
+ const INT16** cpv;
+ INT16** pv;
+ } cnv;
+ const prim_size_t roi64x64 = { 64, 64 };
+ INT16 ALIGN(r[4096 + 1]);
+ INT16 ALIGN(g[4096 + 1]);
+ INT16 ALIGN(b[4096 + 1]);
+ UINT32 ALIGN(dst[4096 + 1]);
+ INT16* ptrs[3];
+ winpr_RAND(r, sizeof(r));
+ winpr_RAND(g, sizeof(g));
+ winpr_RAND(b, sizeof(b));
+
+ /* clear upper bytes */
+ for (int i = 0; i < 4096; ++i)
+ {
+ r[i] &= 0x00FFU;
+ g[i] &= 0x00FFU;
+ b[i] &= 0x00FFU;
+ }
+
+ ptrs[0] = r + 1;
+ ptrs[1] = g + 1;
+ ptrs[2] = b + 1;
+
+ cnv.pv = ptrs;
+ if (!speed_test("RGBToRGB_16s8u_P3AC4R", "aligned", g_Iterations,
+ generic->RGBToRGB_16s8u_P3AC4R, optimized->RGBToRGB_16s8u_P3AC4R, cnv.cpv,
+ 64 * 2, (BYTE*)dst, 64 * 4, &roi64x64))
+ return FALSE;
+
+ if (!speed_test("RGBToRGB_16s8u_P3AC4R", "unaligned", g_Iterations,
+ generic->RGBToRGB_16s8u_P3AC4R, optimized->RGBToRGB_16s8u_P3AC4R, cnv.cpv,
+ 64 * 2, ((BYTE*)dst) + 1, 64 * 4, &roi64x64))
+ return FALSE;
+
+ return TRUE;
+}
+
+/* ========================================================================= */
+static BOOL test_yCbCrToRGB_16s16s_P3P3_func(void)
+{
+ pstatus_t status = 0;
+ INT16 ALIGN(y[4096]) = { 0 };
+ INT16 ALIGN(cb[4096]) = { 0 };
+ INT16 ALIGN(cr[4096]) = { 0 };
+ INT16 ALIGN(r1[4096]) = { 0 };
+ INT16 ALIGN(g1[4096]) = { 0 };
+ INT16 ALIGN(b1[4096]) = { 0 };
+ INT16 ALIGN(r2[4096]) = { 0 };
+ INT16 ALIGN(g2[4096]) = { 0 };
+ INT16 ALIGN(b2[4096]) = { 0 };
+ const INT16* in[3];
+ INT16* out1[3];
+ INT16* out2[3];
+ prim_size_t roi = { 64, 64 };
+ winpr_RAND(y, sizeof(y));
+ winpr_RAND(cb, sizeof(cb));
+ winpr_RAND(cr, sizeof(cr));
+
+ /* Normalize to 11.5 fixed radix */
+ for (int i = 0; i < 4096; ++i)
+ {
+ y[i] &= 0x1FE0U;
+ cb[i] &= 0x1FE0U;
+ cr[i] &= 0x1FE0U;
+ }
+
+ in[0] = y;
+ in[1] = cb;
+ in[2] = cr;
+ out1[0] = r1;
+ out1[1] = g1;
+ out1[2] = b1;
+ out2[0] = r2;
+ out2[1] = g2;
+ out2[2] = b2;
+ status = generic->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out1, 64 * 2, &roi);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out2, 64 * 2, &roi);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ for (int i = 0; i < 4096; ++i)
+ {
+ if ((ABS(r1[i] - r2[i]) > 1) || (ABS(g1[i] - g2[i]) > 1) || (ABS(b1[i] - b2[i]) > 1))
+ {
+ printf("YCbCrToRGB-SSE FAIL[%d]: %" PRId16 ",%" PRId16 ",%" PRId16 " vs %" PRId16
+ ",%" PRId16 ",%" PRId16 "\n",
+ i, r1[i], g1[i], b1[i], r2[i], g2[i], b2[i]);
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static int test_yCbCrToRGB_16s16s_P3P3_speed(void)
+{
+ prim_size_t roi = { 64, 64 };
+ INT16 ALIGN(y[4096]);
+ INT16 ALIGN(cb[4096]);
+ INT16 ALIGN(cr[4096]);
+ INT16 ALIGN(r[4096]);
+ INT16 ALIGN(g[4096]);
+ INT16 ALIGN(b[4096]);
+ const INT16* input[3];
+ INT16* output[3];
+ winpr_RAND(y, sizeof(y));
+ winpr_RAND(cb, sizeof(cb));
+ winpr_RAND(cr, sizeof(cr));
+
+ /* Normalize to 11.5 fixed radix */
+ for (int i = 0; i < 4096; ++i)
+ {
+ y[i] &= 0x1FE0U;
+ cb[i] &= 0x1FE0U;
+ cr[i] &= 0x1FE0U;
+ }
+
+ input[0] = y;
+ input[1] = cb;
+ input[2] = cr;
+ output[0] = r;
+ output[1] = g;
+ output[2] = b;
+
+ if (!speed_test("yCbCrToRGB_16s16s_P3P3", "aligned", g_Iterations,
+ (speed_test_fkt)generic->yCbCrToRGB_16s16s_P3P3,
+ (speed_test_fkt)optimized->yCbCrToRGB_16s16s_P3P3, input, 64 * 2, output,
+ 64 * 2, &roi))
+ return FALSE;
+
+ return TRUE;
+}
+
+int TestPrimitivesColors(int argc, char* argv[])
+{
+ const DWORD formats[] = { PIXEL_FORMAT_ARGB32, PIXEL_FORMAT_XRGB32, PIXEL_FORMAT_ABGR32,
+ PIXEL_FORMAT_XBGR32, PIXEL_FORMAT_RGBA32, PIXEL_FORMAT_RGBX32,
+ PIXEL_FORMAT_BGRA32, PIXEL_FORMAT_BGRX32 };
+ prim_size_t roi = { 1920 / 4, 1080 / 4 };
+ WINPR_UNUSED(argc);
+ WINPR_UNUSED(argv);
+ prim_test_setup(FALSE);
+
+ for (UINT32 x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
+ {
+ if (!test_RGBToRGB_16s8u_P3AC4R_func(roi, formats[x]))
+ return 1;
+
+#if 0
+
+ if (g_TestPrimitivesPerformance)
+ {
+ if (!test_RGBToRGB_16s8u_P3AC4R_speed())
+ return 1;
+ }
+
+ if (!test_yCbCrToRGB_16s16s_P3P3_func())
+ return 1;
+
+ if (g_TestPrimitivesPerformance)
+ {
+ if (!test_yCbCrToRGB_16s16s_P3P3_speed())
+ return 1;
+ }
+
+#endif
+ }
+
+ return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesCopy.c b/libfreerdp/primitives/test/TestPrimitivesCopy.c
new file mode 100644
index 0000000..8c681f2
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesCopy.c
@@ -0,0 +1,90 @@
+/* test_copy.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+
+#define COPY_TESTSIZE (256 * 2 + 16 * 2 + 15 + 15)
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_copy8u_func(void)
+{
+ primitives_t* prims = primitives_get();
+ BYTE ALIGN(data[COPY_TESTSIZE + 15]) = { 0 };
+ winpr_RAND(data, sizeof(data));
+
+ for (int soff = 0; soff < 16; ++soff)
+ {
+ for (int doff = 0; doff < 16; ++doff)
+ {
+ for (int length = 1; length <= COPY_TESTSIZE - doff; ++length)
+ {
+ BYTE ALIGN(dest[COPY_TESTSIZE + 15]) = { 0 };
+
+ if (prims->copy_8u(data + soff, dest + doff, length) != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ for (int i = 0; i < length; ++i)
+ {
+ if (dest[i + doff] != data[i + soff])
+ {
+ printf("COPY8U FAIL: off=%d len=%d, dest[%d]=0x%02" PRIx8 ""
+ "data[%d]=0x%02" PRIx8 "\n",
+ doff, length, i + doff, dest[i + doff], i + soff, data[i + soff]);
+ return FALSE;
+ }
+ }
+ }
+ }
+ }
+
+ return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_copy8u_speed(void)
+{
+ BYTE ALIGN(src[MAX_TEST_SIZE + 4]);
+ BYTE ALIGN(dst[MAX_TEST_SIZE + 4]);
+
+ if (!speed_test("copy_8u", "aligned", g_Iterations, (speed_test_fkt)generic->copy_8u,
+ (speed_test_fkt)optimized->copy_8u, src, dst, MAX_TEST_SIZE))
+ return FALSE;
+
+ if (!speed_test("copy_8u", "unaligned", g_Iterations, (speed_test_fkt)generic->copy_8u,
+ (speed_test_fkt)optimized->copy_8u, src + 1, dst + 1, MAX_TEST_SIZE))
+ return FALSE;
+
+ return TRUE;
+}
+
+int TestPrimitivesCopy(int argc, char* argv[])
+{
+ WINPR_UNUSED(argc);
+ WINPR_UNUSED(argv);
+ prim_test_setup(FALSE);
+
+ if (!test_copy8u_func())
+ return 1;
+
+ if (g_TestPrimitivesPerformance)
+ {
+ if (!test_copy8u_speed())
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesSet.c b/libfreerdp/primitives/test/TestPrimitivesSet.c
new file mode 100644
index 0000000..c6cefcc
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesSet.c
@@ -0,0 +1,274 @@
+/* test_set.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+
+/* ------------------------------------------------------------------------- */
+static BOOL check8(const BYTE* src, UINT32 length, UINT32 offset, BYTE value)
+{
+ for (UINT32 i = 0; i < length; ++i)
+ {
+ if (src[offset + i] != value)
+ {
+ printf("SET8U FAILED: off=%" PRIu32 " len=%" PRIu32 " dest[%" PRIu32 "]=0x%02" PRIx8
+ "\n",
+ offset, length, i + offset, src[i + offset]);
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
+
+static BOOL test_set8u_func(void)
+{
+ pstatus_t status = 0;
+
+ for (UINT32 off = 0; off < 16; ++off)
+ {
+ BYTE dest[1024];
+
+ memset(dest, 3, sizeof(dest));
+ for (UINT32 len = 1; len < 48 - off; ++len)
+ {
+ status = generic->set_8u(0xa5, dest + off, len);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ if (!check8(dest, len, off, 0xa5))
+ return FALSE;
+ }
+ }
+
+ for (UINT32 off = 0; off < 16; ++off)
+ {
+ BYTE dest[1024];
+
+ memset(dest, 3, sizeof(dest));
+ for (UINT32 len = 1; len < 48 - off; ++len)
+ {
+ status = optimized->set_8u(0xa5, dest + off, len);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ if (!check8(dest, len, off, 0xa5))
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_set8u_speed(void)
+{
+ BYTE dest[1024];
+ BYTE value = 0;
+
+ for (UINT32 x = 0; x < 16; x++)
+ {
+ winpr_RAND(&value, sizeof(value));
+
+ if (!speed_test("set_8u", "", g_Iterations, (speed_test_fkt)generic->set_8u,
+ (speed_test_fkt)optimized->set_8u, value, dest + x, x))
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+static BOOL check32s(const INT32* src, UINT32 length, UINT32 offset, INT32 value)
+{
+ for (UINT32 i = 0; i < length; ++i)
+ {
+ if (src[offset + i] != value)
+ {
+ printf("SET8U FAILED: off=%" PRIu32 " len=%" PRIu32 " dest[%" PRIu32 "]=0x%08" PRIx32
+ "\n",
+ offset, length, i + offset, src[i + offset]);
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_set32s_func(void)
+{
+ pstatus_t status = 0;
+ const INT32 value = -0x12345678;
+
+ for (UINT32 off = 0; off < 16; ++off)
+ {
+ INT32 dest[1024] = { 0 };
+
+ for (UINT32 len = 1; len < 48 - off; ++len)
+ {
+ status = generic->set_32s(value, dest + off, len);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ if (!check32s(dest, len, off, value))
+ return FALSE;
+ }
+ }
+
+ for (UINT32 off = 0; off < 16; ++off)
+ {
+ INT32 dest[1024] = { 0 };
+
+ for (UINT32 len = 1; len < 48 - off; ++len)
+ {
+ status = optimized->set_32s(value, dest + off, len);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ if (!check32s(dest, len, off, value))
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
+
+static BOOL check32u(const UINT32* src, UINT32 length, UINT32 offset, UINT32 value)
+{
+ for (UINT32 i = 0; i < length; ++i)
+ {
+ if (src[offset + i] != value)
+ {
+ printf("SET8U FAILED: off=%" PRIu32 " len=%" PRIu32 " dest[%" PRIu32 "]=0x%08" PRIx32
+ "\n",
+ offset, length, i + offset, src[i + offset]);
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_set32u_func(void)
+{
+ pstatus_t status = 0;
+ const UINT32 value = 0xABCDEF12;
+
+ for (UINT32 off = 0; off < 16; ++off)
+ {
+ UINT32 dest[1024] = { 0 };
+
+ for (UINT32 len = 1; len < 48 - off; ++len)
+ {
+ status = generic->set_32u(value, dest + off, len);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ if (!check32u(dest, len, off, value))
+ return FALSE;
+ }
+ }
+
+ for (UINT32 off = 0; off < 16; ++off)
+ {
+ UINT32 dest[1024] = { 0 };
+
+ for (UINT32 len = 1; len < 48 - off; ++len)
+ {
+ status = optimized->set_32u(value, dest + off, len);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ if (!check32u(dest, len, off, value))
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_set32u_speed(void)
+{
+ UINT32 dest[1024];
+ BYTE value = 0;
+
+ for (UINT32 x = 0; x < 16; x++)
+ {
+ winpr_RAND(&value, sizeof(value));
+
+ if (!speed_test("set_32u", "", g_Iterations, (speed_test_fkt)generic->set_32u,
+ (speed_test_fkt)optimized->set_32u, value, dest + x, x))
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_set32s_speed(void)
+{
+ INT32 dest[1024];
+ BYTE value = 0;
+
+ for (UINT32 x = 0; x < 16; x++)
+ {
+ winpr_RAND(&value, sizeof(value));
+
+ if (!speed_test("set_32s", "", g_Iterations, (speed_test_fkt)generic->set_32s,
+ (speed_test_fkt)optimized->set_32s, value, dest + x, x))
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+int TestPrimitivesSet(int argc, char* argv[])
+{
+ WINPR_UNUSED(argc);
+ WINPR_UNUSED(argv);
+ prim_test_setup(FALSE);
+
+ if (!test_set8u_func())
+ return -1;
+
+ if (!test_set32s_func())
+ return -1;
+
+ if (!test_set32u_func())
+ return -1;
+
+ if (g_TestPrimitivesPerformance)
+ {
+ if (!test_set8u_speed())
+ return -1;
+
+ if (!test_set32s_speed())
+ return -1;
+
+ if (!test_set32u_speed())
+ return -1;
+ }
+
+ return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesShift.c b/libfreerdp/primitives/test/TestPrimitivesShift.c
new file mode 100644
index 0000000..8845838
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesShift.c
@@ -0,0 +1,470 @@
+/* test_shift.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+
+#define FUNC_TEST_SIZE 65536
+
+static BOOL test_lShift_16s_func(void)
+{
+ pstatus_t status = 0;
+ INT16 ALIGN(src[FUNC_TEST_SIZE + 3]);
+ INT16 ALIGN(d1[FUNC_TEST_SIZE + 3]);
+ UINT32 val = 0;
+ winpr_RAND(&val, sizeof(val));
+ winpr_RAND(src, sizeof(src));
+ val = val % 16;
+ /* Negative tests */
+ status = generic->lShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status == PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->lShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status == PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ /* Aligned */
+ status = generic->lShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->lShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ /* Unaligned */
+ status = generic->lShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->lShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ return TRUE;
+}
+
+static BOOL test_lShift_16u_func(void)
+{
+ pstatus_t status = 0;
+ UINT16 ALIGN(src[FUNC_TEST_SIZE + 3]);
+ UINT16 ALIGN(d1[FUNC_TEST_SIZE + 3]);
+ UINT32 val = 0;
+ winpr_RAND(&val, sizeof(val));
+ winpr_RAND(src, sizeof(src));
+ val = val % 16;
+
+ /* Negative tests */
+ status = generic->lShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status == PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->lShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status == PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ /* Aligned */
+ status = generic->lShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->lShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ /* Unaligned */
+ status = generic->lShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->lShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ return TRUE;
+}
+
+static BOOL test_rShift_16s_func(void)
+{
+ pstatus_t status = 0;
+ INT16 ALIGN(src[FUNC_TEST_SIZE + 3]);
+ INT16 ALIGN(d1[FUNC_TEST_SIZE + 3]);
+ UINT32 val = 0;
+ winpr_RAND(&val, sizeof(val));
+ winpr_RAND(src, sizeof(src));
+ val = val % 16;
+
+ /* Negative Tests */
+ status = generic->rShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status == PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->rShiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status == PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ /* Aligned */
+ status = generic->rShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->rShiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ /* Unaligned */
+ status = generic->rShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->rShiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ return TRUE;
+}
+
+static BOOL test_rShift_16u_func(void)
+{
+ pstatus_t status = 0;
+ UINT16 ALIGN(src[FUNC_TEST_SIZE + 3]);
+ UINT16 ALIGN(d1[FUNC_TEST_SIZE + 3]);
+ UINT32 val = 0;
+ winpr_RAND(&val, sizeof(val));
+ winpr_RAND(src, sizeof(src));
+ val = val % 16;
+ /* Negative tests */
+ status = generic->rShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status == PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->rShiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status == PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ /* Aligned */
+ status = generic->rShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->rShiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ /* Unaligned */
+ status = generic->rShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->rShiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ return TRUE;
+}
+
+static BOOL test_ShiftWrapper_16s_func(void)
+{
+ pstatus_t status = 0;
+ INT16 ALIGN(src[FUNC_TEST_SIZE + 3]);
+ INT16 ALIGN(d1[FUNC_TEST_SIZE + 3]);
+ UINT32 tmp = 0;
+ INT32 val = 0;
+ winpr_RAND(&tmp, sizeof(tmp));
+ winpr_RAND(src, sizeof(src));
+ val = tmp % 16;
+
+ /* Negative tests */
+ status = generic->shiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status == PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->shiftC_16s(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status == PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ /* Aligned */
+ status = generic->shiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->shiftC_16s(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = generic->shiftC_16s(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->shiftC_16s(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ /* Unaligned */
+ status = generic->shiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->shiftC_16s(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = generic->shiftC_16s(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->shiftC_16s(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ return TRUE;
+}
+
+static BOOL test_ShiftWrapper_16u_func(void)
+{
+ pstatus_t status = 0;
+ UINT16 ALIGN(src[FUNC_TEST_SIZE + 3]);
+ UINT16 ALIGN(d1[FUNC_TEST_SIZE + 3]);
+ UINT32 tmp = 0;
+ INT32 val = 0;
+ winpr_RAND(&tmp, sizeof(tmp));
+ winpr_RAND(src, sizeof(src));
+ val = tmp % 16;
+
+ /* Negative */
+ status = generic->shiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+ if (status == PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->shiftC_16u(src + 1, 16, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status == PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ /* Aligned */
+ status = generic->shiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->shiftC_16u(src + 1, val, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = generic->shiftC_16u(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->shiftC_16u(src + 1, -val, d1 + 1, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ /* Unaligned */
+ status = generic->shiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->shiftC_16u(src + 1, val, d1 + 2, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = generic->shiftC_16u(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->shiftC_16u(src + 1, -val, d1 + 2, FUNC_TEST_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_lShift_16s_speed(void)
+{
+ UINT32 val = 0;
+ INT16 ALIGN(src[MAX_TEST_SIZE + 1]);
+ INT16 ALIGN(dst[MAX_TEST_SIZE + 1]);
+ winpr_RAND(src, sizeof(src));
+ winpr_RAND(&val, sizeof(val));
+
+ val = val % 16;
+ if (!speed_test("lShift_16s", "aligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16s,
+ (speed_test_fkt)optimized->lShiftC_16s, src, val, dst, MAX_TEST_SIZE))
+ return FALSE;
+
+ if (!speed_test("lShift_16s", "unaligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16s,
+ (speed_test_fkt)optimized->lShiftC_16s, src + 1, val, dst, MAX_TEST_SIZE))
+ return FALSE;
+
+ return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_lShift_16u_speed(void)
+{
+ UINT32 val = 0;
+ UINT16 ALIGN(src[MAX_TEST_SIZE + 1]);
+ UINT16 ALIGN(dst[MAX_TEST_SIZE + 1]);
+ winpr_RAND(&val, sizeof(val));
+ winpr_RAND(src, sizeof(src));
+
+ val = val % 16;
+ if (!speed_test("lShift_16u", "aligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16u,
+ (speed_test_fkt)optimized->lShiftC_16u, src, val, dst, MAX_TEST_SIZE))
+ return FALSE;
+
+ if (!speed_test("lShift_16u", "unaligned", g_Iterations, (speed_test_fkt)generic->lShiftC_16u,
+ (speed_test_fkt)optimized->lShiftC_16u, src + 1, val, dst, MAX_TEST_SIZE))
+ return FALSE;
+
+ return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_rShift_16s_speed(void)
+{
+ UINT32 val = 0;
+ INT16 ALIGN(src[MAX_TEST_SIZE + 1]);
+ INT16 ALIGN(dst[MAX_TEST_SIZE + 1]);
+ winpr_RAND(src, sizeof(src));
+ winpr_RAND(&val, sizeof(val));
+
+ val = val % 16;
+ if (!speed_test("rShift_16s", "aligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16s,
+ (speed_test_fkt)optimized->rShiftC_16s, src, val, dst, MAX_TEST_SIZE))
+ return FALSE;
+
+ if (!speed_test("rShift_16s", "unaligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16s,
+ (speed_test_fkt)optimized->rShiftC_16s, src + 1, val, dst, MAX_TEST_SIZE))
+ return FALSE;
+
+ return TRUE;
+}
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_rShift_16u_speed(void)
+{
+ UINT32 val = 0;
+ UINT16 ALIGN(src[MAX_TEST_SIZE + 1]);
+ UINT16 ALIGN(dst[MAX_TEST_SIZE + 1]);
+ winpr_RAND(&val, sizeof(val));
+ winpr_RAND(src, sizeof(src));
+
+ val = val % 16;
+ if (!speed_test("rShift_16u", "aligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16u,
+ (speed_test_fkt)optimized->rShiftC_16u, src, val, dst, MAX_TEST_SIZE))
+ return FALSE;
+
+ if (!speed_test("rShift_16u", "unaligned", g_Iterations, (speed_test_fkt)generic->rShiftC_16u,
+ (speed_test_fkt)optimized->rShiftC_16u, src + 1, val, dst, MAX_TEST_SIZE))
+ return FALSE;
+
+ return TRUE;
+}
+
+int TestPrimitivesShift(int argc, char* argv[])
+{
+ WINPR_UNUSED(argc);
+ WINPR_UNUSED(argv);
+ prim_test_setup(FALSE);
+
+ if (!test_lShift_16s_func())
+ return 1;
+
+ if (g_TestPrimitivesPerformance)
+ {
+ if (!test_lShift_16s_speed())
+ return 1;
+ }
+
+ if (!test_lShift_16u_func())
+ return 1;
+
+ if (g_TestPrimitivesPerformance)
+ {
+ if (!test_lShift_16u_speed())
+ return 1;
+ }
+
+ if (!test_rShift_16s_func())
+ return 1;
+
+ if (g_TestPrimitivesPerformance)
+ {
+ if (!test_rShift_16s_speed())
+ return 1;
+ }
+
+ if (!test_rShift_16u_func())
+ return 1;
+
+ if (g_TestPrimitivesPerformance)
+ {
+ if (!test_rShift_16u_speed())
+ return 1;
+ }
+
+ if (!test_ShiftWrapper_16s_func())
+ return 1;
+
+ if (!test_ShiftWrapper_16u_func())
+ return 1;
+
+ return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesSign.c b/libfreerdp/primitives/test/TestPrimitivesSign.c
new file mode 100644
index 0000000..fb9549a
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesSign.c
@@ -0,0 +1,93 @@
+/* test_sign.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+
+#define TEST_BUFFER_SIZE 65535
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_sign16s_func(void)
+{
+ pstatus_t status = 0;
+ INT16 ALIGN(src[TEST_BUFFER_SIZE + 16]) = { 0 };
+ INT16 ALIGN(d1[TEST_BUFFER_SIZE + 16]) = { 0 };
+ INT16 ALIGN(d2[TEST_BUFFER_SIZE + 16]) = { 0 };
+ winpr_RAND(src, sizeof(src));
+ status = generic->sign_16s(src + 1, d1 + 1, TEST_BUFFER_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->sign_16s(src + 1, d2 + 1, TEST_BUFFER_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ if (memcmp(d1, d2, sizeof(d1)) != 0)
+ return FALSE;
+
+ status = generic->sign_16s(src + 1, d1 + 2, TEST_BUFFER_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ status = optimized->sign_16s(src + 1, d2 + 2, TEST_BUFFER_SIZE);
+
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ if (memcmp(d1, d2, sizeof(d1)) != 0)
+ return FALSE;
+
+ return TRUE;
+}
+
+static int test_sign16s_speed(void)
+{
+ INT16 ALIGN(src[MAX_TEST_SIZE + 3]) = { 0 };
+ INT16 ALIGN(dst[MAX_TEST_SIZE + 3]) = { 0 };
+ winpr_RAND(src, sizeof(src));
+
+ if (!speed_test("sign16s", "aligned", g_Iterations, (speed_test_fkt)generic->sign_16s,
+ (speed_test_fkt)optimized->sign_16s, src + 1, dst + 1, MAX_TEST_SIZE))
+ return FALSE;
+
+ if (!speed_test("sign16s", "unaligned", g_Iterations, (speed_test_fkt)generic->sign_16s,
+ (speed_test_fkt)optimized->sign_16s, src + 1, dst + 2, MAX_TEST_SIZE))
+ return FALSE;
+
+ return TRUE;
+}
+
+int TestPrimitivesSign(int argc, char* argv[])
+{
+ WINPR_UNUSED(argc);
+ WINPR_UNUSED(argv);
+
+ prim_test_setup(FALSE);
+
+ if (!test_sign16s_func())
+ return 1;
+
+ if (g_TestPrimitivesPerformance)
+ {
+ if (!test_sign16s_speed())
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesYCbCr.c b/libfreerdp/primitives/test/TestPrimitivesYCbCr.c
new file mode 100644
index 0000000..64e7f91
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesYCbCr.c
@@ -0,0 +1,1835 @@
+
+#include "prim_test.h"
+
+#include <winpr/print.h>
+#include <freerdp/codec/color.h>
+#include <winpr/wlog.h>
+#include <freerdp/utils/profiler.h>
+
+#include <freerdp/config.h>
+
+#define TAG __FILE__
+
+static const INT16 TEST_Y_COMPONENT[4096] = {
+ -32, +16, +64, +272, -32, -16, +0, -16, -32, -24, -16, -8, +0,
+ -24, -48, -72, -96, -90, -84, -78, -72, -98, -124, -150, -176, -192,
+ -208, -224, -240, -256, -272, -288, -304, -304, -304, -304, -304, -336, -368,
+ -400, -432, -450, -468, -486, -504, -522, -540, -558, -576, -598, -620, -642,
+ -664, -686, -708, -730, -752, -768, -784, -800, -816, -816, -816, -816, +68,
+ +120, +172, +240, +53, +55, +57, +43, +30, +32, +34, +36, +38, +20,
+ +2, -16, -34, -36, -38, -40, -42, -68, -94, -120, -146, -148, -151,
+ -186, -220, -227, -233, -240, -247, -254, -261, -268, -275, -302, -329, -356,
+ -384, -403, -423, -443, -463, -484, -506, -528, -550, -572, -594, -616, -639,
+ -673, -707, -709, -712, -733, -754, -775, -796, -796, -796, -796, +168, +224,
+ +281, +209, +138, +126, +115, +103, +92, +88, +84, +80, +76, +64, +52,
+ +40, +28, +18, +8, -2, -12, -38, -64, -90, -116, -105, -95, -148,
+ -201, -198, -195, -192, -190, -204, -218, -232, -247, -269, -291, -313, -336,
+ -357, -379, -400, -422, -447, -473, -498, -524, -546, -569, -591, -614, -660,
+ -707, -689, -672, -698, -724, -750, -776, -776, -776, -776, +268, +312, +357,
+ +273, +191, +181, +172, +162, +154, +144, +134, +124, +114, +108, +102, +80,
+ +58, +56, +54, +52, +50, +24, -2, -44, -86, -61, -38, -93, -149,
+ -137, -124, -144, -165, -170, -175, -196, -218, -235, -252, -269, -288, -310,
+ -334, -357, -381, -409, -439, -468, -498, -520, -543, -565, -589, -647, -706,
+ -668, -632, -663, -694, -725, -756, -756, -756, -756, +368, +401, +434, +339,
+ +244, +237, +230, +223, +216, +200, +184, +168, +152, +152, +152, +120, +88,
+ +94, +100, +106, +112, +86, +60, +2, -56, -18, +19, -39, -98, -76,
+ -55, -97, -140, -136, -133, -161, -190, -202, -215, -227, -240, -265, -290,
+ -315, -340, -373, -406, -439, -472, -495, -518, -541, -564, -635, -706, -649,
+ -592, -628, -664, -700, -736, -736, -736, -736, +404, +556, +454, +383, +313,
+ +531, +239, +282, +326, +304, +282, +260, +238, +246, +254, +118, +238, +196,
+ +154, +32, -90, -88, -86, +76, +238, +243, +247, +29, -191, -232, -272,
+ -121, +29, -62, -153, -149, -145, -162, -180, -197, -216, -240, -265, -289,
+ -315, -345, -376, -406, -438, -446, -456, -497, -539, -595, -653, -502, -608,
+ -625, -642, -675, -708, -708, -708, -708, +440, +713, +475, +428, +382, +827,
+ +249, +342, +436, +408, +380, +352, +324, +340, +356, -140, -124, +42, +208,
+ +214, +220, +250, +280, +406, +532, +504, +476, +352, +229, +125, +22, -146,
+ -314, -244, -175, -138, -101, -123, -146, -169, -192, -216, -241, -265, -290,
+ -318, -347, -375, -404, -399, -395, -454, -514, -557, -601, -356, -624, -622,
+ -620, -650, -680, -680, -680, -680, +604, +677, +495, +457, +419, +770, +354,
+ +386, +418, +416, +414, +380, +346, +258, -342, -302, -6, +288, +582, +604,
+ +626, +588, +550, +688, +826, +829, +833, +724, +616, +481, +348, +181, +15,
+ -139, -292, -175, -56, -83, -112, -139, -168, -192, -216, -240, -265, -291,
+ -317, -343, -370, -351, -333, -411, -489, -486, -484, -402, -576, -587, -598,
+ -625, -652, -652, -652, -652, +1280, +1154, +1028, +998, +968, +970, +460, +430,
+ +400, +424, +448, +408, +368, +432, -528, -208, +112, +534, +956, +994, +1032,
+ +926, +820, +970, +1120, +1155, +1190, +1097, +1004, +839, +674, +509, +344, +223,
+ +102, +45, -12, -45, -78, -111, -144, -168, -192, -216, -240, -264, -288,
+ -312, -336, -304, -272, -368, -464, -416, -368, -448, -528, -552, -576, -600,
+ -624, -624, -624, -624, +770, +671, +573, +554, +536, +629, +467, +464, +462,
+ +492, +523, +490, +457, +281, -405, -101, +204, +599, +995, +1310, +1370, +1297,
+ +1225, +1296, +1368, +1432, +1498, +1402, +1308, +1184, +1062, +874, +688, +586, +485,
+ +303, +123, -82, -32, -76, -122, -174, -226, -199, -171, -193, -216, -238,
+ -261, -314, -368, -325, -283, -360, -438, -451, -465, -515, -565, -583, -601,
+ -617, -633, -633, +772, +701, +630, +623, +616, +545, +474, +499, +524, +561,
+ +599, +572, +546, +131, -283, +6, +296, +665, +1034, +1627, +1708, +1669, +1630,
+ +1623, +1616, +1711, +1806, +1709, +1612, +1531, +1450, +1241, +1032, +950, +869, +563,
+ +258, -120, +15, -42, -100, -180, -261, -182, -103, -123, -144, -165, -186,
+ -325, -464, -283, -102, -305, -508, -455, -402, -478, -554, -566, -578, -610,
+ -642, -642, +774, +730, +687, +675, +664, +620, +577, +581, +586, +597, +610,
+ +590, +571, -147, -96, +209, +516, +794, +1073, +1575, +1822, +1976, +1875, +1869,
+ +1864, +1988, +2114, +2014, +1916, +1876, +1838, +1606, +1376, +1266, +1156, +902, +137,
+ -61, -3, -120, -238, -122, -7, -69, -130, -164, -200, -219, -239, -271,
+ -304, -128, -209, -297, -386, -426, -467, -937, -895, -549, -459, -667, -619,
+ -619, +776, +760, +744, +728, +712, +696, +680, +664, +648, +635, +622, +609,
+ +596, -425, +90, +413, +736, +924, +1112, +1524, +1936, +2284, +2120, +2116, +2112,
+ +2267, +2422, +2321, +2220, +2223, +2226, +1973, +1720, +1582, +1444, +1242, +16, -2,
+ -20, +58, +136, -65, -267, -212, -158, -207, -257, -274, -292, -218, -144,
+ +26, -316, -290, -264, -142, -20, +2956, +2860, -788, -852, -980, -596, -596,
+ +826, +807, +789, +770, +752, +749, +747, +744, +742, +677, +613, +516, +421,
+ -285, +288, +573, +860, +1081, +1303, +1668, +2034, +2313, +2337, +2344, +2352, +2452,
+ +2554, +2574, +2596, +2506, +2418, +2248, +2080, +1961, +1843, +925, +7, +40, +74,
+ +748, +654, +453, +251, +48, -154, -107, -61, -111, -161, -28, +104, +45,
+ -271, -274, -278, -842, +1411, +3007, +3323, +327, -1389, -1197, -493, -493, +876,
+ +855, +834, +813, +792, +803, +814, +825, +836, +720, +605, +681, +758, +110,
+ +487, +735, +984, +1239, +1494, +1813, +2132, +2343, +2554, +2573, +2592, +2639, +2686,
+ +2829, +2972, +2791, +2610, +2525, +2440, +2341, +2243, +608, -2, +83, +169, +1438,
+ +1172, +970, +768, +565, +363, +249, +135, +52, -30, -95, -160, -193, -226,
+ -259, -292, +763, -742, +2290, +1738, -1118, -902, -902, -390, -390, +926, +902,
+ +879, +855, +832, +824, +817, +809, +802, +763, +724, +397, +2375, +970, +589,
+ +848, +1108, +1396, +1685, +1941, +2198, +2468, +2739, +2785, +2832, +2888, +2946, +3178,
+ +2900, +3058, +2962, +2848, +2736, +2896, +2546, -364, +309, +205, +871, +1760, +1626,
+ +1471, +1317, +1145, +975, +844, +714, +599, +485, +351, +216, +146, +75, -355,
+ +750, +2687, +529, -1067, -615, -835, -799, -847, -383, -383, +976, +950, +924,
+ +898, +872, +846, +820, +794, +768, +806, +844, +882, +1432, +2598, +692, +962,
+ +1232, +1554, +1876, +2070, +2264, +2594, +2924, +2998, +3072, +3139, +3206, +3273, +2316,
+ +3071, +3314, +3173, +3032, +2941, +1826, -57, +108, +73, +1574, +2083, +2080, +1973,
+ +1866, +1727, +1588, +1441, +1294, +1147, +1000, +796, +592, +484, +376, +828, +256,
+ +772, -248, -72, -408, +984, -184, -536, -376, -376, +1026, +997, +969, +941,
+ +913, +888, +864, +840, +816, +762, +709, +768, +1339, +2269, +2176, +1411, +1414,
+ +1677, +1941, +2188, +2436, +2730, +3023, +3157, +3291, +3349, +3409, +3420, +2152, +3000,
+ +3594, +3403, +3213, +3233, +951, +12, +97, -303, +2883, +2755, +2373, +2312, +2252,
+ +2143, +2036, +1861, +1687, +1544, +1403, +1254, +1106, +974, +842, +1229, +1105, +21,
+ +217, +46, -381, +1912, +3181, +2765, +301, -723, +1076, +1045, +1015, +984, +954,
+ +931, +909, +886, +864, +719, +575, +654, +1246, +1685, +3149, +1604, +1596, +1801,
+ +2006, +2307, +2609, +2866, +3123, +3316, +3510, +3561, +3613, +3568, +1988, +2931, +3875,
+ +3634, +3394, +3527, +76, +81, +86, +859, +3168, +2917, +2666, +2652, +2639, +2561,
+ +2484, +2282, +2081, +1943, +1806, +1713, +1621, +1464, +1308, +1119, +931, +550, +170,
+ -92, -354, +1560, +3986, +1970, -558, -558, +1126, +1092, +1060, +1027, +995, +973,
+ +953, +932, +912, +899, +888, -340, +1249, +1756, +2521, +2421, +1810, +2036, +2263,
+ +2521, +2781, +3066, +3350, +3443, +3537, +3612, +3688, +3476, +2496, +3021, +3803, +3833,
+ +3863, +2843, +33, +133, -21, +2099, +3197, +3061, +2927, +2944, +2961, +2882, +2804,
+ +2607, +2410, +2309, +2209, +2139, +2071, +1842, +1614, +1328, +1044, +663, +283, +10,
+ -263, -488, -201, -201, -457, -457, +1176, +1141, +1106, +1071, +1036, +1017, +998,
+ +979, +960, +825, +690, +203, +740, +1573, +1894, +3239, +2024, +2272, +2521, +2737,
+ +2954, +3010, +3067, +3315, +3564, +3664, +3764, +3384, +3004, +3112, +3732, +3776, +3820,
+ +1905, -10, +187, -128, +3341, +3226, +3207, +3188, +3236, +3284, +3204, +3124, +2932,
+ +2740, +2676, +2612, +2567, +2522, +2221, +1920, +1539, +1158, +777, +396, +112, -172,
+ -488, -292, -324, -356, -356, +1194, +1162, +1131, +1099, +1069, +1047, +1026, +972,
+ +920, +969, +507, +380, +767, +1428, +1834, +2799, +2486, +2347, +2721, +2919, +3118,
+ +3290, +3462, +3266, +3071, +3157, +3243, +3521, +3800, +3674, +3548, +3710, +3873, +874,
+ +179, +91, +517, +3439, +3291, +3333, +3377, +3403, +3430, +3361, +3292, +3174, +3057,
+ +3004, +2951, +2761, +2572, +2222, +1874, +1554, +1235, +883, +533, +220, -93, -470,
+ -335, -319, -303, -303, +1212, +1184, +1157, +1129, +1102, +1078, +1055, +967, +880,
+ +1114, +325, +559, +794, +1284, +1775, +2361, +2948, +2423, +2923, +3103, +3283, +3314,
+ +3346, +3474, +3602, +3674, +3747, +3659, +3572, +3980, +3877, +3901, +3926, -157, +368,
+ +253, +1674, +3795, +3356, +3461, +3566, +3571, +3577, +3518, +3460, +3417, +3375, +3332,
+ +3290, +2956, +2623, +2225, +1828, +1570, +1313, +991, +670, +328, -14, -452, -378,
+ -314, -250, -250, +1230, +1206, +1182, +1158, +1135, +1109, +1083, +1025, +968, +779,
+ +78, +481, +885, +1284, +1939, +2466, +3250, +2626, +2772, +3157, +3543, +3514, +3486,
+ +3729, +3717, +3775, +3834, +3780, +3728, +3934, +3885, +3915, +2667, +92, +333, +173,
+ +2831, +3701, +3549, +3587, +3627, +3642, +3659, +3643, +3628, +3675, +3724, +3436, +3149,
+ +2847, +2545, +2275, +2006, +1730, +1454, +1114, +775, +388, +1, -402, -293, -309,
+ -325, -325, +1248, +1228, +1208, +1188, +1168, +1140, +1112, +1084, +1056, +700, +344,
+ +660, +976, +1284, +2104, +2316, +3040, +2319, +2110, +2189, +2268, +2691, +3114, +3729,
+ +3832, +3877, +3922, +3903, +3884, +3889, +3894, +3931, +1408, +341, +298, +95, +3988,
+ +3609, +3742, +3715, +3688, +3715, +3742, +3769, +3796, +3679, +3562, +3285, +3008, +2738,
+ +2468, +2326, +2184, +1890, +1596, +1238, +880, +448, +16, -352, -208, -304, -400,
+ -400, +1296, +1284, +1272, +1260, +1249, +1165, +1081, +1093, +1106, +232, +382, +677,
+ +971, +973, +1232, +834, +693, +537, +639, +564, +490, +563, +637, -106, +944,
+ +2358, +3773, +3795, +4074, +3964, +3855, +4337, +212, +204, +197, +1341, +4023, +3813,
+ +3860, +3810, +3762, +3766, +3771, +3776, +3781, +3603, +3427, +3201, +2977, +2838, +2699,
+ +2400, +2101, +1982, +1607, +1280, +954, +545, -120, -321, -266, -314, -362, -362,
+ +1344, +1340, +1337, +1333, +1330, +1190, +1051, +1103, +1156, +20, +933, +950, +967,
+ +919, +872, +889, +906, +805, +705, +733, +761, +740, +720, +668, +616, +328,
+ +40, +1640, +3752, +3784, +3816, +3208, +40, +581, +97, +2589, +4058, +4018, +3979,
+ +3907, +3836, +3818, +3801, +3784, +3767, +3529, +3292, +3375, +3458, +3706, +3954, +3754,
+ +3555, +2843, +1619, +1067, +516, +386, -256, -290, -324, -324, -324, -324, +1392,
+ +1364, +1337, +1309, +1283, +1247, +1212, +968, +982, +1424, +1099, +1079, +1058, +1072,
+ +1088, +815, +799, +1056, +802, +772, +743, +645, +547, +769, +736, +649, +563,
+ +332, +102, +1939, +4033, +1982, +444, +332, -36, +4076, +4093, +4047, +4001, +3955,
+ +3910, +3870, +3830, +3791, +3752, +3806, +3861, +3835, +3811, +3678, +3545, +3380, +3216,
+ +3639, +3806, +2341, +1134, +1091, +24, -387, -286, -286, -286, -286, +1440, +1389,
+ +1338, +1287, +1236, +1305, +1374, +1091, +1320, +1037, +1267, +1208, +1150, +715, +281,
+ +486, +1204, +1564, +901, +1325, +1750, +1830, +1911, +1383, +344, +459, +574, +817,
+ +548, +351, +666, +757, +336, +340, +856, +4028, +4128, +4076, +4024, +4004, +3984,
+ +3922, +3861, +3799, +3738, +3828, +3919, +3785, +3652, +3394, +3137, +3007, +2878, +2900,
+ +2923, +3105, +3800, +1284, +1328, +28, -248, -248, -248, -248, +1456, +1406, +1358,
+ +1309, +1261, +1209, +1159, +1444, +1218, +1265, +33, -654, -1342, -977, -356, +394,
+ +1401, +1753, +1338, +1738, +2140, +2575, +3009, +3524, +3784, +2536, +1033, +265, +522,
+ +440, +615, +629, +388, +403, +2211, +4051, +4099, +4078, +4058, +3990, +3922, +3910,
+ +3898, +3886, +3875, +3805, +3735, +3553, +3373, +3126, +2879, +2585, +2291, +2026, +1762,
+ +2649, +3026, +2303, +2092, +665, -250, -250, -250, -250, +1472, +1425, +1379, +1332,
+ +1286, +1371, +1457, +1030, -932, -1834, -1712, -1237, -763, -621, +33, +815, +1598,
+ +1943, +1776, +2153, +2531, +2808, +3085, +3362, +3640, +4102, +4052, +3042, +496, +530,
+ +564, +502, +440, +211, +3055, +3818, +4070, +4081, +4093, +3976, +3860, +3898, +3936,
+ +3974, +4013, +3783, +3553, +3323, +3094, +2858, +2623, +2420, +2217, +1921, +1626, +915,
+ +2764, +250, +296, +22, -252, -252, -252, -252, +1488, +1443, +1399, +1371, +1343,
+ +1308, +1530, -408, -1834, -1589, -1089, -811, -535, -281, +485, +1171, +1859, +2132,
+ +2150, +2503, +2857, +3105, +3352, +3536, +3720, +3875, +3775, +4298, +4054, +2123, +449,
+ +502, +556, +546, +26, +2113, +3945, +4115, +4031, +3946, +3862, +3838, +3814, +3982,
+ +3894, +3488, +3338, +3140, +2943, +2622, +2302, +2030, +1758, +1495, +1234, +1259, +774,
+ -347, -188, -189, -190, -222, -254, -254, +1504, +1462, +1420, +1410, +1400, +1246,
+ +1604, -1334, -1712, -1089, -978, -643, -308, +59, +938, +1529, +2120, +2322, +2524,
+ +2854, +3184, +3402, +3620, +3710, +3800, +3905, +4010, +4019, +4028, +3973, +334, +503,
+ +672, +627, +582, +409, +236, +2359, +3970, +3917, +3864, +3778, +3692, +3990, +3776,
+ +3194, +3124, +2958, +2792, +2387, +1982, +1641, +1300, +1071, +842, +69, -192, -176,
+ -160, -144, -128, -192, -256, -256, +1546, +1496, +1447, +1430, +1413, +1627, +1330,
+ -2102, -1184, -819, -712, -395, -80, +405, +1148, +1713, +2280, +2486, +2692, +2995,
+ +3297, +3467, +3638, +3712, +3787, +3915, +4045, +3917, +4047, +3097, +357, +655, +699,
+ +198, +466, +381, +297, +376, +200, +1815, +3431, +3568, +3961, +4114, +3755, +3310,
+ +3121, +2804, +2487, +2208, +1931, +1189, +447, +37, -116, -254, -136, -111, -86,
+ -109, -132, -196, -260, -260, +1588, +1531, +1475, +1450, +1426, +1497, +33, -1591,
+ -1168, -807, -446, -149, +148, +753, +1358, +1899, +2440, +2650, +2861, +3136, +3411,
+ +3533, +3656, +3715, +3774, +3927, +4080, +3817, +4066, +2223, +380, +553, +214, +3610,
+ +350, +354, +358, +442, +526, +226, -74, +286, +1158, +1678, +1686, +1634, +1582,
+ +1114, +646, +239, -168, -31, +107, -228, -51, -65, -80, -46, -12, -74,
+ -136, -200, -264, -264, +1630, +1565, +1502, +1470, +1439, +1590, -817, -1399, -960,
+ -633, -308, -14, +280, +875, +1472, +1971, +2472, +2718, +2965, +3229, +3492, +3582,
+ +3674, +3701, +3729, +3793, +3859, +4147, +4181, +707, +563, +417, +1297, +3917, +4234,
+ +2198, +163, +267, +372, +348, +325, +108, +147, +186, -31, +38, +107, +96,
+ +85, +61, +38, -162, -106, -126, +111, +876, -152, -93, -34, -87, -140,
+ -204, -268, -268, +1672, +1601, +1530, +1491, +1452, +1685, -1666, -1209, -752, -461,
+ -170, +121, +412, +999, +1586, +2045, +2504, +2787, +3071, +3322, +3574, +3633, +3693,
+ +3688, +3684, +3661, +3638, +3711, +2760, +473, +746, +283, +2380, +4225, +4022, +4043,
+ +4064, +2141, +218, +215, +212, +186, +160, +230, +300, +234, +168, +102, +36,
+ -117, -269, +218, +1218, +2025, +2833, +1048, -224, -140, -56, -100, -144, -208,
+ -272, -272, +1626, +1607, +1589, +1458, +1585, +692, -1479, -1107, -736, -451, -168,
+ +115, +400, +805, +1468, +1937, +2408, +2703, +2999, +3327, +3655, +3568, +3482, +3620,
+ +3759, +3439, +3121, +1601, +851, +819, +533, +437, +3415, +4252, +4066, +4055, +4045,
+ +4084, +4124, +2995, +1867, +1068, +269, +62, -145, -38, +69, +704, +1339, +2183,
+ +3028, +2816, +2861, +2953, +2790, -349, +96, -19, -134, -137, -140, -204, -268,
+ -268, +1580, +1614, +1649, +1427, +1718, -300, -1293, -1006, -720, -443, -166, +111,
+ +388, +613, +1350, +1831, +2312, +2620, +2928, +3076, +3225, +3249, +3273, +3297, +3322,
+ +3475, +3628, +3333, +1502, +655, +832, +593, +3938, +4024, +4110, +4068, +4026, +3980,
+ +3934, +3984, +4034, +3998, +3962, +3990, +4018, +3786, +3554, +3610, +3666, +3459, +3253,
+ +3111, +2969, +2858, +2236, -210, -96, -154, -212, -174, -136, -200, -264, -264,
+ +1662, +1653, +1644, +1619, +1851, -988, -1266, -985, -704, -401, -100, +9, +120,
+ +403, +944, +1579, +2216, +2504, +2793, +2873, +2954, +2976, +2999, +3085, +3173, +3237,
+ +3303, +3575, +521, +553, +587, +1771, +3981, +4019, +4058, +4032, +4007, +3971, +3936,
+ +3948, +3961, +3920, +3879, +3806, +3989, +3866, +3743, +3636, +3529, +3375, +3222, +3069,
+ +2916, +2907, +1362, -119, -64, -113, -162, -147, -132, -196, -260, -260, +1744,
+ +1692, +1640, +1556, +1472, -1932, -1240, -964, -688, -361, -34, +165, +364, +707,
+ +1050, +1585, +2120, +2389, +2658, +2671, +2684, +2705, +2726, +2875, +3024, +3001, +2978,
+ +2283, +564, +965, +342, +2951, +4024, +4015, +4006, +3997, +3988, +3963, +3938, +3913,
+ +3888, +3842, +3796, +3622, +3960, +3946, +3932, +3662, +3392, +3292, +3192, +3028, +2864,
+ +2956, +488, -28, -32, -72, -112, -120, -128, -192, -256, -256, +1834, +1635,
+ +1692, +1718, +208, -1663, -1229, -924, -619, -283, +50, +256, +719, +705, +948,
+ +1126, +1562, +1845, +2129, +2236, +2344, +2447, +2551, +2654, +2759, +2738, +2719, +1562,
+ +663, +623, +327, +4207, +3992, +4012, +4034, +3990, +3948, +3922, +3898, +3872, +3848,
+ +3774, +3701, +3484, +3523, +3726, +3929, +3812, +3695, +3604, +3513, +3407, +3300, +3350,
+ -440, -231, -22, -48, -74, -100, -126, -174, -222, -222, +1924, +1578, +1745,
+ +1880, -1057, -1394, -1219, -884, -550, -207, +135, +93, +563, +449, +847, +669,
+ +1004, +1302, +1600, +1802, +2005, +2191, +2377, +2435, +2494, +2477, +2460, +843, +763,
+ +794, +1337, +3928, +3960, +4011, +4062, +3985, +3908, +3883, +3858, +3833, +3808, +3707,
+ +3607, +3603, +3599, +3506, +3414, +3706, +3998, +3916, +3835, +3786, +3737, +2208, -345,
+ +78, -12, -24, -36, -80, -124, -156, -188, -188, +1598, +1585, +1829, +2154,
+ -1873, -1413, -1208, -556, -417, -514, -102, +440, +214, +191, +681, +435, +702,
+ +870, +1039, +1224, +1409, +1709, +2010, +2039, +2069, +2086, +1849, +795, +766, +596,
+ +2474, +3953, +3896, +3928, +3962, +3914, +3868, +3842, +3818, +3792, +3768, +3687, +3608,
+ +3577, +3546, +3462, +3379, +3312, +3245, +3364, +3484, +3189, +2893, +858, -154, +35,
+ -34, -48, -62, -108, -154, -154, -154, -154, +1784, +1849, +1915, +892, -1666,
+ -1176, -1711, -741, -796, -822, +175, -748, +378, +191, +517, +202, +400, +439,
+ +479, +646, +814, +1229, +1645, +1644, +1644, +1697, +1239, +748, +770, +399, +3613,
+ +3978, +3832, +3847, +3862, +3845, +3828, +3803, +3778, +3753, +3728, +3669, +3611, +3552,
+ +3494, +3419, +3345, +3174, +3004, +2813, +2623, +2592, +2562, -237, +37, -9, -56,
+ -72, -88, -136, -184, -152, -120, -120, +1802, +1900, +2255, -286, -1290, -1129,
+ -712, -391, -327, -385, -445, +201, -178, +436, +27, -45, -118, +204, +270,
+ +384, +498, +685, +874, +998, +1123, +1252, +1127, +794, +717, +1161, +3654, +3843,
+ +3776, +3788, +3802, +3782, +3764, +3616, +3726, +3690, +3656, +3595, +3536, +3476, +3417,
+ +3341, +3265, +3078, +2891, +2687, +2484, +2617, +1982, -28, +8, +14, +18, -18,
+ -54, +6, +66, -30, -126, -126, +1820, +1696, +2084, -2232, -1939, -570, -1762,
+ -1834, -1394, -461, -552, -387, -223, -1110, -462, -37, -124, -31, -451, -134,
+ +183, +143, +104, +353, +602, +809, +1017, +841, +665, +1924, +3696, +3708, +3720,
+ +3731, +3742, +3721, +3700, +3431, +3674, +3629, +3584, +3523, +3462, +3401, +3341, +3264,
+ +3187, +2982, +2778, +2562, +2346, +2386, +891, -77, -20, +36, +92, +36, -20,
+ -108, -196, -164, -132, -132, +1710, +1955, +1177, -2833, -955, -2075, -2172, -364,
+ -1885, -1352, -820, -1599, -843, -1249, -887, -652, -674, -554, -435, -636, -325,
+ -304, -282, -101, -175, +493, +906, +871, +580, +2767, +3674, +3653, +3632, +3656,
+ +3682, +3626, +3572, +3436, +3558, +3534, +3512, +3449, +3388, +3325, +3264, +3186, +3108,
+ +2902, +2697, +2500, +2304, +2219, +343, +179, +271, +154, +38, -6, -50, -110,
+ -170, -154, -138, -138, +1600, +1959, -242, -2667, -2020, -2557, -2582, -1455, +696,
+ +316, +960, +2052, +2120, +1940, +1760, +1292, +824, -310, -932, -1394, -832, -750,
+ -668, -298, -440, +434, +796, +902, +496, +3610, +3652, +3598, +3544, +3583, +3622,
+ +3533, +3444, +3443, +3442, +3441, +3440, +3377, +3314, +3251, +3188, +3109, +3030, +2823,
+ +2616, +2439, +2262, +2053, -204, +179, +50, +17, -16, -48, -80, -112, -144,
+ -144, -144, -144, +1956, +1852, -2091, -3025, -1145, +322, +2045, +1672, +1555, +1328,
+ +1614, +1916, +1706, +1622, +1282, +1502, +1466, +1301, +1393, +940, -792, -1548, -768,
+ -820, -617, +926, +934, +909, +1397, +3323, +3456, +3446, +3436, +3393, +3351, +3388,
+ +3426, +3373, +3321, +3444, +3313, +3264, +3217, +3153, +3090, +2997, +2906, +2686, +2467,
+ +2290, +2115, +1282, -61, +136, +79, +36, -5, -37, -69, -101, -133, -133,
+ -133, -133, +1800, +1746, +669, +1992, +1779, +1665, +1552, +1727, +1390, +1317, +1245,
+ +1269, +1293, +1560, +1316, +1456, +1084, +1121, +1158, +971, +1297, +726, -869, -1343,
+ -794, +1419, +1072, +917, +2299, +3036, +3261, +3294, +3328, +3204, +3080, +3244, +3409,
+ +3305, +3201, +3449, +3186, +3153, +3121, +3056, +2992, +2887, +2783, +2550, +2318, +2143,
+ +1968, +513, +82, +95, +108, +57, +6, -26, -58, -90, -122, -122, -122,
+ -122, +1516, +1832, +1636, +1905, +1406, +1344, +1283, +1589, +1641, +1465, +1291, +1277,
+ +1263, +1386, +1254, +1314, +1118, +1116, +1115, +905, +953, +1160, +1111, +118, -363,
+ +807, +698, +700, +2240, +3325, +2361, +2934, +3252, +2998, +2745, +2924, +3103, +3155,
+ +2952, +3277, +3091, +3057, +3024, +2959, +2894, +2776, +2659, +2414, +2169, +2074, +1981,
+ +255, +65, +68, +73, +44, +17, -15, -47, -79, -111, -111, -111, -111,
+ +1744, +1662, +1581, +1563, +1546, +1536, +1527, +1453, +1380, +1359, +1339, +1286, +1234,
+ +1213, +1193, +1172, +1152, +1112, +1073, +1097, +1122, +826, +1043, +1067, +1092, +964,
+ +837, +741, +2182, +2078, +2487, +2831, +2664, +2793, +2923, +2860, +2798, +3007, +2705,
+ +3106, +2996, +2962, +2928, +2862, +2796, +2666, +2536, +2278, +2020, +1751, +1482, -259,
+ +48, +43, +38, +33, +28, -4, -36, -68, -100, -100, -100, -100, +1684,
+ +1640, +1596, +1584, +1573, +1543, +1513, +1451, +1391, +1359, +1329, +1282, +1236, +1213,
+ +1190, +1168, +1146, +1107, +1069, +1063, +1058, +920, +1038, +996, +955, +924, +894,
+ +880, +1635, +1679, +2235, +2439, +2132, +2451, +2771, +2580, +2644, +2713, +2528, +2742,
+ +2701, +2828, +2699, +2570, +2442, +2383, +2324, +2105, +1887, +1732, +811, -79, +55,
+ +62, +71, +46, +23, -7, -37, -67, -97, -113, -129, -129, +1624, +1618,
+ +1612, +1606, +1601, +1551, +1501, +1451, +1402, +1361, +1320, +1279, +1239, +1214, +1189,
+ +1164, +1140, +1103, +1067, +1031, +995, +1014, +1034, +926, +818, +885, +953, +1021,
+ +1089, +1024, +1472, +2048, +2112, +2110, +2109, +2044, +2491, +2421, +2352, +2379, +2406,
+ +2694, +2471, +2279, +2088, +2100, +2113, +1933, +1754, +1715, +140, +101, +62, +83,
+ +104, +61, +18, -10, -38, -66, -94, -126, -158, -158, +1724, +1788, +1852,
+ +1692, +1532, +1494, +1456, +1418, +1381, +1345, +1311, +1275, +1241, +1214, +1187, +1160,
+ +1134, +1098, +1064, +1029, +995, +996, +998, +935, +873, +877, +883, +792, +702,
+ +657, +1125, +1832, +2284, +1193, +1638, +1796, +2209, +2320, +2176, +2239, +2047, +2560,
+ +2562, +1891, +1734, +1673, +1613, +1744, +1621, +1152, -83, -8, +69, +70, +73,
+ +42, +13, -13, -39, -65, -91, -139, -187, -187, +1824, +1702, +1580, +1522,
+ +1464, +1438, +1412, +1386, +1360, +1331, +1302, +1273, +1244, +1215, +1186, +1157, +1128,
+ +1095, +1062, +1029, +996, +979, +962, +945, +928, +871, +814, +821, +828, +803,
+ +1290, +1617, +1944, +2068, +1168, +1292, +1416, +1708, +1488, +1844, +1688, +2171, +2142,
+ +1249, +1380, +1503, +1626, +1045, -48, +79, +206, +141, +76, +59, +42, +25,
+ +8, -16, -40, -64, -88, -152, -216, -216, +1688, +1615, +1542, +1501, +1460,
+ +1429, +1398, +1367, +1336, +1309, +1284, +1257, +1232, +1205, +1180, +1153, +1128, +1092,
+ +1058, +1022, +988, +968, +950, +930, +912, +861, +812, +793, +776, +595, +672,
+ +971, +1272, +330, +924, +1038, +1152, +1298, +1444, +1910, +1608, +1531, +1200, +515,
+ +344, +259, +176, +251, +72, +122, +174, +128, +84, +64, +46, +26, +8,
+ -18, -44, -70, -96, -144, -192, -192, +1552, +1528, +1504, +1480, +1456, +1420,
+ +1384, +1348, +1312, +1289, +1266, +1243, +1220, +1197, +1174, +1151, +1128, +1091, +1054,
+ +1017, +980, +959, +938, +917, +896, +853, +810, +767, +724, +645, +566, +583,
+ +600, +640, +680, +528, +376, +376, +888, +1464, +1016, +637, +258, +295, +332,
+ +297, +262, +227, +192, +167, +142, +117, +92, +71, +50, +29, +8, -20,
+ -48, -76, -104, -136, -168, -168, +1544, +1521, +1498, +1475, +1452, +1411, +1370,
+ +1329, +1288, +1267, +1248, +1227, +1208, +1187, +1168, +1147, +1128, +1088, +1050, +1010,
+ +972, +948, +926, +902, +880, +843, +808, +771, +736, +677, +620, +609, +600,
+ +614, +628, +546, +464, +238, +2060, +1690, +1576, +1709, +308, +313, +320, +285,
+ +252, +217, +184, +162, +142, +120, +100, +76, +54, +30, +8, -22, -52,
+ -82, -112, -128, -144, -144, +1536, +1514, +1492, +1470, +1448, +1402, +1356, +1310,
+ +1264, +1247, +1230, +1213, +1196, +1179, +1162, +1145, +1128, +1087, +1046, +1005, +964,
+ +939, +914, +889, +864, +835, +806, +777, +748, +711, +674, +637, +600, +588,
+ +576, +564, +552, +612, +160, +1916, +1112, +223, +358, +333, +308, +275, +242,
+ +209, +176, +159, +142, +125, +108, +83, +58, +33, +8, -24, -56, -88,
+ -120, -120, -120, -120, +1536, +1514, +1492, +1470, +1448, +1402, +1356, +1310, +1264,
+ +1246, +1230, +1212, +1196, +1178, +1162, +1144, +1128, +1086, +1046, +1004, +964, +938,
+ +914, +888, +864, +834, +806, +776, +748, +710, +674, +636, +600, +588, +576,
+ +564, +552, +644, +480, +108, +504, +158, +326, +316, +308, +274, +242, +208,
+ +176, +158, +142, +124, +108, +82, +58, +32, +8, -24, -56, -88, -120,
+ -120, -120, -120, +1536, +1514, +1492, +1470, +1448, +1402, +1356, +1310, +1264, +1247,
+ +1230, +1213, +1196, +1179, +1162, +1145, +1128, +1087, +1046, +1005, +964, +939, +914,
+ +889, +864, +835, +806, +777, +748, +711, +674, +637, +600, +588, +576, +564,
+ +552, +420, +288, +348, +408, +351, +294, +301, +308, +275, +242, +209, +176,
+ +159, +142, +125, +108, +83, +58, +33, +8, -24, -56, -88, -120, -120,
+ -120, -120, +1536, +1514, +1492, +1470, +1448, +1402, +1356, +1310, +1264, +1246, +1230,
+ +1212, +1196, +1178, +1162, +1144, +1128, +1086, +1046, +1004, +964, +938, +914, +888,
+ +864, +834, +806, +776, +748, +710, +674, +636, +600, +588, +576, +564, +552,
+ +420, +288, +348, +408, +350, +294, +300, +308, +274, +242, +208, +176, +158,
+ +142, +124, +108, +82, +58, +32, +8, -24, -56, -88, -120, -120, -120,
+ -120
+};
+
+static const INT16 TEST_CB_COMPONENT[4096] = {
+ +1728, +1730, +1732, +1734, +1736, +1738, +1740, +1742, +1744, +1740, +1736, +1732, +1728,
+ +1796, +1864, +1804, +1744, +1754, +1764, +1774, +1784, +1794, +1804, +1814, +1824, +1774,
+ +1724, +1802, +1880, +1814, +1748, +1810, +1872, +1878, +1884, +1890, +1896, +1910, +1924,
+ +1938, +1952, +1938, +1924, +1910, +1896, +1914, +1932, +1950, +1968, +1974, +1980, +1986,
+ +1992, +1998, +2004, +2010, +2016, +2016, +2016, +2016, +2016, +2016, +2016, +2016, +1710,
+ +1697, +1684, +1704, +1723, +1726, +1730, +1733, +1737, +1738, +1740, +1741, +1743, +1758,
+ +1774, +1757, +1741, +1762, +1783, +1788, +1793, +1774, +1755, +1784, +1813, +1817, +1821,
+ +1825, +1829, +1857, +1885, +1881, +1877, +1849, +1821, +1857, +1894, +1904, +1914, +1924,
+ +1935, +1928, +1922, +1915, +1909, +1922, +1936, +1949, +1963, +1974, +1985, +1997, +2008,
+ +2009, +2011, +2012, +2014, +2017, +2020, +2023, +2026, +2026, +2026, +2026, +1692, +1664,
+ +1637, +1674, +1711, +1715, +1720, +1725, +1730, +1737, +1744, +1751, +1758, +1721, +1684,
+ +1711, +1738, +1770, +1802, +1802, +1802, +1754, +1706, +1754, +1802, +1860, +1918, +1848,
+ +1778, +1900, +2022, +1952, +1882, +1820, +1759, +1825, +1892, +1898, +1905, +1911, +1918,
+ +1919, +1920, +1921, +1922, +1931, +1940, +1949, +1958, +1974, +1991, +2008, +2025, +2021,
+ +2018, +2015, +2012, +2018, +2024, +2030, +2036, +2036, +2036, +2036, +1674, +1631, +1589,
+ +1644, +1698, +1703, +1710, +1716, +1723, +1735, +1748, +1760, +1773, +1763, +1754, +1760,
+ +1767, +1794, +1821, +1800, +1779, +1830, +1881, +1900, +1919, +2047, +2175, +2015, +1855,
+ +1879, +1903, +1927, +1951, +1759, +1824, +1856, +1890, +1892, +1895, +1897, +1901, +1909,
+ +1918, +1926, +1935, +1939, +1944, +1948, +1953, +1974, +1996, +2019, +2041, +2032, +2025,
+ +2017, +2010, +2019, +2028, +2037, +2046, +2046, +2046, +2046, +1656, +1599, +1543, +1614,
+ +1686, +1693, +1701, +1708, +1716, +1734, +1752, +1770, +1788, +1806, +1824, +1810, +1796,
+ +1818, +1840, +2054, +2268, +1650, +1032, +510, -12, -70, -128, +390, +908, +1602,
+ +2296, +2158, +2020, +1699, +1890, +1889, +1888, +1887, +1886, +1885, +1884, +1900, +1916,
+ +1932, +1948, +1948, +1948, +1948, +1948, +1975, +2003, +2030, +2058, +2045, +2033, +2020,
+ +2008, +2020, +2032, +2044, +2056, +2056, +2056, +2056, +1590, +1570, +1551, +1612, +1673,
+ +1579, +1742, +1713, +1685, +1672, +1660, +1711, +1763, +1694, +1626, +1941, +2001, +2060,
+ +583, -654, -1891, -2046, -2201, -2084, -1967, -2049, -2131, -2053, -1975, -1751, -1527,
+ +41, +1609, +2374, +1859, +2000, +1886, +1898, +1912, +1909, +1907, +1900, +1894, +1919,
+ +1945, +1944, +1944, +1943, +1943, +1967, +1992, +2017, +2042, +2032, +2023, +2014, +2006,
+ +2017, +2028, +2039, +2050, +2050, +2050, +2050, +1524, +1542, +1560, +1610, +1661, +1467,
+ +1785, +1719, +1654, +1611, +1568, +1653, +1738, +1839, +1940, +793, -866, -2050, -2210,
+ -2082, -1954, -1902, -1850, -1862, -1874, -1980, -2086, -1936, -1786, -1776, -1766, -1820,
+ -1874, -534, +1829, +2112, +1884, +1911, +1939, +1934, +1930, +1901, +1872, +1907, +1942,
+ +1941, +1940, +1939, +1938, +1960, +1982, +2004, +2027, +2021, +2015, +2009, +2004, +2014,
+ +2024, +2034, +2044, +2044, +2044, +2044, +1586, +1641, +1697, +1704, +1712, +1577, +1699,
+ +1660, +1623, +1613, +1604, +1642, +1681, +1791, -402, -2036, -1877, -2144, -1899, -1942,
+ -1985, -1918, -1851, -1880, -1909, -1959, -2009, -1931, -1853, -1801, -1749, -1617, -1485,
+ -1939, -1882, +96, +2074, +1971, +1869, +1895, +1921, +1885, +1850, +1894, +1939, +1937,
+ +1936, +1934, +1933, +1952, +1972, +1991, +2011, +2008, +2006, +2003, +2002, +2011, +2020,
+ +2029, +2038, +2038, +2038, +2038, +1136, +1229, +1322, +1287, +1252, +1433, +1614, +1603,
+ +1592, +1616, +1640, +1632, +1624, +2256, -1720, -1792, -1864, -1982, -2100, -2058, -2016,
+ -1934, -1852, -1898, -1944, -1938, -1932, -1926, -1920, -1826, -1732, -1670, -1608, -1552,
+ -1496, -1664, -1320, +2288, +1800, +1856, +1912, +1870, +1828, +1882, +1936, +1934, +1932,
+ +1930, +1928, +1945, +1962, +1979, +1996, +1997, +1998, +1999, +2000, +2008, +2016, +2024,
+ +2032, +2032, +2032, +2032, +1552, +1624, +1698, +1674, +1652, +1644, +1638, +1614, +1592,
+ +1611, +1630, +1681, +1733, +1146, -2000, -1787, -1830, -1924, -2019, -2049, -2080, -1986,
+ -1893, -1895, -1898, -1896, -1894, -1860, -1827, -1779, -1731, -1667, -1604, -1615, -1626,
+ -1878, -594, +2063, +1903, +2016, +1873, +2132, +1880, +1884, +1888, +1921, +1955, +1941,
+ +1927, +1925, +1925, +1955, +1987, +2005, +2025, +2043, +2063, +1995, +1927, +2099, +2015,
+ +2095, +2175, +2175, +1456, +1509, +1562, +1551, +1540, +1601, +1662, +1627, +1592, +1606,
+ +1621, +1731, +1842, +37, -2281, -1782, -1796, -1867, -1938, -2041, -2144, -2039, -1934,
+ -1893, -1852, -1854, -1857, -1795, -1734, -1732, -1731, -1665, -1600, -1678, -1757, -1836,
+ +645, +2094, +2007, +1920, +1322, +2139, +1933, +1886, +1840, +1909, +1979, +1952, +1926,
+ +1907, +1888, +1933, +1978, +2015, +2052, +2089, +2126, +1982, +1838, +2174, +1998, +2158,
+ +2318, +2318, +1488, +1520, +1554, +1554, +1556, +1588, +1622, +1606, +1592, +1569, +1547,
+ +1700, +1855, -993, -2049, -1825, -1858, -1905, -1953, -2016, -2080, -1995, -1911, -1858,
+ -1806, -1812, -1819, -1729, -1641, -1685, -1730, -1678, -1628, -1677, -1727, -2194, +1947,
+ +2125, +2046, +945, -2205, +114, +2177, +2144, +1856, +1912, +1970, +1963, +1957, +1935,
+ +1915, +1925, +1937, +1991, +2047, +2181, +2061, +2337, +2613, +1817, +2301, +2157, +2269,
+ +2397, +1520, +1533, +1546, +1559, +1572, +1577, +1582, +1587, +1592, +1533, +1474, +1671,
+ +1868, -2023, -1818, -1869, -1920, -1944, -1968, -1992, -2016, -1952, -1888, -1824, -1760,
+ -1771, -1782, -1665, -1548, -1639, -1730, -1693, -1656, -1677, -1699, -1017, +2226, +1644,
+ +2087, -286, -2148, -2167, -1674, +611, +2384, +2173, +1962, +1975, +1988, +1965, +1942,
+ +1919, +1896, +1969, +2042, +2019, +1484, -1916, -1220, +2484, +1068, -916, +1708, +1964,
+ +1504, +1514, +1526, +1536, +1548, +1550, +1554, +1556, +1560, +1581, +1604, +1786, +689,
+ -2138, -1894, -1905, -1918, -1926, -1935, -1943, -1952, -1878, -1805, -1731, -1658, -1626,
+ -1596, -1549, -1503, -1507, -1513, -1518, -1524, -1526, -1785, +148, +2080, +1995, +2422,
+ -2094, -2003, -2033, -1809, -1665, -1776, -189, +1398, +2536, +2139, +2122, +2105, +2327,
+ +2295, +2204, +2113, +2870, -213, -1669, -1077, -1237, -1653, -1589, +2059, +1931, +1488,
+ +1497, +1506, +1515, +1524, +1525, +1526, +1527, +1528, +1631, +1735, +1902, -490, -2254,
+ -1971, -1943, -1916, -1909, -1902, -1895, -1888, -1805, -1722, -1639, -1556, -1483, -1411,
+ -1434, -1458, -1377, -1297, -1344, -1392, -1376, -1872, +1312, +1935, +1834, +1734, -2622,
+ -2370, -2157, -1945, -1892, -1840, -2039, -2239, -2022, -782, -281, +220, +433, +134,
+ -377, -888, -1655, -1398, -1166, -934, -1374, -1302, -726, +2410, +1898, +1472, +1478,
+ +1486, +1492, +1500, +1498, +1498, +1496, +1496, +1600, +1705, +1666, -933, -1474, -2015,
+ -1964, -1914, -1891, -1869, -1846, -1824, -1731, -1639, -1546, -1454, -1387, -1321, -1191,
+ -1317, -1150, -1240, -1250, -1260, -1545, -1575, +2459, +1885, +2057, +182, -2429, -2225,
+ -2088, -1952, -1928, -1904, -1905, -1907, -2149, -1879, -1835, -1793, -1670, -1803, -1645,
+ -1489, -1491, -1239, -1335, -1431, -1335, -1495, +681, +2345, +2089, +1456, +1461, +1466,
+ +1471, +1476, +1473, +1470, +1467, +1464, +1570, +1676, +1174, -1888, -950, -2060, -1986,
+ -1912, -1874, -1836, -1798, -1760, -1658, -1556, -1454, -1352, -1292, -1232, -1204, -1688,
+ -1180, -1184, -1156, -1128, -1203, -254, +2071, +1836, +2281, -1370, -2237, -2080, -2020,
+ -1960, -1964, -1968, -2028, -2088, -2020, -1952, -1855, -1758, -1725, -1692, -1635, -1578,
+ -1329, -1592, -1504, -1416, -1040, -1688, +2088, +2280, +2280, +1428, +1438, +1450, +1460,
+ +1472, +1463, +1454, +1493, +1533, +1512, +1748, -160, -2068, -1346, -1137, -1775, -1902,
+ -1848, -1794, -1708, -1622, -1544, -1466, -1356, -1247, -1198, -1149, -1196, -1755, -1246,
+ -993, -1012, -1032, -1202, +930, +2023, +1837, +2238, -2480, -2286, -1838, -1799, -1761,
+ -1835, -1909, -1954, -2000, -1982, -1964, -1908, -1853, -1829, -1807, -1749, -1692, -1538,
+ -1642, -1526, -1410, -638, -122, +774, +1926, +1926, +1400, +1417, +1434, +1451, +1469,
+ +1454, +1439, +1520, +1602, +1455, +1820, -1239, -1737, -1743, -726, -1821, -1892, -1822,
+ -1752, -1618, -1485, -1431, -1377, -1259, -1142, -1104, -1066, -1188, -1823, -1313, -803,
+ -869, -936, -1203, +2115, +1976, +1838, +916, -2055, -1569, -1596, -1579, -1563, -1706,
+ -1850, -1881, -1913, -1944, -1976, -1962, -1949, -1935, -1922, -1864, -1807, -1749, -1692,
+ -1548, -1404, -1004, -92, +996, +2084, +2084, +1372, +1394, +1418, +1441, +1465, +1444,
+ +1423, +1483, +1543, +1765, +1732, -2204, -1533, -1611, -1179, -1274, -1882, -1764, -1646,
+ -1560, -1475, -1301, -1127, -1113, -1101, -994, -887, -1052, -1730, -1395, -804, -709,
+ -872, -306, +2051, +1929, +2063, -151, -1597, -1347, -1354, -1326, -1300, -1417, -1535,
+ -1599, -1665, -1730, -1796, -1824, -1852, -1880, -1909, -1883, -1857, -1767, -1678, -1570,
+ -1462, -1434, +1154, +2402, +1858, +1858, +1344, +1373, +1403, +1432, +1462, +1435, +1409,
+ +1446, +1484, +1564, +621, -1890, -1842, -1737, -1633, -728, -1872, -1706, -1541, -1503,
+ -1466, -1428, -1391, -1225, -1060, -884, -709, -917, -1638, -1478, -807, -551, -808,
+ +590, +1988, +1882, +2288, -1218, -1140, -1126, -1112, -1075, -1038, -1129, -1220, -1319,
+ -1418, -1517, -1616, -1686, -1756, -1826, -1896, -1902, -1908, -1786, -1664, -1592, -1520,
+ -1864, +2400, +2016, +2144, +2144, +1348, +1372, +1398, +1424, +1450, +1463, +1477, +1491,
+ +1505, +1729, -607, -1838, -1790, -1735, -1681, -1003, -1350, -1710, -1558, -1519, -1480,
+ -1382, -1285, -1379, -1475, -1208, -941, -611, -793, -796, -800, -611, -680, +1364,
+ +1872, +1932, +1481, -1150, -966, -926, -886, -868, -851, -929, -1009, -1061, -1114,
+ -1230, -1348, -1521, -1695, -1805, -1915, -1900, -1886, -1792, -1698, -1604, -1766, -744,
+ +2326, +2134, +2198, +2198, +1352, +1373, +1395, +1417, +1439, +1492, +1546, +1536, +1526,
+ +1894, -1835, -1787, -1739, -1735, -1731, -1279, -828, -1714, -1577, -1536, -1495, -1337,
+ -1180, -1023, -866, -764, -663, -562, -973, -371, -282, -417, -552, +2138, +1757,
+ +1983, +674, -1083, -793, -726, -660, -662, -665, -731, -798, -804, -811, -945,
+ -1080, -1357, -1635, -1784, -1934, -1899, -1865, -1798, -1732, -1616, -2012, +376, +2252,
+ +2252, +2252, +2252, +1356, +1373, +1391, +1409, +1427, +1425, +1423, +1501, +1579, +907,
+ -1814, -1702, -1847, -1909, -1716, -1634, -786, -1686, -1819, -1712, -1605, -1371, -1139,
+ -921, -705, -656, -608, -384, -416, -233, -308, -477, +376, +1968, +1769, +2033,
+ -5, -839, -651, -606, -562, -584, -606, -660, -715, -739, -763, -963, -1164,
+ -1432, -1702, -1843, -1985, -1977, -1971, -1884, -1798, -2012, -2226, +2152, +2178, +2194,
+ +2210, +2210, +1360, +1374, +1388, +1402, +1416, +1358, +1300, +1466, +1632, -81, -1794,
+ -1619, -1956, -2085, -1702, -1991, -744, -891, -526, -353, -180, -383, -586, -821,
+ -1056, -805, -554, -463, -372, -353, -334, -539, +1304, +1799, +1782, +2085, -684,
+ -597, -510, -487, -464, -506, -548, -590, -632, -674, -716, -982, -1248, -1509,
+ -1770, -1903, -2036, -2057, -2078, -1971, -1864, -1896, -1416, +2392, +2104, +2136, +2168,
+ +2168, +1346, +1358, +1371, +1383, +1396, +1395, +1393, +1552, +1711, -1177, -1762, -2203,
+ -1364, -465, +690, +1942, +1913, +1747, +1837, +1816, +1794, +1889, +1983, +1774, +1564,
+ +548, -468, -299, -386, -391, -398, -147, +1895, +1920, +1946, +1284, -401, -397,
+ -393, -421, -450, -478, -507, -568, -629, -722, -815, -1068, -1321, -1697, -2074,
+ -2082, -2091, -2129, -2168, -2030, -1894, -2028, +142, +2280, +2114, +2082, +2050, +2050,
+ +1332, +1343, +1354, +1365, +1377, +1432, +1487, +1382, +1278, -1763, -195, +1308, +1788,
+ +1667, +1547, +1522, +1498, +1569, +1641, +1681, +1721, +1600, +1480, +1552, +1624, +1901,
+ +2179, +1145, -401, -431, -462, -12, +1974, +1786, +2111, +484, -119, -198, -277,
+ -356, -436, -451, -467, -547, -627, -770, -914, -898, -882, -606, -330, -470,
+ -611, -1435, -2259, -2091, -1924, -2160, +1700, +2168, +2124, +2028, +1932, +1932, +1318,
+ +1327, +1337, +1346, +1357, +1405, +1452, +1420, +1389, +1381, +1629, +1748, +1356, +1495,
+ +1635, +1631, +1627, +1551, +1732, +1689, +1647, +1728, +1809, +1730, +1652, +1686, +1721,
+ +1948, +1921, +874, -430, +363, +1925, +1764, +1859, +148, -28, -95, -160, -291,
+ -422, -423, -426, -557, -688, -370, -309, -280, -251, -570, -890, -858, -826,
+ -563, -301, -1079, -1858, -1636, +2170, +2296, +2166, +2118, +2070, +2070, +1304, +1312,
+ +1321, +1329, +1338, +1378, +1419, +1459, +1500, +1452, +1404, +1420, +1436, +1580, +1724,
+ +1484, +1244, +1022, +1313, +1187, +1062, +1088, +1115, +1397, +1680, +1728, +1777, +1729,
+ +1682, +1922, +1651, +1763, +1876, +1742, +1609, -189, +62, +8, -45, -226, -408,
+ -397, -387, -568, -750, -227, -217, -430, -644, -1047, -1451, -1502, -1554, -1229,
+ -905, -580, -256, -856, +1616, +1912, +2208, +2208, +2208, +2208, +1290, +1304, +1319,
+ +1334, +1350, +1377, +1404, +1271, +1395, +1525, +1655, +1769, +1884, +1802, +1720, +1430,
+ +1141, +1026, +1168, +1037, +908, +700, +491, +331, +172, +873, +1575, +1524, +1731,
+ +1991, +1738, +1774, +1811, +1914, +993, -119, +48, -74, -196, -271, -346, -407,
+ -470, -324, -179, -213, -503, -810, -1117, -1273, -1430, -1636, -1841, -1823, -1551,
+ -1246, -686, +1194, +1026, +1610, +2194, +2194, +2194, +2194, +1276, +1297, +1319, +1341,
+ +1363, +1376, +1390, +1340, +1802, +1854, +1907, +1863, +1820, +1768, +1717, +1377, +1038,
+ +1031, +1024, +889, +755, +568, +381, +290, +200, +19, -162, +553, +1781, +2060,
+ +1827, +1786, +1746, +2086, +378, -50, +35, -156, -348, -316, -284, -419, -554,
+ -337, -121, -456, -791, -934, -1078, -1244, -1411, -1514, -1617, -1907, -1686, -1657,
+ -1116, +1964, +1972, +2076, +2180, +2180, +2180, +2180, +1262, +1289, +1318, +1346, +1375,
+ +1359, +1344, +1632, +1921, +1927, +1934, +1876, +1820, +1702, +1585, +1259, +935, +907,
+ +880, +724, +569, +436, +302, +217, +132, +44, -43, -99, +102, +801, +2011,
+ +1878, +1745, +1426, +2131, +916, -43, -191, -340, -393, -446, -461, -478, -237,
+ -254, -522, -790, -962, -1135, -1519, -1647, -1760, -1872, -1446, -2045, -1827, -1354,
+ +2254, +2278, +2222, +2166, +2166, +2166, +2166, +1248, +1283, +1318, +1353, +1388, +1343,
+ +1298, +1925, +2040, +2001, +1962, +1891, +1820, +1637, +1454, +1143, +832, +784, +736,
+ +560, +384, +304, +224, +144, +64, +70, +76, +18, -40, +54, +1684, +1714,
+ +1744, +1790, +1836, +1882, +1928, +798, -332, -470, -608, -505, -402, -139, -388,
+ -589, -790, -991, -1192, -1794, -1884, -2006, -2128, -2266, -868, +818, +2504, +2288,
+ +2072, +2112, +2152, +2152, +2152, +2152, +1238, +1263, +1290, +1332, +1375, +1301, +1484,
+ +2002, +2009, +1973, +1939, +1871, +1805, +1608, +1411, +1118, +826, +751, +676, +505,
+ +334, +273, +212, +151, +91, +69, +48, +11, -26, +482, +1758, +1771, +1784,
+ +2033, +1771, +1860, +1950, +1989, +2029, +884, -260, -1156, -261, -309, -614, -922,
+ -975, -1411, -1848, -2062, -2019, -697, +626, +2060, +2471, +2273, +2076, +2051, +2026,
+ +2081, +2136, +2136, +2136, +2136, +1228, +1245, +1263, +1313, +1363, +1260, +1670, +2080,
+ +1978, +1947, +1916, +1853, +1791, +1580, +1369, +1094, +820, +718, +616, +450, +285,
+ +243, +201, +159, +118, +69, +20, +4, -13, +910, +1833, +1828, +1824, +229,
+ +1706, +1839, +1972, +1901, +1830, +1983, +2136, +2032, +1416, +1056, +696, +280, +376,
+ +728, +1080, +1767, +2454, +2405, +2356, +2035, +2226, +2193, +2160, +2070, +1980, +2050,
+ +2120, +2120, +2120, +2120, +1218, +1226, +1235, +1292, +1350, +1235, +1888, +2061, +1979,
+ +1935, +1893, +1834, +1776, +1551, +1326, +1070, +814, +685, +556, +395, +235, +212,
+ +189, +166, +145, +116, +88, -68, +33, +1306, +1811, +1949, +1576, -200, -183,
+ +905, +1994, +1956, +1919, +1881, +1844, +2004, +1909, +2005, +2102, +2042, +2239, +2195,
+ +2152, +2043, +1935, +2370, +2038, +2697, +1821, +368, +2244, +2121, +1998, +2051, +2104,
+ +2104, +2104, +2104, +1208, +1208, +1209, +1273, +1338, +1210, +2107, +2043, +1980, +1925,
+ +1871, +1816, +1762, +1523, +1285, +1046, +808, +652, +497, +341, +186, +182, +179,
+ +175, +172, +164, +157, +117, +590, +1958, +1791, +1815, +816, +140, -24, -28,
+ -32, +988, +2008, +2036, +2064, +1977, +1890, +1931, +1972, +2013, +2054, +2127, +2200,
+ +2320, +2440, +2080, +184, -1760, -3192, +336, +2328, +2172, +2016, +2052, +2088, +2088,
+ +2088, +2088, +1222, +1215, +1209, +1266, +1325, +1459, +2104, +2046, +1989, +1945, +1903,
+ +1861, +1819, +1612, +1406, +1136, +866, +715, +564, +446, +328, +295, +263, +230,
+ +199, +481, +764, +711, +1427, +2086, +1721, +1692, +128, -37, +55, -14, -82,
+ -108, -135, +335, +804, +1293, +1783, +2272, +2250, +2197, +1889, +1356, +568, -763,
+ -2095, -3010, -2646, -2931, -2705, +2305, +2196, +2159, +2122, +2117, +2112, +2112, +2112,
+ +2112, +1236, +1223, +1210, +1261, +1313, +1708, +2103, +2050, +1998, +1967, +1937, +1907,
+ +1877, +1702, +1528, +1226, +924, +778, +633, +552, +471, +409, +348, +287, +226,
+ +287, +349, +283, +1241, +1702, +1652, +1826, -48, +43, +134, +1, -132, -181,
+ -230, -343, -456, -670, -884, -202, -544, -946, -1860, -1718, -2088, -2311, -2534,
+ -2469, -2404, -2311, -1706, +2483, +2064, +2146, +2228, +2182, +2136, +2136, +2136, +2136,
+ +1250, +1230, +1211, +1255, +1300, +1957, +2101, +2054, +2007, +1956, +1906, +1856, +1806,
+ +1696, +1586, +1284, +982, +841, +701, +657, +613, +554, +497, +438, +381, +412,
+ +445, +717, +1758, +1782, +1807, +1095, -128, -70, -11, -97, -182, -253, -325,
+ -428, -532, -761, -991, -580, -170, -1033, -873, -1976, -1800, -2018, -2237, -2343,
+ -2450, -2650, -35, +2308, +2092, +2117, +2142, +2151, +2160, +2160, +2160, +2160, +1264,
+ +1238, +1212, +1250, +1288, +2206, +2100, +2058, +2016, +1946, +1876, +1806, +1736, +1690,
+ +1644, +1342, +1040, +905, +770, +763, +756, +701, +646, +591, +536, +539, +542,
+ +897, +1764, +1607, +1962, +365, -208, -182, -156, -194, -232, -326, -420, -514,
+ -608, -853, -1098, -1471, -820, -97, -910, -955, -2024, -2238, -2452, -2474, -2496,
+ -2990, +1636, +2134, +2120, +2088, +2056, +2120, +2184, +2184, +2184, +2184, +1198, +1191,
+ +1185, +1227, +1525, +2065, +2093, +2009, +1925, +1887, +1850, +1781, +1712, +1682, +1653,
+ +1464, +1275, +1130, +986, +937, +889, +840, +792, +743, +696, +684, +674, +1335,
+ +1741, +1839, +1939, +54, -294, -295, -297, -298, -300, -414, -527, -641, -755,
+ -947, -1140, -1732, -1813, -733, -166, -1038, -887, -1234, -1581, -1609, -1636, -1158,
+ +2392, +2279, +2166, +2119, +2072, +2121, +2170, +2170, +2170, +2170, +1132, +1145, +1159,
+ +1205, +1763, +1924, +2086, +1960, +1834, +1829, +1825, +1756, +1688, +1675, +1663, +1586,
+ +1510, +1356, +1202, +1112, +1023, +981, +939, +897, +856, +831, +807, +1774, +1718,
+ +1817, +1405, -512, -380, -409, -438, -403, -369, -502, -635, -768, -902, -1042,
+ -1182, -1482, -1782, -2138, -1982, -610, -262, -486, -711, -744, -777, +162, +2125,
+ +1912, +2212, +2150, +2088, +2122, +2156, +2156, +2156, +2156, +1194, +1146, +1100, +1182,
+ +1776, +1927, +2079, +1863, +1903, +1978, +1799, +1843, +1632, +1619, +1608, +1612, +1617,
+ +1517, +1418, +1351, +1284, +1216, +1149, +1098, +1048, +945, +1099, +1781, +1695, +1954,
+ +422, -566, -530, -554, -579, -571, -565, -686, -806, -927, -1049, -1232, -1416,
+ -1679, -1943, -2342, -2486, -2501, -2773, -2074, -1376, -1671, -2221, +458, +2369, +2137,
+ +2162, +2133, +2104, +2123, +2142, +2142, +2142, +2142, +1256, +1149, +1043, +1160, +1790,
+ +1931, +2073, +1766, +1972, +2129, +1774, +1931, +1576, +1565, +1554, +1639, +1724, +1679,
+ +1635, +1590, +1546, +1453, +1361, +1300, +1240, +1060, +1392, +1788, +1672, +2092, -560,
+ -620, -680, -700, -721, -741, -762, -870, -979, -1087, -1196, -1423, -1650, -1877,
+ -2104, -2291, -2478, -2857, -2724, -2895, -3067, -3110, -3666, +2547, +2103, +2107, +2112,
+ +2116, +2120, +2124, +2128, +2128, +2128, +2128, +1214, +1170, +1128, +1453, +1779, +1692,
+ +1861, +1807, +1753, +1732, +1712, +1803, +1640, +1759, +1623, +1710, +1799, +1666, +1790,
+ +1755, +1719, +1628, +1539, +1497, +1456, +1352, +1504, +1752, +1745, +1445, -902, -898,
+ -894, -907, -921, -935, -950, -1070, -1190, -1310, -1431, -1641, -1852, -2062, -2273,
+ -2431, -2590, -2812, -2779, -2929, -3080, -3279, -2198, +2298, +2187, +2124, +2062, +2081,
+ +2100, +2119, +2138, +2138, +2138, +2138, +1172, +1193, +1214, +1747, +1769, +1710, +2163,
+ +2360, +2046, +1592, +1651, +1677, +1704, +1954, +1693, +1783, +1874, +1654, +1947, +1920,
+ +1893, +1805, +1718, +1695, +1672, +1644, +1617, +1717, +1818, +798, -1245, -1176, -1108,
+ -1115, -1123, -1131, -1139, -1270, -1402, -1534, -1666, -1860, -2054, -2248, -2442, -2572,
+ -2702, -2768, -2834, -2964, -3094, -3192, -219, +2306, +2272, +2142, +2012, +2046, +2080,
+ +2114, +2148, +2148, +2148, +2148, +1194, +1150, +1364, +1784, +1694, +1983, +2272, +1441,
+ +2147, +1980, +1813, +1838, +1864, +1909, +1698, +1823, +1949, +1818, +1943, +1989, +2034,
+ +1933, +1833, +1812, +1792, +1712, +1633, +1649, +1923, -536, -1459, -1390, -1322, -1354,
+ -1388, -1421, -1455, -1566, -1678, -1789, -1901, -2078, -2256, -2433, -2611, -2744, -2878,
+ -2915, -2953, -2998, -3044, -3777, +1633, +2298, +1941, +2015, +2090, +2107, +2124, +2141,
+ +2158, +2158, +2158, +2158, +1216, +1109, +1514, +1823, +1620, +2001, +1870, +1803, +1224,
+ +1600, +1464, +1232, +1000, +1096, +1192, +1352, +1512, +1726, +1940, +2058, +2176, +2062,
+ +1948, +1930, +1912, +1781, +1650, +1583, +2028, -1871, -1674, -1605, -1536, -1595, -1654,
+ -1713, -1772, -1863, -1954, -2045, -2136, -2297, -2458, -2619, -2780, -2917, -3054, -3063,
+ -3072, -3033, -2994, -2827, +2460, +2035, +2122, +2145, +2168, +2168, +2168, +2168, +2168,
+ +2168, +2168, +2168, +1190, +1271, +1610, +1756, +1647, +1523, +1144, +1324, +1249, +1364,
+ +1224, +1211, +1199, +1255, +1566, +1430, +1294, +1404, +1514, +1800, +2087, +2075, +2063,
+ +2003, +1944, +1654, +1621, +1811, +979, -1997, -1903, -1888, -1874, -1927, -1982, -2036,
+ -2091, -2163, -2236, -2308, -2381, -2513, -2646, -2778, -2911, -3005, -3100, -3114, -3129,
+ -3039, -3206, -1084, +2317, +2104, +2148, +2159, +2171, +2175, +2179, +2183, +2187, +2187,
+ +2187, +2187, +1164, +1179, +1195, +1179, +1163, +1302, +1442, +1358, +1274, +1385, +1496,
+ +1447, +1399, +1158, +1429, +1508, +1588, +1594, +1601, +1543, +1486, +1832, +2179, +2077,
+ +1976, +1528, +1593, +1785, -582, -2381, -2133, -2172, -2212, -2261, -2311, -2361, -2411,
+ -2464, -2518, -2572, -2626, -2730, -2834, -2938, -3042, -3094, -3146, -3166, -3186, -3046,
+ -3418, +658, +2174, +2174, +2174, +2174, +2174, +2182, +2190, +2198, +2206, +2206, +2206,
+ +2206, +1202, +1230, +1259, +1272, +1286, +1321, +1356, +1343, +1331, +1405, +1480, +1474,
+ +1470, +1349, +1483, +1522, +1562, +1576, +1591, +1573, +1557, +1589, +1622, +1718, +1816,
+ +1690, +1820, +1694, -2015, -2556, -2330, -2376, -2422, -2610, -2799, -2700, -2602, -2669,
+ -2736, -2803, -2871, -2946, -3022, -3097, -3173, -3182, -3192, -3153, -3115, -3324, -3278,
+ +2256, +2159, +2147, +2136, +2156, +2177, +2189, +2201, +2213, +2225, +2225, +2225, +2225,
+ +1240, +1282, +1325, +1367, +1410, +1340, +1271, +1329, +1388, +1426, +1465, +1503, +1542,
+ +1540, +1539, +1537, +1536, +1559, +1582, +1605, +1628, +1603, +1578, +1617, +1656, +1596,
+ +1536, +1604, -2936, -2476, -2528, -2580, -2632, -2704, -2777, -2785, -2794, -2874, -2955,
+ -3035, -3116, -3163, -3210, -3257, -3304, -3271, -3238, -3141, -3044, -3091, -2114, +2319,
+ +2144, +2121, +2098, +2139, +2180, +2196, +2212, +2228, +2244, +2244, +2244, +2244, +1230,
+ +1255, +1281, +1306, +1333, +1303, +1272, +1338, +1405, +1436, +1468, +1500, +1533, +1535,
+ +1537, +1539, +1542, +1562, +1584, +1605, +1627, +1601, +1577, +1616, +1656, +1807, +1959,
+ -417, -2793, -2797, -2545, -2581, -2618, -2687, -2757, -2794, -2833, -2901, -2968, -3036,
+ -3105, -3145, -3186, -3178, -3171, -3149, -3128, -3058, -2989, -3221, -126, +2281, +2129,
+ +2084, +2040, +2107, +2175, +2189, +2203, +2217, +2231, +2231, +2231, +2231, +1220, +1229,
+ +1238, +1247, +1257, +1266, +1275, +1348, +1422, +1447, +1473, +1499, +1525, +1530, +1536,
+ +1542, +1548, +1567, +1587, +1606, +1626, +1601, +1577, +1616, +1656, +1763, +1871, +1658,
+ -2138, -2862, -2563, -2583, -2604, -2671, -2738, -2805, -2873, -2928, -2983, -3038, -3094,
+ -3128, -3162, -3100, -3038, -3028, -3018, -2976, -2934, -3352, +1862, +2244, +2114, +2048,
+ +1982, +2076, +2170, +2182, +2194, +2206, +2218, +2218, +2218, +2218, +1210, +1234, +1259,
+ +1283, +1308, +1325, +1341, +1390, +1439, +1457, +1477, +1496, +1516, +1525, +1535, +1544,
+ +1554, +1571, +1589, +1607, +1625, +1616, +1608, +1632, +1656, +1718, +1782, +1685, +1845,
+ +528, -2836, -2728, -2622, -2654, -2687, -2719, -2752, -2763, -2773, -2992, -2955, -3030,
+ -3106, -2813, -2777, -3226, -2908, -3134, -3359, -971, +2186, +2270, +2099, +2075, +2052,
+ +2108, +2165, +2175, +2185, +2195, +2205, +2205, +2205, +2205, +1200, +1240, +1280, +1320,
+ +1360, +1384, +1408, +1432, +1456, +1469, +1482, +1495, +1508, +1521, +1534, +1547, +1560,
+ +1576, +1592, +1608, +1624, +1632, +1640, +1648, +1656, +1675, +1694, +1713, +1732, +1871,
+ +986, -827, -2640, -2638, -2636, -2634, -2632, -2598, -2564, -2946, -2816, -2933, -3050,
+ -2783, -3028, -3169, -1774, +293, +2360, +2179, +1998, +2041, +2084, +2103, +2122, +2141,
+ +2160, +2168, +2176, +2184, +2192, +2192, +2192, +2192, +1232, +1266, +1300, +1334, +1368,
+ +1390, +1412, +1434, +1456, +1468, +1482, +1494, +1508, +1520, +1534, +1546, +1560, +1578,
+ +1596, +1614, +1632, +1640, +1648, +1656, +1664, +1645, +1628, +1705, +1784, +2101, +1908,
+ +1298, +688, +1071, -594, -1587, -2580, -2891, -3202, -2281, -2640, -2058, -1476, -94,
+ +1032, +2278, +2244, +2209, +2176, +2131, +2088, +2091, +2096, +2111, +2128, +2143, +2160,
+ +2168, +2176, +2184, +2192, +2192, +2192, +2192, +1264, +1292, +1320, +1348, +1376, +1396,
+ +1416, +1436, +1456, +1469, +1482, +1495, +1508, +1521, +1534, +1547, +1560, +1580, +1600,
+ +1620, +1640, +1648, +1656, +1664, +1672, +1617, +1562, +1699, +1836, +1821, +1806, +1887,
+ +1968, +1964, +1960, +2020, +2080, +1936, +1792, +1200, +1632, +1889, +2146, +2083, +2020,
+ +2093, +2166, +2079, +1992, +2085, +2178, +2143, +2108, +2121, +2134, +2147, +2160, +2168,
+ +2176, +2184, +2192, +2192, +2192, +2192, +1296, +1318, +1340, +1362, +1384, +1402, +1420,
+ +1438, +1456, +1468, +1482, +1494, +1508, +1520, +1534, +1546, +1560, +1582, +1604, +1626,
+ +1648, +1656, +1664, +1672, +1680, +1667, +1656, +1739, +1824, +1811, +1800, +1835, +1872,
+ +1881, +1890, +1819, +1748, +1995, +450, +937, +912, +715, +2056, +2019, +1984, +2035,
+ +2088, +2059, +2032, +2085, +2140, +2129, +2120, +2129, +2140, +2149, +2160, +2168, +2176,
+ +2184, +2192, +2192, +2192, +2192, +1328, +1344, +1360, +1376, +1392, +1408, +1424, +1440,
+ +1456, +1469, +1482, +1495, +1508, +1521, +1534, +1547, +1560, +1584, +1608, +1632, +1656,
+ +1664, +1672, +1680, +1688, +1719, +1750, +1781, +1812, +1803, +1794, +1785, +1776, +1798,
+ +1820, +1874, +1928, +1798, +2180, +674, +1216, +2103, +1966, +1957, +1948, +1979, +2010,
+ +2041, +2072, +2087, +2102, +2117, +2132, +2139, +2146, +2153, +2160, +2168, +2176, +2184,
+ +2192, +2192, +2192, +2192, +1328, +1344, +1360, +1376, +1392, +1408, +1424, +1440, +1456,
+ +1468, +1482, +1494, +1508, +1520, +1534, +1546, +1560, +1584, +1608, +1632, +1656, +1664,
+ +1672, +1680, +1688, +1718, +1750, +1780, +1812, +1802, +1794, +1784, +1776, +1798, +1820,
+ +1858, +1896, +1750, +1860, +2338, +1792, +2134, +1966, +1956, +1948, +1978, +2010, +2040,
+ +2072, +2086, +2102, +2116, +2132, +2138, +2146, +2152, +2160, +2168, +2176, +2184, +2192,
+ +2192, +2192, +2192, +1328, +1344, +1360, +1376, +1392, +1408, +1424, +1440, +1456, +1469,
+ +1482, +1495, +1508, +1521, +1534, +1547, +1560, +1584, +1608, +1632, +1656, +1664, +1672,
+ +1680, +1688, +1719, +1750, +1781, +1812, +1803, +1794, +1785, +1776, +1798, +1820, +1842,
+ +1864, +1958, +2052, +1954, +1856, +1911, +1966, +1957, +1948, +1979, +2010, +2041, +2072,
+ +2087, +2102, +2117, +2132, +2139, +2146, +2153, +2160, +2168, +2176, +2184, +2192, +2192,
+ +2192, +2192, +1328, +1344, +1360, +1376, +1392, +1408, +1424, +1440, +1456, +1468, +1482,
+ +1494, +1508, +1520, +1534, +1546, +1560, +1584, +1608, +1632, +1656, +1664, +1672, +1680,
+ +1688, +1718, +1750, +1780, +1812, +1802, +1794, +1784, +1776, +1798, +1820, +1842, +1864,
+ +1958, +2052, +1954, +1856, +1910, +1966, +1956, +1948, +1978, +2010, +2040, +2072, +2086,
+ +2102, +2116, +2132, +2138, +2146, +2152, +2160, +2168, +2176, +2184, +2192, +2192, +2192,
+ +2192
+};
+
+static const INT16 TEST_CR_COMPONENT[4096] = {
+ -2112, -2114, -2116, -2118, -2120, -2122, -2124, -2126, -2128, -2118, -2108, -2098, -2088,
+ -2150, -2212, -2146, -2080, -2100, -2120, -2140, -2160, -2164, -2168, -2172, -2176, -2092,
+ -2008, -2052, -2096, -2132, -2168, -2076, -1984, -2088, -2192, -2168, -2144, -2136, -2128,
+ -2120, -2112, -2126, -2140, -2154, -2168, -2150, -2132, -2114, -2096, -2096, -2096, -2096,
+ -2096, -2096, -2096, -2096, -2096, -2080, -2064, -2048, -2032, -2032, -2032, -2032, -2128,
+ -2113, -2098, -2115, -2132, -2133, -2134, -2135, -2137, -2127, -2117, -2107, -2097, -2117,
+ -2137, -2125, -2114, -2134, -2154, -2159, -2163, -2135, -2108, -2128, -2149, -2132, -2116,
+ -2116, -2115, -2115, -2114, -2098, -2082, -2112, -2142, -2141, -2139, -2133, -2128, -2122,
+ -2117, -2127, -2137, -2147, -2158, -2146, -2134, -2122, -2111, -2108, -2106, -2104, -2102,
+ -2101, -2101, -2101, -2101, -2087, -2073, -2059, -2045, -2045, -2045, -2045, -2144, -2112,
+ -2080, -2112, -2145, -2145, -2145, -2145, -2146, -2136, -2126, -2116, -2107, -2085, -2063,
+ -2105, -2148, -2168, -2189, -2178, -2167, -2107, -2048, -2085, -2122, -2173, -2225, -2180,
+ -2135, -2098, -2061, -2120, -2180, -2136, -2093, -2114, -2135, -2131, -2128, -2125, -2122,
+ -2128, -2135, -2141, -2148, -2142, -2137, -2131, -2126, -2121, -2117, -2112, -2108, -2107,
+ -2107, -2106, -2106, -2094, -2082, -2070, -2058, -2058, -2058, -2058, -2160, -2111, -2062,
+ -2109, -2157, -2156, -2155, -2154, -2155, -2145, -2135, -2125, -2116, -2132, -2148, -2132,
+ -2118, -2154, -2191, -2181, -2170, -2494, -2308, -2393, -2479, -2470, -2461, -2243, -2282,
+ -2353, -2167, -2174, -2182, -2160, -2139, -2135, -2130, -2128, -2128, -2127, -2127, -2129,
+ -2132, -2134, -2138, -2138, -2139, -2139, -2141, -2133, -2127, -2120, -2114, -2112, -2112,
+ -2111, -2111, -2101, -2091, -2081, -2071, -2071, -2071, -2071, -2176, -2110, -2045, -2107,
+ -2170, -2168, -2167, -2165, -2164, -2154, -2145, -2135, -2126, -2180, -2235, -2161, -2088,
+ -2141, -2195, -2440, -2686, -2371, -1033, -398, +236, +305, +375, -3, -894, -2096,
+ -2787, -2485, -2184, -2185, -2187, -2156, -2126, -2127, -2129, -2130, -2132, -2131, -2130,
+ -2129, -2128, -2135, -2142, -2149, -2156, -2147, -2138, -2129, -2120, -2119, -2118, -2117,
+ -2116, -2108, -2100, -2092, -2084, -2084, -2084, -2084, -2112, -2085, -2058, -2112, -2166,
+ -2067, -2225, -2190, -2157, -2107, -2057, -2104, -2151, -2119, -2088, -2632, -2666, -2263,
+ -837, +844, +2526, +3327, +2847, +2847, +2847, +2726, +2606, +2967, +3070, +2968, +2867,
+ +397, -2074, -2745, -2137, -2281, -2169, -2202, -2236, -2190, -2145, -2145, -2147, -2148,
+ -2150, -2152, -2156, -2159, -2163, -2159, -2156, -2152, -2150, -2130, -2111, -2123, -2137,
+ -2127, -2117, -2107, -2097, -2097, -2097, -2097, -2048, -2060, -2073, -2118, -2163, -1967,
+ -2284, -2217, -2150, -2060, -1971, -2074, -2177, -2315, -2454, -1057, +1364, +2990, +2568,
+ +2593, +2619, +2369, +2631, +2508, +2386, +2332, +2278, +2352, +2427, +2913, +2888, +3022,
+ +3156, +1302, -2088, -2406, -2213, -2279, -2345, -2251, -2158, -2161, -2165, -2168, -2172,
+ -2171, -2171, -2170, -2170, -2172, -2175, -2177, -2180, -2142, -2105, -2131, -2158, -2146,
+ -2134, -2122, -2110, -2110, -2110, -2110, -2112, -2163, -2215, -2235, -2255, -1994, -2247,
+ -2194, -2143, -2109, -2076, -2123, -2170, -2270, +700, +3527, +2770, +2035, +2325, +2293,
+ +2263, +2178, +2350, +2265, +2181, +2129, +2078, +2154, +2231, +2521, +2557, +2559, +2562,
+ +3221, +3113, +140, -2832, -2034, -2261, -2199, -2139, -2160, -2182, -2188, -2194, -2189,
+ -2185, -2181, -2177, -2185, -2193, -2201, -2210, -2154, -2098, -2138, -2179, -2165, -2151,
+ -2137, -2123, -2123, -2123, -2123, -1664, -1755, -1846, -1841, -1836, -1767, -2210, -2173,
+ -2136, -2159, -2182, -2173, -2164, -2739, +2830, +2735, +2640, +2361, +2082, +1995, +1908,
+ +1989, +2070, +2023, +1976, +1927, +1878, +1957, +2036, +2131, +2226, +2353, +2480, +2581,
+ +2682, +2943, +2692, -2815, -2178, -2149, -2120, -2160, -2200, -2208, -2216, -2208, -2200,
+ -2192, -2184, -2198, -2212, -2226, -2240, -2166, -2092, -2146, -2200, -2184, -2168, -2152,
+ -2136, -2136, -2136, -2136, -2096, -2166, -2238, -2228, -2220, -2087, -2210, -2173, -2137,
+ -2189, -2243, -2152, -2318, -2031, +3375, +2861, +2605, +2305, +2007, +1851, +1697, +1756,
+ +1815, +1810, +1806, +1756, +1707, +1754, +1801, +1911, +2023, +2149, +2277, +2299, +2323,
+ +2729, +1345, -2439, -2129, -2217, -2307, -2349, -2136, -2179, -2222, -2223, -2224, -2193,
+ -2162, -2171, -2180, -2190, -2199, -2198, -2198, -2213, -2229, -2172, -2115, -2170, -2225,
+ -2113, -2257, -2257, -2016, -2067, -2118, -2105, -2093, -2152, -2211, -2174, -2138, -2221,
+ -2305, -2132, -2472, +212, +2897, +2477, +2570, +2251, +1932, +1709, +1487, +1524, +1561,
+ +1598, +1636, +1586, +1537, +1552, +1567, +1693, +1820, +1947, +2074, +2019, +1964, +2261,
+ -514, -2321, -2080, -2031, -1982, -2283, -2073, -2151, -2229, -2238, -2248, -2194, -2140,
+ -2144, -2149, -2154, -2159, -2231, -2304, -2281, -2258, -2160, -2062, -2188, -2314, -2090,
+ -2378, -2378, -2064, -2094, -2126, -2125, -2125, -2152, -2179, -2159, -2139, -2204, -2270,
+ -2144, -2530, +1688, +2834, +2460, +2343, +2147, +1953, +1678, +1404, +1387, +1370, +1418,
+ +1466, +1416, +1366, +1349, +1332, +1442, +1553, +1663, +1775, +1817, +1861, +2415, -2405,
+ -2457, -1999, -2035, -281, -1464, -2393, -2378, -2363, -2301, -2240, -2195, -2150, -2165,
+ -2181, -2182, -2182, -2199, -2218, -2188, -2159, -2756, -2329, -1934, -2307, -2627, -2179,
+ -2307, -2112, -2123, -2135, -2146, -2158, -2153, -2149, -2144, -2140, -2188, -2236, -2156,
+ -2588, +3164, +2772, +2444, +2116, +2045, +1975, +1648, +1322, +1251, +1181, +1238, +1296,
+ +1246, +1197, +1147, +1098, +1192, +1287, +1381, +1476, +1617, +1758, +1291, -2760, -2083,
+ -2430, -1273, -628, -647, -667, -1582, -2498, -2365, -2233, -2196, -2160, -2187, -2215,
+ -2210, -2206, -2169, -2133, -2096, -2060, -280, -548, -2448, -1788, -860, -1980, -2236,
+ -2112, -2120, -2130, -2140, -2150, -2145, -2141, -2137, -2133, -2147, -2161, -2079, -718,
+ +3207, +2525, +2291, +2057, +1941, +1827, +1553, +1279, +1174, +1070, +1094, +1118, +1044,
+ +970, +976, +983, +1001, +1019, +1165, +1313, +1305, +1555, -212, -2491, -2189, -2401,
+ -867, -615, -642, -671, -603, -536, -1354, -2172, -2271, -2370, -2340, -2311, -2330,
+ -2349, -2315, -2282, -2697, -1321, -420, -543, -394, -757, -741, -2261, -2261, -2112,
+ -2119, -2127, -2135, -2143, -2138, -2134, -2130, -2126, -2106, -2087, -2259, +640, +2995,
+ +2279, +2138, +1998, +1839, +1681, +1459, +1237, +1098, +960, +950, +940, +842, +744,
+ +806, +869, +811, +753, +951, +1150, +995, +1352, -1715, -2222, -2297, -2372, -463,
+ -602, -639, -676, -649, -623, -600, -577, -810, -1044, -1214, -1384, -1426, -1469,
+ -1183, -897, -483, -582, -560, -538, -900, -750, -1134, -2542, -2286, -2112, -2117,
+ -2123, -2129, -2135, -2131, -2127, -2123, -2119, -2017, -1916, -2886, +1262, +2014, +2256,
+ +2097, +1939, +1736, +1534, +1364, +1194, +1022, +850, +806, +762, +736, +710, +508,
+ +818, +604, +646, +752, +859, +1131, +1149, -2865, -2273, -2339, -1639, -425, -493,
+ -522, -553, -566, -581, -677, -773, -661, -550, -567, -585, -586, -588, -657,
+ -727, -572, -675, -668, -661, -798, -679, -1799, -2407, -2151, -2112, -2116, -2120,
+ -2124, -2128, -2124, -2120, -2116, -2112, -2185, -2258, -1723, +1884, +1035, +2234, +2057,
+ +1880, +1634, +1388, +1270, +1152, +946, +740, +662, +584, +630, +676, +466, +1280,
+ +654, +540, +554, +568, +757, -78, -2481, -2324, -2383, -906, -389, -384, -407,
+ -430, -485, -540, -499, -458, -513, -568, -689, -810, -771, -732, -645, -558,
+ -663, -768, -776, -784, -696, -608, -2464, -2272, -2016, -2104, -2110, -2116, -2122,
+ -2129, -2105, -2081, -2105, -2130, -2204, -2536, -84, +1856, +1148, +1209, +1701, +1683,
+ +1507, +1332, +1188, +1045, +837, +630, +518, +407, +489, +572, +398, +1249, +662,
+ +330, +383, +436, +589, -1304, -2350, -2117, -2615, +213, -12, -239, -265, -293,
+ -320, -348, -377, -407, -484, -562, -626, -691, -675, -661, -625, -590, -682,
+ -776, -804, -832, -540, -248, -664, -1848, -2616, -2096, -2104, -2113, -2121, -2130,
+ -2086, -2043, -2095, -2148, -2225, -2815, +1555, +1829, +1519, +697, +1603, +1486, +1381,
+ +1276, +1107, +938, +729, +520, +375, +230, +349, +468, +331, +1219, +670, +121,
+ +212, +304, +423, -2531, -2477, -2423, -1569, +309, -149, -94, -125, -157, -157,
+ -157, -256, -356, -456, -556, -564, -573, -581, -590, -606, -623, -703, -784,
+ -832, -880, -384, +112, -1424, -2448, -2192, -2088, -2098, -2109, -2119, -2131, -2099,
+ -2068, -2100, -2134, -2485, -2325, +2921, +2025, +1536, +1048, +1088, +1385, +1270, +1156,
+ +993, +831, +700, +570, +407, +245, +256, +268, +343, +932, +662, +135, +185,
+ +236, -337, -2445, -2346, -2504, -793, +149, -75, -45, -64, -84, -88, -93,
+ -183, -273, -363, -454, -454, -454, -518, -583, -619, -655, -723, -792, -796,
+ -800, -868, -1960, -2296, -2376, -2248, -2080, -2093, -2106, -2119, -2132, -2113, -2094,
+ -2107, -2120, -2234, -813, +2752, +2222, +1555, +1401, +574, +1284, +1160, +1036, +880,
+ +724, +672, +620, +440, +260, +164, +69, +357, +646, +654, +151, +159, +168,
+ -1096, -2361, -2217, -2586, -18, -11, -3, +4, -4, -13, -21, -30, -110,
+ -191, -271, -352, -344, -336, -456, -576, -632, -688, -744, -800, -760, -720,
+ -584, -2496, -2400, -2304, -2304, -2072, -2086, -2102, -2117, -2133, -2171, -2211, -2170,
+ -2130, -2462, +1045, +2615, +2138, +1656, +1432, +807, +951, +1193, +924, +734, +545,
+ +397, +250, +486, +723, +569, +416, +311, +207, +384, +305, +242, +180, -1825,
+ -2295, -2348, -1891, +69, -19, -10, -3, -7, -12, -16, -22, -65, -107,
+ -182, -258, -309, -361, -477, -593, -640, -688, -736, -784, -752, -720, -1200,
+ -2448, -2384, -2320, -2320, -2064, -2081, -2099, -2116, -2134, -2231, -2329, -2234, -2140,
+ -2691, +2902, +2478, +2055, +1759, +1464, +1041, +618, +1227, +812, +589, +366, +379,
+ +392, +277, +162, +207, +253, +267, +281, +114, -52, +70, +192, -2555, -2230,
+ -2481, -1197, +156, -28, -19, -10, -11, -12, -13, -15, -20, -25, -94,
+ -164, -275, -387, -498, -610, -649, -689, -728, -768, -744, -720, -1816, -2400,
+ -2368, -2336, -2336, -2056, -2075, -2095, -2115, -2135, -2178, -2222, -2138, -2310, -1319,
+ +2743, +2293, +2099, +1893, +1432, +1242, +541, +1036, +1020, +699, +379, +376, +374,
+ +275, +177, +196, +217, +189, +162, +100, +39, +153, -756, -2420, -2293, -2549,
+ -502, +131, -4, -10, -17, -14, -12, -9, -7, -7, -6, -102, -198,
+ -320, -444, -519, -595, -641, -689, -720, -752, -768, -784, -2192, -2320, -2336,
+ -2352, -2352, -2048, -2070, -2092, -2114, -2136, -2126, -2116, -2042, -2480, +52, +2584,
+ +2108, +2144, +2028, +1400, +1444, +464, +78, -308, -470, -632, -394, -156, +18,
+ +192, +187, +182, +113, +44, +87, +130, +237, -1704, -2286, -2356, -2618, +192,
+ +106, +20, -2, -24, -18, -12, -6, +0, +6, +12, -110, -232, -367,
+ -502, -541, -580, -635, -690, -713, -736, -792, -848, -2568, -2240, -2304, -2368,
+ -2368, -2046, -2068, -2091, -2113, -2136, -2121, -2105, -2186, -2523, +1999, +2681, +2740,
+ +1518, +117, -1541, -2639, -2457, -2465, -2474, -2466, -2459, -2498, -2536, -2303, -2070,
+ -995, +81, -76, +24, +35, +47, -150, -2394, -2422, -2450, -1806, +117, +85,
+ +53, +21, -11, -11, -11, -11, -11, -11, -11, -107, -203, -404, -606,
+ -615, -625, -610, -596, -693, -791, -757, -1491, -2401, -2287, -2303, -2319, -2319,
+ -2044, -2067, -2090, -2113, -2137, -2116, -2095, -2074, -2054, +2923, +219, -1748, -2692,
+ -2563, -2435, -2114, -2306, -2193, -2080, -2159, -2239, -2298, -2357, -2320, -2284, -2432,
+ -2580, -1544, +4, -16, -36, -280, -2572, -2302, -2544, -994, +43, +64, +86,
+ +44, +2, -4, -10, -16, -22, -28, -34, -104, -174, -186, -198, -178,
+ -158, -330, -502, -674, -846, -722, -2134, -2234, -2334, -2302, -2270, -2270, -2042,
+ -2065, -2089, -2112, -2137, -2159, -2180, -2154, -2129, -2458, -2532, -2604, -2166, -2218,
+ -2272, -2293, -2315, -2000, -2198, -2219, -2242, -2322, -2401, -2385, -2370, -2285, -2201,
+ -2452, -2704, -1411, +137, -1402, -2174, -2502, -2830, +250, +0, +28, +55, +35,
+ +15, +3, -9, -21, -33, -45, -57, -101, -145, -175, -206, -220, -235,
+ -177, -120, -414, -709, -191, -2489, -2547, -2349, -2349, -2349, -2349, -2040, -2064,
+ -2089, -2113, -2138, -2202, -2267, -2235, -2204, -2207, -2210, -2181, -2152, -2131, -2110,
+ -2217, -1812, -1552, -2317, -2025, -1734, -1578, -1423, -1939, -2456, -2395, -2334, -2081,
+ -2340, -2551, -2250, -2013, -2288, -2446, -2093, -43, -42, -8, +25, +26, +28,
+ +10, -8, -26, -44, -62, -80, -98, -116, -165, -214, -263, -312, -281,
+ -250, -155, -60, -940, -1820, -2348, -2364, -2396, -2428, -2428, -2038, -2058, -2079,
+ -2100, -2122, -2123, -2124, -2285, -2191, -2065, -1940, -1910, -1882, -2232, -2327, -2149,
+ -1717, -1485, -2022, -1759, -1497, -1242, -987, -716, -446, -1226, -2007, -2723, -2160,
+ -2330, -2245, -2175, -2362, -2338, -1034, +109, -28, -19, -10, +15, +41, +19,
+ -3, -25, -47, -89, -131, -141, -151, -208, -266, -355, -445, -458, -472,
+ -405, -83, -1135, -1163, -1895, -2371, -2387, -2403, -2403, -2036, -2053, -2071, -2089,
+ -2107, -2044, -1982, -2080, -1666, -1668, -1671, -1897, -2124, -2590, -2545, -2083, -1622,
+ -1419, -1729, -1495, -1261, -1162, -1064, -774, -484, -314, -144, -806, -2492, -2366,
+ -2240, -2338, -2436, -2486, -489, +4, -15, -30, -45, +4, +54, +28, +2,
+ -24, -50, -116, -182, -184, -186, -252, -318, -448, -578, -636, -694, -656,
+ -106, -2098, -2042, -2210, -2378, -2378, -2378, -2378, -2034, -2047, -2062, -2076, -2091,
+ -2093, -2096, -1650, -1461, -1687, -1913, -2155, -2398, -2676, -2442, -2016, -1591, -1448,
+ -1563, -1341, -1120, -986, -853, -623, -394, -265, -137, +200, +24, -1554, -2363,
+ -2324, -2286, -2122, -2727, -1220, +31, +136, -15, +25, +67, +37, +7, -7,
+ -21, -111, -201, -211, -221, -295, -370, -460, -551, -509, -468, -634, -545,
+ -2805, -2249, -2301, -2353, -2353, -2353, -2353, -2032, -2043, -2054, -2065, -2076, -2143,
+ -2210, -1477, -1768, -1962, -2156, -2414, -2672, -2762, -2340, -1950, -1560, -1479, -1398,
+ -1189, -980, -811, -642, -473, -304, -217, -130, -75, -20, +27, -2486, -2311,
+ -2136, -2527, -2406, -2445, -2484, -979, +14, +47, +80, +46, +12, +10, +8,
+ -106, -220, -238, -256, -339, -422, -473, -524, -639, -754, -1637, -2520, -2232,
+ -2456, -2392, -2328, -2328, -2328, -2328, -2012, -2030, -2049, -2052, -2055, -2191, -2073,
+ -1585, -1867, -2081, -2296, -2526, -2757, -2653, -2294, -1886, -1479, -1380, -1282, -1087,
+ -893, -748, -604, -491, -379, -243, -109, -181, +1, -606, -2493, -2283, -2331,
+ -2481, -2376, -2413, -2452, -2308, -2421, -1350, -278, -124, +30, +88, +145, +127,
+ +109, +27, -56, -278, -501, -1107, -1714, -2162, -2612, -2532, -2453, -2297, -2397,
+ -2369, -2341, -2341, -2341, -2341, -1992, -2018, -2045, -2040, -2035, -2241, -1936, -1695,
+ -1966, -2201, -2436, -2639, -2842, -2545, -2248, -1823, -1398, -1282, -1166, -986, -806,
+ -686, -566, -510, -454, -271, -88, -289, +22, -1239, -2500, -2257, -2526, -388,
+ -2346, -2383, -2421, -2358, -2296, -2490, -2684, -2342, -2001, -1627, -1254, -1176, -1099,
+ -1501, -1904, -2266, -2628, -2510, -2393, -2407, -2422, -2404, -2386, -2362, -2338, -2346,
+ -2354, -2354, -2354, -2354, -1972, -2006, -2040, -2043, -2046, -2194, -1831, -1835, -2097,
+ -2336, -2576, -2735, -2895, -2564, -2234, -1839, -1445, -1279, -1114, -916, -719, -623,
+ -528, -528, -529, -425, -323, -59, -53, -2527, -2443, -2517, -2081, +170, -140,
+ -1312, -2485, -2440, -2395, -2382, -2370, -2400, -2431, -2509, -2589, -2559, -2530, -2500,
+ -2472, -2429, -2387, -2489, -2335, -2939, -2008, -1331, -2447, -2395, -2343, -2355, -2367,
+ -2367, -2367, -2367, -1952, -1994, -2037, -2047, -2058, -2148, -1727, -1977, -2228, -2472,
+ -2716, -2832, -2948, -2584, -2220, -1856, -1492, -1277, -1062, -847, -632, -561, -490,
+ -547, -604, -581, -558, -343, -1152, -2281, -2386, -2523, -1124, -40, +19, +15,
+ +10, -1242, -2495, -2531, -2568, -2459, -2350, -2369, -2388, -2407, -2426, -2477, -2528,
+ -2593, -2659, -2212, -1254, +369, +967, -1026, -2508, -2428, -2348, -2364, -2380, -2380,
+ -2380, -2380, -1948, -1996, -2044, -2060, -2077, -1957, -1837, -2069, -2303, -2545, -2788,
+ -2918, -3049, -2873, -2442, -2026, -1611, -1374, -1138, -965, -793, -732, -672, -707,
+ -743, -847, -953, -2017, -2059, -2441, -2313, -2327, -295, +99, -19, +23, +65,
+ +26, -13, -629, -1246, -1795, -2345, -2509, -2675, -2540, -2406, -1887, -1368, -467,
+ +434, +439, +699, +1162, +856, -2695, -2409, -2413, -2417, -2389, -2361, -2361, -2361,
+ -2361, -1944, -1998, -2052, -2074, -2097, -1767, -1949, -2163, -2378, -2619, -2860, -3005,
+ -3150, -3163, -2664, -2197, -1730, -1472, -1214, -1084, -954, -904, -854, -868, -882,
+ -859, -836, -877, -1942, -2091, -2240, -2389, +22, -18, -57, +32, +121, +14,
+ -93, -9, +76, +149, +221, +166, +110, +143, +175, +239, +304, +379, +455,
+ +530, +605, +676, +235, -2573, -2310, -2398, -2486, -2414, -2342, -2342, -2342, -2342,
+ -1940, -2000, -2060, -2072, -2084, -1640, -1964, -2144, -2325, -2532, -2740, -2899, -3059,
+ -3052, -2790, -2319, -1849, -1569, -1290, -1202, -1115, -1075, -1036, -1028, -1021, -1077,
+ -1135, -503, -2689, -2395, -2359, -1553, +19, -6, -30, +25, +80, +34, -12,
+ +37, +86, +124, +162, +137, +111, +137, +163, +237, +312, +393, +475, +525,
+ +574, +654, -803, -2466, -2339, -2383, -2427, -2375, -2323, -2323, -2323, -2323, -1936,
+ -2002, -2068, -2070, -2072, -1514, -1980, -2126, -2272, -2446, -2620, -2794, -2968, -2942,
+ -2916, -2442, -1968, -1667, -1366, -1321, -1276, -1247, -1218, -1189, -1160, -1041, -922,
+ -1411, -2412, -2189, -2478, -719, +16, +6, -4, +18, +40, +54, +68, +82,
+ +96, +100, +104, +108, +112, +132, +152, +236, +320, +408, +496, +520, +544,
+ +632, -1840, -2360, -2368, -2368, -2368, -2336, -2304, -2304, -2304, -2304, -1898, -1921,
+ -1944, -2111, -1766, -1551, -1848, -1985, -2122, -2318, -2515, -2664, -2813, -3074, -3079,
+ -2828, -2321, -2024, -1729, -1608, -1489, -1457, -1425, -1393, -1362, -1246, -1131, -1879,
+ -2372, -2532, -2693, +331, +25, +40, +55, +54, +54, +71, +88, +105, +123,
+ +151, +180, +208, +237, +83, -70, +48, +167, +248, +329, +346, +363, +733,
+ -2738, -2577, -2416, -2395, -2374, -2353, -2332, -2332, -2332, -2332, -1860, -1840, -1820,
+ -2152, -1460, -1588, -1716, -1844, -1972, -2191, -2411, -2535, -2659, -2950, -2730, -2958,
+ -2674, -2383, -2092, -1897, -1703, -1668, -1633, -1598, -1564, -1452, -1340, -2348, -2333,
+ -2365, -1885, -157, +34, +74, +115, +91, +68, +88, +109, +129, +150, +203,
+ +256, +309, +362, +291, +220, +117, +14, +88, +162, +172, +183, -702, -2612,
+ -2282, -2464, -2422, -2380, -2370, -2360, -2360, -2360, -2360, -2110, -1967, -1824, -1953,
+ -1314, -1513, -1712, -1815, -1918, -2207, -2242, -2453, -2408, -2602, -2541, -2752, -2707,
+ -2692, -2679, -2409, -2140, -2054, -1968, -1867, -1766, -1721, -1677, -2369, -2293, -2516,
+ -948, -53, +75, +92, +110, +95, +82, +105, +129, +152, +177, +222, +268,
+ +313, +359, +354, +350, +441, +533, +472, +411, +414, +674, -1689, -2518, -2339,
+ -2416, -2401, -2386, -2387, -2388, -2388, -2388, -2388, -1848, -1838, -1828, -1754, -1168,
+ -1438, -1708, -1786, -1864, -2225, -2075, -2372, -2158, -2255, -2353, -2546, -2740, -2747,
+ -2755, -2666, -2578, -2441, -2305, -2136, -1968, -1991, -2015, -2390, -2254, -2669, -13,
+ +51, +116, +111, +106, +101, +96, +123, +150, +177, +204, +242, +280, +318,
+ +356, +418, +480, +510, +540, +600, +661, +657, +1166, -2677, -2425, -2396, -2368,
+ -2380, -2392, -2404, -2416, -2416, -2416, -2416, -1882, -1711, -1796, -1369, -1198, -1419,
+ -1640, -1749, -1858, -1977, -1842, -2058, -2019, -2113, -2207, -2366, -2525, -2478, -2689,
+ -2836, -2983, -2759, -2536, -2393, -2250, -2194, -2139, -2357, -2318, -2018, +72, +113,
+ +157, +150, +145, +139, +134, +159, +186, +212, +239, +273, +308, +342, +377,
+ +439, +502, +548, +595, +632, +669, +931, +170, -2666, -2430, -2403, -2376, -2385,
+ -2394, -2403, -2412, -2412, -2412, -2412, -1916, -1840, -2276, -1240, -1228, -1400, -1572,
+ -1712, -1852, -1731, -1610, -1745, -1881, -1972, -2063, -2186, -2310, -2211, -2625, -2751,
+ -2877, -2822, -2768, -2650, -2532, -2398, -2265, -2324, -2383, -1369, +156, +177, +198,
+ +191, +185, +178, +172, +197, +223, +248, +274, +305, +336, +367, +398, +461,
+ +524, +587, +650, +664, +679, +1206, -827, -2656, -2437, -2410, -2384, -2390, -2396,
+ -2402, -2408, -2408, -2408, -2408, -1950, -1953, -1956, -1063, -1194, -1317, -1440, -1435,
+ -1430, -1499, -1314, -1431, -1550, -1638, -1726, -1798, -1871, -1927, -2240, -2409, -2578,
+ -2597, -2616, -2731, -2846, -2554, -2262, -2259, -2511, -527, +176, +207, +239, +231,
+ +224, +217, +210, +234, +259, +284, +309, +336, +364, +391, +419, +482, +546,
+ +609, +673, +744, +816, +936, -2015, -2485, -2187, -2289, -2392, -2395, -2398, -2401,
+ -2404, -2404, -2404, -2404, -1984, -2066, -1636, -886, -1160, -1234, -1308, -1414, -1520,
+ -2037, -2042, -1887, -1732, -1817, -1902, -1923, -1944, -1900, -1856, -2068, -2280, -2372,
+ -2464, -2556, -2648, -2454, -2260, -2194, -2640, +314, +196, +238, +280, +272, +264,
+ +256, +248, +272, +296, +320, +344, +368, +392, +416, +440, +504, +568, +632,
+ +696, +825, +954, +923, -2692, -2315, -2450, -2425, -2400, -2400, -2400, -2400, -2400,
+ -2400, -2400, -2400, -2252, -1953, -1142, -1035, -1441, -1826, -2211, -2244, -2278, -2220,
+ -1908, -1914, -1922, -2001, -2336, -2095, -2111, -2171, -2231, -2131, -2031, -2143, -2255,
+ -2303, -2352, -2306, -2260, -2359, -1689, +442, +269, +305, +341, +333, +325, +317,
+ +309, +329, +349, +369, +389, +415, +441, +468, +494, +536, +579, +669, +760,
+ +797, +1091, -248, -2610, -2406, -2459, -2431, -2404, -2400, -2396, -2392, -2388, -2388,
+ -2388, -2388, -2008, -2096, -1673, -1953, -2234, -2162, -2091, -2051, -2012, -2149, -2286,
+ -2199, -2113, -1930, -2259, -2012, -2278, -2186, -2094, -2194, -2295, -2171, -2047, -2051,
+ -2056, -2158, -2261, -2524, -739, +570, +343, +372, +402, +394, +386, +378, +370,
+ +386, +402, +418, +434, +462, +491, +520, +549, +569, +590, +707, +824, +770,
+ +1228, -1418, -2528, -2498, -2468, -2438, -2408, -2400, -2392, -2384, -2376, -2376, -2376,
+ -2376, -1988, -2191, -2139, -2150, -2163, -2130, -2098, -2081, -2066, -2140, -2216, -2179,
+ -2143, -2066, -2245, -2137, -2285, -2233, -2181, -2225, -2270, -2326, -2382, -2166, -1952,
+ -2250, -2549, -2465, +180, +394, +352, +407, +463, +455, +447, +423, +399, +523,
+ +391, +547, +447, +493, +540, +572, +603, +633, +665, +792, +920, +1094, +1269,
+ -2764, -2446, -2429, -2413, -2412, -2412, -2400, -2388, -2376, -2364, -2364, -2364, -2364,
+ -1968, -2031, -2094, -2093, -2092, -2099, -2106, -2113, -2120, -2133, -2147, -2160, -2174,
+ -2203, -2233, -2262, -2292, -2280, -2269, -2257, -2246, -2226, -2207, -2283, -2360, -2343,
+ -2327, -2406, +586, -38, +363, +443, +524, +516, +508, +468, +428, +660, +380,
+ +676, +460, +525, +591, +624, +658, +699, +741, +878, +1016, +907, +286, -2575,
+ -2364, -2361, -2358, -2387, -2416, -2400, -2384, -2368, -2352, -2352, -2352, -2352, -2020,
+ -2071, -2124, -2080, -2037, -2062, -2089, -2115, -2142, -2152, -2164, -2176, -2188, -2211,
+ -2235, -2259, -2283, -2275, -2267, -2260, -2253, -2249, -2246, -2290, -2336, -2337, -2339,
+ -1205, -71, -16, +296, +496, +441, +469, +497, +381, +521, +635, +493, +735,
+ +465, +544, +624, +640, +656, +747, +839, +899, +960, +1115, -1033, -2493, -2418,
+ -2378, -2339, -2379, -2420, -2408, -2396, -2384, -2372, -2372, -2372, -2372, -2072, -2113,
+ -2155, -2068, -1982, -2027, -2073, -2118, -2164, -2173, -2183, -2193, -2203, -2220, -2238,
+ -2256, -2274, -2270, -2267, -2264, -2261, -2273, -2286, -2299, -2312, -2332, -2352, -2052,
+ -729, +7, +230, +550, +358, +422, +486, +294, +614, +610, +606, +794, +470,
+ +564, +658, +656, +655, +797, +939, +921, +904, +1324, -2352, -2412, -2472, -2396,
+ -2320, -2372, -2424, -2416, -2408, -2400, -2392, -2392, -2392, -2392, -1996, -1930, -1865,
+ -1960, -2055, -2087, -2120, -2153, -2186, -2193, -2201, -2209, -2217, -2229, -2241, -2253,
+ -2265, -2265, -2266, -2267, -2268, -2280, -2294, -2306, -2320, -2342, -2365, -2707, -2538,
+ -1491, -188, +172, +275, +327, +379, +287, +451, +505, +559, +773, +475, +551,
+ +628, +512, +653, +909, +654, +1007, +1104, -739, -2583, -2506, -2430, -2397, -2365,
+ -2396, -2428, -2424, -2420, -2416, -2412, -2412, -2412, -2412, -1920, -2004, -2088, -2108,
+ -2128, -2148, -2168, -2188, -2208, -2214, -2220, -2226, -2232, -2238, -2244, -2250, -2256,
+ -2261, -2266, -2271, -2276, -2289, -2302, -2315, -2328, -2353, -2378, -2339, -2300, -2477,
+ -1630, -719, +192, +232, +272, +280, +288, +400, +512, +752, +480, +539, +598,
+ +369, +652, +767, -142, -1211, -2792, -2547, -2302, -2345, -2388, -2399, -2410, -2421,
+ -2432, -2432, -2432, -2432, -2432, -2432, -2432, -2432, -2024, -2070, -2116, -2130, -2144,
+ -2164, -2184, -2204, -2224, -2228, -2232, -2236, -2240, -2244, -2248, -2252, -2256, -2262,
+ -2270, -2276, -2284, -2296, -2310, -2322, -2336, -2319, -2304, -2287, -2272, -2559, -2336,
+ -1855, -1376, -2264, -1104, -520, +64, +384, +704, +704, +192, -44, -280, -1236,
+ -1936, -3018, -2564, -2349, -2392, -2390, -2390, -2388, -2388, -2398, -2410, -2420, -2432,
+ -2432, -2432, -2432, -2432, -2432, -2432, -2432, -2128, -2136, -2144, -2152, -2160, -2180,
+ -2200, -2220, -2240, -2242, -2244, -2246, -2248, -2250, -2252, -2254, -2256, -2265, -2274,
+ -2283, -2292, -2305, -2318, -2331, -2344, -2287, -2230, -2237, -2244, -2387, -2530, -2481,
+ -2432, -2456, -2480, -2600, -2720, -2448, -2176, -1904, -2144, -2419, -2694, -2585, -2476,
+ -2451, -2426, -2465, -2504, -2491, -2478, -2433, -2388, -2399, -2410, -2421, -2432, -2432,
+ -2432, -2432, -2432, -2432, -2432, -2432, -2104, -2122, -2140, -2158, -2176, -2196, -2216,
+ -2236, -2256, -2256, -2256, -2256, -2256, -2256, -2256, -2256, -2256, -2266, -2278, -2288,
+ -2300, -2312, -2326, -2338, -2352, -2317, -2284, -2281, -2280, -2357, -2436, -2417, -2400,
+ -2408, -2416, -2360, -2304, -2480, -864, -1648, -1408, -1225, -2580, -2509, -2440, -2427,
+ -2416, -2435, -2456, -2446, -2438, -2412, -2388, -2398, -2410, -2420, -2432, -2432, -2432,
+ -2432, -2432, -2432, -2432, -2432, -2080, -2108, -2136, -2164, -2192, -2212, -2232, -2252,
+ -2272, -2270, -2268, -2266, -2264, -2262, -2260, -2258, -2256, -2269, -2282, -2295, -2308,
+ -2321, -2334, -2347, -2360, -2349, -2338, -2327, -2316, -2329, -2342, -2355, -2368, -2360,
+ -2352, -2376, -2400, -2256, -2624, -1392, -1696, -2593, -2466, -2435, -2404, -2405, -2406,
+ -2407, -2408, -2403, -2398, -2393, -2388, -2399, -2410, -2421, -2432, -2432, -2432, -2432,
+ -2432, -2432, -2432, -2432, -2080, -2108, -2136, -2164, -2192, -2212, -2232, -2252, -2272,
+ -2270, -2268, -2266, -2264, -2262, -2260, -2258, -2256, -2268, -2282, -2294, -2308, -2320,
+ -2334, -2346, -2360, -2348, -2338, -2326, -2316, -2328, -2342, -2354, -2368, -2360, -2352,
+ -2360, -2368, -2352, -2592, -2192, -2560, -2768, -2466, -2434, -2404, -2404, -2406, -2406,
+ -2408, -2402, -2398, -2392, -2388, -2398, -2410, -2420, -2432, -2432, -2432, -2432, -2432,
+ -2432, -2432, -2432, -2080, -2108, -2136, -2164, -2192, -2212, -2232, -2252, -2272, -2270,
+ -2268, -2266, -2264, -2262, -2260, -2258, -2256, -2269, -2282, -2295, -2308, -2321, -2334,
+ -2347, -2360, -2349, -2338, -2327, -2316, -2329, -2342, -2355, -2368, -2360, -2352, -2344,
+ -2336, -2448, -2560, -2480, -2400, -2433, -2466, -2435, -2404, -2405, -2406, -2407, -2408,
+ -2403, -2398, -2393, -2388, -2399, -2410, -2421, -2432, -2432, -2432, -2432, -2432, -2432,
+ -2432, -2432, -2080, -2108, -2136, -2164, -2192, -2212, -2232, -2252, -2272, -2270, -2268,
+ -2266, -2264, -2262, -2260, -2258, -2256, -2268, -2282, -2294, -2308, -2320, -2334, -2346,
+ -2360, -2348, -2338, -2326, -2316, -2328, -2342, -2354, -2368, -2360, -2352, -2344, -2336,
+ -2448, -2560, -2480, -2400, -2432, -2466, -2434, -2404, -2404, -2406, -2406, -2408, -2402,
+ -2398, -2392, -2388, -2398, -2410, -2420, -2432, -2432, -2432, -2432, -2432, -2432, -2432,
+ -2432
+};
+
+/**
+ * 64x64 XRGB Image
+ */
+
+static const UINT32 TEST_XRGB_IMAGE[4096] = {
+ 0xFF229cdf, 0xFF249de0, 0xFF259fe2, 0xFF2ca5e8, 0xFF229cdf, 0xFF229ce0, 0xFF239de0, 0xFF229ce0,
+ 0xFF229cdf, 0xFF229cdf, 0xFF239ce0, 0xFF249ce0, 0xFF249ce0, 0xFF219ce3, 0xFF1e9ce6, 0xFF209ae2,
+ 0xFF2299dd, 0xFF2199de, 0xFF209adf, 0xFF209ae0, 0xFF1f9be0, 0xFF1e9ae0, 0xFF1d99e0, 0xFF1c98e0,
+ 0xFF1b97df, 0xFF1e96dc, 0xFF2194d9, 0xFF1f93dd, 0xFF1d93e0, 0xFF1b94dc, 0xFF1895d8, 0xFF1c92db,
+ 0xFF208fde, 0xFF1b91de, 0xFF1693df, 0xFF1793df, 0xFF1992df, 0xFF1891df, 0xFF178fdf, 0xFF178edf,
+ 0xFF168dde, 0xFF158cdd, 0xFF148cdc, 0xFF128cda, 0xFF118cd9, 0xFF118bd9, 0xFF128ada, 0xFF1289da,
+ 0xFF1288db, 0xFF1187da, 0xFF1186da, 0xFF1085da, 0xFF0f85d9, 0xFF0f84d9, 0xFF0e83d9, 0xFF0d82d8,
+ 0xFF0d82d8, 0xFF0d81d8, 0xFF0d80d7, 0xFF0d7fd7, 0xFF0d7ed6, 0xFF0d7ed6, 0xFF0d7ed6, 0xFF0d7ed6,
+ 0xFF259fe1, 0xFF27a1e2, 0xFF29a2e3, 0xFF2ba4e6, 0xFF249fe1, 0xFF249fe1, 0xFF249fe1, 0xFF249ee1,
+ 0xFF239ee1, 0xFF249ee1, 0xFF249ee1, 0xFF259de1, 0xFF259de2, 0xFF249de2, 0xFF229de2, 0xFF229ce1,
+ 0xFF229bdf, 0xFF219ce0, 0xFF209ce1, 0xFF209ce2, 0xFF209ce2, 0xFF209ae0, 0xFF2199de, 0xFF1f99df,
+ 0xFF1d98e0, 0xFF1e97e0, 0xFF1f97e0, 0xFF1d96df, 0xFF1c95de, 0xFF1c94e0, 0xFF1c94e1, 0xFF1d93e1,
+ 0xFF1d92e0, 0xFF1b93de, 0xFF1a94dc, 0xFF1a93de, 0xFF1a93e0, 0xFF1992e0, 0xFF1891df, 0xFF188fdf,
+ 0xFF178edf, 0xFF168ede, 0xFF158edd, 0xFF148ddc, 0xFF138ddb, 0xFF138cdb, 0xFF138bdb, 0xFF128adb,
+ 0xFF1289db, 0xFF1288db, 0xFF1187db, 0xFF1186db, 0xFF1085db, 0xFF0f84da, 0xFF0e83d9, 0xFF0e83d9,
+ 0xFF0e83d9, 0xFF0e82d9, 0xFF0e81d8, 0xFF0e80d8, 0xFF0d7fd7, 0xFF0d7fd7, 0xFF0d7fd7, 0xFF0d7fd7,
+ 0xFF27a3e3, 0xFF2aa4e3, 0xFF2ea6e3, 0xFF2aa4e3, 0xFF26a2e3, 0xFF26a1e3, 0xFF25a1e3, 0xFF25a0e3,
+ 0xFF25a0e3, 0xFF25a0e3, 0xFF259fe3, 0xFF269fe3, 0xFF269ee4, 0xFF279ee1, 0xFF279edf, 0xFF259ee0,
+ 0xFF239ee1, 0xFF219ee2, 0xFF209ee4, 0xFF209de4, 0xFF219de3, 0xFF229be0, 0xFF2499dc, 0xFF2299de,
+ 0xFF1f98e0, 0xFF1d99e4, 0xFF1b9ae7, 0xFF1c98e2, 0xFF1c96dc, 0xFF1e94e3, 0xFF2092ea, 0xFF1d94e6,
+ 0xFF1a96e2, 0xFF1c96de, 0xFF1d95da, 0xFF1c94de, 0xFF1b94e1, 0xFF1a93e0, 0xFF1a92e0, 0xFF1991e0,
+ 0xFF1890e0, 0xFF1790df, 0xFF178fde, 0xFF168fde, 0xFF158edd, 0xFF148ddd, 0xFF138cdc, 0xFF138bdc,
+ 0xFF128adc, 0xFF1289dc, 0xFF1188dc, 0xFF1187dd, 0xFF1086dd, 0xFF0f85db, 0xFF0e83d9, 0xFF0e84da,
+ 0xFF0f84da, 0xFF0e83da, 0xFF0e82d9, 0xFF0e81d9, 0xFF0e80d8, 0xFF0e80d8, 0xFF0e80d8, 0xFF0e80d8,
+ 0xFF2aa7e5, 0xFF2da7e4, 0xFF31a8e3, 0xFF2ca6e3, 0xFF27a4e4, 0xFF27a3e4, 0xFF27a3e4, 0xFF27a3e4,
+ 0xFF26a2e4, 0xFF26a2e4, 0xFF27a1e5, 0xFF27a0e5, 0xFF27a0e6, 0xFF26a0e5, 0xFF25a0e4, 0xFF259fe4,
+ 0xFF259ee3, 0xFF239ee5, 0xFF229fe6, 0xFF229fe5, 0xFF229fe4, 0xFF13a5e6, 0xFF1b9fe8, 0xFF16a0e8,
+ 0xFF11a0e7, 0xFF129fef, 0xFF139ef7, 0xFF1b99ec, 0xFF179ae2, 0xFF149ce4, 0xFF1d98e5, 0xFF1c97e6,
+ 0xFF1b96e7, 0xFF1c98dc, 0xFF1d97df, 0xFF1c96e1, 0xFF1c94e2, 0xFF1b94e1, 0xFF1b93e1, 0xFF1a93e0,
+ 0xFF1a92e0, 0xFF1991e0, 0xFF1890e0, 0xFF1790df, 0xFF168fdf, 0xFF158ede, 0xFF158dde, 0xFF148cdd,
+ 0xFF138bdc, 0xFF128add, 0xFF1289dd, 0xFF1188de, 0xFF1187de, 0xFF0f85dc, 0xFF0d83da, 0xFF0f85db,
+ 0xFF1086db, 0xFF0f84db, 0xFF0f83da, 0xFF0e82da, 0xFF0e81da, 0xFF0e81da, 0xFF0e81da, 0xFF0e81da,
+ 0xFF2caae7, 0xFF30aae5, 0xFF34abe3, 0xFF2ea8e4, 0xFF29a6e5, 0xFF28a6e5, 0xFF28a5e5, 0xFF28a5e5,
+ 0xFF28a5e6, 0xFF28a4e6, 0xFF28a3e7, 0xFF28a2e7, 0xFF28a1e8, 0xFF25a2e9, 0xFF23a3ea, 0xFF25a0e8,
+ 0xFF279ee6, 0xFF259fe7, 0xFF23a0e9, 0xFF18a4f5, 0xFF0ea7ff, 0xFF1ba6de, 0xFF558ebb, 0xFF6f839c,
+ 0xFF89797e, 0xFF8d797c, 0xFF917979, 0xFF7f7b94, 0xFF5687af, 0xFF229bd6, 0xFF04a4fd, 0xFF109df4,
+ 0xFF1c97eb, 0xFF1c9ada, 0xFF1c98e4, 0xFF1c97e3, 0xFF1d95e2, 0xFF1c95e2, 0xFF1c94e2, 0xFF1c94e1,
+ 0xFF1b94e1, 0xFF1a93e1, 0xFF1a92e1, 0xFF1991e1, 0xFF1890e1, 0xFF178fe0, 0xFF158edf, 0xFF148dde,
+ 0xFF138cdd, 0xFF128bde, 0xFF128adf, 0xFF1289df, 0xFF1188e0, 0xFF0f85dd, 0xFF0d83da, 0xFF0f85db,
+ 0xFF1187dd, 0xFF1086dc, 0xFF0f84dc, 0xFF0e83db, 0xFF0e81db, 0xFF0e81db, 0xFF0e81db, 0xFF0e81db,
+ 0xFF30abe5, 0xFF36afe8, 0xFF34abe4, 0xFF2faae5, 0xFF2ba8e6, 0xFF36aee8, 0xFF26a6e8, 0xFF29a7e7,
+ 0xFF2ca8e7, 0xFF2da7e6, 0xFF2fa5e5, 0xFF2ca5e7, 0xFF29a4e9, 0xFF2ba5e5, 0xFF2ca5e2, 0xFF10aaef,
+ 0xFF13adf6, 0xFF23a3f8, 0xFF6091a5, 0xFFa6755d, 0xFFec5915, 0xFFff490c, 0xFFfa5504, 0xFFff590f,
+ 0xFFff5d1b, 0xFFff6116, 0xFFfa6412, 0xFFff550f, 0xFFff4b0d, 0xFFfb4918, 0xFFf54823, 0xFF8e737e,
+ 0xFF269eda, 0xFF06a2ff, 0xFF1d97e2, 0xFF1799ea, 0xFF1c97e4, 0xFF1a98e4, 0xFF1898e4, 0xFF1a96e3,
+ 0xFF1b95e3, 0xFF1a94e2, 0xFF1a93e0, 0xFF1992e1, 0xFF1891e2, 0xFF1790e1, 0xFF168fe0, 0xFF158fdf,
+ 0xFF138ede, 0xFF138ddf, 0xFF138ce0, 0xFF128be0, 0xFF1189e0, 0xFF1087de, 0xFF0f85db, 0xFF138ae0,
+ 0xFF0f87dc, 0xFF0f86dc, 0xFF0f85dc, 0xFF0f84dc, 0xFF0e83db, 0xFF0e83db, 0xFF0e83db, 0xFF0e83db,
+ 0xFF34abe2, 0xFF3cb4ec, 0xFF34ace5, 0xFF31abe6, 0xFF2daae8, 0xFF44b6eb, 0xFF24a7ea, 0xFF29aaea,
+ 0xFF2face9, 0xFF32a9e6, 0xFF35a7e3, 0xFF30a7e6, 0xFF2ba8ea, 0xFF25aaf0, 0xFF20adf6, 0xFF4d8ba7,
+ 0xFFb8674c, 0xFFff5510, 0xFFf7650c, 0xFFf86313, 0xFFfa611b, 0xFFf0671f, 0xFFfc6222, 0xFFfb6926,
+ 0xFFf96f29, 0xFFf67122, 0xFFf3721b, 0xFFf26b20, 0xFFf16424, 0xFFff5622, 0xFFff531f, 0xFFff4b17,
+ 0xFFff440e, 0xFFb1615b, 0xFF1f95e0, 0xFF129bf0, 0xFF1c9ae5, 0xFF189ae6, 0xFF159be7, 0xFF1898e6,
+ 0xFF1b95e5, 0xFF1b95e2, 0xFF1995e0, 0xFF1994e1, 0xFF1892e2, 0xFF1792e1, 0xFF1691e0, 0xFF1590df,
+ 0xFF148fdf, 0xFF148fe0, 0xFF148fe1, 0xFF128de1, 0xFF108be0, 0xFF1189de, 0xFF1186dd, 0xFF178fe4,
+ 0xFF0e87db, 0xFF0e87dc, 0xFF0f87dd, 0xFF0f85dc, 0xFF0e84dc, 0xFF0e84dc, 0xFF0e84dc, 0xFF0e84dc,
+ 0xFF36b1eb, 0xFF36b4f0, 0xFF2eafed, 0xFF2caeec, 0xFF2aadec, 0xFF41b4ef, 0xFF29abe9, 0xFF2cabe8,
+ 0xFF2fabe7, 0xFF31abe6, 0xFF32aae6, 0xFF2faae7, 0xFF2ca9e8, 0xFF25a7eb, 0xFF946a5f, 0xFFff3e06,
+ 0xFFf95618, 0xFFe27312, 0xFFf87329, 0xFFf77427, 0xFFf77626, 0xFFf27628, 0xFFf8712b, 0xFFf9772e,
+ 0xFFf97e30, 0xFFf77f2e, 0xFFf5812b, 0xFFf57b2c, 0xFFf5752d, 0xFFfd6a2b, 0xFFfb652a, 0xFFf65e2c,
+ 0xFFf1572e, 0xFFff4810, 0xFFff460f, 0xFF817680, 0xFF02a7f1, 0xFF2496ea, 0xFF199be4, 0xFF1b98e4,
+ 0xFF1d96e5, 0xFF1b96e2, 0xFF1a96e0, 0xFF1995e1, 0xFF1794e3, 0xFF1793e2, 0xFF1692e1, 0xFF1691e0,
+ 0xFF1590df, 0xFF1591e1, 0xFF1591e3, 0xFF138fe1, 0xFF108ce0, 0xFF128be0, 0xFF158ae0, 0xFF168de2,
+ 0xFF0f89dd, 0xFF0f88dd, 0xFF0f88dd, 0xFF0f86dd, 0xFF0f85dc, 0xFF0f85dc, 0xFF0f85dc, 0xFF0f85dc,
+ 0xFF5fc1e7, 0xFF57bee8, 0xFF4fbbe9, 0xFF4ebae6, 0xFF4ebae3, 0xFF51b6ee, 0xFF2eaee8, 0xFF2eade6,
+ 0xFF2fabe5, 0xFF2face7, 0xFF2eade9, 0xFF2eace7, 0xFF2daae5, 0xFF15b2ff, 0xFFec4310, 0xFFf15016,
+ 0xFFf75d1c, 0xFFf87123, 0xFFf9862a, 0xFFf6882d, 0xFFf48b31, 0xFFf48532, 0xFFf47f33, 0xFFf78535,
+ 0xFFfa8c37, 0xFFf88e39, 0xFFf7903a, 0xFFf88b38, 0xFFf98635, 0xFFf87e35, 0xFFf77635, 0xFFf76d34,
+ 0xFFf76532, 0xFFf85e31, 0xFFf95730, 0xFFff5125, 0xFFf65237, 0xFF03a5fd, 0xFF1e9be1, 0xFF1e98e3,
+ 0xFF1f96e5, 0xFF1c97e2, 0xFF1a97df, 0xFF1896e1, 0xFF1795e4, 0xFF1794e3, 0xFF1793e2, 0xFF1692e1,
+ 0xFF1692e0, 0xFF1693e2, 0xFF1794e4, 0xFF1391e2, 0xFF0f8ee0, 0xFF148ee1, 0xFF198ee3, 0xFF148ce1,
+ 0xFF0f8bde, 0xFF0f8ade, 0xFF0f89de, 0xFF0f88dd, 0xFF0f86dd, 0xFF0f86dd, 0xFF0f86dd, 0xFF0f86dd,
+ 0xFF3cb6ee, 0xFF36b4ef, 0xFF30b2f0, 0xFF30b1ee, 0xFF2fb1ec, 0xFF38b0ef, 0xFF2eaee9, 0xFF2faee8,
+ 0xFF31ade6, 0xFF2fafe8, 0xFF2eb1ea, 0xFF31adec, 0xFF29afee, 0xFF30aac8, 0xFFff3d05, 0xFFfa501a,
+ 0xFFf96021, 0xFFf87428, 0xFFf7882f, 0xFFfa9638, 0xFFf59b38, 0xFFf5973b, 0xFFf6923e, 0xFFf89440,
+ 0xFFfa9742, 0xFFfa9a44, 0xFFfa9d46, 0xFFf99845, 0xFFf89444, 0xFFf98d43, 0xFFfa8641, 0xFFf97d3f,
+ 0xFFf9743d, 0xFFf77039, 0xFFf56d35, 0xFFff6122, 0xFFbf6c63, 0xFF129eef, 0xFF229ae8, 0xFF1c99ed,
+ 0xFF179ce4, 0xFF1498f0, 0xFF1b94e1, 0xFF1a96e2, 0xFF1998e3, 0xFF1897e4, 0xFF1896e5, 0xFF1895e4,
+ 0xFF1993e2, 0xFF1792e1, 0xFF1590df, 0xFF1692e2, 0xFF1793e5, 0xFF1490e4, 0xFF128ee2, 0xFF118de3,
+ 0xFF108de3, 0xFF118bde, 0xFF1289d9, 0xFF0f88e2, 0xFF0c89dd, 0xFF1085e0, 0xFF0987e4, 0xFF0987e4,
+ 0xFF40b5e9, 0xFF3bb4e9, 0xFF37b2ea, 0xFF37b2e9, 0xFF38b1e8, 0xFF33b0ea, 0xFF2eaeeb, 0xFF30afe9,
+ 0xFF33afe8, 0xFF30b2ea, 0xFF2eb5ec, 0xFF34aff2, 0xFF25b4f7, 0xFF8d7f86, 0xFFf64f00, 0xFFed5c1e,
+ 0xFFfa6326, 0xFFf7762d, 0xFFf58a35, 0xFFfea242, 0xFFf7ab3f, 0xFFf7a843, 0xFFf7a548, 0xFFf9a34a,
+ 0xFFfaa24c, 0xFFfba64f, 0xFFfcaa52, 0xFFf9a652, 0xFFf7a252, 0xFFfa9c50, 0xFFfd974e, 0xFFfc8d4b,
+ 0xFFfb8348, 0xFFf68341, 0xFFf1823a, 0xFFf5732c, 0xFF718cac, 0xFF179af0, 0xFF2599ef, 0xFF2697e9,
+ 0xFF269bc6, 0xFF1696f1, 0xFF1d91e3, 0xFF1c96e3, 0xFF1b9be3, 0xFF1a99e6, 0xFF1998e9, 0xFF1b97e7,
+ 0xFF1c95e5, 0xFF1891df, 0xFF138dda, 0xFF1992e2, 0xFF1e98ea, 0xFF1592e6, 0xFF0b8de2, 0xFF0e8ee5,
+ 0xFF108fe9, 0xFF128cdf, 0xFF1489d4, 0xFF0e88e6, 0xFF088cdc, 0xFF1184e4, 0xFF0488ec, 0xFF0488ec,
+ 0xFF3eb6ea, 0xFF3bb5eb, 0xFF38b4eb, 0xFF38b4eb, 0xFF38b3eb, 0xFF35b2eb, 0xFF33b1ec, 0xFF34b1eb,
+ 0xFF35b1ea, 0xFF32b3e9, 0xFF30b5e9, 0xFF34b0f0, 0xFF23b6f8, 0xFFc56044, 0xFFf9540c, 0xFFf26322,
+ 0xFFf77029, 0xFFf77d2f, 0xFFf78b35, 0xFFfba142, 0xFFf6b046, 0xFFfbb44f, 0xFFf7b051, 0xFFf9af54,
+ 0xFFfbad56, 0xFFfcb25a, 0xFFfeb75d, 0xFFfab35f, 0xFFf6b061, 0xFFfaac5d, 0xFFfda95a, 0xFFfb9f55,
+ 0xFFf99551, 0xFFf7914b, 0xFFf68d45, 0xFFff7e23, 0xFF1ba5f0, 0xFF129ef4, 0xFF2896f1, 0xFF239fb1,
+ 0xFF6c9600, 0xFF3c9c82, 0xFF179ef8, 0xFF169cf4, 0xFF149de3, 0xFF169ae5, 0xFF1897e7, 0xFF1995e6,
+ 0xFF1a93e5, 0xFF1993e3, 0xFF1793e0, 0xFF1c98e6, 0xFF1a95e5, 0xFF1692e5, 0xFF138fe5, 0xFF138ceb,
+ 0xFF138be3, 0xFF0087e4, 0xFF007cf5, 0xFF1a86d3, 0xFF0d8cf1, 0xFF008fe2, 0xFF0d85ea, 0xFF0886f1,
+ 0xFF3cb7ec, 0xFF3bb7ed, 0xFF3ab6ed, 0xFF39b6ed, 0xFF38b5ed, 0xFF37b5ed, 0xFF37b4ed, 0xFF37b3ed,
+ 0xFF36b3ec, 0xFF34b4e9, 0xFF31b5e5, 0xFF35b1ef, 0xFF21b8fa, 0xFFfd4203, 0xFFfc581e, 0xFFf86a26,
+ 0xFFf47c2d, 0xFFf78431, 0xFFf98c36, 0xFFf8a041, 0xFFf6b54d, 0xFFfec05b, 0xFFf6bc5a, 0xFFf8ba5d,
+ 0xFFfbb861, 0xFFfdbe65, 0xFFffc469, 0xFFfbc16c, 0xFFf5bd70, 0xFFfabc6b, 0xFFfebb66, 0xFFfab160,
+ 0xFFf6a75a, 0xFFf89f55, 0xFFfa984f, 0xFFdf956f, 0xFF08a6fc, 0xFF259ddb, 0xFF159ff3, 0xFF4aa172,
+ 0xFF69a90d, 0xFF62a406, 0xFF5a981b, 0xFF34969b, 0xFF0e99ff, 0xFF1297f2, 0xFF1695e4, 0xFF1793e5,
+ 0xFF1892e5, 0xFF1995e6, 0xFF1a98e7, 0xFF209deb, 0xFF1593df, 0xFF1892e4, 0xFF1a91e9, 0xFF2095eb,
+ 0xFF259dd1, 0xFFd0f772, 0xFFc1f396, 0xFF0083f1, 0xFF1782a0, 0xFF3c7e2f, 0xFF1787cc, 0xFF0b8ada,
+ 0xFF3db9ed, 0xFF3cb8ed, 0xFF3bb8ed, 0xFF3ab7ed, 0xFF39b7ed, 0xFF39b7ed, 0xFF39b6ed, 0xFF3ab6ed,
+ 0xFF3ab6ed, 0xFF37b4ed, 0xFF34b2ec, 0xFF35abf3, 0xFF6e96b3, 0xFFff4601, 0xFFf86520, 0xFFf67329,
+ 0xFFf58131, 0xFFf78b37, 0xFFf9953e, 0xFFf8a649, 0xFFf8b854, 0xFFfcc260, 0xFFf8c465, 0xFFf9c36a,
+ 0xFFfac26e, 0xFFfac773, 0xFFfacb77, 0xFFfbcb7b, 0xFFfccb7e, 0xFFfac87b, 0xFFf8c578, 0xFFf9bc72,
+ 0xFFfbb46d, 0xFFf6b069, 0xFFfeaa57, 0xFF94a0a5, 0xFF13a1f3, 0xFF219df0, 0xFF199eff, 0xFF71c124,
+ 0xFF79b826, 0xFF72b21e, 0xFF6aaa24, 0xFF67a125, 0xFF649a19, 0xFF419d72, 0xFF1f9fcb, 0xFF1994ff,
+ 0xFF1399f1, 0xFF199cf4, 0xFF1ea0f8, 0xFF1b9cff, 0xFF1193f6, 0xFF1293f1, 0xFF1393ec, 0xFF0083ff,
+ 0xFF72cca0, 0xFFcbf982, 0xFFd0ffac, 0xFF79a046, 0xFF337700, 0xFF3a7c03, 0xFF0d8de2, 0xFF0d8edb,
+ 0xFF3fbbee, 0xFF3ebaed, 0xFF3db9ed, 0xFF3cb9ed, 0xFF3bb8ed, 0xFF3bb8ed, 0xFF3cb9ee, 0xFF3cb9ee,
+ 0xFF3db9ef, 0xFF3ab4f1, 0xFF37aff3, 0xFF32b3fe, 0xFFb48f7d, 0xFFff5907, 0xFFf37122, 0xFFf57c2b,
+ 0xFFf68735, 0xFFf7923d, 0xFFf89d45, 0xFFf9ac50, 0xFFf9bb5a, 0xFFf9c465, 0xFFfacd71, 0xFFfacd76,
+ 0xFFfacd7b, 0xFFf7cf80, 0xFFf4d286, 0xFFfcd689, 0xFFffd98c, 0xFFfbd48b, 0xFFf3cf8a, 0xFFf9c885,
+ 0xFFffc17f, 0xFFf5c27d, 0xFFffbc5e, 0xFF48abdc, 0xFF1e9deb, 0xFF1ea2e8, 0xFF1da8e5, 0xFF99d31c,
+ 0xFF8acb22, 0xFF82c427, 0xFF7abc2c, 0xFF75b429, 0xFF70ad25, 0xFF6dab17, 0xFF6ba908, 0xFF5ea912,
+ 0xFF519f54, 0xFF489b6d, 0xFF3e9887, 0xFF3b9592, 0xFF389880, 0xFF449663, 0xFF509446, 0xFF83b43c,
+ 0xFF4f851b, 0xFFafe187, 0xFF9fcc83, 0xFF368011, 0xFF43821c, 0xFF32853c, 0xFF0492f9, 0xFF1092dd,
+ 0xFF40bcee, 0xFF3fbcee, 0xFF3ebbee, 0xFF3dbaed, 0xFF3cbaed, 0xFF3cb9ed, 0xFF3cb9ec, 0xFF3cb9ec,
+ 0xFF3cb8ec, 0xFF3fb4f0, 0xFF43aff5, 0xFF0ebbe9, 0xFFffb897, 0xFFf7814d, 0xFFf57623, 0xFFf6812e,
+ 0xFFf88c39, 0xFFf89943, 0xFFf8a64d, 0xFFf8b257, 0xFFf9bd60, 0xFFfac96d, 0xFFfbd47b, 0xFFfad681,
+ 0xFFfad788, 0xFFfbd98e, 0xFFfbda93, 0xFFfae5a1, 0xFFfed692, 0xFFfadea0, 0xFFf9db98, 0xFFfad694,
+ 0xFFfbd090, 0xFFffd285, 0xFFffc778, 0xFF009afd, 0xFF26a8f2, 0xFF20a4f8, 0xFF53bea5, 0xFFa4da31,
+ 0xFF9dd638, 0xFF97d03a, 0xFF91ca3d, 0xFF8bc539, 0xFF85c035, 0xFF7dbe31, 0xFF74bc2d, 0xFF76b81c,
+ 0xFF77b027, 0xFF72ab25, 0xFF6da724, 0xFF6ba328, 0xFF68a31f, 0xFF58951a, 0xFF78b745, 0xFFbbf181,
+ 0xFF73ad4c, 0xFF417c15, 0xFF508b1e, 0xFF43861c, 0xFF498614, 0xFF17868b, 0xFF0b90f6, 0xFF168ee8,
+ 0xFF42beef, 0xFF41bdee, 0xFF40bcee, 0xFF3fbced, 0xFF3ebbed, 0xFF3dbaec, 0xFF3db9eb, 0xFF3cb8ea,
+ 0xFF3bb7e9, 0xFF39b9f0, 0xFF37bbf7, 0xFF50b5dc, 0xFFff9744, 0xFFfec49d, 0xFFf87a24, 0xFFf88530,
+ 0xFFf9913d, 0xFFf8a049, 0xFFf7af55, 0xFFf8b85d, 0xFFf9c065, 0xFFface75, 0xFFfcdb85, 0xFFfbde8d,
+ 0xFFfae195, 0xFFfee29b, 0xFFffe2a0, 0xFFfbe9a4, 0xFFffbe6b, 0xFFfdde9f, 0xFFffe8a6, 0xFFfbe3a3,
+ 0xFFf8dea0, 0xFFfdd899, 0xFFb6bdab, 0xFF119ff1, 0xFF1ea4e9, 0xFF1a9fff, 0xFF89d465, 0xFFb0e245,
+ 0xFFb0e04e, 0xFFacdc4e, 0xFFa7d94e, 0xFFa1d649, 0xFF9ad345, 0xFF97ce3d, 0xFF94c935, 0xFF8dc534,
+ 0xFF86c133, 0xFF7bbc32, 0xFF6fb731, 0xFF6db330, 0xFF6cae2e, 0xFF7eba3f, 0xFF70a531, 0xFF7bb54f,
+ 0xFF579a20, 0xFF5c9f2b, 0xFF519425, 0xFF80b965, 0xFF609a1d, 0xFF0390e3, 0xFF118ef2, 0xFF1c89f2,
+ 0xFF44c0ef, 0xFF43bfef, 0xFF42beee, 0xFF40bdee, 0xFF3fbcee, 0xFF3fbbed, 0xFF40baeb, 0xFF3eb9ed,
+ 0xFF3cb9ee, 0xFF37b9eb, 0xFF27bcf7, 0xFF949c8f, 0xFFfb9637, 0xFFf9bc7c, 0xFFf9b585, 0xFFf7994a,
+ 0xFFf69b43, 0xFFf6a64e, 0xFFf7b259, 0xFFf8bc66, 0xFFfac672, 0xFFfad380, 0xFFfae08d, 0xFFf9e698,
+ 0xFFf9eba2, 0xFFfeeaa6, 0xFFffeaab, 0xFFfcefa9, 0xFFfaba62, 0xFFfbdc99, 0xFFfff4b9, 0xFFfbecb2,
+ 0xFFf7e6ab, 0xFFffe5a3, 0xFF64b1d1, 0xFF199ff0, 0xFF269fe9, 0xFF0499f2, 0xFFe3f051, 0xFFd5ef58,
+ 0xFFc0e364, 0xFFbde165, 0xFFbae065, 0xFFb5de5d, 0xFFb0dc56, 0xFFaad74e, 0xFFa3d346, 0xFF9bd043,
+ 0xFF93cd3f, 0xFF8cc93e, 0xFF84c63c, 0xFF81c139, 0xFF7dbc36, 0xFF8bc746, 0xFF89c245, 0xFF63a02c,
+ 0xFF65aa2c, 0xFF5ea42d, 0xFF509626, 0xFFa4cf98, 0xFFd9eadd, 0xFFb9ddff, 0xFF389ef4, 0xFF008fd4,
+ 0xFF46c1ef, 0xFF44c0ef, 0xFF43bfef, 0xFF42beef, 0xFF40bdef, 0xFF42bced, 0xFF43baec, 0xFF40baf0,
+ 0xFF3dbaf4, 0xFF35b8e7, 0xFF17bdf7, 0xFFd97f50, 0xFFf79147, 0xFFf7a554, 0xFFffdbba, 0xFFf8a24d,
+ 0xFFf3a549, 0xFFf5ad53, 0xFFf7b55e, 0xFFf9c16f, 0xFFfbcc7f, 0xFFf9d88a, 0xFFf8e595, 0xFFf8eda2,
+ 0xFFf8f5ae, 0xFFfff3b2, 0xFFfff2b6, 0xFFfef5ae, 0xFFf4b659, 0xFFf9db93, 0xFFfeffcd, 0xFFfbf6c1,
+ 0xFFf7edb6, 0xFFfff2ac, 0xFF13a4f7, 0xFF16a5f0, 0xFF18a5e8, 0xFF56b4cd, 0xFFf1f271, 0xFFd5ef84,
+ 0xFFcfe67b, 0xFFcde77c, 0xFFcbe77c, 0xFFc9e672, 0xFFc7e567, 0xFFbce15f, 0xFFb1dd57, 0xFFa9dc51,
+ 0xFFa0da4b, 0xFF9dd749, 0xFF9ad447, 0xFF94cf43, 0xFF8fcb3f, 0xFF88c43c, 0xFF82be39, 0xFF72b430,
+ 0xFF63a928, 0xFF59a028, 0xFF4e9827, 0xFFa0c479, 0xFFfffbf7, 0xFF7fd3f5, 0xFF038fe2, 0xFF0e89e2,
+ 0xFF48c3ef, 0xFF46c2ef, 0xFF45c1f0, 0xFF43c0f0, 0xFF42bff0, 0xFF42beee, 0xFF43bdec, 0xFF41bcef,
+ 0xFF3fbcf2, 0xFF2fc0fe, 0xFF36bdfc, 0xFFf54c00, 0xFFff8a52, 0xFFfaa65e, 0xFFfdc48e, 0xFFfbc185,
+ 0xFFf5ae50, 0xFFf7b65e, 0xFFf9be6c, 0xFFfac978, 0xFFfbd485, 0xFFfede98, 0xFFffe8aa, 0xFFfdeeae,
+ 0xFFf9f5b2, 0xFFfcf6ba, 0xFFfff7c2, 0xFFfcf0b2, 0xFFf7cc6e, 0xFFfbde91, 0xFFfdfcca, 0xFFfffbd1,
+ 0xFFfffdc8, 0xFFcae4c8, 0xFF16a1f2, 0xFF1da4ef, 0xFF12a1f1, 0xFF9fd5b9, 0xFFeaf28c, 0xFFdcf095,
+ 0xFFd9eb90, 0xFFd9ec93, 0xFFd9ec95, 0xFFd6eb8c, 0xFFd4ea83, 0xFFc9e779, 0xFFbfe36f, 0xFFb8e368,
+ 0xFFb1e262, 0xFFafe05e, 0xFFaddf5a, 0xFFa3d952, 0xFF99d449, 0xFF8ecb41, 0xFF84c33a, 0xFF75b833,
+ 0xFF66ac2c, 0xFF5da329, 0xFF559927, 0xFF4b9421, 0xFF2499b9, 0xFF1593fe, 0xFF0993d8, 0xFF0f90d8,
+ 0xFF4ac5ef, 0xFF48c4f0, 0xFF46c2f0, 0xFF45c1f1, 0xFF43c0f1, 0xFF43bfef, 0xFF43bfed, 0xFF42beee,
+ 0xFF41bdf0, 0xFF38bbf0, 0xFF72a1b8, 0xFFff5d1e, 0xFFf97931, 0xFFf5a151, 0xFFf9ad61, 0xFFfee0bd,
+ 0xFFf8b758, 0xFFfabf69, 0xFFfcc87a, 0xFFfcd282, 0xFFfcdc8b, 0xFFfbde8f, 0xFFfbe193, 0xFFfbeba4,
+ 0xFFfbf5b5, 0xFFfaf8c2, 0xFFf9fcce, 0xFFf9ecb7, 0xFFfae183, 0xFFfee290, 0xFFfbfac8, 0xFFfdf8d8,
+ 0xFFfffccb, 0xFF8bcedc, 0xFF189fee, 0xFF25a3ee, 0xFF0b9dfb, 0xFFe8f6a5, 0xFFe4f1a6, 0xFFe4f0a6,
+ 0xFFe4efa6, 0xFFe5f1aa, 0xFFe6f2ad, 0xFFe3f1a6, 0xFFe0ef9e, 0xFFd7ec93, 0xFFcde987, 0xFFc8ea80,
+ 0xFFc2eb78, 0xFFc1ea73, 0xFFc0e96e, 0xFFb1e360, 0xFFa3dd53, 0xFF94d247, 0xFF86c83b, 0xFF78bc35,
+ 0xFF69b030, 0xFF62a52b, 0xFF5b9b27, 0xFF57920a, 0xFF0995fc, 0xFF0d96e5, 0xFF1091eb, 0xFF1091eb,
+ 0xFF4ac5f0, 0xFF49c4f0, 0xFF47c3f1, 0xFF45c2f1, 0xFF44c1f2, 0xFF41c1f2, 0xFF3fc1f2, 0xFF3fbff1,
+ 0xFF3fbcf0, 0xFF32c3fe, 0xFFbe7f6e, 0xFFfe6526, 0xFFf67b35, 0xFFf59a4d, 0xFFf8ab5c, 0xFFfbd0a0,
+ 0xFFf7c783, 0xFFfec16b, 0xFFfdd17f, 0xFFfbdb87, 0xFFf9e590, 0xFFf8ed9a, 0xFFf7f4a5, 0xFFfbea9a,
+ 0xFFffdf8e, 0xFFfce3a0, 0xFFf7e6b1, 0xFFfceecc, 0xFFfffbcb, 0xFFfff3c7, 0xFFfcf1c3, 0xFFfef5d2,
+ 0xFFfffcd3, 0xFF4bb5e7, 0xFF21a5ed, 0xFF1ca2ee, 0xFF3daae2, 0xFFeef6ac, 0xFFe6f2b1, 0xFFe8f2b5,
+ 0xFFe9f3b8, 0xFFeaf4ba, 0xFFebf5bc, 0xFFe8f3b6, 0xFFe6f2af, 0xFFe0f0a8, 0xFFdbeea2, 0xFFd6ef9a,
+ 0xFFd1f092, 0xFFc9ed82, 0xFFc1eb73, 0xFFb0e362, 0xFFa1dc51, 0xFF94d347, 0xFF88ca3e, 0xFF7bbf38,
+ 0xFF6eb433, 0xFF66a92e, 0xFF5da01b, 0xFF3d9448, 0xFF0a93f6, 0xFF0e94ec, 0xFF1193f0, 0xFF1193f0,
+ 0xFF4bc5f1, 0xFF4ac5f1, 0xFF48c4f1, 0xFF47c3f2, 0xFF45c3f2, 0xFF40c3f4, 0xFF3bc4f6, 0xFF3cbff3,
+ 0xFF3ebbf0, 0xFF2dcaff, 0xFFff5d25, 0xFFfe6d2f, 0xFFf37d39, 0xFFf59348, 0xFFf8a958, 0xFFf7c083,
+ 0xFFf7d7ae, 0xFFffc36d, 0xFFffda84, 0xFFfbe48c, 0xFFf7ee94, 0xFFf8ed9e, 0xFFfaeca7, 0xFFf9f1b4,
+ 0xFFf8f6c1, 0xFFfcf6c8, 0xFFfff6d0, 0xFFfef2d3, 0xFFfcf4ba, 0xFFfffee8, 0xFFf7fdea, 0xFFfdfde3,
+ 0xFFfffcdc, 0xFF0b9df1, 0xFF2aaaed, 0xFF1baaf6, 0xFF80c8da, 0xFFfdffbb, 0xFFe8f2bd, 0xFFebf4c4,
+ 0xFFeff7cb, 0xFFeff7cb, 0xFFeff7cb, 0xFFedf6c5, 0xFFebf5c0, 0xFFeaf4be, 0xFFe8f3bd, 0xFFe4f4b4,
+ 0xFFe0f6ab, 0xFFd0f191, 0xFFc1ec77, 0xFFb0e463, 0xFF9edb4e, 0xFF95d448, 0xFF8bcc42, 0xFF7fc23b,
+ 0xFF73b935, 0xFF6aac31, 0xFF60a510, 0xFF229687, 0xFF0b91f1, 0xFF0e93f3, 0xFF1294f5, 0xFF1294f5,
+ 0xFF4cc6f1, 0xFF4bc5f2, 0xFF49c5f2, 0xFF47c4f2, 0xFF46c4f2, 0xFF43c4f1, 0xFF40c4f0, 0xFF42c0f3,
+ 0xFF39c1f6, 0xFF5eacca, 0xFFfb591e, 0xFFf36e31, 0xFFf88135, 0xFFfb923f, 0xFFfbaf5e, 0xFFffc373,
+ 0xFFfde2ba, 0xFFffcd75, 0xFFffd372, 0xFFffe584, 0xFFfff796, 0xFFfef4a2, 0xFFfdf1ae, 0xFFfff8c2,
+ 0xFFfcf8cd, 0xFFfef8d2, 0xFFfff9d6, 0xFFfef6e1, 0xFFfcf5dd, 0xFFfffbee, 0xFFfbfce8, 0xFFfffce0,
+ 0xFFb2e0e8, 0xFF19a4f0, 0xFF26abec, 0xFF16a8f6, 0xFFc2e4d8, 0xFFf9fac5, 0xFFeff6cb, 0xFFf0f7ce,
+ 0xFFf1f8d2, 0xFFf1f8d1, 0xFFf2f9d1, 0xFFf1f9cd, 0xFFf1f9ca, 0xFFf2fbca, 0xFFf4fdca, 0xFFe7f8b6,
+ 0xFFdaf3a2, 0xFFcbef8a, 0xFFbcec71, 0xFFb0e661, 0xFFa5e151, 0xFF9ad949, 0xFF8fd240, 0xFF83c73b,
+ 0xFF77bc35, 0xFF6ab31d, 0xFF5ea905, 0xFF138dea, 0xFF1193ef, 0xFF1093f0, 0xFF0f93f0, 0xFF0f93f0,
+ 0xFF4dc6f2, 0xFF4cc6f2, 0xFF4ac5f3, 0xFF48c5f3, 0xFF47c5f3, 0xFF46c4ef, 0xFF46c4eb, 0xFF48c0f3,
+ 0xFF34c7fb, 0xFF989591, 0xFFfc6428, 0xFFf1773b, 0xFFfc8432, 0xFFff9135, 0xFFffb564, 0xFFffbe5a,
+ 0xFFf3ddb6, 0xFFccd097, 0xFFb4cea5, 0xFFb0d3b1, 0xFFabd7bd, 0xFFc3e1bf, 0xFFdaebc1, 0xFFf5fdc7,
+ 0xFFffffbd, 0xFFfffecd, 0xFFfffcdc, 0xFFfffce0, 0xFFfbfce5, 0xFFfdfbe6, 0xFFfffae7, 0xFFfffbdd,
+ 0xFF61c4f4, 0xFF26aaee, 0xFF22abec, 0xFF10a7f6, 0xFFffffd7, 0xFFf5f5d0, 0xFFf6fad9, 0xFFf4f9d9,
+ 0xFFf2f9da, 0xFFf3fad8, 0xFFf4fbd7, 0xFFf5fcd5, 0xFFf7fdd4, 0xFFf3face, 0xFFf0f7c8, 0xFFe2f4b0,
+ 0xFFd4f199, 0xFFc5ee82, 0xFFb7eb6b, 0xFFb1e95f, 0xFFabe754, 0xFF9fdf49, 0xFF94d83f, 0xFF87cc3a,
+ 0xFF7bc034, 0xFF6bb425, 0xFF5ba332, 0xFF0495f9, 0xFF1795ee, 0xFF1293ed, 0xFF0c91eb, 0xFF0c91eb,
+ 0xFF4fc8f3, 0xFF4dc8f3, 0xFF4cc8f4, 0xFF4bc8f4, 0xFF49c8f4, 0xFF47c5f2, 0xFF45c2ef, 0xFF42c2f8,
+ 0xFF34c8ff, 0xFFdf6746, 0xFFff632a, 0xFFff701b, 0xFFe18b53, 0xFFa4a185, 0xFF63c1cd, 0xFF26c0ff,
+ 0xFF2ab8ff, 0xFF25b5f1, 0xFF27b7f9, 0xFF26b5f6, 0xFF23b3f2, 0xFF24b5fa, 0xFF25b7ff, 0xFF189ddf,
+ 0xFF43bbf4, 0xFF9edae8, 0xFFf9f9dc, 0xFFf3fbe6, 0xFFffffea, 0xFFfdffe6, 0xFFfafce2, 0xFFffffff,
+ 0xFF1ea8ef, 0xFF1ca8f1, 0xFF1ba8f2, 0xFF5bc4f1, 0xFFffffe7, 0xFFfbf9e1, 0xFFfbfce3, 0xFFf8fbe0,
+ 0xFFf5fadd, 0xFFf5fbdb, 0xFFf5fbda, 0xFFf6fcd7, 0xFFf6fdd3, 0xFFf0f8c9, 0xFFebf4be, 0xFFdff2a9,
+ 0xFFd4f094, 0xFFc7f47b, 0xFFbaf862, 0xFFb0ef58, 0xFFa6e64e, 0xFFa3e248, 0xFF98d73a, 0xFF8acd38,
+ 0xFF7bc435, 0xFF70b821, 0xFF3b9c84, 0xFF0d93f4, 0xFF1394ed, 0xFF1193e9, 0xFF0f92e6, 0xFF0f92e6,
+ 0xFF50c9f4, 0xFF4fcaf4, 0xFF4ecaf5, 0xFF4dcaf5, 0xFF4ccaf6, 0xFF48c5f4, 0xFF45c0f3, 0xFF47c2ef,
+ 0xFF4ac4eb, 0xFFff521f, 0xFFa79a92, 0xFF51b7e6, 0xFF28c7ff, 0xFF2cc4f9, 0xFF31c1f1, 0xFF3fbbf0,
+ 0xFF37c0ef, 0xFF39b9f0, 0xFF3bb3f1, 0xFF38b5f4, 0xFF36b7f7, 0xFF32b9f0, 0xFF2fbbe8, 0xFF2fb8eb,
+ 0xFF2fb5ed, 0xFF20acf3, 0xFF10a3fa, 0xFF70c9f3, 0xFFf5f9df, 0xFFf6fbde, 0xFFf6fdde, 0xFFd8ebe4,
+ 0xFF11a5ee, 0xFF2db2f5, 0xFF14a5f8, 0xFFa5e2ec, 0xFFfffff8, 0xFFfffef3, 0xFFfffded, 0xFFfcfde6,
+ 0xFFf8fce0, 0xFFf7fcde, 0xFFf6fcdd, 0xFFf6fcd8, 0xFFf5fdd3, 0xFFedf7c4, 0xFFe5f1b4, 0xFFe5f5b8,
+ 0xFFe4f9bb, 0xFFecfed2, 0xFFf3ffe9, 0xFFedfedb, 0xFFe8f9cd, 0xFFcaef89, 0xFF9cd636, 0xFF84c72e,
+ 0xFF6bb826, 0xFF6cb315, 0xFF1a95d6, 0xFF1591ef, 0xFF1093eb, 0xFF1193e6, 0xFF1294e1, 0xFF1294e1,
+ 0xFF52cbf4, 0xFF50caf4, 0xFF4ecaf4, 0xFF4ccaf3, 0xFF4ac9f3, 0xFF48c8f5, 0xFF46c7f6, 0xFF40bfed,
+ 0xFF41bfeb, 0xFF41d4f9, 0xFF33c9fc, 0xFF2fc9ff, 0xFF42c3ec, 0xFF40c3f4, 0xFF3ec3fc, 0xFF35bbf4,
+ 0xFF33bbf3, 0xFF49bdf7, 0xFF39b7f9, 0xFF37b7f6, 0xFF35b7f2, 0xFF2eb5f4, 0xFF28b3f5, 0xFF2fbbf8,
+ 0xFF2fbaf2, 0xFF30b5f2, 0xFF31b0f1, 0xFF1facf6, 0xFF0dabed, 0xFF7fd2ed, 0xFFffffe6, 0xFF80d9d2,
+ 0xFF2faaf8, 0xFF1dafec, 0xFF03aae6, 0xFFfff8ff, 0xFFfffffe, 0xFFfffff9, 0xFFfffdf4, 0xFFfdfeeb,
+ 0xFFfbfee3, 0xFFf9fde1, 0xFFf7fce0, 0xFFf5fdd8, 0xFFf4fdcf, 0xFFf5fce2, 0xFFf6fde8, 0xFFf3fde8,
+ 0xFFf1fde9, 0xFFebfdd3, 0xFFe6fdbe, 0xFFe0f8ba, 0xFFdaf2b7, 0xFFeafcd2, 0xFFf2fde6, 0xFFb7de8d,
+ 0xFF84c73d, 0xFF9ab848, 0xFF14a1f9, 0xFF0494f3, 0xFF1094ef, 0xFF1095ec, 0xFF1095e9, 0xFF1095e9,
+ 0xFF54ccf5, 0xFF51cbf4, 0xFF4ecaf3, 0xFF4cc9f2, 0xFF49c8f1, 0xFF48cbf5, 0xFF48cef9, 0xFF40c4f3,
+ 0xFF49cafc, 0xFF40c2f1, 0xFF47caf5, 0xFF46c7f4, 0xFF46c4f3, 0xFF39b5ee, 0xFF2ca5e8, 0xFF2eb1e1,
+ 0xFF56c1ea, 0xFF6dc9e9, 0xFF37c2e5, 0xFF51caeb, 0xFF6bd2f1, 0xFF74d1f5, 0xFF7dcff9, 0xFF56c7f8,
+ 0xFF1fafe8, 0xFF25b1ee, 0xFF2cb3f4, 0xFF3eb5f9, 0xFF2bb3ee, 0xFF1baff5, 0xFF32b5f0, 0xFF3fb2f9,
+ 0xFF26a9f2, 0xFF1faeeb, 0xFF3fb8f4, 0xFFfcfff3, 0xFFffffff, 0xFFffffff, 0xFFfffefb, 0xFFfefff1,
+ 0xFFfeffe6, 0xFFfbffe5, 0xFFf8fde3, 0xFFf5fdd7, 0xFFf3fecb, 0xFFf5fbeb, 0xFFf7feee, 0xFFf2fdde,
+ 0xFFedfccf, 0xFFe3f9b0, 0xFFd9f692, 0xFFd2f48b, 0xFFccf184, 0xFFceee97, 0xFFd0eaa9, 0xFFdaebc1,
+ 0xFFf4fbe9, 0xFF7fc679, 0xFF5ac1ff, 0xFF1aa1eb, 0xFF1195f2, 0xFF0f96f2, 0xFF0e97f2, 0xFF0e97f2,
+ 0xFF54cdf5, 0xFF52ccf4, 0xFF4fcbf3, 0xFF4dc9f3, 0xFF4ac8f2, 0xFF49c6f2, 0xFF47c4f2, 0xFF49d2f3,
+ 0xFF46c8f3, 0xFF4dc5fc, 0xFF2c9add, 0xFF1883cd, 0xFF046cbe, 0xFF0080c5, 0xFF0f96d4, 0xFF2eaddb,
+ 0xFF60c6eb, 0xFF76cdef, 0xFF51caea, 0xFF69d2f0, 0xFF81daf5, 0xFF9ae4f7, 0xFFb3eff9, 0xFFcffaff,
+ 0xFFe3feff, 0xFF9ae1ff, 0xFF48bcf7, 0xFF11b5dd, 0xFF32aef0, 0xFF28acfc, 0xFF31b2f3, 0xFF34b1f6,
+ 0xFF25adf0, 0xFF26acf6, 0xFF98d1fc, 0xFFfffdf8, 0xFFffffff, 0xFFfffffb, 0xFFfefff4, 0xFFfdffee,
+ 0xFFfcfde7, 0xFFfbfee4, 0xFFfaffe0, 0xFFf8fde7, 0xFFf7fcef, 0xFFf3fbeb, 0xFFeffdd9, 0xFFe9fbc2,
+ 0xFFe3f9ac, 0xFFd9f49b, 0xFFceef8b, 0xFFc1ea76, 0xFFb4e562, 0xFFabdd5a, 0xFFa2d261, 0xFFc1e98e,
+ 0xFFdbe8b9, 0xFF96d4ff, 0xFF8ed0fa, 0xFF42aeee, 0xFF1095f1, 0xFF1096f1, 0xFF0f96f1, 0xFF0f96f1,
+ 0xFF55cef5, 0xFF53ccf4, 0xFF50cbf4, 0xFF4ecaf4, 0xFF4cc8f4, 0xFF51caf7, 0xFF57cbfa, 0xFF45c0ea,
+ 0xFF1a75c7, 0xFF0058ad, 0xFF015bb4, 0xFF066fc0, 0xFF0b84cd, 0xFF0093ce, 0xFF11a7e0, 0xFF3eb9e6,
+ 0xFF6bcbeb, 0xFF7ed1f6, 0xFF6cd3f0, 0xFF82dbf4, 0xFF98e3f9, 0xFFa5ecf7, 0xFFb2f4f5, 0xFFc7f7f9,
+ 0xFFddfafd, 0xFFf2ffff, 0xFFf8fff6, 0xFFbcebfe, 0xFF22b4f2, 0xFF29afff, 0xFF2fb0f7, 0xFF29b1f2,
+ 0xFF23b1ee, 0xFF1aa7fa, 0xFFcae6f4, 0xFFf7f8f4, 0xFFfeffff, 0xFFfefff7, 0xFFfeffed, 0xFFfcffeb,
+ 0xFFfbfae9, 0xFFfbfee3, 0xFFfbffdc, 0xFFfbffe9, 0xFFfbfff7, 0xFFf1fedd, 0xFFe7fbc3, 0xFFe0f6b4,
+ 0xFFd8f0a5, 0xFFceec94, 0xFFc4e884, 0xFFb8e678, 0xFFace36c, 0xFFa0df53, 0xFF94d455, 0xFF80bd41,
+ 0xFFd2e599, 0xFF2ca1f4, 0xFF30a2f6, 0xFF209cf3, 0xFF1096f1, 0xFF1096f1, 0xFF1096f1, 0xFF1096f1,
+ 0xFF55cef4, 0xFF53cdf4, 0xFF51cbf5, 0xFF50cbf5, 0xFF4ecaf6, 0xFF4dc9f4, 0xFF54d0fa, 0xFF2b86ce,
+ 0xFF0752b1, 0xFF045fb9, 0xFF0a74c9, 0xFF0882ce, 0xFF0691d4, 0xFF02a0d5, 0xFF24b5e7, 0xFF4cc4ea,
+ 0xFF74d3ee, 0xFF83d9f5, 0xFF7fddf4, 0xFF93e4f6, 0xFFa8ecf9, 0xFFb6f2f9, 0xFFc3f9f9, 0xFFd3fafb,
+ 0xFFe3fcfc, 0xFFedfefb, 0xFFf0f9f3, 0xFFffffff, 0xFFfffdff, 0xFF7edcef, 0xFF26adfd, 0xFF2aaff7,
+ 0xFF2db2f2, 0xFF34b1e0, 0xFF09a7f7, 0xFF8dd3f5, 0xFFfdfbf9, 0xFFfffff6, 0xFFfdffeb, 0xFFfcffe6,
+ 0xFFfcfce0, 0xFFf9fcde, 0xFFf7fcdd, 0xFFfcffef, 0xFFf9fdec, 0xFFe8f5d0, 0xFFdff5bd, 0xFFd9f1ad,
+ 0xFFd2ed9d, 0xFFc5e97e, 0xFFb8e26d, 0xFFabdd5e, 0xFF9fd74f, 0xFF98c95f, 0xFF92c735, 0xFF8bc942,
+ 0xFF80b34d, 0xFF009bf2, 0xFF1894f8, 0xFF1595f5, 0xFF1397f2, 0xFF1296f1, 0xFF1195f0, 0xFF1195f0,
+ 0xFF56cff4, 0xFF54cdf5, 0xFF52ccf5, 0xFF51cbf7, 0xFF51cbf9, 0xFF49c8f1, 0xFF51d5fa, 0xFF1662c1,
+ 0xFF005cbb, 0xFF0874cd, 0xFF037cce, 0xFF028dd4, 0xFF019edb, 0xFF09aedc, 0xFF37c2ee, 0xFF5acfef,
+ 0xFF7edcf0, 0xFF88e1f4, 0xFF92e6f8, 0xFFa5eef8, 0xFFb9f5f9, 0xFFc7f9fb, 0xFFd5fdfe, 0xFFdffdfc,
+ 0xFFe9fdfa, 0xFFf0fefe, 0xFFf8ffff, 0xFFfafffe, 0xFFfdfffc, 0xFFfdfbff, 0xFF1db0e8, 0xFF2ab1ee,
+ 0xFF37b2f5, 0xFF25b9f7, 0xFF29b4f8, 0xFF22aff5, 0xFF1baaf2, 0xFF9fd7f6, 0xFFfdffea, 0xFFfcfee0,
+ 0xFFfcfdd7, 0xFFf8fada, 0xFFf4f7dd, 0xFFfdfef5, 0xFFf6fae1, 0xFFdfecc3, 0xFFd8efb6, 0xFFd2eca6,
+ 0xFFccea95, 0xFFbce567, 0xFFabdb56, 0xFF9fd344, 0xFF92cb33, 0xFF85c824, 0xFF79b46a, 0xFF3a9eaf,
+ 0xFF0c97ff, 0xFF1994f9, 0xFF0f9bee, 0xFF139af0, 0xFF1699f3, 0xFF1497f1, 0xFF1295ef, 0xFF1295ef,
+ 0xFF58d0f5, 0xFF56cef5, 0xFF53cdf4, 0xFF53ccf6, 0xFF52cbf8, 0xFF53d6fb, 0xFF4fc8fc, 0xFF004cad,
+ 0xFF096fca, 0xFF0b80d4, 0xFF0588d5, 0xFF0598db, 0xFF05a8e1, 0xFF18b6e6, 0xFF3fc8f2, 0xFF63d3f3,
+ 0xFF86dff5, 0xFF91e4f7, 0xFF9ce9fa, 0xFFaef0f9, 0xFFc0f7f9, 0xFFcbfafb, 0xFFd7fdfd, 0xFFdefdfc,
+ 0xFFe6fefb, 0xFFf0fffe, 0xFFfaffff, 0xFFf2fefb, 0xFFfefffd, 0xFFc6e9fb, 0xFF1eb0ec, 0xFF30b4f6,
+ 0xFF30b7f8, 0xFF19a8f7, 0xFF26b0f0, 0xFF22aef3, 0xFF1eabf5, 0xFF27aafa, 0xFF1ca6f6, 0xFF7dcdea,
+ 0xFFdff4dd, 0xFFeaffb0, 0xFFfdfeed, 0xFFffffef, 0xFFfcf9d3, 0xFFedeeb4, 0xFFe6e9ac, 0xFFd9e68a,
+ 0xFFcbe367, 0xFFb9e153, 0xFFa6dd4d, 0xFF75c57f, 0xFF43adb0, 0xFF229bf3, 0xFF0a9cff, 0xFF0998f6,
+ 0xFF109cef, 0xFF189aee, 0xFF149ded, 0xFF159bf0, 0xFF1599f2, 0xFF1397f0, 0xFF1195ee, 0xFF1195ee,
+ 0xFF5ad1f6, 0xFF57cff5, 0xFF54cef4, 0xFF54cdf6, 0xFF53cbf8, 0xFF4dd3f4, 0xFF2c9add, 0xFF045ec1,
+ 0xFF0572c9, 0xFF0683d2, 0xFF0794dc, 0xFF08a2e2, 0xFF08b1e8, 0xFF28bfef, 0xFF48cef6, 0xFF6bd8f8,
+ 0xFF8fe3fa, 0xFF9be8fa, 0xFFa6edfb, 0xFFb7f3fb, 0xFFc7f9fa, 0xFFd0fbfc, 0xFFd9fdfd, 0xFFdefefd,
+ 0xFFe2fffc, 0xFFeffffe, 0xFFfcffff, 0xFFebfef7, 0xFFfffffe, 0xFF8fd7f8, 0xFF1eb0f1, 0xFF2eb0f6,
+ 0xFF18abec, 0xFFe0f7fd, 0xFF24ade9, 0xFF23acf1, 0xFF21acf8, 0xFF26aef7, 0xFF2cb0f6, 0xFF1aa9f5,
+ 0xFF08a3f4, 0xFF22a7f9, 0xFF4cc2f2, 0xFF6dcdef, 0xFF7ec9db, 0xFF7fcac2, 0xFF81c6c6, 0xFF61bccb,
+ 0xFF41b3d0, 0xFF24a7e9, 0xFF089bff, 0xFF119dff, 0xFF1a9fff, 0xFF0f99e9, 0xFF149cf9, 0xFF159cf7,
+ 0xFF159cf5, 0xFF179df1, 0xFF199eed, 0xFF179cef, 0xFF1599f1, 0xFF1397ef, 0xFF1195ed, 0xFF1195ed,
+ 0xFF5cd2f6, 0xFF59d0f5, 0xFF55cff3, 0xFF54cdf5, 0xFF53ccf8, 0xFF51d5f6, 0xFF167bcf, 0xFF0467c6,
+ 0xFF067bcf, 0xFF068bd7, 0xFF059cdf, 0xFF08a9e5, 0xFF0ab6eb, 0xFF2bc4f1, 0xFF4cd2f7, 0xFF6ddbf9,
+ 0xFF8ee5fa, 0xFF9deafb, 0xFFaceffb, 0xFFbdf5fb, 0xFFcefbfa, 0xFFd5fbfc, 0xFFdcfcfd, 0xFFdcfefd,
+ 0xFFddfffd, 0xFFe4fffd, 0xFFeafffd, 0xFFfffffe, 0xFFffffff, 0xFF27c0de, 0xFF26b5f6, 0xFF1fb0f9,
+ 0xFF4dc6ff, 0xFFfff9ef, 0xFFfefffa, 0xFF8bd8f7, 0xFF18a7f3, 0xFF1daaf4, 0xFF23acf6, 0xFF22acf3,
+ 0xFF22abf0, 0xFF1aa3f2, 0xFF1aa6ee, 0xFF18a8f5, 0xFF0ea2f3, 0xFF11a4f2, 0xFF14a4ff, 0xFF15a3fc,
+ 0xFF16a3fa, 0xFF17a2f3, 0xFF19a2ec, 0xFF0e99fe, 0xFF169bed, 0xFF00a1ff, 0xFF2b9de8, 0xFF61b5b0,
+ 0xFF109af7, 0xFF149cf2, 0xFF189eed, 0xFF169cef, 0xFF149af0, 0xFF1298ee, 0xFF1096ec, 0xFF1096ec,
+ 0xFF5fd3f7, 0xFF5bd2f5, 0xFF56d0f3, 0xFF55cef5, 0xFF53cdf7, 0xFF56d8f8, 0xFF005cc0, 0xFF0370cb,
+ 0xFF0785d6, 0xFF0594dc, 0xFF04a3e2, 0xFF08afe8, 0xFF0cbcee, 0xFF2ec8f3, 0xFF50d5f9, 0xFF6fdefa,
+ 0xFF8de7fb, 0xFF9fecfb, 0xFFb1f2fb, 0xFFc3f7fb, 0xFFd4fcfa, 0xFFd9fcfc, 0xFFdefcfd, 0xFFdbfdfd,
+ 0xFFd9fffd, 0xFFd9fdfb, 0xFFd9fcfa, 0xFFe5fafa, 0xFFa4eaf7, 0xFF2badfb, 0xFF2fb9fa, 0xFF1aaeed,
+ 0xFF99dbf8, 0xFFffffff, 0xFFfefdfc, 0xFFfffefd, 0xFFfffffd, 0xFF8cd4fa, 0xFF19a9f6, 0xFF18a9f7,
+ 0xFF16aaf9, 0xFF1aa7f3, 0xFF1ea5ee, 0xFF1fa7f2, 0xFF21a9f6, 0xFF1ea7f7, 0xFF1ba5f7, 0xFF17a4f9,
+ 0xFF12a2fb, 0xFF0b9dfd, 0xFF0399fe, 0xFF26a2fa, 0xFF6fc0b0, 0xFFcfca5e, 0xFFffe528, 0xFF74b4b3,
+ 0xFF0b98fa, 0xFF119af4, 0xFF179dee, 0xFF159cee, 0xFF139aef, 0xFF1198ed, 0xFF0f96eb, 0xFF0f96eb,
+ 0xFF5dd1f6, 0xFF5bd2f5, 0xFF58d2f4, 0xFF53cef4, 0xFF56d2fb, 0xFF40b2e6, 0xFF0164c6, 0xFF0376cf,
+ 0xFF0487d7, 0xFF0296dd, 0xFF01a4e4, 0xFF04b1ea, 0xFF07bdf1, 0xFF1bc8f2, 0xFF43d5fc, 0xFF64ddfb,
+ 0xFF85e6fb, 0xFF98ebfc, 0xFFacf1fd, 0xFFbef9ff, 0xFFcfffff, 0xFFcffdff, 0xFFcff9fb, 0xFFd2fefe,
+ 0xFFd5ffff, 0xFFc6f9ff, 0xFFb8efff, 0xFF5ad7d9, 0xFF40b9e9, 0xFF2fb9ff, 0xFF2bb2f0, 0xFF28afeb,
+ 0xFFdef0f2, 0xFFffffff, 0xFFfeffff, 0xFFfffefe, 0xFFfffefa, 0xFFfffffa, 0xFFfffff9, 0xFFc2e8f0,
+ 0xFF84cde7, 0xFF53bbe9, 0xFF22a9eb, 0xFF14a1ff, 0xFF069ff8, 0xFF0fa0f8, 0xFF19a3eb, 0xFF43b1e1,
+ 0xFF6ec2c9, 0xFFb0d79a, 0xFFf2eb6b, 0xFFebee32, 0xFFf8e647, 0xFFffe23a, 0xFFfde142, 0xFF0098f4,
+ 0xFF19a1fc, 0xFF169ef7, 0xFF129bf1, 0xFF139af1, 0xFF149af0, 0xFF1298ee, 0xFF1096ec, 0xFF1096ec,
+ 0xFF5ccff6, 0xFF5bd2f6, 0xFF5ad4f6, 0xFF52cdf2, 0xFF5ad6fe, 0xFF298cd5, 0xFF026ccc, 0xFF027bd2,
+ 0xFF0189d8, 0xFF0097df, 0xFF00a6e6, 0xFF00b2ed, 0xFF02bef4, 0xFF09c7f1, 0xFF35d5ff, 0xFF59ddfd,
+ 0xFF7ce5fb, 0xFF91eafd, 0xFFa6f0ff, 0xFFb1f2ff, 0xFFbbf5ff, 0xFFbef5fc, 0xFFc1f6f9, 0xFFc1f7f7,
+ 0xFFc1f9f4, 0xFFc7fdfc, 0xFFcdffff, 0xFFc2f9f8, 0xFF5acdf4, 0xFF39b1f3, 0xFF38baf5, 0xFF2ab4f7,
+ 0xFFfcfbf8, 0xFFfdfeff, 0xFFfeffff, 0xFFfffeff, 0xFFfffcf6, 0xFFfdfef2, 0xFFf7ffee, 0xFFfcffea,
+ 0xFFffffe5, 0xFFffffd8, 0xFFffffcb, 0xFFfffbf1, 0xFFffffdf, 0xFFfdfdc2, 0xFFf7ff88, 0xFFfbfe92,
+ 0xFFffff7f, 0xFFfdfc6c, 0xFFfaf759, 0xFFf8f059, 0xFFf7e958, 0xFFf7e359, 0xFFd0d368, 0xFF0998ff,
+ 0xFF189aef, 0xFF129af2, 0xFF0c99f5, 0xFF1199f3, 0xFF1599f2, 0xFF1397f0, 0xFF1195ee, 0xFF1195ee,
+ 0xFF5fd2f9, 0xFF5cd3f8, 0xFF59d4f6, 0xFF58d3f8, 0xFF5edaff, 0xFF1971cd, 0xFF026ecd, 0xFF037bd3,
+ 0xFF0488d9, 0xFF0497e0, 0xFF05a6e6, 0xFF01ade7, 0xFF00b5e8, 0xFF07beea, 0xFF23cbf5, 0xFF4cd7f8,
+ 0xFF74e4fc, 0xFF89e8fd, 0xFF9fecfe, 0xFFa5edfe, 0xFFabeffe, 0xFFaeeffc, 0xFFb0eff9, 0xFFb3f3f9,
+ 0xFFb6f6f8, 0xFFb6f9fc, 0xFFb5fcff, 0xFFdaf3ff, 0xFF1ab9f1, 0xFF28b3f4, 0xFF2bb3f6, 0xFF73cef4,
+ 0xFFfdfdf5, 0xFFfdfefa, 0xFFfdfffe, 0xFFfffef9, 0xFFfffdf3, 0xFFfdfeee, 0xFFfaffe9, 0xFFfdffe4,
+ 0xFFffffde, 0xFFffffd0, 0xFFffffc2, 0xFFfdfad7, 0xFFfffcf3, 0xFFffffc0, 0xFFfcfbc5, 0xFFfcff84,
+ 0xFFfcfb8b, 0xFFfbf67a, 0xFFf9f269, 0xFFf7ed5e, 0xFFf4e954, 0xFFf7e948, 0xFF87bda9, 0xFF109afc,
+ 0xFF179cf2, 0xFF149bf1, 0xFF119af1, 0xFF1399f2, 0xFF1698f3, 0xFF1496f1, 0xFF1294ef, 0xFF1294ef,
+ 0xFF62d4fc, 0xFF5dd4f9, 0xFF59d4f6, 0xFF56d1f6, 0xFF53cef5, 0xFF014ebe, 0xFF026fcd, 0xFF057bd4,
+ 0xFF0787da, 0xFF0996e0, 0xFF0ca5e7, 0xFF0bb0e9, 0xFF09bbeb, 0xFF15c5f3, 0xFF21d0fc, 0xFF46dafc,
+ 0xFF6ce3fc, 0xFF82e6fd, 0xFF97e9fe, 0xFF99e9fe, 0xFF9ce8fe, 0xFF9ee9fb, 0xFFa0e9f9, 0xFFa6eefa,
+ 0xFFacf3fc, 0xFFb0effc, 0xFFb5ecfb, 0xFF89ddf9, 0xFF28b4f3, 0xFF3ebef7, 0xFF1eadf7, 0xFFbde8f0,
+ 0xFFfefff2, 0xFFfefff3, 0xFFfdfff4, 0xFFfefef2, 0xFFfefef0, 0xFFfefeea, 0xFFfefee4, 0xFFfefede,
+ 0xFFfefed8, 0xFFfcffc9, 0xFFfbffba, 0xFFf6fea0, 0xFFffffce, 0xFFfff9f6, 0xFFffffc9, 0xFFfdf7be,
+ 0xFFf8f87a, 0xFFf9f66b, 0xFFf9f35c, 0xFFf5ee56, 0xFFf1e84f, 0xFFf8ee37, 0xFF3fa7ea, 0xFF189df5,
+ 0xFF179df4, 0xFF169cf1, 0xFF159bee, 0xFF169af2, 0xFF1798f5, 0xFF1596f3, 0xFF1394f1, 0xFF1394f1,
+ 0xFF66d7fc, 0xFF5fd1f5, 0xFF60d4f6, 0xFF59d8f9, 0xFF399ddb, 0xFF0858be, 0xFF096ccd, 0xFF0c7ad2,
+ 0xFF1087d7, 0xFF1296df, 0xFF13a6e8, 0xFF13b0eb, 0xFF1bc3f5, 0xFF0fc8f3, 0xFF17d0f9, 0xFF27d3f4,
+ 0xFF4bd7f7, 0xFF61dbf8, 0xFF77def9, 0xFF7fe0fa, 0xFF88e1fa, 0xFF8de4fb, 0xFF91e7fb, 0xFF96eafc,
+ 0xFF9aedfd, 0xFF9feafb, 0xFFa3e7fa, 0xFF5eccfb, 0xFF2db7f5, 0xFF24b8f9, 0xFF14b1f5, 0xFFfffbff,
+ 0xFFfeffec, 0xFFffffed, 0xFFffffee, 0xFFffffec, 0xFFfefdeb, 0xFFfefde4, 0xFFfefddd, 0xFFfefed6,
+ 0xFFfefece, 0xFFfcfdc1, 0xFFfcfcb5, 0xFFf6fb8d, 0xFFf8fc8a, 0xFFf8facc, 0xFFf8fef2, 0xFFf9ffbe,
+ 0xFFfbf9c2, 0xFFfbf8ac, 0xFFfcf796, 0xFFfaf491, 0xFFf7f18d, 0xFFffe5a9, 0xFF0096f7, 0xFF089af7,
+ 0xFF159ef7, 0xFF169df4, 0xFF169cf0, 0xFF169bf2, 0xFF1699f4, 0xFF1497f3, 0xFF1396f1, 0xFF1396f1,
+ 0xFF6bd9fb, 0xFF61cef1, 0xFF67d3f7, 0xFF5cdefd, 0xFF1f6cc0, 0xFF0f63bf, 0xFF0f6acd, 0xFF1478d1,
+ 0xFF1887d4, 0xFF1997df, 0xFF1aa6e9, 0xFF14a9e4, 0xFF1dbbef, 0xFF0dbeeb, 0xFF23c5f6, 0xFF13c6ed,
+ 0xFF2acbf3, 0xFF40cff4, 0xFF56d4f4, 0xFF65d7f6, 0xFF74daf7, 0xFF7bdffb, 0xFF83e5fe, 0xFF86e6fe,
+ 0xFF89e8fd, 0xFF8ee5fb, 0xFF92e2fa, 0xFF33bcfc, 0xFF32b9f7, 0xFF31bafd, 0xFF57c5f7, 0xFFf4ffde,
+ 0xFFfdffe7, 0xFFffffe7, 0xFFffffe7, 0xFFffffe6, 0xFFfdfce6, 0xFFfdfddd, 0xFFfdfdd5, 0xFFfdfdcd,
+ 0xFFfefdc5, 0xFFfdfaba, 0xFFfcf8af, 0xFFfef99f, 0xFFfffb8e, 0xFFfafe77, 0xFFf4fb7d, 0xFFf9f8d2,
+ 0xFFfdffee, 0xFFfefedf, 0xFFfffcd0, 0xFFfefacd, 0xFFfdf9ca, 0xFFa6d3ce, 0xFF0399eb, 0xFF1ea1ec,
+ 0xFF149ffa, 0xFF159ef6, 0xFF179ef2, 0xFF169cf3, 0xFF159af3, 0xFF1499f2, 0xFF1398f1, 0xFF1398f1,
+ 0xFF55d4f4, 0xFF5bd1f1, 0xFF69d6f6, 0xFF6ee2ff, 0xFF0c50a8, 0xFF1161be, 0xFF0f6acd, 0xFF1f83d6,
+ 0xFF1f89dc, 0xFF0f8cdd, 0xFF1a9be0, 0xFF22b1f4, 0xFF1dabe1, 0xFF14aedf, 0xFF26bdee, 0xFF15bae7,
+ 0xFF1fc1ef, 0xFF25c7ef, 0xFF2bcdef, 0xFF3dcdf1, 0xFF4ecef3, 0xFF5bd6f9, 0xFF68defe, 0xFF6eddfc,
+ 0xFF73ddfb, 0xFF76ddf5, 0xFF70d3f7, 0xFF31bafb, 0xFF33b9f6, 0xFF24b6ff, 0xFFa4dee5, 0xFFf9ffdc,
+ 0xFFfdfedc, 0xFFffffdc, 0xFFffffdc, 0xFFfefedb, 0xFFfcfdda, 0xFFfdfdd2, 0xFFfdfdcb, 0xFFfdfdc3,
+ 0xFFfefdbc, 0xFFfdfbaf, 0xFFfcfaa2, 0xFFfdfb93, 0xFFfefb83, 0xFFfcfd6b, 0xFFf9fc60, 0xFFfbf85d,
+ 0xFFfdf74c, 0xFFfef576, 0xFFfff2a1, 0xFFf6ec87, 0xFFf8e360, 0xFF51bbb4, 0xFF0d9afe, 0xFF1a9ef7,
+ 0xFF159ef6, 0xFF159df4, 0xFF159df2, 0xFF149bf2, 0xFF1299f2, 0xFF1299f2, 0xFF1299f2, 0xFF1299f2,
+ 0xFF67d4fd, 0xFF69d6f9, 0xFF6cd9f5, 0xFF4fb7dc, 0xFF1953af, 0xFF1c67c6, 0xFF005abd, 0xFF1a7eca,
+ 0xFF157bd4, 0xFF0581dc, 0xFF2aa1e7, 0xFF0189d3, 0xFF2dabe3, 0xFF23a7dc, 0xFF29b4e6, 0xFF17ade1,
+ 0xFF14b7ec, 0xFF15b9ea, 0xFF16bbe9, 0xFF1fbfec, 0xFF28c2ef, 0xFF3bcdf7, 0xFF4ed8ff, 0xFF56d5fb,
+ 0xFF5dd2f8, 0xFF5ed6f0, 0xFF4ec5f4, 0xFF2fb9fa, 0xFF35b8f4, 0xFF17b1ff, 0xFFf0f7d2, 0xFFfeffda,
+ 0xFFfdfcd2, 0xFFfdfdd1, 0xFFfdfed1, 0xFFfdfecf, 0xFFfcfecd, 0xFFfcfdc7, 0xFFfdfdc0, 0xFFfdfdb9,
+ 0xFFfdfdb2, 0xFFfdfca4, 0xFFfdfc95, 0xFFfdfc87, 0xFFfdfc79, 0xFFfdfa6c, 0xFFfef85f, 0xFFf9f645,
+ 0xFFf6ef47, 0xFFf2e938, 0xFFefe428, 0xFFeee425, 0xFFffdd05, 0xFF0399ff, 0xFF17a1f5, 0xFF179ef4,
+ 0xFF169cf3, 0xFF159cf3, 0xFF149cf3, 0xFF129bf1, 0xFF1099f0, 0xFF119af1, 0xFF129bf2, 0xFF129bf2,
+ 0xFF66d5fb, 0xFF70d5fc, 0xFF78e2ff, 0xFF3b86c7, 0xFF235fba, 0xFF1e6aba, 0xFF227ad1, 0xFF2787d8,
+ 0xFF248cd7, 0xFF1d8dd4, 0xFF2189d1, 0xFF2ca1ea, 0xFF2296d5, 0xFF31aaef, 0xFF20a1db, 0xFF17a1dd,
+ 0xFF0ea1e0, 0xFF1aace3, 0xFF13b1eb, 0xFF10b8ed, 0xFF0dc0ef, 0xFF1cc1ef, 0xFF2cc3f0, 0xFF36c4f2,
+ 0xFF40c5f4, 0xFF47c9f2, 0xFF45c3f6, 0xFF31bafa, 0xFF31b7f7, 0xFF4cc2f4, 0xFFf5fac0, 0xFFfdffc6,
+ 0xFFfdfcc5, 0xFFfdfdc4, 0xFFfdfdc4, 0xFFfcfdc2, 0xFFfbfdc1, 0xFFf8f9b6, 0xFFfdfdb3, 0xFFfdfdab,
+ 0xFFfdfca3, 0xFFfcfc95, 0xFFfcfb88, 0xFFfcfb7b, 0xFFfbfb6d, 0xFFfcf962, 0xFFfcf757, 0xFFf8f245,
+ 0xFFf4eb41, 0xFFf0e532, 0xFFebe023, 0xFFfbe01c, 0xFFc5d244, 0xFF0aa2fe, 0xFF169ff9, 0xFF179ff6,
+ 0xFF189ff3, 0xFF179ef2, 0xFF159df2, 0xFF179ff5, 0xFF18a1f8, 0xFF159ef5, 0xFF129bf2, 0xFF129bf2,
+ 0xFF65d7fa, 0xFF64d1f7, 0xFF5de7ff, 0xFF04439b, 0xFF0e4ca5, 0xFF317bcd, 0xFF0455c1, 0xFF0053c9,
+ 0xFF0368c6, 0xFF2687ca, 0xFF2881ca, 0xFF2789d1, 0xFF2791d7, 0xFF0774c9, 0xFF178dcf, 0xFF1f9ce1,
+ 0xFF179be4, 0xFF1e9eda, 0xFF0097de, 0xFF03a5e6, 0xFF08b1ee, 0xFF09b0e8, 0xFF0aafe2, 0xFF17b4e9,
+ 0xFF24b9ef, 0xFF30bdf4, 0xFF3cc1f9, 0xFF34bcf9, 0xFF2cb6f9, 0xFF80d2e8, 0xFFfafdaf, 0xFFfcfdb3,
+ 0xFFfdfcb7, 0xFFfdfcb7, 0xFFfdfdb7, 0xFFfcfcb6, 0xFFfbfcb5, 0xFFf4f4a5, 0xFFfdfda5, 0xFFfcfc9d,
+ 0xFFfcfc94, 0xFFfbfb87, 0xFFfbfb7b, 0xFFfafa6e, 0xFFfafa61, 0xFFfaf758, 0xFFfaf54e, 0xFFf7ee44,
+ 0xFFf3e73a, 0xFFede12c, 0xFFe7db1e, 0xFFffd21a, 0xFF78b090, 0xFF09a0fd, 0xFF159dfd, 0xFF18a0f8,
+ 0xFF1aa2f2, 0xFF18a0f2, 0xFF169ef2, 0xFF139bf2, 0xFF1099f1, 0xFF119af2, 0xFF129bf3, 0xFF129bf3,
+ 0xFF60d4f7, 0xFF67dcfd, 0xFF4fc2f0, 0xFF002c8a, 0xFF2e6bc0, 0xFF0547ad, 0xFF0044ba, 0xFF3685c4,
+ 0xFF064ebc, 0xFF1462c3, 0xFF2d70cb, 0xFF0f5ab4, 0xFF2274cd, 0xFF1169c2, 0xFF1979c2, 0xFF1d80d0,
+ 0xFF1980d7, 0xFF1a86d3, 0xFF1090de, 0xFF038dda, 0xFF0599e6, 0xFF059ce1, 0xFF049edd, 0xFF05a6e1,
+ 0xFF00a7de, 0xFF1fb6ee, 0xFF39bdf7, 0xFF38bcf6, 0xFF24b5fc, 0xFFbfe8b9, 0xFFfafea2, 0xFFfbfca5,
+ 0xFFfcfaa8, 0xFFfcfca7, 0xFFfdfda6, 0xFFfbfca3, 0xFFf9fb9f, 0xFFf6f795, 0xFFfafb92, 0xFFfbfb8b,
+ 0xFFfbfb85, 0xFFfafa79, 0xFFfafa6d, 0xFFf9f961, 0xFFf8f956, 0xFFf9f64c, 0xFFf9f442, 0xFFf5ec39,
+ 0xFFf2e531, 0xFFefde28, 0xFFecd620, 0xFFeed900, 0xFF32a6e5, 0xFF19a4ff, 0xFF29a4f4, 0xFF20a2f4,
+ 0xFF18a0f5, 0xFF179ef4, 0xFF159df4, 0xFF139bf3, 0xFF1199f2, 0xFF129af2, 0xFF129af3, 0xFF129af3,
+ 0xFF5bd1f5, 0xFF63dffa, 0xFF318dcc, 0xFF062d91, 0xFF0e499a, 0xFF00369f, 0xFF003897, 0xFF155fb6,
+ 0xFF53aad9, 0xFF31a6e2, 0xFF45bcef, 0xFF6dddff, 0xFF76defa, 0xFF6dd9f9, 0xFF64d5f9, 0xFF54c5f3,
+ 0xFF45b5ed, 0xFF238ed6, 0xFF1277ce, 0xFF006cc6, 0xFF0282de, 0xFF0187db, 0xFF008dd7, 0xFF079be1,
+ 0xFF0099dc, 0xFF22b1f0, 0xFF36baf4, 0xFF3cbcf4, 0xFF1cb5ff, 0xFFfffe89, 0xFFfbff96, 0xFFfbfc98,
+ 0xFFfbf99a, 0xFFfcfb98, 0xFFfdfd96, 0xFFfafb90, 0xFFf6f98a, 0xFFf7f984, 0xFFf8fa7f, 0xFFfafa7a,
+ 0xFFfbfb75, 0xFFfafa6a, 0xFFf9f960, 0xFFf8f855, 0xFFf7f84a, 0xFFf7f540, 0xFFf8f336, 0xFFf4eb2f,
+ 0xFFf0e328, 0xFFf0da24, 0xFFf0d121, 0xFFe9ca24, 0xFF049bff, 0xFF20a3f6, 0xFF16a1f7, 0xFF16a0f7,
+ 0xFF169ef7, 0xFF159df6, 0xFF149cf5, 0xFF139bf4, 0xFF129af3, 0xFF129af3, 0xFF129af3, 0xFF129af3,
+ 0xFF5ae3ff, 0xFF64d8ff, 0xFF0d4798, 0xFF002682, 0xFF1d6bb7, 0xFF3aa2de, 0xFF5fe5ff, 0xFF52d8fd,
+ 0xFF4dd6f6, 0xFF48ccf5, 0xFF5fd0f6, 0xFF68d9ff, 0xFF61d3f8, 0xFF5bd2f8, 0xFF42cbff, 0xFF53cefe,
+ 0xFF51cff5, 0xFF49caf6, 0xFF4acdff, 0xFF40baff, 0xFF0e7edb, 0xFF0069c2, 0xFF0584da, 0xFF0184d5,
+ 0xFF068cd8, 0xFF38bef8, 0xFF3abef7, 0xFF35beff, 0xFF62c7e2, 0xFFfbf379, 0xFFf8fa83, 0xFFf9f983,
+ 0xFFfaf884, 0xFFf9f77f, 0xFFf7f77b, 0xFFf8f979, 0xFFf9fa77, 0xFFf8f972, 0xFFf7f86c, 0xFFfcfc6c,
+ 0xFFf9f864, 0xFFf8f85b, 0xFFf8f752, 0xFFf7f649, 0xFFf6f53f, 0xFFf5f237, 0xFFf4ef2f, 0xFFf1e628,
+ 0xFFeede20, 0xFFead61f, 0xFFf2cc11, 0xFF9db96c, 0xFF0c9ffe, 0xFF1ba3f9, 0xFF17a2f9, 0xFF17a0f9,
+ 0xFF169ef8, 0xFF169df7, 0xFF159cf6, 0xFF149bf5, 0xFF139af5, 0xFF139af5, 0xFF139af5, 0xFF139af5,
+ 0xFF60d8f9, 0xFF5bd9f8, 0xFF4cadd7, 0xFF69ddff, 0xFF56ddf8, 0xFF55d6fc, 0xFF55d0ff, 0xFF5cd5ff,
+ 0xFF53cbf2, 0xFF4bcaf6, 0xFF43cafa, 0xFF47c9f8, 0xFF4cc8f6, 0xFF5ccff1, 0xFF46ccf8, 0xFF55caff,
+ 0xFF3ec4fa, 0xFF43c3fb, 0xFF48c2fd, 0xFF3ebff4, 0xFF44ccfb, 0xFF37b3fc, 0xFF0b7bdd, 0xFF006dc9,
+ 0xFF0d80d4, 0xFF4eccff, 0xFF3ec3fa, 0xFF2ec2ff, 0xFFa7dea8, 0xFFf8ec5b, 0xFFf5f570, 0xFFf7f66f,
+ 0xFFfaf76e, 0xFFf5f467, 0xFFf1f060, 0xFFf6f663, 0xFFfbfc65, 0xFFf8f95f, 0xFFf6f659, 0xFFfefe5d,
+ 0xFFf7f652, 0xFFf7f54c, 0xFFf7f545, 0xFFf6f33d, 0xFFf6f235, 0xFFf3ef2f, 0xFFf1eb29, 0xFFefe221,
+ 0xFFecd818, 0xFFe5d21a, 0xFFf3c700, 0xFF52a9b4, 0xFF14a4fb, 0xFF15a3fb, 0xFF17a3fc, 0xFF17a1fa,
+ 0xFF179ff8, 0xFF169df8, 0xFF159cf7, 0xFF159bf7, 0xFF1499f6, 0xFF1499f6, 0xFF1499f6, 0xFF1499f6,
+ 0xFF58cff2, 0xFF59ddfd, 0xFF55d5f9, 0xFF5ddeff, 0xFF4dcef3, 0xFF4dcbf3, 0xFF4cc8f3, 0xFF56d2fc,
+ 0xFF59d3fd, 0xFF50cefb, 0xFF47cafa, 0xFF48c9f9, 0xFF49c7f9, 0xFF51cbf6, 0xFF45c9f9, 0xFF4bc8fd,
+ 0xFF3fc5f9, 0xFF41c4fa, 0xFF43c2fb, 0xFF3bbdf3, 0xFF3ac0f4, 0xFF3ec7fc, 0xFF3ac6fc, 0xFF25a1e3,
+ 0xFF1f8dd9, 0xFF37b9f7, 0xFF26bbfa, 0xFF2abbf4, 0xFFced857, 0xFFf9fa5b, 0xFFd9db49, 0xFFedec58,
+ 0xFFfaf560, 0xFFf2ef4d, 0xFFe9ea3b, 0xFFeeef46, 0xFFf2f451, 0xFFf9f34f, 0xFFedf145, 0xFFfef84b,
+ 0xFFf4f542, 0xFFf5f43d, 0xFFf6f337, 0xFFf5f131, 0xFFf5ef2b, 0xFFf2eb27, 0xFFf0e622, 0xFFeedb1d,
+ 0xFFecd117, 0xFFf1cc09, 0xFFf5c509, 0xFF0fadff, 0xFF17a1f9, 0xFF18a1f9, 0xFF18a1f8, 0xFF18a0f9,
+ 0xFF179ff9, 0xFF169df9, 0xFF169cf8, 0xFF159bf8, 0xFF1599f8, 0xFF1599f8, 0xFF1599f8, 0xFF1599f8,
+ 0xFF60d5fb, 0xFF5bd3fb, 0xFF56d2fb, 0xFF55d1fc, 0xFF55d0fe, 0xFF54d0fa, 0xFF53d1f6, 0xFF51cef7,
+ 0xFF4ecbf8, 0xFF4dcbf9, 0xFF4ccafb, 0xFF49c8fb, 0xFF47c6fc, 0xFF45c6fb, 0xFF43c6fa, 0xFF41c6fa,
+ 0xFF40c7f9, 0xFF3fc5f9, 0xFF3ec3f9, 0xFF3fc3fb, 0xFF41c4fd, 0xFF38baf2, 0xFF40c1f8, 0xFF3dc3fb,
+ 0xFF3bc5fe, 0xFF37c1f6, 0xFF34beef, 0xFF2ebcf0, 0xFFded722, 0xFFbfdc38, 0xFFdee142, 0xFFecea4a,
+ 0xFFeae442, 0xFFeee942, 0xFFf2ee42, 0xFFeeed3f, 0xFFeaec3d, 0xFFfbee3f, 0xFFe5ec31, 0xFFfff239,
+ 0xFFf2f531, 0xFFf4f32e, 0xFFf5f12a, 0xFFf5ee25, 0xFFf4ec21, 0xFFf2e71e, 0xFFf0e11c, 0xFFeed519,
+ 0xFFecc917, 0xFFdec40c, 0xFFbbbe39, 0xFF0798f8, 0xFF1a9ff8, 0xFF1a9ff7, 0xFF1a9ff5, 0xFF189ff7,
+ 0xFF179ff9, 0xFF179ef9, 0xFF169cf9, 0xFF169bf9, 0xFF1699f9, 0xFF1699f9, 0xFF1699f9, 0xFF1699f9,
+ 0xFF5cd4f9, 0xFF58d4f9, 0xFF55d3f9, 0xFF56d2fa, 0xFF58d0fb, 0xFF56d0f8, 0xFF54d0f6, 0xFF51cef7,
+ 0xFF4dccf9, 0xFF4ccbfa, 0xFF4bcafb, 0xFF49c8fb, 0xFF47c7fb, 0xFF45c7fb, 0xFF43c6fa, 0xFF41c6fa,
+ 0xFF40c6f9, 0xFF3fc4f9, 0xFF3ec3f9, 0xFF3ec2fa, 0xFF3ec2fb, 0xFF3abef5, 0xFF3ec2f8, 0xFF3bc1f9,
+ 0xFF37c0f9, 0xFF36beff, 0xFF35bbff, 0xFF67bb84, 0xFFb0d219, 0xFFb4d31a, 0xFFd3da39, 0xFFe2dd3d,
+ 0xFFd6d532, 0xFFe1df38, 0xFFece93e, 0xFFe1e636, 0xFFe9e536, 0xFFf1e634, 0xFFe5e42b, 0xFFf6e62e,
+ 0xFFe9eb29, 0xFFf0ee2a, 0xFFf0e824, 0xFFece420, 0xFFe9e01d, 0xFFebdb1c, 0xFFedd71c, 0xFFe9ce19,
+ 0xFFe5c516, 0xFFe7c004, 0xFF6cb292, 0xFF109dfc, 0xFF18a1f7, 0xFF1aa0f5, 0xFF1ca0f3, 0xFF19a0f6,
+ 0xFF179ff9, 0xFF169ef9, 0xFF169cf9, 0xFF159bf8, 0xFF159af8, 0xFF1499f8, 0xFF1499f7, 0xFF1499f7,
+ 0xFF58d4f6, 0xFF56d4f6, 0xFF54d5f7, 0xFF57d3f7, 0xFF5bd1f8, 0xFF58d0f6, 0xFF54cff5, 0xFF50cef8,
+ 0xFF4dcdfa, 0xFF4bcbfb, 0xFF4acafb, 0xFF48c9fb, 0xFF46c7fb, 0xFF45c7fa, 0xFF43c7fa, 0xFF42c6fa,
+ 0xFF40c6f9, 0xFF3fc4f9, 0xFF3ec3f9, 0xFF3dc1f9, 0xFF3cc0f9, 0xFF3cc1f8, 0xFF3cc2f7, 0xFF38bff6,
+ 0xFF34bbf5, 0xFF35bdfd, 0xFF37beff, 0xFF46bcfc, 0xFF82c92c, 0xFFa0be02, 0xFFb8c420, 0xFFd8cf31,
+ 0xFFd2d632, 0xFFd4d52e, 0xFFd7d42a, 0xFFcdd725, 0xFFe9df2f, 0xFFe6dd2a, 0xFFe4dc25, 0xFFedd922,
+ 0xFFe0e220, 0xFFede927, 0xFFeae01e, 0xFFe4da1c, 0xFFded319, 0xFFe5d01a, 0xFFebcd1b, 0xFFe5c818,
+ 0xFFdec214, 0xFFf0bc00, 0xFF1da5eb, 0xFF19a1ff, 0xFF16a2f7, 0xFF19a2f4, 0xFF1ea2f1, 0xFF1aa0f5,
+ 0xFF169ff9, 0xFF169ef8, 0xFF159df8, 0xFF159cf8, 0xFF149bf8, 0xFF139af7, 0xFF1299f6, 0xFF1299f6,
+ 0xFF5ed5f9, 0xFF63d6fc, 0xFF68d6ff, 0xFF5fd3fc, 0xFF56d0f8, 0xFF53cff8, 0xFF51cef8, 0xFF4ecdf9,
+ 0xFF4bccfb, 0xFF4acbfb, 0xFF48cafb, 0xFF47c9fa, 0xFF46c8fb, 0xFF44c7fa, 0xFF43c7fa, 0xFF42c6fa,
+ 0xFF40c5f9, 0xFF3fc4f9, 0xFF3ec3f9, 0xFF3dc1f9, 0xFF3cc0f9, 0xFF3bc1f9, 0xFF3bc1f8, 0xFF38bff7,
+ 0xFF36bdf7, 0xFF35bdfa, 0xFF34bdfe, 0xFF22c3f6, 0xFF27bbfc, 0xFF53b0b2, 0xFF9bc606, 0xFFc1d322,
+ 0xFFd3dd36, 0xFFb4ba12, 0xFFc4c71f, 0xFFc5cf22, 0xFFd9d82d, 0xFFdfdb30, 0xFFdcd52b, 0xFFe8d520,
+ 0xFFd5d51c, 0xFFe8e428, 0xFFece324, 0xFFd1ce1f, 0xFFd3c51d, 0xFFdcc302, 0xFFcfc312, 0xFFe3c209,
+ 0xFFe3be00, 0xFF84bf6e, 0xFF0ca0f6, 0xFF129ffd, 0xFF18a2f6, 0xFF19a1f5, 0xFF1ba1f4, 0xFF18a0f6,
+ 0xFF169ff8, 0xFF159ef8, 0xFF159df8, 0xFF149cf7, 0xFF139bf7, 0xFF129af6, 0xFF1098f4, 0xFF1098f4,
+ 0xFF65d7fb, 0xFF5dd4fa, 0xFF56d2f8, 0xFF53d0f9, 0xFF50cff9, 0xFF4fcef9, 0xFF4dcdfa, 0xFF4bcdfa,
+ 0xFF4accfb, 0xFF48cbfb, 0xFF47cafb, 0xFF46c9fa, 0xFF45c8fa, 0xFF44c7fa, 0xFF43c7fa, 0xFF42c6fa,
+ 0xFF40c5fa, 0xFF3fc4f9, 0xFF3ec3f9, 0xFF3dc1f9, 0xFF3bc0f9, 0xFF3ac0f9, 0xFF39c0f9, 0xFF38bff9,
+ 0xFF37bff9, 0xFF34bef8, 0xFF31bcf7, 0xFF33bbf8, 0xFF35bbfa, 0xFF2cbcff, 0xFF61c2df, 0xFF93cb85,
+ 0xFFc5d52b, 0xFFcbd82f, 0xFFb0bb13, 0xFFb5be17, 0xFFb9c21b, 0xFFc7c826, 0xFFc5bf21, 0xFFdbc817,
+ 0xFFcac819, 0xFFdbd722, 0xFFddd61a, 0xFFb7bd0d, 0xFFc8bd04, 0xFFd0c000, 0xFFadc951, 0xFF6cb8b1,
+ 0xFF04a3ff, 0xFF13a4fb, 0xFF21a4f5, 0xFF1ea3f5, 0xFF1aa1f6, 0xFF19a1f6, 0xFF18a0f7, 0xFF17a0f7,
+ 0xFF169ff8, 0xFF159ef7, 0xFF149ef7, 0xFF139df7, 0xFF139cf6, 0xFF119af4, 0xFF0f98f2, 0xFF0f98f2,
+ 0xFF5cd5f9, 0xFF58d3f8, 0xFF53d1f8, 0xFF52d0f9, 0xFF50cff9, 0xFF4ecefa, 0xFF4ccdfa, 0xFF4accfa,
+ 0xFF48ccfa, 0xFF47cbfa, 0xFF46cafa, 0xFF45c9fa, 0xFF44c8fa, 0xFF43c7fa, 0xFF42c7fa, 0xFF41c6fa,
+ 0xFF40c5fa, 0xFF3fc4f9, 0xFF3ec2f9, 0xFF3cc1f9, 0xFF3bc0f9, 0xFF3ac0f9, 0xFF38bff9, 0xFF37bff9,
+ 0xFF36bff9, 0xFF35bdf6, 0xFF34bbf3, 0xFF35b9f7, 0xFF35b8fb, 0xFF22b5ff, 0xFF2fb5ff, 0xFF4dbae6,
+ 0xFF6bbfce, 0xFF27b1c5, 0xFF6cbc7c, 0xFF8abd49, 0xFFa7be15, 0xFFb9bf09, 0xFFccc000, 0xFFdac43d,
+ 0xFFbbca20, 0xFFaec73e, 0xFF99bc54, 0xFF5aad8b, 0xFF36abc4, 0xFF04b3ff, 0xFF15a7ff, 0xFF21a4ff,
+ 0xFF19a0fb, 0xFF1ba2fa, 0xFF1da4f9, 0xFF1ba3f8, 0xFF1aa1f7, 0xFF19a1f7, 0xFF18a0f7, 0xFF17a0f7,
+ 0xFF169ff8, 0xFF159ef7, 0xFF149ef7, 0xFF139df7, 0xFF129cf6, 0xFF119af5, 0xFF0f99f3, 0xFF0f99f3,
+ 0xFF53d2f6, 0xFF52d1f7, 0xFF51d1f8, 0xFF50d0f9, 0xFF4fcffa, 0xFF4dcefa, 0xFF4bcdfa, 0xFF49ccfa,
+ 0xFF47cbfa, 0xFF46caf9, 0xFF45caf9, 0xFF44c9f9, 0xFF44c8fa, 0xFF43c7fa, 0xFF42c6f9, 0xFF41c6f9,
+ 0xFF40c5fa, 0xFF3fc4f9, 0xFF3dc2f9, 0xFF3cc1f9, 0xFF3ac0f9, 0xFF39c0f9, 0xFF38bff9, 0xFF36bff9,
+ 0xFF35bef8, 0xFF36bcf4, 0xFF38baf0, 0xFF36b8f6, 0xFF34b5fc, 0xFF2cb6f9, 0xFF23b7f6, 0xFF25b5fa,
+ 0xFF28b4ff, 0xFF28b6ff, 0xFF29b7ff, 0xFF1fb5ff, 0xFF15b2ff, 0xFF20aef7, 0xFF3cb9ff, 0xFF5acbf0,
+ 0xFF42befa, 0xFF2ab6fc, 0xFF12adff, 0xFF18acfc, 0xFF1eacfa, 0xFF1ea9fd, 0xFF1ea7ff, 0xFF1ba8fa,
+ 0xFF18a8f4, 0xFF18a6f8, 0xFF18a4fd, 0xFF19a3fa, 0xFF1aa1f7, 0xFF19a1f7, 0xFF18a0f8, 0xFF17a0f8,
+ 0xFF169ff8, 0xFF159ef7, 0xFF149df7, 0xFF139cf6, 0xFF129bf6, 0xFF119af5, 0xFF1099f4, 0xFF1099f4,
+ 0xFF54d1f8, 0xFF52d1f8, 0xFF51d0f9, 0xFF4fcff9, 0xFF4ecffa, 0xFF4ccefa, 0xFF4acdf9, 0xFF48ccf9,
+ 0xFF45cbf9, 0xFF45caf9, 0xFF44c9f9, 0xFF43c8f9, 0xFF43c8f9, 0xFF42c7f9, 0xFF42c6f9, 0xFF41c5f9,
+ 0xFF40c5fa, 0xFF3fc4f9, 0xFF3dc2f9, 0xFF3bc1f9, 0xFF3ac0fa, 0xFF38bff9, 0xFF37bff9, 0xFF36bef9,
+ 0xFF34bef8, 0xFF35bcf6, 0xFF35baf5, 0xFF34b8f8, 0xFF33b6fc, 0xFF2eb6f9, 0xFF29b6f7, 0xFF29b5f8,
+ 0xFF2ab4fa, 0xFF2ab5fb, 0xFF2ab5fc, 0xFF2ab2f6, 0xFF2aafef, 0xFF1ba9f6, 0xFF9bcfd9, 0xFF6dcfe9,
+ 0xFF74c7e4, 0xFF80c9dd, 0xFF19adfb, 0xFF1cacf9, 0xFF1fabf8, 0xFF1fa9f9, 0xFF1ea7fb, 0xFF1ca7f9,
+ 0xFF1aa7f6, 0xFF1aa5f8, 0xFF1aa4fb, 0xFF1aa3fa, 0xFF1aa2f8, 0xFF19a1f8, 0xFF18a0f8, 0xFF17a0f8,
+ 0xFF169ff8, 0xFF159ef7, 0xFF149df7, 0xFF139cf6, 0xFF129bf6, 0xFF119bf5, 0xFF119af5, 0xFF119af5,
+ 0xFF55d0f9, 0xFF53d0fa, 0xFF51d0fa, 0xFF4fcffa, 0xFF4dcffa, 0xFF4bcefa, 0xFF49cdf9, 0xFF46ccf9,
+ 0xFF44caf8, 0xFF43caf8, 0xFF43c9f8, 0xFF43c8f9, 0xFF42c8f9, 0xFF42c7f9, 0xFF41c6f9, 0xFF41c6f9,
+ 0xFF40c5fa, 0xFF3ec3f9, 0xFF3dc2fa, 0xFF3bc1fa, 0xFF39c0fa, 0xFF38bff9, 0xFF36bff9, 0xFF35bef9,
+ 0xFF34bdf8, 0xFF33bcf9, 0xFF33bafa, 0xFF32b9fb, 0xFF32b8fc, 0xFF30b7fa, 0xFF2eb6f8, 0xFF2db5f7,
+ 0xFF2bb4f5, 0xFF2bb4f6, 0xFF2bb3f7, 0xFF29b2f9, 0xFF28b2fc, 0xFF30b2f7, 0xFF12a8fe, 0xFF7fd4e1,
+ 0xFF58bbe6, 0xFF15aafb, 0xFF1fadf8, 0xFF20acf7, 0xFF20aaf5, 0xFF1fa9f6, 0xFF1ea8f7, 0xFF1da6f7,
+ 0xFF1ca5f8, 0xFF1ca4f8, 0xFF1ba3f9, 0xFF1ba3f9, 0xFF1ba2f9, 0xFF19a1f9, 0xFF18a0f8, 0xFF17a0f8,
+ 0xFF169ff8, 0xFF159ef7, 0xFF149df7, 0xFF139cf6, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5,
+ 0xFF55d0f9, 0xFF53d0fa, 0xFF51d0fa, 0xFF4fcffa, 0xFF4dcffa, 0xFF4bcefa, 0xFF49cdf9, 0xFF46ccf9,
+ 0xFF44caf8, 0xFF43caf8, 0xFF43c9f8, 0xFF43c8f9, 0xFF42c8f9, 0xFF42c7f9, 0xFF41c6f9, 0xFF41c6f9,
+ 0xFF40c5fa, 0xFF3ec3f9, 0xFF3dc2fa, 0xFF3bc1fa, 0xFF39c0fa, 0xFF38bff9, 0xFF36bff9, 0xFF35bef9,
+ 0xFF34bdf8, 0xFF33bcf9, 0xFF33bafa, 0xFF32b9fb, 0xFF32b8fc, 0xFF30b7fa, 0xFF2eb6f8, 0xFF2db5f7,
+ 0xFF2bb4f5, 0xFF2bb4f6, 0xFF2bb3f7, 0xFF2ab2f8, 0xFF29b2fa, 0xFF2db6f5, 0xFF1db5f6, 0xFF239bff,
+ 0xFF20b6f3, 0xFF0cacfb, 0xFF1eacf7, 0xFF1fabf6, 0xFF20aaf5, 0xFF1fa9f6, 0xFF1ea8f7, 0xFF1da6f7,
+ 0xFF1ca5f8, 0xFF1ca4f8, 0xFF1ba3f9, 0xFF1ba3f9, 0xFF1ba2f9, 0xFF19a1f9, 0xFF18a0f8, 0xFF17a0f8,
+ 0xFF169ff8, 0xFF159ef7, 0xFF149df7, 0xFF139cf6, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5,
+ 0xFF55d0f9, 0xFF53d0fa, 0xFF51d0fa, 0xFF4fcffa, 0xFF4dcffa, 0xFF4bcefa, 0xFF49cdf9, 0xFF46ccf9,
+ 0xFF44caf8, 0xFF43caf8, 0xFF43c9f8, 0xFF43c8f9, 0xFF42c8f9, 0xFF42c7f9, 0xFF41c6f9, 0xFF41c6f9,
+ 0xFF40c5fa, 0xFF3ec3f9, 0xFF3dc2fa, 0xFF3bc1fa, 0xFF39c0fa, 0xFF38bff9, 0xFF36bff9, 0xFF35bef9,
+ 0xFF34bdf8, 0xFF33bcf9, 0xFF33bafa, 0xFF32b9fb, 0xFF32b8fc, 0xFF30b7fa, 0xFF2eb6f8, 0xFF2db5f7,
+ 0xFF2bb4f5, 0xFF2bb4f6, 0xFF2bb3f7, 0xFF2bb2f8, 0xFF2bb1f8, 0xFF22aff9, 0xFF19acfa, 0xFF1eadf7,
+ 0xFF24aef3, 0xFF20adf5, 0xFF1dabf6, 0xFF1fabf6, 0xFF20aaf5, 0xFF1fa9f6, 0xFF1ea8f7, 0xFF1da6f7,
+ 0xFF1ca5f8, 0xFF1ca4f8, 0xFF1ba3f9, 0xFF1ba3f9, 0xFF1ba2f9, 0xFF19a1f9, 0xFF18a0f8, 0xFF17a0f8,
+ 0xFF169ff8, 0xFF159ef7, 0xFF149df7, 0xFF139cf6, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5,
+ 0xFF55d0f9, 0xFF53d0fa, 0xFF51d0fa, 0xFF4fcffa, 0xFF4dcffa, 0xFF4bcefa, 0xFF49cdf9, 0xFF46ccf9,
+ 0xFF44caf8, 0xFF43caf8, 0xFF43c9f8, 0xFF43c8f9, 0xFF42c8f9, 0xFF42c7f9, 0xFF41c6f9, 0xFF41c6f9,
+ 0xFF40c5fa, 0xFF3ec3f9, 0xFF3dc2fa, 0xFF3bc1fa, 0xFF39c0fa, 0xFF38bff9, 0xFF36bff9, 0xFF35bef9,
+ 0xFF34bdf8, 0xFF33bcf9, 0xFF33bafa, 0xFF32b9fb, 0xFF32b8fc, 0xFF30b7fa, 0xFF2eb6f8, 0xFF2db5f7,
+ 0xFF2bb4f5, 0xFF2bb4f6, 0xFF2bb3f7, 0xFF2bb2f8, 0xFF2bb1f8, 0xFF22aff9, 0xFF19acfa, 0xFF1eadf7,
+ 0xFF24aef3, 0xFF20adf5, 0xFF1dabf6, 0xFF1fabf6, 0xFF20aaf5, 0xFF1fa9f6, 0xFF1ea8f7, 0xFF1da6f7,
+ 0xFF1ca5f8, 0xFF1ca4f8, 0xFF1ba3f9, 0xFF1ba3f9, 0xFF1ba2f9, 0xFF19a1f9, 0xFF18a0f8, 0xFF17a0f8,
+ 0xFF169ff8, 0xFF159ef7, 0xFF149df7, 0xFF139cf6, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5, 0xFF129bf5
+};
+
+static int test_bmp_cmp_count(const BYTE* mem1, const BYTE* mem2, int size, int channel, int margin)
+{
+ int error = 0;
+ int count = 0;
+ size /= 4;
+ mem1 += channel;
+ mem2 += channel;
+
+ for (int index = 0; index < size; index++)
+ {
+ if (*mem1 != *mem2)
+ {
+ error = (*mem1 > *mem2) ? *mem1 - *mem2 : *mem2 - *mem1;
+
+ if (error > margin)
+ count++;
+ }
+
+ mem1 += 4;
+ mem2 += 4;
+ }
+
+ return count;
+}
+
+static int test_bmp_cmp_dump(const BYTE* actual, const BYTE* expected, int size, int channel,
+ int margin)
+{
+ int error[3];
+ int count = 0;
+ size /= 4;
+ actual += channel;
+ expected += channel;
+
+ for (int index = 0; index < size; index++)
+ {
+ if (*actual != *expected)
+ {
+ const UINT32 pixel = *((const UINT32*)&actual[-channel]);
+ const UINT32 ePixel = *((const UINT32*)&expected[-channel]);
+ const INT16 Y = TEST_Y_COMPONENT[index];
+ const INT16 Cb = TEST_CB_COMPONENT[index];
+ const INT16 Cr = TEST_CR_COMPONENT[index];
+ const int x = index % 64;
+ const int y = (index - x) / 64;
+ BYTE R = 0;
+ BYTE G = 0;
+ BYTE B = 0;
+ BYTE eR = 0;
+ BYTE eG = 0;
+ BYTE eB = 0;
+
+ FreeRDPSplitColor(pixel, PIXEL_FORMAT_XRGB32, &R, &G, &B, NULL, NULL);
+ FreeRDPSplitColor(ePixel, PIXEL_FORMAT_XRGB32, &eR, &eG, &eB, NULL, NULL);
+ error[0] = (R > eR) ? R - eR : eR - R;
+ error[1] = (G > eG) ? G - eG : eG - G;
+ error[2] = (B > eB) ? B - eB : eB - B;
+
+ if ((error[0] > margin) || (error[1] > margin) || (error[2] > margin))
+ {
+ printf("(%2d,%2d) Y: %+5" PRId16 " Cb: %+5" PRId16 " Cr: %+5" PRId16
+ " R: %03" PRIu8 "/%03" PRIu8 " G: %03" PRIu8 "/%03" PRIu8 " B: %03" PRIu8
+ "/%03" PRIu8 " %d %d %d\n",
+ x, y, Y, Cb, Cr, R, eR, G, eG, B, eB, R - eR, G - eG, B - eB);
+ count++;
+ }
+ }
+
+ actual += 4;
+ expected += 4;
+ }
+
+ return count;
+}
+
+static int test_PrimitivesYCbCr(const primitives_t* prims, UINT32 format, prim_size_t roi,
+ BOOL compare)
+{
+ union
+ {
+ const INT16** cpi;
+ INT16** pi;
+ const UINT16** cpv;
+ UINT16** pv;
+ } cnv;
+ pstatus_t status = -1;
+ int cnt[3];
+ float err[3];
+ BYTE* actual = NULL;
+ BYTE* actual1 = NULL;
+ const BYTE* expected = (const BYTE*)TEST_XRGB_IMAGE;
+ int margin = 1;
+ INT16* pYCbCr[3] = { NULL, NULL, NULL };
+ const UINT32 srcStride = roi.width * 2;
+ const UINT32 dstStride = roi.width * FreeRDPGetBytesPerPixel(format);
+ const UINT32 srcSize = srcStride * roi.height;
+ const UINT32 dstSize = dstStride * roi.height;
+ PROFILER_DEFINE(prof)
+ PROFILER_DEFINE(prof1)
+ PROFILER_DEFINE(prof2)
+ // return test_YCbCr_pixels();
+
+ actual = winpr_aligned_malloc(dstSize, 16);
+ actual1 = winpr_aligned_malloc(dstSize, 16);
+ PROFILER_CREATE(prof, "yCbCrToRGB_16s8u")
+ PROFILER_CREATE(prof1, "yCbCrToRGB16s16s")
+ PROFILER_CREATE(prof2, "RGBToRGB_16s8u")
+
+ if (!actual || !actual1)
+ goto fail;
+
+ ZeroMemory(actual, dstSize);
+ ZeroMemory(actual1, dstSize);
+ pYCbCr[0] = winpr_aligned_malloc(srcSize, 16);
+ pYCbCr[1] = winpr_aligned_malloc(srcSize, 16);
+ pYCbCr[2] = winpr_aligned_malloc(srcSize, 16);
+
+ if (!pYCbCr[0] || !pYCbCr[1] || !pYCbCr[2])
+ goto fail;
+
+ winpr_RAND(pYCbCr[0], srcSize);
+ winpr_RAND(pYCbCr[1], srcSize);
+ winpr_RAND(pYCbCr[2], srcSize);
+
+ if (compare)
+ {
+ memcpy(pYCbCr[0], TEST_Y_COMPONENT, srcSize);
+ memcpy(pYCbCr[1], TEST_CB_COMPONENT, srcSize);
+ memcpy(pYCbCr[2], TEST_CR_COMPONENT, srcSize);
+ }
+
+ {
+ PROFILER_ENTER(prof)
+ cnv.pi = pYCbCr;
+ status =
+ prims->yCbCrToRGB_16s8u_P3AC4R(cnv.cpi, srcStride, actual, dstStride, format, &roi);
+ if (status != PRIMITIVES_SUCCESS)
+ goto fail;
+
+ PROFILER_EXIT(prof)
+ }
+
+ {
+ INT16* pSrcDst[3];
+ pSrcDst[0] = winpr_aligned_malloc(srcSize, 16);
+ pSrcDst[1] = winpr_aligned_malloc(srcSize, 16);
+ pSrcDst[2] = winpr_aligned_malloc(srcSize, 16);
+ CopyMemory(pSrcDst[0], pYCbCr[0], srcSize);
+ CopyMemory(pSrcDst[1], pYCbCr[1], srcSize);
+ CopyMemory(pSrcDst[2], pYCbCr[2], srcSize);
+ PROFILER_ENTER(prof1)
+ cnv.pi = pSrcDst;
+ status = prims->yCbCrToRGB_16s16s_P3P3(cnv.cpi, srcStride, pSrcDst, srcStride, &roi);
+ PROFILER_EXIT(prof1)
+
+ if (status != PRIMITIVES_SUCCESS)
+ goto fail2;
+
+ PROFILER_ENTER(prof2)
+ status = prims->RGBToRGB_16s8u_P3AC4R(cnv.cpi, srcStride, actual1, dstStride, format, &roi);
+ PROFILER_EXIT(prof2)
+ fail2:
+ winpr_aligned_free(pSrcDst[0]);
+ winpr_aligned_free(pSrcDst[1]);
+ winpr_aligned_free(pSrcDst[2]);
+
+ if (status != PRIMITIVES_SUCCESS)
+ goto fail;
+ }
+
+ if (compare)
+ {
+ cnt[2] = test_bmp_cmp_count(actual, expected, dstSize, 2, margin); /* red */
+ err[2] = ((float)cnt[2]) / ((float)dstSize / 4.0f) * 100.0f;
+ cnt[1] = test_bmp_cmp_count(actual, expected, dstSize, 1, margin); /* green */
+ err[1] = ((float)cnt[1]) / ((float)dstSize / 4.0f) * 100.0f;
+ cnt[0] = test_bmp_cmp_count(actual, expected, dstSize, 0, margin); /* blue */
+ err[0] = ((float)cnt[0]) / ((float)dstSize / 4.0f) * 100.0f;
+
+ if (cnt[0] || cnt[1] || cnt[2])
+ {
+ printf("Summary information yCbCrToRGB_16s8u_P3AC4R\n");
+ printf("Red Error Dump:\n");
+ test_bmp_cmp_dump(actual, expected, dstSize, 2, margin); /* red */
+ printf("Green Error Dump:\n");
+ test_bmp_cmp_dump(actual, expected, dstSize, 1, margin); /* green */
+ printf("Blue Error Dump:\n");
+ test_bmp_cmp_dump(actual, expected, dstSize, 0, margin); /* blue */
+ printf("R: diff: %d (%f%%)\n", cnt[2], err[2]);
+ printf("G: diff: %d (%f%%)\n", cnt[1], err[1]);
+ printf("B: diff: %d (%f%%)\n", cnt[0], err[0]);
+ }
+
+ cnt[2] = test_bmp_cmp_count(actual1, expected, dstSize, 2, margin); /* red */
+ err[2] = ((float)cnt[2]) / ((float)dstSize / 4.0f) * 100.0f;
+ cnt[1] = test_bmp_cmp_count(actual1, expected, dstSize, 1, margin); /* green */
+ err[1] = ((float)cnt[1]) / ((float)dstSize / 4.0f) * 100.0f;
+ cnt[0] = test_bmp_cmp_count(actual1, expected, dstSize, 0, margin); /* blue */
+ err[0] = ((float)cnt[0]) / ((float)dstSize / 4.0f) * 100.0f;
+
+ if (cnt[0] || cnt[1] || cnt[2])
+ {
+ printf("Summary information yCbCrToRGB_16s16s_P3P3 & RGBToRGB_16s8u_P3AC4R\n");
+ printf("Red Error Dump:\n");
+ test_bmp_cmp_dump(actual1, expected, dstSize, 2, margin); /* red */
+ printf("Green Error Dump:\n");
+ test_bmp_cmp_dump(actual1, expected, dstSize, 1, margin); /* green */
+ printf("Blue Error Dump:\n");
+ test_bmp_cmp_dump(actual1, expected, dstSize, 0, margin); /* blue */
+ printf("R: diff: %d (%f%%)\n", cnt[2], err[2]);
+ printf("G: diff: %d (%f%%)\n", cnt[1], err[1]);
+ printf("B: diff: %d (%f%%)\n", cnt[0], err[0]);
+ }
+ }
+
+ PROFILER_PRINT_HEADER
+ PROFILER_PRINT(prof)
+ PROFILER_PRINT(prof1)
+ PROFILER_PRINT(prof2)
+ PROFILER_PRINT_FOOTER
+fail:
+ winpr_aligned_free((BYTE*)pYCbCr[0]);
+ winpr_aligned_free((BYTE*)pYCbCr[1]);
+ winpr_aligned_free((BYTE*)pYCbCr[2]);
+ winpr_aligned_free(actual);
+ winpr_aligned_free(actual1);
+ PROFILER_FREE(prof)
+ PROFILER_FREE(prof1)
+ PROFILER_FREE(prof2)
+ return status;
+}
+
+int TestPrimitivesYCbCr(int argc, char* argv[])
+{
+ const UINT32 formats[] = { PIXEL_FORMAT_XRGB32, PIXEL_FORMAT_XBGR32, PIXEL_FORMAT_ARGB32,
+ PIXEL_FORMAT_ABGR32, PIXEL_FORMAT_RGBA32, PIXEL_FORMAT_RGBX32,
+ PIXEL_FORMAT_BGRA32, PIXEL_FORMAT_BGRX32 };
+ const primitives_t* prims = primitives_get();
+ const primitives_t* generics = primitives_get_generic();
+
+ WINPR_UNUSED(argv);
+
+ if (argc < 2)
+ {
+ {
+ /* Do content comparison. */
+ for (UINT32 x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
+ {
+ prim_size_t roi = { 64, 64 };
+ int rc = 0;
+ printf("----------------------- GENERIC %s [%" PRIu32 "x%" PRIu32
+ "] COMPARE CONTENT ----\n",
+ FreeRDPGetColorFormatName(formats[x]), roi.width, roi.height);
+ rc = test_PrimitivesYCbCr(generics, formats[x], roi, TRUE);
+
+ if (rc != PRIMITIVES_SUCCESS)
+ return rc;
+
+ printf("------------------------- END %s ----------------------\n",
+ FreeRDPGetColorFormatName(formats[x]));
+ printf("---------------------- OPTIMIZED %s [%" PRIu32 "x%" PRIu32
+ "] COMPARE CONTENT ----\n",
+ FreeRDPGetColorFormatName(formats[x]), roi.width, roi.height);
+ rc = test_PrimitivesYCbCr(prims, formats[x], roi, TRUE);
+
+ if (rc != PRIMITIVES_SUCCESS)
+ return rc;
+
+ printf("------------------------- END %s ----------------------\n",
+ FreeRDPGetColorFormatName(formats[x]));
+ }
+ }
+ /* Do random data conversion with random sizes */
+ {
+ prim_size_t roi;
+
+ do
+ {
+ winpr_RAND(&roi.width, sizeof(roi.width));
+ roi.width %= 2048 / 4;
+ } while (roi.width < 16);
+
+ do
+ {
+ winpr_RAND(&roi.height, sizeof(roi.height));
+ roi.height %= 2048 / 4;
+ } while (roi.height < 16);
+
+ for (size_t x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
+ {
+ int rc = 0;
+ printf("----------------------- GENERIC %s [%" PRIu32 "x%" PRIu32
+ "] COMPARE CONTENT ----\n",
+ FreeRDPGetColorFormatName(formats[x]), roi.width, roi.height);
+ rc = test_PrimitivesYCbCr(generics, formats[x], roi, FALSE);
+
+ if (rc != PRIMITIVES_SUCCESS)
+ return rc;
+
+ printf("------------------------- END %s ----------------------\n",
+ FreeRDPGetColorFormatName(formats[x]));
+ printf("---------------------- OPTIMIZED %s [%" PRIu32 "x%" PRIu32
+ "] COMPARE CONTENT ----\n",
+ FreeRDPGetColorFormatName(formats[x]), roi.width, roi.height);
+ rc = test_PrimitivesYCbCr(prims, formats[x], roi, FALSE);
+
+ if (rc != PRIMITIVES_SUCCESS)
+ return rc;
+
+ printf("------------------------- END %s ----------------------\n",
+ FreeRDPGetColorFormatName(formats[x]));
+ }
+ }
+ }
+ /* Do a performance run with full HD */
+ else
+ {
+ prim_size_t roi = { 1928 / 8, 1080 / 8 };
+
+ for (size_t x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
+ {
+ int rc = 0;
+ printf("----------------------- GENERIC %s [%" PRIu32 "x%" PRIu32
+ "] COMPARE CONTENT ----\n",
+ FreeRDPGetColorFormatName(formats[x]), roi.width, roi.height);
+ rc = test_PrimitivesYCbCr(generics, formats[x], roi, FALSE);
+
+ if (rc != PRIMITIVES_SUCCESS)
+ return rc;
+
+ printf("------------------------- END %s ----------------------\n",
+ FreeRDPGetColorFormatName(formats[x]));
+ printf("---------------------- OPTIMIZED %s [%" PRIu32 "x%" PRIu32
+ "] COMPARE CONTENT ----\n",
+ FreeRDPGetColorFormatName(formats[x]), roi.width, roi.height);
+ rc = test_PrimitivesYCbCr(prims, formats[x], roi, FALSE);
+
+ if (rc != PRIMITIVES_SUCCESS)
+ return rc;
+
+ printf("------------------------- END %s ----------------------\n",
+ FreeRDPGetColorFormatName(formats[x]));
+ }
+ }
+
+ return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesYCoCg.c b/libfreerdp/primitives/test/TestPrimitivesYCoCg.c
new file mode 100644
index 0000000..318aec6
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesYCoCg.c
@@ -0,0 +1,145 @@
+/* test_YCoCg.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+#include <freerdp/utils/profiler.h>
+
+/* ------------------------------------------------------------------------- */
+static BOOL test_YCoCgRToRGB_8u_AC4R_func(UINT32 width, UINT32 height)
+{
+ pstatus_t status = -1;
+ BYTE* out_sse = NULL;
+ BYTE* in = NULL;
+ BYTE* out_c = NULL;
+ const UINT32 srcStride = width * 4;
+ const UINT32 size = srcStride * height;
+ const UINT32 formats[] = { PIXEL_FORMAT_ARGB32, PIXEL_FORMAT_ABGR32, PIXEL_FORMAT_RGBA32,
+ PIXEL_FORMAT_RGBX32, PIXEL_FORMAT_BGRA32, PIXEL_FORMAT_BGRX32 };
+ PROFILER_DEFINE(genericProf)
+ PROFILER_DEFINE(optProf)
+ in = winpr_aligned_calloc(1, size, 16);
+ out_c = winpr_aligned_calloc(1, size, 16);
+ out_sse = winpr_aligned_calloc(1, size, 16);
+
+ if (!in || !out_c || !out_sse)
+ goto fail;
+
+ winpr_RAND(in, size);
+
+ for (size_t x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
+ {
+ const UINT32 format = formats[x];
+ const UINT32 dstStride = width * FreeRDPGetBytesPerPixel(format);
+ const char* formatName = FreeRDPGetColorFormatName(format);
+ PROFILER_CREATE(genericProf, "YCoCgRToRGB_8u_AC4R-GENERIC")
+ PROFILER_CREATE(optProf, "YCoCgRToRGB_8u_AC4R-OPT")
+ PROFILER_ENTER(genericProf)
+ status = generic->YCoCgToRGB_8u_AC4R(in, srcStride, out_c, format, dstStride, width, height,
+ 2, TRUE);
+ PROFILER_EXIT(genericProf)
+
+ if (status != PRIMITIVES_SUCCESS)
+ goto loop_fail;
+
+ PROFILER_ENTER(optProf)
+ status = optimized->YCoCgToRGB_8u_AC4R(in, srcStride, out_sse, format, dstStride, width,
+ height, 2, TRUE);
+ PROFILER_EXIT(optProf)
+
+ if (status != PRIMITIVES_SUCCESS)
+ goto loop_fail;
+
+ if (memcmp(out_c, out_sse, dstStride * height) != 0)
+ {
+ for (size_t i = 0; i < 1ull * width * height; ++i)
+ {
+ const UINT32 c = FreeRDPReadColor(out_c + 4 * i, format);
+ const UINT32 sse = FreeRDPReadColor(out_sse + 4 * i, format);
+
+ if (c != sse)
+ {
+ printf("optimized->YCoCgRToRGB FAIL[%s] [%" PRIu32 "]: 0x%08" PRIx32
+ " -> C 0x%08" PRIx32 " vs optimized 0x%08" PRIx32 "\n",
+ formatName, i, in[i + 1], c, sse);
+ status = -1;
+ }
+ }
+ }
+
+ printf("--------------------------- [%s] [%" PRIu32 "x%" PRIu32
+ "] ---------------------------\n",
+ formatName, width, height);
+ PROFILER_PRINT_HEADER
+ PROFILER_PRINT(genericProf)
+ PROFILER_PRINT(optProf)
+ PROFILER_PRINT_FOOTER
+ loop_fail:
+ PROFILER_FREE(genericProf)
+ PROFILER_FREE(optProf)
+
+ if (status != PRIMITIVES_SUCCESS)
+ goto fail;
+ }
+
+fail:
+ winpr_aligned_free(in);
+ winpr_aligned_free(out_c);
+ winpr_aligned_free(out_sse);
+ return status == PRIMITIVES_SUCCESS;
+}
+
+int TestPrimitivesYCoCg(int argc, char* argv[])
+{
+ WINPR_UNUSED(argc);
+ WINPR_UNUSED(argv);
+ prim_test_setup(FALSE);
+
+ /* Random resolution tests */
+ if (argc < 2)
+ {
+ for (UINT32 x = 0; x < 10; x++)
+ {
+ UINT32 w = 0;
+ UINT32 h = 0;
+
+ do
+ {
+ winpr_RAND(&w, sizeof(w));
+ w %= 2048 / 4;
+ } while (w < 16);
+
+ do
+ {
+ winpr_RAND(&h, sizeof(h));
+ h %= 2048 / 4;
+ } while (h < 16);
+
+ if (!test_YCoCgRToRGB_8u_AC4R_func(w, h))
+ return 1;
+ }
+ }
+
+ /* Test once with full HD/4 */
+ if (!test_YCoCgRToRGB_8u_AC4R_func(1920 / 4, 1080 / 4))
+ return 1;
+
+ return 0;
+}
diff --git a/libfreerdp/primitives/test/TestPrimitivesYUV.c b/libfreerdp/primitives/test/TestPrimitivesYUV.c
new file mode 100644
index 0000000..f679c07
--- /dev/null
+++ b/libfreerdp/primitives/test/TestPrimitivesYUV.c
@@ -0,0 +1,979 @@
+
+#include <freerdp/config.h>
+
+#include <math.h>
+
+#include "prim_test.h"
+
+#include <winpr/wlog.h>
+#include <winpr/crypto.h>
+#include <freerdp/primitives.h>
+#include <freerdp/utils/profiler.h>
+
+#define TAG __FILE__
+
+#define PADDING_FILL_VALUE 0x37
+
+/* YUV to RGB conversion is lossy, so consider every value only
+ * differing by less than 2 abs equal. */
+static BOOL similar(const BYTE* src, const BYTE* dst, size_t size)
+{
+ for (size_t x = 0; x < size; x++)
+ {
+ int diff = src[x] - dst[x];
+
+ if (abs(diff) > 4)
+ {
+ fprintf(stderr, "%" PRIuz " %02" PRIX8 " : %02" PRIX8 " diff=%d\n", x, src[x], dst[x],
+ abs(diff));
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
+
+static BOOL similarRGB(const BYTE* src, const BYTE* dst, size_t size, UINT32 format, BOOL use444)
+{
+ const UINT32 bpp = FreeRDPGetBytesPerPixel(format);
+ BYTE fill = PADDING_FILL_VALUE;
+ if (!FreeRDPColorHasAlpha(format))
+ fill = 0xFF;
+
+ for (size_t x = 0; x < size; x++)
+ {
+ const LONG maxDiff = 4;
+ UINT32 sColor = 0;
+ UINT32 dColor = 0;
+ BYTE sR = 0;
+ BYTE sG = 0;
+ BYTE sB = 0;
+ BYTE sA = 0;
+ BYTE dR = 0;
+ BYTE dG = 0;
+ BYTE dB = 0;
+ BYTE dA = 0;
+ sColor = FreeRDPReadColor(src, format);
+ dColor = FreeRDPReadColor(dst, format);
+ src += bpp;
+ dst += bpp;
+ FreeRDPSplitColor(sColor, format, &sR, &sG, &sB, &sA, NULL);
+ FreeRDPSplitColor(dColor, format, &dR, &dG, &dB, &dA, NULL);
+
+ if ((labs(sR - dR) > maxDiff) || (labs(sG - dG) > maxDiff) || (labs(sB - dB) > maxDiff))
+ {
+ fprintf(
+ stderr,
+ "Color value mismatch R[%02X %02X], G[%02X %02X], B[%02X %02X] at position %" PRIuz
+ "\n",
+ sR, dR, sG, dG, sA, dA, x);
+ return FALSE;
+ }
+
+ if (dA != fill)
+ {
+ fprintf(
+ stderr,
+ "[%s] Invalid destination alpha value 0x%02X [expected 0x%02X] at position %" PRIuz
+ "\n",
+ use444 ? "AVC444" : "AVC420", dA, fill, x);
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
+
+static void get_size(BOOL large, UINT32* width, UINT32* height)
+{
+ UINT32 shift = large ? 8 : 1;
+ winpr_RAND(width, sizeof(*width));
+ winpr_RAND(height, sizeof(*height));
+ // TODO: Algorithm only works on even resolutions...
+ *width = (*width % 64 + 1) << shift;
+ *height = (*height % 64 + 1) << shift;
+}
+
+static BOOL check_padding(const BYTE* psrc, size_t size, size_t padding, const char* buffer)
+{
+ BOOL rc = TRUE;
+ const BYTE* src = NULL;
+ const BYTE* esrc = NULL;
+ size_t halfPad = (padding + 1) / 2;
+
+ if (!psrc)
+ return FALSE;
+
+ src = psrc - halfPad;
+ esrc = src + size + halfPad;
+
+ for (size_t x = 0; x < halfPad; x++)
+ {
+ const BYTE s = *src++;
+ const BYTE d = *esrc++;
+
+ if (s != 'A')
+ {
+ size_t start = x;
+
+ while ((x < halfPad) && (*esrc++ != 'A'))
+ x++;
+
+ fprintf(stderr,
+ "Buffer underflow detected %02" PRIx8 " != %02X %s [%" PRIuz "-%" PRIuz "]\n",
+ d, 'A', buffer, start, x);
+ return FALSE;
+ }
+
+ if (d != 'A')
+ {
+ size_t start = x;
+
+ while ((x < halfPad) && (*esrc++ != 'A'))
+ x++;
+
+ fprintf(stderr,
+ "Buffer overflow detected %02" PRIx8 " != %02X %s [%" PRIuz "-%" PRIuz "]\n", d,
+ 'A', buffer, start, x);
+ return FALSE;
+ }
+ }
+
+ return rc;
+}
+
+static void* set_padding(size_t size, size_t padding)
+{
+ size_t halfPad = (padding + 1) / 2;
+ BYTE* psrc = NULL;
+ BYTE* src = winpr_aligned_malloc(size + 2 * halfPad, 16);
+
+ if (!src)
+ return NULL;
+
+ memset(&src[0], 'A', halfPad);
+ memset(&src[halfPad], PADDING_FILL_VALUE, size);
+ memset(&src[halfPad + size], 'A', halfPad);
+ psrc = &src[halfPad];
+
+ if (!check_padding(psrc, size, padding, "init"))
+ {
+ winpr_aligned_free(src);
+ return NULL;
+ }
+
+ return psrc;
+}
+
+static void free_padding(void* src, size_t padding)
+{
+ BYTE* ptr = NULL;
+
+ if (!src)
+ return;
+
+ ptr = ((BYTE*)src) - (padding + 1) / 2;
+ winpr_aligned_free(ptr);
+}
+
+/* Create 2 pseudo YUV420 frames of same size.
+ * Combine them and check, if the data is at the expected position. */
+static BOOL TestPrimitiveYUVCombine(primitives_t* prims, prim_size_t roi)
+{
+ union
+ {
+ const BYTE** cpv;
+ BYTE** pv;
+ } cnv;
+ UINT32 awidth = 0;
+ UINT32 aheight = 0;
+ BOOL rc = FALSE;
+ BYTE* luma[3] = { 0 };
+ BYTE* chroma[3] = { 0 };
+ BYTE* yuv[3] = { 0 };
+ BYTE* pmain[3] = { 0 };
+ BYTE* paux[3] = { 0 };
+ UINT32 lumaStride[3];
+ UINT32 chromaStride[3];
+ UINT32 yuvStride[3];
+ const size_t padding = 10000;
+ RECTANGLE_16 rect;
+ PROFILER_DEFINE(yuvCombine)
+ PROFILER_DEFINE(yuvSplit)
+ awidth = roi.width + 16 - roi.width % 16;
+ aheight = roi.height + 16 - roi.height % 16;
+ fprintf(stderr,
+ "Running YUVCombine on frame size %" PRIu32 "x%" PRIu32 " [%" PRIu32 "x%" PRIu32 "]\n",
+ roi.width, roi.height, awidth, aheight);
+ PROFILER_CREATE(yuvCombine, "YUV420CombineToYUV444")
+ PROFILER_CREATE(yuvSplit, "YUV444SplitToYUV420")
+ rect.left = 0;
+ rect.top = 0;
+ rect.right = roi.width;
+ rect.bottom = roi.height;
+
+ if (!prims || !prims->YUV420CombineToYUV444)
+ goto fail;
+
+ for (UINT32 x = 0; x < 3; x++)
+ {
+ size_t halfStride = ((x > 0) ? awidth / 2 : awidth);
+ size_t size = aheight * awidth;
+ size_t halfSize = ((x > 0) ? halfStride * aheight / 2 : awidth * aheight);
+ yuvStride[x] = awidth;
+
+ if (!(yuv[x] = set_padding(size, padding)))
+ goto fail;
+
+ lumaStride[x] = halfStride;
+
+ if (!(luma[x] = set_padding(halfSize, padding)))
+ goto fail;
+
+ if (!(pmain[x] = set_padding(halfSize, padding)))
+ goto fail;
+
+ chromaStride[x] = halfStride;
+
+ if (!(chroma[x] = set_padding(halfSize, padding)))
+ goto fail;
+
+ if (!(paux[x] = set_padding(halfSize, padding)))
+ goto fail;
+
+ memset(luma[x], 0xAB + 3 * x, halfSize);
+ memset(chroma[x], 0x80 + 2 * x, halfSize);
+
+ if (!check_padding(luma[x], halfSize, padding, "luma"))
+ goto fail;
+
+ if (!check_padding(chroma[x], halfSize, padding, "chroma"))
+ goto fail;
+
+ if (!check_padding(pmain[x], halfSize, padding, "main"))
+ goto fail;
+
+ if (!check_padding(paux[x], halfSize, padding, "aux"))
+ goto fail;
+
+ if (!check_padding(yuv[x], size, padding, "yuv"))
+ goto fail;
+ }
+
+ PROFILER_ENTER(yuvCombine)
+
+ cnv.pv = luma;
+ if (prims->YUV420CombineToYUV444(AVC444_LUMA, cnv.cpv, lumaStride, roi.width, roi.height, yuv,
+ yuvStride, &rect) != PRIMITIVES_SUCCESS)
+ {
+ PROFILER_EXIT(yuvCombine)
+ goto fail;
+ }
+
+ cnv.pv = chroma;
+ if (prims->YUV420CombineToYUV444(AVC444_CHROMAv1, cnv.cpv, chromaStride, roi.width, roi.height,
+ yuv, yuvStride, &rect) != PRIMITIVES_SUCCESS)
+ {
+ PROFILER_EXIT(yuvCombine)
+ goto fail;
+ }
+
+ PROFILER_EXIT(yuvCombine)
+
+ for (UINT32 x = 0; x < 3; x++)
+ {
+ size_t halfStride = ((x > 0) ? awidth / 2 : awidth);
+ size_t size = aheight * awidth;
+ size_t halfSize = ((x > 0) ? halfStride * aheight / 2 : awidth * aheight);
+
+ if (!check_padding(luma[x], halfSize, padding, "luma"))
+ goto fail;
+
+ if (!check_padding(chroma[x], halfSize, padding, "chroma"))
+ goto fail;
+
+ if (!check_padding(yuv[x], size, padding, "yuv"))
+ goto fail;
+ }
+
+ PROFILER_ENTER(yuvSplit)
+
+ cnv.pv = yuv;
+ if (prims->YUV444SplitToYUV420(cnv.cpv, yuvStride, pmain, lumaStride, paux, chromaStride,
+ &roi) != PRIMITIVES_SUCCESS)
+ {
+ PROFILER_EXIT(yuvSplit)
+ goto fail;
+ }
+
+ PROFILER_EXIT(yuvSplit)
+
+ for (UINT32 x = 0; x < 3; x++)
+ {
+ size_t halfStride = ((x > 0) ? awidth / 2 : awidth);
+ size_t size = aheight * awidth;
+ size_t halfSize = ((x > 0) ? halfStride * aheight / 2 : awidth * aheight);
+
+ if (!check_padding(pmain[x], halfSize, padding, "main"))
+ goto fail;
+
+ if (!check_padding(paux[x], halfSize, padding, "aux"))
+ goto fail;
+
+ if (!check_padding(yuv[x], size, padding, "yuv"))
+ goto fail;
+ }
+
+ for (UINT32 i = 0; i < 3; i++)
+ {
+ for (UINT32 y = 0; y < roi.height; y++)
+ {
+ UINT32 w = roi.width;
+ UINT32 lstride = lumaStride[i];
+ UINT32 cstride = chromaStride[i];
+
+ if (i > 0)
+ {
+ w = (roi.width + 3) / 4;
+
+ if (roi.height > (roi.height + 1) / 2)
+ continue;
+ }
+
+ if (!similar(luma[i] + y * lstride, pmain[i] + y * lstride, w))
+ goto fail;
+
+ /* Need to ignore lines of destination Y plane,
+ * if the lines are not a multiple of 16
+ * as the UV planes are packed in 8 line stripes. */
+ if (i == 0)
+ {
+ /* TODO: This check is not perfect, it does not
+ * include the last V lines packed to the Y
+ * frame. */
+ UINT32 rem = roi.height % 16;
+
+ if (y > roi.height - rem)
+ continue;
+ }
+
+ if (!similar(chroma[i] + y * cstride, paux[i] + y * cstride, w))
+ goto fail;
+ }
+ }
+
+ PROFILER_PRINT_HEADER
+ PROFILER_PRINT(yuvSplit)
+ PROFILER_PRINT(yuvCombine)
+ PROFILER_PRINT_FOOTER
+ rc = TRUE;
+fail:
+ PROFILER_FREE(yuvCombine)
+ PROFILER_FREE(yuvSplit)
+
+ for (UINT32 x = 0; x < 3; x++)
+ {
+ free_padding(yuv[x], padding);
+ free_padding(luma[x], padding);
+ free_padding(chroma[x], padding);
+ free_padding(pmain[x], padding);
+ free_padding(paux[x], padding);
+ }
+
+ return rc;
+}
+
+static BOOL TestPrimitiveYUV(primitives_t* prims, prim_size_t roi, BOOL use444)
+{
+ union
+ {
+ const BYTE** cpv;
+ BYTE** pv;
+ } cnv;
+ BOOL res = FALSE;
+ UINT32 awidth = 0;
+ UINT32 aheight = 0;
+ BYTE* yuv[3] = { 0 };
+ UINT32 yuv_step[3];
+ BYTE* rgb = NULL;
+ BYTE* rgb_dst = NULL;
+ size_t size = 0;
+ size_t uvsize = 0;
+ size_t uvwidth = 0;
+ size_t padding = 100 * 16;
+ UINT32 stride = 0;
+ const UINT32 formats[] = { PIXEL_FORMAT_XRGB32, PIXEL_FORMAT_XBGR32, PIXEL_FORMAT_ARGB32,
+ PIXEL_FORMAT_ABGR32, PIXEL_FORMAT_RGBA32, PIXEL_FORMAT_RGBX32,
+ PIXEL_FORMAT_BGRA32, PIXEL_FORMAT_BGRX32 };
+ PROFILER_DEFINE(rgbToYUV420)
+ PROFILER_DEFINE(rgbToYUV444)
+ PROFILER_DEFINE(yuv420ToRGB)
+ PROFILER_DEFINE(yuv444ToRGB)
+ /* Buffers need to be 16x16 aligned. */
+ awidth = roi.width + 16 - roi.width % 16;
+ aheight = roi.height + 16 - roi.height % 16;
+ stride = awidth * sizeof(UINT32);
+ size = awidth * aheight;
+
+ if (use444)
+ {
+ uvwidth = awidth;
+ uvsize = size;
+
+ if (!prims || !prims->RGBToYUV444_8u_P3AC4R || !prims->YUV444ToRGB_8u_P3AC4R)
+ return FALSE;
+ }
+ else
+ {
+ uvwidth = (awidth + 1) / 2;
+ uvsize = (aheight + 1) / 2 * uvwidth;
+
+ if (!prims || !prims->RGBToYUV420_8u_P3AC4R || !prims->YUV420ToRGB_8u_P3AC4R)
+ return FALSE;
+ }
+
+ fprintf(stderr, "Running AVC%s on frame size %" PRIu32 "x%" PRIu32 "\n", use444 ? "444" : "420",
+ roi.width, roi.height);
+
+ /* Test RGB to YUV444 conversion and vice versa */
+ if (!(rgb = set_padding(size * sizeof(UINT32), padding)))
+ goto fail;
+
+ if (!(rgb_dst = set_padding(size * sizeof(UINT32), padding)))
+ goto fail;
+
+ if (!(yuv[0] = set_padding(size, padding)))
+ goto fail;
+
+ if (!(yuv[1] = set_padding(uvsize, padding)))
+ goto fail;
+
+ if (!(yuv[2] = set_padding(uvsize, padding)))
+ goto fail;
+
+ for (UINT32 y = 0; y < roi.height; y++)
+ {
+ BYTE* line = &rgb[y * stride];
+
+ for (UINT32 x = 0; x < roi.width; x++)
+ {
+ line[x * 4 + 0] = 0x81;
+ line[x * 4 + 1] = 0x33;
+ line[x * 4 + 2] = 0xAB;
+ line[x * 4 + 3] = 0xFF;
+ }
+ }
+
+ yuv_step[0] = awidth;
+ yuv_step[1] = uvwidth;
+ yuv_step[2] = uvwidth;
+
+ for (UINT32 x = 0; x < ARRAYSIZE(formats); x++)
+ {
+ pstatus_t rc = 0;
+ const UINT32 DstFormat = formats[x];
+ printf("Testing destination color format %s\n", FreeRDPGetColorFormatName(DstFormat));
+ memset(rgb_dst, PADDING_FILL_VALUE, size * sizeof(UINT32));
+
+ PROFILER_CREATE(rgbToYUV420, "RGBToYUV420")
+ PROFILER_CREATE(rgbToYUV444, "RGBToYUV444")
+ PROFILER_CREATE(yuv420ToRGB, "YUV420ToRGB")
+ PROFILER_CREATE(yuv444ToRGB, "YUV444ToRGB")
+
+ if (use444)
+ {
+ PROFILER_ENTER(rgbToYUV444)
+ rc = prims->RGBToYUV444_8u_P3AC4R(rgb, DstFormat, stride, yuv, yuv_step, &roi);
+ PROFILER_EXIT(rgbToYUV444)
+
+ if (rc != PRIMITIVES_SUCCESS)
+ goto loop_fail;
+
+ PROFILER_PRINT_HEADER
+ PROFILER_PRINT(rgbToYUV444)
+ PROFILER_PRINT_FOOTER
+ }
+ else
+ {
+ PROFILER_ENTER(rgbToYUV420)
+ rc = prims->RGBToYUV420_8u_P3AC4R(rgb, DstFormat, stride, yuv, yuv_step, &roi);
+ PROFILER_EXIT(rgbToYUV420)
+
+ if (rc != PRIMITIVES_SUCCESS)
+ goto loop_fail;
+
+ PROFILER_PRINT_HEADER
+ PROFILER_PRINT(rgbToYUV420)
+ PROFILER_PRINT_FOOTER
+ }
+
+ if (!check_padding(rgb, size * sizeof(UINT32), padding, "rgb"))
+ {
+ rc = -1;
+ goto loop_fail;
+ }
+
+ if ((!check_padding(yuv[0], size, padding, "Y")) ||
+ (!check_padding(yuv[1], uvsize, padding, "U")) ||
+ (!check_padding(yuv[2], uvsize, padding, "V")))
+ {
+ rc = -1;
+ goto loop_fail;
+ }
+
+ cnv.pv = yuv;
+ if (use444)
+ {
+ PROFILER_ENTER(yuv444ToRGB)
+ rc = prims->YUV444ToRGB_8u_P3AC4R(cnv.cpv, yuv_step, rgb_dst, stride, DstFormat, &roi);
+ PROFILER_EXIT(yuv444ToRGB)
+
+ if (rc != PRIMITIVES_SUCCESS)
+ goto loop_fail;
+
+ loop_fail:
+ PROFILER_EXIT(yuv444ToRGB)
+ PROFILER_PRINT_HEADER
+ PROFILER_PRINT(yuv444ToRGB)
+ PROFILER_PRINT_FOOTER
+
+ if (rc != PRIMITIVES_SUCCESS)
+ goto fail;
+ }
+ else
+ {
+ PROFILER_ENTER(yuv420ToRGB)
+
+ if (prims->YUV420ToRGB_8u_P3AC4R(cnv.cpv, yuv_step, rgb_dst, stride, DstFormat, &roi) !=
+ PRIMITIVES_SUCCESS)
+ {
+ PROFILER_EXIT(yuv420ToRGB)
+ goto fail;
+ }
+
+ PROFILER_EXIT(yuv420ToRGB)
+ PROFILER_PRINT_HEADER
+ PROFILER_PRINT(yuv420ToRGB)
+ PROFILER_PRINT_FOOTER
+ }
+
+ if (!check_padding(rgb_dst, size * sizeof(UINT32), padding, "rgb dst"))
+ goto fail;
+
+ if ((!check_padding(yuv[0], size, padding, "Y")) ||
+ (!check_padding(yuv[1], uvsize, padding, "U")) ||
+ (!check_padding(yuv[2], uvsize, padding, "V")))
+ goto fail;
+
+ for (UINT32 y = 0; y < roi.height; y++)
+ {
+ BYTE* srgb = &rgb[y * stride];
+ BYTE* drgb = &rgb_dst[y * stride];
+
+ if (!similarRGB(srgb, drgb, roi.width, DstFormat, use444))
+ goto fail;
+ }
+
+ PROFILER_FREE(rgbToYUV420)
+ PROFILER_FREE(rgbToYUV444)
+ PROFILER_FREE(yuv420ToRGB)
+ PROFILER_FREE(yuv444ToRGB)
+ }
+
+ res = TRUE;
+fail:
+ free_padding(rgb, padding);
+ free_padding(rgb_dst, padding);
+ free_padding(yuv[0], padding);
+ free_padding(yuv[1], padding);
+ free_padding(yuv[2], padding);
+ return res;
+}
+
+static BOOL allocate_yuv420(BYTE** planes, UINT32 width, UINT32 height, UINT32 padding)
+{
+ const size_t size = width * height;
+ const size_t uvwidth = (width + 1) / 2;
+ const size_t uvsize = (height + 1) / 2 * uvwidth;
+
+ if (!(planes[0] = set_padding(size, padding)))
+ goto fail;
+
+ if (!(planes[1] = set_padding(uvsize, padding)))
+ goto fail;
+
+ if (!(planes[2] = set_padding(uvsize, padding)))
+ goto fail;
+
+ return TRUE;
+fail:
+ free_padding(planes[0], padding);
+ free_padding(planes[1], padding);
+ free_padding(planes[2], padding);
+ return FALSE;
+}
+
+static void free_yuv420(BYTE** planes, UINT32 padding)
+{
+ if (!planes)
+ return;
+
+ free_padding(planes[0], padding);
+ free_padding(planes[1], padding);
+ free_padding(planes[2], padding);
+ planes[0] = NULL;
+ planes[1] = NULL;
+ planes[2] = NULL;
+}
+static BOOL check_yuv420(BYTE** planes, UINT32 width, UINT32 height, UINT32 padding)
+{
+ const size_t size = width * height;
+ const size_t uvwidth = (width + 1) / 2;
+ const size_t uvsize = (height + 1) / 2 * uvwidth;
+ const BOOL yOk = check_padding(planes[0], size, padding, "Y");
+ const BOOL uOk = check_padding(planes[1], uvsize, padding, "U");
+ const BOOL vOk = check_padding(planes[2], uvsize, padding, "V");
+ return (yOk && uOk && vOk);
+}
+
+static BOOL check_for_mismatches(const BYTE* planeA, const BYTE* planeB, UINT32 size)
+{
+ BOOL rc = FALSE;
+
+ for (UINT32 x = 0; x < size; x++)
+ {
+ const BYTE a = planeA[x];
+ const BYTE b = planeB[x];
+
+ if (fabsf((float)a - (float)b) > 2.0f)
+ {
+ rc = TRUE;
+ fprintf(stderr, "[%08x] %02x != %02x\n", x, a, b);
+ }
+ }
+
+ return rc;
+}
+
+static BOOL compare_yuv420(BYTE** planesA, BYTE** planesB, UINT32 width, UINT32 height,
+ UINT32 padding)
+{
+ BOOL rc = TRUE;
+ const size_t size = width * height;
+ const size_t uvwidth = (width + 1) / 2;
+ const size_t uvsize = (height + 1) / 2 * uvwidth;
+
+ if (check_for_mismatches(planesA[0], planesB[0], size))
+ {
+ fprintf(stderr, "Mismatch in Y planes!");
+ rc = FALSE;
+ }
+
+ if (check_for_mismatches(planesA[1], planesB[1], uvsize))
+ {
+ fprintf(stderr, "Mismatch in U planes!");
+ rc = FALSE;
+ }
+
+ if (check_for_mismatches(planesA[2], planesB[2], uvsize))
+ {
+ fprintf(stderr, "Mismatch in V planes!");
+ rc = FALSE;
+ }
+
+ return rc;
+}
+
+static BOOL TestPrimitiveRgbToLumaChroma(primitives_t* prims, prim_size_t roi, UINT32 version)
+{
+ BOOL res = FALSE;
+ UINT32 awidth = 0;
+ UINT32 aheight = 0;
+ BYTE* luma[3] = { 0 };
+ BYTE* chroma[3] = { 0 };
+ BYTE* lumaGeneric[3] = { 0 };
+ BYTE* chromaGeneric[3] = { 0 };
+ UINT32 yuv_step[3];
+ BYTE* rgb = NULL;
+ size_t size = 0;
+ size_t uvwidth = 0;
+ const size_t padding = 0x1000;
+ UINT32 stride = 0;
+ __RGBToAVC444YUV_t fkt = NULL;
+ __RGBToAVC444YUV_t gen = NULL;
+ const UINT32 formats[] = { PIXEL_FORMAT_XRGB32, PIXEL_FORMAT_XBGR32, PIXEL_FORMAT_ARGB32,
+ PIXEL_FORMAT_ABGR32, PIXEL_FORMAT_RGBA32, PIXEL_FORMAT_RGBX32,
+ PIXEL_FORMAT_BGRA32, PIXEL_FORMAT_BGRX32 };
+ PROFILER_DEFINE(rgbToYUV444)
+ PROFILER_DEFINE(rgbToYUV444opt)
+ /* Buffers need to be 16x16 aligned. */
+ awidth = roi.width;
+
+ if (awidth % 16 != 0)
+ awidth += 16 - roi.width % 16;
+
+ aheight = roi.height;
+
+ if (aheight % 16 != 0)
+ aheight += 16 - roi.height % 16;
+
+ stride = awidth * sizeof(UINT32);
+ size = awidth * aheight;
+ uvwidth = (awidth + 1) / 2;
+
+ if (!prims || !generic)
+ return FALSE;
+
+ switch (version)
+ {
+ case 1:
+ fkt = prims->RGBToAVC444YUV;
+ gen = generic->RGBToAVC444YUV;
+ break;
+
+ case 2:
+ fkt = prims->RGBToAVC444YUVv2;
+ gen = generic->RGBToAVC444YUVv2;
+ break;
+
+ default:
+ return FALSE;
+ }
+
+ if (!fkt || !gen)
+ return FALSE;
+
+ fprintf(stderr, "Running AVC444 on frame size %" PRIu32 "x%" PRIu32 "\n", roi.width,
+ roi.height);
+
+ /* Test RGB to YUV444 conversion and vice versa */
+ if (!(rgb = set_padding(size * sizeof(UINT32), padding)))
+ goto fail;
+
+ if (!allocate_yuv420(luma, awidth, aheight, padding))
+ goto fail;
+
+ if (!allocate_yuv420(chroma, awidth, aheight, padding))
+ goto fail;
+
+ if (!allocate_yuv420(lumaGeneric, awidth, aheight, padding))
+ goto fail;
+
+ if (!allocate_yuv420(chromaGeneric, awidth, aheight, padding))
+ goto fail;
+
+ for (UINT32 y = 0; y < roi.height; y++)
+ {
+ BYTE* line = &rgb[y * stride];
+
+ for (UINT32 x = 0; x < roi.width; x++)
+ {
+#if 1
+ line[x * 4 + 0] = rand();
+ line[x * 4 + 1] = rand();
+ line[x * 4 + 2] = rand();
+ line[x * 4 + 3] = rand();
+#else
+ line[x * 4 + 0] = (y * roi.width + x) * 16 + 5;
+ line[x * 4 + 1] = (y * roi.width + x) * 16 + 7;
+ line[x * 4 + 2] = (y * roi.width + x) * 16 + 11;
+ line[x * 4 + 3] = (y * roi.width + x) * 16 + 0;
+#endif
+ }
+ }
+
+ yuv_step[0] = awidth;
+ yuv_step[1] = uvwidth;
+ yuv_step[2] = uvwidth;
+
+ for (UINT32 x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
+ {
+ pstatus_t rc = -1;
+ const UINT32 DstFormat = formats[x];
+ printf("Testing destination color format %s\n", FreeRDPGetColorFormatName(DstFormat));
+ PROFILER_CREATE(rgbToYUV444, "RGBToYUV444-generic")
+ PROFILER_CREATE(rgbToYUV444opt, "RGBToYUV444-optimized")
+
+ for (UINT32 cnt = 0; cnt < 10; cnt++)
+ {
+ PROFILER_ENTER(rgbToYUV444opt)
+ rc = fkt(rgb, DstFormat, stride, luma, yuv_step, chroma, yuv_step, &roi);
+ PROFILER_EXIT(rgbToYUV444opt)
+
+ if (rc != PRIMITIVES_SUCCESS)
+ goto loop_fail;
+ }
+
+ PROFILER_PRINT_HEADER
+ PROFILER_PRINT(rgbToYUV444opt)
+ PROFILER_PRINT_FOOTER
+
+ if (!check_padding(rgb, size * sizeof(UINT32), padding, "rgb"))
+ {
+ rc = -1;
+ goto loop_fail;
+ }
+
+ if (!check_yuv420(luma, awidth, aheight, padding) ||
+ !check_yuv420(chroma, awidth, aheight, padding))
+ {
+ rc = -1;
+ goto loop_fail;
+ }
+
+ for (UINT32 cnt = 0; cnt < 10; cnt++)
+ {
+ PROFILER_ENTER(rgbToYUV444)
+ rc = gen(rgb, DstFormat, stride, lumaGeneric, yuv_step, chromaGeneric, yuv_step, &roi);
+ PROFILER_EXIT(rgbToYUV444)
+
+ if (rc != PRIMITIVES_SUCCESS)
+ goto loop_fail;
+ }
+
+ PROFILER_PRINT_HEADER
+ PROFILER_PRINT(rgbToYUV444)
+ PROFILER_PRINT_FOOTER
+
+ if (!check_padding(rgb, size * sizeof(UINT32), padding, "rgb"))
+ {
+ rc = -1;
+ goto loop_fail;
+ }
+
+ if (!check_yuv420(lumaGeneric, awidth, aheight, padding) ||
+ !check_yuv420(chromaGeneric, awidth, aheight, padding))
+ {
+ rc = -1;
+ goto loop_fail;
+ }
+
+ if (!compare_yuv420(luma, lumaGeneric, awidth, aheight, padding) ||
+ !compare_yuv420(chroma, chromaGeneric, awidth, aheight, padding))
+ {
+ rc = -1;
+ goto loop_fail;
+ }
+
+ loop_fail:
+ PROFILER_FREE(rgbToYUV444)
+ PROFILER_FREE(rgbToYUV444opt)
+
+ if (rc != PRIMITIVES_SUCCESS)
+ goto fail;
+ }
+
+ res = TRUE;
+fail:
+ free_padding(rgb, padding);
+ free_yuv420(luma, padding);
+ free_yuv420(chroma, padding);
+ free_yuv420(lumaGeneric, padding);
+ free_yuv420(chromaGeneric, padding);
+ return res;
+}
+
+int TestPrimitivesYUV(int argc, char* argv[])
+{
+ BOOL large = (argc > 1);
+ int rc = -1;
+ WINPR_UNUSED(argc);
+ WINPR_UNUSED(argv);
+ prim_test_setup(FALSE);
+ primitives_t* prims = primitives_get();
+
+ for (UINT32 x = 0; x < 5; x++)
+ {
+ prim_size_t roi;
+
+ if (argc > 1)
+ {
+ int crc = sscanf(argv[1], "%" PRIu32 "x%" PRIu32, &roi.width, &roi.height);
+
+ if (crc != 2)
+ {
+ roi.width = 1920;
+ roi.height = 1080;
+ }
+ }
+ else
+ get_size(large, &roi.width, &roi.height);
+
+ printf("-------------------- GENERIC ------------------------\n");
+
+ if (!TestPrimitiveYUV(generic, roi, TRUE))
+ {
+ printf("TestPrimitiveYUV (444) failed.\n");
+ goto end;
+ }
+
+ printf("---------------------- END --------------------------\n");
+ printf("------------------- OPTIMIZED -----------------------\n");
+
+ if (!TestPrimitiveYUV(prims, roi, TRUE))
+ {
+ printf("TestPrimitiveYUV (444) failed.\n");
+ goto end;
+ }
+
+ printf("---------------------- END --------------------------\n");
+ printf("-------------------- GENERIC ------------------------\n");
+
+ if (!TestPrimitiveYUV(generic, roi, FALSE))
+ {
+ printf("TestPrimitiveYUV (420) failed.\n");
+ goto end;
+ }
+
+ printf("---------------------- END --------------------------\n");
+ printf("------------------- OPTIMIZED -----------------------\n");
+
+ if (!TestPrimitiveYUV(prims, roi, FALSE))
+ {
+ printf("TestPrimitiveYUV (420) failed.\n");
+ goto end;
+ }
+
+ printf("---------------------- END --------------------------\n");
+ printf("-------------------- GENERIC ------------------------\n");
+
+ if (!TestPrimitiveYUVCombine(generic, roi))
+ {
+ printf("TestPrimitiveYUVCombine failed.\n");
+ goto end;
+ }
+
+ printf("---------------------- END --------------------------\n");
+ printf("------------------- OPTIMIZED -----------------------\n");
+
+ if (!TestPrimitiveYUVCombine(prims, roi))
+ {
+ printf("TestPrimitiveYUVCombine failed.\n");
+ goto end;
+ }
+
+ printf("---------------------- END --------------------------\n");
+ printf("------------------- OPTIMIZED -----------------------\n");
+
+ if (!TestPrimitiveRgbToLumaChroma(prims, roi, 1))
+ {
+ printf("TestPrimitiveRgbToLumaChroma failed.\n");
+ goto end;
+ }
+
+ printf("---------------------- END --------------------------\n");
+ printf("-------------------- GENERIC ------------------------\n");
+
+ if (!TestPrimitiveRgbToLumaChroma(prims, roi, 2))
+ {
+ printf("TestPrimitiveYUVCombine failed.\n");
+ goto end;
+ }
+
+ printf("---------------------- END --------------------------\n");
+ }
+
+ rc = 0;
+end:
+ return rc;
+}
diff --git a/libfreerdp/primitives/test/measure.h b/libfreerdp/primitives/test/measure.h
new file mode 100644
index 0000000..ee04abd
--- /dev/null
+++ b/libfreerdp/primitives/test/measure.h
@@ -0,0 +1,145 @@
+/* measure.h
+ * Macros to help with performance measurement.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License. Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ * MEASURE_LOOP_START("measurement", 2000)
+ * code to be measured
+ * MEASURE_LOOP_STOP
+ * buffer flush and such
+ * MEASURE_SHOW_RESULTS
+ *
+ * Define GOOGLE_PROFILER if you want gperftools included.
+ */
+
+#ifndef TEST_MEASURE_H_INCLUDED
+#define TEST_MEASURE_H_INCLUDED
+
+#include <freerdp/config.h>
+
+#include <time.h>
+#include <winpr/string.h>
+
+#ifndef _WIN32
+#include <sys/param.h>
+#endif
+
+#include <winpr/crt.h>
+
+#ifdef _WIN32
+
+#define PROFILER_START(_prefix_)
+#define PROFILER_STOP
+
+#define MEASURE_LOOP_START(_prefix_, _count_)
+#define MEASURE_LOOP_STOP
+#define MEASURE_GET_RESULTS(_result_)
+#define MEASURE_SHOW_RESULTS(_result_)
+#define MEASURE_SHOW_RESULTS_SCALED(_scale_, _label_)
+#define MEASURE_TIMED(_label_, _init_iter_, _test_time_, _result_, _call_)
+
+#else
+
+#ifdef GOOGLE_PROFILER
+#include <gperftools/profiler.h>
+#define PROFILER_START(_prefix_) \
+ do \
+ { \
+ char _path[PATH_MAX]; \
+ sprintf_s(_path, sizeof(_path), "./%s.prof", (_prefix_)); \
+ ProfilerStart(_path); \
+ } while (0);
+#define PROFILER_STOP \
+ do \
+ { \
+ ProfilerStop(); \
+ } while (0);
+#else
+#define PROFILER_START(_prefix_)
+#define PROFILER_STOP
+#endif // GOOGLE_PROFILER
+
+extern float _delta_time(const struct timespec* t0, const struct timespec* t1);
+extern void _floatprint(float t, char* output);
+
+#ifndef CLOCK_MONOTONIC_RAW
+#define CLOCK_MONOTONIC_RAW 4
+#endif // !CLOCK_MONOTONIC_RAW
+
+#define MEASURE_LOOP_START(_prefix_, _count_) \
+ { \
+ struct timespec _start, _stop; \
+ char* _prefix; \
+ int _count = (_count_); \
+ int _loop; \
+ float _delta; \
+ char _str1[32], _str2[32]; \
+ _prefix = _strdup(_prefix_); \
+ _str1[0] = '\0'; \
+ _str2[0] = '\0'; \
+ clock_gettime(CLOCK_MONOTONIC_RAW, &_start); \
+ PROFILER_START(_prefix); \
+ _loop = (_count); \
+ do \
+ {
+
+#define MEASURE_LOOP_STOP \
+ } \
+ while (--_loop) \
+ ;
+
+#define MEASURE_GET_RESULTS(_result_) \
+ PROFILER_STOP; \
+ clock_gettime(CLOCK_MONOTONIC_RAW, &_stop); \
+ _delta = _delta_time(&_start, &_stop); \
+ (_result_) = (float)_count / _delta; \
+ free(_prefix); \
+ }
+
+#define MEASURE_SHOW_RESULTS(_result_) \
+ PROFILER_STOP; \
+ clock_gettime(CLOCK_MONOTONIC_RAW, &_stop); \
+ _delta = _delta_time(&_start, &_stop); \
+ (_result_) = (float)_count / _delta; \
+ _floatprint((float)_count / _delta, _str1); \
+ printf("%s: %9d iterations in %5.1f seconds = %s/s \n", _prefix, _count, _delta, _str1); \
+ free(_prefix); \
+ }
+
+#define MEASURE_SHOW_RESULTS_SCALED(_scale_, _label_) \
+ PROFILER_STOP; \
+ clock_gettime(CLOCK_MONOTONIC_RAW, &_stop); \
+ _delta = _delta_time(&_start, &_stop); \
+ _floatprint((float)_count / _delta, _str1); \
+ _floatprint((float)_count / _delta * (_scale_), _str2); \
+ printf("%s: %9d iterations in %5.1f seconds = %s/s = %s%s \n", _prefix, _count, _delta, _str1, \
+ _str2, _label_); \
+ free(_prefix); \
+ }
+
+#define MEASURE_TIMED(_label_, _init_iter_, _test_time_, _result_, _call_) \
+ { \
+ float _r; \
+ MEASURE_LOOP_START(_label_, _init_iter_); \
+ _call_; \
+ MEASURE_LOOP_STOP; \
+ MEASURE_GET_RESULTS(_r); \
+ MEASURE_LOOP_START(_label_, _r* _test_time_); \
+ _call_; \
+ MEASURE_LOOP_STOP; \
+ MEASURE_SHOW_RESULTS(_result_); \
+ }
+
+#endif
+
+#endif // __MEASURE_H_INCLUDED__
diff --git a/libfreerdp/primitives/test/prim_test.c b/libfreerdp/primitives/test/prim_test.c
new file mode 100644
index 0000000..ede8316
--- /dev/null
+++ b/libfreerdp/primitives/test/prim_test.c
@@ -0,0 +1,109 @@
+/* prim_test.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include "prim_test.h"
+
+#ifndef _WIN32
+#include <fcntl.h>
+#include <math.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#endif
+
+#include <winpr/sysinfo.h>
+#include <winpr/platform.h>
+#include <winpr/crypto.h>
+
+primitives_t* generic = NULL;
+primitives_t* optimized = NULL;
+BOOL g_TestPrimitivesPerformance = FALSE;
+UINT32 g_Iterations = 1000;
+
+int test_sizes[] = { 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096 };
+
+/* ------------------------------------------------------------------------- */
+
+#ifdef _WIN32
+float _delta_time(const struct timespec* t0, const struct timespec* t1)
+{
+ return 0.0f;
+}
+#else
+float _delta_time(const struct timespec* t0, const struct timespec* t1)
+{
+ INT64 secs = (INT64)(t1->tv_sec) - (INT64)(t0->tv_sec);
+ long nsecs = t1->tv_nsec - t0->tv_nsec;
+ double retval = NAN;
+
+ if (nsecs < 0)
+ {
+ --secs;
+ nsecs += 1000000000;
+ }
+
+ retval = (double)secs + (double)nsecs / (double)1000000000.0;
+ return (retval < 0.0) ? 0.0 : (float)retval;
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void _floatprint(float t, char* output)
+{
+ /* I don't want to link against -lm, so avoid log,exp,... */
+ float f = 10.0;
+ int i = 0;
+
+ while (t > f)
+ f *= 10.0;
+
+ f /= 1000.0;
+ i = ((int)(t / f + 0.5f)) * (int)f;
+
+ if (t < 0.0f)
+ sprintf(output, "%f", t);
+ else if (i == 0)
+ sprintf(output, "%d", (int)(t + 0.5f));
+ else if (t < 1e+3f)
+ sprintf(output, "%3d", i);
+ else if (t < 1e+6f)
+ sprintf(output, "%3d,%03d", i / 1000, i % 1000);
+ else if (t < 1e+9f)
+ sprintf(output, "%3d,%03d,000", i / 1000000, (i % 1000000) / 1000);
+ else if (t < 1e+12f)
+ sprintf(output, "%3d,%03d,000,000", i / 1000000000, (i % 1000000000) / 1000000);
+ else
+ sprintf(output, "%f", t);
+}
+
+void prim_test_setup(BOOL performance)
+{
+ generic = primitives_get_generic();
+ optimized = primitives_get();
+ g_TestPrimitivesPerformance = performance;
+}
+
+BOOL speed_test(const char* name, const char* dsc, UINT32 iterations, pstatus_t (*fkt_generic)(),
+ pstatus_t (*optimised)(), ...)
+{
+ if (!name || !generic || !optimised || (iterations == 0))
+ return FALSE;
+
+ for (UINT32 i = 0; i < iterations; i++)
+ {
+ }
+
+ return TRUE;
+}
diff --git a/libfreerdp/primitives/test/prim_test.h b/libfreerdp/primitives/test/prim_test.h
new file mode 100644
index 0000000..3642f51
--- /dev/null
+++ b/libfreerdp/primitives/test/prim_test.h
@@ -0,0 +1,59 @@
+/* primtest.h
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License. Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ */
+
+#ifndef FREERDP_LIB_PRIMTEST_H
+#define FREERDP_LIB_PRIMTEST_H
+
+#include <winpr/crt.h>
+#include <winpr/spec.h>
+#include <winpr/wtypes.h>
+#include <winpr/platform.h>
+#include <winpr/crypto.h>
+
+#include <freerdp/primitives.h>
+
+#include "measure.h"
+
+#ifdef WITH_IPP
+#include <ipps.h>
+#include <ippi.h>
+#endif
+
+#ifdef _WIN32
+#define ALIGN(x) x
+#else
+#define ALIGN(x) x DECLSPEC_ALIGN(MEMORY_ALLOCATION_ALIGNMENT)
+#endif
+
+#define ABS(_x_) ((_x_) < 0 ? (-(_x_)) : (_x_))
+#define MAX_TEST_SIZE 4096
+
+extern int test_sizes[];
+#define NUM_TEST_SIZES 10
+
+extern BOOL g_TestPrimitivesPerformance;
+extern UINT32 g_Iterations;
+
+extern primitives_t* generic;
+extern primitives_t* optimized;
+
+void prim_test_setup(BOOL performance);
+
+typedef pstatus_t (*speed_test_fkt)();
+
+BOOL speed_test(const char* name, const char* dsc, UINT32 iterations, speed_test_fkt generic,
+ speed_test_fkt optimized, ...);
+
+#endif /* FREERDP_LIB_PRIMTEST_H */