Adding upstream version 3.3.0+dfsg1.upstream/3.3.0+dfsg1

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 01:24:41 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 01:24:41 +0000
commit: a9bcc81f821d7c66f623779fa5147e728eb3c388 (patch)
tree: 98676963bcdd537ae5908a067a8eb110b93486a6 /libfreerdp/primitives/prim_templates.h
parent: Initial commit. (diff)
download: freerdp3-a9bcc81f821d7c66f623779fa5147e728eb3c388.tar.xz
freerdp3-a9bcc81f821d7c66f623779fa5147e728eb3c388.zip
1 files changed, 444 insertions, 0 deletions
diff --git a/libfreerdp/primitives/prim_templates.h b/libfreerdp/primitives/prim_templates.h
new file mode 100644
index 0000000..5ab85a8
--- /dev/null
+++ b/libfreerdp/primitives/prim_templates.h
@@ -0,0 +1,444 @@
+/* prim_templates.h
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ */
+
+#ifdef __GNUC__
+#pragma once
+#endif
+
+#ifndef FREERDP_LIB_PRIM_TEMPLATES_H
+#define FREERDP_LIB_PRIM_TEMPLATES_H
+
+/* These are prototypes for SSE (potentially NEON) routines that do a
+ * simple SSE operation over an array of data.  Since so much of this
+ * code is shared except for the operation itself, these prototypes are
+ * used rather than duplicating code.  The naming convention depends on
+ * the parameters:  S=Source param; C=Constant; D=Destination.
+ * All the macros have parameters for a fallback procedure if the data
+ * is too small and an operation "the slow way" for use at 16-byte edges.
+ */
+
+/* SSE3 note:  If someone needs to support an SSE2 version of these without
+ * SSE3 support, an alternative version could be added that merely checks
+ * that 16-byte alignment on both destination and source(s) can be
+ * achieved, rather than use LDDQU for unaligned reads.
+ */
+
+/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
+ * It easily can't do that if the value is stored in a variable.
+ * So don't save it as an intermediate value.
+ */
+
+/* ----------------------------------------------------------------------------
+ * SCD = Source, Constant, Destination
+ */
+#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)                 \
+	static pstatus_t _name_(const _type_* pSrc, UINT32 val, _type_* pDst, UINT32 len) \
+	{                                                                                 \
+		INT32 shifts = 0;                                                             \
+		UINT32 offBeatMask;                                                           \
+		const _type_* sptr = pSrc;                                                    \
+		_type_* dptr = pDst;                                                          \
+		int count;                                                                    \
+		if (val == 0)                                                                 \
+			return PRIMITIVES_SUCCESS;                                                \
+		if (val >= 16)                                                                \
+			return -1;                                                                \
+		if (len < 16) /* pointless if too small */                                    \
+		{                                                                             \
+			return _fallback_(pSrc, val, pDst, len);                                  \
+		}                                                                             \
+		if (sizeof(_type_) == 1)                                                      \
+			shifts = 1;                                                               \
+		else if (sizeof(_type_) == 2)                                                 \
+			shifts = 2;                                                               \
+		else if (sizeof(_type_) == 4)                                                 \
+			shifts = 3;                                                               \
+		else if (sizeof(_type_) == 8)                                                 \
+			shifts = 4;                                                               \
+		offBeatMask = (1 << (shifts - 1)) - 1;                                        \
+		if ((ULONG_PTR)pDst & offBeatMask)                                            \
+		{                                                                             \
+			/* Incrementing the pointer skips over 16-byte boundary. */               \
+			return _fallback_(pSrc, val, pDst, len);                                  \
+		}                                                                             \
+		/* Get to the 16-byte boundary now. */                                        \
+		while ((ULONG_PTR)dptr & 0x0f)                                                \
+		{                                                                             \
+			_slowWay_;                                                                \
+			if (--len == 0)                                                           \
+				return PRIMITIVES_SUCCESS;                                            \
+		}                                                                             \
+		/* Use 8 128-bit SSE registers. */                                            \
+		count = len >> (8 - shifts);                                                  \
+		len -= count << (8 - shifts);                                                 \
+		if ((const ULONG_PTR)sptr & 0x0f)                                             \
+		{                                                                             \
+			while (count--)                                                           \
+			{                                                                         \
+				__m128i xmm0 = _mm_lddqu_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm5 = _mm_lddqu_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm6 = _mm_lddqu_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm7 = _mm_lddqu_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                        \
+				xmm0 = _op_(xmm0, val);                                               \
+				xmm1 = _op_(xmm1, val);                                               \
+				xmm2 = _op_(xmm2, val);                                               \
+				xmm3 = _op_(xmm3, val);                                               \
+				xmm4 = _op_(xmm4, val);                                               \
+				xmm5 = _op_(xmm5, val);                                               \
+				xmm6 = _op_(xmm6, val);                                               \
+				xmm7 = _op_(xmm7, val);                                               \
+				_mm_store_si128((__m128i*)dptr, xmm0);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm1);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm2);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm3);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm4);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm5);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm6);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm7);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+			}                                                                         \
+		}                                                                             \
+		else                                                                          \
+		{                                                                             \
+			while (count--)                                                           \
+			{                                                                         \
+				__m128i xmm0 = _mm_load_si128((const __m128i*)sptr);                  \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm1 = _mm_load_si128((const __m128i*)sptr);                  \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm2 = _mm_load_si128((const __m128i*)sptr);                  \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm3 = _mm_load_si128((const __m128i*)sptr);                  \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm4 = _mm_load_si128((const __m128i*)sptr);                  \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm5 = _mm_load_si128((const __m128i*)sptr);                  \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm6 = _mm_load_si128((const __m128i*)sptr);                  \
+				sptr += (16 / sizeof(_type_));                                        \
+				__m128i xmm7 = _mm_load_si128((const __m128i*)sptr);                  \
+				sptr += (16 / sizeof(_type_));                                        \
+				xmm0 = _op_(xmm0, val);                                               \
+				xmm1 = _op_(xmm1, val);                                               \
+				xmm2 = _op_(xmm2, val);                                               \
+				xmm3 = _op_(xmm3, val);                                               \
+				xmm4 = _op_(xmm4, val);                                               \
+				xmm5 = _op_(xmm5, val);                                               \
+				xmm6 = _op_(xmm6, val);                                               \
+				xmm7 = _op_(xmm7, val);                                               \
+				_mm_store_si128((__m128i*)dptr, xmm0);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm1);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm2);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm3);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm4);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm5);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm6);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+				_mm_store_si128((__m128i*)dptr, xmm7);                                \
+				dptr += (16 / sizeof(_type_));                                        \
+			}                                                                         \
+		}                                                                             \
+		/* Use a single 128-bit SSE register. */                                      \
+		count = len >> (5 - shifts);                                                  \
+		len -= count << (5 - shifts);                                                 \
+		while (count--)                                                               \
+		{                                                                             \
+			__m128i xmm0 = LOAD_SI128(sptr);                                          \
+			sptr += (16 / sizeof(_type_));                                            \
+			xmm0 = _op_(xmm0, val);                                                   \
+			_mm_store_si128((__m128i*)dptr, xmm0);                                    \
+			dptr += (16 / sizeof(_type_));                                            \
+		}                                                                             \
+		/* Finish off the remainder. */                                               \
+		while (len--)                                                                 \
+		{                                                                             \
+			_slowWay_;                                                                \
+		}                                                                             \
+		return PRIMITIVES_SUCCESS;                                                    \
+	}
+
+/* ----------------------------------------------------------------------------
+ * SCD = Source, Constant, Destination
+ * PRE = preload xmm0 with the constant.
+ */
+#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)            \
+	static pstatus_t _name_(const _type_* pSrc, _type_ val, _type_* pDst, INT32 len) \
+	{                                                                                \
+		int shifts = 0;                                                              \
+		UINT32 offBeatMask;                                                          \
+		const _type_* sptr = pSrc;                                                   \
+		_type_* dptr = pDst;                                                         \
+		size_t count;                                                                \
+		__m128i xmm0;                                                                \
+		if (len < 16) /* pointless if too small */                                   \
+		{                                                                            \
+			return _fallback_(pSrc, val, pDst, len);                                 \
+		}                                                                            \
+		if (sizeof(_type_) == 1)                                                     \
+			shifts = 1;                                                              \
+		else if (sizeof(_type_) == 2)                                                \
+			shifts = 2;                                                              \
+		else if (sizeof(_type_) == 4)                                                \
+			shifts = 3;                                                              \
+		else if (sizeof(_type_) == 8)                                                \
+			shifts = 4;                                                              \
+		offBeatMask = (1 << (shifts - 1)) - 1;                                       \
+		if ((ULONG_PTR)pDst & offBeatMask)                                           \
+		{                                                                            \
+			/* Incrementing the pointer skips over 16-byte boundary. */              \
+			return _fallback_(pSrc, val, pDst, len);                                 \
+		}                                                                            \
+		/* Get to the 16-byte boundary now. */                                       \
+		while ((ULONG_PTR)dptr & 0x0f)                                               \
+		{                                                                            \
+			_slowWay_;                                                               \
+			if (--len == 0)                                                          \
+				return PRIMITIVES_SUCCESS;                                           \
+		}                                                                            \
+		/* Use 4 128-bit SSE registers. */                                           \
+		count = len >> (7 - shifts);                                                 \
+		len -= count << (7 - shifts);                                                \
+		xmm0 = _mm_set1_epi32(val);                                                  \
+		if ((const ULONG_PTR)sptr & 0x0f)                                            \
+		{                                                                            \
+			while (count--)                                                          \
+			{                                                                        \
+				__m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr);                \
+				sptr += (16 / sizeof(_type_));                                       \
+				__m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr);                \
+				sptr += (16 / sizeof(_type_));                                       \
+				__m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr);                \
+				sptr += (16 / sizeof(_type_));                                       \
+				__m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr);                \
+				sptr += (16 / sizeof(_type_));                                       \
+				xmm1 = _op_(xmm1, xmm0);                                             \
+				xmm2 = _op_(xmm2, xmm0);                                             \
+				xmm3 = _op_(xmm3, xmm0);                                             \
+				xmm4 = _op_(xmm4, xmm0);                                             \
+				_mm_store_si128((__m128i*)dptr, xmm1);                               \
+				dptr += (16 / sizeof(_type_));                                       \
+				_mm_store_si128((__m128i*)dptr, xmm2);                               \
+				dptr += (16 / sizeof(_type_));                                       \
+				_mm_store_si128((__m128i*)dptr, xmm3);                               \
+				dptr += (16 / sizeof(_type_));                                       \
+				_mm_store_si128((__m128i*)dptr, xmm4);                               \
+				dptr += (16 / sizeof(_type_));                                       \
+			}                                                                        \
+		}                                                                            \
+		else                                                                         \
+		{                                                                            \
+			while (count--)                                                          \
+			{                                                                        \
+				__m128i xmm1 = _mm_load_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                       \
+				__m128i xmm2 = _mm_load_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                       \
+				__m128i xmm3 = _mm_load_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                       \
+				__m128i xmm4 = _mm_load_si128((const __m128i*)sptr);                 \
+				sptr += (16 / sizeof(_type_));                                       \
+				xmm1 = _op_(xmm1, xmm0);                                             \
+				xmm2 = _op_(xmm2, xmm0);                                             \
+				xmm3 = _op_(xmm3, xmm0);                                             \
+				xmm4 = _op_(xmm4, xmm0);                                             \
+				_mm_store_si128((__m128i*)dptr, xmm1);                               \
+				dptr += (16 / sizeof(_type_));                                       \
+				_mm_store_si128((__m128i*)dptr, xmm2);                               \
+				dptr += (16 / sizeof(_type_));                                       \
+				_mm_store_si128((__m128i*)dptr, xmm3);                               \
+				dptr += (16 / sizeof(_type_));                                       \
+				_mm_store_si128((__m128i*)dptr, xmm4);                               \
+				dptr += (16 / sizeof(_type_));                                       \
+			}                                                                        \
+		}                                                                            \
+		/* Use a single 128-bit SSE register. */                                     \
+		count = len >> (5 - shifts);                                                 \
+		len -= count << (5 - shifts);                                                \
+		while (count--)                                                              \
+		{                                                                            \
+			__m128i xmm1 = LOAD_SI128(sptr);                                         \
+			sptr += (16 / sizeof(_type_));                                           \
+			xmm1 = _op_(xmm1, xmm0);                                                 \
+			_mm_store_si128((__m128i*)dptr, xmm1);                                   \
+			dptr += (16 / sizeof(_type_));                                           \
+		}                                                                            \
+		/* Finish off the remainder. */                                              \
+		while (len--)                                                                \
+		{                                                                            \
+			_slowWay_;                                                               \
+		}                                                                            \
+		return PRIMITIVES_SUCCESS;                                                   \
+	}
+
+/* ----------------------------------------------------------------------------
+ * SSD = Source1, Source2, Destination
+ */
+#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)                           \
+	static pstatus_t _name_(const _type_* pSrc1, const _type_* pSrc2, _type_* pDst, UINT32 len) \
+	{                                                                                           \
+		int shifts = 0;                                                                         \
+		UINT32 offBeatMask;                                                                     \
+		const _type_* sptr1 = pSrc1;                                                            \
+		const _type_* sptr2 = pSrc2;                                                            \
+		_type_* dptr = pDst;                                                                    \
+		size_t count;                                                                           \
+		if (len < 16) /* pointless if too small */                                              \
+		{                                                                                       \
+			return _fallback_(pSrc1, pSrc2, pDst, len);                                         \
+		}                                                                                       \
+		if (sizeof(_type_) == 1)                                                                \
+			shifts = 1;                                                                         \
+		else if (sizeof(_type_) == 2)                                                           \
+			shifts = 2;                                                                         \
+		else if (sizeof(_type_) == 4)                                                           \
+			shifts = 3;                                                                         \
+		else if (sizeof(_type_) == 8)                                                           \
+			shifts = 4;                                                                         \
+		offBeatMask = (1 << (shifts - 1)) - 1;                                                  \
+		if ((ULONG_PTR)pDst & offBeatMask)                                                      \
+		{                                                                                       \
+			/* Incrementing the pointer skips over 16-byte boundary. */                         \
+			return _fallback_(pSrc1, pSrc2, pDst, len);                                         \
+		}                                                                                       \
+		/* Get to the 16-byte boundary now. */                                                  \
+		while ((ULONG_PTR)dptr & 0x0f)                                                          \
+		{                                                                                       \
+			pstatus_t status;                                                                   \
+			status = _slowWay_;                                                                 \
+			if (status != PRIMITIVES_SUCCESS)                                                   \
+				return status;                                                                  \
+			if (--len == 0)                                                                     \
+				return PRIMITIVES_SUCCESS;                                                      \
+		}                                                                                       \
+		/* Use 4 128-bit SSE registers. */                                                      \
+		count = len >> (7 - shifts);                                                            \
+		len -= count << (7 - shifts);                                                           \
+		if (((const ULONG_PTR)sptr1 & 0x0f) || ((const ULONG_PTR)sptr2 & 0x0f))                 \
+		{                                                                                       \
+			/* Unaligned loads */                                                               \
+			while (count--)                                                                     \
+			{                                                                                   \
+				__m128i xmm0 = _mm_lddqu_si128((const __m128i*)sptr1);                          \
+				sptr1 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr1);                          \
+				sptr1 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr1);                          \
+				sptr1 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr1);                          \
+				sptr1 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr2);                          \
+				sptr2 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm5 = _mm_lddqu_si128((const __m128i*)sptr2);                          \
+				sptr2 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm6 = _mm_lddqu_si128((const __m128i*)sptr2);                          \
+				sptr2 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm7 = _mm_lddqu_si128((const __m128i*)sptr2);                          \
+				sptr2 += (16 / sizeof(_type_));                                                 \
+				xmm0 = _op_(xmm0, xmm4);                                                        \
+				xmm1 = _op_(xmm1, xmm5);                                                        \
+				xmm2 = _op_(xmm2, xmm6);                                                        \
+				xmm3 = _op_(xmm3, xmm7);                                                        \
+				_mm_store_si128((__m128i*)dptr, xmm0);                                          \
+				dptr += (16 / sizeof(_type_));                                                  \
+				_mm_store_si128((__m128i*)dptr, xmm1);                                          \
+				dptr += (16 / sizeof(_type_));                                                  \
+				_mm_store_si128((__m128i*)dptr, xmm2);                                          \
+				dptr += (16 / sizeof(_type_));                                                  \
+				_mm_store_si128((__m128i*)dptr, xmm3);                                          \
+				dptr += (16 / sizeof(_type_));                                                  \
+			}                                                                                   \
+		}                                                                                       \
+		else                                                                                    \
+		{                                                                                       \
+			/* Aligned loads */                                                                 \
+			while (count--)                                                                     \
+			{                                                                                   \
+				__m128i xmm0 = _mm_load_si128((const __m128i*)sptr1);                           \
+				sptr1 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm1 = _mm_load_si128((const __m128i*)sptr1);                           \
+				sptr1 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm2 = _mm_load_si128((const __m128i*)sptr1);                           \
+				sptr1 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm3 = _mm_load_si128((const __m128i*)sptr1);                           \
+				sptr1 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm4 = _mm_load_si128((const __m128i*)sptr2);                           \
+				sptr2 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm5 = _mm_load_si128((const __m128i*)sptr2);                           \
+				sptr2 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm6 = _mm_load_si128((const __m128i*)sptr2);                           \
+				sptr2 += (16 / sizeof(_type_));                                                 \
+				__m128i xmm7 = _mm_load_si128((const __m128i*)sptr2);                           \
+				sptr2 += (16 / sizeof(_type_));                                                 \
+				xmm0 = _op_(xmm0, xmm4);                                                        \
+				xmm1 = _op_(xmm1, xmm5);                                                        \
+				xmm2 = _op_(xmm2, xmm6);                                                        \
+				xmm3 = _op_(xmm3, xmm7);                                                        \
+				_mm_store_si128((__m128i*)dptr, xmm0);                                          \
+				dptr += (16 / sizeof(_type_));                                                  \
+				_mm_store_si128((__m128i*)dptr, xmm1);                                          \
+				dptr += (16 / sizeof(_type_));                                                  \
+				_mm_store_si128((__m128i*)dptr, xmm2);                                          \
+				dptr += (16 / sizeof(_type_));                                                  \
+				_mm_store_si128((__m128i*)dptr, xmm3);                                          \
+				dptr += (16 / sizeof(_type_));                                                  \
+			}                                                                                   \
+		}                                                                                       \
+		/* Use a single 128-bit SSE register. */                                                \
+		count = len >> (5 - shifts);                                                            \
+		len -= count << (5 - shifts);                                                           \
+		while (count--)                                                                         \
+		{                                                                                       \
+			__m128i xmm0 = LOAD_SI128(sptr1);                                                   \
+			sptr1 += (16 / sizeof(_type_));                                                     \
+			__m128i xmm1 = LOAD_SI128(sptr2);                                                   \
+			sptr2 += (16 / sizeof(_type_));                                                     \
+			xmm0 = _op_(xmm0, xmm1);                                                            \
+			_mm_store_si128((__m128i*)dptr, xmm0);                                              \
+			dptr += (16 / sizeof(_type_));                                                      \
+		}                                                                                       \
+		/* Finish off the remainder. */                                                         \
+		while (len--)                                                                           \
+		{                                                                                       \
+			_slowWay_;                                                                          \
+		}                                                                                       \
+		return PRIMITIVES_SUCCESS;                                                              \
+	}
+
+#endif /* FREERDP_LIB_PRIM_TEMPLATES_H */
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 01:24:41 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 01:24:41 +0000
commit	a9bcc81f821d7c66f623779fa5147e728eb3c388 (patch)
tree	98676963bcdd537ae5908a067a8eb110b93486a6 /libfreerdp/primitives/prim_templates.h
parent	Initial commit. (diff)
download	freerdp3-a9bcc81f821d7c66f623779fa5147e728eb3c388.tar.xz freerdp3-a9bcc81f821d7c66f623779fa5147e728eb3c388.zip