summaryrefslogtreecommitdiffstats
path: root/libfreerdp/primitives/prim_templates.h
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--libfreerdp/primitives/prim_templates.h444
1 files changed, 444 insertions, 0 deletions
diff --git a/libfreerdp/primitives/prim_templates.h b/libfreerdp/primitives/prim_templates.h
new file mode 100644
index 0000000..5ab85a8
--- /dev/null
+++ b/libfreerdp/primitives/prim_templates.h
@@ -0,0 +1,444 @@
+/* prim_templates.h
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License. Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ */
+
+#ifdef __GNUC__
+#pragma once
+#endif
+
+#ifndef FREERDP_LIB_PRIM_TEMPLATES_H
+#define FREERDP_LIB_PRIM_TEMPLATES_H
+
+/* These are prototypes for SSE (potentially NEON) routines that do a
+ * simple SSE operation over an array of data. Since so much of this
+ * code is shared except for the operation itself, these prototypes are
+ * used rather than duplicating code. The naming convention depends on
+ * the parameters: S=Source param; C=Constant; D=Destination.
+ * All the macros have parameters for a fallback procedure if the data
+ * is too small and an operation "the slow way" for use at 16-byte edges.
+ */
+
+/* SSE3 note: If someone needs to support an SSE2 version of these without
+ * SSE3 support, an alternative version could be added that merely checks
+ * that 16-byte alignment on both destination and source(s) can be
+ * achieved, rather than use LDDQU for unaligned reads.
+ */
+
+/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
+ * It easily can't do that if the value is stored in a variable.
+ * So don't save it as an intermediate value.
+ */
+
+/* ----------------------------------------------------------------------------
+ * SCD = Source, Constant, Destination
+ */
+#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
+ static pstatus_t _name_(const _type_* pSrc, UINT32 val, _type_* pDst, UINT32 len) \
+ { \
+ INT32 shifts = 0; \
+ UINT32 offBeatMask; \
+ const _type_* sptr = pSrc; \
+ _type_* dptr = pDst; \
+ int count; \
+ if (val == 0) \
+ return PRIMITIVES_SUCCESS; \
+ if (val >= 16) \
+ return -1; \
+ if (len < 16) /* pointless if too small */ \
+ { \
+ return _fallback_(pSrc, val, pDst, len); \
+ } \
+ if (sizeof(_type_) == 1) \
+ shifts = 1; \
+ else if (sizeof(_type_) == 2) \
+ shifts = 2; \
+ else if (sizeof(_type_) == 4) \
+ shifts = 3; \
+ else if (sizeof(_type_) == 8) \
+ shifts = 4; \
+ offBeatMask = (1 << (shifts - 1)) - 1; \
+ if ((ULONG_PTR)pDst & offBeatMask) \
+ { \
+ /* Incrementing the pointer skips over 16-byte boundary. */ \
+ return _fallback_(pSrc, val, pDst, len); \
+ } \
+ /* Get to the 16-byte boundary now. */ \
+ while ((ULONG_PTR)dptr & 0x0f) \
+ { \
+ _slowWay_; \
+ if (--len == 0) \
+ return PRIMITIVES_SUCCESS; \
+ } \
+ /* Use 8 128-bit SSE registers. */ \
+ count = len >> (8 - shifts); \
+ len -= count << (8 - shifts); \
+ if ((const ULONG_PTR)sptr & 0x0f) \
+ { \
+ while (count--) \
+ { \
+ __m128i xmm0 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm5 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm6 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm7 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ xmm0 = _op_(xmm0, val); \
+ xmm1 = _op_(xmm1, val); \
+ xmm2 = _op_(xmm2, val); \
+ xmm3 = _op_(xmm3, val); \
+ xmm4 = _op_(xmm4, val); \
+ xmm5 = _op_(xmm5, val); \
+ xmm6 = _op_(xmm6, val); \
+ xmm7 = _op_(xmm7, val); \
+ _mm_store_si128((__m128i*)dptr, xmm0); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm1); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm2); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm3); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm4); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm5); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm6); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm7); \
+ dptr += (16 / sizeof(_type_)); \
+ } \
+ } \
+ else \
+ { \
+ while (count--) \
+ { \
+ __m128i xmm0 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm1 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm2 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm3 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm4 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm5 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm6 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm7 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ xmm0 = _op_(xmm0, val); \
+ xmm1 = _op_(xmm1, val); \
+ xmm2 = _op_(xmm2, val); \
+ xmm3 = _op_(xmm3, val); \
+ xmm4 = _op_(xmm4, val); \
+ xmm5 = _op_(xmm5, val); \
+ xmm6 = _op_(xmm6, val); \
+ xmm7 = _op_(xmm7, val); \
+ _mm_store_si128((__m128i*)dptr, xmm0); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm1); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm2); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm3); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm4); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm5); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm6); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm7); \
+ dptr += (16 / sizeof(_type_)); \
+ } \
+ } \
+ /* Use a single 128-bit SSE register. */ \
+ count = len >> (5 - shifts); \
+ len -= count << (5 - shifts); \
+ while (count--) \
+ { \
+ __m128i xmm0 = LOAD_SI128(sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ xmm0 = _op_(xmm0, val); \
+ _mm_store_si128((__m128i*)dptr, xmm0); \
+ dptr += (16 / sizeof(_type_)); \
+ } \
+ /* Finish off the remainder. */ \
+ while (len--) \
+ { \
+ _slowWay_; \
+ } \
+ return PRIMITIVES_SUCCESS; \
+ }
+
+/* ----------------------------------------------------------------------------
+ * SCD = Source, Constant, Destination
+ * PRE = preload xmm0 with the constant.
+ */
+#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
+ static pstatus_t _name_(const _type_* pSrc, _type_ val, _type_* pDst, INT32 len) \
+ { \
+ int shifts = 0; \
+ UINT32 offBeatMask; \
+ const _type_* sptr = pSrc; \
+ _type_* dptr = pDst; \
+ size_t count; \
+ __m128i xmm0; \
+ if (len < 16) /* pointless if too small */ \
+ { \
+ return _fallback_(pSrc, val, pDst, len); \
+ } \
+ if (sizeof(_type_) == 1) \
+ shifts = 1; \
+ else if (sizeof(_type_) == 2) \
+ shifts = 2; \
+ else if (sizeof(_type_) == 4) \
+ shifts = 3; \
+ else if (sizeof(_type_) == 8) \
+ shifts = 4; \
+ offBeatMask = (1 << (shifts - 1)) - 1; \
+ if ((ULONG_PTR)pDst & offBeatMask) \
+ { \
+ /* Incrementing the pointer skips over 16-byte boundary. */ \
+ return _fallback_(pSrc, val, pDst, len); \
+ } \
+ /* Get to the 16-byte boundary now. */ \
+ while ((ULONG_PTR)dptr & 0x0f) \
+ { \
+ _slowWay_; \
+ if (--len == 0) \
+ return PRIMITIVES_SUCCESS; \
+ } \
+ /* Use 4 128-bit SSE registers. */ \
+ count = len >> (7 - shifts); \
+ len -= count << (7 - shifts); \
+ xmm0 = _mm_set1_epi32(val); \
+ if ((const ULONG_PTR)sptr & 0x0f) \
+ { \
+ while (count--) \
+ { \
+ __m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ xmm1 = _op_(xmm1, xmm0); \
+ xmm2 = _op_(xmm2, xmm0); \
+ xmm3 = _op_(xmm3, xmm0); \
+ xmm4 = _op_(xmm4, xmm0); \
+ _mm_store_si128((__m128i*)dptr, xmm1); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm2); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm3); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm4); \
+ dptr += (16 / sizeof(_type_)); \
+ } \
+ } \
+ else \
+ { \
+ while (count--) \
+ { \
+ __m128i xmm1 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm2 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm3 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ __m128i xmm4 = _mm_load_si128((const __m128i*)sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ xmm1 = _op_(xmm1, xmm0); \
+ xmm2 = _op_(xmm2, xmm0); \
+ xmm3 = _op_(xmm3, xmm0); \
+ xmm4 = _op_(xmm4, xmm0); \
+ _mm_store_si128((__m128i*)dptr, xmm1); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm2); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm3); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm4); \
+ dptr += (16 / sizeof(_type_)); \
+ } \
+ } \
+ /* Use a single 128-bit SSE register. */ \
+ count = len >> (5 - shifts); \
+ len -= count << (5 - shifts); \
+ while (count--) \
+ { \
+ __m128i xmm1 = LOAD_SI128(sptr); \
+ sptr += (16 / sizeof(_type_)); \
+ xmm1 = _op_(xmm1, xmm0); \
+ _mm_store_si128((__m128i*)dptr, xmm1); \
+ dptr += (16 / sizeof(_type_)); \
+ } \
+ /* Finish off the remainder. */ \
+ while (len--) \
+ { \
+ _slowWay_; \
+ } \
+ return PRIMITIVES_SUCCESS; \
+ }
+
+/* ----------------------------------------------------------------------------
+ * SSD = Source1, Source2, Destination
+ */
+#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
+ static pstatus_t _name_(const _type_* pSrc1, const _type_* pSrc2, _type_* pDst, UINT32 len) \
+ { \
+ int shifts = 0; \
+ UINT32 offBeatMask; \
+ const _type_* sptr1 = pSrc1; \
+ const _type_* sptr2 = pSrc2; \
+ _type_* dptr = pDst; \
+ size_t count; \
+ if (len < 16) /* pointless if too small */ \
+ { \
+ return _fallback_(pSrc1, pSrc2, pDst, len); \
+ } \
+ if (sizeof(_type_) == 1) \
+ shifts = 1; \
+ else if (sizeof(_type_) == 2) \
+ shifts = 2; \
+ else if (sizeof(_type_) == 4) \
+ shifts = 3; \
+ else if (sizeof(_type_) == 8) \
+ shifts = 4; \
+ offBeatMask = (1 << (shifts - 1)) - 1; \
+ if ((ULONG_PTR)pDst & offBeatMask) \
+ { \
+ /* Incrementing the pointer skips over 16-byte boundary. */ \
+ return _fallback_(pSrc1, pSrc2, pDst, len); \
+ } \
+ /* Get to the 16-byte boundary now. */ \
+ while ((ULONG_PTR)dptr & 0x0f) \
+ { \
+ pstatus_t status; \
+ status = _slowWay_; \
+ if (status != PRIMITIVES_SUCCESS) \
+ return status; \
+ if (--len == 0) \
+ return PRIMITIVES_SUCCESS; \
+ } \
+ /* Use 4 128-bit SSE registers. */ \
+ count = len >> (7 - shifts); \
+ len -= count << (7 - shifts); \
+ if (((const ULONG_PTR)sptr1 & 0x0f) || ((const ULONG_PTR)sptr2 & 0x0f)) \
+ { \
+ /* Unaligned loads */ \
+ while (count--) \
+ { \
+ __m128i xmm0 = _mm_lddqu_si128((const __m128i*)sptr1); \
+ sptr1 += (16 / sizeof(_type_)); \
+ __m128i xmm1 = _mm_lddqu_si128((const __m128i*)sptr1); \
+ sptr1 += (16 / sizeof(_type_)); \
+ __m128i xmm2 = _mm_lddqu_si128((const __m128i*)sptr1); \
+ sptr1 += (16 / sizeof(_type_)); \
+ __m128i xmm3 = _mm_lddqu_si128((const __m128i*)sptr1); \
+ sptr1 += (16 / sizeof(_type_)); \
+ __m128i xmm4 = _mm_lddqu_si128((const __m128i*)sptr2); \
+ sptr2 += (16 / sizeof(_type_)); \
+ __m128i xmm5 = _mm_lddqu_si128((const __m128i*)sptr2); \
+ sptr2 += (16 / sizeof(_type_)); \
+ __m128i xmm6 = _mm_lddqu_si128((const __m128i*)sptr2); \
+ sptr2 += (16 / sizeof(_type_)); \
+ __m128i xmm7 = _mm_lddqu_si128((const __m128i*)sptr2); \
+ sptr2 += (16 / sizeof(_type_)); \
+ xmm0 = _op_(xmm0, xmm4); \
+ xmm1 = _op_(xmm1, xmm5); \
+ xmm2 = _op_(xmm2, xmm6); \
+ xmm3 = _op_(xmm3, xmm7); \
+ _mm_store_si128((__m128i*)dptr, xmm0); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm1); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm2); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm3); \
+ dptr += (16 / sizeof(_type_)); \
+ } \
+ } \
+ else \
+ { \
+ /* Aligned loads */ \
+ while (count--) \
+ { \
+ __m128i xmm0 = _mm_load_si128((const __m128i*)sptr1); \
+ sptr1 += (16 / sizeof(_type_)); \
+ __m128i xmm1 = _mm_load_si128((const __m128i*)sptr1); \
+ sptr1 += (16 / sizeof(_type_)); \
+ __m128i xmm2 = _mm_load_si128((const __m128i*)sptr1); \
+ sptr1 += (16 / sizeof(_type_)); \
+ __m128i xmm3 = _mm_load_si128((const __m128i*)sptr1); \
+ sptr1 += (16 / sizeof(_type_)); \
+ __m128i xmm4 = _mm_load_si128((const __m128i*)sptr2); \
+ sptr2 += (16 / sizeof(_type_)); \
+ __m128i xmm5 = _mm_load_si128((const __m128i*)sptr2); \
+ sptr2 += (16 / sizeof(_type_)); \
+ __m128i xmm6 = _mm_load_si128((const __m128i*)sptr2); \
+ sptr2 += (16 / sizeof(_type_)); \
+ __m128i xmm7 = _mm_load_si128((const __m128i*)sptr2); \
+ sptr2 += (16 / sizeof(_type_)); \
+ xmm0 = _op_(xmm0, xmm4); \
+ xmm1 = _op_(xmm1, xmm5); \
+ xmm2 = _op_(xmm2, xmm6); \
+ xmm3 = _op_(xmm3, xmm7); \
+ _mm_store_si128((__m128i*)dptr, xmm0); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm1); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm2); \
+ dptr += (16 / sizeof(_type_)); \
+ _mm_store_si128((__m128i*)dptr, xmm3); \
+ dptr += (16 / sizeof(_type_)); \
+ } \
+ } \
+ /* Use a single 128-bit SSE register. */ \
+ count = len >> (5 - shifts); \
+ len -= count << (5 - shifts); \
+ while (count--) \
+ { \
+ __m128i xmm0 = LOAD_SI128(sptr1); \
+ sptr1 += (16 / sizeof(_type_)); \
+ __m128i xmm1 = LOAD_SI128(sptr2); \
+ sptr2 += (16 / sizeof(_type_)); \
+ xmm0 = _op_(xmm0, xmm1); \
+ _mm_store_si128((__m128i*)dptr, xmm0); \
+ dptr += (16 / sizeof(_type_)); \
+ } \
+ /* Finish off the remainder. */ \
+ while (len--) \
+ { \
+ _slowWay_; \
+ } \
+ return PRIMITIVES_SUCCESS; \
+ }
+
+#endif /* FREERDP_LIB_PRIM_TEMPLATES_H */