From d8bbc7858622b6d9c278469aab701ca0b609cddf Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Wed, 15 May 2024 05:35:49 +0200
Subject: Merging upstream version 126.0.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 third_party/aom/aom_dsp/arm/mem_neon.h | 56 +++++++++++++++++++++++++++++-----
 1 file changed, 49 insertions(+), 7 deletions(-)

(limited to 'third_party/aom/aom_dsp/arm/mem_neon.h')

diff --git a/third_party/aom/aom_dsp/arm/mem_neon.h b/third_party/aom/aom_dsp/arm/mem_neon.h
index 52c7a34e3e..32a462a186 100644
--- a/third_party/aom/aom_dsp/arm/mem_neon.h
+++ b/third_party/aom/aom_dsp/arm/mem_neon.h
@@ -56,17 +56,10 @@ static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
 
 #elif defined(__GNUC__) && !defined(__clang__)  // GCC 64-bit.
 #if __GNUC__ < 8
-
 static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
   uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
   return res;
 }
-
-static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
-  uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
-                         vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
-  return res;
-}
 #endif  // __GNUC__ < 8
 
 #if __GNUC__ < 9
@@ -76,6 +69,15 @@ static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
   return res;
 }
 #endif  // __GNUC__ < 9
+
+// vld1q_u16_x4 is defined from GCC 8.5.0 and onwards.
+#if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
+static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
+  uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
+                         vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
+  return res;
+}
+#endif  // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
 #endif  // defined(__GNUC__) && !defined(__clang__)
 
 static INLINE void store_u8_8x2(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
@@ -457,6 +459,16 @@ static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
   *s3 = vld1_s16(s);
 }
 
+static INLINE void load_s16_4x3(const int16_t *s, ptrdiff_t p,
+                                int16x4_t *const s0, int16x4_t *const s1,
+                                int16x4_t *const s2) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+}
+
 static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
                                 const uint8x8_t s1, const uint8x8_t s2,
                                 const uint8x8_t s3, const uint8x8_t s4,
@@ -525,6 +537,16 @@ static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
   vst1q_u16(s, s7);
 }
 
+static INLINE void store_u16_4x3(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x4_t s0, const uint16x4_t s1,
+                                 const uint16x4_t s2) {
+  vst1_u16(s, s0);
+  s += dst_stride;
+  vst1_u16(s, s1);
+  s += dst_stride;
+  vst1_u16(s, s2);
+}
+
 static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
                                  const uint16x4_t s0, const uint16x4_t s1,
                                  const uint16x4_t s2, const uint16x4_t s3) {
@@ -544,6 +566,16 @@ static INLINE void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
   vst1q_u16(s, s1);
 }
 
+static INLINE void store_u16_8x3(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x8_t s0, const uint16x8_t s1,
+                                 const uint16x8_t s2) {
+  vst1q_u16(s, s0);
+  s += dst_stride;
+  vst1q_u16(s, s1);
+  s += dst_stride;
+  vst1q_u16(s, s2);
+}
+
 static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
                                  const uint16x8_t s0, const uint16x8_t s1,
                                  const uint16x8_t s2, const uint16x8_t s3) {
@@ -857,6 +889,16 @@ static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
   *s3 = vld1q_s16(s);
 }
 
+static INLINE void load_s16_8x3(const int16_t *s, ptrdiff_t p,
+                                int16x8_t *const s0, int16x8_t *const s1,
+                                int16x8_t *const s2) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+}
+
 // Load 2 sets of 4 bytes when alignment is not guaranteed.
 static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
   uint32_t a;
-- 
cgit v1.2.3