summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/arm/64/looprestoration_tmpl.S
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/arm/64/looprestoration_tmpl.S')
-rw-r--r--third_party/dav1d/src/arm/64/looprestoration_tmpl.S597
1 files changed, 597 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/64/looprestoration_tmpl.S b/third_party/dav1d/src/arm/64/looprestoration_tmpl.S
new file mode 100644
index 0000000000..7cdfd6f3f7
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/looprestoration_tmpl.S
@@ -0,0 +1,597 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+#define FILTER_OUT_STRIDE 384
+
+.macro sgr_funcs bpc
+// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp,
+// const pixel *src, const ptrdiff_t stride,
+// const int32_t *a, const int16_t *b,
+// const int w, const int h);
+function sgr_finish_filter1_\bpc\()bpc_neon, export=1
+ sub x7, x3, #(4*SUM_STRIDE)
+ add x8, x3, #(4*SUM_STRIDE)
+ sub x9, x4, #(2*SUM_STRIDE)
+ add x10, x4, #(2*SUM_STRIDE)
+ mov x11, #SUM_STRIDE
+ mov x12, #FILTER_OUT_STRIDE
+ add x13, x5, #7
+ bic x13, x13, #7 // Aligned width
+.if \bpc == 8
+ sub x2, x2, x13
+.else
+ sub x2, x2, x13, lsl #1
+.endif
+ sub x12, x12, x13
+ sub x11, x11, x13
+ sub x11, x11, #4 // We read 4 extra elements from a
+ sub x14, x11, #4 // We read 8 extra elements from b
+ mov x13, x5
+ movi v6.8h, #3
+ movi v7.4s, #3
+1:
+ ld1 {v0.8h, v1.8h}, [x9], #32
+ ld1 {v2.8h, v3.8h}, [x4], #32
+ ld1 {v4.8h, v5.8h}, [x10], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
+ ld1 {v19.4s, v20.4s, v21.4s}, [x3], #48
+ ld1 {v22.4s, v23.4s, v24.4s}, [x8], #48
+
+2:
+ subs x5, x5, #8
+ ext v25.16b, v0.16b, v1.16b, #2 // -stride
+ ext v26.16b, v2.16b, v3.16b, #2 // 0
+ ext v27.16b, v4.16b, v5.16b, #2 // +stride
+ ext v28.16b, v0.16b, v1.16b, #4 // +1-stride
+ ext v29.16b, v2.16b, v3.16b, #4 // +1
+ ext v30.16b, v4.16b, v5.16b, #4 // +1+stride
+ add v2.8h, v2.8h, v25.8h // -1, -stride
+ add v26.8h, v26.8h, v27.8h // 0, +stride
+ add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride
+ add v2.8h, v2.8h, v26.8h
+ add v4.8h, v4.8h, v30.8h // -1+stride, +1+stride
+ add v2.8h, v2.8h, v29.8h // +1
+ add v0.8h, v0.8h, v4.8h
+
+ ext v25.16b, v16.16b, v17.16b, #4 // -stride
+ ext v26.16b, v17.16b, v18.16b, #4
+ shl v2.8h, v2.8h, #2
+ ext v27.16b, v16.16b, v17.16b, #8 // +1-stride
+ ext v28.16b, v17.16b, v18.16b, #8
+ ext v29.16b, v19.16b, v20.16b, #4 // 0
+ ext v30.16b, v20.16b, v21.16b, #4
+ mla v2.8h, v0.8h, v6.8h // * 3 -> a
+ add v25.4s, v25.4s, v19.4s // -stride, -1
+ add v26.4s, v26.4s, v20.4s
+ add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride
+ add v17.4s, v17.4s, v28.4s
+ ext v27.16b, v19.16b, v20.16b, #8 // +1
+ ext v28.16b, v20.16b, v21.16b, #8
+ add v16.4s, v16.4s, v22.4s // -1+stride
+ add v17.4s, v17.4s, v23.4s
+ add v29.4s, v29.4s, v27.4s // 0, +1
+ add v30.4s, v30.4s, v28.4s
+ add v25.4s, v25.4s, v29.4s
+ add v26.4s, v26.4s, v30.4s
+ ext v27.16b, v22.16b, v23.16b, #4 // +stride
+ ext v28.16b, v23.16b, v24.16b, #4
+ ext v29.16b, v22.16b, v23.16b, #8 // +1+stride
+ ext v30.16b, v23.16b, v24.16b, #8
+.if \bpc == 8
+ ld1 {v19.8b}, [x1], #8 // src
+.else
+ ld1 {v19.8h}, [x1], #16 // src
+.endif
+ add v25.4s, v25.4s, v27.4s // +stride
+ add v26.4s, v26.4s, v28.4s
+ add v16.4s, v16.4s, v29.4s // +1+stride
+ add v17.4s, v17.4s, v30.4s
+ shl v25.4s, v25.4s, #2
+ shl v26.4s, v26.4s, #2
+ mla v25.4s, v16.4s, v7.4s // * 3 -> b
+ mla v26.4s, v17.4s, v7.4s
+.if \bpc == 8
+ uxtl v19.8h, v19.8b // src
+.endif
+ mov v0.16b, v1.16b
+ umlal v25.4s, v2.4h, v19.4h // b + a * src
+ umlal2 v26.4s, v2.8h, v19.8h
+ mov v2.16b, v3.16b
+ rshrn v25.4h, v25.4s, #9
+ rshrn2 v25.8h, v26.4s, #9
+ mov v4.16b, v5.16b
+ st1 {v25.8h}, [x0], #16
+
+ b.le 3f
+ mov v16.16b, v18.16b
+ mov v19.16b, v21.16b
+ mov v22.16b, v24.16b
+ ld1 {v1.8h}, [x9], #16
+ ld1 {v3.8h}, [x4], #16
+ ld1 {v5.8h}, [x10], #16
+ ld1 {v17.4s, v18.4s}, [x7], #32
+ ld1 {v20.4s, v21.4s}, [x3], #32
+ ld1 {v23.4s, v24.4s}, [x8], #32
+ b 2b
+
+3:
+ subs x6, x6, #1
+ b.le 0f
+ mov x5, x13
+ add x0, x0, x12, lsl #1
+ add x1, x1, x2
+ add x3, x3, x11, lsl #2
+ add x7, x7, x11, lsl #2
+ add x8, x8, x11, lsl #2
+ add x4, x4, x14, lsl #1
+ add x9, x9, x14, lsl #1
+ add x10, x10, x14, lsl #1
+ b 1b
+0:
+ ret
+endfunc
+
+// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp,
+// const pixel *src, const ptrdiff_t stride,
+// const int32_t *a, const int16_t *b,
+// const int w, const int h);
+function sgr_finish_filter2_\bpc\()bpc_neon, export=1
+ add x7, x3, #(4*(SUM_STRIDE))
+ sub x3, x3, #(4*(SUM_STRIDE))
+ add x8, x4, #(2*(SUM_STRIDE))
+ sub x4, x4, #(2*(SUM_STRIDE))
+ mov x9, #(2*SUM_STRIDE)
+ mov x10, #FILTER_OUT_STRIDE
+ add x11, x5, #7
+ bic x11, x11, #7 // Aligned width
+.if \bpc == 8
+ sub x2, x2, x11
+.else
+ sub x2, x2, x11, lsl #1
+.endif
+ sub x10, x10, x11
+ sub x9, x9, x11
+ sub x9, x9, #4 // We read 4 extra elements from a
+ sub x12, x9, #4 // We read 8 extra elements from b
+ mov x11, x5
+ movi v4.8h, #5
+ movi v5.4s, #5
+ movi v6.8h, #6
+ movi v7.4s, #6
+1:
+ ld1 {v0.8h, v1.8h}, [x4], #32
+ ld1 {v2.8h, v3.8h}, [x8], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
+ ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
+
+2:
+ subs x5, x5, #8
+ ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
+ ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
+ ext v22.16b, v0.16b, v1.16b, #2 // -stride
+ ext v23.16b, v2.16b, v3.16b, #2 // +stride
+ add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
+ add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
+ add v2.8h, v22.8h, v23.8h // -stride, +stride
+ add v0.8h, v0.8h, v25.8h
+
+ ext v22.16b, v16.16b, v17.16b, #4 // -stride
+ ext v23.16b, v17.16b, v18.16b, #4
+ ext v24.16b, v19.16b, v20.16b, #4 // +stride
+ ext v25.16b, v20.16b, v21.16b, #4
+ ext v26.16b, v16.16b, v17.16b, #8 // +1-stride
+ ext v27.16b, v17.16b, v18.16b, #8
+ ext v28.16b, v19.16b, v20.16b, #8 // +1+stride
+ ext v29.16b, v20.16b, v21.16b, #8
+ mul v0.8h, v0.8h, v4.8h // * 5
+ mla v0.8h, v2.8h, v6.8h // * 6
+.if \bpc == 8
+ ld1 {v31.8b}, [x1], #8
+.else
+ ld1 {v31.8h}, [x1], #16
+.endif
+ add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
+ add v17.4s, v17.4s, v27.4s
+ add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride
+ add v20.4s, v20.4s, v29.4s
+ add v16.4s, v16.4s, v19.4s
+ add v17.4s, v17.4s, v20.4s
+
+ add v22.4s, v22.4s, v24.4s // -stride, +stride
+ add v23.4s, v23.4s, v25.4s
+ // This is, surprisingly, faster than other variants where the
+ // mul+mla pairs are further apart, on Cortex A53.
+ mul v16.4s, v16.4s, v5.4s // * 5
+ mla v16.4s, v22.4s, v7.4s // * 6
+ mul v17.4s, v17.4s, v5.4s // * 5
+ mla v17.4s, v23.4s, v7.4s // * 6
+
+.if \bpc == 8
+ uxtl v31.8h, v31.8b
+.endif
+ umlal v16.4s, v0.4h, v31.4h // b + a * src
+ umlal2 v17.4s, v0.8h, v31.8h
+ mov v0.16b, v1.16b
+ rshrn v16.4h, v16.4s, #9
+ rshrn2 v16.8h, v17.4s, #9
+ mov v2.16b, v3.16b
+ st1 {v16.8h}, [x0], #16
+
+ b.le 3f
+ mov v16.16b, v18.16b
+ mov v19.16b, v21.16b
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v3.8h}, [x8], #16
+ ld1 {v17.4s, v18.4s}, [x3], #32
+ ld1 {v20.4s, v21.4s}, [x7], #32
+ b 2b
+
+3:
+ subs x6, x6, #1
+ b.le 0f
+ mov x5, x11
+ add x0, x0, x10, lsl #1
+ add x1, x1, x2
+ add x3, x3, x9, lsl #2
+ add x7, x7, x9, lsl #2
+ add x4, x4, x12, lsl #1
+ add x8, x8, x12, lsl #1
+ mov x13, x3
+ mov x14, x4
+
+ ld1 {v0.8h, v1.8h}, [x4], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
+
+4:
+ subs x5, x5, #8
+ ext v23.16b, v0.16b, v1.16b, #4 // +1
+ ext v22.16b, v0.16b, v1.16b, #2 // 0
+ add v0.8h, v0.8h, v23.8h // -1, +1
+
+ ext v24.16b, v16.16b, v17.16b, #4 // 0
+ ext v25.16b, v17.16b, v18.16b, #4
+ ext v26.16b, v16.16b, v17.16b, #8 // +1
+ ext v27.16b, v17.16b, v18.16b, #8
+ mul v2.8h, v22.8h, v6.8h // * 6
+ mla v2.8h, v0.8h, v4.8h // * 5 -> a
+.if \bpc == 8
+ ld1 {v31.8b}, [x1], #8
+.else
+ ld1 {v31.8h}, [x1], #16
+.endif
+ add v16.4s, v16.4s, v26.4s // -1, +1
+ add v17.4s, v17.4s, v27.4s
+.if \bpc == 8
+ uxtl v31.8h, v31.8b
+.endif
+ // This is, surprisingly, faster than other variants where the
+ // mul+mla pairs are further apart, on Cortex A53.
+ mul v24.4s, v24.4s, v7.4s // * 6
+ mla v24.4s, v16.4s, v5.4s // * 5 -> b
+ mul v25.4s, v25.4s, v7.4s // * 6
+ mla v25.4s, v17.4s, v5.4s // * 5 -> b
+
+ umlal v24.4s, v2.4h, v31.4h // b + a * src
+ umlal2 v25.4s, v2.8h, v31.8h
+ mov v0.16b, v1.16b
+ rshrn v24.4h, v24.4s, #8
+ rshrn2 v24.8h, v25.4s, #8
+ mov v16.16b, v18.16b
+ st1 {v24.8h}, [x0], #16
+
+ b.le 5f
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v17.4s, v18.4s}, [x3], #32
+ b 4b
+
+5:
+ subs x6, x6, #1
+ b.le 0f
+ mov x5, x11
+ add x0, x0, x10, lsl #1
+ add x1, x1, x2
+ mov x3, x13 // Rewind x3/x4 to where they started
+ mov x4, x14
+ b 1b
+0:
+ ret
+endfunc
+
+// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *t1, const int w, const int h,
+// const int wt, const int bitdepth_max);
+function sgr_weighted1_\bpc\()bpc_neon, export=1
+.if \bpc == 16
+ ldr w8, [sp]
+.endif
+ dup v31.8h, w7
+ cmp x6, #2
+.if \bpc == 16
+ dup v30.8h, w8
+.endif
+ add x9, x0, x1
+ add x10, x2, x3
+ add x11, x4, #2*FILTER_OUT_STRIDE
+ mov x7, #(4*FILTER_OUT_STRIDE)
+ lsl x1, x1, #1
+ lsl x3, x3, #1
+ add x8, x5, #7
+ bic x8, x8, #7 // Aligned width
+.if \bpc == 8
+ sub x1, x1, x8
+ sub x3, x3, x8
+.else
+ sub x1, x1, x8, lsl #1
+ sub x3, x3, x8, lsl #1
+.endif
+ sub x7, x7, x8, lsl #1
+ mov x8, x5
+ b.lt 2f
+1:
+.if \bpc == 8
+ ld1 {v0.8b}, [x2], #8
+ ld1 {v4.8b}, [x10], #8
+.else
+ ld1 {v0.8h}, [x2], #16
+ ld1 {v4.8h}, [x10], #16
+.endif
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v5.8h}, [x11], #16
+ subs x5, x5, #8
+.if \bpc == 8
+ ushll v0.8h, v0.8b, #4 // u
+ ushll v4.8h, v4.8b, #4 // u
+.else
+ shl v0.8h, v0.8h, #4 // u
+ shl v4.8h, v4.8h, #4 // u
+.endif
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ sub v5.8h, v5.8h, v4.8h // t1 - u
+ ushll v2.4s, v0.4h, #7 // u << 7
+ ushll2 v3.4s, v0.8h, #7 // u << 7
+ ushll v6.4s, v4.4h, #7 // u << 7
+ ushll2 v7.4s, v4.8h, #7 // u << 7
+ smlal v2.4s, v1.4h, v31.4h // v
+ smlal2 v3.4s, v1.8h, v31.8h // v
+ smlal v6.4s, v5.4h, v31.4h // v
+ smlal2 v7.4s, v5.8h, v31.8h // v
+.if \bpc == 8
+ rshrn v2.4h, v2.4s, #11
+ rshrn2 v2.8h, v3.4s, #11
+ rshrn v6.4h, v6.4s, #11
+ rshrn2 v6.8h, v7.4s, #11
+ sqxtun v2.8b, v2.8h
+ sqxtun v6.8b, v6.8h
+ st1 {v2.8b}, [x0], #8
+ st1 {v6.8b}, [x9], #8
+.else
+ sqrshrun v2.4h, v2.4s, #11
+ sqrshrun2 v2.8h, v3.4s, #11
+ sqrshrun v6.4h, v6.4s, #11
+ sqrshrun2 v6.8h, v7.4s, #11
+ umin v2.8h, v2.8h, v30.8h
+ umin v6.8h, v6.8h, v30.8h
+ st1 {v2.8h}, [x0], #16
+ st1 {v6.8h}, [x9], #16
+.endif
+ b.gt 1b
+
+ sub x6, x6, #2
+ cmp x6, #1
+ b.lt 0f
+ mov x5, x8
+ add x0, x0, x1
+ add x9, x9, x1
+ add x2, x2, x3
+ add x10, x10, x3
+ add x4, x4, x7
+ add x11, x11, x7
+ b.eq 2f
+ b 1b
+
+2:
+.if \bpc == 8
+ ld1 {v0.8b}, [x2], #8
+.else
+ ld1 {v0.8h}, [x2], #16
+.endif
+ ld1 {v1.8h}, [x4], #16
+ subs x5, x5, #8
+.if \bpc == 8
+ ushll v0.8h, v0.8b, #4 // u
+.else
+ shl v0.8h, v0.8h, #4 // u
+.endif
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ ushll v2.4s, v0.4h, #7 // u << 7
+ ushll2 v3.4s, v0.8h, #7 // u << 7
+ smlal v2.4s, v1.4h, v31.4h // v
+ smlal2 v3.4s, v1.8h, v31.8h // v
+.if \bpc == 8
+ rshrn v2.4h, v2.4s, #11
+ rshrn2 v2.8h, v3.4s, #11
+ sqxtun v2.8b, v2.8h
+ st1 {v2.8b}, [x0], #8
+.else
+ sqrshrun v2.4h, v2.4s, #11
+ sqrshrun2 v2.8h, v3.4s, #11
+ umin v2.8h, v2.8h, v30.8h
+ st1 {v2.8h}, [x0], #16
+.endif
+ b.gt 2b
+0:
+ ret
+endfunc
+
+// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *t1, const int16_t *t2,
+// const int w, const int h,
+// const int16_t wt[2], const int bitdepth_max);
+function sgr_weighted2_\bpc\()bpc_neon, export=1
+.if \bpc == 8
+ ldr x8, [sp]
+.else
+ ldp x8, x9, [sp]
+.endif
+ cmp x7, #2
+ add x10, x0, x1
+ add x11, x2, x3
+ add x12, x4, #2*FILTER_OUT_STRIDE
+ add x13, x5, #2*FILTER_OUT_STRIDE
+ ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
+.if \bpc == 16
+ dup v29.8h, w9
+.endif
+ mov x8, #4*FILTER_OUT_STRIDE
+ lsl x1, x1, #1
+ lsl x3, x3, #1
+ add x9, x6, #7
+ bic x9, x9, #7 // Aligned width
+.if \bpc == 8
+ sub x1, x1, x9
+ sub x3, x3, x9
+.else
+ sub x1, x1, x9, lsl #1
+ sub x3, x3, x9, lsl #1
+.endif
+ sub x8, x8, x9, lsl #1
+ mov x9, x6
+ b.lt 2f
+1:
+.if \bpc == 8
+ ld1 {v0.8b}, [x2], #8
+ ld1 {v16.8b}, [x11], #8
+.else
+ ld1 {v0.8h}, [x2], #16
+ ld1 {v16.8h}, [x11], #16
+.endif
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v17.8h}, [x12], #16
+ ld1 {v2.8h}, [x5], #16
+ ld1 {v18.8h}, [x13], #16
+ subs x6, x6, #8
+.if \bpc == 8
+ ushll v0.8h, v0.8b, #4 // u
+ ushll v16.8h, v16.8b, #4 // u
+.else
+ shl v0.8h, v0.8h, #4 // u
+ shl v16.8h, v16.8h, #4 // u
+.endif
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ sub v2.8h, v2.8h, v0.8h // t2 - u
+ sub v17.8h, v17.8h, v16.8h // t1 - u
+ sub v18.8h, v18.8h, v16.8h // t2 - u
+ ushll v3.4s, v0.4h, #7 // u << 7
+ ushll2 v4.4s, v0.8h, #7 // u << 7
+ ushll v19.4s, v16.4h, #7 // u << 7
+ ushll2 v20.4s, v16.8h, #7 // u << 7
+ smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
+ smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
+ smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
+ smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
+ smlal v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)
+ smlal v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)
+ smlal2 v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)
+ smlal2 v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)
+.if \bpc == 8
+ rshrn v3.4h, v3.4s, #11
+ rshrn2 v3.8h, v4.4s, #11
+ rshrn v19.4h, v19.4s, #11
+ rshrn2 v19.8h, v20.4s, #11
+ sqxtun v3.8b, v3.8h
+ sqxtun v19.8b, v19.8h
+ st1 {v3.8b}, [x0], #8
+ st1 {v19.8b}, [x10], #8
+.else
+ sqrshrun v3.4h, v3.4s, #11
+ sqrshrun2 v3.8h, v4.4s, #11
+ sqrshrun v19.4h, v19.4s, #11
+ sqrshrun2 v19.8h, v20.4s, #11
+ umin v3.8h, v3.8h, v29.8h
+ umin v19.8h, v19.8h, v29.8h
+ st1 {v3.8h}, [x0], #16
+ st1 {v19.8h}, [x10], #16
+.endif
+ b.gt 1b
+
+ subs x7, x7, #2
+ cmp x7, #1
+ b.lt 0f
+ mov x6, x9
+ add x0, x0, x1
+ add x10, x10, x1
+ add x2, x2, x3
+ add x11, x11, x3
+ add x4, x4, x8
+ add x12, x12, x8
+ add x5, x5, x8
+ add x13, x13, x8
+ b.eq 2f
+ b 1b
+
+2:
+.if \bpc == 8
+ ld1 {v0.8b}, [x2], #8
+.else
+ ld1 {v0.8h}, [x2], #16
+.endif
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v2.8h}, [x5], #16
+ subs x6, x6, #8
+.if \bpc == 8
+ ushll v0.8h, v0.8b, #4 // u
+.else
+ shl v0.8h, v0.8h, #4 // u
+.endif
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ sub v2.8h, v2.8h, v0.8h // t2 - u
+ ushll v3.4s, v0.4h, #7 // u << 7
+ ushll2 v4.4s, v0.8h, #7 // u << 7
+ smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
+ smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
+ smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
+ smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
+.if \bpc == 8
+ rshrn v3.4h, v3.4s, #11
+ rshrn2 v3.8h, v4.4s, #11
+ sqxtun v3.8b, v3.8h
+ st1 {v3.8b}, [x0], #8
+.else
+ sqrshrun v3.4h, v3.4s, #11
+ sqrshrun2 v3.8h, v4.4s, #11
+ umin v3.8h, v3.8h, v29.8h
+ st1 {v3.8h}, [x0], #16
+.endif
+ b.gt 1b
+0:
+ ret
+endfunc
+.endm