summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/arm
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/arm')
-rw-r--r--third_party/dav1d/src/arm/32/itx.S79
-rw-r--r--third_party/dav1d/src/arm/32/itx16.S19
-rw-r--r--third_party/dav1d/src/arm/32/msac.S167
-rw-r--r--third_party/dav1d/src/arm/64/itx.S99
-rw-r--r--third_party/dav1d/src/arm/64/itx16.S21
-rw-r--r--third_party/dav1d/src/arm/64/mc.S411
-rw-r--r--third_party/dav1d/src/arm/64/mc16.S373
-rw-r--r--third_party/dav1d/src/arm/64/msac.S167
-rw-r--r--third_party/dav1d/src/arm/64/util.S49
-rw-r--r--third_party/dav1d/src/arm/asm.S44
-rw-r--r--third_party/dav1d/src/arm/cpu.c137
-rw-r--r--third_party/dav1d/src/arm/cpu.h4
-rw-r--r--third_party/dav1d/src/arm/itx.h4
-rw-r--r--third_party/dav1d/src/arm/msac.h2
14 files changed, 1086 insertions, 490 deletions
diff --git a/third_party/dav1d/src/arm/32/itx.S b/third_party/dav1d/src/arm/32/itx.S
index ceea025e45..9ba1df7a68 100644
--- a/third_party/dav1d/src/arm/32/itx.S
+++ b/third_party/dav1d/src/arm/32/itx.S
@@ -965,6 +965,8 @@ function inv_txfm_\variant\()add_8x8_neon
.ifc \variant, identity_
// The identity shl #1 and downshift srshr #1 cancel out
+
+ b L(itx_8x8_epilog)
.else
blx r4
@@ -976,8 +978,8 @@ function inv_txfm_\variant\()add_8x8_neon
vrshr.s16 q13, q13, #1
vrshr.s16 q14, q14, #1
vrshr.s16 q15, q15, #1
-.endif
+L(itx_8x8_epilog):
transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
blx r5
@@ -985,11 +987,12 @@ function inv_txfm_\variant\()add_8x8_neon
load_add_store_8x8 r0, r7
vpop {q4-q7}
pop {r4-r5,r7,pc}
+.endif
endfunc
.endm
-def_fn_8x8_base
def_fn_8x8_base identity_
+def_fn_8x8_base
.macro def_fn_8x8 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
@@ -1444,14 +1447,16 @@ function inv_txfm_horz\suffix\()_16x4_neon
.else
identity_4x16_shift1 d0[0]
.endif
+ b L(horz_16x4_epilog)
.else
blx r4
-.endif
-.if \shift > 0
.irp i, q8, q9, q10, q11, q12, q13, q14, q15
vrshr.s16 \i, \i, #\shift
.endr
-.endif
+.if \shift == 1
+ b L(horz_16x4_epilog)
+.else
+L(horz_16x4_epilog):
transpose_4x4h q8, q9, d16, d17, d18, d19
transpose_4x4h q10, q11, d20, d21, d22, d23
transpose_4x4h q12, q13, d24, d25, d26, d27
@@ -1462,13 +1467,15 @@ function inv_txfm_horz\suffix\()_16x4_neon
.endr
pop {pc}
+.endif
+.endif
endfunc
.endm
-def_horz_16 scale=0, identity=0, shift=2
-def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
-def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity
def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity
+def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity
+def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
+def_horz_16 scale=0, identity=0, shift=2
function inv_txfm_add_vert_4x16_neon
push {lr}
@@ -1597,6 +1604,8 @@ function inv_txfm_\variant\()add_16x4_neon
.endr
identity_4x16_shift1 d0[0]
+
+ b L(itx_16x4_epilog)
.else
vmov.i16 q2, #0
vmov.i16 q3, #0
@@ -1615,30 +1624,25 @@ function inv_txfm_\variant\()add_16x4_neon
vswp d19, d22
vswp d18, d20
vswp d19, d21
-.irp i, q8, q9, q10, q11
+ vswp d25, d28
+ vswp d27, d30
+ vswp d26, d28
+ vswp d27, d29
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
vrshr.s16 \i, \i, #1
.endr
-.endif
+
+L(itx_16x4_epilog):
transpose_4x8h q8, q9, q10, q11
blx r5
mov r6, r0
load_add_store_8x4 r6, r7
-.ifc \variant, identity_
vmov q8, q12
vmov q9, q13
vmov q10, q14
vmov q11, q15
-.else
- vswp d25, d28
- vswp d27, d30
- vswp d26, d28
- vswp d27, d29
- vrshr.s16 q8, q12, #1
- vrshr.s16 q9, q13, #1
- vrshr.s16 q10, q14, #1
- vrshr.s16 q11, q15, #1
-.endif
+
transpose_4x8h q8, q9, q10, q11
blx r5
add r6, r0, #8
@@ -1646,6 +1650,7 @@ function inv_txfm_\variant\()add_16x4_neon
vpop {q4-q7}
pop {r4-r11,pc}
+.endif
endfunc
function inv_txfm_\variant\()add_4x16_neon
@@ -1696,12 +1701,14 @@ function inv_txfm_\variant\()add_4x16_neon
movw r12, #(5793-4096)*8
vdup.16 d0, r12
identity_8x4_shift1 q8, q9, q10, q11, d0[0]
+
+ b L(itx_4x16_epilog)
.else
blx r4
.irp i, q8, q9, q10, q11
vrshr.s16 \i, \i, #1
.endr
-.endif
+L(itx_4x16_epilog):
transpose_4x8h q8, q9, q10, q11
vswp d19, d21
vswp d18, d20
@@ -1714,11 +1721,12 @@ function inv_txfm_\variant\()add_4x16_neon
vpop {q4-q7}
pop {r4-r11,pc}
+.endif
endfunc
.endm
-def_fn_416_base
def_fn_416_base identity_
+def_fn_416_base
.macro def_fn_416 w, h, txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
@@ -1728,11 +1736,15 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
.if \w == 4
+.ifnc \txfm1, identity
movrel_local r4, inv_\txfm1\()_8h_x\w\()_neon
+.endif
movrel_local r5, inv_\txfm2\()_4h_x\h\()_neon
mov r10, #\eob_half
.else
+.ifnc \txfm1, identity
movrel_local r4, inv_\txfm1\()_4h_x\w\()_neon
+.endif
movrel_local r5, inv_\txfm2\()_8h_x\h\()_neon
.endif
.ifc \txfm1, identity
@@ -1765,8 +1777,7 @@ def_fn_416 \w, \h, identity, flipadst, 32
def_fns_416 4, 16
def_fns_416 16, 4
-.macro def_fn_816_base variant
-function inv_txfm_\variant\()add_16x8_neon
+function inv_txfm_add_16x8_neon
sub_sp_align 256
.irp i, 0, 4
@@ -1805,6 +1816,7 @@ function inv_txfm_\variant\()add_16x8_neon
pop {r4-r11,pc}
endfunc
+.macro def_fn_816_base variant
function inv_txfm_\variant\()add_8x16_neon
sub_sp_align 256
@@ -1849,6 +1861,10 @@ function inv_txfm_\variant\()add_8x16_neon
.endr
2:
+.ifc \variant, identity_
+ b L(itx_8x16_epilog)
+.else
+L(itx_8x16_epilog):
.irp i, 0, 4
add r6, r0, #(\i)
add r7, sp, #(\i*2)
@@ -1859,11 +1875,18 @@ function inv_txfm_\variant\()add_8x16_neon
add_sp_align 256
vpop {q4-q7}
pop {r4-r11,pc}
+.endif
endfunc
.endm
-def_fn_816_base
def_fn_816_base identity_
+def_fn_816_base
+
+/* Define symbols used in .if statement */
+.equ dct, 1
+.equ identity, 2
+.equ adst, 3
+.equ flipadst, 4
.macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
@@ -1873,7 +1896,9 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
.if \w == 8
+.ifnc \txfm1, identity
movrel_local r4, inv_\txfm1\()_8h_x8_neon
+.endif
movrel_local r5, inv_\txfm2\()_4h_x16_neon
.else
.ifc \txfm1, identity
@@ -1889,7 +1914,7 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
.else
mov r10, #\eob_4x4
.endif
-.ifc \txfm1, identity
+.if \w == 8 && \txfm1 == identity
b inv_txfm_identity_add_\w\()x\h\()_neon
.else
b inv_txfm_add_\w\()x\h\()_neon
diff --git a/third_party/dav1d/src/arm/32/itx16.S b/third_party/dav1d/src/arm/32/itx16.S
index aa6c272e71..7691272517 100644
--- a/third_party/dav1d/src/arm/32/itx16.S
+++ b/third_party/dav1d/src/arm/32/itx16.S
@@ -547,11 +547,11 @@ function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
vmov.i16 q15, #0
vld1.32 {q8, q9}, [r2, :128]
vst1.32 {q14, q15}, [r2, :128]!
- vshr.s16 q8, q8, #2
+ vshr.s32 q8, q8, #2
vld1.32 {q10, q11}, [r2, :128]
- vshr.s16 q9, q9, #2
- vshr.s16 q10, q10, #2
- vshr.s16 q11, q11, #2
+ vshr.s32 q9, q9, #2
+ vshr.s32 q10, q10, #2
+ vshr.s32 q11, q11, #2
iwht4
@@ -598,7 +598,9 @@ function inv_txfm_add_4x4_neon
vld1.16 {d3}, [r0, :64], r1
L(itx_4x4_end):
- vmvn.i16 q15, #0xfc00 // 0x3ff
+ // read bitdepth_max from the callers stack
+ ldr r4, [sp, #44]
+ vdup.i16 q15, r4
sub r0, r0, r1, lsl #2
vqadd.s16 q8, q8, q0
vqadd.s16 q9, q9, q1
@@ -1487,6 +1489,10 @@ function inv_txfm_horz\suffix\()_16x2_neon
vqrshrn.s32 d21, q13, #\shift
vqrshrn.s32 d22, q14, #\shift
vqrshrn.s32 d23, q15, #\shift
+.if \scale
+ b L(horz_16x2_epilog)
+.else
+L(horz_16x2_epilog):
vuzp.16 q8, q9
vuzp.16 q10, q11
@@ -1495,11 +1501,12 @@ function inv_txfm_horz\suffix\()_16x2_neon
.endr
pop {pc}
+.endif
endfunc
.endm
-def_horz_16 scale=0, shift=2
def_horz_16 scale=1, shift=1, suffix=_scale
+def_horz_16 scale=0, shift=2
function inv_txfm_add_vert_4x16_neon
push {lr}
diff --git a/third_party/dav1d/src/arm/32/msac.S b/third_party/dav1d/src/arm/32/msac.S
index b06e109dda..b16957fb7e 100644
--- a/third_party/dav1d/src/arm/32/msac.S
+++ b/third_party/dav1d/src/arm/32/msac.S
@@ -279,60 +279,67 @@ L(renorm):
sub r4, r4, r3 // rng = u - v
clz r5, r4 // clz(rng)
eor r5, r5, #16 // d = clz(rng) ^ 16
- mvn r7, r7 // ~dif
- add r7, r7, r3, lsl #16 // ~dif + (v << 16)
+ sub r7, r7, r3, lsl #16 // dif - (v << 16)
L(renorm2):
lsl r4, r4, r5 // rng << d
subs r6, r6, r5 // cnt -= d
- lsl r7, r7, r5 // (~dif + (v << 16)) << d
+ lsl r7, r7, r5 // (dif - (v << 16)) << d
str r4, [r0, #RNG]
- mvn r7, r7 // ~dif
- bhs 9f
+ bhs 4f
// refill
ldr r3, [r0, #BUF_POS] // BUF_POS
ldr r4, [r0, #BUF_END] // BUF_END
add r5, r3, #4
- cmp r5, r4
- bgt 2f
-
- ldr r3, [r3] // next_bits
- add r8, r6, #23 // shift_bits = cnt + 23
- add r6, r6, #16 // cnt += 16
- rev r3, r3 // next_bits = bswap(next_bits)
- sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3
- and r8, r8, #24 // shift_bits &= 24
- lsr r3, r3, r8 // next_bits >>= shift_bits
- sub r8, r8, r6 // shift_bits -= 16 + cnt
- str r5, [r0, #BUF_POS]
- lsl r3, r3, r8 // next_bits <<= shift_bits
- rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits
- eor r7, r7, r3 // dif ^= next_bits
- b 9f
-
-2: // refill_eob
- rsb r5, r6, #8 // c = 8 - cnt
-3:
- cmp r3, r4
- bge 4f
- ldrb r8, [r3], #1
- lsl r8, r8, r5
- eor r7, r7, r8
- subs r5, r5, #8
- bge 3b
-
-4: // refill_eob_end
+ subs r5, r5, r4
+ bhi 6f
+
+ ldr r8, [r3] // next_bits
+ rsb r5, r6, #16
+ add r4, r6, #16 // shift_bits = cnt + 16
+ mvn r8, r8
+ lsr r5, r5, #3 // num_bytes_read
+ rev r8, r8 // next_bits = bswap(next_bits)
+ lsr r8, r8, r4 // next_bits >>= shift_bits
+
+2: // refill_end
+ add r3, r3, r5
+ add r6, r6, r5, lsl #3 // cnt += num_bits_read
str r3, [r0, #BUF_POS]
- rsb r6, r5, #8 // cnt = 8 - c
-9:
+3: // refill_end2
+ orr r7, r7, r8 // dif |= next_bits
+
+4: // end
str r6, [r0, #CNT]
str r7, [r0, #DIF]
-
mov r0, lr
add sp, sp, #48
-
pop {r4-r10,pc}
+
+5: // pad_with_ones
+ add r8, r6, #-240
+ lsr r8, r8, r8
+ b 3b
+
+6: // refill_eob
+ cmp r3, r4
+ bhs 5b
+
+ ldr r8, [r4, #-4]
+ lsl r5, r5, #3
+ lsr r8, r8, r5
+ add r5, r6, #16
+ mvn r8, r8
+ sub r4, r4, r3 // num_bytes_left
+ rev r8, r8
+ lsr r8, r8, r5
+ rsb r5, r6, #16
+ lsr r5, r5, #3
+ cmp r5, r4
+ it hs
+ movhs r5, r4
+ b 2b
endfunc
function msac_decode_symbol_adapt8_neon, export=1
@@ -414,53 +421,38 @@ function msac_decode_hi_tok_neon, export=1
sub r4, r4, r3 // rng = u - v
clz r5, r4 // clz(rng)
eor r5, r5, #16 // d = clz(rng) ^ 16
- mvn r7, r7 // ~dif
- add r7, r7, r3, lsl #16 // ~dif + (v << 16)
+ sub r7, r7, r3, lsl #16 // dif - (v << 16)
lsl r4, r4, r5 // rng << d
subs r6, r6, r5 // cnt -= d
- lsl r7, r7, r5 // (~dif + (v << 16)) << d
+ lsl r7, r7, r5 // (dif - (v << 16)) << d
str r4, [r0, #RNG]
vdup.16 d1, r4
- mvn r7, r7 // ~dif
- bhs 9f
+ bhs 5f
// refill
ldr r3, [r0, #BUF_POS] // BUF_POS
ldr r4, [r0, #BUF_END] // BUF_END
add r5, r3, #4
- cmp r5, r4
- bgt 2f
-
- ldr r3, [r3] // next_bits
- add r8, r6, #23 // shift_bits = cnt + 23
- add r6, r6, #16 // cnt += 16
- rev r3, r3 // next_bits = bswap(next_bits)
- sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3
- and r8, r8, #24 // shift_bits &= 24
- lsr r3, r3, r8 // next_bits >>= shift_bits
- sub r8, r8, r6 // shift_bits -= 16 + cnt
- str r5, [r0, #BUF_POS]
- lsl r3, r3, r8 // next_bits <<= shift_bits
- rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits
- eor r7, r7, r3 // dif ^= next_bits
- b 9f
-
-2: // refill_eob
- rsb r5, r6, #8 // c = 40 - cnt
-3:
- cmp r3, r4
- bge 4f
- ldrb r8, [r3], #1
- lsl r8, r8, r5
- eor r7, r7, r8
- subs r5, r5, #8
- bge 3b
-
-4: // refill_eob_end
+ subs r5, r5, r4
+ bhi 7f
+
+ ldr r8, [r3] // next_bits
+ rsb r5, r6, #16
+ add r4, r6, #16 // shift_bits = cnt + 16
+ mvn r8, r8
+ lsr r5, r5, #3 // num_bytes_read
+ rev r8, r8 // next_bits = bswap(next_bits)
+ lsr r8, r8, r4 // next_bits >>= shift_bits
+
+3: // refill_end
+ add r3, r3, r5
+ add r6, r6, r5, lsl #3 // cnt += num_bits_read
str r3, [r0, #BUF_POS]
- rsb r6, r5, #8 // cnt = 40 - c
-9:
+4: // refill_end2
+ orr r7, r7, r8 // dif |= next_bits
+
+5: // end
lsl lr, lr, #1
sub lr, lr, #5
lsr r12, r7, #16
@@ -473,6 +465,30 @@ function msac_decode_hi_tok_neon, export=1
str r7, [r0, #DIF]
lsr r0, r2, #1
pop {r4-r10,pc}
+
+6: // pad_with_ones
+ add r8, r6, #-240
+ lsr r8, r8, r8
+ b 4b
+
+7: // refill_eob
+ cmp r3, r4
+ bhs 6b
+
+ ldr r8, [r4, #-4]
+ lsl r5, r5, #3
+ lsr r8, r8, r5
+ add r5, r6, #16
+ mvn r8, r8
+ sub r4, r4, r3 // num_bytes_left
+ rev r8, r8
+ lsr r8, r8, r5
+ rsb r5, r6, #16
+ lsr r5, r5, #3
+ cmp r5, r4
+ it hs
+ movhs r5, r4
+ b 3b
endfunc
function msac_decode_bool_equi_neon, export=1
@@ -493,7 +509,6 @@ function msac_decode_bool_equi_neon, export=1
movhs r7, r8 // if (ret) dif = dif - vw;
clz r5, r4 // clz(rng)
- mvn r7, r7 // ~dif
eor r5, r5, #16 // d = clz(rng) ^ 16
mov lr, r2
b L(renorm2)
@@ -519,7 +534,6 @@ function msac_decode_bool_neon, export=1
movhs r7, r8 // if (ret) dif = dif - vw;
clz r5, r4 // clz(rng)
- mvn r7, r7 // ~dif
eor r5, r5, #16 // d = clz(rng) ^ 16
mov lr, r2
b L(renorm2)
@@ -549,7 +563,6 @@ function msac_decode_bool_adapt_neon, export=1
cmp r10, #0
clz r5, r4 // clz(rng)
- mvn r7, r7 // ~dif
eor r5, r5, #16 // d = clz(rng) ^ 16
mov lr, r2
diff --git a/third_party/dav1d/src/arm/64/itx.S b/third_party/dav1d/src/arm/64/itx.S
index 53490cd677..7063cbde1d 100644
--- a/third_party/dav1d/src/arm/64/itx.S
+++ b/third_party/dav1d/src/arm/64/itx.S
@@ -879,6 +879,8 @@ function inv_txfm_\variant\()add_8x8_neon
.ifc \variant, identity_
// The identity shl #1 and downshift srshr #1 cancel out
+
+ b L(itx_8x8_epilog)
.else
blr x4
@@ -890,19 +892,20 @@ function inv_txfm_\variant\()add_8x8_neon
srshr v21.8h, v21.8h, #1
srshr v22.8h, v22.8h, #1
srshr v23.8h, v23.8h, #1
-.endif
+L(itx_8x8_epilog):
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
blr x5
load_add_store_8x8 x0, x7
ret x15
+.endif
endfunc
.endm
-def_fn_8x8_base
def_fn_8x8_base identity_
+def_fn_8x8_base
.macro def_fn_8x8 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
@@ -1390,14 +1393,16 @@ function inv_txfm_horz\suffix\()_16x8_neon
.endif
.if \identity
identity_8x16_shift2 v0.h[0]
+ b L(horz_16x8_epilog)
.else
blr x4
-.endif
-.if \shift > 0
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
srshr \i, \i, #\shift
.endr
-.endif
+.if \shift == 1
+ b L(horz_16x8_epilog)
+.else
+L(horz_16x8_epilog):
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
@@ -1406,12 +1411,14 @@ function inv_txfm_horz\suffix\()_16x8_neon
.endr
ret x14
+.endif
+.endif
endfunc
.endm
-def_horz_16 scale=0, identity=0, shift=2
def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
def_horz_16 scale=0, identity=1, shift=0, suffix=_identity
+def_horz_16 scale=0, identity=0, shift=2
function inv_txfm_add_vert_8x16_neon
mov x14, x30
@@ -1512,6 +1519,8 @@ function inv_txfm_\variant\()add_16x4_neon
.endr
identity_8x16_shift1 v0.h[0]
+
+ b L(itx_16x4_epilog)
.else
.irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
ld1 {\i}, [x2]
@@ -1527,33 +1536,29 @@ function inv_txfm_\variant\()add_16x4_neon
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
srshr \i, \i, #1
.endr
-.endif
- transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
- blr x5
- mov x6, x0
- load_add_store_8x4 x6, x7
-.ifc \variant, identity_
- mov v16.16b, v20.16b
- mov v17.16b, v21.16b
- mov v18.16b, v22.16b
- mov v19.16b, v23.16b
-.else
ins v24.d[1], v28.d[0]
ins v25.d[1], v29.d[0]
ins v26.d[1], v30.d[0]
ins v27.d[1], v31.d[0]
- srshr v16.8h, v24.8h, #1
- srshr v17.8h, v25.8h, #1
- srshr v18.8h, v26.8h, #1
- srshr v19.8h, v27.8h, #1
-.endif
+ srshr v20.8h, v24.8h, #1
+ srshr v21.8h, v25.8h, #1
+ srshr v22.8h, v26.8h, #1
+ srshr v23.8h, v27.8h, #1
+
+L(itx_16x4_epilog):
transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
blr x5
+ mov x6, x0
+ load_add_store_8x4 x6, x7
+
+ transpose_4x8h_mov v20, v21, v22, v23, v2, v3, v4, v5, v16, v17, v18, v19
+ blr x5
add x6, x0, #8
load_add_store_8x4 x6, x7
ret x15
+.endif
endfunc
function inv_txfm_\variant\()add_4x16_neon
@@ -1605,12 +1610,14 @@ function inv_txfm_\variant\()add_4x16_neon
mov w16, #(5793-4096)*8
dup v0.4h, w16
identity_8x4_shift1 v16, v17, v18, v19, v0.h[0]
+
+ b L(itx_4x16_epilog)
.else
blr x4
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
srshr \i, \i, #1
.endr
-.endif
+L(itx_4x16_epilog):
transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
ins v20.d[0], v16.d[1]
ins v21.d[0], v17.d[1]
@@ -1622,11 +1629,12 @@ function inv_txfm_\variant\()add_4x16_neon
load_add_store_4x16 x0, x6
ret x15
+.endif
endfunc
.endm
-def_fn_416_base
def_fn_416_base identity_
+def_fn_416_base
.macro def_fn_416 w, h, txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
@@ -1634,11 +1642,15 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
idct_dc \w, \h, 1
.endif
.if \w == 4
+.ifnc \txfm1, identity
adr x4, inv_\txfm1\()_8h_x\w\()_neon
+.endif
adr x5, inv_\txfm2\()_4h_x\h\()_neon
mov w13, #\eob_half
.else
+.ifnc \txfm1, identity
adr x4, inv_\txfm1\()_4h_x\w\()_neon
+.endif
adr x5, inv_\txfm2\()_8h_x\h\()_neon
.endif
.ifc \txfm1, identity
@@ -1690,13 +1702,16 @@ function inv_txfm_\variant\()add_16x8_neon
mov w16, #2*(5793-4096)*8
dup v0.4h, w16
identity_8x16_shift1 v0.h[0]
+
+ b L(itx_16x8_epilog)
.else
blr x4
-.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
srshr \i, \i, #1
.endr
-.endif
+
+L(itx_16x8_epilog):
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
blr x5
@@ -1704,27 +1719,7 @@ function inv_txfm_\variant\()add_16x8_neon
mov x6, x0
load_add_store_8x8 x6, x7
-.ifc \variant, identity_
- mov v16.16b, v24.16b
- mov v17.16b, v25.16b
- mov v18.16b, v26.16b
- mov v19.16b, v27.16b
- mov v20.16b, v28.16b
- mov v21.16b, v29.16b
- mov v22.16b, v30.16b
- mov v23.16b, v31.16b
-.else
- srshr v16.8h, v24.8h, #1
- srshr v17.8h, v25.8h, #1
- srshr v18.8h, v26.8h, #1
- srshr v19.8h, v27.8h, #1
- srshr v20.8h, v28.8h, #1
- srshr v21.8h, v29.8h, #1
- srshr v22.8h, v30.8h, #1
- srshr v23.8h, v31.8h, #1
-.endif
-
- transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+ transpose_8x8h_mov v24, v25, v26, v27, v28, v29, v30, v31, v2, v3, v16, v17, v18, v19, v20, v21, v22, v23
blr x5
@@ -1732,6 +1727,7 @@ function inv_txfm_\variant\()add_16x8_neon
load_add_store_8x8 x0, x7
ret x15
+.endif
endfunc
function inv_txfm_\variant\()add_8x16_neon
@@ -1790,14 +1786,16 @@ function inv_txfm_\variant\()add_8x16_neon
scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
.ifc \variant, identity_
// The identity shl #1 and downshift srshr #1 cancel out
+
+ b L(itx_8x16_epilog)
.else
blr x4
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
srshr \i, \i, #1
.endr
-.endif
+L(itx_8x16_epilog):
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
blr x5
@@ -1805,18 +1803,21 @@ function inv_txfm_\variant\()add_8x16_neon
load_add_store_8x16 x0, x6
ret x15
+.endif
endfunc
.endm
-def_fn_816_base
def_fn_816_base identity_
+def_fn_816_base
.macro def_fn_816 w, h, txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
.ifc \txfm1\()_\txfm2, dct_dct
idct_dc \w, \h, 1
.endif
+.ifnc \txfm1, identity
adr x4, inv_\txfm1\()_8h_x\w\()_neon
+.endif
adr x5, inv_\txfm2\()_8h_x\h\()_neon
.if \w == 8
mov x13, #\eob_half
diff --git a/third_party/dav1d/src/arm/64/itx16.S b/third_party/dav1d/src/arm/64/itx16.S
index eee3a9636d..31ee9be1b4 100644
--- a/third_party/dav1d/src/arm/64/itx16.S
+++ b/third_party/dav1d/src/arm/64/itx16.S
@@ -514,13 +514,17 @@ function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
b L(itx_4x4_end)
endfunc
+// HBD inv_txfm_add_4x4_neon deviates from the common pattern with registers
+// x0-x4 external parameters
+// x5 function pointer to first transform
+// x6 function pointer to second transform
function inv_txfm_add_4x4_neon
movi v30.4s, #0
movi v31.4s, #0
ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
st1 {v30.4s, v31.4s}, [x2], #32
- blr x4
+ blr x5
st1 {v30.4s, v31.4s}, [x2], #32
sqxtn v16.4h, v16.4s
@@ -529,7 +533,7 @@ function inv_txfm_add_4x4_neon
sqxtn v19.4h, v19.4s
transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23
- blr x5
+ blr x6
ld1 {v0.d}[0], [x0], x1
ld1 {v0.d}[1], [x0], x1
@@ -541,7 +545,7 @@ function inv_txfm_add_4x4_neon
srshr v18.8h, v18.8h, #4
L(itx_4x4_end):
- mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+ dup v31.8h, w4
sub x0, x0, x1, lsl #2
usqadd v0.8h, v16.8h
usqadd v1.8h, v18.8h
@@ -579,8 +583,8 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1
b L(itx_4x4_end)
1:
.endif
- adr x4, inv_\txfm1\()_4s_x4_neon
- movrel x5, X(inv_\txfm2\()_4h_x4_neon)
+ adr x5, inv_\txfm1\()_4s_x4_neon
+ movrel x6, X(inv_\txfm2\()_4h_x4_neon)
b inv_txfm_add_4x4_neon
endfunc
.endm
@@ -1381,6 +1385,10 @@ function inv_txfm_horz\suffix\()_16x4_neon
sqrshrn2 v21.8h, v29.4s, #\shift
sqrshrn2 v22.8h, v30.4s, #\shift
sqrshrn2 v23.8h, v31.4s, #\shift
+.if \scale
+ b L(horz_16x4_epilog)
+.else
+L(horz_16x4_epilog):
transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7
@@ -1389,11 +1397,12 @@ function inv_txfm_horz\suffix\()_16x4_neon
.endr
ret x14
+.endif
endfunc
.endm
-def_horz_16 scale=0, shift=2
def_horz_16 scale=1, shift=1, suffix=_scale
+def_horz_16 scale=0, shift=2
function inv_txfm_add_vert_8x16_neon
mov x14, x30
diff --git a/third_party/dav1d/src/arm/64/mc.S b/third_party/dav1d/src/arm/64/mc.S
index 9f7b4e7a89..3df0393c3a 100644
--- a/third_party/dav1d/src/arm/64/mc.S
+++ b/third_party/dav1d/src/arm/64/mc.S
@@ -1154,7 +1154,7 @@ endfunc
uxtl \r6\().8h, \r6\().8b
.endif
.endm
-.macro mul_mla_4 d, s0, s1, s2, s3, wd
+.macro mul_mla_4tap d, s0, s1, s2, s3, wd
mul \d\wd, \s0\wd, v0.h[0]
mla \d\wd, \s1\wd, v0.h[1]
mla \d\wd, \s2\wd, v0.h[2]
@@ -1163,7 +1163,51 @@ endfunc
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
-.macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
+.macro mul_mla_6tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
+ mul \d0\().4h, \s1\().4h, v0.h[1]
+ mla \d0\().4h, \s2\().4h, v0.h[2]
+ mla \d0\().4h, \s3\().4h, v0.h[3]
+ mla \d0\().4h, \s4\().4h, v0.h[4]
+ mla \d0\().4h, \s5\().4h, v0.h[5]
+ mla \d0\().4h, \s6\().4h, v0.h[6]
+.endm
+.macro mul_mla_6tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+ mul \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+.endm
+.macro mul_mla_6tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+ mul \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mul \d1\().8h, \s2\().8h, v0.h[1]
+ mla \d1\().8h, \s3\().8h, v0.h[2]
+ mla \d1\().8h, \s4\().8h, v0.h[3]
+ mla \d1\().8h, \s5\().8h, v0.h[4]
+ mla \d1\().8h, \s6\().8h, v0.h[5]
+ mla \d1\().8h, \s7\().8h, v0.h[6]
+.endm
+.macro mul_mla_6tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+ mul \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mul \d1\().8h, \s3\().8h, v0.h[1]
+ mla \d1\().8h, \s4\().8h, v0.h[2]
+ mla \d1\().8h, \s5\().8h, v0.h[3]
+ mla \d1\().8h, \s6\().8h, v0.h[4]
+ mla \d1\().8h, \s7\().8h, v0.h[5]
+ mla \d1\().8h, \s8\().8h, v0.h[6]
+.endm
+.macro mul_mla_8tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
mul \d0\().4h, \s0\().4h, v0.h[0]
mla \d0\().4h, \s1\().4h, v0.h[1]
mla \d0\().4h, \s2\().4h, v0.h[2]
@@ -1173,7 +1217,7 @@ endfunc
mla \d0\().4h, \s6\().4h, v0.h[6]
mla \d0\().4h, \s7\().4h, v0.h[7]
.endm
-.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+.macro mul_mla_8tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
mul \d0\().8h, \s0\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
@@ -1183,7 +1227,7 @@ endfunc
mla \d0\().8h, \s6\().8h, v0.h[6]
mla \d0\().8h, \s7\().8h, v0.h[7]
.endm
-.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+.macro mul_mla_8tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
mul \d0\().8h, \s0\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
@@ -1201,7 +1245,7 @@ endfunc
mla \d1\().8h, \s7\().8h, v0.h[6]
mla \d1\().8h, \s8\().8h, v0.h[7]
.endm
-.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+.macro mul_mla_8tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
mul \d0\().8h, \s0\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
@@ -1315,11 +1359,11 @@ endfunc
.endif
.endm
-.macro make_8tap_fn op, type, type_h, type_v
+.macro make_8tap_fn op, type, type_h, type_v, taps
function \op\()_8tap_\type\()_8bpc_neon, export=1
mov x8, \type_h
mov x9, \type_v
- b \op\()_8tap_neon
+ b \op\()_\taps\()_neon
endfunc
.endm
@@ -1328,18 +1372,8 @@ endfunc
#define SMOOTH ((1*15<<7)|4*15)
#define SHARP ((2*15<<7)|3*15)
-.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
-make_8tap_fn \type, regular, REGULAR, REGULAR
-make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
-make_8tap_fn \type, regular_sharp, REGULAR, SHARP
-make_8tap_fn \type, smooth, SMOOTH, SMOOTH
-make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
-make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
-make_8tap_fn \type, sharp, SHARP, SHARP
-make_8tap_fn \type, sharp_regular, SHARP, REGULAR
-make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
-
-function \type\()_8tap_neon
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv, taps
+function \type\()_\taps\()_neon
mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
mul \mx, \mx, w10
mul \my, \my, w10
@@ -1354,12 +1388,12 @@ function \type\()_8tap_neon
tst \mx, #(0x7f << 14)
sub w8, w8, #24
movrel x10, X(mc_subpel_filters), -8
- b.ne L(\type\()_8tap_h)
+ b.ne L(\type\()_\taps\()_h)
tst \my, #(0x7f << 14)
- b.ne L(\type\()_8tap_v)
+ b.ne L(\type\()_\taps\()_v)
b \type\()_neon
-L(\type\()_8tap_h):
+L(\type\()_\taps\()_h):
cmp \w, #4
ubfx w9, \mx, #7, #7
and \mx, \mx, #0x7f
@@ -1368,9 +1402,9 @@ L(\type\()_8tap_h):
4:
tst \my, #(0x7f << 14)
add \xmx, x10, \mx, uxtw #3
- b.ne L(\type\()_8tap_hv)
+ b.ne L(\type\()_\taps\()_hv)
- adr x9, L(\type\()_8tap_h_tbl)
+ adr x9, L(\type\()_\taps\()_h_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
br x9
@@ -1471,6 +1505,18 @@ L(\type\()_8tap_h):
uxtl v20.8h, v20.8b
uxtl v21.8h, v21.8b
+.ifc \taps, 6tap
+ ext v19.16b, v16.16b, v17.16b, #2
+ ext v23.16b, v20.16b, v21.16b, #2
+ mul v18.8h, v19.8h, v0.h[1]
+ mul v22.8h, v23.8h, v0.h[1]
+.irpc i, 23456
+ ext v19.16b, v16.16b, v17.16b, #(2*\i)
+ ext v23.16b, v20.16b, v21.16b, #(2*\i)
+ mla v18.8h, v19.8h, v0.h[\i]
+ mla v22.8h, v23.8h, v0.h[\i]
+.endr
+.else // 8tap
mul v18.8h, v16.8h, v0.h[0]
mul v22.8h, v20.8h, v0.h[0]
.irpc i, 1234567
@@ -1479,6 +1525,7 @@ L(\type\()_8tap_h):
mla v18.8h, v19.8h, v0.h[\i]
mla v22.8h, v23.8h, v0.h[\i]
.endr
+.endif
subs \h, \h, #2
srshr v18.8h, v18.8h, #2
srshr v22.8h, v22.8h, #2
@@ -1523,6 +1570,26 @@ L(\type\()_8tap_h):
uxtl v22.8h, v22.8b
16:
+.ifc \taps, 6tap
+ ext v28.16b, v16.16b, v17.16b, #2
+ ext v29.16b, v17.16b, v18.16b, #2
+ ext v30.16b, v20.16b, v21.16b, #2
+ ext v31.16b, v21.16b, v22.16b, #2
+ mul v24.8h, v28.8h, v0.h[1]
+ mul v25.8h, v29.8h, v0.h[1]
+ mul v26.8h, v30.8h, v0.h[1]
+ mul v27.8h, v31.8h, v0.h[1]
+.irpc i, 23456
+ ext v28.16b, v16.16b, v17.16b, #(2*\i)
+ ext v29.16b, v17.16b, v18.16b, #(2*\i)
+ ext v30.16b, v20.16b, v21.16b, #(2*\i)
+ ext v31.16b, v21.16b, v22.16b, #(2*\i)
+ mla v24.8h, v28.8h, v0.h[\i]
+ mla v25.8h, v29.8h, v0.h[\i]
+ mla v26.8h, v30.8h, v0.h[\i]
+ mla v27.8h, v31.8h, v0.h[\i]
+.endr
+.else // 8tap
mul v24.8h, v16.8h, v0.h[0]
mul v25.8h, v17.8h, v0.h[0]
mul v26.8h, v20.8h, v0.h[0]
@@ -1537,6 +1604,7 @@ L(\type\()_8tap_h):
mla v26.8h, v30.8h, v0.h[\i]
mla v27.8h, v31.8h, v0.h[\i]
.endr
+.endif
srshr v24.8h, v24.8h, #2
srshr v25.8h, v25.8h, #2
srshr v26.8h, v26.8h, #2
@@ -1575,18 +1643,18 @@ L(\type\()_8tap_h):
b.gt 161b
ret
-L(\type\()_8tap_h_tbl):
- .hword L(\type\()_8tap_h_tbl) - 1280b
- .hword L(\type\()_8tap_h_tbl) - 640b
- .hword L(\type\()_8tap_h_tbl) - 320b
- .hword L(\type\()_8tap_h_tbl) - 160b
- .hword L(\type\()_8tap_h_tbl) - 80b
- .hword L(\type\()_8tap_h_tbl) - 40b
- .hword L(\type\()_8tap_h_tbl) - 20b
+L(\type\()_\taps\()_h_tbl):
+ .hword L(\type\()_\taps\()_h_tbl) - 1280b
+ .hword L(\type\()_\taps\()_h_tbl) - 640b
+ .hword L(\type\()_\taps\()_h_tbl) - 320b
+ .hword L(\type\()_\taps\()_h_tbl) - 160b
+ .hword L(\type\()_\taps\()_h_tbl) - 80b
+ .hword L(\type\()_\taps\()_h_tbl) - 40b
+ .hword L(\type\()_\taps\()_h_tbl) - 20b
.hword 0
-L(\type\()_8tap_v):
+L(\type\()_\taps\()_v):
cmp \h, #4
ubfx w9, \my, #7, #7
and \my, \my, #0x7f
@@ -1595,7 +1663,7 @@ L(\type\()_8tap_v):
4:
add \xmy, x10, \my, uxtw #3
- adr x9, L(\type\()_8tap_v_tbl)
+ adr x9, L(\type\()_\taps\()_v_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
br x9
@@ -1620,7 +1688,7 @@ L(\type\()_8tap_v):
interleave_1_h v1, v2, v3, v4, v5
b.gt 24f
uxtl_b v1, v2, v3, v4
- mul_mla_4 v6, v1, v2, v3, v4, .4h
+ mul_mla_4tap v6, v1, v2, v3, v4, .4h
sqrshrun_b 6, v6
st_h \d_strd, v6, 2
ret
@@ -1630,7 +1698,7 @@ L(\type\()_8tap_v):
interleave_1_h v5, v6, v7
interleave_2_s v1, v2, v3, v4, v5, v6
uxtl_b v1, v2, v3, v4
- mul_mla_4 v6, v1, v2, v3, v4, .8h
+ mul_mla_4tap v6, v1, v2, v3, v4, .8h
sqrshrun_b 6, v6
st_h \d_strd, v6, 4
ret
@@ -1655,7 +1723,7 @@ L(\type\()_8tap_v):
interleave_1_h v7, v16, v17, v18, v19
interleave_2_s v5, v6, v7, v16, v17, v18
uxtl_b v5, v6, v7, v16
- mul_mla_8_0 v30, v1, v2, v3, v4, v5, v6, v7, v16
+ mul_mla_\taps\()_0 v30, v1, v2, v3, v4, v5, v6, v7, v16
sqrshrun_b 6, v30
st_h \d_strd, v30, 4
b.le 0f
@@ -1673,7 +1741,7 @@ L(\type\()_8tap_v):
load_h \sr2, \src, \s_strd, v16, v17
interleave_1_h v7, v16, v17
uxtl_b v5, v6, v7, v16
- mul_mla_8_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16
+ mul_mla_\taps\()_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16
sqrshrun_b 6, v30
st_h \d_strd, v30, 2
0:
@@ -1698,13 +1766,13 @@ L(\type\()_8tap_v):
load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5
interleave_1_s v1, v2, v3, v4, v5
uxtl_b v1, v2, v3, v4
- mul_mla_4 v6, v1, v2, v3, v4, .8h
+ mul_mla_4tap v6, v1, v2, v3, v4, .8h
shift_store_4 \type, \d_strd, v6
b.le 0f
load_s \sr2, \src, \s_strd, v6, v7
interleave_1_s v5, v6, v7
uxtl_b v5, v6
- mul_mla_4 v7, v3, v4, v5, v6, .8h
+ mul_mla_4tap v7, v3, v4, v5, v6, .8h
shift_store_4 \type, \d_strd, v7
0:
ret
@@ -1729,28 +1797,28 @@ L(\type\()_8tap_v):
load_s \sr2, \src, \s_strd, v23, v24, v25, v26
interleave_1_s v22, v23, v24, v25, v26
uxtl_b v22, v23, v24, v25
- mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+ mul_mla_\taps\()_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
shift_store_4 \type, \d_strd, v1, v2
b.le 0f
load_s \sr2, \src, \s_strd, v27, v16
subs \h, \h, #2
interleave_1_s v26, v27, v16
uxtl_b v26, v27
- mul_mla_8_0 v1, v20, v21, v22, v23, v24, v25, v26, v27
+ mul_mla_\taps\()_0 v1, v20, v21, v22, v23, v24, v25, v26, v27
shift_store_4 \type, \d_strd, v1
b.le 0f
load_s \sr2, \src, \s_strd, v17, v18
subs \h, \h, #2
interleave_1_s v16, v17, v18
uxtl_b v16, v17
- mul_mla_8_0 v2, v22, v23, v24, v25, v26, v27, v16, v17
+ mul_mla_\taps\()_0 v2, v22, v23, v24, v25, v26, v27, v16, v17
shift_store_4 \type, \d_strd, v2
b.le 0f
subs \h, \h, #4
load_s \sr2, \src, \s_strd, v19, v20, v21, v22
interleave_1_s v18, v19, v20, v21, v22
uxtl_b v18, v19, v20, v21
- mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
+ mul_mla_\taps\()_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
shift_store_4 \type, \d_strd, v1, v2
b.gt 48b
0:
@@ -1773,14 +1841,14 @@ L(\type\()_8tap_v):
load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5
uxtl_b v1, v2, v3, v4, v5
- mul_mla_4 v6, v1, v2, v3, v4, .8h
- mul_mla_4 v7, v2, v3, v4, v5, .8h
+ mul_mla_4tap v6, v1, v2, v3, v4, .8h
+ mul_mla_4tap v7, v2, v3, v4, v5, .8h
shift_store_8 \type, \d_strd, v6, v7
b.le 0f
load_8b \sr2, \src, \s_strd, v6, v7
uxtl_b v6, v7
- mul_mla_4 v1, v3, v4, v5, v6, .8h
- mul_mla_4 v2, v4, v5, v6, v7, .8h
+ mul_mla_4tap v1, v3, v4, v5, v6, .8h
+ mul_mla_4tap v2, v4, v5, v6, v7, .8h
shift_store_8 \type, \d_strd, v1, v2
0:
ret
@@ -1809,32 +1877,32 @@ L(\type\()_8tap_v):
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v23, v24
uxtl_b v23, v24
- mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24
+ mul_mla_\taps\()_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24
shift_store_8 \type, \d_strd, v1, v2
b.le 9f
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v25, v26
uxtl_b v25, v26
- mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
+ mul_mla_\taps\()_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
shift_store_8 \type, \d_strd, v3, v4
b.le 9f
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v27, v16
uxtl_b v27, v16
- mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
+ mul_mla_\taps\()_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
shift_store_8 \type, \d_strd, v1, v2
b.le 9f
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v17, v18
uxtl_b v17, v18
- mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
+ mul_mla_\taps\()_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
shift_store_8 \type, \d_strd, v3, v4
b.le 9f
subs \h, \h, #4
load_8b \sr2, \src, \s_strd, v19, v20, v21, v22
uxtl_b v19, v20, v21, v22
- mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20
- mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22
+ mul_mla_\taps\()_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20
+ mul_mla_\taps\()_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22
shift_store_8 \type, \d_strd, v1, v2, v3, v4
b.gt 88b
9:
@@ -1882,10 +1950,10 @@ L(\type\()_8tap_v):
uxtl2 v25.8h, v3.16b
uxtl2 v26.8h, v4.16b
uxtl2 v27.8h, v5.16b
- mul_mla_4 v1, v16, v17, v18, v19, .8h
- mul_mla_4 v16, v17, v18, v19, v20, .8h
- mul_mla_4 v2, v23, v24, v25, v26, .8h
- mul_mla_4 v17, v24, v25, v26, v27, .8h
+ mul_mla_4tap v1, v16, v17, v18, v19, .8h
+ mul_mla_4tap v16, v17, v18, v19, v20, .8h
+ mul_mla_4tap v2, v23, v24, v25, v26, .8h
+ mul_mla_4tap v17, v24, v25, v26, v27, .8h
shift_store_16 \type, \d_strd, v1, v2, v16, v17
b.le 0f
load_16b \sr2, \src, \s_strd, v6, v7
@@ -1893,25 +1961,25 @@ L(\type\()_8tap_v):
uxtl v22.8h, v7.8b
uxtl2 v28.8h, v6.16b
uxtl2 v29.8h, v7.16b
- mul_mla_4 v1, v18, v19, v20, v21, .8h
- mul_mla_4 v3, v19, v20, v21, v22, .8h
- mul_mla_4 v2, v25, v26, v27, v28, .8h
- mul_mla_4 v4, v26, v27, v28, v29, .8h
+ mul_mla_4tap v1, v18, v19, v20, v21, .8h
+ mul_mla_4tap v3, v19, v20, v21, v22, .8h
+ mul_mla_4tap v2, v25, v26, v27, v28, .8h
+ mul_mla_4tap v4, v26, v27, v28, v29, .8h
shift_store_16 \type, \d_strd, v1, v2, v3, v4
0:
ret
-L(\type\()_8tap_v_tbl):
- .hword L(\type\()_8tap_v_tbl) - 1280b
- .hword L(\type\()_8tap_v_tbl) - 640b
- .hword L(\type\()_8tap_v_tbl) - 320b
- .hword L(\type\()_8tap_v_tbl) - 160b
- .hword L(\type\()_8tap_v_tbl) - 80b
- .hword L(\type\()_8tap_v_tbl) - 40b
- .hword L(\type\()_8tap_v_tbl) - 20b
+L(\type\()_\taps\()_v_tbl):
+ .hword L(\type\()_\taps\()_v_tbl) - 1280b
+ .hword L(\type\()_\taps\()_v_tbl) - 640b
+ .hword L(\type\()_\taps\()_v_tbl) - 320b
+ .hword L(\type\()_\taps\()_v_tbl) - 160b
+ .hword L(\type\()_\taps\()_v_tbl) - 80b
+ .hword L(\type\()_\taps\()_v_tbl) - 40b
+ .hword L(\type\()_\taps\()_v_tbl) - 20b
.hword 0
-L(\type\()_8tap_hv):
+L(\type\()_\taps\()_hv):
cmp \h, #4
ubfx w9, \my, #7, #7
and \my, \my, #0x7f
@@ -1920,7 +1988,7 @@ L(\type\()_8tap_hv):
4:
add \xmy, x10, \my, uxtw #3
- adr x9, L(\type\()_8tap_hv_tbl)
+ adr x9, L(\type\()_\taps\()_hv_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
br x9
@@ -1952,13 +2020,13 @@ L(\type\()_8tap_hv):
addp v28.4h, v28.4h, v29.4h
addp v16.4h, v28.4h, v28.4h
srshr v16.4h, v16.4h, #2
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
trn1 v16.2s, v16.2s, v28.2s
mov v17.8b, v28.8b
2:
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v18.8b, v17.8b, v28.8b, #4
smull v2.4s, v16.4h, v1.h[0]
@@ -1997,19 +2065,27 @@ L(\type\()_8tap_hv):
addp v16.4h, v28.4h, v28.4h
srshr v16.4h, v16.4h, #2
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
trn1 v16.2s, v16.2s, v28.2s
mov v17.8b, v28.8b
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v18.8b, v17.8b, v28.8b, #4
mov v19.8b, v28.8b
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v20.8b, v19.8b, v28.8b, #4
mov v21.8b, v28.8b
28:
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v22.8b, v21.8b, v28.8b, #4
+.ifc \taps, 6tap
+ smull v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+.else // 8tap
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
@@ -2018,6 +2094,7 @@ L(\type\()_8tap_hv):
smlal v2.4s, v21.4h, v1.h[5]
smlal v2.4s, v22.4h, v1.h[6]
smlal v2.4s, v28.4h, v1.h[7]
+.endif
sqrshrn v2.4h, v2.4s, #\shift_hv
sqxtun v2.8b, v2.8h
@@ -2036,7 +2113,7 @@ L(\type\()_8tap_hv):
0:
ret x15
-L(\type\()_8tap_filter_2):
+L(\type\()_\taps\()_filter_2):
ld1 {v28.8b}, [\sr2], \s_strd
ld1 {v30.8b}, [\src], \s_strd
uxtl v28.8h, v28.8b
@@ -2083,12 +2160,12 @@ L(\type\()_8tap_filter_2):
mla v31.4h, v30.4h, v0.h[3]
srshr v16.4h, v31.4h, #2
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v17.8b, v28.8b
mov v18.8b, v29.8b
4:
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
@@ -2121,8 +2198,13 @@ L(\type\()_8tap_filter_2):
480: // 4x8, 4x16, 4x32 hv
ld1 {v1.8b}, [\xmy]
sub \src, \src, #1
+.ifc \taps, 6tap
+ sub \sr2, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+.else
sub \sr2, \src, \s_strd, lsl #1
sub \src, \sr2, \s_strd
+.endif
add \ds2, \dst, \d_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
@@ -2139,20 +2221,38 @@ L(\type\()_8tap_filter_2):
mla v31.4h, v28.4h, v0.h[1]
mla v31.4h, v29.4h, v0.h[2]
mla v31.4h, v30.4h, v0.h[3]
+.ifc \taps, 6tap
+ srshr v18.4h, v31.4h, #2
+.else
srshr v16.4h, v31.4h, #2
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v17.8b, v28.8b
mov v18.8b, v29.8b
- bl L(\type\()_8tap_filter_4)
+.endif
+ bl L(\type\()_\taps\()_filter_4)
mov v19.8b, v28.8b
mov v20.8b, v29.8b
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v21.8b, v28.8b
mov v22.8b, v29.8b
48:
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
+.ifc \taps, 6tap
+ smull v2.4s, v18.4h, v1.h[1]
+ smlal v2.4s, v19.4h, v1.h[2]
+ smlal v2.4s, v20.4h, v1.h[3]
+ smlal v2.4s, v21.4h, v1.h[4]
+ smlal v2.4s, v22.4h, v1.h[5]
+ smlal v2.4s, v28.4h, v1.h[6]
+ smull v3.4s, v19.4h, v1.h[1]
+ smlal v3.4s, v20.4h, v1.h[2]
+ smlal v3.4s, v21.4h, v1.h[3]
+ smlal v3.4s, v22.4h, v1.h[4]
+ smlal v3.4s, v28.4h, v1.h[5]
+ smlal v3.4s, v29.4h, v1.h[6]
+.else // 8tap
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
@@ -2169,6 +2269,7 @@ L(\type\()_8tap_filter_2):
smlal v3.4s, v22.4h, v1.h[5]
smlal v3.4s, v28.4h, v1.h[6]
smlal v3.4s, v29.4h, v1.h[7]
+.endif
sqrshrn v2.4h, v2.4s, #\shift_hv
sqrshrn v3.4h, v3.4s, #\shift_hv
subs \h, \h, #2
@@ -2182,8 +2283,10 @@ L(\type\()_8tap_filter_2):
st1 {v3.4h}, [\ds2], \d_strd
.endif
b.le 0f
+.ifc \taps, 8tap
mov v16.8b, v18.8b
mov v17.8b, v19.8b
+.endif
mov v18.8b, v20.8b
mov v19.8b, v21.8b
mov v20.8b, v22.8b
@@ -2193,7 +2296,7 @@ L(\type\()_8tap_filter_2):
0:
ret x15
-L(\type\()_8tap_filter_4):
+L(\type\()_\taps\()_filter_4):
ld1 {v26.8b}, [\sr2], \s_strd
ld1 {v27.8b}, [\src], \s_strd
uxtl v26.8h, v26.8b
@@ -2237,15 +2340,15 @@ L(\type\()_8tap_filter_4):
lsl \d_strd, \d_strd, #1
lsl \s_strd, \s_strd, #1
- bl L(\type\()_8tap_filter_8_first)
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8_first)
+ bl L(\type\()_\taps\()_filter_8)
mov v17.16b, v24.16b
mov v18.16b, v25.16b
8:
smull v2.4s, v16.4h, v1.h[0]
smull2 v3.4s, v16.8h, v1.h[0]
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
smull v4.4s, v17.4h, v1.h[0]
smull2 v5.4s, v17.8h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
@@ -2303,7 +2406,9 @@ L(\type\()_8tap_filter_4):
ld1 {v0.8b}, [\xmx]
ld1 {v1.8b}, [\xmy]
sub \src, \src, #3
+.ifc \taps, 8tap
sub \src, \src, \s_strd
+.endif
sub \src, \src, \s_strd, lsl #1
sxtl v0.8h, v0.8b
sxtl v1.8h, v1.8b
@@ -2316,21 +2421,52 @@ L(\type\()_8tap_filter_4):
lsl \d_strd, \d_strd, #1
lsl \s_strd, \s_strd, #1
- bl L(\type\()_8tap_filter_8_first)
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8_first)
+.ifc \taps, 6tap
+ mov v18.16b, v16.16b
+.else
+ bl L(\type\()_\taps\()_filter_8)
mov v17.16b, v24.16b
mov v18.16b, v25.16b
- bl L(\type\()_8tap_filter_8)
+.endif
+ bl L(\type\()_\taps\()_filter_8)
mov v19.16b, v24.16b
mov v20.16b, v25.16b
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
mov v21.16b, v24.16b
mov v22.16b, v25.16b
88:
+.ifc \taps, 6tap
+ smull v2.4s, v18.4h, v1.h[1]
+ smull2 v3.4s, v18.8h, v1.h[1]
+ bl L(\type\()_\taps\()_filter_8)
+ smull v4.4s, v19.4h, v1.h[1]
+ smull2 v5.4s, v19.8h, v1.h[1]
+ smlal v2.4s, v19.4h, v1.h[2]
+ smlal2 v3.4s, v19.8h, v1.h[2]
+ smlal v4.4s, v20.4h, v1.h[2]
+ smlal2 v5.4s, v20.8h, v1.h[2]
+ smlal v2.4s, v20.4h, v1.h[3]
+ smlal2 v3.4s, v20.8h, v1.h[3]
+ smlal v4.4s, v21.4h, v1.h[3]
+ smlal2 v5.4s, v21.8h, v1.h[3]
+ smlal v2.4s, v21.4h, v1.h[4]
+ smlal2 v3.4s, v21.8h, v1.h[4]
+ smlal v4.4s, v22.4h, v1.h[4]
+ smlal2 v5.4s, v22.8h, v1.h[4]
+ smlal v2.4s, v22.4h, v1.h[5]
+ smlal2 v3.4s, v22.8h, v1.h[5]
+ smlal v4.4s, v24.4h, v1.h[5]
+ smlal2 v5.4s, v24.8h, v1.h[5]
+ smlal v2.4s, v24.4h, v1.h[6]
+ smlal2 v3.4s, v24.8h, v1.h[6]
+ smlal v4.4s, v25.4h, v1.h[6]
+ smlal2 v5.4s, v25.8h, v1.h[6]
+.else // 8tap
smull v2.4s, v16.4h, v1.h[0]
smull2 v3.4s, v16.8h, v1.h[0]
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
smull v4.4s, v17.4h, v1.h[0]
smull2 v5.4s, v17.8h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
@@ -2361,6 +2497,7 @@ L(\type\()_8tap_filter_4):
smlal2 v3.4s, v24.8h, v1.h[7]
smlal v4.4s, v25.4h, v1.h[7]
smlal2 v5.4s, v25.8h, v1.h[7]
+.endif
sqrshrn v2.4h, v2.4s, #\shift_hv
sqrshrn2 v2.8h, v3.4s, #\shift_hv
sqrshrn v4.4h, v4.4s, #\shift_hv
@@ -2376,8 +2513,10 @@ L(\type\()_8tap_filter_4):
st1 {v4.8h}, [\ds2], \d_strd
.endif
b.le 9f
+.ifc \taps, 8tap
mov v16.16b, v18.16b
mov v17.16b, v19.16b
+.endif
mov v18.16b, v20.16b
mov v19.16b, v21.16b
mov v20.16b, v22.16b
@@ -2399,14 +2538,32 @@ L(\type\()_8tap_filter_4):
.else
add \dst, \dst, #16
.endif
+.ifc \taps, 6tap
+ add \src, \src, \s_strd, lsl #1
+.endif
b 168b
0:
ret x15
-L(\type\()_8tap_filter_8_first):
+L(\type\()_\taps\()_filter_8_first):
ld1 {v28.8b, v29.8b}, [\src], \s_strd
uxtl v28.8h, v28.8b
uxtl v29.8h, v29.8b
+.ifc \taps, 6tap
+ ext v24.16b, v28.16b, v29.16b, #(2*1)
+ ext v25.16b, v28.16b, v29.16b, #(2*2)
+ ext v26.16b, v28.16b, v29.16b, #(2*3)
+ ext v27.16b, v28.16b, v29.16b, #(2*4)
+ mul v16.8h, v24.8h, v0.h[1]
+ mla v16.8h, v25.8h, v0.h[2]
+ mla v16.8h, v26.8h, v0.h[3]
+ mla v16.8h, v27.8h, v0.h[4]
+ ext v24.16b, v28.16b, v29.16b, #(2*5)
+ ext v25.16b, v28.16b, v29.16b, #(2*6)
+ ext v26.16b, v28.16b, v29.16b, #(2*7)
+ mla v16.8h, v24.8h, v0.h[5]
+ mla v16.8h, v25.8h, v0.h[6]
+.else // 8tap
mul v16.8h, v28.8h, v0.h[0]
ext v24.16b, v28.16b, v29.16b, #(2*1)
ext v25.16b, v28.16b, v29.16b, #(2*2)
@@ -2422,16 +2579,29 @@ L(\type\()_8tap_filter_8_first):
mla v16.8h, v24.8h, v0.h[5]
mla v16.8h, v25.8h, v0.h[6]
mla v16.8h, v26.8h, v0.h[7]
+.endif
srshr v16.8h, v16.8h, #2
ret
-L(\type\()_8tap_filter_8):
+L(\type\()_\taps\()_filter_8):
ld1 {v28.8b, v29.8b}, [\sr2], \s_strd
ld1 {v30.8b, v31.8b}, [\src], \s_strd
uxtl v28.8h, v28.8b
uxtl v29.8h, v29.8b
uxtl v30.8h, v30.8b
uxtl v31.8h, v31.8b
+.ifc \taps, 6tap
+ ext v26.16b, v28.16b, v29.16b, #2
+ ext v27.16b, v30.16b, v31.16b, #2
+ mul v24.8h, v26.8h, v0.h[1]
+ mul v25.8h, v27.8h, v0.h[1]
+.irpc i, 23456
+ ext v26.16b, v28.16b, v29.16b, #(2*\i)
+ ext v27.16b, v30.16b, v31.16b, #(2*\i)
+ mla v24.8h, v26.8h, v0.h[\i]
+ mla v25.8h, v27.8h, v0.h[\i]
+.endr
+.else // 8tap
mul v24.8h, v28.8h, v0.h[0]
mul v25.8h, v30.8h, v0.h[0]
.irpc i, 1234567
@@ -2440,22 +2610,25 @@ L(\type\()_8tap_filter_8):
mla v24.8h, v26.8h, v0.h[\i]
mla v25.8h, v27.8h, v0.h[\i]
.endr
+.endif
srshr v24.8h, v24.8h, #2
srshr v25.8h, v25.8h, #2
ret
-L(\type\()_8tap_hv_tbl):
- .hword L(\type\()_8tap_hv_tbl) - 1280b
- .hword L(\type\()_8tap_hv_tbl) - 640b
- .hword L(\type\()_8tap_hv_tbl) - 320b
- .hword L(\type\()_8tap_hv_tbl) - 160b
- .hword L(\type\()_8tap_hv_tbl) - 80b
- .hword L(\type\()_8tap_hv_tbl) - 40b
- .hword L(\type\()_8tap_hv_tbl) - 20b
+L(\type\()_\taps\()_hv_tbl):
+ .hword L(\type\()_\taps\()_hv_tbl) - 1280b
+ .hword L(\type\()_\taps\()_hv_tbl) - 640b
+ .hword L(\type\()_\taps\()_hv_tbl) - 320b
+ .hword L(\type\()_\taps\()_hv_tbl) - 160b
+ .hword L(\type\()_\taps\()_hv_tbl) - 80b
+ .hword L(\type\()_\taps\()_hv_tbl) - 40b
+ .hword L(\type\()_\taps\()_hv_tbl) - 20b
.hword 0
endfunc
+.endm
+.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
function \type\()_bilin_8bpc_neon, export=1
dup v1.16b, \mx
dup v3.16b, \my
@@ -2987,8 +3160,34 @@ L(\type\()_bilin_hv_tbl):
endfunc
.endm
-filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
-filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
+make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap
+make_8tap_fn put, smooth_sharp, SMOOTH, SHARP, 8tap
+make_8tap_fn put, sharp, SHARP, SHARP, 8tap
+make_8tap_fn put, sharp_regular, SHARP, REGULAR, 8tap
+make_8tap_fn put, sharp_smooth, SHARP, SMOOTH, 8tap
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 8tap
+
+make_8tap_fn put, regular, REGULAR, REGULAR, 6tap
+make_8tap_fn put, regular_smooth, REGULAR, SMOOTH, 6tap
+make_8tap_fn put, smooth, SMOOTH, SMOOTH, 6tap
+make_8tap_fn put, smooth_regular, SMOOTH, REGULAR, 6tap
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 6tap
+filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
+
+make_8tap_fn prep, regular_sharp, REGULAR, SHARP, 8tap
+make_8tap_fn prep, smooth_sharp, SMOOTH, SHARP, 8tap
+make_8tap_fn prep, sharp, SHARP, SHARP, 8tap
+make_8tap_fn prep, sharp_regular, SHARP, REGULAR, 8tap
+make_8tap_fn prep, sharp_smooth, SHARP, SMOOTH, 8tap
+filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6, 8tap
+
+make_8tap_fn prep, regular, REGULAR, REGULAR, 6tap
+make_8tap_fn prep, regular_smooth, REGULAR, SMOOTH, 6tap
+make_8tap_fn prep, smooth, SMOOTH, SMOOTH, 6tap
+make_8tap_fn prep, smooth_regular, SMOOTH, REGULAR, 6tap
+filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6, 6tap
+filter_bilin_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
+
.macro load_filter_row dst, src, inc
asr w13, \src, #10
diff --git a/third_party/dav1d/src/arm/64/mc16.S b/third_party/dav1d/src/arm/64/mc16.S
index 1bfb12ebb3..576fab158a 100644
--- a/third_party/dav1d/src/arm/64/mc16.S
+++ b/third_party/dav1d/src/arm/64/mc16.S
@@ -1374,19 +1374,35 @@ endfunc
sub \r3\wd, \r3\wd, \c\wd
.endif
.endm
-.macro smull_smlal_4 d, s0, s1, s2, s3
+.macro smull_smlal_4tap d, s0, s1, s2, s3
smull \d\().4s, \s0\().4h, v0.h[0]
smlal \d\().4s, \s1\().4h, v0.h[1]
smlal \d\().4s, \s2\().4h, v0.h[2]
smlal \d\().4s, \s3\().4h, v0.h[3]
.endm
-.macro smull2_smlal2_4 d, s0, s1, s2, s3
+.macro smull2_smlal2_4tap d, s0, s1, s2, s3
smull2 \d\().4s, \s0\().8h, v0.h[0]
smlal2 \d\().4s, \s1\().8h, v0.h[1]
smlal2 \d\().4s, \s2\().8h, v0.h[2]
smlal2 \d\().4s, \s3\().8h, v0.h[3]
.endm
-.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+.macro smull_smlal_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
+ smull \d\().4s, \s1\().4h, v0.h[1]
+ smlal \d\().4s, \s2\().4h, v0.h[2]
+ smlal \d\().4s, \s3\().4h, v0.h[3]
+ smlal \d\().4s, \s4\().4h, v0.h[4]
+ smlal \d\().4s, \s5\().4h, v0.h[5]
+ smlal \d\().4s, \s6\().4h, v0.h[6]
+.endm
+.macro smull2_smlal2_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
+ smull2 \d\().4s, \s1\().8h, v0.h[1]
+ smlal2 \d\().4s, \s2\().8h, v0.h[2]
+ smlal2 \d\().4s, \s3\().8h, v0.h[3]
+ smlal2 \d\().4s, \s4\().8h, v0.h[4]
+ smlal2 \d\().4s, \s5\().8h, v0.h[5]
+ smlal2 \d\().4s, \s6\().8h, v0.h[6]
+.endm
+.macro smull_smlal_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
smull \d\().4s, \s0\().4h, v0.h[0]
smlal \d\().4s, \s1\().4h, v0.h[1]
smlal \d\().4s, \s2\().4h, v0.h[2]
@@ -1396,7 +1412,7 @@ endfunc
smlal \d\().4s, \s6\().4h, v0.h[6]
smlal \d\().4s, \s7\().4h, v0.h[7]
.endm
-.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+.macro smull2_smlal2_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
smull2 \d\().4s, \s0\().8h, v0.h[0]
smlal2 \d\().4s, \s1\().8h, v0.h[1]
smlal2 \d\().4s, \s2\().8h, v0.h[2]
@@ -1499,11 +1515,11 @@ endfunc
st1 {\r0\().8h, \r1\().8h}, [\dst], \strd
.endm
-.macro make_8tap_fn op, type, type_h, type_v
+.macro make_8tap_fn op, type, type_h, type_v, taps
function \op\()_8tap_\type\()_16bpc_neon, export=1
mov w9, \type_h
mov w10, \type_v
- b \op\()_8tap_neon
+ b \op\()_\taps\()_neon
endfunc
.endm
@@ -1512,18 +1528,8 @@ endfunc
#define SMOOTH ((1*15<<7)|4*15)
#define SHARP ((2*15<<7)|3*15)
-.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
-make_8tap_fn \type, regular, REGULAR, REGULAR
-make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
-make_8tap_fn \type, regular_sharp, REGULAR, SHARP
-make_8tap_fn \type, smooth, SMOOTH, SMOOTH
-make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
-make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
-make_8tap_fn \type, sharp, SHARP, SHARP
-make_8tap_fn \type, sharp_regular, SHARP, REGULAR
-make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
-
-function \type\()_8tap_neon
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2, taps
+function \type\()_\taps\()_neon
.ifc \bdmax, w8
ldr w8, [sp]
.endif
@@ -1547,12 +1553,12 @@ function \type\()_8tap_neon
add w13, w12, \bdmax // 6 + intermediate_bits
sub w12, w12, \bdmax // 6 - intermediate_bits
movrel x11, X(mc_subpel_filters), -8
- b.ne L(\type\()_8tap_h)
+ b.ne L(\type\()_\taps\()_h)
tst \my, #(0x7f << 14)
- b.ne L(\type\()_8tap_v)
+ b.ne L(\type\()_\taps\()_v)
b \type\()_neon
-L(\type\()_8tap_h):
+L(\type\()_\taps\()_h):
cmp \w, #4
ubfx w10, \mx, #7, #7
and \mx, \mx, #0x7f
@@ -1561,9 +1567,9 @@ L(\type\()_8tap_h):
4:
tst \my, #(0x7f << 14)
add \xmx, x11, \mx, uxtw #3
- b.ne L(\type\()_8tap_hv)
+ b.ne L(\type\()_\taps\()_hv)
- adr x10, L(\type\()_8tap_h_tbl)
+ adr x10, L(\type\()_\taps\()_h_tbl)
dup v30.4s, w12 // 6 - intermediate_bits
ldrh w9, [x10, x9, lsl #1]
neg v30.4s, v30.4s // -(6-intermediate_bits)
@@ -1682,6 +1688,22 @@ L(\type\()_8tap_h):
mov \mx, \w
8:
+.ifc \taps, 6tap
+ ext v24.16b, v16.16b, v17.16b, #2
+ ext v25.16b, v20.16b, v21.16b, #2
+ smull v18.4s, v24.4h, v0.h[1]
+ smull2 v19.4s, v24.8h, v0.h[1]
+ smull v22.4s, v25.4h, v0.h[1]
+ smull2 v23.4s, v25.8h, v0.h[1]
+.irpc i, 23456
+ ext v24.16b, v16.16b, v17.16b, #(2*\i)
+ ext v25.16b, v20.16b, v21.16b, #(2*\i)
+ smlal v18.4s, v24.4h, v0.h[\i]
+ smlal2 v19.4s, v24.8h, v0.h[\i]
+ smlal v22.4s, v25.4h, v0.h[\i]
+ smlal2 v23.4s, v25.8h, v0.h[\i]
+.endr
+.else // 8tap
smull v18.4s, v16.4h, v0.h[0]
smull2 v19.4s, v16.8h, v0.h[0]
smull v22.4s, v20.4h, v0.h[0]
@@ -1694,6 +1716,7 @@ L(\type\()_8tap_h):
smlal v22.4s, v25.4h, v0.h[\i]
smlal2 v23.4s, v25.8h, v0.h[\i]
.endr
+.endif
subs \mx, \mx, #8
srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits)
srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits)
@@ -1734,18 +1757,18 @@ L(\type\()_8tap_h):
b.gt 81b
ret
-L(\type\()_8tap_h_tbl):
- .hword L(\type\()_8tap_h_tbl) - 1280b
- .hword L(\type\()_8tap_h_tbl) - 640b
- .hword L(\type\()_8tap_h_tbl) - 320b
- .hword L(\type\()_8tap_h_tbl) - 160b
- .hword L(\type\()_8tap_h_tbl) - 80b
- .hword L(\type\()_8tap_h_tbl) - 40b
- .hword L(\type\()_8tap_h_tbl) - 20b
+L(\type\()_\taps\()_h_tbl):
+ .hword L(\type\()_\taps\()_h_tbl) - 1280b
+ .hword L(\type\()_\taps\()_h_tbl) - 640b
+ .hword L(\type\()_\taps\()_h_tbl) - 320b
+ .hword L(\type\()_\taps\()_h_tbl) - 160b
+ .hword L(\type\()_\taps\()_h_tbl) - 80b
+ .hword L(\type\()_\taps\()_h_tbl) - 40b
+ .hword L(\type\()_\taps\()_h_tbl) - 20b
.hword 0
-L(\type\()_8tap_v):
+L(\type\()_\taps\()_v):
cmp \h, #4
ubfx w10, \my, #7, #7
and \my, \my, #0x7f
@@ -1758,7 +1781,7 @@ L(\type\()_8tap_v):
dup v30.4s, w12 // 6 - intermediate_bits
movi v29.8h, #(PREP_BIAS >> 8), lsl #8
.endif
- adr x10, L(\type\()_8tap_v_tbl)
+ adr x10, L(\type\()_\taps\()_v_tbl)
ldrh w9, [x10, x9, lsl #1]
.ifc \type, prep
neg v30.4s, v30.4s // -(6-intermediate_bits)
@@ -1785,7 +1808,7 @@ L(\type\()_8tap_v):
load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5
interleave_1_s v1, v2, v3, v4, v5
b.gt 24f
- smull_smlal_4 v6, v1, v2, v3, v4
+ smull_smlal_4tap v6, v1, v2, v3, v4
sqrshrun_h 6, v6
umin_h v31, .8h, v6
st_s \d_strd, v6, 2
@@ -1794,8 +1817,8 @@ L(\type\()_8tap_v):
24: // 2x4 v
load_s \sr2, \src, \s_strd, v6, v7
interleave_1_s v5, v6, v7
- smull_smlal_4 v16, v1, v2, v3, v4
- smull_smlal_4 v17, v3, v4, v5, v6
+ smull_smlal_4tap v16, v1, v2, v3, v4
+ smull_smlal_4tap v17, v3, v4, v5, v6
sqrshrun_h 6, v16, v17
umin_h v31, .8h, v16
st_s \d_strd, v16, 4
@@ -1817,8 +1840,8 @@ L(\type\()_8tap_v):
subs \h, \h, #4
load_s \sr2, \src, \s_strd, v16, v17, v18, v19
interleave_1_s v7, v16, v17, v18, v19
- smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16
- smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18
+ smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16
+ smull_smlal_\taps v25, v3, v4, v5, v6, v7, v16, v17, v18
sqrshrun_h 6, v24, v25
umin_h v31, .8h, v24
st_s \d_strd, v24, 4
@@ -1836,7 +1859,7 @@ L(\type\()_8tap_v):
26:
load_s \sr2, \src, \s_strd, v16, v17
interleave_1_s v7, v16, v17
- smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16
+ smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16
sqrshrun_h 6, v24
umin_h v31, .4h, v24
st_s \d_strd, v24, 2
@@ -1860,13 +1883,13 @@ L(\type\()_8tap_v):
sxtl v0.8h, v0.8b
load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
- smull_smlal_4 v6, v1, v2, v3, v4
- smull_smlal_4 v7, v2, v3, v4, v5
+ smull_smlal_4tap v6, v1, v2, v3, v4
+ smull_smlal_4tap v7, v2, v3, v4, v5
shift_store_4 \type, \d_strd, v6, v7
b.le 0f
load_4h \sr2, \src, \s_strd, v6, v7
- smull_smlal_4 v1, v3, v4, v5, v6
- smull_smlal_4 v2, v4, v5, v6, v7
+ smull_smlal_4tap v1, v3, v4, v5, v6
+ smull_smlal_4tap v2, v4, v5, v6, v7
shift_store_4 \type, \d_strd, v1, v2
0:
ret
@@ -1885,10 +1908,10 @@ L(\type\()_8tap_v):
48:
subs \h, \h, #4
load_4h \sr2, \src, \s_strd, v23, v24, v25, v26
- smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
- smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24
- smull_smlal_8 v3, v18, v19, v20, v21, v22, v23, v24, v25
- smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26
+ smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
+ smull_smlal_\taps v3, v18, v19, v20, v21, v22, v23, v24, v25
+ smull_smlal_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
shift_store_4 \type, \d_strd, v1, v2, v3, v4
b.le 0f
cmp \h, #2
@@ -1903,8 +1926,8 @@ L(\type\()_8tap_v):
b 48b
46:
load_4h \sr2, \src, \s_strd, v23, v24
- smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
- smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24
+ smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
shift_store_4 \type, \d_strd, v1, v2
0:
ret
@@ -1925,17 +1948,17 @@ L(\type\()_8tap_v):
sxtl v0.8h, v0.8b
load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
- smull_smlal_4 v16, v1, v2, v3, v4
- smull2_smlal2_4 v17, v1, v2, v3, v4
- smull_smlal_4 v18, v2, v3, v4, v5
- smull2_smlal2_4 v19, v2, v3, v4, v5
+ smull_smlal_4tap v16, v1, v2, v3, v4
+ smull2_smlal2_4tap v17, v1, v2, v3, v4
+ smull_smlal_4tap v18, v2, v3, v4, v5
+ smull2_smlal2_4tap v19, v2, v3, v4, v5
shift_store_8 \type, \d_strd, v16, v17, v18, v19
b.le 0f
load_8h \sr2, \src, \s_strd, v6, v7
- smull_smlal_4 v16, v3, v4, v5, v6
- smull2_smlal2_4 v17, v3, v4, v5, v6
- smull_smlal_4 v18, v4, v5, v6, v7
- smull2_smlal2_4 v19, v4, v5, v6, v7
+ smull_smlal_4tap v16, v3, v4, v5, v6
+ smull2_smlal2_4tap v17, v3, v4, v5, v6
+ smull_smlal_4tap v18, v4, v5, v6, v7
+ smull2_smlal2_4tap v19, v4, v5, v6, v7
shift_store_8 \type, \d_strd, v16, v17, v18, v19
0:
ret
@@ -1962,18 +1985,18 @@ L(\type\()_8tap_v):
88:
subs \h, \h, #2
load_8h \sr2, \src, \s_strd, v23, v24
- smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
- smull2_smlal2_8 v2, v16, v17, v18, v19, v20, v21, v22, v23
- smull_smlal_8 v3, v17, v18, v19, v20, v21, v22, v23, v24
- smull2_smlal2_8 v4, v17, v18, v19, v20, v21, v22, v23, v24
+ smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull2_smlal2_\taps v2, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_\taps v3, v17, v18, v19, v20, v21, v22, v23, v24
+ smull2_smlal2_\taps v4, v17, v18, v19, v20, v21, v22, v23, v24
shift_store_8 \type, \d_strd, v1, v2, v3, v4
b.le 9f
subs \h, \h, #2
load_8h \sr2, \src, \s_strd, v25, v26
- smull_smlal_8 v1, v18, v19, v20, v21, v22, v23, v24, v25
- smull2_smlal2_8 v2, v18, v19, v20, v21, v22, v23, v24, v25
- smull_smlal_8 v3, v19, v20, v21, v22, v23, v24, v25, v26
- smull2_smlal2_8 v4, v19, v20, v21, v22, v23, v24, v25, v26
+ smull_smlal_\taps v1, v18, v19, v20, v21, v22, v23, v24, v25
+ smull2_smlal2_\taps v2, v18, v19, v20, v21, v22, v23, v24, v25
+ smull_smlal_\taps v3, v19, v20, v21, v22, v23, v24, v25, v26
+ smull2_smlal2_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
shift_store_8 \type, \d_strd, v1, v2, v3, v4
b.le 9f
mov v16.16b, v20.16b
@@ -2013,10 +2036,10 @@ L(\type\()_8tap_v):
16:
load_16h \src, \src, \s_strd, v22, v23
subs \h, \h, #1
- smull_smlal_4 v1, v16, v18, v20, v22
- smull2_smlal2_4 v2, v16, v18, v20, v22
- smull_smlal_4 v3, v17, v19, v21, v23
- smull2_smlal2_4 v4, v17, v19, v21, v23
+ smull_smlal_4tap v1, v16, v18, v20, v22
+ smull2_smlal2_4tap v2, v16, v18, v20, v22
+ smull_smlal_4tap v3, v17, v19, v21, v23
+ smull2_smlal2_4tap v4, v17, v19, v21, v23
shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4
b.le 0f
mov v16.16b, v18.16b
@@ -2029,17 +2052,17 @@ L(\type\()_8tap_v):
0:
ret
-L(\type\()_8tap_v_tbl):
- .hword L(\type\()_8tap_v_tbl) - 1280b
- .hword L(\type\()_8tap_v_tbl) - 640b
- .hword L(\type\()_8tap_v_tbl) - 320b
- .hword L(\type\()_8tap_v_tbl) - 160b
- .hword L(\type\()_8tap_v_tbl) - 80b
- .hword L(\type\()_8tap_v_tbl) - 40b
- .hword L(\type\()_8tap_v_tbl) - 20b
+L(\type\()_\taps\()_v_tbl):
+ .hword L(\type\()_\taps\()_v_tbl) - 1280b
+ .hword L(\type\()_\taps\()_v_tbl) - 640b
+ .hword L(\type\()_\taps\()_v_tbl) - 320b
+ .hword L(\type\()_\taps\()_v_tbl) - 160b
+ .hword L(\type\()_\taps\()_v_tbl) - 80b
+ .hword L(\type\()_\taps\()_v_tbl) - 40b
+ .hword L(\type\()_\taps\()_v_tbl) - 20b
.hword 0
-L(\type\()_8tap_hv):
+L(\type\()_\taps\()_hv):
cmp \h, #4
ubfx w10, \my, #7, #7
and \my, \my, #0x7f
@@ -2048,7 +2071,7 @@ L(\type\()_8tap_hv):
4:
add \xmy, x11, \my, uxtw #3
- adr x10, L(\type\()_8tap_hv_tbl)
+ adr x10, L(\type\()_\taps\()_hv_tbl)
dup v30.4s, w12 // 6 - intermediate_bits
ldrh w9, [x10, x9, lsl #1]
neg v30.4s, v30.4s // -(6-intermediate_bits)
@@ -2089,7 +2112,7 @@ L(\type\()_8tap_hv):
addp v27.4s, v27.4s, v28.4s
addp v16.4s, v27.4s, v27.4s
srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
// The intermediates from the horizontal pass fit in 16 bit without
// any bias; we could just as well keep them as .4s, but narrowing
// them to .4h gives a significant speedup on out of order cores
@@ -2100,7 +2123,7 @@ L(\type\()_8tap_hv):
mov v17.8b, v24.8b
2:
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v18.8b, v17.8b, v24.8b, #4
smull v2.4s, v16.4h, v1.h[0]
@@ -2143,20 +2166,28 @@ L(\type\()_8tap_hv):
// them to .4h gives a significant speedup on out of order cores
// (at the cost of a smaller slowdown on in-order cores such as A53).
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
xtn v16.4h, v16.4s
trn1 v16.2s, v16.2s, v24.2s
mov v17.8b, v24.8b
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v18.8b, v17.8b, v24.8b, #4
mov v19.8b, v24.8b
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v20.8b, v19.8b, v24.8b, #4
mov v21.8b, v24.8b
28:
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v22.8b, v21.8b, v24.8b, #4
+.ifc \taps, 6tap
+ smull v3.4s, v17.4h, v1.h[1]
+ smlal v3.4s, v18.4h, v1.h[2]
+ smlal v3.4s, v19.4h, v1.h[3]
+ smlal v3.4s, v20.4h, v1.h[4]
+ smlal v3.4s, v21.4h, v1.h[5]
+ smlal v3.4s, v22.4h, v1.h[6]
+.else // 8tap
smull v3.4s, v16.4h, v1.h[0]
smlal v3.4s, v17.4h, v1.h[1]
smlal v3.4s, v18.4h, v1.h[2]
@@ -2165,6 +2196,7 @@ L(\type\()_8tap_hv):
smlal v3.4s, v21.4h, v1.h[5]
smlal v3.4s, v22.4h, v1.h[6]
smlal v3.4s, v24.4h, v1.h[7]
+.endif
srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
sqxtun v3.4h, v3.4s
@@ -2184,7 +2216,7 @@ L(\type\()_8tap_hv):
0:
ret x15
-L(\type\()_8tap_filter_2):
+L(\type\()_\taps\()_filter_2):
ld1 {v25.8h}, [\sr2], \s_strd
ld1 {v27.8h}, [\src], \s_strd
ext v26.16b, v25.16b, v25.16b, #2
@@ -2234,12 +2266,12 @@ L(\type\()_8tap_filter_2):
// (at the cost of a smaller slowdown on in-order cores such as A53).
xtn v16.4h, v16.4s
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v17.8b, v24.8b
mov v18.8b, v25.8b
4:
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
@@ -2272,8 +2304,13 @@ L(\type\()_8tap_filter_2):
480: // 4x8, 4x16, 4x32 hv
ld1 {v1.8b}, [\xmy]
sub \src, \src, #2
+.ifc \taps, 6tap
+ sub \sr2, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+.else
sub \sr2, \src, \s_strd, lsl #1
sub \src, \sr2, \s_strd
+.endif
add \ds2, \dst, \d_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
@@ -2294,20 +2331,38 @@ L(\type\()_8tap_filter_2):
// any bias; we could just as well keep them as .4s, but narrowing
// them to .4h gives a significant speedup on out of order cores
// (at the cost of a smaller slowdown on in-order cores such as A53).
+.ifc \taps, 6tap
+ xtn v18.4h, v16.4s
+.else
xtn v16.4h, v16.4s
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v17.8b, v24.8b
mov v18.8b, v25.8b
- bl L(\type\()_8tap_filter_4)
+.endif
+ bl L(\type\()_\taps\()_filter_4)
mov v19.8b, v24.8b
mov v20.8b, v25.8b
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v21.8b, v24.8b
mov v22.8b, v25.8b
48:
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
+.ifc \taps, 6tap
+ smull v3.4s, v18.4h, v1.h[1]
+ smlal v3.4s, v19.4h, v1.h[2]
+ smlal v3.4s, v20.4h, v1.h[3]
+ smlal v3.4s, v21.4h, v1.h[4]
+ smlal v3.4s, v22.4h, v1.h[5]
+ smlal v3.4s, v24.4h, v1.h[6]
+ smull v4.4s, v19.4h, v1.h[1]
+ smlal v4.4s, v20.4h, v1.h[2]
+ smlal v4.4s, v21.4h, v1.h[3]
+ smlal v4.4s, v22.4h, v1.h[4]
+ smlal v4.4s, v24.4h, v1.h[5]
+ smlal v4.4s, v25.4h, v1.h[6]
+.else // 8tap
smull v3.4s, v16.4h, v1.h[0]
smlal v3.4s, v17.4h, v1.h[1]
smlal v3.4s, v18.4h, v1.h[2]
@@ -2324,6 +2379,7 @@ L(\type\()_8tap_filter_2):
smlal v4.4s, v22.4h, v1.h[5]
smlal v4.4s, v24.4h, v1.h[6]
smlal v4.4s, v25.4h, v1.h[7]
+.endif
.ifc \type, put
srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits)
@@ -2339,8 +2395,10 @@ L(\type\()_8tap_filter_2):
st1 {v3.d}[0], [\dst], \d_strd
st1 {v3.d}[1], [\ds2], \d_strd
b.le 0f
+.ifc \taps, 8tap
mov v16.8b, v18.8b
mov v17.8b, v19.8b
+.endif
mov v18.8b, v20.8b
mov v19.8b, v21.8b
mov v20.8b, v22.8b
@@ -2350,7 +2408,7 @@ L(\type\()_8tap_filter_2):
0:
ret x15
-L(\type\()_8tap_filter_4):
+L(\type\()_\taps\()_filter_4):
ld1 {v24.8h}, [\sr2], \s_strd
ld1 {v25.8h}, [\src], \s_strd
ext v26.16b, v24.16b, v24.16b, #2
@@ -2411,14 +2469,14 @@ L(\type\()_8tap_filter_4):
// and conserves register space (no need to clobber v8-v15).
uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
mov v17.16b, v23.16b
mov v18.16b, v24.16b
8:
smull v2.4s, v16.4h, v1.h[0]
smull2 v3.4s, v16.8h, v1.h[0]
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
smull v4.4s, v17.4h, v1.h[0]
smull2 v5.4s, v17.8h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
@@ -2480,7 +2538,9 @@ L(\type\()_8tap_filter_4):
ld1 {v0.8b}, [\xmx]
ld1 {v1.8b}, [\xmy]
sub \src, \src, #6
+.ifc \taps, 8tap
sub \src, \src, \s_strd
+.endif
sub \src, \src, \s_strd, lsl #1
sxtl v0.8h, v0.8b
sxtl v1.8h, v1.8b
@@ -2494,6 +2554,16 @@ L(\type\()_8tap_filter_4):
lsl \s_strd, \s_strd, #1
ld1 {v27.8h, v28.8h}, [\src], \s_strd
+.ifc \taps, 6tap
+ ext v26.16b, v27.16b, v28.16b, #2
+ smull v24.4s, v26.4h, v0.h[1]
+ smull2 v25.4s, v26.8h, v0.h[1]
+.irpc i, 23456
+ ext v26.16b, v27.16b, v28.16b, #(2*\i)
+ smlal v24.4s, v26.4h, v0.h[\i]
+ smlal2 v25.4s, v26.8h, v0.h[\i]
+.endr
+.else // 8tap
smull v24.4s, v27.4h, v0.h[0]
smull2 v25.4s, v27.8h, v0.h[0]
.irpc i, 1234567
@@ -2501,6 +2571,7 @@ L(\type\()_8tap_filter_4):
smlal v24.4s, v26.4h, v0.h[\i]
smlal2 v25.4s, v26.8h, v0.h[\i]
.endr
+.endif
srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
// The intermediates from the horizontal pass fit in 16 bit without
@@ -2508,22 +2579,53 @@ L(\type\()_8tap_filter_4):
// them to .4h gives a significant speedup on out of order cores
// (at the cost of a smaller slowdown on in-order cores such as A53),
// and conserves register space (no need to clobber v8-v15).
+.ifc \taps, 6tap
+ uzp1 v18.8h, v24.8h, v25.8h // Same as xtn, xtn2
+.else
uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
mov v17.16b, v23.16b
mov v18.16b, v24.16b
- bl L(\type\()_8tap_filter_8)
+.endif
+ bl L(\type\()_\taps\()_filter_8)
mov v19.16b, v23.16b
mov v20.16b, v24.16b
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
mov v21.16b, v23.16b
mov v22.16b, v24.16b
88:
+.ifc \taps, 6tap
+ smull v2.4s, v18.4h, v1.h[1]
+ smull2 v3.4s, v18.8h, v1.h[1]
+ bl L(\type\()_\taps\()_filter_8)
+ smull v4.4s, v19.4h, v1.h[1]
+ smull2 v5.4s, v19.8h, v1.h[1]
+ smlal v2.4s, v19.4h, v1.h[2]
+ smlal2 v3.4s, v19.8h, v1.h[2]
+ smlal v4.4s, v20.4h, v1.h[2]
+ smlal2 v5.4s, v20.8h, v1.h[2]
+ smlal v2.4s, v20.4h, v1.h[3]
+ smlal2 v3.4s, v20.8h, v1.h[3]
+ smlal v4.4s, v21.4h, v1.h[3]
+ smlal2 v5.4s, v21.8h, v1.h[3]
+ smlal v2.4s, v21.4h, v1.h[4]
+ smlal2 v3.4s, v21.8h, v1.h[4]
+ smlal v4.4s, v22.4h, v1.h[4]
+ smlal2 v5.4s, v22.8h, v1.h[4]
+ smlal v2.4s, v22.4h, v1.h[5]
+ smlal2 v3.4s, v22.8h, v1.h[5]
+ smlal v4.4s, v23.4h, v1.h[5]
+ smlal2 v5.4s, v23.8h, v1.h[5]
+ smlal v2.4s, v23.4h, v1.h[6]
+ smlal2 v3.4s, v23.8h, v1.h[6]
+ smlal v4.4s, v24.4h, v1.h[6]
+ smlal2 v5.4s, v24.8h, v1.h[6]
+.else // 8tap
smull v2.4s, v16.4h, v1.h[0]
smull2 v3.4s, v16.8h, v1.h[0]
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
smull v4.4s, v17.4h, v1.h[0]
smull2 v5.4s, v17.8h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
@@ -2554,6 +2656,7 @@ L(\type\()_8tap_filter_4):
smlal2 v3.4s, v23.8h, v1.h[7]
smlal v4.4s, v24.4h, v1.h[7]
smlal2 v5.4s, v24.8h, v1.h[7]
+.endif
.ifc \type, put
srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
@@ -2577,8 +2680,10 @@ L(\type\()_8tap_filter_4):
st1 {v2.8h}, [\dst], \d_strd
st1 {v3.8h}, [\ds2], \d_strd
b.le 9f
+.ifc \taps, 8tap
mov v16.16b, v18.16b
mov v17.16b, v19.16b
+.endif
mov v18.16b, v20.16b
mov v19.16b, v21.16b
mov v20.16b, v22.16b
@@ -2596,13 +2701,32 @@ L(\type\()_8tap_filter_4):
mov \h, \my
add \src, \src, #16
add \dst, \dst, #16
+.ifc \taps, 6tap
+ add \src, \src, \s_strd, lsl #1
+.endif
b 168b
0:
ret x15
-L(\type\()_8tap_filter_8):
+L(\type\()_\taps\()_filter_8):
ld1 {v4.8h, v5.8h}, [\sr2], \s_strd
ld1 {v6.8h, v7.8h}, [\src], \s_strd
+.ifc \taps, 6tap
+ ext v23.16b, v4.16b, v5.16b, #2
+ ext v24.16b, v6.16b, v7.16b, #2
+ smull v25.4s, v23.4h, v0.h[1]
+ smull2 v26.4s, v23.8h, v0.h[1]
+ smull v27.4s, v24.4h, v0.h[1]
+ smull2 v28.4s, v24.8h, v0.h[1]
+.irpc i, 23456
+ ext v23.16b, v4.16b, v5.16b, #(2*\i)
+ ext v24.16b, v6.16b, v7.16b, #(2*\i)
+ smlal v25.4s, v23.4h, v0.h[\i]
+ smlal2 v26.4s, v23.8h, v0.h[\i]
+ smlal v27.4s, v24.4h, v0.h[\i]
+ smlal2 v28.4s, v24.8h, v0.h[\i]
+.endr
+.else // 8tap
smull v25.4s, v4.4h, v0.h[0]
smull2 v26.4s, v4.8h, v0.h[0]
smull v27.4s, v6.4h, v0.h[0]
@@ -2615,6 +2739,7 @@ L(\type\()_8tap_filter_8):
smlal v27.4s, v24.4h, v0.h[\i]
smlal2 v28.4s, v24.8h, v0.h[\i]
.endr
+.endif
srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits)
srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits)
@@ -2623,18 +2748,20 @@ L(\type\()_8tap_filter_8):
uzp1 v24.8h, v27.8h, v28.8h // Ditto
ret
-L(\type\()_8tap_hv_tbl):
- .hword L(\type\()_8tap_hv_tbl) - 1280b
- .hword L(\type\()_8tap_hv_tbl) - 640b
- .hword L(\type\()_8tap_hv_tbl) - 320b
- .hword L(\type\()_8tap_hv_tbl) - 160b
- .hword L(\type\()_8tap_hv_tbl) - 80b
- .hword L(\type\()_8tap_hv_tbl) - 40b
- .hword L(\type\()_8tap_hv_tbl) - 20b
+L(\type\()_\taps\()_hv_tbl):
+ .hword L(\type\()_\taps\()_hv_tbl) - 1280b
+ .hword L(\type\()_\taps\()_hv_tbl) - 640b
+ .hword L(\type\()_\taps\()_hv_tbl) - 320b
+ .hword L(\type\()_\taps\()_hv_tbl) - 160b
+ .hword L(\type\()_\taps\()_hv_tbl) - 80b
+ .hword L(\type\()_\taps\()_hv_tbl) - 40b
+ .hword L(\type\()_\taps\()_hv_tbl) - 20b
.hword 0
endfunc
+.endm
+.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
function \type\()_bilin_16bpc_neon, export=1
.ifc \bdmax, w8
ldr w8, [sp]
@@ -3236,8 +3363,34 @@ L(\type\()_bilin_hv_tbl):
endfunc
.endm
-filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
-filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
+make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap
+make_8tap_fn put, smooth_sharp, SMOOTH, SHARP, 8tap
+make_8tap_fn put, sharp, SHARP, SHARP, 8tap
+make_8tap_fn put, sharp_regular, SHARP, REGULAR, 8tap
+make_8tap_fn put, sharp_smooth, SHARP, SMOOTH, 8tap
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 8tap
+
+make_8tap_fn put, regular, REGULAR, REGULAR, 6tap
+make_8tap_fn put, regular_smooth, REGULAR, SMOOTH, 6tap
+make_8tap_fn put, smooth, SMOOTH, SMOOTH, 6tap
+make_8tap_fn put, smooth_regular, SMOOTH, REGULAR, 6tap
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 6tap
+filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
+
+make_8tap_fn prep, regular_sharp, REGULAR, SHARP, 8tap
+make_8tap_fn prep, smooth_sharp, SMOOTH, SHARP, 8tap
+make_8tap_fn prep, sharp, SHARP, SHARP, 8tap
+make_8tap_fn prep, sharp_regular, SHARP, REGULAR, 8tap
+make_8tap_fn prep, sharp_smooth, SHARP, SMOOTH, 8tap
+filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 8tap
+
+make_8tap_fn prep, regular, REGULAR, REGULAR, 6tap
+make_8tap_fn prep, regular_smooth, REGULAR, SMOOTH, 6tap
+make_8tap_fn prep, smooth, SMOOTH, SMOOTH, 6tap
+make_8tap_fn prep, smooth_regular, SMOOTH, REGULAR, 6tap
+filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 6tap
+filter_bilin_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
+
.macro load_filter_row dst, src, inc
asr w13, \src, #10
diff --git a/third_party/dav1d/src/arm/64/msac.S b/third_party/dav1d/src/arm/64/msac.S
index 3a6cf900a9..7bef9243fb 100644
--- a/third_party/dav1d/src/arm/64/msac.S
+++ b/third_party/dav1d/src/arm/64/msac.S
@@ -208,60 +208,66 @@ L(renorm):
sub w4, w4, w3 // rng = u - v
clz w5, w4 // clz(rng)
eor w5, w5, #16 // d = clz(rng) ^ 16
- mvn x7, x7 // ~dif
- add x7, x7, x3, lsl #48 // ~dif + (v << 48)
+ sub x7, x7, x3, lsl #48 // dif - (v << 48)
L(renorm2):
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
- lsl x7, x7, x5 // (~dif + (v << 48)) << d
+ lsl x7, x7, x5 // (dif - (v << 48)) << d
str w4, [x0, #RNG]
- mvn x7, x7 // ~dif
- b.hs 9f
+ b.hs 4f
// refill
ldp x3, x4, [x0] // BUF_POS, BUF_END
add x5, x3, #8
- cmp x5, x4
- b.gt 2f
-
- ldr x3, [x3] // next_bits
- add w8, w6, #23 // shift_bits = cnt + 23
- add w6, w6, #16 // cnt += 16
- rev x3, x3 // next_bits = bswap(next_bits)
- sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3
- and w8, w8, #24 // shift_bits &= 24
- lsr x3, x3, x8 // next_bits >>= shift_bits
- sub w8, w8, w6 // shift_bits -= 16 + cnt
- str x5, [x0, #BUF_POS]
- lsl x3, x3, x8 // next_bits <<= shift_bits
- mov w4, #48
- sub w6, w4, w8 // cnt = cnt + 64 - shift_bits
- eor x7, x7, x3 // dif ^= next_bits
- b 9f
-
-2: // refill_eob
- mov w14, #40
- sub w5, w14, w6 // c = 40 - cnt
-3:
- cmp x3, x4
- b.ge 4f
- ldrb w8, [x3], #1
- lsl x8, x8, x5
- eor x7, x7, x8
- subs w5, w5, #8
- b.ge 3b
-
-4: // refill_eob_end
+ subs x5, x5, x4
+ b.hi 6f
+
+ ldr x8, [x3] // next_bits
+ add w4, w6, #-48 // shift_bits = cnt + 16 (- 64)
+ mvn x8, x8
+ neg w5, w4
+ rev x8, x8 // next_bits = bswap(next_bits)
+ lsr w5, w5, #3 // num_bytes_read
+ lsr x8, x8, x4 // next_bits >>= (shift_bits & 63)
+
+2: // refill_end
+ add x3, x3, x5
+ add w6, w6, w5, lsl #3 // cnt += num_bits_read
str x3, [x0, #BUF_POS]
- sub w6, w14, w5 // cnt = 40 - c
-9:
+3: // refill_end2
+ orr x7, x7, x8 // dif |= next_bits
+
+4: // end
str w6, [x0, #CNT]
str x7, [x0, #DIF]
mov w0, w15
add sp, sp, #48
ret
+
+5: // pad_with_ones
+ add w8, w6, #-16
+ ror x8, x8, x8
+ b 3b
+
+6: // refill_eob
+ cmp x3, x4
+ b.hs 5b
+
+ ldr x8, [x4, #-8]
+ lsl w5, w5, #3
+ lsr x8, x8, x5
+ add w5, w6, #-48
+ mvn x8, x8
+ sub w4, w4, w3 // num_bytes_left
+ rev x8, x8
+ lsr x8, x8, x5
+ neg w5, w5
+ lsr w5, w5, #3
+ cmp w5, w4
+ csel w5, w5, w4, lo // num_bytes_read
+ b 2b
endfunc
function msac_decode_symbol_adapt8_neon, export=1
@@ -334,54 +340,37 @@ function msac_decode_hi_tok_neon, export=1
sub w4, w4, w3 // rng = u - v
clz w5, w4 // clz(rng)
eor w5, w5, #16 // d = clz(rng) ^ 16
- mvn x7, x7 // ~dif
- add x7, x7, x3, lsl #48 // ~dif + (v << 48)
+ sub x7, x7, x3, lsl #48 // dif - (v << 48)
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
- lsl x7, x7, x5 // (~dif + (v << 48)) << d
+ lsl x7, x7, x5 // (dif - (v << 48)) << d
str w4, [x0, #RNG]
dup v3.4h, w4
- mvn x7, x7 // ~dif
- b.hs 9f
+ b.hs 5f
// refill
ldp x3, x4, [x0] // BUF_POS, BUF_END
add x5, x3, #8
- cmp x5, x4
- b.gt 2f
-
- ldr x3, [x3] // next_bits
- add w8, w6, #23 // shift_bits = cnt + 23
- add w6, w6, #16 // cnt += 16
- rev x3, x3 // next_bits = bswap(next_bits)
- sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3
- and w8, w8, #24 // shift_bits &= 24
- lsr x3, x3, x8 // next_bits >>= shift_bits
- sub w8, w8, w6 // shift_bits -= 16 + cnt
- str x5, [x0, #BUF_POS]
- lsl x3, x3, x8 // next_bits <<= shift_bits
- mov w4, #48
- sub w6, w4, w8 // cnt = cnt + 64 - shift_bits
- eor x7, x7, x3 // dif ^= next_bits
- b 9f
-
-2: // refill_eob
- mov w14, #40
- sub w5, w14, w6 // c = 40 - cnt
-3:
- cmp x3, x4
- b.ge 4f
- ldrb w8, [x3], #1
- lsl x8, x8, x5
- eor x7, x7, x8
- subs w5, w5, #8
- b.ge 3b
-
-4: // refill_eob_end
+ subs x5, x5, x4
+ b.hi 7f
+
+ ldr x8, [x3] // next_bits
+ add w4, w6, #-48 // shift_bits = cnt + 16 (- 64)
+ mvn x8, x8
+ neg w5, w4
+ rev x8, x8 // next_bits = bswap(next_bits)
+ lsr w5, w5, #3 // num_bytes_read
+ lsr x8, x8, x4 // next_bits >>= (shift_bits & 63)
+
+3: // refill_end
+ add x3, x3, x5
+ add w6, w6, w5, lsl #3 // cnt += num_bits_read
str x3, [x0, #BUF_POS]
- sub w6, w14, w5 // cnt = 40 - c
-9:
+4: // refill_end2
+ orr x7, x7, x8 // dif |= next_bits
+
+5: // end
lsl w15, w15, #1
sub w15, w15, #5
lsr x12, x7, #48
@@ -394,6 +383,29 @@ function msac_decode_hi_tok_neon, export=1
str x7, [x0, #DIF]
lsr w0, w13, #1
ret
+
+6: // pad_with_ones
+ add w8, w6, #-16
+ ror x8, x8, x8
+ b 4b
+
+7: // refill_eob
+ cmp x3, x4
+ b.hs 6b
+
+ ldr x8, [x4, #-8]
+ lsl w5, w5, #3
+ lsr x8, x8, x5
+ add w5, w6, #-48
+ mvn x8, x8
+ sub w4, w4, w3 // num_bytes_left
+ rev x8, x8
+ lsr x8, x8, x5
+ neg w5, w5
+ lsr w5, w5, #3
+ cmp w5, w4
+ csel w5, w5, w4, lo // num_bytes_read
+ b 3b
endfunc
function msac_decode_bool_equi_neon, export=1
@@ -410,7 +422,6 @@ function msac_decode_bool_equi_neon, export=1
csel x7, x8, x7, hs // if (ret) dif = dif - vw;
clz w5, w4 // clz(rng)
- mvn x7, x7 // ~dif
eor w5, w5, #16 // d = clz(rng) ^ 16
b L(renorm2)
endfunc
@@ -431,7 +442,6 @@ function msac_decode_bool_neon, export=1
csel x7, x8, x7, hs // if (ret) dif = dif - vw;
clz w5, w4 // clz(rng)
- mvn x7, x7 // ~dif
eor w5, w5, #16 // d = clz(rng) ^ 16
b L(renorm2)
endfunc
@@ -455,7 +465,6 @@ function msac_decode_bool_adapt_neon, export=1
ldr w10, [x0, #ALLOW_UPDATE_CDF]
clz w5, w4 // clz(rng)
- mvn x7, x7 // ~dif
eor w5, w5, #16 // d = clz(rng) ^ 16
cbz w10, L(renorm2)
diff --git a/third_party/dav1d/src/arm/64/util.S b/third_party/dav1d/src/arm/64/util.S
index 9013fd4b1e..1b3f319ce5 100644
--- a/third_party/dav1d/src/arm/64/util.S
+++ b/third_party/dav1d/src/arm/64/util.S
@@ -32,6 +32,10 @@
#include "config.h"
#include "src/arm/asm.S"
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
.macro movrel rd, val, offset=0
#if defined(__APPLE__)
.if \offset < 0
@@ -51,6 +55,10 @@
adrp \rd, \val+(\offset)
add \rd, \rd, :lo12:\val+(\offset)
.endif
+#elif __has_feature(hwaddress_sanitizer)
+ adrp \rd, :pg_hi21_nc:\val+(\offset)
+ movk \rd, #:prel_g3:\val+0x100000000
+ add \rd, \rd, :lo12:\val+(\offset)
#elif defined(PIC)
adrp \rd, \val+(\offset)
add \rd, \rd, :lo12:\val+(\offset)
@@ -149,6 +157,35 @@
trn2 \r7\().2d, \t9\().2d, \r7\().2d
.endm
+.macro transpose_8x8h_mov r0, r1, r2, r3, r4, r5, r6, r7, t8, t9, o0, o1, o2, o3, o4, o5, o6, o7
+ trn1 \t8\().8h, \r0\().8h, \r1\().8h
+ trn2 \t9\().8h, \r0\().8h, \r1\().8h
+ trn1 \r1\().8h, \r2\().8h, \r3\().8h
+ trn2 \r3\().8h, \r2\().8h, \r3\().8h
+ trn1 \r0\().8h, \r4\().8h, \r5\().8h
+ trn2 \r5\().8h, \r4\().8h, \r5\().8h
+ trn1 \r2\().8h, \r6\().8h, \r7\().8h
+ trn2 \r7\().8h, \r6\().8h, \r7\().8h
+
+ trn1 \r4\().4s, \r0\().4s, \r2\().4s
+ trn2 \r2\().4s, \r0\().4s, \r2\().4s
+ trn1 \r6\().4s, \r5\().4s, \r7\().4s
+ trn2 \r7\().4s, \r5\().4s, \r7\().4s
+ trn1 \r5\().4s, \t9\().4s, \r3\().4s
+ trn2 \t9\().4s, \t9\().4s, \r3\().4s
+ trn1 \r3\().4s, \t8\().4s, \r1\().4s
+ trn2 \t8\().4s, \t8\().4s, \r1\().4s
+
+ trn1 \o0\().2d, \r3\().2d, \r4\().2d
+ trn2 \o4\().2d, \r3\().2d, \r4\().2d
+ trn1 \o1\().2d, \r5\().2d, \r6\().2d
+ trn2 \o5\().2d, \r5\().2d, \r6\().2d
+ trn2 \o6\().2d, \t8\().2d, \r2\().2d
+ trn1 \o2\().2d, \t8\().2d, \r2\().2d
+ trn1 \o3\().2d, \t9\().2d, \r7\().2d
+ trn2 \o7\().2d, \t9\().2d, \r7\().2d
+.endm
+
.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
trn1 \t8\().16b, \r0\().16b, \r1\().16b
trn2 \t9\().16b, \r0\().16b, \r1\().16b
@@ -226,4 +263,16 @@
trn2 \r3\().4s, \t5\().4s, \t7\().4s
.endm
+.macro transpose_4x8h_mov r0, r1, r2, r3, t4, t5, t6, t7, o0, o1, o2, o3
+ trn1 \t4\().8h, \r0\().8h, \r1\().8h
+ trn2 \t5\().8h, \r0\().8h, \r1\().8h
+ trn1 \t6\().8h, \r2\().8h, \r3\().8h
+ trn2 \t7\().8h, \r2\().8h, \r3\().8h
+
+ trn1 \o0\().4s, \t4\().4s, \t6\().4s
+ trn2 \o2\().4s, \t4\().4s, \t6\().4s
+ trn1 \o1\().4s, \t5\().4s, \t7\().4s
+ trn2 \o3\().4s, \t5\().4s, \t7\().4s
+.endm
+
#endif /* DAV1D_SRC_ARM_64_UTIL_S */
diff --git a/third_party/dav1d/src/arm/asm.S b/third_party/dav1d/src/arm/asm.S
index dc50415f1f..fed73b3048 100644
--- a/third_party/dav1d/src/arm/asm.S
+++ b/third_party/dav1d/src/arm/asm.S
@@ -34,6 +34,50 @@
#define x18 do_not_use_x18
#define w18 do_not_use_w18
+#if HAVE_AS_ARCH_DIRECTIVE
+ .arch AS_ARCH_LEVEL
+#endif
+
+#if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE
+#define ENABLE_DOTPROD .arch_extension dotprod
+#define DISABLE_DOTPROD .arch_extension nodotprod
+#else
+#define ENABLE_DOTPROD
+#define DISABLE_DOTPROD
+#endif
+#if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE
+#define ENABLE_I8MM .arch_extension i8mm
+#define DISABLE_I8MM .arch_extension noi8mm
+#else
+#define ENABLE_I8MM
+#define DISABLE_I8MM
+#endif
+#if HAVE_AS_ARCHEXT_SVE_DIRECTIVE
+#define ENABLE_SVE .arch_extension sve
+#define DISABLE_SVE .arch_extension nosve
+#else
+#define ENABLE_SVE
+#define DISABLE_SVE
+#endif
+#if HAVE_AS_ARCHEXT_SVE2_DIRECTIVE
+#define ENABLE_SVE2 .arch_extension sve2
+#define DISABLE_SVE2 .arch_extension nosve2
+#else
+#define ENABLE_SVE2
+#define DISABLE_SVE2
+#endif
+
+/* If we do support the .arch_extension directives, disable support for all
+ * the extensions that we may use, in case they were implicitly enabled by
+ * the .arch level. This makes it clear if we try to assemble an instruction
+ * from an unintended extension set; we only allow assmbling such instructions
+ * within regions where we explicitly enable those extensions. */
+DISABLE_DOTPROD
+DISABLE_I8MM
+DISABLE_SVE
+DISABLE_SVE2
+
+
/* Support macros for
* - Armv8.3-A Pointer Authentication and
* - Armv8.5-A Branch Target Identification
diff --git a/third_party/dav1d/src/arm/cpu.c b/third_party/dav1d/src/arm/cpu.c
index b7a0d3adbc..d9b1751a6a 100644
--- a/third_party/dav1d/src/arm/cpu.c
+++ b/third_party/dav1d/src/arm/cpu.c
@@ -31,22 +31,95 @@
#include "src/arm/cpu.h"
-#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
-// NEON is always available; runtime tests are not needed.
-#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
+#if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)
#include <sys/auxv.h>
+#if ARCH_AARCH64
+
+#define HWCAP_AARCH64_ASIMDDP (1 << 20)
+#define HWCAP_AARCH64_SVE (1 << 22)
+#define HWCAP2_AARCH64_SVE2 (1 << 1)
+#define HWCAP2_AARCH64_I8MM (1 << 13)
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+#ifdef HAVE_GETAUXVAL
+ unsigned long hw_cap = getauxval(AT_HWCAP);
+ unsigned long hw_cap2 = getauxval(AT_HWCAP2);
+#else
+ unsigned long hw_cap = 0;
+ unsigned long hw_cap2 = 0;
+ elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
+ elf_aux_info(AT_HWCAP2, &hw_cap2, sizeof(hw_cap2));
+#endif
+
+ unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+ flags |= (hw_cap & HWCAP_AARCH64_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
+ flags |= (hw_cap2 & HWCAP2_AARCH64_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
+ flags |= (hw_cap & HWCAP_AARCH64_SVE) ? DAV1D_ARM_CPU_FLAG_SVE : 0;
+ flags |= (hw_cap2 & HWCAP2_AARCH64_SVE2) ? DAV1D_ARM_CPU_FLAG_SVE2 : 0;
+ return flags;
+}
+#else /* !ARCH_AARCH64 */
+
#ifndef HWCAP_ARM_NEON
-#define HWCAP_ARM_NEON (1 << 12)
+#define HWCAP_ARM_NEON (1 << 12)
#endif
-#define NEON_HWCAP HWCAP_ARM_NEON
+#define HWCAP_ARM_ASIMDDP (1 << 24)
+#define HWCAP_ARM_I8MM (1 << 27)
-#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
-#include <sys/auxv.h>
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+#ifdef HAVE_GETAUXVAL
+ unsigned long hw_cap = getauxval(AT_HWCAP);
+#else
+ unsigned long hw_cap = 0;
+ elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
+#endif
+
+ unsigned flags = (hw_cap & HWCAP_ARM_NEON) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+ flags |= (hw_cap & HWCAP_ARM_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
+ flags |= (hw_cap & HWCAP_ARM_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
+ return flags;
+}
+#endif /* ARCH_AARCH64 */
+
+#elif defined(__APPLE__)
+#include <sys/sysctl.h>
+
+static int have_feature(const char *feature) {
+ int supported = 0;
+ size_t size = sizeof(supported);
+ if (sysctlbyname(feature, &supported, &size, NULL, 0) != 0) {
+ return 0;
+ }
+ return supported;
+}
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+ unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+ if (have_feature("hw.optional.arm.FEAT_DotProd"))
+ flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
+ if (have_feature("hw.optional.arm.FEAT_I8MM"))
+ flags |= DAV1D_ARM_CPU_FLAG_I8MM;
+ /* No SVE and SVE2 feature detection available on Apple platforms. */
+ return flags;
+}
+
+#elif defined(_WIN32)
+#include <windows.h>
-#define NEON_HWCAP HWCAP_NEON
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+ unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+#ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
+ if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE))
+ flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
+#endif
+ /* No I8MM or SVE feature detection available on Windows at the time of
+ * writing. */
+ return flags;
+}
#elif defined(__ANDROID__)
+#include <ctype.h>
#include <stdio.h>
#include <string.h>
@@ -58,18 +131,25 @@ static unsigned parse_proc_cpuinfo(const char *flag) {
char line_buffer[120];
const char *line;
+ size_t flaglen = strlen(flag);
while ((line = fgets(line_buffer, sizeof(line_buffer), file))) {
- if (strstr(line, flag)) {
- fclose(file);
- return 1;
+ // check all occurances as whole words
+ const char *found = line;
+ while ((found = strstr(found, flag))) {
+ if ((found == line_buffer || !isgraph(found[-1])) &&
+ (isspace(found[flaglen]) || feof(file))) {
+ fclose(file);
+ return 1;
+ }
+ found += flaglen;
}
// if line is incomplete seek back to avoid splitting the search
// string into two buffers
- if (!strchr(line, '\n') && strlen(line) > strlen(flag)) {
+ if (!strchr(line, '\n') && strlen(line) > flaglen) {
// use fseek since the 64 bit fseeko is only available since
// Android API level 24 and meson defines _FILE_OFFSET_BITS
// by default 64
- if (fseek(file, -strlen(flag), SEEK_CUR))
+ if (fseek(file, -flaglen, SEEK_CUR))
break;
}
}
@@ -78,22 +158,23 @@ static unsigned parse_proc_cpuinfo(const char *flag) {
return 0;
}
-#endif
COLD unsigned dav1d_get_cpu_flags_arm(void) {
- unsigned flags = 0;
-#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
- flags |= DAV1D_ARM_CPU_FLAG_NEON;
-#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
- unsigned long hw_cap = getauxval(AT_HWCAP);
- flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
-#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
- unsigned long hw_cap = 0;
- elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
- flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
-#elif defined(__ANDROID__)
- flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
-#endif
-
+ unsigned flags = parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+ flags |= parse_proc_cpuinfo("asimd") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+ flags |= parse_proc_cpuinfo("asimddp") ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
+ flags |= parse_proc_cpuinfo("i8mm") ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
+#if ARCH_AARCH64
+ flags |= parse_proc_cpuinfo("sve") ? DAV1D_ARM_CPU_FLAG_SVE : 0;
+ flags |= parse_proc_cpuinfo("sve2") ? DAV1D_ARM_CPU_FLAG_SVE2 : 0;
+#endif /* ARCH_AARCH64 */
return flags;
}
+
+#else /* Unsupported OS */
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+ return 0;
+}
+
+#endif
diff --git a/third_party/dav1d/src/arm/cpu.h b/third_party/dav1d/src/arm/cpu.h
index 8c10a1b6b0..de9bde6ccf 100644
--- a/third_party/dav1d/src/arm/cpu.h
+++ b/third_party/dav1d/src/arm/cpu.h
@@ -30,6 +30,10 @@
enum CpuFlags {
DAV1D_ARM_CPU_FLAG_NEON = 1 << 0,
+ DAV1D_ARM_CPU_FLAG_DOTPROD = 1 << 1,
+ DAV1D_ARM_CPU_FLAG_I8MM = 1 << 2,
+ DAV1D_ARM_CPU_FLAG_SVE = 1 << 3,
+ DAV1D_ARM_CPU_FLAG_SVE2 = 1 << 4,
};
unsigned dav1d_get_cpu_flags_arm(void);
diff --git a/third_party/dav1d/src/arm/itx.h b/third_party/dav1d/src/arm/itx.h
index 2ecd086b3b..17234e027a 100644
--- a/third_party/dav1d/src/arm/itx.h
+++ b/third_party/dav1d/src/arm/itx.h
@@ -117,9 +117,11 @@ static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+ assign_itx_fn( , 4, 4, wht_wht, WHT_WHT, neon);
+
if (BITDEPTH == 16 && bpc != 10) return;
- assign_itx17_fn( , 4, 4, neon);
+ assign_itx16_fn( , 4, 4, neon);
assign_itx16_fn(R, 4, 8, neon);
assign_itx16_fn(R, 4, 16, neon);
assign_itx16_fn(R, 8, 4, neon);
diff --git a/third_party/dav1d/src/arm/msac.h b/third_party/dav1d/src/arm/msac.h
index 9db0bf86ae..6eee0da424 100644
--- a/third_party/dav1d/src/arm/msac.h
+++ b/third_party/dav1d/src/arm/msac.h
@@ -39,7 +39,7 @@ unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf);
unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s);
unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f);
-#if ARCH_AARCH64 || defined(__ARM_NEON)
+#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_neon
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_neon
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon