summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/arm/32
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--third_party/dav1d/src/arm/32/itx.S79
-rw-r--r--third_party/dav1d/src/arm/32/itx16.S19
-rw-r--r--third_party/dav1d/src/arm/32/msac.S167
3 files changed, 155 insertions, 110 deletions
diff --git a/third_party/dav1d/src/arm/32/itx.S b/third_party/dav1d/src/arm/32/itx.S
index ceea025e45..9ba1df7a68 100644
--- a/third_party/dav1d/src/arm/32/itx.S
+++ b/third_party/dav1d/src/arm/32/itx.S
@@ -965,6 +965,8 @@ function inv_txfm_\variant\()add_8x8_neon
.ifc \variant, identity_
// The identity shl #1 and downshift srshr #1 cancel out
+
+ b L(itx_8x8_epilog)
.else
blx r4
@@ -976,8 +978,8 @@ function inv_txfm_\variant\()add_8x8_neon
vrshr.s16 q13, q13, #1
vrshr.s16 q14, q14, #1
vrshr.s16 q15, q15, #1
-.endif
+L(itx_8x8_epilog):
transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
blx r5
@@ -985,11 +987,12 @@ function inv_txfm_\variant\()add_8x8_neon
load_add_store_8x8 r0, r7
vpop {q4-q7}
pop {r4-r5,r7,pc}
+.endif
endfunc
.endm
-def_fn_8x8_base
def_fn_8x8_base identity_
+def_fn_8x8_base
.macro def_fn_8x8 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
@@ -1444,14 +1447,16 @@ function inv_txfm_horz\suffix\()_16x4_neon
.else
identity_4x16_shift1 d0[0]
.endif
+ b L(horz_16x4_epilog)
.else
blx r4
-.endif
-.if \shift > 0
.irp i, q8, q9, q10, q11, q12, q13, q14, q15
vrshr.s16 \i, \i, #\shift
.endr
-.endif
+.if \shift == 1
+ b L(horz_16x4_epilog)
+.else
+L(horz_16x4_epilog):
transpose_4x4h q8, q9, d16, d17, d18, d19
transpose_4x4h q10, q11, d20, d21, d22, d23
transpose_4x4h q12, q13, d24, d25, d26, d27
@@ -1462,13 +1467,15 @@ function inv_txfm_horz\suffix\()_16x4_neon
.endr
pop {pc}
+.endif
+.endif
endfunc
.endm
-def_horz_16 scale=0, identity=0, shift=2
-def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
-def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity
def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity
+def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity
+def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
+def_horz_16 scale=0, identity=0, shift=2
function inv_txfm_add_vert_4x16_neon
push {lr}
@@ -1597,6 +1604,8 @@ function inv_txfm_\variant\()add_16x4_neon
.endr
identity_4x16_shift1 d0[0]
+
+ b L(itx_16x4_epilog)
.else
vmov.i16 q2, #0
vmov.i16 q3, #0
@@ -1615,30 +1624,25 @@ function inv_txfm_\variant\()add_16x4_neon
vswp d19, d22
vswp d18, d20
vswp d19, d21
-.irp i, q8, q9, q10, q11
+ vswp d25, d28
+ vswp d27, d30
+ vswp d26, d28
+ vswp d27, d29
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
vrshr.s16 \i, \i, #1
.endr
-.endif
+
+L(itx_16x4_epilog):
transpose_4x8h q8, q9, q10, q11
blx r5
mov r6, r0
load_add_store_8x4 r6, r7
-.ifc \variant, identity_
vmov q8, q12
vmov q9, q13
vmov q10, q14
vmov q11, q15
-.else
- vswp d25, d28
- vswp d27, d30
- vswp d26, d28
- vswp d27, d29
- vrshr.s16 q8, q12, #1
- vrshr.s16 q9, q13, #1
- vrshr.s16 q10, q14, #1
- vrshr.s16 q11, q15, #1
-.endif
+
transpose_4x8h q8, q9, q10, q11
blx r5
add r6, r0, #8
@@ -1646,6 +1650,7 @@ function inv_txfm_\variant\()add_16x4_neon
vpop {q4-q7}
pop {r4-r11,pc}
+.endif
endfunc
function inv_txfm_\variant\()add_4x16_neon
@@ -1696,12 +1701,14 @@ function inv_txfm_\variant\()add_4x16_neon
movw r12, #(5793-4096)*8
vdup.16 d0, r12
identity_8x4_shift1 q8, q9, q10, q11, d0[0]
+
+ b L(itx_4x16_epilog)
.else
blx r4
.irp i, q8, q9, q10, q11
vrshr.s16 \i, \i, #1
.endr
-.endif
+L(itx_4x16_epilog):
transpose_4x8h q8, q9, q10, q11
vswp d19, d21
vswp d18, d20
@@ -1714,11 +1721,12 @@ function inv_txfm_\variant\()add_4x16_neon
vpop {q4-q7}
pop {r4-r11,pc}
+.endif
endfunc
.endm
-def_fn_416_base
def_fn_416_base identity_
+def_fn_416_base
.macro def_fn_416 w, h, txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
@@ -1728,11 +1736,15 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
.if \w == 4
+.ifnc \txfm1, identity
movrel_local r4, inv_\txfm1\()_8h_x\w\()_neon
+.endif
movrel_local r5, inv_\txfm2\()_4h_x\h\()_neon
mov r10, #\eob_half
.else
+.ifnc \txfm1, identity
movrel_local r4, inv_\txfm1\()_4h_x\w\()_neon
+.endif
movrel_local r5, inv_\txfm2\()_8h_x\h\()_neon
.endif
.ifc \txfm1, identity
@@ -1765,8 +1777,7 @@ def_fn_416 \w, \h, identity, flipadst, 32
def_fns_416 4, 16
def_fns_416 16, 4
-.macro def_fn_816_base variant
-function inv_txfm_\variant\()add_16x8_neon
+function inv_txfm_add_16x8_neon
sub_sp_align 256
.irp i, 0, 4
@@ -1805,6 +1816,7 @@ function inv_txfm_\variant\()add_16x8_neon
pop {r4-r11,pc}
endfunc
+.macro def_fn_816_base variant
function inv_txfm_\variant\()add_8x16_neon
sub_sp_align 256
@@ -1849,6 +1861,10 @@ function inv_txfm_\variant\()add_8x16_neon
.endr
2:
+.ifc \variant, identity_
+ b L(itx_8x16_epilog)
+.else
+L(itx_8x16_epilog):
.irp i, 0, 4
add r6, r0, #(\i)
add r7, sp, #(\i*2)
@@ -1859,11 +1875,18 @@ function inv_txfm_\variant\()add_8x16_neon
add_sp_align 256
vpop {q4-q7}
pop {r4-r11,pc}
+.endif
endfunc
.endm
-def_fn_816_base
def_fn_816_base identity_
+def_fn_816_base
+
+/* Define symbols used in .if statement */
+.equ dct, 1
+.equ identity, 2
+.equ adst, 3
+.equ flipadst, 4
.macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
@@ -1873,7 +1896,9 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
.if \w == 8
+.ifnc \txfm1, identity
movrel_local r4, inv_\txfm1\()_8h_x8_neon
+.endif
movrel_local r5, inv_\txfm2\()_4h_x16_neon
.else
.ifc \txfm1, identity
@@ -1889,7 +1914,7 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
.else
mov r10, #\eob_4x4
.endif
-.ifc \txfm1, identity
+.if \w == 8 && \txfm1 == identity
b inv_txfm_identity_add_\w\()x\h\()_neon
.else
b inv_txfm_add_\w\()x\h\()_neon
diff --git a/third_party/dav1d/src/arm/32/itx16.S b/third_party/dav1d/src/arm/32/itx16.S
index aa6c272e71..7691272517 100644
--- a/third_party/dav1d/src/arm/32/itx16.S
+++ b/third_party/dav1d/src/arm/32/itx16.S
@@ -547,11 +547,11 @@ function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
vmov.i16 q15, #0
vld1.32 {q8, q9}, [r2, :128]
vst1.32 {q14, q15}, [r2, :128]!
- vshr.s16 q8, q8, #2
+ vshr.s32 q8, q8, #2
vld1.32 {q10, q11}, [r2, :128]
- vshr.s16 q9, q9, #2
- vshr.s16 q10, q10, #2
- vshr.s16 q11, q11, #2
+ vshr.s32 q9, q9, #2
+ vshr.s32 q10, q10, #2
+ vshr.s32 q11, q11, #2
iwht4
@@ -598,7 +598,9 @@ function inv_txfm_add_4x4_neon
vld1.16 {d3}, [r0, :64], r1
L(itx_4x4_end):
- vmvn.i16 q15, #0xfc00 // 0x3ff
+ // read bitdepth_max from the callers stack
+ ldr r4, [sp, #44]
+ vdup.i16 q15, r4
sub r0, r0, r1, lsl #2
vqadd.s16 q8, q8, q0
vqadd.s16 q9, q9, q1
@@ -1487,6 +1489,10 @@ function inv_txfm_horz\suffix\()_16x2_neon
vqrshrn.s32 d21, q13, #\shift
vqrshrn.s32 d22, q14, #\shift
vqrshrn.s32 d23, q15, #\shift
+.if \scale
+ b L(horz_16x2_epilog)
+.else
+L(horz_16x2_epilog):
vuzp.16 q8, q9
vuzp.16 q10, q11
@@ -1495,11 +1501,12 @@ function inv_txfm_horz\suffix\()_16x2_neon
.endr
pop {pc}
+.endif
endfunc
.endm
-def_horz_16 scale=0, shift=2
def_horz_16 scale=1, shift=1, suffix=_scale
+def_horz_16 scale=0, shift=2
function inv_txfm_add_vert_4x16_neon
push {lr}
diff --git a/third_party/dav1d/src/arm/32/msac.S b/third_party/dav1d/src/arm/32/msac.S
index b06e109dda..b16957fb7e 100644
--- a/third_party/dav1d/src/arm/32/msac.S
+++ b/third_party/dav1d/src/arm/32/msac.S
@@ -279,60 +279,67 @@ L(renorm):
sub r4, r4, r3 // rng = u - v
clz r5, r4 // clz(rng)
eor r5, r5, #16 // d = clz(rng) ^ 16
- mvn r7, r7 // ~dif
- add r7, r7, r3, lsl #16 // ~dif + (v << 16)
+ sub r7, r7, r3, lsl #16 // dif - (v << 16)
L(renorm2):
lsl r4, r4, r5 // rng << d
subs r6, r6, r5 // cnt -= d
- lsl r7, r7, r5 // (~dif + (v << 16)) << d
+ lsl r7, r7, r5 // (dif - (v << 16)) << d
str r4, [r0, #RNG]
- mvn r7, r7 // ~dif
- bhs 9f
+ bhs 4f
// refill
ldr r3, [r0, #BUF_POS] // BUF_POS
ldr r4, [r0, #BUF_END] // BUF_END
add r5, r3, #4
- cmp r5, r4
- bgt 2f
-
- ldr r3, [r3] // next_bits
- add r8, r6, #23 // shift_bits = cnt + 23
- add r6, r6, #16 // cnt += 16
- rev r3, r3 // next_bits = bswap(next_bits)
- sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3
- and r8, r8, #24 // shift_bits &= 24
- lsr r3, r3, r8 // next_bits >>= shift_bits
- sub r8, r8, r6 // shift_bits -= 16 + cnt
- str r5, [r0, #BUF_POS]
- lsl r3, r3, r8 // next_bits <<= shift_bits
- rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits
- eor r7, r7, r3 // dif ^= next_bits
- b 9f
-
-2: // refill_eob
- rsb r5, r6, #8 // c = 8 - cnt
-3:
- cmp r3, r4
- bge 4f
- ldrb r8, [r3], #1
- lsl r8, r8, r5
- eor r7, r7, r8
- subs r5, r5, #8
- bge 3b
-
-4: // refill_eob_end
+ subs r5, r5, r4
+ bhi 6f
+
+ ldr r8, [r3] // next_bits
+ rsb r5, r6, #16
+ add r4, r6, #16 // shift_bits = cnt + 16
+ mvn r8, r8
+ lsr r5, r5, #3 // num_bytes_read
+ rev r8, r8 // next_bits = bswap(next_bits)
+ lsr r8, r8, r4 // next_bits >>= shift_bits
+
+2: // refill_end
+ add r3, r3, r5
+ add r6, r6, r5, lsl #3 // cnt += num_bits_read
str r3, [r0, #BUF_POS]
- rsb r6, r5, #8 // cnt = 8 - c
-9:
+3: // refill_end2
+ orr r7, r7, r8 // dif |= next_bits
+
+4: // end
str r6, [r0, #CNT]
str r7, [r0, #DIF]
-
mov r0, lr
add sp, sp, #48
-
pop {r4-r10,pc}
+
+5: // pad_with_ones
+ add r8, r6, #-240
+ lsr r8, r8, r8
+ b 3b
+
+6: // refill_eob
+ cmp r3, r4
+ bhs 5b
+
+ ldr r8, [r4, #-4]
+ lsl r5, r5, #3
+ lsr r8, r8, r5
+ add r5, r6, #16
+ mvn r8, r8
+ sub r4, r4, r3 // num_bytes_left
+ rev r8, r8
+ lsr r8, r8, r5
+ rsb r5, r6, #16
+ lsr r5, r5, #3
+ cmp r5, r4
+ it hs
+ movhs r5, r4
+ b 2b
endfunc
function msac_decode_symbol_adapt8_neon, export=1
@@ -414,53 +421,38 @@ function msac_decode_hi_tok_neon, export=1
sub r4, r4, r3 // rng = u - v
clz r5, r4 // clz(rng)
eor r5, r5, #16 // d = clz(rng) ^ 16
- mvn r7, r7 // ~dif
- add r7, r7, r3, lsl #16 // ~dif + (v << 16)
+ sub r7, r7, r3, lsl #16 // dif - (v << 16)
lsl r4, r4, r5 // rng << d
subs r6, r6, r5 // cnt -= d
- lsl r7, r7, r5 // (~dif + (v << 16)) << d
+ lsl r7, r7, r5 // (dif - (v << 16)) << d
str r4, [r0, #RNG]
vdup.16 d1, r4
- mvn r7, r7 // ~dif
- bhs 9f
+ bhs 5f
// refill
ldr r3, [r0, #BUF_POS] // BUF_POS
ldr r4, [r0, #BUF_END] // BUF_END
add r5, r3, #4
- cmp r5, r4
- bgt 2f
-
- ldr r3, [r3] // next_bits
- add r8, r6, #23 // shift_bits = cnt + 23
- add r6, r6, #16 // cnt += 16
- rev r3, r3 // next_bits = bswap(next_bits)
- sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3
- and r8, r8, #24 // shift_bits &= 24
- lsr r3, r3, r8 // next_bits >>= shift_bits
- sub r8, r8, r6 // shift_bits -= 16 + cnt
- str r5, [r0, #BUF_POS]
- lsl r3, r3, r8 // next_bits <<= shift_bits
- rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits
- eor r7, r7, r3 // dif ^= next_bits
- b 9f
-
-2: // refill_eob
- rsb r5, r6, #8 // c = 40 - cnt
-3:
- cmp r3, r4
- bge 4f
- ldrb r8, [r3], #1
- lsl r8, r8, r5
- eor r7, r7, r8
- subs r5, r5, #8
- bge 3b
-
-4: // refill_eob_end
+ subs r5, r5, r4
+ bhi 7f
+
+ ldr r8, [r3] // next_bits
+ rsb r5, r6, #16
+ add r4, r6, #16 // shift_bits = cnt + 16
+ mvn r8, r8
+ lsr r5, r5, #3 // num_bytes_read
+ rev r8, r8 // next_bits = bswap(next_bits)
+ lsr r8, r8, r4 // next_bits >>= shift_bits
+
+3: // refill_end
+ add r3, r3, r5
+ add r6, r6, r5, lsl #3 // cnt += num_bits_read
str r3, [r0, #BUF_POS]
- rsb r6, r5, #8 // cnt = 40 - c
-9:
+4: // refill_end2
+ orr r7, r7, r8 // dif |= next_bits
+
+5: // end
lsl lr, lr, #1
sub lr, lr, #5
lsr r12, r7, #16
@@ -473,6 +465,30 @@ function msac_decode_hi_tok_neon, export=1
str r7, [r0, #DIF]
lsr r0, r2, #1
pop {r4-r10,pc}
+
+6: // pad_with_ones
+ add r8, r6, #-240
+ lsr r8, r8, r8
+ b 4b
+
+7: // refill_eob
+ cmp r3, r4
+ bhs 6b
+
+ ldr r8, [r4, #-4]
+ lsl r5, r5, #3
+ lsr r8, r8, r5
+ add r5, r6, #16
+ mvn r8, r8
+ sub r4, r4, r3 // num_bytes_left
+ rev r8, r8
+ lsr r8, r8, r5
+ rsb r5, r6, #16
+ lsr r5, r5, #3
+ cmp r5, r4
+ it hs
+ movhs r5, r4
+ b 3b
endfunc
function msac_decode_bool_equi_neon, export=1
@@ -493,7 +509,6 @@ function msac_decode_bool_equi_neon, export=1
movhs r7, r8 // if (ret) dif = dif - vw;
clz r5, r4 // clz(rng)
- mvn r7, r7 // ~dif
eor r5, r5, #16 // d = clz(rng) ^ 16
mov lr, r2
b L(renorm2)
@@ -519,7 +534,6 @@ function msac_decode_bool_neon, export=1
movhs r7, r8 // if (ret) dif = dif - vw;
clz r5, r4 // clz(rng)
- mvn r7, r7 // ~dif
eor r5, r5, #16 // d = clz(rng) ^ 16
mov lr, r2
b L(renorm2)
@@ -549,7 +563,6 @@ function msac_decode_bool_adapt_neon, export=1
cmp r10, #0
clz r5, r4 // clz(rng)
- mvn r7, r7 // ~dif
eor r5, r5, #16 // d = clz(rng) ^ 16
mov lr, r2