1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
|
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2020, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
.macro dir_table w, stride
const directions\w
.byte -1 * \stride + 1, -2 * \stride + 2
.byte 0 * \stride + 1, -1 * \stride + 2
.byte 0 * \stride + 1, 0 * \stride + 2
.byte 0 * \stride + 1, 1 * \stride + 2
.byte 1 * \stride + 1, 2 * \stride + 2
.byte 1 * \stride + 0, 2 * \stride + 1
.byte 1 * \stride + 0, 2 * \stride + 0
.byte 1 * \stride + 0, 2 * \stride - 1
// Repeated, to avoid & 7
.byte -1 * \stride + 1, -2 * \stride + 2
.byte 0 * \stride + 1, -1 * \stride + 2
.byte 0 * \stride + 1, 0 * \stride + 2
.byte 0 * \stride + 1, 1 * \stride + 2
.byte 1 * \stride + 1, 2 * \stride + 2
.byte 1 * \stride + 0, 2 * \stride + 1
endconst
.endm
.macro tables
dir_table 8, 16
dir_table 4, 8
const pri_taps
.byte 4, 2, 3, 3
endconst
.endm
.macro load_px d1, d2, w
.if \w == 8
add x6, x2, w9, sxtb #1 // x + off
sub x9, x2, w9, sxtb #1 // x - off
ld1 {\d1\().8h}, [x6] // p0
ld1 {\d2\().8h}, [x9] // p1
.else
add x6, x2, w9, sxtb #1 // x + off
sub x9, x2, w9, sxtb #1 // x - off
ld1 {\d1\().4h}, [x6] // p0
add x6, x6, #2*8 // += stride
ld1 {\d2\().4h}, [x9] // p1
add x9, x9, #2*8 // += stride
ld1 {\d1\().d}[1], [x6] // p0
ld1 {\d2\().d}[1], [x9] // p1
.endif
.endm
.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
.if \min
umin v2.8h, v2.8h, \s1\().8h
smax v3.8h, v3.8h, \s1\().8h
umin v2.8h, v2.8h, \s2\().8h
smax v3.8h, v3.8h, \s2\().8h
.endif
uabd v16.8h, v0.8h, \s1\().8h // abs(diff)
uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
ushl v21.8h, v20.8h, \shift // abs(diff) >> shift
uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px
sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px
neg v16.8h, v17.8h // -clip
neg v20.8h, v21.8h // -clip
smin v18.8h, v18.8h, v17.8h // imin(diff, clip)
smin v22.8h, v22.8h, v21.8h // imin(diff, clip)
dup v19.8h, \tap // taps[k]
smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip)
smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip)
mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain()
mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain()
.endm
// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping,
// int h, size_t edges);
.macro filter_func w, bpc, pri, sec, min, suffix
function cdef_filter\w\suffix\()_\bpc\()bpc_neon
.if \bpc == 8
ldr w8, [sp] // edges
cmp w8, #0xf
b.eq cdef_filter\w\suffix\()_edged_8bpc_neon
.endif
.if \pri
.if \bpc == 16
ldr w9, [sp, #8] // bitdepth_max
clz w9, w9
sub w9, w9, #24 // -bitdepth_min_8
neg w9, w9 // bitdepth_min_8
.endif
movrel x8, pri_taps
.if \bpc == 16
lsr w9, w3, w9 // pri_strength >> bitdepth_min_8
and w9, w9, #1 // (pri_strength >> bitdepth_min_8) & 1
.else
and w9, w3, #1
.endif
add x8, x8, w9, uxtw #1
.endif
movrel x9, directions\w
add x5, x9, w5, uxtw #1
movi v30.4h, #15
dup v28.4h, w6 // damping
.if \pri
dup v25.8h, w3 // threshold
.endif
.if \sec
dup v27.8h, w4 // threshold
.endif
trn1 v24.4h, v25.4h, v27.4h
clz v24.4h, v24.4h // clz(threshold)
sub v24.4h, v30.4h, v24.4h // ulog2(threshold)
uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold))
neg v24.4h, v24.4h // -shift
.if \sec
dup v26.8h, v24.h[1]
.endif
.if \pri
dup v24.8h, v24.h[0]
.endif
1:
.if \w == 8
ld1 {v0.8h}, [x2] // px
.else
add x12, x2, #2*8
ld1 {v0.4h}, [x2] // px
ld1 {v0.d}[1], [x12] // px
.endif
movi v1.8h, #0 // sum
.if \min
mov v2.16b, v0.16b // min
mov v3.16b, v0.16b // max
.endif
// Instead of loading sec_taps 2, 1 from memory, just set it
// to 2 initially and decrease for the second round.
// This is also used as loop counter.
mov w11, #2 // sec_taps[0]
2:
.if \pri
ldrb w9, [x5] // off1
load_px v4, v5, \w
.endif
.if \sec
add x5, x5, #4 // +2*2
ldrb w9, [x5] // off2
load_px v6, v7, \w
.endif
.if \pri
ldrb w10, [x8] // *pri_taps
handle_pixel v4, v5, v25.8h, v24.8h, w10, \min
.endif
.if \sec
add x5, x5, #8 // +2*4
ldrb w9, [x5] // off3
load_px v4, v5, \w
handle_pixel v6, v7, v27.8h, v26.8h, w11, \min
handle_pixel v4, v5, v27.8h, v26.8h, w11, \min
sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
.else
add x5, x5, #1 // x5 += 1
.endif
subs w11, w11, #1 // sec_tap-- (value)
.if \pri
add x8, x8, #1 // pri_taps++ (pointer)
.endif
b.ne 2b
cmlt v4.8h, v1.8h, #0 // -(sum < 0)
add v1.8h, v1.8h, v4.8h // sum - (sum < 0)
srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4
.if \min
smin v0.8h, v0.8h, v3.8h
smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max)
.endif
.if \bpc == 8
xtn v0.8b, v0.8h
.endif
.if \w == 8
add x2, x2, #2*16 // tmp += tmp_stride
subs w7, w7, #1 // h--
.if \bpc == 8
st1 {v0.8b}, [x0], x1
.else
st1 {v0.8h}, [x0], x1
.endif
.else
.if \bpc == 8
st1 {v0.s}[0], [x0], x1
.else
st1 {v0.d}[0], [x0], x1
.endif
add x2, x2, #2*16 // tmp += 2*tmp_stride
subs w7, w7, #2 // h -= 2
.if \bpc == 8
st1 {v0.s}[1], [x0], x1
.else
st1 {v0.d}[1], [x0], x1
.endif
.endif
// Reset pri_taps and directions back to the original point
sub x5, x5, #2
.if \pri
sub x8, x8, #2
.endif
b.gt 1b
ret
endfunc
.endm
.macro filter w, bpc
filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec
function cdef_filter\w\()_\bpc\()bpc_neon, export=1
cbnz w3, 1f // pri_strength
b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec
1:
cbnz w4, 1f // sec_strength
b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri
1:
b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
endfunc
.endm
const div_table
.short 840, 420, 280, 210, 168, 140, 120, 105
endconst
const alt_fact
.short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
endconst
.macro cost_alt d1, d2, s1, s2, s3, s4
smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
smull2 v23.4s, \s1\().8h, \s1\().8h
smull v24.4s, \s2\().4h, \s2\().4h
smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
smull2 v26.4s, \s3\().8h, \s3\().8h
smull v27.4s, \s4\().4h, \s4\().4h
mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact
mla v22.4s, v23.4s, v30.4s
mla v22.4s, v24.4s, v31.4s
mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact
mla v25.4s, v26.4s, v30.4s
mla v25.4s, v27.4s, v31.4s
addv \d1, v22.4s // *cost_ptr
addv \d2, v25.4s // *cost_ptr
.endm
.macro find_best s1, s2, s3
.ifnb \s2
mov w5, \s2\().s[0]
.endif
cmp w4, w1 // cost[n] > best_cost
csel w0, w3, w0, gt // best_dir = n
csel w1, w4, w1, gt // best_cost = cost[n]
.ifnb \s2
add w3, w3, #1 // n++
cmp w5, w1 // cost[n] > best_cost
mov w4, \s3\().s[0]
csel w0, w3, w0, gt // best_dir = n
csel w1, w5, w1, gt // best_cost = cost[n]
add w3, w3, #1 // n++
.endif
.endm
// Steps for loading and preparing each row
.macro dir_load_step1 s1, bpc
.if \bpc == 8
ld1 {\s1\().8b}, [x0], x1
.else
ld1 {\s1\().8h}, [x0], x1
.endif
.endm
.macro dir_load_step2 s1, bpc
.if \bpc == 8
usubl \s1\().8h, \s1\().8b, v31.8b
.else
ushl \s1\().8h, \s1\().8h, v8.8h
.endif
.endm
.macro dir_load_step3 s1, bpc
// Nothing for \bpc == 8
.if \bpc != 8
sub \s1\().8h, \s1\().8h, v31.8h
.endif
.endm
// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
// unsigned *const var)
.macro find_dir bpc
function cdef_find_dir_\bpc\()bpc_neon, export=1
.if \bpc == 16
str d8, [sp, #-0x10]!
clz w3, w3 // clz(bitdepth_max)
sub w3, w3, #24 // -bitdepth_min_8
dup v8.8h, w3
.endif
sub sp, sp, #32 // cost
mov w3, #8
.if \bpc == 8
movi v31.16b, #128
.else
movi v31.8h, #128
.endif
movi v30.16b, #0
movi v1.8h, #0 // v0-v1 sum_diag[0]
movi v3.8h, #0 // v2-v3 sum_diag[1]
movi v5.8h, #0 // v4-v5 sum_hv[0-1]
movi v7.8h, #0 // v6-v7 sum_alt[0]
dir_load_step1 v26, \bpc // Setup first row early
movi v17.8h, #0 // v16-v17 sum_alt[1]
movi v18.8h, #0 // v18-v19 sum_alt[2]
dir_load_step2 v26, \bpc
movi v19.8h, #0
dir_load_step3 v26, \bpc
movi v21.8h, #0 // v20-v21 sum_alt[3]
.irpc i, 01234567
addv h25, v26.8h // [y]
rev64 v27.8h, v26.8h
addp v28.8h, v26.8h, v30.8h // [(x >> 1)]
add v5.8h, v5.8h, v26.8h // sum_hv[1]
ext v27.16b, v27.16b, v27.16b, #8 // [-x]
rev64 v29.4h, v28.4h // [-(x >> 1)]
ins v4.h[\i], v25.h[0] // sum_hv[0]
.if \i < 6
ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
add v18.8h, v18.8h, v22.8h // sum_alt[2]
add v19.4h, v19.4h, v23.4h // sum_alt[2]
.else
add v18.8h, v18.8h, v26.8h // sum_alt[2]
.endif
.if \i == 0
mov v20.16b, v26.16b // sum_alt[3]
.elseif \i == 1
add v20.8h, v20.8h, v26.8h // sum_alt[3]
.else
ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
add v20.8h, v20.8h, v24.8h // sum_alt[3]
add v21.4h, v21.4h, v25.4h // sum_alt[3]
.endif
.if \i == 0
mov v0.16b, v26.16b // sum_diag[0]
dir_load_step1 v26, \bpc
mov v2.16b, v27.16b // sum_diag[1]
dir_load_step2 v26, \bpc
mov v6.16b, v28.16b // sum_alt[0]
dir_load_step3 v26, \bpc
mov v16.16b, v29.16b // sum_alt[1]
.else
ext v22.16b, v30.16b, v26.16b, #(16-2*\i)
ext v23.16b, v26.16b, v30.16b, #(16-2*\i)
ext v24.16b, v30.16b, v27.16b, #(16-2*\i)
ext v25.16b, v27.16b, v30.16b, #(16-2*\i)
.if \i != 7 // Nothing to load for the final row
dir_load_step1 v26, \bpc // Start setting up the next row early.
.endif
add v0.8h, v0.8h, v22.8h // sum_diag[0]
add v1.8h, v1.8h, v23.8h // sum_diag[0]
add v2.8h, v2.8h, v24.8h // sum_diag[1]
add v3.8h, v3.8h, v25.8h // sum_diag[1]
.if \i != 7
dir_load_step2 v26, \bpc
.endif
ext v22.16b, v30.16b, v28.16b, #(16-2*\i)
ext v23.16b, v28.16b, v30.16b, #(16-2*\i)
ext v24.16b, v30.16b, v29.16b, #(16-2*\i)
ext v25.16b, v29.16b, v30.16b, #(16-2*\i)
.if \i != 7
dir_load_step3 v26, \bpc
.endif
add v6.8h, v6.8h, v22.8h // sum_alt[0]
add v7.4h, v7.4h, v23.4h // sum_alt[0]
add v16.8h, v16.8h, v24.8h // sum_alt[1]
add v17.4h, v17.4h, v25.4h // sum_alt[1]
.endif
.endr
movi v31.4s, #105
smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0]
smlal2 v26.4s, v4.8h, v4.8h
smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1]
smlal2 v27.4s, v5.8h, v5.8h
mul v26.4s, v26.4s, v31.4s // cost[2] *= 105
mul v27.4s, v27.4s, v31.4s // cost[6] *= 105
addv s4, v26.4s // cost[2]
addv s5, v27.4s // cost[6]
rev64 v1.8h, v1.8h
rev64 v3.8h, v3.8h
ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n]
ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n]
str s4, [sp, #2*4] // cost[2]
str s5, [sp, #6*4] // cost[6]
movrel x4, div_table
ld1 {v31.8h}, [x4]
smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0]
smull2 v23.4s, v0.8h, v0.8h
smlal v22.4s, v1.4h, v1.4h
smlal2 v23.4s, v1.8h, v1.8h
smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1]
smull2 v25.4s, v2.8h, v2.8h
smlal v24.4s, v3.4h, v3.4h
smlal2 v25.4s, v3.8h, v3.8h
uxtl v30.4s, v31.4h // div_table
uxtl2 v31.4s, v31.8h
mul v22.4s, v22.4s, v30.4s // cost[0]
mla v22.4s, v23.4s, v31.4s // cost[0]
mul v24.4s, v24.4s, v30.4s // cost[4]
mla v24.4s, v25.4s, v31.4s // cost[4]
addv s0, v22.4s // cost[0]
addv s2, v24.4s // cost[4]
movrel x5, alt_fact
ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105
str s0, [sp, #0*4] // cost[0]
str s2, [sp, #4*4] // cost[4]
uxtl v29.4s, v29.4h // div_table[2*m+1] + 105
uxtl v30.4s, v30.4h
uxtl v31.4s, v31.4h
cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3]
cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7]
str s6, [sp, #1*4] // cost[1]
str s16, [sp, #3*4] // cost[3]
mov w0, #0 // best_dir
mov w1, v0.s[0] // best_cost
mov w3, #1 // n
str s18, [sp, #5*4] // cost[5]
str s20, [sp, #7*4] // cost[7]
mov w4, v6.s[0]
find_best v6, v4, v16
find_best v16, v2, v18
find_best v18, v5, v20
find_best v20
eor w3, w0, #4 // best_dir ^4
ldr w4, [sp, w3, uxtw #2]
sub w1, w1, w4 // best_cost - cost[best_dir ^ 4]
lsr w1, w1, #10
str w1, [x2] // *var
add sp, sp, #32
.if \bpc == 16
ldr d8, [sp], 0x10
.endif
ret
endfunc
.endm
|