diff options
Diffstat (limited to '')
-rw-r--r-- | third_party/dav1d/tests/checkasm/arm/checkasm_32.S | 201 | ||||
-rw-r--r-- | third_party/dav1d/tests/checkasm/arm/checkasm_64.S | 211 | ||||
-rw-r--r-- | third_party/dav1d/tests/checkasm/cdef.c | 144 | ||||
-rw-r--r-- | third_party/dav1d/tests/checkasm/checkasm.c | 986 | ||||
-rw-r--r-- | third_party/dav1d/tests/checkasm/checkasm.h | 379 | ||||
-rw-r--r-- | third_party/dav1d/tests/checkasm/filmgrain.c | 401 | ||||
-rw-r--r-- | third_party/dav1d/tests/checkasm/ipred.c | 297 | ||||
-rw-r--r-- | third_party/dav1d/tests/checkasm/itx.c | 318 | ||||
-rw-r--r-- | third_party/dav1d/tests/checkasm/loopfilter.c | 203 | ||||
-rw-r--r-- | third_party/dav1d/tests/checkasm/looprestoration.c | 196 | ||||
-rw-r--r-- | third_party/dav1d/tests/checkasm/mc.c | 790 | ||||
-rw-r--r-- | third_party/dav1d/tests/checkasm/msac.c | 294 | ||||
-rw-r--r-- | third_party/dav1d/tests/checkasm/refmvs.c | 167 | ||||
-rw-r--r-- | third_party/dav1d/tests/checkasm/x86/checkasm.asm | 475 |
14 files changed, 5062 insertions, 0 deletions
diff --git a/third_party/dav1d/tests/checkasm/arm/checkasm_32.S b/third_party/dav1d/tests/checkasm/arm/checkasm_32.S new file mode 100644 index 0000000000..a186ef8fc2 --- /dev/null +++ b/third_party/dav1d/tests/checkasm/arm/checkasm_32.S @@ -0,0 +1,201 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2015 Martin Storsjo + * Copyright © 2015 Janne Grunau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#define PRIVATE_PREFIX checkasm_ + +#include "src/arm/asm.S" +#include "src/arm/32/util.S" + +const register_init, align=3 + .quad 0x21f86d66c8ca00ce + .quad 0x75b6ba21077c48ad + .quad 0xed56bb2dcb3c7736 + .quad 0x8bda43d3fd1a7e06 + .quad 0xb64a9c9e5d318408 + .quad 0xdf9a54b303f1d3a3 + .quad 0x4a75479abd64e097 + .quad 0x249214109d5d1c88 +endconst + +const error_message_fpscr + .asciz "failed to preserve register FPSCR, changed bits: %x" +error_message_gpr: + .asciz "failed to preserve register r%d" +error_message_vfp: + .asciz "failed to preserve register d%d" +error_message_stack: + .asciz "failed to preserve stack" +endconst + +@ max number of args used by any asm function. +#define MAX_ARGS 15 + +#define ARG_STACK 4*(MAX_ARGS - 4) + +@ Align the used stack space to 8 to preserve the stack alignment. +@ +8 for stack canary reference. +#define ARG_STACK_A (((ARG_STACK + pushed + 7) & ~7) - pushed + 8) + +.macro clobbercheck variant +.equ pushed, 4*9 +function checked_call_\variant, export=1 + push {r4-r11, lr} +.ifc \variant, vfp + vpush {d8-d15} + fmrx r4, FPSCR + push {r4} +.equ pushed, pushed + 16*4 + 4 +.endif + + movrel r12, register_init +.ifc \variant, vfp + vldm r12, {d8-d15} +.endif + ldm r12, {r4-r11} + + sub sp, sp, #ARG_STACK_A +.equ pos, 0 +.rept MAX_ARGS-4 + ldr r12, [sp, #ARG_STACK_A + pushed + 8 + pos] + str r12, [sp, #pos] +.equ pos, pos + 4 +.endr + + @ For stack overflows, the callee is free to overwrite the parameters + @ that were passed on the stack (if any), so we can only check after + @ that point. First figure out how many parameters the function + @ really took on the stack: + ldr r12, [sp, #ARG_STACK_A + pushed + 8 + 4*(MAX_ARGS-4)] + @ Load the first non-parameter value from the stack, that should be + @ left untouched by the function. Store a copy of it inverted, so that + @ e.g. overwriting everything with zero would be noticed. + ldr r12, [sp, r12, lsl #2] + mvn r12, r12 + str r12, [sp, #ARG_STACK_A - 4] + + mov r12, r0 + mov r0, r2 + mov r1, r3 + ldrd r2, r3, [sp, #ARG_STACK_A + pushed] + @ Call the target function + blx r12 + + @ Load the number of stack parameters, stack canary and its reference + ldr r12, [sp, #ARG_STACK_A + pushed + 8 + 4*(MAX_ARGS-4)] + ldr r2, [sp, r12, lsl #2] + ldr r3, [sp, #ARG_STACK_A - 4] + + add sp, sp, #ARG_STACK_A + push {r0, r1} + + mvn r3, r3 + cmp r2, r3 + bne 5f + + movrel r12, register_init +.ifc \variant, vfp +.macro check_reg_vfp, dreg, offset + ldrd r2, r3, [r12, #8 * (\offset)] + vmov r0, lr, \dreg + eor r2, r2, r0 + eor r3, r3, lr + orrs r2, r2, r3 + bne 4f +.endm + +.irp n, 8, 9, 10, 11, 12, 13, 14, 15 + @ keep track of the checked double/SIMD register + mov r1, #\n + check_reg_vfp d\n, \n-8 +.endr +.purgem check_reg_vfp + + fmrx r1, FPSCR + ldr r3, [sp, #8] + eor r1, r1, r3 + @ Ignore changes in bits 0-4 and 7 + bic r1, r1, #0x9f + @ Ignore changes in the topmost 5 bits + bics r1, r1, #0xf8000000 + bne 3f +.endif + + @ keep track of the checked GPR + mov r1, #4 +.macro check_reg reg1, reg2= + ldrd r2, r3, [r12], #8 + eors r2, r2, \reg1 + bne 2f + add r1, r1, #1 +.ifnb \reg2 + eors r3, r3, \reg2 + bne 2f +.endif + add r1, r1, #1 +.endm + check_reg r4, r5 + check_reg r6, r7 +@ r9 is a volatile register in the ios ABI +#ifdef __APPLE__ + check_reg r8 +#else + check_reg r8, r9 +#endif + check_reg r10, r11 +.purgem check_reg + + b 0f +5: + movrel r0, error_message_stack + b 1f +4: + movrel r0, error_message_vfp + b 1f +3: + movrel r0, error_message_fpscr + b 1f +2: + movrel r0, error_message_gpr +1: +#ifdef PREFIX + bl _checkasm_fail_func +#else + bl checkasm_fail_func +#endif +0: + pop {r0, r1} +.ifc \variant, vfp + pop {r2} + fmxr FPSCR, r2 + vpop {d8-d15} +.endif + pop {r4-r11, pc} +endfunc +.endm + +clobbercheck vfp diff --git a/third_party/dav1d/tests/checkasm/arm/checkasm_64.S b/third_party/dav1d/tests/checkasm/arm/checkasm_64.S new file mode 100644 index 0000000000..25749145a5 --- /dev/null +++ b/third_party/dav1d/tests/checkasm/arm/checkasm_64.S @@ -0,0 +1,211 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2015 Martin Storsjo + * Copyright © 2015 Janne Grunau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#define PRIVATE_PREFIX checkasm_ + +#include "src/arm/asm.S" +#include "src/arm/64/util.S" + +const register_init, align=4 + .quad 0x21f86d66c8ca00ce + .quad 0x75b6ba21077c48ad + .quad 0xed56bb2dcb3c7736 + .quad 0x8bda43d3fd1a7e06 + .quad 0xb64a9c9e5d318408 + .quad 0xdf9a54b303f1d3a3 + .quad 0x4a75479abd64e097 + .quad 0x249214109d5d1c88 + .quad 0x1a1b2550a612b48c + .quad 0x79445c159ce79064 + .quad 0x2eed899d5a28ddcd + .quad 0x86b2536fcd8cf636 + .quad 0xb0856806085e7943 + .quad 0x3f2bf84fc0fcca4e + .quad 0xacbd382dcf5b8de2 + .quad 0xd229e1f5b281303f + .quad 0x71aeaff20b095fd9 + .quad 0xab63e2e11fa38ed9 +endconst + + +const error_message_register + .asciz "failed to preserve register" +error_message_stack: + .asciz "stack clobbered" +endconst + + +// max number of args used by any asm function. +#define MAX_ARGS 15 + +#define CLOBBER_STACK ((8*MAX_ARGS + 15) & ~15) + +function stack_clobber, export=1 + mov x3, sp + mov x2, #CLOBBER_STACK +1: + stp x0, x1, [sp, #-16]! + subs x2, x2, #16 + b.gt 1b + mov sp, x3 + ret +endfunc + +// + 16 for stack canary reference +#define ARG_STACK ((8*(MAX_ARGS - 8) + 15) & ~15 + 16) + +function checked_call, export=1 + stp x29, x30, [sp, #-16]! + mov x29, sp + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x28, [sp, #-16]! + stp d8, d9, [sp, #-16]! + stp d10, d11, [sp, #-16]! + stp d12, d13, [sp, #-16]! + stp d14, d15, [sp, #-16]! + + movrel x9, register_init + ldp d8, d9, [x9], #16 + ldp d10, d11, [x9], #16 + ldp d12, d13, [x9], #16 + ldp d14, d15, [x9], #16 + ldp x19, x20, [x9], #16 + ldp x21, x22, [x9], #16 + ldp x23, x24, [x9], #16 + ldp x25, x26, [x9], #16 + ldp x27, x28, [x9], #16 + + sub sp, sp, #ARG_STACK +.equ pos, 0 +.rept MAX_ARGS-8 + // Skip the first 8 args, that are loaded into registers + ldr x9, [x29, #16 + 8*8 + pos] + str x9, [sp, #pos] +.equ pos, pos + 8 +.endr + + // Fill x8-x17 with garbage. This doesn't have to be preserved, + // but avoids relying on them having any particular value. + movrel x9, register_init + ldp x10, x11, [x9], #32 + ldp x12, x13, [x9], #32 + ldp x14, x15, [x9], #32 + ldp x16, x17, [x9], #32 + ldp x8, x9, [x9] + + // For stack overflows, the callee is free to overwrite the parameters + // that were passed on the stack (if any), so we can only check after + // that point. First figure out how many parameters the function + // really took on the stack: + ldr w2, [x29, #16 + 8*8 + (MAX_ARGS-8)*8] + // Load the first non-parameter value from the stack, that should be + // left untouched by the function. Store a copy of it inverted, so that + // e.g. overwriting everything with zero would be noticed. + ldr x2, [sp, x2, lsl #3] + mvn x2, x2 + str x2, [sp, #ARG_STACK-8] + + // Load the in-register arguments + mov x12, x0 + ldp x0, x1, [x29, #16] + ldp x2, x3, [x29, #32] + ldp x4, x5, [x29, #48] + ldp x6, x7, [x29, #64] + // Call the target function + blr x12 + + // Load the number of stack parameters, stack canary and its reference + ldr w2, [x29, #16 + 8*8 + (MAX_ARGS-8)*8] + ldr x2, [sp, x2, lsl #3] + ldr x3, [sp, #ARG_STACK-8] + + add sp, sp, #ARG_STACK + stp x0, x1, [sp, #-16]! + + mvn x3, x3 + cmp x2, x3 + b.ne 2f + + movrel x9, register_init + movi v3.8h, #0 + +.macro check_reg_neon reg1, reg2 + ldr q1, [x9], #16 + uzp1 v2.2d, v\reg1\().2d, v\reg2\().2d + eor v1.16b, v1.16b, v2.16b + orr v3.16b, v3.16b, v1.16b +.endm + check_reg_neon 8, 9 + check_reg_neon 10, 11 + check_reg_neon 12, 13 + check_reg_neon 14, 15 + uqxtn v3.8b, v3.8h + umov x3, v3.d[0] + +.macro check_reg reg1, reg2 + ldp x0, x1, [x9], #16 + eor x0, x0, \reg1 + eor x1, x1, \reg2 + orr x3, x3, x0 + orr x3, x3, x1 +.endm + check_reg x19, x20 + check_reg x21, x22 + check_reg x23, x24 + check_reg x25, x26 + check_reg x27, x28 + + cbz x3, 0f + + movrel x0, error_message_register + b 1f +2: + movrel x0, error_message_stack +1: +#ifdef PREFIX + bl _checkasm_fail_func +#else + bl checkasm_fail_func +#endif +0: + ldp x0, x1, [sp], #16 + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 + ldp x27, x28, [sp], #16 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ldp x29, x30, [sp], #16 + ret +endfunc diff --git a/third_party/dav1d/tests/checkasm/cdef.c b/third_party/dav1d/tests/checkasm/cdef.c new file mode 100644 index 0000000000..9a90e313da --- /dev/null +++ b/third_party/dav1d/tests/checkasm/cdef.c @@ -0,0 +1,144 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "tests/checkasm/checkasm.h" + +#include <string.h> +#include <stdio.h> + +#include "common/dump.h" + +#include "src/levels.h" +#include "src/cdef.h" + +static int to_binary(int x) { /* 0-15 -> 0000-1111 */ + return (x & 1) + 5 * (x & 2) + 25 * (x & 4) + 125 * (x & 8); +} + +static void init_tmp(pixel *buf, int n, const int bitdepth_max) { + const int fill_type = rnd() & 7; + if (fill_type == 0) + while (n--) /* check for cdef_filter underflows */ + *buf++ = rnd() & 1; + else if (fill_type == 1) + while (n--) /* check for cdef_filter overflows */ + *buf++ = bitdepth_max - (rnd() & 1); + else + while (n--) + *buf++ = rnd() & bitdepth_max; +} + +static void check_cdef_filter(const cdef_fn fn, const int w, const int h) { + ALIGN_STK_64(pixel, c_src, 16 * 10 + 16, ), *const c_dst = c_src + 8; + ALIGN_STK_64(pixel, a_src, 16 * 10 + 16, ), *const a_dst = a_src + 8; + ALIGN_STK_64(pixel, top_buf, 16 * 2 + 16, ), *const top = top_buf + 8; + ALIGN_STK_64(pixel, bot_buf, 16 * 2 + 16, ), *const bot = bot_buf + 8; + ALIGN_STK_16(pixel, left, 8,[2]); + const ptrdiff_t stride = 16 * sizeof(pixel); + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel (*left)[2], + const pixel *top, const pixel *bot, int pri_strength, + int sec_strength, int dir, int damping, + enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX); + + for (int s = 0x1; s <= 0x3; s++) { + if (check_func(fn, "cdef_filter_%dx%d_%02d_%dbpc", w, h, to_binary(s), BITDEPTH)) { + for (int dir = 0; dir < 8; dir++) { + for (enum CdefEdgeFlags edges = 0x0; edges <= 0xf; edges++) { +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + + init_tmp(c_src, 16 * 10 + 16, bitdepth_max); + init_tmp(top_buf, 16 * 2 + 16, bitdepth_max); + init_tmp(bot_buf, 16 * 2 + 16, bitdepth_max); + init_tmp((pixel *) left, 8 * 2, bitdepth_max); + memcpy(a_src, c_src, (16 * 10 + 16) * sizeof(pixel)); + + const int pri_strength = s & 2 ? (1 + (rnd() % 15)) << bitdepth_min_8 : 0; + const int sec_strength = s & 1 ? 1 << ((rnd() % 3) + bitdepth_min_8) : 0; + const int damping = 3 + (rnd() & 3) + bitdepth_min_8 - (w == 4 || (rnd() & 1)); + call_ref(c_dst, stride, left, top, bot, pri_strength, sec_strength, + dir, damping, edges HIGHBD_TAIL_SUFFIX); + call_new(a_dst, stride, left, top, bot, pri_strength, sec_strength, + dir, damping, edges HIGHBD_TAIL_SUFFIX); + if (checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst")) { + fprintf(stderr, "strength = %d:%d, dir = %d, damping = %d, edges = %04d\n", + pri_strength, sec_strength, dir, damping, to_binary(edges)); + return; + } + if (dir == 7 && (edges == 0x5 || edges == 0xa || edges == 0xf)) + bench_new(alternate(c_dst, a_dst), stride, left, top, bot, pri_strength, + sec_strength, dir, damping, edges HIGHBD_TAIL_SUFFIX); + } + } + } + } +} + +static void check_cdef_direction(const cdef_dir_fn fn) { + ALIGN_STK_64(pixel, src, 8 * 8,); + + declare_func(int, pixel *src, ptrdiff_t dst_stride, unsigned *var + HIGHBD_DECL_SUFFIX); + + if (check_func(fn, "cdef_dir_%dbpc", BITDEPTH)) { + unsigned c_var, a_var; +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + init_tmp(src, 64, bitdepth_max); + + const int c_dir = call_ref(src, 8 * sizeof(pixel), &c_var HIGHBD_TAIL_SUFFIX); + const int a_dir = call_new(src, 8 * sizeof(pixel), &a_var HIGHBD_TAIL_SUFFIX); + if (c_var != a_var || c_dir != a_dir) { + if (fail()) { + hex_fdump(stderr, src, 8 * sizeof(pixel), 8, 8, "src"); + fprintf(stderr, "c_dir %d a_dir %d\n", c_dir, a_dir); + } + } + bench_new(src, 8 * sizeof(pixel), &a_var HIGHBD_TAIL_SUFFIX); + } + report("cdef_dir"); +} + +void bitfn(checkasm_check_cdef)(void) { + Dav1dCdefDSPContext c; + bitfn(dav1d_cdef_dsp_init)(&c); + + check_cdef_direction(c.dir); + + check_cdef_filter(c.fb[0], 8, 8); + check_cdef_filter(c.fb[1], 4, 8); + check_cdef_filter(c.fb[2], 4, 4); + report("cdef_filter"); +} diff --git a/third_party/dav1d/tests/checkasm/checkasm.c b/third_party/dav1d/tests/checkasm/checkasm.c new file mode 100644 index 0000000000..bca2158b5b --- /dev/null +++ b/third_party/dav1d/tests/checkasm/checkasm.c @@ -0,0 +1,986 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "tests/checkasm/checkasm.h" + +#include <math.h> +#include <stdarg.h> +#include <stdio.h> +#include <string.h> + +#include "src/cpu.h" + +#ifdef _WIN32 +#include <windows.h> +#define COLOR_RED FOREGROUND_RED +#define COLOR_GREEN FOREGROUND_GREEN +#define COLOR_YELLOW (FOREGROUND_RED|FOREGROUND_GREEN) +#else +#include <unistd.h> +#include <signal.h> +#include <time.h> +#include <pthread.h> +#ifdef HAVE_PTHREAD_NP_H +#include <pthread_np.h> +#endif +#ifdef __APPLE__ +#include <mach/mach_time.h> +#endif +#define COLOR_RED 1 +#define COLOR_GREEN 2 +#define COLOR_YELLOW 3 +#endif + +/* List of tests to invoke */ +static const struct { + const char *name; + void (*func)(void); +} tests[] = { + { "msac", checkasm_check_msac }, + { "refmvs", checkasm_check_refmvs }, +#if CONFIG_8BPC + { "cdef_8bpc", checkasm_check_cdef_8bpc }, + { "filmgrain_8bpc", checkasm_check_filmgrain_8bpc }, + { "ipred_8bpc", checkasm_check_ipred_8bpc }, + { "itx_8bpc", checkasm_check_itx_8bpc }, + { "loopfilter_8bpc", checkasm_check_loopfilter_8bpc }, + { "looprestoration_8bpc", checkasm_check_looprestoration_8bpc }, + { "mc_8bpc", checkasm_check_mc_8bpc }, +#endif +#if CONFIG_16BPC + { "cdef_16bpc", checkasm_check_cdef_16bpc }, + { "filmgrain_16bpc", checkasm_check_filmgrain_16bpc }, + { "ipred_16bpc", checkasm_check_ipred_16bpc }, + { "itx_16bpc", checkasm_check_itx_16bpc }, + { "loopfilter_16bpc", checkasm_check_loopfilter_16bpc }, + { "looprestoration_16bpc", checkasm_check_looprestoration_16bpc }, + { "mc_16bpc", checkasm_check_mc_16bpc }, +#endif + { 0 } +}; + +/* List of cpu flags to check */ +static const struct { + const char *name; + const char *suffix; + unsigned flag; +} cpus[] = { +#if ARCH_X86 + { "SSE2", "sse2", DAV1D_X86_CPU_FLAG_SSE2 }, + { "SSSE3", "ssse3", DAV1D_X86_CPU_FLAG_SSSE3 }, + { "SSE4.1", "sse4", DAV1D_X86_CPU_FLAG_SSE41 }, + { "AVX2", "avx2", DAV1D_X86_CPU_FLAG_AVX2 }, + { "AVX-512 (Ice Lake)", "avx512icl", DAV1D_X86_CPU_FLAG_AVX512ICL }, +#elif ARCH_AARCH64 || ARCH_ARM + { "NEON", "neon", DAV1D_ARM_CPU_FLAG_NEON }, +#elif ARCH_PPC64LE + { "VSX", "vsx", DAV1D_PPC_CPU_FLAG_VSX }, +#endif + { 0 } +}; + +typedef struct CheckasmFuncVersion { + struct CheckasmFuncVersion *next; + void *func; + int ok; + unsigned cpu; + int iterations; + uint64_t cycles; +} CheckasmFuncVersion; + +/* Binary search tree node */ +typedef struct CheckasmFunc { + struct CheckasmFunc *child[2]; + CheckasmFuncVersion versions; + uint8_t color; /* 0 = red, 1 = black */ + char name[]; +} CheckasmFunc; + +/* Internal state */ +static struct { + CheckasmFunc *funcs; + CheckasmFunc *current_func; + CheckasmFuncVersion *current_func_ver; + const char *current_test_name; + int num_checked; + int num_failed; + double nop_time; + unsigned cpu_flag; + const char *cpu_flag_name; + const char *test_pattern; + const char *function_pattern; + unsigned seed; + int bench; + int bench_c; + int verbose; + int function_listing; + int catch_signals; + int suffix_length; + int max_function_name_length; +#if ARCH_X86_64 + void (*simd_warmup)(void); +#endif +} state; + +/* float compare support code */ +typedef union { + float f; + uint32_t i; +} intfloat; + +static uint32_t xs_state[4]; + +static void xor128_srand(unsigned seed) { + xs_state[0] = seed; + xs_state[1] = ( seed & 0xffff0000) | (~seed & 0x0000ffff); + xs_state[2] = (~seed & 0xffff0000) | ( seed & 0x0000ffff); + xs_state[3] = ~seed; +} + +// xor128 from Marsaglia, George (July 2003). "Xorshift RNGs". +// Journal of Statistical Software. 8 (14). +// doi:10.18637/jss.v008.i14. +int xor128_rand(void) { + const uint32_t x = xs_state[0]; + const uint32_t t = x ^ (x << 11); + + xs_state[0] = xs_state[1]; + xs_state[1] = xs_state[2]; + xs_state[2] = xs_state[3]; + uint32_t w = xs_state[3]; + + w = (w ^ (w >> 19)) ^ (t ^ (t >> 8)); + xs_state[3] = w; + + return w >> 1; +} + +static int is_negative(const intfloat u) { + return u.i >> 31; +} + +int float_near_ulp(const float a, const float b, const unsigned max_ulp) { + intfloat x, y; + + x.f = a; + y.f = b; + + if (is_negative(x) != is_negative(y)) { + // handle -0.0 == +0.0 + return a == b; + } + + if (llabs((int64_t)x.i - y.i) <= max_ulp) + return 1; + + return 0; +} + +int float_near_ulp_array(const float *const a, const float *const b, + const unsigned max_ulp, const int len) +{ + for (int i = 0; i < len; i++) + if (!float_near_ulp(a[i], b[i], max_ulp)) + return 0; + + return 1; +} + +int float_near_abs_eps(const float a, const float b, const float eps) { + return fabsf(a - b) < eps; +} + +int float_near_abs_eps_array(const float *const a, const float *const b, + const float eps, const int len) +{ + for (int i = 0; i < len; i++) + if (!float_near_abs_eps(a[i], b[i], eps)) + return 0; + + return 1; +} + +int float_near_abs_eps_ulp(const float a, const float b, const float eps, + const unsigned max_ulp) +{ + return float_near_ulp(a, b, max_ulp) || float_near_abs_eps(a, b, eps); +} + +int float_near_abs_eps_array_ulp(const float *const a, const float *const b, + const float eps, const unsigned max_ulp, + const int len) +{ + for (int i = 0; i < len; i++) + if (!float_near_abs_eps_ulp(a[i], b[i], eps, max_ulp)) + return 0; + + return 1; +} + +/* Print colored text to stderr if the terminal supports it */ +static void color_printf(const int color, const char *const fmt, ...) { + static int8_t use_color = -1; + va_list arg; + +#ifdef _WIN32 + static HANDLE con; + static WORD org_attributes; + + if (use_color < 0) { + CONSOLE_SCREEN_BUFFER_INFO con_info; + con = GetStdHandle(STD_ERROR_HANDLE); + if (con && con != INVALID_HANDLE_VALUE && + GetConsoleScreenBufferInfo(con, &con_info)) + { + org_attributes = con_info.wAttributes; + use_color = 1; + } else + use_color = 0; + } + if (use_color) + SetConsoleTextAttribute(con, (org_attributes & 0xfff0) | + (color & 0x0f)); +#else + if (use_color < 0) { + const char *const term = getenv("TERM"); + use_color = term && strcmp(term, "dumb") && isatty(2); + } + if (use_color) + fprintf(stderr, "\x1b[%d;3%dm", (color & 0x08) >> 3, color & 0x07); +#endif + + va_start(arg, fmt); + vfprintf(stderr, fmt, arg); + va_end(arg); + + if (use_color) { +#ifdef _WIN32 + SetConsoleTextAttribute(con, org_attributes); +#else + fprintf(stderr, "\x1b[0m"); +#endif + } +} + +/* Deallocate a tree */ +static void destroy_func_tree(CheckasmFunc *const f) { + if (f) { + CheckasmFuncVersion *v = f->versions.next; + while (v) { + CheckasmFuncVersion *next = v->next; + free(v); + v = next; + } + + destroy_func_tree(f->child[0]); + destroy_func_tree(f->child[1]); + free(f); + } +} + +/* Allocate a zero-initialized block, clean up and exit on failure */ +static void *checkasm_malloc(const size_t size) { + void *const ptr = calloc(1, size); + if (!ptr) { + fprintf(stderr, "checkasm: malloc failed\n"); + destroy_func_tree(state.funcs); + exit(1); + } + return ptr; +} + +/* Get the suffix of the specified cpu flag */ +static const char *cpu_suffix(const unsigned cpu) { + for (int i = (int)(sizeof(cpus) / sizeof(*cpus)) - 2; i >= 0; i--) + if (cpu & cpus[i].flag) + return cpus[i].suffix; + + return "c"; +} + +#ifdef readtime +static int cmp_nop(const void *a, const void *b) { + return *(const uint16_t*)a - *(const uint16_t*)b; +} + +/* Measure the overhead of the timing code (in decicycles) */ +static double measure_nop_time(void) { + uint16_t nops[10000]; + int nop_sum = 0; + + for (int i = 0; i < 10000; i++) { + uint64_t t = readtime(); + nops[i] = (uint16_t) (readtime() - t); + } + + qsort(nops, 10000, sizeof(uint16_t), cmp_nop); + for (int i = 2500; i < 7500; i++) + nop_sum += nops[i]; + + return nop_sum / 5000.0; +} + +static double avg_cycles_per_call(const CheckasmFuncVersion *const v) { + if (v->iterations) { + const double cycles = (double)v->cycles / v->iterations - state.nop_time; + if (cycles > 0.0) + return cycles / 4.0; /* 4 calls per iteration */ + } + return 0.0; +} + +/* Print benchmark results */ +static void print_benchs(const CheckasmFunc *const f) { + if (f) { + print_benchs(f->child[0]); + + /* Only print functions with at least one assembly version */ + const CheckasmFuncVersion *v = &f->versions; + if ((state.bench_c || v->cpu || v->next) && v->iterations) { + const double baseline = avg_cycles_per_call(v); + do { + const int pad_length = 10 + state.max_function_name_length - + printf("%s_%s:", f->name, cpu_suffix(v->cpu)); + const double cycles = avg_cycles_per_call(v); + const double ratio = cycles ? baseline / cycles : 0.0; + printf("%*.1f (%5.2fx)\n", imax(pad_length, 0), cycles, ratio); + } while ((v = v->next)); + } + + print_benchs(f->child[1]); + } +} +#endif + +static void print_functions(const CheckasmFunc *const f) { + if (f) { + print_functions(f->child[0]); + const CheckasmFuncVersion *v = &f->versions; + printf("%s (%s", f->name, cpu_suffix(v->cpu)); + while ((v = v->next)) + printf(", %s", cpu_suffix(v->cpu)); + printf(")\n"); + print_functions(f->child[1]); + } +} + +#define is_digit(x) ((x) >= '0' && (x) <= '9') + +/* ASCIIbetical sort except preserving natural order for numbers */ +static int cmp_func_names(const char *a, const char *b) { + const char *const start = a; + int ascii_diff, digit_diff; + + for (; !(ascii_diff = *(const unsigned char*)a - + *(const unsigned char*)b) && *a; a++, b++); + for (; is_digit(*a) && is_digit(*b); a++, b++); + + if (a > start && is_digit(a[-1]) && + (digit_diff = is_digit(*a) - is_digit(*b))) + { + return digit_diff; + } + + return ascii_diff; +} + +/* Perform a tree rotation in the specified direction and return the new root */ +static CheckasmFunc *rotate_tree(CheckasmFunc *const f, const int dir) { + CheckasmFunc *const r = f->child[dir^1]; + f->child[dir^1] = r->child[dir]; + r->child[dir] = f; + r->color = f->color; + f->color = 0; + return r; +} + +#define is_red(f) ((f) && !(f)->color) + +/* Balance a left-leaning red-black tree at the specified node */ +static void balance_tree(CheckasmFunc **const root) { + CheckasmFunc *const f = *root; + + if (is_red(f->child[0]) && is_red(f->child[1])) { + f->color ^= 1; + f->child[0]->color = f->child[1]->color = 1; + } + else if (!is_red(f->child[0]) && is_red(f->child[1])) + *root = rotate_tree(f, 0); /* Rotate left */ + else if (is_red(f->child[0]) && is_red(f->child[0]->child[0])) + *root = rotate_tree(f, 1); /* Rotate right */ +} + +/* Get a node with the specified name, creating it if it doesn't exist */ +static CheckasmFunc *get_func(CheckasmFunc **const root, const char *const name) { + CheckasmFunc *f = *root; + + if (f) { + /* Search the tree for a matching node */ + const int cmp = cmp_func_names(name, f->name); + if (cmp) { + f = get_func(&f->child[cmp > 0], name); + + /* Rebalance the tree on the way up if a new node was inserted */ + if (!f->versions.func) + balance_tree(root); + } + } else { + /* Allocate and insert a new node into the tree */ + const size_t name_length = strlen(name) + 1; + f = *root = checkasm_malloc(offsetof(CheckasmFunc, name) + name_length); + memcpy(f->name, name, name_length); + } + + return f; +} + +checkasm_context checkasm_context_buf; + +/* Crash handling: attempt to catch crashes and handle them + * gracefully instead of just aborting abruptly. */ +#ifdef _WIN32 +static LONG NTAPI signal_handler(EXCEPTION_POINTERS *const e) { + if (!state.catch_signals) + return EXCEPTION_CONTINUE_SEARCH; + + const char *err; + switch (e->ExceptionRecord->ExceptionCode) { + case EXCEPTION_FLT_DIVIDE_BY_ZERO: + case EXCEPTION_INT_DIVIDE_BY_ZERO: + err = "fatal arithmetic error"; + break; + case EXCEPTION_ILLEGAL_INSTRUCTION: + case EXCEPTION_PRIV_INSTRUCTION: + err = "illegal instruction"; + break; + case EXCEPTION_ACCESS_VIOLATION: + case EXCEPTION_ARRAY_BOUNDS_EXCEEDED: + case EXCEPTION_DATATYPE_MISALIGNMENT: + case EXCEPTION_IN_PAGE_ERROR: + case EXCEPTION_STACK_OVERFLOW: + err = "segmentation fault"; + break; + default: + return EXCEPTION_CONTINUE_SEARCH; + } + state.catch_signals = 0; + checkasm_fail_func(err); + checkasm_load_context(); + return EXCEPTION_CONTINUE_EXECUTION; /* never reached, but shuts up gcc */ +} +#else +static void signal_handler(const int s) { + if (state.catch_signals) { + state.catch_signals = 0; + checkasm_fail_func(s == SIGFPE ? "fatal arithmetic error" : + s == SIGILL ? "illegal instruction" : + "segmentation fault"); + checkasm_load_context(); + } else { + /* fall back to the default signal handler */ + static const struct sigaction default_sa = { .sa_handler = SIG_DFL }; + sigaction(s, &default_sa, NULL); + raise(s); + } +} +#endif + +/* Compares a string with a wildcard pattern. */ +static int wildstrcmp(const char *str, const char *pattern) { + const char *wild = strchr(pattern, '*'); + if (wild) { + const size_t len = wild - pattern; + if (strncmp(str, pattern, len)) return 1; + while (*++wild == '*'); + if (!*wild) return 0; + str += len; + while (*str && wildstrcmp(str, wild)) str++; + return !*str; + } + return strcmp(str, pattern); +} + +/* Perform tests and benchmarks for the specified + * cpu flag if supported by the host */ +static void check_cpu_flag(const char *const name, unsigned flag) { + const unsigned old_cpu_flag = state.cpu_flag; + + flag |= old_cpu_flag; + dav1d_set_cpu_flags_mask(flag); + state.cpu_flag = dav1d_get_cpu_flags(); + + if (!flag || state.cpu_flag != old_cpu_flag) { + state.cpu_flag_name = name; + state.suffix_length = (int)strlen(cpu_suffix(flag)) + 1; + for (int i = 0; tests[i].func; i++) { + if (state.test_pattern && wildstrcmp(tests[i].name, state.test_pattern)) + continue; + xor128_srand(state.seed); + state.current_test_name = tests[i].name; + tests[i].func(); + } + } +} + +/* Print the name of the current CPU flag, but only do it once */ +static void print_cpu_name(void) { + if (state.cpu_flag_name) { + color_printf(COLOR_YELLOW, "%s:\n", state.cpu_flag_name); + state.cpu_flag_name = NULL; + } +} + +static unsigned get_seed(void) { +#ifdef _WIN32 + LARGE_INTEGER i; + QueryPerformanceCounter(&i); + return i.LowPart; +#elif defined(__APPLE__) + return (unsigned) mach_absolute_time(); +#else + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (unsigned) (1000000000ULL * ts.tv_sec + ts.tv_nsec); +#endif +} + +int main(int argc, char *argv[]) { + state.seed = get_seed(); + + while (argc > 1) { + if (!strncmp(argv[1], "--help", 6) || !strcmp(argv[1], "-h")) { + fprintf(stderr, + "checkasm [options] <random seed>\n" + " <random seed> Numeric value to seed the rng\n" + "Options:\n" + " --affinity=<cpu> Run the process on CPU <cpu>\n" + " --test=<pattern> Test only <pattern>\n" + " --function=<pattern> -f Test only the functions matching <pattern>\n" + " --bench -b Benchmark the tested functions\n" + " --list-functions List available functions\n" + " --list-tests List available tests\n" + " --bench-c -c Benchmark the C-only functions\n" + " --verbose -v Print verbose output\n"); + return 0; + } else if (!strcmp(argv[1], "--bench-c") || !strcmp(argv[1], "-c")) { + state.bench_c = 1; + } else if (!strcmp(argv[1], "--bench") || !strcmp(argv[1], "-b")) { +#ifndef readtime + fprintf(stderr, + "checkasm: --bench is not supported on your system\n"); + return 1; +#endif + state.bench = 1; + } else if (!strncmp(argv[1], "--test=", 7)) { + state.test_pattern = argv[1] + 7; + } else if (!strcmp(argv[1], "-t")) { + state.test_pattern = argc > 1 ? argv[2] : ""; + argc--; + argv++; + } else if (!strncmp(argv[1], "--function=", 11)) { + state.function_pattern = argv[1] + 11; + } else if (!strcmp(argv[1], "-f")) { + state.function_pattern = argc > 1 ? argv[2] : ""; + argc--; + argv++; + } else if (!strcmp(argv[1], "--list-functions")) { + state.function_listing = 1; + } else if (!strcmp(argv[1], "--list-tests")) { + for (int i = 0; tests[i].name; i++) + printf("%s\n", tests[i].name); + return 0; + } else if (!strcmp(argv[1], "--verbose") || !strcmp(argv[1], "-v")) { + state.verbose = 1; + } else if (!strncmp(argv[1], "--affinity=", 11)) { + unsigned long affinity = strtoul(argv[1] + 11, NULL, 16); +#ifdef _WIN32 + BOOL (WINAPI *spdcs)(HANDLE, const ULONG*, ULONG) = + (void*)GetProcAddress(GetModuleHandleW(L"kernel32.dll"), "SetProcessDefaultCpuSets"); + HANDLE process = GetCurrentProcess(); + int affinity_err; + if (spdcs) { + affinity_err = !spdcs(process, (ULONG[]){ affinity + 256 }, 1); + } else { + if (affinity < sizeof(DWORD_PTR) * 8) + affinity_err = !SetProcessAffinityMask(process, (DWORD_PTR)1 << affinity); + else + affinity_err = 1; + } + if (affinity_err) { + fprintf(stderr, "checkasm: invalid cpu affinity (%lu)\n", affinity); + return 1; + } else { + fprintf(stderr, "checkasm: running on cpu %lu\n", affinity); + } +#elif defined(HAVE_PTHREAD_SETAFFINITY_NP) && defined(CPU_SET) + cpu_set_t set; + CPU_ZERO(&set); + CPU_SET(affinity, &set); + if (pthread_setaffinity_np(pthread_self(), sizeof(set), &set)) { + fprintf(stderr, "checkasm: invalid cpu affinity (%lu)\n", affinity); + return 1; + } else { + fprintf(stderr, "checkasm: running on cpu %lu\n", affinity); + } +#else + (void)affinity; + fprintf(stderr, + "checkasm: --affinity is not supported on your system\n"); + return 1; +#endif + } else { + state.seed = (unsigned) strtoul(argv[1], NULL, 10); + } + + argc--; + argv++; + } + +#if TRIM_DSP_FUNCTIONS + fprintf(stderr, "checkasm: reference functions unavailable\n"); + return 0; +#endif + + dav1d_init_cpu(); + +#ifdef _WIN32 +#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) + AddVectoredExceptionHandler(0, signal_handler); +#endif +#else + const struct sigaction sa = { + .sa_handler = signal_handler, + .sa_flags = SA_NODEFER, + }; + sigaction(SIGBUS, &sa, NULL); + sigaction(SIGFPE, &sa, NULL); + sigaction(SIGILL, &sa, NULL); + sigaction(SIGSEGV, &sa, NULL); +#endif + +#ifdef readtime + if (state.bench) { + static int testing = 0; + checkasm_save_context(); + if (!testing) { + checkasm_set_signal_handler_state(1); + testing = 1; + readtime(); + checkasm_set_signal_handler_state(0); + } else { + fprintf(stderr, "checkasm: unable to access cycle counter\n"); + return 1; + } + } +#endif + + int ret = 0; + + if (!state.function_listing) { +#if ARCH_X86_64 + void checkasm_warmup_avx2(void); + void checkasm_warmup_avx512(void); + const unsigned cpu_flags = dav1d_get_cpu_flags(); + if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX512ICL) + state.simd_warmup = checkasm_warmup_avx512; + else if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX2) + state.simd_warmup = checkasm_warmup_avx2; + checkasm_simd_warmup(); +#endif +#if ARCH_X86 + unsigned checkasm_init_x86(char *name); + char name[48]; + const unsigned cpuid = checkasm_init_x86(name); + for (size_t len = strlen(name); len && name[len-1] == ' '; len--) + name[len-1] = '\0'; /* trim trailing whitespace */ + fprintf(stderr, "checkasm: %s (%08X) using random seed %u\n", name, cpuid, state.seed); +#else + fprintf(stderr, "checkasm: using random seed %u\n", state.seed); +#endif + } + + check_cpu_flag(NULL, 0); + for (int i = 0; cpus[i].flag; i++) + check_cpu_flag(cpus[i].name, cpus[i].flag); + + if (state.function_listing) { + print_functions(state.funcs); + } else if (state.num_failed) { + fprintf(stderr, "checkasm: %d of %d tests failed\n", + state.num_failed, state.num_checked); + ret = 1; + } else { + if (state.num_checked) + fprintf(stderr, "checkasm: all %d tests passed\n", state.num_checked); + else + fprintf(stderr, "checkasm: no tests to perform\n"); +#ifdef readtime + if (state.bench && state.max_function_name_length) { + state.nop_time = measure_nop_time(); + if (state.verbose) + printf("nop:%*.1f\n", state.max_function_name_length + 6, state.nop_time); + print_benchs(state.funcs); + } +#endif + } + + destroy_func_tree(state.funcs); + return ret; +} + +/* Decide whether or not the specified function needs to be tested and + * allocate/initialize data structures if needed. Returns a pointer to a + * reference function if the function should be tested, otherwise NULL */ +void *checkasm_check_func(void *const func, const char *const name, ...) { + char name_buf[256]; + va_list arg; + + va_start(arg, name); + int name_length = vsnprintf(name_buf, sizeof(name_buf), name, arg); + va_end(arg); + + if (!func || name_length <= 0 || (size_t)name_length >= sizeof(name_buf) || + (state.function_pattern && wildstrcmp(name_buf, state.function_pattern))) + { + return NULL; + } + + state.current_func = get_func(&state.funcs, name_buf); + + state.funcs->color = 1; + CheckasmFuncVersion *v = &state.current_func->versions; + void *ref = func; + + if (v->func) { + CheckasmFuncVersion *prev; + do { + /* Only test functions that haven't already been tested */ + if (v->func == func) + return NULL; + + if (v->ok) + ref = v->func; + + prev = v; + } while ((v = v->next)); + + v = prev->next = checkasm_malloc(sizeof(CheckasmFuncVersion)); + } + + name_length += state.suffix_length; + if (name_length > state.max_function_name_length) + state.max_function_name_length = name_length; + + v->func = func; + v->ok = 1; + v->cpu = state.cpu_flag; + state.current_func_ver = v; + if (state.function_listing) /* Save function names without running tests */ + return NULL; + + xor128_srand(state.seed); + + if (state.cpu_flag) + state.num_checked++; + + return ref; +} + +/* Decide whether or not the current function needs to be benchmarked */ +int checkasm_bench_func(void) { + return !state.num_failed && state.bench; +} + +/* Indicate that the current test has failed, return whether verbose printing + * is requested. */ +int checkasm_fail_func(const char *const msg, ...) { + if (state.current_func_ver && state.current_func_ver->cpu && + state.current_func_ver->ok) + { + va_list arg; + + print_cpu_name(); + fprintf(stderr, " %s_%s (", state.current_func->name, + cpu_suffix(state.current_func_ver->cpu)); + va_start(arg, msg); + vfprintf(stderr, msg, arg); + va_end(arg); + fprintf(stderr, ")\n"); + + state.current_func_ver->ok = 0; + state.num_failed++; + } + return state.verbose; +} + +/* Update benchmark results of the current function */ +void checkasm_update_bench(const int iterations, const uint64_t cycles) { + state.current_func_ver->iterations += iterations; + state.current_func_ver->cycles += cycles; +} + +/* Print the outcome of all tests performed since + * the last time this function was called */ +void checkasm_report(const char *const name, ...) { + static int prev_checked, prev_failed; + static size_t max_length; + + if (state.num_checked > prev_checked) { + int pad_length = (int) max_length + 4; + va_list arg; + + print_cpu_name(); + pad_length -= fprintf(stderr, " - %s.", state.current_test_name); + va_start(arg, name); + pad_length -= vfprintf(stderr, name, arg); + va_end(arg); + fprintf(stderr, "%*c", imax(pad_length, 0) + 2, '['); + + if (state.num_failed == prev_failed) + color_printf(COLOR_GREEN, "OK"); + else + color_printf(COLOR_RED, "FAILED"); + fprintf(stderr, "]\n"); + + prev_checked = state.num_checked; + prev_failed = state.num_failed; + } else if (!state.cpu_flag) { + /* Calculate the amount of padding required + * to make the output vertically aligned */ + size_t length = strlen(state.current_test_name); + va_list arg; + + va_start(arg, name); + length += vsnprintf(NULL, 0, name, arg); + va_end(arg); + + if (length > max_length) + max_length = length; + } +} + +void checkasm_set_signal_handler_state(const int enabled) { + state.catch_signals = enabled; +} + +static int check_err(const char *const file, const int line, + const char *const name, const int w, const int h, + int *const err) +{ + if (*err) + return 0; + if (!checkasm_fail_func("%s:%d", file, line)) + return 1; + *err = 1; + fprintf(stderr, "%s (%dx%d):\n", name, w, h); + return 0; +} + +#define DEF_CHECKASM_CHECK_FUNC(type, fmt) \ +int checkasm_check_##type(const char *const file, const int line, \ + const type *buf1, ptrdiff_t stride1, \ + const type *buf2, ptrdiff_t stride2, \ + const int w, int h, const char *const name, \ + const int align_w, const int align_h, \ + const int padding) \ +{ \ + int aligned_w = (w + align_w - 1) & ~(align_w - 1); \ + int aligned_h = (h + align_h - 1) & ~(align_h - 1); \ + int err = 0; \ + stride1 /= sizeof(*buf1); \ + stride2 /= sizeof(*buf2); \ + int y = 0; \ + for (y = 0; y < h; y++) \ + if (memcmp(&buf1[y*stride1], &buf2[y*stride2], w*sizeof(*buf1))) \ + break; \ + if (y != h) { \ + if (check_err(file, line, name, w, h, &err)) \ + return 1; \ + for (y = 0; y < h; y++) { \ + for (int x = 0; x < w; x++) \ + fprintf(stderr, " " fmt, buf1[x]); \ + fprintf(stderr, " "); \ + for (int x = 0; x < w; x++) \ + fprintf(stderr, " " fmt, buf2[x]); \ + fprintf(stderr, " "); \ + for (int x = 0; x < w; x++) \ + fprintf(stderr, "%c", buf1[x] != buf2[x] ? 'x' : '.'); \ + buf1 += stride1; \ + buf2 += stride2; \ + fprintf(stderr, "\n"); \ + } \ + buf1 -= h*stride1; \ + buf2 -= h*stride2; \ + } \ + for (y = -padding; y < 0; y++) \ + if (memcmp(&buf1[y*stride1 - padding], &buf2[y*stride2 - padding], \ + (w + 2*padding)*sizeof(*buf1))) { \ + if (check_err(file, line, name, w, h, &err)) \ + return 1; \ + fprintf(stderr, " overwrite above\n"); \ + break; \ + } \ + for (y = aligned_h; y < aligned_h + padding; y++) \ + if (memcmp(&buf1[y*stride1 - padding], &buf2[y*stride2 - padding], \ + (w + 2*padding)*sizeof(*buf1))) { \ + if (check_err(file, line, name, w, h, &err)) \ + return 1; \ + fprintf(stderr, " overwrite below\n"); \ + break; \ + } \ + for (y = 0; y < h; y++) \ + if (memcmp(&buf1[y*stride1 - padding], &buf2[y*stride2 - padding], \ + padding*sizeof(*buf1))) { \ + if (check_err(file, line, name, w, h, &err)) \ + return 1; \ + fprintf(stderr, " overwrite left\n"); \ + break; \ + } \ + for (y = 0; y < h; y++) \ + if (memcmp(&buf1[y*stride1 + aligned_w], &buf2[y*stride2 + aligned_w], \ + padding*sizeof(*buf1))) { \ + if (check_err(file, line, name, w, h, &err)) \ + return 1; \ + fprintf(stderr, " overwrite right\n"); \ + break; \ + } \ + return err; \ +} + +DEF_CHECKASM_CHECK_FUNC(int8_t, "%4d") +DEF_CHECKASM_CHECK_FUNC(int16_t, "%6d") +DEF_CHECKASM_CHECK_FUNC(int32_t, "%9d") +DEF_CHECKASM_CHECK_FUNC(uint8_t, "%02x") +DEF_CHECKASM_CHECK_FUNC(uint16_t, "%04x") +DEF_CHECKASM_CHECK_FUNC(uint32_t, "%08x") + +#if ARCH_X86_64 +void checkasm_simd_warmup(void) +{ + if (state.simd_warmup) + state.simd_warmup(); +} +#endif diff --git a/third_party/dav1d/tests/checkasm/checkasm.h b/third_party/dav1d/tests/checkasm/checkasm.h new file mode 100644 index 0000000000..29c1dbe2b9 --- /dev/null +++ b/third_party/dav1d/tests/checkasm/checkasm.h @@ -0,0 +1,379 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_TESTS_CHECKASM_CHECKASM_H +#define DAV1D_TESTS_CHECKASM_CHECKASM_H + +#include "config.h" + +#include <stdint.h> +#include <stdlib.h> + +#if ARCH_X86_64 && defined(_WIN32) +/* setjmp/longjmp on 64-bit Windows will try to use SEH to unwind the stack, + * which doesn't work for assembly functions without unwind information. */ +#include <windows.h> +#define checkasm_context CONTEXT +#define checkasm_save_context() RtlCaptureContext(&checkasm_context_buf) +#define checkasm_load_context() RtlRestoreContext(&checkasm_context_buf, NULL) +#else +#include <setjmp.h> +#define checkasm_context jmp_buf +#define checkasm_save_context() setjmp(checkasm_context_buf) +#define checkasm_load_context() longjmp(checkasm_context_buf, 1) +#endif + +#include "include/common/attributes.h" +#include "include/common/bitdepth.h" +#include "include/common/intops.h" + +int xor128_rand(void); +#define rnd xor128_rand + +#define decl_check_bitfns(name) \ +name##_8bpc(void); \ +name##_16bpc(void) + +void checkasm_check_msac(void); +void checkasm_check_refmvs(void); +decl_check_bitfns(void checkasm_check_cdef); +decl_check_bitfns(void checkasm_check_filmgrain); +decl_check_bitfns(void checkasm_check_ipred); +decl_check_bitfns(void checkasm_check_itx); +decl_check_bitfns(void checkasm_check_loopfilter); +decl_check_bitfns(void checkasm_check_looprestoration); +decl_check_bitfns(void checkasm_check_mc); + +void *checkasm_check_func(void *func, const char *name, ...); +int checkasm_bench_func(void); +int checkasm_fail_func(const char *msg, ...); +void checkasm_update_bench(int iterations, uint64_t cycles); +void checkasm_report(const char *name, ...); +void checkasm_set_signal_handler_state(int enabled); +extern checkasm_context checkasm_context_buf; + +/* float compare utilities */ +int float_near_ulp(float a, float b, unsigned max_ulp); +int float_near_abs_eps(float a, float b, float eps); +int float_near_abs_eps_ulp(float a, float b, float eps, unsigned max_ulp); +int float_near_ulp_array(const float *a, const float *b, unsigned max_ulp, + int len); +int float_near_abs_eps_array(const float *a, const float *b, float eps, + int len); +int float_near_abs_eps_array_ulp(const float *a, const float *b, float eps, + unsigned max_ulp, int len); + +#define BENCH_RUNS (1 << 12) /* Trade-off between accuracy and speed */ + +/* Decide whether or not the specified function needs to be tested */ +#define check_func(func, ...)\ + (func_ref = checkasm_check_func((func_new = func), __VA_ARGS__)) + +/* Declare the function prototype. The first argument is the return value, + * the remaining arguments are the function parameters. Naming parameters + * is optional. */ +#define declare_func(ret, ...)\ + declare_new(ret, __VA_ARGS__)\ + void *func_ref, *func_new;\ + typedef ret func_type(__VA_ARGS__);\ + checkasm_save_context() + +/* Indicate that the current test has failed */ +#define fail() checkasm_fail_func("%s:%d", __FILE__, __LINE__) + +/* Print the test outcome */ +#define report checkasm_report + +/* Call the reference function */ +#define call_ref(...)\ + (checkasm_set_signal_handler_state(1),\ + ((func_type *)func_ref)(__VA_ARGS__));\ + checkasm_set_signal_handler_state(0) + +#if HAVE_ASM +#if ARCH_X86 +#if defined(_MSC_VER) && !defined(__clang__) +#include <intrin.h> +#define readtime() (_mm_lfence(), __rdtsc()) +#else +static inline uint64_t readtime(void) { + uint32_t eax, edx; + __asm__ __volatile__("lfence\nrdtsc" : "=a"(eax), "=d"(edx)); + return (((uint64_t)edx) << 32) | eax; +} +#define readtime readtime +#endif +#elif (ARCH_AARCH64 || ARCH_ARM) && defined(__APPLE__) +#include <mach/mach_time.h> +#define readtime() mach_absolute_time() +#elif ARCH_AARCH64 +#ifdef _MSC_VER +#include <windows.h> +#define readtime() (_InstructionSynchronizationBarrier(), ReadTimeStampCounter()) +#else +static inline uint64_t readtime(void) { + uint64_t cycle_counter; + /* This requires enabling user mode access to the cycle counter (which + * can only be done from kernel space). + * This could also read cntvct_el0 instead of pmccntr_el0; that register + * might also be readable (depending on kernel version), but it has much + * worse precision (it's a fixed 50 MHz timer). */ + __asm__ __volatile__("isb\nmrs %0, pmccntr_el0" + : "=r"(cycle_counter) + :: "memory"); + return cycle_counter; +} +#define readtime readtime +#endif +#elif ARCH_ARM && !defined(_MSC_VER) && __ARM_ARCH >= 7 +static inline uint64_t readtime(void) { + uint32_t cycle_counter; + /* This requires enabling user mode access to the cycle counter (which + * can only be done from kernel space). */ + __asm__ __volatile__("isb\nmrc p15, 0, %0, c9, c13, 0" + : "=r"(cycle_counter) + :: "memory"); + return cycle_counter; +} +#define readtime readtime +#elif ARCH_PPC64LE +static inline uint64_t readtime(void) { + uint32_t tbu, tbl, temp; + + __asm__ __volatile__( + "1:\n" + "mfspr %2,269\n" + "mfspr %0,268\n" + "mfspr %1,269\n" + "cmpw %2,%1\n" + "bne 1b\n" + : "=r"(tbl), "=r"(tbu), "=r"(temp) + : + : "cc"); + + return (((uint64_t)tbu) << 32) | (uint64_t)tbl; +} +#define readtime readtime +#endif + +/* Verifies that clobbered callee-saved registers + * are properly saved and restored */ +void checkasm_checked_call(void *func, ...); + +#if ARCH_X86_64 +/* YMM and ZMM registers on x86 are turned off to save power when they haven't + * been used for some period of time. When they are used there will be a + * "warmup" period during which performance will be reduced and inconsistent + * which is problematic when trying to benchmark individual functions. We can + * work around this by periodically issuing "dummy" instructions that uses + * those registers to keep them powered on. */ +void checkasm_simd_warmup(void); + +/* The upper 32 bits of 32-bit data types are undefined when passed as function + * parameters. In practice those bits usually end up being zero which may hide + * certain bugs, such as using a register containing undefined bits as a pointer + * offset, so we want to intentionally clobber those bits with junk to expose + * any issues. The following set of macros automatically calculates a bitmask + * specifying which parameters should have their upper halves clobbered. */ +#ifdef _WIN32 +/* Integer and floating-point parameters share "register slots". */ +#define IGNORED_FP_ARGS 0 +#else +/* Up to 8 floating-point parameters are passed in XMM registers, which are + * handled orthogonally from integer parameters passed in GPR registers. */ +#define IGNORED_FP_ARGS 8 +#endif +#ifdef HAVE_C11_GENERIC +#define clobber_type(arg) _Generic((void (*)(void*, arg))NULL,\ + void (*)(void*, int32_t ): clobber_mask |= 1 << mpos++,\ + void (*)(void*, uint32_t): clobber_mask |= 1 << mpos++,\ + void (*)(void*, float ): mpos += (fp_args++ >= IGNORED_FP_ARGS),\ + void (*)(void*, double ): mpos += (fp_args++ >= IGNORED_FP_ARGS),\ + default: mpos++) +#define init_clobber_mask(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, ...)\ + unsigned clobber_mask = 0;\ + {\ + int mpos = 0, fp_args = 0;\ + clobber_type(a); clobber_type(b); clobber_type(c); clobber_type(d);\ + clobber_type(e); clobber_type(f); clobber_type(g); clobber_type(h);\ + clobber_type(i); clobber_type(j); clobber_type(k); clobber_type(l);\ + clobber_type(m); clobber_type(n); clobber_type(o); clobber_type(p);\ + } +#else +/* Skip parameter clobbering on compilers without support for _Generic() */ +#define init_clobber_mask(...) unsigned clobber_mask = 0 +#endif +#define declare_new(ret, ...)\ + ret (*checked_call)(__VA_ARGS__, int, int, int, int, int, int, int,\ + int, int, int, int, int, int, int, int, int,\ + void*, unsigned) =\ + (void*)checkasm_checked_call;\ + init_clobber_mask(__VA_ARGS__, void*, void*, void*, void*,\ + void*, void*, void*, void*, void*, void*,\ + void*, void*, void*, void*, void*); +#define call_new(...)\ + (checkasm_set_signal_handler_state(1),\ + checkasm_simd_warmup(),\ + checked_call(__VA_ARGS__, 16, 15, 14, 13, 12, 11, 10, 9, 8,\ + 7, 6, 5, 4, 3, 2, 1, func_new, clobber_mask));\ + checkasm_set_signal_handler_state(0) +#elif ARCH_X86_32 +#define declare_new(ret, ...)\ + ret (*checked_call)(void *, __VA_ARGS__, int, int, int, int, int, int,\ + int, int, int, int, int, int, int, int, int) =\ + (void *)checkasm_checked_call; +#define call_new(...)\ + (checkasm_set_signal_handler_state(1),\ + checked_call(func_new, __VA_ARGS__, 15, 14, 13, 12,\ + 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1));\ + checkasm_set_signal_handler_state(0) +#elif ARCH_ARM +/* Use a dummy argument, to offset the real parameters by 2, not only 1. + * This makes sure that potential 8-byte-alignment of parameters is kept + * the same even when the extra parameters have been removed. */ +void checkasm_checked_call_vfp(void *func, int dummy, ...); +#define declare_new(ret, ...)\ + ret (*checked_call)(void *, int dummy, __VA_ARGS__,\ + int, int, int, int, int, int, int, int,\ + int, int, int, int, int, int, int) =\ + (void *)checkasm_checked_call_vfp; +#define call_new(...)\ + (checkasm_set_signal_handler_state(1),\ + checked_call(func_new, 0, __VA_ARGS__, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0));\ + checkasm_set_signal_handler_state(0) +#elif ARCH_AARCH64 && !defined(__APPLE__) +void checkasm_stack_clobber(uint64_t clobber, ...); +#define declare_new(ret, ...)\ + ret (*checked_call)(void *, int, int, int, int, int, int, int,\ + __VA_ARGS__, int, int, int, int, int, int, int, int,\ + int, int, int, int, int, int, int) =\ + (void *)checkasm_checked_call; +#define CLOB (UINT64_C(0xdeadbeefdeadbeef)) +#define call_new(...)\ + (checkasm_set_signal_handler_state(1),\ + checkasm_stack_clobber(CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\ + CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\ + CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\ + CLOB, CLOB, CLOB, CLOB, CLOB),\ + checked_call(func_new, 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__,\ + 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0));\ + checkasm_set_signal_handler_state(0) +#else +#define declare_new(ret, ...) +#define call_new(...)\ + (checkasm_set_signal_handler_state(1),\ + ((func_type *)func_new)(__VA_ARGS__));\ + checkasm_set_signal_handler_state(0) +#endif +#else /* HAVE_ASM */ +#define declare_new(ret, ...) +/* Call the function */ +#define call_new(...)\ + (checkasm_set_signal_handler_state(1),\ + ((func_type *)func_new)(__VA_ARGS__));\ + checkasm_set_signal_handler_state(0) +#endif /* HAVE_ASM */ + +/* Benchmark the function */ +#ifdef readtime +#define bench_new(...)\ + do {\ + if (checkasm_bench_func()) {\ + func_type *const tfunc = func_new;\ + checkasm_set_signal_handler_state(1);\ + uint64_t tsum = 0;\ + int tcount = 0;\ + for (int ti = 0; ti < BENCH_RUNS; ti++) {\ + uint64_t t = readtime();\ + int talt = 0; (void)talt;\ + tfunc(__VA_ARGS__);\ + talt = 1;\ + tfunc(__VA_ARGS__);\ + talt = 0;\ + tfunc(__VA_ARGS__);\ + talt = 1;\ + tfunc(__VA_ARGS__);\ + t = readtime() - t;\ + if (t*tcount <= tsum*4 && ti > 0) {\ + tsum += t;\ + tcount++;\ + }\ + }\ + checkasm_set_signal_handler_state(0);\ + checkasm_update_bench(tcount, tsum);\ + } else {\ + const int talt = 0; (void)talt;\ + call_new(__VA_ARGS__);\ + }\ + } while (0) +#else +#define bench_new(...) do {} while (0) +#endif + +/* Alternates between two pointers. Intended to be used within bench_new() + * calls for functions which modifies their input buffer(s) to ensure that + * throughput, and not latency, is measured. */ +#define alternate(a, b) (talt ? (b) : (a)) + +#define ROUND_UP(x,a) (((x)+((a)-1)) & ~((a)-1)) +#define PIXEL_RECT(name, w, h) \ + ALIGN_STK_64(pixel, name##_buf, ((h)+32)*(ROUND_UP(w,64)+64) + 64,); \ + ptrdiff_t name##_stride = sizeof(pixel)*(ROUND_UP(w,64)+64); \ + (void)name##_stride; \ + pixel *name = name##_buf + (ROUND_UP(w,64)+64)*16 + 64 + +#define CLEAR_PIXEL_RECT(name) \ + memset(name##_buf, 0x99, sizeof(name##_buf)) \ + +#define DECL_CHECKASM_CHECK_FUNC(type) \ +int checkasm_check_##type(const char *const file, const int line, \ + const type *const buf1, const ptrdiff_t stride1, \ + const type *const buf2, const ptrdiff_t stride2, \ + const int w, const int h, const char *const name, \ + const int align_w, const int align_h, \ + const int padding) + +DECL_CHECKASM_CHECK_FUNC(int8_t); +DECL_CHECKASM_CHECK_FUNC(int16_t); +DECL_CHECKASM_CHECK_FUNC(int32_t); +DECL_CHECKASM_CHECK_FUNC(uint8_t); +DECL_CHECKASM_CHECK_FUNC(uint16_t); +DECL_CHECKASM_CHECK_FUNC(uint32_t); + +#define CONCAT(a,b) a ## b + +#define checkasm_check2(prefix, ...) CONCAT(checkasm_check_, prefix)(__FILE__, __LINE__, __VA_ARGS__) +#define checkasm_check(prefix, ...) checkasm_check2(prefix, __VA_ARGS__, 0, 0, 0) + +#ifdef BITDEPTH +#define checkasm_check_pixel(...) checkasm_check(PIXEL_TYPE, __VA_ARGS__) +#define checkasm_check_pixel_padded(...) checkasm_check2(PIXEL_TYPE, __VA_ARGS__, 1, 1, 8) +#define checkasm_check_pixel_padded_align(...) checkasm_check2(PIXEL_TYPE, __VA_ARGS__, 8) +#define checkasm_check_coef(...) checkasm_check(COEF_TYPE, __VA_ARGS__) +#endif + +#endif /* DAV1D_TESTS_CHECKASM_CHECKASM_H */ diff --git a/third_party/dav1d/tests/checkasm/filmgrain.c b/third_party/dav1d/tests/checkasm/filmgrain.c new file mode 100644 index 0000000000..638e83fd11 --- /dev/null +++ b/third_party/dav1d/tests/checkasm/filmgrain.c @@ -0,0 +1,401 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "tests/checkasm/checkasm.h" + +#include <string.h> + +#include "src/levels.h" +#include "src/filmgrain.h" +#define UNIT_TEST 1 +#include "src/fg_apply_tmpl.c" + +#if BITDEPTH == 8 +#define checkasm_check_entry(...) checkasm_check(int8_t, __VA_ARGS__) +#else +#define checkasm_check_entry(...) checkasm_check(int16_t, __VA_ARGS__) +#endif + +static const char ss_name[][4] = { + [DAV1D_PIXEL_LAYOUT_I420 - 1] = "420", + [DAV1D_PIXEL_LAYOUT_I422 - 1] = "422", + [DAV1D_PIXEL_LAYOUT_I444 - 1] = "444", +}; + +static void check_gen_grny(const Dav1dFilmGrainDSPContext *const dsp) { + ALIGN_STK_16(entry, grain_lut_c, GRAIN_HEIGHT,[GRAIN_WIDTH]); + ALIGN_STK_16(entry, grain_lut_a, GRAIN_HEIGHT + 1,[GRAIN_WIDTH]); + + declare_func(void, entry grain_lut[][GRAIN_WIDTH], + const Dav1dFilmGrainData *data HIGHBD_DECL_SUFFIX); + + for (int i = 0; i < 4; i++) { + if (check_func(dsp->generate_grain_y, "gen_grain_y_ar%d_%dbpc", i, BITDEPTH)) { + ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,); + fg_data[0].seed = rnd() & 0xFFFF; + +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#endif + + fg_data[0].grain_scale_shift = rnd() & 3; + fg_data[0].ar_coeff_shift = (rnd() & 3) + 6; + fg_data[0].ar_coeff_lag = i; + const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1); + for (int n = 0; n < num_y_pos; n++) + fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128; + + call_ref(grain_lut_c, fg_data HIGHBD_TAIL_SUFFIX); + call_new(grain_lut_a, fg_data HIGHBD_TAIL_SUFFIX); + checkasm_check_entry(grain_lut_c[0], sizeof(entry) * GRAIN_WIDTH, + grain_lut_a[0], sizeof(entry) * GRAIN_WIDTH, + GRAIN_WIDTH, GRAIN_HEIGHT, "grain_lut"); + + bench_new(grain_lut_a, fg_data HIGHBD_TAIL_SUFFIX); + } + } + + report("gen_grain_y"); +} + +static void check_gen_grnuv(const Dav1dFilmGrainDSPContext *const dsp) { + ALIGN_STK_16(entry, grain_lut_y, GRAIN_HEIGHT + 1,[GRAIN_WIDTH]); + ALIGN_STK_16(entry, grain_lut_c, GRAIN_HEIGHT, [GRAIN_WIDTH]); + ALIGN_STK_16(entry, grain_lut_a, GRAIN_HEIGHT + 1,[GRAIN_WIDTH]); + + declare_func(void, entry grain_lut[][GRAIN_WIDTH], + const entry grain_lut_y[][GRAIN_WIDTH], + const Dav1dFilmGrainData *data, intptr_t uv HIGHBD_DECL_SUFFIX); + + for (int layout_idx = 0; layout_idx < 3; layout_idx++) { + const enum Dav1dPixelLayout layout = layout_idx + 1; + const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444; + const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420; + + for (int i = 0; i < 4; i++) { + if (check_func(dsp->generate_grain_uv[layout_idx], + "gen_grain_uv_ar%d_%dbpc_%s", + i, BITDEPTH, ss_name[layout_idx])) + { + ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,); + fg_data[0].seed = rnd() & 0xFFFF; + +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#endif + + fg_data[0].num_y_points = rnd() & 1; + fg_data[0].grain_scale_shift = rnd() & 3; + fg_data[0].ar_coeff_shift = (rnd() & 3) + 6; + fg_data[0].ar_coeff_lag = i; + const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1); + for (int n = 0; n < num_y_pos; n++) + fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128; + dsp->generate_grain_y(grain_lut_y, fg_data HIGHBD_TAIL_SUFFIX); + + const int uv = rnd() & 1; + const int num_uv_pos = num_y_pos + !!fg_data[0].num_y_points; + for (int n = 0; n < num_uv_pos; n++) + fg_data[0].ar_coeffs_uv[uv][n] = (rnd() & 0xff) - 128; + if (!fg_data[0].num_y_points) + fg_data[0].ar_coeffs_uv[uv][num_uv_pos] = 0; + memset(grain_lut_c, 0xff, sizeof(grain_lut_c)); + memset(grain_lut_a, 0xff, sizeof(grain_lut_a)); + call_ref(grain_lut_c, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX); + call_new(grain_lut_a, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX); + int w = ss_x ? 44 : GRAIN_WIDTH; + int h = ss_y ? 38 : GRAIN_HEIGHT; + checkasm_check_entry(grain_lut_c[0], sizeof(entry) * GRAIN_WIDTH, + grain_lut_a[0], sizeof(entry) * GRAIN_WIDTH, + w, h, "grain_lut"); + + bench_new(grain_lut_a, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX); + } + } + } + + report("gen_grain_uv"); +} + +static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) { + PIXEL_RECT(c_dst, 128, 32); + PIXEL_RECT(a_dst, 128, 32); + PIXEL_RECT(src, 128, 32); + const ptrdiff_t stride = c_dst_stride; + + declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride, + const Dav1dFilmGrainData *data, size_t pw, + const uint8_t scaling[SCALING_SIZE], + const entry grain_lut[][GRAIN_WIDTH], + int bh, int row_num HIGHBD_DECL_SUFFIX); + + if (check_func(dsp->fgy_32x32xn, "fgy_32x32xn_%dbpc", BITDEPTH)) { + ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 16,); + ALIGN_STK_16(entry, grain_lut, GRAIN_HEIGHT + 1,[GRAIN_WIDTH]); + ALIGN_STK_64(uint8_t, scaling, SCALING_SIZE,); + fg_data[0].seed = rnd() & 0xFFFF; + +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + + fg_data[0].grain_scale_shift = rnd() & 3; + fg_data[0].ar_coeff_shift = (rnd() & 3) + 6; + fg_data[0].ar_coeff_lag = rnd() & 3; + const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1); + for (int n = 0; n < num_y_pos; n++) + fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128; + dsp->generate_grain_y(grain_lut, fg_data HIGHBD_TAIL_SUFFIX); + + fg_data[0].num_y_points = 2 + (rnd() % 13); + const int pad = 0xff / fg_data[0].num_y_points; + for (int n = 0; n < fg_data[0].num_y_points; n++) { + fg_data[0].y_points[n][0] = 0xff * n / fg_data[0].num_y_points; + fg_data[0].y_points[n][0] += rnd() % pad; + fg_data[0].y_points[n][1] = rnd() & 0xff; + } + generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].y_points, + fg_data[0].num_y_points, scaling); + + fg_data[0].clip_to_restricted_range = rnd() & 1; + fg_data[0].scaling_shift = (rnd() & 3) + 8; + for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1; + fg_data[0].overlap_flag++) + { + for (int i = 0; i <= 2 * fg_data[0].overlap_flag; i++) { + int w, h, row_num; + if (fg_data[0].overlap_flag) { + w = 35 + (rnd() % 93); + if (i == 0) { + row_num = 0; + h = 1 + (rnd() % 31); + } else { + row_num = 1 + (rnd() & 0x7ff); + if (i == 1) { + h = 3 + (rnd() % 30); + } else { + h = 1 + (rnd() & 1); + } + } + } else { + w = 1 + (rnd() & 127); + h = 1 + (rnd() & 31); + row_num = rnd() & 0x7ff; + } + + for (int y = 0; y < 32; y++) { + // Src pixels past the right edge can be uninitialized + for (int x = 0; x < 128; x++) + src[y * PXSTRIDE(stride) + x] = rnd(); + for (int x = 0; x < w; x++) + src[y * PXSTRIDE(stride) + x] &= bitdepth_max; + } + + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut, h, + row_num HIGHBD_TAIL_SUFFIX); + call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut, h, + row_num HIGHBD_TAIL_SUFFIX); + + checkasm_check_pixel_padded_align(c_dst, stride, a_dst, stride, + w, h, "dst", 32, 2); + } + } + fg_data[0].overlap_flag = 1; + for (int y = 0; y < 32; y++) { + // Make sure all pixels are in range + for (int x = 0; x < 128; x++) + src[y * PXSTRIDE(stride) + x] &= bitdepth_max; + } + bench_new(a_dst, src, stride, fg_data, 64, scaling, grain_lut, 32, + 1 HIGHBD_TAIL_SUFFIX); + } + + report("fgy_32x32xn"); +} + +static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) { + PIXEL_RECT(c_dst, 128, 32); + PIXEL_RECT(a_dst, 128, 32); + PIXEL_RECT(src, 128, 32); + PIXEL_RECT(luma_src, 128, 32); + const ptrdiff_t lstride = luma_src_stride; + + declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride, + const Dav1dFilmGrainData *data, size_t pw, + const uint8_t scaling[SCALING_SIZE], + const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, + const pixel *luma_row, ptrdiff_t luma_stride, int uv_pl, + int is_identity HIGHBD_DECL_SUFFIX); + + for (int layout_idx = 0; layout_idx < 3; layout_idx++) { + const enum Dav1dPixelLayout layout = layout_idx + 1; + const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444; + const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420; + const ptrdiff_t stride = c_dst_stride; + + for (int csfl = 0; csfl <= 1; csfl++) { + if (check_func(dsp->fguv_32x32xn[layout_idx], + "fguv_32x32xn_%dbpc_%s_csfl%d", + BITDEPTH, ss_name[layout_idx], csfl)) + { + ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,); + ALIGN_STK_16(entry, grain_lut, 2,[GRAIN_HEIGHT + 1][GRAIN_WIDTH]); + ALIGN_STK_64(uint8_t, scaling, SCALING_SIZE,); + + fg_data[0].seed = rnd() & 0xFFFF; + +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + const int uv_pl = rnd() & 1; + const int is_identity = rnd() & 1; + + fg_data[0].grain_scale_shift = rnd() & 3; + fg_data[0].ar_coeff_shift = (rnd() & 3) + 6; + fg_data[0].ar_coeff_lag = rnd() & 3; + fg_data[0].num_y_points = csfl ? 2 + (rnd() % 13) : 0; + const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1); + for (int n = 0; n < num_y_pos; n++) + fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128; + const int num_uv_pos = num_y_pos + 1; + for (int n = 0; n < num_uv_pos; n++) + fg_data[0].ar_coeffs_uv[uv_pl][n] = (rnd() & 0xff) - 128; + dsp->generate_grain_y(grain_lut[0], fg_data HIGHBD_TAIL_SUFFIX); + dsp->generate_grain_uv[layout_idx](grain_lut[1], grain_lut[0], + fg_data, uv_pl HIGHBD_TAIL_SUFFIX); + + if (csfl) { + const int pad = 0xff / fg_data[0].num_y_points; + for (int n = 0; n < fg_data[0].num_y_points; n++) { + fg_data[0].y_points[n][0] = 0xff * n / fg_data[0].num_y_points; + fg_data[0].y_points[n][0] += rnd() % pad; + fg_data[0].y_points[n][1] = rnd() & 0xff; + } + generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].y_points, + fg_data[0].num_y_points, scaling); + } else { + fg_data[0].num_uv_points[uv_pl] = 2 + (rnd() % 9); + const int pad = 0xff / fg_data[0].num_uv_points[uv_pl]; + for (int n = 0; n < fg_data[0].num_uv_points[uv_pl]; n++) { + fg_data[0].uv_points[uv_pl][n][0] = 0xff * n / fg_data[0].num_uv_points[uv_pl]; + fg_data[0].uv_points[uv_pl][n][0] += rnd() % pad; + fg_data[0].uv_points[uv_pl][n][1] = rnd() & 0xff; + } + generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].uv_points[uv_pl], + fg_data[0].num_uv_points[uv_pl], scaling); + + fg_data[0].uv_mult[uv_pl] = (rnd() & 0xff) - 128; + fg_data[0].uv_luma_mult[uv_pl] = (rnd() & 0xff) - 128; + fg_data[0].uv_offset[uv_pl] = (rnd() & 0x1ff) - 256; + } + + fg_data[0].clip_to_restricted_range = rnd() & 1; + fg_data[0].scaling_shift = (rnd() & 3) + 8; + fg_data[0].chroma_scaling_from_luma = csfl; + for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1; + fg_data[0].overlap_flag++) + { + for (int i = 0; i <= 2 * fg_data[0].overlap_flag; i++) { + int w, h, row_num; + if (fg_data[0].overlap_flag) { + w = (36 >> ss_x) + (rnd() % (92 >> ss_x)); + if (i == 0) { + row_num = 0; + h = 1 + (rnd() & (31 >> ss_y)); + } else { + row_num = 1 + (rnd() & 0x7ff); + if (i == 1) { + h = (ss_y ? 2 : 3) + (rnd() % (ss_y ? 15 : 30)); + } else { + h = ss_y ? 1 : 1 + (rnd() & 1); + } + } + } else { + w = 1 + (rnd() & (127 >> ss_x)); + h = 1 + (rnd() & (31 >> ss_y)); + row_num = rnd() & 0x7ff; + } + + for (int y = 0; y < 32; y++) { + // Src pixels past the right edge can be uninitialized + for (int x = 0; x < 128; x++) { + src[y * PXSTRIDE(stride) + x] = rnd(); + luma_src[y * PXSTRIDE(lstride) + x] = rnd(); + } + for (int x = 0; x < w; x++) + src[y * PXSTRIDE(stride) + x] &= bitdepth_max; + for (int x = 0; x < (w << ss_x); x++) + luma_src[y * PXSTRIDE(lstride) + x] &= bitdepth_max; + } + + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut[1], h, + row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX); + call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut[1], h, + row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX); + + checkasm_check_pixel_padded_align(c_dst, stride, + a_dst, stride, + w, h, "dst", + 32 >> ss_x, 4); + } + } + + fg_data[0].overlap_flag = 1; + for (int y = 0; y < 32; y++) { + // Make sure all pixels are in range + for (int x = 0; x < 128; x++) { + src[y * PXSTRIDE(stride) + x] &= bitdepth_max; + luma_src[y * PXSTRIDE(lstride) + x] &= bitdepth_max; + } + } + bench_new(a_dst, src, stride, fg_data, 64 >> ss_x, scaling, grain_lut[1], 32 >> ss_y, + 1, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX); + } + } + } + + report("fguv_32x32xn"); +} + +void bitfn(checkasm_check_filmgrain)(void) { + Dav1dFilmGrainDSPContext c; + + bitfn(dav1d_film_grain_dsp_init)(&c); + + check_gen_grny(&c); + check_gen_grnuv(&c); + check_fgy_sbrow(&c); + check_fguv_sbrow(&c); +} diff --git a/third_party/dav1d/tests/checkasm/ipred.c b/third_party/dav1d/tests/checkasm/ipred.c new file mode 100644 index 0000000000..3676b809b3 --- /dev/null +++ b/third_party/dav1d/tests/checkasm/ipred.c @@ -0,0 +1,297 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "tests/checkasm/checkasm.h" +#include "src/ipred.h" +#include "src/levels.h" + +#include <stdio.h> + +static const char *const intra_pred_mode_names[N_IMPL_INTRA_PRED_MODES] = { + [DC_PRED] = "dc", + [DC_128_PRED] = "dc_128", + [TOP_DC_PRED] = "dc_top", + [LEFT_DC_PRED] = "dc_left", + [HOR_PRED] = "h", + [VERT_PRED] = "v", + [PAETH_PRED] = "paeth", + [SMOOTH_PRED] = "smooth", + [SMOOTH_V_PRED] = "smooth_v", + [SMOOTH_H_PRED] = "smooth_h", + [Z1_PRED] = "z1", + [Z2_PRED] = "z2", + [Z3_PRED] = "z3", + [FILTER_PRED] = "filter" +}; + +static const char *const cfl_ac_names[3] = { "420", "422", "444" }; + +static const char *const cfl_pred_mode_names[DC_128_PRED + 1] = { + [DC_PRED] = "cfl", + [DC_128_PRED] = "cfl_128", + [TOP_DC_PRED] = "cfl_top", + [LEFT_DC_PRED] = "cfl_left", +}; + +static const uint8_t z_angles[27] = { + 3, 6, 9, + 14, 17, 20, 23, 26, 29, 32, + 36, 39, 42, 45, 48, 51, 54, + 58, 61, 64, 67, 70, 73, 76, + 81, 84, 87 +}; + +static void check_intra_pred(Dav1dIntraPredDSPContext *const c) { + PIXEL_RECT(c_dst, 64, 64); + PIXEL_RECT(a_dst, 64, 64); + ALIGN_STK_64(pixel, topleft_buf, 257,); + pixel *const topleft = topleft_buf + 128; + + declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *topleft, + int width, int height, int angle, int max_width, int max_height + HIGHBD_DECL_SUFFIX); + + for (int mode = 0; mode < N_IMPL_INTRA_PRED_MODES; mode++) { + int bpc_min = BITDEPTH, bpc_max = BITDEPTH; + if (mode == FILTER_PRED && BITDEPTH == 16) { + bpc_min = 10; + bpc_max = 12; + } + for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) + for (int w = 4; w <= (mode == FILTER_PRED ? 32 : 64); w <<= 1) + if (check_func(c->intra_pred[mode], "intra_pred_%s_w%d_%dbpc", + intra_pred_mode_names[mode], w, bpc)) + { + for (int h = imax(w / 4, 4); h <= imin(w * 4, + (mode == FILTER_PRED ? 32 : 64)); h <<= 1) + { + const ptrdiff_t stride = c_dst_stride; + int nb_iters = (mode >= Z1_PRED && mode <= Z3_PRED) ? 5 : 1; + + for (int iter = 0; iter < nb_iters; iter++) { + int a = 0, maxw = 0, maxh = 0; + if (mode >= Z1_PRED && mode <= Z3_PRED) { /* angle */ + a = (90 * (mode - Z1_PRED) + z_angles[rnd() % 27]) | + (rnd() & 0x600); + if (mode == Z2_PRED) { + maxw = rnd(), maxh = rnd(); + maxw = 1 + (maxw & (maxw & 4096 ? 4095 : w - 1)); + maxh = 1 + (maxh & (maxh & 4096 ? 4095 : h - 1)); + } + } else if (mode == FILTER_PRED) /* filter_idx */ + a = (rnd() % 5) | (rnd() & ~511); + + int bitdepth_max; + if (bpc == 16) + bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; + else + bitdepth_max = (1 << bpc) - 1; + + for (int i = -h * 2; i <= w * 2; i++) + topleft[i] = rnd() & bitdepth_max; + + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + call_ref(c_dst, stride, topleft, w, h, a, maxw, maxh + HIGHBD_TAIL_SUFFIX); + call_new(a_dst, stride, topleft, w, h, a, maxw, maxh + HIGHBD_TAIL_SUFFIX); + if (checkasm_check_pixel_padded(c_dst, stride, + a_dst, stride, + w, h, "dst")) + { + if (mode == Z1_PRED || mode == Z3_PRED) + fprintf(stderr, "angle = %d (0x%03x)\n", + a & 0x1ff, a & 0x600); + else if (mode == Z2_PRED) + fprintf(stderr, "angle = %d (0x%03x), " + "max_width = %d, max_height = %d\n", + a & 0x1ff, a & 0x600, maxw, maxh); + else if (mode == FILTER_PRED) + fprintf(stderr, "filter_idx = %d\n", a & 0x1ff); + break; + } + + bench_new(a_dst, stride, topleft, w, h, a, 128, 128 + HIGHBD_TAIL_SUFFIX); + } + } + } + } + report("intra_pred"); +} + +static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) { + ALIGN_STK_64(int16_t, c_dst, 32 * 32,); + ALIGN_STK_64(int16_t, a_dst, 32 * 32,); + ALIGN_STK_64(pixel, luma, 32 * 32,); + + declare_func(void, int16_t *ac, const pixel *y, ptrdiff_t stride, + int w_pad, int h_pad, int cw, int ch); + + for (int layout = 1; layout <= DAV1D_PIXEL_LAYOUT_I444; layout++) { + const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444; + const int h_step = 2 >> ss_hor, v_step = 2 >> ss_ver; + for (int w = 4; w <= (32 >> ss_hor); w <<= 1) + if (check_func(c->cfl_ac[layout - 1], "cfl_ac_%s_w%d_%dbpc", + cfl_ac_names[layout - 1], w, BITDEPTH)) + { + for (int h = imax(w / 4, 4); + h <= imin(w * 4, (32 >> ss_ver)); h <<= 1) + { + const ptrdiff_t stride = 32 * sizeof(pixel); + for (int w_pad = imax((w >> 2) - h_step, 0); + w_pad >= 0; w_pad -= h_step) + { + for (int h_pad = imax((h >> 2) - v_step, 0); + h_pad >= 0; h_pad -= v_step) + { +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + for (int y = 0; y < (h << ss_ver); y++) + for (int x = 0; x < (w << ss_hor); x++) + luma[y * 32 + x] = rnd() & bitdepth_max; + + call_ref(c_dst, luma, stride, w_pad, h_pad, w, h); + call_new(a_dst, luma, stride, w_pad, h_pad, w, h); + checkasm_check(int16_t, c_dst, w * sizeof(*c_dst), + a_dst, w * sizeof(*a_dst), + w, h, "dst"); + } + } + + bench_new(a_dst, luma, stride, 0, 0, w, h); + } + } + } + report("cfl_ac"); +} + +static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) { + PIXEL_RECT(c_dst, 32, 32); + PIXEL_RECT(a_dst, 32, 32); + ALIGN_STK_64(int16_t, ac, 32 * 32,); + ALIGN_STK_64(pixel, topleft_buf, 257,); + pixel *const topleft = topleft_buf + 128; + + declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *topleft, + int width, int height, const int16_t *ac, int alpha + HIGHBD_DECL_SUFFIX); + + for (int mode = 0; mode <= DC_128_PRED; mode += 1 + 2 * !mode) + for (int w = 4; w <= 32; w <<= 1) + if (check_func(c->cfl_pred[mode], "cfl_pred_%s_w%d_%dbpc", + cfl_pred_mode_names[mode], w, BITDEPTH)) + { + for (int h = imax(w / 4, 4); h <= imin(w * 4, 32); h <<= 1) + { +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + + int alpha = ((rnd() & 15) + 1) * (1 - (rnd() & 2)); + + for (int i = -h * 2; i <= w * 2; i++) + topleft[i] = rnd() & bitdepth_max; + + int luma_avg = w * h >> 1; + for (int i = 0; i < w * h; i++) + luma_avg += ac[i] = rnd() & (bitdepth_max << 3); + luma_avg /= w * h; + for (int i = 0; i < w * h; i++) + ac[i] -= luma_avg; + + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + call_ref(c_dst, c_dst_stride, topleft, w, h, ac, alpha + HIGHBD_TAIL_SUFFIX); + call_new(a_dst, a_dst_stride, topleft, w, h, ac, alpha + HIGHBD_TAIL_SUFFIX); + checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride, + w, h, "dst"); + + bench_new(a_dst, a_dst_stride, topleft, w, h, ac, alpha + HIGHBD_TAIL_SUFFIX); + } + } + report("cfl_pred"); +} + +static void check_pal_pred(Dav1dIntraPredDSPContext *const c) { + PIXEL_RECT(c_dst, 64, 64); + PIXEL_RECT(a_dst, 64, 64); + ALIGN_STK_64(uint8_t, idx, 64 * 64,); + ALIGN_STK_16(uint16_t, pal, 8,); + + declare_func(void, pixel *dst, ptrdiff_t stride, const uint16_t *pal, + const uint8_t *idx, int w, int h); + + for (int w = 4; w <= 64; w <<= 1) + if (check_func(c->pal_pred, "pal_pred_w%d_%dbpc", w, BITDEPTH)) + for (int h = imax(w / 4, 4); h <= imin(w * 4, 64); h <<= 1) + { +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + + for (int i = 0; i < 8; i++) + pal[i] = rnd() & bitdepth_max; + + for (int i = 0; i < w * h; i++) + idx[i] = rnd() & 7; + + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + call_ref(c_dst, c_dst_stride, pal, idx, w, h); + call_new(a_dst, a_dst_stride, pal, idx, w, h); + checkasm_check_pixel_padded(c_dst, c_dst_stride, + a_dst, a_dst_stride, w, h, "dst"); + + bench_new(a_dst, a_dst_stride, pal, idx, w, h); + } + report("pal_pred"); +} + +void bitfn(checkasm_check_ipred)(void) { + Dav1dIntraPredDSPContext c; + bitfn(dav1d_intra_pred_dsp_init)(&c); + + check_intra_pred(&c); + check_cfl_ac(&c); + check_cfl_pred(&c); + check_pal_pred(&c); +} diff --git a/third_party/dav1d/tests/checkasm/itx.c b/third_party/dav1d/tests/checkasm/itx.c new file mode 100644 index 0000000000..c7cc411ff5 --- /dev/null +++ b/third_party/dav1d/tests/checkasm/itx.c @@ -0,0 +1,318 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "tests/checkasm/checkasm.h" + +#include <math.h> + +#include "src/itx.h" +#include "src/levels.h" +#include "src/scan.h" +#include "src/tables.h" + +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif +#ifndef M_SQRT1_2 +#define M_SQRT1_2 0.707106781186547524401 +#endif + +enum Tx1D { DCT, ADST, FLIPADST, IDENTITY, WHT }; + +static const uint8_t itx_1d_types[N_TX_TYPES_PLUS_LL][2] = { + [DCT_DCT] = { DCT, DCT }, + [ADST_DCT] = { DCT, ADST }, + [DCT_ADST] = { ADST, DCT }, + [ADST_ADST] = { ADST, ADST }, + [FLIPADST_DCT] = { DCT, FLIPADST }, + [DCT_FLIPADST] = { FLIPADST, DCT }, + [FLIPADST_FLIPADST] = { FLIPADST, FLIPADST }, + [ADST_FLIPADST] = { FLIPADST, ADST }, + [FLIPADST_ADST] = { ADST, FLIPADST }, + [IDTX] = { IDENTITY, IDENTITY }, + [V_DCT] = { IDENTITY, DCT }, + [H_DCT] = { DCT, IDENTITY }, + [V_ADST] = { IDENTITY, ADST }, + [H_ADST] = { ADST, IDENTITY }, + [V_FLIPADST] = { IDENTITY, FLIPADST }, + [H_FLIPADST] = { FLIPADST, IDENTITY }, + [WHT_WHT] = { WHT, WHT }, +}; + +static const char *const itx_1d_names[5] = { + [DCT] = "dct", + [ADST] = "adst", + [FLIPADST] = "flipadst", + [IDENTITY] = "identity", + [WHT] = "wht" +}; + +static const double scaling_factors[9] = { + 4.0000, /* 4x4 */ + 4.0000 * M_SQRT1_2, /* 4x8 8x4 */ + 2.0000, /* 4x16 8x8 16x4 */ + 2.0000 * M_SQRT1_2, /* 8x16 16x8 */ + 1.0000, /* 8x32 16x16 32x8 */ + 0.5000 * M_SQRT1_2, /* 16x32 32x16 */ + 0.2500, /* 16x64 32x32 64x16 */ + 0.1250 * M_SQRT1_2, /* 32x64 64x32 */ + 0.0625, /* 64x64 */ +}; + +/* FIXME: Ensure that those forward transforms are similar to the real AV1 + * transforms. The FLIPADST currently uses the ADST forward transform for + * example which is obviously "incorrect", but we're just using it for now + * since it does produce coefficients in the correct range at least. */ + +/* DCT-II */ +static void fdct_1d(double *const out, const double *const in, const int sz) { + for (int i = 0; i < sz; i++) { + out[i] = 0.0; + for (int j = 0; j < sz; j++) + out[i] += in[j] * cos(M_PI * (2 * j + 1) * i / (sz * 2.0)); + } + out[0] *= M_SQRT1_2; +} + +/* See "Towards jointly optimal spatial prediction and adaptive transform in + * video/image coding", by J. Han, A. Saxena, and K. Rose + * IEEE Proc. ICASSP, pp. 726-729, Mar. 2010. + * and "A Butterfly Structured Design of The Hybrid Transform Coding Scheme", + * by Jingning Han, Yaowu Xu, and Debargha Mukherjee + * http://research.google.com/pubs/archive/41418.pdf + */ +static void fadst_1d(double *const out, const double *const in, const int sz) { + for (int i = 0; i < sz; i++) { + out[i] = 0.0; + for (int j = 0; j < sz; j++) + out[i] += in[j] * sin(M_PI * + (sz == 4 ? ( j + 1) * (2 * i + 1) / (8.0 + 1.0) : + (2 * j + 1) * (2 * i + 1) / (sz * 4.0))); + } +} + +static void fwht4_1d(double *const out, const double *const in) +{ + const double t0 = in[0] + in[1]; + const double t3 = in[3] - in[2]; + const double t4 = (t0 - t3) * 0.5; + const double t1 = t4 - in[1]; + const double t2 = t4 - in[2]; + out[0] = t0 - t2; + out[1] = t2; + out[2] = t3 + t1; + out[3] = t1; +} + +static int copy_subcoefs(coef *coeff, + const enum RectTxfmSize tx, const enum TxfmType txtp, + const int sw, const int sh, const int subsh) +{ + /* copy the topleft coefficients such that the return value (being the + * coefficient scantable index for the eob token) guarantees that only + * the topleft $sub out of $sz (where $sz >= $sub) coefficients in both + * dimensions are non-zero. This leads to braching to specific optimized + * simd versions (e.g. dc-only) so that we get full asm coverage in this + * test */ + + const enum TxClass tx_class = dav1d_tx_type_class[txtp]; + const uint16_t *const scan = dav1d_scans[tx]; + const int sub_high = subsh > 0 ? subsh * 8 - 1 : 0; + const int sub_low = subsh > 1 ? sub_high - 8 : 0; + int n, eob; + + for (n = 0, eob = 0; n < sw * sh; n++) { + int rc, rcx, rcy; + if (tx_class == TX_CLASS_2D) + rc = scan[n], rcx = rc % sh, rcy = rc / sh; + else if (tx_class == TX_CLASS_H) + rcx = n % sh, rcy = n / sh, rc = n; + else /* tx_class == TX_CLASS_V */ + rcx = n / sw, rcy = n % sw, rc = rcy * sh + rcx; + + /* Pick a random eob within this sub-itx */ + if (rcx > sub_high || rcy > sub_high) { + break; /* upper boundary */ + } else if (!eob && (rcx > sub_low || rcy > sub_low)) + eob = n; /* lower boundary */ + } + + if (eob) + eob += rnd() % (n - eob - 1); + if (tx_class == TX_CLASS_2D) + for (n = eob + 1; n < sw * sh; n++) + coeff[scan[n]] = 0; + else if (tx_class == TX_CLASS_H) + for (n = eob + 1; n < sw * sh; n++) + coeff[n] = 0; + else /* tx_class == TX_CLASS_V */ { + for (int rcx = eob / sw, rcy = eob % sw; rcx < sh; rcx++, rcy = -1) + while (++rcy < sw) + coeff[rcy * sh + rcx] = 0; + n = sw * sh; + } + for (; n < 32 * 32; n++) + coeff[n] = rnd(); + return eob; +} + +static int ftx(coef *const buf, const enum RectTxfmSize tx, + const enum TxfmType txtp, const int w, const int h, + const int subsh, const int bitdepth_max) +{ + double out[64 * 64], temp[64 * 64]; + const double scale = scaling_factors[ctz(w * h) - 4]; + const int sw = imin(w, 32), sh = imin(h, 32); + + for (int i = 0; i < h; i++) { + double in[64], temp_out[64]; + + for (int i = 0; i < w; i++) + in[i] = (rnd() & (2 * bitdepth_max + 1)) - bitdepth_max; + + switch (itx_1d_types[txtp][0]) { + case DCT: + fdct_1d(temp_out, in, w); + break; + case ADST: + case FLIPADST: + fadst_1d(temp_out, in, w); + break; + case WHT: + fwht4_1d(temp_out, in); + break; + case IDENTITY: + memcpy(temp_out, in, w * sizeof(*temp_out)); + break; + } + + for (int j = 0; j < w; j++) + temp[j * h + i] = temp_out[j] * scale; + } + + for (int i = 0; i < w; i++) { + switch (itx_1d_types[txtp][0]) { + case DCT: + fdct_1d(&out[i * h], &temp[i * h], h); + break; + case ADST: + case FLIPADST: + fadst_1d(&out[i * h], &temp[i * h], h); + break; + case WHT: + fwht4_1d(&out[i * h], &temp[i * h]); + break; + case IDENTITY: + memcpy(&out[i * h], &temp[i * h], h * sizeof(*out)); + break; + } + } + + for (int y = 0; y < sh; y++) + for (int x = 0; x < sw; x++) + buf[y * sw + x] = (coef) (out[y * w + x] + 0.5); + + return copy_subcoefs(buf, tx, txtp, sw, sh, subsh); +} + +static void check_itxfm_add(Dav1dInvTxfmDSPContext *const c, + const enum RectTxfmSize tx) +{ + ALIGN_STK_64(coef, coeff, 2, [32 * 32]); + PIXEL_RECT(c_dst, 64, 64); + PIXEL_RECT(a_dst, 64, 64); + + static const uint8_t subsh_iters[5] = { 2, 2, 3, 5, 5 }; + + const int w = dav1d_txfm_dimensions[tx].w * 4; + const int h = dav1d_txfm_dimensions[tx].h * 4; + const int subsh_max = subsh_iters[imax(dav1d_txfm_dimensions[tx].lw, + dav1d_txfm_dimensions[tx].lh)]; +#if BITDEPTH == 16 + const int bpc_min = 10, bpc_max = 12; +#else + const int bpc_min = 8, bpc_max = 8; +#endif + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, coef *coeff, + int eob HIGHBD_DECL_SUFFIX); + + for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) { + bitfn(dav1d_itx_dsp_init)(c, bpc); + for (enum TxfmType txtp = 0; txtp < N_TX_TYPES_PLUS_LL; txtp++) + for (int subsh = 0; subsh < subsh_max; subsh++) + if (check_func(c->itxfm_add[tx][txtp], + "inv_txfm_add_%dx%d_%s_%s_%d_%dbpc", + w, h, itx_1d_names[itx_1d_types[txtp][0]], + itx_1d_names[itx_1d_types[txtp][1]], subsh, + bpc)) + { + const int bitdepth_max = (1 << bpc) - 1; + const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max); + memcpy(coeff[1], coeff[0], sizeof(*coeff)); + + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + for (int y = 0; y < h; y++) + for (int x = 0; x < w; x++) + c_dst[y*PXSTRIDE(c_dst_stride) + x] = + a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max; + + call_ref(c_dst, c_dst_stride, coeff[0], eob + HIGHBD_TAIL_SUFFIX); + call_new(a_dst, a_dst_stride, coeff[1], eob + HIGHBD_TAIL_SUFFIX); + + checkasm_check_pixel_padded(c_dst, c_dst_stride, + a_dst, a_dst_stride, + w, h, "dst"); + if (memcmp(coeff[0], coeff[1], sizeof(*coeff))) + fail(); + + bench_new(alternate(c_dst, a_dst), a_dst_stride, + alternate(coeff[0], coeff[1]), eob HIGHBD_TAIL_SUFFIX); + } + } + report("add_%dx%d", w, h); +} + +void bitfn(checkasm_check_itx)(void) { + static const uint8_t txfm_size_order[N_RECT_TX_SIZES] = { + TX_4X4, RTX_4X8, RTX_4X16, + RTX_8X4, TX_8X8, RTX_8X16, RTX_8X32, + RTX_16X4, RTX_16X8, TX_16X16, RTX_16X32, RTX_16X64, + RTX_32X8, RTX_32X16, TX_32X32, RTX_32X64, + RTX_64X16, RTX_64X32, TX_64X64 + }; + + /* Zero unused function pointer elements. */ + Dav1dInvTxfmDSPContext c = { { { 0 } } }; + + for (int i = 0; i < N_RECT_TX_SIZES; i++) + check_itxfm_add(&c, txfm_size_order[i]); +} diff --git a/third_party/dav1d/tests/checkasm/loopfilter.c b/third_party/dav1d/tests/checkasm/loopfilter.c new file mode 100644 index 0000000000..7d70d0648c --- /dev/null +++ b/third_party/dav1d/tests/checkasm/loopfilter.c @@ -0,0 +1,203 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "tests/checkasm/checkasm.h" + +#include <string.h> + +#include "src/levels.h" +#include "src/loopfilter.h" + +static void init_lpf_border(pixel *const dst, const ptrdiff_t stride, + int E, int I, const int bitdepth_max) +{ + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + const int F = 1 << bitdepth_min_8; + E <<= bitdepth_min_8; + I <<= bitdepth_min_8; + + const int filter_type = rnd() % 4; + const int edge_diff = rnd() % ((E + 2) * 4) - 2 * (E + 2); + switch (filter_type) { + case 0: // random, unfiltered + for (int i = -8; i < 8; i++) + dst[i * stride] = rnd() & bitdepth_max; + break; + case 1: // long flat + dst[-8 * stride] = rnd() & bitdepth_max; + dst[+7 * stride] = rnd() & bitdepth_max; + dst[+0 * stride] = rnd() & bitdepth_max; + dst[-1 * stride] = iclip_pixel(dst[+0 * stride] + edge_diff); + for (int i = 1; i < 7; i++) { + dst[-(1 + i) * stride] = iclip_pixel(dst[-1 * stride] + + rnd() % (2 * (F + 1)) - (F + 1)); + dst[+(0 + i) * stride] = iclip_pixel(dst[+0 * stride] + + rnd() % (2 * (F + 1)) - (F + 1)); + } + break; + case 2: // short flat + for (int i = 4; i < 8; i++) { + dst[-(1 + i) * stride] = rnd() & bitdepth_max; + dst[+(0 + i) * stride] = rnd() & bitdepth_max; + } + dst[+0 * stride] = rnd() & bitdepth_max; + dst[-1 * stride] = iclip_pixel(dst[+0 * stride] + edge_diff); + for (int i = 1; i < 4; i++) { + dst[-(1 + i) * stride] = iclip_pixel(dst[-1 * stride] + + rnd() % (2 * (F + 1)) - (F + 1)); + dst[+(0 + i) * stride] = iclip_pixel(dst[+0 * stride] + + rnd() % (2 * (F + 1)) - (F + 1)); + } + break; + case 3: // normal or hev + for (int i = 4; i < 8; i++) { + dst[-(1 + i) * stride] = rnd() & bitdepth_max; + dst[+(0 + i) * stride] = rnd() & bitdepth_max; + } + dst[+0 * stride] = rnd() & bitdepth_max; + dst[-1 * stride] = iclip_pixel(dst[+0 * stride] + edge_diff); + for (int i = 1; i < 4; i++) { + dst[-(1 + i) * stride] = iclip_pixel(dst[-(0 + i) * stride] + + rnd() % (2 * (I + 1)) - (I + 1)); + dst[+(0 + i) * stride] = iclip_pixel(dst[+(i - 1) * stride] + + rnd() % (2 * (I + 1)) - (I + 1)); + } + break; + } +} + +static void check_lpf_sb(loopfilter_sb_fn fn, const char *const name, + const int n_blks, const int lf_idx, + const int is_chroma, const int dir) +{ + ALIGN_STK_64(pixel, c_dst_mem, 128 * 16,); + ALIGN_STK_64(pixel, a_dst_mem, 128 * 16,); + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, const uint32_t *mask, + const uint8_t (*l)[4], ptrdiff_t b4_stride, + const Av1FilterLUT *lut, int w HIGHBD_DECL_SUFFIX); + + pixel *a_dst, *c_dst; + ptrdiff_t stride, b4_stride; + int w, h; + if (dir) { + a_dst = a_dst_mem + n_blks * 4 * 8; + c_dst = c_dst_mem + n_blks * 4 * 8; + w = n_blks * 4; + h = 16; + b4_stride = 32; + } else { + a_dst = a_dst_mem + 8; + c_dst = c_dst_mem + 8; + w = 16; + h = n_blks * 4; + b4_stride = 2; + } + stride = w * sizeof(pixel); + + Av1FilterLUT lut; + const int sharp = rnd() & 7; + for (int level = 0; level < 64; level++) { + int limit = level; + + if (sharp > 0) { + limit >>= (sharp + 3) >> 2; + limit = imin(limit, 9 - sharp); + } + limit = imax(limit, 1); + + lut.i[level] = limit; + lut.e[level] = 2 * (level + 2) + limit; + } + lut.sharp[0] = (sharp + 3) >> 2; + lut.sharp[1] = sharp ? 9 - sharp : 0xff; + + const int n_strengths = is_chroma ? 2 : 3; + for (int i = 0; i < n_strengths; i++) { + if (check_func(fn, "%s_w%d_%dbpc", name, + is_chroma ? 4 + 2 * i : 4 << i, BITDEPTH)) + { + uint32_t vmask[4] = { 0 }; + uint8_t l[32 * 2][4]; + + for (int j = 0; j < n_blks; j++) { + const int idx = rnd() % (i + 2); + if (idx) vmask[idx - 1] |= 1U << j; + if (dir) { + l[j][lf_idx] = rnd() & 63; + l[j + 32][lf_idx] = rnd() & 63; + } else { + l[j * 2][lf_idx] = rnd() & 63; + l[j * 2 + 1][lf_idx] = rnd() & 63; + } + } +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + + for (int i = 0; i < 4 * n_blks; i++) { + const int x = i >> 2; + int L; + if (dir) { + L = l[32 + x][lf_idx] ? l[32 + x][lf_idx] : l[x][lf_idx]; + } else { + L = l[2 * x + 1][lf_idx] ? l[2 * x + 1][lf_idx] : l[2 * x][lf_idx]; + } + init_lpf_border(c_dst + i * (dir ? 1 : 16), dir ? n_blks * 4 : 1, + lut.e[L], lut.i[L], bitdepth_max); + } + memcpy(a_dst_mem, c_dst_mem, 128 * sizeof(pixel) * 16); + + call_ref(c_dst, stride, vmask, + (const uint8_t(*)[4]) &l[dir ? 32 : 1][lf_idx], + b4_stride, &lut, n_blks HIGHBD_TAIL_SUFFIX); + call_new(a_dst, stride, vmask, + (const uint8_t(*)[4]) &l[dir ? 32 : 1][lf_idx], + b4_stride, &lut, n_blks HIGHBD_TAIL_SUFFIX); + + checkasm_check_pixel(c_dst_mem, stride, a_dst_mem, stride, + w, h, "dst"); + bench_new(alternate(c_dst, a_dst), stride, vmask, + (const uint8_t(*)[4]) &l[dir ? 32 : 1][lf_idx], + b4_stride, &lut, n_blks HIGHBD_TAIL_SUFFIX); + } + } + report(name); +} + +void bitfn(checkasm_check_loopfilter)(void) { + Dav1dLoopFilterDSPContext c; + + bitfn(dav1d_loop_filter_dsp_init)(&c); + + check_lpf_sb(c.loop_filter_sb[0][0], "lpf_h_sb_y", 32, 0, 0, 0); + check_lpf_sb(c.loop_filter_sb[0][1], "lpf_v_sb_y", 32, 1, 0, 1); + check_lpf_sb(c.loop_filter_sb[1][0], "lpf_h_sb_uv", 16, 2, 1, 0); + check_lpf_sb(c.loop_filter_sb[1][1], "lpf_v_sb_uv", 16, 2, 1, 1); +} diff --git a/third_party/dav1d/tests/checkasm/looprestoration.c b/third_party/dav1d/tests/checkasm/looprestoration.c new file mode 100644 index 0000000000..d84f3c476a --- /dev/null +++ b/third_party/dav1d/tests/checkasm/looprestoration.c @@ -0,0 +1,196 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "tests/checkasm/checkasm.h" + +#include <stdio.h> +#include <string.h> + +#include "src/levels.h" +#include "src/looprestoration.h" +#include "src/tables.h" + +static int to_binary(int x) { /* 0-15 -> 0000-1111 */ + return (x & 1) + 5 * (x & 2) + 25 * (x & 4) + 125 * (x & 8); +} + +static void init_tmp(pixel *buf, const ptrdiff_t stride, + const int w, const int h, const int bitdepth_max) +{ + const int noise_mask = bitdepth_max >> 4; + const int x_off = rnd() & 7, y_off = rnd() & 7; + + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + buf[x] = (((x + x_off) ^ (y + y_off)) & 8 ? bitdepth_max : 0) ^ + (rnd() & noise_mask); + } + buf += PXSTRIDE(stride); + } +} + +static void check_wiener(Dav1dLoopRestorationDSPContext *const c, const int bpc) { + ALIGN_STK_64(pixel, c_src, 448 * 64 + 64,), *const c_dst = c_src + 64; + ALIGN_STK_64(pixel, a_src, 448 * 64 + 64,), *const a_dst = a_src + 64; + ALIGN_STK_64(pixel, edge_buf, 448 * 8 + 64,), *const h_edge = edge_buf + 64; + pixel left[64][4]; + LooprestorationParams params; + int16_t (*const filter)[8] = params.filter; + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, + const pixel (*const left)[4], + const pixel *lpf, int w, int h, + const LooprestorationParams *params, + enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); + + for (int t = 0; t < 2; t++) { + if (check_func(c->wiener[t], "wiener_%dtap_%dbpc", t ? 5 : 7, bpc)) { + filter[0][0] = filter[0][6] = t ? 0 : (rnd() & 15) - 5; + filter[0][1] = filter[0][5] = (rnd() & 31) - 23; + filter[0][2] = filter[0][4] = (rnd() & 63) - 17; + filter[0][3] = -(filter[0][0] + filter[0][1] + filter[0][2]) * 2; +#if BITDEPTH != 8 + filter[0][3] += 128; +#endif + + filter[1][0] = filter[1][6] = t ? 0 : (rnd() & 15) - 5; + filter[1][1] = filter[1][5] = (rnd() & 31) - 23; + filter[1][2] = filter[1][4] = (rnd() & 63) - 17; + filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2; + + const int base_w = 1 + (rnd() % 384); + const int base_h = 1 + (rnd() & 63); + const int bitdepth_max = (1 << bpc) - 1; + + init_tmp(c_src, 448 * sizeof(pixel), 448, 64, bitdepth_max); + init_tmp(edge_buf, 448 * sizeof(pixel), 448, 8, bitdepth_max); + init_tmp((pixel *) left, 4 * sizeof(pixel), 4, 64, bitdepth_max); + + for (enum LrEdgeFlags edges = 0; edges <= 0xf; edges++) { + const int w = edges & LR_HAVE_RIGHT ? 256 : base_w; + const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h; + + memcpy(a_src, c_src, 448 * 64 * sizeof(pixel)); + + call_ref(c_dst, 448 * sizeof(pixel), left, + h_edge, w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX); + call_new(a_dst, 448 * sizeof(pixel), left, + h_edge, w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX); + if (checkasm_check_pixel(c_dst, 448 * sizeof(pixel), + a_dst, 448 * sizeof(pixel), + w, h, "dst")) + { + fprintf(stderr, "size = %dx%d, edges = %04d\n", + w, h, to_binary(edges)); + break; + } + } + bench_new(alternate(c_dst, a_dst), 448 * sizeof(pixel), left, + h_edge, 256, 64, ¶ms, 0xf HIGHBD_TAIL_SUFFIX); + } + } +} + +static void check_sgr(Dav1dLoopRestorationDSPContext *const c, const int bpc) { + ALIGN_STK_64(pixel, c_src, 448 * 64 + 64,), *const c_dst = c_src + 64; + ALIGN_STK_64(pixel, a_src, 448 * 64 + 64,), *const a_dst = a_src + 64; + ALIGN_STK_64(pixel, edge_buf, 448 * 8 + 64,), *const h_edge = edge_buf + 64; + pixel left[64][4]; + LooprestorationParams params; + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, + const pixel (*const left)[4], + const pixel *lpf, int w, int h, + const LooprestorationParams *params, + enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); + + static const struct { char name[4]; uint8_t idx; } sgr_data[3] = { + { "5x5", 14 }, + { "3x3", 10 }, + { "mix", 0 }, + }; + + for (int i = 0; i < 3; i++) { + if (check_func(c->sgr[i], "sgr_%s_%dbpc", sgr_data[i].name, bpc)) { + const uint16_t *const sgr_params = dav1d_sgr_params[sgr_data[i].idx]; + params.sgr.s0 = sgr_params[0]; + params.sgr.s1 = sgr_params[1]; + params.sgr.w0 = sgr_params[0] ? (rnd() & 127) - 96 : 0; + params.sgr.w1 = (sgr_params[1] ? 160 - (rnd() & 127) : 33) - params.sgr.w0; + + const int base_w = 1 + (rnd() % 384); + const int base_h = 1 + (rnd() & 63); + const int bitdepth_max = (1 << bpc) - 1; + + init_tmp(c_src, 448 * sizeof(pixel), 448, 64, bitdepth_max); + init_tmp(edge_buf, 448 * sizeof(pixel), 448, 8, bitdepth_max); + init_tmp((pixel *) left, 4 * sizeof(pixel), 4, 64, bitdepth_max); + + for (enum LrEdgeFlags edges = 0; edges <= 0xf; edges++) { + const int w = edges & LR_HAVE_RIGHT ? 256 : base_w; + const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h; + + memcpy(a_src, c_src, 448 * 64 * sizeof(pixel)); + + call_ref(c_dst, 448 * sizeof(pixel), left, h_edge, + w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX); + call_new(a_dst, 448 * sizeof(pixel), left, h_edge, + w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX); + if (checkasm_check_pixel(c_dst, 448 * sizeof(pixel), + a_dst, 448 * sizeof(pixel), + w, h, "dst")) + { + fprintf(stderr, "size = %dx%d, edges = %04d\n", + w, h, to_binary(edges)); + break; + } + } + bench_new(alternate(c_dst, a_dst), 448 * sizeof(pixel), left, + h_edge, 256, 64, ¶ms, 0xf HIGHBD_TAIL_SUFFIX); + } + } +} + +void bitfn(checkasm_check_looprestoration)(void) { +#if BITDEPTH == 16 + const int bpc_min = 10, bpc_max = 12; +#else + const int bpc_min = 8, bpc_max = 8; +#endif + for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) { + Dav1dLoopRestorationDSPContext c; + bitfn(dav1d_loop_restoration_dsp_init)(&c, bpc); + check_wiener(&c, bpc); + } + report("wiener"); + for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) { + Dav1dLoopRestorationDSPContext c; + bitfn(dav1d_loop_restoration_dsp_init)(&c, bpc); + check_sgr(&c, bpc); + } + report("sgr"); +} diff --git a/third_party/dav1d/tests/checkasm/mc.c b/third_party/dav1d/tests/checkasm/mc.c new file mode 100644 index 0000000000..047ef7b4a4 --- /dev/null +++ b/third_party/dav1d/tests/checkasm/mc.c @@ -0,0 +1,790 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "tests/checkasm/checkasm.h" + +#include "src/levels.h" +#include "src/mc.h" + +static const char *const filter_names[] = { + "8tap_regular", "8tap_regular_smooth", "8tap_regular_sharp", + "8tap_sharp_regular", "8tap_sharp_smooth", "8tap_sharp", + "8tap_smooth_regular", "8tap_smooth", "8tap_smooth_sharp", + "bilinear" +}; + +static const char *const mxy_names[] = { "0", "h", "v", "hv" }; +static const char *const scaled_paths[] = { "", "_dy1", "_dy2" }; + +static int mc_h_next(const int h) { + switch (h) { + case 4: + case 8: + case 16: + return (h * 3) >> 1; + case 6: + case 12: + case 24: + return (h & (h - 1)) * 2; + default: + return h * 2; + } +} + +static void check_mc(Dav1dMCDSPContext *const c) { + ALIGN_STK_64(pixel, src_buf, 135 * 135,); + PIXEL_RECT(c_dst, 128, 128); + PIXEL_RECT(a_dst, 128, 128); + const pixel *src = src_buf + 135 * 3 + 3; + const ptrdiff_t src_stride = 135 * sizeof(pixel); + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src, + ptrdiff_t src_stride, int w, int h, int mx, int my + HIGHBD_DECL_SUFFIX); + + for (int filter = 0; filter < N_2D_FILTERS; filter++) + for (int w = 2; w <= 128; w <<= 1) { + for (int mxy = 0; mxy < 4; mxy++) + if (check_func(c->mc[filter], "mc_%s_w%d_%s_%dbpc", + filter_names[filter], w, mxy_names[mxy], BITDEPTH)) + { + const int h_min = w <= 32 ? 2 : w / 4; + const int h_max = imax(imin(w * 4, 128), 32); + for (int h = h_min; h <= h_max; h = mc_h_next(h)) { + const int mx = (mxy & 1) ? rnd() % 15 + 1 : 0; + const int my = (mxy & 2) ? rnd() % 15 + 1 : 0; +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + + for (int i = 0; i < 135 * 135; i++) + src_buf[i] = rnd() & bitdepth_max; + + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + call_ref(c_dst, c_dst_stride, src, src_stride, w, h, + mx, my HIGHBD_TAIL_SUFFIX); + call_new(a_dst, a_dst_stride, src, src_stride, w, h, + mx, my HIGHBD_TAIL_SUFFIX); + checkasm_check_pixel_padded(c_dst, c_dst_stride, + a_dst, a_dst_stride, + w, h, "dst"); + + if (filter == FILTER_2D_8TAP_REGULAR || + filter == FILTER_2D_BILINEAR) + { + bench_new(a_dst, a_dst_stride, src, src_stride, w, h, + mx, my HIGHBD_TAIL_SUFFIX); + } + } + } + } + report("mc"); +} + +/* Generate worst case input in the topleft corner, randomize the rest */ +static void generate_mct_input(pixel *const buf, const int bitdepth_max) { + static const int8_t pattern[8] = { -1, 0, -1, 0, 0, -1, 0, -1 }; + const int sign = -(rnd() & 1); + + for (int y = 0; y < 135; y++) + for (int x = 0; x < 135; x++) + buf[135*y+x] = ((x | y) < 8 ? (pattern[x] ^ pattern[y] ^ sign) + : rnd()) & bitdepth_max; +} + +static void check_mct(Dav1dMCDSPContext *const c) { + ALIGN_STK_64(pixel, src_buf, 135 * 135,); + ALIGN_STK_64(int16_t, c_tmp, 128 * 128,); + ALIGN_STK_64(int16_t, a_tmp, 128 * 128,); + const pixel *src = src_buf + 135 * 3 + 3; + const ptrdiff_t src_stride = 135 * sizeof(pixel); + + declare_func(void, int16_t *tmp, const pixel *src, ptrdiff_t src_stride, + int w, int h, int mx, int my HIGHBD_DECL_SUFFIX); + + for (int filter = 0; filter < N_2D_FILTERS; filter++) + for (int w = 4; w <= 128; w <<= 1) + for (int mxy = 0; mxy < 4; mxy++) + if (check_func(c->mct[filter], "mct_%s_w%d_%s_%dbpc", + filter_names[filter], w, mxy_names[mxy], BITDEPTH)) + for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1) + { + const int mx = (mxy & 1) ? rnd() % 15 + 1 : 0; + const int my = (mxy & 2) ? rnd() % 15 + 1 : 0; +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + generate_mct_input(src_buf, bitdepth_max); + + call_ref(c_tmp, src, src_stride, w, h, + mx, my HIGHBD_TAIL_SUFFIX); + call_new(a_tmp, src, src_stride, w, h, + mx, my HIGHBD_TAIL_SUFFIX); + checkasm_check(int16_t, c_tmp, w * sizeof(*c_tmp), + a_tmp, w * sizeof(*a_tmp), + w, h, "tmp"); + + if (filter == FILTER_2D_8TAP_REGULAR || + filter == FILTER_2D_BILINEAR) + { + bench_new(a_tmp, src, src_stride, w, h, + mx, my HIGHBD_TAIL_SUFFIX); + } + } + report("mct"); +} + +static void check_mc_scaled(Dav1dMCDSPContext *const c) { + ALIGN_STK_64(pixel, src_buf, 263 * 263,); + PIXEL_RECT(c_dst, 128, 128); + PIXEL_RECT(a_dst, 128, 128); + const pixel *src = src_buf + 263 * 3 + 3; + const ptrdiff_t src_stride = 263 * sizeof(pixel); +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src, + ptrdiff_t src_stride, int w, int h, + int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX); + + for (int filter = 0; filter < N_2D_FILTERS; filter++) + for (int w = 2; w <= 128; w <<= 1) { + for (int p = 0; p < 3; ++p) { + if (check_func(c->mc_scaled[filter], "mc_scaled_%s_w%d%s_%dbpc", + filter_names[filter], w, scaled_paths[p], BITDEPTH)) + { + const int h_min = w <= 32 ? 2 : w / 4; + const int h_max = imax(imin(w * 4, 128), 32); + for (int h = h_min; h <= h_max; h = mc_h_next(h)) { + const int mx = rnd() % 1024; + const int my = rnd() % 1024; + const int dx = rnd() % 2048 + 1; + const int dy = !p + ? rnd() % 2048 + 1 + : p << 10; // ystep=1.0 and ystep=2.0 paths + + for (int k = 0; k < 263 * 263; k++) + src_buf[k] = rnd() & bitdepth_max; + + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + call_ref(c_dst, c_dst_stride, src, src_stride, + w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); + call_new(a_dst, a_dst_stride, src, src_stride, + w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); + checkasm_check_pixel_padded(c_dst, c_dst_stride, + a_dst, a_dst_stride, + w, h, "dst"); + + if (filter == FILTER_2D_8TAP_REGULAR || + filter == FILTER_2D_BILINEAR) + bench_new(a_dst, a_dst_stride, src, src_stride, + w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); + } + } + } + } + report("mc_scaled"); +} + +static void check_mct_scaled(Dav1dMCDSPContext *const c) { + ALIGN_STK_64(pixel, src_buf, 263 * 263,); + ALIGN_STK_64(int16_t, c_tmp, 128 * 128,); + ALIGN_STK_64(int16_t, a_tmp, 128 * 128,); + const pixel *src = src_buf + 263 * 3 + 3; + const ptrdiff_t src_stride = 263 * sizeof(pixel); +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + + declare_func(void, int16_t *tmp, const pixel *src, ptrdiff_t src_stride, + int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX); + + for (int filter = 0; filter < N_2D_FILTERS; filter++) + for (int w = 4; w <= 128; w <<= 1) + for (int p = 0; p < 3; ++p) { + if (check_func(c->mct_scaled[filter], "mct_scaled_%s_w%d%s_%dbpc", + filter_names[filter], w, scaled_paths[p], BITDEPTH)) + { + const int h_min = imax(w / 4, 4); + const int h_max = imin(w * 4, 128); + for (int h = h_min; h <= h_max; h = mc_h_next(h)) { + const int mx = rnd() % 1024; + const int my = rnd() % 1024; + const int dx = rnd() % 2048 + 1; + const int dy = !p + ? rnd() % 2048 + 1 + : p << 10; // ystep=1.0 and ystep=2.0 paths + + for (int k = 0; k < 263 * 263; k++) + src_buf[k] = rnd() & bitdepth_max; + + call_ref(c_tmp, src, src_stride, + w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); + call_new(a_tmp, src, src_stride, + w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); + checkasm_check(int16_t, c_tmp, w * sizeof(*c_tmp), + a_tmp, w * sizeof(*a_tmp), + w, h, "tmp"); + + if (filter == FILTER_2D_8TAP_REGULAR || + filter == FILTER_2D_BILINEAR) + bench_new(a_tmp, src, src_stride, + w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); + } + } + } + report("mct_scaled"); +} + +static void init_tmp(Dav1dMCDSPContext *const c, pixel *const buf, + int16_t (*const tmp)[128 * 128], const int bitdepth_max) +{ + for (int i = 0; i < 2; i++) { + generate_mct_input(buf, bitdepth_max); + c->mct[FILTER_2D_8TAP_SHARP](tmp[i], buf + 135 * 3 + 3, + 135 * sizeof(pixel), 128, 128, + 8, 8 HIGHBD_TAIL_SUFFIX); + } +} + +static void check_avg(Dav1dMCDSPContext *const c) { + ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]); + PIXEL_RECT(c_dst, 135, 135); + PIXEL_RECT(a_dst, 128, 128); + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1, + const int16_t *tmp2, int w, int h HIGHBD_DECL_SUFFIX); + + for (int w = 4; w <= 128; w <<= 1) + if (check_func(c->avg, "avg_w%d_%dbpc", w, BITDEPTH)) { + for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1) + { +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + + init_tmp(c, c_dst, tmp, bitdepth_max); + + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX); + call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX); + checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride, + w, h, "dst"); + + bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX); + } + } + report("avg"); +} + +static void check_w_avg(Dav1dMCDSPContext *const c) { + ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]); + PIXEL_RECT(c_dst, 135, 135); + PIXEL_RECT(a_dst, 128, 128); + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1, + const int16_t *tmp2, int w, int h, int weight HIGHBD_DECL_SUFFIX); + + for (int w = 4; w <= 128; w <<= 1) + if (check_func(c->w_avg, "w_avg_w%d_%dbpc", w, BITDEPTH)) { + for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1) + { + int weight = rnd() % 15 + 1; +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + init_tmp(c, c_dst, tmp, bitdepth_max); + + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX); + call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX); + checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride, + w, h, "dst"); + + bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX); + } + } + report("w_avg"); +} + +static void check_mask(Dav1dMCDSPContext *const c) { + ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]); + PIXEL_RECT(c_dst, 135, 135); + PIXEL_RECT(a_dst, 128, 128); + ALIGN_STK_64(uint8_t, mask, 128 * 128,); + + for (int i = 0; i < 128 * 128; i++) + mask[i] = rnd() % 65; + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1, + const int16_t *tmp2, int w, int h, const uint8_t *mask + HIGHBD_DECL_SUFFIX); + + for (int w = 4; w <= 128; w <<= 1) + if (check_func(c->mask, "mask_w%d_%dbpc", w, BITDEPTH)) { + for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1) + { +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + init_tmp(c, c_dst, tmp, bitdepth_max); + + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX); + call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX); + checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride, + w, h, "dst"); + + bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX); + } + } + report("mask"); +} + +static void check_w_mask(Dav1dMCDSPContext *const c) { + ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]); + PIXEL_RECT(c_dst, 135, 135); + PIXEL_RECT(a_dst, 128, 128); + ALIGN_STK_64(uint8_t, c_mask, 128 * 128,); + ALIGN_STK_64(uint8_t, a_mask, 128 * 128,); + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1, + const int16_t *tmp2, int w, int h, uint8_t *mask, int sign + HIGHBD_DECL_SUFFIX); + + static const uint16_t ss[] = { 444, 422, 420 }; + static const uint8_t ss_hor[] = { 0, 1, 1 }; + static const uint8_t ss_ver[] = { 0, 0, 1 }; + + for (int i = 0; i < 3; i++) + for (int w = 4; w <= 128; w <<= 1) + if (check_func(c->w_mask[i], "w_mask_%d_w%d_%dbpc", ss[i], w, + BITDEPTH)) + { + for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1) + { + int sign = rnd() & 1; +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + init_tmp(c, c_dst, tmp, bitdepth_max); + + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h, + c_mask, sign HIGHBD_TAIL_SUFFIX); + call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, + a_mask, sign HIGHBD_TAIL_SUFFIX); + checkasm_check_pixel_padded(c_dst, c_dst_stride, + a_dst, a_dst_stride, + w, h, "dst"); + checkasm_check(uint8_t, c_mask, w >> ss_hor[i], + a_mask, w >> ss_hor[i], + w >> ss_hor[i], h >> ss_ver[i], + "mask"); + + bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, + a_mask, sign HIGHBD_TAIL_SUFFIX); + } + } + report("w_mask"); +} + +static void check_blend(Dav1dMCDSPContext *const c) { + ALIGN_STK_64(pixel, tmp, 32 * 32,); + PIXEL_RECT(c_dst, 32, 32); + PIXEL_RECT(a_dst, 32, 32); + ALIGN_STK_64(uint8_t, mask, 32 * 32,); + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, + int w, int h, const uint8_t *mask); + + for (int w = 4; w <= 32; w <<= 1) { + if (check_func(c->blend, "blend_w%d_%dbpc", w, BITDEPTH)) + for (int h = imax(w / 2, 4); h <= imin(w * 2, 32); h <<= 1) { +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + for (int i = 0; i < 32 * 32; i++) { + tmp[i] = rnd() & bitdepth_max; + mask[i] = rnd() % 65; + } + + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + for (int y = 0; y < h; y++) + for (int x = 0; x < w; x++) + c_dst[y*PXSTRIDE(c_dst_stride) + x] = + a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max; + + call_ref(c_dst, c_dst_stride, tmp, w, h, mask); + call_new(a_dst, a_dst_stride, tmp, w, h, mask); + checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride, + w, h, "dst"); + + bench_new(alternate(c_dst, a_dst), a_dst_stride, tmp, w, h, mask); + } + } + report("blend"); +} + +static void check_blend_v(Dav1dMCDSPContext *const c) { + ALIGN_STK_64(pixel, tmp, 32 * 128,); + PIXEL_RECT(c_dst, 32, 128); + PIXEL_RECT(a_dst, 32, 128); + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, + int w, int h); + + for (int w = 2; w <= 32; w <<= 1) { + if (check_func(c->blend_v, "blend_v_w%d_%dbpc", w, BITDEPTH)) + for (int h = 2; h <= (w == 2 ? 64 : 128); h <<= 1) { +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + for (int y = 0; y < h; y++) + for (int x = 0; x < w; x++) + c_dst[y*PXSTRIDE(c_dst_stride) + x] = + a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max; + + for (int i = 0; i < 32 * 128; i++) + tmp[i] = rnd() & bitdepth_max; + + call_ref(c_dst, c_dst_stride, tmp, w, h); + call_new(a_dst, a_dst_stride, tmp, w, h); + checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride, + w, h, "dst"); + + bench_new(alternate(c_dst, a_dst), a_dst_stride, tmp, w, h); + } + } + report("blend_v"); +} + +static void check_blend_h(Dav1dMCDSPContext *const c) { + ALIGN_STK_64(pixel, tmp, 128 * 32,); + PIXEL_RECT(c_dst, 128, 32); + PIXEL_RECT(a_dst, 128, 32); + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, + int w, int h); + + for (int w = 2; w <= 128; w <<= 1) { + if (check_func(c->blend_h, "blend_h_w%d_%dbpc", w, BITDEPTH)) + for (int h = (w == 128 ? 4 : 2); h <= 32; h <<= 1) { +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + for (int y = 0; y < h; y++) + for (int x = 0; x < w; x++) + c_dst[y*PXSTRIDE(c_dst_stride) + x] = + a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max; + + for (int i = 0; i < 128 * 32; i++) + tmp[i] = rnd() & bitdepth_max; + + call_ref(c_dst, c_dst_stride, tmp, w, h); + call_new(a_dst, a_dst_stride, tmp, w, h); + checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride, + w, h, "dst"); + + bench_new(alternate(c_dst, a_dst), a_dst_stride, tmp, w, h); + } + } + report("blend_h"); +} + +static void check_warp8x8(Dav1dMCDSPContext *const c) { + ALIGN_STK_64(pixel, src_buf, 15 * 15,); + PIXEL_RECT(c_dst, 8, 8); + PIXEL_RECT(a_dst, 8, 8); + int16_t abcd[4]; + const pixel *src = src_buf + 15 * 3 + 3; + const ptrdiff_t src_stride = 15 * sizeof(pixel); + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src, + ptrdiff_t src_stride, const int16_t *abcd, int mx, int my + HIGHBD_DECL_SUFFIX); + + if (check_func(c->warp8x8, "warp_8x8_%dbpc", BITDEPTH)) { + const int mx = (rnd() & 0x1fff) - 0xa00; + const int my = (rnd() & 0x1fff) - 0xa00; +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + + for (int i = 0; i < 4; i++) + abcd[i] = (rnd() & 0x1fff) - 0xa00; + + for (int i = 0; i < 15 * 15; i++) + src_buf[i] = rnd() & bitdepth_max; + + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + call_ref(c_dst, c_dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX); + call_new(a_dst, a_dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX); + checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride, + 8, 8, "dst"); + + bench_new(a_dst, a_dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX); + } + report("warp8x8"); +} + +static void check_warp8x8t(Dav1dMCDSPContext *const c) { + ALIGN_STK_64(pixel, src_buf, 15 * 15,); + ALIGN_STK_64(int16_t, c_tmp, 8 * 8,); + ALIGN_STK_64(int16_t, a_tmp, 8 * 8,); + int16_t abcd[4]; + const pixel *src = src_buf + 15 * 3 + 3; + const ptrdiff_t src_stride = 15 * sizeof(pixel); + + declare_func(void, int16_t *tmp, ptrdiff_t tmp_stride, const pixel *src, + ptrdiff_t src_stride, const int16_t *abcd, int mx, int my + HIGHBD_DECL_SUFFIX); + + if (check_func(c->warp8x8t, "warp_8x8t_%dbpc", BITDEPTH)) { + const int mx = (rnd() & 0x1fff) - 0xa00; + const int my = (rnd() & 0x1fff) - 0xa00; +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + + for (int i = 0; i < 4; i++) + abcd[i] = (rnd() & 0x1fff) - 0xa00; + + for (int i = 0; i < 15 * 15; i++) + src_buf[i] = rnd() & bitdepth_max; + + call_ref(c_tmp, 8, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX); + call_new(a_tmp, 8, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX); + checkasm_check(int16_t, c_tmp, 8 * sizeof(*c_tmp), + a_tmp, 8 * sizeof(*a_tmp), + 8, 8, "tmp"); + + bench_new(a_tmp, 8, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX); + } + report("warp8x8t"); +} + +enum EdgeFlags { + HAVE_TOP = 1, + HAVE_BOTTOM = 2, + HAVE_LEFT = 4, + HAVE_RIGHT = 8, +}; + +static void random_offset_for_edge(int *const x, int *const y, + const int bw, const int bh, + int *const iw, int *const ih, + const enum EdgeFlags edge) +{ +#define set_off(edge1, edge2, pos, dim) \ + *i##dim = edge & (HAVE_##edge1 | HAVE_##edge2) ? 160 : 1 + (rnd() % (b##dim - 2)); \ + switch (edge & (HAVE_##edge1 | HAVE_##edge2)) { \ + case HAVE_##edge1 | HAVE_##edge2: \ + assert(b##dim <= *i##dim); \ + *pos = rnd() % (*i##dim - b##dim + 1); \ + break; \ + case HAVE_##edge1: \ + *pos = (*i##dim - b##dim) + 1 + (rnd() % (b##dim - 1)); \ + break; \ + case HAVE_##edge2: \ + *pos = -(1 + (rnd() % (b##dim - 1))); \ + break; \ + case 0: \ + assert(b##dim - 1 > *i##dim); \ + *pos = -(1 + (rnd() % (b##dim - *i##dim - 1))); \ + break; \ + } + set_off(LEFT, RIGHT, x, w); + set_off(TOP, BOTTOM, y, h); +} + +static void check_emuedge(Dav1dMCDSPContext *const c) { + ALIGN_STK_64(pixel, c_dst, 135 * 192,); + ALIGN_STK_64(pixel, a_dst, 135 * 192,); + ALIGN_STK_64(pixel, src, 160 * 160,); + + for (int i = 0; i < 160 * 160; i++) + src[i] = rnd() & ((1U << BITDEPTH) - 1); + + declare_func(void, intptr_t bw, intptr_t bh, intptr_t iw, intptr_t ih, + intptr_t x, intptr_t y, + pixel *dst, ptrdiff_t dst_stride, + const pixel *src, ptrdiff_t src_stride); + + int x, y, iw, ih; + for (int w = 4; w <= 128; w <<= 1) + if (check_func(c->emu_edge, "emu_edge_w%d_%dbpc", w, BITDEPTH)) { + for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1) { + // we skip 0xf, since it implies that we don't need emu_edge + for (enum EdgeFlags edge = 0; edge < 0xf; edge++) { + const int bw = w + (rnd() & 7); + const int bh = h + (rnd() & 7); + random_offset_for_edge(&x, &y, bw, bh, &iw, &ih, edge); + call_ref(bw, bh, iw, ih, x, y, + c_dst, 192 * sizeof(pixel), src, 160 * sizeof(pixel)); + call_new(bw, bh, iw, ih, x, y, + a_dst, 192 * sizeof(pixel), src, 160 * sizeof(pixel)); + checkasm_check_pixel(c_dst, 192 * sizeof(pixel), + a_dst, 192 * sizeof(pixel), + bw, bh, "dst"); + } + } + for (enum EdgeFlags edge = 1; edge < 0xf; edge <<= 1) { + random_offset_for_edge(&x, &y, w + 7, w + 7, &iw, &ih, edge); + bench_new(w + 7, w + 7, iw, ih, x, y, + a_dst, 192 * sizeof(pixel), src, 160 * sizeof(pixel)); + } + } + report("emu_edge"); +} + +static int get_upscale_x0(const int in_w, const int out_w, const int step) { + const int err = out_w * step - (in_w << 14); + const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err >> 1); + return x0 & 0x3fff; +} + +static void check_resize(Dav1dMCDSPContext *const c) { + PIXEL_RECT(c_dst, 1024, 64); + PIXEL_RECT(a_dst, 1024, 64); + ALIGN_STK_64(pixel, src, 512 * 64,); + + const int height = 64; + const int max_src_width = 512; + const ptrdiff_t src_stride = 512 * sizeof(pixel); + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, + const pixel *src, ptrdiff_t src_stride, + int dst_w, int src_w, int h, int dx, int mx0 + HIGHBD_DECL_SUFFIX); + + if (check_func(c->resize, "resize_%dbpc", BITDEPTH)) { +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + + for (int i = 0; i < max_src_width * height; i++) + src[i] = rnd() & bitdepth_max; + + const int w_den = 9 + (rnd() & 7); + const int src_w = 16 + (rnd() % (max_src_width - 16 + 1)); + const int dst_w = w_den * src_w >> 3; +#define scale_fac(ref_sz, this_sz) \ + ((((ref_sz) << 14) + ((this_sz) >> 1)) / (this_sz)) + const int dx = scale_fac(src_w, dst_w); +#undef scale_fac + const int mx0 = get_upscale_x0(src_w, dst_w, dx); + + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + call_ref(c_dst, c_dst_stride, src, src_stride, + dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX); + call_new(a_dst, a_dst_stride, src, src_stride, + dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX); + checkasm_check_pixel_padded_align(c_dst, c_dst_stride, a_dst, a_dst_stride, + dst_w, height, "dst", 16, 1); + + bench_new(a_dst, a_dst_stride, src, src_stride, + 512, height, 512 * 8 / w_den, dx, mx0 HIGHBD_TAIL_SUFFIX); + } + + report("resize"); +} + +void bitfn(checkasm_check_mc)(void) { + Dav1dMCDSPContext c; + bitfn(dav1d_mc_dsp_init)(&c); + + check_mc(&c); + check_mct(&c); + check_mc_scaled(&c); + check_mct_scaled(&c); + check_avg(&c); + check_w_avg(&c); + check_mask(&c); + check_w_mask(&c); + check_blend(&c); + check_blend_v(&c); + check_blend_h(&c); + check_warp8x8(&c); + check_warp8x8t(&c); + check_emuedge(&c); + check_resize(&c); +} diff --git a/third_party/dav1d/tests/checkasm/msac.c b/third_party/dav1d/tests/checkasm/msac.c new file mode 100644 index 0000000000..b9c89b47cf --- /dev/null +++ b/third_party/dav1d/tests/checkasm/msac.c @@ -0,0 +1,294 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "tests/checkasm/checkasm.h" + +#include "src/cpu.h" +#include "src/msac.h" + +#include <stdio.h> +#include <string.h> + +#define BUF_SIZE 8192 + +/* The normal code doesn't use function pointers */ +typedef unsigned (*decode_symbol_adapt_fn)(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +typedef unsigned (*decode_adapt_fn)(MsacContext *s, uint16_t *cdf); +typedef unsigned (*decode_bool_equi_fn)(MsacContext *s); +typedef unsigned (*decode_bool_fn)(MsacContext *s, unsigned f); + +typedef struct { + decode_symbol_adapt_fn decode_symbol_adapt4; + decode_symbol_adapt_fn decode_symbol_adapt8; + decode_symbol_adapt_fn decode_symbol_adapt16; + decode_adapt_fn decode_bool_adapt; + decode_bool_equi_fn decode_bool_equi; + decode_bool_fn decode_bool; + decode_adapt_fn decode_hi_tok; +} MsacDSPContext; + +static void randomize_cdf(uint16_t *const cdf, const int n) { + int i; + for (i = 15; i > n; i--) + cdf[i] = rnd(); // padding + cdf[i] = 0; // count + do { + cdf[i - 1] = cdf[i] + rnd() % (32768 - cdf[i] - i) + 1; + } while (--i > 0); +} + +/* memcmp() on structs can have weird behavior due to padding etc. */ +static int msac_cmp(const MsacContext *const a, const MsacContext *const b) { + return a->buf_pos != b->buf_pos || a->buf_end != b->buf_end || + a->dif != b->dif || a->rng != b->rng || a->cnt != b->cnt || + a->allow_update_cdf != b->allow_update_cdf; +} + +static void msac_dump(unsigned c_res, unsigned a_res, + const MsacContext *const a, const MsacContext *const b, + const uint16_t *const cdf_a, const uint16_t *const cdf_b, + const int num_cdf) +{ + if (c_res != a_res) + fprintf(stderr, "c_res %u a_res %u\n", c_res, a_res); + if (a->buf_pos != b->buf_pos) + fprintf(stderr, "buf_pos %p vs %p\n", a->buf_pos, b->buf_pos); + if (a->buf_end != b->buf_end) + fprintf(stderr, "buf_end %p vs %p\n", a->buf_end, b->buf_end); + if (a->dif != b->dif) + fprintf(stderr, "dif %zx vs %zx\n", a->dif, b->dif); + if (a->rng != b->rng) + fprintf(stderr, "rng %u vs %u\n", a->rng, b->rng); + if (a->cnt != b->cnt) + fprintf(stderr, "cnt %d vs %d\n", a->cnt, b->cnt); + if (a->allow_update_cdf) + fprintf(stderr, "allow_update_cdf %d vs %d\n", + a->allow_update_cdf, b->allow_update_cdf); + if (num_cdf && memcmp(cdf_a, cdf_b, sizeof(*cdf_a) * (num_cdf + 1))) { + fprintf(stderr, "cdf:\n"); + for (int i = 0; i <= num_cdf; i++) + fprintf(stderr, " %5u", cdf_a[i]); + fprintf(stderr, "\n"); + for (int i = 0; i <= num_cdf; i++) + fprintf(stderr, " %5u", cdf_b[i]); + fprintf(stderr, "\n"); + for (int i = 0; i <= num_cdf; i++) + fprintf(stderr, " %c", cdf_a[i] != cdf_b[i] ? 'x' : '.'); + fprintf(stderr, "\n"); + } +} + +#define CHECK_SYMBOL_ADAPT(n, n_min, n_max) do { \ + if (check_func(c->decode_symbol_adapt##n, \ + "msac_decode_symbol_adapt%d", n)) \ + { \ + for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { \ + for (int ns = n_min; ns <= n_max; ns++) { \ + dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); \ + s_a = s_c; \ + randomize_cdf(cdf[0], ns); \ + memcpy(cdf[1], cdf[0], sizeof(*cdf)); \ + for (int i = 0; i < 64; i++) { \ + unsigned c_res = call_ref(&s_c, cdf[0], ns); \ + unsigned a_res = call_new(&s_a, cdf[1], ns); \ + if (c_res != a_res || msac_cmp(&s_c, &s_a) || \ + memcmp(cdf[0], cdf[1], sizeof(**cdf) * (ns + 1))) \ + { \ + if (fail()) \ + msac_dump(c_res, a_res, &s_c, &s_a, \ + cdf[0], cdf[1], ns); \ + } \ + } \ + if (cdf_update && ns == n - 1) \ + bench_new(alternate(&s_c, &s_a), \ + alternate(cdf[0], cdf[1]), ns); \ + } \ + } \ + } \ +} while (0) + +static void check_decode_symbol(MsacDSPContext *const c, uint8_t *const buf) { + ALIGN_STK_32(uint16_t, cdf, 2, [16]); + MsacContext s_c, s_a; + + declare_func(unsigned, MsacContext *s, uint16_t *cdf, size_t n_symbols); + CHECK_SYMBOL_ADAPT( 4, 1, 4); + CHECK_SYMBOL_ADAPT( 8, 1, 7); + CHECK_SYMBOL_ADAPT(16, 3, 15); + report("decode_symbol"); +} + +static void check_decode_bool_adapt(MsacDSPContext *const c, uint8_t *const buf) { + MsacContext s_c, s_a; + + declare_func(unsigned, MsacContext *s, uint16_t *cdf); + if (check_func(c->decode_bool_adapt, "msac_decode_bool_adapt")) { + uint16_t cdf[2][2]; + for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { + dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); + s_a = s_c; + cdf[0][0] = cdf[1][0] = rnd() % 32767 + 1; + cdf[0][1] = cdf[1][1] = 0; + for (int i = 0; i < 64; i++) { + unsigned c_res = call_ref(&s_c, cdf[0]); + unsigned a_res = call_new(&s_a, cdf[1]); + if (c_res != a_res || msac_cmp(&s_c, &s_a) || + memcmp(cdf[0], cdf[1], sizeof(*cdf))) + { + if (fail()) + msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 1); + } + } + if (cdf_update) + bench_new(alternate(&s_c, &s_a), alternate(cdf[0], cdf[1])); + } + } +} + +static void check_decode_bool_equi(MsacDSPContext *const c, uint8_t *const buf) { + MsacContext s_c, s_a; + + declare_func(unsigned, MsacContext *s); + if (check_func(c->decode_bool_equi, "msac_decode_bool_equi")) { + dav1d_msac_init(&s_c, buf, BUF_SIZE, 1); + s_a = s_c; + for (int i = 0; i < 64; i++) { + unsigned c_res = call_ref(&s_c); + unsigned a_res = call_new(&s_a); + if (c_res != a_res || msac_cmp(&s_c, &s_a)) { + if (fail()) + msac_dump(c_res, a_res, &s_c, &s_a, NULL, NULL, 0); + } + } + bench_new(alternate(&s_c, &s_a)); + } +} + +static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) { + MsacContext s_c, s_a; + + declare_func(unsigned, MsacContext *s, unsigned f); + if (check_func(c->decode_bool, "msac_decode_bool")) { + dav1d_msac_init(&s_c, buf, BUF_SIZE, 1); + s_a = s_c; + for (int i = 0; i < 64; i++) { + const unsigned f = rnd() & 0x7fff; + unsigned c_res = call_ref(&s_c, f); + unsigned a_res = call_new(&s_a, f); + if (c_res != a_res || msac_cmp(&s_c, &s_a)) { + if (fail()) + msac_dump(c_res, a_res, &s_c, &s_a, NULL, NULL, 0); + } + } + bench_new(alternate(&s_c, &s_a), 16384); + } + +} + +static void check_decode_bool_funcs(MsacDSPContext *const c, uint8_t *const buf) { + check_decode_bool_adapt(c, buf); + check_decode_bool_equi(c, buf); + check_decode_bool(c, buf); + report("decode_bool"); +} + +static void check_decode_hi_tok(MsacDSPContext *const c, uint8_t *const buf) { + ALIGN_STK_16(uint16_t, cdf, 2, [16]); + MsacContext s_c, s_a; + + declare_func(unsigned, MsacContext *s, uint16_t *cdf); + if (check_func(c->decode_hi_tok, "msac_decode_hi_tok")) { + for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { + dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); + s_a = s_c; + randomize_cdf(cdf[0], 3); + memcpy(cdf[1], cdf[0], sizeof(*cdf)); + for (int i = 0; i < 64; i++) { + unsigned c_res = call_ref(&s_c, cdf[0]); + unsigned a_res = call_new(&s_a, cdf[1]); + if (c_res != a_res || msac_cmp(&s_c, &s_a) || + memcmp(cdf[0], cdf[1], sizeof(*cdf))) + { + if (fail()) + msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 3); + break; + } + } + if (cdf_update) + bench_new(alternate(&s_c, &s_a), alternate(cdf[0], cdf[1])); + } + } + report("decode_hi_tok"); +} + +void checkasm_check_msac(void) { + MsacDSPContext c; + c.decode_symbol_adapt4 = dav1d_msac_decode_symbol_adapt_c; + c.decode_symbol_adapt8 = dav1d_msac_decode_symbol_adapt_c; + c.decode_symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c; + c.decode_bool_adapt = dav1d_msac_decode_bool_adapt_c; + c.decode_bool_equi = dav1d_msac_decode_bool_equi_c; + c.decode_bool = dav1d_msac_decode_bool_c; + c.decode_hi_tok = dav1d_msac_decode_hi_tok_c; + +#if (ARCH_AARCH64 || ARCH_ARM) && HAVE_ASM + if (dav1d_get_cpu_flags() & DAV1D_ARM_CPU_FLAG_NEON) { + c.decode_symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_neon; + c.decode_symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_neon; + c.decode_symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_neon; + c.decode_bool_adapt = dav1d_msac_decode_bool_adapt_neon; + c.decode_bool_equi = dav1d_msac_decode_bool_equi_neon; + c.decode_bool = dav1d_msac_decode_bool_neon; + c.decode_hi_tok = dav1d_msac_decode_hi_tok_neon; + } +#elif ARCH_X86 && HAVE_ASM + if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) { + c.decode_symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_sse2; + c.decode_symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_sse2; + c.decode_symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2; + c.decode_bool_adapt = dav1d_msac_decode_bool_adapt_sse2; + c.decode_bool_equi = dav1d_msac_decode_bool_equi_sse2; + c.decode_bool = dav1d_msac_decode_bool_sse2; + c.decode_hi_tok = dav1d_msac_decode_hi_tok_sse2; + } + +#if ARCH_X86_64 + if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_AVX2) { + c.decode_symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2; + } +#endif +#endif + + uint8_t buf[BUF_SIZE]; + for (int i = 0; i < BUF_SIZE; i++) + buf[i] = rnd(); + + check_decode_symbol(&c, buf); + check_decode_bool_funcs(&c, buf); + check_decode_hi_tok(&c, buf); +} diff --git a/third_party/dav1d/tests/checkasm/refmvs.c b/third_party/dav1d/tests/checkasm/refmvs.c new file mode 100644 index 0000000000..f21c81f85a --- /dev/null +++ b/third_party/dav1d/tests/checkasm/refmvs.c @@ -0,0 +1,167 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "tests/checkasm/checkasm.h" +#include "src/refmvs.h" + +#include <stdio.h> + +static inline int gen_mv(const int total_bits, int spel_bits) { + int bits = rnd() & ((1 << spel_bits) - 1); + do { + bits |= (rnd() & 1) << spel_bits; + } while (rnd() & 1 && ++spel_bits < total_bits); + // the do/while makes it relatively more likely to be close to zero (fpel) + // than far away + return rnd() & 1 ? -bits : bits; +} + +static void check_save_tmvs(const Dav1dRefmvsDSPContext *const c) { + refmvs_block *rr[31]; + refmvs_block r[31 * 256]; + ALIGN_STK_64(refmvs_temporal_block, c_rp, 128 * 16,); + ALIGN_STK_64(refmvs_temporal_block, a_rp, 128 * 16,); + uint8_t ref_sign[7]; + + for (int i = 0; i < 31; i++) + rr[i] = &r[i * 256]; + + declare_func(void, refmvs_temporal_block *rp, const ptrdiff_t stride, + refmvs_block *const *const rr, const uint8_t *const ref_sign, + int col_end8, int row_end8, int col_start8, int row_start8); + + if (check_func(c->save_tmvs, "save_tmvs")) { + const int row_start8 = rnd() & 7; + const int row_end8 = 8 + (rnd() & 7); + const int col_start8 = rnd() & 31; + const int col_end8 = 96 + (rnd() & 31); + + for (int i = 0; i < 7; i++) + ref_sign[i] = rnd() & 1; + + for (int i = row_start8; i < row_end8; i++) + for (int j = col_start8; j < col_end8;) { + int bs = rnd() % N_BS_SIZES; + while (j + ((dav1d_block_dimensions[bs][0] + 1) >> 1) > col_end8) + bs++; + rr[i * 2][j * 2 + 1] = (refmvs_block) { + .mv.mv[0].x = gen_mv(14, 10), + .mv.mv[0].y = gen_mv(14, 10), + .mv.mv[1].x = gen_mv(14, 10), + .mv.mv[1].y = gen_mv(14, 10), + .ref.ref = { (rnd() % 9) - 1, (rnd() % 9) - 1 }, + .bs = bs + }; + for (int k = 0; k < (dav1d_block_dimensions[bs][0] + 1) >> 1; k++, j++) { + c_rp[i * 128 + j].mv.n = 0xdeadbeef; + c_rp[i * 128 + j].ref = 0xdd; + } + } + + call_ref(c_rp + row_start8 * 128, 128, rr, ref_sign, + col_end8, row_end8, col_start8, row_start8); + call_new(a_rp + row_start8 * 128, 128, rr, ref_sign, + col_end8, row_end8, col_start8, row_start8); + for (int i = row_start8; i < row_end8; i++) + for (int j = col_start8; j < col_end8; j++) + if (c_rp[i * 128 + j].mv.n != a_rp[i * 128 + j].mv.n || + c_rp[i * 128 + j].ref != a_rp[i * 128 + j].ref) + { + if (fail()) { + fprintf(stderr, "[%d][%d] c_rp.mv.x = 0x%x a_rp.mv.x = 0x%x\n", + i, j, c_rp[i * 128 + j].mv.x, a_rp[i * 128 + j].mv.x); + fprintf(stderr, "[%d][%d] c_rp.mv.y = 0x%x a_rp.mv.y = 0x%x\n", + i, j, c_rp[i * 128 + j].mv.y, a_rp[i * 128 + j].mv.y); + fprintf(stderr, "[%d][%d] c_rp.ref = %u a_rp.ref = %u\n", + i, j, c_rp[i * 128 + j].ref, a_rp[i * 128 + j].ref); + } + } + + for (int bs = BS_4x4; bs < N_BS_SIZES; bs++) { + const int bw8 = (dav1d_block_dimensions[bs][0] + 1) >> 1; + for (int i = 0; i < 16; i++) + for (int j = 0; j < 128; j += bw8) { + rr[i * 2][j * 2 + 1].ref.ref[0] = (rnd() % 9) - 1; + rr[i * 2][j * 2 + 1].ref.ref[1] = (rnd() % 9) - 1; + rr[i * 2][j * 2 + 1].bs = bs; + } + bench_new(alternate(c_rp, a_rp), 128, rr, ref_sign, 128, 16, 0, 0); + } + } + + report("save_tmvs"); +} + +static void check_splat_mv(const Dav1dRefmvsDSPContext *const c) { + ALIGN_STK_64(refmvs_block, c_buf, 32 * 32,); + ALIGN_STK_64(refmvs_block, a_buf, 32 * 32,); + refmvs_block *c_dst[32]; + refmvs_block *a_dst[32]; + const size_t stride = 32 * sizeof(refmvs_block); + + for (int i = 0; i < 32; i++) { + c_dst[i] = c_buf + 32 * i; + a_dst[i] = a_buf + 32 * i; + } + + declare_func(void, refmvs_block **rr, const refmvs_block *rmv, + int bx4, int bw4, int bh4); + + for (int w = 1; w <= 32; w *= 2) { + if (check_func(c->splat_mv, "splat_mv_w%d", w)) { + const int h_min = imax(w / 4, 1); + const int h_max = imin(w * 4, 32); + const int w_uint32 = w * sizeof(refmvs_block) / sizeof(uint32_t); + for (int h = h_min; h <= h_max; h *= 2) { + const int offset = (int) ((unsigned) w * rnd()) & 31; + union { + refmvs_block rmv; + uint32_t u32[3]; + } ALIGN(tmp, 16); + tmp.u32[0] = rnd(); + tmp.u32[1] = rnd(); + tmp.u32[2] = rnd(); + + call_ref(c_dst, &tmp.rmv, offset, w, h); + call_new(a_dst, &tmp.rmv, offset, w, h); + checkasm_check(uint32_t, (uint32_t*)(c_buf + offset), stride, + (uint32_t*)(a_buf + offset), stride, + w_uint32, h, "dst"); + + bench_new(a_dst, &tmp.rmv, 0, w, h); + } + } + } + report("splat_mv"); +} + +void checkasm_check_refmvs(void) { + Dav1dRefmvsDSPContext c; + dav1d_refmvs_dsp_init(&c); + + check_save_tmvs(&c); + check_splat_mv(&c); +} diff --git a/third_party/dav1d/tests/checkasm/x86/checkasm.asm b/third_party/dav1d/tests/checkasm/x86/checkasm.asm new file mode 100644 index 0000000000..8f19ef97f7 --- /dev/null +++ b/third_party/dav1d/tests/checkasm/x86/checkasm.asm @@ -0,0 +1,475 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%undef private_prefix +%define private_prefix checkasm +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 + +%if ARCH_X86_64 +; just random numbers to reduce the chance of incidental match +%if WIN64 +x6: dq 0x1a1b2550a612b48c,0x79445c159ce79064 +x7: dq 0x2eed899d5a28ddcd,0x86b2536fcd8cf636 +x8: dq 0xb0856806085e7943,0x3f2bf84fc0fcca4e +x9: dq 0xacbd382dcf5b8de2,0xd229e1f5b281303f +x10: dq 0x71aeaff20b095fd9,0xab63e2e11fa38ed9 +x11: dq 0x89b0c0765892729a,0x77d410d5c42c882d +x12: dq 0xc45ea11a955d8dd5,0x24b3c1d2a024048b +x13: dq 0x2e8ec680de14b47c,0xdd7b8919edd42786 +x14: dq 0x135ce6888fa02cbf,0x11e53e2b2ac655ef +x15: dq 0x011ff554472a7a10,0x6de8f4c914c334d5 +n7: dq 0x21f86d66c8ca00ce +n8: dq 0x75b6ba21077c48ad +%endif +n9: dq 0xed56bb2dcb3c7736 +n10: dq 0x8bda43d3fd1a7e06 +n11: dq 0xb64a9c9e5d318408 +n12: dq 0xdf9a54b303f1d3a3 +n13: dq 0x4a75479abd64e097 +n14: dq 0x249214109d5d1c88 +%endif + +errmsg_stack: db "stack corruption", 0 +errmsg_register: db "failed to preserve register:%s", 0 +errmsg_vzeroupper: db "missing vzeroupper", 0 + +SECTION .bss + +check_vzeroupper: resd 1 + +SECTION .text + +cextern fail_func + +; max number of args used by any asm function. +; (max_args % 4) must equal 3 for stack alignment +%define max_args 15 + +%if UNIX64 + DECLARE_REG_TMP 0 +%else + DECLARE_REG_TMP 4 +%endif + +;----------------------------------------------------------------------------- +; unsigned checkasm_init_x86(char *name) +;----------------------------------------------------------------------------- +cglobal init_x86, 0, 5 +%if ARCH_X86_64 + push rbx +%endif + movifnidn t0, r0mp + mov eax, 0x80000000 + cpuid + cmp eax, 0x80000004 + jb .no_brand ; processor brand string not supported + mov eax, 0x80000002 + cpuid + mov [t0+4* 0], eax + mov [t0+4* 1], ebx + mov [t0+4* 2], ecx + mov [t0+4* 3], edx + mov eax, 0x80000003 + cpuid + mov [t0+4* 4], eax + mov [t0+4* 5], ebx + mov [t0+4* 6], ecx + mov [t0+4* 7], edx + mov eax, 0x80000004 + cpuid + mov [t0+4* 8], eax + mov [t0+4* 9], ebx + mov [t0+4*10], ecx + mov [t0+4*11], edx + xor eax, eax + cpuid + jmp .check_xcr1 +.no_brand: ; use manufacturer id as a fallback + xor eax, eax + mov [t0+4*3], eax + cpuid + mov [t0+4*0], ebx + mov [t0+4*1], edx + mov [t0+4*2], ecx +.check_xcr1: + test eax, eax + jz .end2 ; cpuid leaf 1 not supported + mov t0d, eax ; max leaf + mov eax, 1 + cpuid + and ecx, 0x18000000 + cmp ecx, 0x18000000 + jne .end2 ; osxsave/avx not supported + cmp t0d, 13 ; cpuid leaf 13 not supported + jb .end2 + mov t0d, eax ; cpuid signature + mov eax, 13 + mov ecx, 1 + cpuid + test al, 0x04 + jz .end ; xcr1 not supported + mov ecx, 1 + xgetbv + test al, 0x04 + jnz .end ; always-dirty ymm state +%if ARCH_X86_64 == 0 && PIC + LEA eax, check_vzeroupper + mov [eax], ecx +%else + mov [check_vzeroupper], ecx +%endif +.end: + mov eax, t0d +.end2: +%if ARCH_X86_64 + pop rbx +%endif + RET + +%if ARCH_X86_64 +%if WIN64 + %define stack_param rsp+32 ; shadow space + %define num_fn_args rsp+stack_offset+17*8 + %assign num_reg_args 4 + %assign free_regs 7 + %assign clobber_mask_stack_bit 16 + DECLARE_REG_TMP 4 +%else + %define stack_param rsp + %define num_fn_args rsp+stack_offset+11*8 + %assign num_reg_args 6 + %assign free_regs 9 + %assign clobber_mask_stack_bit 64 + DECLARE_REG_TMP 7 +%endif + +%macro CLOBBER_UPPER 2 ; reg, mask_bit + mov r13d, %1d + or r13, r8 + test r9b, %2 + cmovnz %1, r13 +%endmacro + +cglobal checked_call, 2, 15, 16, max_args*8+64+8 + mov r10d, [num_fn_args] + mov r8, 0xdeadbeef00000000 + mov r9d, [num_fn_args+r10*8+8] ; clobber_mask + mov t0, [num_fn_args+r10*8] ; func + + ; Clobber the upper halves of 32-bit parameters + CLOBBER_UPPER r0, 1 + CLOBBER_UPPER r1, 2 + CLOBBER_UPPER r2, 4 + CLOBBER_UPPER r3, 8 +%if UNIX64 + CLOBBER_UPPER r4, 16 + CLOBBER_UPPER r5, 32 +%else ; WIN64 +%assign i 6 +%rep 16-6 + mova m %+ i, [x %+ i] + %assign i i+1 +%endrep +%endif + + xor r11d, r11d + sub r10d, num_reg_args + cmovs r10d, r11d ; num stack args + + ; write stack canaries to the area above parameters passed on the stack + mov r12, [rsp+stack_offset] ; return address + not r12 +%assign i 0 +%rep 8 ; 64 bytes + mov [stack_param+(r10+i)*8], r12 + %assign i i+1 +%endrep + + test r10d, r10d + jz .stack_setup_done ; no stack parameters +.copy_stack_parameter: + mov r12, [stack_param+stack_offset+8+r11*8] + CLOBBER_UPPER r12, clobber_mask_stack_bit + shr r9d, 1 + mov [stack_param+r11*8], r12 + inc r11d + cmp r11d, r10d + jl .copy_stack_parameter +.stack_setup_done: + +%assign i 14 +%rep 15-free_regs + mov r %+ i, [n %+ i] + %assign i i-1 +%endrep + call t0 + + ; check for stack corruption + mov r0d, [num_fn_args] + xor r3d, r3d + sub r0d, num_reg_args + cmovs r0d, r3d ; num stack args + + mov r3, [rsp+stack_offset] + mov r4, [stack_param+r0*8] + not r3 + xor r4, r3 +%assign i 1 +%rep 6 + mov r5, [stack_param+(r0+i)*8] + xor r5, r3 + or r4, r5 + %assign i i+1 +%endrep + xor r3, [stack_param+(r0+7)*8] + or r4, r3 + jz .stack_ok + ; Save the return value located in rdx:rax first to prevent clobbering. + mov r10, rax + mov r11, rdx + lea r0, [errmsg_stack] + jmp .fail +.stack_ok: + + ; check for failure to preserve registers +%assign i 14 +%rep 15-free_regs + cmp r %+ i, [n %+ i] + setne r4b + lea r3d, [r4+r3*2] + %assign i i-1 +%endrep +%if WIN64 + lea r0, [rsp+32] ; account for shadow space + mov r5, r0 + test r3d, r3d + jz .gpr_ok +%else + test r3d, r3d + jz .gpr_xmm_ok + mov r0, rsp +%endif +%assign i free_regs +%rep 15-free_regs +%if i < 10 + mov dword [r0], " r0" + (i << 16) + lea r4, [r0+3] +%else + mov dword [r0], " r10" + ((i - 10) << 24) + lea r4, [r0+4] +%endif + test r3b, 1 << (i - free_regs) + cmovnz r0, r4 + %assign i i+1 +%endrep +%if WIN64 ; xmm registers +.gpr_ok: +%assign i 6 +%rep 16-6 + pxor m %+ i, [x %+ i] + %assign i i+1 +%endrep + packsswb m6, m7 + packsswb m8, m9 + packsswb m10, m11 + packsswb m12, m13 + packsswb m14, m15 + packsswb m6, m6 + packsswb m8, m10 + packsswb m12, m14 + packsswb m6, m6 + packsswb m8, m12 + packsswb m6, m8 + pxor m7, m7 + pcmpeqb m6, m7 + pmovmskb r3d, m6 + cmp r3d, 0xffff + je .xmm_ok + mov r7d, " xmm" +%assign i 6 +%rep 16-6 + mov [r0+0], r7d +%if i < 10 + mov byte [r0+4], "0" + i + lea r4, [r0+5] +%else + mov word [r0+4], "10" + ((i - 10) << 8) + lea r4, [r0+6] +%endif + test r3d, 1 << i + cmovz r0, r4 + %assign i i+1 +%endrep +.xmm_ok: + cmp r0, r5 + je .gpr_xmm_ok + mov byte [r0], 0 + mov r11, rdx + mov r1, r5 +%else + mov byte [r0], 0 + mov r11, rdx + mov r1, rsp +%endif + mov r10, rax + lea r0, [errmsg_register] + jmp .fail +.gpr_xmm_ok: + ; Check for dirty YMM state, i.e. missing vzeroupper + mov ecx, [check_vzeroupper] + test ecx, ecx + jz .ok ; not supported, skip + mov r10, rax + mov r11, rdx + xgetbv + test al, 0x04 + jz .restore_retval ; clean ymm state + lea r0, [errmsg_vzeroupper] + vzeroupper +.fail: + ; Call fail_func() with a descriptive message to mark it as a failure. + xor eax, eax + call fail_func +.restore_retval: + mov rax, r10 + mov rdx, r11 +.ok: + RET + +; trigger a warmup of vector units +%macro WARMUP 0 +cglobal warmup, 0, 0 + xorps m0, m0 + mulps m0, m0 + RET +%endmacro + +INIT_YMM avx2 +WARMUP +INIT_ZMM avx512 +WARMUP + +%else + +; just random numbers to reduce the chance of incidental match +%assign n3 0x6549315c +%assign n4 0xe02f3e23 +%assign n5 0xb78d0d1d +%assign n6 0x33627ba7 + +;----------------------------------------------------------------------------- +; void checkasm_checked_call(void *func, ...) +;----------------------------------------------------------------------------- +cglobal checked_call, 1, 7 + mov r3, [esp+stack_offset] ; return address + mov r1, [esp+stack_offset+17*4] ; num_stack_params + mov r2, 27 + not r3 + sub r2, r1 +.push_canary: + push r3 + dec r2 + jg .push_canary +.push_parameter: + push dword [esp+32*4] + dec r1 + jg .push_parameter + mov r3, n3 + mov r4, n4 + mov r5, n5 + mov r6, n6 + call r0 + + ; check for failure to preserve registers + cmp r3, n3 + setne r3h + cmp r4, n4 + setne r3b + shl r3d, 16 + cmp r5, n5 + setne r3h + cmp r6, n6 + setne r3b + test r3, r3 + jz .gpr_ok + lea r1, [esp+16] + mov [esp+4], r1 +%assign i 3 +%rep 4 + mov dword [r1], " r0" + (i << 16) + lea r4, [r1+3] + test r3, 1 << ((6 - i) * 8) + cmovnz r1, r4 + %assign i i+1 +%endrep + mov byte [r1], 0 + mov r5, eax + mov r6, edx + LEA r1, errmsg_register + jmp .fail +.gpr_ok: + ; check for stack corruption + mov r3, [esp+48*4] ; num_stack_params + mov r6, [esp+31*4] ; return address + mov r4, [esp+r3*4] + sub r3, 26 + not r6 + xor r4, r6 +.check_canary: + mov r5, [esp+(r3+27)*4] + xor r5, r6 + or r4, r5 + inc r3 + jl .check_canary + mov r5, eax + mov r6, edx + test r4, r4 + jz .stack_ok + LEA r1, errmsg_stack + jmp .fail +.stack_ok: + ; check for dirty YMM state, i.e. missing vzeroupper + LEA ecx, check_vzeroupper + mov ecx, [ecx] + test ecx, ecx + jz .ok ; not supported, skip + xgetbv + test al, 0x04 + jz .ok ; clean ymm state + LEA r1, errmsg_vzeroupper + vzeroupper +.fail: + mov [esp], r1 + call fail_func +.ok: + add esp, 27*4 + mov eax, r5 + mov edx, r6 + RET + +%endif ; ARCH_X86_64 |