summaryrefslogtreecommitdiffstats
path: root/media/libvpx/libvpx/vpx_util
diff options
context:
space:
mode:
Diffstat (limited to 'media/libvpx/libvpx/vpx_util')
-rw-r--r--media/libvpx/libvpx/vpx_util/endian_inl.h118
-rw-r--r--media/libvpx/libvpx/vpx_util/loongson_intrinsics.h2090
-rw-r--r--media/libvpx/libvpx/vpx_util/vpx_atomics.h111
-rw-r--r--media/libvpx/libvpx/vpx_util/vpx_debug_util.c282
-rw-r--r--media/libvpx/libvpx/vpx_util/vpx_debug_util.h70
-rw-r--r--media/libvpx/libvpx/vpx_util/vpx_thread.c181
-rw-r--r--media/libvpx/libvpx/vpx_util/vpx_thread.h438
-rw-r--r--media/libvpx/libvpx/vpx_util/vpx_timestamp.h49
-rw-r--r--media/libvpx/libvpx/vpx_util/vpx_util.mk20
-rw-r--r--media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.c46
-rw-r--r--media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.h27
11 files changed, 3432 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vpx_util/endian_inl.h b/media/libvpx/libvpx/vpx_util/endian_inl.h
new file mode 100644
index 0000000000..1b6ef56c69
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/endian_inl.h
@@ -0,0 +1,118 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Endian related functions.
+
+#ifndef VPX_VPX_UTIL_ENDIAN_INL_H_
+#define VPX_VPX_UTIL_ENDIAN_INL_H_
+
+#include <stdlib.h>
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#if defined(__GNUC__)
+#define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
+#define LOCAL_GCC_PREREQ(maj, min) (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
+#else
+#define LOCAL_GCC_VERSION 0
+#define LOCAL_GCC_PREREQ(maj, min) 0
+#endif
+
+// handle clang compatibility
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
+#if !defined(WORDS_BIGENDIAN) && \
+ (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
+ (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
+#define WORDS_BIGENDIAN
+#endif
+
+#if defined(WORDS_BIGENDIAN)
+#define HToLE32 BSwap32
+#define HToLE16 BSwap16
+#define HToBE64(x) (x)
+#define HToBE32(x) (x)
+#else
+#define HToLE32(x) (x)
+#define HToLE16(x) (x)
+#define HToBE64(X) BSwap64(X)
+#define HToBE32(X) BSwap32(X)
+#endif
+
+#if LOCAL_GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
+#define HAVE_BUILTIN_BSWAP16
+#endif
+
+#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
+#define HAVE_BUILTIN_BSWAP32
+#endif
+
+#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
+#define HAVE_BUILTIN_BSWAP64
+#endif
+
+#if HAVE_MIPS32 && defined(__mips__) && !defined(__mips64) && \
+ defined(__mips_isa_rev) && (__mips_isa_rev >= 2) && (__mips_isa_rev < 6)
+#define VPX_USE_MIPS32_R2
+#endif
+
+static INLINE uint16_t BSwap16(uint16_t x) {
+#if defined(HAVE_BUILTIN_BSWAP16)
+ return __builtin_bswap16(x);
+#elif defined(_MSC_VER)
+ return _byteswap_ushort(x);
+#else
+ // gcc will recognize a 'rorw $8, ...' here:
+ return (x >> 8) | ((x & 0xff) << 8);
+#endif // HAVE_BUILTIN_BSWAP16
+}
+
+static INLINE uint32_t BSwap32(uint32_t x) {
+#if defined(VPX_USE_MIPS32_R2)
+ uint32_t ret;
+ __asm__ volatile(
+ "wsbh %[ret], %[x] \n\t"
+ "rotr %[ret], %[ret], 16 \n\t"
+ : [ret] "=r"(ret)
+ : [x] "r"(x));
+ return ret;
+#elif defined(HAVE_BUILTIN_BSWAP32)
+ return __builtin_bswap32(x);
+#elif defined(__i386__) || defined(__x86_64__)
+ uint32_t swapped_bytes;
+ __asm__ volatile("bswap %0" : "=r"(swapped_bytes) : "0"(x));
+ return swapped_bytes;
+#elif defined(_MSC_VER)
+ return (uint32_t)_byteswap_ulong(x);
+#else
+ return (x >> 24) | ((x >> 8) & 0xff00) | ((x << 8) & 0xff0000) | (x << 24);
+#endif // HAVE_BUILTIN_BSWAP32
+}
+
+static INLINE uint64_t BSwap64(uint64_t x) {
+#if defined(HAVE_BUILTIN_BSWAP64)
+ return __builtin_bswap64(x);
+#elif defined(__x86_64__)
+ uint64_t swapped_bytes;
+ __asm__ volatile("bswapq %0" : "=r"(swapped_bytes) : "0"(x));
+ return swapped_bytes;
+#elif defined(_MSC_VER)
+ return (uint64_t)_byteswap_uint64(x);
+#else // generic code for swapping 64-bit values (suggested by bdb@)
+ x = ((x & 0xffffffff00000000ull) >> 32) | ((x & 0x00000000ffffffffull) << 32);
+ x = ((x & 0xffff0000ffff0000ull) >> 16) | ((x & 0x0000ffff0000ffffull) << 16);
+ x = ((x & 0xff00ff00ff00ff00ull) >> 8) | ((x & 0x00ff00ff00ff00ffull) << 8);
+ return x;
+#endif // HAVE_BUILTIN_BSWAP64
+}
+
+#endif // VPX_VPX_UTIL_ENDIAN_INL_H_
diff --git a/media/libvpx/libvpx/vpx_util/loongson_intrinsics.h b/media/libvpx/libvpx/vpx_util/loongson_intrinsics.h
new file mode 100644
index 0000000000..b8b9e6db02
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/loongson_intrinsics.h
@@ -0,0 +1,2090 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#ifndef VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_
+#define VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_
+
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ * Xiwei Gu <guxiwei-hf@loongson.cn>
+ * Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is a header file for loongarch builtin extension.
+ *
+ */
+
+#ifndef LOONGSON_INTRINSICS_H
+#define LOONGSON_INTRINSICS_H
+
+/**
+ * MAJOR version: Macro usage changes.
+ * MINOR version: Add new functions, or bug fixes.
+ * MICRO version: Comment changes or implementation changes.
+ */
+#define LSOM_VERSION_MAJOR 1
+#define LSOM_VERSION_MINOR 2
+#define LSOM_VERSION_MICRO 1
+
+#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
+ { \
+ _OUT0 = _INS(_IN0); \
+ _OUT1 = _INS(_IN1); \
+ }
+
+#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
+ { \
+ _OUT0 = _INS(_IN0, _IN1); \
+ _OUT1 = _INS(_IN2, _IN3); \
+ }
+
+#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
+ { \
+ _OUT0 = _INS(_IN0, _IN1, _IN2); \
+ _OUT1 = _INS(_IN3, _IN4, _IN5); \
+ }
+
+#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
+ { \
+ DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
+ DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
+ }
+
+#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
+ _OUT1, _OUT2, _OUT3) \
+ { \
+ DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
+ DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
+ }
+
+#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
+ _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
+ { \
+ DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
+ DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
+ }
+
+#ifdef __loongarch_sx
+#include <lsxintrin.h>
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Outputs - out
+ * Return Type - halfword
+ * Details : Signed byte elements from in_h are multiplied by
+ * signed byte elements from in_l, and then added adjacent to
+ * each other to get a result twice the size of input. Then
+ * the results are added to signed half-word elements from in_c.
+ * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ * in_c : 1,2,3,4, 1,2,3,4
+ * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ * out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
+ __m128i in_l) {
+ __m128i out;
+
+ out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
+ out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Outputs - out
+ * Return Type - halfword
+ * Details : Unsigned byte elements from in_h are multiplied by
+ * unsigned byte elements from in_l, and then added adjacent to
+ * each other to get a result twice the size of input.
+ * The results are added to signed half-word elements from in_c.
+ * Example : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
+ * in_c : 1,2,3,4, 1,2,3,4
+ * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ * out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
+ __m128i in_l) {
+ __m128i out;
+
+ out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
+ out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Outputs - out
+ * Return Type - halfword
+ * Details : Unsigned byte elements from in_h are multiplied by
+ * signed byte elements from in_l, and then added adjacent to
+ * each other to get a result twice the size of input.
+ * The results are added to signed half-word elements from in_c.
+ * Example : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
+ * in_c : 1,1,1,1, 1,1,1,1
+ * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
+ * out : -4,-24,-60,-112, 6,26,62,114
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
+ __m128i in_l) {
+ __m128i out;
+
+ out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
+ out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of half-word vector elements
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Outputs - out
+ * Return Type - __m128i
+ * Details : Signed half-word elements from in_h are multiplied by
+ * signed half-word elements from in_l, and then added adjacent to
+ * each other to get a result twice the size of input.
+ * Then the results are added to signed word elements from in_c.
+ * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ * in_c : 1,2,3,4
+ * in_h : 1,2,3,4, 5,6,7,8
+ * in_l : 8,7,6,5, 4,3,2,1
+ * out : 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
+ __m128i in_l) {
+ __m128i out;
+
+ out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
+ out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Outputs - out
+ * Return Type - halfword
+ * Details : Signed byte elements from in_h are multiplied by
+ * signed byte elements from in_l, and then added adjacent to
+ * each other to get a result twice the size of input.
+ * Example : out = __lsx_vdp2_h_b(in_h, in_l)
+ * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ * out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
+ __m128i out;
+
+ out = __lsx_vmulwev_h_b(in_h, in_l);
+ out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Outputs - out
+ * Return Type - halfword
+ * Details : Unsigned byte elements from in_h are multiplied by
+ * unsigned byte elements from in_l, and then added adjacent to
+ * each other to get a result twice the size of input.
+ * Example : out = __lsx_vdp2_h_bu(in_h, in_l)
+ * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ * out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
+ __m128i out;
+
+ out = __lsx_vmulwev_h_bu(in_h, in_l);
+ out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Outputs - out
+ * Return Type - halfword
+ * Details : Unsigned byte elements from in_h are multiplied by
+ * signed byte elements from in_l, and then added adjacent to
+ * each other to get a result twice the size of input.
+ * Example : out = __lsx_vdp2_h_bu_b(in_h, in_l)
+ * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
+ * out : 22,38,38,22, 22,38,38,6
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
+ __m128i out;
+
+ out = __lsx_vmulwev_h_bu_b(in_h, in_l);
+ out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Outputs - out
+ * Return Type - halfword
+ * Details : Signed byte elements from in_h are multiplied by
+ * signed byte elements from in_l, and then added adjacent to
+ * each other to get a result twice the size of input.
+ * Example : out = __lsx_vdp2_w_h(in_h, in_l)
+ * in_h : 1,2,3,4, 5,6,7,8
+ * in_l : 8,7,6,5, 4,3,2,1
+ * out : 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
+ __m128i out;
+
+ out = __lsx_vmulwev_w_h(in_h, in_l);
+ out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Outputs - out
+ * Return Type - double
+ * Details : Signed byte elements from in_h are multiplied by
+ * signed byte elements from in_l, and then added adjacent to
+ * each other to get a result twice the size of input.
+ * Example : out = __lsx_vdp2_d_w(in_h, in_l)
+ * in_h : 1,2,3,4
+ * in_l : 8,7,6,5
+ * out : 22,38
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_d_w(__m128i in_h, __m128i in_l) {
+ __m128i out;
+
+ out = __lsx_vmulwev_d_w(in_h, in_l);
+ out = __lsx_vmaddwod_d_w(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all halfword elements of input vector between min & max
+ * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
+ * (_in))
+ * Arguments : Inputs - _in (input vector)
+ * - min (min threshold)
+ * - max (max threshold)
+ * Outputs - out (output vector with clipped elements)
+ * Return Type - signed halfword
+ * Example : out = __lsx_vclip_h(_in)
+ * _in : -8,2,280,249, -8,255,280,249
+ * min : 1,1,1,1, 1,1,1,1
+ * max : 9,9,9,9, 9,9,9,9
+ * out : 1,2,9,9, 1,9,9,9
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
+ __m128i out;
+
+ out = __lsx_vmax_h(min, _in);
+ out = __lsx_vmin_h(max, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Set each element of vector between 0 and 255
+ * Arguments : Inputs - _in
+ * Outputs - out
+ * Return Type - halfword
+ * Details : Signed byte elements from _in are clamped between 0 and 255.
+ * Example : out = __lsx_vclip255_h(_in)
+ * _in : -8,255,280,249, -8,255,280,249
+ * out : 0,255,255,249, 0,255,255,249
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip255_h(__m128i _in) {
+ __m128i out;
+
+ out = __lsx_vmaxi_h(_in, 0);
+ out = __lsx_vsat_hu(out, 7);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Set each element of vector between 0 and 255
+ * Arguments : Inputs - _in
+ * Outputs - out
+ * Return Type - word
+ * Details : Signed byte elements from _in are clamped between 0 and 255.
+ * Example : out = __lsx_vclip255_w(_in)
+ * _in : -8,255,280,249
+ * out : 0,255,255,249
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip255_w(__m128i _in) {
+ __m128i out;
+
+ out = __lsx_vmaxi_w(_in, 0);
+ out = __lsx_vsat_wu(out, 7);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Swap two variables
+ * Arguments : Inputs - _in0, _in1
+ * Outputs - _in0, _in1 (in-place)
+ * Details : Swapping of two input variables using xor
+ * Example : LSX_SWAP(_in0, _in1)
+ * _in0 : 1,2,3,4
+ * _in1 : 5,6,7,8
+ * _in0(out) : 5,6,7,8
+ * _in1(out) : 1,2,3,4
+ * =============================================================================
+ */
+#define LSX_SWAP(_in0, _in1) \
+ { \
+ _in0 = __lsx_vxor_v(_in0, _in1); \
+ _in1 = __lsx_vxor_v(_in0, _in1); \
+ _in0 = __lsx_vxor_v(_in0, _in1); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ * Details :
+ * Example :
+ * 1, 2, 3, 4 1, 5, 9,13
+ * 5, 6, 7, 8 to 2, 6,10,14
+ * 9,10,11,12 =====> 3, 7,11,15
+ * 13,14,15,16 4, 8,12,16
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ __m128i _t0, _t1, _t2, _t3; \
+ \
+ _t0 = __lsx_vilvl_w(_in1, _in0); \
+ _t1 = __lsx_vilvh_w(_in1, _in0); \
+ _t2 = __lsx_vilvl_w(_in3, _in2); \
+ _t3 = __lsx_vilvh_w(_in3, _in2); \
+ _out0 = __lsx_vilvl_d(_t2, _t0); \
+ _out1 = __lsx_vilvh_d(_t2, _t0); \
+ _out2 = __lsx_vilvl_d(_t3, _t1); \
+ _out3 = __lsx_vilvh_d(_t3, _t1); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with byte elements in vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7
+ * Details : The rows of the matrix become columns, and the columns
+ * become rows.
+ * Example : LSX_TRANSPOSE8x8_B
+ * _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
+ * _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
+ * _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
+ * _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
+ * _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
+ * _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
+ * _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
+ * _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
+ *
+ * _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
+ * _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
+ * _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
+ * _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
+ * _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
+ * _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
+ * _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
+ * _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m128i zero = { 0 }; \
+ __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; \
+ __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
+ \
+ _t0 = __lsx_vilvl_b(_in2, _in0); \
+ _t1 = __lsx_vilvl_b(_in3, _in1); \
+ _t2 = __lsx_vilvl_b(_in6, _in4); \
+ _t3 = __lsx_vilvl_b(_in7, _in5); \
+ _t4 = __lsx_vilvl_b(_t1, _t0); \
+ _t5 = __lsx_vilvh_b(_t1, _t0); \
+ _t6 = __lsx_vilvl_b(_t3, _t2); \
+ _t7 = __lsx_vilvh_b(_t3, _t2); \
+ _out0 = __lsx_vilvl_w(_t6, _t4); \
+ _out2 = __lsx_vilvh_w(_t6, _t4); \
+ _out4 = __lsx_vilvl_w(_t7, _t5); \
+ _out6 = __lsx_vilvh_w(_t7, _t5); \
+ _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
+ _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
+ _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
+ _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ * Details :
+ * Example :
+ * 00,01,02,03,04,05,06,07 00,10,20,30,40,50,60,70
+ * 10,11,12,13,14,15,16,17 01,11,21,31,41,51,61,71
+ * 20,21,22,23,24,25,26,27 02,12,22,32,42,52,62,72
+ * 30,31,32,33,34,35,36,37 to 03,13,23,33,43,53,63,73
+ * 40,41,42,43,44,45,46,47 ======> 04,14,24,34,44,54,64,74
+ * 50,51,52,53,54,55,56,57 05,15,25,35,45,55,65,75
+ * 60,61,62,63,64,65,66,67 06,16,26,36,46,56,66,76
+ * 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
+ \
+ _s0 = __lsx_vilvl_h(_in6, _in4); \
+ _s1 = __lsx_vilvl_h(_in7, _in5); \
+ _t0 = __lsx_vilvl_h(_s1, _s0); \
+ _t1 = __lsx_vilvh_h(_s1, _s0); \
+ _s0 = __lsx_vilvh_h(_in6, _in4); \
+ _s1 = __lsx_vilvh_h(_in7, _in5); \
+ _t2 = __lsx_vilvl_h(_s1, _s0); \
+ _t3 = __lsx_vilvh_h(_s1, _s0); \
+ _s0 = __lsx_vilvl_h(_in2, _in0); \
+ _s1 = __lsx_vilvl_h(_in3, _in1); \
+ _t4 = __lsx_vilvl_h(_s1, _s0); \
+ _t5 = __lsx_vilvh_h(_s1, _s0); \
+ _s0 = __lsx_vilvh_h(_in2, _in0); \
+ _s1 = __lsx_vilvh_h(_in3, _in1); \
+ _t6 = __lsx_vilvl_h(_s1, _s0); \
+ _t7 = __lsx_vilvh_h(_s1, _s0); \
+ \
+ _out0 = __lsx_vpickev_d(_t0, _t4); \
+ _out2 = __lsx_vpickev_d(_t1, _t5); \
+ _out4 = __lsx_vpickev_d(_t2, _t6); \
+ _out6 = __lsx_vpickev_d(_t3, _t7); \
+ _out1 = __lsx_vpickod_d(_t0, _t4); \
+ _out3 = __lsx_vpickod_d(_t1, _t5); \
+ _out5 = __lsx_vpickod_d(_t2, _t6); \
+ _out7 = __lsx_vpickod_d(_t3, _t7); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 8x4 byte block into 4x8
+ * Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block)
+ * Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block)
+ * Return Type - as per RTYPE
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
+ * Example : LSX_TRANSPOSE8x4_B
+ * _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
+ * _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
+ * _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
+ * _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
+ * _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
+ * _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
+ * _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
+ * _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
+ *
+ * _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
+ * _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
+ * _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
+ * _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3) \
+ { \
+ __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ \
+ _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
+ _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
+ _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
+ _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
+ _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
+ \
+ _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
+ _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
+ _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
+ \
+ _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
+ _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
+ _out1 = __lsx_vilvh_d(_out2, _out0); \
+ _out3 = __lsx_vilvh_d(_out0, _out2); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8
+ * in9, in10, in11, in12, in13, in14, in15
+ * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ * Details :
+ * Example :
+ * 000,001,002,003,004,005,006,007
+ * 008,009,010,011,012,013,014,015
+ * 016,017,018,019,020,021,022,023
+ * 024,025,026,027,028,029,030,031
+ * 032,033,034,035,036,037,038,039
+ * 040,041,042,043,044,045,046,047 000,008,...,112,120
+ * 048,049,050,051,052,053,054,055 001,009,...,113,121
+ * 056,057,058,059,060,061,062,063 to 002,010,...,114,122
+ * 064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
+ * 072,073,074,075,076,077,078,079 004,012,...,116,124
+ * 080,081,082,083,084,085,086,087 005,013,...,117,125
+ * 088,089,090,091,092,093,094,095 006,014,...,118,126
+ * 096,097,098,099,100,101,102,103 007,015,...,119,127
+ * 104,105,106,107,108,109,110,111
+ * 112,113,114,115,116,117,118,119
+ * 120,121,122,123,124,125,126,127
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
+ _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+ _out6, _out7) \
+ { \
+ __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
+ __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
+ DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
+ _tmp0, _tmp1, _tmp2, _tmp3); \
+ DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
+ _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
+ DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
+ DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
+ DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
+ DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
+ DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
+ DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
+ DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
+ DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
+ DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
+ DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
+ DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
+ DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 4 input vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ * Details : Butterfly operation
+ * Example :
+ * out0 = in0 + in3;
+ * out1 = in1 + in2;
+ * out2 = in1 - in2;
+ * out3 = in0 - in3;
+ * =============================================================================
+ */
+#define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lsx_vadd_b(_in0, _in3); \
+ _out1 = __lsx_vadd_b(_in1, _in2); \
+ _out2 = __lsx_vsub_b(_in1, _in2); \
+ _out3 = __lsx_vsub_b(_in0, _in3); \
+ }
+#define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lsx_vadd_h(_in0, _in3); \
+ _out1 = __lsx_vadd_h(_in1, _in2); \
+ _out2 = __lsx_vsub_h(_in1, _in2); \
+ _out3 = __lsx_vsub_h(_in0, _in3); \
+ }
+#define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lsx_vadd_w(_in0, _in3); \
+ _out1 = __lsx_vadd_w(_in1, _in2); \
+ _out2 = __lsx_vsub_w(_in1, _in2); \
+ _out3 = __lsx_vsub_w(_in0, _in3); \
+ }
+#define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lsx_vadd_d(_in0, _in3); \
+ _out1 = __lsx_vadd_d(_in1, _in2); \
+ _out2 = __lsx_vsub_d(_in1, _in2); \
+ _out3 = __lsx_vsub_d(_in0, _in3); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 8 input vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
+ * Outputs - _out0, _out1, _out2, _out3, ~
+ * Details : Butterfly operation
+ * Example :
+ * _out0 = _in0 + _in7;
+ * _out1 = _in1 + _in6;
+ * _out2 = _in2 + _in5;
+ * _out3 = _in3 + _in4;
+ * _out4 = _in3 - _in4;
+ * _out5 = _in2 - _in5;
+ * _out6 = _in1 - _in6;
+ * _out7 = _in0 - _in7;
+ * =============================================================================
+ */
+#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lsx_vadd_b(_in0, _in7); \
+ _out1 = __lsx_vadd_b(_in1, _in6); \
+ _out2 = __lsx_vadd_b(_in2, _in5); \
+ _out3 = __lsx_vadd_b(_in3, _in4); \
+ _out4 = __lsx_vsub_b(_in3, _in4); \
+ _out5 = __lsx_vsub_b(_in2, _in5); \
+ _out6 = __lsx_vsub_b(_in1, _in6); \
+ _out7 = __lsx_vsub_b(_in0, _in7); \
+ }
+
+#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lsx_vadd_h(_in0, _in7); \
+ _out1 = __lsx_vadd_h(_in1, _in6); \
+ _out2 = __lsx_vadd_h(_in2, _in5); \
+ _out3 = __lsx_vadd_h(_in3, _in4); \
+ _out4 = __lsx_vsub_h(_in3, _in4); \
+ _out5 = __lsx_vsub_h(_in2, _in5); \
+ _out6 = __lsx_vsub_h(_in1, _in6); \
+ _out7 = __lsx_vsub_h(_in0, _in7); \
+ }
+
+#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lsx_vadd_w(_in0, _in7); \
+ _out1 = __lsx_vadd_w(_in1, _in6); \
+ _out2 = __lsx_vadd_w(_in2, _in5); \
+ _out3 = __lsx_vadd_w(_in3, _in4); \
+ _out4 = __lsx_vsub_w(_in3, _in4); \
+ _out5 = __lsx_vsub_w(_in2, _in5); \
+ _out6 = __lsx_vsub_w(_in1, _in6); \
+ _out7 = __lsx_vsub_w(_in0, _in7); \
+ }
+
+#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lsx_vadd_d(_in0, _in7); \
+ _out1 = __lsx_vadd_d(_in1, _in6); \
+ _out2 = __lsx_vadd_d(_in2, _in5); \
+ _out3 = __lsx_vadd_d(_in3, _in4); \
+ _out4 = __lsx_vsub_d(_in3, _in4); \
+ _out5 = __lsx_vsub_d(_in2, _in5); \
+ _out6 = __lsx_vsub_d(_in1, _in6); \
+ _out7 = __lsx_vsub_d(_in0, _in7); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 16 input vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
+ * Outputs - _out0, _out1, _out2, _out3, ~
+ * Details : Butterfly operation
+ * Example :
+ * _out0 = _in0 + _in15;
+ * _out1 = _in1 + _in14;
+ * _out2 = _in2 + _in13;
+ * _out3 = _in3 + _in12;
+ * _out4 = _in4 + _in11;
+ * _out5 = _in5 + _in10;
+ * _out6 = _in6 + _in9;
+ * _out7 = _in7 + _in8;
+ * _out8 = _in7 - _in8;
+ * _out9 = _in6 - _in9;
+ * _out10 = _in5 - _in10;
+ * _out11 = _in4 - _in11;
+ * _out12 = _in3 - _in12;
+ * _out13 = _in2 - _in13;
+ * _out14 = _in1 - _in14;
+ * _out15 = _in0 - _in15;
+ * =============================================================================
+ */
+
+#define LSX_BUTTERFLY_16_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
+ _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+ _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+ _out13, _out14, _out15) \
+ { \
+ _out0 = __lsx_vadd_b(_in0, _in15); \
+ _out1 = __lsx_vadd_b(_in1, _in14); \
+ _out2 = __lsx_vadd_b(_in2, _in13); \
+ _out3 = __lsx_vadd_b(_in3, _in12); \
+ _out4 = __lsx_vadd_b(_in4, _in11); \
+ _out5 = __lsx_vadd_b(_in5, _in10); \
+ _out6 = __lsx_vadd_b(_in6, _in9); \
+ _out7 = __lsx_vadd_b(_in7, _in8); \
+ \
+ _out8 = __lsx_vsub_b(_in7, _in8); \
+ _out9 = __lsx_vsub_b(_in6, _in9); \
+ _out10 = __lsx_vsub_b(_in5, _in10); \
+ _out11 = __lsx_vsub_b(_in4, _in11); \
+ _out12 = __lsx_vsub_b(_in3, _in12); \
+ _out13 = __lsx_vsub_b(_in2, _in13); \
+ _out14 = __lsx_vsub_b(_in1, _in14); \
+ _out15 = __lsx_vsub_b(_in0, _in15); \
+ }
+
+#define LSX_BUTTERFLY_16_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
+ _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+ _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+ _out13, _out14, _out15) \
+ { \
+ _out0 = __lsx_vadd_h(_in0, _in15); \
+ _out1 = __lsx_vadd_h(_in1, _in14); \
+ _out2 = __lsx_vadd_h(_in2, _in13); \
+ _out3 = __lsx_vadd_h(_in3, _in12); \
+ _out4 = __lsx_vadd_h(_in4, _in11); \
+ _out5 = __lsx_vadd_h(_in5, _in10); \
+ _out6 = __lsx_vadd_h(_in6, _in9); \
+ _out7 = __lsx_vadd_h(_in7, _in8); \
+ \
+ _out8 = __lsx_vsub_h(_in7, _in8); \
+ _out9 = __lsx_vsub_h(_in6, _in9); \
+ _out10 = __lsx_vsub_h(_in5, _in10); \
+ _out11 = __lsx_vsub_h(_in4, _in11); \
+ _out12 = __lsx_vsub_h(_in3, _in12); \
+ _out13 = __lsx_vsub_h(_in2, _in13); \
+ _out14 = __lsx_vsub_h(_in1, _in14); \
+ _out15 = __lsx_vsub_h(_in0, _in15); \
+ }
+
+#define LSX_BUTTERFLY_16_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
+ _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+ _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+ _out13, _out14, _out15) \
+ { \
+ _out0 = __lsx_vadd_w(_in0, _in15); \
+ _out1 = __lsx_vadd_w(_in1, _in14); \
+ _out2 = __lsx_vadd_w(_in2, _in13); \
+ _out3 = __lsx_vadd_w(_in3, _in12); \
+ _out4 = __lsx_vadd_w(_in4, _in11); \
+ _out5 = __lsx_vadd_w(_in5, _in10); \
+ _out6 = __lsx_vadd_w(_in6, _in9); \
+ _out7 = __lsx_vadd_w(_in7, _in8); \
+ \
+ _out8 = __lsx_vsub_w(_in7, _in8); \
+ _out9 = __lsx_vsub_w(_in6, _in9); \
+ _out10 = __lsx_vsub_w(_in5, _in10); \
+ _out11 = __lsx_vsub_w(_in4, _in11); \
+ _out12 = __lsx_vsub_w(_in3, _in12); \
+ _out13 = __lsx_vsub_w(_in2, _in13); \
+ _out14 = __lsx_vsub_w(_in1, _in14); \
+ _out15 = __lsx_vsub_w(_in0, _in15); \
+ }
+
+#define LSX_BUTTERFLY_16_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
+ _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+ _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+ _out13, _out14, _out15) \
+ { \
+ _out0 = __lsx_vadd_d(_in0, _in15); \
+ _out1 = __lsx_vadd_d(_in1, _in14); \
+ _out2 = __lsx_vadd_d(_in2, _in13); \
+ _out3 = __lsx_vadd_d(_in3, _in12); \
+ _out4 = __lsx_vadd_d(_in4, _in11); \
+ _out5 = __lsx_vadd_d(_in5, _in10); \
+ _out6 = __lsx_vadd_d(_in6, _in9); \
+ _out7 = __lsx_vadd_d(_in7, _in8); \
+ \
+ _out8 = __lsx_vsub_d(_in7, _in8); \
+ _out9 = __lsx_vsub_d(_in6, _in9); \
+ _out10 = __lsx_vsub_d(_in5, _in10); \
+ _out11 = __lsx_vsub_d(_in4, _in11); \
+ _out12 = __lsx_vsub_d(_in3, _in12); \
+ _out13 = __lsx_vsub_d(_in2, _in13); \
+ _out14 = __lsx_vsub_d(_in1, _in14); \
+ _out15 = __lsx_vsub_d(_in0, _in15); \
+ }
+
+#endif // LSX
+
+#ifdef __loongarch_asx
+#include <lasxintrin.h>
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - signed halfword
+ * Details : Unsigned byte elements from in_h are multiplied with
+ * unsigned byte elements from in_l producing a result
+ * twice the size of input i.e. signed halfword.
+ * Then these multiplied results of adjacent odd-even elements
+ * are added to the out vector
+ * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_h_bu(in_h, in_l);
+ out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - signed halfword
+ * Details : Signed byte elements from in_h are multiplied with
+ * signed byte elements from in_l producing a result
+ * twice the size of input i.e. signed halfword.
+ * Then these multiplication results of adjacent odd-even elements
+ * are added to the out vector
+ * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_h_b(in_h, in_l);
+ out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - signed word
+ * Details : Signed halfword elements from in_h are multiplied with
+ * signed halfword elements from in_l producing a result
+ * twice the size of input i.e. signed word.
+ * Then these multiplied results of adjacent odd-even elements
+ * are added to the out vector.
+ * Example : out = __lasx_xvdp2_w_h(in_h, in_l)
+ * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ * out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_w_h(in_h, in_l);
+ out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of word vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - signed double
+ * Details : Signed word elements from in_h are multiplied with
+ * signed word elements from in_l producing a result
+ * twice the size of input i.e. signed double-word.
+ * Then these multiplied results of adjacent odd-even elements
+ * are added to the out vector.
+ * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_d_w(in_h, in_l);
+ out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - signed word
+ * Details : Unsigned halfword elements from in_h are multiplied with
+ * signed halfword elements from in_l producing a result
+ * twice the size of input i.e. unsigned word.
+ * Multiplication result of adjacent odd-even elements
+ * are added to the out vector
+ * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
+ out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - halfword
+ * Details : Signed byte elements from in_h are multiplied with
+ * signed byte elements from in_l producing a result
+ * twice the size of input i.e. signed halfword.
+ * Then these multiplied results of adjacent odd-even elements
+ * are added to the in_c vector.
+ * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - halfword
+ * Details : Unsigned byte elements from in_h are multiplied with
+ * unsigned byte elements from in_l producing a result
+ * twice the size of input i.e. signed halfword.
+ * Then these multiplied results of adjacent odd-even elements
+ * are added to the in_c vector.
+ * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - halfword
+ * Details : Unsigned byte elements from in_h are multiplied with
+ * signed byte elements from in_l producing a result
+ * twice the size of input i.e. signed halfword.
+ * Then these multiplied results of adjacent odd-even elements
+ * are added to the in_c vector.
+ * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Output - out
+ * Return Type - per RTYPE
+ * Details : Signed halfword elements from in_h are multiplied with
+ * signed halfword elements from in_l producing a result
+ * twice the size of input i.e. signed word.
+ * Multiplication result of adjacent odd-even elements
+ * are added to the in_c vector.
+ * Example : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * in_c : 1,2,3,4, 1,2,3,4
+ * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
+ * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
+ * out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Output - out
+ * Return Type - signed word
+ * Details : Unsigned halfword elements from in_h are multiplied with
+ * unsigned halfword elements from in_l producing a result
+ * twice the size of input i.e. signed word.
+ * Multiplication result of adjacent odd-even elements
+ * are added to the in_c vector.
+ * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Output - out
+ * Return Type - signed word
+ * Details : Unsigned halfword elements from in_h are multiplied with
+ * signed halfword elements from in_l producing a result
+ * twice the size of input i.e. signed word.
+ * Multiplication result of adjacent odd-even elements
+ * are added to the in_c vector
+ * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Vector Unsigned Dot Product and Subtract
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Output - out
+ * Return Type - signed halfword
+ * Details : Unsigned byte elements from in_h are multiplied with
+ * unsigned byte elements from in_l producing a result
+ * twice the size of input i.e. signed halfword.
+ * Multiplication result of adjacent odd-even elements
+ * are added together and subtracted from double width elements
+ * in_c vector.
+ * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_h_bu(in_h, in_l);
+ out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+ out = __lasx_xvsub_h(in_c, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Vector Signed Dot Product and Subtract
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Output - out
+ * Return Type - signed word
+ * Details : Signed halfword elements from in_h are multiplied with
+ * Signed halfword elements from in_l producing a result
+ * twice the size of input i.e. signed word.
+ * Multiplication result of adjacent odd-even elements
+ * are added together and subtracted from double width elements
+ * in_c vector.
+ * Example : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
+ * in_c : 0,0,0,0, 0,0,0,0
+ * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
+ * in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
+ * out : -7,-3,0,0, 0,-1,0,-1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_w_h(in_h, in_l);
+ out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+ out = __lasx_xvsub_w(in_c, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - signed word
+ * Details : Signed halfword elements from in_h are multiplied with
+ * signed halfword elements from in_l producing a result
+ * four times the size of input i.e. signed doubleword.
+ * Then these multiplication results of four adjacent elements
+ * are added together and stored to the out vector.
+ * Example : out = __lasx_xvdp4_d_h(in_h, in_l)
+ * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
+ * in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
+ * out : -2,0,1,1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_w_h(in_h, in_l);
+ out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+ out = __lasx_xvhaddw_d_w(out, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The high half of the vector elements are expanded and
+ * added after being doubled.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The in_h vector and the in_l vector are added after the
+ * higher half of the two-fold sign extension (signed byte
+ * to signed halfword) and stored to the out vector.
+ * Example : See out = __lasx_xvaddwh_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvilvh_b(in_h, in_l);
+ out = __lasx_xvhaddw_h_b(out, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The high half of the vector elements are expanded and
+ * added after being doubled.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The in_h vector and the in_l vector are added after the
+ * higher half of the two-fold sign extension (signed halfword
+ * to signed word) and stored to the out vector.
+ * Example : out = __lasx_xvaddwh_w_h(in_h, in_l)
+ * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
+ * out : 1,0,0,-1, 1,0,0, 2
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvilvh_h(in_h, in_l);
+ out = __lasx_xvhaddw_w_h(out, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ * added after being doubled.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The in_h vector and the in_l vector are added after the
+ * lower half of the two-fold sign extension (signed byte
+ * to signed halfword) and stored to the out vector.
+ * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvilvl_b(in_h, in_l);
+ out = __lasx_xvhaddw_h_b(out, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ * added after being doubled.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The in_h vector and the in_l vector are added after the
+ * lower half of the two-fold sign extension (signed halfword
+ * to signed word) and stored to the out vector.
+ * Example : out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
+ * out : 5,-1,4,2, 1,0,2,-1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvilvl_h(in_h, in_l);
+ out = __lasx_xvhaddw_w_h(out, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ * added after being doubled.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The out vector and the out vector are added after the
+ * lower half of the two-fold zero extension (unsigned byte
+ * to unsigned halfword) and stored to the out vector.
+ * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvilvl_b(in_h, in_l);
+ out = __lasx_xvhaddw_hu_bu(out, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ * added after being doubled.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The in_l vector after double zero extension (unsigned byte to
+ * signed halfword),added to the in_h vector.
+ * Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvsllwil_hu_bu(in_l, 0);
+ out = __lasx_xvadd_h(in_h, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ * added after being doubled.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The in_l vector after double sign extension (signed halfword to
+ * signed word), added to the in_h vector.
+ * Example : out = __lasx_xvaddw_w_w_h(in_h, in_l)
+ * in_h : 0, 1,0,0, -1,0,0,1,
+ * in_l : 2,-1,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1,
+ * out : 2, 0,1,2, -1,0,1,1,
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvsllwil_w_h(in_l, 0);
+ out = __lasx_xvadd_w(in_h, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication and addition calculation after expansion
+ * of the lower half of the vector.
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Output - out
+ * Details : The in_h vector and the in_l vector are multiplied after
+ * the lower half of the two-fold sign extension (signed halfword
+ * to signed word), and the result is added to the vector in_c,
+ * then stored to the out vector.
+ * Example : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
+ * in_c : 1,2,3,4, 5,6,7,8
+ * in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
+ * in_l : 200, 300, 400, 500, 2000, 3000, 4000, 5000,
+ * -200,-300,-400,-500, -2000,-3000,-4000,-5000
+ * out : 201, 602,1203,2004, -995, -1794,-2793,-3992
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i tmp0, tmp1, out;
+
+ tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+ tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+ tmp0 = __lasx_xvmul_w(tmp0, tmp1);
+ out = __lasx_xvadd_w(tmp0, in_c);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication and addition calculation after expansion
+ * of the higher half of the vector.
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Output - out
+ * Details : The in_h vector and the in_l vector are multiplied after
+ * the higher half of the two-fold sign extension (signed
+ * halfword to signed word), and the result is added to
+ * the vector in_c, then stored to the out vector.
+ * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i tmp0, tmp1, out;
+
+ tmp0 = __lasx_xvilvh_h(in_h, in_h);
+ tmp1 = __lasx_xvilvh_h(in_l, in_l);
+ tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
+ out = __lasx_xvadd_w(tmp0, in_c);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication calculation after expansion of the lower
+ * half of the vector.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The in_h vector and the in_l vector are multiplied after
+ * the lower half of the two-fold sign extension (signed
+ * halfword to signed word), then stored to the out vector.
+ * Example : out = __lasx_xvmulwl_w_h(in_h, in_l)
+ * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
+ * out : 6,1,3,0, 0,0,1,0
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
+ __m256i tmp0, tmp1, out;
+
+ tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+ tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+ out = __lasx_xvmul_w(tmp0, tmp1);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication calculation after expansion of the lower
+ * half of the vector.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The in_h vector and the in_l vector are multiplied after
+ * the lower half of the two-fold sign extension (signed
+ * halfword to signed word), then stored to the out vector.
+ * Example : out = __lasx_xvmulwh_w_h(in_h, in_l)
+ * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
+ * out : 0,0,0,0, 0,0,0,1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
+ __m256i tmp0, tmp1, out;
+
+ tmp0 = __lasx_xvilvh_h(in_h, in_h);
+ tmp1 = __lasx_xvilvh_h(in_l, in_l);
+ out = __lasx_xvmulwev_w_h(tmp0, tmp1);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are added to the high half
+ * after being doubled, then saturated.
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Details : The in_h vector adds the in_l vector after the lower half of
+ * the two-fold zero extension (unsigned byte to unsigned
+ * halfword) and then saturated. The results are stored to the out
+ * vector.
+ * Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
+ * in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
+ * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
+ * 0,0,0,1
+ * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
+ __m256i tmp1, out;
+ __m256i zero = { 0 };
+
+ tmp1 = __lasx_xvilvl_b(zero, in_l);
+ out = __lasx_xvsadd_hu(in_h, tmp1);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all halfword elements of input vector between min & max
+ * out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
+ * Arguments : Inputs - in (input vector)
+ * - min (min threshold)
+ * - max (max threshold)
+ * Outputs - in (output vector with clipped elements)
+ * Return Type - signed halfword
+ * Example : out = __lasx_xvclip_h(in, min, max)
+ * in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
+ * min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
+ * max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
+ * out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
+ __m256i out;
+
+ out = __lasx_xvmax_h(min, in);
+ out = __lasx_xvmin_h(max, out);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all signed halfword elements of input vector
+ * between 0 & 255
+ * Arguments : Inputs - in (input vector)
+ * Outputs - out (output vector with clipped elements)
+ * Return Type - signed halfword
+ * Example : See out = __lasx_xvclip255_w(in)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip255_h(__m256i in) {
+ __m256i out;
+
+ out = __lasx_xvmaxi_h(in, 0);
+ out = __lasx_xvsat_hu(out, 7);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all signed word elements of input vector
+ * between 0 & 255
+ * Arguments : Inputs - in (input vector)
+ * Output - out (output vector with clipped elements)
+ * Return Type - signed word
+ * Example : out = __lasx_xvclip255_w(in)
+ * in : -8,255,280,249, -8,255,280,249
+ * out : 0,255,255,249, 0,255,255,249
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip255_w(__m256i in) {
+ __m256i out;
+
+ out = __lasx_xvmaxi_w(in, 0);
+ out = __lasx_xvsat_wu(out, 7);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Indexed halfword element values are replicated to all
+ * elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ * if 'idx >= 8' use xvsplati_h_*.
+ * Arguments : Inputs - in, idx
+ * Output - out
+ * Details : Idx element value from in vector is replicated to all
+ * elements in out vector.
+ * Valid index range for halfword operation is 0-7
+ * Example : out = __lasx_xvsplati_l_h(in, idx)
+ * in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
+ * idx : 0x02
+ * out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
+ __m256i out;
+
+ out = __lasx_xvpermi_q(in, in, 0x02);
+ out = __lasx_xvreplve_h(out, idx);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Indexed halfword element values are replicated to all
+ * elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ * if 'idx >= 8' use xvsplati_h_*.
+ * Arguments : Inputs - in, idx
+ * Output - out
+ * Details : Idx element value from in vector is replicated to all
+ * elements in out vector.
+ * Valid index range for halfword operation is 0-7
+ * Example : out = __lasx_xvsplati_h_h(in, idx)
+ * in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
+ * idx : 0x09
+ * out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
+ __m256i out;
+
+ out = __lasx_xvpermi_q(in, in, 0x13);
+ out = __lasx_xvreplve_h(out, idx);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with double-word elements in vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3
+ * Outputs - _out0, _out1, _out2, _out3
+ * Example : LASX_TRANSPOSE4x4_D
+ * _in0 : 1,2,3,4
+ * _in1 : 1,2,3,4
+ * _in2 : 1,2,3,4
+ * _in3 : 1,2,3,4
+ *
+ * _out0 : 1,1,1,1
+ * _out1 : 2,2,2,2
+ * _out2 : 3,3,3,3
+ * _out3 : 4,4,4,4
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+ _out3) \
+ { \
+ __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
+ _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
+ _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
+ _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
+ _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
+ _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
+ _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
+ _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
+ _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with word elements in vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7
+ * Example : LASX_TRANSPOSE8x8_W
+ * _in0 : 1,2,3,4,5,6,7,8
+ * _in1 : 2,2,3,4,5,6,7,8
+ * _in2 : 3,2,3,4,5,6,7,8
+ * _in3 : 4,2,3,4,5,6,7,8
+ * _in4 : 5,2,3,4,5,6,7,8
+ * _in5 : 6,2,3,4,5,6,7,8
+ * _in6 : 7,2,3,4,5,6,7,8
+ * _in7 : 8,2,3,4,5,6,7,8
+ *
+ * _out0 : 1,2,3,4,5,6,7,8
+ * _out1 : 2,2,2,2,2,2,2,2
+ * _out2 : 3,3,3,3,3,3,3,3
+ * _out3 : 4,4,4,4,4,4,4,4
+ * _out4 : 5,5,5,5,5,5,5,5
+ * _out5 : 6,6,6,6,6,6,6,6
+ * _out6 : 7,7,7,7,7,7,7,7
+ * _out7 : 8,8,8,8,8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m256i _s0_m, _s1_m; \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ \
+ _s0_m = __lasx_xvilvl_w(_in2, _in0); \
+ _s1_m = __lasx_xvilvl_w(_in3, _in1); \
+ _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvh_w(_in2, _in0); \
+ _s1_m = __lasx_xvilvh_w(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvl_w(_in6, _in4); \
+ _s1_m = __lasx_xvilvl_w(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvh_w(_in6, _in4); \
+ _s1_m = __lasx_xvilvh_w(_in7, _in5); \
+ _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
+ _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
+ _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
+ _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
+ _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
+ _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
+ _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
+ _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 16x8 byte block
+ * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
+ * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
+ * (input 16x8 byte block)
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7 (output 8x16 byte block)
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
+ * Example : See LASX_TRANSPOSE16x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
+ _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+ _out6, _out7) \
+ { \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ \
+ _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
+ _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
+ _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
+ _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
+ _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
+ _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
+ _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
+ _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
+ _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
+ _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
+ _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
+ _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
+ _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
+ _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
+ _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
+ _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
+ _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
+ _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
+ _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
+ _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
+ _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
+ _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
+ _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
+ _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
+ _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
+ _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
+ _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
+ _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
+ _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
+ _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 16x8 byte block
+ * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
+ * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
+ * (input 16x8 byte block)
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7 (output 8x16 byte block)
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
+ * Example : LASX_TRANSPOSE16x8_H
+ * _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ * _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *
+ * _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
+ * _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
+ * _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
+ * _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
+ * _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+ * _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
+ * _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+ * _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
+ _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+ _out6, _out7) \
+ { \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
+ \
+ _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
+ _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
+ _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
+ _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
+ _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
+ _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
+ _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
+ _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
+ _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
+ _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
+ _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
+ _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
+ _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
+ _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
+ _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
+ _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
+ _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
+ _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
+ _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
+ _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
+ _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
+ _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
+ _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
+ _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
+ _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
+ _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
+ \
+ _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
+ _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
+ _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
+ _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
+ _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
+ _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
+ _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
+ _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
+ _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
+ _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
+ _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
+ _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
+ _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
+ _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
+ _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
+ _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
+ _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
+ _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
+ _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
+ _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
+ _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
+ _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
+ _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
+ _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
+ _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
+ _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with halfword elements in vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3
+ * Outputs - _out0, _out1, _out2, _out3
+ * Return Type - signed halfword
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
+ * Example : See LASX_TRANSPOSE8x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+ _out3) \
+ { \
+ __m256i _s0_m, _s1_m; \
+ \
+ _s0_m = __lasx_xvilvl_h(_in1, _in0); \
+ _s1_m = __lasx_xvilvl_h(_in3, _in2); \
+ _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _out1 = __lasx_xvilvh_d(_out0, _out0); \
+ _out3 = __lasx_xvilvh_d(_out2, _out2); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 8x8 byte block
+ * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ * (input 8x8 byte block)
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7 (output 8x8 byte block)
+ * Example : See LASX_TRANSPOSE8x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
+ _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
+ _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
+ _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
+ _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
+ _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
+ _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
+ _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
+ _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
+ _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
+ _out1 = __lasx_xvbsrl_v(_out0, 8); \
+ _out3 = __lasx_xvbsrl_v(_out2, 8); \
+ _out5 = __lasx_xvbsrl_v(_out4, 8); \
+ _out7 = __lasx_xvbsrl_v(_out6, 8); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with halfword elements in vectors.
+ * Arguments : Inputs - _in0, _in1, ~
+ * Outputs - _out0, _out1, ~
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
+ * Example : LASX_TRANSPOSE8x8_H
+ * _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
+ * _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
+ * _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
+ * _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
+ *
+ * _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
+ * _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
+ * _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
+ * _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
+ * _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
+ * _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
+ * _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
+ * _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m256i _s0_m, _s1_m; \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ \
+ _s0_m = __lasx_xvilvl_h(_in6, _in4); \
+ _s1_m = __lasx_xvilvl_h(_in7, _in5); \
+ _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
+ _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvh_h(_in6, _in4); \
+ _s1_m = __lasx_xvilvh_h(_in7, _in5); \
+ _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
+ _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
+ \
+ _s0_m = __lasx_xvilvl_h(_in2, _in0); \
+ _s1_m = __lasx_xvilvl_h(_in3, _in1); \
+ _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
+ _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvh_h(_in2, _in0); \
+ _s1_m = __lasx_xvilvh_h(_in3, _in1); \
+ _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
+ _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
+ \
+ _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
+ _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
+ _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
+ _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
+ _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
+ _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
+ _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
+ _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 4 input vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3
+ * Outputs - _out0, _out1, _out2, _out3
+ * Details : Butterfly operation
+ * Example : LASX_BUTTERFLY_4
+ * _out0 = _in0 + _in3;
+ * _out1 = _in1 + _in2;
+ * _out2 = _in1 - _in2;
+ * _out3 = _in0 - _in3;
+ * =============================================================================
+ */
+#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lasx_xvadd_b(_in0, _in3); \
+ _out1 = __lasx_xvadd_b(_in1, _in2); \
+ _out2 = __lasx_xvsub_b(_in1, _in2); \
+ _out3 = __lasx_xvsub_b(_in0, _in3); \
+ }
+#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lasx_xvadd_h(_in0, _in3); \
+ _out1 = __lasx_xvadd_h(_in1, _in2); \
+ _out2 = __lasx_xvsub_h(_in1, _in2); \
+ _out3 = __lasx_xvsub_h(_in0, _in3); \
+ }
+#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lasx_xvadd_w(_in0, _in3); \
+ _out1 = __lasx_xvadd_w(_in1, _in2); \
+ _out2 = __lasx_xvsub_w(_in1, _in2); \
+ _out3 = __lasx_xvsub_w(_in0, _in3); \
+ }
+#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lasx_xvadd_d(_in0, _in3); \
+ _out1 = __lasx_xvadd_d(_in1, _in2); \
+ _out2 = __lasx_xvsub_d(_in1, _in2); \
+ _out3 = __lasx_xvsub_d(_in0, _in3); \
+ }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 8 input vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
+ * Outputs - _out0, _out1, _out2, _out3, ~
+ * Details : Butterfly operation
+ * Example : LASX_BUTTERFLY_8
+ * _out0 = _in0 + _in7;
+ * _out1 = _in1 + _in6;
+ * _out2 = _in2 + _in5;
+ * _out3 = _in3 + _in4;
+ * _out4 = _in3 - _in4;
+ * _out5 = _in2 - _in5;
+ * _out6 = _in1 - _in6;
+ * _out7 = _in0 - _in7;
+ * =============================================================================
+ */
+#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lasx_xvadd_b(_in0, _in7); \
+ _out1 = __lasx_xvadd_b(_in1, _in6); \
+ _out2 = __lasx_xvadd_b(_in2, _in5); \
+ _out3 = __lasx_xvadd_b(_in3, _in4); \
+ _out4 = __lasx_xvsub_b(_in3, _in4); \
+ _out5 = __lasx_xvsub_b(_in2, _in5); \
+ _out6 = __lasx_xvsub_b(_in1, _in6); \
+ _out7 = __lasx_xvsub_b(_in0, _in7); \
+ }
+
+#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lasx_xvadd_h(_in0, _in7); \
+ _out1 = __lasx_xvadd_h(_in1, _in6); \
+ _out2 = __lasx_xvadd_h(_in2, _in5); \
+ _out3 = __lasx_xvadd_h(_in3, _in4); \
+ _out4 = __lasx_xvsub_h(_in3, _in4); \
+ _out5 = __lasx_xvsub_h(_in2, _in5); \
+ _out6 = __lasx_xvsub_h(_in1, _in6); \
+ _out7 = __lasx_xvsub_h(_in0, _in7); \
+ }
+
+#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lasx_xvadd_w(_in0, _in7); \
+ _out1 = __lasx_xvadd_w(_in1, _in6); \
+ _out2 = __lasx_xvadd_w(_in2, _in5); \
+ _out3 = __lasx_xvadd_w(_in3, _in4); \
+ _out4 = __lasx_xvsub_w(_in3, _in4); \
+ _out5 = __lasx_xvsub_w(_in2, _in5); \
+ _out6 = __lasx_xvsub_w(_in1, _in6); \
+ _out7 = __lasx_xvsub_w(_in0, _in7); \
+ }
+
+#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lasx_xvadd_d(_in0, _in7); \
+ _out1 = __lasx_xvadd_d(_in1, _in6); \
+ _out2 = __lasx_xvadd_d(_in2, _in5); \
+ _out3 = __lasx_xvadd_d(_in3, _in4); \
+ _out4 = __lasx_xvsub_d(_in3, _in4); \
+ _out5 = __lasx_xvsub_d(_in2, _in5); \
+ _out6 = __lasx_xvsub_d(_in1, _in6); \
+ _out7 = __lasx_xvsub_d(_in0, _in7); \
+ }
+
+#endif // LASX
+
+/*
+ * =============================================================================
+ * Description : Print out elements in vector.
+ * Arguments : Inputs - RTYPE, _element_num, _in0, _enter
+ * Outputs -
+ * Details : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
+ * '_enter' is TRUE, prefix "\nVP:" will be added first.
+ * Example : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
+ * VP:1,2,3,4,
+ * =============================================================================
+ */
+#define VECT_PRINT(RTYPE, element_num, in0, enter) \
+ { \
+ RTYPE _tmp0 = (RTYPE)in0; \
+ int _i = 0; \
+ if (enter) printf("\nVP:"); \
+ for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
+ }
+
+#endif /* LOONGSON_INTRINSICS_H */
+#endif /* VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_ */
diff --git a/media/libvpx/libvpx/vpx_util/vpx_atomics.h b/media/libvpx/libvpx/vpx_util/vpx_atomics.h
new file mode 100644
index 0000000000..23ad566851
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_atomics.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_UTIL_VPX_ATOMICS_H_
+#define VPX_VPX_UTIL_VPX_ATOMICS_H_
+
+#include "./vpx_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
+
+// Look for built-in atomic support. We cannot use <stdatomic.h> or <atomic>
+// since neither is guaranteed to exist on both C and C++ platforms, and we need
+// to back the atomic type with the same type (g++ needs to be able to use
+// gcc-built code). g++ 6 doesn't support _Atomic as a keyword and can't use the
+// stdatomic.h header. Even if both <stdatomic.h> and <atomic> existed it's not
+// guaranteed that atomic_int is the same type as std::atomic_int.
+// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=60932#c13.
+#if !defined(__has_builtin)
+#define __has_builtin(x) 0 // Compatibility with non-clang compilers.
+#endif // !defined(__has_builtin)
+
+#if (__has_builtin(__atomic_load_n)) || \
+ (defined(__GNUC__) && \
+ (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)))
+// For GCC >= 4.7 and Clang versions that support __atomic builtins, use those.
+#define VPX_USE_ATOMIC_BUILTINS
+#else
+// Use platform-specific asm barriers.
+#if defined(_MSC_VER)
+// TODO(pbos): This assumes that newer versions of MSVC are building with the
+// default /volatile:ms (or older, where this is always true. Consider adding
+// support for using <atomic> instead of stdatomic.h when building C++11 under
+// MSVC. It's unclear what to do for plain C under /volatile:iso (inline asm?),
+// there're no explicit Interlocked* functions for only storing or loading
+// (presumably because volatile has historically implied that on MSVC).
+//
+// For earlier versions of MSVC or the default /volatile:ms volatile int are
+// acquire/release and require no barrier.
+#define vpx_atomic_memory_barrier() \
+ do { \
+ } while (0)
+#else
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
+// Use a compiler barrier on x86, no runtime penalty.
+#define vpx_atomic_memory_barrier() __asm__ __volatile__("" ::: "memory")
+#elif VPX_ARCH_ARM
+#define vpx_atomic_memory_barrier() __asm__ __volatile__("dmb ish" ::: "memory")
+#elif VPX_ARCH_MIPS
+#define vpx_atomic_memory_barrier() __asm__ __volatile__("sync" ::: "memory")
+#else
+#error Unsupported architecture!
+#endif // VPX_ARCH_X86 || VPX_ARCH_X86_64
+#endif // defined(_MSC_VER)
+#endif // atomic builtin availability check
+
+// These are wrapped in a struct so that they are not easily accessed directly
+// on any platform (to discourage programmer errors by setting values directly).
+// This primitive MUST be initialized using vpx_atomic_init or VPX_ATOMIC_INIT
+// (NOT memset) and accessed through vpx_atomic_ functions.
+typedef struct vpx_atomic_int {
+ volatile int value;
+} vpx_atomic_int;
+
+#define VPX_ATOMIC_INIT(num) \
+ { num }
+
+// Initialization of an atomic int, not thread safe.
+static INLINE void vpx_atomic_init(vpx_atomic_int *atomic, int value) {
+ atomic->value = value;
+}
+
+static INLINE void vpx_atomic_store_release(vpx_atomic_int *atomic, int value) {
+#if defined(VPX_USE_ATOMIC_BUILTINS)
+ __atomic_store_n(&atomic->value, value, __ATOMIC_RELEASE);
+#else
+ vpx_atomic_memory_barrier();
+ atomic->value = value;
+#endif // defined(VPX_USE_ATOMIC_BUILTINS)
+}
+
+static INLINE int vpx_atomic_load_acquire(const vpx_atomic_int *atomic) {
+#if defined(VPX_USE_ATOMIC_BUILTINS)
+ return __atomic_load_n(&atomic->value, __ATOMIC_ACQUIRE);
+#else
+ int v = atomic->value;
+ vpx_atomic_memory_barrier();
+ return v;
+#endif // defined(VPX_USE_ATOMIC_BUILTINS)
+}
+
+#undef VPX_USE_ATOMIC_BUILTINS
+#undef vpx_atomic_memory_barrier
+
+#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // VPX_VPX_UTIL_VPX_ATOMICS_H_
diff --git a/media/libvpx/libvpx/vpx_util/vpx_debug_util.c b/media/libvpx/libvpx/vpx_util/vpx_debug_util.c
new file mode 100644
index 0000000000..3ce4065ba5
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_debug_util.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include "vpx_util/vpx_debug_util.h"
+
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+static int frame_idx_w = 0;
+static int frame_idx_r = 0;
+
+void bitstream_queue_set_frame_write(int frame_idx) { frame_idx_w = frame_idx; }
+
+int bitstream_queue_get_frame_write(void) { return frame_idx_w; }
+
+void bitstream_queue_set_frame_read(int frame_idx) { frame_idx_r = frame_idx; }
+
+int bitstream_queue_get_frame_read(void) { return frame_idx_r; }
+#endif
+
+#if CONFIG_BITSTREAM_DEBUG
+#define QUEUE_MAX_SIZE 2000000
+static int result_queue[QUEUE_MAX_SIZE];
+static int prob_queue[QUEUE_MAX_SIZE];
+
+static int queue_r = 0;
+static int queue_w = 0;
+static int queue_prev_w = -1;
+static int skip_r = 0;
+static int skip_w = 0;
+void bitstream_queue_set_skip_write(int skip) { skip_w = skip; }
+
+void bitstream_queue_set_skip_read(int skip) { skip_r = skip; }
+
+void bitstream_queue_record_write(void) { queue_prev_w = queue_w; }
+
+void bitstream_queue_reset_write(void) { queue_w = queue_prev_w; }
+
+int bitstream_queue_get_write(void) { return queue_w; }
+
+int bitstream_queue_get_read(void) { return queue_r; }
+
+void bitstream_queue_pop(int *result, int *prob) {
+ if (!skip_r) {
+ if (queue_w == queue_r) {
+ printf("buffer underflow queue_w %d queue_r %d\n", queue_w, queue_r);
+ assert(0);
+ }
+ *result = result_queue[queue_r];
+ *prob = prob_queue[queue_r];
+ queue_r = (queue_r + 1) % QUEUE_MAX_SIZE;
+ }
+}
+
+void bitstream_queue_push(int result, const int prob) {
+ if (!skip_w) {
+ result_queue[queue_w] = result;
+ prob_queue[queue_w] = prob;
+ queue_w = (queue_w + 1) % QUEUE_MAX_SIZE;
+ if (queue_w == queue_r) {
+ printf("buffer overflow queue_w %d queue_r %d\n", queue_w, queue_r);
+ assert(0);
+ }
+ }
+}
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_MISMATCH_DEBUG
+static int frame_buf_idx_r = 0;
+static int frame_buf_idx_w = 0;
+#define MAX_FRAME_BUF_NUM 20
+#define MAX_FRAME_STRIDE 1920
+#define MAX_FRAME_HEIGHT 1080
+static uint16_t
+ frame_pre[MAX_FRAME_BUF_NUM][3]
+ [MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT]; // prediction only
+static uint16_t
+ frame_tx[MAX_FRAME_BUF_NUM][3]
+ [MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT]; // prediction + txfm
+static int frame_stride = MAX_FRAME_STRIDE;
+static int frame_height = MAX_FRAME_HEIGHT;
+static int frame_size = MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT;
+void mismatch_move_frame_idx_w(void) {
+ frame_buf_idx_w = (frame_buf_idx_w + 1) % MAX_FRAME_BUF_NUM;
+ if (frame_buf_idx_w == frame_buf_idx_r) {
+ printf("frame_buf overflow\n");
+ assert(0);
+ }
+}
+
+void mismatch_reset_frame(int num_planes) {
+ int plane;
+ for (plane = 0; plane < num_planes; ++plane) {
+ memset(frame_pre[frame_buf_idx_w][plane], 0,
+ sizeof(frame_pre[frame_buf_idx_w][plane][0]) * frame_size);
+ memset(frame_tx[frame_buf_idx_w][plane], 0,
+ sizeof(frame_tx[frame_buf_idx_w][plane][0]) * frame_size);
+ }
+}
+
+void mismatch_move_frame_idx_r(void) {
+ if (frame_buf_idx_w == frame_buf_idx_r) {
+ printf("frame_buf underflow\n");
+ assert(0);
+ }
+ frame_buf_idx_r = (frame_buf_idx_r + 1) % MAX_FRAME_BUF_NUM;
+}
+
+void mismatch_record_block_pre(const uint8_t *src, int src_stride, int plane,
+ int pixel_c, int pixel_r, int blk_w, int blk_h,
+ int highbd) {
+ const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+ int r, c;
+
+ if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+ printf("frame_buf undersized\n");
+ assert(0);
+ }
+
+ for (r = 0; r < blk_h; ++r) {
+ for (c = 0; c < blk_w; ++c) {
+ frame_pre[frame_buf_idx_w][plane]
+ [(r + pixel_r) * frame_stride + c + pixel_c] =
+ src16 ? src16[r * src_stride + c] : src[r * src_stride + c];
+ }
+ }
+#if 0
+ {
+ int ref_frame_idx = 3;
+ int ref_plane = 1;
+ int ref_pixel_c = 162;
+ int ref_pixel_r = 16;
+ if (frame_idx_w == ref_frame_idx && plane == ref_plane &&
+ ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w &&
+ ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) {
+ printf(
+ "\nrecord_block_pre frame_idx %d plane %d pixel_c %d pixel_r %d blk_w"
+ " %d blk_h %d\n",
+ frame_idx_w, plane, pixel_c, pixel_r, blk_w, blk_h);
+ }
+ }
+#endif
+}
+void mismatch_record_block_tx(const uint8_t *src, int src_stride, int plane,
+ int pixel_c, int pixel_r, int blk_w, int blk_h,
+ int highbd) {
+ const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+ int r, c;
+ if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+ printf("frame_buf undersized\n");
+ assert(0);
+ }
+
+ for (r = 0; r < blk_h; ++r) {
+ for (c = 0; c < blk_w; ++c) {
+ frame_tx[frame_buf_idx_w][plane]
+ [(r + pixel_r) * frame_stride + c + pixel_c] =
+ src16 ? src16[r * src_stride + c] : src[r * src_stride + c];
+ }
+ }
+#if 0
+ {
+ int ref_frame_idx = 3;
+ int ref_plane = 1;
+ int ref_pixel_c = 162;
+ int ref_pixel_r = 16;
+ if (frame_idx_w == ref_frame_idx && plane == ref_plane &&
+ ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w &&
+ ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) {
+ printf(
+ "\nrecord_block_tx frame_idx %d plane %d pixel_c %d pixel_r %d blk_w "
+ "%d blk_h %d\n",
+ frame_idx_w, plane, pixel_c, pixel_r, blk_w, blk_h);
+ }
+ }
+#endif
+}
+void mismatch_check_block_pre(const uint8_t *src, int src_stride, int plane,
+ int pixel_c, int pixel_r, int blk_w, int blk_h,
+ int highbd) {
+ const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+ int mismatch = 0;
+ int r, c;
+ if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+ printf("frame_buf undersized\n");
+ assert(0);
+ }
+
+ for (r = 0; r < blk_h; ++r) {
+ for (c = 0; c < blk_w; ++c) {
+ if (frame_pre[frame_buf_idx_r][plane]
+ [(r + pixel_r) * frame_stride + c + pixel_c] !=
+ (uint16_t)(src16 ? src16[r * src_stride + c]
+ : src[r * src_stride + c])) {
+ mismatch = 1;
+ }
+ }
+ }
+ if (mismatch) {
+ int rr, cc;
+ printf(
+ "\ncheck_block_pre failed frame_idx %d plane %d "
+ "pixel_c %d pixel_r "
+ "%d blk_w %d blk_h %d\n",
+ frame_idx_r, plane, pixel_c, pixel_r, blk_w, blk_h);
+ printf("enc\n");
+ for (rr = 0; rr < blk_h; ++rr) {
+ for (cc = 0; cc < blk_w; ++cc) {
+ printf("%d ", frame_pre[frame_buf_idx_r][plane]
+ [(rr + pixel_r) * frame_stride + cc + pixel_c]);
+ }
+ printf("\n");
+ }
+
+ printf("dec\n");
+ for (rr = 0; rr < blk_h; ++rr) {
+ for (cc = 0; cc < blk_w; ++cc) {
+ printf("%d ",
+ src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]);
+ }
+ printf("\n");
+ }
+ assert(0);
+ }
+}
+void mismatch_check_block_tx(const uint8_t *src, int src_stride, int plane,
+ int pixel_c, int pixel_r, int blk_w, int blk_h,
+ int highbd) {
+ const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+ int mismatch = 0;
+ int r, c;
+ if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+ printf("frame_buf undersized\n");
+ assert(0);
+ }
+
+ for (r = 0; r < blk_h; ++r) {
+ for (c = 0; c < blk_w; ++c) {
+ if (frame_tx[frame_buf_idx_r][plane]
+ [(r + pixel_r) * frame_stride + c + pixel_c] !=
+ (uint16_t)(src16 ? src16[r * src_stride + c]
+ : src[r * src_stride + c])) {
+ mismatch = 1;
+ }
+ }
+ }
+ if (mismatch) {
+ int rr, cc;
+ printf(
+ "\ncheck_block_tx failed frame_idx %d plane %d pixel_c "
+ "%d pixel_r "
+ "%d blk_w %d blk_h %d\n",
+ frame_idx_r, plane, pixel_c, pixel_r, blk_w, blk_h);
+ printf("enc\n");
+ for (rr = 0; rr < blk_h; ++rr) {
+ for (cc = 0; cc < blk_w; ++cc) {
+ printf("%d ", frame_tx[frame_buf_idx_r][plane]
+ [(rr + pixel_r) * frame_stride + cc + pixel_c]);
+ }
+ printf("\n");
+ }
+
+ printf("dec\n");
+ for (rr = 0; rr < blk_h; ++rr) {
+ for (cc = 0; cc < blk_w; ++cc) {
+ printf("%d ",
+ src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]);
+ }
+ printf("\n");
+ }
+ assert(0);
+ }
+}
+#endif // CONFIG_MISMATCH_DEBUG
diff --git a/media/libvpx/libvpx/vpx_util/vpx_debug_util.h b/media/libvpx/libvpx/vpx_util/vpx_debug_util.h
new file mode 100644
index 0000000000..df1a1aab2c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_debug_util.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_UTIL_VPX_DEBUG_UTIL_H_
+#define VPX_VPX_UTIL_VPX_DEBUG_UTIL_H_
+
+#include "./vpx_config.h"
+
+#include "vpx_dsp/prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+void bitstream_queue_set_frame_write(int frame_idx);
+int bitstream_queue_get_frame_write(void);
+void bitstream_queue_set_frame_read(int frame_idx);
+int bitstream_queue_get_frame_read(void);
+#endif
+
+#if CONFIG_BITSTREAM_DEBUG
+/* This is a debug tool used to detect bitstream error. On encoder side, it
+ * pushes each bit and probability into a queue before the bit is written into
+ * the Arithmetic coder. On decoder side, whenever a bit is read out from the
+ * Arithmetic coder, it pops out the reference bit and probability from the
+ * queue as well. If the two results do not match, this debug tool will report
+ * an error. This tool can be used to pin down the bitstream error precisely.
+ * By combining gdb's backtrace method, we can detect which module causes the
+ * bitstream error. */
+int bitstream_queue_get_write(void);
+int bitstream_queue_get_read(void);
+void bitstream_queue_record_write(void);
+void bitstream_queue_reset_write(void);
+void bitstream_queue_pop(int *result, int *prob);
+void bitstream_queue_push(int result, const int prob);
+void bitstream_queue_set_skip_write(int skip);
+void bitstream_queue_set_skip_read(int skip);
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_MISMATCH_DEBUG
+void mismatch_move_frame_idx_w(void);
+void mismatch_move_frame_idx_r(void);
+void mismatch_reset_frame(int num_planes);
+void mismatch_record_block_pre(const uint8_t *src, int src_stride, int plane,
+ int pixel_c, int pixel_r, int blk_w, int blk_h,
+ int highbd);
+void mismatch_record_block_tx(const uint8_t *src, int src_stride, int plane,
+ int pixel_c, int pixel_r, int blk_w, int blk_h,
+ int highbd);
+void mismatch_check_block_pre(const uint8_t *src, int src_stride, int plane,
+ int pixel_c, int pixel_r, int blk_w, int blk_h,
+ int highbd);
+void mismatch_check_block_tx(const uint8_t *src, int src_stride, int plane,
+ int pixel_c, int pixel_r, int blk_w, int blk_h,
+ int highbd);
+#endif // CONFIG_MISMATCH_DEBUG
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_UTIL_VPX_DEBUG_UTIL_H_
diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.c b/media/libvpx/libvpx/vpx_util/vpx_thread.c
new file mode 100644
index 0000000000..04c5fb6f26
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_thread.c
@@ -0,0 +1,181 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Multi-threaded worker
+//
+// Original source:
+// https://chromium.googlesource.com/webm/libwebp
+
+#include <assert.h>
+#include <string.h> // for memset()
+#include "./vpx_thread.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if CONFIG_MULTITHREAD
+
+struct VPxWorkerImpl {
+ pthread_mutex_t mutex_;
+ pthread_cond_t condition_;
+ pthread_t thread_;
+};
+
+//------------------------------------------------------------------------------
+
+static void execute(VPxWorker *const worker); // Forward declaration.
+
+static THREADFN thread_loop(void *ptr) {
+ VPxWorker *const worker = (VPxWorker *)ptr;
+ int done = 0;
+ while (!done) {
+ pthread_mutex_lock(&worker->impl_->mutex_);
+ while (worker->status_ == OK) { // wait in idling mode
+ pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+ }
+ if (worker->status_ == WORK) {
+ execute(worker);
+ worker->status_ = OK;
+ } else if (worker->status_ == NOT_OK) { // finish the worker
+ done = 1;
+ }
+ // signal to the main thread that we're done (for sync())
+ pthread_cond_signal(&worker->impl_->condition_);
+ pthread_mutex_unlock(&worker->impl_->mutex_);
+ }
+ return THREAD_RETURN(NULL); // Thread is finished
+}
+
+// main thread state control
+static void change_state(VPxWorker *const worker, VPxWorkerStatus new_status) {
+ // No-op when attempting to change state on a thread that didn't come up.
+ // Checking status_ without acquiring the lock first would result in a data
+ // race.
+ if (worker->impl_ == NULL) return;
+
+ pthread_mutex_lock(&worker->impl_->mutex_);
+ if (worker->status_ >= OK) {
+ // wait for the worker to finish
+ while (worker->status_ != OK) {
+ pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+ }
+ // assign new status and release the working thread if needed
+ if (new_status != OK) {
+ worker->status_ = new_status;
+ pthread_cond_signal(&worker->impl_->condition_);
+ }
+ }
+ pthread_mutex_unlock(&worker->impl_->mutex_);
+}
+
+#endif // CONFIG_MULTITHREAD
+
+//------------------------------------------------------------------------------
+
+static void init(VPxWorker *const worker) {
+ memset(worker, 0, sizeof(*worker));
+ worker->status_ = NOT_OK;
+}
+
+static int sync(VPxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+ change_state(worker, OK);
+#endif
+ assert(worker->status_ <= OK);
+ return !worker->had_error;
+}
+
+static int reset(VPxWorker *const worker) {
+ int ok = 1;
+ worker->had_error = 0;
+ if (worker->status_ < OK) {
+#if CONFIG_MULTITHREAD
+ worker->impl_ = (VPxWorkerImpl *)vpx_calloc(1, sizeof(*worker->impl_));
+ if (worker->impl_ == NULL) {
+ return 0;
+ }
+ if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) {
+ goto Error;
+ }
+ if (pthread_cond_init(&worker->impl_->condition_, NULL)) {
+ pthread_mutex_destroy(&worker->impl_->mutex_);
+ goto Error;
+ }
+ pthread_mutex_lock(&worker->impl_->mutex_);
+ ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker);
+ if (ok) worker->status_ = OK;
+ pthread_mutex_unlock(&worker->impl_->mutex_);
+ if (!ok) {
+ pthread_mutex_destroy(&worker->impl_->mutex_);
+ pthread_cond_destroy(&worker->impl_->condition_);
+ Error:
+ vpx_free(worker->impl_);
+ worker->impl_ = NULL;
+ return 0;
+ }
+#else
+ worker->status_ = OK;
+#endif
+ } else if (worker->status_ > OK) {
+ ok = sync(worker);
+ }
+ assert(!ok || (worker->status_ == OK));
+ return ok;
+}
+
+static void execute(VPxWorker *const worker) {
+ if (worker->hook != NULL) {
+ worker->had_error |= !worker->hook(worker->data1, worker->data2);
+ }
+}
+
+static void launch(VPxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+ change_state(worker, WORK);
+#else
+ execute(worker);
+#endif
+}
+
+static void end(VPxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+ if (worker->impl_ != NULL) {
+ change_state(worker, NOT_OK);
+ pthread_join(worker->impl_->thread_, NULL);
+ pthread_mutex_destroy(&worker->impl_->mutex_);
+ pthread_cond_destroy(&worker->impl_->condition_);
+ vpx_free(worker->impl_);
+ worker->impl_ = NULL;
+ }
+#else
+ worker->status_ = NOT_OK;
+ assert(worker->impl_ == NULL);
+#endif
+ assert(worker->status_ == NOT_OK);
+}
+
+//------------------------------------------------------------------------------
+
+static VPxWorkerInterface g_worker_interface = { init, reset, sync,
+ launch, execute, end };
+
+int vpx_set_worker_interface(const VPxWorkerInterface *const winterface) {
+ if (winterface == NULL || winterface->init == NULL ||
+ winterface->reset == NULL || winterface->sync == NULL ||
+ winterface->launch == NULL || winterface->execute == NULL ||
+ winterface->end == NULL) {
+ return 0;
+ }
+ g_worker_interface = *winterface;
+ return 1;
+}
+
+const VPxWorkerInterface *vpx_get_worker_interface(void) {
+ return &g_worker_interface;
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.h b/media/libvpx/libvpx/vpx_util/vpx_thread.h
new file mode 100644
index 0000000000..6d308e949b
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_thread.h
@@ -0,0 +1,438 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Multi-threaded worker
+//
+// Original source:
+// https://chromium.googlesource.com/webm/libwebp
+
+#ifndef VPX_VPX_UTIL_VPX_THREAD_H_
+#define VPX_VPX_UTIL_VPX_THREAD_H_
+
+#include "./vpx_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Set maximum decode threads to be 8 due to the limit of frame buffers
+// and not enough semaphores in the emulation layer on windows.
+#define MAX_DECODE_THREADS 8
+
+#if CONFIG_MULTITHREAD
+
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+#include <errno.h> // NOLINT
+#include <process.h> // NOLINT
+#include <windows.h> // NOLINT
+typedef HANDLE pthread_t;
+typedef CRITICAL_SECTION pthread_mutex_t;
+
+#if _WIN32_WINNT >= 0x0600 // Windows Vista / Server 2008 or greater
+#define USE_WINDOWS_CONDITION_VARIABLE
+typedef CONDITION_VARIABLE pthread_cond_t;
+#else
+typedef struct {
+ HANDLE waiting_sem_;
+ HANDLE received_sem_;
+ HANDLE signal_event_;
+} pthread_cond_t;
+#endif // _WIN32_WINNT >= 0x600
+
+#ifndef WINAPI_FAMILY_PARTITION
+#define WINAPI_PARTITION_DESKTOP 1
+#define WINAPI_FAMILY_PARTITION(x) x
+#endif
+
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define USE_CREATE_THREAD
+#endif
+
+//------------------------------------------------------------------------------
+// simplistic pthread emulation layer
+
+// _beginthreadex requires __stdcall
+#if defined(__GNUC__) && \
+ (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
+#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall
+#else
+#define THREADFN unsigned int __stdcall
+#endif
+#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
+
+#if _WIN32_WINNT >= 0x0501 // Windows XP or greater
+#define WaitForSingleObject(obj, timeout) \
+ WaitForSingleObjectEx(obj, timeout, FALSE /*bAlertable*/)
+#endif
+
+static INLINE int pthread_create(pthread_t *const thread, const void *attr,
+ unsigned int(__stdcall *start)(void *),
+ void *arg) {
+ (void)attr;
+#ifdef USE_CREATE_THREAD
+ *thread = CreateThread(NULL, /* lpThreadAttributes */
+ 0, /* dwStackSize */
+ start, arg, 0, /* dwStackSize */
+ NULL); /* lpThreadId */
+#else
+ *thread = (pthread_t)_beginthreadex(NULL, /* void *security */
+ 0, /* unsigned stack_size */
+ start, arg, 0, /* unsigned initflag */
+ NULL); /* unsigned *thrdaddr */
+#endif
+ if (*thread == NULL) return 1;
+ SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
+ return 0;
+}
+
+static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
+ (void)value_ptr;
+ return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 ||
+ CloseHandle(thread) == 0);
+}
+
+// Mutex
+static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
+ void *mutexattr) {
+ (void)mutexattr;
+#if _WIN32_WINNT >= 0x0600 // Windows Vista / Server 2008 or greater
+ InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/);
+#else
+ InitializeCriticalSection(mutex);
+#endif
+ return 0;
+}
+
+static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
+ return TryEnterCriticalSection(mutex) ? 0 : EBUSY;
+}
+
+static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+ EnterCriticalSection(mutex);
+ return 0;
+}
+
+static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+ LeaveCriticalSection(mutex);
+ return 0;
+}
+
+static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+ DeleteCriticalSection(mutex);
+ return 0;
+}
+
+// Condition
+static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
+ int ok = 1;
+#ifdef USE_WINDOWS_CONDITION_VARIABLE
+ (void)condition;
+#else
+ ok &= (CloseHandle(condition->waiting_sem_) != 0);
+ ok &= (CloseHandle(condition->received_sem_) != 0);
+ ok &= (CloseHandle(condition->signal_event_) != 0);
+#endif
+ return !ok;
+}
+
+static INLINE int pthread_cond_init(pthread_cond_t *const condition,
+ void *cond_attr) {
+ (void)cond_attr;
+#ifdef USE_WINDOWS_CONDITION_VARIABLE
+ InitializeConditionVariable(condition);
+#else
+ condition->waiting_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
+ condition->received_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
+ condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
+ if (condition->waiting_sem_ == NULL || condition->received_sem_ == NULL ||
+ condition->signal_event_ == NULL) {
+ pthread_cond_destroy(condition);
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
+ int ok = 1;
+#ifdef USE_WINDOWS_CONDITION_VARIABLE
+ WakeAllConditionVariable(condition);
+#else
+ while (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
+ // a thread is waiting in pthread_cond_wait: allow it to be notified
+ ok &= SetEvent(condition->signal_event_);
+ // wait until the event is consumed so the signaler cannot consume
+ // the event via its own pthread_cond_wait.
+ ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
+ WAIT_OBJECT_0);
+ }
+#endif
+ return !ok;
+}
+
+static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
+ int ok = 1;
+#ifdef USE_WINDOWS_CONDITION_VARIABLE
+ WakeConditionVariable(condition);
+#else
+ if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
+ // a thread is waiting in pthread_cond_wait: allow it to be notified
+ ok = SetEvent(condition->signal_event_);
+ // wait until the event is consumed so the signaler cannot consume
+ // the event via its own pthread_cond_wait.
+ ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
+ WAIT_OBJECT_0);
+ }
+#endif
+ return !ok;
+}
+
+static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
+ pthread_mutex_t *const mutex) {
+ int ok;
+#ifdef USE_WINDOWS_CONDITION_VARIABLE
+ ok = SleepConditionVariableCS(condition, mutex, INFINITE);
+#else
+ // note that there is a consumer available so the signal isn't dropped in
+ // pthread_cond_signal
+ if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) return 1;
+ // now unlock the mutex so pthread_cond_signal may be issued
+ pthread_mutex_unlock(mutex);
+ ok = (WaitForSingleObject(condition->signal_event_, INFINITE) ==
+ WAIT_OBJECT_0);
+ ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL);
+ pthread_mutex_lock(mutex);
+#endif
+ return !ok;
+}
+
+#elif defined(__OS2__)
+#define INCL_DOS
+#include <os2.h> // NOLINT
+
+#include <errno.h> // NOLINT
+#include <stdlib.h> // NOLINT
+#include <sys/builtin.h> // NOLINT
+
+#if defined(__STRICT_ANSI__)
+// _beginthread() is not declared on __STRICT_ANSI__ mode. Declare here.
+int _beginthread(void (*)(void *), void *, unsigned, void *);
+#endif
+
+#define pthread_t TID
+#define pthread_mutex_t HMTX
+
+typedef struct {
+ HEV event_sem_;
+ HEV ack_sem_;
+ volatile unsigned wait_count_;
+} pthread_cond_t;
+
+//------------------------------------------------------------------------------
+// simplistic pthread emulation layer
+
+#define THREADFN void *
+#define THREAD_RETURN(val) (val)
+
+typedef struct {
+ void *(*start_)(void *);
+ void *arg_;
+} thread_arg;
+
+static void thread_start(void *arg) {
+ thread_arg targ = *(thread_arg *)arg;
+ free(arg);
+
+ targ.start_(targ.arg_);
+}
+
+static INLINE int pthread_create(pthread_t *const thread, const void *attr,
+ void *(*start)(void *), void *arg) {
+ int tid;
+ thread_arg *targ = (thread_arg *)malloc(sizeof(*targ));
+ if (targ == NULL) return 1;
+
+ (void)attr;
+
+ targ->start_ = start;
+ targ->arg_ = arg;
+ tid = (pthread_t)_beginthread(thread_start, NULL, 1024 * 1024, targ);
+ if (tid == -1) {
+ free(targ);
+ return 1;
+ }
+
+ *thread = tid;
+ return 0;
+}
+
+static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
+ (void)value_ptr;
+ return DosWaitThread(&thread, DCWW_WAIT) != 0;
+}
+
+// Mutex
+static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
+ void *mutexattr) {
+ (void)mutexattr;
+ return DosCreateMutexSem(NULL, mutex, 0, FALSE) != 0;
+}
+
+static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
+ return DosRequestMutexSem(*mutex, SEM_IMMEDIATE_RETURN) == 0 ? 0 : EBUSY;
+}
+
+static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+ return DosRequestMutexSem(*mutex, SEM_INDEFINITE_WAIT) != 0;
+}
+
+static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+ return DosReleaseMutexSem(*mutex) != 0;
+}
+
+static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+ return DosCloseMutexSem(*mutex) != 0;
+}
+
+// Condition
+static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
+ int ok = 1;
+ ok &= DosCloseEventSem(condition->event_sem_) == 0;
+ ok &= DosCloseEventSem(condition->ack_sem_) == 0;
+ return !ok;
+}
+
+static INLINE int pthread_cond_init(pthread_cond_t *const condition,
+ void *cond_attr) {
+ int ok = 1;
+ (void)cond_attr;
+
+ ok &=
+ DosCreateEventSem(NULL, &condition->event_sem_, DCE_POSTONE, FALSE) == 0;
+ ok &= DosCreateEventSem(NULL, &condition->ack_sem_, DCE_POSTONE, FALSE) == 0;
+ if (!ok) {
+ pthread_cond_destroy(condition);
+ return 1;
+ }
+ condition->wait_count_ = 0;
+ return 0;
+}
+
+static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
+ int ok = 1;
+
+ if (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) {
+ ok &= DosPostEventSem(condition->event_sem_) == 0;
+ ok &= DosWaitEventSem(condition->ack_sem_, SEM_INDEFINITE_WAIT) == 0;
+ }
+
+ return !ok;
+}
+
+static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
+ int ok = 1;
+
+ while (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0))
+ ok &= pthread_cond_signal(condition) == 0;
+
+ return !ok;
+}
+
+static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
+ pthread_mutex_t *const mutex) {
+ int ok = 1;
+
+ __atomic_increment(&condition->wait_count_);
+
+ ok &= pthread_mutex_unlock(mutex) == 0;
+
+ ok &= DosWaitEventSem(condition->event_sem_, SEM_INDEFINITE_WAIT) == 0;
+
+ __atomic_decrement(&condition->wait_count_);
+
+ ok &= DosPostEventSem(condition->ack_sem_) == 0;
+
+ pthread_mutex_lock(mutex);
+
+ return !ok;
+}
+#else // _WIN32
+#include <pthread.h> // NOLINT
+#define THREADFN void *
+#define THREAD_RETURN(val) val
+#endif
+
+#endif // CONFIG_MULTITHREAD
+
+// State of the worker thread object
+typedef enum {
+ NOT_OK = 0, // object is unusable
+ OK, // ready to work
+ WORK // busy finishing the current task
+} VPxWorkerStatus;
+
+// Function to be called by the worker thread. Takes two opaque pointers as
+// arguments (data1 and data2), and should return false in case of error.
+typedef int (*VPxWorkerHook)(void *, void *);
+
+// Platform-dependent implementation details for the worker.
+typedef struct VPxWorkerImpl VPxWorkerImpl;
+
+// Synchronization object used to launch job in the worker thread
+typedef struct {
+ VPxWorkerImpl *impl_;
+ VPxWorkerStatus status_;
+ VPxWorkerHook hook; // hook to call
+ void *data1; // first argument passed to 'hook'
+ void *data2; // second argument passed to 'hook'
+ int had_error; // return value of the last call to 'hook'
+} VPxWorker;
+
+// The interface for all thread-worker related functions. All these functions
+// must be implemented.
+typedef struct {
+ // Must be called first, before any other method.
+ void (*init)(VPxWorker *const worker);
+ // Must be called to initialize the object and spawn the thread. Re-entrant.
+ // Will potentially launch the thread. Returns false in case of error.
+ int (*reset)(VPxWorker *const worker);
+ // Makes sure the previous work is finished. Returns true if worker->had_error
+ // was not set and no error condition was triggered by the working thread.
+ int (*sync)(VPxWorker *const worker);
+ // Triggers the thread to call hook() with data1 and data2 arguments. These
+ // hook/data1/data2 values can be changed at any time before calling this
+ // function, but not be changed afterward until the next call to Sync().
+ void (*launch)(VPxWorker *const worker);
+ // This function is similar to launch() except that it calls the
+ // hook directly instead of using a thread. Convenient to bypass the thread
+ // mechanism while still using the VPxWorker structs. sync() must
+ // still be called afterward (for error reporting).
+ void (*execute)(VPxWorker *const worker);
+ // Kill the thread and terminate the object. To use the object again, one
+ // must call reset() again.
+ void (*end)(VPxWorker *const worker);
+} VPxWorkerInterface;
+
+// Install a new set of threading functions, overriding the defaults. This
+// should be done before any workers are started, i.e., before any encoding or
+// decoding takes place. The contents of the interface struct are copied, it
+// is safe to free the corresponding memory after this call. This function is
+// not thread-safe. Return false in case of invalid pointer or methods.
+int vpx_set_worker_interface(const VPxWorkerInterface *const winterface);
+
+// Retrieve the currently set thread worker interface.
+const VPxWorkerInterface *vpx_get_worker_interface(void);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_UTIL_VPX_THREAD_H_
diff --git a/media/libvpx/libvpx/vpx_util/vpx_timestamp.h b/media/libvpx/libvpx/vpx_util/vpx_timestamp.h
new file mode 100644
index 0000000000..5296458fad
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_timestamp.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_UTIL_VPX_TIMESTAMP_H_
+#define VPX_VPX_UTIL_VPX_TIMESTAMP_H_
+
+#include <assert.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// Rational Number with an int64 numerator
+typedef struct vpx_rational64 {
+ int64_t num; // fraction numerator
+ int den; // fraction denominator
+} vpx_rational64_t; // alias for struct vpx_rational64_t
+
+static INLINE int gcd(int64_t a, int b) {
+ int r; // remainder
+ assert(a >= 0);
+ assert(b > 0);
+ while (b != 0) {
+ r = (int)(a % b);
+ a = b;
+ b = r;
+ }
+
+ return (int)a;
+}
+
+static INLINE void reduce_ratio(vpx_rational64_t *ratio) {
+ const int denom = gcd(ratio->num, ratio->den);
+ ratio->num /= denom;
+ ratio->den /= denom;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // VPX_VPX_UTIL_VPX_TIMESTAMP_H_
diff --git a/media/libvpx/libvpx/vpx_util/vpx_util.mk b/media/libvpx/libvpx/vpx_util/vpx_util.mk
new file mode 100644
index 0000000000..1162714956
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_util.mk
@@ -0,0 +1,20 @@
+##
+## Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
+UTIL_SRCS-yes += vpx_atomics.h
+UTIL_SRCS-yes += vpx_util.mk
+UTIL_SRCS-yes += vpx_thread.c
+UTIL_SRCS-yes += vpx_thread.h
+UTIL_SRCS-yes += endian_inl.h
+UTIL_SRCS-yes += vpx_write_yuv_frame.h
+UTIL_SRCS-yes += vpx_write_yuv_frame.c
+UTIL_SRCS-yes += vpx_timestamp.h
+UTIL_SRCS-$(or $(CONFIG_BITSTREAM_DEBUG),$(CONFIG_MISMATCH_DEBUG)) += vpx_debug_util.h
+UTIL_SRCS-$(or $(CONFIG_BITSTREAM_DEBUG),$(CONFIG_MISMATCH_DEBUG)) += vpx_debug_util.c
diff --git a/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.c b/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.c
new file mode 100644
index 0000000000..4ef57a2fee
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/skin_detection.h"
+#include "vpx_util/vpx_write_yuv_frame.h"
+
+void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) {
+#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED) || \
+ defined(OUTPUT_YUV_SKINMAP) || defined(OUTPUT_YUV_SVC_SRC)
+
+ unsigned char *src = s->y_buffer;
+ int h = s->y_crop_height;
+
+ do {
+ fwrite(src, s->y_width, 1, yuv_file);
+ src += s->y_stride;
+ } while (--h);
+
+ src = s->u_buffer;
+ h = s->uv_crop_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_file);
+ src += s->uv_stride;
+ } while (--h);
+
+ src = s->v_buffer;
+ h = s->uv_crop_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_file);
+ src += s->uv_stride;
+ } while (--h);
+
+#else
+ (void)yuv_file;
+ (void)s;
+#endif
+}
diff --git a/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.h b/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.h
new file mode 100644
index 0000000000..ce1102458e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
+#define VPX_VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
+
+#include <stdio.h>
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_UTIL_VPX_WRITE_YUV_FRAME_H_