third_party/aom/av1/common/ppc/cfl_ppc.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152

/*
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include <altivec.h>

#include "config/av1_rtcd.h"

#include "av1/common/cfl.h"

#define OFF_0 0
#define OFF_1 16
#define OFF_2 32
#define OFF_3 48
#define CFL_BUF_LINE_BYTES 64
#define CFL_LINE_1 64
#define CFL_LINE_2 128
#define CFL_LINE_3 192

typedef vector signed char int8x16_t;          // NOLINT(runtime/int)
typedef vector unsigned char uint8x16_t;       // NOLINT(runtime/int)
typedef vector signed short int16x8_t;         // NOLINT(runtime/int)
typedef vector unsigned short uint16x8_t;      // NOLINT(runtime/int)
typedef vector signed int int32x4_t;           // NOLINT(runtime/int)
typedef vector unsigned int uint32x4_t;        // NOLINT(runtime/int)
typedef vector unsigned long long uint64x2_t;  // NOLINT(runtime/int)

static INLINE void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst,
                                        int width, int height, int round_offset,
                                        int num_pel_log2) {
  //  int16_t *dst = dst_ptr;
  const int16_t *dst_end = dst + height * CFL_BUF_LINE;
  const int16_t *sum_buf = (const int16_t *)src_ptr;
  const int16_t *end = sum_buf + height * CFL_BUF_LINE;
  const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2);
  const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
                               0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
  const uint8x16_t mask_32 = { 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
                               0x1C, 0x1D, 0x1E, 0x1F, 0x08, 0x09, 0x0A, 0x0B };

  int32x4_t sum_32x4_0 = { 0, 0, 0, round_offset };
  int32x4_t sum_32x4_1 = { 0, 0, 0, 0 };
  do {
    sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_0, sum_buf), sum_32x4_0);
    sum_32x4_1 = vec_sum4s(vec_vsx_ld(OFF_0 + CFL_LINE_1, sum_buf), sum_32x4_1);
    if (width >= 16) {
      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_1, sum_buf), sum_32x4_0);
      sum_32x4_1 =
          vec_sum4s(vec_vsx_ld(OFF_1 + CFL_LINE_1, sum_buf), sum_32x4_1);
    }
    if (width == 32) {
      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_2, sum_buf), sum_32x4_0);
      sum_32x4_1 =
          vec_sum4s(vec_vsx_ld(OFF_2 + CFL_LINE_1, sum_buf), sum_32x4_1);
      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_3, sum_buf), sum_32x4_0);
      sum_32x4_1 =
          vec_sum4s(vec_vsx_ld(OFF_3 + CFL_LINE_1, sum_buf), sum_32x4_1);
    }
  } while ((sum_buf += (CFL_BUF_LINE * 2)) < end);
  int32x4_t sum_32x4 = vec_add(sum_32x4_0, sum_32x4_1);

  const int32x4_t perm_64 = vec_perm(sum_32x4, sum_32x4, mask_64);
  sum_32x4 = vec_add(sum_32x4, perm_64);
  const int32x4_t perm_32 = vec_perm(sum_32x4, sum_32x4, mask_32);
  sum_32x4 = vec_add(sum_32x4, perm_32);
  const int32x4_t avg = vec_sr(sum_32x4, div_shift);
  const int16x8_t vec_avg = vec_pack(avg, avg);
  do {
    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, dst), vec_avg), OFF_0, dst);
    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, dst), vec_avg),
               OFF_0 + CFL_BUF_LINE_BYTES, dst);
    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, dst), vec_avg),
               OFF_0 + CFL_LINE_2, dst);
    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, dst), vec_avg),
               OFF_0 + CFL_LINE_3, dst);
    if (width >= 16) {
      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, dst), vec_avg), OFF_1, dst);
      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, dst), vec_avg),
                 OFF_1 + CFL_LINE_1, dst);
      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, dst), vec_avg),
                 OFF_1 + CFL_LINE_2, dst);
      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, dst), vec_avg),
                 OFF_1 + CFL_LINE_3, dst);
    }
    if (width == 32) {
      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, dst), vec_avg), OFF_2, dst);
      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, dst), vec_avg),
                 OFF_2 + CFL_LINE_1, dst);
      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, dst), vec_avg),
                 OFF_2 + CFL_LINE_2, dst);
      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, dst), vec_avg),
                 OFF_2 + CFL_LINE_3, dst);

      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, dst), vec_avg), OFF_3, dst);
      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, dst), vec_avg),
                 OFF_3 + CFL_LINE_1, dst);
      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, dst), vec_avg),
                 OFF_3 + CFL_LINE_2, dst);
      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, dst), vec_avg),
                 OFF_3 + CFL_LINE_3, dst);
    }
  } while ((dst += CFL_BUF_LINE * 4) < dst_end);
}

// Declare wrappers for VSX sizes
CFL_SUB_AVG_X(vsx, 8, 4, 16, 5)
CFL_SUB_AVG_X(vsx, 8, 8, 32, 6)
CFL_SUB_AVG_X(vsx, 8, 16, 64, 7)
CFL_SUB_AVG_X(vsx, 8, 32, 128, 8)
CFL_SUB_AVG_X(vsx, 16, 4, 32, 6)
CFL_SUB_AVG_X(vsx, 16, 8, 64, 7)
CFL_SUB_AVG_X(vsx, 16, 16, 128, 8)
CFL_SUB_AVG_X(vsx, 16, 32, 256, 9)
CFL_SUB_AVG_X(vsx, 32, 8, 128, 8)
CFL_SUB_AVG_X(vsx, 32, 16, 256, 9)
CFL_SUB_AVG_X(vsx, 32, 32, 512, 10)

// Based on observation, for small blocks VSX does not outperform C (no 64bit
// load and store intrinsics). So we call the C code for block widths 4.
cfl_subtract_average_fn get_subtract_average_fn_vsx(TX_SIZE tx_size) {
  static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
    subtract_average_4x4_c,     /* 4x4 */
    subtract_average_8x8_vsx,   /* 8x8 */
    subtract_average_16x16_vsx, /* 16x16 */
    subtract_average_32x32_vsx, /* 32x32 */
    cfl_subtract_average_null,  /* 64x64 (invalid CFL size) */
    subtract_average_4x8_c,     /* 4x8 */
    subtract_average_8x4_vsx,   /* 8x4 */
    subtract_average_8x16_vsx,  /* 8x16 */
    subtract_average_16x8_vsx,  /* 16x8 */
    subtract_average_16x32_vsx, /* 16x32 */
    subtract_average_32x16_vsx, /* 32x16 */
    cfl_subtract_average_null,  /* 32x64 (invalid CFL size) */
    cfl_subtract_average_null,  /* 64x32 (invalid CFL size) */
    subtract_average_4x16_c,    /* 4x16 */
    subtract_average_16x4_vsx,  /* 16x4 */
    subtract_average_8x32_vsx,  /* 8x32 */
    subtract_average_32x8_vsx,  /* 32x8 */
    cfl_subtract_average_null,  /* 16x64 (invalid CFL size) */
    cfl_subtract_average_null,  /* 64x16 (invalid CFL size) */
  };
  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
  // index the function pointer array out of bounds.
  return sub_avg[tx_size % TX_SIZES_ALL];
}