1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
|
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "lib/jxl/enc_entropy_coder.h"
#include <stddef.h>
#include <stdint.h>
#include <algorithm>
#include <utility>
#include <vector>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "lib/jxl/enc_entropy_coder.cc"
#include <hwy/foreach_target.h>
#include <hwy/highway.h>
#include "lib/jxl/ac_context.h"
#include "lib/jxl/ac_strategy.h"
#include "lib/jxl/base/bits.h"
#include "lib/jxl/base/compiler_specific.h"
#include "lib/jxl/base/profiler.h"
#include "lib/jxl/base/status.h"
#include "lib/jxl/coeff_order.h"
#include "lib/jxl/coeff_order_fwd.h"
#include "lib/jxl/common.h"
#include "lib/jxl/dec_ans.h"
#include "lib/jxl/dec_bit_reader.h"
#include "lib/jxl/dec_context_map.h"
#include "lib/jxl/entropy_coder.h"
#include "lib/jxl/epf.h"
#include "lib/jxl/image.h"
#include "lib/jxl/image_ops.h"
HWY_BEFORE_NAMESPACE();
namespace jxl {
namespace HWY_NAMESPACE {
// These templates are not found via ADL.
using hwy::HWY_NAMESPACE::Add;
using hwy::HWY_NAMESPACE::AndNot;
using hwy::HWY_NAMESPACE::Eq;
using hwy::HWY_NAMESPACE::GetLane;
// Returns number of non-zero coefficients (but skip LLF).
// We cannot rely on block[] being all-zero bits, so first truncate to integer.
// Also writes the per-8x8 block nzeros starting at nzeros_pos.
int32_t NumNonZeroExceptLLF(const size_t cx, const size_t cy,
const AcStrategy acs, const size_t covered_blocks,
const size_t log2_covered_blocks,
const int32_t* JXL_RESTRICT block,
const size_t nzeros_stride,
int32_t* JXL_RESTRICT nzeros_pos) {
const HWY_CAPPED(int32_t, kBlockDim) di;
const auto zero = Zero(di);
// Add FF..FF for every zero coefficient, negate to get #zeros.
auto neg_sum_zero = zero;
{
// Mask sufficient for one row of coefficients.
HWY_ALIGN const int32_t
llf_mask_lanes[AcStrategy::kMaxCoeffBlocks * (1 + kBlockDim)] = {
-1, -1, -1, -1};
// First cx=1,2,4 elements are FF..FF, others 0.
const int32_t* llf_mask_pos =
llf_mask_lanes + AcStrategy::kMaxCoeffBlocks - cx;
// Rows with LLF: mask out the LLF
for (size_t y = 0; y < cy; y++) {
for (size_t x = 0; x < cx * kBlockDim; x += Lanes(di)) {
const auto llf_mask = LoadU(di, llf_mask_pos + x);
// LLF counts as zero so we don't include it in nzeros.
const auto coef =
AndNot(llf_mask, Load(di, &block[y * cx * kBlockDim + x]));
neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
}
}
}
// Remaining rows: no mask
for (size_t y = cy; y < cy * kBlockDim; y++) {
for (size_t x = 0; x < cx * kBlockDim; x += Lanes(di)) {
const auto coef = Load(di, &block[y * cx * kBlockDim + x]);
neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
}
}
// We want area - sum_zero, add because neg_sum_zero is already negated.
const int32_t nzeros =
int32_t(cx * cy * kDCTBlockSize) + GetLane(SumOfLanes(di, neg_sum_zero));
const int32_t shifted_nzeros = static_cast<int32_t>(
(nzeros + covered_blocks - 1) >> log2_covered_blocks);
// Need non-canonicalized dimensions!
for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
nzeros_pos[x + y * nzeros_stride] = shifted_nzeros;
}
}
return nzeros;
}
// Specialization for 8x8, where only top-left is LLF/DC.
// About 1% overall speedup vs. NumNonZeroExceptLLF.
int32_t NumNonZero8x8ExceptDC(const int32_t* JXL_RESTRICT block,
int32_t* JXL_RESTRICT nzeros_pos) {
const HWY_CAPPED(int32_t, kBlockDim) di;
const auto zero = Zero(di);
// Add FF..FF for every zero coefficient, negate to get #zeros.
auto neg_sum_zero = zero;
{
// First row has DC, so mask
const size_t y = 0;
HWY_ALIGN const int32_t dc_mask_lanes[kBlockDim] = {-1};
for (size_t x = 0; x < kBlockDim; x += Lanes(di)) {
const auto dc_mask = Load(di, dc_mask_lanes + x);
// DC counts as zero so we don't include it in nzeros.
const auto coef = AndNot(dc_mask, Load(di, &block[y * kBlockDim + x]));
neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
}
}
// Remaining rows: no mask
for (size_t y = 1; y < kBlockDim; y++) {
for (size_t x = 0; x < kBlockDim; x += Lanes(di)) {
const auto coef = Load(di, &block[y * kBlockDim + x]);
neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
}
}
// We want 64 - sum_zero, add because neg_sum_zero is already negated.
const int32_t nzeros =
int32_t(kDCTBlockSize) + GetLane(SumOfLanes(di, neg_sum_zero));
*nzeros_pos = nzeros;
return nzeros;
}
// The number of nonzeros of each block is predicted from the top and the left
// blocks, with opportune scaling to take into account the number of blocks of
// each strategy. The predicted number of nonzeros divided by two is used as a
// context; if this number is above 63, a specific context is used. If the
// number of nonzeros of a strategy is above 63, it is written directly using a
// fixed number of bits (that depends on the size of the strategy).
void TokenizeCoefficients(const coeff_order_t* JXL_RESTRICT orders,
const Rect& rect,
const int32_t* JXL_RESTRICT* JXL_RESTRICT ac_rows,
const AcStrategyImage& ac_strategy,
YCbCrChromaSubsampling cs,
Image3I* JXL_RESTRICT tmp_num_nzeroes,
std::vector<Token>* JXL_RESTRICT output,
const ImageB& qdc, const ImageI& qf,
const BlockCtxMap& block_ctx_map) {
const size_t xsize_blocks = rect.xsize();
const size_t ysize_blocks = rect.ysize();
// TODO(user): update the estimate: usually less coefficients are used.
output->reserve(output->size() +
3 * xsize_blocks * ysize_blocks * kDCTBlockSize);
size_t offset[3] = {};
const size_t nzeros_stride = tmp_num_nzeroes->PixelsPerRow();
for (size_t by = 0; by < ysize_blocks; ++by) {
size_t sby[3] = {by >> cs.VShift(0), by >> cs.VShift(1),
by >> cs.VShift(2)};
int32_t* JXL_RESTRICT row_nzeros[3] = {
tmp_num_nzeroes->PlaneRow(0, sby[0]),
tmp_num_nzeroes->PlaneRow(1, sby[1]),
tmp_num_nzeroes->PlaneRow(2, sby[2]),
};
const int32_t* JXL_RESTRICT row_nzeros_top[3] = {
sby[0] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(0, sby[0] - 1),
sby[1] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(1, sby[1] - 1),
sby[2] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(2, sby[2] - 1),
};
const uint8_t* JXL_RESTRICT row_qdc =
qdc.ConstRow(rect.y0() + by) + rect.x0();
const int32_t* JXL_RESTRICT row_qf = rect.ConstRow(qf, by);
AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by);
for (size_t bx = 0; bx < xsize_blocks; ++bx) {
AcStrategy acs = acs_row[bx];
if (!acs.IsFirstBlock()) continue;
size_t sbx[3] = {bx >> cs.HShift(0), bx >> cs.HShift(1),
bx >> cs.HShift(2)};
size_t cx = acs.covered_blocks_x();
size_t cy = acs.covered_blocks_y();
const size_t covered_blocks = cx * cy; // = #LLF coefficients
const size_t log2_covered_blocks =
Num0BitsBelowLS1Bit_Nonzero(covered_blocks);
const size_t size = covered_blocks * kDCTBlockSize;
CoefficientLayout(&cy, &cx); // swap cx/cy to canonical order
for (int c : {1, 0, 2}) {
if (sbx[c] << cs.HShift(c) != bx) continue;
if (sby[c] << cs.VShift(c) != by) continue;
const int32_t* JXL_RESTRICT block = ac_rows[c] + offset[c];
int32_t nzeros =
(covered_blocks == 1)
? NumNonZero8x8ExceptDC(block, row_nzeros[c] + sbx[c])
: NumNonZeroExceptLLF(cx, cy, acs, covered_blocks,
log2_covered_blocks, block, nzeros_stride,
row_nzeros[c] + sbx[c]);
int ord = kStrategyOrder[acs.RawStrategy()];
const coeff_order_t* JXL_RESTRICT order =
&orders[CoeffOrderOffset(ord, c)];
int32_t predicted_nzeros =
PredictFromTopAndLeft(row_nzeros_top[c], row_nzeros[c], sbx[c], 32);
size_t block_ctx =
block_ctx_map.Context(row_qdc[bx], row_qf[sbx[c]], ord, c);
const int32_t nzero_ctx =
block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx);
output->emplace_back(nzero_ctx, nzeros);
const size_t histo_offset =
block_ctx_map.ZeroDensityContextsOffset(block_ctx);
// Skip LLF.
size_t prev = (nzeros > static_cast<ssize_t>(size / 16) ? 0 : 1);
for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) {
int32_t coeff = block[order[k]];
size_t ctx =
histo_offset + ZeroDensityContext(nzeros, k, covered_blocks,
log2_covered_blocks, prev);
uint32_t u_coeff = PackSigned(coeff);
output->emplace_back(ctx, u_coeff);
prev = coeff != 0;
nzeros -= prev;
}
JXL_DASSERT(nzeros == 0);
offset[c] += size;
}
}
}
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace jxl
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace jxl {
HWY_EXPORT(TokenizeCoefficients);
void TokenizeCoefficients(const coeff_order_t* JXL_RESTRICT orders,
const Rect& rect,
const int32_t* JXL_RESTRICT* JXL_RESTRICT ac_rows,
const AcStrategyImage& ac_strategy,
YCbCrChromaSubsampling cs,
Image3I* JXL_RESTRICT tmp_num_nzeroes,
std::vector<Token>* JXL_RESTRICT output,
const ImageB& qdc, const ImageI& qf,
const BlockCtxMap& block_ctx_map) {
return HWY_DYNAMIC_DISPATCH(TokenizeCoefficients)(
orders, rect, ac_rows, ac_strategy, cs, tmp_num_nzeroes, output, qdc, qf,
block_ctx_map);
}
} // namespace jxl
#endif // HWY_ONCE
|