diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
commit | 26a029d407be480d791972afb5975cf62c9360a6 (patch) | |
tree | f435a8308119effd964b339f76abb83a57c29483 /third_party/jpeg-xl/lib/jxl/fast_dct8-inl.h | |
parent | Initial commit. (diff) | |
download | firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip |
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/jpeg-xl/lib/jxl/fast_dct8-inl.h')
-rw-r--r-- | third_party/jpeg-xl/lib/jxl/fast_dct8-inl.h | 80 |
1 files changed, 80 insertions, 0 deletions
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct8-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct8-inl.h new file mode 100644 index 0000000000..946ace4a0c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/fast_dct8-inl.h @@ -0,0 +1,80 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* This file is automatically generated. Do not modify it directly. */ +#if HWY_TARGET != HWY_NEON +#error "only include this file from fast_dct-inl.h" +#endif + +constexpr size_t FastIDCTIntegerBits(FastDCTTag<8>) { return 1; } + +void FastIDCT(FastDCTTag<8>, const int16_t* in, size_t in_stride, int16_t* out, + size_t out_stride, size_t count) { + JXL_ASSERT(count % 8 == 0); + for (size_t i = 0; i < count; i += 8) { + int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i); + int16x8_t v1 = vld1q_s16(in + in_stride * 4 + i); + int16x8_t v2 = vaddq_s16(v0, v1); + int16x8_t v3 = vld1q_s16(in + in_stride * 2 + i); + int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573); + int16x8_t v4 = vaddq_s16(v4_tmp, v3); + int16x8_t v5 = vld1q_s16(in + in_stride * 6 + i); + int16x8_t v6 = vaddq_s16(v5, v3); + int16x8_t v7 = vaddq_s16(v4, v6); + int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734); + int16x8_t v9 = vaddq_s16(v2, v8); + int16x8_t v10 = vld1q_s16(in + in_stride * 1 + i); + int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573); + int16x8_t v11 = vaddq_s16(v11_tmp, v10); + int16x8_t v12 = vld1q_s16(in + in_stride * 5 + i); + int16x8_t v13 = vld1q_s16(in + in_stride * 3 + i); + int16x8_t v14 = vaddq_s16(v12, v13); + int16x8_t v15 = vaddq_s16(v11, v14); + int16x8_t v16 = vaddq_s16(v13, v10); + int16x8_t v17 = vqrdmulhq_n_s16(v16, 25080); + int16x8_t v18 = vld1q_s16(in + in_stride * 7 + i); + int16x8_t v19 = vaddq_s16(v18, v12); + int16x8_t v20 = vaddq_s16(v16, v19); + int16x8_t v21 = vqrdmulhq_n_s16(v20, 17734); + int16x8_t v22 = vaddq_s16(v17, v21); + int16x8_t v23 = vaddq_s16(v15, v22); + int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705); + int16x8_t v25 = vaddq_s16(v9, v24); + int16x8_t v26 = vsubq_s16(v0, v1); + int16x8_t v27 = vsubq_s16(v4, v6); + int16x8_t v28_tmp = vqrdmulhq_n_s16(v27, 10045); + int16x8_t v28 = vaddq_s16(v28_tmp, v27); + int16x8_t v29 = vaddq_s16(v26, v28); + int16x8_t v30 = vsubq_s16(v11, v14); + int16x8_t v31 = vqrdmulhq_n_s16(v16, 17734); + int16x8_t v32_tmp = vqrdmulhq_n_s16(v19, 10045); + int16x8_t v32 = vaddq_s16(v32_tmp, v19); + int16x8_t v33 = vsubq_s16(v31, v32); + int16x8_t v34 = vaddq_s16(v30, v33); + int16x8_t v35 = vqrdmulhq_n_s16(v34, 19705); + int16x8_t v36 = vaddq_s16(v29, v35); + int16x8_t v37 = vsubq_s16(v26, v28); + int16x8_t v38 = vsubq_s16(v30, v33); + int16x8_t v39 = vqrdmulhq_n_s16(v38, 29490); + int16x8_t v40 = vaddq_s16(v37, v39); + int16x8_t v41 = vsubq_s16(v2, v8); + int16x8_t v42 = vsubq_s16(v15, v22); + int16x8_t v43_tmp = vqrdmulhq_n_s16(v42, 18446); + int16x8_t v43 = vmlaq_n_s16(v43_tmp, v42, 2); + int16x8_t v44 = vaddq_s16(v41, v43); + int16x8_t v45 = vsubq_s16(v41, v43); + int16x8_t v46 = vsubq_s16(v37, v39); + int16x8_t v47 = vsubq_s16(v29, v35); + int16x8_t v48 = vsubq_s16(v9, v24); + vst1q_s16(out + out_stride * 0 + i, v25); + vst1q_s16(out + out_stride * 1 + i, v36); + vst1q_s16(out + out_stride * 2 + i, v40); + vst1q_s16(out + out_stride * 3 + i, v44); + vst1q_s16(out + out_stride * 4 + i, v45); + vst1q_s16(out + out_stride * 5 + i, v46); + vst1q_s16(out + out_stride * 6 + i, v47); + vst1q_s16(out + out_stride * 7 + i, v48); + } +} |