From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 21:33:14 +0200 Subject: Adding upstream version 115.7.0esr. Signed-off-by: Daniel Baumann --- third_party/jpeg-xl/lib/jxl/ac_context.h | 149 + third_party/jpeg-xl/lib/jxl/ac_strategy.cc | 108 + third_party/jpeg-xl/lib/jxl/ac_strategy.h | 261 + third_party/jpeg-xl/lib/jxl/ac_strategy_test.cc | 237 + third_party/jpeg-xl/lib/jxl/alpha.cc | 115 + third_party/jpeg-xl/lib/jxl/alpha.h | 66 + third_party/jpeg-xl/lib/jxl/alpha_test.cc | 134 + third_party/jpeg-xl/lib/jxl/ans_common.cc | 148 + third_party/jpeg-xl/lib/jxl/ans_common.h | 143 + third_party/jpeg-xl/lib/jxl/ans_common_test.cc | 43 + third_party/jpeg-xl/lib/jxl/ans_params.h | 36 + third_party/jpeg-xl/lib/jxl/ans_test.cc | 278 + third_party/jpeg-xl/lib/jxl/base/arch_macros.h | 33 + third_party/jpeg-xl/lib/jxl/base/bits.h | 147 + third_party/jpeg-xl/lib/jxl/base/byte_order.h | 274 + third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc | 157 + third_party/jpeg-xl/lib/jxl/base/cache_aligned.h | 74 + .../jpeg-xl/lib/jxl/base/compiler_specific.h | 157 + third_party/jpeg-xl/lib/jxl/base/data_parallel.cc | 23 + third_party/jpeg-xl/lib/jxl/base/data_parallel.h | 120 + third_party/jpeg-xl/lib/jxl/base/file_io.h | 153 + third_party/jpeg-xl/lib/jxl/base/float.h | 98 + third_party/jpeg-xl/lib/jxl/base/iaca.h | 65 + third_party/jpeg-xl/lib/jxl/base/os_macros.h | 50 + third_party/jpeg-xl/lib/jxl/base/override.h | 29 + third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc | 63 + third_party/jpeg-xl/lib/jxl/base/padded_bytes.h | 197 + third_party/jpeg-xl/lib/jxl/base/printf_macros.h | 34 + third_party/jpeg-xl/lib/jxl/base/profiler.cc | 540 ++ third_party/jpeg-xl/lib/jxl/base/profiler.h | 170 + third_party/jpeg-xl/lib/jxl/base/random.cc | 21 + third_party/jpeg-xl/lib/jxl/base/random.h | 95 + .../jpeg-xl/lib/jxl/base/sanitizer_definitions.h | 44 + third_party/jpeg-xl/lib/jxl/base/scope_guard.h | 48 + third_party/jpeg-xl/lib/jxl/base/span.h | 60 + third_party/jpeg-xl/lib/jxl/base/status.h | 326 ++ third_party/jpeg-xl/lib/jxl/base/tsc_timer.h | 172 + third_party/jpeg-xl/lib/jxl/bit_reader_test.cc | 262 + third_party/jpeg-xl/lib/jxl/bits_test.cc | 87 + third_party/jpeg-xl/lib/jxl/blending.cc | 152 + third_party/jpeg-xl/lib/jxl/blending.h | 24 + third_party/jpeg-xl/lib/jxl/blending_test.cc | 37 + third_party/jpeg-xl/lib/jxl/box_content_decoder.cc | 101 + third_party/jpeg-xl/lib/jxl/box_content_decoder.h | 49 + .../jpeg-xl/lib/jxl/butteraugli/butteraugli.cc | 1988 +++++++ .../jpeg-xl/lib/jxl/butteraugli/butteraugli.h | 209 + third_party/jpeg-xl/lib/jxl/butteraugli_test.cc | 103 + third_party/jpeg-xl/lib/jxl/butteraugli_wrapper.cc | 203 + third_party/jpeg-xl/lib/jxl/byte_order_test.cc | 53 + third_party/jpeg-xl/lib/jxl/chroma_from_luma.cc | 21 + third_party/jpeg-xl/lib/jxl/chroma_from_luma.h | 147 + third_party/jpeg-xl/lib/jxl/codec_in_out.h | 116 + third_party/jpeg-xl/lib/jxl/coeff_order.cc | 153 + third_party/jpeg-xl/lib/jxl/coeff_order.h | 64 + third_party/jpeg-xl/lib/jxl/coeff_order_fwd.h | 47 + third_party/jpeg-xl/lib/jxl/coeff_order_test.cc | 97 + .../jpeg-xl/lib/jxl/color_encoding_internal.cc | 753 +++ .../jpeg-xl/lib/jxl/color_encoding_internal.h | 463 ++ .../lib/jxl/color_encoding_internal_test.cc | 157 + third_party/jpeg-xl/lib/jxl/color_management.cc | 682 +++ third_party/jpeg-xl/lib/jxl/color_management.h | 40 + .../jpeg-xl/lib/jxl/color_management_test.cc | 405 ++ third_party/jpeg-xl/lib/jxl/common.h | 245 + third_party/jpeg-xl/lib/jxl/compressed_dc.cc | 318 ++ third_party/jpeg-xl/lib/jxl/compressed_dc.h | 34 + third_party/jpeg-xl/lib/jxl/convolve-inl.h | 297 ++ third_party/jpeg-xl/lib/jxl/convolve.h | 105 + third_party/jpeg-xl/lib/jxl/convolve_separable5.cc | 261 + third_party/jpeg-xl/lib/jxl/convolve_separable7.cc | 285 + third_party/jpeg-xl/lib/jxl/convolve_slow.cc | 212 + third_party/jpeg-xl/lib/jxl/convolve_symmetric3.cc | 194 + third_party/jpeg-xl/lib/jxl/convolve_symmetric5.cc | 185 + third_party/jpeg-xl/lib/jxl/convolve_test.cc | 252 + third_party/jpeg-xl/lib/jxl/data_parallel_test.cc | 87 + third_party/jpeg-xl/lib/jxl/dct-inl.h | 334 ++ third_party/jpeg-xl/lib/jxl/dct_block-inl.h | 108 + third_party/jpeg-xl/lib/jxl/dct_for_test.h | 99 + third_party/jpeg-xl/lib/jxl/dct_scales.cc | 31 + third_party/jpeg-xl/lib/jxl/dct_scales.h | 379 ++ third_party/jpeg-xl/lib/jxl/dct_test.cc | 389 ++ third_party/jpeg-xl/lib/jxl/dct_util.h | 86 + third_party/jpeg-xl/lib/jxl/dec_ans.cc | 374 ++ third_party/jpeg-xl/lib/jxl/dec_ans.h | 462 ++ third_party/jpeg-xl/lib/jxl/dec_bit_reader.h | 354 ++ third_party/jpeg-xl/lib/jxl/dec_cache.cc | 229 + third_party/jpeg-xl/lib/jxl/dec_cache.h | 261 + third_party/jpeg-xl/lib/jxl/dec_context_map.cc | 86 + third_party/jpeg-xl/lib/jxl/dec_context_map.h | 30 + third_party/jpeg-xl/lib/jxl/dec_external_image.cc | 493 ++ third_party/jpeg-xl/lib/jxl/dec_external_image.h | 46 + .../jpeg-xl/lib/jxl/dec_external_image_gbench.cc | 56 + third_party/jpeg-xl/lib/jxl/dec_frame.cc | 878 ++++ third_party/jpeg-xl/lib/jxl/dec_frame.h | 329 ++ third_party/jpeg-xl/lib/jxl/dec_group.cc | 801 +++ third_party/jpeg-xl/lib/jxl/dec_group.h | 49 + third_party/jpeg-xl/lib/jxl/dec_group_border.cc | 184 + third_party/jpeg-xl/lib/jxl/dec_group_border.h | 47 + third_party/jpeg-xl/lib/jxl/dec_huffman.cc | 255 + third_party/jpeg-xl/lib/jxl/dec_huffman.h | 32 + third_party/jpeg-xl/lib/jxl/dec_modular.cc | 774 +++ third_party/jpeg-xl/lib/jxl/dec_modular.h | 140 + third_party/jpeg-xl/lib/jxl/dec_noise.cc | 131 + third_party/jpeg-xl/lib/jxl/dec_noise.h | 32 + .../jpeg-xl/lib/jxl/dec_patch_dictionary.cc | 347 ++ third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.h | 151 + third_party/jpeg-xl/lib/jxl/dec_tone_mapping-inl.h | 234 + third_party/jpeg-xl/lib/jxl/dec_transforms-inl.h | 853 +++ .../jpeg-xl/lib/jxl/dec_transforms_testonly.cc | 41 + .../jpeg-xl/lib/jxl/dec_transforms_testonly.h | 32 + third_party/jpeg-xl/lib/jxl/dec_xyb-inl.h | 346 ++ third_party/jpeg-xl/lib/jxl/dec_xyb.cc | 329 ++ third_party/jpeg-xl/lib/jxl/dec_xyb.h | 89 + third_party/jpeg-xl/lib/jxl/decode.cc | 2809 ++++++++++ third_party/jpeg-xl/lib/jxl/decode_test.cc | 5507 ++++++++++++++++++++ third_party/jpeg-xl/lib/jxl/decode_to_jpeg.cc | 169 + third_party/jpeg-xl/lib/jxl/decode_to_jpeg.h | 217 + third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc | 1168 +++++ third_party/jpeg-xl/lib/jxl/enc_ac_strategy.h | 74 + .../jpeg-xl/lib/jxl/enc_adaptive_quantization.cc | 1145 ++++ .../jpeg-xl/lib/jxl/enc_adaptive_quantization.h | 66 + third_party/jpeg-xl/lib/jxl/enc_ans.cc | 1688 ++++++ third_party/jpeg-xl/lib/jxl/enc_ans.h | 143 + third_party/jpeg-xl/lib/jxl/enc_ans_params.h | 76 + .../jpeg-xl/lib/jxl/enc_ar_control_field.cc | 325 ++ third_party/jpeg-xl/lib/jxl/enc_ar_control_field.h | 49 + third_party/jpeg-xl/lib/jxl/enc_aux_out.cc | 205 + third_party/jpeg-xl/lib/jxl/enc_aux_out.h | 163 + third_party/jpeg-xl/lib/jxl/enc_bit_writer.cc | 201 + third_party/jpeg-xl/lib/jxl/enc_bit_writer.h | 129 + .../jpeg-xl/lib/jxl/enc_butteraugli_comparator.cc | 99 + .../jpeg-xl/lib/jxl/enc_butteraugli_comparator.h | 59 + .../jpeg-xl/lib/jxl/enc_butteraugli_pnorm.cc | 211 + .../jpeg-xl/lib/jxl/enc_butteraugli_pnorm.h | 25 + third_party/jpeg-xl/lib/jxl/enc_cache.cc | 218 + third_party/jpeg-xl/lib/jxl/enc_cache.h | 93 + .../jpeg-xl/lib/jxl/enc_chroma_from_luma.cc | 409 ++ third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.h | 68 + third_party/jpeg-xl/lib/jxl/enc_cluster.cc | 295 ++ third_party/jpeg-xl/lib/jxl/enc_cluster.h | 63 + third_party/jpeg-xl/lib/jxl/enc_coeff_order.cc | 291 ++ third_party/jpeg-xl/lib/jxl/enc_coeff_order.h | 54 + .../jpeg-xl/lib/jxl/enc_color_management.cc | 1293 +++++ third_party/jpeg-xl/lib/jxl/enc_color_management.h | 90 + third_party/jpeg-xl/lib/jxl/enc_comparator.cc | 130 + third_party/jpeg-xl/lib/jxl/enc_comparator.h | 52 + third_party/jpeg-xl/lib/jxl/enc_context_map.cc | 141 + third_party/jpeg-xl/lib/jxl/enc_context_map.h | 35 + third_party/jpeg-xl/lib/jxl/enc_detect_dots.cc | 626 +++ third_party/jpeg-xl/lib/jxl/enc_detect_dots.h | 67 + third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.cc | 71 + third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.h | 34 + third_party/jpeg-xl/lib/jxl/enc_entropy_coder.cc | 274 + third_party/jpeg-xl/lib/jxl/enc_entropy_coder.h | 46 + third_party/jpeg-xl/lib/jxl/enc_external_image.cc | 183 + third_party/jpeg-xl/lib/jxl/enc_external_image.h | 45 + .../jpeg-xl/lib/jxl/enc_external_image_gbench.cc | 46 + .../jpeg-xl/lib/jxl/enc_external_image_test.cc | 79 + third_party/jpeg-xl/lib/jxl/enc_fast_lossless.cc | 3860 ++++++++++++++ third_party/jpeg-xl/lib/jxl/enc_fast_lossless.h | 72 + third_party/jpeg-xl/lib/jxl/enc_fields.cc | 239 + third_party/jpeg-xl/lib/jxl/enc_fields.h | 37 + third_party/jpeg-xl/lib/jxl/enc_file.cc | 141 + third_party/jpeg-xl/lib/jxl/enc_file.h | 31 + third_party/jpeg-xl/lib/jxl/enc_frame.cc | 1745 +++++++ third_party/jpeg-xl/lib/jxl/enc_frame.h | 78 + third_party/jpeg-xl/lib/jxl/enc_gaborish.cc | 61 + third_party/jpeg-xl/lib/jxl/enc_gaborish.h | 26 + third_party/jpeg-xl/lib/jxl/enc_gaborish_test.cc | 77 + third_party/jpeg-xl/lib/jxl/enc_gamma_correct.h | 36 + third_party/jpeg-xl/lib/jxl/enc_group.cc | 426 ++ third_party/jpeg-xl/lib/jxl/enc_group.h | 32 + third_party/jpeg-xl/lib/jxl/enc_heuristics.cc | 948 ++++ third_party/jpeg-xl/lib/jxl/enc_heuristics.h | 81 + third_party/jpeg-xl/lib/jxl/enc_huffman.cc | 214 + third_party/jpeg-xl/lib/jxl/enc_huffman.h | 22 + third_party/jpeg-xl/lib/jxl/enc_huffman_tree.cc | 328 ++ third_party/jpeg-xl/lib/jxl/enc_huffman_tree.h | 52 + third_party/jpeg-xl/lib/jxl/enc_icc_codec.cc | 406 ++ third_party/jpeg-xl/lib/jxl/enc_icc_codec.h | 33 + third_party/jpeg-xl/lib/jxl/enc_image_bundle.cc | 154 + third_party/jpeg-xl/lib/jxl/enc_image_bundle.h | 25 + third_party/jpeg-xl/lib/jxl/enc_jxl_skcms.h | 54 + third_party/jpeg-xl/lib/jxl/enc_linalg.cc | 52 + third_party/jpeg-xl/lib/jxl/enc_linalg.h | 24 + third_party/jpeg-xl/lib/jxl/enc_linalg_test.cc | 118 + third_party/jpeg-xl/lib/jxl/enc_modular.cc | 1762 +++++++ third_party/jpeg-xl/lib/jxl/enc_modular.h | 92 + third_party/jpeg-xl/lib/jxl/enc_noise.cc | 374 ++ third_party/jpeg-xl/lib/jxl/enc_noise.h | 34 + third_party/jpeg-xl/lib/jxl/enc_optimize.cc | 163 + third_party/jpeg-xl/lib/jxl/enc_optimize.h | 218 + third_party/jpeg-xl/lib/jxl/enc_optimize_test.cc | 109 + third_party/jpeg-xl/lib/jxl/enc_params.h | 225 + .../jpeg-xl/lib/jxl/enc_patch_dictionary.cc | 813 +++ third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.h | 109 + third_party/jpeg-xl/lib/jxl/enc_photon_noise.cc | 89 + third_party/jpeg-xl/lib/jxl/enc_photon_noise.h | 22 + .../jpeg-xl/lib/jxl/enc_photon_noise_test.cc | 51 + .../jpeg-xl/lib/jxl/enc_progressive_split.cc | 82 + .../jpeg-xl/lib/jxl/enc_progressive_split.h | 131 + third_party/jpeg-xl/lib/jxl/enc_quant_weights.cc | 214 + third_party/jpeg-xl/lib/jxl/enc_quant_weights.h | 37 + third_party/jpeg-xl/lib/jxl/enc_splines.cc | 98 + third_party/jpeg-xl/lib/jxl/enc_splines.h | 38 + third_party/jpeg-xl/lib/jxl/enc_toc.cc | 45 + third_party/jpeg-xl/lib/jxl/enc_toc.h | 31 + third_party/jpeg-xl/lib/jxl/enc_transforms-inl.h | 827 +++ third_party/jpeg-xl/lib/jxl/enc_transforms.cc | 41 + third_party/jpeg-xl/lib/jxl/enc_transforms.h | 32 + third_party/jpeg-xl/lib/jxl/enc_xyb.cc | 520 ++ third_party/jpeg-xl/lib/jxl/enc_xyb.h | 56 + third_party/jpeg-xl/lib/jxl/encode.cc | 2128 ++++++++ third_party/jpeg-xl/lib/jxl/encode_internal.h | 275 + third_party/jpeg-xl/lib/jxl/encode_test.cc | 1405 +++++ third_party/jpeg-xl/lib/jxl/entropy_coder.cc | 70 + third_party/jpeg-xl/lib/jxl/entropy_coder.h | 45 + third_party/jpeg-xl/lib/jxl/entropy_coder_test.cc | 68 + third_party/jpeg-xl/lib/jxl/epf.cc | 146 + third_party/jpeg-xl/lib/jxl/epf.h | 33 + third_party/jpeg-xl/lib/jxl/exif.h | 87 + .../lib/jxl/fake_parallel_runner_testonly.h | 79 + third_party/jpeg-xl/lib/jxl/fast_dct-inl.h | 238 + third_party/jpeg-xl/lib/jxl/fast_dct.cc | 37 + third_party/jpeg-xl/lib/jxl/fast_dct.h | 9 + third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h | 2137 ++++++++ third_party/jpeg-xl/lib/jxl/fast_dct16-inl.h | 180 + third_party/jpeg-xl/lib/jxl/fast_dct256-inl.h | 4811 +++++++++++++++++ third_party/jpeg-xl/lib/jxl/fast_dct32-inl.h | 419 ++ third_party/jpeg-xl/lib/jxl/fast_dct64-inl.h | 985 ++++ third_party/jpeg-xl/lib/jxl/fast_dct8-inl.h | 80 + third_party/jpeg-xl/lib/jxl/fast_dct_test.cc | 378 ++ third_party/jpeg-xl/lib/jxl/fast_math-inl.h | 236 + third_party/jpeg-xl/lib/jxl/fast_math_test.cc | 288 + third_party/jpeg-xl/lib/jxl/field_encodings.h | 134 + third_party/jpeg-xl/lib/jxl/fields.cc | 642 +++ third_party/jpeg-xl/lib/jxl/fields.h | 377 ++ third_party/jpeg-xl/lib/jxl/fields_test.cc | 429 ++ third_party/jpeg-xl/lib/jxl/frame_header.cc | 494 ++ third_party/jpeg-xl/lib/jxl/frame_header.h | 503 ++ third_party/jpeg-xl/lib/jxl/gamma_correct_test.cc | 37 + third_party/jpeg-xl/lib/jxl/gauss_blur.cc | 623 +++ third_party/jpeg-xl/lib/jxl/gauss_blur.h | 94 + third_party/jpeg-xl/lib/jxl/gauss_blur_gbench.cc | 126 + third_party/jpeg-xl/lib/jxl/gauss_blur_test.cc | 453 ++ third_party/jpeg-xl/lib/jxl/gradient_test.cc | 207 + third_party/jpeg-xl/lib/jxl/headers.cc | 194 + third_party/jpeg-xl/lib/jxl/headers.h | 97 + third_party/jpeg-xl/lib/jxl/huffman_table.cc | 161 + third_party/jpeg-xl/lib/jxl/huffman_table.h | 28 + third_party/jpeg-xl/lib/jxl/iaca_test.cc | 21 + third_party/jpeg-xl/lib/jxl/icc_codec.cc | 389 ++ third_party/jpeg-xl/lib/jxl/icc_codec.h | 57 + third_party/jpeg-xl/lib/jxl/icc_codec_common.cc | 190 + third_party/jpeg-xl/lib/jxl/icc_codec_common.h | 106 + third_party/jpeg-xl/lib/jxl/icc_codec_test.cc | 207 + third_party/jpeg-xl/lib/jxl/image.cc | 251 + third_party/jpeg-xl/lib/jxl/image.h | 497 ++ third_party/jpeg-xl/lib/jxl/image_bundle.cc | 125 + third_party/jpeg-xl/lib/jxl/image_bundle.h | 254 + third_party/jpeg-xl/lib/jxl/image_bundle_test.cc | 37 + third_party/jpeg-xl/lib/jxl/image_metadata.cc | 472 ++ third_party/jpeg-xl/lib/jxl/image_metadata.h | 425 ++ third_party/jpeg-xl/lib/jxl/image_ops.h | 805 +++ third_party/jpeg-xl/lib/jxl/image_ops_test.cc | 164 + third_party/jpeg-xl/lib/jxl/image_test_utils.h | 257 + third_party/jpeg-xl/lib/jxl/inverse_mtf-inl.h | 90 + third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.cc | 145 + third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.h | 19 + .../jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.cc | 1050 ++++ .../jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.h | 35 + .../jpeg-xl/lib/jxl/jpeg/dec_jpeg_output_chunk.h | 72 + .../lib/jxl/jpeg/dec_jpeg_serialization_state.h | 96 + third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.cc | 384 ++ third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.h | 31 + .../jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.cc | 1053 ++++ .../jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.h | 36 + .../lib/jxl/jpeg/enc_jpeg_huffman_decode.cc | 103 + .../jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.h | 41 + third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.cc | 451 ++ third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.h | 216 + third_party/jpeg-xl/lib/jxl/jxl.syms | 5 + third_party/jpeg-xl/lib/jxl/jxl.version | 17 + third_party/jpeg-xl/lib/jxl/jxl_inspection.h | 22 + third_party/jpeg-xl/lib/jxl/jxl_osx.syms | 1 + third_party/jpeg-xl/lib/jxl/jxl_test.cc | 1537 ++++++ third_party/jpeg-xl/lib/jxl/lehmer_code.h | 102 + third_party/jpeg-xl/lib/jxl/lehmer_code_test.cc | 98 + third_party/jpeg-xl/lib/jxl/libjxl.pc.in | 13 + third_party/jpeg-xl/lib/jxl/loop_filter.cc | 98 + third_party/jpeg-xl/lib/jxl/loop_filter.h | 76 + third_party/jpeg-xl/lib/jxl/luminance.cc | 26 + third_party/jpeg-xl/lib/jxl/luminance.h | 22 + third_party/jpeg-xl/lib/jxl/matrix_ops.h | 84 + .../jpeg-xl/lib/jxl/memory_manager_internal.cc | 18 + .../jpeg-xl/lib/jxl/memory_manager_internal.h | 101 + .../lib/jxl/modular/encoding/context_predict.h | 626 +++ .../jpeg-xl/lib/jxl/modular/encoding/dec_ma.cc | 107 + .../jpeg-xl/lib/jxl/modular/encoding/dec_ma.h | 66 + .../lib/jxl/modular/encoding/enc_debug_tree.cc | 124 + .../lib/jxl/modular/encoding/enc_debug_tree.h | 27 + .../lib/jxl/modular/encoding/enc_encoding.cc | 562 ++ .../lib/jxl/modular/encoding/enc_encoding.h | 47 + .../jpeg-xl/lib/jxl/modular/encoding/enc_ma.cc | 1023 ++++ .../jpeg-xl/lib/jxl/modular/encoding/enc_ma.h | 157 + .../jpeg-xl/lib/jxl/modular/encoding/encoding.cc | 622 +++ .../jpeg-xl/lib/jxl/modular/encoding/encoding.h | 135 + .../jpeg-xl/lib/jxl/modular/encoding/ma_common.h | 28 + .../jpeg-xl/lib/jxl/modular/modular_image.cc | 77 + .../jpeg-xl/lib/jxl/modular/modular_image.h | 118 + third_party/jpeg-xl/lib/jxl/modular/options.h | 117 + .../lib/jxl/modular/transform/enc_palette.cc | 606 +++ .../lib/jxl/modular/transform/enc_palette.h | 22 + .../jpeg-xl/lib/jxl/modular/transform/enc_rct.cc | 73 + .../jpeg-xl/lib/jxl/modular/transform/enc_rct.h | 17 + .../lib/jxl/modular/transform/enc_squeeze.cc | 141 + .../lib/jxl/modular/transform/enc_squeeze.h | 20 + .../lib/jxl/modular/transform/enc_transform.cc | 46 + .../lib/jxl/modular/transform/enc_transform.h | 22 + .../jpeg-xl/lib/jxl/modular/transform/palette.cc | 176 + .../jpeg-xl/lib/jxl/modular/transform/palette.h | 129 + .../jpeg-xl/lib/jxl/modular/transform/rct.cc | 153 + .../jpeg-xl/lib/jxl/modular/transform/rct.h | 20 + .../jpeg-xl/lib/jxl/modular/transform/squeeze.cc | 478 ++ .../jpeg-xl/lib/jxl/modular/transform/squeeze.h | 90 + .../jpeg-xl/lib/jxl/modular/transform/transform.cc | 98 + .../jpeg-xl/lib/jxl/modular/transform/transform.h | 148 + third_party/jpeg-xl/lib/jxl/modular_test.cc | 541 ++ third_party/jpeg-xl/lib/jxl/noise.h | 60 + third_party/jpeg-xl/lib/jxl/opsin_image_test.cc | 123 + third_party/jpeg-xl/lib/jxl/opsin_inverse_test.cc | 57 + third_party/jpeg-xl/lib/jxl/opsin_params.cc | 44 + third_party/jpeg-xl/lib/jxl/opsin_params.h | 86 + third_party/jpeg-xl/lib/jxl/padded_bytes_test.cc | 126 + third_party/jpeg-xl/lib/jxl/passes_state.cc | 70 + third_party/jpeg-xl/lib/jxl/passes_state.h | 133 + third_party/jpeg-xl/lib/jxl/passes_test.cc | 402 ++ .../jpeg-xl/lib/jxl/patch_dictionary_internal.h | 31 + .../jpeg-xl/lib/jxl/patch_dictionary_test.cc | 58 + third_party/jpeg-xl/lib/jxl/preview_test.cc | 68 + third_party/jpeg-xl/lib/jxl/quant_weights.cc | 1239 +++++ third_party/jpeg-xl/lib/jxl/quant_weights.h | 448 ++ third_party/jpeg-xl/lib/jxl/quant_weights_test.cc | 240 + third_party/jpeg-xl/lib/jxl/quantizer-inl.h | 74 + third_party/jpeg-xl/lib/jxl/quantizer.cc | 156 + third_party/jpeg-xl/lib/jxl/quantizer.h | 182 + third_party/jpeg-xl/lib/jxl/quantizer_test.cc | 81 + .../jpeg-xl/lib/jxl/rational_polynomial-inl.h | 98 + .../jpeg-xl/lib/jxl/rational_polynomial_test.cc | 238 + .../render_pipeline/low_memory_render_pipeline.cc | 865 +++ .../render_pipeline/low_memory_render_pipeline.h | 111 + .../lib/jxl/render_pipeline/render_pipeline.cc | 132 + .../lib/jxl/render_pipeline/render_pipeline.h | 139 + .../jxl/render_pipeline/render_pipeline_stage.h | 171 + .../jxl/render_pipeline/render_pipeline_test.cc | 562 ++ .../jxl/render_pipeline/simple_render_pipeline.cc | 266 + .../jxl/render_pipeline/simple_render_pipeline.h | 37 + .../lib/jxl/render_pipeline/stage_blending.cc | 247 + .../lib/jxl/render_pipeline/stage_blending.h | 24 + .../jxl/render_pipeline/stage_chroma_upsampling.cc | 129 + .../jxl/render_pipeline/stage_chroma_upsampling.h | 27 + .../jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc | 524 ++ .../jpeg-xl/lib/jxl/render_pipeline/stage_epf.h | 31 + .../lib/jxl/render_pipeline/stage_from_linear.cc | 191 + .../lib/jxl/render_pipeline/stage_from_linear.h | 20 + .../lib/jxl/render_pipeline/stage_gaborish.cc | 122 + .../lib/jxl/render_pipeline/stage_gaborish.h | 25 + .../jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc | 311 ++ .../jpeg-xl/lib/jxl/render_pipeline/stage_noise.h | 32 + .../lib/jxl/render_pipeline/stage_patches.cc | 48 + .../lib/jxl/render_pipeline/stage_patches.h | 22 + .../lib/jxl/render_pipeline/stage_splines.cc | 63 + .../lib/jxl/render_pipeline/stage_splines.h | 21 + .../jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc | 52 + .../jpeg-xl/lib/jxl/render_pipeline/stage_spot.h | 21 + .../lib/jxl/render_pipeline/stage_to_linear.cc | 202 + .../lib/jxl/render_pipeline/stage_to_linear.h | 21 + .../lib/jxl/render_pipeline/stage_tone_mapping.cc | 151 + .../lib/jxl/render_pipeline/stage_tone_mapping.h | 37 + .../lib/jxl/render_pipeline/stage_upsampling.cc | 187 + .../lib/jxl/render_pipeline/stage_upsampling.h | 26 + .../jpeg-xl/lib/jxl/render_pipeline/stage_write.cc | 601 +++ .../jpeg-xl/lib/jxl/render_pipeline/stage_write.h | 31 + .../jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc | 176 + .../jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h | 26 + .../jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc | 85 + .../jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h | 25 + .../render_pipeline/test_render_pipeline_stages.h | 101 + third_party/jpeg-xl/lib/jxl/roundtrip_test.cc | 839 +++ third_party/jpeg-xl/lib/jxl/sanitizers.h | 242 + third_party/jpeg-xl/lib/jxl/simd_util-inl.h | 349 ++ third_party/jpeg-xl/lib/jxl/simd_util_test.cc | 84 + third_party/jpeg-xl/lib/jxl/speed_tier_test.cc | 108 + third_party/jpeg-xl/lib/jxl/splines.cc | 694 +++ third_party/jpeg-xl/lib/jxl/splines.h | 148 + third_party/jpeg-xl/lib/jxl/splines_gbench.cc | 52 + third_party/jpeg-xl/lib/jxl/splines_test.cc | 348 ++ third_party/jpeg-xl/lib/jxl/test_image.cc | 453 ++ third_party/jpeg-xl/lib/jxl/test_image.h | 94 + third_party/jpeg-xl/lib/jxl/test_utils.cc | 673 +++ third_party/jpeg-xl/lib/jxl/test_utils.h | 175 + third_party/jpeg-xl/lib/jxl/testing.h | 73 + third_party/jpeg-xl/lib/jxl/tf_gbench.cc | 143 + third_party/jpeg-xl/lib/jxl/toc.cc | 105 + third_party/jpeg-xl/lib/jxl/toc.h | 55 + third_party/jpeg-xl/lib/jxl/toc_test.cc | 92 + .../jpeg-xl/lib/jxl/transfer_functions-inl.h | 413 ++ third_party/jpeg-xl/lib/jxl/transpose-inl.h | 203 + third_party/jpeg-xl/lib/jxl/version.h.in | 39 + third_party/jpeg-xl/lib/jxl/xorshift128plus-inl.h | 103 + .../jpeg-xl/lib/jxl/xorshift128plus_test.cc | 378 ++ 410 files changed, 109850 insertions(+) create mode 100644 third_party/jpeg-xl/lib/jxl/ac_context.h create mode 100644 third_party/jpeg-xl/lib/jxl/ac_strategy.cc create mode 100644 third_party/jpeg-xl/lib/jxl/ac_strategy.h create mode 100644 third_party/jpeg-xl/lib/jxl/ac_strategy_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/alpha.cc create mode 100644 third_party/jpeg-xl/lib/jxl/alpha.h create mode 100644 third_party/jpeg-xl/lib/jxl/alpha_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/ans_common.cc create mode 100644 third_party/jpeg-xl/lib/jxl/ans_common.h create mode 100644 third_party/jpeg-xl/lib/jxl/ans_common_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/ans_params.h create mode 100644 third_party/jpeg-xl/lib/jxl/ans_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/base/arch_macros.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/bits.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/byte_order.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc create mode 100644 third_party/jpeg-xl/lib/jxl/base/cache_aligned.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/compiler_specific.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/data_parallel.cc create mode 100644 third_party/jpeg-xl/lib/jxl/base/data_parallel.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/file_io.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/float.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/iaca.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/os_macros.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/override.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc create mode 100644 third_party/jpeg-xl/lib/jxl/base/padded_bytes.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/printf_macros.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/profiler.cc create mode 100644 third_party/jpeg-xl/lib/jxl/base/profiler.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/random.cc create mode 100644 third_party/jpeg-xl/lib/jxl/base/random.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/sanitizer_definitions.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/scope_guard.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/span.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/status.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/tsc_timer.h create mode 100644 third_party/jpeg-xl/lib/jxl/bit_reader_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/bits_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/blending.cc create mode 100644 third_party/jpeg-xl/lib/jxl/blending.h create mode 100644 third_party/jpeg-xl/lib/jxl/blending_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/box_content_decoder.cc create mode 100644 third_party/jpeg-xl/lib/jxl/box_content_decoder.h create mode 100644 third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.cc create mode 100644 third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.h create mode 100644 third_party/jpeg-xl/lib/jxl/butteraugli_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/butteraugli_wrapper.cc create mode 100644 third_party/jpeg-xl/lib/jxl/byte_order_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/chroma_from_luma.cc create mode 100644 third_party/jpeg-xl/lib/jxl/chroma_from_luma.h create mode 100644 third_party/jpeg-xl/lib/jxl/codec_in_out.h create mode 100644 third_party/jpeg-xl/lib/jxl/coeff_order.cc create mode 100644 third_party/jpeg-xl/lib/jxl/coeff_order.h create mode 100644 third_party/jpeg-xl/lib/jxl/coeff_order_fwd.h create mode 100644 third_party/jpeg-xl/lib/jxl/coeff_order_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/color_encoding_internal.cc create mode 100644 third_party/jpeg-xl/lib/jxl/color_encoding_internal.h create mode 100644 third_party/jpeg-xl/lib/jxl/color_encoding_internal_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/color_management.cc create mode 100644 third_party/jpeg-xl/lib/jxl/color_management.h create mode 100644 third_party/jpeg-xl/lib/jxl/color_management_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/common.h create mode 100644 third_party/jpeg-xl/lib/jxl/compressed_dc.cc create mode 100644 third_party/jpeg-xl/lib/jxl/compressed_dc.h create mode 100644 third_party/jpeg-xl/lib/jxl/convolve-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/convolve.h create mode 100644 third_party/jpeg-xl/lib/jxl/convolve_separable5.cc create mode 100644 third_party/jpeg-xl/lib/jxl/convolve_separable7.cc create mode 100644 third_party/jpeg-xl/lib/jxl/convolve_slow.cc create mode 100644 third_party/jpeg-xl/lib/jxl/convolve_symmetric3.cc create mode 100644 third_party/jpeg-xl/lib/jxl/convolve_symmetric5.cc create mode 100644 third_party/jpeg-xl/lib/jxl/convolve_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/data_parallel_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/dct-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/dct_block-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/dct_for_test.h create mode 100644 third_party/jpeg-xl/lib/jxl/dct_scales.cc create mode 100644 third_party/jpeg-xl/lib/jxl/dct_scales.h create mode 100644 third_party/jpeg-xl/lib/jxl/dct_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/dct_util.h create mode 100644 third_party/jpeg-xl/lib/jxl/dec_ans.cc create mode 100644 third_party/jpeg-xl/lib/jxl/dec_ans.h create mode 100644 third_party/jpeg-xl/lib/jxl/dec_bit_reader.h create mode 100644 third_party/jpeg-xl/lib/jxl/dec_cache.cc create mode 100644 third_party/jpeg-xl/lib/jxl/dec_cache.h create mode 100644 third_party/jpeg-xl/lib/jxl/dec_context_map.cc create mode 100644 third_party/jpeg-xl/lib/jxl/dec_context_map.h create mode 100644 third_party/jpeg-xl/lib/jxl/dec_external_image.cc create mode 100644 third_party/jpeg-xl/lib/jxl/dec_external_image.h create mode 100644 third_party/jpeg-xl/lib/jxl/dec_external_image_gbench.cc create mode 100644 third_party/jpeg-xl/lib/jxl/dec_frame.cc create mode 100644 third_party/jpeg-xl/lib/jxl/dec_frame.h create mode 100644 third_party/jpeg-xl/lib/jxl/dec_group.cc create mode 100644 third_party/jpeg-xl/lib/jxl/dec_group.h create mode 100644 third_party/jpeg-xl/lib/jxl/dec_group_border.cc create mode 100644 third_party/jpeg-xl/lib/jxl/dec_group_border.h create mode 100644 third_party/jpeg-xl/lib/jxl/dec_huffman.cc create mode 100644 third_party/jpeg-xl/lib/jxl/dec_huffman.h create mode 100644 third_party/jpeg-xl/lib/jxl/dec_modular.cc create mode 100644 third_party/jpeg-xl/lib/jxl/dec_modular.h create mode 100644 third_party/jpeg-xl/lib/jxl/dec_noise.cc create mode 100644 third_party/jpeg-xl/lib/jxl/dec_noise.h create mode 100644 third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.cc create mode 100644 third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.h create mode 100644 third_party/jpeg-xl/lib/jxl/dec_tone_mapping-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/dec_transforms-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.cc create mode 100644 third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.h create mode 100644 third_party/jpeg-xl/lib/jxl/dec_xyb-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/dec_xyb.cc create mode 100644 third_party/jpeg-xl/lib/jxl/dec_xyb.h create mode 100644 third_party/jpeg-xl/lib/jxl/decode.cc create mode 100644 third_party/jpeg-xl/lib/jxl/decode_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/decode_to_jpeg.cc create mode 100644 third_party/jpeg-xl/lib/jxl/decode_to_jpeg.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_ac_strategy.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_ans.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_ans.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_ans_params.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_ar_control_field.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_ar_control_field.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_aux_out.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_aux_out.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_bit_writer.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_bit_writer.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_cache.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_cache.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_cluster.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_cluster.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_coeff_order.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_coeff_order.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_color_management.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_color_management.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_comparator.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_comparator.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_context_map.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_context_map.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_detect_dots.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_detect_dots.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_entropy_coder.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_entropy_coder.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_external_image.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_external_image.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_external_image_gbench.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_external_image_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_fast_lossless.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_fast_lossless.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_fields.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_fields.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_file.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_file.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_frame.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_frame.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_gaborish.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_gaborish.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_gaborish_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_gamma_correct.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_group.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_group.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_heuristics.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_heuristics.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_huffman.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_huffman.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_huffman_tree.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_huffman_tree.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_icc_codec.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_icc_codec.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_image_bundle.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_image_bundle.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_jxl_skcms.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_linalg.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_linalg.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_linalg_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_modular.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_modular.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_noise.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_noise.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_optimize.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_optimize.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_optimize_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_params.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_photon_noise.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_photon_noise.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_photon_noise_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_progressive_split.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_progressive_split.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_quant_weights.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_quant_weights.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_splines.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_splines.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_toc.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_toc.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_transforms-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_transforms.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_transforms.h create mode 100644 third_party/jpeg-xl/lib/jxl/enc_xyb.cc create mode 100644 third_party/jpeg-xl/lib/jxl/enc_xyb.h create mode 100644 third_party/jpeg-xl/lib/jxl/encode.cc create mode 100644 third_party/jpeg-xl/lib/jxl/encode_internal.h create mode 100644 third_party/jpeg-xl/lib/jxl/encode_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/entropy_coder.cc create mode 100644 third_party/jpeg-xl/lib/jxl/entropy_coder.h create mode 100644 third_party/jpeg-xl/lib/jxl/entropy_coder_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/epf.cc create mode 100644 third_party/jpeg-xl/lib/jxl/epf.h create mode 100644 third_party/jpeg-xl/lib/jxl/exif.h create mode 100644 third_party/jpeg-xl/lib/jxl/fake_parallel_runner_testonly.h create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct.cc create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct.h create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct16-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct256-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct32-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct64-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct8-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/fast_math-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/fast_math_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/field_encodings.h create mode 100644 third_party/jpeg-xl/lib/jxl/fields.cc create mode 100644 third_party/jpeg-xl/lib/jxl/fields.h create mode 100644 third_party/jpeg-xl/lib/jxl/fields_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/frame_header.cc create mode 100644 third_party/jpeg-xl/lib/jxl/frame_header.h create mode 100644 third_party/jpeg-xl/lib/jxl/gamma_correct_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/gauss_blur.cc create mode 100644 third_party/jpeg-xl/lib/jxl/gauss_blur.h create mode 100644 third_party/jpeg-xl/lib/jxl/gauss_blur_gbench.cc create mode 100644 third_party/jpeg-xl/lib/jxl/gauss_blur_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/gradient_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/headers.cc create mode 100644 third_party/jpeg-xl/lib/jxl/headers.h create mode 100644 third_party/jpeg-xl/lib/jxl/huffman_table.cc create mode 100644 third_party/jpeg-xl/lib/jxl/huffman_table.h create mode 100644 third_party/jpeg-xl/lib/jxl/iaca_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/icc_codec.cc create mode 100644 third_party/jpeg-xl/lib/jxl/icc_codec.h create mode 100644 third_party/jpeg-xl/lib/jxl/icc_codec_common.cc create mode 100644 third_party/jpeg-xl/lib/jxl/icc_codec_common.h create mode 100644 third_party/jpeg-xl/lib/jxl/icc_codec_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/image.cc create mode 100644 third_party/jpeg-xl/lib/jxl/image.h create mode 100644 third_party/jpeg-xl/lib/jxl/image_bundle.cc create mode 100644 third_party/jpeg-xl/lib/jxl/image_bundle.h create mode 100644 third_party/jpeg-xl/lib/jxl/image_bundle_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/image_metadata.cc create mode 100644 third_party/jpeg-xl/lib/jxl/image_metadata.h create mode 100644 third_party/jpeg-xl/lib/jxl/image_ops.h create mode 100644 third_party/jpeg-xl/lib/jxl/image_ops_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/image_test_utils.h create mode 100644 third_party/jpeg-xl/lib/jxl/inverse_mtf-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.cc create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.h create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.cc create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.h create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_output_chunk.h create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_serialization_state.h create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.cc create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.h create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.cc create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.h create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.h create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.cc create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.h create mode 100644 third_party/jpeg-xl/lib/jxl/jxl.syms create mode 100644 third_party/jpeg-xl/lib/jxl/jxl.version create mode 100644 third_party/jpeg-xl/lib/jxl/jxl_inspection.h create mode 100644 third_party/jpeg-xl/lib/jxl/jxl_osx.syms create mode 100644 third_party/jpeg-xl/lib/jxl/jxl_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/lehmer_code.h create mode 100644 third_party/jpeg-xl/lib/jxl/lehmer_code_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/libjxl.pc.in create mode 100644 third_party/jpeg-xl/lib/jxl/loop_filter.cc create mode 100644 third_party/jpeg-xl/lib/jxl/loop_filter.h create mode 100644 third_party/jpeg-xl/lib/jxl/luminance.cc create mode 100644 third_party/jpeg-xl/lib/jxl/luminance.h create mode 100644 third_party/jpeg-xl/lib/jxl/matrix_ops.h create mode 100644 third_party/jpeg-xl/lib/jxl/memory_manager_internal.cc create mode 100644 third_party/jpeg-xl/lib/jxl/memory_manager_internal.h create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/context_predict.h create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.cc create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.h create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.cc create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.h create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.cc create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.h create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.cc create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.h create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.cc create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.h create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/ma_common.h create mode 100644 third_party/jpeg-xl/lib/jxl/modular/modular_image.cc create mode 100644 third_party/jpeg-xl/lib/jxl/modular/modular_image.h create mode 100644 third_party/jpeg-xl/lib/jxl/modular/options.h create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.cc create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.h create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.cc create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.h create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.cc create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.h create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.cc create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.h create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/palette.cc create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/palette.h create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/rct.cc create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/rct.h create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.cc create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.h create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/transform.cc create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/transform.h create mode 100644 third_party/jpeg-xl/lib/jxl/modular_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/noise.h create mode 100644 third_party/jpeg-xl/lib/jxl/opsin_image_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/opsin_inverse_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/opsin_params.cc create mode 100644 third_party/jpeg-xl/lib/jxl/opsin_params.h create mode 100644 third_party/jpeg-xl/lib/jxl/padded_bytes_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/passes_state.cc create mode 100644 third_party/jpeg-xl/lib/jxl/passes_state.h create mode 100644 third_party/jpeg-xl/lib/jxl/passes_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/patch_dictionary_internal.h create mode 100644 third_party/jpeg-xl/lib/jxl/patch_dictionary_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/preview_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/quant_weights.cc create mode 100644 third_party/jpeg-xl/lib/jxl/quant_weights.h create mode 100644 third_party/jpeg-xl/lib/jxl/quant_weights_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/quantizer-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/quantizer.cc create mode 100644 third_party/jpeg-xl/lib/jxl/quantizer.h create mode 100644 third_party/jpeg-xl/lib/jxl/quantizer_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/rational_polynomial-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/rational_polynomial_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h create mode 100644 third_party/jpeg-xl/lib/jxl/roundtrip_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/sanitizers.h create mode 100644 third_party/jpeg-xl/lib/jxl/simd_util-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/simd_util_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/speed_tier_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/splines.cc create mode 100644 third_party/jpeg-xl/lib/jxl/splines.h create mode 100644 third_party/jpeg-xl/lib/jxl/splines_gbench.cc create mode 100644 third_party/jpeg-xl/lib/jxl/splines_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/test_image.cc create mode 100644 third_party/jpeg-xl/lib/jxl/test_image.h create mode 100644 third_party/jpeg-xl/lib/jxl/test_utils.cc create mode 100644 third_party/jpeg-xl/lib/jxl/test_utils.h create mode 100644 third_party/jpeg-xl/lib/jxl/testing.h create mode 100644 third_party/jpeg-xl/lib/jxl/tf_gbench.cc create mode 100644 third_party/jpeg-xl/lib/jxl/toc.cc create mode 100644 third_party/jpeg-xl/lib/jxl/toc.h create mode 100644 third_party/jpeg-xl/lib/jxl/toc_test.cc create mode 100644 third_party/jpeg-xl/lib/jxl/transfer_functions-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/transpose-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/version.h.in create mode 100644 third_party/jpeg-xl/lib/jxl/xorshift128plus-inl.h create mode 100644 third_party/jpeg-xl/lib/jxl/xorshift128plus_test.cc (limited to 'third_party/jpeg-xl/lib/jxl') diff --git a/third_party/jpeg-xl/lib/jxl/ac_context.h b/third_party/jpeg-xl/lib/jxl/ac_context.h new file mode 100644 index 0000000000..a2b9e046d1 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/ac_context.h @@ -0,0 +1,149 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_AC_CONTEXT_H_ +#define LIB_JXL_AC_CONTEXT_H_ + +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order_fwd.h" + +namespace jxl { + +// Block context used for scanning order, number of non-zeros, AC coefficients. +// Equal to the channel. +constexpr uint32_t kDCTOrderContextStart = 0; + +// The number of predicted nonzeros goes from 0 to 1008. We use +// ceil(log2(predicted+1)) as a context for the number of nonzeros, so from 0 to +// 10, inclusive. +constexpr uint32_t kNonZeroBuckets = 37; + +static const uint16_t kCoeffFreqContext[64] = { + 0xBAD, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, + 23, 23, 23, 23, 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, + 27, 27, 27, 27, 28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, +}; + +static const uint16_t kCoeffNumNonzeroContext[64] = { + 0xBAD, 0, 31, 62, 62, 93, 93, 93, 93, 123, 123, 123, 123, + 152, 152, 152, 152, 152, 152, 152, 152, 180, 180, 180, 180, 180, + 180, 180, 180, 180, 180, 180, 180, 206, 206, 206, 206, 206, 206, + 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, + 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, +}; + +// Supremum of ZeroDensityContext(x, y) + 1, when x + y < 64. +constexpr int kZeroDensityContextCount = 458; +// Supremum of ZeroDensityContext(x, y) + 1. +constexpr int kZeroDensityContextLimit = 474; + +/* This function is used for entropy-sources pre-clustering. + * + * Ideally, each combination of |nonzeros_left| and |k| should go to its own + * bucket; but it implies (64 * 63 / 2) == 2016 buckets. If there is other + * dimension (e.g. block context), then number of primary clusters becomes too + * big. + * + * To solve this problem, |nonzeros_left| and |k| values are clustered. It is + * known that their sum is at most 64, consequently, the total number buckets + * is at most A(64) * B(64). + */ +// TODO(user): investigate, why disabling pre-clustering makes entropy code +// less dense. Perhaps we would need to add HQ clustering algorithm that would +// be able to squeeze better by spending more CPU cycles. +static JXL_INLINE size_t ZeroDensityContext(size_t nonzeros_left, size_t k, + size_t covered_blocks, + size_t log2_covered_blocks, + size_t prev) { + JXL_DASSERT((1u << log2_covered_blocks) == covered_blocks); + nonzeros_left = (nonzeros_left + covered_blocks - 1) >> log2_covered_blocks; + k >>= log2_covered_blocks; + JXL_DASSERT(k > 0); + JXL_DASSERT(k < 64); + JXL_DASSERT(nonzeros_left > 0); + // Asserting nonzeros_left + k < 65 here causes crashes in debug mode with + // invalid input, since the (hot) decoding loop does not check this condition. + // As no out-of-bound memory reads are issued even if that condition is + // broken, we check this simpler condition which holds anyway. The decoder + // will still mark a file in which that condition happens as not valid at the + // end of the decoding loop, as `nzeros` will not be `0`. + JXL_DASSERT(nonzeros_left < 64); + return (kCoeffNumNonzeroContext[nonzeros_left] + kCoeffFreqContext[k]) * 2 + + prev; +} + +struct BlockCtxMap { + std::vector dc_thresholds[3]; + std::vector qf_thresholds; + std::vector ctx_map; + size_t num_ctxs, num_dc_ctxs; + + static constexpr uint8_t kDefaultCtxMap[] = { + // Default ctx map clusters all the large transforms together. + 0, 1, 2, 2, 3, 3, 4, 5, 6, 6, 6, 6, 6, // + 7, 8, 9, 9, 10, 11, 12, 13, 14, 14, 14, 14, 14, // + 7, 8, 9, 9, 10, 11, 12, 13, 14, 14, 14, 14, 14, // + }; + static_assert(3 * kNumOrders == + sizeof(kDefaultCtxMap) / sizeof *kDefaultCtxMap, + "Update default context map"); + + size_t Context(int dc_idx, uint32_t qf, size_t ord, size_t c) const { + size_t qf_idx = 0; + for (uint32_t t : qf_thresholds) { + if (qf > t) qf_idx++; + } + size_t idx = c < 2 ? c ^ 1 : 2; + idx = idx * kNumOrders + ord; + idx = idx * (qf_thresholds.size() + 1) + qf_idx; + idx = idx * num_dc_ctxs + dc_idx; + return ctx_map[idx]; + } + // Non-zero context is based on number of non-zeros and block context. + // For better clustering, contexts with same number of non-zeros are grouped. + constexpr uint32_t ZeroDensityContextsOffset(uint32_t block_ctx) const { + return num_ctxs * kNonZeroBuckets + kZeroDensityContextCount * block_ctx; + } + + // Context map for AC coefficients consists of 2 blocks: + // |num_ctxs x : context for number of non-zeros in the block + // kNonZeroBuckets| computed from block context and predicted + // value (based top and left values) + // |num_ctxs x : context for AC coefficient symbols, + // kZeroDensityContextCount| computed from block context, + // number of non-zeros left and + // index in scan order + constexpr uint32_t NumACContexts() const { + return num_ctxs * (kNonZeroBuckets + kZeroDensityContextCount); + } + + // Non-zero context is based on number of non-zeros and block context. + // For better clustering, contexts with same number of non-zeros are grouped. + inline uint32_t NonZeroContext(uint32_t non_zeros, uint32_t block_ctx) const { + uint32_t ctx; + if (non_zeros >= 64) non_zeros = 64; + if (non_zeros < 8) { + ctx = non_zeros; + } else { + ctx = 4 + non_zeros / 2; + } + return ctx * num_ctxs + block_ctx; + } + + BlockCtxMap() { + ctx_map.assign(std::begin(kDefaultCtxMap), std::end(kDefaultCtxMap)); + num_ctxs = *std::max_element(ctx_map.begin(), ctx_map.end()) + 1; + num_dc_ctxs = 1; + } +}; + +} // namespace jxl + +#endif // LIB_JXL_AC_CONTEXT_H_ diff --git a/third_party/jpeg-xl/lib/jxl/ac_strategy.cc b/third_party/jpeg-xl/lib/jxl/ac_strategy.cc new file mode 100644 index 0000000000..ada3bcb6f5 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/ac_strategy.cc @@ -0,0 +1,108 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/ac_strategy.h" + +#include + +#include +#include // iota +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { + +// Tries to generalize zig-zag order to non-square blocks. Surprisingly, in +// square block frequency along the (i + j == const) diagonals is roughly the +// same. For historical reasons, consecutive diagonals are traversed +// in alternating directions - so called "zig-zag" (or "snake") order. +template +static void CoeffOrderAndLut(AcStrategy acs, coeff_order_t* out) { + size_t cx = acs.covered_blocks_x(); + size_t cy = acs.covered_blocks_y(); + CoefficientLayout(&cy, &cx); + + // CoefficientLayout ensures cx >= cy. + // We compute the zigzag order for a cx x cx block, then discard all the + // lines that are not multiple of the ratio between cx and cy. + size_t xs = cx / cy; + size_t xsm = xs - 1; + size_t xss = CeilLog2Nonzero(xs); + // First half of the block + size_t cur = cx * cy; + for (size_t i = 0; i < cx * kBlockDim; i++) { + for (size_t j = 0; j <= i; j++) { + size_t x = j; + size_t y = i - j; + if (i % 2) std::swap(x, y); + if ((y & xsm) != 0) continue; + y >>= xss; + size_t val = 0; + if (x < cx && y < cy) { + val = y * cx + x; + } else { + val = cur++; + } + if (is_lut) { + out[y * cx * kBlockDim + x] = val; + } else { + out[val] = y * cx * kBlockDim + x; + } + } + } + // Second half + for (size_t ip = cx * kBlockDim - 1; ip > 0; ip--) { + size_t i = ip - 1; + for (size_t j = 0; j <= i; j++) { + size_t x = cx * kBlockDim - 1 - (i - j); + size_t y = cx * kBlockDim - 1 - j; + if (i % 2) std::swap(x, y); + if ((y & xsm) != 0) continue; + y >>= xss; + size_t val = cur++; + if (is_lut) { + out[y * cx * kBlockDim + x] = val; + } else { + out[val] = y * cx * kBlockDim + x; + } + } + } +} + +void AcStrategy::ComputeNaturalCoeffOrder(coeff_order_t* order) const { + CoeffOrderAndLut(*this, order); +} +void AcStrategy::ComputeNaturalCoeffOrderLut(coeff_order_t* lut) const { + CoeffOrderAndLut(*this, lut); +} + +// These definitions are needed before C++17. +constexpr size_t AcStrategy::kMaxCoeffBlocks; +constexpr size_t AcStrategy::kMaxBlockDim; +constexpr size_t AcStrategy::kMaxCoeffArea; + +AcStrategyImage::AcStrategyImage(size_t xsize, size_t ysize) + : layers_(xsize, ysize) { + row_ = layers_.Row(0); + stride_ = layers_.PixelsPerRow(); +} + +size_t AcStrategyImage::CountBlocks(AcStrategy::Type type) const { + size_t ret = 0; + for (size_t y = 0; y < layers_.ysize(); y++) { + const uint8_t* JXL_RESTRICT row = layers_.ConstRow(y); + for (size_t x = 0; x < layers_.xsize(); x++) { + if (row[x] == ((static_cast(type) << 1) | 1)) ret++; + } + } + return ret; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/ac_strategy.h b/third_party/jpeg-xl/lib/jxl/ac_strategy.h new file mode 100644 index 0000000000..7d21167e6e --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/ac_strategy.h @@ -0,0 +1,261 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_AC_STRATEGY_H_ +#define LIB_JXL_AC_STRATEGY_H_ + +#include +#include + +#include // kMaxVectorSize + +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image_ops.h" + +// Defines the different kinds of transforms, and heuristics to choose between +// them. +// `AcStrategy` represents what transform should be used, and which sub-block of +// that transform we are currently in. Note that DCT4x4 is applied on all four +// 4x4 sub-blocks of an 8x8 block. +// `AcStrategyImage` defines which strategy should be used for each 8x8 block +// of the image. The highest 4 bits represent the strategy to be used, the +// lowest 4 represent the index of the block inside that strategy. + +namespace jxl { + +class AcStrategy { + public: + // Extremal values for the number of blocks/coefficients of a single strategy. + static constexpr size_t kMaxCoeffBlocks = 32; + static constexpr size_t kMaxBlockDim = kBlockDim * kMaxCoeffBlocks; + // Maximum number of coefficients in a block. Guaranteed to be a multiple of + // the vector size. + static constexpr size_t kMaxCoeffArea = kMaxBlockDim * kMaxBlockDim; + static_assert((kMaxCoeffArea * sizeof(float)) % hwy::kMaxVectorSize == 0, + "Coefficient area is not a multiple of vector size"); + + // Raw strategy types. + enum Type : uint32_t { + // Regular block size DCT + DCT = 0, + // Encode pixels without transforming + IDENTITY = 1, + // Use 2-by-2 DCT + DCT2X2 = 2, + // Use 4-by-4 DCT + DCT4X4 = 3, + // Use 16-by-16 DCT + DCT16X16 = 4, + // Use 32-by-32 DCT + DCT32X32 = 5, + // Use 16-by-8 DCT + DCT16X8 = 6, + // Use 8-by-16 DCT + DCT8X16 = 7, + // Use 32-by-8 DCT + DCT32X8 = 8, + // Use 8-by-32 DCT + DCT8X32 = 9, + // Use 32-by-16 DCT + DCT32X16 = 10, + // Use 16-by-32 DCT + DCT16X32 = 11, + // 4x8 and 8x4 DCT + DCT4X8 = 12, + DCT8X4 = 13, + // Corner-DCT. + AFV0 = 14, + AFV1 = 15, + AFV2 = 16, + AFV3 = 17, + // Larger DCTs + DCT64X64 = 18, + DCT64X32 = 19, + DCT32X64 = 20, + DCT128X128 = 21, + DCT128X64 = 22, + DCT64X128 = 23, + DCT256X256 = 24, + DCT256X128 = 25, + DCT128X256 = 26, + // Marker for num of valid strategies. + kNumValidStrategies + }; + + static constexpr uint32_t TypeBit(const Type type) { + return 1u << static_cast(type); + } + + // Returns true if this block is the first 8x8 block (i.e. top-left) of a + // possibly multi-block strategy. + JXL_INLINE bool IsFirstBlock() const { return is_first_; } + + JXL_INLINE bool IsMultiblock() const { + constexpr uint32_t bits = + TypeBit(Type::DCT16X16) | TypeBit(Type::DCT32X32) | + TypeBit(Type::DCT16X8) | TypeBit(Type::DCT8X16) | + TypeBit(Type::DCT32X8) | TypeBit(Type::DCT8X32) | + TypeBit(Type::DCT16X32) | TypeBit(Type::DCT32X16) | + TypeBit(Type::DCT32X64) | TypeBit(Type::DCT64X32) | + TypeBit(Type::DCT64X64) | TypeBit(DCT64X128) | TypeBit(DCT128X64) | + TypeBit(DCT128X128) | TypeBit(DCT128X256) | TypeBit(DCT256X128) | + TypeBit(DCT256X256); + JXL_DASSERT(Strategy() < kNumValidStrategies); + return ((1u << static_cast(Strategy())) & bits) != 0; + } + + // Returns the raw strategy value. Should only be used for tokenization. + JXL_INLINE uint8_t RawStrategy() const { + return static_cast(strategy_); + } + + JXL_INLINE Type Strategy() const { return strategy_; } + + // Inverse check + static JXL_INLINE constexpr bool IsRawStrategyValid(int raw_strategy) { + return raw_strategy < static_cast(kNumValidStrategies) && + raw_strategy >= 0; + } + static JXL_INLINE AcStrategy FromRawStrategy(uint8_t raw_strategy) { + return FromRawStrategy(static_cast(raw_strategy)); + } + static JXL_INLINE AcStrategy FromRawStrategy(Type raw_strategy) { + JXL_DASSERT(IsRawStrategyValid(static_cast(raw_strategy))); + return AcStrategy(raw_strategy, /*is_first=*/true); + } + + // "Natural order" means the order of increasing of "anisotropic" frequency of + // continuous version of DCT basis. + // Round-trip, for any given strategy s: + // X = NaturalCoeffOrder(s)[NaturalCoeffOrderLutN(s)[X]] + // X = NaturalCoeffOrderLut(s)[NaturalCoeffOrderN(s)[X]] + void ComputeNaturalCoeffOrder(coeff_order_t* order) const; + void ComputeNaturalCoeffOrderLut(coeff_order_t* lut) const; + + // Number of 8x8 blocks that this strategy will cover. 0 for non-top-left + // blocks inside a multi-block transform. + JXL_INLINE size_t covered_blocks_x() const { + static constexpr uint8_t kLut[] = {1, 1, 1, 1, 2, 4, 1, 2, 1, + 4, 2, 4, 1, 1, 1, 1, 1, 1, + 8, 4, 8, 16, 8, 16, 32, 16, 32}; + static_assert(sizeof(kLut) / sizeof(*kLut) == kNumValidStrategies, + "Update LUT"); + return kLut[size_t(strategy_)]; + } + + JXL_INLINE size_t covered_blocks_y() const { + static constexpr uint8_t kLut[] = {1, 1, 1, 1, 2, 4, 2, 1, 4, + 1, 4, 2, 1, 1, 1, 1, 1, 1, + 8, 8, 4, 16, 16, 8, 32, 32, 16}; + static_assert(sizeof(kLut) / sizeof(*kLut) == kNumValidStrategies, + "Update LUT"); + return kLut[size_t(strategy_)]; + } + + JXL_INLINE size_t log2_covered_blocks() const { + static constexpr uint8_t kLut[] = {0, 0, 0, 0, 2, 4, 1, 1, 2, + 2, 3, 3, 0, 0, 0, 0, 0, 0, + 6, 5, 5, 8, 7, 7, 10, 9, 9}; + static_assert(sizeof(kLut) / sizeof(*kLut) == kNumValidStrategies, + "Update LUT"); + return kLut[size_t(strategy_)]; + } + + private: + friend class AcStrategyRow; + JXL_INLINE AcStrategy(Type strategy, bool is_first) + : strategy_(strategy), is_first_(is_first) { + JXL_DASSERT(IsMultiblock() || is_first == true); + } + + Type strategy_; + bool is_first_; +}; + +// Class to use a certain row of the AC strategy. +class AcStrategyRow { + public: + explicit AcStrategyRow(const uint8_t* row) : row_(row) {} + AcStrategy operator[](size_t x) const { + return AcStrategy(static_cast(row_[x] >> 1), row_[x] & 1); + } + + private: + const uint8_t* JXL_RESTRICT row_; +}; + +class AcStrategyImage { + public: + AcStrategyImage() = default; + AcStrategyImage(size_t xsize, size_t ysize); + AcStrategyImage(AcStrategyImage&&) = default; + AcStrategyImage& operator=(AcStrategyImage&&) = default; + + void FillDCT8(const Rect& rect) { + FillPlane((static_cast(AcStrategy::Type::DCT) << 1) | 1, + &layers_, rect); + } + void FillDCT8() { FillDCT8(Rect(layers_)); } + + void FillInvalid() { FillImage(INVALID, &layers_); } + + void Set(size_t x, size_t y, AcStrategy::Type type) { +#if JXL_ENABLE_ASSERT + AcStrategy acs = AcStrategy::FromRawStrategy(type); +#endif // JXL_ENABLE_ASSERT + JXL_ASSERT(y + acs.covered_blocks_y() <= layers_.ysize()); + JXL_ASSERT(x + acs.covered_blocks_x() <= layers_.xsize()); + JXL_CHECK(SetNoBoundsCheck(x, y, type, /*check=*/false)); + } + + Status SetNoBoundsCheck(size_t x, size_t y, AcStrategy::Type type, + bool check = true) { + AcStrategy acs = AcStrategy::FromRawStrategy(type); + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + size_t pos = (y + iy) * stride_ + x + ix; + if (check && row_[pos] != INVALID) { + return JXL_FAILURE("Invalid AC strategy: block overlap"); + } + row_[pos] = + (static_cast(type) << 1) | ((iy | ix) == 0 ? 1 : 0); + } + } + return true; + } + + bool IsValid(size_t x, size_t y) { return row_[y * stride_ + x] != INVALID; } + + AcStrategyRow ConstRow(size_t y, size_t x_prefix = 0) const { + return AcStrategyRow(layers_.ConstRow(y) + x_prefix); + } + + AcStrategyRow ConstRow(const Rect& rect, size_t y) const { + return ConstRow(rect.y0() + y, rect.x0()); + } + + size_t PixelsPerRow() const { return layers_.PixelsPerRow(); } + + size_t xsize() const { return layers_.xsize(); } + size_t ysize() const { return layers_.ysize(); } + + // Count the number of blocks of a given type. + size_t CountBlocks(AcStrategy::Type type) const; + + private: + ImageB layers_; + uint8_t* JXL_RESTRICT row_; + size_t stride_; + + // A value that does not represent a valid combined AC strategy + // value. Used as a sentinel. + static constexpr uint8_t INVALID = 0xFF; +}; + +} // namespace jxl + +#endif // LIB_JXL_AC_STRATEGY_H_ diff --git a/third_party/jpeg-xl/lib/jxl/ac_strategy_test.cc b/third_party/jpeg-xl/lib/jxl/ac_strategy_test.cc new file mode 100644 index 0000000000..d366aa3f82 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/ac_strategy_test.cc @@ -0,0 +1,237 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/ac_strategy.h" + +#include + +#include +#include +#include // HWY_ALIGN_MAX +#include +#include + +#include "lib/jxl/base/random.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/dec_transforms_testonly.h" +#include "lib/jxl/enc_transforms.h" + +namespace jxl { +namespace { + +// Test that DCT -> IDCT is a noop. +class AcStrategyRoundtrip : public ::hwy::TestWithParamTargetAndT { + protected: + void Run() { + const AcStrategy::Type type = static_cast(GetParam()); + const AcStrategy acs = AcStrategy::FromRawStrategy(type); + + auto mem = hwy::AllocateAligned(4 * AcStrategy::kMaxCoeffArea); + float* scratch_space = mem.get(); + float* coeffs = scratch_space + AcStrategy::kMaxCoeffArea; + float* idct = coeffs + AcStrategy::kMaxCoeffArea; + Rng rng(type * 65537 + 13); + + for (size_t j = 0; j < 64; j++) { + size_t i = (acs.log2_covered_blocks() + ? rng.UniformU(0, 64u << acs.log2_covered_blocks()) + : j); + float* input = idct + AcStrategy::kMaxCoeffArea; + std::fill_n(input, AcStrategy::kMaxCoeffArea, 0); + input[i] = 0.2f; + TransformFromPixels(type, input, acs.covered_blocks_x() * 8, coeffs, + scratch_space); + ASSERT_NEAR(coeffs[0], 0.2 / (64 << acs.log2_covered_blocks()), 1e-6) + << " i = " << i; + TransformToPixels(type, coeffs, idct, acs.covered_blocks_x() * 8, + scratch_space); + for (size_t j = 0; j < 64u << acs.log2_covered_blocks(); j++) { + ASSERT_NEAR(idct[j], j == i ? 0.2f : 0, 2e-6) + << "j = " << j << " i = " << i << " acs " << type; + } + } + // Test DC. + std::fill_n(idct, AcStrategy::kMaxCoeffArea, 0); + for (size_t y = 0; y < acs.covered_blocks_y(); y++) { + for (size_t x = 0; x < acs.covered_blocks_x(); x++) { + float* dc = idct + AcStrategy::kMaxCoeffArea; + std::fill_n(dc, AcStrategy::kMaxCoeffArea, 0); + dc[y * acs.covered_blocks_x() * 8 + x] = 0.2; + LowestFrequenciesFromDC(type, dc, acs.covered_blocks_x() * 8, coeffs); + DCFromLowestFrequencies(type, coeffs, idct, acs.covered_blocks_x() * 8); + std::fill_n(dc, AcStrategy::kMaxCoeffArea, 0); + dc[y * acs.covered_blocks_x() * 8 + x] = 0.2; + for (size_t j = 0; j < 64u << acs.log2_covered_blocks(); j++) { + ASSERT_NEAR(idct[j], dc[j], 1e-6) + << "j = " << j << " x = " << x << " y = " << y << " acs " << type; + } + } + } + } +}; + +HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T( + AcStrategyRoundtrip, + ::testing::Range(0, int(AcStrategy::Type::kNumValidStrategies))); + +TEST_P(AcStrategyRoundtrip, Test) { Run(); } + +// Test that DC(2x2) -> DCT coefficients -> IDCT -> downsampled IDCT is a noop. +class AcStrategyRoundtripDownsample + : public ::hwy::TestWithParamTargetAndT { + protected: + void Run() { + const AcStrategy::Type type = static_cast(GetParam()); + const AcStrategy acs = AcStrategy::FromRawStrategy(type); + + auto mem = hwy::AllocateAligned(4 * AcStrategy::kMaxCoeffArea); + float* scratch_space = mem.get(); + float* coeffs = scratch_space + AcStrategy::kMaxCoeffArea; + std::fill_n(coeffs, AcStrategy::kMaxCoeffArea, 0.0f); + float* idct = coeffs + AcStrategy::kMaxCoeffArea; + Rng rng(type * 65537 + 13); + + for (size_t y = 0; y < acs.covered_blocks_y(); y++) { + for (size_t x = 0; x < acs.covered_blocks_x(); x++) { + if (x > 4 || y > 4) { + if (rng.Bernoulli(0.9f)) continue; + } + float* dc = idct + AcStrategy::kMaxCoeffArea; + std::fill_n(dc, AcStrategy::kMaxCoeffArea, 0); + dc[y * acs.covered_blocks_x() * 8 + x] = 0.2f; + LowestFrequenciesFromDC(type, dc, acs.covered_blocks_x() * 8, coeffs); + TransformToPixels(type, coeffs, idct, acs.covered_blocks_x() * 8, + scratch_space); + std::fill_n(coeffs, AcStrategy::kMaxCoeffArea, 0.0f); + std::fill_n(dc, AcStrategy::kMaxCoeffArea, 0); + dc[y * acs.covered_blocks_x() * 8 + x] = 0.2f; + // Downsample + for (size_t dy = 0; dy < acs.covered_blocks_y(); dy++) { + for (size_t dx = 0; dx < acs.covered_blocks_x(); dx++) { + float sum = 0; + for (size_t iy = 0; iy < 8; iy++) { + for (size_t ix = 0; ix < 8; ix++) { + sum += idct[(dy * 8 + iy) * 8 * acs.covered_blocks_x() + + dx * 8 + ix]; + } + } + sum /= 64.0f; + ASSERT_NEAR(sum, dc[dy * 8 * acs.covered_blocks_x() + dx], 1e-6) + << "acs " << type; + } + } + } + } + } +}; + +HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T( + AcStrategyRoundtripDownsample, + ::testing::Range(0, int(AcStrategy::Type::kNumValidStrategies))); + +TEST_P(AcStrategyRoundtripDownsample, Test) { Run(); } + +// Test that IDCT(block with zeros in the non-topleft corner) -> downsampled +// IDCT is the same as IDCT -> DC(2x2) of the same block. +class AcStrategyDownsample : public ::hwy::TestWithParamTargetAndT { + protected: + void Run() { + const AcStrategy::Type type = static_cast(GetParam()); + const AcStrategy acs = AcStrategy::FromRawStrategy(type); + size_t cx = acs.covered_blocks_y(); + size_t cy = acs.covered_blocks_x(); + CoefficientLayout(&cy, &cx); + + auto mem = hwy::AllocateAligned(4 * AcStrategy::kMaxCoeffArea); + float* scratch_space = mem.get(); + float* idct = scratch_space + AcStrategy::kMaxCoeffArea; + float* idct_acs_downsampled = idct + AcStrategy::kMaxCoeffArea; + Rng rng(type * 65537 + 13); + + for (size_t y = 0; y < cy; y++) { + for (size_t x = 0; x < cx; x++) { + if (x > 4 || y > 4) { + if (rng.Bernoulli(0.9f)) continue; + } + float* coeffs = idct + AcStrategy::kMaxCoeffArea; + std::fill_n(coeffs, AcStrategy::kMaxCoeffArea, 0); + coeffs[y * cx * 8 + x] = 0.2f; + TransformToPixels(type, coeffs, idct, acs.covered_blocks_x() * 8, + scratch_space); + std::fill_n(coeffs, AcStrategy::kMaxCoeffArea, 0); + coeffs[y * cx * 8 + x] = 0.2f; + DCFromLowestFrequencies(type, coeffs, idct_acs_downsampled, + acs.covered_blocks_x() * 8); + // Downsample + for (size_t dy = 0; dy < acs.covered_blocks_y(); dy++) { + for (size_t dx = 0; dx < acs.covered_blocks_x(); dx++) { + float sum = 0; + for (size_t iy = 0; iy < 8; iy++) { + for (size_t ix = 0; ix < 8; ix++) { + sum += idct[(dy * 8 + iy) * 8 * acs.covered_blocks_x() + + dx * 8 + ix]; + } + } + sum /= 64; + ASSERT_NEAR( + sum, idct_acs_downsampled[dy * 8 * acs.covered_blocks_x() + dx], + 1e-6) + << " acs " << type; + } + } + } + } + } +}; + +HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T( + AcStrategyDownsample, + ::testing::Range(0, int(AcStrategy::Type::kNumValidStrategies))); + +TEST_P(AcStrategyDownsample, Test) { Run(); } + +class AcStrategyTargetTest : public ::hwy::TestWithParamTarget {}; +HWY_TARGET_INSTANTIATE_TEST_SUITE_P(AcStrategyTargetTest); + +TEST_P(AcStrategyTargetTest, RoundtripAFVDCT) { + HWY_ALIGN_MAX float idct[16]; + for (size_t i = 0; i < 16; i++) { + HWY_ALIGN_MAX float pixels[16] = {}; + pixels[i] = 1; + HWY_ALIGN_MAX float coeffs[16] = {}; + + AFVDCT4x4(pixels, coeffs); + AFVIDCT4x4(coeffs, idct); + for (size_t j = 0; j < 16; j++) { + EXPECT_NEAR(idct[j], pixels[j], 1e-6); + } + } +} + +TEST_P(AcStrategyTargetTest, BenchmarkAFV) { + const AcStrategy::Type type = AcStrategy::Type::AFV0; + HWY_ALIGN_MAX float pixels[64] = {1}; + HWY_ALIGN_MAX float coeffs[64] = {}; + HWY_ALIGN_MAX float scratch_space[64] = {}; + for (size_t i = 0; i < 1 << 14; i++) { + TransformToPixels(type, coeffs, pixels, 8, scratch_space); + TransformFromPixels(type, pixels, 8, coeffs, scratch_space); + } + EXPECT_NEAR(pixels[0], 0.0, 1E-6); +} + +TEST_P(AcStrategyTargetTest, BenchmarkAFVDCT) { + HWY_ALIGN_MAX float pixels[64] = {1}; + HWY_ALIGN_MAX float coeffs[64] = {}; + for (size_t i = 0; i < 1 << 14; i++) { + AFVDCT4x4(pixels, coeffs); + AFVIDCT4x4(coeffs, pixels); + } + EXPECT_NEAR(pixels[0], 1.0, 1E-6); +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/alpha.cc b/third_party/jpeg-xl/lib/jxl/alpha.cc new file mode 100644 index 0000000000..48d7e7ee92 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/alpha.cc @@ -0,0 +1,115 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/alpha.h" + +#include + +#include + +namespace jxl { + +static float Clamp(float x) { return std::max(std::min(1.0f, x), 0.0f); } + +void PerformAlphaBlending(const AlphaBlendingInputLayer& bg, + const AlphaBlendingInputLayer& fg, + const AlphaBlendingOutput& out, size_t num_pixels, + bool alpha_is_premultiplied, bool clamp) { + if (alpha_is_premultiplied) { + for (size_t x = 0; x < num_pixels; ++x) { + float fga = clamp ? Clamp(fg.a[x]) : fg.a[x]; + out.r[x] = (fg.r[x] + bg.r[x] * (1.f - fga)); + out.g[x] = (fg.g[x] + bg.g[x] * (1.f - fga)); + out.b[x] = (fg.b[x] + bg.b[x] * (1.f - fga)); + out.a[x] = (1.f - (1.f - fga) * (1.f - bg.a[x])); + } + } else { + for (size_t x = 0; x < num_pixels; ++x) { + float fga = clamp ? Clamp(fg.a[x]) : fg.a[x]; + const float new_a = 1.f - (1.f - fga) * (1.f - bg.a[x]); + const float rnew_a = (new_a > 0 ? 1.f / new_a : 0.f); + out.r[x] = (fg.r[x] * fga + bg.r[x] * bg.a[x] * (1.f - fga)) * rnew_a; + out.g[x] = (fg.g[x] * fga + bg.g[x] * bg.a[x] * (1.f - fga)) * rnew_a; + out.b[x] = (fg.b[x] * fga + bg.b[x] * bg.a[x] * (1.f - fga)) * rnew_a; + out.a[x] = new_a; + } + } +} +void PerformAlphaBlending(const float* bg, const float* bga, const float* fg, + const float* fga, float* out, size_t num_pixels, + bool alpha_is_premultiplied, bool clamp) { + if (bg == bga && fg == fga) { + for (size_t x = 0; x < num_pixels; ++x) { + float fa = clamp ? fga[x] : Clamp(fga[x]); + out[x] = (1.f - (1.f - fa) * (1.f - bga[x])); + } + } else { + if (alpha_is_premultiplied) { + for (size_t x = 0; x < num_pixels; ++x) { + float fa = clamp ? fga[x] : Clamp(fga[x]); + out[x] = (fg[x] + bg[x] * (1.f - fa)); + } + } else { + for (size_t x = 0; x < num_pixels; ++x) { + float fa = clamp ? fga[x] : Clamp(fga[x]); + const float new_a = 1.f - (1.f - fa) * (1.f - bga[x]); + const float rnew_a = (new_a > 0 ? 1.f / new_a : 0.f); + out[x] = (fg[x] * fa + bg[x] * bga[x] * (1.f - fa)) * rnew_a; + } + } + } +} + +void PerformAlphaWeightedAdd(const float* bg, const float* fg, const float* fga, + float* out, size_t num_pixels, bool clamp) { + if (fg == fga) { + memcpy(out, bg, num_pixels * sizeof(*out)); + } else if (clamp) { + for (size_t x = 0; x < num_pixels; ++x) { + out[x] = bg[x] + fg[x] * Clamp(fga[x]); + } + } else { + for (size_t x = 0; x < num_pixels; ++x) { + out[x] = bg[x] + fg[x] * fga[x]; + } + } +} + +void PerformMulBlending(const float* bg, const float* fg, float* out, + size_t num_pixels, bool clamp) { + if (clamp) { + for (size_t x = 0; x < num_pixels; ++x) { + out[x] = bg[x] * Clamp(fg[x]); + } + } else { + for (size_t x = 0; x < num_pixels; ++x) { + out[x] = bg[x] * fg[x]; + } + } +} + +void PremultiplyAlpha(float* JXL_RESTRICT r, float* JXL_RESTRICT g, + float* JXL_RESTRICT b, const float* JXL_RESTRICT a, + size_t num_pixels) { + for (size_t x = 0; x < num_pixels; ++x) { + const float multiplier = std::max(kSmallAlpha, a[x]); + r[x] *= multiplier; + g[x] *= multiplier; + b[x] *= multiplier; + } +} + +void UnpremultiplyAlpha(float* JXL_RESTRICT r, float* JXL_RESTRICT g, + float* JXL_RESTRICT b, const float* JXL_RESTRICT a, + size_t num_pixels) { + for (size_t x = 0; x < num_pixels; ++x) { + const float multiplier = 1.f / std::max(kSmallAlpha, a[x]); + r[x] *= multiplier; + g[x] *= multiplier; + b[x] *= multiplier; + } +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/alpha.h b/third_party/jpeg-xl/lib/jxl/alpha.h new file mode 100644 index 0000000000..efb76c800f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/alpha.h @@ -0,0 +1,66 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ALPHA_H_ +#define LIB_JXL_ALPHA_H_ + +#include +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" + +namespace jxl { + +// A very small value to avoid divisions by zero when converting to +// unpremultiplied alpha. Page 21 of the technical introduction to OpenEXR +// (https://www.openexr.com/documentation/TechnicalIntroduction.pdf) recommends +// "a power of two" that is "less than half of the smallest positive 16-bit +// floating-point value". That smallest value happens to be the denormal number +// 2^-24, so 2^-26 should be a good choice. +static constexpr float kSmallAlpha = 1.f / (1u << 26u); + +struct AlphaBlendingInputLayer { + const float* r; + const float* g; + const float* b; + const float* a; +}; + +struct AlphaBlendingOutput { + float* r; + float* g; + float* b; + float* a; +}; + +// Note: The pointers in `out` are allowed to alias those in `bg` or `fg`. +// No pointer shall be null. +void PerformAlphaBlending(const AlphaBlendingInputLayer& bg, + const AlphaBlendingInputLayer& fg, + const AlphaBlendingOutput& out, size_t num_pixels, + bool alpha_is_premultiplied, bool clamp); +// Single plane alpha blending +void PerformAlphaBlending(const float* bg, const float* bga, const float* fg, + const float* fga, float* out, size_t num_pixels, + bool alpha_is_premultiplied, bool clamp); + +void PerformAlphaWeightedAdd(const float* bg, const float* fg, const float* fga, + float* out, size_t num_pixels, bool clamp); + +void PerformMulBlending(const float* bg, const float* fg, float* out, + size_t num_pixels, bool clamp); + +void PremultiplyAlpha(float* JXL_RESTRICT r, float* JXL_RESTRICT g, + float* JXL_RESTRICT b, const float* JXL_RESTRICT a, + size_t num_pixels); +void UnpremultiplyAlpha(float* JXL_RESTRICT r, float* JXL_RESTRICT g, + float* JXL_RESTRICT b, const float* JXL_RESTRICT a, + size_t num_pixels); + +} // namespace jxl + +#endif // LIB_JXL_ALPHA_H_ diff --git a/third_party/jpeg-xl/lib/jxl/alpha_test.cc b/third_party/jpeg-xl/lib/jxl/alpha_test.cc new file mode 100644 index 0000000000..ddafd829ec --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/alpha_test.cc @@ -0,0 +1,134 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/alpha.h" + +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +using ::testing::_; +using ::testing::ElementsAre; +using ::testing::FloatNear; + +TEST(AlphaTest, BlendingWithNonPremultiplied) { + const float bg_rgb[3] = {100, 110, 120}; + const float bg_a = 180.f / 255; + const float fg_rgb[3] = {25, 21, 23}; + const float fg_a = 15420.f / 65535; + const float fg_a2 = 2.0f; + float out_rgb[3]; + float out_a; + PerformAlphaBlending( + /*bg=*/{&bg_rgb[0], &bg_rgb[1], &bg_rgb[2], &bg_a}, + /*fg=*/{&fg_rgb[0], &fg_rgb[1], &fg_rgb[2], &fg_a}, + /*out=*/{&out_rgb[0], &out_rgb[1], &out_rgb[2], &out_a}, 1, + /*alpha_is_premultiplied=*/false, /*clamp=*/false); + EXPECT_THAT(out_rgb, + ElementsAre(FloatNear(77.2f, .05f), FloatNear(83.0f, .05f), + FloatNear(90.6f, .05f))); + EXPECT_NEAR(out_a, 3174.f / 4095, 1e-5); + PerformAlphaBlending( + /*bg=*/{&bg_rgb[0], &bg_rgb[1], &bg_rgb[2], &bg_a}, + /*fg=*/{&fg_rgb[0], &fg_rgb[1], &fg_rgb[2], &fg_a2}, + /*out=*/{&out_rgb[0], &out_rgb[1], &out_rgb[2], &out_a}, 1, + /*alpha_is_premultiplied=*/false, /*clamp=*/true); + EXPECT_THAT(out_rgb, ElementsAre(FloatNear(fg_rgb[0], .05f), + FloatNear(fg_rgb[1], .05f), + FloatNear(fg_rgb[2], .05f))); + EXPECT_NEAR(out_a, 1.0f, 1e-5); +} + +TEST(AlphaTest, BlendingWithPremultiplied) { + const float bg_rgb[3] = {100, 110, 120}; + const float bg_a = 180.f / 255; + const float fg_rgb[3] = {25, 21, 23}; + const float fg_a = 15420.f / 65535; + const float fg_a2 = 2.0f; + float out_rgb[3]; + float out_a; + PerformAlphaBlending( + /*bg=*/{&bg_rgb[0], &bg_rgb[1], &bg_rgb[2], &bg_a}, + /*fg=*/{&fg_rgb[0], &fg_rgb[1], &fg_rgb[2], &fg_a}, + /*out=*/{&out_rgb[0], &out_rgb[1], &out_rgb[2], &out_a}, 1, + /*alpha_is_premultiplied=*/true, /*clamp=*/false); + EXPECT_THAT(out_rgb, + ElementsAre(FloatNear(101.5f, .05f), FloatNear(105.1f, .05f), + FloatNear(114.8f, .05f))); + EXPECT_NEAR(out_a, 3174.f / 4095, 1e-5); + PerformAlphaBlending( + /*bg=*/{&bg_rgb[0], &bg_rgb[1], &bg_rgb[2], &bg_a}, + /*fg=*/{&fg_rgb[0], &fg_rgb[1], &fg_rgb[2], &fg_a2}, + /*out=*/{&out_rgb[0], &out_rgb[1], &out_rgb[2], &out_a}, 1, + /*alpha_is_premultiplied=*/true, /*clamp=*/true); + EXPECT_THAT(out_rgb, ElementsAre(FloatNear(fg_rgb[0], .05f), + FloatNear(fg_rgb[1], .05f), + FloatNear(fg_rgb[2], .05f))); + EXPECT_NEAR(out_a, 1.0f, 1e-5); +} + +TEST(AlphaTest, Mul) { + const float bg = 100; + const float fg = 25; + float out; + PerformMulBlending(&bg, &fg, &out, 1, /*clamp=*/false); + EXPECT_THAT(out, FloatNear(fg * bg, .05f)); + PerformMulBlending(&bg, &fg, &out, 1, /*clamp=*/true); + EXPECT_THAT(out, FloatNear(bg, .05f)); +} + +TEST(AlphaTest, PremultiplyAndUnpremultiply) { + const float alpha[] = {0.f, 63.f / 255, 127.f / 255, 1.f}; + float r[] = {120, 130, 140, 150}; + float g[] = {124, 134, 144, 154}; + float b[] = {127, 137, 147, 157}; + + PremultiplyAlpha(r, g, b, alpha, 4); + EXPECT_THAT( + r, ElementsAre(FloatNear(0.f, 1e-5f), FloatNear(130 * 63.f / 255, 1e-5f), + FloatNear(140 * 127.f / 255, 1e-5f), 150)); + EXPECT_THAT( + g, ElementsAre(FloatNear(0.f, 1e-5f), FloatNear(134 * 63.f / 255, 1e-5f), + FloatNear(144 * 127.f / 255, 1e-5f), 154)); + EXPECT_THAT( + b, ElementsAre(FloatNear(0.f, 1e-5f), FloatNear(137 * 63.f / 255, 1e-5f), + FloatNear(147 * 127.f / 255, 1e-5f), 157)); + + UnpremultiplyAlpha(r, g, b, alpha, 4); + EXPECT_THAT(r, ElementsAre(FloatNear(120, 1e-4f), FloatNear(130, 1e-4f), + FloatNear(140, 1e-4f), FloatNear(150, 1e-4f))); + EXPECT_THAT(g, ElementsAre(FloatNear(124, 1e-4f), FloatNear(134, 1e-4f), + FloatNear(144, 1e-4f), FloatNear(154, 1e-4f))); + EXPECT_THAT(b, ElementsAre(FloatNear(127, 1e-4f), FloatNear(137, 1e-4f), + FloatNear(147, 1e-4f), FloatNear(157, 1e-4f))); +} + +TEST(AlphaTest, UnpremultiplyAndPremultiply) { + const float alpha[] = {0.f, 63.f / 255, 127.f / 255, 1.f}; + float r[] = {50, 60, 70, 80}; + float g[] = {54, 64, 74, 84}; + float b[] = {57, 67, 77, 87}; + + UnpremultiplyAlpha(r, g, b, alpha, 4); + EXPECT_THAT(r, ElementsAre(_, FloatNear(60 * 255.f / 63, 1e-4f), + FloatNear(70 * 255.f / 127, 1e-4f), 80)); + EXPECT_THAT(g, ElementsAre(_, FloatNear(64 * 255.f / 63, 1e-4f), + FloatNear(74 * 255.f / 127, 1e-4f), 84)); + EXPECT_THAT(b, ElementsAre(_, FloatNear(67 * 255.f / 63, 1e-4f), + FloatNear(77 * 255.f / 127, 1e-4f), 87)); + + PremultiplyAlpha(r, g, b, alpha, 4); + EXPECT_THAT(r, ElementsAre(FloatNear(50, 1e-4f), FloatNear(60, 1e-4f), + FloatNear(70, 1e-4f), FloatNear(80, 1e-4f))); + EXPECT_THAT(g, ElementsAre(FloatNear(54, 1e-4f), FloatNear(64, 1e-4f), + FloatNear(74, 1e-4f), FloatNear(84, 1e-4f))); + EXPECT_THAT(b, ElementsAre(FloatNear(57, 1e-4f), FloatNear(67, 1e-4f), + FloatNear(77, 1e-4f), FloatNear(87, 1e-4f))); +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/ans_common.cc b/third_party/jpeg-xl/lib/jxl/ans_common.cc new file mode 100644 index 0000000000..d2cf897ec4 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/ans_common.cc @@ -0,0 +1,148 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/ans_common.h" + +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +std::vector CreateFlatHistogram(int length, int total_count) { + JXL_ASSERT(length > 0); + JXL_ASSERT(length <= total_count); + const int count = total_count / length; + std::vector result(length, count); + const int rem_counts = total_count % length; + for (int i = 0; i < rem_counts; ++i) { + ++result[i]; + } + return result; +} + +// First, all trailing non-occurring symbols are removed from the distribution; +// if this leaves the distribution empty, a dummy symbol with max weight is +// added. This ensures that the resulting distribution sums to total table size. +// Then, `entry_size` is chosen to be the largest power of two so that +// `table_size` = ANS_TAB_SIZE/`entry_size` is at least as big as the +// distribution size. +// Note that each entry will only ever contain two different symbols, and +// consecutive ranges of offsets, which allows us to use a compact +// representation. +// Each entry is initialized with only the (symbol=i, offset) pairs; then +// positions for which the entry overflows (i.e. distribution[i] > entry_size) +// or is not full are computed, and put into a stack in increasing order. +// Missing symbols in the distribution are padded with 0 (because `table_size` +// >= number of symbols). The `cutoff` value for each entry is initialized to +// the number of occupied slots in that entry (i.e. `distributions[i]`). While +// the overflowing-symbol stack is not empty (which implies that the +// underflowing-symbol stack also is not), the top overfull and underfull +// positions are popped from the stack; the empty slots in the underfull entry +// are then filled with as many slots as needed from the overfull entry; such +// slots are placed after the slots in the overfull entry, and `offsets[1]` is +// computed accordingly. The formerly underfull entry is thus now neither +// underfull nor overfull, and represents exactly two symbols. The overfull +// entry might be either overfull or underfull, and is pushed into the +// corresponding stack. +void InitAliasTable(std::vector distribution, uint32_t range, + size_t log_alpha_size, AliasTable::Entry* JXL_RESTRICT a) { + while (!distribution.empty() && distribution.back() == 0) { + distribution.pop_back(); + } + // Ensure that a valid table is always returned, even for an empty + // alphabet. Otherwise, a specially-crafted stream might crash the + // decoder. + if (distribution.empty()) { + distribution.emplace_back(range); + } + const size_t table_size = 1 << log_alpha_size; +#if JXL_ENABLE_ASSERT + int sum = std::accumulate(distribution.begin(), distribution.end(), 0); +#endif // JXL_ENABLE_ASSERT + JXL_ASSERT(static_cast(sum) == range); + // range must be a power of two + JXL_ASSERT((range & (range - 1)) == 0); + JXL_ASSERT(distribution.size() <= table_size); + JXL_ASSERT(table_size <= range); + const uint32_t entry_size = range >> log_alpha_size; // this is exact + // Special case for single-symbol distributions, that ensures that the state + // does not change when decoding from such a distribution. Note that, since we + // hardcode offset0 == 0, it is not straightforward (if at all possible) to + // fix the general case to produce this result. + for (size_t sym = 0; sym < distribution.size(); sym++) { + if (distribution[sym] == ANS_TAB_SIZE) { + for (size_t i = 0; i < table_size; i++) { + a[i].right_value = sym; + a[i].cutoff = 0; + a[i].offsets1 = entry_size * i; + a[i].freq0 = 0; + a[i].freq1_xor_freq0 = ANS_TAB_SIZE; + } + return; + } + } + std::vector underfull_posn; + std::vector overfull_posn; + std::vector cutoffs(1 << log_alpha_size); + // Initialize entries. + for (size_t i = 0; i < distribution.size(); i++) { + cutoffs[i] = distribution[i]; + if (cutoffs[i] > entry_size) { + overfull_posn.push_back(i); + } else if (cutoffs[i] < entry_size) { + underfull_posn.push_back(i); + } + } + for (uint32_t i = distribution.size(); i < table_size; i++) { + cutoffs[i] = 0; + underfull_posn.push_back(i); + } + // Reassign overflow/underflow values. + while (!overfull_posn.empty()) { + uint32_t overfull_i = overfull_posn.back(); + overfull_posn.pop_back(); + JXL_ASSERT(!underfull_posn.empty()); + uint32_t underfull_i = underfull_posn.back(); + underfull_posn.pop_back(); + uint32_t underfull_by = entry_size - cutoffs[underfull_i]; + cutoffs[overfull_i] -= underfull_by; + // overfull positions have their original symbols + a[underfull_i].right_value = overfull_i; + a[underfull_i].offsets1 = cutoffs[overfull_i]; + // Slots in the right part of entry underfull_i were taken from the end + // of the symbols in entry overfull_i. + if (cutoffs[overfull_i] < entry_size) { + underfull_posn.push_back(overfull_i); + } else if (cutoffs[overfull_i] > entry_size) { + overfull_posn.push_back(overfull_i); + } + } + for (uint32_t i = 0; i < table_size; i++) { + // cutoffs[i] is properly initialized but the clang-analyzer doesn't infer + // it since it is partially initialized across two for-loops. + // NOLINTNEXTLINE(clang-analyzer-core.UndefinedBinaryOperatorResult) + if (cutoffs[i] == entry_size) { + a[i].right_value = i; + a[i].offsets1 = 0; + a[i].cutoff = 0; + } else { + // Note that, if cutoff is not equal to entry_size, + // a[i].offsets1 was initialized with (overfull cutoff) - + // (entry_size - a[i].cutoff). Thus, subtracting + // a[i].cutoff cannot make it negative. + a[i].offsets1 -= cutoffs[i]; + a[i].cutoff = cutoffs[i]; + } + const size_t freq0 = i < distribution.size() ? distribution[i] : 0; + const size_t i1 = a[i].right_value; + const size_t freq1 = i1 < distribution.size() ? distribution[i1] : 0; + a[i].freq0 = static_cast(freq0); + a[i].freq1_xor_freq0 = static_cast(freq1 ^ freq0); + } +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/ans_common.h b/third_party/jpeg-xl/lib/jxl/ans_common.h new file mode 100644 index 0000000000..fb5058e310 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/ans_common.h @@ -0,0 +1,143 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ANS_COMMON_H_ +#define LIB_JXL_ANS_COMMON_H_ + +#include + +#include +#include // Prefetch +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +// Returns the precision (number of bits) that should be used to store +// a histogram count such that Log2Floor(count) == logcount. +static JXL_INLINE uint32_t GetPopulationCountPrecision(uint32_t logcount, + uint32_t shift) { + int32_t r = std::min( + logcount, int(shift) - int((ANS_LOG_TAB_SIZE - logcount) >> 1)); + if (r < 0) return 0; + return r; +} + +// Returns a histogram where the counts are positive, differ by at most 1, +// and add up to total_count. The bigger counts (if any) are at the beginning +// of the histogram. +std::vector CreateFlatHistogram(int length, int total_count); + +// An alias table implements a mapping from the [0, ANS_TAB_SIZE) range into +// the [0, ANS_MAX_ALPHABET_SIZE) range, satisfying the following conditions: +// - each symbol occurs as many times as specified by any valid distribution +// of frequencies of the symbols. A valid distribution here is an array of +// ANS_MAX_ALPHABET_SIZE that contains numbers in the range [0, ANS_TAB_SIZE], +// and whose sum is ANS_TAB_SIZE. +// - lookups can be done in constant time, and also return how many smaller +// input values map into the same symbol, according to some well-defined order +// of input values. +// - the space used by the alias table is given by a small constant times the +// index of the largest symbol with nonzero probability in the distribution. +// Each of the entries in the table covers a range of `entry_size` values in the +// [0, ANS_TAB_SIZE) range; consecutive entries represent consecutive +// sub-ranges. In the range covered by entry `i`, the first `cutoff` values map +// to symbol `i`, while the others map to symbol `right_value`. +// +// TODO(veluca): consider making the order used for computing offsets easier to +// define - it is currently defined by the algorithm to compute the alias table. +// Beware of breaking the implicit assumption that symbols that come after the +// cutoff value should have an offset at least as big as the cutoff. + +struct AliasTable { + struct Symbol { + size_t value; + size_t offset; + size_t freq; + }; + +// Working set size matters here (~64 tables x 256 entries). +// offsets0 is always zero (beginning of [0] side among the same symbol). +// offsets1 is an offset of (pos >= cutoff) side decremented by cutoff. +#pragma pack(push, 1) + struct Entry { + uint8_t cutoff; // < kEntrySizeMinus1 when used by ANS. + uint8_t right_value; // < alphabet size. + uint16_t freq0; + + // Only used if `greater` (see Lookup) + uint16_t offsets1; // <= ANS_TAB_SIZE + uint16_t freq1_xor_freq0; // for branchless ternary in Lookup + }; +#pragma pack(pop) + + // Dividing `value` by `entry_size` determines `i`, the entry which is + // responsible for the input. If the remainder is below `cutoff`, then the + // mapped symbol is `i`; since `offsets[0]` stores the number of occurrences + // of `i` "before" the start of this entry, the offset of the input will be + // `offsets[0] + remainder`. If the remainder is above cutoff, the mapped + // symbol is `right_value`; since `offsets[1]` stores the number of + // occurrences of `right_value` "before" this entry, minus the `cutoff` value, + // the input offset is then `remainder + offsets[1]`. + static JXL_INLINE Symbol Lookup(const Entry* JXL_RESTRICT table, size_t value, + size_t log_entry_size, + size_t entry_size_minus_1) { + const size_t i = value >> log_entry_size; + const size_t pos = value & entry_size_minus_1; + +#if JXL_BYTE_ORDER_LITTLE + uint64_t entry; + memcpy(&entry, &table[i].cutoff, sizeof(entry)); + const size_t cutoff = entry & 0xFF; // = MOVZX + const size_t right_value = (entry >> 8) & 0xFF; // = MOVZX + const size_t freq0 = (entry >> 16) & 0xFFFF; +#else + // Generates multiple loads with complex addressing. + const size_t cutoff = table[i].cutoff; + const size_t right_value = table[i].right_value; + const size_t freq0 = table[i].freq0; +#endif + + const bool greater = pos >= cutoff; + +#if JXL_BYTE_ORDER_LITTLE + const uint64_t conditional = greater ? entry : 0; // = CMOV + const size_t offsets1_or_0 = (conditional >> 32) & 0xFFFF; + const size_t freq1_xor_freq0_or_0 = conditional >> 48; +#else + const size_t offsets1_or_0 = greater ? table[i].offsets1 : 0; + const size_t freq1_xor_freq0_or_0 = greater ? table[i].freq1_xor_freq0 : 0; +#endif + + // WARNING: moving this code may interfere with CMOV heuristics. + Symbol s; + s.value = greater ? right_value : i; + s.offset = offsets1_or_0 + pos; + s.freq = freq0 ^ freq1_xor_freq0_or_0; // = greater ? freq1 : freq0 + // XOR avoids implementation-defined conversion from unsigned to signed. + // Alternatives considered: BEXTR is 2 cycles on HSW, SET+shift causes + // spills, simple ternary has a long dependency chain. + + return s; + } + + static HWY_INLINE void Prefetch(const Entry* JXL_RESTRICT table, size_t value, + size_t log_entry_size) { + const size_t i = value >> log_entry_size; + hwy::Prefetch(table + i); + } +}; + +// Computes an alias table for a given distribution. +void InitAliasTable(std::vector distribution, uint32_t range, + size_t log_alpha_size, AliasTable::Entry* JXL_RESTRICT a); + +} // namespace jxl + +#endif // LIB_JXL_ANS_COMMON_H_ diff --git a/third_party/jpeg-xl/lib/jxl/ans_common_test.cc b/third_party/jpeg-xl/lib/jxl/ans_common_test.cc new file mode 100644 index 0000000000..487b6cf5bd --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/ans_common_test.cc @@ -0,0 +1,43 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/ans_common.h" + +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +void VerifyAliasDistribution(const std::vector& distribution, + uint32_t range) { + constexpr size_t log_alpha_size = 8; + AliasTable::Entry table[1 << log_alpha_size]; + InitAliasTable(distribution, range, log_alpha_size, table); + std::vector> offsets(distribution.size()); + for (uint32_t i = 0; i < range; i++) { + AliasTable::Symbol s = AliasTable::Lookup( + table, i, ANS_LOG_TAB_SIZE - 8, (1 << (ANS_LOG_TAB_SIZE - 8)) - 1); + offsets[s.value].push_back(s.offset); + } + for (uint32_t i = 0; i < distribution.size(); i++) { + ASSERT_EQ(static_cast(distribution[i]), offsets[i].size()); + std::sort(offsets[i].begin(), offsets[i].end()); + for (uint32_t j = 0; j < offsets[i].size(); j++) { + ASSERT_EQ(offsets[i][j], j); + } + } +} + +TEST(ANSCommonTest, AliasDistributionSmoke) { + VerifyAliasDistribution({ANS_TAB_SIZE / 2, ANS_TAB_SIZE / 2}, ANS_TAB_SIZE); + VerifyAliasDistribution({ANS_TAB_SIZE}, ANS_TAB_SIZE); + VerifyAliasDistribution({0, 0, 0, ANS_TAB_SIZE, 0}, ANS_TAB_SIZE); +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/ans_params.h b/third_party/jpeg-xl/lib/jxl/ans_params.h new file mode 100644 index 0000000000..4bbc284c0b --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/ans_params.h @@ -0,0 +1,36 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ANS_PARAMS_H_ +#define LIB_JXL_ANS_PARAMS_H_ + +// Common parameters that are needed for both the ANS entropy encoding and +// decoding methods. + +#include +#include + +namespace jxl { + +// TODO(veluca): decide if 12 is the best constant here (valid range is up to +// 16). This requires recomputing the Huffman tables in {enc,dec}_ans.cc +// 14 gives a 0.2% improvement at d1 and makes d8 slightly worse. This is +// likely not worth the increase in encoder complexity. +#define ANS_LOG_TAB_SIZE 12u +#define ANS_TAB_SIZE (1 << ANS_LOG_TAB_SIZE) +#define ANS_TAB_MASK (ANS_TAB_SIZE - 1) + +// Largest possible symbol to be encoded by either ANS or prefix coding. +#define PREFIX_MAX_ALPHABET_SIZE 4096 +#define ANS_MAX_ALPHABET_SIZE 256 + +// Max number of bits for prefix coding. +#define PREFIX_MAX_BITS 15 + +#define ANS_SIGNATURE 0x13 // Initial state, used as CRC. + +} // namespace jxl + +#endif // LIB_JXL_ANS_PARAMS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/ans_test.cc b/third_party/jpeg-xl/lib/jxl/ans_test.cc new file mode 100644 index 0000000000..06bc46477f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/ans_test.cc @@ -0,0 +1,278 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include + +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/random.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +void RoundtripTestcase(int n_histograms, int alphabet_size, + const std::vector& input_values) { + constexpr uint16_t kMagic1 = 0x9e33; + constexpr uint16_t kMagic2 = 0x8b04; + + BitWriter writer; + // Space for magic bytes. + BitWriter::Allotment allotment_magic1(&writer, 16); + writer.Write(16, kMagic1); + allotment_magic1.ReclaimAndCharge(&writer, 0, nullptr); + + std::vector context_map; + EntropyEncodingData codes; + std::vector> input_values_vec; + input_values_vec.push_back(input_values); + + BuildAndEncodeHistograms(HistogramParams(), n_histograms, input_values_vec, + &codes, &context_map, &writer, 0, nullptr); + WriteTokens(input_values_vec[0], codes, context_map, &writer, 0, nullptr); + + // Magic bytes + padding + BitWriter::Allotment allotment_magic2(&writer, 24); + writer.Write(16, kMagic2); + writer.ZeroPadToByte(); + allotment_magic2.ReclaimAndCharge(&writer, 0, nullptr); + + // We do not truncate the output. Reading past the end reads out zeroes + // anyway. + BitReader br(writer.GetSpan()); + + ASSERT_EQ(br.ReadBits(16), kMagic1); + + std::vector dec_context_map; + ANSCode decoded_codes; + ASSERT_TRUE( + DecodeHistograms(&br, n_histograms, &decoded_codes, &dec_context_map)); + ASSERT_EQ(dec_context_map, context_map); + ANSSymbolReader reader(&decoded_codes, &br); + + for (const Token& symbol : input_values) { + uint32_t read_symbol = + reader.ReadHybridUint(symbol.context, &br, dec_context_map); + ASSERT_EQ(read_symbol, symbol.value); + } + ASSERT_TRUE(reader.CheckANSFinalState()); + + ASSERT_EQ(br.ReadBits(16), kMagic2); + EXPECT_TRUE(br.Close()); +} + +TEST(ANSTest, EmptyRoundtrip) { + RoundtripTestcase(2, ANS_MAX_ALPHABET_SIZE, std::vector()); +} + +TEST(ANSTest, SingleSymbolRoundtrip) { + for (uint32_t i = 0; i < ANS_MAX_ALPHABET_SIZE; i++) { + RoundtripTestcase(2, ANS_MAX_ALPHABET_SIZE, {{0, i}}); + } + for (uint32_t i = 0; i < ANS_MAX_ALPHABET_SIZE; i++) { + RoundtripTestcase(2, ANS_MAX_ALPHABET_SIZE, + std::vector(1024, {0, i})); + } +} + +#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \ + defined(THREAD_SANITIZER) +constexpr size_t kReps = 3; +#else +constexpr size_t kReps = 10; +#endif + +void RoundtripRandomStream(int alphabet_size, size_t reps = kReps, + size_t num = 1 << 18) { + constexpr int kNumHistograms = 3; + Rng rng(0); + for (size_t i = 0; i < reps; i++) { + std::vector symbols; + for (size_t j = 0; j < num; j++) { + int context = rng.UniformI(0, kNumHistograms); + int value = rng.UniformU(0, alphabet_size); + symbols.emplace_back(context, value); + } + RoundtripTestcase(kNumHistograms, alphabet_size, symbols); + } +} + +void RoundtripRandomUnbalancedStream(int alphabet_size) { + constexpr int kNumHistograms = 3; + constexpr int kPrecision = 1 << 10; + Rng rng(0); + for (size_t i = 0; i < kReps; i++) { + std::vector distributions[kNumHistograms] = {}; + for (int j = 0; j < kNumHistograms; j++) { + distributions[j].resize(kPrecision); + int symbol = 0; + int remaining = 1; + for (int k = 0; k < kPrecision; k++) { + if (remaining == 0) { + if (symbol < alphabet_size - 1) symbol++; + // There is no meaning behind this distribution: it's anything that + // will create a nonuniform distribution and won't have too few + // symbols usually. Also we want different distributions we get to be + // sufficiently dissimilar. + remaining = rng.UniformU(0, kPrecision - k + 1); + } + distributions[j][k] = symbol; + remaining--; + } + } + std::vector symbols; + for (int j = 0; j < 1 << 18; j++) { + int context = rng.UniformI(0, kNumHistograms); + int value = rng.UniformU(0, kPrecision); + symbols.emplace_back(context, value); + } + RoundtripTestcase(kNumHistograms + 1, alphabet_size, symbols); + } +} + +TEST(ANSTest, RandomStreamRoundtrip3Small) { RoundtripRandomStream(3, 1, 16); } + +TEST(ANSTest, RandomStreamRoundtrip3) { RoundtripRandomStream(3); } + +TEST(ANSTest, RandomStreamRoundtripBig) { + RoundtripRandomStream(ANS_MAX_ALPHABET_SIZE); +} + +TEST(ANSTest, RandomUnbalancedStreamRoundtrip3) { + RoundtripRandomUnbalancedStream(3); +} + +TEST(ANSTest, RandomUnbalancedStreamRoundtripBig) { + RoundtripRandomUnbalancedStream(ANS_MAX_ALPHABET_SIZE); +} + +TEST(ANSTest, UintConfigRoundtrip) { + for (size_t log_alpha_size = 5; log_alpha_size <= 8; log_alpha_size++) { + std::vector uint_config, uint_config_dec; + for (size_t i = 0; i < log_alpha_size; i++) { + for (size_t j = 0; j <= i; j++) { + for (size_t k = 0; k <= i - j; k++) { + uint_config.emplace_back(i, j, k); + } + } + } + uint_config.emplace_back(log_alpha_size, 0, 0); + uint_config_dec.resize(uint_config.size()); + BitWriter writer; + BitWriter::Allotment allotment(&writer, 10 * uint_config.size()); + EncodeUintConfigs(uint_config, &writer, log_alpha_size); + allotment.ReclaimAndCharge(&writer, 0, nullptr); + writer.ZeroPadToByte(); + BitReader br(writer.GetSpan()); + EXPECT_TRUE(DecodeUintConfigs(log_alpha_size, &uint_config_dec, &br)); + EXPECT_TRUE(br.Close()); + for (size_t i = 0; i < uint_config.size(); i++) { + EXPECT_EQ(uint_config[i].split_token, uint_config_dec[i].split_token); + EXPECT_EQ(uint_config[i].msb_in_token, uint_config_dec[i].msb_in_token); + EXPECT_EQ(uint_config[i].lsb_in_token, uint_config_dec[i].lsb_in_token); + } + } +} + +void TestCheckpointing(bool ans, bool lz77) { + std::vector> input_values(1); + for (size_t i = 0; i < 1024; i++) { + input_values[0].push_back(Token(0, i % 4)); + } + // up to lz77 window size. + for (size_t i = 0; i < (1 << 20) - 1022; i++) { + input_values[0].push_back(Token(0, (i % 5) + 4)); + } + // Ensure that when the window wraps around, new values are different. + input_values[0].push_back(Token(0, 0)); + for (size_t i = 0; i < 1024; i++) { + input_values[0].push_back(Token(0, i % 4)); + } + + std::vector context_map; + EntropyEncodingData codes; + HistogramParams params; + params.lz77_method = lz77 ? HistogramParams::LZ77Method::kLZ77 + : HistogramParams::LZ77Method::kNone; + params.force_huffman = !ans; + + BitWriter writer; + { + auto input_values_copy = input_values; + BuildAndEncodeHistograms(params, 1, input_values_copy, &codes, &context_map, + &writer, 0, nullptr); + WriteTokens(input_values_copy[0], codes, context_map, &writer, 0, nullptr); + writer.ZeroPadToByte(); + } + + // We do not truncate the output. Reading past the end reads out zeroes + // anyway. + BitReader br(writer.GetSpan()); + Status status = true; + { + BitReaderScopedCloser bc(&br, &status); + + std::vector dec_context_map; + ANSCode decoded_codes; + ASSERT_TRUE(DecodeHistograms(&br, 1, &decoded_codes, &dec_context_map)); + ASSERT_EQ(dec_context_map, context_map); + ANSSymbolReader reader(&decoded_codes, &br); + + ANSSymbolReader::Checkpoint checkpoint; + size_t br_pos = 0; + constexpr size_t kInterval = ANSSymbolReader::kMaxCheckpointInterval - 2; + for (size_t i = 0; i < input_values[0].size(); i++) { + if (i % kInterval == 0 && i > 0) { + reader.Restore(checkpoint); + ASSERT_TRUE(br.Close()); + br = BitReader(writer.GetSpan()); + br.SkipBits(br_pos); + for (size_t j = i - kInterval; j < i; j++) { + Token symbol = input_values[0][j]; + uint32_t read_symbol = + reader.ReadHybridUint(symbol.context, &br, dec_context_map); + ASSERT_EQ(read_symbol, symbol.value) << "j = " << j; + } + } + if (i % kInterval == 0) { + reader.Save(&checkpoint); + br_pos = br.TotalBitsConsumed(); + } + Token symbol = input_values[0][i]; + uint32_t read_symbol = + reader.ReadHybridUint(symbol.context, &br, dec_context_map); + ASSERT_EQ(read_symbol, symbol.value) << "i = " << i; + } + ASSERT_TRUE(reader.CheckANSFinalState()); + } + EXPECT_TRUE(status); +} + +TEST(ANSTest, TestCheckpointingANS) { + TestCheckpointing(/*ans=*/true, /*lz77=*/false); +} + +TEST(ANSTest, TestCheckpointingPrefix) { + TestCheckpointing(/*ans=*/false, /*lz77=*/false); +} + +TEST(ANSTest, TestCheckpointingANSLZ77) { + TestCheckpointing(/*ans=*/true, /*lz77=*/true); +} + +TEST(ANSTest, TestCheckpointingPrefixLZ77) { + TestCheckpointing(/*ans=*/false, /*lz77=*/true); +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/base/arch_macros.h b/third_party/jpeg-xl/lib/jxl/base/arch_macros.h new file mode 100644 index 0000000000..a98301915e --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/arch_macros.h @@ -0,0 +1,33 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_ARCH_MACROS_H_ +#define LIB_JXL_BASE_ARCH_MACROS_H_ + +// Defines the JXL_ARCH_* macros. + +namespace jxl { + +#if defined(__x86_64__) || defined(_M_X64) +#define JXL_ARCH_X64 1 +#else +#define JXL_ARCH_X64 0 +#endif + +#if defined(__powerpc64__) || defined(_M_PPC) +#define JXL_ARCH_PPC 1 +#else +#define JXL_ARCH_PPC 0 +#endif + +#if defined(__aarch64__) || defined(__arm__) +#define JXL_ARCH_ARM 1 +#else +#define JXL_ARCH_ARM 0 +#endif + +} // namespace jxl + +#endif // LIB_JXL_BASE_ARCH_MACROS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/bits.h b/third_party/jpeg-xl/lib/jxl/base/bits.h new file mode 100644 index 0000000000..9f86118e72 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/bits.h @@ -0,0 +1,147 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_BITS_H_ +#define LIB_JXL_BASE_BITS_H_ + +// Specialized instructions for processing register-sized bit arrays. + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" + +#if JXL_COMPILER_MSVC +#include +#endif + +#include +#include + +namespace jxl { + +// Empty struct used as a size tag type. +template +struct SizeTag {}; + +template +constexpr bool IsSigned() { + return T(0) > T(-1); +} + +// Undefined results for x == 0. +static JXL_INLINE JXL_MAYBE_UNUSED size_t +Num0BitsAboveMS1Bit_Nonzero(SizeTag<4> /* tag */, const uint32_t x) { + JXL_DASSERT(x != 0); +#if JXL_COMPILER_MSVC + unsigned long index; + _BitScanReverse(&index, x); + return 31 - index; +#else + return static_cast(__builtin_clz(x)); +#endif +} +static JXL_INLINE JXL_MAYBE_UNUSED size_t +Num0BitsAboveMS1Bit_Nonzero(SizeTag<8> /* tag */, const uint64_t x) { + JXL_DASSERT(x != 0); +#if JXL_COMPILER_MSVC +#if JXL_ARCH_X64 + unsigned long index; + _BitScanReverse64(&index, x); + return 63 - index; +#else // JXL_ARCH_X64 + // _BitScanReverse64 not available + uint32_t msb = static_cast(x >> 32u); + unsigned long index; + if (msb == 0) { + uint32_t lsb = static_cast(x & 0xFFFFFFFF); + _BitScanReverse(&index, lsb); + return 63 - index; + } else { + _BitScanReverse(&index, msb); + return 31 - index; + } +#endif // JXL_ARCH_X64 +#else + return static_cast(__builtin_clzll(x)); +#endif +} +template +static JXL_INLINE JXL_MAYBE_UNUSED size_t +Num0BitsAboveMS1Bit_Nonzero(const T x) { + static_assert(!IsSigned(), "Num0BitsAboveMS1Bit_Nonzero: use unsigned"); + return Num0BitsAboveMS1Bit_Nonzero(SizeTag(), x); +} + +// Undefined results for x == 0. +static JXL_INLINE JXL_MAYBE_UNUSED size_t +Num0BitsBelowLS1Bit_Nonzero(SizeTag<4> /* tag */, const uint32_t x) { + JXL_DASSERT(x != 0); +#if JXL_COMPILER_MSVC + unsigned long index; + _BitScanForward(&index, x); + return index; +#else + return static_cast(__builtin_ctz(x)); +#endif +} +static JXL_INLINE JXL_MAYBE_UNUSED size_t +Num0BitsBelowLS1Bit_Nonzero(SizeTag<8> /* tag */, const uint64_t x) { + JXL_DASSERT(x != 0); +#if JXL_COMPILER_MSVC +#if JXL_ARCH_X64 + unsigned long index; + _BitScanForward64(&index, x); + return index; +#else // JXL_ARCH_64 + // _BitScanForward64 not available + uint32_t lsb = static_cast(x & 0xFFFFFFFF); + unsigned long index; + if (lsb == 0) { + uint32_t msb = static_cast(x >> 32u); + _BitScanForward(&index, msb); + return 32 + index; + } else { + _BitScanForward(&index, lsb); + return index; + } +#endif // JXL_ARCH_X64 +#else + return static_cast(__builtin_ctzll(x)); +#endif +} +template +static JXL_INLINE JXL_MAYBE_UNUSED size_t Num0BitsBelowLS1Bit_Nonzero(T x) { + static_assert(!IsSigned(), "Num0BitsBelowLS1Bit_Nonzero: use unsigned"); + return Num0BitsBelowLS1Bit_Nonzero(SizeTag(), x); +} + +// Returns bit width for x == 0. +template +static JXL_INLINE JXL_MAYBE_UNUSED size_t Num0BitsAboveMS1Bit(const T x) { + return (x == 0) ? sizeof(T) * 8 : Num0BitsAboveMS1Bit_Nonzero(x); +} + +// Returns bit width for x == 0. +template +static JXL_INLINE JXL_MAYBE_UNUSED size_t Num0BitsBelowLS1Bit(const T x) { + return (x == 0) ? sizeof(T) * 8 : Num0BitsBelowLS1Bit_Nonzero(x); +} + +// Returns base-2 logarithm, rounded down. +template +static JXL_INLINE JXL_MAYBE_UNUSED size_t FloorLog2Nonzero(const T x) { + return (sizeof(T) * 8 - 1) ^ Num0BitsAboveMS1Bit_Nonzero(x); +} + +// Returns base-2 logarithm, rounded up. +template +static JXL_INLINE JXL_MAYBE_UNUSED size_t CeilLog2Nonzero(const T x) { + const size_t floor_log2 = FloorLog2Nonzero(x); + if ((x & (x - 1)) == 0) return floor_log2; // power of two + return floor_log2 + 1; +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_BITS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/byte_order.h b/third_party/jpeg-xl/lib/jxl/base/byte_order.h new file mode 100644 index 0000000000..8966834e08 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/byte_order.h @@ -0,0 +1,274 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_BYTE_ORDER_H_ +#define LIB_JXL_BASE_BYTE_ORDER_H_ + +#include +#include +#include // memcpy + +#include "lib/jxl/base/compiler_specific.h" + +#if JXL_COMPILER_MSVC +#include // _byteswap_* +#endif + +#if (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) +#define JXL_BYTE_ORDER_LITTLE 1 +#else +// This means that we don't know that the byte order is little endian, in +// this case we use endian-neutral code that works for both little- and +// big-endian. +#define JXL_BYTE_ORDER_LITTLE 0 +#endif + +// Returns whether the system is little-endian (least-significant byte first). +#if JXL_BYTE_ORDER_LITTLE +static constexpr bool IsLittleEndian() { return true; } +#else +static inline bool IsLittleEndian() { + const uint32_t multibyte = 1; + uint8_t byte; + memcpy(&byte, &multibyte, 1); + return byte == 1; +} +#endif + +static inline bool SwapEndianness(JxlEndianness endianness) { + return ((endianness == JXL_BIG_ENDIAN && IsLittleEndian()) || + (endianness == JXL_LITTLE_ENDIAN && !IsLittleEndian())); +} + +#if JXL_COMPILER_MSVC +#define JXL_BSWAP16(x) _byteswap_ushort(x) +#define JXL_BSWAP32(x) _byteswap_ulong(x) +#define JXL_BSWAP64(x) _byteswap_uint64(x) +#else +#define JXL_BSWAP16(x) __builtin_bswap16(x) +#define JXL_BSWAP32(x) __builtin_bswap32(x) +#define JXL_BSWAP64(x) __builtin_bswap64(x) +#endif + +static JXL_INLINE uint32_t LoadBE16(const uint8_t* p) { + const uint32_t byte1 = p[0]; + const uint32_t byte0 = p[1]; + return (byte1 << 8) | byte0; +} + +static JXL_INLINE uint32_t LoadLE16(const uint8_t* p) { + const uint32_t byte0 = p[0]; + const uint32_t byte1 = p[1]; + return (byte1 << 8) | byte0; +} + +static JXL_INLINE uint32_t LoadBE32(const uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + uint32_t big; + memcpy(&big, p, 4); + return JXL_BSWAP32(big); +#else + // Byte-order-independent - can't assume this machine is big endian. + const uint32_t byte3 = p[0]; + const uint32_t byte2 = p[1]; + const uint32_t byte1 = p[2]; + const uint32_t byte0 = p[3]; + return (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0; +#endif +} + +static JXL_INLINE uint64_t LoadBE64(const uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + uint64_t big; + memcpy(&big, p, 8); + return JXL_BSWAP64(big); +#else + // Byte-order-independent - can't assume this machine is big endian. + const uint64_t byte7 = p[0]; + const uint64_t byte6 = p[1]; + const uint64_t byte5 = p[2]; + const uint64_t byte4 = p[3]; + const uint64_t byte3 = p[4]; + const uint64_t byte2 = p[5]; + const uint64_t byte1 = p[6]; + const uint64_t byte0 = p[7]; + return (byte7 << 56ull) | (byte6 << 48ull) | (byte5 << 40ull) | + (byte4 << 32ull) | (byte3 << 24ull) | (byte2 << 16ull) | + (byte1 << 8ull) | byte0; +#endif +} + +static JXL_INLINE uint32_t LoadLE32(const uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + uint32_t little; + memcpy(&little, p, 4); + return little; +#else + // Byte-order-independent - can't assume this machine is big endian. + const uint32_t byte0 = p[0]; + const uint32_t byte1 = p[1]; + const uint32_t byte2 = p[2]; + const uint32_t byte3 = p[3]; + return (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0; +#endif +} + +static JXL_INLINE uint64_t LoadLE64(const uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + uint64_t little; + memcpy(&little, p, 8); + return little; +#else + // Byte-order-independent - can't assume this machine is big endian. + const uint64_t byte0 = p[0]; + const uint64_t byte1 = p[1]; + const uint64_t byte2 = p[2]; + const uint64_t byte3 = p[3]; + const uint64_t byte4 = p[4]; + const uint64_t byte5 = p[5]; + const uint64_t byte6 = p[6]; + const uint64_t byte7 = p[7]; + return (byte7 << 56) | (byte6 << 48) | (byte5 << 40) | (byte4 << 32) | + (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0; +#endif +} + +// Loads a Big-Endian float +static JXL_INLINE float LoadBEFloat(const uint8_t* p) { + uint32_t u = LoadBE32(p); + float result; + memcpy(&result, &u, 4); + return result; +} + +// Loads a Little-Endian float +static JXL_INLINE float LoadLEFloat(const uint8_t* p) { + uint32_t u = LoadLE32(p); + float result; + memcpy(&result, &u, 4); + return result; +} + +static JXL_INLINE void StoreBE16(const uint32_t native, uint8_t* p) { + p[0] = (native >> 8) & 0xFF; + p[1] = native & 0xFF; +} + +static JXL_INLINE void StoreLE16(const uint32_t native, uint8_t* p) { + p[1] = (native >> 8) & 0xFF; + p[0] = native & 0xFF; +} + +static JXL_INLINE void StoreBE32(const uint32_t native, uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + const uint32_t big = JXL_BSWAP32(native); + memcpy(p, &big, 4); +#else + // Byte-order-independent - can't assume this machine is big endian. + p[0] = native >> 24; + p[1] = (native >> 16) & 0xFF; + p[2] = (native >> 8) & 0xFF; + p[3] = native & 0xFF; +#endif +} + +static JXL_INLINE void StoreBE64(const uint64_t native, uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + const uint64_t big = JXL_BSWAP64(native); + memcpy(p, &big, 8); +#else + // Byte-order-independent - can't assume this machine is big endian. + p[0] = native >> 56ull; + p[1] = (native >> 48ull) & 0xFF; + p[2] = (native >> 40ull) & 0xFF; + p[3] = (native >> 32ull) & 0xFF; + p[4] = (native >> 24ull) & 0xFF; + p[5] = (native >> 16ull) & 0xFF; + p[6] = (native >> 8ull) & 0xFF; + p[7] = native & 0xFF; +#endif +} + +static JXL_INLINE void StoreLE32(const uint32_t native, uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + const uint32_t little = native; + memcpy(p, &little, 4); +#else + // Byte-order-independent - can't assume this machine is big endian. + p[3] = native >> 24; + p[2] = (native >> 16) & 0xFF; + p[1] = (native >> 8) & 0xFF; + p[0] = native & 0xFF; +#endif +} + +static JXL_INLINE void StoreLE64(const uint64_t native, uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + const uint64_t little = native; + memcpy(p, &little, 8); +#else + // Byte-order-independent - can't assume this machine is big endian. + p[7] = native >> 56; + p[6] = (native >> 48) & 0xFF; + p[5] = (native >> 40) & 0xFF; + p[4] = (native >> 32) & 0xFF; + p[3] = (native >> 24) & 0xFF; + p[2] = (native >> 16) & 0xFF; + p[1] = (native >> 8) & 0xFF; + p[0] = native & 0xFF; +#endif +} + +static JXL_INLINE float BSwapFloat(float x) { + uint32_t u; + memcpy(&u, &x, 4); + uint32_t uswap = JXL_BSWAP32(u); + float xswap; + memcpy(&xswap, &uswap, 4); + return xswap; +} + +// Big/Little Endian order. +struct OrderBE {}; +struct OrderLE {}; + +// Wrappers for calling from generic code. +static JXL_INLINE void Store16(OrderBE /*tag*/, const uint32_t native, + uint8_t* p) { + return StoreBE16(native, p); +} + +static JXL_INLINE void Store16(OrderLE /*tag*/, const uint32_t native, + uint8_t* p) { + return StoreLE16(native, p); +} + +static JXL_INLINE void Store32(OrderBE /*tag*/, const uint32_t native, + uint8_t* p) { + return StoreBE32(native, p); +} + +static JXL_INLINE void Store32(OrderLE /*tag*/, const uint32_t native, + uint8_t* p) { + return StoreLE32(native, p); +} + +static JXL_INLINE uint32_t Load16(OrderBE /*tag*/, const uint8_t* p) { + return LoadBE16(p); +} + +static JXL_INLINE uint32_t Load16(OrderLE /*tag*/, const uint8_t* p) { + return LoadLE16(p); +} + +static JXL_INLINE uint32_t Load32(OrderBE /*tag*/, const uint8_t* p) { + return LoadBE32(p); +} + +static JXL_INLINE uint32_t Load32(OrderLE /*tag*/, const uint8_t* p) { + return LoadLE32(p); +} + +#endif // LIB_JXL_BASE_BYTE_ORDER_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc new file mode 100644 index 0000000000..9a9cc585a1 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc @@ -0,0 +1,157 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/cache_aligned.h" + +#include +#include + +// Disabled: slower than malloc + alignment. +#define JXL_USE_MMAP 0 + +#if JXL_USE_MMAP +#include +#endif + +#include // std::max +#include +#include // kMaxVectorSize +#include + +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/status.h" + +namespace jxl { +namespace { + +#pragma pack(push, 1) +struct AllocationHeader { + void* allocated; + size_t allocated_size; + uint8_t left_padding[hwy::kMaxVectorSize]; +}; +#pragma pack(pop) + +std::atomic num_allocations{0}; +std::atomic bytes_in_use{0}; +std::atomic max_bytes_in_use{0}; + +} // namespace + +// Avoids linker errors in pre-C++17 builds. +constexpr size_t CacheAligned::kPointerSize; +constexpr size_t CacheAligned::kCacheLineSize; +constexpr size_t CacheAligned::kAlignment; +constexpr size_t CacheAligned::kAlias; + +void CacheAligned::PrintStats() { + fprintf( + stderr, "Allocations: %" PRIuS " (max bytes in use: %E)\n", + static_cast(num_allocations.load(std::memory_order_relaxed)), + static_cast(max_bytes_in_use.load(std::memory_order_relaxed))); +} + +size_t CacheAligned::NextOffset() { + static std::atomic next{0}; + constexpr uint32_t kGroups = CacheAligned::kAlias / CacheAligned::kAlignment; + const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups; + return CacheAligned::kAlignment * group; +} + +void* CacheAligned::Allocate(const size_t payload_size, size_t offset) { + JXL_ASSERT(payload_size <= std::numeric_limits::max() / 2); + JXL_ASSERT((offset % kAlignment == 0) && offset <= kAlias); + + // What: | misalign | unused | AllocationHeader |payload + // Size: |<= kAlias | offset | |payload_size + // ^allocated.^aligned.^header............^payload + // The header must immediately precede payload, which must remain aligned. + // To avoid wasting space, the header resides at the end of `unused`, + // which therefore cannot be empty (offset == 0). + if (offset == 0) { + // SVE/RVV vectors can be large, so we cannot rely on them (including the + // padding at the end of AllocationHeader) to fit in kAlignment. + offset = hwy::RoundUpTo(sizeof(AllocationHeader), kAlignment); + } + +#if JXL_USE_MMAP + const size_t allocated_size = offset + payload_size; + const int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE; + void* allocated = + mmap(nullptr, allocated_size, PROT_READ | PROT_WRITE, flags, -1, 0); + if (allocated == MAP_FAILED) return nullptr; + const uintptr_t aligned = reinterpret_cast(allocated); +#else + const size_t allocated_size = kAlias + offset + payload_size; + void* allocated = malloc(allocated_size); + if (allocated == nullptr) return nullptr; + // Always round up even if already aligned - we already asked for kAlias + // extra bytes and there's no way to give them back. + uintptr_t aligned = reinterpret_cast(allocated) + kAlias; + static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2"); + static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias"); + aligned &= ~(kAlias - 1); +#endif + +#if 0 + // No effect. + uintptr_t page_aligned = reinterpret_cast(allocated); + page_aligned &= ~(4096 - 1); + if (madvise(reinterpret_cast(page_aligned), allocated_size, + MADV_WILLNEED) != 0) { + JXL_NOTIFY_ERROR("madvise failed"); + } +#elif 0 + // INCREASES both first and subsequent decode times. + if (mlock(allocated, allocated_size) != 0) { + JXL_NOTIFY_ERROR("mlock failed"); + } +#endif + + // Update statistics (#allocations and max bytes in use) + num_allocations.fetch_add(1, std::memory_order_relaxed); + const uint64_t prev_bytes = + bytes_in_use.fetch_add(allocated_size, std::memory_order_acq_rel); + uint64_t expected_max = max_bytes_in_use.load(std::memory_order_acquire); + for (;;) { + const uint64_t desired = + std::max(expected_max, prev_bytes + allocated_size); + if (max_bytes_in_use.compare_exchange_strong(expected_max, desired, + std::memory_order_acq_rel)) { + break; + } + } + + const uintptr_t payload = aligned + offset; // still aligned + + // Stash `allocated` and payload_size inside header for use by Free(). + AllocationHeader* header = reinterpret_cast(payload) - 1; + header->allocated = allocated; + header->allocated_size = allocated_size; + + return JXL_ASSUME_ALIGNED(reinterpret_cast(payload), 64); +} + +void CacheAligned::Free(const void* aligned_pointer) { + if (aligned_pointer == nullptr) { + return; + } + const uintptr_t payload = reinterpret_cast(aligned_pointer); + JXL_ASSERT(payload % kAlignment == 0); + const AllocationHeader* header = + reinterpret_cast(payload) - 1; + + // Subtract (2's complement negation). + bytes_in_use.fetch_add(~header->allocated_size + 1, + std::memory_order_acq_rel); + +#if JXL_USE_MMAP + munmap(header->allocated, header->allocated_size); +#else + free(header->allocated); +#endif +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/base/cache_aligned.h b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.h new file mode 100644 index 0000000000..e57df14837 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.h @@ -0,0 +1,74 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_CACHE_ALIGNED_H_ +#define LIB_JXL_BASE_CACHE_ALIGNED_H_ + +// Memory allocator with support for alignment + misalignment. + +#include +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" + +namespace jxl { + +// Functions that depend on the cache line size. +class CacheAligned { + public: + static void PrintStats(); + + static constexpr size_t kPointerSize = sizeof(void*); + static constexpr size_t kCacheLineSize = 64; + // To avoid RFOs, match L2 fill size (pairs of lines). + static constexpr size_t kAlignment = 2 * kCacheLineSize; + // Minimum multiple for which cache set conflicts and/or loads blocked by + // preceding stores can occur. + static constexpr size_t kAlias = 2048; + + // Returns a 'random' (cyclical) offset suitable for Allocate. + static size_t NextOffset(); + + // Returns null or memory whose address is congruent to `offset` (mod kAlias). + // This reduces cache conflicts and load/store stalls, especially with large + // allocations that would otherwise have similar alignments. At least + // `payload_size` (which can be zero) bytes will be accessible. + static void* Allocate(size_t payload_size, size_t offset); + + static void* Allocate(const size_t payload_size) { + return Allocate(payload_size, NextOffset()); + } + + static void Free(const void* aligned_pointer); +}; + +// Avoids the need for a function pointer (deleter) in CacheAlignedUniquePtr. +struct CacheAlignedDeleter { + void operator()(uint8_t* aligned_pointer) const { + return CacheAligned::Free(aligned_pointer); + } +}; + +using CacheAlignedUniquePtr = std::unique_ptr; + +// Does not invoke constructors. +static inline CacheAlignedUniquePtr AllocateArray(const size_t bytes) { + return CacheAlignedUniquePtr( + static_cast(CacheAligned::Allocate(bytes)), + CacheAlignedDeleter()); +} + +static inline CacheAlignedUniquePtr AllocateArray(const size_t bytes, + const size_t offset) { + return CacheAlignedUniquePtr( + static_cast(CacheAligned::Allocate(bytes, offset)), + CacheAlignedDeleter()); +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_CACHE_ALIGNED_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/compiler_specific.h b/third_party/jpeg-xl/lib/jxl/base/compiler_specific.h new file mode 100644 index 0000000000..abe1261f48 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/compiler_specific.h @@ -0,0 +1,157 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_COMPILER_SPECIFIC_H_ +#define LIB_JXL_BASE_COMPILER_SPECIFIC_H_ + +// Macros for compiler version + nonstandard keywords, e.g. __builtin_expect. + +#include +#include + +#include "lib/jxl/base/sanitizer_definitions.h" + +// #if is shorter and safer than #ifdef. *_VERSION are zero if not detected, +// otherwise 100 * major + minor version. Note that other packages check for +// #ifdef COMPILER_MSVC, so we cannot use that same name. + +#ifdef _MSC_VER +#define JXL_COMPILER_MSVC _MSC_VER +#else +#define JXL_COMPILER_MSVC 0 +#endif + +#ifdef __GNUC__ +#define JXL_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__) +#else +#define JXL_COMPILER_GCC 0 +#endif + +#ifdef __clang__ +#define JXL_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__) +// Clang pretends to be GCC for compatibility. +#undef JXL_COMPILER_GCC +#define JXL_COMPILER_GCC 0 +#else +#define JXL_COMPILER_CLANG 0 +#endif + +#if JXL_COMPILER_MSVC +#define JXL_RESTRICT __restrict +#elif JXL_COMPILER_GCC || JXL_COMPILER_CLANG +#define JXL_RESTRICT __restrict__ +#else +#define JXL_RESTRICT +#endif + +#if JXL_COMPILER_MSVC +#define JXL_INLINE __forceinline +#define JXL_NOINLINE __declspec(noinline) +#else +#define JXL_INLINE inline __attribute__((always_inline)) +#define JXL_NOINLINE __attribute__((noinline)) +#endif + +#if JXL_COMPILER_MSVC +#define JXL_NORETURN __declspec(noreturn) +#elif JXL_COMPILER_GCC || JXL_COMPILER_CLANG +#define JXL_NORETURN __attribute__((noreturn)) +#else +#define JXL_NORETURN +#endif + +#if JXL_COMPILER_MSVC +#define JXL_UNREACHABLE __assume(false) +#elif JXL_COMPILER_CLANG || JXL_COMPILER_GCC >= 405 +#define JXL_UNREACHABLE __builtin_unreachable() +#else +#define JXL_UNREACHABLE +#endif + +#if JXL_COMPILER_MSVC +#define JXL_MAYBE_UNUSED +#else +// Encountered "attribute list cannot appear here" when using the C++17 +// [[maybe_unused]], so only use the old style attribute for now. +#define JXL_MAYBE_UNUSED __attribute__((unused)) +#endif + +// MSAN execution won't hurt if some code it not inlined, but this can greatly +// improve compilation time. Unfortunately this macro can not be used just +// everywhere - inside header files it leads to "multiple definition" error; +// though it would be better not to have JXL_INLINE in header overall. +#if JXL_MEMORY_SANITIZER || JXL_ADDRESS_SANITIZER || JXL_THREAD_SANITIZER +#define JXL_MAYBE_INLINE JXL_MAYBE_UNUSED +#else +#define JXL_MAYBE_INLINE JXL_INLINE +#endif + +#if JXL_COMPILER_MSVC +// Unsupported, __assume is not the same. +#define JXL_LIKELY(expr) expr +#define JXL_UNLIKELY(expr) expr +#else +#define JXL_LIKELY(expr) __builtin_expect(!!(expr), 1) +#define JXL_UNLIKELY(expr) __builtin_expect(!!(expr), 0) +#endif + +// Returns a void* pointer which the compiler then assumes is N-byte aligned. +// Example: float* JXL_RESTRICT aligned = (float*)JXL_ASSUME_ALIGNED(in, 32); +// +// The assignment semantics are required by GCC/Clang. ICC provides an in-place +// __assume_aligned, whereas MSVC's __assume appears unsuitable. +#if JXL_COMPILER_CLANG +// Early versions of Clang did not support __builtin_assume_aligned. +#define JXL_HAS_ASSUME_ALIGNED __has_builtin(__builtin_assume_aligned) +#elif JXL_COMPILER_GCC +#define JXL_HAS_ASSUME_ALIGNED 1 +#else +#define JXL_HAS_ASSUME_ALIGNED 0 +#endif + +#if JXL_HAS_ASSUME_ALIGNED +#define JXL_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align)) +#else +#define JXL_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */ +#endif + +#ifdef __has_attribute +#define JXL_HAVE_ATTRIBUTE(x) __has_attribute(x) +#else +#define JXL_HAVE_ATTRIBUTE(x) 0 +#endif + +// Raises warnings if the function return value is unused. Should appear as the +// first part of a function definition/declaration. +#if JXL_HAVE_ATTRIBUTE(nodiscard) +#define JXL_MUST_USE_RESULT [[nodiscard]] +#elif JXL_COMPILER_CLANG && JXL_HAVE_ATTRIBUTE(warn_unused_result) +#define JXL_MUST_USE_RESULT __attribute__((warn_unused_result)) +#else +#define JXL_MUST_USE_RESULT +#endif + +// Disable certain -fsanitize flags for functions that are expected to include +// things like unsigned integer overflow. For example use in the function +// declaration JXL_NO_SANITIZE("unsigned-integer-overflow") to silence unsigned +// integer overflow ubsan messages. +#if JXL_COMPILER_CLANG && JXL_HAVE_ATTRIBUTE(no_sanitize) +#define JXL_NO_SANITIZE(X) __attribute__((no_sanitize(X))) +#else +#define JXL_NO_SANITIZE(X) +#endif + +#if JXL_HAVE_ATTRIBUTE(__format__) +#define JXL_FORMAT(idx_fmt, idx_arg) \ + __attribute__((__format__(__printf__, idx_fmt, idx_arg))) +#else +#define JXL_FORMAT(idx_fmt, idx_arg) +#endif + +#if JXL_COMPILER_MSVC +using ssize_t = intptr_t; +#endif + +#endif // LIB_JXL_BASE_COMPILER_SPECIFIC_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/data_parallel.cc b/third_party/jpeg-xl/lib/jxl/base/data_parallel.cc new file mode 100644 index 0000000000..20a911255c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/data_parallel.cc @@ -0,0 +1,23 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/data_parallel.h" + +namespace jxl { + +// static +JxlParallelRetCode ThreadPool::SequentialRunnerStatic( + void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init, + JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range) { + JxlParallelRetCode init_ret = (*init)(jpegxl_opaque, 1); + if (init_ret != 0) return init_ret; + + for (uint32_t i = start_range; i < end_range; i++) { + (*func)(jpegxl_opaque, i, 0); + } + return 0; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/base/data_parallel.h b/third_party/jpeg-xl/lib/jxl/base/data_parallel.h new file mode 100644 index 0000000000..ba7e7adfad --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/data_parallel.h @@ -0,0 +1,120 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_DATA_PARALLEL_H_ +#define LIB_JXL_BASE_DATA_PARALLEL_H_ + +// Portable, low-overhead C++11 ThreadPool alternative to OpenMP for +// data-parallel computations. + +#include +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/status.h" +#if JXL_COMPILER_MSVC +// suppress warnings about the const & applied to function types +#pragma warning(disable : 4180) +#endif + +namespace jxl { + +class ThreadPool { + public: + ThreadPool(JxlParallelRunner runner, void* runner_opaque) + : runner_(runner ? runner : &ThreadPool::SequentialRunnerStatic), + runner_opaque_(runner ? runner_opaque : static_cast(this)) {} + + ThreadPool(const ThreadPool&) = delete; + ThreadPool& operator&(const ThreadPool&) = delete; + + JxlParallelRunner runner() const { return runner_; } + void* runner_opaque() const { return runner_opaque_; } + + // Runs init_func(num_threads) followed by data_func(task, thread) on worker + // thread(s) for every task in [begin, end). init_func() must return a Status + // indicating whether the initialization succeeded. + // "thread" is an integer smaller than num_threads. + // Not thread-safe - no two calls to Run may overlap. + // Subsequent calls will reuse the same threads. + // + // Precondition: begin <= end. + template + Status Run(uint32_t begin, uint32_t end, const InitFunc& init_func, + const DataFunc& data_func, const char* caller = "") { + JXL_ASSERT(begin <= end); + if (begin == end) return true; + RunCallState call_state(init_func, data_func); + // The runner_ uses the C convention and returns 0 in case of error, so we + // convert it to a Status. + return (*runner_)(runner_opaque_, static_cast(&call_state), + &call_state.CallInitFunc, &call_state.CallDataFunc, begin, + end) == 0; + } + + // Use this as init_func when no initialization is needed. + static Status NoInit(size_t num_threads) { return true; } + + private: + // class holding the state of a Run() call to pass to the runner_ as an + // opaque_jpegxl pointer. + template + class RunCallState final { + public: + RunCallState(const InitFunc& init_func, const DataFunc& data_func) + : init_func_(init_func), data_func_(data_func) {} + + // JxlParallelRunInit interface. + static int CallInitFunc(void* jpegxl_opaque, size_t num_threads) { + const auto* self = + static_cast*>(jpegxl_opaque); + // Returns -1 when the internal init function returns false Status to + // indicate an error. + return self->init_func_(num_threads) ? 0 : -1; + } + + // JxlParallelRunFunction interface. + static void CallDataFunc(void* jpegxl_opaque, uint32_t value, + size_t thread_id) { + const auto* self = + static_cast*>(jpegxl_opaque); + return self->data_func_(value, thread_id); + } + + private: + const InitFunc& init_func_; + const DataFunc& data_func_; + }; + + // Default JxlParallelRunner used when no runner is provided by the + // caller. This runner doesn't use any threading and thread_id is always 0. + static JxlParallelRetCode SequentialRunnerStatic( + void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init, + JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range); + + // The caller supplied runner function and its opaque void*. + const JxlParallelRunner runner_; + void* const runner_opaque_; +}; + +template +Status RunOnPool(ThreadPool* pool, const uint32_t begin, const uint32_t end, + const InitFunc& init_func, const DataFunc& data_func, + const char* caller) { + if (pool == nullptr) { + ThreadPool default_pool(nullptr, nullptr); + return default_pool.Run(begin, end, init_func, data_func, caller); + } else { + return pool->Run(begin, end, init_func, data_func, caller); + } +} + +} // namespace jxl +#if JXL_COMPILER_MSVC +#pragma warning(default : 4180) +#endif + +#endif // LIB_JXL_BASE_DATA_PARALLEL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/file_io.h b/third_party/jpeg-xl/lib/jxl/base/file_io.h new file mode 100644 index 0000000000..64d5860915 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/file_io.h @@ -0,0 +1,153 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_FILE_IO_H_ +#define LIB_JXL_BASE_FILE_IO_H_ + +// Helper functions for reading/writing files. + +#include +#include + +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +// Returns extension including the dot, or empty string if none. Assumes +// filename is not a hidden file (e.g. ".bashrc"). May be called with a pathname +// if the filename contains a dot and/or no other path component does. +static inline std::string Extension(const std::string& filename) { + const size_t pos = filename.rfind('.'); + if (pos == std::string::npos) return std::string(); + return filename.substr(pos); +} + +// RAII, ensures files are closed even when returning early. +class FileWrapper { + public: + FileWrapper(const FileWrapper& other) = delete; + FileWrapper& operator=(const FileWrapper& other) = delete; + + explicit FileWrapper(const std::string& pathname, const char* mode) + : file_(pathname == "-" ? (mode[0] == 'r' ? stdin : stdout) + : fopen(pathname.c_str(), mode)), + close_on_delete_(pathname != "-") { +#ifdef _WIN32 + struct __stat64 s = {}; + const int err = _stat64(pathname.c_str(), &s); + const bool is_file = (s.st_mode & S_IFREG) != 0; +#else + struct stat s = {}; + const int err = stat(pathname.c_str(), &s); + const bool is_file = S_ISREG(s.st_mode); +#endif + if (err == 0 && is_file) { + size_ = s.st_size; + } + } + + ~FileWrapper() { + if (file_ != nullptr && close_on_delete_) { + const int err = fclose(file_); + JXL_CHECK(err == 0); + } + } + + // We intend to use FileWrapper as a replacement of FILE. + // NOLINTNEXTLINE(google-explicit-constructor) + operator FILE*() const { return file_; } + + int64_t size() { return size_; } + + private: + FILE* const file_; + bool close_on_delete_ = true; + int64_t size_ = -1; +}; + +template +static inline Status ReadFile(const std::string& pathname, + ContainerType* JXL_RESTRICT bytes) { + FileWrapper f(pathname, "rb"); + if (f == nullptr) + return JXL_FAILURE("Failed to open file for reading: %s", pathname.c_str()); + + // Get size of file in bytes + const int64_t size = f.size(); + if (size < 0) { + // Size is unknown, loop reading chunks until EOF. + bytes->clear(); + std::list> chunks; + + size_t total_size = 0; + while (true) { + std::vector chunk(16 * 1024); + const size_t bytes_read = fread(chunk.data(), 1, chunk.size(), f); + if (ferror(f) || bytes_read > chunk.size()) { + return JXL_FAILURE("Error reading %s", pathname.c_str()); + } + + chunk.resize(bytes_read); + total_size += bytes_read; + if (bytes_read != 0) { + chunks.emplace_back(std::move(chunk)); + } + if (feof(f)) { + break; + } + } + bytes->resize(total_size); + size_t pos = 0; + for (const auto& chunk : chunks) { + // Needed in case ContainerType is std::string, whose data() is const. + char* bytes_writable = reinterpret_cast(&(*bytes)[0]); + memcpy(bytes_writable + pos, chunk.data(), chunk.size()); + pos += chunk.size(); + } + } else { + // Size is known, read the file directly. + bytes->resize(static_cast(size)); + size_t pos = 0; + while (pos < bytes->size()) { + // Needed in case ContainerType is std::string, whose data() is const. + char* bytes_writable = reinterpret_cast(&(*bytes)[0]); + const size_t bytes_read = + fread(bytes_writable + pos, 1, bytes->size() - pos, f); + if (bytes_read == 0) return JXL_FAILURE("Failed to read"); + pos += bytes_read; + } + JXL_ASSERT(pos == bytes->size()); + } + return true; +} + +template +static inline Status WriteFile(const ContainerType& bytes, + const std::string& pathname) { + FileWrapper f(pathname, "wb"); + if (f == nullptr) + return JXL_FAILURE("Failed to open file for writing: %s", pathname.c_str()); + + size_t pos = 0; + while (pos < bytes.size()) { + const size_t bytes_written = + fwrite(bytes.data() + pos, 1, bytes.size() - pos, f); + if (bytes_written == 0) return JXL_FAILURE("Failed to write"); + pos += bytes_written; + } + JXL_ASSERT(pos == bytes.size()); + + return true; +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_FILE_IO_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/float.h b/third_party/jpeg-xl/lib/jxl/base/float.h new file mode 100644 index 0000000000..90bdeedf54 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/float.h @@ -0,0 +1,98 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_FLOAT_H_ +#define LIB_JXL_BASE_FLOAT_H_ + +#include +#include +#include +#include + +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +namespace { +// Based on highway scalar implementation, for testing +float LoadFloat16(uint16_t bits16) { + const uint32_t sign = bits16 >> 15; + const uint32_t biased_exp = (bits16 >> 10) & 0x1F; + const uint32_t mantissa = bits16 & 0x3FF; + + // Subnormal or zero + if (biased_exp == 0) { + const float subnormal = (1.0f / 16384) * (mantissa * (1.0f / 1024)); + return sign ? -subnormal : subnormal; + } + + // Normalized: convert the representation directly (faster than ldexp/tables). + const uint32_t biased_exp32 = biased_exp + (127 - 15); + const uint32_t mantissa32 = mantissa << (23 - 10); + const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32; + + float result; + memcpy(&result, &bits32, 4); + return result; +} +} // namespace + +template +static Status JXL_INLINE LoadFloatRow(const uint8_t* src, size_t count, + size_t stride, JxlDataType type, + bool little_endian, float scale, + SaveFloatAtFn callback) { + switch (type) { + case JXL_TYPE_FLOAT: + if (little_endian) { + for (size_t i = 0; i < count; ++i) { + callback(i, LoadLEFloat(src + stride * i)); + } + } else { + for (size_t i = 0; i < count; ++i) { + callback(i, LoadBEFloat(src + stride * i)); + } + } + return true; + + case JXL_TYPE_UINT8: + for (size_t i = 0; i < count; ++i) { + callback(i, src[stride * i] * scale); + } + return true; + + case JXL_TYPE_UINT16: + if (little_endian) { + for (size_t i = 0; i < count; ++i) { + callback(i, LoadLE16(src + stride * i) * scale); + } + } else { + for (size_t i = 0; i < count; ++i) { + callback(i, LoadBE16(src + stride * i) * scale); + } + } + return true; + + case JXL_TYPE_FLOAT16: + if (little_endian) { + for (size_t i = 0; i < count; ++i) { + callback(i, LoadFloat16(LoadLE16(src + stride * i))); + } + } else { + for (size_t i = 0; i < count; ++i) { + callback(i, LoadFloat16(LoadBE16(src + stride * i))); + } + } + return true; + + default: + return JXL_FAILURE("Unsupported sample format"); + } +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_FLOAT_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/iaca.h b/third_party/jpeg-xl/lib/jxl/base/iaca.h new file mode 100644 index 0000000000..e5732dae5c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/iaca.h @@ -0,0 +1,65 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_IACA_H_ +#define LIB_JXL_BASE_IACA_H_ + +#include "lib/jxl/base/compiler_specific.h" + +// IACA (Intel's Code Analyzer) analyzes instruction latencies, but only for +// code between special markers. These functions embed such markers in an +// executable, but only for reading via IACA - they deliberately trigger a +// crash if executed to ensure they are removed in normal builds. + +#ifndef JXL_IACA_ENABLED +#define JXL_IACA_ENABLED 0 +#endif + +namespace jxl { + +// Call before the region of interest. +static JXL_INLINE void BeginIACA() { +#if JXL_IACA_ENABLED && (JXL_COMPILER_GCC || JXL_COMPILER_CLANG) + asm volatile( + // UD2 "instruction" raises an invalid opcode exception. + ".byte 0x0F, 0x0B\n\t" + // Magic sequence recognized by IACA (MOV + addr32 fs:NOP). This actually + // clobbers EBX, but we don't care because the code won't be run, and we + // want IACA to observe the same code the compiler would have generated + // without this marker. + "movl $111, %%ebx\n\t" + ".byte 0x64, 0x67, 0x90\n\t" + : + : + // (Allegedly) clobbering memory may prevent reordering. + : "memory"); +#endif +} + +// Call after the region of interest. +static JXL_INLINE void EndIACA() { +#if JXL_IACA_ENABLED && (JXL_COMPILER_GCC || JXL_COMPILER_CLANG) + asm volatile( + // See above. + "movl $222, %%ebx\n\t" + ".byte 0x64, 0x67, 0x90\n\t" + // UD2 + ".byte 0x0F, 0x0B\n\t" + : + : + // (Allegedly) clobbering memory may prevent reordering. + : "memory"); +#endif +} + +// Add to a scope to mark a region. +struct ScopeIACA { + JXL_INLINE ScopeIACA() { BeginIACA(); } + JXL_INLINE ~ScopeIACA() { EndIACA(); } +}; + +} // namespace jxl + +#endif // LIB_JXL_BASE_IACA_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/os_macros.h b/third_party/jpeg-xl/lib/jxl/base/os_macros.h new file mode 100644 index 0000000000..84d0b82bf5 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/os_macros.h @@ -0,0 +1,50 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_OS_MACROS_H_ +#define LIB_JXL_BASE_OS_MACROS_H_ + +// Defines the JXL_OS_* macros. + +#if defined(_WIN32) || defined(_WIN64) +#define JXL_OS_WIN 1 +#else +#define JXL_OS_WIN 0 +#endif + +#ifdef __linux__ +#define JXL_OS_LINUX 1 +#else +#define JXL_OS_LINUX 0 +#endif + +#ifdef __APPLE__ +#define JXL_OS_MAC 1 +#else +#define JXL_OS_MAC 0 +#endif + +#define JXL_OS_IOS 0 +#ifdef __APPLE__ +#include +#if TARGET_OS_IPHONE +#undef JXL_OS_IOS +#define JXL_OS_IOS 1 +#endif +#endif + +#ifdef __FreeBSD__ +#define JXL_OS_FREEBSD 1 +#else +#define JXL_OS_FREEBSD 0 +#endif + +#ifdef __HAIKU__ +#define JXL_OS_HAIKU 1 +#else +#define JXL_OS_HAIKU 0 +#endif + +#endif // LIB_JXL_BASE_OS_MACROS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/override.h b/third_party/jpeg-xl/lib/jxl/base/override.h new file mode 100644 index 0000000000..1f8b657974 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/override.h @@ -0,0 +1,29 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_OVERRIDE_H_ +#define LIB_JXL_BASE_OVERRIDE_H_ + +// 'Trool' for command line arguments: force enable/disable, or use default. + +namespace jxl { + +// No effect if kDefault, otherwise forces a feature (typically a FrameHeader +// flag) on or off. +enum class Override : int { kOn = 1, kOff = 0, kDefault = -1 }; + +static inline Override OverrideFromBool(bool flag) { + return flag ? Override::kOn : Override::kOff; +} + +static inline bool ApplyOverride(Override o, bool default_condition) { + if (o == Override::kOn) return true; + if (o == Override::kOff) return false; + return default_condition; +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_OVERRIDE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc new file mode 100644 index 0000000000..11e4bff6fe --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc @@ -0,0 +1,63 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/padded_bytes.h" + +namespace jxl { + +void PaddedBytes::IncreaseCapacityTo(size_t capacity) { + JXL_ASSERT(capacity > capacity_); + + size_t new_capacity = std::max(capacity, 3 * capacity_ / 2); + new_capacity = std::max(64, new_capacity); + + // BitWriter writes up to 7 bytes past the end. + CacheAlignedUniquePtr new_data = AllocateArray(new_capacity + 8); + if (new_data == nullptr) { + // Allocation failed, discard all data to ensure this is noticed. + size_ = capacity_ = 0; + return; + } + + if (data_ == nullptr) { + // First allocation: ensure first byte is initialized (won't be copied). + new_data[0] = 0; + } else { + // Subsequent resize: copy existing data to new location. + memcpy(new_data.get(), data_.get(), size_); + // Ensure that the first new byte is initialized, to allow write_bits to + // safely append to the newly-resized PaddedBytes. + new_data[size_] = 0; + } + + capacity_ = new_capacity; + std::swap(new_data, data_); +} + +void PaddedBytes::assign(const uint8_t* new_begin, const uint8_t* new_end) { + JXL_DASSERT(new_begin <= new_end); + const size_t new_size = static_cast(new_end - new_begin); + + // memcpy requires non-overlapping ranges, and resizing might invalidate the + // new range. Neither happens if the new range is completely to the left or + // right of the _allocated_ range (irrespective of size_). + const uint8_t* allocated_end = begin() + capacity_; + const bool outside = new_end <= begin() || new_begin >= allocated_end; + if (outside) { + resize(new_size); // grow or shrink + memcpy(data(), new_begin, new_size); + return; + } + + // There is overlap. The new size cannot be larger because we own the memory + // and the new range cannot include anything outside the allocated range. + JXL_ASSERT(new_size <= capacity_); + + // memmove allows overlap and capacity_ is sufficient. + memmove(data(), new_begin, new_size); + size_ = new_size; // shrink +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/base/padded_bytes.h b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.h new file mode 100644 index 0000000000..4534ddf863 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.h @@ -0,0 +1,197 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_PADDED_BYTES_H_ +#define LIB_JXL_BASE_PADDED_BYTES_H_ + +// std::vector replacement with padding to reduce bounds checks in WriteBits + +#include +#include +#include // memcpy + +#include // max +#include +#include // swap + +#include "lib/jxl/base/cache_aligned.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +// Provides a subset of the std::vector interface with some differences: +// - allows BitWriter to write 64 bits at a time without bounds checking; +// - ONLY zero-initializes the first byte (required by BitWriter); +// - ensures cache-line alignment. +class PaddedBytes { + public: + // Required for output params. + PaddedBytes() : size_(0), capacity_(0) {} + + explicit PaddedBytes(size_t size) : size_(size), capacity_(0) { + if (size != 0) IncreaseCapacityTo(size); + } + + PaddedBytes(size_t size, uint8_t value) : size_(size), capacity_(0) { + if (size != 0) { + IncreaseCapacityTo(size); + } + if (size_ != 0) { + memset(data(), value, size); + } + } + + PaddedBytes(const PaddedBytes& other) : size_(other.size_), capacity_(0) { + if (size_ != 0) IncreaseCapacityTo(size_); + if (data() != nullptr) memcpy(data(), other.data(), size_); + } + PaddedBytes& operator=(const PaddedBytes& other) { + // Self-assignment is safe. + resize(other.size()); + if (data() != nullptr) memmove(data(), other.data(), size_); + return *this; + } + + // default is not OK - need to set other.size_ to 0! + PaddedBytes(PaddedBytes&& other) noexcept + : size_(other.size_), + capacity_(other.capacity_), + data_(std::move(other.data_)) { + other.size_ = other.capacity_ = 0; + } + PaddedBytes& operator=(PaddedBytes&& other) noexcept { + size_ = other.size_; + capacity_ = other.capacity_; + data_ = std::move(other.data_); + + if (&other != this) { + other.size_ = other.capacity_ = 0; + } + return *this; + } + + void swap(PaddedBytes& other) { + std::swap(size_, other.size_); + std::swap(capacity_, other.capacity_); + std::swap(data_, other.data_); + } + + void reserve(size_t capacity) { + if (capacity > capacity_) IncreaseCapacityTo(capacity); + } + + // NOTE: unlike vector, this does not initialize the new data! + // However, we guarantee that write_bits can safely append after + // the resize, as we zero-initialize the first new byte of data. + // If size < capacity(), does not invalidate the memory. + void resize(size_t size) { + if (size > capacity_) IncreaseCapacityTo(size); + size_ = (data() == nullptr) ? 0 : size; + } + + // resize(size) plus explicit initialization of the new data with `value`. + void resize(size_t size, uint8_t value) { + size_t old_size = size_; + resize(size); + if (size_ > old_size) { + memset(data() + old_size, value, size_ - old_size); + } + } + + // Amortized constant complexity due to exponential growth. + void push_back(uint8_t x) { + if (size_ == capacity_) { + IncreaseCapacityTo(capacity_ + 1); + if (data() == nullptr) return; + } + + data_[size_++] = x; + } + + size_t size() const { return size_; } + size_t capacity() const { return capacity_; } + + uint8_t* data() { return data_.get(); } + const uint8_t* data() const { return data_.get(); } + + // std::vector operations implemented in terms of the public interface above. + + void clear() { resize(0); } + bool empty() const { return size() == 0; } + + void assign(std::initializer_list il) { + resize(il.size()); + memcpy(data(), il.begin(), il.size()); + } + + // Replaces data() with [new_begin, new_end); potentially reallocates. + void assign(const uint8_t* new_begin, const uint8_t* new_end); + + uint8_t* begin() { return data(); } + const uint8_t* begin() const { return data(); } + uint8_t* end() { return begin() + size(); } + const uint8_t* end() const { return begin() + size(); } + + uint8_t& operator[](const size_t i) { + BoundsCheck(i); + return data()[i]; + } + const uint8_t& operator[](const size_t i) const { + BoundsCheck(i); + return data()[i]; + } + + uint8_t& back() { + JXL_ASSERT(size() != 0); + return data()[size() - 1]; + } + const uint8_t& back() const { + JXL_ASSERT(size() != 0); + return data()[size() - 1]; + } + + template + void append(const T& other) { + append(reinterpret_cast(other.data()), + reinterpret_cast(other.data()) + other.size()); + } + + void append(const uint8_t* begin, const uint8_t* end) { + if (end - begin > 0) { + size_t old_size = size(); + resize(size() + (end - begin)); + memcpy(data() + old_size, begin, end - begin); + } + } + + private: + void BoundsCheck(size_t i) const { + // <= is safe due to padding and required by BitWriter. + JXL_ASSERT(i <= size()); + } + + // Copies existing data to newly allocated "data_". If allocation fails, + // data() == nullptr and size_ = capacity_ = 0. + // The new capacity will be at least 1.5 times the old capacity. This ensures + // that we avoid quadratic behaviour. + void IncreaseCapacityTo(size_t capacity); + + size_t size_; + size_t capacity_; + CacheAlignedUniquePtr data_; +}; + +template +static inline void Append(const T& s, PaddedBytes* out, + size_t* JXL_RESTRICT byte_pos) { + memcpy(out->data() + *byte_pos, s.data(), s.size()); + *byte_pos += s.size(); + JXL_CHECK(*byte_pos <= out->size()); +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_PADDED_BYTES_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/printf_macros.h b/third_party/jpeg-xl/lib/jxl/base/printf_macros.h new file mode 100644 index 0000000000..3215052afd --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/printf_macros.h @@ -0,0 +1,34 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_PRINTF_MACROS_H_ +#define LIB_JXL_BASE_PRINTF_MACROS_H_ + +// Format string macros. These should be included after any other system +// library since those may unconditionally define these, depending on the +// platform. + +// PRIuS and PRIdS macros to print size_t and ssize_t respectively. +#if !defined(PRIdS) +#if defined(_WIN64) +#define PRIdS "lld" +#elif defined(_WIN32) +#define PRIdS "d" +#else +#define PRIdS "zd" +#endif +#endif // PRIdS + +#if !defined(PRIuS) +#if defined(_WIN64) +#define PRIuS "llu" +#elif defined(_WIN32) +#define PRIuS "u" +#else +#define PRIuS "zu" +#endif +#endif // PRIuS + +#endif // LIB_JXL_BASE_PRINTF_MACROS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/profiler.cc b/third_party/jpeg-xl/lib/jxl/base/profiler.cc new file mode 100644 index 0000000000..a38d9b82b7 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/profiler.cc @@ -0,0 +1,540 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/profiler.h" + +#if JXL_PROFILER_ENABLED + +#include +#include +#include // memcpy + +#include // sort +#include +#include // PRIu64 +#include +#include +#include + +// Optionally use SIMD in StreamCacheLine if available. +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/base/profiler.cc" +#include +#include + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace profiler { +namespace HWY_NAMESPACE { + +// Overwrites `to` without loading it into cache (read-for-ownership). +// Copies 64 bytes from/to naturally aligned addresses. +void StreamCacheLine(const Packet* HWY_RESTRICT from, Packet* HWY_RESTRICT to) { +#if HWY_TARGET == HWY_SCALAR + hwy::CopyBytes<64>(from, to); +#else + const HWY_CAPPED(uint64_t, 2) d; + HWY_FENCE; + const uint64_t* HWY_RESTRICT from64 = reinterpret_cast(from); + const auto v0 = Load(d, from64 + 0); + const auto v1 = Load(d, from64 + 2); + const auto v2 = Load(d, from64 + 4); + const auto v3 = Load(d, from64 + 6); + // Fences prevent the compiler from reordering loads/stores, which may + // interfere with write-combining. + HWY_FENCE; + uint64_t* HWY_RESTRICT to64 = reinterpret_cast(to); + Stream(v0, d, to64 + 0); + Stream(v1, d, to64 + 2); + Stream(v2, d, to64 + 4); + Stream(v3, d, to64 + 6); + HWY_FENCE; +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace profiler +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +namespace profiler { + +HWY_EXPORT(StreamCacheLine); + +namespace { + +// How many mebibytes to allocate (if JXL_PROFILER_ENABLED) per thread that +// enters at least one zone. Once this buffer is full, the thread will analyze +// packets (two per zone), which introduces observer overhead. +#ifndef PROFILER_THREAD_STORAGE +#define PROFILER_THREAD_STORAGE 32ULL +#endif + +#define PROFILER_PRINT_OVERHEAD 0 + +// Upper bounds for fixed-size data structures (guarded via HWY_ASSERT): +constexpr size_t kMaxDepth = 64; // Maximum nesting of zones. +constexpr size_t kMaxZones = 256; // Total number of zones. + +// Stack of active (entered but not exited) zones. POD, uninitialized. +// Used to deduct child duration from the parent's self time. +struct ActiveZone { + const char* name; + uint64_t entry_timestamp; + uint64_t child_total; +}; + +// Totals for all Zones with the same name. POD, must be zero-initialized. +struct ZoneTotals { + uint64_t total_duration; + const char* name; + uint64_t num_calls; +}; + +template +inline T ClampedSubtract(const T minuend, const T subtrahend) { + if (subtrahend > minuend) { + return 0; + } + return minuend - subtrahend; +} + +} // namespace + +// Per-thread call graph (stack) and ZoneTotals for each zone. +class Results { + public: + Results() { + // Zero-initialize all accumulators (avoids a check for num_zones_ == 0). + memset(zones_, 0, sizeof(zones_)); + } + + // Used for computing overhead when this thread encounters its first Zone. + // This has no observable effect apart from increasing "analyze_elapsed_". + uint64_t ZoneDuration(const Packet* packets) { + HWY_ASSERT(depth_ == 0); + HWY_ASSERT(num_zones_ == 0); + AnalyzePackets(packets, 2); + const uint64_t duration = zones_[0].total_duration; + zones_[0].num_calls = 0; + zones_[0].total_duration = 0; + HWY_ASSERT(depth_ == 0); + num_zones_ = 0; + return duration; + } + + void SetSelfOverhead(const uint64_t self_overhead) { + self_overhead_ = self_overhead; + } + + void SetChildOverhead(const uint64_t child_overhead) { + child_overhead_ = child_overhead; + } + + // Draw all required information from the packets, which can be discarded + // afterwards. Called whenever this thread's storage is full. + void AnalyzePackets(const Packet* HWY_RESTRICT packets, + const size_t num_packets) { + // Ensures prior weakly-ordered streaming stores are globally visible. + hwy::FlushStream(); + + const uint64_t t0 = TicksBefore(); + + for (size_t i = 0; i < num_packets; ++i) { + const uint64_t timestamp = packets[i].timestamp; + // Entering a zone + if (packets[i].name != nullptr) { + HWY_ASSERT(depth_ < kMaxDepth); + zone_stack_[depth_].name = packets[i].name; + zone_stack_[depth_].entry_timestamp = timestamp; + zone_stack_[depth_].child_total = 0; + ++depth_; + continue; + } + + HWY_ASSERT(depth_ != 0); + const ActiveZone& active = zone_stack_[depth_ - 1]; + const uint64_t duration = timestamp - active.entry_timestamp; + const uint64_t self_duration = ClampedSubtract( + duration, self_overhead_ + child_overhead_ + active.child_total); + + UpdateOrAdd(active.name, 1, self_duration); + --depth_; + + // "Deduct" the nested time from its parent's self_duration. + if (depth_ != 0) { + zone_stack_[depth_ - 1].child_total += duration + child_overhead_; + } + } + + const uint64_t t1 = TicksAfter(); + analyze_elapsed_ += t1 - t0; + } + + // Incorporates results from another thread. Call after all threads have + // exited any zones. + void Assimilate(const Results& other) { + const uint64_t t0 = TicksBefore(); + HWY_ASSERT(depth_ == 0); + HWY_ASSERT(other.depth_ == 0); + + for (size_t i = 0; i < other.num_zones_; ++i) { + const ZoneTotals& zone = other.zones_[i]; + UpdateOrAdd(zone.name, zone.num_calls, zone.total_duration); + } + const uint64_t t1 = TicksAfter(); + analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_; + } + + // Single-threaded. + void Print() { + const uint64_t t0 = TicksBefore(); + MergeDuplicates(); + + // Sort by decreasing total (self) cost. + std::sort(zones_, zones_ + num_zones_, + [](const ZoneTotals& r1, const ZoneTotals& r2) { + return r1.total_duration > r2.total_duration; + }); + + uint64_t total_visible_duration = 0; + for (size_t i = 0; i < num_zones_; ++i) { + const ZoneTotals& r = zones_[i]; + if (r.name[0] != '@') { + total_visible_duration += r.total_duration; + printf("%-40s: %10" PRIu64 " x %15" PRIu64 "= %15" PRIu64 "\n", r.name, + r.num_calls, r.total_duration / r.num_calls, r.total_duration); + } + } + + const uint64_t t1 = TicksAfter(); + analyze_elapsed_ += t1 - t0; + printf("Total clocks during analysis: %" PRIu64 "\n", analyze_elapsed_); + printf("Total clocks measured: %" PRIu64 "\n", total_visible_duration); + } + + // Single-threaded. Clears all results as if no zones had been recorded. + void Reset() { + analyze_elapsed_ = 0; + HWY_ASSERT(depth_ == 0); + num_zones_ = 0; + memset(zone_stack_, 0, sizeof(zone_stack_)); + memset(zones_, 0, sizeof(zones_)); + } + + private: + // Updates ZoneTotals of the same name, or inserts a new one if this thread + // has not yet seen that name. Uses a self-organizing list data structure, + // which avoids dynamic memory allocations and is faster than unordered_map. + void UpdateOrAdd(const char* name, const uint64_t num_calls, + const uint64_t duration) { + // Special case for first zone: (maybe) update, without swapping. + if (zones_[0].name == name) { + zones_[0].total_duration += duration; + zones_[0].num_calls += num_calls; + return; + } + + // Look for a zone with the same name. + for (size_t i = 1; i < num_zones_; ++i) { + if (zones_[i].name == name) { + zones_[i].total_duration += duration; + zones_[i].num_calls += num_calls; + // Swap with predecessor (more conservative than move to front, + // but at least as successful). + std::swap(zones_[i - 1], zones_[i]); + return; + } + } + + // Not found; create a new ZoneTotals. + HWY_ASSERT(num_zones_ < kMaxZones); + ZoneTotals* HWY_RESTRICT zone = zones_ + num_zones_; + zone->name = name; + zone->num_calls = num_calls; + zone->total_duration = duration; + ++num_zones_; + } + + // Each instantiation of a function template seems to get its own copy of + // __func__ and GCC doesn't merge them. An N^2 search for duplicates is + // acceptable because we only expect a few dozen zones. + void MergeDuplicates() { + for (size_t i = 0; i < num_zones_; ++i) { + // Add any subsequent duplicates to num_calls and total_duration. + for (size_t j = i + 1; j < num_zones_;) { + if (!strcmp(zones_[i].name, zones_[j].name)) { + zones_[i].num_calls += zones_[j].num_calls; + zones_[i].total_duration += zones_[j].total_duration; + // Fill hole with last item. + zones_[j] = zones_[--num_zones_]; + } else { // Name differed, try next ZoneTotals. + ++j; + } + } + } + } + + uint64_t analyze_elapsed_ = 0; + uint64_t self_overhead_ = 0; + uint64_t child_overhead_ = 0; + + size_t depth_ = 0; // Number of active zones <= kMaxDepth. + size_t num_zones_ = 0; // Number of unique zones <= kMaxZones. + + // After other members to avoid large pointer offsets. + alignas(64) ActiveZone zone_stack_[kMaxDepth]; // Last = newest + alignas(64) ZoneTotals zones_[kMaxZones]; // Self-organizing list +}; + +ThreadSpecific::ThreadSpecific() + : max_packets_(PROFILER_THREAD_STORAGE << 16), // MiB / sizeof(Packet) + packets_(hwy::AllocateAligned(max_packets_)), + num_packets_(0), + results_(hwy::MakeUniqueAligned()) {} + +ThreadSpecific::~ThreadSpecific() {} + +void ThreadSpecific::FlushBuffer() { + if (num_packets_ + kBufferCapacity > max_packets_) { + results_->AnalyzePackets(packets_.get(), num_packets_); + num_packets_ = 0; + } + // This buffering halves observer overhead and decreases the overall + // runtime by about 3%. + HWY_DYNAMIC_DISPATCH(StreamCacheLine) + (buffer_, packets_.get() + num_packets_); + num_packets_ += kBufferCapacity; + buffer_size_ = 0; +} + +void ThreadSpecific::AnalyzeRemainingPackets() { + // Storage full => empty it. + if (num_packets_ + buffer_size_ > max_packets_) { + results_->AnalyzePackets(packets_.get(), num_packets_); + num_packets_ = 0; + } + + // Move buffer to storage + memcpy(packets_.get() + num_packets_, buffer_, buffer_size_ * sizeof(Packet)); + num_packets_ += buffer_size_; + buffer_size_ = 0; + + results_->AnalyzePackets(packets_.get(), num_packets_); + num_packets_ = 0; +} + +namespace { + +class HalfSampleMode { + public: + // Returns mode. "sorted" must be in ascending order. + template + T operator()(const T* const HWY_RESTRICT sorted, + const size_t num_values) const { + int64_t center = num_values / 2; + int64_t width = num_values; + + // Zoom in on modal intervals of decreasing width. Stop before we reach + // width=1, i.e. single values, for which there is no "slope". + while (width > 2) { + // Round up so we can still reach the outer edges of odd widths. + width = (width + 1) / 2; + + center = CenterOfIntervalWithMinSlope(sorted, num_values, center, width); + } + + return sorted[center]; // mode := middle value in modal interval. + } + + private: + // Returns center of the densest region [c-radius, c+radius]. + template + static HWY_INLINE int64_t CenterOfIntervalWithMinSlope( + const T* HWY_RESTRICT sorted, const int64_t total_values, + const int64_t center, const int64_t width) { + const int64_t radius = (width + 1) / 2; + + auto compute_slope = [radius, total_values, sorted]( + int64_t c, int64_t* actual_center = nullptr) { + // For symmetry, check 2*radius+1 values, i.e. [min, max]. + const int64_t min = std::max(c - radius, int64_t(0)); + const int64_t max = std::min(c + radius, total_values - 1); + HWY_ASSERT(min < max); + HWY_ASSERT(sorted[min] <= + sorted[max] + std::numeric_limits::epsilon()); + const float dx = max - min + 1; + const float slope = (sorted[max] - sorted[min]) / dx; + + if (actual_center != nullptr) { + // c may be out of bounds, so return center of the clamped bounds. + *actual_center = (min + max + 1) / 2; + } + return slope; + }; + + // First find min_slope for all centers. + float min_slope = std::numeric_limits::max(); + for (int64_t c = center - radius; c <= center + radius; ++c) { + min_slope = std::min(min_slope, compute_slope(c)); + } + + // Candidates := centers with slope ~= min_slope. + std::vector candidates; + for (int64_t c = center - radius; c <= center + radius; ++c) { + int64_t actual_center; + const float slope = compute_slope(c, &actual_center); + if (slope <= min_slope * 1.001f) { + candidates.push_back(actual_center); + } + } + + // Keep the median. + HWY_ASSERT(!candidates.empty()); + if (candidates.size() == 1) return candidates[0]; + std::nth_element(candidates.begin(), + candidates.begin() + candidates.size() / 2, + candidates.end()); + return candidates[candidates.size() / 2]; + } +}; + +} // namespace + +void ThreadSpecific::ComputeOverhead() { + // Delay after capturing timestamps before/after the actual zone runs. Even + // with frequency throttling disabled, this has a multimodal distribution, + // including 32, 34, 48, 52, 59, 62. + uint64_t self_overhead; + { + const size_t kNumSamples = 32; + uint32_t samples[kNumSamples]; + for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) { + const size_t kNumDurations = 1024; + uint32_t durations[kNumDurations]; + + for (size_t idx_duration = 0; idx_duration < kNumDurations; + ++idx_duration) { + { // + PROFILER_ZONE("Dummy Zone (never shown)"); + } + const uint64_t duration = results_->ZoneDuration(buffer_); + buffer_size_ = 0; + durations[idx_duration] = static_cast(duration); + HWY_ASSERT(num_packets_ == 0); + } + std::sort(durations, durations + kNumDurations); + samples[idx_sample] = HalfSampleMode()(durations, kNumDurations); + } + // Median. + std::sort(samples, samples + kNumSamples); + self_overhead = samples[kNumSamples / 2]; +#if PROFILER_PRINT_OVERHEAD + printf("Overhead: %" PRIu64 "\n", static_cast(self_overhead)); +#endif + results_->SetSelfOverhead(self_overhead); + } + + // Delay before capturing start timestamp / after end timestamp. + const size_t kNumSamples = 32; + uint32_t samples[kNumSamples]; + for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) { + const size_t kNumDurations = 16; + uint32_t durations[kNumDurations]; + for (size_t idx_duration = 0; idx_duration < kNumDurations; + ++idx_duration) { + const size_t kReps = 10000; + // Analysis time should not be included => must fit within buffer. + HWY_ASSERT(kReps * 2 < max_packets_); + hwy::FlushStream(); + const uint64_t t0 = TicksBefore(); + for (size_t i = 0; i < kReps; ++i) { + PROFILER_ZONE("Dummy"); + } + hwy::FlushStream(); + const uint64_t t1 = TicksAfter(); + HWY_ASSERT(num_packets_ + buffer_size_ == kReps * 2); + buffer_size_ = 0; + num_packets_ = 0; + const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps; + durations[idx_duration] = + static_cast(ClampedSubtract(avg_duration, self_overhead)); + } + std::sort(durations, durations + kNumDurations); + samples[idx_sample] = HalfSampleMode()(durations, kNumDurations); + } + std::sort(samples, samples + kNumSamples); + const uint64_t child_overhead = samples[9 * kNumSamples / 10]; +#if PROFILER_PRINT_OVERHEAD + printf("Child overhead: %" PRIu64 "\n", + static_cast(child_overhead)); +#endif + results_->SetChildOverhead(child_overhead); +} + +namespace { + +// Could be a static member of Zone, but that would expose in header. +std::atomic& GetHead() { + static std::atomic head_{nullptr}; // Owning + return head_; +} + +} // namespace + +// Thread-safe. +ThreadSpecific* Zone::InitThreadSpecific() { + ThreadSpecific* thread_specific = + hwy::MakeUniqueAligned().release(); + + // Insert into unordered list + std::atomic& head = GetHead(); + ThreadSpecific* old_head = head.load(std::memory_order_relaxed); + thread_specific->SetNext(old_head); + while (!head.compare_exchange_weak(old_head, thread_specific, + std::memory_order_release, + std::memory_order_relaxed)) { + thread_specific->SetNext(old_head); + // TODO(janwas): pause + } + + // ComputeOverhead also creates a Zone, so this needs to be set before that + // to prevent infinite recursion. + GetThreadSpecific() = thread_specific; + + thread_specific->ComputeOverhead(); + return thread_specific; +} + +// Single-threaded. +/*static*/ void Zone::PrintResults() { + ThreadSpecific* head = GetHead().load(std::memory_order_relaxed); + ThreadSpecific* p = head; + while (p) { + p->AnalyzeRemainingPackets(); + + // Combine all threads into a single Result. + if (p != head) { + head->GetResults().Assimilate(p->GetResults()); + p->GetResults().Reset(); + } + + p = p->GetNext(); + } + + if (head != nullptr) { + head->GetResults().Print(); + head->GetResults().Reset(); + } +} + +} // namespace profiler +} // namespace jxl + +#endif // HWY_ONCE +#endif // JXL_PROFILER_ENABLED diff --git a/third_party/jpeg-xl/lib/jxl/base/profiler.h b/third_party/jpeg-xl/lib/jxl/base/profiler.h new file mode 100644 index 0000000000..4c0efa4b3a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/profiler.h @@ -0,0 +1,170 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_PROFILER_H_ +#define LIB_JXL_BASE_PROFILER_H_ + +// High precision, low overhead time measurements. Returns exact call counts and +// total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes). +// +// To use the profiler you must set the JPEGXL_ENABLE_PROFILER CMake flag, which +// defines JXL_PROFILER_ENABLED. +// +// Usage: instrument regions of interest: { PROFILER_ZONE("name"); /*code*/ } or +// void FuncToMeasure() { PROFILER_FUNC; /*code*/ }. +// After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to +// print call counts and average durations [CPU cycles] to stdout, sorted in +// descending order of total duration. + +// If zero, this file has no effect and no measurements will be recorded. +#ifndef JXL_PROFILER_ENABLED +#define JXL_PROFILER_ENABLED 0 +#endif +#if JXL_PROFILER_ENABLED + +#include +#include + +#include +#include + +#include "lib/jxl/base/tsc_timer.h" + +#if HWY_COMPILER_MSVC +#define PROFILER_PUBLIC +#else +#define PROFILER_PUBLIC __attribute__((visibility("default"))) +#endif + +namespace jxl { +namespace profiler { + +// Represents zone entry/exit events. POD. +#pragma pack(push, 1) +struct Packet { + // Computing a hash or string table is likely too expensive, and offsets + // from other libraries' string literals can be too large to combine them and + // a full-resolution timestamp into 64 bits. + uint64_t timestamp; + const char* name; // nullptr for exit packets +#if UINTPTR_MAX <= 0xFFFFFFFFu + uint32_t padding; +#endif +}; +#pragma pack(pop) +static_assert(sizeof(Packet) == 16, "Wrong Packet size"); + +class Results; // pImpl + +// Per-thread packet storage, dynamically allocated and aligned. +class ThreadSpecific { + static constexpr size_t kBufferCapacity = 64 / sizeof(Packet); + + public: + PROFILER_PUBLIC explicit ThreadSpecific(); + PROFILER_PUBLIC ~ThreadSpecific(); + + // Depends on Zone => defined out of line. + PROFILER_PUBLIC void ComputeOverhead(); + + HWY_INLINE void WriteEntry(const char* name) { Write(name, TicksBefore()); } + HWY_INLINE void WriteExit() { Write(nullptr, TicksAfter()); } + + PROFILER_PUBLIC void AnalyzeRemainingPackets(); + + // Accessors instead of public member for well-defined data layout. + void SetNext(ThreadSpecific* next) { next_ = next; } + ThreadSpecific* GetNext() const { return next_; } + + Results& GetResults() { return *results_; } + + private: + PROFILER_PUBLIC void FlushBuffer(); + + // Write packet to buffer/storage, emptying them as needed. + void Write(const char* name, const uint64_t timestamp) { + if (buffer_size_ == kBufferCapacity) { // Full + FlushBuffer(); + } + buffer_[buffer_size_].name = name; + buffer_[buffer_size_].timestamp = timestamp; + ++buffer_size_; + } + + // Write-combining buffer to avoid cache pollution. Must be the first + // non-static member to ensure cache-line alignment. + Packet buffer_[kBufferCapacity]; + size_t buffer_size_ = 0; + + // Contiguous storage for zone enter/exit packets. + const size_t max_packets_; + hwy::AlignedFreeUniquePtr packets_; + size_t num_packets_; + + // Linked list of all threads. + ThreadSpecific* next_ = nullptr; // Owned, never released. + + hwy::AlignedUniquePtr results_; +}; + +// RAII zone enter/exit recorder constructed by PROFILER_ZONE; also +// responsible for initializing ThreadSpecific. +class Zone { + public: + HWY_NOINLINE explicit Zone(const char* name) { + HWY_FENCE; + ThreadSpecific* HWY_RESTRICT thread_specific = GetThreadSpecific(); + if (HWY_UNLIKELY(thread_specific == nullptr)) { + thread_specific = InitThreadSpecific(); + } + + thread_specific->WriteEntry(name); + } + + HWY_NOINLINE ~Zone() { GetThreadSpecific()->WriteExit(); } + + // Call exactly once after all threads have exited all zones. + PROFILER_PUBLIC static void PrintResults(); + + private: + // Returns reference to the thread's ThreadSpecific pointer (initially null). + // Function-local static avoids needing a separate definition. + static ThreadSpecific*& GetThreadSpecific() { + static thread_local ThreadSpecific* thread_specific; + return thread_specific; + } + + // Non time-critical. + PROFILER_PUBLIC ThreadSpecific* InitThreadSpecific(); +}; + +// Creates a zone starting from here until the end of the current scope. +// Timestamps will be recorded when entering and exiting the zone. +// To ensure the name pointer remains valid, we require it to be a string +// literal (by merging with ""). We also compare strings by address. +#define PROFILER_ZONE(name) \ + HWY_FENCE; \ + const ::jxl::profiler::Zone zone("" name); \ + HWY_FENCE + +// Creates a zone for an entire function (when placed at its beginning). +// Shorter/more convenient than ZONE. +#define PROFILER_FUNC \ + HWY_FENCE; \ + const ::jxl::profiler::Zone zone(__func__); \ + HWY_FENCE + +#define PROFILER_PRINT_RESULTS ::jxl::profiler::Zone::PrintResults + +} // namespace profiler +} // namespace jxl + +#else // !JXL_PROFILER_ENABLED +#define PROFILER_ZONE(name) +#define PROFILER_FUNC +#define PROFILER_PRINT_RESULTS() +#endif + +#endif // LIB_JXL_BASE_PROFILER_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/random.cc b/third_party/jpeg-xl/lib/jxl/base/random.cc new file mode 100644 index 0000000000..c99f88921c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/random.cc @@ -0,0 +1,21 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/random.h" + +#include + +namespace jxl { + +Rng::GeometricDistribution::GeometricDistribution(float p) + : inv_log_1mp(1.0 / std::log(1 - p)) {} + +uint32_t Rng::Geometric(const GeometricDistribution& dist) { + float f = UniformF(0, 1); + float log = std::log(1 - f) * dist.inv_log_1mp; + return static_cast(log); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/base/random.h b/third_party/jpeg-xl/lib/jxl/base/random.h new file mode 100644 index 0000000000..663b88c95d --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/random.h @@ -0,0 +1,95 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_RANDOM_ +#define LIB_JXL_BASE_RANDOM_ + +// Random number generator + distributions. +// We don't use because the implementation (and thus results) differs +// between libstdc++ and libc++. + +#include +#include + +#include + +#include "lib/jxl/base/status.h" + +namespace jxl { +struct Rng { + explicit Rng(size_t seed) + : s{static_cast(0x94D049BB133111EBull), + static_cast(0xBF58476D1CE4E5B9ull) + seed} {} + + // Xorshift128+ adapted from xorshift128+-inl.h + uint64_t operator()() { + uint64_t s1 = s[0]; + const uint64_t s0 = s[1]; + const uint64_t bits = s1 + s0; // b, c + s[0] = s0; + s1 ^= s1 << 23; + s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5); + s[1] = s1; + return bits; + } + + // Uniformly distributed int64_t in [begin, end), under the assumption that + // `end-begin` is significantly smaller than 1<<64, otherwise there is some + // bias. + int64_t UniformI(int64_t begin, int64_t end) { + JXL_DASSERT(end > begin); + return static_cast((*this)() % + static_cast(end - begin)) + + begin; + } + + // Same as UniformI, but for uint64_t. + uint64_t UniformU(uint64_t begin, uint64_t end) { + JXL_DASSERT(end > begin); + return (*this)() % (end - begin) + begin; + } + + // Uniformly distributed float in [begin, end) range. Note: only 23 bits of + // randomness. + float UniformF(float begin, float end) { + float f; + // Bits of a random [1, 2) float. + uint32_t u = ((*this)() >> (64 - 23)) | 0x3F800000; + static_assert(sizeof(f) == sizeof(u), + "Float and U32 must have the same size"); + memcpy(&f, &u, sizeof(f)); + // Note: (end-begin) * f + (2*begin-end) may fail to return a number >= + // begin. + return (end - begin) * (f - 1.0f) + begin; + } + + // Bernoulli trial + bool Bernoulli(float p) { return UniformF(0, 1) < p; } + + // State for geometric distributions. + struct GeometricDistribution { + explicit GeometricDistribution(float p); + + private: + float inv_log_1mp; + friend struct Rng; + }; + + uint32_t Geometric(const GeometricDistribution& dist); + + template + void Shuffle(T* t, size_t n) { + for (size_t i = 0; i + 1 < n; i++) { + size_t a = UniformU(i, n); + std::swap(t[a], t[i]); + } + } + + private: + uint64_t s[2]; +}; + +} // namespace jxl +#endif // LIB_JXL_BASE_RANDOM_ diff --git a/third_party/jpeg-xl/lib/jxl/base/sanitizer_definitions.h b/third_party/jpeg-xl/lib/jxl/base/sanitizer_definitions.h new file mode 100644 index 0000000000..315f3bd003 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/sanitizer_definitions.h @@ -0,0 +1,44 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_SANITIZER_DEFINITIONS_H_ +#define LIB_JXL_BASE_SANITIZER_DEFINITIONS_H_ + +#ifdef MEMORY_SANITIZER +#define JXL_MEMORY_SANITIZER 1 +#elif defined(__has_feature) +#if __has_feature(memory_sanitizer) +#define JXL_MEMORY_SANITIZER 1 +#else +#define JXL_MEMORY_SANITIZER 0 +#endif +#else +#define JXL_MEMORY_SANITIZER 0 +#endif + +#ifdef ADDRESS_SANITIZER +#define JXL_ADDRESS_SANITIZER 1 +#elif defined(__has_feature) +#if __has_feature(address_sanitizer) +#define JXL_ADDRESS_SANITIZER 1 +#else +#define JXL_ADDRESS_SANITIZER 0 +#endif +#else +#define JXL_ADDRESS_SANITIZER 0 +#endif + +#ifdef THREAD_SANITIZER +#define JXL_THREAD_SANITIZER 1 +#elif defined(__has_feature) +#if __has_feature(thread_sanitizer) +#define JXL_THREAD_SANITIZER 1 +#else +#define JXL_THREAD_SANITIZER 0 +#endif +#else +#define JXL_THREAD_SANITIZER 0 +#endif +#endif // LIB_JXL_BASE_SANITIZER_DEFINITIONS_H diff --git a/third_party/jpeg-xl/lib/jxl/base/scope_guard.h b/third_party/jpeg-xl/lib/jxl/base/scope_guard.h new file mode 100644 index 0000000000..a18a44cb79 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/scope_guard.h @@ -0,0 +1,48 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_SCOPE_GUARD_H_ +#define LIB_JXL_BASE_SCOPE_GUARD_H_ + +#include + +namespace jxl { + +template +class ScopeGuard { + public: + // Discourage unnecessary moves / copies. + ScopeGuard(const ScopeGuard &) = delete; + ScopeGuard &operator=(const ScopeGuard &) = delete; + ScopeGuard &operator=(ScopeGuard &&) = delete; + + // Pre-C++17 does not guarantee RVO -> require move constructor. + ScopeGuard(ScopeGuard &&other) : callback_(std::move(other.callback_)) { + other.armed_ = false; + } + + template + explicit ScopeGuard(CallbackParam &&callback) + : callback_(std::forward(callback)), armed_(true) {} + + ~ScopeGuard() { + if (armed_) callback_(); + } + + void Disarm() { armed_ = false; } + + private: + Callback callback_; + bool armed_; +}; + +template +ScopeGuard MakeScopeGuard(Callback &&callback) { + return ScopeGuard{std::forward(callback)}; +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_SCOPE_GUARD_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/span.h b/third_party/jpeg-xl/lib/jxl/base/span.h new file mode 100644 index 0000000000..41c3623a4b --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/span.h @@ -0,0 +1,60 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_SPAN_H_ +#define LIB_JXL_BASE_SPAN_H_ + +// Span (array view) is a non-owning container that provides cheap "cut" +// operations and could be used as "ArrayLike" data source for PaddedBytes. + +#include + +#include "lib/jxl/base/status.h" + +namespace jxl { + +template +class Span { + public: + constexpr Span() noexcept : Span(nullptr, 0) {} + + constexpr Span(T* array, size_t length) noexcept + : ptr_(array), len_(length) {} + + template + explicit constexpr Span(T (&a)[N]) noexcept : Span(a, N) {} + + template + explicit constexpr Span(const ArrayLike& other) noexcept + : Span(reinterpret_cast(other.data()), other.size()) { + static_assert(sizeof(*other.data()) == sizeof(T), + "Incompatible type of source."); + } + + constexpr T* data() const noexcept { return ptr_; } + + constexpr size_t size() const noexcept { return len_; } + + constexpr bool empty() const noexcept { return len_ == 0; } + + constexpr T& operator[](size_t i) const noexcept { + // MSVC 2015 accepts this as constexpr, but not ptr_[i] + return *(data() + i); + } + + void remove_prefix(size_t n) noexcept { + JXL_ASSERT(size() >= n); + ptr_ += n; + len_ -= n; + } + + private: + T* ptr_; + size_t len_; +}; + +} // namespace jxl + +#endif // LIB_JXL_BASE_SPAN_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/status.h b/third_party/jpeg-xl/lib/jxl/base/status.h new file mode 100644 index 0000000000..f40be0c434 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/status.h @@ -0,0 +1,326 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_STATUS_H_ +#define LIB_JXL_BASE_STATUS_H_ + +// Error handling: Status return type + helper macros. + +#include +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/sanitizer_definitions.h" + +#if JXL_ADDRESS_SANITIZER || JXL_MEMORY_SANITIZER || JXL_THREAD_SANITIZER +#include "sanitizer/common_interface_defs.h" // __sanitizer_print_stack_trace +#endif // defined(*_SANITIZER) + +namespace jxl { + +// Uncomment to abort when JXL_FAILURE or JXL_STATUS with a fatal error is +// reached: +// #define JXL_CRASH_ON_ERROR + +#ifndef JXL_ENABLE_ASSERT +#define JXL_ENABLE_ASSERT 1 +#endif + +#ifndef JXL_ENABLE_CHECK +#define JXL_ENABLE_CHECK 1 +#endif + +// Pass -DJXL_DEBUG_ON_ERROR at compile time to print debug messages when a +// function returns JXL_FAILURE or calls JXL_NOTIFY_ERROR. Note that this is +// irrelevant if you also pass -DJXL_CRASH_ON_ERROR. +#if defined(JXL_DEBUG_ON_ERROR) || defined(JXL_CRASH_ON_ERROR) +#undef JXL_DEBUG_ON_ERROR +#define JXL_DEBUG_ON_ERROR 1 +#else // JXL_DEBUG_ON_ERROR || JXL_CRASH_ON_ERROR +#ifdef NDEBUG +#define JXL_DEBUG_ON_ERROR 0 +#else // NDEBUG +#define JXL_DEBUG_ON_ERROR 1 +#endif // NDEBUG +#endif // JXL_DEBUG_ON_ERROR || JXL_CRASH_ON_ERROR + +// Pass -DJXL_DEBUG_ON_ALL_ERROR at compile time to print debug messages on +// all error (fatal and non-fatal) status. This implies JXL_DEBUG_ON_ERROR. +#if defined(JXL_DEBUG_ON_ALL_ERROR) +#undef JXL_DEBUG_ON_ALL_ERROR +#define JXL_DEBUG_ON_ALL_ERROR 1 +// JXL_DEBUG_ON_ALL_ERROR implies JXL_DEBUG_ON_ERROR too. +#undef JXL_DEBUG_ON_ERROR +#define JXL_DEBUG_ON_ERROR 1 +#else // JXL_DEBUG_ON_ALL_ERROR +#define JXL_DEBUG_ON_ALL_ERROR 0 +#endif // JXL_DEBUG_ON_ALL_ERROR + +// The Verbose level for the library +#ifndef JXL_DEBUG_V_LEVEL +#define JXL_DEBUG_V_LEVEL 0 +#endif // JXL_DEBUG_V_LEVEL + +// Pass -DJXL_DEBUG_ON_ABORT=0 to disable the debug messages on JXL_ASSERT, +// JXL_CHECK and JXL_ABORT. +#ifndef JXL_DEBUG_ON_ABORT +#define JXL_DEBUG_ON_ABORT 1 +#endif // JXL_DEBUG_ON_ABORT + +// Print a debug message on standard error. You should use the JXL_DEBUG macro +// instead of calling Debug directly. This function returns false, so it can be +// used as a return value in JXL_FAILURE. +JXL_FORMAT(1, 2) +inline JXL_NOINLINE bool Debug(const char* format, ...) { + va_list args; + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + return false; +} + +// Print a debug message on standard error if "enabled" is true. "enabled" is +// normally a macro that evaluates to 0 or 1 at compile time, so the Debug +// function is never called and optimized out in release builds. Note that the +// arguments are compiled but not evaluated when enabled is false. The format +// string must be a explicit string in the call, for example: +// JXL_DEBUG(JXL_DEBUG_MYMODULE, "my module message: %d", some_var); +// Add a header at the top of your module's .cc or .h file (depending on whether +// you have JXL_DEBUG calls from the .h as well) like this: +// #ifndef JXL_DEBUG_MYMODULE +// #define JXL_DEBUG_MYMODULE 0 +// #endif JXL_DEBUG_MYMODULE +#define JXL_DEBUG_TMP(format, ...) \ + ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__) + +#define JXL_DEBUG(enabled, format, ...) \ + do { \ + if (enabled) { \ + JXL_DEBUG_TMP(format, ##__VA_ARGS__); \ + } \ + } while (0) + +// JXL_DEBUG version that prints the debug message if the global verbose level +// defined at compile time by JXL_DEBUG_V_LEVEL is greater or equal than the +// passed level. +#define JXL_DEBUG_V(level, format, ...) \ + JXL_DEBUG(level <= JXL_DEBUG_V_LEVEL, format, ##__VA_ARGS__) + +// Warnings (via JXL_WARNING) are enabled by default in debug builds (opt and +// debug). +#ifdef JXL_DEBUG_WARNING +#undef JXL_DEBUG_WARNING +#define JXL_DEBUG_WARNING 1 +#else // JXL_DEBUG_WARNING +#ifdef NDEBUG +#define JXL_DEBUG_WARNING 0 +#else // JXL_DEBUG_WARNING +#define JXL_DEBUG_WARNING 1 +#endif // NDEBUG +#endif // JXL_DEBUG_WARNING +#define JXL_WARNING(format, ...) \ + JXL_DEBUG(JXL_DEBUG_WARNING, format, ##__VA_ARGS__) + +// Exits the program after printing a stack trace when possible. +JXL_NORETURN inline JXL_NOINLINE bool Abort() { +#if JXL_ADDRESS_SANITIZER || JXL_MEMORY_SANITIZER || JXL_THREAD_SANITIZER + // If compiled with any sanitizer print a stack trace. This call doesn't crash + // the program, instead the trap below will crash it also allowing gdb to + // break there. + __sanitizer_print_stack_trace(); +#endif // *_SANITIZER) + +#if JXL_COMPILER_MSVC + __debugbreak(); + abort(); +#else + __builtin_trap(); +#endif +} + +// Exits the program after printing file/line plus a formatted string. +#define JXL_ABORT(format, ...) \ + ((JXL_DEBUG_ON_ABORT) && ::jxl::Debug(("%s:%d: JXL_ABORT: " format "\n"), \ + __FILE__, __LINE__, ##__VA_ARGS__), \ + ::jxl::Abort()) + +// Does not guarantee running the code, use only for debug mode checks. +#if JXL_ENABLE_ASSERT +#define JXL_ASSERT(condition) \ + do { \ + if (!(condition)) { \ + JXL_DEBUG(JXL_DEBUG_ON_ABORT, "JXL_ASSERT: %s", #condition); \ + ::jxl::Abort(); \ + } \ + } while (0) +#else +#define JXL_ASSERT(condition) \ + do { \ + } while (0) +#endif + +// Define JXL_IS_DEBUG_BUILD that denotes asan, msan and other debug builds, +// but not opt or release. +#ifndef JXL_IS_DEBUG_BUILD +#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) || \ + defined(MEMORY_SANITIZER) || defined(THREAD_SANITIZER) || \ + defined(__clang_analyzer__) +#define JXL_IS_DEBUG_BUILD 1 +#else +#define JXL_IS_DEBUG_BUILD 0 +#endif +#endif // JXL_IS_DEBUG_BUILD + +// Same as above, but only runs in debug builds (builds where NDEBUG is not +// defined). This is useful for slower asserts that we want to run more rarely +// than usual. These will run on asan, msan and other debug builds, but not in +// opt or release. +#if JXL_IS_DEBUG_BUILD +#define JXL_DASSERT(condition) \ + do { \ + if (!(condition)) { \ + JXL_DEBUG(JXL_DEBUG_ON_ABORT, "JXL_DASSERT: %s", #condition); \ + ::jxl::Abort(); \ + } \ + } while (0) +#else +#define JXL_DASSERT(condition) \ + do { \ + } while (0) +#endif + +// Always runs the condition, so can be used for non-debug calls. +#if JXL_ENABLE_CHECK +#define JXL_CHECK(condition) \ + do { \ + if (!(condition)) { \ + JXL_DEBUG(JXL_DEBUG_ON_ABORT, "JXL_CHECK: %s", #condition); \ + ::jxl::Abort(); \ + } \ + } while (0) +#else +#define JXL_CHECK(condition) \ + do { \ + (void)(condition); \ + } while (0) +#endif + +// A jxl::Status value from a StatusCode or Status which prints a debug message +// when enabled. +#define JXL_STATUS(status, format, ...) \ + ::jxl::StatusMessage(::jxl::Status(status), "%s:%d: " format "\n", __FILE__, \ + __LINE__, ##__VA_ARGS__) + +// Notify of an error but discard the resulting Status value. This is only +// useful for debug builds or when building with JXL_CRASH_ON_ERROR. +#define JXL_NOTIFY_ERROR(format, ...) \ + (void)JXL_STATUS(::jxl::StatusCode::kGenericError, "JXL_ERROR: " format, \ + ##__VA_ARGS__) + +// An error Status with a message. The JXL_STATUS() macro will return a Status +// object with a kGenericError code, but the comma operator helps with +// clang-tidy inference and potentially with optimizations. +#define JXL_FAILURE(format, ...) \ + ((void)JXL_STATUS(::jxl::StatusCode::kGenericError, "JXL_FAILURE: " format, \ + ##__VA_ARGS__), \ + ::jxl::Status(::jxl::StatusCode::kGenericError)) + +// Always evaluates the status exactly once, so can be used for non-debug calls. +// Returns from the current context if the passed Status expression is an error +// (fatal or non-fatal). The return value is the passed Status. +#define JXL_RETURN_IF_ERROR(status) \ + do { \ + ::jxl::Status jxl_return_if_error_status = (status); \ + if (!jxl_return_if_error_status) { \ + (void)::jxl::StatusMessage( \ + jxl_return_if_error_status, \ + "%s:%d: JXL_RETURN_IF_ERROR code=%d: %s\n", __FILE__, __LINE__, \ + static_cast(jxl_return_if_error_status.code()), #status); \ + return jxl_return_if_error_status; \ + } \ + } while (0) + +// As above, but without calling StatusMessage. Intended for bundles (see +// fields.h), which have numerous call sites (-> relevant for code size) and do +// not want to generate excessive messages when decoding partial headers. +#define JXL_QUIET_RETURN_IF_ERROR(status) \ + do { \ + ::jxl::Status jxl_return_if_error_status = (status); \ + if (!jxl_return_if_error_status) { \ + return jxl_return_if_error_status; \ + } \ + } while (0) + +enum class StatusCode : int32_t { + // Non-fatal errors (negative values). + kNotEnoughBytes = -1, + + // The only non-error status code. + kOk = 0, + + // Fatal-errors (positive values) + kGenericError = 1, +}; + +// Drop-in replacement for bool that raises compiler warnings if not used +// after being returned from a function. Example: +// Status LoadFile(...) { return true; } is more compact than +// bool JXL_MUST_USE_RESULT LoadFile(...) { return true; } +// In case of error, the status can carry an extra error code in its value which +// is split between fatal and non-fatal error codes. +class JXL_MUST_USE_RESULT Status { + public: + // We want implicit constructor from bool to allow returning "true" or "false" + // on a function when using Status. "true" means kOk while "false" means a + // generic fatal error. + // NOLINTNEXTLINE(google-explicit-constructor) + constexpr Status(bool ok) + : code_(ok ? StatusCode::kOk : StatusCode::kGenericError) {} + + // NOLINTNEXTLINE(google-explicit-constructor) + constexpr Status(StatusCode code) : code_(code) {} + + // We also want implicit cast to bool to check for return values of functions. + // NOLINTNEXTLINE(google-explicit-constructor) + constexpr operator bool() const { return code_ == StatusCode::kOk; } + + constexpr StatusCode code() const { return code_; } + + // Returns whether the status code is a fatal error. + constexpr bool IsFatalError() const { + return static_cast(code_) > 0; + } + + private: + StatusCode code_; +}; + +// Helper function to create a Status and print the debug message or abort when +// needed. +inline JXL_FORMAT(2, 3) Status + StatusMessage(const Status status, const char* format, ...) { + // This block will be optimized out when JXL_DEBUG_ON_ERROR and + // JXL_DEBUG_ON_ALL_ERROR are both disabled. + if ((JXL_DEBUG_ON_ERROR && status.IsFatalError()) || + (JXL_DEBUG_ON_ALL_ERROR && !status)) { + va_list args; + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + } +#ifdef JXL_CRASH_ON_ERROR + // JXL_CRASH_ON_ERROR means to Abort() only on non-fatal errors. + if (status.IsFatalError()) { + Abort(); + } +#endif // JXL_CRASH_ON_ERROR + return status; +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_STATUS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/tsc_timer.h b/third_party/jpeg-xl/lib/jxl/base/tsc_timer.h new file mode 100644 index 0000000000..74d51f72d1 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/tsc_timer.h @@ -0,0 +1,172 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_TSC_TIMER_H_ +#define LIB_JXL_BASE_TSC_TIMER_H_ + +// High-resolution (~10 ns) timestamps, using fences to prevent reordering and +// ensure exactly the desired regions are measured. + +#include +#include // clock_gettime + +#if defined(_WIN32) || defined(_WIN64) +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif // WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif // NOMINMAX +#ifndef NOGDI +#define NOGDI +#endif // NOGDI +#include +// Undef macros to avoid collisions +#undef LoadFence +#endif + +#if defined(__APPLE__) +#include +#include +#endif + +#if defined(__HAIKU__) +#include +#endif + +#include +#include +#include // LoadFence + +namespace jxl { +namespace profiler { + +// Ticks := platform-specific timer values (CPU cycles on x86). Must be +// unsigned to guarantee wraparound on overflow. +using Ticks = uint64_t; + +// TicksBefore/After return absolute timestamps and must be placed immediately +// before and after the region to measure. We provide separate Before/After +// functions because they use different fences. +// +// Background: RDTSC is not 'serializing'; earlier instructions may complete +// after it, and/or later instructions may complete before it. 'Fences' ensure +// regions' elapsed times are independent of such reordering. The only +// documented unprivileged serializing instruction is CPUID, which acts as a +// full fence (no reordering across it in either direction). Unfortunately +// the latency of CPUID varies wildly (perhaps made worse by not initializing +// its EAX input). Because it cannot reliably be deducted from the region's +// elapsed time, it must not be included in the region to measure (i.e. +// between the two RDTSC). +// +// The newer RDTSCP is sometimes described as serializing, but it actually +// only serves as a half-fence with release semantics. Although all +// instructions in the region will complete before the final timestamp is +// captured, subsequent instructions may leak into the region and increase the +// elapsed time. Inserting another fence after the final RDTSCP would prevent +// such reordering without affecting the measured region. +// +// Fortunately, such a fence exists. The LFENCE instruction is only documented +// to delay later loads until earlier loads are visible. However, Intel's +// reference manual says it acts as a full fence (waiting until all earlier +// instructions have completed, and delaying later instructions until it +// completes). AMD assigns the same behavior to MFENCE. +// +// We need a fence before the initial RDTSC to prevent earlier instructions +// from leaking into the region, and arguably another after RDTSC to avoid +// region instructions from completing before the timestamp is recorded. +// When surrounded by fences, the additional RDTSCP half-fence provides no +// benefit, so the initial timestamp can be recorded via RDTSC, which has +// lower overhead than RDTSCP because it does not read TSC_AUX. In summary, +// we define Before = LFENCE/RDTSC/LFENCE; After = RDTSCP/LFENCE. +// +// Using Before+Before leads to higher variance and overhead than After+After. +// However, After+After includes an LFENCE in the region measurements, which +// adds a delay dependent on earlier loads. The combination of Before+After +// is faster than Before+Before and more consistent than After+After because +// the first LFENCE already delayed subsequent loads before the measured +// region. This combination seems not to have been considered in prior work: +// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c +// +// Note: performance counters can measure 'exact' instructions-retired or +// (unhalted) cycle counts. The RDPMC instruction is not serializing and also +// requires fences. Unfortunately, it is not accessible on all OSes and we +// prefer to avoid kernel-mode drivers. Performance counters are also affected +// by several under/over-count errata, so we use the TSC instead. + +// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds, +// divide by InvariantTicksPerSecond. +static HWY_INLINE HWY_MAYBE_UNUSED Ticks TicksBefore() { + Ticks t; +#if HWY_ARCH_PPC && defined(__GLIBC__) + asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); +#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC + hwy::LoadFence(); + HWY_FENCE; + t = __rdtsc(); + hwy::LoadFence(); + HWY_FENCE; +#elif HWY_ARCH_X86_64 + asm volatile( + "lfence\n\t" + "rdtsc\n\t" + "shl $32, %%rdx\n\t" + "or %%rdx, %0\n\t" + "lfence" + : "=a"(t) + : + // "memory" avoids reordering. rdx = TSC >> 32. + // "cc" = flags modified by SHL. + : "rdx", "memory", "cc"); +#elif HWY_ARCH_RVV + asm volatile("rdcycle %0" : "=r"(t)); +#elif defined(_WIN32) || defined(_WIN64) + LARGE_INTEGER counter; + (void)QueryPerformanceCounter(&counter); + t = counter.QuadPart; +#elif defined(__APPLE__) + t = mach_absolute_time(); +#elif defined(__HAIKU__) + t = system_time_nsecs(); // since boot +#else // POSIX + timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + t = static_cast(ts.tv_sec * 1000000000LL + ts.tv_nsec); +#endif + return t; +} + +static HWY_INLINE HWY_MAYBE_UNUSED Ticks TicksAfter() { + Ticks t; +#if HWY_ARCH_PPC && defined(__GLIBC__) + asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); +#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC + HWY_FENCE; + unsigned aux; + t = __rdtscp(&aux); + hwy::LoadFence(); + HWY_FENCE; +#elif HWY_ARCH_X86_64 + // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx). + asm volatile( + "rdtscp\n\t" + "shl $32, %%rdx\n\t" + "or %%rdx, %0\n\t" + "lfence" + : "=a"(t) + : + // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32. + // "cc" = flags modified by SHL. + : "rcx", "rdx", "memory", "cc"); +#else + t = TicksBefore(); // no difference on other platforms. +#endif + return t; +} + +} // namespace profiler +} // namespace jxl + +#endif // LIB_JXL_BASE_TSC_TIMER_H_ diff --git a/third_party/jpeg-xl/lib/jxl/bit_reader_test.cc b/third_party/jpeg-xl/lib/jxl/bit_reader_test.cc new file mode 100644 index 0000000000..24cc9b64e8 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/bit_reader_test.cc @@ -0,0 +1,262 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include + +#include +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/random.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +TEST(BitReaderTest, ExtendsWithZeroes) { + for (size_t size = 4; size < 32; ++size) { + std::vector data(size, 0xff); + + for (size_t n_bytes = 0; n_bytes < size; n_bytes++) { + BitReader br(Span(data.data(), n_bytes)); + // Read all the bits + for (size_t i = 0; i < n_bytes * kBitsPerByte; i++) { + ASSERT_EQ(br.ReadBits(1), 1u) << "n_bytes=" << n_bytes << " i=" << i; + } + + // PEEK more than the declared size - all will be zero. Cannot consume. + for (size_t i = 0; i < BitReader::kMaxBitsPerCall; i++) { + ASSERT_EQ(br.PeekBits(i), 0u) + << "size=" << size << "n_bytes=" << n_bytes << " i=" << i; + } + + EXPECT_TRUE(br.Close()); + } + } +} + +struct Symbol { + uint32_t num_bits; + uint32_t value; +}; + +// Reading from output gives the same values. +TEST(BitReaderTest, TestRoundTrip) { + test::ThreadPoolForTests pool(8); + EXPECT_TRUE(RunOnPool( + &pool, 0, 1000, ThreadPool::NoInit, + [](const uint32_t task, size_t /* thread */) { + constexpr size_t kMaxBits = 8000; + BitWriter writer; + BitWriter::Allotment allotment(&writer, kMaxBits); + + std::vector symbols; + symbols.reserve(1000); + + Rng rng(55537 + 129 * task); + + for (;;) { + const uint32_t num_bits = rng.UniformU(1, 33); + if (writer.BitsWritten() + num_bits > kMaxBits) break; + const uint32_t value = rng.UniformU(0, 1ULL << num_bits); + symbols.push_back({num_bits, value}); + writer.Write(num_bits, value); + } + + writer.ZeroPadToByte(); + allotment.ReclaimAndCharge(&writer, 0, nullptr); + BitReader reader(writer.GetSpan()); + for (const Symbol& s : symbols) { + EXPECT_EQ(s.value, reader.ReadBits(s.num_bits)); + } + EXPECT_TRUE(reader.Close()); + }, + "TestTBitReaderRoundTrip")); +} + +// SkipBits is the same as reading that many bits. +TEST(BitReaderTest, TestSkip) { + test::ThreadPoolForTests pool(8); + EXPECT_TRUE(RunOnPool( + &pool, 0, 96, ThreadPool::NoInit, + [](const uint32_t task, size_t /* thread */) { + constexpr size_t kSize = 100; + + for (size_t skip = 0; skip < 128; ++skip) { + BitWriter writer; + BitWriter::Allotment allotment(&writer, kSize * kBitsPerByte); + // Start with "task" 1-bits. + for (size_t i = 0; i < task; ++i) { + writer.Write(1, 1); + } + + // Write 0-bits that we will skip over + for (size_t i = 0; i < skip; ++i) { + writer.Write(1, 0); + } + + // Write terminator bits '101' + writer.Write(3, 5); + EXPECT_EQ(task + skip + 3, writer.BitsWritten()); + writer.ZeroPadToByte(); + AuxOut aux_out; + allotment.ReclaimAndCharge(&writer, 0, &aux_out); + EXPECT_LT(aux_out.layers[0].total_bits, kSize * 8); + + BitReader reader1(writer.GetSpan()); + BitReader reader2(writer.GetSpan()); + // Verify initial 1-bits + for (size_t i = 0; i < task; ++i) { + EXPECT_EQ(1u, reader1.ReadBits(1)); + EXPECT_EQ(1u, reader2.ReadBits(1)); + } + + // SkipBits or manually read "skip" bits + reader1.SkipBits(skip); + for (size_t i = 0; i < skip; ++i) { + EXPECT_EQ(0u, reader2.ReadBits(1)) + << " skip=" << skip << " i=" << i; + } + EXPECT_EQ(reader1.TotalBitsConsumed(), reader2.TotalBitsConsumed()); + + // Ensure both readers see the terminator bits. + EXPECT_EQ(5u, reader1.ReadBits(3)); + EXPECT_EQ(5u, reader2.ReadBits(3)); + + EXPECT_TRUE(reader1.Close()); + EXPECT_TRUE(reader2.Close()); + } + }, + "TestSkip")); +} + +// Verifies byte order and different groupings of bits. +TEST(BitReaderTest, TestOrder) { + constexpr size_t kMaxBits = 16; + + // u(1) - bits written into LSBs of first byte + { + BitWriter writer; + BitWriter::Allotment allotment(&writer, kMaxBits); + for (size_t i = 0; i < 5; ++i) { + writer.Write(1, 1); + } + for (size_t i = 0; i < 5; ++i) { + writer.Write(1, 0); + } + for (size_t i = 0; i < 6; ++i) { + writer.Write(1, 1); + } + + writer.ZeroPadToByte(); + allotment.ReclaimAndCharge(&writer, 0, nullptr); + BitReader reader(writer.GetSpan()); + EXPECT_EQ(0x1Fu, reader.ReadFixedBits<8>()); + EXPECT_EQ(0xFCu, reader.ReadFixedBits<8>()); + EXPECT_TRUE(reader.Close()); + } + + // u(8) - get bytes in the same order + { + BitWriter writer; + BitWriter::Allotment allotment(&writer, kMaxBits); + writer.Write(8, 0xF8); + writer.Write(8, 0x3F); + + writer.ZeroPadToByte(); + allotment.ReclaimAndCharge(&writer, 0, nullptr); + BitReader reader(writer.GetSpan()); + EXPECT_EQ(0xF8u, reader.ReadFixedBits<8>()); + EXPECT_EQ(0x3Fu, reader.ReadFixedBits<8>()); + EXPECT_TRUE(reader.Close()); + } + + // u(16) - little-endian bytes + { + BitWriter writer; + BitWriter::Allotment allotment(&writer, kMaxBits); + writer.Write(16, 0xF83F); + + writer.ZeroPadToByte(); + allotment.ReclaimAndCharge(&writer, 0, nullptr); + BitReader reader(writer.GetSpan()); + EXPECT_EQ(0x3Fu, reader.ReadFixedBits<8>()); + EXPECT_EQ(0xF8u, reader.ReadFixedBits<8>()); + EXPECT_TRUE(reader.Close()); + } + + // Non-byte-aligned, mixed sizes + { + BitWriter writer; + BitWriter::Allotment allotment(&writer, kMaxBits); + writer.Write(1, 1); + writer.Write(3, 6); + writer.Write(8, 0xDB); + writer.Write(4, 8); + + writer.ZeroPadToByte(); + allotment.ReclaimAndCharge(&writer, 0, nullptr); + BitReader reader(writer.GetSpan()); + EXPECT_EQ(0xBDu, reader.ReadFixedBits<8>()); + EXPECT_EQ(0x8Du, reader.ReadFixedBits<8>()); + EXPECT_TRUE(reader.Close()); + } +} + +TEST(BitReaderTest, TotalCountersTest) { + uint8_t buf[8] = {1, 2, 3, 4}; + BitReader reader(Span(buf, sizeof(buf))); + + EXPECT_EQ(sizeof(buf), reader.TotalBytes()); + EXPECT_EQ(0u, reader.TotalBitsConsumed()); + reader.ReadFixedBits<1>(); + EXPECT_EQ(1u, reader.TotalBitsConsumed()); + + reader.ReadFixedBits<10>(); + EXPECT_EQ(11u, reader.TotalBitsConsumed()); + + reader.ReadFixedBits<4>(); + EXPECT_EQ(15u, reader.TotalBitsConsumed()); + + reader.ReadFixedBits<1>(); + EXPECT_EQ(16u, reader.TotalBitsConsumed()); + + reader.ReadFixedBits<16>(); + EXPECT_EQ(32u, reader.TotalBitsConsumed()); + + EXPECT_TRUE(reader.Close()); +} + +TEST(BitReaderTest, MoveTest) { + uint8_t buf[8] = {1, 2, 3, 4}; + BitReader reader2; + { + BitReader reader1(Span(buf, sizeof(buf))); + + EXPECT_EQ(0u, reader1.TotalBitsConsumed()); + reader1.ReadFixedBits<16>(); + EXPECT_EQ(16u, reader1.TotalBitsConsumed()); + + reader2 = std::move(reader1); + // From this point reader1 is invalid, but can continue to access reader2 + // and we don't need to call Close() on reader1. + } + + EXPECT_EQ(16u, reader2.TotalBitsConsumed()); + EXPECT_EQ(3U, reader2.ReadFixedBits<8>()); + EXPECT_EQ(24u, reader2.TotalBitsConsumed()); + + EXPECT_TRUE(reader2.Close()); +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/bits_test.cc b/third_party/jpeg-xl/lib/jxl/bits_test.cc new file mode 100644 index 0000000000..bd7aa548c8 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/bits_test.cc @@ -0,0 +1,87 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/bits.h" + +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +TEST(BitsTest, TestNumZeroBits) { + // Zero input is well-defined. + EXPECT_EQ(32u, Num0BitsAboveMS1Bit(0u)); + EXPECT_EQ(64u, Num0BitsAboveMS1Bit(0ull)); + EXPECT_EQ(32u, Num0BitsBelowLS1Bit(0u)); + EXPECT_EQ(64u, Num0BitsBelowLS1Bit(0ull)); + + EXPECT_EQ(31u, Num0BitsAboveMS1Bit(1u)); + EXPECT_EQ(30u, Num0BitsAboveMS1Bit(2u)); + EXPECT_EQ(63u, Num0BitsAboveMS1Bit(1ull)); + EXPECT_EQ(62u, Num0BitsAboveMS1Bit(2ull)); + + EXPECT_EQ(0u, Num0BitsBelowLS1Bit(1u)); + EXPECT_EQ(0u, Num0BitsBelowLS1Bit(1ull)); + EXPECT_EQ(1u, Num0BitsBelowLS1Bit(2u)); + EXPECT_EQ(1u, Num0BitsBelowLS1Bit(2ull)); + + EXPECT_EQ(0u, Num0BitsAboveMS1Bit(0x80000000u)); + EXPECT_EQ(0u, Num0BitsAboveMS1Bit(0x8000000000000000ull)); + EXPECT_EQ(31u, Num0BitsBelowLS1Bit(0x80000000u)); + EXPECT_EQ(63u, Num0BitsBelowLS1Bit(0x8000000000000000ull)); +} + +TEST(BitsTest, TestFloorLog2) { + // for input = [1, 7] + const size_t expected[7] = {0, 1, 1, 2, 2, 2, 2}; + for (uint32_t i = 1; i <= 7; ++i) { + EXPECT_EQ(expected[i - 1], FloorLog2Nonzero(i)) << " " << i; + EXPECT_EQ(expected[i - 1], FloorLog2Nonzero(uint64_t(i))) << " " << i; + } + + EXPECT_EQ(11u, FloorLog2Nonzero(0x00000fffu)); // 4095 + EXPECT_EQ(12u, FloorLog2Nonzero(0x00001000u)); // 4096 + EXPECT_EQ(12u, FloorLog2Nonzero(0x00001001u)); // 4097 + + EXPECT_EQ(31u, FloorLog2Nonzero(0x80000000u)); + EXPECT_EQ(31u, FloorLog2Nonzero(0x80000001u)); + EXPECT_EQ(31u, FloorLog2Nonzero(0xFFFFFFFFu)); + + EXPECT_EQ(31u, FloorLog2Nonzero(0x80000000ull)); + EXPECT_EQ(31u, FloorLog2Nonzero(0x80000001ull)); + EXPECT_EQ(31u, FloorLog2Nonzero(0xFFFFFFFFull)); + + EXPECT_EQ(63u, FloorLog2Nonzero(0x8000000000000000ull)); + EXPECT_EQ(63u, FloorLog2Nonzero(0x8000000000000001ull)); + EXPECT_EQ(63u, FloorLog2Nonzero(0xFFFFFFFFFFFFFFFFull)); +} + +TEST(BitsTest, TestCeilLog2) { + // for input = [1, 7] + const size_t expected[7] = {0, 1, 2, 2, 3, 3, 3}; + for (uint32_t i = 1; i <= 7; ++i) { + EXPECT_EQ(expected[i - 1], CeilLog2Nonzero(i)) << " " << i; + EXPECT_EQ(expected[i - 1], CeilLog2Nonzero(uint64_t(i))) << " " << i; + } + + EXPECT_EQ(12u, CeilLog2Nonzero(0x00000fffu)); // 4095 + EXPECT_EQ(12u, CeilLog2Nonzero(0x00001000u)); // 4096 + EXPECT_EQ(13u, CeilLog2Nonzero(0x00001001u)); // 4097 + + EXPECT_EQ(31u, CeilLog2Nonzero(0x80000000u)); + EXPECT_EQ(32u, CeilLog2Nonzero(0x80000001u)); + EXPECT_EQ(32u, CeilLog2Nonzero(0xFFFFFFFFu)); + + EXPECT_EQ(31u, CeilLog2Nonzero(0x80000000ull)); + EXPECT_EQ(32u, CeilLog2Nonzero(0x80000001ull)); + EXPECT_EQ(32u, CeilLog2Nonzero(0xFFFFFFFFull)); + + EXPECT_EQ(63u, CeilLog2Nonzero(0x8000000000000000ull)); + EXPECT_EQ(64u, CeilLog2Nonzero(0x8000000000000001ull)); + EXPECT_EQ(64u, CeilLog2Nonzero(0xFFFFFFFFFFFFFFFFull)); +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/blending.cc b/third_party/jpeg-xl/lib/jxl/blending.cc new file mode 100644 index 0000000000..ab37fdabb5 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/blending.cc @@ -0,0 +1,152 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/blending.h" + +#include "lib/jxl/alpha.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { + +bool NeedsBlending(PassesDecoderState* dec_state) { + const PassesSharedState& state = *dec_state->shared; + if (!(state.frame_header.frame_type == FrameType::kRegularFrame || + state.frame_header.frame_type == FrameType::kSkipProgressive)) { + return false; + } + const auto& info = state.frame_header.blending_info; + bool replace_all = (info.mode == BlendMode::kReplace); + for (const auto& ec_i : state.frame_header.extra_channel_blending_info) { + if (ec_i.mode != BlendMode::kReplace) { + replace_all = false; + } + } + // Replace the full frame: nothing to do. + if (!state.frame_header.custom_size_or_origin && replace_all) { + return false; + } + return true; +} + +void PerformBlending(const float* const* bg, const float* const* fg, + float* const* out, size_t x0, size_t xsize, + const PatchBlending& color_blending, + const PatchBlending* ec_blending, + const std::vector& extra_channel_info) { + bool has_alpha = false; + size_t num_ec = extra_channel_info.size(); + for (size_t i = 0; i < num_ec; i++) { + if (extra_channel_info[i].type == jxl::ExtraChannel::kAlpha) { + has_alpha = true; + break; + } + } + ImageF tmp(xsize, 3 + num_ec); + // Blend extra channels first so that we use the pre-blending alpha. + for (size_t i = 0; i < num_ec; i++) { + if (ec_blending[i].mode == PatchBlendMode::kAdd) { + for (size_t x = 0; x < xsize; x++) { + tmp.Row(3 + i)[x] = bg[3 + i][x + x0] + fg[3 + i][x + x0]; + } + } else if (ec_blending[i].mode == PatchBlendMode::kBlendAbove) { + size_t alpha = ec_blending[i].alpha_channel; + bool is_premultiplied = extra_channel_info[alpha].alpha_associated; + PerformAlphaBlending(bg[3 + i] + x0, bg[3 + alpha] + x0, fg[3 + i] + x0, + fg[3 + alpha] + x0, tmp.Row(3 + i), xsize, + is_premultiplied, ec_blending[i].clamp); + } else if (ec_blending[i].mode == PatchBlendMode::kBlendBelow) { + size_t alpha = ec_blending[i].alpha_channel; + bool is_premultiplied = extra_channel_info[alpha].alpha_associated; + PerformAlphaBlending(fg[3 + i] + x0, fg[3 + alpha] + x0, bg[3 + i] + x0, + bg[3 + alpha] + x0, tmp.Row(3 + i), xsize, + is_premultiplied, ec_blending[i].clamp); + } else if (ec_blending[i].mode == PatchBlendMode::kAlphaWeightedAddAbove) { + size_t alpha = ec_blending[i].alpha_channel; + PerformAlphaWeightedAdd(bg[3 + i] + x0, fg[3 + i] + x0, + fg[3 + alpha] + x0, tmp.Row(3 + i), xsize, + ec_blending[i].clamp); + } else if (ec_blending[i].mode == PatchBlendMode::kAlphaWeightedAddBelow) { + size_t alpha = ec_blending[i].alpha_channel; + PerformAlphaWeightedAdd(fg[3 + i] + x0, bg[3 + i] + x0, + bg[3 + alpha] + x0, tmp.Row(3 + i), xsize, + ec_blending[i].clamp); + } else if (ec_blending[i].mode == PatchBlendMode::kMul) { + PerformMulBlending(bg[3 + i] + x0, fg[3 + i] + x0, tmp.Row(3 + i), xsize, + ec_blending[i].clamp); + } else if (ec_blending[i].mode == PatchBlendMode::kReplace) { + memcpy(tmp.Row(3 + i), fg[3 + i] + x0, xsize * sizeof(**fg)); + } else if (ec_blending[i].mode == PatchBlendMode::kNone) { + if (xsize) memcpy(tmp.Row(3 + i), bg[3 + i] + x0, xsize * sizeof(**fg)); + } else { + JXL_ABORT("Unreachable"); + } + } + size_t alpha = color_blending.alpha_channel; + + if (color_blending.mode == PatchBlendMode::kAdd || + (color_blending.mode == PatchBlendMode::kAlphaWeightedAddAbove && + !has_alpha) || + (color_blending.mode == PatchBlendMode::kAlphaWeightedAddBelow && + !has_alpha)) { + for (int p = 0; p < 3; p++) { + float* out = tmp.Row(p); + for (size_t x = 0; x < xsize; x++) { + out[x] = bg[p][x + x0] + fg[p][x + x0]; + } + } + } else if (color_blending.mode == PatchBlendMode::kBlendAbove + // blend without alpha is just replace + && has_alpha) { + bool is_premultiplied = extra_channel_info[alpha].alpha_associated; + PerformAlphaBlending( + {bg[0] + x0, bg[1] + x0, bg[2] + x0, bg[3 + alpha] + x0}, + {fg[0] + x0, fg[1] + x0, fg[2] + x0, fg[3 + alpha] + x0}, + {tmp.Row(0), tmp.Row(1), tmp.Row(2), tmp.Row(3 + alpha)}, xsize, + is_premultiplied, color_blending.clamp); + } else if (color_blending.mode == PatchBlendMode::kBlendBelow + // blend without alpha is just replace + && has_alpha) { + bool is_premultiplied = extra_channel_info[alpha].alpha_associated; + PerformAlphaBlending( + {fg[0] + x0, fg[1] + x0, fg[2] + x0, fg[3 + alpha] + x0}, + {bg[0] + x0, bg[1] + x0, bg[2] + x0, bg[3 + alpha] + x0}, + {tmp.Row(0), tmp.Row(1), tmp.Row(2), tmp.Row(3 + alpha)}, xsize, + is_premultiplied, color_blending.clamp); + } else if (color_blending.mode == PatchBlendMode::kAlphaWeightedAddAbove) { + JXL_DASSERT(has_alpha); + for (size_t c = 0; c < 3; c++) { + PerformAlphaWeightedAdd(bg[c] + x0, fg[c] + x0, fg[3 + alpha] + x0, + tmp.Row(c), xsize, color_blending.clamp); + } + } else if (color_blending.mode == PatchBlendMode::kAlphaWeightedAddBelow) { + JXL_DASSERT(has_alpha); + for (size_t c = 0; c < 3; c++) { + PerformAlphaWeightedAdd(fg[c] + x0, bg[c] + x0, bg[3 + alpha] + x0, + tmp.Row(c), xsize, color_blending.clamp); + } + } else if (color_blending.mode == PatchBlendMode::kMul) { + for (int p = 0; p < 3; p++) { + PerformMulBlending(bg[p] + x0, fg[p] + x0, tmp.Row(p), xsize, + color_blending.clamp); + } + } else if (color_blending.mode == PatchBlendMode::kReplace || + color_blending.mode == PatchBlendMode::kBlendAbove || + color_blending.mode == PatchBlendMode::kBlendBelow) { // kReplace + for (size_t p = 0; p < 3; p++) { + memcpy(tmp.Row(p), fg[p] + x0, xsize * sizeof(**fg)); + } + } else if (color_blending.mode == PatchBlendMode::kNone) { + for (size_t p = 0; p < 3; p++) { + memcpy(tmp.Row(p), bg[p] + x0, xsize * sizeof(**fg)); + } + } else { + JXL_ABORT("Unreachable"); + } + for (size_t i = 0; i < 3 + num_ec; i++) { + if (xsize != 0) memcpy(out[i] + x0, tmp.Row(i), xsize * sizeof(**out)); + } +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/blending.h b/third_party/jpeg-xl/lib/jxl/blending.h new file mode 100644 index 0000000000..7eab7d50cd --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/blending.h @@ -0,0 +1,24 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BLENDING_H_ +#define LIB_JXL_BLENDING_H_ +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/dec_patch_dictionary.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +bool NeedsBlending(PassesDecoderState* dec_state); + +void PerformBlending(const float* const* bg, const float* const* fg, + float* const* out, size_t x0, size_t xsize, + const PatchBlending& color_blending, + const PatchBlending* ec_blending, + const std::vector& extra_channel_info); + +} // namespace jxl + +#endif // LIB_JXL_BLENDING_H_ diff --git a/third_party/jpeg-xl/lib/jxl/blending_test.cc b/third_party/jpeg-xl/lib/jxl/blending_test.cc new file mode 100644 index 0000000000..ff4c46c529 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/blending_test.cc @@ -0,0 +1,37 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/extras/codec.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +using ::testing::SizeIs; + +TEST(BlendingTest, Crops) { + const PaddedBytes compressed = + jxl::test::ReadTestData("jxl/blending/cropped_traffic_light.jxl"); + CodecInOut decoded; + ASSERT_TRUE(test::DecodeFile({}, Span(compressed), &decoded)); + ASSERT_THAT(decoded.frames, SizeIs(4)); + + int i = 0; + for (const ImageBundle& ib : decoded.frames) { + std::ostringstream filename; + filename << "jxl/blending/cropped_traffic_light_frame-" << i << ".png"; + const PaddedBytes compressed_frame = + jxl::test::ReadTestData(filename.str()); + CodecInOut frame; + ASSERT_TRUE(SetFromBytes(Span(compressed_frame), &frame)); + JXL_EXPECT_OK(SamePixels(ib.color(), *frame.Main().color(), _)); + ++i; + } +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/box_content_decoder.cc b/third_party/jpeg-xl/lib/jxl/box_content_decoder.cc new file mode 100644 index 0000000000..c4cba3a31a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/box_content_decoder.cc @@ -0,0 +1,101 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/box_content_decoder.h" + +#include "lib/jxl/sanitizers.h" + +namespace jxl { + +JxlBoxContentDecoder::JxlBoxContentDecoder() {} + +JxlBoxContentDecoder::~JxlBoxContentDecoder() { + if (brotli_dec) { + BrotliDecoderDestroyInstance(brotli_dec); + } +} + +void JxlBoxContentDecoder::StartBox(bool brob_decode, bool box_until_eof, + size_t contents_size) { + if (brotli_dec) { + BrotliDecoderDestroyInstance(brotli_dec); + brotli_dec = nullptr; + } + header_done_ = false; + brob_decode_ = brob_decode; + box_until_eof_ = box_until_eof; + remaining_ = box_until_eof ? 0 : contents_size; + pos_ = 0; +} + +JxlDecoderStatus JxlBoxContentDecoder::Process(const uint8_t* next_in, + size_t avail_in, size_t box_pos, + uint8_t** next_out, + size_t* avail_out) { + next_in += pos_ - box_pos; + avail_in -= pos_ - box_pos; + + if (brob_decode_) { + if (!header_done_) { + if (avail_in < 4) return JXL_DEC_NEED_MORE_INPUT; + if (!box_until_eof_) { + if (remaining_ < 4) return JXL_DEC_ERROR; + remaining_ -= 4; + } + next_in += 4; + avail_in -= 4; + pos_ += 4; + header_done_ = true; + } + + if (!brotli_dec) { + brotli_dec = BrotliDecoderCreateInstance(nullptr, nullptr, nullptr); + } + + const uint8_t* next_in_before = next_in; + uint8_t* next_out_before = *next_out; + msan::MemoryIsInitialized(next_in, avail_in); + BrotliDecoderResult res = BrotliDecoderDecompressStream( + brotli_dec, &avail_in, &next_in, avail_out, next_out, nullptr); + size_t consumed = next_in - next_in_before; + size_t produced = *next_out - next_out_before; + if (res == BROTLI_DECODER_RESULT_ERROR) { + return JXL_DEC_ERROR; + } + msan::UnpoisonMemory(next_out_before, produced); + pos_ += consumed; + if (!box_until_eof_) remaining_ -= consumed; + if (res == BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT) { + return JXL_DEC_NEED_MORE_INPUT; + } + if (res == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT) { + return JXL_DEC_BOX_NEED_MORE_OUTPUT; + } + if (res == BROTLI_DECODER_RESULT_SUCCESS) { + return JXL_DEC_SUCCESS; + } + // unknown Brotli result + return JXL_DEC_ERROR; + } else { + // remaining box bytes as seen from dec->file_pos + size_t can_read = avail_in; + if (!box_until_eof_) can_read = std::min(can_read, remaining_); + size_t to_write = std::min(can_read, *avail_out); + memcpy(*next_out, next_in, to_write); + + *next_out += to_write; + *avail_out -= to_write; + if (!box_until_eof_) remaining_ -= to_write; + pos_ += to_write; + + if (to_write < can_read) return JXL_DEC_BOX_NEED_MORE_OUTPUT; + + if (!box_until_eof_ && remaining_ > 0) return JXL_DEC_NEED_MORE_INPUT; + + return JXL_DEC_SUCCESS; + } +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/box_content_decoder.h b/third_party/jpeg-xl/lib/jxl/box_content_decoder.h new file mode 100644 index 0000000000..6153360a8e --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/box_content_decoder.h @@ -0,0 +1,49 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BOX_CONTENT_DECODER_H_ +#define LIB_JXL_BOX_CONTENT_DECODER_H_ + +#include +#include +#include +#include + +#include +#include + +namespace jxl { + +/** Outputs the contents of a box in a streaming fashion, either directly, or + * optionally decoding with Brotli, in case of a brob box. The input must be + * the contents of a box, excluding the box header. + */ +class JxlBoxContentDecoder { + public: + JxlBoxContentDecoder(); + ~JxlBoxContentDecoder(); + + void StartBox(bool brob_decode, bool box_until_eof, size_t contents_size); + + // Outputs decoded bytes from the box, decoding with brotli if needed. + // box_pos is the position in the box content which next_in points to. + // Returns success, whether more input or output bytes are needed, or error. + JxlDecoderStatus Process(const uint8_t* next_in, size_t avail_in, + size_t box_pos, uint8_t** next_out, + size_t* avail_out); + + private: + BrotliDecoderState* brotli_dec; + + bool header_done_; + bool brob_decode_; + bool box_until_eof_; + size_t remaining_; + size_t pos_; +}; + +} // namespace jxl + +#endif // LIB_JXL_BOX_CONTENT_DECODER_H_ diff --git a/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.cc b/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.cc new file mode 100644 index 0000000000..a412becd0d --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.cc @@ -0,0 +1,1988 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// +// Author: Jyrki Alakuijala (jyrki.alakuijala@gmail.com) +// +// The physical architecture of butteraugli is based on the following naming +// convention: +// * Opsin - dynamics of the photosensitive chemicals in the retina +// with their immediate electrical processing +// * Xyb - hybrid opponent/trichromatic color space +// x is roughly red-subtract-green. +// y is yellow. +// b is blue. +// Xyb values are computed from Opsin mixing, not directly from rgb. +// * Mask - for visual masking +// * Hf - color modeling for spatially high-frequency features +// * Lf - color modeling for spatially low-frequency features +// * Diffmap - to cluster and build an image of error between the images +// * Blur - to hold the smoothing code + +#include "lib/jxl/butteraugli/butteraugli.h" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#if JXL_PROFILER_ENABLED +#include +#endif // JXL_PROFILER_ENABLED + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/butteraugli/butteraugli.cc" +#include + +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/fast_math-inl.h" +#include "lib/jxl/gauss_blur.h" +#include "lib/jxl/image_ops.h" + +#ifndef JXL_BUTTERAUGLI_ONCE +#define JXL_BUTTERAUGLI_ONCE + +namespace jxl { + +std::vector ComputeKernel(float sigma) { + const float m = 2.25; // Accuracy increases when m is increased. + const double scaler = -1.0 / (2.0 * sigma * sigma); + const int diff = std::max(1, m * std::fabs(sigma)); + std::vector kernel(2 * diff + 1); + for (int i = -diff; i <= diff; ++i) { + kernel[i + diff] = std::exp(scaler * i * i); + } + return kernel; +} + +void ConvolveBorderColumn(const ImageF& in, const std::vector& kernel, + const size_t x, float* BUTTERAUGLI_RESTRICT row_out) { + const size_t offset = kernel.size() / 2; + int minx = x < offset ? 0 : x - offset; + int maxx = std::min(in.xsize() - 1, x + offset); + float weight = 0.0f; + for (int j = minx; j <= maxx; ++j) { + weight += kernel[j - x + offset]; + } + float scale = 1.0f / weight; + for (size_t y = 0; y < in.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y); + float sum = 0.0f; + for (int j = minx; j <= maxx; ++j) { + sum += row_in[j] * kernel[j - x + offset]; + } + row_out[y] = sum * scale; + } +} + +// Computes a horizontal convolution and transposes the result. +void ConvolutionWithTranspose(const ImageF& in, + const std::vector& kernel, + ImageF* BUTTERAUGLI_RESTRICT out) { + PROFILER_FUNC; + JXL_CHECK(out->xsize() == in.ysize()); + JXL_CHECK(out->ysize() == in.xsize()); + const size_t len = kernel.size(); + const size_t offset = len / 2; + float weight_no_border = 0.0f; + for (size_t j = 0; j < len; ++j) { + weight_no_border += kernel[j]; + } + const float scale_no_border = 1.0f / weight_no_border; + const size_t border1 = std::min(in.xsize(), offset); + const size_t border2 = in.xsize() > offset ? in.xsize() - offset : 0; + std::vector scaled_kernel(len / 2 + 1); + for (size_t i = 0; i <= len / 2; ++i) { + scaled_kernel[i] = kernel[i] * scale_no_border; + } + + // middle + switch (len) { + case 7: { + PROFILER_ZONE("conv7"); + const float sk0 = scaled_kernel[0]; + const float sk1 = scaled_kernel[1]; + const float sk2 = scaled_kernel[2]; + const float sk3 = scaled_kernel[3]; + for (size_t y = 0; y < in.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset; + for (size_t x = border1; x < border2; ++x, ++row_in) { + const float sum0 = (row_in[0] + row_in[6]) * sk0; + const float sum1 = (row_in[1] + row_in[5]) * sk1; + const float sum2 = (row_in[2] + row_in[4]) * sk2; + const float sum = (row_in[3]) * sk3 + sum0 + sum1 + sum2; + float* BUTTERAUGLI_RESTRICT row_out = out->Row(x); + row_out[y] = sum; + } + } + } break; + case 13: { + PROFILER_ZONE("conv15"); + for (size_t y = 0; y < in.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset; + for (size_t x = border1; x < border2; ++x, ++row_in) { + float sum0 = (row_in[0] + row_in[12]) * scaled_kernel[0]; + float sum1 = (row_in[1] + row_in[11]) * scaled_kernel[1]; + float sum2 = (row_in[2] + row_in[10]) * scaled_kernel[2]; + float sum3 = (row_in[3] + row_in[9]) * scaled_kernel[3]; + sum0 += (row_in[4] + row_in[8]) * scaled_kernel[4]; + sum1 += (row_in[5] + row_in[7]) * scaled_kernel[5]; + const float sum = (row_in[6]) * scaled_kernel[6]; + float* BUTTERAUGLI_RESTRICT row_out = out->Row(x); + row_out[y] = sum + sum0 + sum1 + sum2 + sum3; + } + } + break; + } + case 15: { + PROFILER_ZONE("conv15"); + for (size_t y = 0; y < in.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset; + for (size_t x = border1; x < border2; ++x, ++row_in) { + float sum0 = (row_in[0] + row_in[14]) * scaled_kernel[0]; + float sum1 = (row_in[1] + row_in[13]) * scaled_kernel[1]; + float sum2 = (row_in[2] + row_in[12]) * scaled_kernel[2]; + float sum3 = (row_in[3] + row_in[11]) * scaled_kernel[3]; + sum0 += (row_in[4] + row_in[10]) * scaled_kernel[4]; + sum1 += (row_in[5] + row_in[9]) * scaled_kernel[5]; + sum2 += (row_in[6] + row_in[8]) * scaled_kernel[6]; + const float sum = (row_in[7]) * scaled_kernel[7]; + float* BUTTERAUGLI_RESTRICT row_out = out->Row(x); + row_out[y] = sum + sum0 + sum1 + sum2 + sum3; + } + } + break; + } + case 33: { + PROFILER_ZONE("conv33"); + for (size_t y = 0; y < in.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset; + for (size_t x = border1; x < border2; ++x, ++row_in) { + float sum0 = (row_in[0] + row_in[32]) * scaled_kernel[0]; + float sum1 = (row_in[1] + row_in[31]) * scaled_kernel[1]; + float sum2 = (row_in[2] + row_in[30]) * scaled_kernel[2]; + float sum3 = (row_in[3] + row_in[29]) * scaled_kernel[3]; + sum0 += (row_in[4] + row_in[28]) * scaled_kernel[4]; + sum1 += (row_in[5] + row_in[27]) * scaled_kernel[5]; + sum2 += (row_in[6] + row_in[26]) * scaled_kernel[6]; + sum3 += (row_in[7] + row_in[25]) * scaled_kernel[7]; + sum0 += (row_in[8] + row_in[24]) * scaled_kernel[8]; + sum1 += (row_in[9] + row_in[23]) * scaled_kernel[9]; + sum2 += (row_in[10] + row_in[22]) * scaled_kernel[10]; + sum3 += (row_in[11] + row_in[21]) * scaled_kernel[11]; + sum0 += (row_in[12] + row_in[20]) * scaled_kernel[12]; + sum1 += (row_in[13] + row_in[19]) * scaled_kernel[13]; + sum2 += (row_in[14] + row_in[18]) * scaled_kernel[14]; + sum3 += (row_in[15] + row_in[17]) * scaled_kernel[15]; + const float sum = (row_in[16]) * scaled_kernel[16]; + float* BUTTERAUGLI_RESTRICT row_out = out->Row(x); + row_out[y] = sum + sum0 + sum1 + sum2 + sum3; + } + } + break; + } + default: + printf("Warning: Unexpected kernel size! %" PRIuS "\n", len); + for (size_t y = 0; y < in.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y); + for (size_t x = border1; x < border2; ++x) { + const int d = x - offset; + float* BUTTERAUGLI_RESTRICT row_out = out->Row(x); + float sum = 0.0f; + size_t j; + for (j = 0; j <= len / 2; ++j) { + sum += row_in[d + j] * scaled_kernel[j]; + } + for (; j < len; ++j) { + sum += row_in[d + j] * scaled_kernel[len - 1 - j]; + } + row_out[y] = sum; + } + } + } + // left border + for (size_t x = 0; x < border1; ++x) { + ConvolveBorderColumn(in, kernel, x, out->Row(x)); + } + + // right border + for (size_t x = border2; x < in.xsize(); ++x) { + ConvolveBorderColumn(in, kernel, x, out->Row(x)); + } +} + +// A blur somewhat similar to a 2D Gaussian blur. +// See: https://en.wikipedia.org/wiki/Gaussian_blur +// +// This is a bottleneck because the sigma can be quite large (>7). We can use +// gauss_blur.cc (runtime independent of sigma, closer to a 4*sigma truncated +// Gaussian and our 2.25 in ComputeKernel), but its boundary conditions are +// zero-valued. This leads to noticeable differences at the edges of diffmaps. +// We retain a special case for 5x5 kernels (even faster than gauss_blur), +// optionally use gauss_blur followed by fixup of the borders for large images, +// or fall back to the previous truncated FIR followed by a transpose. +void Blur(const ImageF& in, float sigma, const ButteraugliParams& params, + BlurTemp* temp, ImageF* out) { + std::vector kernel = ComputeKernel(sigma); + // Separable5 does an in-place convolution, so this fast path is not safe if + // in aliases out. + if (kernel.size() == 5 && &in != out) { + float sum_weights = 0.0f; + for (const float w : kernel) { + sum_weights += w; + } + const float scale = 1.0f / sum_weights; + const float w0 = kernel[2] * scale; + const float w1 = kernel[1] * scale; + const float w2 = kernel[0] * scale; + const WeightsSeparable5 weights = { + {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}, + {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}, + }; + Separable5(in, Rect(in), weights, /*pool=*/nullptr, out); + return; + } + + ImageF* JXL_RESTRICT temp_t = temp->GetTransposed(in); + ConvolutionWithTranspose(in, kernel, temp_t); + ConvolutionWithTranspose(*temp_t, kernel, out); +} + +// Allows PaddedMaltaUnit to call either function via overloading. +struct MaltaTagLF {}; +struct MaltaTag {}; + +} // namespace jxl + +#endif // JXL_BUTTERAUGLI_ONCE + +#include +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Abs; +using hwy::HWY_NAMESPACE::Div; +using hwy::HWY_NAMESPACE::Gt; +using hwy::HWY_NAMESPACE::IfThenElse; +using hwy::HWY_NAMESPACE::IfThenElseZero; +using hwy::HWY_NAMESPACE::Lt; +using hwy::HWY_NAMESPACE::Max; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::MulSub; +using hwy::HWY_NAMESPACE::Neg; +using hwy::HWY_NAMESPACE::Sub; +using hwy::HWY_NAMESPACE::Vec; +using hwy::HWY_NAMESPACE::ZeroIfNegative; + +template +HWY_INLINE V MaximumClamp(D d, V v, double kMaxVal) { + static const double kMul = 0.724216145665; + const V mul = Set(d, kMul); + const V maxval = Set(d, kMaxVal); + // If greater than maxval or less than -maxval, replace with if_*. + const V if_pos = MulAdd(Sub(v, maxval), mul, maxval); + const V if_neg = MulSub(Add(v, maxval), mul, maxval); + const V pos_or_v = IfThenElse(Ge(v, maxval), if_pos, v); + return IfThenElse(Lt(v, Neg(maxval)), if_neg, pos_or_v); +} + +// Make area around zero less important (remove it). +template +HWY_INLINE V RemoveRangeAroundZero(const D d, const double kw, const V x) { + const auto w = Set(d, kw); + return IfThenElse(Gt(x, w), Sub(x, w), + IfThenElseZero(Lt(x, Neg(w)), Add(x, w))); +} + +// Make area around zero more important (2x it until the limit). +template +HWY_INLINE V AmplifyRangeAroundZero(const D d, const double kw, const V x) { + const auto w = Set(d, kw); + return IfThenElse(Gt(x, w), Add(x, w), + IfThenElse(Lt(x, Neg(w)), Sub(x, w), Add(x, x))); +} + +// XybLowFreqToVals converts from low-frequency XYB space to the 'vals' space. +// Vals space can be converted to L2-norm space (Euclidean and normalized) +// through visual masking. +template +HWY_INLINE void XybLowFreqToVals(const D d, const V& x, const V& y, + const V& b_arg, V* HWY_RESTRICT valx, + V* HWY_RESTRICT valy, V* HWY_RESTRICT valb) { + static const double xmul_scalar = 33.832837186260; + static const double ymul_scalar = 14.458268100570; + static const double bmul_scalar = 49.87984651440; + static const double y_to_b_mul_scalar = -0.362267051518; + const V xmul = Set(d, xmul_scalar); + const V ymul = Set(d, ymul_scalar); + const V bmul = Set(d, bmul_scalar); + const V y_to_b_mul = Set(d, y_to_b_mul_scalar); + const V b = MulAdd(y_to_b_mul, y, b_arg); + *valb = Mul(b, bmul); + *valx = Mul(x, xmul); + *valy = Mul(y, ymul); +} + +void SuppressXByY(const ImageF& in_x, const ImageF& in_y, const double yw, + ImageF* HWY_RESTRICT out) { + JXL_DASSERT(SameSize(in_x, in_y) && SameSize(in_x, *out)); + const size_t xsize = in_x.xsize(); + const size_t ysize = in_x.ysize(); + + const HWY_FULL(float) d; + static const double s = 0.653020556257; + const auto sv = Set(d, s); + const auto one_minus_s = Set(d, 1.0 - s); + const auto ywv = Set(d, yw); + + for (size_t y = 0; y < ysize; ++y) { + const float* HWY_RESTRICT row_x = in_x.ConstRow(y); + const float* HWY_RESTRICT row_y = in_y.ConstRow(y); + float* HWY_RESTRICT row_out = out->Row(y); + + for (size_t x = 0; x < xsize; x += Lanes(d)) { + const auto vx = Load(d, row_x + x); + const auto vy = Load(d, row_y + x); + const auto scaler = + MulAdd(Div(ywv, MulAdd(vy, vy, ywv)), one_minus_s, sv); + Store(Mul(scaler, vx), d, row_out + x); + } + } +} + +static void SeparateFrequencies(size_t xsize, size_t ysize, + const ButteraugliParams& params, + BlurTemp* blur_temp, const Image3F& xyb, + PsychoImage& ps) { + PROFILER_FUNC; + const HWY_FULL(float) d; + + // Extract lf ... + static const double kSigmaLf = 7.15593339443; + static const double kSigmaHf = 3.22489901262; + static const double kSigmaUhf = 1.56416327805; + ps.mf = Image3F(xsize, ysize); + ps.hf[0] = ImageF(xsize, ysize); + ps.hf[1] = ImageF(xsize, ysize); + ps.lf = Image3F(xyb.xsize(), xyb.ysize()); + ps.mf = Image3F(xyb.xsize(), xyb.ysize()); + for (int i = 0; i < 3; ++i) { + Blur(xyb.Plane(i), kSigmaLf, params, blur_temp, &ps.lf.Plane(i)); + + // ... and keep everything else in mf. + for (size_t y = 0; y < ysize; ++y) { + const float* BUTTERAUGLI_RESTRICT row_xyb = xyb.PlaneRow(i, y); + const float* BUTTERAUGLI_RESTRICT row_lf = ps.lf.ConstPlaneRow(i, y); + float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(i, y); + for (size_t x = 0; x < xsize; x += Lanes(d)) { + const auto mf = Sub(Load(d, row_xyb + x), Load(d, row_lf + x)); + Store(mf, d, row_mf + x); + } + } + if (i == 2) { + Blur(ps.mf.Plane(i), kSigmaHf, params, blur_temp, &ps.mf.Plane(i)); + break; + } + // Divide mf into mf and hf. + for (size_t y = 0; y < ysize; ++y) { + float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(i, y); + float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[i].Row(y); + for (size_t x = 0; x < xsize; x += Lanes(d)) { + Store(Load(d, row_mf + x), d, row_hf + x); + } + } + Blur(ps.mf.Plane(i), kSigmaHf, params, blur_temp, &ps.mf.Plane(i)); + static const double kRemoveMfRange = 0.29; + static const double kAddMfRange = 0.1; + if (i == 0) { + for (size_t y = 0; y < ysize; ++y) { + float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(0, y); + float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[0].Row(y); + for (size_t x = 0; x < xsize; x += Lanes(d)) { + auto mf = Load(d, row_mf + x); + auto hf = Sub(Load(d, row_hf + x), mf); + mf = RemoveRangeAroundZero(d, kRemoveMfRange, mf); + Store(mf, d, row_mf + x); + Store(hf, d, row_hf + x); + } + } + } else { + for (size_t y = 0; y < ysize; ++y) { + float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(1, y); + float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[1].Row(y); + for (size_t x = 0; x < xsize; x += Lanes(d)) { + auto mf = Load(d, row_mf + x); + auto hf = Sub(Load(d, row_hf + x), mf); + + mf = AmplifyRangeAroundZero(d, kAddMfRange, mf); + Store(mf, d, row_mf + x); + Store(hf, d, row_hf + x); + } + } + } + } + + // Temporarily used as output of SuppressXByY + ps.uhf[0] = ImageF(xsize, ysize); + ps.uhf[1] = ImageF(xsize, ysize); + + // Suppress red-green by intensity change in the high freq channels. + static const double suppress = 46.0; + SuppressXByY(ps.hf[0], ps.hf[1], suppress, &ps.uhf[0]); + // hf is the SuppressXByY output, uhf will be written below. + ps.hf[0].Swap(ps.uhf[0]); + + for (int i = 0; i < 2; ++i) { + // Divide hf into hf and uhf. + for (size_t y = 0; y < ysize; ++y) { + float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[i].Row(y); + float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[i].Row(y); + for (size_t x = 0; x < xsize; ++x) { + row_uhf[x] = row_hf[x]; + } + } + Blur(ps.hf[i], kSigmaUhf, params, blur_temp, &ps.hf[i]); + static const double kRemoveHfRange = 1.5; + static const double kAddHfRange = 0.132; + static const double kRemoveUhfRange = 0.04; + static const double kMaxclampHf = 28.4691806922; + static const double kMaxclampUhf = 5.19175294647; + static double kMulYHf = 2.155; + static double kMulYUhf = 2.69313763794; + if (i == 0) { + for (size_t y = 0; y < ysize; ++y) { + float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[0].Row(y); + float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[0].Row(y); + for (size_t x = 0; x < xsize; x += Lanes(d)) { + auto hf = Load(d, row_hf + x); + auto uhf = Sub(Load(d, row_uhf + x), hf); + hf = RemoveRangeAroundZero(d, kRemoveHfRange, hf); + uhf = RemoveRangeAroundZero(d, kRemoveUhfRange, uhf); + Store(hf, d, row_hf + x); + Store(uhf, d, row_uhf + x); + } + } + } else { + for (size_t y = 0; y < ysize; ++y) { + float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[1].Row(y); + float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[1].Row(y); + for (size_t x = 0; x < xsize; x += Lanes(d)) { + auto hf = Load(d, row_hf + x); + hf = MaximumClamp(d, hf, kMaxclampHf); + + auto uhf = Sub(Load(d, row_uhf + x), hf); + uhf = MaximumClamp(d, uhf, kMaxclampUhf); + uhf = Mul(uhf, Set(d, kMulYUhf)); + Store(uhf, d, row_uhf + x); + + hf = Mul(hf, Set(d, kMulYHf)); + hf = AmplifyRangeAroundZero(d, kAddHfRange, hf); + Store(hf, d, row_hf + x); + } + } + } + } + // Modify range around zero code only concerns the high frequency + // planes and only the X and Y channels. + // Convert low freq xyb to vals space so that we can do a simple squared sum + // diff on the low frequencies later. + for (size_t y = 0; y < ysize; ++y) { + float* BUTTERAUGLI_RESTRICT row_x = ps.lf.PlaneRow(0, y); + float* BUTTERAUGLI_RESTRICT row_y = ps.lf.PlaneRow(1, y); + float* BUTTERAUGLI_RESTRICT row_b = ps.lf.PlaneRow(2, y); + for (size_t x = 0; x < xsize; x += Lanes(d)) { + auto valx = Undefined(d); + auto valy = Undefined(d); + auto valb = Undefined(d); + XybLowFreqToVals(d, Load(d, row_x + x), Load(d, row_y + x), + Load(d, row_b + x), &valx, &valy, &valb); + Store(valx, d, row_x + x); + Store(valy, d, row_y + x); + Store(valb, d, row_b + x); + } + } +} + +namespace { +template +BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d) { + return Add(Add(a, b), Add(c, d)); +} +template +BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d, V e) { + return Sum(a, b, c, Add(d, e)); +} +template +BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d, V e, V f, V g) { + return Sum(a, b, c, Sum(d, e, f, g)); +} +template +BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d, V e, V f, V g, V h, V i) { + return Add(Add(Sum(a, b, c, d), Sum(e, f, g, h)), i); +} +} // namespace + +template +Vec MaltaUnit(MaltaTagLF /*tag*/, const D df, + const float* BUTTERAUGLI_RESTRICT d, const intptr_t xs) { + const intptr_t xs3 = 3 * xs; + + const auto center = LoadU(df, d); + + // x grows, y constant + const auto sum_yconst = Sum(LoadU(df, d - 4), LoadU(df, d - 2), center, + LoadU(df, d + 2), LoadU(df, d + 4)); + // Will return this, sum of all line kernels + auto retval = Mul(sum_yconst, sum_yconst); + { + // y grows, x constant + auto sum = Sum(LoadU(df, d - xs3 - xs), LoadU(df, d - xs - xs), center, + LoadU(df, d + xs + xs), LoadU(df, d + xs3 + xs)); + retval = MulAdd(sum, sum, retval); + } + { + // both grow + auto sum = Sum(LoadU(df, d - xs3 - 3), LoadU(df, d - xs - xs - 2), center, + LoadU(df, d + xs + xs + 2), LoadU(df, d + xs3 + 3)); + retval = MulAdd(sum, sum, retval); + } + { + // y grows, x shrinks + auto sum = Sum(LoadU(df, d - xs3 + 3), LoadU(df, d - xs - xs + 2), center, + LoadU(df, d + xs + xs - 2), LoadU(df, d + xs3 - 3)); + retval = MulAdd(sum, sum, retval); + } + { + // y grows -4 to 4, x shrinks 1 -> -1 + auto sum = + Sum(LoadU(df, d - xs3 - xs + 1), LoadU(df, d - xs - xs + 1), center, + LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 + xs - 1)); + retval = MulAdd(sum, sum, retval); + } + { + // y grows -4 to 4, x grows -1 -> 1 + auto sum = + Sum(LoadU(df, d - xs3 - xs - 1), LoadU(df, d - xs - xs - 1), center, + LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + xs + 1)); + retval = MulAdd(sum, sum, retval); + } + { + // x grows -4 to 4, y grows -1 to 1 + auto sum = Sum(LoadU(df, d - 4 - xs), LoadU(df, d - 2 - xs), center, + LoadU(df, d + 2 + xs), LoadU(df, d + 4 + xs)); + retval = MulAdd(sum, sum, retval); + } + { + // x grows -4 to 4, y shrinks 1 to -1 + auto sum = Sum(LoadU(df, d - 4 + xs), LoadU(df, d - 2 + xs), center, + LoadU(df, d + 2 - xs), LoadU(df, d + 4 - xs)); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1__*______ + 2___*_____ + 3_________ + 4____0____ + 5_________ + 6_____*___ + 7______*__ + 8_________ */ + auto sum = Sum(LoadU(df, d - xs3 - 2), LoadU(df, d - xs - xs - 1), center, + LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + 2)); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1______*__ + 2_____*___ + 3_________ + 4____0____ + 5_________ + 6___*_____ + 7__*______ + 8_________ */ + auto sum = Sum(LoadU(df, d - xs3 + 2), LoadU(df, d - xs - xs + 1), center, + LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 - 2)); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1_________ + 2_*_______ + 3__*______ + 4____0____ + 5______*__ + 6_______*_ + 7_________ + 8_________ */ + auto sum = Sum(LoadU(df, d - xs - xs - 3), LoadU(df, d - xs - 2), center, + LoadU(df, d + xs + 2), LoadU(df, d + xs + xs + 3)); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1_________ + 2_______*_ + 3______*__ + 4____0____ + 5__*______ + 6_*_______ + 7_________ + 8_________ */ + auto sum = Sum(LoadU(df, d - xs - xs + 3), LoadU(df, d - xs + 2), center, + LoadU(df, d + xs - 2), LoadU(df, d + xs + xs - 3)); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1_________ + 2________* + 3______*__ + 4____0____ + 5__*______ + 6*________ + 7_________ + 8_________ */ + + auto sum = Sum(LoadU(df, d + xs + xs - 4), LoadU(df, d + xs - 2), center, + LoadU(df, d - xs + 2), LoadU(df, d - xs - xs + 4)); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1_________ + 2*________ + 3__*______ + 4____0____ + 5______*__ + 6________* + 7_________ + 8_________ */ + auto sum = Sum(LoadU(df, d - xs - xs - 4), LoadU(df, d - xs - 2), center, + LoadU(df, d + xs + 2), LoadU(df, d + xs + xs + 4)); + retval = MulAdd(sum, sum, retval); + } + { + /* 0__*______ + 1_________ + 2___*_____ + 3_________ + 4____0____ + 5_________ + 6_____*___ + 7_________ + 8______*__ */ + auto sum = + Sum(LoadU(df, d - xs3 - xs - 2), LoadU(df, d - xs - xs - 1), center, + LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + xs + 2)); + retval = MulAdd(sum, sum, retval); + } + { + /* 0______*__ + 1_________ + 2_____*___ + 3_________ + 4____0____ + 5_________ + 6___*_____ + 7_________ + 8__*______ */ + auto sum = + Sum(LoadU(df, d - xs3 - xs + 2), LoadU(df, d - xs - xs + 1), center, + LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 + xs - 2)); + retval = MulAdd(sum, sum, retval); + } + return retval; +} + +template +Vec MaltaUnit(MaltaTag /*tag*/, const D df, + const float* BUTTERAUGLI_RESTRICT d, const intptr_t xs) { + const intptr_t xs3 = 3 * xs; + + const auto center = LoadU(df, d); + + // x grows, y constant + const auto sum_yconst = + Sum(LoadU(df, d - 4), LoadU(df, d - 3), LoadU(df, d - 2), + LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + 2), + LoadU(df, d + 3), LoadU(df, d + 4)); + // Will return this, sum of all line kernels + auto retval = Mul(sum_yconst, sum_yconst); + + { + // y grows, x constant + auto sum = Sum(LoadU(df, d - xs3 - xs), LoadU(df, d - xs3), + LoadU(df, d - xs - xs), LoadU(df, d - xs), center, + LoadU(df, d + xs), LoadU(df, d + xs + xs), + LoadU(df, d + xs3), LoadU(df, d + xs3 + xs)); + retval = MulAdd(sum, sum, retval); + } + { + // both grow + auto sum = Sum(LoadU(df, d - xs3 - 3), LoadU(df, d - xs - xs - 2), + LoadU(df, d - xs - 1), center, LoadU(df, d + xs + 1), + LoadU(df, d + xs + xs + 2), LoadU(df, d + xs3 + 3)); + retval = MulAdd(sum, sum, retval); + } + { + // y grows, x shrinks + auto sum = Sum(LoadU(df, d - xs3 + 3), LoadU(df, d - xs - xs + 2), + LoadU(df, d - xs + 1), center, LoadU(df, d + xs - 1), + LoadU(df, d + xs + xs - 2), LoadU(df, d + xs3 - 3)); + retval = MulAdd(sum, sum, retval); + } + { + // y grows -4 to 4, x shrinks 1 -> -1 + auto sum = Sum(LoadU(df, d - xs3 - xs + 1), LoadU(df, d - xs3 + 1), + LoadU(df, d - xs - xs + 1), LoadU(df, d - xs), center, + LoadU(df, d + xs), LoadU(df, d + xs + xs - 1), + LoadU(df, d + xs3 - 1), LoadU(df, d + xs3 + xs - 1)); + retval = MulAdd(sum, sum, retval); + } + { + // y grows -4 to 4, x grows -1 -> 1 + auto sum = Sum(LoadU(df, d - xs3 - xs - 1), LoadU(df, d - xs3 - 1), + LoadU(df, d - xs - xs - 1), LoadU(df, d - xs), center, + LoadU(df, d + xs), LoadU(df, d + xs + xs + 1), + LoadU(df, d + xs3 + 1), LoadU(df, d + xs3 + xs + 1)); + retval = MulAdd(sum, sum, retval); + } + { + // x grows -4 to 4, y grows -1 to 1 + auto sum = + Sum(LoadU(df, d - 4 - xs), LoadU(df, d - 3 - xs), LoadU(df, d - 2 - xs), + LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + 2 + xs), + LoadU(df, d + 3 + xs), LoadU(df, d + 4 + xs)); + retval = MulAdd(sum, sum, retval); + } + { + // x grows -4 to 4, y shrinks 1 to -1 + auto sum = + Sum(LoadU(df, d - 4 + xs), LoadU(df, d - 3 + xs), LoadU(df, d - 2 + xs), + LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + 2 - xs), + LoadU(df, d + 3 - xs), LoadU(df, d + 4 - xs)); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1__*______ + 2___*_____ + 3___*_____ + 4____0____ + 5_____*___ + 6_____*___ + 7______*__ + 8_________ */ + auto sum = Sum(LoadU(df, d - xs3 - 2), LoadU(df, d - xs - xs - 1), + LoadU(df, d - xs - 1), center, LoadU(df, d + xs + 1), + LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + 2)); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1______*__ + 2_____*___ + 3_____*___ + 4____0____ + 5___*_____ + 6___*_____ + 7__*______ + 8_________ */ + auto sum = Sum(LoadU(df, d - xs3 + 2), LoadU(df, d - xs - xs + 1), + LoadU(df, d - xs + 1), center, LoadU(df, d + xs - 1), + LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 - 2)); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1_________ + 2_*_______ + 3__**_____ + 4____0____ + 5_____**__ + 6_______*_ + 7_________ + 8_________ */ + auto sum = Sum(LoadU(df, d - xs - xs - 3), LoadU(df, d - xs - 2), + LoadU(df, d - xs - 1), center, LoadU(df, d + xs + 1), + LoadU(df, d + xs + 2), LoadU(df, d + xs + xs + 3)); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1_________ + 2_______*_ + 3_____**__ + 4____0____ + 5__**_____ + 6_*_______ + 7_________ + 8_________ */ + auto sum = Sum(LoadU(df, d - xs - xs + 3), LoadU(df, d - xs + 2), + LoadU(df, d - xs + 1), center, LoadU(df, d + xs - 1), + LoadU(df, d + xs - 2), LoadU(df, d + xs + xs - 3)); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1_________ + 2_________ + 3______*** + 4___*0*___ + 5***______ + 6_________ + 7_________ + 8_________ */ + + auto sum = + Sum(LoadU(df, d + xs - 4), LoadU(df, d + xs - 3), LoadU(df, d + xs - 2), + LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d - xs + 2), + LoadU(df, d - xs + 3), LoadU(df, d - xs + 4)); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1_________ + 2_________ + 3***______ + 4___*0*___ + 5______*** + 6_________ + 7_________ + 8_________ */ + auto sum = + Sum(LoadU(df, d - xs - 4), LoadU(df, d - xs - 3), LoadU(df, d - xs - 2), + LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + xs + 2), + LoadU(df, d + xs + 3), LoadU(df, d + xs + 4)); + retval = MulAdd(sum, sum, retval); + } + { + /* 0___*_____ + 1___*_____ + 2___*_____ + 3____*____ + 4____0____ + 5____*____ + 6_____*___ + 7_____*___ + 8_____*___ */ + auto sum = Sum(LoadU(df, d - xs3 - xs - 1), LoadU(df, d - xs3 - 1), + LoadU(df, d - xs - xs - 1), LoadU(df, d - xs), center, + LoadU(df, d + xs), LoadU(df, d + xs + xs + 1), + LoadU(df, d + xs3 + 1), LoadU(df, d + xs3 + xs + 1)); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_____*___ + 1_____*___ + 2____ *___ + 3____*____ + 4____0____ + 5____*____ + 6___*_____ + 7___*_____ + 8___*_____ */ + auto sum = Sum(LoadU(df, d - xs3 - xs + 1), LoadU(df, d - xs3 + 1), + LoadU(df, d - xs - xs + 1), LoadU(df, d - xs), center, + LoadU(df, d + xs), LoadU(df, d + xs + xs - 1), + LoadU(df, d + xs3 - 1), LoadU(df, d + xs3 + xs - 1)); + retval = MulAdd(sum, sum, retval); + } + return retval; +} + +// Returns MaltaUnit. Avoids bounds-checks when x0 and y0 are known +// to be far enough from the image borders. "diffs" is a packed image. +template +static BUTTERAUGLI_INLINE float PaddedMaltaUnit(const ImageF& diffs, + const size_t x0, + const size_t y0) { + const float* BUTTERAUGLI_RESTRICT d = diffs.ConstRow(y0) + x0; + const HWY_CAPPED(float, 1) df; + if ((x0 >= 4 && y0 >= 4 && x0 < (diffs.xsize() - 4) && + y0 < (diffs.ysize() - 4))) { + return GetLane(MaltaUnit(Tag(), df, d, diffs.PixelsPerRow())); + } + + PROFILER_ZONE("Padded Malta"); + float borderimage[12 * 9]; // round up to 4 + for (int dy = 0; dy < 9; ++dy) { + int y = y0 + dy - 4; + if (y < 0 || static_cast(y) >= diffs.ysize()) { + for (int dx = 0; dx < 12; ++dx) { + borderimage[dy * 12 + dx] = 0.0f; + } + continue; + } + + const float* row_diffs = diffs.ConstRow(y); + for (int dx = 0; dx < 9; ++dx) { + int x = x0 + dx - 4; + if (x < 0 || static_cast(x) >= diffs.xsize()) { + borderimage[dy * 12 + dx] = 0.0f; + } else { + borderimage[dy * 12 + dx] = row_diffs[x]; + } + } + std::fill(borderimage + dy * 12 + 9, borderimage + dy * 12 + 12, 0.0f); + } + return GetLane(MaltaUnit(Tag(), df, &borderimage[4 * 12 + 4], 12)); +} + +template +static void MaltaDiffMapT(const Tag tag, const ImageF& lum0, const ImageF& lum1, + const double w_0gt1, const double w_0lt1, + const double norm1, const double len, + const double mulli, ImageF* HWY_RESTRICT diffs, + Image3F* HWY_RESTRICT block_diff_ac, size_t c) { + JXL_DASSERT(SameSize(lum0, lum1) && SameSize(lum0, *diffs)); + const size_t xsize_ = lum0.xsize(); + const size_t ysize_ = lum0.ysize(); + + const float kWeight0 = 0.5; + const float kWeight1 = 0.33; + + const double w_pre0gt1 = mulli * std::sqrt(kWeight0 * w_0gt1) / (len * 2 + 1); + const double w_pre0lt1 = mulli * std::sqrt(kWeight1 * w_0lt1) / (len * 2 + 1); + const float norm2_0gt1 = w_pre0gt1 * norm1; + const float norm2_0lt1 = w_pre0lt1 * norm1; + + for (size_t y = 0; y < ysize_; ++y) { + const float* HWY_RESTRICT row0 = lum0.ConstRow(y); + const float* HWY_RESTRICT row1 = lum1.ConstRow(y); + float* HWY_RESTRICT row_diffs = diffs->Row(y); + for (size_t x = 0; x < xsize_; ++x) { + const float absval = 0.5f * (std::abs(row0[x]) + std::abs(row1[x])); + const float diff = row0[x] - row1[x]; + const float scaler = norm2_0gt1 / (static_cast(norm1) + absval); + + // Primary symmetric quadratic objective. + row_diffs[x] = scaler * diff; + + const float scaler2 = norm2_0lt1 / (static_cast(norm1) + absval); + const double fabs0 = std::fabs(row0[x]); + + // Secondary half-open quadratic objectives. + const double too_small = 0.55 * fabs0; + const double too_big = 1.05 * fabs0; + + if (row0[x] < 0) { + if (row1[x] > -too_small) { + double impact = scaler2 * (row1[x] + too_small); + row_diffs[x] -= impact; + } else if (row1[x] < -too_big) { + double impact = scaler2 * (-row1[x] - too_big); + row_diffs[x] += impact; + } + } else { + if (row1[x] < too_small) { + double impact = scaler2 * (too_small - row1[x]); + row_diffs[x] += impact; + } else if (row1[x] > too_big) { + double impact = scaler2 * (row1[x] - too_big); + row_diffs[x] -= impact; + } + } + } + } + + size_t y0 = 0; + // Top + for (; y0 < 4; ++y0) { + float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0); + for (size_t x0 = 0; x0 < xsize_; ++x0) { + row_diff[x0] += PaddedMaltaUnit(*diffs, x0, y0); + } + } + + const HWY_FULL(float) df; + const size_t aligned_x = std::max(size_t(4), Lanes(df)); + const intptr_t stride = diffs->PixelsPerRow(); + + // Middle + for (; y0 < ysize_ - 4; ++y0) { + const float* BUTTERAUGLI_RESTRICT row_in = diffs->ConstRow(y0); + float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0); + size_t x0 = 0; + for (; x0 < aligned_x; ++x0) { + row_diff[x0] += PaddedMaltaUnit(*diffs, x0, y0); + } + for (; x0 + Lanes(df) + 4 <= xsize_; x0 += Lanes(df)) { + auto diff = Load(df, row_diff + x0); + diff = Add(diff, MaltaUnit(Tag(), df, row_in + x0, stride)); + Store(diff, df, row_diff + x0); + } + + for (; x0 < xsize_; ++x0) { + row_diff[x0] += PaddedMaltaUnit(*diffs, x0, y0); + } + } + + // Bottom + for (; y0 < ysize_; ++y0) { + float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0); + for (size_t x0 = 0; x0 < xsize_; ++x0) { + row_diff[x0] += PaddedMaltaUnit(*diffs, x0, y0); + } + } +} + +// Need non-template wrapper functions for HWY_EXPORT. +void MaltaDiffMap(const ImageF& lum0, const ImageF& lum1, const double w_0gt1, + const double w_0lt1, const double norm1, const double len, + const double mulli, ImageF* HWY_RESTRICT diffs, + Image3F* HWY_RESTRICT block_diff_ac, size_t c) { + MaltaDiffMapT(MaltaTag(), lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli, + diffs, block_diff_ac, c); +} + +void MaltaDiffMapLF(const ImageF& lum0, const ImageF& lum1, const double w_0gt1, + const double w_0lt1, const double norm1, const double len, + const double mulli, ImageF* HWY_RESTRICT diffs, + Image3F* HWY_RESTRICT block_diff_ac, size_t c) { + MaltaDiffMapT(MaltaTagLF(), lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli, + diffs, block_diff_ac, c); +} + +void DiffPrecompute(const ImageF& xyb, float mul, float bias_arg, ImageF* out) { + PROFILER_FUNC; + const size_t xsize = xyb.xsize(); + const size_t ysize = xyb.ysize(); + const float bias = mul * bias_arg; + const float sqrt_bias = sqrt(bias); + for (size_t y = 0; y < ysize; ++y) { + const float* BUTTERAUGLI_RESTRICT row_in = xyb.Row(y); + float* BUTTERAUGLI_RESTRICT row_out = out->Row(y); + for (size_t x = 0; x < xsize; ++x) { + // kBias makes sqrt behave more linearly. + row_out[x] = sqrt(mul * std::abs(row_in[x]) + bias) - sqrt_bias; + } + } +} + +// std::log(80.0) / std::log(255.0); +constexpr float kIntensityTargetNormalizationHack = 0.79079917404f; +static const float kInternalGoodQualityThreshold = + 17.83f * kIntensityTargetNormalizationHack; +static const float kGlobalScale = 1.0 / kInternalGoodQualityThreshold; + +void StoreMin3(const float v, float& min0, float& min1, float& min2) { + if (v < min2) { + if (v < min0) { + min2 = min1; + min1 = min0; + min0 = v; + } else if (v < min1) { + min2 = min1; + min1 = v; + } else { + min2 = v; + } + } +} + +// Look for smooth areas near the area of degradation. +// If the areas area generally smooth, don't do masking. +void FuzzyErosion(const ImageF& from, ImageF* to) { + const size_t xsize = from.xsize(); + const size_t ysize = from.ysize(); + static const int kStep = 3; + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + float min0 = from.Row(y)[x]; + float min1 = 2 * min0; + float min2 = min1; + if (x >= kStep) { + float v = from.Row(y)[x - kStep]; + StoreMin3(v, min0, min1, min2); + if (y >= kStep) { + float v = from.Row(y - kStep)[x - kStep]; + StoreMin3(v, min0, min1, min2); + } + if (y < ysize - kStep) { + float v = from.Row(y + kStep)[x - kStep]; + StoreMin3(v, min0, min1, min2); + } + } + if (x < xsize - kStep) { + float v = from.Row(y)[x + kStep]; + StoreMin3(v, min0, min1, min2); + if (y >= kStep) { + float v = from.Row(y - kStep)[x + kStep]; + StoreMin3(v, min0, min1, min2); + } + if (y < ysize - kStep) { + float v = from.Row(y + kStep)[x + kStep]; + StoreMin3(v, min0, min1, min2); + } + } + if (y >= kStep) { + float v = from.Row(y - kStep)[x]; + StoreMin3(v, min0, min1, min2); + } + if (y < ysize - kStep) { + float v = from.Row(y + kStep)[x]; + StoreMin3(v, min0, min1, min2); + } + to->Row(y)[x] = (0.45f * min0 + 0.3f * min1 + 0.25f * min2); + } + } +} + +// Compute values of local frequency and dc masking based on the activity +// in the two images. img_diff_ac may be null. +void Mask(const ImageF& mask0, const ImageF& mask1, + const ButteraugliParams& params, BlurTemp* blur_temp, + ImageF* BUTTERAUGLI_RESTRICT mask, + ImageF* BUTTERAUGLI_RESTRICT diff_ac) { + // Only X and Y components are involved in masking. B's influence + // is considered less important in the high frequency area, and we + // don't model masking from lower frequency signals. + PROFILER_FUNC; + const size_t xsize = mask0.xsize(); + const size_t ysize = mask0.ysize(); + *mask = ImageF(xsize, ysize); + static const float kMul = 6.19424080439; + static const float kBias = 12.61050594197; + static const float kRadius = 2.7; + ImageF diff0(xsize, ysize); + ImageF diff1(xsize, ysize); + ImageF blurred0(xsize, ysize); + ImageF blurred1(xsize, ysize); + DiffPrecompute(mask0, kMul, kBias, &diff0); + DiffPrecompute(mask1, kMul, kBias, &diff1); + Blur(diff0, kRadius, params, blur_temp, &blurred0); + FuzzyErosion(blurred0, &diff0); + Blur(diff1, kRadius, params, blur_temp, &blurred1); + FuzzyErosion(blurred1, &diff1); + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + mask->Row(y)[x] = diff0.Row(y)[x]; + if (diff_ac != nullptr) { + static const float kMaskToErrorMul = 10.0; + float diff = blurred0.Row(y)[x] - blurred1.Row(y)[x]; + diff_ac->Row(y)[x] += kMaskToErrorMul * diff * diff; + } + } + } +} + +// `diff_ac` may be null. +void MaskPsychoImage(const PsychoImage& pi0, const PsychoImage& pi1, + const size_t xsize, const size_t ysize, + const ButteraugliParams& params, Image3F* temp, + BlurTemp* blur_temp, ImageF* BUTTERAUGLI_RESTRICT mask, + ImageF* BUTTERAUGLI_RESTRICT diff_ac) { + ImageF mask0(xsize, ysize); + ImageF mask1(xsize, ysize); + static const float muls[3] = { + 2.5f, + 0.4f, + 0.4f, + }; + // Silly and unoptimized approach here. TODO(jyrki): rework this. + for (size_t y = 0; y < ysize; ++y) { + const float* BUTTERAUGLI_RESTRICT row_y_hf0 = pi0.hf[1].Row(y); + const float* BUTTERAUGLI_RESTRICT row_y_hf1 = pi1.hf[1].Row(y); + const float* BUTTERAUGLI_RESTRICT row_y_uhf0 = pi0.uhf[1].Row(y); + const float* BUTTERAUGLI_RESTRICT row_y_uhf1 = pi1.uhf[1].Row(y); + const float* BUTTERAUGLI_RESTRICT row_x_hf0 = pi0.hf[0].Row(y); + const float* BUTTERAUGLI_RESTRICT row_x_hf1 = pi1.hf[0].Row(y); + const float* BUTTERAUGLI_RESTRICT row_x_uhf0 = pi0.uhf[0].Row(y); + const float* BUTTERAUGLI_RESTRICT row_x_uhf1 = pi1.uhf[0].Row(y); + float* BUTTERAUGLI_RESTRICT row0 = mask0.Row(y); + float* BUTTERAUGLI_RESTRICT row1 = mask1.Row(y); + for (size_t x = 0; x < xsize; ++x) { + float xdiff0 = (row_x_uhf0[x] + row_x_hf0[x]) * muls[0]; + float xdiff1 = (row_x_uhf1[x] + row_x_hf1[x]) * muls[0]; + float ydiff0 = row_y_uhf0[x] * muls[1] + row_y_hf0[x] * muls[2]; + float ydiff1 = row_y_uhf1[x] * muls[1] + row_y_hf1[x] * muls[2]; + row0[x] = xdiff0 * xdiff0 + ydiff0 * ydiff0; + row0[x] = sqrt(row0[x]); + row1[x] = xdiff1 * xdiff1 + ydiff1 * ydiff1; + row1[x] = sqrt(row1[x]); + } + } + Mask(mask0, mask1, params, blur_temp, mask, diff_ac); +} + +double MaskY(double delta) { + static const double offset = 0.829591754942; + static const double scaler = 0.451936922203; + static const double mul = 2.5485944793; + const double c = mul / ((scaler * delta) + offset); + const double retval = kGlobalScale * (1.0 + c); + return retval * retval; +} + +double MaskDcY(double delta) { + static const double offset = 0.20025578522; + static const double scaler = 3.87449418804; + static const double mul = 0.505054525019; + const double c = mul / ((scaler * delta) + offset); + const double retval = kGlobalScale * (1.0 + c); + return retval * retval; +} + +inline float MaskColor(const float color[3], const float mask) { + return color[0] * mask + color[1] * mask + color[2] * mask; +} + +// Diffmap := sqrt of sum{diff images by multiplied by X and Y/B masks} +void CombineChannelsToDiffmap(const ImageF& mask, const Image3F& block_diff_dc, + const Image3F& block_diff_ac, float xmul, + ImageF* result) { + PROFILER_FUNC; + JXL_CHECK(SameSize(mask, *result)); + size_t xsize = mask.xsize(); + size_t ysize = mask.ysize(); + for (size_t y = 0; y < ysize; ++y) { + float* BUTTERAUGLI_RESTRICT row_out = result->Row(y); + for (size_t x = 0; x < xsize; ++x) { + float val = mask.Row(y)[x]; + float maskval = MaskY(val); + float dc_maskval = MaskDcY(val); + float diff_dc[3]; + float diff_ac[3]; + for (int i = 0; i < 3; ++i) { + diff_dc[i] = block_diff_dc.PlaneRow(i, y)[x]; + diff_ac[i] = block_diff_ac.PlaneRow(i, y)[x]; + } + diff_ac[0] *= xmul; + diff_dc[0] *= xmul; + row_out[x] = + sqrt(MaskColor(diff_dc, dc_maskval) + MaskColor(diff_ac, maskval)); + } + } +} + +// Adds weighted L2 difference between i0 and i1 to diffmap. +static void L2Diff(const ImageF& i0, const ImageF& i1, const float w, + Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) { + if (w == 0) return; + + const HWY_FULL(float) d; + const auto weight = Set(d, w); + + for (size_t y = 0; y < i0.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row0 = i0.ConstRow(y); + const float* BUTTERAUGLI_RESTRICT row1 = i1.ConstRow(y); + float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y); + + for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) { + const auto diff = Sub(Load(d, row0 + x), Load(d, row1 + x)); + const auto diff2 = Mul(diff, diff); + const auto prev = Load(d, row_diff + x); + Store(MulAdd(diff2, weight, prev), d, row_diff + x); + } + } +} + +// Initializes diffmap to the weighted L2 difference between i0 and i1. +static void SetL2Diff(const ImageF& i0, const ImageF& i1, const float w, + Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) { + if (w == 0) return; + + const HWY_FULL(float) d; + const auto weight = Set(d, w); + + for (size_t y = 0; y < i0.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row0 = i0.ConstRow(y); + const float* BUTTERAUGLI_RESTRICT row1 = i1.ConstRow(y); + float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y); + + for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) { + const auto diff = Sub(Load(d, row0 + x), Load(d, row1 + x)); + const auto diff2 = Mul(diff, diff); + Store(Mul(diff2, weight), d, row_diff + x); + } + } +} + +// i0 is the original image. +// i1 is the deformed copy. +static void L2DiffAsymmetric(const ImageF& i0, const ImageF& i1, float w_0gt1, + float w_0lt1, + Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) { + if (w_0gt1 == 0 && w_0lt1 == 0) { + return; + } + + const HWY_FULL(float) d; + const auto vw_0gt1 = Set(d, w_0gt1 * 0.8); + const auto vw_0lt1 = Set(d, w_0lt1 * 0.8); + + for (size_t y = 0; y < i0.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row0 = i0.Row(y); + const float* BUTTERAUGLI_RESTRICT row1 = i1.Row(y); + float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y); + + for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) { + const auto val0 = Load(d, row0 + x); + const auto val1 = Load(d, row1 + x); + + // Primary symmetric quadratic objective. + const auto diff = Sub(val0, val1); + auto total = MulAdd(Mul(diff, diff), vw_0gt1, Load(d, row_diff + x)); + + // Secondary half-open quadratic objectives. + const auto fabs0 = Abs(val0); + const auto too_small = Mul(Set(d, 0.4), fabs0); + const auto too_big = fabs0; + + const auto if_neg = IfThenElse( + Gt(val1, Neg(too_small)), Add(val1, too_small), + IfThenElseZero(Lt(val1, Neg(too_big)), Sub(Neg(val1), too_big))); + const auto if_pos = + IfThenElse(Lt(val1, too_small), Sub(too_small, val1), + IfThenElseZero(Gt(val1, too_big), Sub(val1, too_big))); + const auto v = IfThenElse(Lt(val0, Zero(d)), if_neg, if_pos); + total = MulAdd(vw_0lt1, Mul(v, v), total); + Store(total, d, row_diff + x); + } + } +} + +// A simple HDR compatible gamma function. +template +V Gamma(const DF df, V v) { + // ln(2) constant folded in because we want std::log but have FastLog2f. + const auto kRetMul = Set(df, 19.245013259874995f * 0.693147180559945f); + const auto kRetAdd = Set(df, -23.16046239805755); + // This should happen rarely, but may lead to a NaN in log, which is + // undesirable. Since negative photons don't exist we solve the NaNs by + // clamping here. + v = ZeroIfNegative(v); + + const auto biased = Add(v, Set(df, 9.9710635769299145)); + const auto log = FastLog2f(df, biased); + // We could fold this into a custom Log2 polynomial, but there would be + // relatively little gain. + return MulAdd(kRetMul, log, kRetAdd); +} + +template +BUTTERAUGLI_INLINE void OpsinAbsorbance(const DF df, const V& in0, const V& in1, + const V& in2, V* JXL_RESTRICT out0, + V* JXL_RESTRICT out1, + V* JXL_RESTRICT out2) { + // https://en.wikipedia.org/wiki/Photopsin absorbance modeling. + static const double mixi0 = 0.29956550340058319; + static const double mixi1 = 0.63373087833825936; + static const double mixi2 = 0.077705617820981968; + static const double mixi3 = 1.7557483643287353; + static const double mixi4 = 0.22158691104574774; + static const double mixi5 = 0.69391388044116142; + static const double mixi6 = 0.0987313588422; + static const double mixi7 = 1.7557483643287353; + static const double mixi8 = 0.02; + static const double mixi9 = 0.02; + static const double mixi10 = 0.20480129041026129; + static const double mixi11 = 12.226454707163354; + + const V mix0 = Set(df, mixi0); + const V mix1 = Set(df, mixi1); + const V mix2 = Set(df, mixi2); + const V mix3 = Set(df, mixi3); + const V mix4 = Set(df, mixi4); + const V mix5 = Set(df, mixi5); + const V mix6 = Set(df, mixi6); + const V mix7 = Set(df, mixi7); + const V mix8 = Set(df, mixi8); + const V mix9 = Set(df, mixi9); + const V mix10 = Set(df, mixi10); + const V mix11 = Set(df, mixi11); + + *out0 = MulAdd(mix0, in0, MulAdd(mix1, in1, MulAdd(mix2, in2, mix3))); + *out1 = MulAdd(mix4, in0, MulAdd(mix5, in1, MulAdd(mix6, in2, mix7))); + *out2 = MulAdd(mix8, in0, MulAdd(mix9, in1, MulAdd(mix10, in2, mix11))); + + if (Clamp) { + *out0 = Max(*out0, mix3); + *out1 = Max(*out1, mix7); + *out2 = Max(*out2, mix11); + } +} + +// `blurred` is a temporary image used inside this function and not returned. +Image3F OpsinDynamicsImage(const Image3F& rgb, const ButteraugliParams& params, + Image3F* blurred, BlurTemp* blur_temp) { + PROFILER_FUNC; + Image3F xyb(rgb.xsize(), rgb.ysize()); + const double kSigma = 1.2; + Blur(rgb.Plane(0), kSigma, params, blur_temp, &blurred->Plane(0)); + Blur(rgb.Plane(1), kSigma, params, blur_temp, &blurred->Plane(1)); + Blur(rgb.Plane(2), kSigma, params, blur_temp, &blurred->Plane(2)); + const HWY_FULL(float) df; + const auto intensity_target_multiplier = Set(df, params.intensity_target); + for (size_t y = 0; y < rgb.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row_r = rgb.ConstPlaneRow(0, y); + const float* BUTTERAUGLI_RESTRICT row_g = rgb.ConstPlaneRow(1, y); + const float* BUTTERAUGLI_RESTRICT row_b = rgb.ConstPlaneRow(2, y); + const float* BUTTERAUGLI_RESTRICT row_blurred_r = + blurred->ConstPlaneRow(0, y); + const float* BUTTERAUGLI_RESTRICT row_blurred_g = + blurred->ConstPlaneRow(1, y); + const float* BUTTERAUGLI_RESTRICT row_blurred_b = + blurred->ConstPlaneRow(2, y); + float* BUTTERAUGLI_RESTRICT row_out_x = xyb.PlaneRow(0, y); + float* BUTTERAUGLI_RESTRICT row_out_y = xyb.PlaneRow(1, y); + float* BUTTERAUGLI_RESTRICT row_out_b = xyb.PlaneRow(2, y); + const auto min = Set(df, 1e-4f); + for (size_t x = 0; x < rgb.xsize(); x += Lanes(df)) { + auto sensitivity0 = Undefined(df); + auto sensitivity1 = Undefined(df); + auto sensitivity2 = Undefined(df); + { + // Calculate sensitivity based on the smoothed image gamma derivative. + auto pre_mixed0 = Undefined(df); + auto pre_mixed1 = Undefined(df); + auto pre_mixed2 = Undefined(df); + OpsinAbsorbance( + df, Mul(Load(df, row_blurred_r + x), intensity_target_multiplier), + Mul(Load(df, row_blurred_g + x), intensity_target_multiplier), + Mul(Load(df, row_blurred_b + x), intensity_target_multiplier), + &pre_mixed0, &pre_mixed1, &pre_mixed2); + pre_mixed0 = Max(pre_mixed0, min); + pre_mixed1 = Max(pre_mixed1, min); + pre_mixed2 = Max(pre_mixed2, min); + sensitivity0 = Div(Gamma(df, pre_mixed0), pre_mixed0); + sensitivity1 = Div(Gamma(df, pre_mixed1), pre_mixed1); + sensitivity2 = Div(Gamma(df, pre_mixed2), pre_mixed2); + sensitivity0 = Max(sensitivity0, min); + sensitivity1 = Max(sensitivity1, min); + sensitivity2 = Max(sensitivity2, min); + } + auto cur_mixed0 = Undefined(df); + auto cur_mixed1 = Undefined(df); + auto cur_mixed2 = Undefined(df); + OpsinAbsorbance( + df, Mul(Load(df, row_r + x), intensity_target_multiplier), + Mul(Load(df, row_g + x), intensity_target_multiplier), + Mul(Load(df, row_b + x), intensity_target_multiplier), &cur_mixed0, + &cur_mixed1, &cur_mixed2); + cur_mixed0 = Mul(cur_mixed0, sensitivity0); + cur_mixed1 = Mul(cur_mixed1, sensitivity1); + cur_mixed2 = Mul(cur_mixed2, sensitivity2); + // This is a kludge. The negative values should be zeroed away before + // blurring. Ideally there would be no negative values in the first place. + const auto min01 = Set(df, 1.7557483643287353f); + const auto min2 = Set(df, 12.226454707163354f); + cur_mixed0 = Max(cur_mixed0, min01); + cur_mixed1 = Max(cur_mixed1, min01); + cur_mixed2 = Max(cur_mixed2, min2); + + Store(Sub(cur_mixed0, cur_mixed1), df, row_out_x + x); + Store(Add(cur_mixed0, cur_mixed1), df, row_out_y + x); + Store(cur_mixed2, df, row_out_b + x); + } + } + return xyb; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(SeparateFrequencies); // Local function. +HWY_EXPORT(MaskPsychoImage); // Local function. +HWY_EXPORT(L2DiffAsymmetric); // Local function. +HWY_EXPORT(L2Diff); // Local function. +HWY_EXPORT(SetL2Diff); // Local function. +HWY_EXPORT(CombineChannelsToDiffmap); // Local function. +HWY_EXPORT(MaltaDiffMap); // Local function. +HWY_EXPORT(MaltaDiffMapLF); // Local function. +HWY_EXPORT(OpsinDynamicsImage); // Local function. + +#if BUTTERAUGLI_ENABLE_CHECKS + +static inline bool IsNan(const float x) { + uint32_t bits; + memcpy(&bits, &x, sizeof(bits)); + const uint32_t bitmask_exp = 0x7F800000; + return (bits & bitmask_exp) == bitmask_exp && (bits & 0x7FFFFF); +} + +static inline bool IsNan(const double x) { + uint64_t bits; + memcpy(&bits, &x, sizeof(bits)); + return (0x7ff0000000000001ULL <= bits && bits <= 0x7fffffffffffffffULL) || + (0xfff0000000000001ULL <= bits && bits <= 0xffffffffffffffffULL); +} + +static inline void CheckImage(const ImageF& image, const char* name) { + PROFILER_FUNC; + for (size_t y = 0; y < image.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row = image.Row(y); + for (size_t x = 0; x < image.xsize(); ++x) { + if (IsNan(row[x])) { + printf("NAN: Image %s @ %" PRIuS ",%" PRIuS " (of %" PRIuS ",%" PRIuS + ")\n", + name, x, y, image.xsize(), image.ysize()); + exit(1); + } + } + } +} + +#define CHECK_NAN(x, str) \ + do { \ + if (IsNan(x)) { \ + printf("%d: %s\n", __LINE__, str); \ + abort(); \ + } \ + } while (0) + +#define CHECK_IMAGE(image, name) CheckImage(image, name) + +#else // BUTTERAUGLI_ENABLE_CHECKS + +#define CHECK_NAN(x, str) +#define CHECK_IMAGE(image, name) + +#endif // BUTTERAUGLI_ENABLE_CHECKS + +// Calculate a 2x2 subsampled image for purposes of recursive butteraugli at +// multiresolution. +static Image3F SubSample2x(const Image3F& in) { + size_t xs = (in.xsize() + 1) / 2; + size_t ys = (in.ysize() + 1) / 2; + Image3F retval(xs, ys); + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < ys; ++y) { + for (size_t x = 0; x < xs; ++x) { + retval.PlaneRow(c, y)[x] = 0; + } + } + } + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < in.ysize(); ++y) { + for (size_t x = 0; x < in.xsize(); ++x) { + retval.PlaneRow(c, y / 2)[x / 2] += 0.25f * in.PlaneRow(c, y)[x]; + } + } + if ((in.xsize() & 1) != 0) { + for (size_t y = 0; y < retval.ysize(); ++y) { + size_t last_column = retval.xsize() - 1; + retval.PlaneRow(c, y)[last_column] *= 2.0f; + } + } + if ((in.ysize() & 1) != 0) { + for (size_t x = 0; x < retval.xsize(); ++x) { + size_t last_row = retval.ysize() - 1; + retval.PlaneRow(c, last_row)[x] *= 2.0f; + } + } + } + return retval; +} + +// Supersample src by 2x and add it to dest. +static void AddSupersampled2x(const ImageF& src, float w, ImageF& dest) { + for (size_t y = 0; y < dest.ysize(); ++y) { + for (size_t x = 0; x < dest.xsize(); ++x) { + // There will be less errors from the more averaged images. + // We take it into account to some extent using a scaler. + static const double kHeuristicMixingValue = 0.3; + dest.Row(y)[x] *= 1.0 - kHeuristicMixingValue * w; + dest.Row(y)[x] += w * src.Row(y / 2)[x / 2]; + } + } +} + +Image3F* ButteraugliComparator::Temp() const { + bool was_in_use = temp_in_use_.test_and_set(std::memory_order_acq_rel); + JXL_ASSERT(!was_in_use); + (void)was_in_use; + return &temp_; +} + +void ButteraugliComparator::ReleaseTemp() const { temp_in_use_.clear(); } + +ButteraugliComparator::ButteraugliComparator(const Image3F& rgb0, + const ButteraugliParams& params) + : xsize_(rgb0.xsize()), + ysize_(rgb0.ysize()), + params_(params), + temp_(xsize_, ysize_) { + if (xsize_ < 8 || ysize_ < 8) { + return; + } + + Image3F xyb0 = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(rgb0, params, Temp(), + &blur_temp_); + ReleaseTemp(); + HWY_DYNAMIC_DISPATCH(SeparateFrequencies) + (xsize_, ysize_, params_, &blur_temp_, xyb0, pi0_); + + // Awful recursive construction of samples of different resolution. + // This is an after-thought and possibly somewhat parallel in + // functionality with the PsychoImage multi-resolution approach. + sub_.reset(new ButteraugliComparator(SubSample2x(rgb0), params)); +} + +void ButteraugliComparator::Mask(ImageF* BUTTERAUGLI_RESTRICT mask) const { + HWY_DYNAMIC_DISPATCH(MaskPsychoImage) + (pi0_, pi0_, xsize_, ysize_, params_, Temp(), &blur_temp_, mask, nullptr); + ReleaseTemp(); +} + +void ButteraugliComparator::Diffmap(const Image3F& rgb1, ImageF& result) const { + PROFILER_FUNC; + if (xsize_ < 8 || ysize_ < 8) { + ZeroFillImage(&result); + return; + } + const Image3F xyb1 = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)( + rgb1, params_, Temp(), &blur_temp_); + ReleaseTemp(); + DiffmapOpsinDynamicsImage(xyb1, result); + if (sub_) { + if (sub_->xsize_ < 8 || sub_->ysize_ < 8) { + return; + } + const Image3F sub_xyb = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)( + SubSample2x(rgb1), params_, sub_->Temp(), &sub_->blur_temp_); + sub_->ReleaseTemp(); + ImageF subresult; + sub_->DiffmapOpsinDynamicsImage(sub_xyb, subresult); + AddSupersampled2x(subresult, 0.5, result); + } +} + +void ButteraugliComparator::DiffmapOpsinDynamicsImage(const Image3F& xyb1, + ImageF& result) const { + PROFILER_FUNC; + if (xsize_ < 8 || ysize_ < 8) { + ZeroFillImage(&result); + return; + } + PsychoImage pi1; + HWY_DYNAMIC_DISPATCH(SeparateFrequencies) + (xsize_, ysize_, params_, &blur_temp_, xyb1, pi1); + result = ImageF(xsize_, ysize_); + DiffmapPsychoImage(pi1, result); +} + +namespace { + +void MaltaDiffMap(const ImageF& lum0, const ImageF& lum1, const double w_0gt1, + const double w_0lt1, const double norm1, + ImageF* HWY_RESTRICT diffs, + Image3F* HWY_RESTRICT block_diff_ac, size_t c) { + PROFILER_FUNC; + const double len = 3.75; + static const double mulli = 0.39905817637; + HWY_DYNAMIC_DISPATCH(MaltaDiffMap) + (lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli, diffs, block_diff_ac, c); +} + +void MaltaDiffMapLF(const ImageF& lum0, const ImageF& lum1, const double w_0gt1, + const double w_0lt1, const double norm1, + ImageF* HWY_RESTRICT diffs, + Image3F* HWY_RESTRICT block_diff_ac, size_t c) { + PROFILER_FUNC; + const double len = 3.75; + static const double mulli = 0.611612573796; + HWY_DYNAMIC_DISPATCH(MaltaDiffMapLF) + (lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli, diffs, block_diff_ac, c); +} + +} // namespace + +void ButteraugliComparator::DiffmapPsychoImage(const PsychoImage& pi1, + ImageF& diffmap) const { + PROFILER_FUNC; + if (xsize_ < 8 || ysize_ < 8) { + ZeroFillImage(&diffmap); + return; + } + + const float hf_asymmetry_ = params_.hf_asymmetry; + const float xmul_ = params_.xmul; + + ImageF diffs(xsize_, ysize_); + Image3F block_diff_ac(xsize_, ysize_); + ZeroFillImage(&block_diff_ac); + static const double wUhfMalta = 1.10039032555; + static const double norm1Uhf = 71.7800275169; + MaltaDiffMap(pi0_.uhf[1], pi1.uhf[1], wUhfMalta * hf_asymmetry_, + wUhfMalta / hf_asymmetry_, norm1Uhf, &diffs, &block_diff_ac, 1); + + static const double wUhfMaltaX = 173.5; + static const double norm1UhfX = 5.0; + MaltaDiffMap(pi0_.uhf[0], pi1.uhf[0], wUhfMaltaX * hf_asymmetry_, + wUhfMaltaX / hf_asymmetry_, norm1UhfX, &diffs, &block_diff_ac, + 0); + + static const double wHfMalta = 18.7237414387; + static const double norm1Hf = 4498534.45232; + MaltaDiffMapLF(pi0_.hf[1], pi1.hf[1], wHfMalta * std::sqrt(hf_asymmetry_), + wHfMalta / std::sqrt(hf_asymmetry_), norm1Hf, &diffs, + &block_diff_ac, 1); + + static const double wHfMaltaX = 6923.99476109; + static const double norm1HfX = 8051.15833247; + MaltaDiffMapLF(pi0_.hf[0], pi1.hf[0], wHfMaltaX * std::sqrt(hf_asymmetry_), + wHfMaltaX / std::sqrt(hf_asymmetry_), norm1HfX, &diffs, + &block_diff_ac, 0); + + static const double wMfMalta = 37.0819870399; + static const double norm1Mf = 130262059.556; + MaltaDiffMapLF(pi0_.mf.Plane(1), pi1.mf.Plane(1), wMfMalta, wMfMalta, norm1Mf, + &diffs, &block_diff_ac, 1); + + static const double wMfMaltaX = 8246.75321353; + static const double norm1MfX = 1009002.70582; + MaltaDiffMapLF(pi0_.mf.Plane(0), pi1.mf.Plane(0), wMfMaltaX, wMfMaltaX, + norm1MfX, &diffs, &block_diff_ac, 0); + + static const double wmul[9] = { + 400.0, 1.50815703118, 0, + 2150.0, 10.6195433239, 16.2176043152, + 29.2353797994, 0.844626970982, 0.703646627719, + }; + Image3F block_diff_dc(xsize_, ysize_); + for (size_t c = 0; c < 3; ++c) { + if (c < 2) { // No blue channel error accumulated at HF. + HWY_DYNAMIC_DISPATCH(L2DiffAsymmetric) + (pi0_.hf[c], pi1.hf[c], wmul[c] * hf_asymmetry_, wmul[c] / hf_asymmetry_, + &block_diff_ac, c); + } + HWY_DYNAMIC_DISPATCH(L2Diff) + (pi0_.mf.Plane(c), pi1.mf.Plane(c), wmul[3 + c], &block_diff_ac, c); + HWY_DYNAMIC_DISPATCH(SetL2Diff) + (pi0_.lf.Plane(c), pi1.lf.Plane(c), wmul[6 + c], &block_diff_dc, c); + } + + ImageF mask; + HWY_DYNAMIC_DISPATCH(MaskPsychoImage) + (pi0_, pi1, xsize_, ysize_, params_, Temp(), &blur_temp_, &mask, + &block_diff_ac.Plane(1)); + ReleaseTemp(); + + HWY_DYNAMIC_DISPATCH(CombineChannelsToDiffmap) + (mask, block_diff_dc, block_diff_ac, xmul_, &diffmap); +} + +double ButteraugliScoreFromDiffmap(const ImageF& diffmap, + const ButteraugliParams* params) { + PROFILER_FUNC; + float retval = 0.0f; + for (size_t y = 0; y < diffmap.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row = diffmap.ConstRow(y); + for (size_t x = 0; x < diffmap.xsize(); ++x) { + retval = std::max(retval, row[x]); + } + } + return retval; +} + +bool ButteraugliDiffmap(const Image3F& rgb0, const Image3F& rgb1, + double hf_asymmetry, double xmul, ImageF& diffmap) { + ButteraugliParams params; + params.hf_asymmetry = hf_asymmetry; + params.xmul = xmul; + return ButteraugliDiffmap(rgb0, rgb1, params, diffmap); +} + +bool ButteraugliDiffmap(const Image3F& rgb0, const Image3F& rgb1, + const ButteraugliParams& params, ImageF& diffmap) { + PROFILER_FUNC; + const size_t xsize = rgb0.xsize(); + const size_t ysize = rgb0.ysize(); + if (xsize < 1 || ysize < 1) { + return JXL_FAILURE("Zero-sized image"); + } + if (!SameSize(rgb0, rgb1)) { + return JXL_FAILURE("Size mismatch"); + } + static const int kMax = 8; + if (xsize < kMax || ysize < kMax) { + // Butteraugli values for small (where xsize or ysize is smaller + // than 8 pixels) images are non-sensical, but most likely it is + // less disruptive to try to compute something than just give up. + // Temporarily extend the borders of the image to fit 8 x 8 size. + size_t xborder = xsize < kMax ? (kMax - xsize) / 2 : 0; + size_t yborder = ysize < kMax ? (kMax - ysize) / 2 : 0; + size_t xscaled = std::max(kMax, xsize); + size_t yscaled = std::max(kMax, ysize); + Image3F scaled0(xscaled, yscaled); + Image3F scaled1(xscaled, yscaled); + for (int i = 0; i < 3; ++i) { + for (size_t y = 0; y < yscaled; ++y) { + for (size_t x = 0; x < xscaled; ++x) { + size_t x2 = + std::min(xsize - 1, x > xborder ? x - xborder : 0); + size_t y2 = + std::min(ysize - 1, y > yborder ? y - yborder : 0); + scaled0.PlaneRow(i, y)[x] = rgb0.PlaneRow(i, y2)[x2]; + scaled1.PlaneRow(i, y)[x] = rgb1.PlaneRow(i, y2)[x2]; + } + } + } + ImageF diffmap_scaled; + const bool ok = + ButteraugliDiffmap(scaled0, scaled1, params, diffmap_scaled); + diffmap = ImageF(xsize, ysize); + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + diffmap.Row(y)[x] = diffmap_scaled.Row(y + yborder)[x + xborder]; + } + } + return ok; + } + ButteraugliComparator butteraugli(rgb0, params); + butteraugli.Diffmap(rgb1, diffmap); + return true; +} + +bool ButteraugliInterface(const Image3F& rgb0, const Image3F& rgb1, + float hf_asymmetry, float xmul, ImageF& diffmap, + double& diffvalue) { + ButteraugliParams params; + params.hf_asymmetry = hf_asymmetry; + params.xmul = xmul; + return ButteraugliInterface(rgb0, rgb1, params, diffmap, diffvalue); +} + +bool ButteraugliInterface(const Image3F& rgb0, const Image3F& rgb1, + const ButteraugliParams& params, ImageF& diffmap, + double& diffvalue) { +#if JXL_PROFILER_ENABLED + auto trace_start = std::chrono::steady_clock::now(); +#endif + if (!ButteraugliDiffmap(rgb0, rgb1, params, diffmap)) { + return false; + } +#if JXL_PROFILER_ENABLED + auto trace_end = std::chrono::steady_clock::now(); + std::chrono::duration elapsed = trace_end - trace_start; + const size_t mp = rgb0.xsize() * rgb0.ysize(); + printf("diff MP/s %f\n", mp / elapsed.count() * 1E-6); +#endif + diffvalue = ButteraugliScoreFromDiffmap(diffmap, ¶ms); + return true; +} + +double ButteraugliFuzzyClass(double score) { + static const double fuzzy_width_up = 4.8; + static const double fuzzy_width_down = 4.8; + static const double m0 = 2.0; + static const double scaler = 0.7777; + double val; + if (score < 1.0) { + // val in [scaler .. 2.0] + val = m0 / (1.0 + exp((score - 1.0) * fuzzy_width_down)); + val -= 1.0; // from [1 .. 2] to [0 .. 1] + val *= 2.0 - scaler; // from [0 .. 1] to [0 .. 2.0 - scaler] + val += scaler; // from [0 .. 2.0 - scaler] to [scaler .. 2.0] + } else { + // val in [0 .. scaler] + val = m0 / (1.0 + exp((score - 1.0) * fuzzy_width_up)); + val *= scaler; + } + return val; +} + +// #define PRINT_OUT_NORMALIZATION + +double ButteraugliFuzzyInverse(double seek) { + double pos = 0; + // NOLINTNEXTLINE(clang-analyzer-security.FloatLoopCounter) + for (double range = 1.0; range >= 1e-10; range *= 0.5) { + double cur = ButteraugliFuzzyClass(pos); + if (cur < seek) { + pos -= range; + } else { + pos += range; + } + } +#ifdef PRINT_OUT_NORMALIZATION + if (seek == 1.0) { + fprintf(stderr, "Fuzzy inverse %g\n", pos); + } +#endif + return pos; +} + +#ifdef PRINT_OUT_NORMALIZATION +static double print_out_normalization = ButteraugliFuzzyInverse(1.0); +#endif + +namespace { + +void ScoreToRgb(double score, double good_threshold, double bad_threshold, + float rgb[3]) { + double heatmap[12][3] = { + {0, 0, 0}, {0, 0, 1}, + {0, 1, 1}, {0, 1, 0}, // Good level + {1, 1, 0}, {1, 0, 0}, // Bad level + {1, 0, 1}, {0.5, 0.5, 1.0}, + {1.0, 0.5, 0.5}, // Pastel colors for the very bad quality range. + {1.0, 1.0, 0.5}, {1, 1, 1}, + {1, 1, 1}, // Last color repeated to have a solid range of white. + }; + if (score < good_threshold) { + score = (score / good_threshold) * 0.3; + } else if (score < bad_threshold) { + score = 0.3 + + (score - good_threshold) / (bad_threshold - good_threshold) * 0.15; + } else { + score = 0.45 + (score - bad_threshold) / (bad_threshold * 12) * 0.5; + } + static const int kTableSize = sizeof(heatmap) / sizeof(heatmap[0]); + score = std::min(std::max(score * (kTableSize - 1), 0.0), + kTableSize - 2); + int ix = static_cast(score); + ix = std::min(std::max(0, ix), kTableSize - 2); // Handle NaN + double mix = score - ix; + for (int i = 0; i < 3; ++i) { + double v = mix * heatmap[ix + 1][i] + (1 - mix) * heatmap[ix][i]; + rgb[i] = pow(v, 0.5); + } +} + +} // namespace + +Image3F CreateHeatMapImage(const ImageF& distmap, double good_threshold, + double bad_threshold) { + Image3F heatmap(distmap.xsize(), distmap.ysize()); + for (size_t y = 0; y < distmap.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row_distmap = distmap.ConstRow(y); + float* BUTTERAUGLI_RESTRICT row_h0 = heatmap.PlaneRow(0, y); + float* BUTTERAUGLI_RESTRICT row_h1 = heatmap.PlaneRow(1, y); + float* BUTTERAUGLI_RESTRICT row_h2 = heatmap.PlaneRow(2, y); + for (size_t x = 0; x < distmap.xsize(); ++x) { + const float d = row_distmap[x]; + float rgb[3]; + ScoreToRgb(d, good_threshold, bad_threshold, rgb); + row_h0[x] = rgb[0]; + row_h1[x] = rgb[1]; + row_h2[x] = rgb[2]; + } + } + return heatmap; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.h b/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.h new file mode 100644 index 0000000000..652b9528c4 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.h @@ -0,0 +1,209 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// +// Author: Jyrki Alakuijala (jyrki.alakuijala@gmail.com) + +#ifndef LIB_JXL_BUTTERAUGLI_BUTTERAUGLI_H_ +#define LIB_JXL_BUTTERAUGLI_BUTTERAUGLI_H_ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_ops.h" + +#define BUTTERAUGLI_ENABLE_CHECKS 0 +#define BUTTERAUGLI_RESTRICT JXL_RESTRICT + +// This is the main interface to butteraugli image similarity +// analysis function. + +namespace jxl { + +struct ButteraugliParams { + // Multiplier for penalizing new HF artifacts more than blurring away + // features. 1.0=neutral. + float hf_asymmetry = 1.0f; + + // Multiplier for the psychovisual difference in the X channel. + float xmul = 1.0f; + + // Number of nits that correspond to 1.0f input values. + float intensity_target = 80.0f; +}; + +// ButteraugliInterface defines the public interface for butteraugli. +// +// It calculates the difference between rgb0 and rgb1. +// +// rgb0 and rgb1 contain the images. rgb0[c][px] and rgb1[c][px] contains +// the red image for c == 0, green for c == 1, blue for c == 2. Location index +// px is calculated as y * xsize + x. +// +// Value of pixels of images rgb0 and rgb1 need to be represented as raw +// intensity. Most image formats store gamma corrected intensity in pixel +// values. This gamma correction has to be removed, by applying the following +// function to values in the 0-1 range: +// butteraugli_val = pow(input_val, gamma); +// A typical value of gamma is 2.2. It is usually stored in the image header. +// Take care not to confuse that value with its inverse. The gamma value should +// be always greater than one. +// Butteraugli does not work as intended if the caller does not perform +// gamma correction. +// +// hf_asymmetry is a multiplier for penalizing new HF artifacts more than +// blurring away features (1.0 -> neutral). +// +// diffmap will contain an image of the size xsize * ysize, containing +// localized differences for values px (indexed with the px the same as rgb0 +// and rgb1). diffvalue will give a global score of similarity. +// +// A diffvalue smaller than kButteraugliGood indicates that images can be +// observed as the same image. +// diffvalue larger than kButteraugliBad indicates that a difference between +// the images can be observed. +// A diffvalue between kButteraugliGood and kButteraugliBad indicates that +// a subtle difference can be observed between the images. +// +// Returns true on success. +bool ButteraugliInterface(const Image3F &rgb0, const Image3F &rgb1, + const ButteraugliParams ¶ms, ImageF &diffmap, + double &diffvalue); + +// Deprecated (calls the previous function) +bool ButteraugliInterface(const Image3F &rgb0, const Image3F &rgb1, + float hf_asymmetry, float xmul, ImageF &diffmap, + double &diffvalue); + +// Converts the butteraugli score into fuzzy class values that are continuous +// at the class boundary. The class boundary location is based on human +// raters, but the slope is arbitrary. Particularly, it does not reflect +// the expectation value of probabilities of the human raters. It is just +// expected that a smoother class boundary will allow for higher-level +// optimization algorithms to work faster. +// +// Returns 2.0 for a perfect match, and 1.0 for 'ok', 0.0 for bad. Because the +// scoring is fuzzy, a butteraugli score of 0.96 would return a class of +// around 1.9. +double ButteraugliFuzzyClass(double score); + +// Input values should be in range 0 (bad) to 2 (good). Use +// kButteraugliNormalization as normalization. +double ButteraugliFuzzyInverse(double seek); + +// Implementation details, don't use anything below or your code will +// break in the future. + +#ifdef _MSC_VER +#define BUTTERAUGLI_INLINE __forceinline +#else +#define BUTTERAUGLI_INLINE inline +#endif + +#ifdef __clang__ +// Early versions of Clang did not support __builtin_assume_aligned. +#define BUTTERAUGLI_HAS_ASSUME_ALIGNED __has_builtin(__builtin_assume_aligned) +#elif defined(__GNUC__) +#define BUTTERAUGLI_HAS_ASSUME_ALIGNED 1 +#else +#define BUTTERAUGLI_HAS_ASSUME_ALIGNED 0 +#endif + +// Returns a void* pointer which the compiler then assumes is N-byte aligned. +// Example: float* JXL_RESTRICT aligned = (float*)JXL_ASSUME_ALIGNED(in, 32); +// +// The assignment semantics are required by GCC/Clang. ICC provides an in-place +// __assume_aligned, whereas MSVC's __assume appears unsuitable. +#if BUTTERAUGLI_HAS_ASSUME_ALIGNED +#define BUTTERAUGLI_ASSUME_ALIGNED(ptr, align) \ + __builtin_assume_aligned((ptr), (align)) +#else +#define BUTTERAUGLI_ASSUME_ALIGNED(ptr, align) (ptr) +#endif // BUTTERAUGLI_HAS_ASSUME_ALIGNED + +struct PsychoImage { + ImageF uhf[2]; // XY + ImageF hf[2]; // XY + Image3F mf; // XYB + Image3F lf; // XYB +}; + +// Blur needs a transposed image. +// Hold it here and only allocate on demand to reduce memory usage. +struct BlurTemp { + ImageF *GetTransposed(const ImageF &in) { + if (transposed_temp.xsize() == 0) { + transposed_temp = ImageF(in.ysize(), in.xsize()); + } + return &transposed_temp; + } + + ImageF transposed_temp; +}; + +class ButteraugliComparator { + public: + // Butteraugli is calibrated at xmul = 1.0. We add a multiplier here so that + // we can test the hypothesis that a higher weighing of the X channel would + // improve results at higher Butteraugli values. + ButteraugliComparator(const Image3F &rgb0, const ButteraugliParams ¶ms); + virtual ~ButteraugliComparator() = default; + + // Computes the butteraugli map between the original image given in the + // constructor and the distorted image give here. + void Diffmap(const Image3F &rgb1, ImageF &result) const; + + // Same as above, but OpsinDynamicsImage() was already applied. + void DiffmapOpsinDynamicsImage(const Image3F &xyb1, ImageF &result) const; + + // Same as above, but the frequency decomposition was already applied. + void DiffmapPsychoImage(const PsychoImage &pi1, ImageF &diffmap) const; + + void Mask(ImageF *BUTTERAUGLI_RESTRICT mask) const; + + private: + Image3F *Temp() const; + void ReleaseTemp() const; + + const size_t xsize_; + const size_t ysize_; + ButteraugliParams params_; + PsychoImage pi0_; + + // Shared temporary image storage to reduce the number of allocations; + // obtained via Temp(), must call ReleaseTemp when no longer needed. + mutable Image3F temp_; + mutable std::atomic_flag temp_in_use_ = ATOMIC_FLAG_INIT; + + mutable BlurTemp blur_temp_; + std::unique_ptr sub_; +}; + +// Deprecated. +bool ButteraugliDiffmap(const Image3F &rgb0, const Image3F &rgb1, + double hf_asymmetry, double xmul, ImageF &diffmap); + +bool ButteraugliDiffmap(const Image3F &rgb0, const Image3F &rgb1, + const ButteraugliParams ¶ms, ImageF &diffmap); + +double ButteraugliScoreFromDiffmap(const ImageF &diffmap, + const ButteraugliParams *params = nullptr); + +// Generate rgb-representation of the distance between two images. +Image3F CreateHeatMapImage(const ImageF &distmap, double good_threshold, + double bad_threshold); + +} // namespace jxl + +#endif // LIB_JXL_BUTTERAUGLI_BUTTERAUGLI_H_ diff --git a/third_party/jpeg-xl/lib/jxl/butteraugli_test.cc b/third_party/jpeg-xl/lib/jxl/butteraugli_test.cc new file mode 100644 index 0000000000..3fdec09725 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/butteraugli_test.cc @@ -0,0 +1,103 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include + +#include "lib/jxl/test_image.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +TEST(ButteraugliTest, Lossless) { + uint32_t xsize = 171; + uint32_t ysize = 219; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr)); + JxlButteraugliResultPtr result(JxlButteraugliCompute( + api.get(), xsize, ysize, &pixel_format, pixels.data(), pixels.size(), + &pixel_format, pixels.data(), pixels.size())); + EXPECT_EQ(0.0, JxlButteraugliResultGetDistance(result.get(), 8.0)); +} + +TEST(ButteraugliTest, Distmap) { + uint32_t xsize = 171; + uint32_t ysize = 219; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr)); + JxlButteraugliResultPtr result(JxlButteraugliCompute( + api.get(), xsize, ysize, &pixel_format, pixels.data(), pixels.size(), + &pixel_format, pixels.data(), pixels.size())); + EXPECT_EQ(0.0, JxlButteraugliResultGetDistance(result.get(), 8.0)); + const float* distmap; + uint32_t row_stride; + JxlButteraugliResultGetDistmap(result.get(), &distmap, &row_stride); + for (uint32_t y = 0; y < ysize; y++) { + for (uint32_t x = 0; x < xsize; x++) { + EXPECT_EQ(0.0, distmap[y * row_stride + x]); + } + } +} + +TEST(ButteraugliTest, Distorted) { + uint32_t xsize = 171; + uint32_t ysize = 219; + std::vector orig_pixels = + jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + std::vector dist_pixels = + jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + dist_pixels[0] += 128; + + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr)); + JxlButteraugliResultPtr result(JxlButteraugliCompute( + api.get(), xsize, ysize, &pixel_format, orig_pixels.data(), + orig_pixels.size(), &pixel_format, dist_pixels.data(), + dist_pixels.size())); + EXPECT_NE(0.0, JxlButteraugliResultGetDistance(result.get(), 8.0)); +} + +TEST(ButteraugliTest, Api) { + uint32_t xsize = 171; + uint32_t ysize = 219; + std::vector orig_pixels = + jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + std::vector dist_pixels = + jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + dist_pixels[0] += 128; + + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr)); + JxlButteraugliApiSetHFAsymmetry(api.get(), 1.0f); + JxlButteraugliApiSetIntensityTarget(api.get(), 250.0f); + JxlButteraugliResultPtr result(JxlButteraugliCompute( + api.get(), xsize, ysize, &pixel_format, orig_pixels.data(), + orig_pixels.size(), &pixel_format, dist_pixels.data(), + dist_pixels.size())); + double distance0 = JxlButteraugliResultGetDistance(result.get(), 8.0); + + JxlButteraugliApiSetHFAsymmetry(api.get(), 2.0f); + result.reset(JxlButteraugliCompute(api.get(), xsize, ysize, &pixel_format, + orig_pixels.data(), orig_pixels.size(), + &pixel_format, dist_pixels.data(), + dist_pixels.size())); + double distance1 = JxlButteraugliResultGetDistance(result.get(), 8.0); + + EXPECT_NE(distance0, distance1); + + JxlButteraugliApiSetIntensityTarget(api.get(), 80.0f); + result.reset(JxlButteraugliCompute(api.get(), xsize, ysize, &pixel_format, + orig_pixels.data(), orig_pixels.size(), + &pixel_format, dist_pixels.data(), + dist_pixels.size())); + double distance2 = JxlButteraugliResultGetDistance(result.get(), 8.0); + + EXPECT_NE(distance1, distance2); +} diff --git a/third_party/jpeg-xl/lib/jxl/butteraugli_wrapper.cc b/third_party/jpeg-xl/lib/jxl/butteraugli_wrapper.cc new file mode 100644 index 0000000000..c5a1a8e506 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/butteraugli_wrapper.cc @@ -0,0 +1,203 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include +#include +#include + +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/butteraugli/butteraugli.h" +#include "lib/jxl/common.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_butteraugli_pnorm.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/enc_external_image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/memory_manager_internal.h" + +namespace { + +void SetMetadataFromPixelFormat(const JxlPixelFormat* pixel_format, + jxl::ImageMetadata* metadata) { + uint32_t potential_alpha_bits = 0; + switch (pixel_format->data_type) { + case JXL_TYPE_FLOAT: + metadata->SetFloat32Samples(); + potential_alpha_bits = 16; + break; + case JXL_TYPE_FLOAT16: + metadata->SetFloat16Samples(); + potential_alpha_bits = 16; + break; + case JXL_TYPE_UINT16: + metadata->SetUintSamples(16); + potential_alpha_bits = 16; + break; + case JXL_TYPE_UINT8: + metadata->SetUintSamples(8); + potential_alpha_bits = 8; + break; + default: + JXL_ABORT("Unhandled JxlDataType"); + } + if (pixel_format->num_channels == 2 || pixel_format->num_channels == 4) { + metadata->SetAlphaBits(potential_alpha_bits); + } +} + +} // namespace + +struct JxlButteraugliResultStruct { + JxlMemoryManager memory_manager; + + jxl::ImageF distmap; + jxl::ButteraugliParams params; +}; + +struct JxlButteraugliApiStruct { + // Multiplier for penalizing new HF artifacts more than blurring away + // features. 1.0=neutral. + float hf_asymmetry = 1.0f; + + // Multiplier for the psychovisual difference in the X channel. + float xmul = 1.0f; + + // Number of nits that correspond to 1.0f input values. + float intensity_target = jxl::kDefaultIntensityTarget; + + JxlCmsInterface cms; + JxlMemoryManager memory_manager; + std::unique_ptr thread_pool{nullptr}; +}; + +JxlButteraugliApi* JxlButteraugliApiCreate( + const JxlMemoryManager* memory_manager) { + JxlMemoryManager local_memory_manager; + if (!jxl::MemoryManagerInit(&local_memory_manager, memory_manager)) + return nullptr; + + void* alloc = + jxl::MemoryManagerAlloc(&local_memory_manager, sizeof(JxlButteraugliApi)); + if (!alloc) return nullptr; + // Placement new constructor on allocated memory + JxlButteraugliApi* ret = new (alloc) JxlButteraugliApi(); + ret->cms = jxl::GetJxlCms(); + ret->memory_manager = local_memory_manager; + return ret; +} + +void JxlButteraugliApiSetParallelRunner(JxlButteraugliApi* api, + JxlParallelRunner parallel_runner, + void* parallel_runner_opaque) { + api->thread_pool = jxl::make_unique(parallel_runner, + parallel_runner_opaque); +} + +void JxlButteraugliApiSetHFAsymmetry(JxlButteraugliApi* api, float v) { + api->hf_asymmetry = v; +} + +void JxlButteraugliApiSetIntensityTarget(JxlButteraugliApi* api, float v) { + api->intensity_target = v; +} + +void JxlButteraugliApiDestroy(JxlButteraugliApi* api) { + if (api) { + JxlMemoryManager local_memory_manager = api->memory_manager; + // Call destructor directly since custom free function is used. + api->~JxlButteraugliApi(); + jxl::MemoryManagerFree(&local_memory_manager, api); + } +} + +JxlButteraugliResult* JxlButteraugliCompute( + const JxlButteraugliApi* api, uint32_t xsize, uint32_t ysize, + const JxlPixelFormat* pixel_format_orig, const void* buffer_orig, + size_t size_orig, const JxlPixelFormat* pixel_format_dist, + const void* buffer_dist, size_t size_dist) { + jxl::ImageMetadata orig_metadata; + SetMetadataFromPixelFormat(pixel_format_orig, &orig_metadata); + jxl::ImageBundle orig_ib(&orig_metadata); + jxl::ColorEncoding c_current; + if (pixel_format_orig->data_type == JXL_TYPE_FLOAT) { + c_current = + jxl::ColorEncoding::LinearSRGB(pixel_format_orig->num_channels < 3); + } else { + c_current = jxl::ColorEncoding::SRGB(pixel_format_orig->num_channels < 3); + } + if (!jxl::BufferToImageBundle(*pixel_format_orig, xsize, ysize, buffer_orig, + size_orig, api->thread_pool.get(), c_current, + &orig_ib)) { + return nullptr; + } + + jxl::ImageMetadata dist_metadata; + SetMetadataFromPixelFormat(pixel_format_dist, &dist_metadata); + jxl::ImageBundle dist_ib(&dist_metadata); + if (pixel_format_dist->data_type == JXL_TYPE_FLOAT) { + c_current = + jxl::ColorEncoding::LinearSRGB(pixel_format_dist->num_channels < 3); + } else { + c_current = jxl::ColorEncoding::SRGB(pixel_format_dist->num_channels < 3); + } + if (!jxl::BufferToImageBundle(*pixel_format_dist, xsize, ysize, buffer_dist, + size_dist, api->thread_pool.get(), c_current, + &dist_ib)) { + return nullptr; + } + + void* alloc = jxl::MemoryManagerAlloc(&api->memory_manager, + sizeof(JxlButteraugliResult)); + if (!alloc) return nullptr; + // Placement new constructor on allocated memory + JxlButteraugliResult* result = new (alloc) JxlButteraugliResult(); + result->memory_manager = api->memory_manager; + result->params.hf_asymmetry = api->hf_asymmetry; + result->params.xmul = api->xmul; + result->params.intensity_target = api->intensity_target; + jxl::ButteraugliDistance(orig_ib, dist_ib, result->params, api->cms, + &result->distmap, api->thread_pool.get()); + + return result; +} + +float JxlButteraugliResultGetDistance(const JxlButteraugliResult* result, + float pnorm) { + return static_cast( + jxl::ComputeDistanceP(result->distmap, result->params, pnorm)); +} + +void JxlButteraugliResultGetDistmap(const JxlButteraugliResult* result, + const float** buffer, + uint32_t* row_stride) { + *buffer = result->distmap.Row(0); + *row_stride = result->distmap.PixelsPerRow(); +} + +float JxlButteraugliResultGetMaxDistance(const JxlButteraugliResult* result) { + float max_distance = 0.0; + for (uint32_t y = 0; y < result->distmap.ysize(); y++) { + for (uint32_t x = 0; x < result->distmap.xsize(); x++) { + if (result->distmap.ConstRow(y)[x] > max_distance) { + max_distance = result->distmap.ConstRow(y)[x]; + } + } + } + return max_distance; +} + +void JxlButteraugliResultDestroy(JxlButteraugliResult* result) { + if (result) { + JxlMemoryManager local_memory_manager = result->memory_manager; + // Call destructor directly since custom free function is used. + result->~JxlButteraugliResult(); + jxl::MemoryManagerFree(&local_memory_manager, result); + } +} diff --git a/third_party/jpeg-xl/lib/jxl/byte_order_test.cc b/third_party/jpeg-xl/lib/jxl/byte_order_test.cc new file mode 100644 index 0000000000..17d7ef6643 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/byte_order_test.cc @@ -0,0 +1,53 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/byte_order.h" + +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +TEST(ByteOrderTest, TestRoundTripBE16) { + const uint32_t in = 0x1234; + uint8_t buf[2]; + StoreBE16(in, buf); + EXPECT_EQ(in, LoadBE16(buf)); + EXPECT_NE(in, LoadLE16(buf)); +} + +TEST(ByteOrderTest, TestRoundTripLE16) { + const uint32_t in = 0x1234; + uint8_t buf[2]; + StoreLE16(in, buf); + EXPECT_EQ(in, LoadLE16(buf)); + EXPECT_NE(in, LoadBE16(buf)); +} + +TEST(ByteOrderTest, TestRoundTripBE32) { + const uint32_t in = 0xFEDCBA98u; + uint8_t buf[4]; + StoreBE32(in, buf); + EXPECT_EQ(in, LoadBE32(buf)); + EXPECT_NE(in, LoadLE32(buf)); +} + +TEST(ByteOrderTest, TestRoundTripLE32) { + const uint32_t in = 0xFEDCBA98u; + uint8_t buf[4]; + StoreLE32(in, buf); + EXPECT_EQ(in, LoadLE32(buf)); + EXPECT_NE(in, LoadBE32(buf)); +} + +TEST(ByteOrderTest, TestRoundTripLE64) { + const uint64_t in = 0xFEDCBA9876543210ull; + uint8_t buf[8]; + StoreLE64(in, buf); + EXPECT_EQ(in, LoadLE64(buf)); +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/chroma_from_luma.cc b/third_party/jpeg-xl/lib/jxl/chroma_from_luma.cc new file mode 100644 index 0000000000..63d21cbb4b --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/chroma_from_luma.cc @@ -0,0 +1,21 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/chroma_from_luma.h" + +namespace jxl { + +ColorCorrelationMap::ColorCorrelationMap(size_t xsize, size_t ysize, bool XYB) + : ytox_map(DivCeil(xsize, kColorTileDim), DivCeil(ysize, kColorTileDim)), + ytob_map(DivCeil(xsize, kColorTileDim), DivCeil(ysize, kColorTileDim)) { + ZeroFillImage(&ytox_map); + ZeroFillImage(&ytob_map); + if (!XYB) { + base_correlation_b_ = 0; + } + RecomputeDCFactors(); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/chroma_from_luma.h b/third_party/jpeg-xl/lib/jxl/chroma_from_luma.h new file mode 100644 index 0000000000..9a7f3d45bc --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/chroma_from_luma.h @@ -0,0 +1,147 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_CHROMA_FROM_LUMA_H_ +#define LIB_JXL_CHROMA_FROM_LUMA_H_ + +// Chroma-from-luma, computed using heuristics to determine the best linear +// model for the X and B channels from the Y channel. + +#include +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/field_encodings.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/quant_weights.h" + +namespace jxl { + +// Tile is the rectangular grid of blocks that share color correlation +// parameters ("factor_x/b" such that residual_b = blue - Y * factor_b). +static constexpr size_t kColorTileDim = 64; + +static_assert(kColorTileDim % kBlockDim == 0, + "Color tile dim should be divisible by block dim"); +static constexpr size_t kColorTileDimInBlocks = kColorTileDim / kBlockDim; + +static_assert(kGroupDimInBlocks % kColorTileDimInBlocks == 0, + "Group dim should be divisible by color tile dim"); + +static constexpr uint8_t kDefaultColorFactor = 84; + +// JPEG DCT coefficients are at most 1024. CfL constants are at most 127, and +// the ratio of two entries in a JPEG quantization table is at most 255. Thus, +// since the CfL denominator is 84, this leaves 12 bits of mantissa to be used. +// For extra caution, we use 11. +static constexpr uint8_t kCFLFixedPointPrecision = 11; + +static constexpr U32Enc kColorFactorDist(Val(kDefaultColorFactor), Val(256), + BitsOffset(8, 2), BitsOffset(16, 258)); + +struct ColorCorrelationMap { + ColorCorrelationMap() = default; + // xsize/ysize are in pixels + // set XYB=false to do something close to no-op cmap (needed for now since + // cmap is mandatory) + ColorCorrelationMap(size_t xsize, size_t ysize, bool XYB = true); + + float YtoXRatio(int32_t x_factor) const { + return base_correlation_x_ + x_factor * color_scale_; + } + + float YtoBRatio(int32_t b_factor) const { + return base_correlation_b_ + b_factor * color_scale_; + } + + Status DecodeDC(BitReader* br) { + if (br->ReadFixedBits<1>() == 1) { + // All default. + return true; + } + SetColorFactor(U32Coder::Read(kColorFactorDist, br)); + JXL_RETURN_IF_ERROR(F16Coder::Read(br, &base_correlation_x_)); + if (std::abs(base_correlation_x_) > 4.0f) { + return JXL_FAILURE("Base X correlation is out of range"); + } + JXL_RETURN_IF_ERROR(F16Coder::Read(br, &base_correlation_b_)); + if (std::abs(base_correlation_b_) > 4.0f) { + return JXL_FAILURE("Base B correlation is out of range"); + } + ytox_dc_ = static_cast(br->ReadFixedBits()) + + std::numeric_limits::min(); + ytob_dc_ = static_cast(br->ReadFixedBits()) + + std::numeric_limits::min(); + RecomputeDCFactors(); + return true; + } + + // We consider a CfL map to be JPEG-reconstruction-compatible if base + // correlation is 0, no DC correlation is used, and we use the default color + // factor. + bool IsJPEGCompatible() const { + return base_correlation_x_ == 0 && base_correlation_b_ == 0 && + ytob_dc_ == 0 && ytox_dc_ == 0 && + color_factor_ == kDefaultColorFactor; + } + + int32_t RatioJPEG(int32_t factor) const { + return factor * (1 << kCFLFixedPointPrecision) / kDefaultColorFactor; + } + + void SetColorFactor(uint32_t factor) { + color_factor_ = factor; + color_scale_ = 1.0f / color_factor_; + RecomputeDCFactors(); + } + + void SetYToBDC(int32_t ytob_dc) { + ytob_dc_ = ytob_dc; + RecomputeDCFactors(); + } + void SetYToXDC(int32_t ytox_dc) { + ytox_dc_ = ytox_dc; + RecomputeDCFactors(); + } + + int32_t GetYToXDC() const { return ytox_dc_; } + int32_t GetYToBDC() const { return ytob_dc_; } + float GetColorFactor() const { return color_factor_; } + float GetBaseCorrelationX() const { return base_correlation_x_; } + float GetBaseCorrelationB() const { return base_correlation_b_; } + + const float* DCFactors() const { return dc_factors_; } + + void RecomputeDCFactors() { + dc_factors_[0] = YtoXRatio(ytox_dc_); + dc_factors_[2] = YtoBRatio(ytob_dc_); + } + + ImageSB ytox_map; + ImageSB ytob_map; + + private: + float dc_factors_[4] = {}; + // range of factor: -1.51 to +1.52 + uint32_t color_factor_ = kDefaultColorFactor; + float color_scale_ = 1.0f / color_factor_; + float base_correlation_x_ = 0.0f; + float base_correlation_b_ = kYToBRatio; + int32_t ytox_dc_ = 0; + int32_t ytob_dc_ = 0; +}; + +} // namespace jxl + +#endif // LIB_JXL_CHROMA_FROM_LUMA_H_ diff --git a/third_party/jpeg-xl/lib/jxl/codec_in_out.h b/third_party/jpeg-xl/lib/jxl/codec_in_out.h new file mode 100644 index 0000000000..9e48b5e937 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/codec_in_out.h @@ -0,0 +1,116 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_CODEC_IN_OUT_H_ +#define LIB_JXL_CODEC_IN_OUT_H_ + +// Holds inputs/outputs for decoding/encoding images. + +#include +#include + +#include +#include +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/luminance.h" + +namespace jxl { + +// Optional text/EXIF metadata. +struct Blobs { + std::vector exif; + std::vector iptc; + std::vector jumbf; + std::vector xmp; +}; + +// Holds a preview, a main image or one or more frames, plus the inputs/outputs +// to/from decoding/encoding. +class CodecInOut { + public: + CodecInOut() : preview_frame(&metadata.m) { + frames.reserve(1); + frames.emplace_back(&metadata.m); + } + + // Move-only. + CodecInOut(CodecInOut&&) = default; + CodecInOut& operator=(CodecInOut&&) = default; + + size_t LastStillFrame() const { + JXL_DASSERT(!frames.empty()); + size_t last = 0; + for (size_t i = 0; i < frames.size(); i++) { + last = i; + if (frames[i].duration > 0) break; + } + return last; + } + + ImageBundle& Main() { return frames[LastStillFrame()]; } + const ImageBundle& Main() const { return frames[LastStillFrame()]; } + + // If c_current.IsGray(), all planes must be identical. + void SetFromImage(Image3F&& color, const ColorEncoding& c_current) { + Main().SetFromImage(std::move(color), c_current); + SetIntensityTarget(&this->metadata.m); + SetSize(Main().xsize(), Main().ysize()); + } + + void SetSize(size_t xsize, size_t ysize) { + JXL_CHECK(metadata.size.Set(xsize, ysize)); + } + + void CheckMetadata() const { + JXL_CHECK(metadata.m.bit_depth.bits_per_sample != 0); + JXL_CHECK(!metadata.m.color_encoding.ICC().empty()); + + if (preview_frame.xsize() != 0) preview_frame.VerifyMetadata(); + JXL_CHECK(preview_frame.metadata() == &metadata.m); + + for (const ImageBundle& ib : frames) { + ib.VerifyMetadata(); + JXL_CHECK(ib.metadata() == &metadata.m); + } + } + + size_t xsize() const { return metadata.size.xsize(); } + size_t ysize() const { return metadata.size.ysize(); } + void ShrinkTo(size_t xsize, size_t ysize) { + // preview is unaffected. + for (ImageBundle& ib : frames) { + ib.ShrinkTo(xsize, ysize); + } + SetSize(xsize, ysize); + } + + // -- DECODER OUTPUT, ENCODER INPUT: + + // Metadata stored into / retrieved from bitstreams. + + Blobs blobs; + + CodecMetadata metadata; // applies to preview and all frames + + // If metadata.have_preview: + ImageBundle preview_frame; + + std::vector frames; // size=1 if !metadata.have_animation + + // If the image should be written to a JPEG, use this quality for encoding. + size_t jpeg_quality; +}; + +} // namespace jxl + +#endif // LIB_JXL_CODEC_IN_OUT_H_ diff --git a/third_party/jpeg-xl/lib/jxl/coeff_order.cc b/third_party/jpeg-xl/lib/jxl/coeff_order.cc new file mode 100644 index 0000000000..43adafd82a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/coeff_order.cc @@ -0,0 +1,153 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/coeff_order.h" + +#include + +#include +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/lehmer_code.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/modular/modular_image.h" + +namespace jxl { + +uint32_t CoeffOrderContext(uint32_t val) { + uint32_t token, nbits, bits; + HybridUintConfig(0, 0, 0).Encode(val, &token, &nbits, &bits); + return std::min(token, kPermutationContexts - 1); +} + +namespace { +Status ReadPermutation(size_t skip, size_t size, coeff_order_t* order, + BitReader* br, ANSSymbolReader* reader, + const std::vector& context_map) { + std::vector lehmer(size); + // temp space needs to be as large as the next power of 2, so doubling the + // allocated size is enough. + std::vector temp(size * 2); + uint32_t end = + reader->ReadHybridUint(CoeffOrderContext(size), br, context_map) + skip; + if (end > size) { + return JXL_FAILURE("Invalid permutation size"); + } + uint32_t last = 0; + for (size_t i = skip; i < end; ++i) { + lehmer[i] = + reader->ReadHybridUint(CoeffOrderContext(last), br, context_map); + last = lehmer[i]; + if (lehmer[i] + i >= size) { + return JXL_FAILURE("Invalid lehmer code"); + } + } + if (order == nullptr) return true; + DecodeLehmerCode(lehmer.data(), temp.data(), size, order); + return true; +} + +} // namespace + +Status DecodePermutation(size_t skip, size_t size, coeff_order_t* order, + BitReader* br) { + std::vector context_map; + ANSCode code; + JXL_RETURN_IF_ERROR( + DecodeHistograms(br, kPermutationContexts, &code, &context_map)); + ANSSymbolReader reader(&code, br); + JXL_RETURN_IF_ERROR( + ReadPermutation(skip, size, order, br, &reader, context_map)); + if (!reader.CheckANSFinalState()) { + return JXL_FAILURE("Invalid ANS stream"); + } + return true; +} + +namespace { + +Status DecodeCoeffOrder(AcStrategy acs, coeff_order_t* order, BitReader* br, + ANSSymbolReader* reader, + std::vector& natural_order, + const std::vector& context_map) { + PROFILER_FUNC; + const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y(); + const size_t size = kDCTBlockSize * llf; + + JXL_RETURN_IF_ERROR( + ReadPermutation(llf, size, order, br, reader, context_map)); + if (order == nullptr) return true; + for (size_t k = 0; k < size; ++k) { + order[k] = natural_order[order[k]]; + } + return true; +} + +} // namespace + +Status DecodeCoeffOrders(uint16_t used_orders, uint32_t used_acs, + coeff_order_t* order, BitReader* br) { + uint16_t computed = 0; + std::vector context_map; + ANSCode code; + std::unique_ptr reader; + std::vector natural_order; + // Bitstream does not have histograms if no coefficient order is used. + if (used_orders != 0) { + JXL_RETURN_IF_ERROR( + DecodeHistograms(br, kPermutationContexts, &code, &context_map)); + reader = make_unique(&code, br); + } + uint32_t acs_mask = 0; + for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { + if ((used_acs & (1 << o)) == 0) continue; + acs_mask |= 1 << kStrategyOrder[o]; + } + for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { + uint8_t ord = kStrategyOrder[o]; + if (computed & (1 << ord)) continue; + computed |= 1 << ord; + AcStrategy acs = AcStrategy::FromRawStrategy(o); + bool used = (acs_mask & (1 << ord)) != 0; + + const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y(); + const size_t size = kDCTBlockSize * llf; + + if (used || (used_orders & (1 << ord))) { + if (natural_order.size() < size) natural_order.resize(size); + acs.ComputeNaturalCoeffOrder(natural_order.data()); + } + + if ((used_orders & (1 << ord)) == 0) { + // No need to set the default order if no ACS uses this order. + if (used) { + for (size_t c = 0; c < 3; c++) { + memcpy(&order[CoeffOrderOffset(ord, c)], natural_order.data(), + size * sizeof(*order)); + } + } + } else { + for (size_t c = 0; c < 3; c++) { + coeff_order_t* dest = used ? &order[CoeffOrderOffset(ord, c)] : nullptr; + JXL_RETURN_IF_ERROR(DecodeCoeffOrder(acs, dest, br, reader.get(), + natural_order, context_map)); + } + } + } + if (used_orders && !reader->CheckANSFinalState()) { + return JXL_FAILURE("Invalid ANS stream"); + } + return true; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/coeff_order.h b/third_party/jpeg-xl/lib/jxl/coeff_order.h new file mode 100644 index 0000000000..fb32499f2f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/coeff_order.h @@ -0,0 +1,64 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_COEFF_ORDER_H_ +#define LIB_JXL_COEFF_ORDER_H_ + +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_util.h" + +namespace jxl { + +class BitReader; + +// Those offsets get multiplied by kDCTBlockSize. +static constexpr size_t kCoeffOrderOffset[] = { + 0, 1, 2, 3, 4, 5, 6, 10, 14, 18, + 34, 50, 66, 68, 70, 72, 76, 80, 84, 92, + 100, 108, 172, 236, 300, 332, 364, 396, 652, 908, + 1164, 1292, 1420, 1548, 2572, 3596, 4620, 5132, 5644, 6156, +}; +static_assert(3 * kNumOrders + 1 == + sizeof(kCoeffOrderOffset) / sizeof(*kCoeffOrderOffset), + "Update this array when adding or removing order types."); + +static constexpr size_t CoeffOrderOffset(size_t order, size_t c) { + return kCoeffOrderOffset[3 * order + c] * kDCTBlockSize; +} + +static constexpr size_t kCoeffOrderMaxSize = + kCoeffOrderOffset[3 * kNumOrders] * kDCTBlockSize; + +// Mapping from AC strategy to order bucket. Strategies with different natural +// orders must have different buckets. +constexpr uint8_t kStrategyOrder[] = { + 0, 1, 1, 1, 2, 3, 4, 4, 5, 5, 6, 6, 1, 1, + 1, 1, 1, 1, 7, 8, 8, 9, 10, 10, 11, 12, 12, +}; + +static_assert(AcStrategy::kNumValidStrategies == + sizeof(kStrategyOrder) / sizeof(*kStrategyOrder), + "Update this array when adding or removing AC strategies."); + +constexpr uint32_t kPermutationContexts = 8; + +uint32_t CoeffOrderContext(uint32_t val); + +Status DecodeCoeffOrders(uint16_t used_orders, uint32_t used_acs, + coeff_order_t* order, BitReader* br); + +Status DecodePermutation(size_t skip, size_t size, coeff_order_t* order, + BitReader* br); + +} // namespace jxl + +#endif // LIB_JXL_COEFF_ORDER_H_ diff --git a/third_party/jpeg-xl/lib/jxl/coeff_order_fwd.h b/third_party/jpeg-xl/lib/jxl/coeff_order_fwd.h new file mode 100644 index 0000000000..26306575c1 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/coeff_order_fwd.h @@ -0,0 +1,47 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_COEFF_ORDER_FWD_H_ +#define LIB_JXL_COEFF_ORDER_FWD_H_ + +// Breaks circular dependency between ac_strategy and coeff_order. + +#include +#include + +#include "lib/jxl/base/compiler_specific.h" + +namespace jxl { + +// Needs at least 16 bits. A 32-bit type speeds up DecodeAC by 2% at the cost of +// more memory. +using coeff_order_t = uint32_t; + +// Maximum number of orders to be used. Note that this needs to be multiplied by +// the number of channels. One per "size class" (plus one extra for DCT8), +// shared between transforms of size XxY and of size YxX. +constexpr uint8_t kNumOrders = 13; + +// DCT coefficients are laid out in such a way that the number of rows of +// coefficients is always the smaller coordinate. +JXL_INLINE constexpr size_t CoefficientRows(size_t rows, size_t columns) { + return rows < columns ? rows : columns; +} + +JXL_INLINE constexpr size_t CoefficientColumns(size_t rows, size_t columns) { + return rows < columns ? columns : rows; +} + +JXL_INLINE void CoefficientLayout(size_t* JXL_RESTRICT rows, + size_t* JXL_RESTRICT columns) { + size_t r = *rows; + size_t c = *columns; + *rows = CoefficientRows(r, c); + *columns = CoefficientColumns(r, c); +} + +} // namespace jxl + +#endif // LIB_JXL_COEFF_ORDER_FWD_H_ diff --git a/third_party/jpeg-xl/lib/jxl/coeff_order_test.cc b/third_party/jpeg-xl/lib/jxl/coeff_order_test.cc new file mode 100644 index 0000000000..6fa0775697 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/coeff_order_test.cc @@ -0,0 +1,97 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/coeff_order.h" + +#include + +#include +#include // iota +#include +#include + +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/random.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_coeff_order.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +void RoundtripPermutation(coeff_order_t* perm, coeff_order_t* out, size_t len, + size_t* size) { + BitWriter writer; + EncodePermutation(perm, 0, len, &writer, 0, nullptr); + writer.ZeroPadToByte(); + Status status = true; + { + BitReader reader(writer.GetSpan()); + BitReaderScopedCloser closer(&reader, &status); + ASSERT_TRUE(DecodePermutation(0, len, out, &reader)); + } + ASSERT_TRUE(status); + *size = writer.GetSpan().size(); +} + +enum Permutation { kIdentity, kFewSwaps, kFewSlides, kRandom }; + +constexpr size_t kSwaps = 32; + +void TestPermutation(Permutation kind, size_t len) { + std::vector perm(len); + std::iota(perm.begin(), perm.end(), 0); + Rng rng(0); + if (kind == kFewSwaps) { + for (size_t i = 0; i < kSwaps; i++) { + size_t a = rng.UniformU(0, len - 1); + size_t b = rng.UniformU(0, len - 1); + std::swap(perm[a], perm[b]); + } + } + if (kind == kFewSlides) { + for (size_t i = 0; i < kSwaps; i++) { + size_t a = rng.UniformU(0, len - 1); + size_t b = rng.UniformU(0, len - 1); + size_t from = std::min(a, b); + size_t to = std::max(a, b); + size_t start = perm[from]; + for (size_t j = from; j < to; j++) { + perm[j] = perm[j + 1]; + } + perm[to] = start; + } + } + if (kind == kRandom) { + rng.Shuffle(perm.data(), perm.size()); + } + std::vector out(len); + size_t size = 0; + RoundtripPermutation(perm.data(), out.data(), len, &size); + for (size_t idx = 0; idx < len; idx++) { + EXPECT_EQ(perm[idx], out[idx]); + } + printf("Encoded size: %" PRIuS "\n", size); +} + +TEST(CoeffOrderTest, IdentitySmall) { TestPermutation(kIdentity, 256); } +TEST(CoeffOrderTest, FewSlidesSmall) { TestPermutation(kFewSlides, 256); } +TEST(CoeffOrderTest, FewSwapsSmall) { TestPermutation(kFewSwaps, 256); } +TEST(CoeffOrderTest, RandomSmall) { TestPermutation(kRandom, 256); } + +TEST(CoeffOrderTest, IdentityMedium) { TestPermutation(kIdentity, 1 << 12); } +TEST(CoeffOrderTest, FewSlidesMedium) { TestPermutation(kFewSlides, 1 << 12); } +TEST(CoeffOrderTest, FewSwapsMedium) { TestPermutation(kFewSwaps, 1 << 12); } +TEST(CoeffOrderTest, RandomMedium) { TestPermutation(kRandom, 1 << 12); } + +TEST(CoeffOrderTest, IdentityBig) { TestPermutation(kIdentity, 1 << 16); } +TEST(CoeffOrderTest, FewSlidesBig) { TestPermutation(kFewSlides, 1 << 16); } +TEST(CoeffOrderTest, FewSwapsBig) { TestPermutation(kFewSwaps, 1 << 16); } +TEST(CoeffOrderTest, RandomBig) { TestPermutation(kRandom, 1 << 16); } + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/color_encoding_internal.cc b/third_party/jpeg-xl/lib/jxl/color_encoding_internal.cc new file mode 100644 index 0000000000..e496accfed --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/color_encoding_internal.cc @@ -0,0 +1,753 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/color_encoding_internal.h" + +#include + +#include +#include + +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/matrix_ops.h" + +namespace jxl { +namespace { + +// Highest reasonable value for the gamma of a transfer curve. +constexpr uint32_t kMaxGamma = 8192; + +// These strings are baked into Description - do not change. + +std::string ToString(ColorSpace color_space) { + switch (color_space) { + case ColorSpace::kRGB: + return "RGB"; + case ColorSpace::kGray: + return "Gra"; + case ColorSpace::kXYB: + return "XYB"; + case ColorSpace::kUnknown: + return "CS?"; + } + // Should not happen - visitor fails if enum is invalid. + JXL_ABORT("Invalid ColorSpace %u", static_cast(color_space)); +} + +std::string ToString(WhitePoint white_point) { + switch (white_point) { + case WhitePoint::kD65: + return "D65"; + case WhitePoint::kCustom: + return "Cst"; + case WhitePoint::kE: + return "EER"; + case WhitePoint::kDCI: + return "DCI"; + } + // Should not happen - visitor fails if enum is invalid. + JXL_ABORT("Invalid WhitePoint %u", static_cast(white_point)); +} + +std::string ToString(Primaries primaries) { + switch (primaries) { + case Primaries::kSRGB: + return "SRG"; + case Primaries::k2100: + return "202"; + case Primaries::kP3: + return "DCI"; + case Primaries::kCustom: + return "Cst"; + } + // Should not happen - visitor fails if enum is invalid. + JXL_ABORT("Invalid Primaries %u", static_cast(primaries)); +} + +std::string ToString(TransferFunction transfer_function) { + switch (transfer_function) { + case TransferFunction::kSRGB: + return "SRG"; + case TransferFunction::kLinear: + return "Lin"; + case TransferFunction::k709: + return "709"; + case TransferFunction::kPQ: + return "PeQ"; + case TransferFunction::kHLG: + return "HLG"; + case TransferFunction::kDCI: + return "DCI"; + case TransferFunction::kUnknown: + return "TF?"; + } + // Should not happen - visitor fails if enum is invalid. + JXL_ABORT("Invalid TransferFunction %u", + static_cast(transfer_function)); +} + +std::string ToString(RenderingIntent rendering_intent) { + switch (rendering_intent) { + case RenderingIntent::kPerceptual: + return "Per"; + case RenderingIntent::kRelative: + return "Rel"; + case RenderingIntent::kSaturation: + return "Sat"; + case RenderingIntent::kAbsolute: + return "Abs"; + } + // Should not happen - visitor fails if enum is invalid. + JXL_ABORT("Invalid RenderingIntent %u", + static_cast(rendering_intent)); +} + +static double F64FromCustomxyI32(const int32_t i) { return i * 1E-6; } +static Status F64ToCustomxyI32(const double f, int32_t* JXL_RESTRICT i) { + if (!(-4 <= f && f <= 4)) { + return JXL_FAILURE("F64 out of bounds for CustomxyI32"); + } + *i = static_cast(roundf(f * 1E6)); + return true; +} + +Status ConvertExternalToInternalWhitePoint(const JxlWhitePoint external, + WhitePoint* internal) { + switch (external) { + case JXL_WHITE_POINT_D65: + *internal = WhitePoint::kD65; + return true; + case JXL_WHITE_POINT_CUSTOM: + *internal = WhitePoint::kCustom; + return true; + case JXL_WHITE_POINT_E: + *internal = WhitePoint::kE; + return true; + case JXL_WHITE_POINT_DCI: + *internal = WhitePoint::kDCI; + return true; + } + return JXL_FAILURE("Invalid WhitePoint enum value"); +} + +Status ConvertExternalToInternalPrimaries(const JxlPrimaries external, + Primaries* internal) { + switch (external) { + case JXL_PRIMARIES_SRGB: + *internal = Primaries::kSRGB; + return true; + case JXL_PRIMARIES_CUSTOM: + *internal = Primaries::kCustom; + return true; + case JXL_PRIMARIES_2100: + *internal = Primaries::k2100; + return true; + case JXL_PRIMARIES_P3: + *internal = Primaries::kP3; + return true; + } + return JXL_FAILURE("Invalid Primaries enum value"); +} + +Status ConvertExternalToInternalTransferFunction( + const JxlTransferFunction external, TransferFunction* internal) { + switch (external) { + case JXL_TRANSFER_FUNCTION_709: + *internal = TransferFunction::k709; + return true; + case JXL_TRANSFER_FUNCTION_UNKNOWN: + *internal = TransferFunction::kUnknown; + return true; + case JXL_TRANSFER_FUNCTION_LINEAR: + *internal = TransferFunction::kLinear; + return true; + case JXL_TRANSFER_FUNCTION_SRGB: + *internal = TransferFunction::kSRGB; + return true; + case JXL_TRANSFER_FUNCTION_PQ: + *internal = TransferFunction::kPQ; + return true; + case JXL_TRANSFER_FUNCTION_DCI: + *internal = TransferFunction::kDCI; + return true; + case JXL_TRANSFER_FUNCTION_HLG: + *internal = TransferFunction::kHLG; + return true; + case JXL_TRANSFER_FUNCTION_GAMMA: + return JXL_FAILURE("Gamma should be handled separately"); + } + return JXL_FAILURE("Invalid TransferFunction enum value"); +} + +Status ConvertExternalToInternalRenderingIntent( + const JxlRenderingIntent external, RenderingIntent* internal) { + switch (external) { + case JXL_RENDERING_INTENT_PERCEPTUAL: + *internal = RenderingIntent::kPerceptual; + return true; + case JXL_RENDERING_INTENT_RELATIVE: + *internal = RenderingIntent::kRelative; + return true; + case JXL_RENDERING_INTENT_SATURATION: + *internal = RenderingIntent::kSaturation; + return true; + case JXL_RENDERING_INTENT_ABSOLUTE: + *internal = RenderingIntent::kAbsolute; + return true; + } + return JXL_FAILURE("Invalid RenderingIntent enum value"); +} + +} // namespace + +CIExy Customxy::Get() const { + CIExy xy; + xy.x = F64FromCustomxyI32(x); + xy.y = F64FromCustomxyI32(y); + return xy; +} + +Status Customxy::Set(const CIExy& xy) { + JXL_RETURN_IF_ERROR(F64ToCustomxyI32(xy.x, &x)); + JXL_RETURN_IF_ERROR(F64ToCustomxyI32(xy.y, &y)); + size_t extension_bits, total_bits; + if (!Bundle::CanEncode(*this, &extension_bits, &total_bits)) { + return JXL_FAILURE("Unable to encode XY %f %f", xy.x, xy.y); + } + return true; +} + +bool CustomTransferFunction::SetImplicit() { + if (nonserialized_color_space == ColorSpace::kXYB) { + if (!SetGamma(1.0 / 3)) JXL_ASSERT(false); + return true; + } + return false; +} + +Status CustomTransferFunction::SetGamma(double gamma) { + if (gamma < (1.0f / kMaxGamma) || gamma > 1.0) { + return JXL_FAILURE("Invalid gamma %f", gamma); + } + + have_gamma_ = false; + if (ApproxEq(gamma, 1.0)) { + transfer_function_ = TransferFunction::kLinear; + return true; + } + if (ApproxEq(gamma, 1.0 / 2.6)) { + transfer_function_ = TransferFunction::kDCI; + return true; + } + // Don't translate 0.45.. to kSRGB nor k709 - that might change pixel + // values because those curves also have a linear part. + + have_gamma_ = true; + gamma_ = roundf(gamma * kGammaMul); + transfer_function_ = TransferFunction::kUnknown; + return true; +} + +namespace { + +std::array CreateC2(const Primaries pr, + const TransferFunction tf) { + std::array c2; + + { + ColorEncoding* c_rgb = c2.data() + 0; + c_rgb->SetColorSpace(ColorSpace::kRGB); + c_rgb->white_point = WhitePoint::kD65; + c_rgb->primaries = pr; + c_rgb->tf.SetTransferFunction(tf); + JXL_CHECK(c_rgb->CreateICC()); + } + + { + ColorEncoding* c_gray = c2.data() + 1; + c_gray->SetColorSpace(ColorSpace::kGray); + c_gray->white_point = WhitePoint::kD65; + c_gray->primaries = pr; + c_gray->tf.SetTransferFunction(tf); + JXL_CHECK(c_gray->CreateICC()); + } + + return c2; +} + +} // namespace + +const ColorEncoding& ColorEncoding::SRGB(bool is_gray) { + static std::array c2 = + CreateC2(Primaries::kSRGB, TransferFunction::kSRGB); + return c2[is_gray]; +} +const ColorEncoding& ColorEncoding::LinearSRGB(bool is_gray) { + static std::array c2 = + CreateC2(Primaries::kSRGB, TransferFunction::kLinear); + return c2[is_gray]; +} + +CIExy ColorEncoding::GetWhitePoint() const { + JXL_DASSERT(have_fields_); + CIExy xy; + switch (white_point) { + case WhitePoint::kCustom: + return white_.Get(); + + case WhitePoint::kD65: + xy.x = 0.3127; + xy.y = 0.3290; + return xy; + + case WhitePoint::kDCI: + // From https://ieeexplore.ieee.org/document/7290729 C.2 page 11 + xy.x = 0.314; + xy.y = 0.351; + return xy; + + case WhitePoint::kE: + xy.x = xy.y = 1.0 / 3; + return xy; + } + JXL_ABORT("Invalid WhitePoint %u", static_cast(white_point)); +} + +Status ColorEncoding::SetWhitePoint(const CIExy& xy) { + JXL_DASSERT(have_fields_); + if (xy.x == 0.0 || xy.y == 0.0) { + return JXL_FAILURE("Invalid white point %f %f", xy.x, xy.y); + } + if (ApproxEq(xy.x, 0.3127) && ApproxEq(xy.y, 0.3290)) { + white_point = WhitePoint::kD65; + return true; + } + if (ApproxEq(xy.x, 1.0 / 3) && ApproxEq(xy.y, 1.0 / 3)) { + white_point = WhitePoint::kE; + return true; + } + if (ApproxEq(xy.x, 0.314) && ApproxEq(xy.y, 0.351)) { + white_point = WhitePoint::kDCI; + return true; + } + white_point = WhitePoint::kCustom; + return white_.Set(xy); +} + +PrimariesCIExy ColorEncoding::GetPrimaries() const { + JXL_DASSERT(have_fields_); + JXL_ASSERT(HasPrimaries()); + PrimariesCIExy xy; + switch (primaries) { + case Primaries::kCustom: + xy.r = red_.Get(); + xy.g = green_.Get(); + xy.b = blue_.Get(); + return xy; + + case Primaries::kSRGB: + xy.r.x = 0.639998686; + xy.r.y = 0.330010138; + xy.g.x = 0.300003784; + xy.g.y = 0.600003357; + xy.b.x = 0.150002046; + xy.b.y = 0.059997204; + return xy; + + case Primaries::k2100: + xy.r.x = 0.708; + xy.r.y = 0.292; + xy.g.x = 0.170; + xy.g.y = 0.797; + xy.b.x = 0.131; + xy.b.y = 0.046; + return xy; + + case Primaries::kP3: + xy.r.x = 0.680; + xy.r.y = 0.320; + xy.g.x = 0.265; + xy.g.y = 0.690; + xy.b.x = 0.150; + xy.b.y = 0.060; + return xy; + } + JXL_ABORT("Invalid Primaries %u", static_cast(primaries)); +} + +Status ColorEncoding::SetPrimaries(const PrimariesCIExy& xy) { + JXL_DASSERT(have_fields_); + JXL_ASSERT(HasPrimaries()); + if (xy.r.x == 0.0 || xy.r.y == 0.0 || xy.g.x == 0.0 || xy.g.y == 0.0 || + xy.b.x == 0.0 || xy.b.y == 0.0) { + return JXL_FAILURE("Invalid primaries %f %f %f %f %f %f", xy.r.x, xy.r.y, + xy.g.x, xy.g.y, xy.b.x, xy.b.y); + } + + if (ApproxEq(xy.r.x, 0.64) && ApproxEq(xy.r.y, 0.33) && + ApproxEq(xy.g.x, 0.30) && ApproxEq(xy.g.y, 0.60) && + ApproxEq(xy.b.x, 0.15) && ApproxEq(xy.b.y, 0.06)) { + primaries = Primaries::kSRGB; + return true; + } + + if (ApproxEq(xy.r.x, 0.708) && ApproxEq(xy.r.y, 0.292) && + ApproxEq(xy.g.x, 0.170) && ApproxEq(xy.g.y, 0.797) && + ApproxEq(xy.b.x, 0.131) && ApproxEq(xy.b.y, 0.046)) { + primaries = Primaries::k2100; + return true; + } + if (ApproxEq(xy.r.x, 0.680) && ApproxEq(xy.r.y, 0.320) && + ApproxEq(xy.g.x, 0.265) && ApproxEq(xy.g.y, 0.690) && + ApproxEq(xy.b.x, 0.150) && ApproxEq(xy.b.y, 0.060)) { + primaries = Primaries::kP3; + return true; + } + + primaries = Primaries::kCustom; + JXL_RETURN_IF_ERROR(red_.Set(xy.r)); + JXL_RETURN_IF_ERROR(green_.Set(xy.g)); + JXL_RETURN_IF_ERROR(blue_.Set(xy.b)); + return true; +} + +Status ColorEncoding::CreateICC() { + InternalRemoveICC(); + if (!MaybeCreateProfile(*this, &icc_)) { + return JXL_FAILURE("Failed to create profile from fields"); + } + return true; +} + +std::string Description(const ColorEncoding& c_in) { + // Copy required for Implicit* + ColorEncoding c = c_in; + + std::string d = ToString(c.GetColorSpace()); + + if (!c.ImplicitWhitePoint()) { + d += '_'; + if (c.white_point == WhitePoint::kCustom) { + const CIExy wp = c.GetWhitePoint(); + d += ToString(wp.x) + ';'; + d += ToString(wp.y); + } else { + d += ToString(c.white_point); + } + } + + if (c.HasPrimaries()) { + d += '_'; + if (c.primaries == Primaries::kCustom) { + const PrimariesCIExy pr = c.GetPrimaries(); + d += ToString(pr.r.x) + ';'; + d += ToString(pr.r.y) + ';'; + d += ToString(pr.g.x) + ';'; + d += ToString(pr.g.y) + ';'; + d += ToString(pr.b.x) + ';'; + d += ToString(pr.b.y); + } else { + d += ToString(c.primaries); + } + } + + d += '_'; + d += ToString(c.rendering_intent); + + if (!c.tf.SetImplicit()) { + d += '_'; + if (c.tf.IsGamma()) { + d += 'g'; + d += ToString(c.tf.GetGamma()); + } else { + d += ToString(c.tf.GetTransferFunction()); + } + } + + return d; +} + +Customxy::Customxy() { Bundle::Init(this); } +Status Customxy::VisitFields(Visitor* JXL_RESTRICT visitor) { + uint32_t ux = PackSigned(x); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Bits(19), BitsOffset(19, 524288), + BitsOffset(20, 1048576), + BitsOffset(21, 2097152), 0, &ux)); + x = UnpackSigned(ux); + uint32_t uy = PackSigned(y); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Bits(19), BitsOffset(19, 524288), + BitsOffset(20, 1048576), + BitsOffset(21, 2097152), 0, &uy)); + y = UnpackSigned(uy); + return true; +} + +CustomTransferFunction::CustomTransferFunction() { Bundle::Init(this); } +Status CustomTransferFunction::VisitFields(Visitor* JXL_RESTRICT visitor) { + if (visitor->Conditional(!SetImplicit())) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_gamma_)); + + if (visitor->Conditional(have_gamma_)) { + // Gamma is represented as a 24-bit int, the exponent used is + // gamma_ / 1e7. Valid values are (0, 1]. On the low end side, we also + // limit it to kMaxGamma/1e7. + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(24, kGammaMul, &gamma_)); + if (gamma_ > kGammaMul || + static_cast(gamma_) * kMaxGamma < kGammaMul) { + return JXL_FAILURE("Invalid gamma %u", gamma_); + } + } + + if (visitor->Conditional(!have_gamma_)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->Enum(TransferFunction::kSRGB, &transfer_function_)); + } + } + + return true; +} + +ColorEncoding::ColorEncoding() { Bundle::Init(this); } +Status ColorEncoding::VisitFields(Visitor* JXL_RESTRICT visitor) { + if (visitor->AllDefault(*this, &all_default)) { + // Overwrite all serialized fields, but not any nonserialized_*. + visitor->SetDefault(this); + return true; + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &want_icc_)); + + // Always send even if want_icc_ because this affects decoding. + // We can skip the white point/primaries because they do not. + JXL_QUIET_RETURN_IF_ERROR(visitor->Enum(ColorSpace::kRGB, &color_space_)); + + if (visitor->Conditional(!WantICC())) { + // Serialize enums. NOTE: we set the defaults to the most common values so + // ImageMetadata.all_default is true in the common case. + + if (visitor->Conditional(!ImplicitWhitePoint())) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Enum(WhitePoint::kD65, &white_point)); + if (visitor->Conditional(white_point == WhitePoint::kCustom)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&white_)); + } + } + + if (visitor->Conditional(HasPrimaries())) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Enum(Primaries::kSRGB, &primaries)); + if (visitor->Conditional(primaries == Primaries::kCustom)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&red_)); + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&green_)); + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&blue_)); + } + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&tf)); + + JXL_QUIET_RETURN_IF_ERROR( + visitor->Enum(RenderingIntent::kRelative, &rendering_intent)); + + // We didn't have ICC, so all fields should be known. + if (color_space_ == ColorSpace::kUnknown || tf.IsUnknown()) { + return JXL_FAILURE( + "No ICC but cs %u and tf %u%s", + static_cast(color_space_), + tf.IsGamma() ? 0 + : static_cast(tf.GetTransferFunction()), + tf.IsGamma() ? "(gamma)" : ""); + } + + JXL_RETURN_IF_ERROR(CreateICC()); + } + + if (WantICC() && visitor->IsReading()) { + // Haven't called SetICC() yet, do nothing. + } else { + if (ICC().empty()) return JXL_FAILURE("Empty ICC"); + } + + return true; +} + +void ConvertInternalToExternalColorEncoding(const ColorEncoding& internal, + JxlColorEncoding* external) { + external->color_space = static_cast(internal.GetColorSpace()); + + external->white_point = static_cast(internal.white_point); + + jxl::CIExy whitepoint = internal.GetWhitePoint(); + external->white_point_xy[0] = whitepoint.x; + external->white_point_xy[1] = whitepoint.y; + + if (external->color_space == JXL_COLOR_SPACE_RGB || + external->color_space == JXL_COLOR_SPACE_UNKNOWN) { + external->primaries = static_cast(internal.primaries); + jxl::PrimariesCIExy primaries = internal.GetPrimaries(); + external->primaries_red_xy[0] = primaries.r.x; + external->primaries_red_xy[1] = primaries.r.y; + external->primaries_green_xy[0] = primaries.g.x; + external->primaries_green_xy[1] = primaries.g.y; + external->primaries_blue_xy[0] = primaries.b.x; + external->primaries_blue_xy[1] = primaries.b.y; + } + + if (internal.tf.IsGamma()) { + external->transfer_function = JXL_TRANSFER_FUNCTION_GAMMA; + external->gamma = internal.tf.GetGamma(); + } else { + external->transfer_function = + static_cast(internal.tf.GetTransferFunction()); + external->gamma = 0; + } + + external->rendering_intent = + static_cast(internal.rendering_intent); +} + +Status ConvertExternalToInternalColorEncoding(const JxlColorEncoding& external, + ColorEncoding* internal) { + internal->SetColorSpace(static_cast(external.color_space)); + + JXL_RETURN_IF_ERROR(ConvertExternalToInternalWhitePoint( + external.white_point, &internal->white_point)); + if (external.white_point == JXL_WHITE_POINT_CUSTOM) { + CIExy wp; + wp.x = external.white_point_xy[0]; + wp.y = external.white_point_xy[1]; + JXL_RETURN_IF_ERROR(internal->SetWhitePoint(wp)); + } + + if (external.color_space == JXL_COLOR_SPACE_RGB || + external.color_space == JXL_COLOR_SPACE_UNKNOWN) { + JXL_RETURN_IF_ERROR(ConvertExternalToInternalPrimaries( + external.primaries, &internal->primaries)); + if (external.primaries == JXL_PRIMARIES_CUSTOM) { + PrimariesCIExy primaries; + primaries.r.x = external.primaries_red_xy[0]; + primaries.r.y = external.primaries_red_xy[1]; + primaries.g.x = external.primaries_green_xy[0]; + primaries.g.y = external.primaries_green_xy[1]; + primaries.b.x = external.primaries_blue_xy[0]; + primaries.b.y = external.primaries_blue_xy[1]; + JXL_RETURN_IF_ERROR(internal->SetPrimaries(primaries)); + } + } + CustomTransferFunction tf; + tf.nonserialized_color_space = internal->GetColorSpace(); + if (external.transfer_function == JXL_TRANSFER_FUNCTION_GAMMA) { + JXL_RETURN_IF_ERROR(tf.SetGamma(external.gamma)); + } else { + TransferFunction tf_enum; + // JXL_TRANSFER_FUNCTION_GAMMA is not handled by this function since there's + // no internal enum value for it. + JXL_RETURN_IF_ERROR(ConvertExternalToInternalTransferFunction( + external.transfer_function, &tf_enum)); + tf.SetTransferFunction(tf_enum); + } + internal->tf = tf; + + JXL_RETURN_IF_ERROR(ConvertExternalToInternalRenderingIntent( + external.rendering_intent, &internal->rendering_intent)); + + // The ColorEncoding caches an ICC profile it created earlier that may no + // longer match the profile with the changed fields, so re-create it. + if (!(internal->CreateICC())) { + // This is not an error: for example, it doesn't have ICC profile creation + // implemented for XYB. This should not be returned as error, since + // ConvertExternalToInternalColorEncoding still worked correctly, and what + // matters is that internal->ICC() will not return the wrong profile. + } + + return true; +} + +/* Chromatic adaptation matrices*/ +static const float kBradford[9] = { + 0.8951f, 0.2664f, -0.1614f, -0.7502f, 1.7135f, + 0.0367f, 0.0389f, -0.0685f, 1.0296f, +}; + +static const float kBradfordInv[9] = { + 0.9869929f, -0.1470543f, 0.1599627f, 0.4323053f, 0.5183603f, + 0.0492912f, -0.0085287f, 0.0400428f, 0.9684867f, +}; + +// Adapts whitepoint x, y to D50 +Status AdaptToXYZD50(float wx, float wy, float matrix[9]) { + if (wx < 0 || wx > 1 || wy <= 0 || wy > 1) { + // Out of range values can cause division through zero + // further down with the bradford adaptation too. + return JXL_FAILURE("Invalid white point"); + } + float w[3] = {wx / wy, 1.0f, (1.0f - wx - wy) / wy}; + // 1 / tiny float can still overflow + JXL_RETURN_IF_ERROR(std::isfinite(w[0]) && std::isfinite(w[2])); + float w50[3] = {0.96422f, 1.0f, 0.82521f}; + + float lms[3]; + float lms50[3]; + + Mul3x3Vector(kBradford, w, lms); + Mul3x3Vector(kBradford, w50, lms50); + + if (lms[0] == 0 || lms[1] == 0 || lms[2] == 0) { + return JXL_FAILURE("Invalid white point"); + } + float a[9] = { + // /----> 0, 1, 2, 3, /----> 4, 5, 6, 7, /----> 8, + lms50[0] / lms[0], 0, 0, 0, lms50[1] / lms[1], 0, 0, 0, lms50[2] / lms[2], + }; + if (!std::isfinite(a[0]) || !std::isfinite(a[4]) || !std::isfinite(a[8])) { + return JXL_FAILURE("Invalid white point"); + } + + float b[9]; + Mul3x3Matrix(a, kBradford, b); + Mul3x3Matrix(kBradfordInv, b, matrix); + + return true; +} + +Status PrimariesToXYZ(float rx, float ry, float gx, float gy, float bx, + float by, float wx, float wy, float matrix[9]) { + if (wx < 0 || wx > 1 || wy <= 0 || wy > 1) { + return JXL_FAILURE("Invalid white point"); + } + // TODO(lode): also require rx, ry, gx, gy, bx, to be in range 0-1? ICC + // profiles in theory forbid negative XYZ values, but in practice the ACES P0 + // color space uses a negative y for the blue primary. + float primaries[9] = { + rx, gx, bx, ry, gy, by, 1.0f - rx - ry, 1.0f - gx - gy, 1.0f - bx - by}; + float primaries_inv[9]; + memcpy(primaries_inv, primaries, sizeof(float) * 9); + JXL_RETURN_IF_ERROR(Inv3x3Matrix(primaries_inv)); + + float w[3] = {wx / wy, 1.0f, (1.0f - wx - wy) / wy}; + // 1 / tiny float can still overflow + JXL_RETURN_IF_ERROR(std::isfinite(w[0]) && std::isfinite(w[2])); + float xyz[3]; + Mul3x3Vector(primaries_inv, w, xyz); + + float a[9] = { + xyz[0], 0, 0, 0, xyz[1], 0, 0, 0, xyz[2], + }; + + Mul3x3Matrix(primaries, a, matrix); + return true; +} + +Status PrimariesToXYZD50(float rx, float ry, float gx, float gy, float bx, + float by, float wx, float wy, float matrix[9]) { + float toXYZ[9]; + JXL_RETURN_IF_ERROR(PrimariesToXYZ(rx, ry, gx, gy, bx, by, wx, wy, toXYZ)); + float d50[9]; + JXL_RETURN_IF_ERROR(AdaptToXYZD50(wx, wy, d50)); + + Mul3x3Matrix(d50, toXYZ, matrix); + return true; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/color_encoding_internal.h b/third_party/jpeg-xl/lib/jxl/color_encoding_internal.h new file mode 100644 index 0000000000..713f216538 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/color_encoding_internal.h @@ -0,0 +1,463 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_COLOR_ENCODING_INTERNAL_H_ +#define LIB_JXL_COLOR_ENCODING_INTERNAL_H_ + +// Metadata for color space conversions. + +#include +#include +#include +#include + +#include // std::abs +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/field_encodings.h" + +namespace jxl { + +// (All CIE units are for the standard 1931 2 degree observer) + +// Color space the color pixel data is encoded in. The color pixel data is +// 3-channel in all cases except in case of kGray, where it uses only 1 channel. +// This also determines the amount of channels used in modular encoding. +enum class ColorSpace : uint32_t { + // Trichromatic color data. This also includes CMYK if a kBlack + // ExtraChannelInfo is present. This implies, if there is an ICC profile, that + // the ICC profile uses a 3-channel color space if no kBlack extra channel is + // present, or uses color space 'CMYK' if a kBlack extra channel is present. + kRGB, + // Single-channel data. This implies, if there is an ICC profile, that the ICC + // profile also represents single-channel data and has the appropriate color + // space ('GRAY'). + kGray, + // Like kRGB, but implies fixed values for primaries etc. + kXYB, + // For non-RGB/gray data, e.g. from non-electro-optical sensors. Otherwise + // the same conditions as kRGB apply. + kUnknown +}; + +static inline const char* EnumName(ColorSpace /*unused*/) { + return "ColorSpace"; +} +static inline constexpr uint64_t EnumBits(ColorSpace /*unused*/) { + using CS = ColorSpace; + return MakeBit(CS::kRGB) | MakeBit(CS::kGray) | MakeBit(CS::kXYB) | + MakeBit(CS::kUnknown); +} + +// Values from CICP ColourPrimaries. +enum class WhitePoint : uint32_t { + kD65 = 1, // sRGB/BT.709/Display P3/BT.2020 + kCustom = 2, // Actual values encoded in separate fields + kE = 10, // XYZ + kDCI = 11, // DCI-P3 +}; + +static inline const char* EnumName(WhitePoint /*unused*/) { + return "WhitePoint"; +} +static inline constexpr uint64_t EnumBits(WhitePoint /*unused*/) { + return MakeBit(WhitePoint::kD65) | MakeBit(WhitePoint::kCustom) | + MakeBit(WhitePoint::kE) | MakeBit(WhitePoint::kDCI); +} + +// Values from CICP ColourPrimaries +enum class Primaries : uint32_t { + kSRGB = 1, // Same as BT.709 + kCustom = 2, // Actual values encoded in separate fields + k2100 = 9, // Same as BT.2020 + kP3 = 11, +}; + +static inline const char* EnumName(Primaries /*unused*/) { return "Primaries"; } +static inline constexpr uint64_t EnumBits(Primaries /*unused*/) { + using Pr = Primaries; + return MakeBit(Pr::kSRGB) | MakeBit(Pr::kCustom) | MakeBit(Pr::k2100) | + MakeBit(Pr::kP3); +} + +// Values from CICP TransferCharacteristics +enum class TransferFunction : uint32_t { + k709 = 1, + kUnknown = 2, + kLinear = 8, + kSRGB = 13, + kPQ = 16, // from BT.2100 + kDCI = 17, // from SMPTE RP 431-2 reference projector + kHLG = 18, // from BT.2100 +}; + +static inline const char* EnumName(TransferFunction /*unused*/) { + return "TransferFunction"; +} +static inline constexpr uint64_t EnumBits(TransferFunction /*unused*/) { + using TF = TransferFunction; + return MakeBit(TF::k709) | MakeBit(TF::kLinear) | MakeBit(TF::kSRGB) | + MakeBit(TF::kPQ) | MakeBit(TF::kDCI) | MakeBit(TF::kHLG) | + MakeBit(TF::kUnknown); +} + +enum class RenderingIntent : uint32_t { + // Values match ICC sRGB encodings. + kPerceptual = 0, // good for photos, requires a profile with LUT. + kRelative, // good for logos. + kSaturation, // perhaps useful for CG with fully saturated colors. + kAbsolute, // leaves white point unchanged; good for proofing. +}; + +static inline const char* EnumName(RenderingIntent /*unused*/) { + return "RenderingIntent"; +} +static inline constexpr uint64_t EnumBits(RenderingIntent /*unused*/) { + using RI = RenderingIntent; + return MakeBit(RI::kPerceptual) | MakeBit(RI::kRelative) | + MakeBit(RI::kSaturation) | MakeBit(RI::kAbsolute); +} + +// Chromaticity (Y is omitted because it is 1 for primaries/white points) +struct CIExy { + double x = 0.0; + double y = 0.0; +}; + +struct PrimariesCIExy { + CIExy r; + CIExy g; + CIExy b; +}; + +// Serializable form of CIExy. +struct Customxy : public Fields { + Customxy(); + JXL_FIELDS_NAME(Customxy) + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + CIExy Get() const; + // Returns false if x or y do not fit in the encoding. + Status Set(const CIExy& xy); + + int32_t x; + int32_t y; +}; + +struct CustomTransferFunction : public Fields { + CustomTransferFunction(); + JXL_FIELDS_NAME(CustomTransferFunction) + + // Sets fields and returns true if nonserialized_color_space has an implicit + // transfer function, otherwise leaves fields unchanged and returns false. + bool SetImplicit(); + + // Gamma: only used for PNG inputs + bool IsGamma() const { return have_gamma_; } + double GetGamma() const { + JXL_ASSERT(IsGamma()); + return gamma_ * 1E-7; // (0, 1) + } + Status SetGamma(double gamma); + + TransferFunction GetTransferFunction() const { + JXL_ASSERT(!IsGamma()); + return transfer_function_; + } + void SetTransferFunction(const TransferFunction tf) { + have_gamma_ = false; + transfer_function_ = tf; + } + + bool IsUnknown() const { + return !have_gamma_ && (transfer_function_ == TransferFunction::kUnknown); + } + bool IsSRGB() const { + return !have_gamma_ && (transfer_function_ == TransferFunction::kSRGB); + } + bool IsLinear() const { + return !have_gamma_ && (transfer_function_ == TransferFunction::kLinear); + } + bool IsPQ() const { + return !have_gamma_ && (transfer_function_ == TransferFunction::kPQ); + } + bool IsHLG() const { + return !have_gamma_ && (transfer_function_ == TransferFunction::kHLG); + } + bool Is709() const { + return !have_gamma_ && (transfer_function_ == TransferFunction::k709); + } + bool IsDCI() const { + return !have_gamma_ && (transfer_function_ == TransferFunction::kDCI); + } + bool IsSame(const CustomTransferFunction& other) const { + if (have_gamma_ != other.have_gamma_) return false; + if (have_gamma_) { + if (gamma_ != other.gamma_) return false; + } else { + if (transfer_function_ != other.transfer_function_) return false; + } + return true; + } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + // Must be set before calling VisitFields! + ColorSpace nonserialized_color_space = ColorSpace::kRGB; + + private: + static constexpr uint32_t kGammaMul = 10000000; + + bool have_gamma_; + + // OETF exponent to go from linear to gamma-compressed. + uint32_t gamma_; // Only used if have_gamma_. + + // Can be kUnknown. + TransferFunction transfer_function_; // Only used if !have_gamma_. +}; + +// Compact encoding of data required to interpret and translate pixels to a +// known color space. Stored in Metadata. Thread-compatible. +struct ColorEncoding : public Fields { + ColorEncoding(); + JXL_FIELDS_NAME(ColorEncoding) + + // Returns ready-to-use color encodings (initialized on-demand). + static const ColorEncoding& SRGB(bool is_gray = false); + static const ColorEncoding& LinearSRGB(bool is_gray = false); + + // Returns true if an ICC profile was successfully created from fields. + // Must be called after modifying fields. Defined in color_management.cc. + Status CreateICC(); + + // Returns non-empty and valid ICC profile, unless: + // - between calling InternalRemoveICC() and CreateICC() in tests; + // - WantICC() == true and SetICC() was not yet called; + // - after a failed call to SetSRGB(), SetICC(), or CreateICC(). + const PaddedBytes& ICC() const { return icc_; } + + // Internal only, do not call except from tests. + void InternalRemoveICC() { icc_.clear(); } + + // Returns true if `icc` is assigned and decoded successfully. If so, + // subsequent WantICC() will return true until DecideIfWantICC() changes it. + // Returning false indicates data has been lost. + Status SetICC(PaddedBytes&& icc) { + if (icc.empty()) return false; + icc_ = std::move(icc); + + if (!SetFieldsFromICC()) { + InternalRemoveICC(); + return false; + } + + want_icc_ = true; + return true; + } + + // Sets the raw ICC profile bytes, without parsing the ICC, and without + // updating the direct fields such as whitepoint, primaries and color + // space. Functions to get and set fields, such as SetWhitePoint, cannot be + // used anymore after this and functions such as IsSRGB return false no matter + // what the contents of the icc profile. + Status SetICCRaw(PaddedBytes&& icc) { + if (icc.empty()) return false; + icc_ = std::move(icc); + + want_icc_ = true; + have_fields_ = false; + return true; + } + + // Returns whether to send the ICC profile in the codestream. + bool WantICC() const { return want_icc_; } + + // Return whether the direct fields are set, if false but ICC is set, only + // raw ICC bytes are known. + bool HaveFields() const { return have_fields_; } + + // Causes WantICC() to return false if ICC() can be reconstructed from fields. + // Defined in color_management.cc. + void DecideIfWantICC(); + + bool IsGray() const { return color_space_ == ColorSpace::kGray; } + bool IsCMYK() const { return cmyk_; } + size_t Channels() const { return IsGray() ? 1 : 3; } + + // Returns false if the field is invalid and unusable. + bool HasPrimaries() const { + return !IsGray() && color_space_ != ColorSpace::kXYB; + } + + // Returns true after setting the field to a value defined by color_space, + // otherwise false and leaves the field unchanged. + bool ImplicitWhitePoint() { + if (color_space_ == ColorSpace::kXYB) { + white_point = WhitePoint::kD65; + return true; + } + return false; + } + + // Returns whether the color space is known to be sRGB. If a raw unparsed ICC + // profile is set without the fields being set, this returns false, even if + // the content of the ICC profile would match sRGB. + bool IsSRGB() const { + if (!have_fields_) return false; + if (!IsGray() && color_space_ != ColorSpace::kRGB) return false; + if (white_point != WhitePoint::kD65) return false; + if (primaries != Primaries::kSRGB) return false; + if (!tf.IsSRGB()) return false; + return true; + } + + // Returns whether the color space is known to be linear sRGB. If a raw + // unparsed ICC profile is set without the fields being set, this returns + // false, even if the content of the ICC profile would match linear sRGB. + bool IsLinearSRGB() const { + if (!have_fields_) return false; + if (!IsGray() && color_space_ != ColorSpace::kRGB) return false; + if (white_point != WhitePoint::kD65) return false; + if (primaries != Primaries::kSRGB) return false; + if (!tf.IsLinear()) return false; + return true; + } + + Status SetSRGB(const ColorSpace cs, + const RenderingIntent ri = RenderingIntent::kRelative) { + InternalRemoveICC(); + JXL_ASSERT(cs == ColorSpace::kGray || cs == ColorSpace::kRGB); + color_space_ = cs; + white_point = WhitePoint::kD65; + primaries = Primaries::kSRGB; + tf.SetTransferFunction(TransferFunction::kSRGB); + rendering_intent = ri; + return CreateICC(); + } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + // Accessors ensure tf.nonserialized_color_space is updated at the same time. + ColorSpace GetColorSpace() const { return color_space_; } + void SetColorSpace(const ColorSpace cs) { + color_space_ = cs; + tf.nonserialized_color_space = cs; + } + + CIExy GetWhitePoint() const; + Status SetWhitePoint(const CIExy& xy); + + PrimariesCIExy GetPrimaries() const; + Status SetPrimaries(const PrimariesCIExy& xy); + + // Checks if the color spaces (including white point / primaries) are the + // same, but ignores the transfer function, rendering intent and ICC bytes. + bool SameColorSpace(const ColorEncoding& other) const { + if (color_space_ != other.color_space_) return false; + + if (white_point != other.white_point) return false; + if (white_point == WhitePoint::kCustom) { + if (white_.x != other.white_.x || white_.y != other.white_.y) + return false; + } + + if (HasPrimaries() != other.HasPrimaries()) return false; + if (HasPrimaries()) { + if (primaries != other.primaries) return false; + if (primaries == Primaries::kCustom) { + if (red_.x != other.red_.x || red_.y != other.red_.y) return false; + if (green_.x != other.green_.x || green_.y != other.green_.y) + return false; + if (blue_.x != other.blue_.x || blue_.y != other.blue_.y) return false; + } + } + return true; + } + + // Checks if the color space and transfer function are the same, ignoring + // rendering intent and ICC bytes + bool SameColorEncoding(const ColorEncoding& other) const { + return SameColorSpace(other) && tf.IsSame(other.tf); + } + + mutable bool all_default; + + // Only valid if HaveFields() + WhitePoint white_point; + Primaries primaries; // Only valid if HasPrimaries() + CustomTransferFunction tf; + RenderingIntent rendering_intent; + + private: + // Returns true if all fields have been initialized (possibly to kUnknown). + // Returns false if the ICC profile is invalid or decoding it fails. + // Defined in enc_color_management.cc. + Status SetFieldsFromICC(); + + // If true, the codestream contains an ICC profile and we do not serialize + // fields. Otherwise, fields are serialized and we create an ICC profile. + bool want_icc_; + + // When false, fields such as white_point and tf are invalid and must not be + // used. This occurs after setting a raw bytes-only ICC profile, only the + // ICC bytes may be used. The color_space_ field is still valid. + bool have_fields_ = true; + + PaddedBytes icc_; // Valid ICC profile + + ColorSpace color_space_; // Can be kUnknown + bool cmyk_ = false; + + // Only used if white_point == kCustom. + Customxy white_; + + // Only used if primaries == kCustom. + Customxy red_; + Customxy green_; + Customxy blue_; +}; + +// Returns whether the two inputs are approximately equal. +static inline bool ApproxEq(const double a, const double b, +#if JPEGXL_ENABLE_SKCMS + double max_l1 = 1E-3) { +#else + double max_l1 = 8E-5) { +#endif + // Threshold should be sufficient for ICC's 15-bit fixed-point numbers. + // We have seen differences of 7.1E-5 with lcms2 and 1E-3 with skcms. + return std::abs(a - b) <= max_l1; +} + +// Returns a representation of the ColorEncoding fields (not icc). +// Example description: "RGB_D65_SRG_Rel_Lin" +std::string Description(const ColorEncoding& c); +static inline std::ostream& operator<<(std::ostream& os, + const ColorEncoding& c) { + return os << Description(c); +} + +void ConvertInternalToExternalColorEncoding(const jxl::ColorEncoding& internal, + JxlColorEncoding* external); + +Status ConvertExternalToInternalColorEncoding(const JxlColorEncoding& external, + jxl::ColorEncoding* internal); + +Status PrimariesToXYZ(float rx, float ry, float gx, float gy, float bx, + float by, float wx, float wy, float matrix[9]); +Status PrimariesToXYZD50(float rx, float ry, float gx, float gy, float bx, + float by, float wx, float wy, float matrix[9]); +Status AdaptToXYZD50(float wx, float wy, float matrix[9]); + +} // namespace jxl + +#endif // LIB_JXL_COLOR_ENCODING_INTERNAL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/color_encoding_internal_test.cc b/third_party/jpeg-xl/lib/jxl/color_encoding_internal_test.cc new file mode 100644 index 0000000000..6ad47e1923 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/color_encoding_internal_test.cc @@ -0,0 +1,157 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/color_encoding_internal.h" + +#include + +#include "lib/jxl/encode_internal.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +TEST(ColorEncodingTest, RoundTripAll) { + for (const test::ColorEncodingDescriptor& cdesc : test::AllEncodings()) { + const ColorEncoding c_original = test::ColorEncodingFromDescriptor(cdesc); + // Verify Set(Get) yields the same white point/primaries/gamma. + { + ColorEncoding c; + EXPECT_TRUE(c.SetWhitePoint(c_original.GetWhitePoint())); + EXPECT_EQ(c_original.white_point, c.white_point); + } + { + ColorEncoding c; + EXPECT_TRUE(c.SetPrimaries(c_original.GetPrimaries())); + EXPECT_EQ(c_original.primaries, c.primaries); + } + if (c_original.tf.IsGamma()) { + ColorEncoding c; + EXPECT_TRUE(c.tf.SetGamma(c_original.tf.GetGamma())); + EXPECT_TRUE(c_original.tf.IsSame(c.tf)); + } + } +} + +TEST(ColorEncodingTest, CustomWhitePoint) { + ColorEncoding c; + // Nonsensical values + CIExy xy_in; + xy_in.x = 0.8; + xy_in.y = 0.01; + EXPECT_TRUE(c.SetWhitePoint(xy_in)); + const CIExy xy = c.GetWhitePoint(); + + ColorEncoding c2; + EXPECT_TRUE(c2.SetWhitePoint(xy)); + EXPECT_TRUE(c.SameColorSpace(c2)); +} + +TEST(ColorEncodingTest, CustomPrimaries) { + ColorEncoding c; + PrimariesCIExy xy_in; + // Nonsensical values + xy_in.r.x = -0.01; + xy_in.r.y = 0.2; + xy_in.g.x = 0.4; + xy_in.g.y = 0.401; + xy_in.b.x = 1.1; + xy_in.b.y = -1.2; + EXPECT_TRUE(c.SetPrimaries(xy_in)); + const PrimariesCIExy xy = c.GetPrimaries(); + + ColorEncoding c2; + EXPECT_TRUE(c2.SetPrimaries(xy)); + EXPECT_TRUE(c.SameColorSpace(c2)); +} + +TEST(ColorEncodingTest, CustomGamma) { + ColorEncoding c; +#ifndef JXL_CRASH_ON_ERROR + EXPECT_FALSE(c.tf.SetGamma(0.0)); + EXPECT_FALSE(c.tf.SetGamma(-1E-6)); + EXPECT_FALSE(c.tf.SetGamma(1.001)); +#endif + EXPECT_TRUE(c.tf.SetGamma(1.0)); + EXPECT_FALSE(c.tf.IsGamma()); + EXPECT_TRUE(c.tf.IsLinear()); + + EXPECT_TRUE(c.tf.SetGamma(0.123)); + EXPECT_TRUE(c.tf.IsGamma()); + const double gamma = c.tf.GetGamma(); + + ColorEncoding c2; + EXPECT_TRUE(c2.tf.SetGamma(gamma)); + EXPECT_TRUE(c.SameColorEncoding(c2)); + EXPECT_TRUE(c2.tf.IsGamma()); +} + +TEST(ColorEncodingTest, InternalExternalConversion) { + ColorEncoding source_internal; + JxlColorEncoding external; + ColorEncoding destination_internal; + + for (int i = 0; i < 100; i++) { + source_internal.SetColorSpace(static_cast(rand() % 4)); + CIExy wp; + wp.x = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25; + wp.y = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25; + EXPECT_TRUE(source_internal.SetWhitePoint(wp)); + if (source_internal.HasPrimaries()) { + PrimariesCIExy primaries; + primaries.r.x = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25; + primaries.r.y = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25; + primaries.g.x = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25; + primaries.g.y = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25; + primaries.b.x = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25; + primaries.b.y = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25; + EXPECT_TRUE(source_internal.SetPrimaries(primaries)); + } + CustomTransferFunction tf; + EXPECT_TRUE(tf.SetGamma((float(rand()) / float((RAND_MAX)) * 0.5) + 0.25)); + source_internal.tf = tf; + source_internal.rendering_intent = static_cast(rand() % 4); + + ConvertInternalToExternalColorEncoding(source_internal, &external); + EXPECT_TRUE(ConvertExternalToInternalColorEncoding(external, + &destination_internal)); + + EXPECT_EQ(source_internal.GetColorSpace(), + destination_internal.GetColorSpace()); + EXPECT_EQ(source_internal.white_point, destination_internal.white_point); + EXPECT_EQ(source_internal.GetWhitePoint().x, + destination_internal.GetWhitePoint().x); + EXPECT_EQ(source_internal.GetWhitePoint().y, + destination_internal.GetWhitePoint().y); + if (source_internal.HasPrimaries()) { + EXPECT_EQ(source_internal.GetPrimaries().r.x, + destination_internal.GetPrimaries().r.x); + EXPECT_EQ(source_internal.GetPrimaries().r.y, + destination_internal.GetPrimaries().r.y); + EXPECT_EQ(source_internal.GetPrimaries().g.x, + destination_internal.GetPrimaries().g.x); + EXPECT_EQ(source_internal.GetPrimaries().g.y, + destination_internal.GetPrimaries().g.y); + EXPECT_EQ(source_internal.GetPrimaries().b.x, + destination_internal.GetPrimaries().b.x); + EXPECT_EQ(source_internal.GetPrimaries().b.y, + destination_internal.GetPrimaries().b.y); + } + EXPECT_EQ(source_internal.tf.IsGamma(), destination_internal.tf.IsGamma()); + if (source_internal.tf.IsGamma()) { + EXPECT_EQ(source_internal.tf.GetGamma(), + destination_internal.tf.GetGamma()); + } else { + EXPECT_EQ(source_internal.tf.GetTransferFunction(), + destination_internal.tf.GetTransferFunction()); + } + EXPECT_EQ(source_internal.rendering_intent, + destination_internal.rendering_intent); + } +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/color_management.cc b/third_party/jpeg-xl/lib/jxl/color_management.cc new file mode 100644 index 0000000000..d656888a8b --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/color_management.cc @@ -0,0 +1,682 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/color_management.h" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/color_management.cc" +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/field_encodings.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/transfer_functions-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// NOTE: this is only used to provide a reasonable ICC profile that other +// software can read. Our own transforms use ExtraTF instead because that is +// more precise and supports unbounded mode. +std::vector CreateTableCurve(uint32_t N, const ExtraTF tf) { + JXL_ASSERT(N <= 4096); // ICC MFT2 only allows 4K entries + JXL_ASSERT(tf == ExtraTF::kPQ || tf == ExtraTF::kHLG); + // No point using float - LCMS converts to 16-bit for A2B/MFT. + std::vector table(N); + for (uint32_t i = 0; i < N; ++i) { + const float x = static_cast(i) / (N - 1); // 1.0 at index N - 1. + const double dx = static_cast(x); + // LCMS requires EOTF (e.g. 2.4 exponent). + double y = (tf == ExtraTF::kHLG) ? TF_HLG().DisplayFromEncoded(dx) + : TF_PQ().DisplayFromEncoded(dx); + JXL_ASSERT(y >= 0.0); + // Clamp to table range - necessary for HLG. + if (y > 1.0) y = 1.0; + // 1.0 corresponds to table value 0xFFFF. + table[i] = static_cast(roundf(y * 65535.0)); + } + return table; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(CreateTableCurve); // Local function. + +Status CIEXYZFromWhiteCIExy(const CIExy& xy, float XYZ[3]) { + // Target Y = 1. + if (std::abs(xy.y) < 1e-12) return JXL_FAILURE("Y value is too small"); + const float factor = 1 / xy.y; + XYZ[0] = xy.x * factor; + XYZ[1] = 1; + XYZ[2] = (1 - xy.x - xy.y) * factor; + return true; +} + +namespace { + +// NOTE: this is only used to provide a reasonable ICC profile that other +// software can read. Our own transforms use ExtraTF instead because that is +// more precise and supports unbounded mode. +template +std::vector CreateTableCurve(uint32_t N, const Func& func) { + JXL_ASSERT(N <= 4096); // ICC MFT2 only allows 4K entries + // No point using float - LCMS converts to 16-bit for A2B/MFT. + std::vector table(N); + for (uint32_t i = 0; i < N; ++i) { + const float x = static_cast(i) / (N - 1); // 1.0 at index N - 1. + // LCMS requires EOTF (e.g. 2.4 exponent). + double y = func.DisplayFromEncoded(static_cast(x)); + JXL_ASSERT(y >= 0.0); + // Clamp to table range - necessary for HLG. + if (y > 1.0) y = 1.0; + // 1.0 corresponds to table value 0xFFFF. + table[i] = static_cast(roundf(y * 65535.0)); + } + return table; +} + +void ICCComputeMD5(const PaddedBytes& data, uint8_t sum[16]) + JXL_NO_SANITIZE("unsigned-integer-overflow") { + PaddedBytes data64 = data; + data64.push_back(128); + // Add bytes such that ((size + 8) & 63) == 0. + size_t extra = ((64 - ((data64.size() + 8) & 63)) & 63); + data64.resize(data64.size() + extra, 0); + for (uint64_t i = 0; i < 64; i += 8) { + data64.push_back(static_cast(data.size() << 3u) >> i); + } + + static const uint32_t sineparts[64] = { + 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a, + 0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, + 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340, + 0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, + 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8, + 0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, + 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa, + 0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, + 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92, + 0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, + 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391, + }; + static const uint32_t shift[64] = { + 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, + 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, + 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, + 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, + }; + + uint32_t a0 = 0x67452301, b0 = 0xefcdab89, c0 = 0x98badcfe, d0 = 0x10325476; + + for (size_t i = 0; i < data64.size(); i += 64) { + uint32_t a = a0, b = b0, c = c0, d = d0, f, g; + for (size_t j = 0; j < 64; j++) { + if (j < 16) { + f = (b & c) | ((~b) & d); + g = j; + } else if (j < 32) { + f = (d & b) | ((~d) & c); + g = (5 * j + 1) & 0xf; + } else if (j < 48) { + f = b ^ c ^ d; + g = (3 * j + 5) & 0xf; + } else { + f = c ^ (b | (~d)); + g = (7 * j) & 0xf; + } + uint32_t dg0 = data64[i + g * 4 + 0], dg1 = data64[i + g * 4 + 1], + dg2 = data64[i + g * 4 + 2], dg3 = data64[i + g * 4 + 3]; + uint32_t u = dg0 | (dg1 << 8u) | (dg2 << 16u) | (dg3 << 24u); + f += a + sineparts[j] + u; + a = d; + d = c; + c = b; + b += (f << shift[j]) | (f >> (32u - shift[j])); + } + a0 += a; + b0 += b; + c0 += c; + d0 += d; + } + sum[0] = a0; + sum[1] = a0 >> 8u; + sum[2] = a0 >> 16u; + sum[3] = a0 >> 24u; + sum[4] = b0; + sum[5] = b0 >> 8u; + sum[6] = b0 >> 16u; + sum[7] = b0 >> 24u; + sum[8] = c0; + sum[9] = c0 >> 8u; + sum[10] = c0 >> 16u; + sum[11] = c0 >> 24u; + sum[12] = d0; + sum[13] = d0 >> 8u; + sum[14] = d0 >> 16u; + sum[15] = d0 >> 24u; +} + +Status CreateICCChadMatrix(CIExy w, float result[9]) { + float m[9]; + if (w.y == 0) { // WhitePoint can not be pitch-black. + return JXL_FAILURE("Invalid WhitePoint"); + } + JXL_RETURN_IF_ERROR(AdaptToXYZD50(w.x, w.y, m)); + memcpy(result, m, sizeof(float) * 9); + return true; +} + +// Creates RGB to XYZ matrix given RGB primaries and whitepoint in xy. +Status CreateICCRGBMatrix(CIExy r, CIExy g, CIExy b, CIExy w, float result[9]) { + float m[9]; + JXL_RETURN_IF_ERROR( + PrimariesToXYZD50(r.x, r.y, g.x, g.y, b.x, b.y, w.x, w.y, m)); + memcpy(result, m, sizeof(float) * 9); + return true; +} + +void WriteICCUint32(uint32_t value, size_t pos, PaddedBytes* JXL_RESTRICT icc) { + if (icc->size() < pos + 4) icc->resize(pos + 4); + (*icc)[pos + 0] = (value >> 24u) & 255; + (*icc)[pos + 1] = (value >> 16u) & 255; + (*icc)[pos + 2] = (value >> 8u) & 255; + (*icc)[pos + 3] = value & 255; +} + +void WriteICCUint16(uint16_t value, size_t pos, PaddedBytes* JXL_RESTRICT icc) { + if (icc->size() < pos + 2) icc->resize(pos + 2); + (*icc)[pos + 0] = (value >> 8u) & 255; + (*icc)[pos + 1] = value & 255; +} + +void WriteICCUint8(uint8_t value, size_t pos, PaddedBytes* JXL_RESTRICT icc) { + if (icc->size() < pos + 1) icc->resize(pos + 1); + (*icc)[pos] = value; +} + +// Writes a 4-character tag +void WriteICCTag(const char* value, size_t pos, PaddedBytes* JXL_RESTRICT icc) { + if (icc->size() < pos + 4) icc->resize(pos + 4); + memcpy(icc->data() + pos, value, 4); +} + +Status WriteICCS15Fixed16(float value, size_t pos, + PaddedBytes* JXL_RESTRICT icc) { + // "nextafterf" for 32768.0f towards zero are: + // 32767.998046875, 32767.99609375, 32767.994140625 + // Even the first value works well,... + bool ok = (-32767.995f <= value) && (value <= 32767.995f); + if (!ok) return JXL_FAILURE("ICC value is out of range / NaN"); + int32_t i = value * 65536.0f + 0.5f; + // Use two's complement + uint32_t u = static_cast(i); + WriteICCUint32(u, pos, icc); + return true; +} + +Status CreateICCHeader(const ColorEncoding& c, + PaddedBytes* JXL_RESTRICT header) { + // TODO(lode): choose color management engine name, e.g. "skia" if + // integrated in skia. + static const char* kCmm = "jxl "; + + header->resize(128, 0); + + WriteICCUint32(0, 0, header); // size, correct value filled in at end + WriteICCTag(kCmm, 4, header); + WriteICCUint32(0x04400000u, 8, header); + const char* profile_type = + c.GetColorSpace() == ColorSpace::kXYB ? "scnr" : "mntr"; + WriteICCTag(profile_type, 12, header); + WriteICCTag(c.IsGray() ? "GRAY" : "RGB ", 16, header); + WriteICCTag("XYZ ", 20, header); + + // Three uint32_t's date/time encoding. + // TODO(lode): encode actual date and time, this is a placeholder + uint32_t year = 2019, month = 12, day = 1; + uint32_t hour = 0, minute = 0, second = 0; + WriteICCUint16(year, 24, header); + WriteICCUint16(month, 26, header); + WriteICCUint16(day, 28, header); + WriteICCUint16(hour, 30, header); + WriteICCUint16(minute, 32, header); + WriteICCUint16(second, 34, header); + + WriteICCTag("acsp", 36, header); + WriteICCTag("APPL", 40, header); + WriteICCUint32(0, 44, header); // flags + WriteICCUint32(0, 48, header); // device manufacturer + WriteICCUint32(0, 52, header); // device model + WriteICCUint32(0, 56, header); // device attributes + WriteICCUint32(0, 60, header); // device attributes + WriteICCUint32(static_cast(c.rendering_intent), 64, header); + + // Mandatory D50 white point of profile connection space + WriteICCUint32(0x0000f6d6, 68, header); + WriteICCUint32(0x00010000, 72, header); + WriteICCUint32(0x0000d32d, 76, header); + + WriteICCTag(kCmm, 80, header); + + return true; +} + +void AddToICCTagTable(const char* tag, size_t offset, size_t size, + PaddedBytes* JXL_RESTRICT tagtable, + std::vector* offsets) { + WriteICCTag(tag, tagtable->size(), tagtable); + // writing true offset deferred to later + WriteICCUint32(0, tagtable->size(), tagtable); + offsets->push_back(offset); + WriteICCUint32(size, tagtable->size(), tagtable); +} + +void FinalizeICCTag(PaddedBytes* JXL_RESTRICT tags, size_t* offset, + size_t* size) { + while ((tags->size() & 3) != 0) { + tags->push_back(0); + } + *offset += *size; + *size = tags->size() - *offset; +} + +// The input text must be ASCII, writing other characters to UTF-16 is not +// implemented. +void CreateICCMlucTag(const std::string& text, PaddedBytes* JXL_RESTRICT tags) { + WriteICCTag("mluc", tags->size(), tags); + WriteICCUint32(0, tags->size(), tags); + WriteICCUint32(1, tags->size(), tags); + WriteICCUint32(12, tags->size(), tags); + WriteICCTag("enUS", tags->size(), tags); + WriteICCUint32(text.size() * 2, tags->size(), tags); + WriteICCUint32(28, tags->size(), tags); + for (size_t i = 0; i < text.size(); i++) { + tags->push_back(0); // prepend 0 for UTF-16 + tags->push_back(text[i]); + } +} + +Status CreateICCXYZTag(float xyz[3], PaddedBytes* JXL_RESTRICT tags) { + WriteICCTag("XYZ ", tags->size(), tags); + WriteICCUint32(0, tags->size(), tags); + for (size_t i = 0; i < 3; ++i) { + JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(xyz[i], tags->size(), tags)); + } + return true; +} + +Status CreateICCChadTag(float chad[9], PaddedBytes* JXL_RESTRICT tags) { + WriteICCTag("sf32", tags->size(), tags); + WriteICCUint32(0, tags->size(), tags); + for (size_t i = 0; i < 9; i++) { + JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(chad[i], tags->size(), tags)); + } + return true; +} + +void MaybeCreateICCCICPTag(const ColorEncoding& c, + PaddedBytes* JXL_RESTRICT tags, size_t* offset, + size_t* size, PaddedBytes* JXL_RESTRICT tagtable, + std::vector* offsets) { + if (c.GetColorSpace() != ColorSpace::kRGB) { + return; + } + uint8_t primaries = 0; + if (c.primaries == Primaries::kP3) { + if (c.white_point == WhitePoint::kD65) { + primaries = 12; + } else if (c.white_point == WhitePoint::kDCI) { + primaries = 11; + } else { + return; + } + } else if (c.primaries != Primaries::kCustom && + c.white_point == WhitePoint::kD65) { + primaries = static_cast(c.primaries); + } else { + return; + } + if (c.tf.IsUnknown() || c.tf.IsGamma()) { + return; + } + WriteICCTag("cicp", tags->size(), tags); + WriteICCUint32(0, tags->size(), tags); + WriteICCUint8(primaries, tags->size(), tags); + WriteICCUint8(static_cast(c.tf.GetTransferFunction()), tags->size(), + tags); + // Matrix + WriteICCUint8(0, tags->size(), tags); + // Full range + WriteICCUint8(1, tags->size(), tags); + FinalizeICCTag(tags, offset, size); + AddToICCTagTable("cicp", *offset, *size, tagtable, offsets); +} + +void CreateICCCurvCurvTag(const std::vector& curve, + PaddedBytes* JXL_RESTRICT tags) { + size_t pos = tags->size(); + tags->resize(tags->size() + 12 + curve.size() * 2, 0); + WriteICCTag("curv", pos, tags); + WriteICCUint32(0, pos + 4, tags); + WriteICCUint32(curve.size(), pos + 8, tags); + for (size_t i = 0; i < curve.size(); i++) { + WriteICCUint16(curve[i], pos + 12 + i * 2, tags); + } +} + +// Writes 12 + 4*params.size() bytes +Status CreateICCCurvParaTag(std::vector params, size_t curve_type, + PaddedBytes* JXL_RESTRICT tags) { + WriteICCTag("para", tags->size(), tags); + WriteICCUint32(0, tags->size(), tags); + WriteICCUint16(curve_type, tags->size(), tags); + WriteICCUint16(0, tags->size(), tags); + for (size_t i = 0; i < params.size(); i++) { + JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(params[i], tags->size(), tags)); + } + return true; +} + +Status CreateICCLutAtoBTagForXYB(PaddedBytes* JXL_RESTRICT tags) { + WriteICCTag("mAB ", tags->size(), tags); + // 4 reserved bytes set to 0 + WriteICCUint32(0, tags->size(), tags); + // number of input channels + WriteICCUint8(3, tags->size(), tags); + // number of output channels + WriteICCUint8(3, tags->size(), tags); + // 2 reserved bytes for padding + WriteICCUint16(0, tags->size(), tags); + // offset to first B curve + WriteICCUint32(32, tags->size(), tags); + // offset to matrix + WriteICCUint32(244, tags->size(), tags); + // offset to first M curve + WriteICCUint32(148, tags->size(), tags); + // offset to CLUT + WriteICCUint32(80, tags->size(), tags); + // offset to first A curve + // (reuse linear B curves) + WriteICCUint32(32, tags->size(), tags); + + // offset = 32 + // no-op curves + JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags)); + JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags)); + JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags)); + // offset = 80 + // number of grid points for each input channel + for (int i = 0; i < 16; ++i) { + WriteICCUint8(i < 3 ? 2 : 0, tags->size(), tags); + } + // precision = 2 + WriteICCUint8(2, tags->size(), tags); + // 3 bytes of padding + WriteICCUint8(0, tags->size(), tags); + WriteICCUint16(0, tags->size(), tags); + const float kOffsets[3] = { + kScaledXYBOffset[0] + kScaledXYBOffset[1], + kScaledXYBOffset[1] - kScaledXYBOffset[0] + 1.0f / kScaledXYBScale[0], + kScaledXYBOffset[1] + kScaledXYBOffset[2]}; + const float kScaling[3] = { + 1.0f / (1.0f / kScaledXYBScale[0] + 1.0f / kScaledXYBScale[1]), + 1.0f / (1.0f / kScaledXYBScale[0] + 1.0f / kScaledXYBScale[1]), + 1.0f / (1.0f / kScaledXYBScale[1] + 1.0f / kScaledXYBScale[2])}; + // 2*2*2*3 entries of 2 bytes each = 48 bytes + for (size_t ix = 0; ix < 2; ++ix) { + for (size_t iy = 0; iy < 2; ++iy) { + for (size_t ib = 0; ib < 2; ++ib) { + float in_f[3] = {ix * 1.0f, iy * 1.0f, ib * 1.0f}; + for (size_t c = 0; c < 3; ++c) { + in_f[c] /= kScaledXYBScale[c]; + in_f[c] -= kScaledXYBOffset[c]; + } + float out_f[3]; + out_f[0] = in_f[1] + in_f[0]; + out_f[1] = in_f[1] - in_f[0]; + out_f[2] = in_f[2] + in_f[1]; + for (int i = 0; i < 3; ++i) { + out_f[i] += kOffsets[i]; + out_f[i] *= kScaling[i]; + } + for (int i = 0; i < 3; ++i) { + JXL_RETURN_IF_ERROR(out_f[i] >= 0.f && out_f[i] <= 1.f); + uint16_t val = static_cast( + 0.5f + 65535 * std::max(0.f, std::min(1.f, out_f[i]))); + WriteICCUint16(val, tags->size(), tags); + } + } + } + } + // offset = 148 + // 3 curves with 5 parameters = 3 * (12 + 5 * 4) = 96 bytes + for (size_t i = 0; i < 3; ++i) { + const float b = + -kOffsets[i] - std::cbrt(jxl::kNegOpsinAbsorbanceBiasRGB[i]); + std::vector params = { + 3, + 1.0f / kScaling[i], + b, + 0, // unused + std::max(0.f, -b * kScaling[i]), // make skcms happy + }; + JXL_RETURN_IF_ERROR(CreateICCCurvParaTag(params, 3, tags)); + } + // offset = 244 + const double matrix[] = {1.5170095, -1.1065225, 0.071623, + -0.050022, 0.5683655, -0.018344, + -1.387676, 1.1145555, 0.6857255}; + // 12 * 4 = 48 bytes + for (size_t i = 0; i < 9; ++i) { + JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(matrix[i], tags->size(), tags)); + } + for (size_t i = 0; i < 3; ++i) { + float intercept = 0; + for (size_t j = 0; j < 3; ++j) { + intercept += matrix[i * 3 + j] * jxl::kNegOpsinAbsorbanceBiasRGB[j]; + } + JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(intercept, tags->size(), tags)); + } + return true; +} +} // namespace + +Status MaybeCreateProfile(const ColorEncoding& c, + PaddedBytes* JXL_RESTRICT icc) { + PaddedBytes header, tagtable, tags; + + if (c.GetColorSpace() == ColorSpace::kUnknown || c.tf.IsUnknown()) { + return false; // Not an error + } + + switch (c.GetColorSpace()) { + case ColorSpace::kRGB: + case ColorSpace::kGray: + case ColorSpace::kXYB: + break; // OK + default: + return JXL_FAILURE("Invalid CS %u", + static_cast(c.GetColorSpace())); + } + + if (c.GetColorSpace() == ColorSpace::kXYB && + c.rendering_intent != RenderingIntent::kPerceptual) { + return JXL_FAILURE( + "Only perceptual rendering intent implemented for XYB " + "ICC profile."); + } + + JXL_RETURN_IF_ERROR(CreateICCHeader(c, &header)); + + std::vector offsets; + // tag count, deferred to later + WriteICCUint32(0, tagtable.size(), &tagtable); + + size_t tag_offset = 0, tag_size = 0; + + CreateICCMlucTag(Description(c), &tags); + FinalizeICCTag(&tags, &tag_offset, &tag_size); + AddToICCTagTable("desc", tag_offset, tag_size, &tagtable, &offsets); + + const std::string copyright = "CC0"; + CreateICCMlucTag(copyright, &tags); + FinalizeICCTag(&tags, &tag_offset, &tag_size); + AddToICCTagTable("cprt", tag_offset, tag_size, &tagtable, &offsets); + + // TODO(eustas): isn't it the other way round: gray image has d50 WhitePoint? + if (c.IsGray()) { + float wtpt[3]; + JXL_RETURN_IF_ERROR(CIEXYZFromWhiteCIExy(c.GetWhitePoint(), wtpt)); + JXL_RETURN_IF_ERROR(CreateICCXYZTag(wtpt, &tags)); + } else { + float d50[3] = {0.964203, 1.0, 0.824905}; + JXL_RETURN_IF_ERROR(CreateICCXYZTag(d50, &tags)); + } + FinalizeICCTag(&tags, &tag_offset, &tag_size); + AddToICCTagTable("wtpt", tag_offset, tag_size, &tagtable, &offsets); + + if (!c.IsGray()) { + // Chromatic adaptation matrix + float chad[9]; + JXL_RETURN_IF_ERROR(CreateICCChadMatrix(c.GetWhitePoint(), chad)); + + JXL_RETURN_IF_ERROR(CreateICCChadTag(chad, &tags)); + FinalizeICCTag(&tags, &tag_offset, &tag_size); + AddToICCTagTable("chad", tag_offset, tag_size, &tagtable, &offsets); + } + + if (c.GetColorSpace() == ColorSpace::kRGB) { + MaybeCreateICCCICPTag(c, &tags, &tag_offset, &tag_size, &tagtable, + &offsets); + + const PrimariesCIExy primaries = c.GetPrimaries(); + float m[9]; + JXL_RETURN_IF_ERROR(CreateICCRGBMatrix(primaries.r, primaries.g, + primaries.b, c.GetWhitePoint(), m)); + float r[3] = {m[0], m[3], m[6]}; + float g[3] = {m[1], m[4], m[7]}; + float b[3] = {m[2], m[5], m[8]}; + + JXL_RETURN_IF_ERROR(CreateICCXYZTag(r, &tags)); + FinalizeICCTag(&tags, &tag_offset, &tag_size); + AddToICCTagTable("rXYZ", tag_offset, tag_size, &tagtable, &offsets); + + JXL_RETURN_IF_ERROR(CreateICCXYZTag(g, &tags)); + FinalizeICCTag(&tags, &tag_offset, &tag_size); + AddToICCTagTable("gXYZ", tag_offset, tag_size, &tagtable, &offsets); + + JXL_RETURN_IF_ERROR(CreateICCXYZTag(b, &tags)); + FinalizeICCTag(&tags, &tag_offset, &tag_size); + AddToICCTagTable("bXYZ", tag_offset, tag_size, &tagtable, &offsets); + } + + if (c.GetColorSpace() == ColorSpace::kXYB) { + JXL_RETURN_IF_ERROR(CreateICCLutAtoBTagForXYB(&tags)); + FinalizeICCTag(&tags, &tag_offset, &tag_size); + AddToICCTagTable("A2B0", tag_offset, tag_size, &tagtable, &offsets); + } else { + if (c.tf.IsGamma()) { + float gamma = 1.0 / c.tf.GetGamma(); + JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({gamma}, 0, &tags)); + } else if (c.GetColorSpace() != ColorSpace::kXYB) { + switch (c.tf.GetTransferFunction()) { + case TransferFunction::kHLG: + CreateICCCurvCurvTag( + HWY_DYNAMIC_DISPATCH(CreateTableCurve)(4096, ExtraTF::kHLG), + &tags); + break; + case TransferFunction::kPQ: + CreateICCCurvCurvTag( + HWY_DYNAMIC_DISPATCH(CreateTableCurve)(4096, ExtraTF::kPQ), + &tags); + break; + case TransferFunction::kSRGB: + JXL_RETURN_IF_ERROR(CreateICCCurvParaTag( + {2.4, 1.0 / 1.055, 0.055 / 1.055, 1.0 / 12.92, 0.04045}, 3, + &tags)); + break; + case TransferFunction::k709: + JXL_RETURN_IF_ERROR(CreateICCCurvParaTag( + {1.0 / 0.45, 1.0 / 1.099, 0.099 / 1.099, 1.0 / 4.5, 0.081}, 3, + &tags)); + break; + case TransferFunction::kLinear: + JXL_RETURN_IF_ERROR( + CreateICCCurvParaTag({1.0, 1.0, 0.0, 1.0, 0.0}, 3, &tags)); + break; + case TransferFunction::kDCI: + JXL_RETURN_IF_ERROR( + CreateICCCurvParaTag({2.6, 1.0, 0.0, 1.0, 0.0}, 3, &tags)); + break; + default: + JXL_ABORT("Unknown TF %u", + static_cast(c.tf.GetTransferFunction())); + } + } + FinalizeICCTag(&tags, &tag_offset, &tag_size); + if (c.IsGray()) { + AddToICCTagTable("kTRC", tag_offset, tag_size, &tagtable, &offsets); + } else { + AddToICCTagTable("rTRC", tag_offset, tag_size, &tagtable, &offsets); + AddToICCTagTable("gTRC", tag_offset, tag_size, &tagtable, &offsets); + AddToICCTagTable("bTRC", tag_offset, tag_size, &tagtable, &offsets); + } + } + + // Tag count + WriteICCUint32(offsets.size(), 0, &tagtable); + for (size_t i = 0; i < offsets.size(); i++) { + WriteICCUint32(offsets[i] + header.size() + tagtable.size(), 4 + 12 * i + 4, + &tagtable); + } + + // ICC profile size + WriteICCUint32(header.size() + tagtable.size() + tags.size(), 0, &header); + + *icc = header; + icc->append(tagtable); + icc->append(tags); + + // The MD5 checksum must be computed on the profile with profile flags, + // rendering intent, and region of the checksum itself, set to 0. + // TODO(lode): manually verify with a reliable tool that this creates correct + // signature (profile id) for ICC profiles. + PaddedBytes icc_sum = *icc; + if (icc_sum.size() >= 64 + 4) { + memset(icc_sum.data() + 44, 0, 4); + memset(icc_sum.data() + 64, 0, 4); + } + uint8_t checksum[16]; + ICCComputeMD5(icc_sum, checksum); + + memcpy(icc->data() + 84, checksum, sizeof(checksum)); + + return true; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/color_management.h b/third_party/jpeg-xl/lib/jxl/color_management.h new file mode 100644 index 0000000000..f623aa1c90 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/color_management.h @@ -0,0 +1,40 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_COLOR_MANAGEMENT_H_ +#define LIB_JXL_COLOR_MANAGEMENT_H_ + +// ICC profiles and color space conversions. + +#include +#include + +#include + +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image.h" + +namespace jxl { + +enum class ExtraTF { + kNone, + kPQ, + kHLG, + kSRGB, +}; + +// NOTE: for XYB colorspace, the created profile can be used to transform a +// *scaled* XYB image (created by ScaleXYB()) to another colorspace. +Status MaybeCreateProfile(const ColorEncoding& c, + PaddedBytes* JXL_RESTRICT icc); + +Status CIEXYZFromWhiteCIExy(const CIExy& xy, float XYZ[3]); + +} // namespace jxl + +#endif // LIB_JXL_COLOR_MANAGEMENT_H_ diff --git a/third_party/jpeg-xl/lib/jxl/color_management_test.cc b/third_party/jpeg-xl/lib/jxl/color_management_test.cc new file mode 100644 index 0000000000..fc7a1c57ff --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/color_management_test.cc @@ -0,0 +1,405 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/color_management.h" + +#include +#include + +#include +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/file_io.h" +#include "lib/jxl/base/random.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { + +std::ostream& operator<<(std::ostream& os, const CIExy& xy) { + return os << "{x=" << xy.x << ", y=" << xy.y << "}"; +} + +std::ostream& operator<<(std::ostream& os, const PrimariesCIExy& primaries) { + return os << "{r=" << primaries.r << ", g=" << primaries.g + << ", b=" << primaries.b << "}"; +} + +namespace { + +using ::testing::ElementsAre; +using ::testing::FloatNear; + +// Small enough to be fast. If changed, must update Generate*. +static constexpr size_t kWidth = 16; + +static constexpr size_t kNumThreads = 1; // only have a single row. + +struct Globals { + // TODO(deymo): Make this a const. + static Globals* GetInstance() { + static Globals ret; + return &ret; + } + + private: + Globals() { + in_gray = GenerateGray(); + in_color = GenerateColor(); + out_gray = ImageF(kWidth, 1); + out_color = ImageF(kWidth * 3, 1); + + c_native = ColorEncoding::LinearSRGB(/*is_gray=*/false); + c_gray = ColorEncoding::LinearSRGB(/*is_gray=*/true); + } + + static ImageF GenerateGray() { + ImageF gray(kWidth, 1); + float* JXL_RESTRICT row = gray.Row(0); + // Increasing left to right + for (uint32_t x = 0; x < kWidth; ++x) { + row[x] = x * 1.0f / (kWidth - 1); // [0, 1] + } + return gray; + } + + static ImageF GenerateColor() { + ImageF image(kWidth * 3, 1); + float* JXL_RESTRICT interleaved = image.Row(0); + std::fill(interleaved, interleaved + kWidth * 3, 0.0f); + + // [0, 4): neutral + for (int32_t x = 0; x < 4; ++x) { + interleaved[3 * x + 0] = x * 1.0f / 3; // [0, 1] + interleaved[3 * x + 2] = interleaved[3 * x + 1] = interleaved[3 * x + 0]; + } + + // [4, 13): pure RGB with low/medium/high saturation + for (int32_t c = 0; c < 3; ++c) { + interleaved[3 * (4 + c) + c] = 0.08f + c * 0.01f; + interleaved[3 * (7 + c) + c] = 0.75f + c * 0.01f; + interleaved[3 * (10 + c) + c] = 1.0f; + } + + // [13, 16): impure, not quite saturated RGB + interleaved[3 * 13 + 0] = 0.86f; + interleaved[3 * 13 + 2] = interleaved[3 * 13 + 1] = 0.16f; + interleaved[3 * 14 + 1] = 0.87f; + interleaved[3 * 14 + 2] = interleaved[3 * 14 + 0] = 0.16f; + interleaved[3 * 15 + 2] = 0.88f; + interleaved[3 * 15 + 1] = interleaved[3 * 15 + 0] = 0.16f; + + return image; + } + + public: + // ImageF so we can use VerifyRelativeError; all are interleaved RGB. + ImageF in_gray; + ImageF in_color; + ImageF out_gray; + ImageF out_color; + ColorEncoding c_native; + ColorEncoding c_gray; +}; + +class ColorManagementTest + : public ::testing::TestWithParam { + public: + static void VerifySameFields(const ColorEncoding& c, + const ColorEncoding& c2) { + ASSERT_EQ(c.rendering_intent, c2.rendering_intent); + ASSERT_EQ(c.GetColorSpace(), c2.GetColorSpace()); + ASSERT_EQ(c.white_point, c2.white_point); + if (c.HasPrimaries()) { + ASSERT_EQ(c.primaries, c2.primaries); + } + ASSERT_TRUE(c.tf.IsSame(c2.tf)); + } + + // "Same" pixels after converting g->c_native -> c -> g->c_native. + static void VerifyPixelRoundTrip(const ColorEncoding& c) { + Globals* g = Globals::GetInstance(); + const ColorEncoding& c_native = c.IsGray() ? g->c_gray : g->c_native; + const JxlCmsInterface& cms = GetJxlCms(); + ColorSpaceTransform xform_fwd(cms); + ColorSpaceTransform xform_rev(cms); + const float intensity_target = + c.tf.IsHLG() ? 1000 : kDefaultIntensityTarget; + ASSERT_TRUE( + xform_fwd.Init(c_native, c, intensity_target, kWidth, kNumThreads)); + ASSERT_TRUE( + xform_rev.Init(c, c_native, intensity_target, kWidth, kNumThreads)); + + const size_t thread = 0; + const ImageF& in = c.IsGray() ? g->in_gray : g->in_color; + ImageF* JXL_RESTRICT out = c.IsGray() ? &g->out_gray : &g->out_color; + ASSERT_TRUE(xform_fwd.Run(thread, in.Row(0), xform_fwd.BufDst(thread))); + ASSERT_TRUE(xform_rev.Run(thread, xform_fwd.BufDst(thread), out->Row(0))); + +#if JPEGXL_ENABLE_SKCMS + double max_l1 = 7E-4; + double max_rel = 4E-7; +#else + double max_l1 = 5E-5; + // Most are lower; reached 3E-7 with D60 AP0. + double max_rel = 4E-7; +#endif + if (c.IsGray()) max_rel = 2E-5; + JXL_ASSERT_OK(VerifyRelativeError(in, *out, max_l1, max_rel, _)); + } +}; +JXL_GTEST_INSTANTIATE_TEST_SUITE_P(ColorManagementTestInstantiation, + ColorManagementTest, + ::testing::ValuesIn(test::AllEncodings())); + +// Exercises the ColorManagement interface for ALL ColorEncoding synthesizable +// via enums. +TEST_P(ColorManagementTest, VerifyAllProfiles) { + ColorEncoding c = ColorEncodingFromDescriptor(GetParam()); + printf("%s\n", Description(c).c_str()); + + // Can create profile. + ASSERT_TRUE(c.CreateICC()); + + // Can set an equivalent ColorEncoding from the generated ICC profile. + ColorEncoding c3; + ASSERT_TRUE(c3.SetICC(PaddedBytes(c.ICC()))); + VerifySameFields(c, c3); + + VerifyPixelRoundTrip(c); +} + +testing::Matcher CIExyIs(const double x, const double y) { + static constexpr double kMaxError = 1e-4; + return testing::AllOf( + testing::Field(&CIExy::x, testing::DoubleNear(x, kMaxError)), + testing::Field(&CIExy::y, testing::DoubleNear(y, kMaxError))); +} + +testing::Matcher PrimariesAre( + const testing::Matcher& r, const testing::Matcher& g, + const testing::Matcher& b) { + return testing::AllOf(testing::Field(&PrimariesCIExy::r, r), + testing::Field(&PrimariesCIExy::g, g), + testing::Field(&PrimariesCIExy::b, b)); +} + +TEST_F(ColorManagementTest, sRGBChromaticity) { + const ColorEncoding sRGB = ColorEncoding::SRGB(); + EXPECT_THAT(sRGB.GetWhitePoint(), CIExyIs(0.3127, 0.3290)); + EXPECT_THAT(sRGB.GetPrimaries(), + PrimariesAre(CIExyIs(0.64, 0.33), CIExyIs(0.30, 0.60), + CIExyIs(0.15, 0.06))); +} + +TEST_F(ColorManagementTest, D2700Chromaticity) { + PaddedBytes icc = + jxl::test::ReadTestData("jxl/color_management/sRGB-D2700.icc"); + ColorEncoding sRGB_D2700; + ASSERT_TRUE(sRGB_D2700.SetICC(std::move(icc))); + + EXPECT_THAT(sRGB_D2700.GetWhitePoint(), CIExyIs(0.45986, 0.41060)); + // The illuminant-relative chromaticities of this profile's primaries are the + // same as for sRGB. It is the PCS-relative chromaticities that would be + // different. + EXPECT_THAT(sRGB_D2700.GetPrimaries(), + PrimariesAre(CIExyIs(0.64, 0.33), CIExyIs(0.30, 0.60), + CIExyIs(0.15, 0.06))); +} + +TEST_F(ColorManagementTest, D2700ToSRGB) { + PaddedBytes icc = + jxl::test::ReadTestData("jxl/color_management/sRGB-D2700.icc"); + ColorEncoding sRGB_D2700; + ASSERT_TRUE(sRGB_D2700.SetICC(std::move(icc))); + + ColorSpaceTransform transform(GetJxlCms()); + ASSERT_TRUE(transform.Init(sRGB_D2700, ColorEncoding::SRGB(), + kDefaultIntensityTarget, 1, 1)); + const float sRGB_D2700_values[3] = {0.863, 0.737, 0.490}; + float sRGB_values[3]; + ASSERT_TRUE(transform.Run(0, sRGB_D2700_values, sRGB_values)); + EXPECT_THAT(sRGB_values, + ElementsAre(FloatNear(0.914, 1e-3), FloatNear(0.745, 1e-3), + FloatNear(0.601, 1e-3))); +} + +TEST_F(ColorManagementTest, P3HlgTo2020Hlg) { + ColorEncoding p3_hlg; + p3_hlg.SetColorSpace(ColorSpace::kRGB); + p3_hlg.white_point = WhitePoint::kD65; + p3_hlg.primaries = Primaries::kP3; + p3_hlg.tf.SetTransferFunction(TransferFunction::kHLG); + ASSERT_TRUE(p3_hlg.CreateICC()); + + ColorEncoding rec2020_hlg = p3_hlg; + rec2020_hlg.primaries = Primaries::k2100; + ASSERT_TRUE(rec2020_hlg.CreateICC()); + + ColorSpaceTransform transform(GetJxlCms()); + ASSERT_TRUE(transform.Init(p3_hlg, rec2020_hlg, 1000, 1, 1)); + const float p3_hlg_values[3] = {0., 0.75, 0.}; + float rec2020_hlg_values[3]; + ASSERT_TRUE(transform.Run(0, p3_hlg_values, rec2020_hlg_values)); + EXPECT_THAT(rec2020_hlg_values, + ElementsAre(FloatNear(0.3973, 1e-4), FloatNear(0.7382, 1e-4), + FloatNear(0.1183, 1e-4))); +} + +TEST_F(ColorManagementTest, HlgOotf) { + ColorEncoding p3_hlg; + p3_hlg.SetColorSpace(ColorSpace::kRGB); + p3_hlg.white_point = WhitePoint::kD65; + p3_hlg.primaries = Primaries::kP3; + p3_hlg.tf.SetTransferFunction(TransferFunction::kHLG); + ASSERT_TRUE(p3_hlg.CreateICC()); + + ColorSpaceTransform transform_to_1000(GetJxlCms()); + ASSERT_TRUE( + transform_to_1000.Init(p3_hlg, ColorEncoding::LinearSRGB(), 1000, 1, 1)); + // HDR reference white: https://www.itu.int/pub/R-REP-BT.2408-4-2021 + float p3_hlg_values[3] = {0.75, 0.75, 0.75}; + float linear_srgb_values[3]; + ASSERT_TRUE(transform_to_1000.Run(0, p3_hlg_values, linear_srgb_values)); + // On a 1000-nit display, HDR reference white should be 203 cd/m² which is + // 0.203 times the maximum. + EXPECT_THAT(linear_srgb_values, + ElementsAre(FloatNear(0.203, 1e-3), FloatNear(0.203, 1e-3), + FloatNear(0.203, 1e-3))); + + ColorSpaceTransform transform_to_400(GetJxlCms()); + ASSERT_TRUE( + transform_to_400.Init(p3_hlg, ColorEncoding::LinearSRGB(), 400, 1, 1)); + ASSERT_TRUE(transform_to_400.Run(0, p3_hlg_values, linear_srgb_values)); + // On a 400-nit display, it should be 100 cd/m². + EXPECT_THAT(linear_srgb_values, + ElementsAre(FloatNear(0.250, 1e-3), FloatNear(0.250, 1e-3), + FloatNear(0.250, 1e-3))); + + p3_hlg_values[2] = 0.50; + ASSERT_TRUE(transform_to_1000.Run(0, p3_hlg_values, linear_srgb_values)); + EXPECT_THAT(linear_srgb_values, + ElementsAre(FloatNear(0.201, 1e-3), FloatNear(0.201, 1e-3), + FloatNear(0.050, 1e-3))); + + ColorSpaceTransform transform_from_400(GetJxlCms()); + ASSERT_TRUE( + transform_from_400.Init(ColorEncoding::LinearSRGB(), p3_hlg, 400, 1, 1)); + linear_srgb_values[0] = linear_srgb_values[1] = linear_srgb_values[2] = 0.250; + ASSERT_TRUE(transform_from_400.Run(0, linear_srgb_values, p3_hlg_values)); + EXPECT_THAT(p3_hlg_values, + ElementsAre(FloatNear(0.75, 1e-3), FloatNear(0.75, 1e-3), + FloatNear(0.75, 1e-3))); + + ColorEncoding grayscale_hlg; + grayscale_hlg.SetColorSpace(ColorSpace::kGray); + grayscale_hlg.white_point = WhitePoint::kD65; + grayscale_hlg.tf.SetTransferFunction(TransferFunction::kHLG); + ASSERT_TRUE(grayscale_hlg.CreateICC()); + + ColorSpaceTransform grayscale_transform(GetJxlCms()); + ASSERT_TRUE(grayscale_transform.Init( + grayscale_hlg, ColorEncoding::LinearSRGB(/*is_gray=*/true), 1000, 1, 1)); + const float grayscale_hlg_value = 0.75; + float linear_grayscale_value; + ASSERT_TRUE(grayscale_transform.Run(0, &grayscale_hlg_value, + &linear_grayscale_value)); + EXPECT_THAT(linear_grayscale_value, FloatNear(0.203, 1e-3)); +} + +TEST_F(ColorManagementTest, XYBProfile) { + ColorEncoding c_xyb; + c_xyb.SetColorSpace(ColorSpace::kXYB); + c_xyb.rendering_intent = RenderingIntent::kPerceptual; + ASSERT_TRUE(c_xyb.CreateICC()); + ColorEncoding c_native = ColorEncoding::LinearSRGB(false); + + static const size_t kGridDim = 17; + static const size_t kNumColors = kGridDim * kGridDim * kGridDim; + const JxlCmsInterface& cms = GetJxlCms(); + ColorSpaceTransform xform(cms); + ASSERT_TRUE( + xform.Init(c_xyb, c_native, kDefaultIntensityTarget, kNumColors, 1)); + + ImageMetadata metadata; + metadata.color_encoding = c_native; + ImageBundle ib(&metadata); + Image3F native(kNumColors, 1); + float mul = 1.0f / (kGridDim - 1); + for (size_t ir = 0, x = 0; ir < kGridDim; ++ir) { + for (size_t ig = 0; ig < kGridDim; ++ig) { + for (size_t ib = 0; ib < kGridDim; ++ib, ++x) { + native.PlaneRow(0, 0)[x] = ir * mul; + native.PlaneRow(1, 0)[x] = ig * mul; + native.PlaneRow(2, 0)[x] = ib * mul; + } + } + } + ib.SetFromImage(std::move(native), c_native); + const Image3F& in = *ib.color(); + Image3F opsin(kNumColors, 1); + ToXYB(ib, nullptr, &opsin, cms, nullptr); + + Image3F opsin2 = CopyImage(opsin); + ScaleXYB(&opsin2); + + float* src = xform.BufSrc(0); + for (size_t i = 0; i < kNumColors; ++i) { + for (size_t c = 0; c < 3; ++c) { + src[3 * i + c] = opsin2.PlaneRow(c, 0)[i]; + } + } + + float* dst = xform.BufDst(0); + ASSERT_TRUE(xform.Run(0, src, dst)); + + Image3F out(kNumColors, 1); + for (size_t i = 0; i < kNumColors; ++i) { + for (size_t c = 0; c < 3; ++c) { + out.PlaneRow(c, 0)[i] = dst[3 * i + c]; + } + } + + auto debug_print_color = [&](size_t i) { + printf( + "(%f, %f, %f) -> (%9.6f, %f, %f) -> (%f, %f, %f) -> " + "(%9.6f, %9.6f, %9.6f)", + in.PlaneRow(0, 0)[i], in.PlaneRow(1, 0)[i], in.PlaneRow(2, 0)[i], + opsin.PlaneRow(0, 0)[i], opsin.PlaneRow(1, 0)[i], + opsin.PlaneRow(2, 0)[i], opsin2.PlaneRow(0, 0)[i], + opsin2.PlaneRow(1, 0)[i], opsin2.PlaneRow(2, 0)[i], + out.PlaneRow(0, 0)[i], out.PlaneRow(1, 0)[i], out.PlaneRow(2, 0)[i]); + }; + + float max_err[3] = {}; + size_t max_err_i[3] = {}; + for (size_t i = 0; i < kNumColors; ++i) { + for (size_t c = 0; c < 3; ++c) { + // debug_print_color(i); printf("\n"); + float err = std::abs(in.PlaneRow(c, 0)[i] - out.PlaneRow(c, 0)[i]); + if (err > max_err[c]) { + max_err[c] = err; + max_err_i[c] = i; + } + } + } + static float kMaxError[3] = {9e-4, 4e-4, 5e-4}; + printf("Maximum errors:\n"); + for (size_t c = 0; c < 3; ++c) { + debug_print_color(max_err_i[c]); + printf(" %f\n", max_err[c]); + EXPECT_LT(max_err[c], kMaxError[c]); + } +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/common.h b/third_party/jpeg-xl/lib/jxl/common.h new file mode 100644 index 0000000000..c2ebe029a8 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/common.h @@ -0,0 +1,245 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_COMMON_H_ +#define LIB_JXL_COMMON_H_ + +// Shared constants and helper functions. + +#include +#include +#include + +#include // numeric_limits +#include // unique_ptr +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/padded_bytes.h" + +#ifndef JXL_HIGH_PRECISION +#define JXL_HIGH_PRECISION 1 +#endif + +// Macro that defines whether support for decoding JXL files to JPEG is enabled. +#ifndef JPEGXL_ENABLE_TRANSCODE_JPEG +#define JPEGXL_ENABLE_TRANSCODE_JPEG 1 +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + +// Macro that defines whether support for decoding boxes is enabled. +#ifndef JPEGXL_ENABLE_BOXES +#define JPEGXL_ENABLE_BOXES 1 +#endif // JPEGXL_ENABLE_BOXES + +namespace jxl { +// Some enums and typedefs used by more than one header file. + +constexpr size_t kBitsPerByte = 8; // more clear than CHAR_BIT + +constexpr inline size_t RoundUpBitsToByteMultiple(size_t bits) { + return (bits + 7) & ~size_t(7); +} + +constexpr inline size_t RoundUpToBlockDim(size_t dim) { + return (dim + 7) & ~size_t(7); +} + +static inline bool JXL_MAYBE_UNUSED SafeAdd(const uint64_t a, const uint64_t b, + uint64_t& sum) { + sum = a + b; + return sum >= a; // no need to check b - either sum >= both or < both. +} + +template +constexpr inline T1 DivCeil(T1 a, T2 b) { + return (a + b - 1) / b; +} + +// Works for any `align`; if a power of two, compiler emits ADD+AND. +constexpr inline size_t RoundUpTo(size_t what, size_t align) { + return DivCeil(what, align) * align; +} + +constexpr double kPi = 3.14159265358979323846264338327950288; + +// Reasonable default for sRGB, matches common monitors. We map white to this +// many nits (cd/m^2) by default. Butteraugli was tuned for 250 nits, which is +// very close. +static constexpr float kDefaultIntensityTarget = 255; + +template +constexpr T Pi(T multiplier) { + return static_cast(multiplier * kPi); +} + +// Block is the square grid of pixels to which an "energy compaction" +// transformation (e.g. DCT) is applied. Each block has its own AC quantizer. +constexpr size_t kBlockDim = 8; + +constexpr size_t kDCTBlockSize = kBlockDim * kBlockDim; + +constexpr size_t kGroupDim = 256; +static_assert(kGroupDim % kBlockDim == 0, + "Group dim should be divisible by block dim"); +constexpr size_t kGroupDimInBlocks = kGroupDim / kBlockDim; + +// Maximum number of passes in an image. +constexpr size_t kMaxNumPasses = 11; + +// Maximum number of reference frames. +constexpr size_t kMaxNumReferenceFrames = 4; + +// Dimensions of a frame, in pixels, and other derived dimensions. +// Computed from FrameHeader. +// TODO(veluca): add extra channels. +struct FrameDimensions { + void Set(size_t xsize, size_t ysize, size_t group_size_shift, + size_t max_hshift, size_t max_vshift, bool modular_mode, + size_t upsampling) { + group_dim = (kGroupDim >> 1) << group_size_shift; + dc_group_dim = group_dim * kBlockDim; + xsize_upsampled = xsize; + ysize_upsampled = ysize; + this->xsize = DivCeil(xsize, upsampling); + this->ysize = DivCeil(ysize, upsampling); + xsize_blocks = DivCeil(this->xsize, kBlockDim << max_hshift) << max_hshift; + ysize_blocks = DivCeil(this->ysize, kBlockDim << max_vshift) << max_vshift; + xsize_padded = xsize_blocks * kBlockDim; + ysize_padded = ysize_blocks * kBlockDim; + if (modular_mode) { + // Modular mode doesn't have any padding. + xsize_padded = this->xsize; + ysize_padded = this->ysize; + } + xsize_upsampled_padded = xsize_padded * upsampling; + ysize_upsampled_padded = ysize_padded * upsampling; + xsize_groups = DivCeil(this->xsize, group_dim); + ysize_groups = DivCeil(this->ysize, group_dim); + xsize_dc_groups = DivCeil(xsize_blocks, group_dim); + ysize_dc_groups = DivCeil(ysize_blocks, group_dim); + num_groups = xsize_groups * ysize_groups; + num_dc_groups = xsize_dc_groups * ysize_dc_groups; + } + + // Image size without any upsampling, i.e. original_size / upsampling. + size_t xsize; + size_t ysize; + // Original image size. + size_t xsize_upsampled; + size_t ysize_upsampled; + // Image size after upsampling the padded image. + size_t xsize_upsampled_padded; + size_t ysize_upsampled_padded; + // Image size after padding to a multiple of kBlockDim (if VarDCT mode). + size_t xsize_padded; + size_t ysize_padded; + // Image size in kBlockDim blocks. + size_t xsize_blocks; + size_t ysize_blocks; + // Image size in number of groups. + size_t xsize_groups; + size_t ysize_groups; + // Image size in number of DC groups. + size_t xsize_dc_groups; + size_t ysize_dc_groups; + // Number of AC or DC groups. + size_t num_groups; + size_t num_dc_groups; + // Size of a group. + size_t group_dim; + size_t dc_group_dim; +}; + +// Prior to C++14 (i.e. C++11): provide our own make_unique +#if __cplusplus < 201402L +template +std::unique_ptr make_unique(Args&&... args) { + return std::unique_ptr(new T(std::forward(args)...)); +} +#else +using std::make_unique; +#endif + +template +JXL_INLINE T Clamp1(T val, T low, T hi) { + return val < low ? low : val > hi ? hi : val; +} + +// Encodes non-negative (X) into (2 * X), negative (-X) into (2 * X - 1) +constexpr uint32_t PackSigned(int32_t value) + JXL_NO_SANITIZE("unsigned-integer-overflow") { + return (static_cast(value) << 1) ^ + ((static_cast(~value) >> 31) - 1); +} + +// Reverse to PackSigned, i.e. UnpackSigned(PackSigned(X)) == X. +// (((~value) & 1) - 1) is either 0 or 0xFF...FF and it will have an expected +// unsigned-integer-overflow. +constexpr intptr_t UnpackSigned(size_t value) + JXL_NO_SANITIZE("unsigned-integer-overflow") { + return static_cast((value >> 1) ^ (((~value) & 1) - 1)); +} + +// conversion from integer to string. +template +std::string ToString(T n) { + char data[32] = {}; + if (T(0.1) != T(0)) { + // float + snprintf(data, sizeof(data), "%g", static_cast(n)); + } else if (T(-1) > T(0)) { + // unsigned + snprintf(data, sizeof(data), "%llu", static_cast(n)); + } else { + // signed + snprintf(data, sizeof(data), "%lld", static_cast(n)); + } + return data; +} + +static inline JXL_MAYBE_UNUSED uint64_t DecodeVarInt(const uint8_t* input, + size_t inputSize, + size_t* pos) { + size_t i; + uint64_t ret = 0; + for (i = 0; *pos + i < inputSize && i < 10; ++i) { + ret |= uint64_t(input[*pos + i] & 127) << uint64_t(7 * i); + // If the next-byte flag is not set, stop + if ((input[*pos + i] & 128) == 0) break; + } + // TODO: Return a decoding error if i == 10. + *pos += i + 1; + return ret; +} + +static inline JXL_MAYBE_UNUSED bool EncodeVarInt(uint64_t value, + size_t output_size, + size_t* output_pos, + uint8_t* output) { + // While more than 7 bits of data are left, + // store 7 bits and set the next byte flag + while (value > 127) { + if (*output_pos > output_size) return false; + // |128: Set the next byte flag + output[(*output_pos)++] = ((uint8_t)(value & 127)) | 128; + // Remove the seven bits we just wrote + value >>= 7; + } + if (*output_pos > output_size) return false; + output[(*output_pos)++] = ((uint8_t)value) & 127; + return true; +} + +static inline JXL_MAYBE_UNUSED void EncodeVarInt(uint64_t value, + PaddedBytes* data) { + size_t pos = data->size(); + data->resize(data->size() + 9); + JXL_CHECK(EncodeVarInt(value, data->size(), &pos, data->data())); + data->resize(pos); +} + +} // namespace jxl + +#endif // LIB_JXL_COMMON_H_ diff --git a/third_party/jpeg-xl/lib/jxl/compressed_dc.cc b/third_party/jpeg-xl/lib/jxl/compressed_dc.cc new file mode 100644 index 0000000000..f9a8f149dd --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/compressed_dc.cc @@ -0,0 +1,318 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/compressed_dc.h" + +#include +#include +#include + +#include +#include +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/compressed_dc.cc" +#include +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/image.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +using D = HWY_FULL(float); +using DScalar = HWY_CAPPED(float, 1); + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Abs; +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Div; +using hwy::HWY_NAMESPACE::Max; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::Rebind; +using hwy::HWY_NAMESPACE::Sub; +using hwy::HWY_NAMESPACE::Vec; +using hwy::HWY_NAMESPACE::ZeroIfNegative; + +// TODO(veluca): optimize constants. +const float w1 = 0.20345139757231578f; +const float w2 = 0.0334829185968739f; +const float w0 = 1.0f - 4.0f * (w1 + w2); + +template +V MaxWorkaround(V a, V b) { +#if (HWY_TARGET == HWY_AVX3) && HWY_COMPILER_CLANG <= 800 + // Prevents "Do not know how to split the result of this operator" error + return IfThenElse(a > b, a, b); +#else + return Max(a, b); +#endif +} + +template +JXL_INLINE void ComputePixelChannel(const D d, const float dc_factor, + const float* JXL_RESTRICT row_top, + const float* JXL_RESTRICT row, + const float* JXL_RESTRICT row_bottom, + Vec* JXL_RESTRICT mc, + Vec* JXL_RESTRICT sm, + Vec* JXL_RESTRICT gap, size_t x) { + const auto tl = LoadU(d, row_top + x - 1); + const auto tc = Load(d, row_top + x); + const auto tr = LoadU(d, row_top + x + 1); + + const auto ml = LoadU(d, row + x - 1); + *mc = Load(d, row + x); + const auto mr = LoadU(d, row + x + 1); + + const auto bl = LoadU(d, row_bottom + x - 1); + const auto bc = Load(d, row_bottom + x); + const auto br = LoadU(d, row_bottom + x + 1); + + const auto w_center = Set(d, w0); + const auto w_side = Set(d, w1); + const auto w_corner = Set(d, w2); + + const auto corner = Add(Add(tl, tr), Add(bl, br)); + const auto side = Add(Add(ml, mr), Add(tc, bc)); + *sm = MulAdd(corner, w_corner, MulAdd(side, w_side, Mul(*mc, w_center))); + + const auto dc_quant = Set(d, dc_factor); + *gap = MaxWorkaround(*gap, Abs(Div(Sub(*mc, *sm), dc_quant))); +} + +template +JXL_INLINE void ComputePixel( + const float* JXL_RESTRICT dc_factors, + const float* JXL_RESTRICT* JXL_RESTRICT rows_top, + const float* JXL_RESTRICT* JXL_RESTRICT rows, + const float* JXL_RESTRICT* JXL_RESTRICT rows_bottom, + float* JXL_RESTRICT* JXL_RESTRICT out_rows, size_t x) { + const D d; + auto mc_x = Undefined(d); + auto mc_y = Undefined(d); + auto mc_b = Undefined(d); + auto sm_x = Undefined(d); + auto sm_y = Undefined(d); + auto sm_b = Undefined(d); + auto gap = Set(d, 0.5f); + ComputePixelChannel(d, dc_factors[0], rows_top[0], rows[0], rows_bottom[0], + &mc_x, &sm_x, &gap, x); + ComputePixelChannel(d, dc_factors[1], rows_top[1], rows[1], rows_bottom[1], + &mc_y, &sm_y, &gap, x); + ComputePixelChannel(d, dc_factors[2], rows_top[2], rows[2], rows_bottom[2], + &mc_b, &sm_b, &gap, x); + auto factor = MulAdd(Set(d, -4.0f), gap, Set(d, 3.0f)); + factor = ZeroIfNegative(factor); + + auto out = MulAdd(Sub(sm_x, mc_x), factor, mc_x); + Store(out, d, out_rows[0] + x); + out = MulAdd(Sub(sm_y, mc_y), factor, mc_y); + Store(out, d, out_rows[1] + x); + out = MulAdd(Sub(sm_b, mc_b), factor, mc_b); + Store(out, d, out_rows[2] + x); +} + +void AdaptiveDCSmoothing(const float* dc_factors, Image3F* dc, + ThreadPool* pool) { + const size_t xsize = dc->xsize(); + const size_t ysize = dc->ysize(); + if (ysize <= 2 || xsize <= 2) return; + + // TODO(veluca): use tile-based processing? + // TODO(veluca): decide if changes to the y channel should be propagated to + // the x and b channels through color correlation. + JXL_ASSERT(w1 + w2 < 0.25f); + + PROFILER_FUNC; + + Image3F smoothed(xsize, ysize); + // Fill in borders that the loop below will not. First and last are unused. + for (size_t c = 0; c < 3; c++) { + for (size_t y : {size_t(0), ysize - 1}) { + memcpy(smoothed.PlaneRow(c, y), dc->PlaneRow(c, y), + xsize * sizeof(float)); + } + } + auto process_row = [&](const uint32_t y, size_t /*thread*/) { + const float* JXL_RESTRICT rows_top[3]{ + dc->ConstPlaneRow(0, y - 1), + dc->ConstPlaneRow(1, y - 1), + dc->ConstPlaneRow(2, y - 1), + }; + const float* JXL_RESTRICT rows[3] = { + dc->ConstPlaneRow(0, y), + dc->ConstPlaneRow(1, y), + dc->ConstPlaneRow(2, y), + }; + const float* JXL_RESTRICT rows_bottom[3] = { + dc->ConstPlaneRow(0, y + 1), + dc->ConstPlaneRow(1, y + 1), + dc->ConstPlaneRow(2, y + 1), + }; + float* JXL_RESTRICT rows_out[3] = { + smoothed.PlaneRow(0, y), + smoothed.PlaneRow(1, y), + smoothed.PlaneRow(2, y), + }; + for (size_t x : {size_t(0), xsize - 1}) { + for (size_t c = 0; c < 3; c++) { + rows_out[c][x] = rows[c][x]; + } + } + + size_t x = 1; + // First pixels + const size_t N = Lanes(D()); + for (; x < std::min(N, xsize - 1); x++) { + ComputePixel(dc_factors, rows_top, rows, rows_bottom, rows_out, + x); + } + // Full vectors. + for (; x + N <= xsize - 1; x += N) { + ComputePixel(dc_factors, rows_top, rows, rows_bottom, rows_out, x); + } + // Last pixels. + for (; x < xsize - 1; x++) { + ComputePixel(dc_factors, rows_top, rows, rows_bottom, rows_out, + x); + } + }; + JXL_CHECK(RunOnPool(pool, 1, ysize - 1, ThreadPool::NoInit, process_row, + "DCSmoothingRow")); + dc->Swap(smoothed); +} + +// DC dequantization. +void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in, + const float* dc_factors, float mul, const float* cfl_factors, + YCbCrChromaSubsampling chroma_subsampling, + const BlockCtxMap& bctx) { + const HWY_FULL(float) df; + const Rebind di; // assumes pixel_type <= float + if (chroma_subsampling.Is444()) { + const auto fac_x = Set(df, dc_factors[0] * mul); + const auto fac_y = Set(df, dc_factors[1] * mul); + const auto fac_b = Set(df, dc_factors[2] * mul); + const auto cfl_fac_x = Set(df, cfl_factors[0]); + const auto cfl_fac_b = Set(df, cfl_factors[2]); + for (size_t y = 0; y < r.ysize(); y++) { + float* dec_row_x = r.PlaneRow(dc, 0, y); + float* dec_row_y = r.PlaneRow(dc, 1, y); + float* dec_row_b = r.PlaneRow(dc, 2, y); + const int32_t* quant_row_x = in.channel[1].plane.Row(y); + const int32_t* quant_row_y = in.channel[0].plane.Row(y); + const int32_t* quant_row_b = in.channel[2].plane.Row(y); + for (size_t x = 0; x < r.xsize(); x += Lanes(di)) { + const auto in_q_x = Load(di, quant_row_x + x); + const auto in_q_y = Load(di, quant_row_y + x); + const auto in_q_b = Load(di, quant_row_b + x); + const auto in_x = Mul(ConvertTo(df, in_q_x), fac_x); + const auto in_y = Mul(ConvertTo(df, in_q_y), fac_y); + const auto in_b = Mul(ConvertTo(df, in_q_b), fac_b); + Store(in_y, df, dec_row_y + x); + Store(MulAdd(in_y, cfl_fac_x, in_x), df, dec_row_x + x); + Store(MulAdd(in_y, cfl_fac_b, in_b), df, dec_row_b + x); + } + } + } else { + for (size_t c : {1, 0, 2}) { + Rect rect(r.x0() >> chroma_subsampling.HShift(c), + r.y0() >> chroma_subsampling.VShift(c), + r.xsize() >> chroma_subsampling.HShift(c), + r.ysize() >> chroma_subsampling.VShift(c)); + const auto fac = Set(df, dc_factors[c] * mul); + const Channel& ch = in.channel[c < 2 ? c ^ 1 : c]; + for (size_t y = 0; y < rect.ysize(); y++) { + const int32_t* quant_row = ch.plane.Row(y); + float* row = rect.PlaneRow(dc, c, y); + for (size_t x = 0; x < rect.xsize(); x += Lanes(di)) { + const auto in_q = Load(di, quant_row + x); + const auto in = Mul(ConvertTo(df, in_q), fac); + Store(in, df, row + x); + } + } + } + } + if (bctx.num_dc_ctxs <= 1) { + for (size_t y = 0; y < r.ysize(); y++) { + uint8_t* qdc_row = r.Row(quant_dc, y); + memset(qdc_row, 0, sizeof(*qdc_row) * r.xsize()); + } + } else { + for (size_t y = 0; y < r.ysize(); y++) { + uint8_t* qdc_row_val = r.Row(quant_dc, y); + const int32_t* quant_row_x = + in.channel[1].plane.Row(y >> chroma_subsampling.VShift(0)); + const int32_t* quant_row_y = + in.channel[0].plane.Row(y >> chroma_subsampling.VShift(1)); + const int32_t* quant_row_b = + in.channel[2].plane.Row(y >> chroma_subsampling.VShift(2)); + for (size_t x = 0; x < r.xsize(); x++) { + int bucket_x = 0, bucket_y = 0, bucket_b = 0; + for (int t : bctx.dc_thresholds[0]) { + if (quant_row_x[x >> chroma_subsampling.HShift(0)] > t) bucket_x++; + } + for (int t : bctx.dc_thresholds[1]) { + if (quant_row_y[x >> chroma_subsampling.HShift(1)] > t) bucket_y++; + } + for (int t : bctx.dc_thresholds[2]) { + if (quant_row_b[x >> chroma_subsampling.HShift(2)] > t) bucket_b++; + } + int bucket = bucket_x; + bucket *= bctx.dc_thresholds[2].size() + 1; + bucket += bucket_b; + bucket *= bctx.dc_thresholds[1].size() + 1; + bucket += bucket_y; + qdc_row_val[x] = bucket; + } + } + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(DequantDC); +HWY_EXPORT(AdaptiveDCSmoothing); +void AdaptiveDCSmoothing(const float* dc_factors, Image3F* dc, + ThreadPool* pool) { + return HWY_DYNAMIC_DISPATCH(AdaptiveDCSmoothing)(dc_factors, dc, pool); +} + +void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in, + const float* dc_factors, float mul, const float* cfl_factors, + YCbCrChromaSubsampling chroma_subsampling, + const BlockCtxMap& bctx) { + return HWY_DYNAMIC_DISPATCH(DequantDC)(r, dc, quant_dc, in, dc_factors, mul, + cfl_factors, chroma_subsampling, bctx); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/compressed_dc.h b/third_party/jpeg-xl/lib/jxl/compressed_dc.h new file mode 100644 index 0000000000..b06e5931f0 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/compressed_dc.h @@ -0,0 +1,34 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_COMPRESSED_DC_H_ +#define LIB_JXL_COMPRESSED_DC_H_ + +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/modular/modular_image.h" + +// DC handling functions: encoding and decoding of DC to and from bitstream, and +// related function to initialize the per-group decoder cache. + +namespace jxl { + +// Smooth DC in already-smooth areas, to counteract banding. +void AdaptiveDCSmoothing(const float* dc_factors, Image3F* dc, + ThreadPool* pool); + +void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in, + const float* dc_factors, float mul, const float* cfl_factors, + YCbCrChromaSubsampling chroma_subsampling, + const BlockCtxMap& bctx); + +} // namespace jxl + +#endif // LIB_JXL_COMPRESSED_DC_H_ diff --git a/third_party/jpeg-xl/lib/jxl/convolve-inl.h b/third_party/jpeg-xl/lib/jxl/convolve-inl.h new file mode 100644 index 0000000000..054c9c6f0d --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/convolve-inl.h @@ -0,0 +1,297 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#if defined(LIB_JXL_CONVOLVE_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_CONVOLVE_INL_H_ +#undef LIB_JXL_CONVOLVE_INL_H_ +#else +#define LIB_JXL_CONVOLVE_INL_H_ +#endif + +#include + +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/image_ops.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Broadcast; +#if HWY_TARGET != HWY_SCALAR +using hwy::HWY_NAMESPACE::CombineShiftRightBytes; +#endif +using hwy::HWY_NAMESPACE::TableLookupLanes; +using hwy::HWY_NAMESPACE::Vec; + +// Synthesizes left/right neighbors from a vector of center pixels. +class Neighbors { + public: + using D = HWY_CAPPED(float, 16); + using V = Vec; + + // Returns l[i] == c[Mirror(i - 1)]. + HWY_INLINE HWY_MAYBE_UNUSED static V FirstL1(const V c) { +#if HWY_CAP_GE256 + const D d; + HWY_ALIGN constexpr int32_t lanes[16] = {0, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14}; + const auto indices = SetTableIndices(d, lanes); + // c = PONM'LKJI + return TableLookupLanes(c, indices); // ONML'KJII +#elif HWY_TARGET == HWY_SCALAR + return c; // Same (the first mirrored value is the last valid one) +#else // 128 bit + // c = LKJI +#if HWY_TARGET <= (1 << HWY_HIGHEST_TARGET_BIT_X86) + return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(2, 1, 0, 0))}; // KJII +#else + const D d; + // TODO(deymo): Figure out if this can be optimized using a single vsri + // instruction to convert LKJI to KJII. + HWY_ALIGN constexpr int lanes[4] = {0, 0, 1, 2}; // KJII + const auto indices = SetTableIndices(d, lanes); + return TableLookupLanes(c, indices); +#endif +#endif + } + + // Returns l[i] == c[Mirror(i - 2)]. + HWY_INLINE HWY_MAYBE_UNUSED static V FirstL2(const V c) { +#if HWY_CAP_GE256 + const D d; + HWY_ALIGN constexpr int32_t lanes[16] = {1, 0, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13}; + const auto indices = SetTableIndices(d, lanes); + // c = PONM'LKJI + return TableLookupLanes(c, indices); // NMLK'JIIJ +#elif HWY_TARGET == HWY_SCALAR + const D d; + JXL_ASSERT(false); // unsupported, avoid calling this. + return Zero(d); +#else // 128 bit + // c = LKJI +#if HWY_TARGET <= (1 << HWY_HIGHEST_TARGET_BIT_X86) + return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(1, 0, 0, 1))}; // JIIJ +#else + const D d; + HWY_ALIGN constexpr int lanes[4] = {1, 0, 0, 1}; // JIIJ + const auto indices = SetTableIndices(d, lanes); + return TableLookupLanes(c, indices); +#endif +#endif + } + + // Returns l[i] == c[Mirror(i - 3)]. + HWY_INLINE HWY_MAYBE_UNUSED static V FirstL3(const V c) { +#if HWY_CAP_GE256 + const D d; + HWY_ALIGN constexpr int32_t lanes[16] = {2, 1, 0, 0, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12}; + const auto indices = SetTableIndices(d, lanes); + // c = PONM'LKJI + return TableLookupLanes(c, indices); // MLKJ'IIJK +#elif HWY_TARGET == HWY_SCALAR + const D d; + JXL_ASSERT(false); // unsupported, avoid calling this. + return Zero(d); +#else // 128 bit + // c = LKJI +#if HWY_TARGET <= (1 << HWY_HIGHEST_TARGET_BIT_X86) + return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(0, 0, 1, 2))}; // IIJK +#else + const D d; + HWY_ALIGN constexpr int lanes[4] = {2, 1, 0, 0}; // IIJK + const auto indices = SetTableIndices(d, lanes); + return TableLookupLanes(c, indices); +#endif +#endif + } +}; + +#if HWY_TARGET != HWY_SCALAR + +// Returns indices for SetTableIndices such that TableLookupLanes on the +// rightmost unaligned vector (rightmost sample in its most-significant lane) +// returns the mirrored values, with the mirror outside the last valid sample. +static inline const int32_t* MirrorLanes(const size_t mod) { + const HWY_CAPPED(float, 16) d; + constexpr size_t kN = MaxLanes(d); + + // For mod = `image width mod 16` 0..15: + // last full vec mirrored (mem order) loadedVec mirrorVec idxVec + // 0123456789abcdef| fedcba9876543210 fed..210 012..def 012..def + // 0123456789abcdef|0 0fedcba98765432 0fe..321 234..f00 123..eff + // 0123456789abcdef|01 10fedcba987654 10f..432 456..110 234..ffe + // 0123456789abcdef|012 210fedcba9876 210..543 67..2210 34..ffed + // 0123456789abcdef|0123 3210fedcba98 321..654 8..33210 4..ffedc + // 0123456789abcdef|01234 43210fedcba + // 0123456789abcdef|012345 543210fedc + // 0123456789abcdef|0123456 6543210fe + // 0123456789abcdef|01234567 76543210 + // 0123456789abcdef|012345678 8765432 + // 0123456789abcdef|0123456789 987654 + // 0123456789abcdef|0123456789A A9876 + // 0123456789abcdef|0123456789AB BA98 + // 0123456789abcdef|0123456789ABC CBA + // 0123456789abcdef|0123456789ABCD DC + // 0123456789abcdef|0123456789ABCDE E EDC..10f EED..210 ffe..321 +#if HWY_CAP_GE512 + HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, // + 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; +#elif HWY_CAP_GE256 + HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = { + 1, 2, 3, 4, 5, 6, 7, 7, // + 6, 5, 4, 3, 2, 1, 0}; +#else // 128-bit + HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = {1, 2, 3, 3, // + 2, 1, 0}; +#endif + return idx_lanes + kN - 1 - mod; +} + +#endif // HWY_TARGET != HWY_SCALAR + +// Single entry point for convolution. +// "Strategy" (Direct*/Separable*) decides kernel size and how to evaluate it. +template +class ConvolveT { + static constexpr int64_t kRadius = Strategy::kRadius; + using Simd = HWY_CAPPED(float, 16); + + public: + static size_t MinWidth() { +#if HWY_TARGET == HWY_SCALAR + // First/Last use mirrored loads of up to +/- kRadius. + return 2 * kRadius; +#else + return Lanes(Simd()) + kRadius; +#endif + } + + // "Image" is ImageF or Image3F. + template + static void Run(const Image& in, const Rect& rect, const Weights& weights, + ThreadPool* pool, Image* out) { + PROFILER_ZONE("ConvolveT::Run"); + JXL_CHECK(SameSize(rect, *out)); + JXL_CHECK(rect.xsize() >= MinWidth()); + + static_assert(int64_t(kRadius) <= 3, + "Must handle [0, kRadius) and >= kRadius"); + switch (rect.xsize() % Lanes(Simd())) { + case 0: + return RunRows<0>(in, rect, weights, pool, out); + case 1: + return RunRows<1>(in, rect, weights, pool, out); + case 2: + return RunRows<2>(in, rect, weights, pool, out); + default: + return RunRows<3>(in, rect, weights, pool, out); + } + } + + private: + template + static JXL_INLINE void RunRow(const float* JXL_RESTRICT in, + const size_t xsize, const int64_t stride, + const WrapRow& wrap_row, const Weights& weights, + float* JXL_RESTRICT out) { + Strategy::template ConvolveRow(in, xsize, stride, wrap_row, + weights, out); + } + + template + static JXL_INLINE void RunBorderRows(const ImageF& in, const Rect& rect, + const int64_t ybegin, const int64_t yend, + const Weights& weights, ImageF* out) { + const int64_t stride = in.PixelsPerRow(); + const WrapRowMirror wrap_row(in, rect.ysize()); + for (int64_t y = ybegin; y < yend; ++y) { + RunRow(rect.ConstRow(in, y), rect.xsize(), stride, wrap_row, + weights, out->Row(y)); + } + } + + // Image3F. + template + static JXL_INLINE void RunBorderRows(const Image3F& in, const Rect& rect, + const int64_t ybegin, const int64_t yend, + const Weights& weights, Image3F* out) { + const int64_t stride = in.PixelsPerRow(); + for (int64_t y = ybegin; y < yend; ++y) { + for (size_t c = 0; c < 3; ++c) { + const WrapRowMirror wrap_row(in.Plane(c), rect.ysize()); + RunRow(rect.ConstPlaneRow(in, c, y), rect.xsize(), stride, + wrap_row, weights, out->PlaneRow(c, y)); + } + } + } + + template + static JXL_INLINE void RunInteriorRows(const ImageF& in, const Rect& rect, + const int64_t ybegin, + const int64_t yend, + const Weights& weights, + ThreadPool* pool, ImageF* out) { + const int64_t stride = in.PixelsPerRow(); + JXL_CHECK(RunOnPool( + pool, ybegin, yend, ThreadPool::NoInit, + [&](const uint32_t y, size_t /*thread*/) HWY_ATTR { + RunRow(rect.ConstRow(in, y), rect.xsize(), stride, + WrapRowUnchanged(), weights, out->Row(y)); + }, + "Convolve")); + } + + // Image3F. + template + static JXL_INLINE void RunInteriorRows(const Image3F& in, const Rect& rect, + const int64_t ybegin, + const int64_t yend, + const Weights& weights, + ThreadPool* pool, Image3F* out) { + const int64_t stride = in.PixelsPerRow(); + JXL_CHECK(RunOnPool( + pool, ybegin, yend, ThreadPool::NoInit, + [&](const uint32_t y, size_t /*thread*/) HWY_ATTR { + for (size_t c = 0; c < 3; ++c) { + RunRow(rect.ConstPlaneRow(in, c, y), rect.xsize(), + stride, WrapRowUnchanged(), weights, + out->PlaneRow(c, y)); + } + }, + "Convolve3")); + } + + template + static JXL_INLINE void RunRows(const Image& in, const Rect& rect, + const Weights& weights, ThreadPool* pool, + Image* out) { + const int64_t ysize = rect.ysize(); + RunBorderRows(in, rect, 0, std::min(int64_t(kRadius), ysize), + weights, out); + if (ysize > 2 * int64_t(kRadius)) { + RunInteriorRows(in, rect, int64_t(kRadius), + ysize - int64_t(kRadius), weights, pool, out); + } + if (ysize > int64_t(kRadius)) { + RunBorderRows(in, rect, ysize - int64_t(kRadius), ysize, + weights, out); + } + } +}; + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_CONVOLVE_INL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/convolve.h b/third_party/jpeg-xl/lib/jxl/convolve.h new file mode 100644 index 0000000000..2fcd2d0980 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/convolve.h @@ -0,0 +1,105 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_CONVOLVE_H_ +#define LIB_JXL_CONVOLVE_H_ + +// 2D convolution. + +#include +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/image.h" + +namespace jxl { + +// No valid values outside [0, xsize), but the strategy may still safely load +// the preceding vector, and/or round xsize up to the vector lane count. This +// avoids needing PadImage. +// Requires xsize >= kConvolveLanes + kConvolveMaxRadius. +static constexpr size_t kConvolveMaxRadius = 3; + +// Weights must already be normalized. + +struct WeightsSymmetric3 { + // d r d (each replicated 4x) + // r c r + // d r d + float c[4]; + float r[4]; + float d[4]; +}; + +struct WeightsSymmetric5 { + // The lower-right quadrant is: c r R (each replicated 4x) + // r d L + // R L D + float c[4]; + float r[4]; + float R[4]; + float d[4]; + float D[4]; + float L[4]; +}; + +// Weights for separable 5x5 filters (typically but not necessarily the same +// values for horizontal and vertical directions). The kernel must already be +// normalized, but note that values for negative offsets are omitted, so the +// given values do not sum to 1. +struct WeightsSeparable5 { + // Horizontal 1D, distances 0..2 (each replicated 4x) + float horz[3 * 4]; + float vert[3 * 4]; +}; + +// Weights for separable 7x7 filters (typically but not necessarily the same +// values for horizontal and vertical directions). The kernel must already be +// normalized, but note that values for negative offsets are omitted, so the +// given values do not sum to 1. +// +// NOTE: for >= 7x7 Gaussian kernels, it is faster to use FastGaussian instead, +// at least when images exceed the L1 cache size. +struct WeightsSeparable7 { + // Horizontal 1D, distances 0..3 (each replicated 4x) + float horz[4 * 4]; + float vert[4 * 4]; +}; + +const WeightsSymmetric3& WeightsSymmetric3Lowpass(); +const WeightsSeparable5& WeightsSeparable5Lowpass(); +const WeightsSymmetric5& WeightsSymmetric5Lowpass(); + +void SlowSymmetric3(const ImageF& in, const Rect& rect, + const WeightsSymmetric3& weights, ThreadPool* pool, + ImageF* JXL_RESTRICT out); + +void SlowSeparable5(const ImageF& in, const Rect& rect, + const WeightsSeparable5& weights, ThreadPool* pool, + ImageF* out); + +void SlowSeparable7(const ImageF& in, const Rect& rect, + const WeightsSeparable7& weights, ThreadPool* pool, + ImageF* out); + +void Symmetric3(const ImageF& in, const Rect& rect, + const WeightsSymmetric3& weights, ThreadPool* pool, + ImageF* out); + +void Symmetric5(const ImageF& in, const Rect& rect, + const WeightsSymmetric5& weights, ThreadPool* pool, + ImageF* JXL_RESTRICT out); + +void Separable5(const ImageF& in, const Rect& rect, + const WeightsSeparable5& weights, ThreadPool* pool, + ImageF* out); + +void Separable7(const ImageF& in, const Rect& rect, + const WeightsSeparable7& weights, ThreadPool* pool, + ImageF* out); + +} // namespace jxl + +#endif // LIB_JXL_CONVOLVE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/convolve_separable5.cc b/third_party/jpeg-xl/lib/jxl/convolve_separable5.cc new file mode 100644 index 0000000000..b26ff54bbc --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/convolve_separable5.cc @@ -0,0 +1,261 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/convolve.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/convolve_separable5.cc" +#include +#include + +#include "lib/jxl/convolve-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::Vec; + +// 5x5 convolution by separable kernel with a single scan through the input. +// This is more cache-efficient than separate horizontal/vertical passes, and +// possibly faster (given enough registers) than tiling and/or transposing. +// +// Overview: imagine a 5x5 window around a central pixel. First convolve the +// rows by multiplying the pixels with the corresponding weights from +// WeightsSeparable5.horz[abs(x_offset) * 4]. Then multiply each of these +// intermediate results by the corresponding vertical weight, i.e. +// vert[abs(y_offset) * 4]. Finally, store the sum of these values as the +// convolution result at the position of the central pixel in the output. +// +// Each of these operations uses SIMD vectors. The central pixel and most +// importantly the output are aligned, so neighnoring pixels (e.g. x_offset=1) +// require unaligned loads. Because weights are supplied in identical groups of +// 4, we can use LoadDup128 to load them (slightly faster). +// +// Uses mirrored boundary handling. Until x >= kRadius, the horizontal +// convolution uses Neighbors class to shuffle vectors as if each of its lanes +// had been loaded from the mirrored offset. Similarly, the last full vector to +// write uses mirroring. In the case of scalar vectors, Neighbors is not usable +// and the value is loaded directly. Otherwise, the number of valid pixels +// modulo the vector size enables a small optimization: for smaller offsets, +// a non-mirrored load is sufficient. +class Separable5Strategy { + using D = HWY_CAPPED(float, 16); + using V = Vec; + + public: + static constexpr int64_t kRadius = 2; + + template + static JXL_MAYBE_INLINE void ConvolveRow( + const float* const JXL_RESTRICT row_m, const size_t xsize, + const int64_t stride, const WrapRow& wrap_row, + const WeightsSeparable5& weights, float* const JXL_RESTRICT row_out) { + const D d; + const int64_t neg_stride = -stride; // allows LEA addressing. + const float* const JXL_RESTRICT row_t2 = + wrap_row(row_m + 2 * neg_stride, stride); + const float* const JXL_RESTRICT row_t1 = + wrap_row(row_m + 1 * neg_stride, stride); + const float* const JXL_RESTRICT row_b1 = + wrap_row(row_m + 1 * stride, stride); + const float* const JXL_RESTRICT row_b2 = + wrap_row(row_m + 2 * stride, stride); + + const V wh0 = LoadDup128(d, weights.horz + 0 * 4); + const V wh1 = LoadDup128(d, weights.horz + 1 * 4); + const V wh2 = LoadDup128(d, weights.horz + 2 * 4); + const V wv0 = LoadDup128(d, weights.vert + 0 * 4); + const V wv1 = LoadDup128(d, weights.vert + 1 * 4); + const V wv2 = LoadDup128(d, weights.vert + 2 * 4); + + size_t x = 0; + + // More than one iteration for scalars. + for (; x < kRadius; x += Lanes(d)) { + const V conv0 = + Mul(HorzConvolveFirst(row_m, x, xsize, wh0, wh1, wh2), wv0); + + const V conv1t = HorzConvolveFirst(row_t1, x, xsize, wh0, wh1, wh2); + const V conv1b = HorzConvolveFirst(row_b1, x, xsize, wh0, wh1, wh2); + const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0); + + const V conv2t = HorzConvolveFirst(row_t2, x, xsize, wh0, wh1, wh2); + const V conv2b = HorzConvolveFirst(row_b2, x, xsize, wh0, wh1, wh2); + const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1); + Store(conv2, d, row_out + x); + } + + // Main loop: load inputs without padding + for (; x + Lanes(d) + kRadius <= xsize; x += Lanes(d)) { + const V conv0 = Mul(HorzConvolve(row_m + x, wh0, wh1, wh2), wv0); + + const V conv1t = HorzConvolve(row_t1 + x, wh0, wh1, wh2); + const V conv1b = HorzConvolve(row_b1 + x, wh0, wh1, wh2); + const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0); + + const V conv2t = HorzConvolve(row_t2 + x, wh0, wh1, wh2); + const V conv2b = HorzConvolve(row_b2 + x, wh0, wh1, wh2); + const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1); + Store(conv2, d, row_out + x); + } + + // Last full vector to write (the above loop handled mod >= kRadius) +#if HWY_TARGET == HWY_SCALAR + while (x < xsize) { +#else + if (kSizeModN < kRadius) { +#endif + const V conv0 = + Mul(HorzConvolveLast(row_m, x, xsize, wh0, wh1, wh2), wv0); + + const V conv1t = + HorzConvolveLast(row_t1, x, xsize, wh0, wh1, wh2); + const V conv1b = + HorzConvolveLast(row_b1, x, xsize, wh0, wh1, wh2); + const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0); + + const V conv2t = + HorzConvolveLast(row_t2, x, xsize, wh0, wh1, wh2); + const V conv2b = + HorzConvolveLast(row_b2, x, xsize, wh0, wh1, wh2); + const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1); + Store(conv2, d, row_out + x); + x += Lanes(d); + } + + // If mod = 0, the above vector was the last. + if (kSizeModN != 0) { + for (; x < xsize; ++x) { + float mul = 0.0f; + for (int64_t dy = -kRadius; dy <= kRadius; ++dy) { + const float wy = weights.vert[std::abs(dy) * 4]; + const float* clamped_row = wrap_row(row_m + dy * stride, stride); + for (int64_t dx = -kRadius; dx <= kRadius; ++dx) { + const float wx = weights.horz[std::abs(dx) * 4]; + const int64_t clamped_x = Mirror(x + dx, xsize); + mul += clamped_row[clamped_x] * wx * wy; + } + } + row_out[x] = mul; + } + } + } + + private: + // Same as HorzConvolve for the first/last vector in a row. + static JXL_MAYBE_INLINE V HorzConvolveFirst( + const float* const JXL_RESTRICT row, const int64_t x, const int64_t xsize, + const V wh0, const V wh1, const V wh2) { + const D d; + const V c = LoadU(d, row + x); + const V mul0 = Mul(c, wh0); + +#if HWY_TARGET == HWY_SCALAR + const V l1 = LoadU(d, row + Mirror(x - 1, xsize)); + const V l2 = LoadU(d, row + Mirror(x - 2, xsize)); +#else + (void)xsize; + const V l1 = Neighbors::FirstL1(c); + const V l2 = Neighbors::FirstL2(c); +#endif + + const V r1 = LoadU(d, row + x + 1); + const V r2 = LoadU(d, row + x + 2); + + const V mul1 = MulAdd(Add(l1, r1), wh1, mul0); + const V mul2 = MulAdd(Add(l2, r2), wh2, mul1); + return mul2; + } + + template + static JXL_MAYBE_INLINE V + HorzConvolveLast(const float* const JXL_RESTRICT row, const int64_t x, + const int64_t xsize, const V wh0, const V wh1, const V wh2) { + const D d; + const V c = LoadU(d, row + x); + const V mul0 = Mul(c, wh0); + + const V l1 = LoadU(d, row + x - 1); + const V l2 = LoadU(d, row + x - 2); + + V r1, r2; +#if HWY_TARGET == HWY_SCALAR + r1 = LoadU(d, row + Mirror(x + 1, xsize)); + r2 = LoadU(d, row + Mirror(x + 2, xsize)); +#else + const size_t N = Lanes(d); + if (kSizeModN == 0) { + r2 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 2))); + r1 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 1))); + } else { // == 1 + const auto last = LoadU(d, row + xsize - N); + r2 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1))); + r1 = last; + } +#endif + + // Sum of pixels with Manhattan distance i, multiplied by weights[i]. + const V sum1 = Add(l1, r1); + const V mul1 = MulAdd(sum1, wh1, mul0); + const V sum2 = Add(l2, r2); + const V mul2 = MulAdd(sum2, wh2, mul1); + return mul2; + } + + // Requires kRadius valid pixels before/after pos. + static JXL_MAYBE_INLINE V HorzConvolve(const float* const JXL_RESTRICT pos, + const V wh0, const V wh1, + const V wh2) { + const D d; + const V c = LoadU(d, pos); + const V mul0 = Mul(c, wh0); + + // Loading anew is faster than combining vectors. + const V l1 = LoadU(d, pos - 1); + const V r1 = LoadU(d, pos + 1); + const V l2 = LoadU(d, pos - 2); + const V r2 = LoadU(d, pos + 2); + // Sum of pixels with Manhattan distance i, multiplied by weights[i]. + const V sum1 = Add(l1, r1); + const V mul1 = MulAdd(sum1, wh1, mul0); + const V sum2 = Add(l2, r2); + const V mul2 = MulAdd(sum2, wh2, mul1); + return mul2; + } +}; + +void Separable5(const ImageF& in, const Rect& rect, + const WeightsSeparable5& weights, ThreadPool* pool, + ImageF* out) { + using Conv = ConvolveT; + if (rect.xsize() >= Conv::MinWidth()) { + return Conv::Run(in, rect, weights, pool, out); + } + + return SlowSeparable5(in, rect, weights, pool, out); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(Separable5); +void Separable5(const ImageF& in, const Rect& rect, + const WeightsSeparable5& weights, ThreadPool* pool, + ImageF* out) { + return HWY_DYNAMIC_DISPATCH(Separable5)(in, rect, weights, pool, out); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/convolve_separable7.cc b/third_party/jpeg-xl/lib/jxl/convolve_separable7.cc new file mode 100644 index 0000000000..086dfd22b5 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/convolve_separable7.cc @@ -0,0 +1,285 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/convolve.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/convolve_separable7.cc" +#include +#include + +#include "lib/jxl/convolve-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::Vec; + +// 7x7 convolution by separable kernel with a single scan through the input. +// Extended version of Separable5, see documentation there. +class Separable7Strategy { + using D = HWY_CAPPED(float, 16); + using V = Vec; + + public: + static constexpr int64_t kRadius = 3; + + template + static JXL_MAYBE_INLINE void ConvolveRow( + const float* const JXL_RESTRICT row_m, const size_t xsize, + const int64_t stride, const WrapRow& wrap_row, + const WeightsSeparable7& weights, float* const JXL_RESTRICT row_out) { + const D d; + const int64_t neg_stride = -stride; // allows LEA addressing. + const float* const JXL_RESTRICT row_t3 = + wrap_row(row_m + 3 * neg_stride, stride); + const float* const JXL_RESTRICT row_t2 = + wrap_row(row_m + 2 * neg_stride, stride); + const float* const JXL_RESTRICT row_t1 = + wrap_row(row_m + 1 * neg_stride, stride); + const float* const JXL_RESTRICT row_b1 = + wrap_row(row_m + 1 * stride, stride); + const float* const JXL_RESTRICT row_b2 = + wrap_row(row_m + 2 * stride, stride); + const float* const JXL_RESTRICT row_b3 = + wrap_row(row_m + 3 * stride, stride); + + const V wh0 = LoadDup128(d, weights.horz + 0 * 4); + const V wh1 = LoadDup128(d, weights.horz + 1 * 4); + const V wh2 = LoadDup128(d, weights.horz + 2 * 4); + const V wh3 = LoadDup128(d, weights.horz + 3 * 4); + const V wv0 = LoadDup128(d, weights.vert + 0 * 4); + const V wv1 = LoadDup128(d, weights.vert + 1 * 4); + const V wv2 = LoadDup128(d, weights.vert + 2 * 4); + const V wv3 = LoadDup128(d, weights.vert + 3 * 4); + + size_t x = 0; + + // More than one iteration for scalars. + for (; x < kRadius; x += Lanes(d)) { + const V conv0 = + Mul(HorzConvolveFirst(row_m, x, xsize, wh0, wh1, wh2, wh3), wv0); + + const V conv1t = HorzConvolveFirst(row_t1, x, xsize, wh0, wh1, wh2, wh3); + const V conv1b = HorzConvolveFirst(row_b1, x, xsize, wh0, wh1, wh2, wh3); + const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0); + + const V conv2t = HorzConvolveFirst(row_t2, x, xsize, wh0, wh1, wh2, wh3); + const V conv2b = HorzConvolveFirst(row_b2, x, xsize, wh0, wh1, wh2, wh3); + const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1); + + const V conv3t = HorzConvolveFirst(row_t3, x, xsize, wh0, wh1, wh2, wh3); + const V conv3b = HorzConvolveFirst(row_b3, x, xsize, wh0, wh1, wh2, wh3); + const V conv3 = MulAdd(Add(conv3t, conv3b), wv3, conv2); + + Store(conv3, d, row_out + x); + } + + // Main loop: load inputs without padding + for (; x + Lanes(d) + kRadius <= xsize; x += Lanes(d)) { + const V conv0 = Mul(HorzConvolve(row_m + x, wh0, wh1, wh2, wh3), wv0); + + const V conv1t = HorzConvolve(row_t1 + x, wh0, wh1, wh2, wh3); + const V conv1b = HorzConvolve(row_b1 + x, wh0, wh1, wh2, wh3); + const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0); + + const V conv2t = HorzConvolve(row_t2 + x, wh0, wh1, wh2, wh3); + const V conv2b = HorzConvolve(row_b2 + x, wh0, wh1, wh2, wh3); + const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1); + + const V conv3t = HorzConvolve(row_t3 + x, wh0, wh1, wh2, wh3); + const V conv3b = HorzConvolve(row_b3 + x, wh0, wh1, wh2, wh3); + const V conv3 = MulAdd(Add(conv3t, conv3b), wv3, conv2); + + Store(conv3, d, row_out + x); + } + + // Last full vector to write (the above loop handled mod >= kRadius) +#if HWY_TARGET == HWY_SCALAR + while (x < xsize) { +#else + if (kSizeModN < kRadius) { +#endif + const V conv0 = + Mul(HorzConvolveLast(row_m, x, xsize, wh0, wh1, wh2, wh3), + wv0); + + const V conv1t = + HorzConvolveLast(row_t1, x, xsize, wh0, wh1, wh2, wh3); + const V conv1b = + HorzConvolveLast(row_b1, x, xsize, wh0, wh1, wh2, wh3); + const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0); + + const V conv2t = + HorzConvolveLast(row_t2, x, xsize, wh0, wh1, wh2, wh3); + const V conv2b = + HorzConvolveLast(row_b2, x, xsize, wh0, wh1, wh2, wh3); + const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1); + + const V conv3t = + HorzConvolveLast(row_t3, x, xsize, wh0, wh1, wh2, wh3); + const V conv3b = + HorzConvolveLast(row_b3, x, xsize, wh0, wh1, wh2, wh3); + const V conv3 = MulAdd(Add(conv3t, conv3b), wv3, conv2); + + Store(conv3, d, row_out + x); + x += Lanes(d); + } + + // If mod = 0, the above vector was the last. + if (kSizeModN != 0) { + for (; x < xsize; ++x) { + float mul = 0.0f; + for (int64_t dy = -kRadius; dy <= kRadius; ++dy) { + const float wy = weights.vert[std::abs(dy) * 4]; + const float* clamped_row = wrap_row(row_m + dy * stride, stride); + for (int64_t dx = -kRadius; dx <= kRadius; ++dx) { + const float wx = weights.horz[std::abs(dx) * 4]; + const int64_t clamped_x = Mirror(x + dx, xsize); + mul += clamped_row[clamped_x] * wx * wy; + } + } + row_out[x] = mul; + } + } + } + + private: + // Same as HorzConvolve for the first/last vector in a row. + static JXL_MAYBE_INLINE V HorzConvolveFirst( + const float* const JXL_RESTRICT row, const int64_t x, const int64_t xsize, + const V wh0, const V wh1, const V wh2, const V wh3) { + const D d; + const V c = LoadU(d, row + x); + const V mul0 = Mul(c, wh0); + +#if HWY_TARGET == HWY_SCALAR + const V l1 = LoadU(d, row + Mirror(x - 1, xsize)); + const V l2 = LoadU(d, row + Mirror(x - 2, xsize)); + const V l3 = LoadU(d, row + Mirror(x - 3, xsize)); +#else + (void)xsize; + const V l1 = Neighbors::FirstL1(c); + const V l2 = Neighbors::FirstL2(c); + const V l3 = Neighbors::FirstL3(c); +#endif + + const V r1 = LoadU(d, row + x + 1); + const V r2 = LoadU(d, row + x + 2); + const V r3 = LoadU(d, row + x + 3); + + const V mul1 = MulAdd(Add(l1, r1), wh1, mul0); + const V mul2 = MulAdd(Add(l2, r2), wh2, mul1); + const V mul3 = MulAdd(Add(l3, r3), wh3, mul2); + return mul3; + } + + template + static JXL_MAYBE_INLINE V HorzConvolveLast( + const float* const JXL_RESTRICT row, const int64_t x, const int64_t xsize, + const V wh0, const V wh1, const V wh2, const V wh3) { + const D d; + const V c = LoadU(d, row + x); + const V mul0 = Mul(c, wh0); + + const V l1 = LoadU(d, row + x - 1); + const V l2 = LoadU(d, row + x - 2); + const V l3 = LoadU(d, row + x - 3); + + V r1, r2, r3; +#if HWY_TARGET == HWY_SCALAR + r1 = LoadU(d, row + Mirror(x + 1, xsize)); + r2 = LoadU(d, row + Mirror(x + 2, xsize)); + r3 = LoadU(d, row + Mirror(x + 3, xsize)); +#else + const size_t N = Lanes(d); + if (kSizeModN == 0) { + r3 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 3))); + r2 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 2))); + r1 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 1))); + } else if (kSizeModN == 1) { + const auto last = LoadU(d, row + xsize - N); + r3 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 2))); + r2 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1))); + r1 = last; + } else /* kSizeModN >= 2 */ { + const auto last = LoadU(d, row + xsize - N); + r3 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1))); + r2 = last; + r1 = LoadU(d, row + x + 1); + } +#endif + + // Sum of pixels with Manhattan distance i, multiplied by weights[i]. + const V sum1 = Add(l1, r1); + const V mul1 = MulAdd(sum1, wh1, mul0); + const V sum2 = Add(l2, r2); + const V mul2 = MulAdd(sum2, wh2, mul1); + const V sum3 = Add(l3, r3); + const V mul3 = MulAdd(sum3, wh3, mul2); + return mul3; + } + + // Returns one vector of horizontal convolution results; lane i is the result + // for pixel pos + i. This is the fast path for interior pixels, i.e. kRadius + // valid pixels before/after pos. + static JXL_MAYBE_INLINE V HorzConvolve(const float* const JXL_RESTRICT pos, + const V wh0, const V wh1, const V wh2, + const V wh3) { + const D d; + const V c = LoadU(d, pos); + const V mul0 = Mul(c, wh0); + + // TODO(janwas): better to Combine + const V l1 = LoadU(d, pos - 1); + const V r1 = LoadU(d, pos + 1); + const V l2 = LoadU(d, pos - 2); + const V r2 = LoadU(d, pos + 2); + const V l3 = LoadU(d, pos - 3); + const V r3 = LoadU(d, pos + 3); + // Sum of pixels with Manhattan distance i, multiplied by weights[i]. + const V sum1 = Add(l1, r1); + const V mul1 = MulAdd(sum1, wh1, mul0); + const V sum2 = Add(l2, r2); + const V mul2 = MulAdd(sum2, wh2, mul1); + const V sum3 = Add(l3, r3); + const V mul3 = MulAdd(sum3, wh3, mul2); + return mul3; + } +}; + +void Separable7(const ImageF& in, const Rect& rect, + const WeightsSeparable7& weights, ThreadPool* pool, + ImageF* out) { + using Conv = ConvolveT; + if (rect.xsize() >= Conv::MinWidth()) { + return Conv::Run(in, rect, weights, pool, out); + } + + return SlowSeparable7(in, rect, weights, pool, out); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(Separable7); +void Separable7(const ImageF& in, const Rect& rect, + const WeightsSeparable7& weights, ThreadPool* pool, + ImageF* out) { + return HWY_DYNAMIC_DISPATCH(Separable7)(in, rect, weights, pool, out); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/convolve_slow.cc b/third_party/jpeg-xl/lib/jxl/convolve_slow.cc new file mode 100644 index 0000000000..fffe5f74c8 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/convolve_slow.cc @@ -0,0 +1,212 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/convolve.h" + +#include "lib/jxl/convolve-inl.h" + +namespace jxl { + +//------------------------------------------------------------------------------ +// Kernels + +// 4 instances of a given literal value, useful as input to LoadDup128. +#define JXL_REP4(literal) literal, literal, literal, literal + +// Concentrates energy in low-frequency components (e.g. for antialiasing). +const WeightsSymmetric3& WeightsSymmetric3Lowpass() { + // Computed by research/convolve_weights.py's cubic spline approximations of + // prolate spheroidal wave functions. + constexpr float w0 = 0.36208932f; + constexpr float w1 = 0.12820096f; + constexpr float w2 = 0.03127668f; + static constexpr WeightsSymmetric3 weights = { + {JXL_REP4(w0)}, {JXL_REP4(w1)}, {JXL_REP4(w2)}}; + return weights; +} + +const WeightsSeparable5& WeightsSeparable5Lowpass() { + constexpr float w0 = 0.41714928f; + constexpr float w1 = 0.25539268f; + constexpr float w2 = 0.03603267f; + static constexpr WeightsSeparable5 weights = { + {JXL_REP4(w0), JXL_REP4(w1), JXL_REP4(w2)}, + {JXL_REP4(w0), JXL_REP4(w1), JXL_REP4(w2)}}; + return weights; +} + +const WeightsSymmetric5& WeightsSymmetric5Lowpass() { + static constexpr WeightsSymmetric5 weights = { + {JXL_REP4(0.1740135f)}, {JXL_REP4(0.1065369f)}, {JXL_REP4(0.0150310f)}, + {JXL_REP4(0.0652254f)}, {JXL_REP4(0.0012984f)}, {JXL_REP4(0.0092025f)}}; + return weights; +} + +const WeightsSeparable5& WeightsSeparable5Gaussian1() { + constexpr float w0 = 0.38774f; + constexpr float w1 = 0.24477f; + constexpr float w2 = 0.06136f; + static constexpr WeightsSeparable5 weights = { + {JXL_REP4(w0), JXL_REP4(w1), JXL_REP4(w2)}, + {JXL_REP4(w0), JXL_REP4(w1), JXL_REP4(w2)}}; + return weights; +} + +const WeightsSeparable5& WeightsSeparable5Gaussian2() { + constexpr float w0 = 0.250301f; + constexpr float w1 = 0.221461f; + constexpr float w2 = 0.153388f; + static constexpr WeightsSeparable5 weights = { + {JXL_REP4(w0), JXL_REP4(w1), JXL_REP4(w2)}, + {JXL_REP4(w0), JXL_REP4(w1), JXL_REP4(w2)}}; + return weights; +} + +#undef JXL_REP4 + +//------------------------------------------------------------------------------ +// Slow + +namespace { + +template +float SlowSymmetric3Pixel(const ImageF& in, const int64_t ix, const int64_t iy, + const int64_t xsize, const int64_t ysize, + const WeightsSymmetric3& weights) { + float sum = 0.0f; + + // ix: image; kx: kernel + for (int64_t ky = -1; ky <= 1; ky++) { + const int64_t y = WrapY()(iy + ky, ysize); + const float* JXL_RESTRICT row_in = in.ConstRow(static_cast(y)); + + const float wc = ky == 0 ? weights.c[0] : weights.r[0]; + const float wlr = ky == 0 ? weights.r[0] : weights.d[0]; + + const int64_t xm1 = WrapX()(ix - 1, xsize); + const int64_t xp1 = WrapX()(ix + 1, xsize); + sum += row_in[ix] * wc + (row_in[xm1] + row_in[xp1]) * wlr; + } + return sum; +} + +template +void SlowSymmetric3Row(const ImageF& in, const int64_t iy, const int64_t xsize, + const int64_t ysize, const WeightsSymmetric3& weights, + float* JXL_RESTRICT row_out) { + row_out[0] = + SlowSymmetric3Pixel(in, 0, iy, xsize, ysize, weights); + for (int64_t ix = 1; ix < xsize - 1; ix++) { + row_out[ix] = SlowSymmetric3Pixel(in, ix, iy, xsize, + ysize, weights); + } + { + const int64_t ix = xsize - 1; + row_out[ix] = SlowSymmetric3Pixel(in, ix, iy, xsize, + ysize, weights); + } +} + +} // namespace + +void SlowSymmetric3(const ImageF& in, const Rect& rect, + const WeightsSymmetric3& weights, ThreadPool* pool, + ImageF* JXL_RESTRICT out) { + PROFILER_FUNC; + + const int64_t xsize = static_cast(rect.xsize()); + const int64_t ysize = static_cast(rect.ysize()); + const int64_t kRadius = 1; + + JXL_CHECK(RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::NoInit, + [&](const uint32_t task, size_t /*thread*/) { + const int64_t iy = task; + float* JXL_RESTRICT out_row = out->Row(static_cast(iy)); + + if (iy < kRadius || iy >= ysize - kRadius) { + SlowSymmetric3Row(in, iy, xsize, ysize, weights, out_row); + } else { + SlowSymmetric3Row(in, iy, xsize, ysize, weights, + out_row); + } + }, + "SlowSymmetric3")); +} + +namespace { + +// Separable kernels, any radius. +float SlowSeparablePixel(const ImageF& in, const Rect& rect, const int64_t x, + const int64_t y, const int64_t radius, + const float* JXL_RESTRICT horz_weights, + const float* JXL_RESTRICT vert_weights) { + const size_t xsize = rect.xsize(); + const size_t ysize = rect.ysize(); + const WrapMirror wrap; + + float mul = 0.0f; + for (int dy = -radius; dy <= radius; ++dy) { + const float wy = vert_weights[std::abs(dy) * 4]; + const size_t sy = wrap(y + dy, ysize); + JXL_CHECK(sy < ysize); + const float* const JXL_RESTRICT row = rect.ConstRow(in, sy); + for (int dx = -radius; dx <= radius; ++dx) { + const float wx = horz_weights[std::abs(dx) * 4]; + const size_t sx = wrap(x + dx, xsize); + JXL_CHECK(sx < xsize); + mul += row[sx] * wx * wy; + } + } + return mul; +} + +} // namespace + +void SlowSeparable5(const ImageF& in, const Rect& rect, + const WeightsSeparable5& weights, ThreadPool* pool, + ImageF* out) { + PROFILER_FUNC; + const float* horz_weights = &weights.horz[0]; + const float* vert_weights = &weights.vert[0]; + + const size_t ysize = rect.ysize(); + JXL_CHECK(RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::NoInit, + [&](const uint32_t task, size_t /*thread*/) { + const int64_t y = task; + + float* const JXL_RESTRICT row_out = out->Row(y); + for (size_t x = 0; x < rect.xsize(); ++x) { + row_out[x] = SlowSeparablePixel(in, rect, x, y, /*radius=*/2, + horz_weights, vert_weights); + } + }, + "SlowSeparable5")); +} + +void SlowSeparable7(const ImageF& in, const Rect& rect, + const WeightsSeparable7& weights, ThreadPool* pool, + ImageF* out) { + PROFILER_FUNC; + const float* horz_weights = &weights.horz[0]; + const float* vert_weights = &weights.vert[0]; + + const size_t ysize = rect.ysize(); + JXL_CHECK(RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::NoInit, + [&](const uint32_t task, size_t /*thread*/) { + const int64_t y = task; + + float* const JXL_RESTRICT row_out = out->Row(y); + for (size_t x = 0; x < rect.xsize(); ++x) { + row_out[x] = SlowSeparablePixel(in, rect, x, y, /*radius=*/3, + horz_weights, vert_weights); + } + }, + "SlowSeparable7")); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/convolve_symmetric3.cc b/third_party/jpeg-xl/lib/jxl/convolve_symmetric3.cc new file mode 100644 index 0000000000..06b59dfb60 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/convolve_symmetric3.cc @@ -0,0 +1,194 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/convolve.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/convolve_symmetric3.cc" +#include +#include + +#include "lib/jxl/convolve-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::Vec; + +template +static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix, + const int64_t iy, const size_t ysize, const V wx0, + const V wx1, const V wx2) { + const HWY_FULL(float) d; + const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix; + const auto in_m2 = LoadU(d, center - 2); + const auto in_p2 = LoadU(d, center + 2); + const auto in_m1 = LoadU(d, center - 1); + const auto in_p1 = LoadU(d, center + 1); + const auto in_00 = Load(d, center); + const auto sum_2 = Mul(wx2, Add(in_m2, in_p2)); + const auto sum_1 = Mul(wx1, Add(in_m1, in_p1)); + const auto sum_0 = Mul(wx0, in_00); + return Add(sum_2, Add(sum_1, sum_0)); +} + +// 3x3 convolution by symmetric kernel with a single scan through the input. +class Symmetric3Strategy { + using D = HWY_CAPPED(float, 16); + using V = Vec; + + public: + static constexpr int64_t kRadius = 1; + + // Only accesses pixels in [0, xsize). + template + static JXL_MAYBE_INLINE void ConvolveRow( + const float* const JXL_RESTRICT row_m, const size_t xsize, + const int64_t stride, const WrapRow& wrap_row, + const WeightsSymmetric3& weights, float* const JXL_RESTRICT row_out) { + const D d; + // t, m, b = top, middle, bottom row; + const float* const JXL_RESTRICT row_t = wrap_row(row_m - stride, stride); + const float* const JXL_RESTRICT row_b = wrap_row(row_m + stride, stride); + + // Must load in advance - compiler doesn't understand LoadDup128 and + // schedules them too late. + const V w0 = LoadDup128(d, weights.c); + const V w1 = LoadDup128(d, weights.r); + const V w2 = LoadDup128(d, weights.d); + + // l, c, r = left, center, right. Leftmost vector: need FirstL1. + { + const V tc = LoadU(d, row_t + 0); + const V mc = LoadU(d, row_m + 0); + const V bc = LoadU(d, row_b + 0); + const V tl = Neighbors::FirstL1(tc); + const V tr = LoadU(d, row_t + 0 + 1); + const V ml = Neighbors::FirstL1(mc); + const V mr = LoadU(d, row_m + 0 + 1); + const V bl = Neighbors::FirstL1(bc); + const V br = LoadU(d, row_b + 0 + 1); + const V conv = + WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2); + Store(conv, d, row_out + 0); + } + + // Loop as long as we can load enough new values: + const size_t N = Lanes(d); + size_t x = N; + for (; x + N + kRadius <= xsize; x += N) { + const auto conv = ConvolveValid(row_t, row_m, row_b, x, w0, w1, w2); + Store(conv, d, row_out + x); + } + + // For final (partial) vector: + const V tc = LoadU(d, row_t + x); + const V mc = LoadU(d, row_m + x); + const V bc = LoadU(d, row_b + x); + + V tr, mr, br; +#if HWY_TARGET == HWY_SCALAR + tr = tc; // Single-lane => mirrored right neighbor = center value. + mr = mc; + br = bc; +#else + if (kSizeModN == 0) { + // The above loop didn't handle the last vector because it needs an + // additional right neighbor (generated via mirroring). + auto mirror = SetTableIndices(d, MirrorLanes(N - 1)); + tr = TableLookupLanes(tc, mirror); + mr = TableLookupLanes(mc, mirror); + br = TableLookupLanes(bc, mirror); + } else { + auto mirror = SetTableIndices(d, MirrorLanes((xsize % N) - 1)); + // Loads last valid value into uppermost lane and mirrors. + tr = TableLookupLanes(LoadU(d, row_t + xsize - N), mirror); + mr = TableLookupLanes(LoadU(d, row_m + xsize - N), mirror); + br = TableLookupLanes(LoadU(d, row_b + xsize - N), mirror); + } +#endif + + const V tl = LoadU(d, row_t + x - 1); + const V ml = LoadU(d, row_m + x - 1); + const V bl = LoadU(d, row_b + x - 1); + const V conv = WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2); + Store(conv, d, row_out + x); + } + + private: + // Returns sum{x_i * w_i}. + template + static JXL_MAYBE_INLINE V WeightedSum(const V tl, const V tc, const V tr, + const V ml, const V mc, const V mr, + const V bl, const V bc, const V br, + const V w0, const V w1, const V w2) { + const V sum_tb = Add(tc, bc); + + // Faster than 5 mul + 4 FMA. + const V mul0 = Mul(mc, w0); + const V sum_lr = Add(ml, mr); + + const V x1 = Add(sum_tb, sum_lr); + const V mul1 = MulAdd(x1, w1, mul0); + + const V sum_t2 = Add(tl, tr); + const V sum_b2 = Add(bl, br); + const V x2 = Add(sum_t2, sum_b2); + const V mul2 = MulAdd(x2, w2, mul1); + return mul2; + } + + static JXL_MAYBE_INLINE V ConvolveValid(const float* JXL_RESTRICT row_t, + const float* JXL_RESTRICT row_m, + const float* JXL_RESTRICT row_b, + const int64_t x, const V w0, + const V w1, const V w2) { + const D d; + const V tc = LoadU(d, row_t + x); + const V mc = LoadU(d, row_m + x); + const V bc = LoadU(d, row_b + x); + const V tl = LoadU(d, row_t + x - 1); + const V tr = LoadU(d, row_t + x + 1); + const V ml = LoadU(d, row_m + x - 1); + const V mr = LoadU(d, row_m + x + 1); + const V bl = LoadU(d, row_b + x - 1); + const V br = LoadU(d, row_b + x + 1); + return WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2); + } +}; + +void Symmetric3(const ImageF& in, const Rect& rect, + const WeightsSymmetric3& weights, ThreadPool* pool, + ImageF* out) { + using Conv = ConvolveT; + if (rect.xsize() >= Conv::MinWidth()) { + return Conv::Run(in, rect, weights, pool, out); + } + + return SlowSymmetric3(in, rect, weights, pool, out); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(Symmetric3); +void Symmetric3(const ImageF& in, const Rect& rect, + const WeightsSymmetric3& weights, ThreadPool* pool, + ImageF* out) { + return HWY_DYNAMIC_DISPATCH(Symmetric3)(in, rect, weights, pool, out); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/convolve_symmetric5.cc b/third_party/jpeg-xl/lib/jxl/convolve_symmetric5.cc new file mode 100644 index 0000000000..55a16899c3 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/convolve_symmetric5.cc @@ -0,0 +1,185 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/convolve.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/convolve_symmetric5.cc" +#include +#include + +#include "lib/jxl/common.h" // RoundUpTo +#include "lib/jxl/convolve-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::Vec; + +// Weighted sum of 1x5 pixels around ix, iy with [wx2 wx1 wx0 wx1 wx2]. +template +static float WeightedSumBorder(const ImageF& in, const WrapY wrap_y, + const int64_t ix, const int64_t iy, + const size_t xsize, const size_t ysize, + const float wx0, const float wx1, + const float wx2) { + const WrapMirror wrap_x; + const float* JXL_RESTRICT row = in.ConstRow(wrap_y(iy, ysize)); + const float in_m2 = row[wrap_x(ix - 2, xsize)]; + const float in_p2 = row[wrap_x(ix + 2, xsize)]; + const float in_m1 = row[wrap_x(ix - 1, xsize)]; + const float in_p1 = row[wrap_x(ix + 1, xsize)]; + const float in_00 = row[ix]; + const float sum_2 = wx2 * (in_m2 + in_p2); + const float sum_1 = wx1 * (in_m1 + in_p1); + const float sum_0 = wx0 * in_00; + return sum_2 + sum_1 + sum_0; +} + +template +static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix, + const int64_t iy, const size_t ysize, const V wx0, + const V wx1, const V wx2) { + const HWY_FULL(float) d; + const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix; + const auto in_m2 = LoadU(d, center - 2); + const auto in_p2 = LoadU(d, center + 2); + const auto in_m1 = LoadU(d, center - 1); + const auto in_p1 = LoadU(d, center + 1); + const auto in_00 = Load(d, center); + const auto sum_2 = Mul(wx2, Add(in_m2, in_p2)); + const auto sum_1 = Mul(wx1, Add(in_m1, in_p1)); + const auto sum_0 = Mul(wx0, in_00); + return Add(sum_2, Add(sum_1, sum_0)); +} + +// Produces result for one pixel +template +float Symmetric5Border(const ImageF& in, const Rect& rect, const int64_t ix, + const int64_t iy, const WeightsSymmetric5& weights) { + const float w0 = weights.c[0]; + const float w1 = weights.r[0]; + const float w2 = weights.R[0]; + const float w4 = weights.d[0]; + const float w5 = weights.L[0]; + const float w8 = weights.D[0]; + + const size_t xsize = rect.xsize(); + const size_t ysize = rect.ysize(); + const WrapY wrap_y; + // Unrolled loop over all 5 rows of the kernel. + float sum0 = WeightedSumBorder(in, wrap_y, ix, iy, xsize, ysize, w0, w1, w2); + + sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 2, xsize, ysize, w2, w5, w8); + float sum1 = + WeightedSumBorder(in, wrap_y, ix, iy + 2, xsize, ysize, w2, w5, w8); + + sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 1, xsize, ysize, w1, w4, w5); + sum1 += WeightedSumBorder(in, wrap_y, ix, iy + 1, xsize, ysize, w1, w4, w5); + + return sum0 + sum1; +} + +// Produces result for one vector's worth of pixels +template +static void Symmetric5Interior(const ImageF& in, const Rect& rect, + const int64_t ix, const int64_t iy, + const WeightsSymmetric5& weights, + float* JXL_RESTRICT row_out) { + const HWY_FULL(float) d; + + const auto w0 = LoadDup128(d, weights.c); + const auto w1 = LoadDup128(d, weights.r); + const auto w2 = LoadDup128(d, weights.R); + const auto w4 = LoadDup128(d, weights.d); + const auto w5 = LoadDup128(d, weights.L); + const auto w8 = LoadDup128(d, weights.D); + + const size_t ysize = rect.ysize(); + const WrapY wrap_y; + // Unrolled loop over all 5 rows of the kernel. + auto sum0 = WeightedSum(in, wrap_y, ix, iy, ysize, w0, w1, w2); + + sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 2, ysize, w2, w5, w8)); + auto sum1 = WeightedSum(in, wrap_y, ix, iy + 2, ysize, w2, w5, w8); + + sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 1, ysize, w1, w4, w5)); + sum1 = Add(sum1, WeightedSum(in, wrap_y, ix, iy + 1, ysize, w1, w4, w5)); + + Store(Add(sum0, sum1), d, row_out + ix); +} + +template +static void Symmetric5Row(const ImageF& in, const Rect& rect, const int64_t iy, + const WeightsSymmetric5& weights, + float* JXL_RESTRICT row_out) { + const int64_t kRadius = 2; + const size_t xsize = rect.xsize(); + + size_t ix = 0; + const HWY_FULL(float) d; + const size_t N = Lanes(d); + const size_t aligned_x = RoundUpTo(kRadius, N); + for (; ix < std::min(aligned_x, xsize); ++ix) { + row_out[ix] = Symmetric5Border(in, rect, ix, iy, weights); + } + for (; ix + N + kRadius <= xsize; ix += N) { + Symmetric5Interior(in, rect, ix, iy, weights, row_out); + } + for (; ix < xsize; ++ix) { + row_out[ix] = Symmetric5Border(in, rect, ix, iy, weights); + } +} + +static JXL_NOINLINE void Symmetric5BorderRow(const ImageF& in, const Rect& rect, + const int64_t iy, + const WeightsSymmetric5& weights, + float* JXL_RESTRICT row_out) { + return Symmetric5Row(in, rect, iy, weights, row_out); +} + +// Semi-vectorized (interior pixels Fonly); called directly like slow::, unlike +// the fully vectorized strategies below. +void Symmetric5(const ImageF& in, const Rect& rect, + const WeightsSymmetric5& weights, ThreadPool* pool, + ImageF* JXL_RESTRICT out) { + PROFILER_FUNC; + + const size_t ysize = rect.ysize(); + JXL_CHECK(RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::NoInit, + [&](const uint32_t task, size_t /*thread*/) { + const int64_t iy = task; + + if (iy < 2 || iy >= static_cast(ysize) - 2) { + Symmetric5BorderRow(in, rect, iy, weights, out->Row(iy)); + } else { + Symmetric5Row(in, rect, iy, weights, out->Row(iy)); + } + }, + "Symmetric5x5Convolution")); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(Symmetric5); +void Symmetric5(const ImageF& in, const Rect& rect, + const WeightsSymmetric5& weights, ThreadPool* pool, + ImageF* JXL_RESTRICT out) { + return HWY_DYNAMIC_DISPATCH(Symmetric5)(in, rect, weights, pool, out); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/convolve_test.cc b/third_party/jpeg-xl/lib/jxl/convolve_test.cc new file mode 100644 index 0000000000..e86d637114 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/convolve_test.cc @@ -0,0 +1,252 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/convolve.h" + +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/convolve_test.cc" +#include +#include +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/random.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +#ifndef JXL_DEBUG_CONVOLVE +#define JXL_DEBUG_CONVOLVE 0 +#endif + +#include "lib/jxl/convolve-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +void TestNeighbors() { + const Neighbors::D d; + const Neighbors::V v = Iota(d, 0); + HWY_ALIGN float actual[hwy::kTestMaxVectorSize / sizeof(float)] = {0}; + + HWY_ALIGN float first_l1[hwy::kTestMaxVectorSize / sizeof(float)] = { + 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}; + Store(Neighbors::FirstL1(v), d, actual); + const size_t N = Lanes(d); + EXPECT_EQ(std::vector(first_l1, first_l1 + N), + std::vector(actual, actual + N)); + +#if HWY_TARGET != HWY_SCALAR + HWY_ALIGN float first_l2[hwy::kTestMaxVectorSize / sizeof(float)] = { + 1, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}; + Store(Neighbors::FirstL2(v), d, actual); + EXPECT_EQ(std::vector(first_l2, first_l2 + N), + std::vector(actual, actual + N)); + + HWY_ALIGN float first_l3[hwy::kTestMaxVectorSize / sizeof(float)] = { + 2, 1, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + Store(Neighbors::FirstL3(v), d, actual); + EXPECT_EQ(std::vector(first_l3, first_l3 + N), + std::vector(actual, actual + N)); +#endif // HWY_TARGET != HWY_SCALAR +} + +void VerifySymmetric3(const size_t xsize, const size_t ysize, ThreadPool* pool, + Rng* rng) { + const Rect rect(0, 0, xsize, ysize); + + ImageF in(xsize, ysize); + GenerateImage(*rng, &in, 0.0f, 1.0f); + + ImageF out_expected(xsize, ysize); + ImageF out_actual(xsize, ysize); + + const WeightsSymmetric3& weights = WeightsSymmetric3Lowpass(); + Symmetric3(in, rect, weights, pool, &out_expected); + SlowSymmetric3(in, rect, weights, pool, &out_actual); + + JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _)); +} + +// Ensures Symmetric and Separable give the same result. +void VerifySymmetric5(const size_t xsize, const size_t ysize, ThreadPool* pool, + Rng* rng) { + const Rect rect(0, 0, xsize, ysize); + + ImageF in(xsize, ysize); + GenerateImage(*rng, &in, 0.0f, 1.0f); + + ImageF out_expected(xsize, ysize); + ImageF out_actual(xsize, ysize); + + Separable5(in, Rect(in), WeightsSeparable5Lowpass(), pool, &out_expected); + Symmetric5(in, rect, WeightsSymmetric5Lowpass(), pool, &out_actual); + + JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _)); +} + +void VerifySeparable5(const size_t xsize, const size_t ysize, ThreadPool* pool, + Rng* rng) { + const Rect rect(0, 0, xsize, ysize); + + ImageF in(xsize, ysize); + GenerateImage(*rng, &in, 0.0f, 1.0f); + + ImageF out_expected(xsize, ysize); + ImageF out_actual(xsize, ysize); + + const WeightsSeparable5& weights = WeightsSeparable5Lowpass(); + Separable5(in, Rect(in), weights, pool, &out_expected); + SlowSeparable5(in, rect, weights, pool, &out_actual); + + JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _)); +} + +void VerifySeparable7(const size_t xsize, const size_t ysize, ThreadPool* pool, + Rng* rng) { + const Rect rect(0, 0, xsize, ysize); + + ImageF in(xsize, ysize); + GenerateImage(*rng, &in, 0.0f, 1.0f); + + ImageF out_expected(xsize, ysize); + ImageF out_actual(xsize, ysize); + + // Gaussian sigma 1.0 + const WeightsSeparable7 weights = {{HWY_REP4(0.383103f), HWY_REP4(0.241843f), + HWY_REP4(0.060626f), HWY_REP4(0.00598f)}, + {HWY_REP4(0.383103f), HWY_REP4(0.241843f), + HWY_REP4(0.060626f), HWY_REP4(0.00598f)}}; + + SlowSeparable7(in, rect, weights, pool, &out_expected); + Separable7(in, Rect(in), weights, pool, &out_actual); + + JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _)); +} + +// For all xsize/ysize and kernels: +void TestConvolve() { + TestNeighbors(); + + test::ThreadPoolForTests pool(4); + EXPECT_EQ(true, + RunOnPool( + &pool, kConvolveMaxRadius, 40, ThreadPool::NoInit, + [](const uint32_t task, size_t /*thread*/) { + const size_t xsize = task; + Rng rng(129 + 13 * xsize); + + ThreadPool* null_pool = nullptr; + test::ThreadPoolForTests pool3(3); + for (size_t ysize = kConvolveMaxRadius; ysize < 16; ++ysize) { + JXL_DEBUG(JXL_DEBUG_CONVOLVE, + "%" PRIuS " x %" PRIuS " (target %" PRIx64 + ")===============================", + xsize, ysize, static_cast(HWY_TARGET)); + + JXL_DEBUG(JXL_DEBUG_CONVOLVE, "Sym3------------------"); + VerifySymmetric3(xsize, ysize, null_pool, &rng); + VerifySymmetric3(xsize, ysize, &pool3, &rng); + + JXL_DEBUG(JXL_DEBUG_CONVOLVE, "Sym5------------------"); + VerifySymmetric5(xsize, ysize, null_pool, &rng); + VerifySymmetric5(xsize, ysize, &pool3, &rng); + + JXL_DEBUG(JXL_DEBUG_CONVOLVE, "Sep5------------------"); + VerifySeparable5(xsize, ysize, null_pool, &rng); + VerifySeparable5(xsize, ysize, &pool3, &rng); + + JXL_DEBUG(JXL_DEBUG_CONVOLVE, "Sep7------------------"); + VerifySeparable7(xsize, ysize, null_pool, &rng); + VerifySeparable7(xsize, ysize, &pool3, &rng); + } + }, + "TestConvolve")); +} + +// Measures durations, verifies results, prints timings. `unpredictable1` +// must have value 1 (unknown to the compiler to prevent elision). +template +void BenchmarkConv(const char* caption, const Conv& conv, + const hwy::FuncInput unpredictable1) { + const size_t kNumInputs = 1; + const hwy::FuncInput inputs[kNumInputs] = {unpredictable1}; + hwy::Result results[kNumInputs]; + + const size_t kDim = 160; // in+out fit in L2 + ImageF in(kDim, kDim); + ZeroFillImage(&in); + in.Row(kDim / 2)[kDim / 2] = unpredictable1; + ImageF out(kDim, kDim); + + hwy::Params p; + p.verbose = false; + p.max_evals = 7; + p.target_rel_mad = 0.002; + const size_t num_results = MeasureClosure( + [&in, &conv, &out](const hwy::FuncInput input) { + conv(in, &out); + return out.Row(input)[0]; + }, + inputs, kNumInputs, results, p); + if (num_results != kNumInputs) { + fprintf(stderr, "MeasureClosure failed.\n"); + } + for (size_t i = 0; i < num_results; ++i) { + const double seconds = static_cast(results[i].ticks) / + hwy::platform::InvariantTicksPerSecond(); + printf("%12s: %7.2f MP/s (MAD=%4.2f%%)\n", caption, + kDim * kDim * 1E-6 / seconds, + static_cast(results[i].variability) * 100.0); + } +} + +struct ConvSymmetric3 { + void operator()(const ImageF& in, ImageF* JXL_RESTRICT out) const { + ThreadPool* null_pool = nullptr; + Symmetric3(in, Rect(in), WeightsSymmetric3Lowpass(), null_pool, out); + } +}; + +struct ConvSeparable5 { + void operator()(const ImageF& in, ImageF* JXL_RESTRICT out) const { + ThreadPool* null_pool = nullptr; + Separable5(in, Rect(in), WeightsSeparable5Lowpass(), null_pool, out); + } +}; + +void BenchmarkAll() { +#if 0 // disabled to avoid test timeouts, run manually on demand + const hwy::FuncInput unpredictable1 = time(nullptr) != 1234; + BenchmarkConv("Symmetric3", ConvSymmetric3(), unpredictable1); + BenchmarkConv("Separable5", ConvSeparable5(), unpredictable1); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +class ConvolveTest : public hwy::TestWithParamTarget {}; +HWY_TARGET_INSTANTIATE_TEST_SUITE_P(ConvolveTest); + +HWY_EXPORT_AND_TEST_P(ConvolveTest, TestConvolve); + +HWY_EXPORT_AND_TEST_P(ConvolveTest, BenchmarkAll); + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/data_parallel_test.cc b/third_party/jpeg-xl/lib/jxl/data_parallel_test.cc new file mode 100644 index 0000000000..ee2a97f93a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/data_parallel_test.cc @@ -0,0 +1,87 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/data_parallel.h" + +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +class DataParallelTest : public ::testing::Test { + protected: + // A fake class to verify that DataParallel is properly calling the + // client-provided runner functions. + static int FakeRunner(void* runner_opaque, void* jpegxl_opaque, + JxlParallelRunInit init, JxlParallelRunFunction func, + uint32_t start_range, uint32_t end_range) { + DataParallelTest* self = static_cast(runner_opaque); + self->runner_called_++; + self->jpegxl_opaque_ = jpegxl_opaque; + self->init_ = init; + self->func_ = func; + self->start_range_ = start_range; + self->end_range_ = end_range; + return self->runner_return_; + } + + ThreadPool pool_{&DataParallelTest::FakeRunner, this}; + + // Number of times FakeRunner() was called. + int runner_called_ = 0; + + // Parameters passed to FakeRunner. + void* jpegxl_opaque_ = nullptr; + JxlParallelRunInit init_ = nullptr; + JxlParallelRunFunction func_ = nullptr; + uint32_t start_range_ = -1; + uint32_t end_range_ = -1; + + // Return value that FakeRunner will return. + int runner_return_ = 0; +}; + +// JxlParallelRunInit interface. +typedef int (*JxlParallelRunInit)(); + +} // namespace + +TEST_F(DataParallelTest, RunnerCalledParameters) { + EXPECT_TRUE(pool_.Run( + 1234, 5678, [](size_t /* num_threads */) { return true; }, + [](uint32_t /* task */, size_t /* thread */) { return; })); + EXPECT_EQ(1, runner_called_); + EXPECT_NE(nullptr, init_); + EXPECT_NE(nullptr, func_); + EXPECT_NE(nullptr, jpegxl_opaque_); + EXPECT_EQ(1234u, start_range_); + EXPECT_EQ(5678u, end_range_); +} + +TEST_F(DataParallelTest, RunnerFailurePropagates) { + runner_return_ = -1; // FakeRunner return value. + EXPECT_FALSE(pool_.Run( + 1234, 5678, [](size_t /* num_threads */) { return false; }, + [](uint32_t /* task */, size_t /* thread */) { return; })); + EXPECT_FALSE(RunOnPool( + nullptr, 1234, 5678, [](size_t /* num_threads */) { return false; }, + [](uint32_t /* task */, size_t /* thread */) { return; }, "Test")); +} + +TEST_F(DataParallelTest, RunnerNotCalledOnEmptyRange) { + runner_return_ = -1; // FakeRunner return value. + EXPECT_TRUE(pool_.Run( + 123, 123, [](size_t /* num_threads */) { return false; }, + [](uint32_t /* task */, size_t /* thread */) { return; })); + EXPECT_TRUE(RunOnPool( + nullptr, 123, 123, [](size_t /* num_threads */) { return false; }, + [](uint32_t /* task */, size_t /* thread */) { return; }, "Test")); + // We don't call the external runner when the range is empty. We don't even + // need to call the init function. + EXPECT_EQ(0, runner_called_); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/dct-inl.h b/third_party/jpeg-xl/lib/jxl/dct-inl.h new file mode 100644 index 0000000000..532606075e --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dct-inl.h @@ -0,0 +1,334 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Fast SIMD floating-point (I)DCT, any power of two. + +#if defined(LIB_JXL_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_DCT_INL_H_ +#undef LIB_JXL_DCT_INL_H_ +#else +#define LIB_JXL_DCT_INL_H_ +#endif + +#include + +#include + +#include "lib/jxl/dct_block-inl.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/transpose-inl.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::NegMulAdd; +using hwy::HWY_NAMESPACE::Sub; + +template +struct FVImpl { + using type = HWY_CAPPED(float, SZ); +}; + +template <> +struct FVImpl<0> { + using type = HWY_FULL(float); +}; + +template +using FV = typename FVImpl::type; + +// Implementation of Lowest Complexity Self Recursive Radix-2 DCT II/III +// Algorithms, by Siriani M. Perera and Jianhua Liu. + +template +struct CoeffBundle { + static void AddReverse(const float* JXL_RESTRICT ain1, + const float* JXL_RESTRICT ain2, + float* JXL_RESTRICT aout) { + for (size_t i = 0; i < N; i++) { + auto in1 = Load(FV(), ain1 + i * SZ); + auto in2 = Load(FV(), ain2 + (N - i - 1) * SZ); + Store(Add(in1, in2), FV(), aout + i * SZ); + } + } + static void SubReverse(const float* JXL_RESTRICT ain1, + const float* JXL_RESTRICT ain2, + float* JXL_RESTRICT aout) { + for (size_t i = 0; i < N; i++) { + auto in1 = Load(FV(), ain1 + i * SZ); + auto in2 = Load(FV(), ain2 + (N - i - 1) * SZ); + Store(Sub(in1, in2), FV(), aout + i * SZ); + } + } + static void B(float* JXL_RESTRICT coeff) { + auto sqrt2 = Set(FV(), kSqrt2); + auto in1 = Load(FV(), coeff); + auto in2 = Load(FV(), coeff + SZ); + Store(MulAdd(in1, sqrt2, in2), FV(), coeff); + for (size_t i = 1; i + 1 < N; i++) { + auto in1 = Load(FV(), coeff + i * SZ); + auto in2 = Load(FV(), coeff + (i + 1) * SZ); + Store(Add(in1, in2), FV(), coeff + i * SZ); + } + } + static void BTranspose(float* JXL_RESTRICT coeff) { + for (size_t i = N - 1; i > 0; i--) { + auto in1 = Load(FV(), coeff + i * SZ); + auto in2 = Load(FV(), coeff + (i - 1) * SZ); + Store(Add(in1, in2), FV(), coeff + i * SZ); + } + auto sqrt2 = Set(FV(), kSqrt2); + auto in1 = Load(FV(), coeff); + Store(Mul(in1, sqrt2), FV(), coeff); + } + // Ideally optimized away by compiler (except the multiply). + static void InverseEvenOdd(const float* JXL_RESTRICT ain, + float* JXL_RESTRICT aout) { + for (size_t i = 0; i < N / 2; i++) { + auto in1 = Load(FV(), ain + i * SZ); + Store(in1, FV(), aout + 2 * i * SZ); + } + for (size_t i = N / 2; i < N; i++) { + auto in1 = Load(FV(), ain + i * SZ); + Store(in1, FV(), aout + (2 * (i - N / 2) + 1) * SZ); + } + } + // Ideally optimized away by compiler. + static void ForwardEvenOdd(const float* JXL_RESTRICT ain, size_t ain_stride, + float* JXL_RESTRICT aout) { + for (size_t i = 0; i < N / 2; i++) { + auto in1 = LoadU(FV(), ain + 2 * i * ain_stride); + Store(in1, FV(), aout + i * SZ); + } + for (size_t i = N / 2; i < N; i++) { + auto in1 = LoadU(FV(), ain + (2 * (i - N / 2) + 1) * ain_stride); + Store(in1, FV(), aout + i * SZ); + } + } + // Invoked on full vector. + static void Multiply(float* JXL_RESTRICT coeff) { + for (size_t i = 0; i < N / 2; i++) { + auto in1 = Load(FV(), coeff + (N / 2 + i) * SZ); + auto mul = Set(FV(), WcMultipliers::kMultipliers[i]); + Store(Mul(in1, mul), FV(), coeff + (N / 2 + i) * SZ); + } + } + static void MultiplyAndAdd(const float* JXL_RESTRICT coeff, + float* JXL_RESTRICT out, size_t out_stride) { + for (size_t i = 0; i < N / 2; i++) { + auto mul = Set(FV(), WcMultipliers::kMultipliers[i]); + auto in1 = Load(FV(), coeff + i * SZ); + auto in2 = Load(FV(), coeff + (N / 2 + i) * SZ); + auto out1 = MulAdd(mul, in2, in1); + auto out2 = NegMulAdd(mul, in2, in1); + StoreU(out1, FV(), out + i * out_stride); + StoreU(out2, FV(), out + (N - i - 1) * out_stride); + } + } + template + static void LoadFromBlock(const Block& in, size_t off, + float* JXL_RESTRICT coeff) { + for (size_t i = 0; i < N; i++) { + Store(in.LoadPart(FV(), i, off), FV(), coeff + i * SZ); + } + } + template + static void StoreToBlockAndScale(const float* JXL_RESTRICT coeff, + const Block& out, size_t off) { + auto mul = Set(FV(), 1.0f / N); + for (size_t i = 0; i < N; i++) { + out.StorePart(FV(), Mul(mul, Load(FV(), coeff + i * SZ)), i, off); + } + } +}; + +template +struct DCT1DImpl; + +template +struct DCT1DImpl<1, SZ> { + JXL_INLINE void operator()(float* JXL_RESTRICT mem) {} +}; + +template +struct DCT1DImpl<2, SZ> { + JXL_INLINE void operator()(float* JXL_RESTRICT mem) { + auto in1 = Load(FV(), mem); + auto in2 = Load(FV(), mem + SZ); + Store(Add(in1, in2), FV(), mem); + Store(Sub(in1, in2), FV(), mem + SZ); + } +}; + +template +struct DCT1DImpl { + void operator()(float* JXL_RESTRICT mem) { + // This is relatively small (4kB with 64-DCT and AVX-512) + HWY_ALIGN float tmp[N * SZ]; + CoeffBundle::AddReverse(mem, mem + N / 2 * SZ, tmp); + DCT1DImpl()(tmp); + CoeffBundle::SubReverse(mem, mem + N / 2 * SZ, tmp + N / 2 * SZ); + CoeffBundle::Multiply(tmp); + DCT1DImpl()(tmp + N / 2 * SZ); + CoeffBundle::B(tmp + N / 2 * SZ); + CoeffBundle::InverseEvenOdd(tmp, mem); + } +}; + +template +struct IDCT1DImpl; + +template +struct IDCT1DImpl<1, SZ> { + JXL_INLINE void operator()(const float* from, size_t from_stride, float* to, + size_t to_stride) { + StoreU(LoadU(FV(), from), FV(), to); + } +}; + +template +struct IDCT1DImpl<2, SZ> { + JXL_INLINE void operator()(const float* from, size_t from_stride, float* to, + size_t to_stride) { + JXL_DASSERT(from_stride >= SZ); + JXL_DASSERT(to_stride >= SZ); + auto in1 = LoadU(FV(), from); + auto in2 = LoadU(FV(), from + from_stride); + StoreU(Add(in1, in2), FV(), to); + StoreU(Sub(in1, in2), FV(), to + to_stride); + } +}; + +template +struct IDCT1DImpl { + void operator()(const float* from, size_t from_stride, float* to, + size_t to_stride) { + JXL_DASSERT(from_stride >= SZ); + JXL_DASSERT(to_stride >= SZ); + // This is relatively small (4kB with 64-DCT and AVX-512) + HWY_ALIGN float tmp[N * SZ]; + CoeffBundle::ForwardEvenOdd(from, from_stride, tmp); + IDCT1DImpl()(tmp, SZ, tmp, SZ); + CoeffBundle::BTranspose(tmp + N / 2 * SZ); + IDCT1DImpl()(tmp + N / 2 * SZ, SZ, tmp + N / 2 * SZ, SZ); + CoeffBundle::MultiplyAndAdd(tmp, to, to_stride); + } +}; + +template +void DCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp) { + size_t M = M_or_0 != 0 ? M_or_0 : Mp; + constexpr size_t SZ = MaxLanes(FV()); + HWY_ALIGN float tmp[N * SZ]; + for (size_t i = 0; i < M; i += Lanes(FV())) { + // TODO(veluca): consider removing the temporary memory here (as is done in + // IDCT), if it turns out that some compilers don't optimize away the loads + // and this is performance-critical. + CoeffBundle::LoadFromBlock(from, i, tmp); + DCT1DImpl()(tmp); + CoeffBundle::StoreToBlockAndScale(tmp, to, i); + } +} + +template +void IDCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp) { + size_t M = M_or_0 != 0 ? M_or_0 : Mp; + constexpr size_t SZ = MaxLanes(FV()); + for (size_t i = 0; i < M; i += Lanes(FV())) { + IDCT1DImpl()(from.Address(0, i), from.Stride(), to.Address(0, i), + to.Stride()); + } +} + +template +struct DCT1D { + template + void operator()(const FromBlock& from, const ToBlock& to) { + return DCT1DWrapper(from, to, M); + } +}; + +template +struct DCT1D MaxLanes(FV<0>()))>::type> { + template + void operator()(const FromBlock& from, const ToBlock& to) { + return NoInlineWrapper(DCT1DWrapper, from, to, M); + } +}; + +template +struct IDCT1D { + template + void operator()(const FromBlock& from, const ToBlock& to) { + return IDCT1DWrapper(from, to, M); + } +}; + +template +struct IDCT1D MaxLanes(FV<0>()))>::type> { + template + void operator()(const FromBlock& from, const ToBlock& to) { + return NoInlineWrapper(IDCT1DWrapper, from, to, + M); + } +}; + +// Computes the maybe-transposed, scaled DCT of a block, that needs to be +// HWY_ALIGN'ed. +template +struct ComputeScaledDCT { + // scratch_space must be aligned, and should have space for ROWS*COLS + // floats. + template + HWY_MAYBE_UNUSED void operator()(const From& from, float* to, + float* JXL_RESTRICT scratch_space) { + float* JXL_RESTRICT block = scratch_space; + if (ROWS < COLS) { + DCT1D()(from, DCTTo(block, COLS)); + Transpose::Run(DCTFrom(block, COLS), DCTTo(to, ROWS)); + DCT1D()(DCTFrom(to, ROWS), DCTTo(block, ROWS)); + Transpose::Run(DCTFrom(block, ROWS), DCTTo(to, COLS)); + } else { + DCT1D()(from, DCTTo(to, COLS)); + Transpose::Run(DCTFrom(to, COLS), DCTTo(block, ROWS)); + DCT1D()(DCTFrom(block, ROWS), DCTTo(to, ROWS)); + } + } +}; +// Computes the maybe-transposed, scaled IDCT of a block, that needs to be +// HWY_ALIGN'ed. +template +struct ComputeScaledIDCT { + // scratch_space must be aligned, and should have space for ROWS*COLS + // floats. + template + HWY_MAYBE_UNUSED void operator()(float* JXL_RESTRICT from, const To& to, + float* JXL_RESTRICT scratch_space) { + float* JXL_RESTRICT block = scratch_space; + // Reverse the steps done in ComputeScaledDCT. + if (ROWS < COLS) { + Transpose::Run(DCTFrom(from, COLS), DCTTo(block, ROWS)); + IDCT1D()(DCTFrom(block, ROWS), DCTTo(from, ROWS)); + Transpose::Run(DCTFrom(from, ROWS), DCTTo(block, COLS)); + IDCT1D()(DCTFrom(block, COLS), to); + } else { + IDCT1D()(DCTFrom(from, ROWS), DCTTo(block, ROWS)); + Transpose::Run(DCTFrom(block, ROWS), DCTTo(from, COLS)); + IDCT1D()(DCTFrom(from, COLS), to); + } + } +}; + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); +#endif // LIB_JXL_DCT_INL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dct_block-inl.h b/third_party/jpeg-xl/lib/jxl/dct_block-inl.h new file mode 100644 index 0000000000..50646a737f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dct_block-inl.h @@ -0,0 +1,108 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Adapters for DCT input/output: from/to contiguous blocks or image rows. + +#if defined(LIB_JXL_DCT_BLOCK_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_DCT_BLOCK_INL_H_ +#undef LIB_JXL_DCT_BLOCK_INL_H_ +#else +#define LIB_JXL_DCT_BLOCK_INL_H_ +#endif + +#include + +#include + +#include "lib/jxl/base/status.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Vec; + +// Block: (x, y) <-> (N * y + x) +// Lines: (x, y) <-> (stride * y + x) +// +// I.e. Block is a specialization of Lines with fixed stride. +// +// FromXXX should implement Read and Load (Read vector). +// ToXXX should implement Write and Store (Write vector). + +template +using BlockDesc = HWY_CAPPED(float, N); + +// Here and in the following, the SZ template parameter specifies the number of +// values to load/store. Needed because we want to handle 4x4 sub-blocks of +// 16x16 blocks. +class DCTFrom { + public: + DCTFrom(const float* data, size_t stride) : stride_(stride), data_(data) {} + + template + HWY_INLINE Vec LoadPart(D, const size_t row, size_t i) const { + JXL_DASSERT(Lanes(D()) <= stride_); + // Since these functions are used also for DC, no alignment at all is + // guaranteed in the case of floating blocks. + // TODO(veluca): consider using a different class for DC-to-LF and + // DC-from-LF, or copying DC values to/from a temporary aligned location. + return LoadU(D(), Address(row, i)); + } + + HWY_INLINE float Read(const size_t row, const size_t i) const { + return *Address(row, i); + } + + constexpr HWY_INLINE const float* Address(const size_t row, + const size_t i) const { + return data_ + row * stride_ + i; + } + + size_t Stride() const { return stride_; } + + private: + size_t stride_; + const float* JXL_RESTRICT data_; +}; + +class DCTTo { + public: + DCTTo(float* data, size_t stride) : stride_(stride), data_(data) {} + + template + HWY_INLINE void StorePart(D, const Vec& v, const size_t row, + size_t i) const { + JXL_DASSERT(Lanes(D()) <= stride_); + // Since these functions are used also for DC, no alignment at all is + // guaranteed in the case of floating blocks. + // TODO(veluca): consider using a different class for DC-to-LF and + // DC-from-LF, or copying DC values to/from a temporary aligned location. + StoreU(v, D(), Address(row, i)); + } + + HWY_INLINE void Write(float v, const size_t row, const size_t i) const { + *Address(row, i) = v; + } + + constexpr HWY_INLINE float* Address(const size_t row, const size_t i) const { + return data_ + row * stride_ + i; + } + + size_t Stride() const { return stride_; } + + private: + size_t stride_; + float* JXL_RESTRICT data_; +}; + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_DCT_BLOCK_INL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dct_for_test.h b/third_party/jpeg-xl/lib/jxl/dct_for_test.h new file mode 100644 index 0000000000..8e32aa7eff --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dct_for_test.h @@ -0,0 +1,99 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DCT_FOR_TEST_H_ +#define LIB_JXL_DCT_FOR_TEST_H_ + +// Unoptimized DCT only for use in tests. + +#include // memcpy + +#include +#include + +#include "lib/jxl/common.h" // Pi + +namespace jxl { + +namespace test { +static inline double alpha(int u) { return u == 0 ? 0.7071067811865475 : 1.0; } + +// N-DCT on M columns, divided by sqrt(N). Matches the definition in the spec. +template +void DCT1D(double block[N * M], double out[N * M]) { + std::vector matrix(N * N); + const double scale = std::sqrt(2.0) / N; + for (size_t y = 0; y < N; y++) { + for (size_t u = 0; u < N; u++) { + matrix[N * u + y] = alpha(u) * cos((y + 0.5) * u * Pi(1.0 / N)) * scale; + } + } + for (size_t x = 0; x < M; x++) { + for (size_t u = 0; u < N; u++) { + out[M * u + x] = 0; + for (size_t y = 0; y < N; y++) { + out[M * u + x] += matrix[N * u + y] * block[M * y + x]; + } + } + } +} + +// N-IDCT on M columns, multiplied by sqrt(N). Matches the definition in the +// spec. +template +void IDCT1D(double block[N * M], double out[N * M]) { + std::vector matrix(N * N); + const double scale = std::sqrt(2.0); + for (size_t y = 0; y < N; y++) { + for (size_t u = 0; u < N; u++) { + // Transpose of DCT matrix. + matrix[N * y + u] = alpha(u) * cos((y + 0.5) * u * Pi(1.0 / N)) * scale; + } + } + for (size_t x = 0; x < M; x++) { + for (size_t u = 0; u < N; u++) { + out[M * u + x] = 0; + for (size_t y = 0; y < N; y++) { + out[M * u + x] += matrix[N * u + y] * block[M * y + x]; + } + } + } +} + +template +void TransposeBlock(double in[N * M], double out[M * N]) { + for (size_t x = 0; x < N; x++) { + for (size_t y = 0; y < M; y++) { + out[y * N + x] = in[x * M + y]; + } + } +} +} // namespace test + +// Untransposed DCT. +template +void DCTSlow(double block[N * N]) { + constexpr size_t kBlockSize = N * N; + std::vector g(kBlockSize); + test::DCT1D(block, g.data()); + test::TransposeBlock(g.data(), block); + test::DCT1D(block, g.data()); + test::TransposeBlock(g.data(), block); +} + +// Untransposed IDCT. +template +void IDCTSlow(double block[N * N]) { + constexpr size_t kBlockSize = N * N; + std::vector g(kBlockSize); + test::IDCT1D(block, g.data()); + test::TransposeBlock(g.data(), block); + test::IDCT1D(block, g.data()); + test::TransposeBlock(g.data(), block); +} + +} // namespace jxl + +#endif // LIB_JXL_DCT_FOR_TEST_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dct_scales.cc b/third_party/jpeg-xl/lib/jxl/dct_scales.cc new file mode 100644 index 0000000000..f9e89a6014 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dct_scales.cc @@ -0,0 +1,31 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dct_scales.h" + +namespace jxl { + +// Definition of constexpr arrays. +constexpr float DCTResampleScales<1, 8>::kScales[]; +constexpr float DCTResampleScales<2, 16>::kScales[]; +constexpr float DCTResampleScales<4, 32>::kScales[]; +constexpr float DCTResampleScales<8, 64>::kScales[]; +constexpr float DCTResampleScales<16, 128>::kScales[]; +constexpr float DCTResampleScales<32, 256>::kScales[]; +constexpr float DCTResampleScales<8, 1>::kScales[]; +constexpr float DCTResampleScales<16, 2>::kScales[]; +constexpr float DCTResampleScales<32, 4>::kScales[]; +constexpr float DCTResampleScales<64, 8>::kScales[]; +constexpr float DCTResampleScales<128, 16>::kScales[]; +constexpr float DCTResampleScales<256, 32>::kScales[]; +constexpr float WcMultipliers<4>::kMultipliers[]; +constexpr float WcMultipliers<8>::kMultipliers[]; +constexpr float WcMultipliers<16>::kMultipliers[]; +constexpr float WcMultipliers<32>::kMultipliers[]; +constexpr float WcMultipliers<64>::kMultipliers[]; +constexpr float WcMultipliers<128>::kMultipliers[]; +constexpr float WcMultipliers<256>::kMultipliers[]; + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/dct_scales.h b/third_party/jpeg-xl/lib/jxl/dct_scales.h new file mode 100644 index 0000000000..23af03d60f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dct_scales.h @@ -0,0 +1,379 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DCT_SCALES_H_ +#define LIB_JXL_DCT_SCALES_H_ + +// Scaling factors. + +#include + +namespace jxl { + +static constexpr float kSqrt2 = 1.41421356237f; +static constexpr float kSqrt0_5 = 0.70710678118f; + +// For n != 0, the n-th basis function of a N-DCT, evaluated in pixel k, has a +// value of cos((k+1/2) n/(2N) pi). When downsampling by 2x, we average +// the values for pixel k and k+1 to get the value for pixel (k/2), thus we get +// +// [cos((k+1/2) n/N pi) + cos((k+3/2) n/N pi)]/2 = +// cos(n/(2N) pi) cos((k+1) n/N pi) = +// cos(n/(2N) pi) cos(((k/2)+1/2) n/(N/2) pi) +// +// which is exactly the same as the value of pixel k/2 of a N/2-sized DCT, +// except for the cos(n/(2N) pi) scaling factor (which does *not* +// depend on the pixel). Thus, when using the lower-frequency coefficients of a +// DCT-N to compute a DCT-(N/2), they should be scaled by this constant. Scaling +// factors for a DCT-(N/4) etc can then be obtained by successive +// multiplications. The structs below contain the above-mentioned scaling +// factors. +// +// Python code for the tables below: +// +// for i in range(N // 8): +// v = math.cos(i / (2 * N) * math.pi) +// v *= math.cos(i / (N) * math.pi) +// v *= math.cos(i / (N / 2) * math.pi) +// print(v, end=", ") + +template +struct DCTResampleScales; + +template <> +struct DCTResampleScales<8, 1> { + static constexpr float kScales[] = { + 1.000000000000000000, + }; +}; + +template <> +struct DCTResampleScales<16, 2> { + static constexpr float kScales[] = { + 1.000000000000000000, + 0.901764195028874394, + }; +}; + +template <> +struct DCTResampleScales<32, 4> { + static constexpr float kScales[] = { + 1.000000000000000000, + 0.974886821136879522, + 0.901764195028874394, + 0.787054918159101335, + }; +}; + +template <> +struct DCTResampleScales<64, 8> { + static constexpr float kScales[] = { + 1.0000000000000000, 0.9936866130906366, 0.9748868211368796, + 0.9440180941651672, 0.9017641950288744, 0.8490574973847023, + 0.7870549181591013, 0.7171081282466044, + }; +}; + +template <> +struct DCTResampleScales<128, 16> { + static constexpr float kScales[] = { + 1.0, + 0.9984194528776054, + 0.9936866130906366, + 0.9858278282666936, + 0.9748868211368796, + 0.9609244059440204, + 0.9440180941651672, + 0.9242615922757944, + 0.9017641950288744, + 0.8766500784429904, + 0.8490574973847023, + 0.8191378932865928, + 0.7870549181591013, + 0.7529833816270532, + 0.7171081282466044, + 0.6796228528314651, + }; +}; + +template <> +struct DCTResampleScales<256, 32> { + static constexpr float kScales[] = { + 1.0, + 0.9996047255830407, + 0.9984194528776054, + 0.9964458326264695, + 0.9936866130906366, + 0.9901456355893141, + 0.9858278282666936, + 0.9807391980963174, + 0.9748868211368796, + 0.9682788310563117, + 0.9609244059440204, + 0.9528337534340876, + 0.9440180941651672, + 0.9344896436056892, + 0.9242615922757944, + 0.913348084400198, + 0.9017641950288744, + 0.8895259056651056, + 0.8766500784429904, + 0.8631544288990163, + 0.8490574973847023, + 0.8343786191696513, + 0.8191378932865928, + 0.8033561501721485, + 0.7870549181591013, + 0.7702563888779096, + 0.7529833816270532, + 0.7352593067735488, + 0.7171081282466044, + 0.6985543251889097, + 0.6796228528314651, + 0.6603391026591464, + }; +}; + +// Inverses of the above. +template <> +struct DCTResampleScales<1, 8> { + static constexpr float kScales[] = { + 1.000000000000000000, + }; +}; + +template <> +struct DCTResampleScales<2, 16> { + static constexpr float kScales[] = { + 1.000000000000000000, + 1.108937353592731823, + }; +}; + +template <> +struct DCTResampleScales<4, 32> { + static constexpr float kScales[] = { + 1.000000000000000000, + 1.025760096781116015, + 1.108937353592731823, + 1.270559368765487251, + }; +}; + +template <> +struct DCTResampleScales<8, 64> { + static constexpr float kScales[] = { + 1.0000000000000000, 1.0063534990068217, 1.0257600967811158, + 1.0593017296817173, 1.1089373535927318, 1.1777765381970435, + 1.2705593687654873, 1.3944898413647777, + }; +}; + +template <> +struct DCTResampleScales<16, 128> { + static constexpr float kScales[] = { + 1.0, + 1.0015830492062623, + 1.0063534990068217, + 1.0143759095928793, + 1.0257600967811158, + 1.0406645869480142, + 1.0593017296817173, + 1.0819447744633812, + 1.1089373535927318, + 1.1407059950032632, + 1.1777765381970435, + 1.2207956782315876, + 1.2705593687654873, + 1.3280505578213306, + 1.3944898413647777, + 1.4714043176061107, + }; +}; + +template <> +struct DCTResampleScales<32, 256> { + static constexpr float kScales[] = { + 1.0, + 1.0003954307206069, + 1.0015830492062623, + 1.0035668445360069, + 1.0063534990068217, + 1.009952439375063, + 1.0143759095928793, + 1.0196390660647288, + 1.0257600967811158, + 1.0327603660498115, + 1.0406645869480142, + 1.049501024072585, + 1.0593017296817173, + 1.0701028169146336, + 1.0819447744633812, + 1.0948728278734026, + 1.1089373535927318, + 1.124194353004584, + 1.1407059950032632, + 1.158541237256391, + 1.1777765381970435, + 1.1984966740820495, + 1.2207956782315876, + 1.244777922949508, + 1.2705593687654873, + 1.2982690107339132, + 1.3280505578213306, + 1.3600643892400104, + 1.3944898413647777, + 1.4315278911623237, + 1.4714043176061107, + 1.5143734423314616, + }; +}; + +// Constants for DCT implementation. Generated by the following snippet: +// for i in range(N // 2): +// print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ") +template +struct WcMultipliers; + +template <> +struct WcMultipliers<4> { + static constexpr float kMultipliers[] = { + 0.541196100146197, + 1.3065629648763764, + }; +}; + +template <> +struct WcMultipliers<8> { + static constexpr float kMultipliers[] = { + 0.5097955791041592, + 0.6013448869350453, + 0.8999762231364156, + 2.5629154477415055, + }; +}; + +template <> +struct WcMultipliers<16> { + static constexpr float kMultipliers[] = { + 0.5024192861881557, 0.5224986149396889, 0.5669440348163577, + 0.6468217833599901, 0.7881546234512502, 1.060677685990347, + 1.7224470982383342, 5.101148618689155, + }; +}; + +template <> +struct WcMultipliers<32> { + static constexpr float kMultipliers[] = { + 0.5006029982351963, 0.5054709598975436, 0.5154473099226246, + 0.5310425910897841, 0.5531038960344445, 0.5829349682061339, + 0.6225041230356648, 0.6748083414550057, 0.7445362710022986, + 0.8393496454155268, 0.9725682378619608, 1.1694399334328847, + 1.4841646163141662, 2.057781009953411, 3.407608418468719, + 10.190008123548033, + }; +}; +template <> +struct WcMultipliers<64> { + static constexpr float kMultipliers[] = { + 0.500150636020651, 0.5013584524464084, 0.5037887256810443, + 0.5074711720725553, 0.5124514794082247, 0.5187927131053328, + 0.52657731515427, 0.535909816907992, 0.5469204379855088, + 0.5597698129470802, 0.57465518403266, 0.5918185358574165, + 0.6115573478825099, 0.6342389366884031, 0.6603198078137061, + 0.6903721282002123, 0.7251205223771985, 0.7654941649730891, + 0.8127020908144905, 0.8683447152233481, 0.9345835970364075, + 1.0144082649970547, 1.1120716205797176, 1.233832737976571, + 1.3892939586328277, 1.5939722833856311, 1.8746759800084078, + 2.282050068005162, 2.924628428158216, 4.084611078129248, + 6.796750711673633, 20.373878167231453, + }; +}; +template <> +struct WcMultipliers<128> { + static constexpr float kMultipliers[] = { + 0.5000376519155477, 0.5003390374428216, 0.5009427176380873, + 0.5018505174842379, 0.5030651913013697, 0.5045904432216454, + 0.5064309549285542, 0.5085924210498143, 0.5110815927066812, + 0.5139063298475396, 0.5170756631334912, 0.5205998663018917, + 0.524490540114724, 0.5287607092074876, 0.5334249333971333, + 0.538499435291984, 0.5440022463817783, 0.549953374183236, + 0.5563749934898856, 0.5632916653417023, 0.5707305880121454, + 0.5787218851348208, 0.5872989370937893, 0.5964987630244563, + 0.606362462272146, 0.6169357260050706, 0.6282694319707711, + 0.6404203382416639, 0.6534518953751283, 0.6674352009263413, + 0.6824501259764195, 0.6985866506472291, 0.7159464549705746, + 0.7346448236478627, 0.7548129391165311, 0.776600658233963, + 0.8001798956216941, 0.8257487738627852, 0.8535367510066064, + 0.8838110045596234, 0.9168844461846523, 0.9531258743921193, + 0.9929729612675466, 1.036949040910389, 1.0856850642580145, + 1.1399486751015042, 1.2006832557294167, 1.2690611716991191, + 1.346557628206286, 1.4350550884414341, 1.5369941008524954, + 1.6555965242641195, 1.7952052190778898, 1.961817848571166, + 2.163957818751979, 2.4141600002500763, 2.7316450287739396, + 3.147462191781909, 3.7152427383269746, 4.5362909369693565, + 5.827688377844654, 8.153848602466814, 13.58429025728446, + 40.744688103351834, + }; +}; + +template <> +struct WcMultipliers<256> { + static constexpr float kMultipliers[128] = { + 0.5000094125358878, 0.500084723455784, 0.5002354020255269, + 0.5004615618093246, 0.5007633734146156, 0.5011410648064231, + 0.5015949217281668, 0.502125288230386, 0.5027325673091954, + 0.5034172216566842, 0.5041797745258774, 0.5050208107132756, + 0.5059409776624396, 0.5069409866925212, 0.5080216143561264, + 0.509183703931388, 0.5104281670536573, 0.5117559854927805, + 0.5131682130825206, 0.5146659778093218, 0.516250484068288, + 0.5179230150949777, 0.5196849355823947, 0.5215376944933958, + 0.5234828280796439, 0.52552196311921, 0.5276568203859896, + 0.5298892183652453, 0.5322210772308335, 0.5346544231010253, + 0.537191392591309, 0.5398342376841637, 0.5425853309375497, + 0.545447171055775, 0.5484223888484947, 0.551513753605893, + 0.554724179920619, 0.5580567349898085, 0.5615146464335654, + 0.5651013106696203, 0.5688203018875696, 0.5726753816701664, + 0.5766705093136241, 0.5808098529038624, 0.5850978012111273, + 0.58953897647151, 0.5941382481306648, 0.5989007476325463, + 0.6038318843443582, 0.6089373627182432, 0.614223200800649, + 0.6196957502119484, 0.6253617177319102, 0.6312281886412079, + 0.6373026519855411, 0.6435930279473415, 0.6501076975307724, + 0.6568555347890955, 0.6638459418498757, 0.6710888870233562, + 0.6785949463131795, 0.6863753486870501, 0.6944420255086364, + 0.7028076645818034, 0.7114857693151208, 0.7204907235796304, + 0.7298378629074134, 0.7395435527641373, 0.749625274727372, + 0.7601017215162176, 0.7709929019493761, 0.7823202570613161, + 0.7941067887834509, 0.8063772028037925, 0.8191580674598145, + 0.83247799080191, 0.8463678182968619, 0.860860854031955, + 0.8759931087426972, 0.8918035785352535, 0.9083345588266809, + 0.9256319988042384, 0.9437459026371479, 0.962730784794803, + 0.9826461881778968, 1.0035572754078206, 1.0255355056139732, + 1.048659411496106, 1.0730154944316674, 1.0986992590905857, + 1.1258164135986009, 1.1544842669978943, 1.184833362908442, + 1.217009397314603, 1.2511754798461228, 1.287514812536712, + 1.326233878832723, 1.3675662599582539, 1.411777227500661, + 1.459169302866857, 1.5100890297227016, 1.5649352798258847, + 1.6241695131835794, 1.6883285509131505, 1.7580406092704062, + 1.8340456094306077, 1.9172211551275689, 2.0086161135167564, + 2.1094945286246385, 2.22139377701127, 2.346202662531156, + 2.486267909203593, 2.644541877144861, 2.824791402350551, + 3.0318994541759925, 3.2723115884254845, 3.5547153325075804, + 3.891107790700307, 4.298537526449054, 4.802076008665048, + 5.440166215091329, 6.274908408039339, 7.413566756422303, + 9.058751453879703, 11.644627325175037, 16.300023088031555, + 27.163977662448232, 81.48784219222516, + }; +}; + +// Apply the DCT algorithm-intrinsic constants to DCTResampleScale. +template +constexpr float DCTTotalResampleScale(size_t x) { + return DCTResampleScales::kScales[x]; +} + +} // namespace jxl + +#endif // LIB_JXL_DCT_SCALES_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dct_test.cc b/third_party/jpeg-xl/lib/jxl/dct_test.cc new file mode 100644 index 0000000000..9f5eff41e9 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dct_test.cc @@ -0,0 +1,389 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/dct_test.cc" +#include +#include +#include + +#include "lib/jxl/common.h" +#include "lib/jxl/dct-inl.h" +#include "lib/jxl/dct_for_test.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/image.h" +#include "lib/jxl/test_utils.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// Computes the in-place NxN DCT of block. +// Requires that block is HWY_ALIGN'ed. +// +// Performs ComputeTransposedScaledDCT and then transposes and scales it to +// obtain "vanilla" DCT. +template +void ComputeDCT(float block[N * N]) { + HWY_ALIGN float tmp_block[N * N]; + HWY_ALIGN float scratch_space[N * N]; + ComputeScaledDCT()(DCTFrom(block, N), tmp_block, scratch_space); + + // Untranspose. + Transpose::Run(DCTFrom(tmp_block, N), DCTTo(block, N)); +} + +// Computes the in-place 8x8 iDCT of block. +// Requires that block is HWY_ALIGN'ed. +template +void ComputeIDCT(float block[N * N]) { + HWY_ALIGN float tmp_block[N * N]; + HWY_ALIGN float scratch_space[N * N]; + // Untranspose. + Transpose::Run(DCTFrom(block, N), DCTTo(tmp_block, N)); + + ComputeScaledIDCT()(tmp_block, DCTTo(block, N), scratch_space); +} + +template +void TransposeTestT(float accuracy) { + constexpr size_t kBlockSize = N * N; + HWY_ALIGN float src[kBlockSize]; + DCTTo to_src(src, N); + for (size_t y = 0; y < N; ++y) { + for (size_t x = 0; x < N; ++x) { + to_src.Write(y * N + x, y, x); + } + } + HWY_ALIGN float dst[kBlockSize]; + Transpose::Run(DCTFrom(src, N), DCTTo(dst, N)); + DCTFrom from_dst(dst, N); + for (size_t y = 0; y < N; ++y) { + for (size_t x = 0; x < N; ++x) { + float expected = x * N + y; + float actual = from_dst.Read(y, x); + EXPECT_NEAR(expected, actual, accuracy) << "x = " << x << ", y = " << y; + } + } +} + +void TransposeTest() { + TransposeTestT<8>(1e-7f); + TransposeTestT<16>(1e-7f); + TransposeTestT<32>(1e-7f); +} + +template +void ColumnDctRoundtripT(float accuracy) { + constexpr size_t kBlockSize = N * N; + // Though we are only interested in single column result, dct.h has built-in + // limit on minimal number of columns processed. So, to be safe, we do + // regular 8x8 block transformation. On the bright side - we could check all + // 8 basis vectors at once. + HWY_ALIGN float block[kBlockSize]; + DCTTo to(block, N); + DCTFrom from(block, N); + for (size_t i = 0; i < N; ++i) { + for (size_t j = 0; j < N; ++j) { + to.Write((i == j) ? 1.0f : 0.0f, i, j); + } + } + + // Running (I)DCT on the same memory block seems to trigger a compiler bug on + // ARMv7 with clang6. + HWY_ALIGN float tmp[kBlockSize]; + DCTTo to_tmp(tmp, N); + DCTFrom from_tmp(tmp, N); + + DCT1D()(from, to_tmp); + IDCT1D()(from_tmp, to); + + for (size_t i = 0; i < N; ++i) { + for (size_t j = 0; j < N; ++j) { + float expected = (i == j) ? 1.0f : 0.0f; + float actual = from.Read(i, j); + EXPECT_NEAR(expected, actual, accuracy) << " i=" << i << ", j=" << j; + } + } +} + +void ColumnDctRoundtrip() { + ColumnDctRoundtripT<8>(1e-6f); + ColumnDctRoundtripT<16>(1e-6f); + ColumnDctRoundtripT<32>(1e-6f); +} + +template +void TestDctAccuracy(float accuracy, size_t start = 0, size_t end = N * N) { + constexpr size_t kBlockSize = N * N; + for (size_t i = start; i < end; i++) { + HWY_ALIGN float fast[kBlockSize] = {0.0f}; + double slow[kBlockSize] = {0.0}; + fast[i] = 1.0; + slow[i] = 1.0; + DCTSlow(slow); + ComputeDCT(fast); + for (size_t k = 0; k < kBlockSize; ++k) { + EXPECT_NEAR(fast[k], slow[k], accuracy / N) + << "i = " << i << ", k = " << k << ", N = " << N; + } + } +} + +template +void TestIdctAccuracy(float accuracy, size_t start = 0, size_t end = N * N) { + constexpr size_t kBlockSize = N * N; + for (size_t i = start; i < end; i++) { + HWY_ALIGN float fast[kBlockSize] = {0.0f}; + double slow[kBlockSize] = {0.0}; + fast[i] = 1.0; + slow[i] = 1.0; + IDCTSlow(slow); + ComputeIDCT(fast); + for (size_t k = 0; k < kBlockSize; ++k) { + EXPECT_NEAR(fast[k], slow[k], accuracy * N) + << "i = " << i << ", k = " << k << ", N = " << N; + } + } +} + +template +void TestInverseT(float accuracy) { + test::ThreadPoolForTests pool(N < 32 ? 0 : 8); + enum { kBlockSize = N * N }; + EXPECT_TRUE(RunOnPool( + &pool, 0, kBlockSize, ThreadPool::NoInit, + [accuracy](const uint32_t task, size_t /*thread*/) { + const size_t i = static_cast(task); + HWY_ALIGN float x[kBlockSize] = {0.0f}; + x[i] = 1.0; + + ComputeIDCT(x); + ComputeDCT(x); + + for (size_t k = 0; k < kBlockSize; ++k) { + EXPECT_NEAR(x[k], (k == i) ? 1.0f : 0.0f, accuracy) + << "i = " << i << ", k = " << k; + } + }, + "TestInverse")); +} + +void InverseTest() { + TestInverseT<8>(1e-6f); + TestInverseT<16>(1e-6f); + TestInverseT<32>(3e-6f); +} + +template +void TestDctTranspose(float accuracy, size_t start = 0, size_t end = N * N) { + constexpr size_t kBlockSize = N * N; + for (size_t i = start; i < end; i++) { + for (size_t j = 0; j < kBlockSize; ++j) { + // We check that = . + // That means (Me_j)_i = (M^\dagger{}e_i)_j + + // x := Me_j + HWY_ALIGN float x[kBlockSize] = {0.0f}; + x[j] = 1.0; + ComputeIDCT(x); + // y := M^\dagger{}e_i + HWY_ALIGN float y[kBlockSize] = {0.0f}; + y[i] = 1.0; + ComputeDCT(y); + + EXPECT_NEAR(x[i] / N, y[j] * N, accuracy) << "i = " << i << ", j = " << j; + } + } +} + +template +void TestSlowInverse(float accuracy, size_t start = 0, size_t end = N * N) { + constexpr size_t kBlockSize = N * N; + for (size_t i = start; i < end; i++) { + double x[kBlockSize] = {0.0f}; + x[i] = 1.0; + + DCTSlow(x); + IDCTSlow(x); + + for (size_t k = 0; k < kBlockSize; ++k) { + EXPECT_NEAR(x[k], (k == i) ? 1.0f : 0.0f, accuracy) + << "i = " << i << ", k = " << k; + } + } +} + +template +void TestRectInverseT(float accuracy) { + constexpr size_t kBlockSize = ROWS * COLS; + for (size_t i = 0; i < kBlockSize; ++i) { + HWY_ALIGN float x[kBlockSize] = {0.0f}; + HWY_ALIGN float out[kBlockSize] = {0.0f}; + x[i] = 1.0; + HWY_ALIGN float coeffs[kBlockSize] = {0.0f}; + HWY_ALIGN float scratch_space[kBlockSize * 2]; + + ComputeScaledDCT()(DCTFrom(x, COLS), coeffs, scratch_space); + ComputeScaledIDCT()(coeffs, DCTTo(out, COLS), scratch_space); + + for (size_t k = 0; k < kBlockSize; ++k) { + EXPECT_NEAR(out[k], (k == i) ? 1.0f : 0.0f, accuracy) + << "i = " << i << ", k = " << k << " ROWS = " << ROWS + << " COLS = " << COLS; + } + } +} + +void TestRectInverse() { + TestRectInverseT<16, 32>(1e-6f); + TestRectInverseT<8, 32>(1e-6f); + TestRectInverseT<8, 16>(1e-6f); + TestRectInverseT<4, 8>(1e-6f); + TestRectInverseT<2, 4>(1e-6f); + TestRectInverseT<1, 4>(1e-6f); + TestRectInverseT<1, 2>(1e-6f); + + TestRectInverseT<32, 16>(1e-6f); + TestRectInverseT<32, 8>(1e-6f); + TestRectInverseT<16, 8>(1e-6f); + TestRectInverseT<8, 4>(1e-6f); + TestRectInverseT<4, 2>(1e-6f); + TestRectInverseT<4, 1>(1e-6f); + TestRectInverseT<2, 1>(1e-6f); +} + +template +void TestRectTransposeT(float accuracy) { + constexpr size_t kBlockSize = ROWS * COLS; + HWY_ALIGN float scratch_space[kBlockSize * 2]; + for (size_t px = 0; px < COLS; ++px) { + for (size_t py = 0; py < ROWS; ++py) { + HWY_ALIGN float x1[kBlockSize] = {0.0f}; + HWY_ALIGN float x2[kBlockSize] = {0.0f}; + HWY_ALIGN float coeffs1[kBlockSize] = {0.0f}; + HWY_ALIGN float coeffs2[kBlockSize] = {0.0f}; + x1[py * COLS + px] = 1; + x2[px * ROWS + py] = 1; + + constexpr size_t OUT_ROWS = ROWS < COLS ? ROWS : COLS; + constexpr size_t OUT_COLS = ROWS < COLS ? COLS : ROWS; + + ComputeScaledDCT()(DCTFrom(x1, COLS), coeffs1, scratch_space); + ComputeScaledDCT()(DCTFrom(x2, ROWS), coeffs2, scratch_space); + + for (size_t x = 0; x < OUT_COLS; ++x) { + for (size_t y = 0; y < OUT_ROWS; ++y) { + EXPECT_NEAR(coeffs1[y * OUT_COLS + x], coeffs2[y * OUT_COLS + x], + accuracy) + << " px = " << px << ", py = " << py << ", x = " << x + << ", y = " << y; + } + } + } + } +} + +void TestRectTranspose() { + TestRectTransposeT<16, 32>(1e-6f); + TestRectTransposeT<8, 32>(1e-6f); + TestRectTransposeT<8, 16>(1e-6f); + TestRectTransposeT<4, 8>(1e-6f); + TestRectTransposeT<2, 4>(1e-6f); + TestRectTransposeT<1, 4>(1e-6f); + TestRectTransposeT<1, 2>(1e-6f); + + // Identical to 8, 16 + // TestRectTranspose<16, 8>(1e-6f); +} + +void TestDctAccuracyShard(size_t shard) { + if (shard == 0) { + TestDctAccuracy<1>(1.1E-7f); + TestDctAccuracy<2>(1.1E-7f); + TestDctAccuracy<4>(1.1E-7f); + TestDctAccuracy<8>(1.1E-7f); + TestDctAccuracy<16>(1.3E-7f); + } + TestDctAccuracy<32>(1.1E-7f, 32 * shard, 32 * (shard + 1)); +} + +void TestIdctAccuracyShard(size_t shard) { + if (shard == 0) { + TestIdctAccuracy<1>(1E-7f); + TestIdctAccuracy<2>(1E-7f); + TestIdctAccuracy<4>(1E-7f); + TestIdctAccuracy<8>(1E-7f); + TestIdctAccuracy<16>(1E-7f); + } + TestIdctAccuracy<32>(1E-7f, 32 * shard, 32 * (shard + 1)); +} + +void TestDctTransposeShard(size_t shard) { + if (shard == 0) { + TestDctTranspose<8>(1E-6f); + TestDctTranspose<16>(1E-6f); + } + TestDctTranspose<32>(3E-6f, 32 * shard, 32 * (shard + 1)); +} + +void TestSlowInverseShard(size_t shard) { + if (shard == 0) { + TestSlowInverse<1>(1E-5f); + TestSlowInverse<2>(1E-5f); + TestSlowInverse<4>(1E-5f); + TestSlowInverse<8>(1E-5f); + TestSlowInverse<16>(1E-5f); + } + TestSlowInverse<32>(1E-5f, 32 * shard, 32 * (shard + 1)); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +class TransposeTest : public hwy::TestWithParamTarget {}; + +HWY_TARGET_INSTANTIATE_TEST_SUITE_P(TransposeTest); + +HWY_EXPORT_AND_TEST_P(TransposeTest, TransposeTest); +HWY_EXPORT_AND_TEST_P(TransposeTest, InverseTest); +HWY_EXPORT_AND_TEST_P(TransposeTest, ColumnDctRoundtrip); +HWY_EXPORT_AND_TEST_P(TransposeTest, TestRectInverse); +HWY_EXPORT_AND_TEST_P(TransposeTest, TestRectTranspose); + +// Tests in the DctShardedTest class are sharded for N=32. +class DctShardedTest : public ::hwy::TestWithParamTargetAndT {}; + +std::vector ShardRange(uint32_t n) { +#ifdef JXL_DISABLE_SLOW_TESTS + JXL_ASSERT(n > 6); + std::vector ret = {0, 1, 3, 5, n - 1}; +#else + std::vector ret(n); + std::iota(ret.begin(), ret.end(), 0); +#endif // JXL_DISABLE_SLOW_TESTS + return ret; +} + +HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(DctShardedTest, + ::testing::ValuesIn(ShardRange(32))); + +HWY_EXPORT_AND_TEST_P_T(DctShardedTest, TestDctAccuracyShard); +HWY_EXPORT_AND_TEST_P_T(DctShardedTest, TestIdctAccuracyShard); +HWY_EXPORT_AND_TEST_P_T(DctShardedTest, TestDctTransposeShard); +HWY_EXPORT_AND_TEST_P_T(DctShardedTest, TestSlowInverseShard); + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/dct_util.h b/third_party/jpeg-xl/lib/jxl/dct_util.h new file mode 100644 index 0000000000..fb6ce3b971 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dct_util.h @@ -0,0 +1,86 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DCT_UTIL_H_ +#define LIB_JXL_DCT_UTIL_H_ + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { + +union ACPtr { + int32_t* ptr32; + int16_t* ptr16; + ACPtr() = default; + explicit ACPtr(int16_t* p) : ptr16(p) {} + explicit ACPtr(int32_t* p) : ptr32(p) {} +}; + +union ConstACPtr { + const int32_t* ptr32; + const int16_t* ptr16; + ConstACPtr() = default; + explicit ConstACPtr(const int16_t* p) : ptr16(p) {} + explicit ConstACPtr(const int32_t* p) : ptr32(p) {} +}; + +enum class ACType { k16 = 0, k32 = 1 }; + +class ACImage { + public: + virtual ~ACImage() = default; + virtual ACType Type() const = 0; + virtual ACPtr PlaneRow(size_t c, size_t y, size_t xbase) = 0; + virtual ConstACPtr PlaneRow(size_t c, size_t y, size_t xbase) const = 0; + virtual size_t PixelsPerRow() const = 0; + virtual void ZeroFill() = 0; + virtual void ZeroFillPlane(size_t c) = 0; + virtual bool IsEmpty() const = 0; +}; + +template +class ACImageT final : public ACImage { + public: + ACImageT() = default; + ACImageT(size_t xsize, size_t ysize) { + static_assert( + std::is_same::value || std::is_same::value, + "ACImage must be either 32- or 16- bit"); + img_ = Image3(xsize, ysize); + } + ACType Type() const override { + return sizeof(T) == 2 ? ACType::k16 : ACType::k32; + } + ACPtr PlaneRow(size_t c, size_t y, size_t xbase) override { + return ACPtr(img_.PlaneRow(c, y) + xbase); + } + ConstACPtr PlaneRow(size_t c, size_t y, size_t xbase) const override { + return ConstACPtr(img_.PlaneRow(c, y) + xbase); + } + + size_t PixelsPerRow() const override { return img_.PixelsPerRow(); } + + void ZeroFill() override { ZeroFillImage(&img_); } + + void ZeroFillPlane(size_t c) override { ZeroFillImage(&img_.Plane(c)); } + + bool IsEmpty() const override { + return img_.xsize() == 0 || img_.ysize() == 0; + } + + private: + Image3 img_; +}; + +} // namespace jxl + +#endif // LIB_JXL_DCT_UTIL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dec_ans.cc b/third_party/jpeg-xl/lib/jxl/dec_ans.cc new file mode 100644 index 0000000000..c9145472e0 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_ans.cc @@ -0,0 +1,374 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_ans.h" + +#include + +#include + +#include "lib/jxl/ans_common.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_context_map.h" +#include "lib/jxl/fields.h" + +namespace jxl { +namespace { + +// Decodes a number in the range [0..255], by reading 1 - 11 bits. +inline int DecodeVarLenUint8(BitReader* input) { + if (input->ReadFixedBits<1>()) { + int nbits = static_cast(input->ReadFixedBits<3>()); + if (nbits == 0) { + return 1; + } else { + return static_cast(input->ReadBits(nbits)) + (1 << nbits); + } + } + return 0; +} + +// Decodes a number in the range [0..65535], by reading 1 - 21 bits. +inline int DecodeVarLenUint16(BitReader* input) { + if (input->ReadFixedBits<1>()) { + int nbits = static_cast(input->ReadFixedBits<4>()); + if (nbits == 0) { + return 1; + } else { + return static_cast(input->ReadBits(nbits)) + (1 << nbits); + } + } + return 0; +} + +Status ReadHistogram(int precision_bits, std::vector* counts, + BitReader* input) { + int simple_code = input->ReadBits(1); + if (simple_code == 1) { + int i; + int symbols[2] = {0}; + int max_symbol = 0; + const int num_symbols = input->ReadBits(1) + 1; + for (i = 0; i < num_symbols; ++i) { + symbols[i] = DecodeVarLenUint8(input); + if (symbols[i] > max_symbol) max_symbol = symbols[i]; + } + counts->resize(max_symbol + 1); + if (num_symbols == 1) { + (*counts)[symbols[0]] = 1 << precision_bits; + } else { + if (symbols[0] == symbols[1]) { // corrupt data + return false; + } + (*counts)[symbols[0]] = input->ReadBits(precision_bits); + (*counts)[symbols[1]] = (1 << precision_bits) - (*counts)[symbols[0]]; + } + } else { + int is_flat = input->ReadBits(1); + if (is_flat == 1) { + int alphabet_size = DecodeVarLenUint8(input) + 1; + *counts = CreateFlatHistogram(alphabet_size, 1 << precision_bits); + return true; + } + + uint32_t shift; + { + // TODO(veluca): speed up reading with table lookups. + int upper_bound_log = FloorLog2Nonzero(ANS_LOG_TAB_SIZE + 1); + int log = 0; + for (; log < upper_bound_log; log++) { + if (input->ReadFixedBits<1>() == 0) break; + } + shift = (input->ReadBits(log) | (1 << log)) - 1; + if (shift > ANS_LOG_TAB_SIZE + 1) { + return JXL_FAILURE("Invalid shift value"); + } + } + + int length = DecodeVarLenUint8(input) + 3; + counts->resize(length); + int total_count = 0; + + static const uint8_t huff[128][2] = { + {3, 10}, {7, 12}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5}, + {3, 10}, {4, 4}, {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2}, + {3, 10}, {5, 0}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5}, + {3, 10}, {4, 4}, {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2}, + {3, 10}, {6, 11}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5}, + {3, 10}, {4, 4}, {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2}, + {3, 10}, {5, 0}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5}, + {3, 10}, {4, 4}, {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2}, + {3, 10}, {7, 13}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5}, + {3, 10}, {4, 4}, {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2}, + {3, 10}, {5, 0}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5}, + {3, 10}, {4, 4}, {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2}, + {3, 10}, {6, 11}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5}, + {3, 10}, {4, 4}, {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2}, + {3, 10}, {5, 0}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5}, + {3, 10}, {4, 4}, {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2}, + }; + + std::vector logcounts(counts->size()); + int omit_log = -1; + int omit_pos = -1; + // This array remembers which symbols have an RLE length. + std::vector same(counts->size(), 0); + for (size_t i = 0; i < logcounts.size(); ++i) { + input->Refill(); // for PeekFixedBits + Advance + int idx = input->PeekFixedBits<7>(); + input->Consume(huff[idx][0]); + logcounts[i] = huff[idx][1]; + // The RLE symbol. + if (logcounts[i] == ANS_LOG_TAB_SIZE + 1) { + int rle_length = DecodeVarLenUint8(input); + same[i] = rle_length + 5; + i += rle_length + 3; + continue; + } + if (logcounts[i] > omit_log) { + omit_log = logcounts[i]; + omit_pos = i; + } + } + // Invalid input, e.g. due to invalid usage of RLE. + if (omit_pos < 0) return JXL_FAILURE("Invalid histogram."); + if (static_cast(omit_pos) + 1 < logcounts.size() && + logcounts[omit_pos + 1] == ANS_TAB_SIZE + 1) { + return JXL_FAILURE("Invalid histogram."); + } + int prev = 0; + int numsame = 0; + for (size_t i = 0; i < logcounts.size(); ++i) { + if (same[i]) { + // RLE sequence, let this loop output the same count for the next + // iterations. + numsame = same[i] - 1; + prev = i > 0 ? (*counts)[i - 1] : 0; + } + if (numsame > 0) { + (*counts)[i] = prev; + numsame--; + } else { + int code = logcounts[i]; + // omit_pos may not be negative at this point (checked before). + if (i == static_cast(omit_pos)) { + continue; + } else if (code == 0) { + continue; + } else if (code == 1) { + (*counts)[i] = 1; + } else { + int bitcount = GetPopulationCountPrecision(code - 1, shift); + (*counts)[i] = (1 << (code - 1)) + + (input->ReadBits(bitcount) << (code - 1 - bitcount)); + } + } + total_count += (*counts)[i]; + } + (*counts)[omit_pos] = (1 << precision_bits) - total_count; + if ((*counts)[omit_pos] <= 0) { + // The histogram we've read sums to more than total_count (including at + // least 1 for the omitted value). + return JXL_FAILURE("Invalid histogram count."); + } + } + return true; +} + +} // namespace + +Status DecodeANSCodes(const size_t num_histograms, + const size_t max_alphabet_size, BitReader* in, + ANSCode* result) { + result->degenerate_symbols.resize(num_histograms, -1); + if (result->use_prefix_code) { + JXL_ASSERT(max_alphabet_size <= 1 << PREFIX_MAX_BITS); + result->huffman_data.resize(num_histograms); + std::vector alphabet_sizes(num_histograms); + for (size_t c = 0; c < num_histograms; c++) { + alphabet_sizes[c] = DecodeVarLenUint16(in) + 1; + if (alphabet_sizes[c] > max_alphabet_size) { + return JXL_FAILURE("Alphabet size is too long: %u", alphabet_sizes[c]); + } + } + for (size_t c = 0; c < num_histograms; c++) { + if (alphabet_sizes[c] > 1) { + if (!result->huffman_data[c].ReadFromBitStream(alphabet_sizes[c], in)) { + if (!in->AllReadsWithinBounds()) { + return JXL_STATUS(StatusCode::kNotEnoughBytes, + "Not enough bytes for huffman code"); + } + return JXL_FAILURE("Invalid huffman tree number %" PRIuS + ", alphabet size %u", + c, alphabet_sizes[c]); + } + } else { + // 0-bit codes does not require extension tables. + result->huffman_data[c].table_.clear(); + result->huffman_data[c].table_.resize(1u << kHuffmanTableBits); + } + for (const auto& h : result->huffman_data[c].table_) { + if (h.bits <= kHuffmanTableBits) { + result->UpdateMaxNumBits(c, h.value); + } + } + } + } else { + JXL_ASSERT(max_alphabet_size <= ANS_MAX_ALPHABET_SIZE); + result->alias_tables = + AllocateArray(num_histograms * (1 << result->log_alpha_size) * + sizeof(AliasTable::Entry)); + AliasTable::Entry* alias_tables = + reinterpret_cast(result->alias_tables.get()); + for (size_t c = 0; c < num_histograms; ++c) { + std::vector counts; + if (!ReadHistogram(ANS_LOG_TAB_SIZE, &counts, in)) { + return JXL_FAILURE("Invalid histogram bitstream."); + } + if (counts.size() > max_alphabet_size) { + return JXL_FAILURE("Alphabet size is too long: %" PRIuS, counts.size()); + } + while (!counts.empty() && counts.back() == 0) { + counts.pop_back(); + } + for (size_t s = 0; s < counts.size(); s++) { + if (counts[s] != 0) { + result->UpdateMaxNumBits(c, s); + } + } + // InitAliasTable "fixes" empty counts to contain degenerate "0" symbol. + int degenerate_symbol = counts.empty() ? 0 : (counts.size() - 1); + for (int s = 0; s < degenerate_symbol; ++s) { + if (counts[s] != 0) { + degenerate_symbol = -1; + break; + } + } + result->degenerate_symbols[c] = degenerate_symbol; + InitAliasTable(counts, ANS_TAB_SIZE, result->log_alpha_size, + alias_tables + c * (1 << result->log_alpha_size)); + } + } + return true; +} +Status DecodeUintConfig(size_t log_alpha_size, HybridUintConfig* uint_config, + BitReader* br) { + br->Refill(); + size_t split_exponent = br->ReadBits(CeilLog2Nonzero(log_alpha_size + 1)); + size_t msb_in_token = 0, lsb_in_token = 0; + if (split_exponent != log_alpha_size) { + // otherwise, msb/lsb don't matter. + size_t nbits = CeilLog2Nonzero(split_exponent + 1); + msb_in_token = br->ReadBits(nbits); + if (msb_in_token > split_exponent) { + // This could be invalid here already and we need to check this before + // we use its value to read more bits. + return JXL_FAILURE("Invalid HybridUintConfig"); + } + nbits = CeilLog2Nonzero(split_exponent - msb_in_token + 1); + lsb_in_token = br->ReadBits(nbits); + } + if (lsb_in_token + msb_in_token > split_exponent) { + return JXL_FAILURE("Invalid HybridUintConfig"); + } + *uint_config = HybridUintConfig(split_exponent, msb_in_token, lsb_in_token); + return true; +} + +Status DecodeUintConfigs(size_t log_alpha_size, + std::vector* uint_config, + BitReader* br) { + // TODO(veluca): RLE? + for (size_t i = 0; i < uint_config->size(); i++) { + JXL_RETURN_IF_ERROR( + DecodeUintConfig(log_alpha_size, &(*uint_config)[i], br)); + } + return true; +} + +LZ77Params::LZ77Params() { Bundle::Init(this); } +Status LZ77Params::VisitFields(Visitor* JXL_RESTRICT visitor) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &enabled)); + if (!visitor->Conditional(enabled)) return true; + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(224), Val(512), Val(4096), + BitsOffset(15, 8), 224, &min_symbol)); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(3), Val(4), BitsOffset(2, 5), + BitsOffset(8, 9), 3, &min_length)); + return true; +} + +void ANSCode::UpdateMaxNumBits(size_t ctx, size_t symbol) { + HybridUintConfig* cfg = &uint_config[ctx]; + // LZ77 symbols use a different uint config. + if (lz77.enabled && lz77.nonserialized_distance_context != ctx && + symbol >= lz77.min_symbol) { + symbol -= lz77.min_symbol; + cfg = &lz77.length_uint_config; + } + size_t split_token = cfg->split_token; + size_t msb_in_token = cfg->msb_in_token; + size_t lsb_in_token = cfg->lsb_in_token; + size_t split_exponent = cfg->split_exponent; + if (symbol < split_token) { + max_num_bits = std::max(max_num_bits, split_exponent); + return; + } + uint32_t n_extra_bits = + split_exponent - (msb_in_token + lsb_in_token) + + ((symbol - split_token) >> (msb_in_token + lsb_in_token)); + size_t total_bits = msb_in_token + lsb_in_token + n_extra_bits + 1; + max_num_bits = std::max(max_num_bits, total_bits); +} + +Status DecodeHistograms(BitReader* br, size_t num_contexts, ANSCode* code, + std::vector* context_map, bool disallow_lz77) { + PROFILER_FUNC; + JXL_RETURN_IF_ERROR(Bundle::Read(br, &code->lz77)); + if (code->lz77.enabled) { + num_contexts++; + JXL_RETURN_IF_ERROR(DecodeUintConfig(/*log_alpha_size=*/8, + &code->lz77.length_uint_config, br)); + } + if (code->lz77.enabled && disallow_lz77) { + return JXL_FAILURE("Using LZ77 when explicitly disallowed"); + } + size_t num_histograms = 1; + context_map->resize(num_contexts); + if (num_contexts > 1) { + JXL_RETURN_IF_ERROR(DecodeContextMap(context_map, &num_histograms, br)); + } + code->lz77.nonserialized_distance_context = context_map->back(); + code->use_prefix_code = br->ReadFixedBits<1>(); + if (code->use_prefix_code) { + code->log_alpha_size = PREFIX_MAX_BITS; + } else { + code->log_alpha_size = br->ReadFixedBits<2>() + 5; + } + code->uint_config.resize(num_histograms); + JXL_RETURN_IF_ERROR( + DecodeUintConfigs(code->log_alpha_size, &code->uint_config, br)); + const size_t max_alphabet_size = 1 << code->log_alpha_size; + JXL_RETURN_IF_ERROR( + DecodeANSCodes(num_histograms, max_alphabet_size, br, code)); + // When using LZ77, flat codes might result in valid codestreams with + // histograms that potentially allow very large bit counts. + // TODO(veluca): in principle, a valid codestream might contain a histogram + // that could allow very large numbers of bits that is never used during ANS + // decoding. There's no benefit to doing that, though. + if (!code->lz77.enabled && code->max_num_bits > 32) { + // Just emit a warning as there are many opportunities for false positives. + JXL_WARNING("Histogram can represent numbers that are too large: %" PRIuS + "\n", + code->max_num_bits); + } + return true; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/dec_ans.h b/third_party/jpeg-xl/lib/jxl/dec_ans.h new file mode 100644 index 0000000000..0f4406745a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_ans.h @@ -0,0 +1,462 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_ANS_H_ +#define LIB_JXL_DEC_ANS_H_ + +// Library to decode the ANS population counts from the bit-stream and build a +// decoding table from them. + +#include +#include + +#include +#include + +#include "lib/jxl/ans_common.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/cache_aligned.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_huffman.h" +#include "lib/jxl/field_encodings.h" + +namespace jxl { + +class ANSSymbolReader; + +// Experiments show that best performance is typically achieved for a +// split-exponent of 3 or 4. Trend seems to be that '4' is better +// for large-ish pictures, and '3' better for rather small-ish pictures. +// This is plausible - the more special symbols we have, the better +// statistics we need to get a benefit out of them. + +// Our hybrid-encoding scheme has dedicated tokens for the smallest +// (1 << split_exponents) numbers, and for the rest +// encodes (number of bits) + (msb_in_token sub-leading binary digits) + +// (lsb_in_token lowest binary digits) in the token, with the remaining bits +// then being encoded as data. +// +// Example with split_exponent = 4, msb_in_token = 2, lsb_in_token = 0. +// +// Numbers N in [0 .. 15]: +// These get represented as (token=N, bits=''). +// Numbers N >= 16: +// If n is such that 2**n <= N < 2**(n+1), +// and m = N - 2**n is the 'mantissa', +// these get represented as: +// (token=split_token + +// ((n - split_exponent) * 4) + +// (m >> (n - msb_in_token)), +// bits=m & (1 << (n - msb_in_token)) - 1) +// Specifically, we would get: +// N = 0 - 15: (token=N, nbits=0, bits='') +// N = 16 (10000): (token=16, nbits=2, bits='00') +// N = 17 (10001): (token=16, nbits=2, bits='01') +// N = 20 (10100): (token=17, nbits=2, bits='00') +// N = 24 (11000): (token=18, nbits=2, bits='00') +// N = 28 (11100): (token=19, nbits=2, bits='00') +// N = 32 (100000): (token=20, nbits=3, bits='000') +// N = 65535: (token=63, nbits=13, bits='1111111111111') +struct HybridUintConfig { + uint32_t split_exponent; + uint32_t split_token; + uint32_t msb_in_token; + uint32_t lsb_in_token; + JXL_INLINE void Encode(uint32_t value, uint32_t* JXL_RESTRICT token, + uint32_t* JXL_RESTRICT nbits, + uint32_t* JXL_RESTRICT bits) const { + if (value < split_token) { + *token = value; + *nbits = 0; + *bits = 0; + } else { + uint32_t n = FloorLog2Nonzero(value); + uint32_t m = value - (1 << n); + *token = split_token + + ((n - split_exponent) << (msb_in_token + lsb_in_token)) + + ((m >> (n - msb_in_token)) << lsb_in_token) + + (m & ((1 << lsb_in_token) - 1)); + *nbits = n - msb_in_token - lsb_in_token; + *bits = (value >> lsb_in_token) & ((1UL << *nbits) - 1); + } + } + + explicit HybridUintConfig(uint32_t split_exponent = 4, + uint32_t msb_in_token = 2, + uint32_t lsb_in_token = 0) + : split_exponent(split_exponent), + split_token(1 << split_exponent), + msb_in_token(msb_in_token), + lsb_in_token(lsb_in_token) { + JXL_DASSERT(split_exponent >= msb_in_token + lsb_in_token); + } +}; + +struct LZ77Params : public Fields { + LZ77Params(); + JXL_FIELDS_NAME(LZ77Params) + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + bool enabled; + + // Symbols above min_symbol use a special hybrid uint encoding and + // represent a length, to be added to min_length. + uint32_t min_symbol; + uint32_t min_length; + + // Not serialized by VisitFields. + HybridUintConfig length_uint_config{0, 0, 0}; + + size_t nonserialized_distance_context; +}; + +static constexpr size_t kWindowSize = 1 << 20; +static constexpr size_t kNumSpecialDistances = 120; +// Table of special distance codes from WebP lossless. +static constexpr int8_t kSpecialDistances[kNumSpecialDistances][2] = { + {0, 1}, {1, 0}, {1, 1}, {-1, 1}, {0, 2}, {2, 0}, {1, 2}, {-1, 2}, + {2, 1}, {-2, 1}, {2, 2}, {-2, 2}, {0, 3}, {3, 0}, {1, 3}, {-1, 3}, + {3, 1}, {-3, 1}, {2, 3}, {-2, 3}, {3, 2}, {-3, 2}, {0, 4}, {4, 0}, + {1, 4}, {-1, 4}, {4, 1}, {-4, 1}, {3, 3}, {-3, 3}, {2, 4}, {-2, 4}, + {4, 2}, {-4, 2}, {0, 5}, {3, 4}, {-3, 4}, {4, 3}, {-4, 3}, {5, 0}, + {1, 5}, {-1, 5}, {5, 1}, {-5, 1}, {2, 5}, {-2, 5}, {5, 2}, {-5, 2}, + {4, 4}, {-4, 4}, {3, 5}, {-3, 5}, {5, 3}, {-5, 3}, {0, 6}, {6, 0}, + {1, 6}, {-1, 6}, {6, 1}, {-6, 1}, {2, 6}, {-2, 6}, {6, 2}, {-6, 2}, + {4, 5}, {-4, 5}, {5, 4}, {-5, 4}, {3, 6}, {-3, 6}, {6, 3}, {-6, 3}, + {0, 7}, {7, 0}, {1, 7}, {-1, 7}, {5, 5}, {-5, 5}, {7, 1}, {-7, 1}, + {4, 6}, {-4, 6}, {6, 4}, {-6, 4}, {2, 7}, {-2, 7}, {7, 2}, {-7, 2}, + {3, 7}, {-3, 7}, {7, 3}, {-7, 3}, {5, 6}, {-5, 6}, {6, 5}, {-6, 5}, + {8, 0}, {4, 7}, {-4, 7}, {7, 4}, {-7, 4}, {8, 1}, {8, 2}, {6, 6}, + {-6, 6}, {8, 3}, {5, 7}, {-5, 7}, {7, 5}, {-7, 5}, {8, 4}, {6, 7}, + {-6, 7}, {7, 6}, {-7, 6}, {8, 5}, {7, 7}, {-7, 7}, {8, 6}, {8, 7}}; + +struct ANSCode { + CacheAlignedUniquePtr alias_tables; + std::vector huffman_data; + std::vector uint_config; + std::vector degenerate_symbols; + bool use_prefix_code; + uint8_t log_alpha_size; // for ANS. + LZ77Params lz77; + // Maximum number of bits necessary to represent the result of a + // ReadHybridUint call done with this ANSCode. + size_t max_num_bits = 0; + void UpdateMaxNumBits(size_t ctx, size_t symbol); +}; + +class ANSSymbolReader { + public: + // Invalid symbol reader, to be overwritten. + ANSSymbolReader() = default; + ANSSymbolReader(const ANSCode* code, BitReader* JXL_RESTRICT br, + size_t distance_multiplier = 0) + : alias_tables_( + reinterpret_cast(code->alias_tables.get())), + huffman_data_(code->huffman_data.data()), + use_prefix_code_(code->use_prefix_code), + configs(code->uint_config.data()) { + if (!use_prefix_code_) { + state_ = static_cast(br->ReadFixedBits<32>()); + log_alpha_size_ = code->log_alpha_size; + log_entry_size_ = ANS_LOG_TAB_SIZE - code->log_alpha_size; + entry_size_minus_1_ = (1 << log_entry_size_) - 1; + } else { + state_ = (ANS_SIGNATURE << 16u); + } + if (!code->lz77.enabled) return; + // a std::vector incurs unacceptable decoding speed loss because of + // initialization. + lz77_window_storage_ = AllocateArray(kWindowSize * sizeof(uint32_t)); + lz77_window_ = reinterpret_cast(lz77_window_storage_.get()); + lz77_ctx_ = code->lz77.nonserialized_distance_context; + lz77_length_uint_ = code->lz77.length_uint_config; + lz77_threshold_ = code->lz77.min_symbol; + lz77_min_length_ = code->lz77.min_length; + num_special_distances_ = + distance_multiplier == 0 ? 0 : kNumSpecialDistances; + for (size_t i = 0; i < num_special_distances_; i++) { + int dist = kSpecialDistances[i][0]; + dist += static_cast(distance_multiplier) * kSpecialDistances[i][1]; + if (dist < 1) dist = 1; + special_distances_[i] = dist; + } + } + + JXL_INLINE size_t ReadSymbolANSWithoutRefill(const size_t histo_idx, + BitReader* JXL_RESTRICT br) { + const uint32_t res = state_ & (ANS_TAB_SIZE - 1u); + + const AliasTable::Entry* table = + &alias_tables_[histo_idx << log_alpha_size_]; + const AliasTable::Symbol symbol = + AliasTable::Lookup(table, res, log_entry_size_, entry_size_minus_1_); + state_ = symbol.freq * (state_ >> ANS_LOG_TAB_SIZE) + symbol.offset; + +#if 1 + // Branchless version is about equally fast on SKX. + const uint32_t new_state = + (state_ << 16u) | static_cast(br->PeekFixedBits<16>()); + const bool normalize = state_ < (1u << 16u); + state_ = normalize ? new_state : state_; + br->Consume(normalize ? 16 : 0); +#else + if (JXL_UNLIKELY(state_ < (1u << 16u))) { + state_ = (state_ << 16u) | br->PeekFixedBits<16>(); + br->Consume(16); + } +#endif + const uint32_t next_res = state_ & (ANS_TAB_SIZE - 1u); + AliasTable::Prefetch(table, next_res, log_entry_size_); + + return symbol.value; + } + + JXL_INLINE size_t ReadSymbolHuffWithoutRefill(const size_t histo_idx, + BitReader* JXL_RESTRICT br) { + return huffman_data_[histo_idx].ReadSymbol(br); + } + + JXL_INLINE size_t ReadSymbolWithoutRefill(const size_t histo_idx, + BitReader* JXL_RESTRICT br) { + // TODO(veluca): hoist if in hotter loops. + if (JXL_UNLIKELY(use_prefix_code_)) { + return ReadSymbolHuffWithoutRefill(histo_idx, br); + } + return ReadSymbolANSWithoutRefill(histo_idx, br); + } + + JXL_INLINE size_t ReadSymbol(const size_t histo_idx, + BitReader* JXL_RESTRICT br) { + br->Refill(); + return ReadSymbolWithoutRefill(histo_idx, br); + } + +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + bool CheckANSFinalState() const { return true; } +#else + bool CheckANSFinalState() const { return state_ == (ANS_SIGNATURE << 16u); } +#endif + + template + static JXL_INLINE uint32_t ReadHybridUintConfig( + const HybridUintConfig& config, size_t token, BitReader* br) { + size_t split_token = config.split_token; + size_t msb_in_token = config.msb_in_token; + size_t lsb_in_token = config.lsb_in_token; + size_t split_exponent = config.split_exponent; + // Fast-track version of hybrid integer decoding. + if (token < split_token) return token; + uint32_t nbits = split_exponent - (msb_in_token + lsb_in_token) + + ((token - split_token) >> (msb_in_token + lsb_in_token)); + // Max amount of bits for ReadBits is 32 and max valid left shift is 29 + // bits. However, for speed no error is propagated here, instead limit the + // nbits size. If nbits > 29, the code stream is invalid, but no error is + // returned. + // Note that in most cases we will emit an error if the histogram allows + // representing numbers that would cause invalid shifts, but we need to + // keep this check as when LZ77 is enabled it might make sense to have an + // histogram that could in principle cause invalid shifts. + nbits &= 31u; + uint32_t low = token & ((1 << lsb_in_token) - 1); + token >>= lsb_in_token; + const size_t bits = br->PeekBits(nbits); + br->Consume(nbits); + size_t ret = (((((1 << msb_in_token) | (token & ((1 << msb_in_token) - 1))) + << nbits) | + bits) + << lsb_in_token) | + low; + // TODO(eustas): mark BitReader as unhealthy if nbits > 29 or ret does not + // fit uint32_t + return static_cast(ret); + } + + // Takes a *clustered* idx. Can only use if HuffRleOnly() is true. + void ReadHybridUintClusteredHuffRleOnly(size_t ctx, + BitReader* JXL_RESTRICT br, + uint32_t* value, uint32_t* run) { + JXL_DASSERT(HuffRleOnly()); + br->Refill(); // covers ReadSymbolWithoutRefill + PeekBits + size_t token = ReadSymbolHuffWithoutRefill(ctx, br); + if (JXL_UNLIKELY(token >= lz77_threshold_)) { + *run = + ReadHybridUintConfig(lz77_length_uint_, token - lz77_threshold_, br) + + lz77_min_length_ - 1; + return; + } + *value = ReadHybridUintConfig(configs[ctx], token, br); + } + bool HuffRleOnly() { + if (lz77_window_ == nullptr) return false; + if (!use_prefix_code_) return false; + for (size_t i = 0; i < kHuffmanTableBits; i++) { + if (huffman_data_[lz77_ctx_].table_[i].bits) return false; + if (huffman_data_[lz77_ctx_].table_[i].value != 1) return false; + } + if (configs[lz77_ctx_].split_token > 1) return false; + return true; + } + + // Takes a *clustered* idx. + size_t ReadHybridUintClustered(size_t ctx, BitReader* JXL_RESTRICT br) { + if (JXL_UNLIKELY(num_to_copy_ > 0)) { + size_t ret = lz77_window_[(copy_pos_++) & kWindowMask]; + num_to_copy_--; + lz77_window_[(num_decoded_++) & kWindowMask] = ret; + return ret; + } + br->Refill(); // covers ReadSymbolWithoutRefill + PeekBits + size_t token = ReadSymbolWithoutRefill(ctx, br); + if (JXL_UNLIKELY(token >= lz77_threshold_)) { + num_to_copy_ = + ReadHybridUintConfig(lz77_length_uint_, token - lz77_threshold_, br) + + lz77_min_length_; + br->Refill(); // covers ReadSymbolWithoutRefill + PeekBits + // Distance code. + size_t token = ReadSymbolWithoutRefill(lz77_ctx_, br); + size_t distance = ReadHybridUintConfig(configs[lz77_ctx_], token, br); + if (JXL_LIKELY(distance < num_special_distances_)) { + distance = special_distances_[distance]; + } else { + distance = distance + 1 - num_special_distances_; + } + if (JXL_UNLIKELY(distance > num_decoded_)) { + distance = num_decoded_; + } + if (JXL_UNLIKELY(distance > kWindowSize)) { + distance = kWindowSize; + } + copy_pos_ = num_decoded_ - distance; + if (JXL_UNLIKELY(distance == 0)) { + JXL_DASSERT(lz77_window_ != nullptr); + // distance 0 -> num_decoded_ == copy_pos_ == 0 + size_t to_fill = std::min(num_to_copy_, kWindowSize); + memset(lz77_window_, 0, to_fill * sizeof(lz77_window_[0])); + } + // TODO(eustas): overflow; mark BitReader as unhealthy + if (num_to_copy_ < lz77_min_length_) return 0; + return ReadHybridUintClustered(ctx, br); // will trigger a copy. + } + size_t ret = ReadHybridUintConfig(configs[ctx], token, br); + if (lz77_window_) lz77_window_[(num_decoded_++) & kWindowMask] = ret; + return ret; + } + + JXL_INLINE size_t ReadHybridUint(size_t ctx, BitReader* JXL_RESTRICT br, + const std::vector& context_map) { + return ReadHybridUintClustered(context_map[ctx], br); + } + + // ctx is a *clustered* context! + // This function will modify the ANS state as if `count` symbols have been + // decoded. + bool IsSingleValueAndAdvance(size_t ctx, uint32_t* value, size_t count) { + // TODO(veluca): No optimization for Huffman mode yet. + if (use_prefix_code_) return false; + // TODO(eustas): propagate "degenerate_symbol" to simplify this method. + const uint32_t res = state_ & (ANS_TAB_SIZE - 1u); + const AliasTable::Entry* table = &alias_tables_[ctx << log_alpha_size_]; + AliasTable::Symbol symbol = + AliasTable::Lookup(table, res, log_entry_size_, entry_size_minus_1_); + if (symbol.freq != ANS_TAB_SIZE) return false; + if (configs[ctx].split_token <= symbol.value) return false; + if (symbol.value >= lz77_threshold_) return false; + *value = symbol.value; + if (lz77_window_) { + for (size_t i = 0; i < count; i++) { + lz77_window_[(num_decoded_++) & kWindowMask] = symbol.value; + } + } + return true; + } + + static constexpr size_t kMaxCheckpointInterval = 512; + struct Checkpoint { + uint32_t state; + uint32_t num_to_copy; + uint32_t copy_pos; + uint32_t num_decoded; + uint32_t lz77_window[kMaxCheckpointInterval]; + }; + void Save(Checkpoint* checkpoint) { + checkpoint->state = state_; + checkpoint->num_decoded = num_decoded_; + checkpoint->num_to_copy = num_to_copy_; + checkpoint->copy_pos = copy_pos_; + if (lz77_window_) { + size_t win_start = num_decoded_ & kWindowMask; + size_t win_end = (num_decoded_ + kMaxCheckpointInterval) & kWindowMask; + if (win_end > win_start) { + memcpy(checkpoint->lz77_window, lz77_window_ + win_start, + (win_end - win_start) * sizeof(*lz77_window_)); + } else { + memcpy(checkpoint->lz77_window, lz77_window_ + win_start, + (kWindowSize - win_start) * sizeof(*lz77_window_)); + memcpy(checkpoint->lz77_window + (kWindowSize - win_start), + lz77_window_, win_end * sizeof(*lz77_window_)); + } + } + } + void Restore(const Checkpoint& checkpoint) { + state_ = checkpoint.state; + JXL_DASSERT(num_decoded_ <= + checkpoint.num_decoded + kMaxCheckpointInterval); + num_decoded_ = checkpoint.num_decoded; + num_to_copy_ = checkpoint.num_to_copy; + copy_pos_ = checkpoint.copy_pos; + if (lz77_window_) { + size_t win_start = num_decoded_ & kWindowMask; + size_t win_end = (num_decoded_ + kMaxCheckpointInterval) & kWindowMask; + if (win_end > win_start) { + memcpy(lz77_window_ + win_start, checkpoint.lz77_window, + (win_end - win_start) * sizeof(*lz77_window_)); + } else { + memcpy(lz77_window_ + win_start, checkpoint.lz77_window, + (kWindowSize - win_start) * sizeof(*lz77_window_)); + memcpy(lz77_window_, checkpoint.lz77_window + (kWindowSize - win_start), + win_end * sizeof(*lz77_window_)); + } + } + } + + private: + const AliasTable::Entry* JXL_RESTRICT alias_tables_; // not owned + const HuffmanDecodingData* huffman_data_; + bool use_prefix_code_; + uint32_t state_ = ANS_SIGNATURE << 16u; + const HybridUintConfig* JXL_RESTRICT configs; + uint32_t log_alpha_size_{}; + uint32_t log_entry_size_{}; + uint32_t entry_size_minus_1_{}; + + // LZ77 structures and constants. + static constexpr size_t kWindowMask = kWindowSize - 1; + CacheAlignedUniquePtr lz77_window_storage_; + uint32_t* lz77_window_ = nullptr; + uint32_t num_decoded_ = 0; + uint32_t num_to_copy_ = 0; + uint32_t copy_pos_ = 0; + uint32_t lz77_ctx_ = 0; + uint32_t lz77_min_length_ = 0; + uint32_t lz77_threshold_ = 1 << 20; // bigger than any symbol. + HybridUintConfig lz77_length_uint_; + uint32_t special_distances_[kNumSpecialDistances]{}; + uint32_t num_special_distances_{}; +}; + +Status DecodeHistograms(BitReader* br, size_t num_contexts, ANSCode* code, + std::vector* context_map, + bool disallow_lz77 = false); + +// Exposed for tests. +Status DecodeUintConfigs(size_t log_alpha_size, + std::vector* uint_config, + BitReader* br); + +} // namespace jxl + +#endif // LIB_JXL_DEC_ANS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dec_bit_reader.h b/third_party/jpeg-xl/lib/jxl/dec_bit_reader.h new file mode 100644 index 0000000000..df70284e3b --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_bit_reader.h @@ -0,0 +1,354 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_BIT_READER_H_ +#define LIB_JXL_DEC_BIT_READER_H_ + +// Bounds-checked bit reader; 64-bit buffer with support for deferred refills +// and switching to reading byte-aligned words. + +#include +#include +#include // memcpy + +#ifdef __BMI2__ +#include +#endif + +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" + +namespace jxl { + +// Reads bits previously written to memory by BitWriter. Uses unaligned 8-byte +// little-endian loads. +class BitReader { + public: + static constexpr size_t kMaxBitsPerCall = 56; + + // Constructs an invalid BitReader, to be overwritten before usage. + BitReader() + : buf_(0), + bits_in_buf_(0), + next_byte_{nullptr}, + end_minus_8_{nullptr}, + first_byte_(nullptr) {} + BitReader(const BitReader&) = delete; + + // bytes need not be aligned nor padded! + template + explicit BitReader(const ArrayLike& bytes) + : buf_(0), + bits_in_buf_(0), + next_byte_(bytes.data()), + // Assumes first_byte_ >= 8. + end_minus_8_(bytes.data() - 8 + bytes.size()), + first_byte_(bytes.data()) { + Refill(); + } + ~BitReader() { + // Close() must be called before destroying an initialized bit reader. + // Invalid bit readers will have a nullptr in first_byte_. + JXL_ASSERT(close_called_ || !first_byte_); + } + + // Move operator needs to invalidate the other BitReader such that it is + // irrelevant if we call Close() on it or not. + BitReader& operator=(BitReader&& other) noexcept { + // Ensure the current instance was already closed, before we overwrite it + // with other. + JXL_ASSERT(close_called_ || !first_byte_); + + JXL_DASSERT(!other.close_called_); + buf_ = other.buf_; + bits_in_buf_ = other.bits_in_buf_; + next_byte_ = other.next_byte_; + end_minus_8_ = other.end_minus_8_; + first_byte_ = other.first_byte_; + overread_bytes_ = other.overread_bytes_; + close_called_ = other.close_called_; + + other.first_byte_ = nullptr; + other.next_byte_ = nullptr; + return *this; + } + BitReader& operator=(const BitReader& other) = delete; + + // For time-critical reads, refills can be shared by multiple reads. + // Based on variant 4 (plus bounds-checking), see + // fgiesen.wordpress.com/2018/02/20/reading-bits-in-far-too-many-ways-part-2/ + JXL_INLINE void Refill() { + if (JXL_UNLIKELY(next_byte_ > end_minus_8_)) { + BoundsCheckedRefill(); + } else { + // It's safe to load 64 bits; insert valid (possibly nonzero) bits above + // bits_in_buf_. The shift requires bits_in_buf_ < 64. + buf_ |= LoadLE64(next_byte_) << bits_in_buf_; + + // Advance by bytes fully absorbed into the buffer. + next_byte_ += (63 - bits_in_buf_) >> 3; + + // We absorbed a multiple of 8 bits, so the lower 3 bits of bits_in_buf_ + // must remain unchanged, otherwise the next refill's shifted bits will + // not align with buf_. Set the three upper bits so the result >= 56. + bits_in_buf_ |= 56; + JXL_DASSERT(56 <= bits_in_buf_ && bits_in_buf_ < 64); + } + } + + // Returns the bits that would be returned by Read without calling Advance(). + // It is legal to PEEK at more bits than present in the bitstream (required + // by Huffman), and those bits will be zero. + template + JXL_INLINE uint64_t PeekFixedBits() const { + static_assert(N <= kMaxBitsPerCall, "Reading too many bits in one call."); + JXL_DASSERT(!close_called_); + return buf_ & ((1ULL << N) - 1); + } + + JXL_INLINE uint64_t PeekBits(size_t nbits) const { + JXL_DASSERT(nbits <= kMaxBitsPerCall); + JXL_DASSERT(!close_called_); + + // Slightly faster but requires BMI2. It is infeasible to make the many + // callers reside between begin/end_target, especially because only the + // callers in dec_ans are time-critical. Therefore only enabled if the + // entire binary is compiled for (and thus requires) BMI2. +#if defined(__BMI2__) && defined(__x86_64__) + return _bzhi_u64(buf_, nbits); +#else + const uint64_t mask = (1ULL << nbits) - 1; + return buf_ & mask; +#endif + } + + // Removes bits from the buffer. Need not match the previous Peek size, but + // the buffer must contain at least num_bits (this prevents consuming more + // than the total number of bits). + JXL_INLINE void Consume(size_t num_bits) { + JXL_DASSERT(!close_called_); + JXL_DASSERT(bits_in_buf_ >= num_bits); +#ifdef JXL_CRASH_ON_ERROR + // When JXL_CRASH_ON_ERROR is defined, it is a fatal error to read more bits + // than available in the stream. A non-zero overread_bytes_ implies that + // next_byte_ is already at the end of the stream, so we don't need to + // check that. + JXL_ASSERT(bits_in_buf_ >= num_bits + overread_bytes_ * kBitsPerByte); +#endif + bits_in_buf_ -= num_bits; + buf_ >>= num_bits; + } + + JXL_INLINE uint64_t ReadBits(size_t nbits) { + JXL_DASSERT(!close_called_); + Refill(); + const uint64_t bits = PeekBits(nbits); + Consume(nbits); + return bits; + } + + template + JXL_INLINE uint64_t ReadFixedBits() { + JXL_DASSERT(!close_called_); + Refill(); + const uint64_t bits = PeekFixedBits(); + Consume(N); + return bits; + } + + // Equivalent to calling ReadFixedBits(1) `skip` times, but much faster. + // `skip` is typically large. + void SkipBits(size_t skip) { + JXL_DASSERT(!close_called_); + // Buffer is large enough - don't zero buf_ below. + if (JXL_UNLIKELY(skip <= bits_in_buf_)) { + Consume(skip); + return; + } + + // First deduct what we can satisfy from the buffer + skip -= bits_in_buf_; + bits_in_buf_ = 0; + // Not enough to call Advance - that may leave some bits in the buffer + // which were previously ABOVE bits_in_buf. + buf_ = 0; + + // Skip whole bytes + const size_t whole_bytes = skip / kBitsPerByte; + skip %= kBitsPerByte; + if (JXL_UNLIKELY(whole_bytes > + static_cast(end_minus_8_ + 8 - next_byte_))) { + // This is already an overflow condition (skipping past the end of the bit + // stream). However if we increase next_byte_ too much we risk overflowing + // that value and potentially making it valid again (next_byte_ < end). + // This will set next_byte_ to the end of the stream and still consume + // some bits in overread_bytes_, however the TotalBitsConsumed() will be + // incorrect (still larger than the TotalBytes()). + next_byte_ = end_minus_8_ + 8; + skip += kBitsPerByte; + } else { + next_byte_ += whole_bytes; + } + + Refill(); + Consume(skip); + } + + size_t TotalBitsConsumed() const { + const size_t bytes_read = static_cast(next_byte_ - first_byte_); + return (bytes_read + overread_bytes_) * kBitsPerByte - bits_in_buf_; + } + + Status JumpToByteBoundary() { + const size_t remainder = TotalBitsConsumed() % kBitsPerByte; + if (remainder == 0) return true; + if (JXL_UNLIKELY(ReadBits(kBitsPerByte - remainder) != 0)) { + return JXL_FAILURE("Non-zero padding bits"); + } + return true; + } + + // For interoperability with other bitreaders (for resuming at + // non-byte-aligned positions). + const uint8_t* FirstByte() const { return first_byte_; } + size_t TotalBytes() const { + return static_cast(end_minus_8_ + 8 - first_byte_); + } + + // Returns span of the remaining (unconsumed) bytes, e.g. for passing to + // external decoders such as Brotli. + Span GetSpan() const { + JXL_DASSERT(first_byte_ != nullptr); + JXL_ASSERT(TotalBitsConsumed() % kBitsPerByte == 0); + const size_t offset = TotalBitsConsumed() / kBitsPerByte; // no remainder + JXL_ASSERT(offset <= TotalBytes()); + return Span(first_byte_ + offset, TotalBytes() - offset); + } + + // Returns whether all the bits read so far have been within the input bounds. + // When reading past the EOF, the Read*() and Consume() functions return zeros + // but flag a failure when calling Close() without checking this function. + Status AllReadsWithinBounds() { + // Mark up to which point the user checked the out of bounds condition. If + // the user handles the condition at higher level (e.g. fetch more bytes + // from network, return a custom JXL_FAILURE, ...), Close() should not + // output a debug error (which would break tests with JXL_CRASH_ON_ERROR + // even when legitimately handling the situation at higher level). This is + // used by Bundle::CanRead. + checked_out_of_bounds_bits_ = TotalBitsConsumed(); + if (TotalBitsConsumed() > TotalBytes() * kBitsPerByte) { + return false; + } + return true; + } + + // Close the bit reader and return whether all the previous reads were + // successful. Close must be called once. + Status Close() { + JXL_DASSERT(!close_called_); + close_called_ = true; + if (!first_byte_) return true; + if (TotalBitsConsumed() > checked_out_of_bounds_bits_ && + TotalBitsConsumed() > TotalBytes() * kBitsPerByte) { + return JXL_FAILURE("Read more bits than available in the bit_reader"); + } + return true; + } + + private: + // Separate function avoids inlining this relatively cold code into callers. + JXL_NOINLINE void BoundsCheckedRefill() { + PROFILER_FUNC; + const uint8_t* end = end_minus_8_ + 8; + + // Read whole bytes until we have [56, 64) bits (same as LoadLE64) + for (; bits_in_buf_ < 64 - kBitsPerByte; bits_in_buf_ += kBitsPerByte) { + if (next_byte_ >= end) break; + buf_ |= static_cast(*next_byte_++) << bits_in_buf_; + } + JXL_DASSERT(bits_in_buf_ < 64); + + // Add extra bytes as 0 at the end of the stream in the bit_buffer_. If + // these bits are read, Close() will return a failure. + size_t extra_bytes = (63 - bits_in_buf_) / kBitsPerByte; + overread_bytes_ += extra_bytes; + bits_in_buf_ += extra_bytes * kBitsPerByte; + + JXL_DASSERT(bits_in_buf_ < 64); + JXL_DASSERT(bits_in_buf_ >= 56); + } + + JXL_NOINLINE uint32_t BoundsCheckedReadByteAlignedWord() { + if (next_byte_ + 1 < end_minus_8_ + 8) { + uint32_t ret = LoadLE16(next_byte_); + next_byte_ += 2; + return ret; + } + overread_bytes_ += 2; + return 0; + } + + uint64_t buf_; + size_t bits_in_buf_; // [0, 64) + const uint8_t* JXL_RESTRICT next_byte_; + const uint8_t* end_minus_8_; // for refill bounds check + const uint8_t* first_byte_; // for GetSpan + + // Number of bytes past the end that were loaded into the buf_. These bytes + // are not read from memory, but instead assumed 0. It is an error (likely due + // to an invalid stream) to Consume() more bits than specified in the range + // passed to the constructor. + uint64_t overread_bytes_{0}; + bool close_called_{false}; + + uint64_t checked_out_of_bounds_bits_{0}; +}; + +// Closes a BitReader when the BitReaderScopedCloser goes out of scope. When +// closing the bit reader, if the status result was failure it sets this failure +// to the passed variable pointer. Typical usage. +// +// Status ret = true; +// { +// BitReader reader(...); +// BitReaderScopedCloser reader_closer(&reader, &ret); +// +// // ... code that can return errors here ... +// } +// // ... more code that doesn't use the BitReader. +// return ret; + +class BitReaderScopedCloser { + public: + BitReaderScopedCloser(BitReader* reader, Status* status) + : reader_(reader), status_(status) { + JXL_DASSERT(reader_ != nullptr); + JXL_DASSERT(status_ != nullptr); + } + ~BitReaderScopedCloser() { + if (reader_ != nullptr) { + Status close_ret = reader_->Close(); + if (!close_ret) *status_ = close_ret; + } + } + void CloseAndSuppressError() { + JXL_ASSERT(reader_ != nullptr); + (void)reader_->Close(); + reader_ = nullptr; + } + BitReaderScopedCloser(const BitReaderScopedCloser&) = delete; + + private: + BitReader* reader_; + Status* status_; +}; + +} // namespace jxl + +#endif // LIB_JXL_DEC_BIT_READER_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dec_cache.cc b/third_party/jpeg-xl/lib/jxl/dec_cache.cc new file mode 100644 index 0000000000..4db6f1d9a5 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_cache.cc @@ -0,0 +1,229 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_cache.h" + +#include "lib/jxl/blending.h" +#include "lib/jxl/render_pipeline/stage_blending.h" +#include "lib/jxl/render_pipeline/stage_chroma_upsampling.h" +#include "lib/jxl/render_pipeline/stage_epf.h" +#include "lib/jxl/render_pipeline/stage_from_linear.h" +#include "lib/jxl/render_pipeline/stage_gaborish.h" +#include "lib/jxl/render_pipeline/stage_noise.h" +#include "lib/jxl/render_pipeline/stage_patches.h" +#include "lib/jxl/render_pipeline/stage_splines.h" +#include "lib/jxl/render_pipeline/stage_spot.h" +#include "lib/jxl/render_pipeline/stage_to_linear.h" +#include "lib/jxl/render_pipeline/stage_tone_mapping.h" +#include "lib/jxl/render_pipeline/stage_upsampling.h" +#include "lib/jxl/render_pipeline/stage_write.h" +#include "lib/jxl/render_pipeline/stage_xyb.h" +#include "lib/jxl/render_pipeline/stage_ycbcr.h" + +namespace jxl { + +Status PassesDecoderState::PreparePipeline(ImageBundle* decoded, + PipelineOptions options) { + const FrameHeader& frame_header = shared->frame_header; + size_t num_c = 3 + frame_header.nonserialized_metadata->m.num_extra_channels; + if ((frame_header.flags & FrameHeader::kNoise) != 0) { + num_c += 3; + } + + if (frame_header.CanBeReferenced()) { + // Necessary so that SetInputSizes() can allocate output buffers as needed. + frame_storage_for_referencing = ImageBundle(decoded->metadata()); + } + + RenderPipeline::Builder builder(num_c); + + if (options.use_slow_render_pipeline) { + builder.UseSimpleImplementation(); + } + + if (!frame_header.chroma_subsampling.Is444()) { + for (size_t c = 0; c < 3; c++) { + if (frame_header.chroma_subsampling.HShift(c) != 0) { + builder.AddStage(GetChromaUpsamplingStage(c, /*horizontal=*/true)); + } + if (frame_header.chroma_subsampling.VShift(c) != 0) { + builder.AddStage(GetChromaUpsamplingStage(c, /*horizontal=*/false)); + } + } + } + + if (frame_header.loop_filter.gab) { + builder.AddStage(GetGaborishStage(frame_header.loop_filter)); + } + + { + const LoopFilter& lf = frame_header.loop_filter; + if (lf.epf_iters >= 3) { + builder.AddStage(GetEPFStage(lf, sigma, 0)); + } + if (lf.epf_iters >= 1) { + builder.AddStage(GetEPFStage(lf, sigma, 1)); + } + if (lf.epf_iters >= 2) { + builder.AddStage(GetEPFStage(lf, sigma, 2)); + } + } + + bool late_ec_upsample = frame_header.upsampling != 1; + for (auto ecups : frame_header.extra_channel_upsampling) { + if (ecups != frame_header.upsampling) { + // If patches are applied, either frame_header.upsampling == 1 or + // late_ec_upsample is true. + late_ec_upsample = false; + } + } + + if (!late_ec_upsample) { + for (size_t ec = 0; ec < frame_header.extra_channel_upsampling.size(); + ec++) { + if (frame_header.extra_channel_upsampling[ec] != 1) { + builder.AddStage(GetUpsamplingStage( + frame_header.nonserialized_metadata->transform_data, 3 + ec, + CeilLog2Nonzero(frame_header.extra_channel_upsampling[ec]))); + } + } + } + + if ((frame_header.flags & FrameHeader::kPatches) != 0) { + builder.AddStage( + GetPatchesStage(&shared->image_features.patches, + 3 + shared->metadata->m.num_extra_channels)); + } + if ((frame_header.flags & FrameHeader::kSplines) != 0) { + builder.AddStage(GetSplineStage(&shared->image_features.splines)); + } + + if (frame_header.upsampling != 1) { + size_t nb_channels = + 3 + + (late_ec_upsample ? frame_header.extra_channel_upsampling.size() : 0); + for (size_t c = 0; c < nb_channels; c++) { + builder.AddStage(GetUpsamplingStage( + frame_header.nonserialized_metadata->transform_data, c, + CeilLog2Nonzero(frame_header.upsampling))); + } + } + + if ((frame_header.flags & FrameHeader::kNoise) != 0) { + builder.AddStage(GetConvolveNoiseStage(num_c - 3)); + builder.AddStage(GetAddNoiseStage(shared->image_features.noise_params, + shared->cmap, num_c - 3)); + } + if (frame_header.dc_level != 0) { + builder.AddStage(GetWriteToImage3FStage( + &shared_storage.dc_frames[frame_header.dc_level - 1])); + } + + if (frame_header.CanBeReferenced() && + frame_header.save_before_color_transform) { + builder.AddStage(GetWriteToImageBundleStage( + &frame_storage_for_referencing, output_encoding_info.color_encoding)); + } + + bool has_alpha = false; + size_t alpha_c = 0; + for (size_t i = 0; i < decoded->metadata()->extra_channel_info.size(); i++) { + if (decoded->metadata()->extra_channel_info[i].type == + ExtraChannel::kAlpha) { + has_alpha = true; + alpha_c = 3 + i; + break; + } + } + + if (fast_xyb_srgb8_conversion) { + JXL_ASSERT(!NeedsBlending(this)); + JXL_ASSERT(!frame_header.CanBeReferenced() || + frame_header.save_before_color_transform); + JXL_ASSERT(!options.render_spotcolors || + !decoded->metadata()->Find(ExtraChannel::kSpotColor)); + bool is_rgba = (main_output.format.num_channels == 4); + uint8_t* rgb_output = reinterpret_cast(main_output.buffer); + builder.AddStage(GetFastXYBTosRGB8Stage(rgb_output, main_output.stride, + width, height, is_rgba, has_alpha, + alpha_c)); + } else { + bool linear = false; + if (frame_header.color_transform == ColorTransform::kYCbCr) { + builder.AddStage(GetYCbCrStage()); + } else if (frame_header.color_transform == ColorTransform::kXYB) { + builder.AddStage(GetXYBStage(output_encoding_info)); + if (output_encoding_info.color_encoding.GetColorSpace() != + ColorSpace::kXYB) { + linear = true; + } + } // Nothing to do for kNone. + + if (options.coalescing && NeedsBlending(this)) { + if (linear) { + builder.AddStage(GetFromLinearStage(output_encoding_info)); + linear = false; + } + builder.AddStage( + GetBlendingStage(this, output_encoding_info.color_encoding)); + } + + if (options.coalescing && frame_header.CanBeReferenced() && + !frame_header.save_before_color_transform) { + if (linear) { + builder.AddStage(GetFromLinearStage(output_encoding_info)); + linear = false; + } + builder.AddStage(GetWriteToImageBundleStage( + &frame_storage_for_referencing, output_encoding_info.color_encoding)); + } + + if (options.render_spotcolors && + frame_header.nonserialized_metadata->m.Find(ExtraChannel::kSpotColor)) { + for (size_t i = 0; i < decoded->metadata()->extra_channel_info.size(); + i++) { + // Don't use Find() because there may be multiple spot color channels. + const ExtraChannelInfo& eci = + decoded->metadata()->extra_channel_info[i]; + if (eci.type == ExtraChannel::kSpotColor) { + builder.AddStage(GetSpotColorStage(3 + i, eci.spot_color)); + } + } + } + + auto tone_mapping_stage = GetToneMappingStage(output_encoding_info); + if (tone_mapping_stage) { + if (!linear) { + auto to_linear_stage = GetToLinearStage(output_encoding_info); + if (!to_linear_stage) { + return JXL_FAILURE( + "attempting to perform tone mapping on colorspace not " + "convertible to linear"); + } + builder.AddStage(std::move(to_linear_stage)); + linear = true; + } + builder.AddStage(std::move(tone_mapping_stage)); + } + + if (linear) { + builder.AddStage(GetFromLinearStage(output_encoding_info)); + linear = false; + } + + if (main_output.callback.IsPresent() || main_output.buffer) { + builder.AddStage(GetWriteToOutputStage(main_output, width, height, + has_alpha, unpremul_alpha, alpha_c, + undo_orientation, extra_output)); + } else { + builder.AddStage(GetWriteToImageBundleStage( + decoded, output_encoding_info.color_encoding)); + } + } + render_pipeline = std::move(builder).Finalize(shared->frame_dim); + return render_pipeline->IsInitialized(); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/dec_cache.h b/third_party/jpeg-xl/lib/jxl/dec_cache.h new file mode 100644 index 0000000000..7c9fe9a6c3 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_cache.h @@ -0,0 +1,261 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_CACHE_H_ +#define LIB_JXL_DEC_CACHE_H_ + +#include +#include + +#include +#include // HWY_ALIGN_MAX + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/common.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/dec_group_border.h" +#include "lib/jxl/dec_noise.h" +#include "lib/jxl/image.h" +#include "lib/jxl/passes_state.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/render_pipeline/render_pipeline.h" +#include "lib/jxl/render_pipeline/stage_upsampling.h" +#include "lib/jxl/sanitizers.h" + +namespace jxl { + +constexpr size_t kSigmaBorder = 1; +constexpr size_t kSigmaPadding = 2; + +struct PixelCallback { + PixelCallback() = default; + PixelCallback(JxlImageOutInitCallback init, JxlImageOutRunCallback run, + JxlImageOutDestroyCallback destroy, void* init_opaque) + : init(init), run(run), destroy(destroy), init_opaque(init_opaque) { +#if JXL_ENABLE_ASSERT + const bool has_init = init != nullptr; + const bool has_run = run != nullptr; + const bool has_destroy = destroy != nullptr; + JXL_ASSERT(has_init == has_run && has_run == has_destroy); +#endif + } + + bool IsPresent() const { return run != nullptr; } + + void* Init(size_t num_threads, size_t num_pixels) const { + return init(init_opaque, num_threads, num_pixels); + } + + JxlImageOutInitCallback init = nullptr; + JxlImageOutRunCallback run = nullptr; + JxlImageOutDestroyCallback destroy = nullptr; + void* init_opaque = nullptr; +}; + +struct ImageOutput { + // Pixel format of the output pixels, used for buffer and callback output. + JxlPixelFormat format; + // Output bit depth for unsigned data types, used for float to int conversion. + size_t bits_per_sample; + // Callback for line-by-line output. + PixelCallback callback; + // Pixel buffer for image output. + void* buffer; + size_t buffer_size; + // Length of a row of image_buffer in bytes (based on oriented width). + size_t stride; +}; + +// Per-frame decoder state. All the images here should be accessed through a +// group rect (either with block units or pixel units). +struct PassesDecoderState { + PassesSharedState shared_storage; + // Allows avoiding copies for encoder loop. + const PassesSharedState* JXL_RESTRICT shared = &shared_storage; + + // 8x upsampling stage for DC. + std::unique_ptr upsampler8x; + + // For ANS decoding. + std::vector code; + std::vector> context_map; + + // Multiplier to be applied to the quant matrices of the x channel. + float x_dm_multiplier; + float b_dm_multiplier; + + // Sigma values for EPF. + ImageF sigma; + + // Image dimensions before applying undo_orientation. + size_t width; + size_t height; + ImageOutput main_output; + std::vector extra_output; + + // Whether to use int16 float-XYB-to-uint8-srgb conversion. + bool fast_xyb_srgb8_conversion; + + // If true, the RGBA output will be unpremultiplied before writing to the + // output. + bool unpremul_alpha; + + // The render pipeline will apply this orientation to bring the image to the + // intended display orientation. + Orientation undo_orientation; + + // Used for seeding noise. + size_t visible_frame_index = 0; + size_t nonvisible_frame_index = 0; + + // Keep track of the transform types used. + std::atomic used_acs{0}; + + // Storage for coefficients if in "accumulate" mode. + std::unique_ptr coefficients = make_unique>(0, 0); + + // Rendering pipeline. + std::unique_ptr render_pipeline; + + // Storage for the current frame if it can be referenced by future frames. + ImageBundle frame_storage_for_referencing; + + struct PipelineOptions { + bool use_slow_render_pipeline; + bool coalescing; + bool render_spotcolors; + }; + + Status PreparePipeline(ImageBundle* decoded, PipelineOptions options); + + // Information for colour conversions. + OutputEncodingInfo output_encoding_info; + + // Initializes decoder-specific structures using information from *shared. + Status Init() { + x_dm_multiplier = + std::pow(1 / (1.25f), shared->frame_header.x_qm_scale - 2.0f); + b_dm_multiplier = + std::pow(1 / (1.25f), shared->frame_header.b_qm_scale - 2.0f); + + main_output.callback = PixelCallback(); + main_output.buffer = nullptr; + extra_output.clear(); + + fast_xyb_srgb8_conversion = false; + unpremul_alpha = false; + undo_orientation = Orientation::kIdentity; + + used_acs = 0; + + upsampler8x = GetUpsamplingStage(shared->metadata->transform_data, 0, 3); + if (shared->frame_header.loop_filter.epf_iters > 0) { + sigma = ImageF(shared->frame_dim.xsize_blocks + 2 * kSigmaPadding, + shared->frame_dim.ysize_blocks + 2 * kSigmaPadding); + } + return true; + } + + // Initialize the decoder state after all of DC is decoded. + Status InitForAC(ThreadPool* pool) { + shared_storage.coeff_order_size = 0; + for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { + if (((1 << o) & used_acs) == 0) continue; + uint8_t ord = kStrategyOrder[o]; + shared_storage.coeff_order_size = + std::max(kCoeffOrderOffset[3 * (ord + 1)] * kDCTBlockSize, + shared_storage.coeff_order_size); + } + size_t sz = shared_storage.frame_header.passes.num_passes * + shared_storage.coeff_order_size; + if (sz > shared_storage.coeff_orders.size()) { + shared_storage.coeff_orders.resize(sz); + } + return true; + } + + // Fills the `state->filter_weights.sigma` image with the precomputed sigma + // values in the area inside `block_rect`. Accesses the AC strategy, quant + // field and epf_sharpness fields in the corresponding positions. + void ComputeSigma(const Rect& block_rect, PassesDecoderState* state); +}; + +// Temp images required for decoding a single group. Reduces memory allocations +// for large images because we only initialize min(#threads, #groups) instances. +struct GroupDecCache { + void InitOnce(size_t num_passes, size_t used_acs) { + PROFILER_FUNC; + + for (size_t i = 0; i < num_passes; i++) { + if (num_nzeroes[i].xsize() == 0) { + // Allocate enough for a whole group - partial groups on the + // right/bottom border just use a subset. The valid size is passed via + // Rect. + + num_nzeroes[i] = Image3I(kGroupDimInBlocks, kGroupDimInBlocks); + } + } + size_t max_block_area = 0; + + for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { + AcStrategy acs = AcStrategy::FromRawStrategy(o); + if ((used_acs & (1 << o)) == 0) continue; + size_t area = + acs.covered_blocks_x() * acs.covered_blocks_y() * kDCTBlockSize; + max_block_area = std::max(area, max_block_area); + } + + if (max_block_area > max_block_area_) { + max_block_area_ = max_block_area; + // We need 3x float blocks for dequantized coefficients and 1x for scratch + // space for transforms. + float_memory_ = hwy::AllocateAligned(max_block_area_ * 4); + // We need 3x int32 or int16 blocks for quantized coefficients. + int32_memory_ = hwy::AllocateAligned(max_block_area_ * 3); + int16_memory_ = hwy::AllocateAligned(max_block_area_ * 3); + } + + dec_group_block = float_memory_.get(); + scratch_space = dec_group_block + max_block_area_ * 3; + dec_group_qblock = int32_memory_.get(); + dec_group_qblock16 = int16_memory_.get(); + } + + void InitDCBufferOnce() { + if (dc_buffer.xsize() == 0) { + dc_buffer = ImageF(kGroupDimInBlocks + kRenderPipelineXOffset * 2, + kGroupDimInBlocks + 4); + } + } + + // Scratch space used by DecGroupImpl(). + float* dec_group_block; + int32_t* dec_group_qblock; + int16_t* dec_group_qblock16; + + // For TransformToPixels. + float* scratch_space; + // Note that scratch_space is never used at the same time as dec_group_qblock. + // Moreover, only one of dec_group_qblock16 is ever used. + // TODO(veluca): figure out if we can save allocations. + + // AC decoding + Image3I num_nzeroes[kMaxNumPasses]; + + // Buffer for DC upsampling. + ImageF dc_buffer; + + private: + hwy::AlignedFreeUniquePtr float_memory_; + hwy::AlignedFreeUniquePtr int32_memory_; + hwy::AlignedFreeUniquePtr int16_memory_; + size_t max_block_area_ = 0; +}; + +} // namespace jxl + +#endif // LIB_JXL_DEC_CACHE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dec_context_map.cc b/third_party/jpeg-xl/lib/jxl/dec_context_map.cc new file mode 100644 index 0000000000..1b291650d7 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_context_map.cc @@ -0,0 +1,86 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_context_map.h" + +#include +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/inverse_mtf-inl.h" + +namespace jxl { + +namespace { + +Status VerifyContextMap(const std::vector& context_map, + const size_t num_htrees) { + std::vector have_htree(num_htrees); + size_t num_found = 0; + for (const uint8_t htree : context_map) { + if (htree >= num_htrees) { + return JXL_FAILURE("Invalid histogram index in context map."); + } + if (!have_htree[htree]) { + have_htree[htree] = true; + ++num_found; + } + } + if (num_found != num_htrees) { + return JXL_FAILURE("Incomplete context map."); + } + return true; +} + +} // namespace + +Status DecodeContextMap(std::vector* context_map, size_t* num_htrees, + BitReader* input) { + bool is_simple = input->ReadFixedBits<1>(); + if (is_simple) { + int bits_per_entry = input->ReadFixedBits<2>(); + if (bits_per_entry != 0) { + for (size_t i = 0; i < context_map->size(); i++) { + (*context_map)[i] = input->ReadBits(bits_per_entry); + } + } else { + std::fill(context_map->begin(), context_map->end(), 0); + } + } else { + bool use_mtf = input->ReadFixedBits<1>(); + ANSCode code; + std::vector dummy_ctx_map; + // Usage of LZ77 is disallowed if decoding only two symbols. This doesn't + // make sense in non-malicious bitstreams, and could cause a stack overflow + // in malicious bitstreams by making every context map require its own + // context map. + JXL_RETURN_IF_ERROR( + DecodeHistograms(input, 1, &code, &dummy_ctx_map, + /*disallow_lz77=*/context_map->size() <= 2)); + ANSSymbolReader reader(&code, input); + size_t i = 0; + while (i < context_map->size()) { + uint32_t sym = reader.ReadHybridUint(0, input, dummy_ctx_map); + if (sym >= kMaxClusters) { + return JXL_FAILURE("Invalid cluster ID"); + } + (*context_map)[i] = sym; + i++; + } + if (!reader.CheckANSFinalState()) { + return JXL_FAILURE("Invalid context map"); + } + if (use_mtf) { + InverseMoveToFrontTransform(context_map->data(), context_map->size()); + } + } + *num_htrees = *std::max_element(context_map->begin(), context_map->end()) + 1; + return VerifyContextMap(*context_map, *num_htrees); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/dec_context_map.h b/third_party/jpeg-xl/lib/jxl/dec_context_map.h new file mode 100644 index 0000000000..95b8a0ca92 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_context_map.h @@ -0,0 +1,30 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_CONTEXT_MAP_H_ +#define LIB_JXL_DEC_CONTEXT_MAP_H_ + +#include +#include + +#include + +#include "lib/jxl/dec_bit_reader.h" + +namespace jxl { + +// Context map uses uint8_t. +constexpr size_t kMaxClusters = 256; + +// Reads the context map from the bit stream. On calling this function, +// context_map->size() must be the number of possible context ids. +// Sets *num_htrees to the number of different histogram ids in +// *context_map. +Status DecodeContextMap(std::vector* context_map, size_t* num_htrees, + BitReader* input); + +} // namespace jxl + +#endif // LIB_JXL_DEC_CONTEXT_MAP_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dec_external_image.cc b/third_party/jpeg-xl/lib/jxl/dec_external_image.cc new file mode 100644 index 0000000000..bbf457ba91 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_external_image.cc @@ -0,0 +1,493 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_external_image.h" + +#include + +#include +#include +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/dec_external_image.cc" +#include +#include + +#include "lib/jxl/alpha.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/cache_aligned.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/sanitizers.h" +#include "lib/jxl/transfer_functions-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Clamp; +using hwy::HWY_NAMESPACE::NearestInt; + +// TODO(jon): check if this can be replaced by a FloatToU16 function +void FloatToU32(const float* in, uint32_t* out, size_t num, float mul, + size_t bits_per_sample) { + const HWY_FULL(float) d; + const hwy::HWY_NAMESPACE::Rebind du; + + // Unpoison accessing partially-uninitialized vectors with memory sanitizer. + // This is because we run NearestInt() on the vector, which triggers msan even + // it it safe to do so since the values are not mixed between lanes. + const size_t num_round_up = RoundUpTo(num, Lanes(d)); + msan::UnpoisonMemory(in + num, sizeof(in[0]) * (num_round_up - num)); + + const auto one = Set(d, 1.0f); + const auto scale = Set(d, mul); + for (size_t x = 0; x < num; x += Lanes(d)) { + auto v = Load(d, in + x); + // Clamp turns NaN to 'min'. + v = Clamp(v, Zero(d), one); + auto i = NearestInt(Mul(v, scale)); + Store(BitCast(du, i), du, out + x); + } + + // Poison back the output. + msan::PoisonMemory(out + num, sizeof(out[0]) * (num_round_up - num)); +} + +void FloatToF16(const float* in, hwy::float16_t* out, size_t num) { + const HWY_FULL(float) d; + const hwy::HWY_NAMESPACE::Rebind du; + + // Unpoison accessing partially-uninitialized vectors with memory sanitizer. + // This is because we run DemoteTo() on the vector which triggers msan. + const size_t num_round_up = RoundUpTo(num, Lanes(d)); + msan::UnpoisonMemory(in + num, sizeof(in[0]) * (num_round_up - num)); + + for (size_t x = 0; x < num; x += Lanes(d)) { + auto v = Load(d, in + x); + auto v16 = DemoteTo(du, v); + Store(v16, du, out + x); + } + + // Poison back the output. + msan::PoisonMemory(out + num, sizeof(out[0]) * (num_round_up - num)); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace jxl { +namespace { + +// Stores a float in big endian +void StoreBEFloat(float value, uint8_t* p) { + uint32_t u; + memcpy(&u, &value, 4); + StoreBE32(u, p); +} + +// Stores a float in little endian +void StoreLEFloat(float value, uint8_t* p) { + uint32_t u; + memcpy(&u, &value, 4); + StoreLE32(u, p); +} + +// The orientation may not be identity. +// TODO(lode): SIMDify where possible +template +Status UndoOrientation(jxl::Orientation undo_orientation, const Plane& image, + Plane& out, jxl::ThreadPool* pool) { + const size_t xsize = image.xsize(); + const size_t ysize = image.ysize(); + + if (undo_orientation == Orientation::kFlipHorizontal) { + out = Plane(xsize, ysize); + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::NoInit, + [&](const uint32_t task, size_t /*thread*/) { + const int64_t y = task; + const T* JXL_RESTRICT row_in = image.Row(y); + T* JXL_RESTRICT row_out = out.Row(y); + for (size_t x = 0; x < xsize; ++x) { + row_out[xsize - x - 1] = row_in[x]; + } + }, + "UndoOrientation")); + } else if (undo_orientation == Orientation::kRotate180) { + out = Plane(xsize, ysize); + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::NoInit, + [&](const uint32_t task, size_t /*thread*/) { + const int64_t y = task; + const T* JXL_RESTRICT row_in = image.Row(y); + T* JXL_RESTRICT row_out = out.Row(ysize - y - 1); + for (size_t x = 0; x < xsize; ++x) { + row_out[xsize - x - 1] = row_in[x]; + } + }, + "UndoOrientation")); + } else if (undo_orientation == Orientation::kFlipVertical) { + out = Plane(xsize, ysize); + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::NoInit, + [&](const uint32_t task, size_t /*thread*/) { + const int64_t y = task; + const T* JXL_RESTRICT row_in = image.Row(y); + T* JXL_RESTRICT row_out = out.Row(ysize - y - 1); + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = row_in[x]; + } + }, + "UndoOrientation")); + } else if (undo_orientation == Orientation::kTranspose) { + out = Plane(ysize, xsize); + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::NoInit, + [&](const uint32_t task, size_t /*thread*/) { + const int64_t y = task; + const T* JXL_RESTRICT row_in = image.Row(y); + for (size_t x = 0; x < xsize; ++x) { + out.Row(x)[y] = row_in[x]; + } + }, + "UndoOrientation")); + } else if (undo_orientation == Orientation::kRotate90) { + out = Plane(ysize, xsize); + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::NoInit, + [&](const uint32_t task, size_t /*thread*/) { + const int64_t y = task; + const T* JXL_RESTRICT row_in = image.Row(y); + for (size_t x = 0; x < xsize; ++x) { + out.Row(x)[ysize - y - 1] = row_in[x]; + } + }, + "UndoOrientation")); + } else if (undo_orientation == Orientation::kAntiTranspose) { + out = Plane(ysize, xsize); + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::NoInit, + [&](const uint32_t task, size_t /*thread*/) { + const int64_t y = task; + const T* JXL_RESTRICT row_in = image.Row(y); + for (size_t x = 0; x < xsize; ++x) { + out.Row(xsize - x - 1)[ysize - y - 1] = row_in[x]; + } + }, + "UndoOrientation")); + } else if (undo_orientation == Orientation::kRotate270) { + out = Plane(ysize, xsize); + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::NoInit, + [&](const uint32_t task, size_t /*thread*/) { + const int64_t y = task; + const T* JXL_RESTRICT row_in = image.Row(y); + for (size_t x = 0; x < xsize; ++x) { + out.Row(xsize - x - 1)[y] = row_in[x]; + } + }, + "UndoOrientation")); + } + return true; +} +} // namespace + +HWY_EXPORT(FloatToU32); +HWY_EXPORT(FloatToF16); + +namespace { + +using StoreFuncType = void(uint32_t value, uint8_t* dest); +template +void StoreUintRow(uint32_t* JXL_RESTRICT* rows_u32, size_t num_channels, + size_t xsize, size_t bytes_per_sample, + uint8_t* JXL_RESTRICT out) { + for (size_t x = 0; x < xsize; ++x) { + for (size_t c = 0; c < num_channels; c++) { + StoreFunc(rows_u32[c][x], + out + (num_channels * x + c) * bytes_per_sample); + } + } +} + +template +void StoreFloatRow(const float* JXL_RESTRICT* rows_in, size_t num_channels, + size_t xsize, uint8_t* JXL_RESTRICT out) { + for (size_t x = 0; x < xsize; ++x) { + for (size_t c = 0; c < num_channels; c++) { + StoreFunc(rows_in[c][x], out + (num_channels * x + c) * sizeof(float)); + } + } +} + +void JXL_INLINE Store8(uint32_t value, uint8_t* dest) { *dest = value & 0xff; } + +// Maximum number of channels for the ConvertChannelsToExternal function. +const size_t kConvertMaxChannels = 4; + +// Converts a list of channels to an interleaved image, applying transformations +// when needed. +// The input channels are given as a (non-const!) array of channel pointers and +// interleaved in that order. +// +// Note: if a pointer in channels[] is nullptr, a 1.0 value will be used +// instead. This is useful for handling when a user requests an alpha channel +// from an image that doesn't have one. The first channel in the list may not +// be nullptr, since it is used to determine the image size. +Status ConvertChannelsToExternal(const ImageF* channels[], size_t num_channels, + size_t bits_per_sample, bool float_out, + JxlEndianness endianness, size_t stride, + jxl::ThreadPool* pool, void* out_image, + size_t out_size, + const PixelCallback& out_callback, + jxl::Orientation undo_orientation) { + JXL_DASSERT(num_channels != 0 && num_channels <= kConvertMaxChannels); + JXL_DASSERT(channels[0] != nullptr); + JXL_CHECK(float_out ? bits_per_sample == 16 || bits_per_sample == 32 + : bits_per_sample > 0 && bits_per_sample <= 16); + if (!!out_image == out_callback.IsPresent()) { + return JXL_FAILURE( + "Must provide either an out_image or an out_callback, but not both."); + } + + const size_t bytes_per_channel = DivCeil(bits_per_sample, jxl::kBitsPerByte); + const size_t bytes_per_pixel = num_channels * bytes_per_channel; + + std::vector> row_out_callback; + const auto FreeCallbackOpaque = [&out_callback](void* p) { + out_callback.destroy(p); + }; + std::unique_ptr out_run_opaque( + nullptr, FreeCallbackOpaque); + auto InitOutCallback = [&](size_t num_threads) -> Status { + if (out_callback.IsPresent()) { + out_run_opaque.reset(out_callback.Init(num_threads, stride)); + JXL_RETURN_IF_ERROR(out_run_opaque != nullptr); + row_out_callback.resize(num_threads); + for (size_t i = 0; i < num_threads; ++i) { + row_out_callback[i].resize(stride); + } + } + return true; + }; + + // Channels used to store the transformed original channels if needed. + ImageF temp_channels[kConvertMaxChannels]; + if (undo_orientation != Orientation::kIdentity) { + for (size_t c = 0; c < num_channels; ++c) { + if (channels[c]) { + JXL_RETURN_IF_ERROR(UndoOrientation(undo_orientation, *channels[c], + temp_channels[c], pool)); + channels[c] = &(temp_channels[c]); + } + } + } + + // First channel may not be nullptr. + size_t xsize = channels[0]->xsize(); + size_t ysize = channels[0]->ysize(); + if (stride < bytes_per_pixel * xsize) { + return JXL_FAILURE("stride is smaller than scanline width in bytes: %" PRIuS + " vs %" PRIuS, + stride, bytes_per_pixel * xsize); + } + if (!out_callback.IsPresent() && + out_size < (ysize - 1) * stride + bytes_per_pixel * xsize) { + return JXL_FAILURE("out_size is too small to store image"); + } + + const bool little_endian = + endianness == JXL_LITTLE_ENDIAN || + (endianness == JXL_NATIVE_ENDIAN && IsLittleEndian()); + + // Handle the case where a channel is nullptr by creating a single row with + // ones to use instead. + ImageF ones; + for (size_t c = 0; c < num_channels; ++c) { + if (!channels[c]) { + ones = ImageF(xsize, 1); + FillImage(1.0f, &ones); + break; + } + } + + if (float_out) { + if (bits_per_sample == 16) { + bool swap_endianness = little_endian != IsLittleEndian(); + Plane f16_cache; + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, static_cast(ysize), + [&](size_t num_threads) { + f16_cache = + Plane(xsize, num_channels * num_threads); + return InitOutCallback(num_threads); + }, + [&](const uint32_t task, const size_t thread) { + const int64_t y = task; + const float* JXL_RESTRICT row_in[kConvertMaxChannels]; + for (size_t c = 0; c < num_channels; c++) { + row_in[c] = channels[c] ? channels[c]->Row(y) : ones.Row(0); + } + hwy::float16_t* JXL_RESTRICT row_f16[kConvertMaxChannels]; + for (size_t c = 0; c < num_channels; c++) { + row_f16[c] = f16_cache.Row(c + thread * num_channels); + HWY_DYNAMIC_DISPATCH(FloatToF16) + (row_in[c], row_f16[c], xsize); + } + uint8_t* row_out = + out_callback.IsPresent() + ? row_out_callback[thread].data() + : &(reinterpret_cast(out_image))[stride * y]; + // interleave the one scanline + hwy::float16_t* row_f16_out = + reinterpret_cast(row_out); + for (size_t x = 0; x < xsize; x++) { + for (size_t c = 0; c < num_channels; c++) { + row_f16_out[x * num_channels + c] = row_f16[c][x]; + } + } + if (swap_endianness) { + size_t size = xsize * num_channels * 2; + for (size_t i = 0; i < size; i += 2) { + std::swap(row_out[i + 0], row_out[i + 1]); + } + } + if (out_callback.IsPresent()) { + out_callback.run(out_run_opaque.get(), thread, 0, y, xsize, + row_out); + } + }, + "ConvertF16")); + } else if (bits_per_sample == 32) { + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, static_cast(ysize), + [&](size_t num_threads) { return InitOutCallback(num_threads); }, + [&](const uint32_t task, const size_t thread) { + const int64_t y = task; + uint8_t* row_out = + out_callback.IsPresent() + ? row_out_callback[thread].data() + : &(reinterpret_cast(out_image))[stride * y]; + const float* JXL_RESTRICT row_in[kConvertMaxChannels]; + for (size_t c = 0; c < num_channels; c++) { + row_in[c] = channels[c] ? channels[c]->Row(y) : ones.Row(0); + } + if (little_endian) { + StoreFloatRow(row_in, num_channels, xsize, row_out); + } else { + StoreFloatRow(row_in, num_channels, xsize, row_out); + } + if (out_callback.IsPresent()) { + out_callback.run(out_run_opaque.get(), thread, 0, y, xsize, + row_out); + } + }, + "ConvertFloat")); + } else { + return JXL_FAILURE("float other than 16-bit and 32-bit not supported"); + } + } else { + // Multiplier to convert from floating point 0-1 range to the integer + // range. + float mul = (1ull << bits_per_sample) - 1; + Plane u32_cache; + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, static_cast(ysize), + [&](size_t num_threads) { + u32_cache = Plane(xsize, num_channels * num_threads); + return InitOutCallback(num_threads); + }, + [&](const uint32_t task, const size_t thread) { + const int64_t y = task; + uint8_t* row_out = + out_callback.IsPresent() + ? row_out_callback[thread].data() + : &(reinterpret_cast(out_image))[stride * y]; + const float* JXL_RESTRICT row_in[kConvertMaxChannels]; + for (size_t c = 0; c < num_channels; c++) { + row_in[c] = channels[c] ? channels[c]->Row(y) : ones.Row(0); + } + uint32_t* JXL_RESTRICT row_u32[kConvertMaxChannels]; + for (size_t c = 0; c < num_channels; c++) { + row_u32[c] = u32_cache.Row(c + thread * num_channels); + // row_u32[] is a per-thread temporary row storage, this isn't + // intended to be initialized on a previous run. + msan::PoisonMemory(row_u32[c], xsize * sizeof(row_u32[c][0])); + HWY_DYNAMIC_DISPATCH(FloatToU32) + (row_in[c], row_u32[c], xsize, mul, bits_per_sample); + } + if (bits_per_sample <= 8) { + StoreUintRow(row_u32, num_channels, xsize, 1, row_out); + } else { + if (little_endian) { + StoreUintRow(row_u32, num_channels, xsize, 2, row_out); + } else { + StoreUintRow(row_u32, num_channels, xsize, 2, row_out); + } + } + if (out_callback.IsPresent()) { + out_callback.run(out_run_opaque.get(), thread, 0, y, xsize, + row_out); + } + }, + "ConvertUint")); + } + return true; +} + +} // namespace + +Status ConvertToExternal(const jxl::ImageBundle& ib, size_t bits_per_sample, + bool float_out, size_t num_channels, + JxlEndianness endianness, size_t stride, + jxl::ThreadPool* pool, void* out_image, + size_t out_size, const PixelCallback& out_callback, + jxl::Orientation undo_orientation, + bool unpremul_alpha) { + bool want_alpha = num_channels == 2 || num_channels == 4; + size_t color_channels = num_channels <= 2 ? 1 : 3; + + const Image3F* color = &ib.color(); + // Undo premultiplied alpha. + Image3F unpremul; + if (ib.AlphaIsPremultiplied() && ib.HasAlpha() && unpremul_alpha) { + unpremul = Image3F(color->xsize(), color->ysize()); + CopyImageTo(*color, &unpremul); + for (size_t y = 0; y < unpremul.ysize(); y++) { + UnpremultiplyAlpha(unpremul.PlaneRow(0, y), unpremul.PlaneRow(1, y), + unpremul.PlaneRow(2, y), ib.alpha().Row(y), + unpremul.xsize()); + } + color = &unpremul; + } + + const ImageF* channels[kConvertMaxChannels]; + size_t c = 0; + for (; c < color_channels; c++) { + channels[c] = &color->Plane(c); + } + if (want_alpha) { + channels[c++] = ib.HasAlpha() ? &ib.alpha() : nullptr; + } + JXL_ASSERT(num_channels == c); + + return ConvertChannelsToExternal( + channels, num_channels, bits_per_sample, float_out, endianness, stride, + pool, out_image, out_size, out_callback, undo_orientation); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/dec_external_image.h b/third_party/jpeg-xl/lib/jxl/dec_external_image.h new file mode 100644 index 0000000000..6ca7abff62 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_external_image.h @@ -0,0 +1,46 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_EXTERNAL_IMAGE_H_ +#define LIB_JXL_DEC_EXTERNAL_IMAGE_H_ + +// Interleaved image for color transforms and Codec. + +#include +#include +#include +#include + +#include "lib/jxl/base/status.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +// Converts ib to interleaved void* pixel buffer with the given format. +// bits_per_sample: must be 16 or 32 if float_out is true, and at most 16 +// if it is false. No bit packing is done. +// num_channels: must be 1, 2, 3 or 4 for gray, gray+alpha, RGB, RGB+alpha. +// This supports the features needed for the C API and does not perform +// color space conversion. +// TODO(lode): support rectangle crop. +// stride_out is output scanline size in bytes, must be >= +// output_xsize * output_bytes_per_pixel. +// undo_orientation is an EXIF orientation to undo. Depending on the +// orientation, the output xsize and ysize are swapped compared to input +// xsize and ysize. +Status ConvertToExternal(const jxl::ImageBundle& ib, size_t bits_per_sample, + bool float_out, size_t num_channels, + JxlEndianness endianness, size_t stride_out, + jxl::ThreadPool* thread_pool, void* out_image, + size_t out_size, const PixelCallback& out_callback, + jxl::Orientation undo_orientation, + bool unpremul_alpha = false); + +} // namespace jxl + +#endif // LIB_JXL_DEC_EXTERNAL_IMAGE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dec_external_image_gbench.cc b/third_party/jpeg-xl/lib/jxl/dec_external_image_gbench.cc new file mode 100644 index 0000000000..c87a4d5f36 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_external_image_gbench.cc @@ -0,0 +1,56 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "benchmark/benchmark.h" +#include "lib/jxl/dec_external_image.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { +namespace { + +// Decoder case, interleaves an internal float image. +void BM_DecExternalImage_ConvertImageRGBA(benchmark::State& state) { + const size_t kNumIter = 5; + size_t xsize = state.range(); + size_t ysize = state.range(); + size_t num_channels = 4; + + ImageMetadata im; + im.SetAlphaBits(8); + ImageBundle ib(&im); + Image3F color(xsize, ysize); + ZeroFillImage(&color); + ib.SetFromImage(std::move(color), ColorEncoding::SRGB()); + ImageF alpha(xsize, ysize); + ZeroFillImage(&alpha); + ib.SetAlpha(std::move(alpha)); + + const size_t bytes_per_row = xsize * num_channels; + std::vector interleaved(bytes_per_row * ysize); + + for (auto _ : state) { + for (size_t i = 0; i < kNumIter; ++i) { + JXL_CHECK(ConvertToExternal( + ib, + /*bits_per_sample=*/8, + /*float_out=*/false, num_channels, JXL_NATIVE_ENDIAN, + /*stride*/ bytes_per_row, + /*thread_pool=*/nullptr, interleaved.data(), interleaved.size(), + /*out_callback=*/{}, + /*undo_orientation=*/jxl::Orientation::kIdentity)); + } + } + + // Pixels per second. + state.SetItemsProcessed(kNumIter * state.iterations() * xsize * ysize); + state.SetBytesProcessed(kNumIter * state.iterations() * interleaved.size()); +} + +BENCHMARK(BM_DecExternalImage_ConvertImageRGBA) + ->RangeMultiplier(2) + ->Range(256, 2048); + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/dec_frame.cc b/third_party/jpeg-xl/lib/jxl/dec_frame.cc new file mode 100644 index 0000000000..98508e431b --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_frame.cc @@ -0,0 +1,878 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_frame.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/dec_group.h" +#include "lib/jxl/dec_modular.h" +#include "lib/jxl/dec_patch_dictionary.h" +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/epf.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/jpeg/jpeg_data.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/passes_state.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/sanitizers.h" +#include "lib/jxl/splines.h" +#include "lib/jxl/toc.h" + +namespace jxl { + +namespace { +Status DecodeGlobalDCInfo(BitReader* reader, bool is_jpeg, + PassesDecoderState* state, ThreadPool* pool) { + PROFILER_FUNC; + JXL_RETURN_IF_ERROR(state->shared_storage.quantizer.Decode(reader)); + + JXL_RETURN_IF_ERROR( + DecodeBlockCtxMap(reader, &state->shared_storage.block_ctx_map)); + + JXL_RETURN_IF_ERROR(state->shared_storage.cmap.DecodeDC(reader)); + + // Pre-compute info for decoding a group. + if (is_jpeg) { + state->shared_storage.quantizer.ClearDCMul(); // Don't dequant DC + } + + state->shared_storage.ac_strategy.FillInvalid(); + return true; +} +} // namespace + +Status DecodeFrame(PassesDecoderState* dec_state, ThreadPool* JXL_RESTRICT pool, + const uint8_t* next_in, size_t avail_in, + ImageBundle* decoded, const CodecMetadata& metadata, + bool use_slow_rendering_pipeline) { + FrameDecoder frame_decoder(dec_state, metadata, pool, + use_slow_rendering_pipeline); + + BitReader reader(Span(next_in, avail_in)); + JXL_RETURN_IF_ERROR(frame_decoder.InitFrame(&reader, decoded, + /*is_preview=*/false)); + JXL_RETURN_IF_ERROR(frame_decoder.InitFrameOutput()); + + JXL_RETURN_IF_ERROR(reader.AllReadsWithinBounds()); + size_t header_bytes = reader.TotalBitsConsumed() / kBitsPerByte; + JXL_RETURN_IF_ERROR(reader.Close()); + + size_t processed_bytes = header_bytes; + Status close_ok = true; + std::vector> section_readers; + { + std::vector> section_closers; + std::vector section_info; + std::vector section_status; + size_t pos = header_bytes; + size_t index = 0; + for (auto toc_entry : frame_decoder.Toc()) { + JXL_RETURN_IF_ERROR(pos + toc_entry.size <= avail_in); + auto br = make_unique( + Span(next_in + pos, toc_entry.size)); + section_info.emplace_back( + FrameDecoder::SectionInfo{br.get(), toc_entry.id, index++}); + section_closers.emplace_back( + make_unique(br.get(), &close_ok)); + section_readers.emplace_back(std::move(br)); + pos += toc_entry.size; + } + section_status.resize(section_info.size()); + JXL_RETURN_IF_ERROR(frame_decoder.ProcessSections( + section_info.data(), section_info.size(), section_status.data())); + for (size_t i = 0; i < section_status.size(); i++) { + JXL_RETURN_IF_ERROR(section_status[i] == FrameDecoder::kDone); + processed_bytes += frame_decoder.Toc()[i].size; + } + } + JXL_RETURN_IF_ERROR(close_ok); + JXL_RETURN_IF_ERROR(frame_decoder.FinalizeFrame()); + decoded->SetDecodedBytes(processed_bytes); + return true; +} + +Status FrameDecoder::InitFrame(BitReader* JXL_RESTRICT br, ImageBundle* decoded, + bool is_preview) { + PROFILER_FUNC; + decoded_ = decoded; + JXL_ASSERT(is_finalized_); + + // Reset the dequantization matrices to their default values. + dec_state_->shared_storage.matrices = DequantMatrices(); + + frame_header_.nonserialized_is_preview = is_preview; + JXL_ASSERT(frame_header_.nonserialized_metadata != nullptr); + JXL_RETURN_IF_ERROR(ReadFrameHeader(br, &frame_header_)); + frame_dim_ = frame_header_.ToFrameDimensions(); + JXL_DEBUG_V(2, "FrameHeader: %s", frame_header_.DebugString().c_str()); + + const size_t num_passes = frame_header_.passes.num_passes; + const size_t num_groups = frame_dim_.num_groups; + + // If the previous frame was not a kRegularFrame, `decoded` may have different + // dimensions; must reset to avoid errors. + decoded->RemoveColor(); + decoded->ClearExtraChannels(); + + decoded->duration = frame_header_.animation_frame.duration; + + if (!frame_header_.nonserialized_is_preview && + (frame_header_.is_last || frame_header_.animation_frame.duration > 0) && + (frame_header_.frame_type == kRegularFrame || + frame_header_.frame_type == kSkipProgressive)) { + ++dec_state_->visible_frame_index; + dec_state_->nonvisible_frame_index = 0; + } else { + ++dec_state_->nonvisible_frame_index; + } + + // Read TOC. + const bool has_ac_global = true; + const size_t toc_entries = NumTocEntries(num_groups, frame_dim_.num_dc_groups, + num_passes, has_ac_global); + std::vector sizes; + std::vector permutation; + JXL_RETURN_IF_ERROR(ReadToc(toc_entries, br, &sizes, &permutation)); + bool have_permutation = !permutation.empty(); + toc_.resize(toc_entries); + section_sizes_sum_ = 0; + for (size_t i = 0; i < toc_entries; ++i) { + toc_[i].size = sizes[i]; + size_t index = have_permutation ? permutation[i] : i; + toc_[index].id = i; + if (section_sizes_sum_ + toc_[i].size < section_sizes_sum_) { + return JXL_FAILURE("group offset overflow"); + } + section_sizes_sum_ += toc_[i].size; + } + + JXL_DASSERT((br->TotalBitsConsumed() % kBitsPerByte) == 0); + const size_t group_codes_begin = br->TotalBitsConsumed() / kBitsPerByte; + JXL_DASSERT(!toc_.empty()); + + // Overflow check. + if (group_codes_begin + section_sizes_sum_ < group_codes_begin) { + return JXL_FAILURE("Invalid group codes"); + } + + if (!frame_header_.chroma_subsampling.Is444() && + !(frame_header_.flags & FrameHeader::kSkipAdaptiveDCSmoothing) && + frame_header_.encoding == FrameEncoding::kVarDCT) { + return JXL_FAILURE( + "Non-444 chroma subsampling is not allowed when adaptive DC " + "smoothing is enabled"); + } + return true; +} + +Status FrameDecoder::InitFrameOutput() { + JXL_RETURN_IF_ERROR( + InitializePassesSharedState(frame_header_, &dec_state_->shared_storage)); + JXL_RETURN_IF_ERROR(dec_state_->Init()); + modular_frame_decoder_.Init(frame_dim_); + + if (decoded_->IsJPEG()) { + if (frame_header_.encoding == FrameEncoding::kModular) { + return JXL_FAILURE("Cannot output JPEG from Modular"); + } + jpeg::JPEGData* jpeg_data = decoded_->jpeg_data.get(); + size_t num_components = jpeg_data->components.size(); + if (num_components != 1 && num_components != 3) { + return JXL_FAILURE("Invalid number of components"); + } + if (frame_header_.nonserialized_metadata->m.xyb_encoded) { + return JXL_FAILURE("Cannot decode to JPEG an XYB image"); + } + auto jpeg_c_map = JpegOrder(ColorTransform::kYCbCr, num_components == 1); + decoded_->jpeg_data->width = frame_dim_.xsize; + decoded_->jpeg_data->height = frame_dim_.ysize; + for (size_t c = 0; c < num_components; c++) { + auto& component = jpeg_data->components[jpeg_c_map[c]]; + component.width_in_blocks = + frame_dim_.xsize_blocks >> frame_header_.chroma_subsampling.HShift(c); + component.height_in_blocks = + frame_dim_.ysize_blocks >> frame_header_.chroma_subsampling.VShift(c); + component.h_samp_factor = + 1 << frame_header_.chroma_subsampling.RawHShift(c); + component.v_samp_factor = + 1 << frame_header_.chroma_subsampling.RawVShift(c); + component.coeffs.resize(component.width_in_blocks * + component.height_in_blocks * jxl::kDCTBlockSize); + } + } + + // Clear the state. + decoded_dc_global_ = false; + decoded_ac_global_ = false; + is_finalized_ = false; + finalized_dc_ = false; + num_sections_done_ = 0; + decoded_dc_groups_.clear(); + decoded_dc_groups_.resize(frame_dim_.num_dc_groups); + decoded_passes_per_ac_group_.clear(); + decoded_passes_per_ac_group_.resize(frame_dim_.num_groups, 0); + processed_section_.clear(); + processed_section_.resize(toc_.size()); + allocated_ = false; + return true; +} + +Status FrameDecoder::ProcessDCGlobal(BitReader* br) { + PROFILER_FUNC; + PassesSharedState& shared = dec_state_->shared_storage; + if (shared.frame_header.flags & FrameHeader::kPatches) { + bool uses_extra_channels = false; + JXL_RETURN_IF_ERROR(shared.image_features.patches.Decode( + br, frame_dim_.xsize_padded, frame_dim_.ysize_padded, + &uses_extra_channels)); + if (uses_extra_channels && frame_header_.upsampling != 1) { + for (size_t ecups : frame_header_.extra_channel_upsampling) { + if (ecups != frame_header_.upsampling) { + return JXL_FAILURE( + "Cannot use extra channels in patches if color channels are " + "subsampled differently from extra channels"); + } + } + } + } else { + shared.image_features.patches.Clear(); + } + shared.image_features.splines.Clear(); + if (shared.frame_header.flags & FrameHeader::kSplines) { + JXL_RETURN_IF_ERROR(shared.image_features.splines.Decode( + br, frame_dim_.xsize * frame_dim_.ysize)); + } + if (shared.frame_header.flags & FrameHeader::kNoise) { + JXL_RETURN_IF_ERROR(DecodeNoise(br, &shared.image_features.noise_params)); + } + JXL_RETURN_IF_ERROR(dec_state_->shared_storage.matrices.DecodeDC(br)); + + if (frame_header_.encoding == FrameEncoding::kVarDCT) { + JXL_RETURN_IF_ERROR( + jxl::DecodeGlobalDCInfo(br, decoded_->IsJPEG(), dec_state_, pool_)); + } + // Splines' draw cache uses the color correlation map. + if (shared.frame_header.flags & FrameHeader::kSplines) { + JXL_RETURN_IF_ERROR(shared.image_features.splines.InitializeDrawCache( + frame_dim_.xsize_upsampled, frame_dim_.ysize_upsampled, + dec_state_->shared->cmap)); + } + Status dec_status = modular_frame_decoder_.DecodeGlobalInfo( + br, frame_header_, /*allow_truncated_group=*/false); + if (dec_status.IsFatalError()) return dec_status; + if (dec_status) { + decoded_dc_global_ = true; + } + return dec_status; +} + +Status FrameDecoder::ProcessDCGroup(size_t dc_group_id, BitReader* br) { + PROFILER_FUNC; + const size_t gx = dc_group_id % frame_dim_.xsize_dc_groups; + const size_t gy = dc_group_id / frame_dim_.xsize_dc_groups; + const LoopFilter& lf = dec_state_->shared->frame_header.loop_filter; + if (frame_header_.encoding == FrameEncoding::kVarDCT && + !(frame_header_.flags & FrameHeader::kUseDcFrame)) { + JXL_RETURN_IF_ERROR( + modular_frame_decoder_.DecodeVarDCTDC(dc_group_id, br, dec_state_)); + } + const Rect mrect(gx * frame_dim_.dc_group_dim, gy * frame_dim_.dc_group_dim, + frame_dim_.dc_group_dim, frame_dim_.dc_group_dim); + JXL_RETURN_IF_ERROR(modular_frame_decoder_.DecodeGroup( + mrect, br, 3, 1000, ModularStreamId::ModularDC(dc_group_id), + /*zerofill=*/false, nullptr, nullptr, + /*allow_truncated=*/false)); + if (frame_header_.encoding == FrameEncoding::kVarDCT) { + JXL_RETURN_IF_ERROR( + modular_frame_decoder_.DecodeAcMetadata(dc_group_id, br, dec_state_)); + } else if (lf.epf_iters > 0) { + FillImage(kInvSigmaNum / lf.epf_sigma_for_modular, &dec_state_->sigma); + } + decoded_dc_groups_[dc_group_id] = uint8_t{true}; + return true; +} + +void FrameDecoder::FinalizeDC() { + // Do Adaptive DC smoothing if enabled. This *must* happen between all the + // ProcessDCGroup and ProcessACGroup. + if (frame_header_.encoding == FrameEncoding::kVarDCT && + !(frame_header_.flags & FrameHeader::kSkipAdaptiveDCSmoothing) && + !(frame_header_.flags & FrameHeader::kUseDcFrame)) { + AdaptiveDCSmoothing(dec_state_->shared->quantizer.MulDC(), + &dec_state_->shared_storage.dc_storage, pool_); + } + + finalized_dc_ = true; +} + +Status FrameDecoder::AllocateOutput() { + if (allocated_) return true; + modular_frame_decoder_.MaybeDropFullImage(); + decoded_->origin = dec_state_->shared->frame_header.frame_origin; + JXL_RETURN_IF_ERROR(dec_state_->InitForAC(nullptr)); + allocated_ = true; + return true; +} + +Status FrameDecoder::ProcessACGlobal(BitReader* br) { + JXL_CHECK(finalized_dc_); + + // Decode AC group. + if (frame_header_.encoding == FrameEncoding::kVarDCT) { + JXL_RETURN_IF_ERROR(dec_state_->shared_storage.matrices.Decode( + br, &modular_frame_decoder_)); + JXL_RETURN_IF_ERROR(dec_state_->shared_storage.matrices.EnsureComputed( + dec_state_->used_acs)); + + size_t num_histo_bits = + CeilLog2Nonzero(dec_state_->shared->frame_dim.num_groups); + dec_state_->shared_storage.num_histograms = + 1 + br->ReadBits(num_histo_bits); + + dec_state_->code.resize(kMaxNumPasses); + dec_state_->context_map.resize(kMaxNumPasses); + // Read coefficient orders and histograms. + size_t max_num_bits_ac = 0; + for (size_t i = 0; + i < dec_state_->shared_storage.frame_header.passes.num_passes; i++) { + uint16_t used_orders = U32Coder::Read(kOrderEnc, br); + JXL_RETURN_IF_ERROR(DecodeCoeffOrders( + used_orders, dec_state_->used_acs, + &dec_state_->shared_storage + .coeff_orders[i * dec_state_->shared_storage.coeff_order_size], + br)); + size_t num_contexts = + dec_state_->shared->num_histograms * + dec_state_->shared_storage.block_ctx_map.NumACContexts(); + JXL_RETURN_IF_ERROR(DecodeHistograms( + br, num_contexts, &dec_state_->code[i], &dec_state_->context_map[i])); + // Add extra values to enable the cheat in hot loop of DecodeACVarBlock. + dec_state_->context_map[i].resize( + num_contexts + kZeroDensityContextLimit - kZeroDensityContextCount); + max_num_bits_ac = + std::max(max_num_bits_ac, dec_state_->code[i].max_num_bits); + } + max_num_bits_ac += CeilLog2Nonzero( + dec_state_->shared_storage.frame_header.passes.num_passes); + // 16-bit buffer for decoding to JPEG are not implemented. + // TODO(veluca): figure out the exact limit - 16 should still work with + // 16-bit buffers, but we are excluding it for safety. + bool use_16_bit = max_num_bits_ac < 16 && !decoded_->IsJPEG(); + bool store = frame_header_.passes.num_passes > 1; + size_t xs = store ? kGroupDim * kGroupDim : 0; + size_t ys = store ? frame_dim_.num_groups : 0; + if (use_16_bit) { + dec_state_->coefficients = make_unique>(xs, ys); + } else { + dec_state_->coefficients = make_unique>(xs, ys); + } + if (store) { + dec_state_->coefficients->ZeroFill(); + } + } + + // Set JPEG decoding data. + if (decoded_->IsJPEG()) { + decoded_->color_transform = frame_header_.color_transform; + decoded_->chroma_subsampling = frame_header_.chroma_subsampling; + const std::vector& qe = + dec_state_->shared_storage.matrices.encodings(); + if (qe.empty() || qe[0].mode != QuantEncoding::Mode::kQuantModeRAW || + std::abs(qe[0].qraw.qtable_den - 1.f / (8 * 255)) > 1e-8f) { + return JXL_FAILURE( + "Quantization table is not a JPEG quantization table."); + } + jpeg::JPEGData* jpeg_data = decoded_->jpeg_data.get(); + size_t num_components = jpeg_data->components.size(); + bool is_gray = (num_components == 1); + auto jpeg_c_map = JpegOrder(frame_header_.color_transform, is_gray); + size_t qt_set = 0; + for (size_t c = 0; c < num_components; c++) { + // TODO(eustas): why 1-st quant table for gray? + size_t quant_c = is_gray ? 1 : c; + size_t qpos = jpeg_data->components[jpeg_c_map[c]].quant_idx; + JXL_CHECK(qpos != jpeg_data->quant.size()); + qt_set |= 1 << qpos; + for (size_t x = 0; x < 8; x++) { + for (size_t y = 0; y < 8; y++) { + jpeg_data->quant[qpos].values[x * 8 + y] = + (*qe[0].qraw.qtable)[quant_c * 64 + y * 8 + x]; + } + } + } + for (size_t i = 0; i < jpeg_data->quant.size(); i++) { + if (qt_set & (1 << i)) continue; + if (i == 0) return JXL_FAILURE("First quant table unused."); + // Unused quant table is set to copy of previous quant table + for (size_t j = 0; j < 64; j++) { + jpeg_data->quant[i].values[j] = jpeg_data->quant[i - 1].values[j]; + } + } + } + decoded_ac_global_ = true; + return true; +} + +Status FrameDecoder::ProcessACGroup(size_t ac_group_id, + BitReader* JXL_RESTRICT* br, + size_t num_passes, size_t thread, + bool force_draw, bool dc_only) { + PROFILER_ZONE("process_group"); + size_t group_dim = frame_dim_.group_dim; + const size_t gx = ac_group_id % frame_dim_.xsize_groups; + const size_t gy = ac_group_id / frame_dim_.xsize_groups; + const size_t x = gx * group_dim; + const size_t y = gy * group_dim; + JXL_DEBUG_V(3, + "Processing AC group %" PRIuS "(%" PRIuS ",%" PRIuS + ") group_dim: %" PRIuS " decoded passes: %u new passes: %" PRIuS, + ac_group_id, gx, gy, group_dim, + decoded_passes_per_ac_group_[ac_group_id], num_passes); + + RenderPipelineInput render_pipeline_input = + dec_state_->render_pipeline->GetInputBuffers(ac_group_id, thread); + + bool should_run_pipeline = true; + + if (frame_header_.encoding == FrameEncoding::kVarDCT) { + group_dec_caches_[thread].InitOnce(frame_header_.passes.num_passes, + dec_state_->used_acs); + JXL_RETURN_IF_ERROR(DecodeGroup(br, num_passes, ac_group_id, dec_state_, + &group_dec_caches_[thread], thread, + render_pipeline_input, decoded_, + decoded_passes_per_ac_group_[ac_group_id], + force_draw, dc_only, &should_run_pipeline)); + } + + // don't limit to image dimensions here (is done in DecodeGroup) + const Rect mrect(x, y, group_dim, group_dim); + bool modular_ready = false; + size_t pass0 = decoded_passes_per_ac_group_[ac_group_id]; + size_t pass1 = + force_draw ? frame_header_.passes.num_passes : pass0 + num_passes; + for (size_t i = pass0; i < pass1; ++i) { + int minShift, maxShift; + frame_header_.passes.GetDownsamplingBracket(i, minShift, maxShift); + bool modular_pass_ready = true; + if (i < pass0 + num_passes) { + JXL_RETURN_IF_ERROR(modular_frame_decoder_.DecodeGroup( + mrect, br[i - pass0], minShift, maxShift, + ModularStreamId::ModularAC(ac_group_id, i), + /*zerofill=*/false, dec_state_, &render_pipeline_input, + /*allow_truncated=*/false, &modular_pass_ready)); + } else { + JXL_RETURN_IF_ERROR(modular_frame_decoder_.DecodeGroup( + mrect, nullptr, minShift, maxShift, + ModularStreamId::ModularAC(ac_group_id, i), /*zerofill=*/true, + dec_state_, &render_pipeline_input, + /*allow_truncated=*/false, &modular_pass_ready)); + } + if (modular_pass_ready) modular_ready = true; + } + decoded_passes_per_ac_group_[ac_group_id] += num_passes; + + if ((frame_header_.flags & FrameHeader::kNoise) != 0) { + PROFILER_ZONE("GenerateNoise"); + size_t noise_c_start = + 3 + frame_header_.nonserialized_metadata->m.num_extra_channels; + // When the color channels are downsampled, we need to generate more noise + // input for the current group than just the group dimensions. + std::pair rects[3]; + for (size_t iy = 0; iy < frame_header_.upsampling; iy++) { + for (size_t ix = 0; ix < frame_header_.upsampling; ix++) { + for (size_t c = 0; c < 3; c++) { + auto r = render_pipeline_input.GetBuffer(noise_c_start + c); + rects[c].first = r.first; + size_t x1 = r.second.x0() + r.second.xsize(); + size_t y1 = r.second.y0() + r.second.ysize(); + rects[c].second = Rect(r.second.x0() + ix * group_dim, + r.second.y0() + iy * group_dim, group_dim, + group_dim, x1, y1); + } + Random3Planes(dec_state_->visible_frame_index, + dec_state_->nonvisible_frame_index, + (gx * frame_header_.upsampling + ix) * group_dim, + (gy * frame_header_.upsampling + iy) * group_dim, + rects[0], rects[1], rects[2]); + } + } + } + + if (!modular_frame_decoder_.UsesFullImage() && !decoded_->IsJPEG()) { + if (should_run_pipeline && modular_ready) { + render_pipeline_input.Done(); + } else if (force_draw) { + return JXL_FAILURE("Modular group decoding failed."); + } + } + return true; +} + +void FrameDecoder::MarkSections(const SectionInfo* sections, size_t num, + SectionStatus* section_status) { + num_sections_done_ += num; + for (size_t i = 0; i < num; i++) { + if (section_status[i] != SectionStatus::kDone) { + processed_section_[sections[i].id] = false; + num_sections_done_--; + } + } +} + +Status FrameDecoder::ProcessSections(const SectionInfo* sections, size_t num, + SectionStatus* section_status) { + if (num == 0) return true; // Nothing to process + std::fill(section_status, section_status + num, SectionStatus::kSkipped); + size_t dc_global_sec = num; + size_t ac_global_sec = num; + std::vector dc_group_sec(frame_dim_.num_dc_groups, num); + std::vector> ac_group_sec( + frame_dim_.num_groups, + std::vector(frame_header_.passes.num_passes, num)); + // This keeps track of the number of ac passes we want to process during this + // call of ProcessSections. + std::vector desired_num_ac_passes(frame_dim_.num_groups); + bool single_section = + frame_dim_.num_groups == 1 && frame_header_.passes.num_passes == 1; + if (single_section) { + JXL_ASSERT(num == 1); + JXL_ASSERT(sections[0].id == 0); + if (processed_section_[0] == false) { + processed_section_[0] = true; + ac_group_sec[0].resize(1); + dc_global_sec = ac_global_sec = dc_group_sec[0] = ac_group_sec[0][0] = 0; + desired_num_ac_passes[0] = 1; + } else { + section_status[0] = SectionStatus::kDuplicate; + } + } else { + size_t ac_global_index = frame_dim_.num_dc_groups + 1; + for (size_t i = 0; i < num; i++) { + JXL_ASSERT(sections[i].id < processed_section_.size()); + if (processed_section_[sections[i].id]) { + section_status[i] = SectionStatus::kDuplicate; + continue; + } + if (sections[i].id == 0) { + dc_global_sec = i; + } else if (sections[i].id < ac_global_index) { + dc_group_sec[sections[i].id - 1] = i; + } else if (sections[i].id == ac_global_index) { + ac_global_sec = i; + } else { + size_t ac_idx = sections[i].id - ac_global_index - 1; + size_t acg = ac_idx % frame_dim_.num_groups; + size_t acp = ac_idx / frame_dim_.num_groups; + if (acp >= frame_header_.passes.num_passes) { + return JXL_FAILURE("Invalid section ID"); + } + ac_group_sec[acg][acp] = i; + } + processed_section_[sections[i].id] = true; + } + // Count number of new passes per group. + for (size_t g = 0; g < ac_group_sec.size(); g++) { + size_t j = 0; + for (; j + decoded_passes_per_ac_group_[g] < + frame_header_.passes.num_passes; + j++) { + if (ac_group_sec[g][j + decoded_passes_per_ac_group_[g]] == num) { + break; + } + } + desired_num_ac_passes[g] = j; + } + } + if (dc_global_sec != num) { + Status dc_global_status = ProcessDCGlobal(sections[dc_global_sec].br); + if (dc_global_status.IsFatalError()) return dc_global_status; + if (dc_global_status) { + section_status[dc_global_sec] = SectionStatus::kDone; + } else { + section_status[dc_global_sec] = SectionStatus::kPartial; + } + } + + std::atomic has_error{false}; + if (decoded_dc_global_) { + JXL_RETURN_IF_ERROR(RunOnPool( + pool_, 0, dc_group_sec.size(), ThreadPool::NoInit, + [this, &dc_group_sec, &num, §ions, §ion_status, &has_error]( + size_t i, size_t thread) { + if (dc_group_sec[i] != num) { + if (!ProcessDCGroup(i, sections[dc_group_sec[i]].br)) { + has_error = true; + } else { + section_status[dc_group_sec[i]] = SectionStatus::kDone; + } + } + }, + "DecodeDCGroup")); + } + if (has_error) return JXL_FAILURE("Error in DC group"); + + if (*std::min_element(decoded_dc_groups_.begin(), decoded_dc_groups_.end()) && + !finalized_dc_) { + PassesDecoderState::PipelineOptions pipeline_options; + pipeline_options.use_slow_render_pipeline = use_slow_rendering_pipeline_; + pipeline_options.coalescing = coalescing_; + pipeline_options.render_spotcolors = render_spotcolors_; + JXL_RETURN_IF_ERROR( + dec_state_->PreparePipeline(decoded_, pipeline_options)); + FinalizeDC(); + JXL_RETURN_IF_ERROR(AllocateOutput()); + if (progressive_detail_ >= JxlProgressiveDetail::kDC) { + MarkSections(sections, num, section_status); + return true; + } + } + + if (finalized_dc_ && ac_global_sec != num && !decoded_ac_global_) { + JXL_RETURN_IF_ERROR(ProcessACGlobal(sections[ac_global_sec].br)); + section_status[ac_global_sec] = SectionStatus::kDone; + } + + if (progressive_detail_ >= JxlProgressiveDetail::kLastPasses) { + // Mark that we only want the next progression pass. + size_t target_complete_passes = NextNumPassesToPause(); + for (size_t i = 0; i < ac_group_sec.size(); i++) { + desired_num_ac_passes[i] = + std::min(desired_num_ac_passes[i], + target_complete_passes - decoded_passes_per_ac_group_[i]); + } + } + + if (decoded_ac_global_) { + // Mark all the AC groups that we received as not complete yet. + for (size_t i = 0; i < ac_group_sec.size(); i++) { + if (desired_num_ac_passes[i] != 0) { + dec_state_->render_pipeline->ClearDone(i); + } + } + + JXL_RETURN_IF_ERROR(RunOnPool( + pool_, 0, ac_group_sec.size(), + [this](size_t num_threads) { + return PrepareStorage(num_threads, + decoded_passes_per_ac_group_.size()); + }, + [this, &ac_group_sec, &desired_num_ac_passes, &num, §ions, + §ion_status, &has_error](size_t g, size_t thread) { + if (desired_num_ac_passes[g] == 0) { + // no new AC pass, nothing to do + return; + } + (void)num; + size_t first_pass = decoded_passes_per_ac_group_[g]; + BitReader* JXL_RESTRICT readers[kMaxNumPasses]; + for (size_t i = 0; i < desired_num_ac_passes[g]; i++) { + JXL_ASSERT(ac_group_sec[g][first_pass + i] != num); + readers[i] = sections[ac_group_sec[g][first_pass + i]].br; + } + if (!ProcessACGroup(g, readers, desired_num_ac_passes[g], + GetStorageLocation(thread, g), + /*force_draw=*/false, /*dc_only=*/false)) { + has_error = true; + } else { + for (size_t i = 0; i < desired_num_ac_passes[g]; i++) { + section_status[ac_group_sec[g][first_pass + i]] = + SectionStatus::kDone; + } + } + }, + "DecodeGroup")); + } + if (has_error) return JXL_FAILURE("Error in AC group"); + + MarkSections(sections, num, section_status); + return true; +} + +Status FrameDecoder::Flush() { + bool has_blending = frame_header_.blending_info.mode != BlendMode::kReplace || + frame_header_.custom_size_or_origin; + for (const auto& blending_info_ec : + frame_header_.extra_channel_blending_info) { + if (blending_info_ec.mode != BlendMode::kReplace) has_blending = true; + } + // No early Flush() if blending is enabled. + if (has_blending && !is_finalized_) { + return false; + } + // No early Flush() - nothing to do - if the frame is a kSkipProgressive + // frame. + if (frame_header_.frame_type == FrameType::kSkipProgressive && + !is_finalized_) { + return true; + } + if (decoded_->IsJPEG()) { + // Nothing to do. + return true; + } + JXL_RETURN_IF_ERROR(AllocateOutput()); + + uint32_t completely_decoded_ac_pass = *std::min_element( + decoded_passes_per_ac_group_.begin(), decoded_passes_per_ac_group_.end()); + if (completely_decoded_ac_pass < frame_header_.passes.num_passes) { + // We don't have all AC yet: force a draw of all the missing areas. + // Mark all sections as not complete. + for (size_t i = 0; i < decoded_passes_per_ac_group_.size(); i++) { + if (decoded_passes_per_ac_group_[i] < frame_header_.passes.num_passes) { + dec_state_->render_pipeline->ClearDone(i); + } + } + std::atomic has_error{false}; + JXL_RETURN_IF_ERROR(RunOnPool( + pool_, 0, decoded_passes_per_ac_group_.size(), + [this](const size_t num_threads) { + return PrepareStorage(num_threads, + decoded_passes_per_ac_group_.size()); + }, + [this, &has_error](const uint32_t g, size_t thread) { + if (decoded_passes_per_ac_group_[g] == + frame_header_.passes.num_passes) { + // This group was drawn already, nothing to do. + return; + } + BitReader* JXL_RESTRICT readers[kMaxNumPasses] = {}; + bool ok = ProcessACGroup( + g, readers, /*num_passes=*/0, GetStorageLocation(thread, g), + /*force_draw=*/true, /*dc_only=*/!decoded_ac_global_); + if (!ok) has_error = true; + }, + "ForceDrawGroup")); + if (has_error) { + return JXL_FAILURE("Drawing groups failed"); + } + } + + // undo global modular transforms and copy int pixel buffers to float ones + JXL_RETURN_IF_ERROR(modular_frame_decoder_.FinalizeDecoding(dec_state_, pool_, + is_finalized_)); + + return true; +} + +int FrameDecoder::SavedAs(const FrameHeader& header) { + if (header.frame_type == FrameType::kDCFrame) { + // bits 16, 32, 64, 128 for DC level + return 16 << (header.dc_level - 1); + } else if (header.CanBeReferenced()) { + // bits 1, 2, 4 and 8 for the references + return 1 << header.save_as_reference; + } + + return 0; +} + +bool FrameDecoder::HasEverything() const { + if (!decoded_dc_global_) return false; + if (!decoded_ac_global_) return false; + for (auto& have_dc_group : decoded_dc_groups_) { + if (!have_dc_group) return false; + } + for (auto& nb_passes : decoded_passes_per_ac_group_) { + if (nb_passes < frame_header_.passes.num_passes) return false; + } + return true; +} + +int FrameDecoder::References() const { + if (is_finalized_) { + return 0; + } + if (!HasEverything()) return 0; + + int result = 0; + + // Blending + if (frame_header_.frame_type == FrameType::kRegularFrame || + frame_header_.frame_type == FrameType::kSkipProgressive) { + bool cropped = frame_header_.custom_size_or_origin; + if (cropped || frame_header_.blending_info.mode != BlendMode::kReplace) { + result |= (1 << frame_header_.blending_info.source); + } + const auto& extra = frame_header_.extra_channel_blending_info; + for (size_t i = 0; i < extra.size(); ++i) { + if (cropped || extra[i].mode != BlendMode::kReplace) { + result |= (1 << extra[i].source); + } + } + } + + // Patches + if (frame_header_.flags & FrameHeader::kPatches) { + result |= dec_state_->shared->image_features.patches.GetReferences(); + } + + // DC Level + if (frame_header_.flags & FrameHeader::kUseDcFrame) { + // Reads from the next dc level + int dc_level = frame_header_.dc_level + 1; + // bits 16, 32, 64, 128 for DC level + result |= (16 << (dc_level - 1)); + } + + return result; +} + +Status FrameDecoder::FinalizeFrame() { + if (is_finalized_) { + return JXL_FAILURE("FinalizeFrame called multiple times"); + } + is_finalized_ = true; + if (decoded_->IsJPEG()) { + // Nothing to do. + return true; + } + + // undo global modular transforms and copy int pixel buffers to float ones + JXL_RETURN_IF_ERROR( + modular_frame_decoder_.FinalizeDecoding(dec_state_, pool_, + /*inplace=*/true)); + + if (frame_header_.CanBeReferenced()) { + auto& info = dec_state_->shared_storage + .reference_frames[frame_header_.save_as_reference]; + info.frame = std::move(dec_state_->frame_storage_for_referencing); + info.ib_is_in_xyb = frame_header_.save_before_color_transform; + } + return true; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/dec_frame.h b/third_party/jpeg-xl/lib/jxl/dec_frame.h new file mode 100644 index 0000000000..6b54ac631f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_frame.h @@ -0,0 +1,329 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_FRAME_H_ +#define LIB_JXL_DEC_FRAME_H_ + +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/blending.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/dec_modular.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +// Decodes a frame. Groups may be processed in parallel by `pool`. +// `metadata` is the metadata that applies to all frames of the codestream +// `decoded->metadata` must already be set and must match metadata.m. +// Used in the encoder to model decoder behaviour, and in tests. +Status DecodeFrame(PassesDecoderState* dec_state, ThreadPool* JXL_RESTRICT pool, + const uint8_t* next_in, size_t avail_in, + ImageBundle* decoded, const CodecMetadata& metadata, + bool use_slow_rendering_pipeline = false); + +// TODO(veluca): implement "forced drawing". +class FrameDecoder { + public: + // All parameters must outlive the FrameDecoder. + FrameDecoder(PassesDecoderState* dec_state, const CodecMetadata& metadata, + ThreadPool* pool, bool use_slow_rendering_pipeline) + : dec_state_(dec_state), + pool_(pool), + frame_header_(&metadata), + use_slow_rendering_pipeline_(use_slow_rendering_pipeline) {} + + void SetRenderSpotcolors(bool rsc) { render_spotcolors_ = rsc; } + void SetCoalescing(bool c) { coalescing_ = c; } + + // Read FrameHeader and table of contents from the given BitReader. + Status InitFrame(BitReader* JXL_RESTRICT br, ImageBundle* decoded, + bool is_preview); + + // Checks frame dimensions for their limits, and sets the output + // image buffer. + Status InitFrameOutput(); + + struct SectionInfo { + BitReader* JXL_RESTRICT br; + // Logical index of the section, regardless of any permutation that may be + // applied in the table of contents or of the physical position in the file. + size_t id; + // Index of the section in the order of the bytes inside the frame. + size_t index; + }; + + struct TocEntry { + size_t size; + size_t id; + }; + + enum SectionStatus { + // Processed correctly. + kDone = 0, + // Skipped because other required sections were not yet processed. + kSkipped = 1, + // Skipped because the section was already processed. + kDuplicate = 2, + // Only partially decoded: the section will need to be processed again. + kPartial = 3, + }; + + // Processes `num` sections; each SectionInfo contains the index + // of the section and a BitReader that only contains the data of the section. + // `section_status` should point to `num` elements, and will be filled with + // information about whether each section was processed or not. + // A section is a part of the encoded file that is indexed by the TOC. + Status ProcessSections(const SectionInfo* sections, size_t num, + SectionStatus* section_status); + + // Flushes all the data decoded so far to pixels. + Status Flush(); + + // Runs final operations once a frame data is decoded. + // Must be called exactly once per frame, after all calls to ProcessSections. + Status FinalizeFrame(); + + // Returns dependencies of this frame on reference ids as a bit mask: bits 0-3 + // indicate reference frame 0-3 for patches and blending, bits 4-7 indicate DC + // frames this frame depends on. Only returns a valid result after all calls + // to ProcessSections are finished and before FinalizeFrame. + int References() const; + + // Returns reference id of storage location where this frame is stored as a + // bit flag, or 0 if not stored. + // Matches the bit mask used for GetReferences: bits 0-3 indicate it is stored + // for patching or blending, bits 4-7 indicate DC frame. + // Unlike References, can be ran at any time as + // soon as the frame header is known. + static int SavedAs(const FrameHeader& header); + + uint64_t SumSectionSizes() const { return section_sizes_sum_; } + const std::vector& Toc() const { return toc_; } + + const FrameHeader& GetFrameHeader() const { return frame_header_; } + + // Returns whether a DC image has been decoded, accessible at low resolution + // at passes.shared_storage.dc_storage + bool HasDecodedDC() const { return finalized_dc_; } + bool HasDecodedAll() const { return toc_.size() == num_sections_done_; } + + size_t NumCompletePasses() const { + return *std::min_element(decoded_passes_per_ac_group_.begin(), + decoded_passes_per_ac_group_.end()); + } + + // If enabled, ProcessSections will stop and return true when the DC + // sections have been processed, instead of starting the AC sections. This + // will only occur if supported (that is, flushing will produce a valid + // 1/8th*1/8th resolution image). The return value of true then does not mean + // all sections have been processed, use HasDecodedDC and HasDecodedAll + // to check the true finished state. + // Returns the progressive detail that will be effective for the frame. + JxlProgressiveDetail SetPauseAtProgressive(JxlProgressiveDetail prog_detail) { + bool single_section = + frame_dim_.num_groups == 1 && frame_header_.passes.num_passes == 1; + if (frame_header_.frame_type != kSkipProgressive && + // If there's only one group and one pass, there is no separate section + // for DC and the entire full resolution image is available at once. + !single_section && + // If extra channels are encoded with modular without squeeze, they + // don't support DC. If the are encoded with squeeze, DC works in theory + // but the implementation may not yet correctly support this for Flush. + // Therefore, can't correctly pause for a progressive step if there is + // an extra channel (including alpha channel) + // TODO(firsching): Check if this is still the case. + decoded_->metadata()->extra_channel_info.empty() && + // DC is not guaranteed to be available in modular mode and may be a + // black image. If squeeze is used, it may be available depending on the + // current implementation. + // TODO(lode): do return DC if it's known that flushing at this point + // will produce a valid 1/8th downscaled image with modular encoding. + frame_header_.encoding == FrameEncoding::kVarDCT) { + progressive_detail_ = prog_detail; + } else { + progressive_detail_ = JxlProgressiveDetail::kFrames; + } + if (progressive_detail_ >= JxlProgressiveDetail::kPasses) { + for (size_t i = 1; i < frame_header_.passes.num_passes; ++i) { + passes_to_pause_.push_back(i); + } + } else if (progressive_detail_ >= JxlProgressiveDetail::kLastPasses) { + for (size_t i = 0; i < frame_header_.passes.num_downsample; ++i) { + passes_to_pause_.push_back(frame_header_.passes.last_pass[i] + 1); + } + // The format does not guarantee that these values are sorted. + std::sort(passes_to_pause_.begin(), passes_to_pause_.end()); + } + return progressive_detail_; + } + + size_t NextNumPassesToPause() const { + auto it = std::upper_bound(passes_to_pause_.begin(), passes_to_pause_.end(), + NumCompletePasses()); + return (it != passes_to_pause_.end() ? *it + : std::numeric_limits::max()); + } + + // Sets the pixel callback or image buffer where the pixels will be decoded. + // + // @param undo_orientation: if true, indicates the frame decoder should apply + // the exif orientation to bring the image to the intended display + // orientation. + void SetImageOutput(const PixelCallback& pixel_callback, void* image_buffer, + size_t image_buffer_size, size_t xsize, size_t ysize, + JxlPixelFormat format, size_t bits_per_sample, + bool unpremul_alpha, bool undo_orientation) const { + dec_state_->width = xsize; + dec_state_->height = ysize; + dec_state_->main_output.format = format; + dec_state_->main_output.bits_per_sample = bits_per_sample; + dec_state_->main_output.callback = pixel_callback; + dec_state_->main_output.buffer = image_buffer; + dec_state_->main_output.buffer_size = image_buffer_size; + dec_state_->main_output.stride = GetStride(xsize, format); + const jxl::ExtraChannelInfo* alpha = + decoded_->metadata()->Find(jxl::ExtraChannel::kAlpha); + if (alpha && alpha->alpha_associated && unpremul_alpha) { + dec_state_->unpremul_alpha = true; + } + if (undo_orientation) { + dec_state_->undo_orientation = decoded_->metadata()->GetOrientation(); + if (static_cast(dec_state_->undo_orientation) > 4) { + std::swap(dec_state_->width, dec_state_->height); + } + } + dec_state_->extra_output.clear(); +#if !JXL_HIGH_PRECISION + if (dec_state_->main_output.buffer && + (format.data_type == JXL_TYPE_UINT8) && (format.num_channels >= 3) && + !dec_state_->unpremul_alpha && + (dec_state_->undo_orientation == Orientation::kIdentity) && + decoded_->metadata()->xyb_encoded && + dec_state_->output_encoding_info.color_encoding.IsSRGB() && + dec_state_->output_encoding_info.all_default_opsin && + (dec_state_->output_encoding_info.desired_intensity_target == + dec_state_->output_encoding_info.orig_intensity_target) && + HasFastXYBTosRGB8() && frame_header_.needs_color_transform()) { + dec_state_->fast_xyb_srgb8_conversion = true; + } +#endif + } + + void AddExtraChannelOutput(void* buffer, size_t buffer_size, size_t xsize, + JxlPixelFormat format, size_t bits_per_sample) { + ImageOutput out; + out.format = format; + out.bits_per_sample = bits_per_sample; + out.buffer = buffer; + out.buffer_size = buffer_size; + out.stride = GetStride(xsize, format); + dec_state_->extra_output.push_back(out); + } + + private: + Status ProcessDCGlobal(BitReader* br); + Status ProcessDCGroup(size_t dc_group_id, BitReader* br); + void FinalizeDC(); + Status AllocateOutput(); + Status ProcessACGlobal(BitReader* br); + Status ProcessACGroup(size_t ac_group_id, BitReader* JXL_RESTRICT* br, + size_t num_passes, size_t thread, bool force_draw, + bool dc_only); + void MarkSections(const SectionInfo* sections, size_t num, + SectionStatus* section_status); + + // Allocates storage for parallel decoding using up to `num_threads` threads + // of up to `num_tasks` tasks. The value of `thread` passed to + // `GetStorageLocation` must be smaller than the `num_threads` value passed + // here. The value of `task` passed to `GetStorageLocation` must be smaller + // than the value of `num_tasks` passed here. + Status PrepareStorage(size_t num_threads, size_t num_tasks) { + size_t storage_size = std::min(num_threads, num_tasks); + if (storage_size > group_dec_caches_.size()) { + group_dec_caches_.resize(storage_size); + } + use_task_id_ = num_threads > num_tasks; + bool use_group_ids = (modular_frame_decoder_.UsesFullImage() && + (frame_header_.encoding == FrameEncoding::kVarDCT || + (frame_header_.flags & FrameHeader::kNoise))); + if (dec_state_->render_pipeline) { + JXL_RETURN_IF_ERROR(dec_state_->render_pipeline->PrepareForThreads( + storage_size, use_group_ids)); + } + return true; + } + + size_t GetStorageLocation(size_t thread, size_t task) { + if (use_task_id_) return task; + return thread; + } + + static size_t BytesPerChannel(JxlDataType data_type) { + return (data_type == JXL_TYPE_UINT8 ? 1u + : data_type == JXL_TYPE_FLOAT ? 4u + : 2u); + } + + static size_t GetStride(const size_t xsize, JxlPixelFormat format) { + size_t stride = + (xsize * BytesPerChannel(format.data_type) * format.num_channels); + if (format.align > 1) { + stride = (jxl::DivCeil(stride, format.align) * format.align); + } + return stride; + } + + PassesDecoderState* dec_state_; + ThreadPool* pool_; + std::vector toc_; + uint64_t section_sizes_sum_; + // TODO(veluca): figure out the duplication between these and dec_state_. + FrameHeader frame_header_; + FrameDimensions frame_dim_; + ImageBundle* decoded_; + ModularFrameDecoder modular_frame_decoder_; + bool render_spotcolors_ = true; + bool coalescing_ = true; + + std::vector processed_section_; + std::vector decoded_passes_per_ac_group_; + std::vector decoded_dc_groups_; + bool decoded_dc_global_; + bool decoded_ac_global_; + bool HasEverything() const; + bool finalized_dc_ = true; + size_t num_sections_done_ = 0; + bool is_finalized_ = true; + bool allocated_ = false; + + std::vector group_dec_caches_; + + // Whether or not the task id should be used for storage indexing, instead of + // the thread id. + bool use_task_id_ = false; + + // Testing setting: whether or not to use the slow rendering pipeline. + bool use_slow_rendering_pipeline_; + + JxlProgressiveDetail progressive_detail_ = kFrames; + // Number of completed passes where section decoding should pause. + // Used for progressive details at least kLastPasses. + std::vector passes_to_pause_; +}; + +} // namespace jxl + +#endif // LIB_JXL_DEC_FRAME_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dec_group.cc b/third_party/jpeg-xl/lib/jxl/dec_group.cc new file mode 100644 index 0000000000..be8df9b062 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_group.cc @@ -0,0 +1,801 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_group.h" + +#include +#include + +#include +#include +#include + +#include "lib/jxl/frame_header.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/dec_group.cc" +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/common.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/dec_transforms-inl.h" +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/epf.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer-inl.h" +#include "lib/jxl/quantizer.h" + +#ifndef LIB_JXL_DEC_GROUP_CC +#define LIB_JXL_DEC_GROUP_CC +namespace jxl { + +struct AuxOut; + +// Interface for reading groups for DecodeGroupImpl. +class GetBlock { + public: + virtual void StartRow(size_t by) = 0; + virtual Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, + size_t size, size_t log2_covered_blocks, + ACPtr block[3], ACType ac_type) = 0; + virtual ~GetBlock() {} +}; + +// Controls whether DecodeGroupImpl renders to pixels or not. +enum DrawMode { + // Render to pixels. + kDraw = 0, + // Don't render to pixels. + kDontDraw = 1, +}; + +} // namespace jxl +#endif // LIB_JXL_DEC_GROUP_CC + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Rebind; +using hwy::HWY_NAMESPACE::ShiftRight; + +using D = HWY_FULL(float); +using DU = HWY_FULL(uint32_t); +using DI = HWY_FULL(int32_t); +using DI16 = Rebind; +constexpr D d; +constexpr DI di; +constexpr DI16 di16; + +// TODO(veluca): consider SIMDfying. +void Transpose8x8InPlace(int32_t* JXL_RESTRICT block) { + for (size_t x = 0; x < 8; x++) { + for (size_t y = x + 1; y < 8; y++) { + std::swap(block[y * 8 + x], block[x * 8 + y]); + } + } +} + +template +void DequantLane(Vec scaled_dequant_x, Vec scaled_dequant_y, + Vec scaled_dequant_b, + const float* JXL_RESTRICT dequant_matrices, size_t size, + size_t k, Vec x_cc_mul, Vec b_cc_mul, + const float* JXL_RESTRICT biases, ACPtr qblock[3], + float* JXL_RESTRICT block) { + const auto x_mul = Mul(Load(d, dequant_matrices + k), scaled_dequant_x); + const auto y_mul = + Mul(Load(d, dequant_matrices + size + k), scaled_dequant_y); + const auto b_mul = + Mul(Load(d, dequant_matrices + 2 * size + k), scaled_dequant_b); + + Vec quantized_x_int; + Vec quantized_y_int; + Vec quantized_b_int; + if (ac_type == ACType::k16) { + Rebind di16; + quantized_x_int = PromoteTo(di, Load(di16, qblock[0].ptr16 + k)); + quantized_y_int = PromoteTo(di, Load(di16, qblock[1].ptr16 + k)); + quantized_b_int = PromoteTo(di, Load(di16, qblock[2].ptr16 + k)); + } else { + quantized_x_int = Load(di, qblock[0].ptr32 + k); + quantized_y_int = Load(di, qblock[1].ptr32 + k); + quantized_b_int = Load(di, qblock[2].ptr32 + k); + } + + const auto dequant_x_cc = + Mul(AdjustQuantBias(di, 0, quantized_x_int, biases), x_mul); + const auto dequant_y = + Mul(AdjustQuantBias(di, 1, quantized_y_int, biases), y_mul); + const auto dequant_b_cc = + Mul(AdjustQuantBias(di, 2, quantized_b_int, biases), b_mul); + + const auto dequant_x = MulAdd(x_cc_mul, dequant_y, dequant_x_cc); + const auto dequant_b = MulAdd(b_cc_mul, dequant_y, dequant_b_cc); + Store(dequant_x, d, block + k); + Store(dequant_y, d, block + size + k); + Store(dequant_b, d, block + 2 * size + k); +} + +template +void DequantBlock(const AcStrategy& acs, float inv_global_scale, int quant, + float x_dm_multiplier, float b_dm_multiplier, Vec x_cc_mul, + Vec b_cc_mul, size_t kind, size_t size, + const Quantizer& quantizer, size_t covered_blocks, + const size_t* sbx, + const float* JXL_RESTRICT* JXL_RESTRICT dc_row, + size_t dc_stride, const float* JXL_RESTRICT biases, + ACPtr qblock[3], float* JXL_RESTRICT block) { + PROFILER_FUNC; + + const auto scaled_dequant_s = inv_global_scale / quant; + + const auto scaled_dequant_x = Set(d, scaled_dequant_s * x_dm_multiplier); + const auto scaled_dequant_y = Set(d, scaled_dequant_s); + const auto scaled_dequant_b = Set(d, scaled_dequant_s * b_dm_multiplier); + + const float* dequant_matrices = quantizer.DequantMatrix(kind, 0); + + for (size_t k = 0; k < covered_blocks * kDCTBlockSize; k += Lanes(d)) { + DequantLane(scaled_dequant_x, scaled_dequant_y, scaled_dequant_b, + dequant_matrices, size, k, x_cc_mul, b_cc_mul, biases, + qblock, block); + } + for (size_t c = 0; c < 3; c++) { + LowestFrequenciesFromDC(acs.Strategy(), dc_row[c] + sbx[c], dc_stride, + block + c * size); + } +} + +Status DecodeGroupImpl(GetBlock* JXL_RESTRICT get_block, + GroupDecCache* JXL_RESTRICT group_dec_cache, + PassesDecoderState* JXL_RESTRICT dec_state, + size_t thread, size_t group_idx, + RenderPipelineInput& render_pipeline_input, + ImageBundle* decoded, DrawMode draw) { + // TODO(veluca): investigate cache usage in this function. + PROFILER_FUNC; + const Rect block_rect = dec_state->shared->BlockGroupRect(group_idx); + const AcStrategyImage& ac_strategy = dec_state->shared->ac_strategy; + + const size_t xsize_blocks = block_rect.xsize(); + const size_t ysize_blocks = block_rect.ysize(); + + const size_t dc_stride = dec_state->shared->dc->PixelsPerRow(); + + const float inv_global_scale = dec_state->shared->quantizer.InvGlobalScale(); + + const YCbCrChromaSubsampling& cs = + dec_state->shared->frame_header.chroma_subsampling; + + size_t idct_stride[3]; + for (size_t c = 0; c < 3; c++) { + idct_stride[c] = render_pipeline_input.GetBuffer(c).first->PixelsPerRow(); + } + + HWY_ALIGN int32_t scaled_qtable[64 * 3]; + + ACType ac_type = dec_state->coefficients->Type(); + auto dequant_block = ac_type == ACType::k16 ? DequantBlock + : DequantBlock; + // Whether or not coefficients should be stored for future usage, and/or read + // from past usage. + bool accumulate = !dec_state->coefficients->IsEmpty(); + // Offset of the current block in the group. + size_t offset = 0; + + std::array jpeg_c_map; + bool jpeg_is_gray = false; + std::array dcoff = {}; + + // TODO(veluca): all of this should be done only once per image. + if (decoded->IsJPEG()) { + if (!dec_state->shared->cmap.IsJPEGCompatible()) { + return JXL_FAILURE("The CfL map is not JPEG-compatible"); + } + jpeg_is_gray = (decoded->jpeg_data->components.size() == 1); + jpeg_c_map = JpegOrder(dec_state->shared->frame_header.color_transform, + jpeg_is_gray); + const std::vector& qe = + dec_state->shared->matrices.encodings(); + if (qe.empty() || qe[0].mode != QuantEncoding::Mode::kQuantModeRAW || + std::abs(qe[0].qraw.qtable_den - 1.f / (8 * 255)) > 1e-8f) { + return JXL_FAILURE( + "Quantization table is not a JPEG quantization table."); + } + for (size_t c = 0; c < 3; c++) { + if (dec_state->shared->frame_header.color_transform == + ColorTransform::kNone) { + dcoff[c] = 1024 / (*qe[0].qraw.qtable)[64 * c]; + } + for (size_t i = 0; i < 64; i++) { + // Transpose the matrix, as it will be used on the transposed block. + int n = qe[0].qraw.qtable->at(64 + i); + int d = qe[0].qraw.qtable->at(64 * c + i); + if (n <= 0 || d <= 0 || n >= 65536 || d >= 65536) { + return JXL_FAILURE("Invalid JPEG quantization table"); + } + scaled_qtable[64 * c + (i % 8) * 8 + (i / 8)] = + (1 << kCFLFixedPointPrecision) * n / d; + } + } + } + + size_t hshift[3] = {cs.HShift(0), cs.HShift(1), cs.HShift(2)}; + size_t vshift[3] = {cs.VShift(0), cs.VShift(1), cs.VShift(2)}; + Rect r[3]; + for (size_t i = 0; i < 3; i++) { + r[i] = + Rect(block_rect.x0() >> hshift[i], block_rect.y0() >> vshift[i], + block_rect.xsize() >> hshift[i], block_rect.ysize() >> vshift[i]); + if (!r[i].IsInside({0, 0, dec_state->shared->dc->Plane(i).xsize(), + dec_state->shared->dc->Plane(i).ysize()})) { + return JXL_FAILURE("Frame dimensions are too big for the image."); + } + } + + for (size_t by = 0; by < ysize_blocks; ++by) { + get_block->StartRow(by); + size_t sby[3] = {by >> vshift[0], by >> vshift[1], by >> vshift[2]}; + + const int32_t* JXL_RESTRICT row_quant = + block_rect.ConstRow(dec_state->shared->raw_quant_field, by); + + const float* JXL_RESTRICT dc_rows[3] = { + r[0].ConstPlaneRow(*dec_state->shared->dc, 0, sby[0]), + r[1].ConstPlaneRow(*dec_state->shared->dc, 1, sby[1]), + r[2].ConstPlaneRow(*dec_state->shared->dc, 2, sby[2]), + }; + + const size_t ty = (block_rect.y0() + by) / kColorTileDimInBlocks; + AcStrategyRow acs_row = ac_strategy.ConstRow(block_rect, by); + + const int8_t* JXL_RESTRICT row_cmap[3] = { + dec_state->shared->cmap.ytox_map.ConstRow(ty), + nullptr, + dec_state->shared->cmap.ytob_map.ConstRow(ty), + }; + + float* JXL_RESTRICT idct_row[3]; + int16_t* JXL_RESTRICT jpeg_row[3]; + for (size_t c = 0; c < 3; c++) { + idct_row[c] = render_pipeline_input.GetBuffer(c).second.Row( + render_pipeline_input.GetBuffer(c).first, sby[c] * kBlockDim); + if (decoded->IsJPEG()) { + auto& component = decoded->jpeg_data->components[jpeg_c_map[c]]; + jpeg_row[c] = + component.coeffs.data() + + (component.width_in_blocks * (r[c].y0() + sby[c]) + r[c].x0()) * + kDCTBlockSize; + } + } + + size_t bx = 0; + for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks); + tx++) { + size_t abs_tx = tx + block_rect.x0() / kColorTileDimInBlocks; + auto x_cc_mul = + Set(d, dec_state->shared->cmap.YtoXRatio(row_cmap[0][abs_tx])); + auto b_cc_mul = + Set(d, dec_state->shared->cmap.YtoBRatio(row_cmap[2][abs_tx])); + // Increment bx by llf_x because those iterations would otherwise + // immediately continue (!IsFirstBlock). Reduces mispredictions. + for (; bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks;) { + size_t sbx[3] = {bx >> hshift[0], bx >> hshift[1], bx >> hshift[2]}; + AcStrategy acs = acs_row[bx]; + const size_t llf_x = acs.covered_blocks_x(); + + // Can only happen in the second or lower rows of a varblock. + if (JXL_UNLIKELY(!acs.IsFirstBlock())) { + bx += llf_x; + continue; + } + PROFILER_ZONE("DecodeGroupImpl inner"); + const size_t log2_covered_blocks = acs.log2_covered_blocks(); + + const size_t covered_blocks = 1 << log2_covered_blocks; + const size_t size = covered_blocks * kDCTBlockSize; + + ACPtr qblock[3]; + if (accumulate) { + for (size_t c = 0; c < 3; c++) { + qblock[c] = dec_state->coefficients->PlaneRow(c, group_idx, offset); + } + } else { + // No point in reading from bitstream without accumulating and not + // drawing. + JXL_ASSERT(draw == kDraw); + if (ac_type == ACType::k16) { + memset(group_dec_cache->dec_group_qblock16, 0, + size * 3 * sizeof(int16_t)); + for (size_t c = 0; c < 3; c++) { + qblock[c].ptr16 = group_dec_cache->dec_group_qblock16 + c * size; + } + } else { + memset(group_dec_cache->dec_group_qblock, 0, + size * 3 * sizeof(int32_t)); + for (size_t c = 0; c < 3; c++) { + qblock[c].ptr32 = group_dec_cache->dec_group_qblock + c * size; + } + } + } + JXL_RETURN_IF_ERROR(get_block->LoadBlock( + bx, by, acs, size, log2_covered_blocks, qblock, ac_type)); + offset += size; + if (draw == kDontDraw) { + bx += llf_x; + continue; + } + + if (JXL_UNLIKELY(decoded->IsJPEG())) { + if (acs.Strategy() != AcStrategy::Type::DCT) { + return JXL_FAILURE( + "Can only decode to JPEG if only DCT-8 is used."); + } + + HWY_ALIGN int32_t transposed_dct_y[64]; + for (size_t c : {1, 0, 2}) { + // Propagate only Y for grayscale. + if (jpeg_is_gray && c != 1) { + continue; + } + if ((sbx[c] << hshift[c] != bx) || (sby[c] << vshift[c] != by)) { + continue; + } + int16_t* JXL_RESTRICT jpeg_pos = + jpeg_row[c] + sbx[c] * kDCTBlockSize; + // JPEG XL is transposed, JPEG is not. + auto transposed_dct = qblock[c].ptr32; + Transpose8x8InPlace(transposed_dct); + // No CfL - no need to store the y block converted to integers. + if (!cs.Is444() || + (row_cmap[0][abs_tx] == 0 && row_cmap[2][abs_tx] == 0)) { + for (size_t i = 0; i < 64; i += Lanes(d)) { + const auto ini = Load(di, transposed_dct + i); + const auto ini16 = DemoteTo(di16, ini); + StoreU(ini16, di16, jpeg_pos + i); + } + } else if (c == 1) { + // Y channel: save for restoring X/B, but nothing else to do. + for (size_t i = 0; i < 64; i += Lanes(d)) { + const auto ini = Load(di, transposed_dct + i); + Store(ini, di, transposed_dct_y + i); + const auto ini16 = DemoteTo(di16, ini); + StoreU(ini16, di16, jpeg_pos + i); + } + } else { + // transposed_dct_y contains the y channel block, transposed. + const auto scale = Set( + di, dec_state->shared->cmap.RatioJPEG(row_cmap[c][abs_tx])); + const auto round = Set(di, 1 << (kCFLFixedPointPrecision - 1)); + for (int i = 0; i < 64; i += Lanes(d)) { + auto in = Load(di, transposed_dct + i); + auto in_y = Load(di, transposed_dct_y + i); + auto qt = Load(di, scaled_qtable + c * size + i); + auto coeff_scale = ShiftRight( + Add(Mul(qt, scale), round)); + auto cfl_factor = ShiftRight( + Add(Mul(in_y, coeff_scale), round)); + StoreU(DemoteTo(di16, Add(in, cfl_factor)), di16, jpeg_pos + i); + } + } + jpeg_pos[0] = + Clamp1(dc_rows[c][sbx[c]] - dcoff[c], -2047, 2047); + } + } else { + HWY_ALIGN float* const block = group_dec_cache->dec_group_block; + // Dequantize and add predictions. + dequant_block( + acs, inv_global_scale, row_quant[bx], dec_state->x_dm_multiplier, + dec_state->b_dm_multiplier, x_cc_mul, b_cc_mul, acs.RawStrategy(), + size, dec_state->shared->quantizer, + acs.covered_blocks_y() * acs.covered_blocks_x(), sbx, dc_rows, + dc_stride, + dec_state->output_encoding_info.opsin_params.quant_biases, qblock, + block); + + for (size_t c : {1, 0, 2}) { + if ((sbx[c] << hshift[c] != bx) || (sby[c] << vshift[c] != by)) { + continue; + } + // IDCT + float* JXL_RESTRICT idct_pos = idct_row[c] + sbx[c] * kBlockDim; + TransformToPixels(acs.Strategy(), block + c * size, idct_pos, + idct_stride[c], group_dec_cache->scratch_space); + } + } + bx += llf_x; + } + } + } + if (draw == kDontDraw) { + return true; + } + return true; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +namespace { +// Decode quantized AC coefficients of DCT blocks. +// LLF components in the output block will not be modified. +template +Status DecodeACVarBlock(size_t ctx_offset, size_t log2_covered_blocks, + int32_t* JXL_RESTRICT row_nzeros, + const int32_t* JXL_RESTRICT row_nzeros_top, + size_t nzeros_stride, size_t c, size_t bx, size_t by, + size_t lbx, AcStrategy acs, + const coeff_order_t* JXL_RESTRICT coeff_order, + BitReader* JXL_RESTRICT br, + ANSSymbolReader* JXL_RESTRICT decoder, + const std::vector& context_map, + const uint8_t* qdc_row, const int32_t* qf_row, + const BlockCtxMap& block_ctx_map, ACPtr block, + size_t shift = 0) { + PROFILER_FUNC; + // Equal to number of LLF coefficients. + const size_t covered_blocks = 1 << log2_covered_blocks; + const size_t size = covered_blocks * kDCTBlockSize; + int32_t predicted_nzeros = + PredictFromTopAndLeft(row_nzeros_top, row_nzeros, bx, 32); + + size_t ord = kStrategyOrder[acs.RawStrategy()]; + const coeff_order_t* JXL_RESTRICT order = + &coeff_order[CoeffOrderOffset(ord, c)]; + + size_t block_ctx = block_ctx_map.Context(qdc_row[lbx], qf_row[bx], ord, c); + const int32_t nzero_ctx = + block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx) + ctx_offset; + + size_t nzeros = decoder->ReadHybridUint(nzero_ctx, br, context_map); + if (nzeros + covered_blocks > size) { + return JXL_FAILURE("Invalid AC: nzeros too large"); + } + for (size_t y = 0; y < acs.covered_blocks_y(); y++) { + for (size_t x = 0; x < acs.covered_blocks_x(); x++) { + row_nzeros[bx + x + y * nzeros_stride] = + (nzeros + covered_blocks - 1) >> log2_covered_blocks; + } + } + + const size_t histo_offset = + ctx_offset + block_ctx_map.ZeroDensityContextsOffset(block_ctx); + + // Skip LLF + { + PROFILER_ZONE("AcDecSkipLLF, reader"); + size_t prev = (nzeros > size / 16 ? 0 : 1); + for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) { + const size_t ctx = + histo_offset + ZeroDensityContext(nzeros, k, covered_blocks, + log2_covered_blocks, prev); + const size_t u_coeff = decoder->ReadHybridUint(ctx, br, context_map); + // Hand-rolled version of UnpackSigned, shifting before the conversion to + // signed integer to avoid undefined behavior of shifting negative + // numbers. + const size_t magnitude = u_coeff >> 1; + const size_t neg_sign = (~u_coeff) & 1; + const intptr_t coeff = + static_cast((magnitude ^ (neg_sign - 1)) << shift); + if (ac_type == ACType::k16) { + block.ptr16[order[k]] += coeff; + } else { + block.ptr32[order[k]] += coeff; + } + prev = static_cast(u_coeff != 0); + nzeros -= prev; + } + if (JXL_UNLIKELY(nzeros != 0)) { + return JXL_FAILURE("Invalid AC: nzeros not 0. Block (%" PRIuS ", %" PRIuS + "), channel %" PRIuS, + bx, by, c); + } + } + return true; +} + +// Structs used by DecodeGroupImpl to get a quantized block. +// GetBlockFromBitstream uses ANS decoding (and thus keeps track of row +// pointers in row_nzeros), GetBlockFromEncoder simply reads the coefficient +// image provided by the encoder. + +struct GetBlockFromBitstream : public GetBlock { + void StartRow(size_t by) override { + qf_row = rect.ConstRow(*qf, by); + for (size_t c = 0; c < 3; c++) { + size_t sby = by >> vshift[c]; + quant_dc_row = quant_dc->ConstRow(rect.y0() + by) + rect.x0(); + for (size_t i = 0; i < num_passes; i++) { + row_nzeros[i][c] = group_dec_cache->num_nzeroes[i].PlaneRow(c, sby); + row_nzeros_top[i][c] = + sby == 0 + ? nullptr + : group_dec_cache->num_nzeroes[i].ConstPlaneRow(c, sby - 1); + } + } + } + + Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, size_t size, + size_t log2_covered_blocks, ACPtr block[3], + ACType ac_type) override { + auto decode_ac_varblock = ac_type == ACType::k16 + ? DecodeACVarBlock + : DecodeACVarBlock; + for (size_t c : {1, 0, 2}) { + size_t sbx = bx >> hshift[c]; + size_t sby = by >> vshift[c]; + if (JXL_UNLIKELY((sbx << hshift[c] != bx) || (sby << vshift[c] != by))) { + continue; + } + + for (size_t pass = 0; JXL_UNLIKELY(pass < num_passes); pass++) { + JXL_RETURN_IF_ERROR(decode_ac_varblock( + ctx_offset[pass], log2_covered_blocks, row_nzeros[pass][c], + row_nzeros_top[pass][c], nzeros_stride, c, sbx, sby, bx, acs, + &coeff_orders[pass * coeff_order_size], readers[pass], + &decoders[pass], context_map[pass], quant_dc_row, qf_row, + *block_ctx_map, block[c], shift_for_pass[pass])); + } + } + return true; + } + + Status Init(BitReader* JXL_RESTRICT* JXL_RESTRICT readers, size_t num_passes, + size_t group_idx, size_t histo_selector_bits, const Rect& rect, + GroupDecCache* JXL_RESTRICT group_dec_cache, + PassesDecoderState* dec_state, size_t first_pass) { + for (size_t i = 0; i < 3; i++) { + hshift[i] = dec_state->shared->frame_header.chroma_subsampling.HShift(i); + vshift[i] = dec_state->shared->frame_header.chroma_subsampling.VShift(i); + } + this->coeff_order_size = dec_state->shared->coeff_order_size; + this->coeff_orders = + dec_state->shared->coeff_orders.data() + first_pass * coeff_order_size; + this->context_map = dec_state->context_map.data() + first_pass; + this->readers = readers; + this->num_passes = num_passes; + this->shift_for_pass = + dec_state->shared->frame_header.passes.shift + first_pass; + this->group_dec_cache = group_dec_cache; + this->rect = rect; + block_ctx_map = &dec_state->shared->block_ctx_map; + qf = &dec_state->shared->raw_quant_field; + quant_dc = &dec_state->shared->quant_dc; + + for (size_t pass = 0; pass < num_passes; pass++) { + // Select which histogram set to use among those of the current pass. + size_t cur_histogram = 0; + if (histo_selector_bits != 0) { + cur_histogram = readers[pass]->ReadBits(histo_selector_bits); + } + if (cur_histogram >= dec_state->shared->num_histograms) { + return JXL_FAILURE("Invalid histogram selector"); + } + ctx_offset[pass] = cur_histogram * block_ctx_map->NumACContexts(); + + decoders[pass] = + ANSSymbolReader(&dec_state->code[pass + first_pass], readers[pass]); + } + nzeros_stride = group_dec_cache->num_nzeroes[0].PixelsPerRow(); + for (size_t i = 0; i < num_passes; i++) { + JXL_ASSERT( + nzeros_stride == + static_cast(group_dec_cache->num_nzeroes[i].PixelsPerRow())); + } + return true; + } + + const uint32_t* shift_for_pass = nullptr; // not owned + const coeff_order_t* JXL_RESTRICT coeff_orders; + size_t coeff_order_size; + const std::vector* JXL_RESTRICT context_map; + ANSSymbolReader decoders[kMaxNumPasses]; + BitReader* JXL_RESTRICT* JXL_RESTRICT readers; + size_t num_passes; + size_t ctx_offset[kMaxNumPasses]; + size_t nzeros_stride; + int32_t* JXL_RESTRICT row_nzeros[kMaxNumPasses][3]; + const int32_t* JXL_RESTRICT row_nzeros_top[kMaxNumPasses][3]; + GroupDecCache* JXL_RESTRICT group_dec_cache; + const BlockCtxMap* block_ctx_map; + const ImageI* qf; + const ImageB* quant_dc; + const int32_t* qf_row; + const uint8_t* quant_dc_row; + Rect rect; + size_t hshift[3], vshift[3]; +}; + +struct GetBlockFromEncoder : public GetBlock { + void StartRow(size_t by) override {} + + Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, size_t size, + size_t log2_covered_blocks, ACPtr block[3], + ACType ac_type) override { + JXL_DASSERT(ac_type == ACType::k32); + for (size_t c = 0; c < 3; c++) { + // for each pass + for (size_t i = 0; i < quantized_ac->size(); i++) { + for (size_t k = 0; k < size; k++) { + // TODO(veluca): SIMD. + block[c].ptr32[k] += + rows[i][c][offset + k] * (1 << shift_for_pass[i]); + } + } + } + offset += size; + return true; + } + + GetBlockFromEncoder(const std::vector>& ac, + size_t group_idx, const uint32_t* shift_for_pass) + : quantized_ac(&ac), shift_for_pass(shift_for_pass) { + // TODO(veluca): not supported with chroma subsampling. + for (size_t i = 0; i < quantized_ac->size(); i++) { + JXL_CHECK((*quantized_ac)[i]->Type() == ACType::k32); + for (size_t c = 0; c < 3; c++) { + rows[i][c] = (*quantized_ac)[i]->PlaneRow(c, group_idx, 0).ptr32; + } + } + } + + const std::vector>* JXL_RESTRICT quantized_ac; + size_t offset = 0; + const int32_t* JXL_RESTRICT rows[kMaxNumPasses][3]; + const uint32_t* shift_for_pass = nullptr; // not owned +}; + +HWY_EXPORT(DecodeGroupImpl); + +} // namespace + +Status DecodeGroup(BitReader* JXL_RESTRICT* JXL_RESTRICT readers, + size_t num_passes, size_t group_idx, + PassesDecoderState* JXL_RESTRICT dec_state, + GroupDecCache* JXL_RESTRICT group_dec_cache, size_t thread, + RenderPipelineInput& render_pipeline_input, + ImageBundle* JXL_RESTRICT decoded, size_t first_pass, + bool force_draw, bool dc_only, bool* should_run_pipeline) { + PROFILER_FUNC; + + DrawMode draw = (num_passes + first_pass == + dec_state->shared->frame_header.passes.num_passes) || + force_draw + ? kDraw + : kDontDraw; + + if (should_run_pipeline) { + *should_run_pipeline = draw != kDontDraw; + } + + if (draw == kDraw && num_passes == 0 && first_pass == 0) { + group_dec_cache->InitDCBufferOnce(); + const YCbCrChromaSubsampling& cs = + dec_state->shared->frame_header.chroma_subsampling; + for (size_t c : {0, 1, 2}) { + size_t hs = cs.HShift(c); + size_t vs = cs.VShift(c); + // We reuse filter_input_storage here as it is not currently in use. + const Rect src_rect_precs = dec_state->shared->BlockGroupRect(group_idx); + const Rect src_rect = + Rect(src_rect_precs.x0() >> hs, src_rect_precs.y0() >> vs, + src_rect_precs.xsize() >> hs, src_rect_precs.ysize() >> vs); + const Rect copy_rect(kRenderPipelineXOffset, 2, src_rect.xsize(), + src_rect.ysize()); + CopyImageToWithPadding(src_rect, dec_state->shared->dc->Plane(c), 2, + copy_rect, &group_dec_cache->dc_buffer); + // Mirrorpad. Interleaving left and right padding ensures that padding + // works out correctly even for images with DC size of 1. + for (size_t y = 0; y < src_rect.ysize() + 4; y++) { + size_t xend = kRenderPipelineXOffset + + (dec_state->shared->dc->Plane(c).xsize() >> hs) - + src_rect.x0(); + for (size_t ix = 0; ix < 2; ix++) { + if (src_rect.x0() == 0) { + group_dec_cache->dc_buffer.Row(y)[kRenderPipelineXOffset - ix - 1] = + group_dec_cache->dc_buffer.Row(y)[kRenderPipelineXOffset + ix]; + } + if (src_rect.x0() + src_rect.xsize() + 2 >= + (dec_state->shared->dc->xsize() >> hs)) { + group_dec_cache->dc_buffer.Row(y)[xend + ix] = + group_dec_cache->dc_buffer.Row(y)[xend - ix - 1]; + } + } + } + Rect dst_rect = render_pipeline_input.GetBuffer(c).second; + ImageF* upsampling_dst = render_pipeline_input.GetBuffer(c).first; + JXL_ASSERT(dst_rect.IsInside(*upsampling_dst)); + + RenderPipelineStage::RowInfo input_rows(1, std::vector(5)); + RenderPipelineStage::RowInfo output_rows(1, std::vector(8)); + for (size_t y = src_rect.y0(); y < src_rect.y0() + src_rect.ysize(); + y++) { + for (ssize_t iy = 0; iy < 5; iy++) { + input_rows[0][iy] = group_dec_cache->dc_buffer.Row( + Mirror(ssize_t(y) + iy - 2, + dec_state->shared->dc->Plane(c).ysize() >> vs) + + 2 - src_rect.y0()); + } + for (size_t iy = 0; iy < 8; iy++) { + output_rows[0][iy] = + dst_rect.Row(upsampling_dst, ((y - src_rect.y0()) << 3) + iy) - + kRenderPipelineXOffset; + } + // Arguments set to 0/nullptr are not used. + dec_state->upsampler8x->ProcessRow(input_rows, output_rows, + /*xextra=*/0, src_rect.xsize(), 0, 0, + thread); + } + } + return true; + } + + size_t histo_selector_bits = 0; + if (dc_only) { + JXL_ASSERT(num_passes == 0); + } else { + JXL_ASSERT(dec_state->shared->num_histograms > 0); + histo_selector_bits = CeilLog2Nonzero(dec_state->shared->num_histograms); + } + + GetBlockFromBitstream get_block; + JXL_RETURN_IF_ERROR( + get_block.Init(readers, num_passes, group_idx, histo_selector_bits, + dec_state->shared->BlockGroupRect(group_idx), + group_dec_cache, dec_state, first_pass)); + + JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(DecodeGroupImpl)( + &get_block, group_dec_cache, dec_state, thread, group_idx, + render_pipeline_input, decoded, draw)); + + for (size_t pass = 0; pass < num_passes; pass++) { + if (!get_block.decoders[pass].CheckANSFinalState()) { + return JXL_FAILURE("ANS checksum failure."); + } + } + return true; +} + +Status DecodeGroupForRoundtrip(const std::vector>& ac, + size_t group_idx, + PassesDecoderState* JXL_RESTRICT dec_state, + GroupDecCache* JXL_RESTRICT group_dec_cache, + size_t thread, + RenderPipelineInput& render_pipeline_input, + ImageBundle* JXL_RESTRICT decoded, + AuxOut* aux_out) { + PROFILER_FUNC; + + GetBlockFromEncoder get_block(ac, group_idx, + dec_state->shared->frame_header.passes.shift); + group_dec_cache->InitOnce( + /*num_passes=*/0, + /*used_acs=*/(1u << AcStrategy::kNumValidStrategies) - 1); + + return HWY_DYNAMIC_DISPATCH(DecodeGroupImpl)( + &get_block, group_dec_cache, dec_state, thread, group_idx, + render_pipeline_input, decoded, kDraw); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/dec_group.h b/third_party/jpeg-xl/lib/jxl/dec_group.h new file mode 100644 index 0000000000..e32ea67b5f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_group.h @@ -0,0 +1,49 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_GROUP_H_ +#define LIB_JXL_DEC_GROUP_H_ + +#include +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/quantizer.h" + +namespace jxl { + +struct AuxOut; + +Status DecodeGroup(BitReader* JXL_RESTRICT* JXL_RESTRICT readers, + size_t num_passes, size_t group_idx, + PassesDecoderState* JXL_RESTRICT dec_state, + GroupDecCache* JXL_RESTRICT group_dec_cache, size_t thread, + RenderPipelineInput& render_pipeline_input, + ImageBundle* JXL_RESTRICT decoded, size_t first_pass, + bool force_draw, bool dc_only, bool* should_run_pipeline); + +Status DecodeGroupForRoundtrip(const std::vector>& ac, + size_t group_idx, + PassesDecoderState* JXL_RESTRICT dec_state, + GroupDecCache* JXL_RESTRICT group_dec_cache, + size_t thread, + RenderPipelineInput& render_pipeline_input, + ImageBundle* JXL_RESTRICT decoded, + AuxOut* aux_out); + +} // namespace jxl + +#endif // LIB_JXL_DEC_GROUP_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dec_group_border.cc b/third_party/jpeg-xl/lib/jxl/dec_group_border.cc new file mode 100644 index 0000000000..4bee3ae6ef --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_group_border.cc @@ -0,0 +1,184 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_group_border.h" + +#include + +namespace jxl { + +void GroupBorderAssigner::Init(const FrameDimensions& frame_dim) { + frame_dim_ = frame_dim; + size_t num_corners = + (frame_dim_.xsize_groups + 1) * (frame_dim_.ysize_groups + 1); + counters_.reset(new std::atomic[num_corners]); + // Initialize counters. + for (size_t y = 0; y < frame_dim_.ysize_groups + 1; y++) { + for (size_t x = 0; x < frame_dim_.xsize_groups + 1; x++) { + // Counters at image borders don't have anything on the other side, we + // pre-fill their value to have more uniform handling afterwards. + uint8_t init_value = 0; + if (x == 0) { + init_value |= kTopLeft | kBottomLeft; + } + if (x == frame_dim_.xsize_groups) { + init_value |= kTopRight | kBottomRight; + } + if (y == 0) { + init_value |= kTopLeft | kTopRight; + } + if (y == frame_dim_.ysize_groups) { + init_value |= kBottomLeft | kBottomRight; + } + counters_[y * (frame_dim_.xsize_groups + 1) + x] = init_value; + } + } +} + +void GroupBorderAssigner::ClearDone(size_t group_id) { + size_t x = group_id % frame_dim_.xsize_groups; + size_t y = group_id / frame_dim_.xsize_groups; + size_t top_left_idx = y * (frame_dim_.xsize_groups + 1) + x; + size_t top_right_idx = y * (frame_dim_.xsize_groups + 1) + x + 1; + size_t bottom_right_idx = (y + 1) * (frame_dim_.xsize_groups + 1) + x + 1; + size_t bottom_left_idx = (y + 1) * (frame_dim_.xsize_groups + 1) + x; + counters_[top_left_idx].fetch_and(~kBottomRight); + counters_[top_right_idx].fetch_and(~kBottomLeft); + counters_[bottom_left_idx].fetch_and(~kTopRight); + counters_[bottom_right_idx].fetch_and(~kTopLeft); +} + +// Looking at each corner between groups, we can guarantee that the four +// involved groups will agree between each other regarding the order in which +// each of the four groups terminated. Thus, the last of the four groups +// gets the responsibility of handling the corner. For borders, every border +// is assigned to its top corner (for vertical borders) or to its left corner +// (for horizontal borders): the order as seen on those corners will decide who +// handles that border. + +void GroupBorderAssigner::GroupDone(size_t group_id, size_t padx, size_t pady, + Rect* rects_to_finalize, + size_t* num_to_finalize) { + size_t x = group_id % frame_dim_.xsize_groups; + size_t y = group_id / frame_dim_.xsize_groups; + Rect block_rect(x * frame_dim_.group_dim / kBlockDim, + y * frame_dim_.group_dim / kBlockDim, + frame_dim_.group_dim / kBlockDim, + frame_dim_.group_dim / kBlockDim, frame_dim_.xsize_blocks, + frame_dim_.ysize_blocks); + + size_t top_left_idx = y * (frame_dim_.xsize_groups + 1) + x; + size_t top_right_idx = y * (frame_dim_.xsize_groups + 1) + x + 1; + size_t bottom_right_idx = (y + 1) * (frame_dim_.xsize_groups + 1) + x + 1; + size_t bottom_left_idx = (y + 1) * (frame_dim_.xsize_groups + 1) + x; + + auto fetch_status = [this](size_t idx, uint8_t bit) { + // Note that the acq-rel semantics of this fetch are actually needed to + // ensure that the pixel data of the group is already written to memory. + size_t status = counters_[idx].fetch_or(bit); + JXL_DASSERT((bit & status) == 0); + return bit | status; + }; + + size_t top_left_status = fetch_status(top_left_idx, kBottomRight); + size_t top_right_status = fetch_status(top_right_idx, kBottomLeft); + size_t bottom_right_status = fetch_status(bottom_right_idx, kTopLeft); + size_t bottom_left_status = fetch_status(bottom_left_idx, kTopRight); + + size_t x1 = block_rect.x0() + block_rect.xsize(); + size_t y1 = block_rect.y0() + block_rect.ysize(); + + bool is_last_group_x = frame_dim_.xsize_groups == x + 1; + bool is_last_group_y = frame_dim_.ysize_groups == y + 1; + + // Start of border of neighbouring group, end of border of this group, start + // of border of this group (on the other side), end of border of next group. + size_t xpos[4] = { + block_rect.x0() == 0 ? 0 : block_rect.x0() * kBlockDim - padx, + block_rect.x0() == 0 + ? 0 + : std::min(frame_dim_.xsize, block_rect.x0() * kBlockDim + padx), + is_last_group_x ? frame_dim_.xsize : x1 * kBlockDim - padx, + std::min(frame_dim_.xsize, x1 * kBlockDim + padx)}; + size_t ypos[4] = { + block_rect.y0() == 0 ? 0 : block_rect.y0() * kBlockDim - pady, + block_rect.y0() == 0 + ? 0 + : std::min(frame_dim_.ysize, block_rect.y0() * kBlockDim + pady), + is_last_group_y ? frame_dim_.ysize : y1 * kBlockDim - pady, + std::min(frame_dim_.ysize, y1 * kBlockDim + pady)}; + + *num_to_finalize = 0; + auto append_rect = [&](size_t x0, size_t x1, size_t y0, size_t y1) { + Rect rect(xpos[x0], ypos[y0], xpos[x1] - xpos[x0], ypos[y1] - ypos[y0]); + if (rect.xsize() == 0 || rect.ysize() == 0) return; + JXL_DASSERT(*num_to_finalize < kMaxToFinalize); + rects_to_finalize[(*num_to_finalize)++] = rect; + }; + + // Because of how group borders are assigned, it is impossible that we need to + // process the left and right side of some area but not the center area. Thus, + // we compute the first/last part to process in every horizontal strip and + // merge them together. We first collect a mask of what parts should be + // processed. + // We do this horizontally rather than vertically because horizontal borders + // are larger. + bool available_parts_mask[3][3] = {}; // [x][y] + // Center + available_parts_mask[1][1] = true; + // Corners + if (top_left_status == 0xF) available_parts_mask[0][0] = true; + if (top_right_status == 0xF) available_parts_mask[2][0] = true; + if (bottom_right_status == 0xF) available_parts_mask[2][2] = true; + if (bottom_left_status == 0xF) available_parts_mask[0][2] = true; + // Other borders + if (top_left_status & kTopRight) available_parts_mask[1][0] = true; + if (top_left_status & kBottomLeft) available_parts_mask[0][1] = true; + if (top_right_status & kBottomRight) available_parts_mask[2][1] = true; + if (bottom_left_status & kBottomRight) available_parts_mask[1][2] = true; + + // Collect horizontal ranges. + constexpr size_t kNoSegment = 3; + std::pair horizontal_segments[3] = {{kNoSegment, kNoSegment}, + {kNoSegment, kNoSegment}, + {kNoSegment, kNoSegment}}; + for (size_t y = 0; y < 3; y++) { + for (size_t x = 0; x < 3; x++) { + if (!available_parts_mask[x][y]) continue; + JXL_DASSERT(horizontal_segments[y].second == kNoSegment || + horizontal_segments[y].second == x); + JXL_DASSERT((horizontal_segments[y].first == kNoSegment) == + (horizontal_segments[y].second == kNoSegment)); + if (horizontal_segments[y].first == kNoSegment) { + horizontal_segments[y].first = x; + } + horizontal_segments[y].second = x + 1; + } + } + if (horizontal_segments[0] == horizontal_segments[1] && + horizontal_segments[0] == horizontal_segments[2]) { + append_rect(horizontal_segments[0].first, horizontal_segments[0].second, 0, + 3); + } else if (horizontal_segments[0] == horizontal_segments[1]) { + append_rect(horizontal_segments[0].first, horizontal_segments[0].second, 0, + 2); + append_rect(horizontal_segments[2].first, horizontal_segments[2].second, 2, + 3); + } else if (horizontal_segments[1] == horizontal_segments[2]) { + append_rect(horizontal_segments[0].first, horizontal_segments[0].second, 0, + 1); + append_rect(horizontal_segments[1].first, horizontal_segments[1].second, 1, + 3); + } else { + append_rect(horizontal_segments[0].first, horizontal_segments[0].second, 0, + 1); + append_rect(horizontal_segments[1].first, horizontal_segments[1].second, 1, + 2); + append_rect(horizontal_segments[2].first, horizontal_segments[2].second, 2, + 3); + } +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/dec_group_border.h b/third_party/jpeg-xl/lib/jxl/dec_group_border.h new file mode 100644 index 0000000000..2d974c9987 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_group_border.h @@ -0,0 +1,47 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_GROUP_BORDER_H_ +#define LIB_JXL_DEC_GROUP_BORDER_H_ + +#include + +#include + +#include "lib/jxl/base/arch_macros.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image.h" + +namespace jxl { + +class GroupBorderAssigner { + public: + // Prepare the GroupBorderAssigner to handle a given frame. + void Init(const FrameDimensions& frame_dim); + // Marks a group as done, and returns the (at most 3) rects to run + // FinalizeImageRect on. `block_rect` must be the rect corresponding + // to the given `group_id`, measured in blocks. + void GroupDone(size_t group_id, size_t padx, size_t pady, + Rect* rects_to_finalize, size_t* num_to_finalize); + // Marks a group as not-done, for running re-paints. + void ClearDone(size_t group_id); + + static constexpr size_t kMaxToFinalize = 3; + + private: + FrameDimensions frame_dim_; + std::unique_ptr[]> counters_; + + // Constants to identify group positions relative to the corners. + static constexpr uint8_t kTopLeft = 0x01; + static constexpr uint8_t kTopRight = 0x02; + static constexpr uint8_t kBottomRight = 0x04; + static constexpr uint8_t kBottomLeft = 0x08; +}; + +} // namespace jxl + +#endif // LIB_JXL_DEC_GROUP_BORDER_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dec_huffman.cc b/third_party/jpeg-xl/lib/jxl/dec_huffman.cc new file mode 100644 index 0000000000..05b275773a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_huffman.cc @@ -0,0 +1,255 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_huffman.h" + +#include /* for memset */ + +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/huffman_table.h" + +namespace jxl { + +static const int kCodeLengthCodes = 18; +static const uint8_t kCodeLengthCodeOrder[kCodeLengthCodes] = { + 1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15, +}; +static const uint8_t kDefaultCodeLength = 8; +static const uint8_t kCodeLengthRepeatCode = 16; + +int ReadHuffmanCodeLengths(const uint8_t* code_length_code_lengths, + int num_symbols, uint8_t* code_lengths, + BitReader* br) { + int symbol = 0; + uint8_t prev_code_len = kDefaultCodeLength; + int repeat = 0; + uint8_t repeat_code_len = 0; + int space = 32768; + HuffmanCode table[32]; + + uint16_t counts[16] = {0}; + for (int i = 0; i < kCodeLengthCodes; ++i) { + ++counts[code_length_code_lengths[i]]; + } + if (!BuildHuffmanTable(table, 5, code_length_code_lengths, kCodeLengthCodes, + &counts[0])) { + return 0; + } + + while (symbol < num_symbols && space > 0) { + const HuffmanCode* p = table; + uint8_t code_len; + br->Refill(); + p += br->PeekFixedBits<5>(); + br->Consume(p->bits); + code_len = (uint8_t)p->value; + if (code_len < kCodeLengthRepeatCode) { + repeat = 0; + code_lengths[symbol++] = code_len; + if (code_len != 0) { + prev_code_len = code_len; + space -= 32768u >> code_len; + } + } else { + const int extra_bits = code_len - 14; + int old_repeat; + int repeat_delta; + uint8_t new_len = 0; + if (code_len == kCodeLengthRepeatCode) { + new_len = prev_code_len; + } + if (repeat_code_len != new_len) { + repeat = 0; + repeat_code_len = new_len; + } + old_repeat = repeat; + if (repeat > 0) { + repeat -= 2; + repeat <<= extra_bits; + } + repeat += (int)br->ReadBits(extra_bits) + 3; + repeat_delta = repeat - old_repeat; + if (symbol + repeat_delta > num_symbols) { + return 0; + } + memset(&code_lengths[symbol], repeat_code_len, (size_t)repeat_delta); + symbol += repeat_delta; + if (repeat_code_len != 0) { + space -= repeat_delta << (15 - repeat_code_len); + } + } + } + if (space != 0) { + return 0; + } + memset(&code_lengths[symbol], 0, (size_t)(num_symbols - symbol)); + return true; +} + +static JXL_INLINE bool ReadSimpleCode(size_t alphabet_size, BitReader* br, + HuffmanCode* table) { + size_t max_bits = + (alphabet_size > 1u) ? FloorLog2Nonzero(alphabet_size - 1u) + 1 : 0; + + size_t num_symbols = br->ReadFixedBits<2>() + 1; + + uint16_t symbols[4] = {0}; + for (size_t i = 0; i < num_symbols; ++i) { + uint16_t symbol = br->ReadBits(max_bits); + if (symbol >= alphabet_size) { + return false; + } + symbols[i] = symbol; + } + + for (size_t i = 0; i < num_symbols - 1; ++i) { + for (size_t j = i + 1; j < num_symbols; ++j) { + if (symbols[i] == symbols[j]) return false; + } + } + + // 4 symbols have to option to encode. + if (num_symbols == 4) num_symbols += br->ReadFixedBits<1>(); + + const auto swap_symbols = [&symbols](size_t i, size_t j) { + uint16_t t = symbols[j]; + symbols[j] = symbols[i]; + symbols[i] = t; + }; + + size_t table_size = 1; + switch (num_symbols) { + case 1: + table[0] = {0, symbols[0]}; + break; + case 2: + if (symbols[0] > symbols[1]) swap_symbols(0, 1); + table[0] = {1, symbols[0]}; + table[1] = {1, symbols[1]}; + table_size = 2; + break; + case 3: + if (symbols[1] > symbols[2]) swap_symbols(1, 2); + table[0] = {1, symbols[0]}; + table[2] = {1, symbols[0]}; + table[1] = {2, symbols[1]}; + table[3] = {2, symbols[2]}; + table_size = 4; + break; + case 4: { + for (size_t i = 0; i < 3; ++i) { + for (size_t j = i + 1; j < 4; ++j) { + if (symbols[i] > symbols[j]) swap_symbols(i, j); + } + } + table[0] = {2, symbols[0]}; + table[2] = {2, symbols[1]}; + table[1] = {2, symbols[2]}; + table[3] = {2, symbols[3]}; + table_size = 4; + break; + } + case 5: { + if (symbols[2] > symbols[3]) swap_symbols(2, 3); + table[0] = {1, symbols[0]}; + table[1] = {2, symbols[1]}; + table[2] = {1, symbols[0]}; + table[3] = {3, symbols[2]}; + table[4] = {1, symbols[0]}; + table[5] = {2, symbols[1]}; + table[6] = {1, symbols[0]}; + table[7] = {3, symbols[3]}; + table_size = 8; + break; + } + default: { + // Unreachable. + return false; + } + } + + const uint32_t goal_size = 1u << kHuffmanTableBits; + while (table_size != goal_size) { + memcpy(&table[table_size], &table[0], + (size_t)table_size * sizeof(table[0])); + table_size <<= 1; + } + + return true; +} + +bool HuffmanDecodingData::ReadFromBitStream(size_t alphabet_size, + BitReader* br) { + if (alphabet_size > (1 << PREFIX_MAX_BITS)) return false; + + /* simple_code_or_skip is used as follows: + 1 for simple code; + 0 for no skipping, 2 skips 2 code lengths, 3 skips 3 code lengths */ + uint32_t simple_code_or_skip = br->ReadFixedBits<2>(); + if (simple_code_or_skip == 1u) { + table_.resize(1u << kHuffmanTableBits); + return ReadSimpleCode(alphabet_size, br, table_.data()); + } + + std::vector code_lengths(alphabet_size, 0); + uint8_t code_length_code_lengths[kCodeLengthCodes] = {0}; + int space = 32; + int num_codes = 0; + /* Static Huffman code for the code length code lengths */ + static const HuffmanCode huff[16] = { + {2, 0}, {2, 4}, {2, 3}, {3, 2}, {2, 0}, {2, 4}, {2, 3}, {4, 1}, + {2, 0}, {2, 4}, {2, 3}, {3, 2}, {2, 0}, {2, 4}, {2, 3}, {4, 5}, + }; + for (size_t i = simple_code_or_skip; i < kCodeLengthCodes && space > 0; ++i) { + const int code_len_idx = kCodeLengthCodeOrder[i]; + const HuffmanCode* p = huff; + uint8_t v; + br->Refill(); + p += br->PeekFixedBits<4>(); + br->Consume(p->bits); + v = (uint8_t)p->value; + code_length_code_lengths[code_len_idx] = v; + if (v != 0) { + space -= (32u >> v); + ++num_codes; + } + } + bool ok = (num_codes == 1 || space == 0) && + ReadHuffmanCodeLengths(code_length_code_lengths, alphabet_size, + &code_lengths[0], br); + + if (!ok) return false; + uint16_t counts[16] = {0}; + for (size_t i = 0; i < alphabet_size; ++i) { + ++counts[code_lengths[i]]; + } + table_.resize(alphabet_size + 376); + uint32_t table_size = + BuildHuffmanTable(table_.data(), kHuffmanTableBits, &code_lengths[0], + alphabet_size, &counts[0]); + table_.resize(table_size); + return (table_size > 0); +} + +// Decodes the next Huffman coded symbol from the bit-stream. +uint16_t HuffmanDecodingData::ReadSymbol(BitReader* br) const { + size_t n_bits; + const HuffmanCode* table = table_.data(); + table += br->PeekBits(kHuffmanTableBits); + n_bits = table->bits; + if (n_bits > kHuffmanTableBits) { + br->Consume(kHuffmanTableBits); + n_bits -= kHuffmanTableBits; + table += table->value; + table += br->PeekBits(n_bits); + } + br->Consume(table->bits); + return table->value; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/dec_huffman.h b/third_party/jpeg-xl/lib/jxl/dec_huffman.h new file mode 100644 index 0000000000..162c3e309c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_huffman.h @@ -0,0 +1,32 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_HUFFMAN_H_ +#define LIB_JXL_DEC_HUFFMAN_H_ + +#include +#include + +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/huffman_table.h" + +namespace jxl { + +static constexpr size_t kHuffmanTableBits = 8u; + +struct HuffmanDecodingData { + // Decodes the Huffman code lengths from the bit-stream and fills in the + // pre-allocated table with the corresponding 2-level Huffman decoding table. + // Returns false if the Huffman code lengths can not de decoded. + bool ReadFromBitStream(size_t alphabet_size, BitReader* br); + + uint16_t ReadSymbol(BitReader* br) const; + + std::vector table_; +}; + +} // namespace jxl + +#endif // LIB_JXL_DEC_HUFFMAN_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dec_modular.cc b/third_party/jpeg-xl/lib/jxl/dec_modular.cc new file mode 100644 index 0000000000..bf85eaa05c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_modular.cc @@ -0,0 +1,774 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_modular.h" + +#include + +#include +#include +#include + +#include "lib/jxl/frame_header.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/dec_modular.cc" +#include +#include + +#include "lib/jxl/alpha.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/epf.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/transform.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::Rebind; + +void MultiplySum(const size_t xsize, + const pixel_type* const JXL_RESTRICT row_in, + const pixel_type* const JXL_RESTRICT row_in_Y, + const float factor, float* const JXL_RESTRICT row_out) { + const HWY_FULL(float) df; + const Rebind di; // assumes pixel_type <= float + const auto factor_v = Set(df, factor); + for (size_t x = 0; x < xsize; x += Lanes(di)) { + const auto in = Add(Load(di, row_in + x), Load(di, row_in_Y + x)); + const auto out = Mul(ConvertTo(df, in), factor_v); + Store(out, df, row_out + x); + } +} + +void RgbFromSingle(const size_t xsize, + const pixel_type* const JXL_RESTRICT row_in, + const float factor, float* out_r, float* out_g, + float* out_b) { + const HWY_FULL(float) df; + const Rebind di; // assumes pixel_type <= float + + const auto factor_v = Set(df, factor); + for (size_t x = 0; x < xsize; x += Lanes(di)) { + const auto in = Load(di, row_in + x); + const auto out = Mul(ConvertTo(df, in), factor_v); + Store(out, df, out_r + x); + Store(out, df, out_g + x); + Store(out, df, out_b + x); + } +} + +void SingleFromSingle(const size_t xsize, + const pixel_type* const JXL_RESTRICT row_in, + const float factor, float* row_out) { + const HWY_FULL(float) df; + const Rebind di; // assumes pixel_type <= float + + const auto factor_v = Set(df, factor); + for (size_t x = 0; x < xsize; x += Lanes(di)) { + const auto in = Load(di, row_in + x); + const auto out = Mul(ConvertTo(df, in), factor_v); + Store(out, df, row_out + x); + } +} +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(MultiplySum); // Local function +HWY_EXPORT(RgbFromSingle); // Local function +HWY_EXPORT(SingleFromSingle); // Local function + +// Slow conversion using double precision multiplication, only +// needed when the bit depth is too high for single precision +void SingleFromSingleAccurate(const size_t xsize, + const pixel_type* const JXL_RESTRICT row_in, + const double factor, float* row_out) { + for (size_t x = 0; x < xsize; x++) { + row_out[x] = row_in[x] * factor; + } +} + +// convert custom [bits]-bit float (with [exp_bits] exponent bits) stored as int +// back to binary32 float +void int_to_float(const pixel_type* const JXL_RESTRICT row_in, + float* const JXL_RESTRICT row_out, const size_t xsize, + const int bits, const int exp_bits) { + if (bits == 32) { + JXL_ASSERT(sizeof(pixel_type) == sizeof(float)); + JXL_ASSERT(exp_bits == 8); + memcpy(row_out, row_in, xsize * sizeof(float)); + return; + } + int exp_bias = (1 << (exp_bits - 1)) - 1; + int sign_shift = bits - 1; + int mant_bits = bits - exp_bits - 1; + int mant_shift = 23 - mant_bits; + for (size_t x = 0; x < xsize; ++x) { + uint32_t f; + memcpy(&f, &row_in[x], 4); + int signbit = (f >> sign_shift); + f &= (1 << sign_shift) - 1; + if (f == 0) { + row_out[x] = (signbit ? -0.f : 0.f); + continue; + } + int exp = (f >> mant_bits); + int mantissa = (f & ((1 << mant_bits) - 1)); + mantissa <<= mant_shift; + // Try to normalize only if there is space for maneuver. + if (exp == 0 && exp_bits < 8) { + // subnormal number + while ((mantissa & 0x800000) == 0) { + mantissa <<= 1; + exp--; + } + exp++; + // remove leading 1 because it is implicit now + mantissa &= 0x7fffff; + } + exp -= exp_bias; + // broke up the arbitrary float into its parts, now reassemble into + // binary32 + exp += 127; + JXL_ASSERT(exp >= 0); + f = (signbit ? 0x80000000 : 0); + f |= (exp << 23); + f |= mantissa; + memcpy(&row_out[x], &f, 4); + } +} + +std::string ModularStreamId::DebugString() const { + std::ostringstream os; + os << (kind == kGlobalData ? "ModularGlobal" + : kind == kVarDCTDC ? "VarDCTDC" + : kind == kModularDC ? "ModularDC" + : kind == kACMetadata ? "ACMeta" + : kind == kQuantTable ? "QuantTable" + : kind == kModularAC ? "ModularAC" + : ""); + if (kind == kVarDCTDC || kind == kModularDC || kind == kACMetadata || + kind == kModularAC) { + os << " group " << group_id; + } + if (kind == kModularAC) { + os << " pass " << pass_id; + } + if (kind == kQuantTable) { + os << " " << quant_table_id; + } + return os.str(); +} + +Status ModularFrameDecoder::DecodeGlobalInfo(BitReader* reader, + const FrameHeader& frame_header, + bool allow_truncated_group) { + bool decode_color = frame_header.encoding == FrameEncoding::kModular; + const auto& metadata = frame_header.nonserialized_metadata->m; + bool is_gray = metadata.color_encoding.IsGray(); + size_t nb_chans = 3; + if (is_gray && frame_header.color_transform == ColorTransform::kNone) { + nb_chans = 1; + } + do_color = decode_color; + size_t nb_extra = metadata.extra_channel_info.size(); + bool has_tree = reader->ReadBits(1); + if (!allow_truncated_group || + reader->TotalBitsConsumed() < reader->TotalBytes() * kBitsPerByte) { + if (has_tree) { + size_t tree_size_limit = + std::min(static_cast(1 << 22), + 1024 + frame_dim.xsize * frame_dim.ysize * + (nb_chans + nb_extra) / 16); + JXL_RETURN_IF_ERROR(DecodeTree(reader, &tree, tree_size_limit)); + JXL_RETURN_IF_ERROR( + DecodeHistograms(reader, (tree.size() + 1) / 2, &code, &context_map)); + } + } + if (!do_color) nb_chans = 0; + + bool fp = metadata.bit_depth.floating_point_sample; + + // bits_per_sample is just metadata for XYB images. + if (metadata.bit_depth.bits_per_sample >= 32 && do_color && + frame_header.color_transform != ColorTransform::kXYB) { + if (metadata.bit_depth.bits_per_sample == 32 && fp == false) { + return JXL_FAILURE("uint32_t not supported in dec_modular"); + } else if (metadata.bit_depth.bits_per_sample > 32) { + return JXL_FAILURE("bits_per_sample > 32 not supported"); + } + } + + Image gi(frame_dim.xsize, frame_dim.ysize, metadata.bit_depth.bits_per_sample, + nb_chans + nb_extra); + + all_same_shift = true; + if (frame_header.color_transform == ColorTransform::kYCbCr) { + for (size_t c = 0; c < nb_chans; c++) { + gi.channel[c].hshift = frame_header.chroma_subsampling.HShift(c); + gi.channel[c].vshift = frame_header.chroma_subsampling.VShift(c); + size_t xsize_shifted = + DivCeil(frame_dim.xsize, 1 << gi.channel[c].hshift); + size_t ysize_shifted = + DivCeil(frame_dim.ysize, 1 << gi.channel[c].vshift); + gi.channel[c].shrink(xsize_shifted, ysize_shifted); + if (gi.channel[c].hshift != gi.channel[0].hshift || + gi.channel[c].vshift != gi.channel[0].vshift) + all_same_shift = false; + } + } + + for (size_t ec = 0, c = nb_chans; ec < nb_extra; ec++, c++) { + size_t ecups = frame_header.extra_channel_upsampling[ec]; + gi.channel[c].shrink(DivCeil(frame_dim.xsize_upsampled, ecups), + DivCeil(frame_dim.ysize_upsampled, ecups)); + gi.channel[c].hshift = gi.channel[c].vshift = + CeilLog2Nonzero(ecups) - CeilLog2Nonzero(frame_header.upsampling); + if (gi.channel[c].hshift != gi.channel[0].hshift || + gi.channel[c].vshift != gi.channel[0].vshift) + all_same_shift = false; + } + + JXL_DEBUG_V(6, "DecodeGlobalInfo: full_image (w/o transforms) %s", + gi.DebugString().c_str()); + ModularOptions options; + options.max_chan_size = frame_dim.group_dim; + options.group_dim = frame_dim.group_dim; + Status dec_status = ModularGenericDecompress( + reader, gi, &global_header, ModularStreamId::Global().ID(frame_dim), + &options, + /*undo_transforms=*/false, &tree, &code, &context_map, + allow_truncated_group); + if (!allow_truncated_group) JXL_RETURN_IF_ERROR(dec_status); + if (dec_status.IsFatalError()) { + return JXL_FAILURE("Failed to decode global modular info"); + } + + // TODO(eustas): are we sure this can be done after partial decode? + have_something = false; + for (size_t c = 0; c < gi.channel.size(); c++) { + Channel& gic = gi.channel[c]; + if (c >= gi.nb_meta_channels && gic.w <= frame_dim.group_dim && + gic.h <= frame_dim.group_dim) + have_something = true; + } + // move global transforms to groups if possible + if (!have_something && all_same_shift) { + if (gi.transform.size() == 1 && gi.transform[0].id == TransformId::kRCT) { + global_transform = gi.transform; + gi.transform.clear(); + // TODO(jon): also move no-delta-palette out (trickier though) + } + } + full_image = std::move(gi); + JXL_DEBUG_V(6, "DecodeGlobalInfo: full_image (with transforms) %s", + full_image.DebugString().c_str()); + return dec_status; +} + +void ModularFrameDecoder::MaybeDropFullImage() { + if (full_image.transform.empty() && !have_something && all_same_shift) { + use_full_image = false; + JXL_DEBUG_V(6, "Dropping full image"); + for (auto& ch : full_image.channel) { + // keep metadata on channels around, but dealloc their planes + ch.plane = Plane(); + } + } +} + +Status ModularFrameDecoder::DecodeGroup( + const Rect& rect, BitReader* reader, int minShift, int maxShift, + const ModularStreamId& stream, bool zerofill, PassesDecoderState* dec_state, + RenderPipelineInput* render_pipeline_input, bool allow_truncated, + bool* should_run_pipeline) { + JXL_DEBUG_V(6, "Decoding %s with rect %s and shift bracket %d..%d %s", + stream.DebugString().c_str(), Description(rect).c_str(), minShift, + maxShift, zerofill ? "using zerofill" : ""); + JXL_DASSERT(stream.kind == ModularStreamId::kModularDC || + stream.kind == ModularStreamId::kModularAC); + const size_t xsize = rect.xsize(); + const size_t ysize = rect.ysize(); + Image gi(xsize, ysize, full_image.bitdepth, 0); + // start at the first bigger-than-groupsize non-metachannel + size_t c = full_image.nb_meta_channels; + for (; c < full_image.channel.size(); c++) { + Channel& fc = full_image.channel[c]; + if (fc.w > frame_dim.group_dim || fc.h > frame_dim.group_dim) break; + } + size_t beginc = c; + for (; c < full_image.channel.size(); c++) { + Channel& fc = full_image.channel[c]; + int shift = std::min(fc.hshift, fc.vshift); + if (shift > maxShift) continue; + if (shift < minShift) continue; + Rect r(rect.x0() >> fc.hshift, rect.y0() >> fc.vshift, + rect.xsize() >> fc.hshift, rect.ysize() >> fc.vshift, fc.w, fc.h); + if (r.xsize() == 0 || r.ysize() == 0) continue; + if (zerofill && use_full_image) { + for (size_t y = 0; y < r.ysize(); ++y) { + pixel_type* const JXL_RESTRICT row_out = r.Row(&fc.plane, y); + memset(row_out, 0, r.xsize() * sizeof(*row_out)); + } + } else { + Channel gc(r.xsize(), r.ysize()); + if (zerofill) ZeroFillImage(&gc.plane); + gc.hshift = fc.hshift; + gc.vshift = fc.vshift; + gi.channel.emplace_back(std::move(gc)); + } + } + if (zerofill && use_full_image) return true; + // Return early if there's nothing to decode. Otherwise there might be + // problems later (in ModularImageToDecodedRect). + if (gi.channel.empty()) { + if (dec_state && should_run_pipeline) { + const auto& frame_header = dec_state->shared->frame_header; + const auto* metadata = frame_header.nonserialized_metadata; + if (do_color || metadata->m.num_extra_channels > 0) { + // Signal to FrameDecoder that we do not have some of the required input + // for the render pipeline. + *should_run_pipeline = false; + } + } + JXL_DEBUG_V(6, "Nothing to decode, returning early."); + return true; + } + ModularOptions options; + if (!zerofill) { + auto status = ModularGenericDecompress( + reader, gi, /*header=*/nullptr, stream.ID(frame_dim), &options, + /*undo_transforms=*/true, &tree, &code, &context_map, allow_truncated); + if (!allow_truncated) JXL_RETURN_IF_ERROR(status); + if (status.IsFatalError()) return status; + } + // Undo global transforms that have been pushed to the group level + if (!use_full_image) { + JXL_ASSERT(render_pipeline_input); + for (auto t : global_transform) { + JXL_RETURN_IF_ERROR(t.Inverse(gi, global_header.wp_header)); + } + JXL_RETURN_IF_ERROR(ModularImageToDecodedRect(gi, dec_state, nullptr, + *render_pipeline_input, + Rect(0, 0, gi.w, gi.h))); + return true; + } + int gic = 0; + for (c = beginc; c < full_image.channel.size(); c++) { + Channel& fc = full_image.channel[c]; + int shift = std::min(fc.hshift, fc.vshift); + if (shift > maxShift) continue; + if (shift < minShift) continue; + Rect r(rect.x0() >> fc.hshift, rect.y0() >> fc.vshift, + rect.xsize() >> fc.hshift, rect.ysize() >> fc.vshift, fc.w, fc.h); + if (r.xsize() == 0 || r.ysize() == 0) continue; + JXL_ASSERT(use_full_image); + CopyImageTo(/*rect_from=*/Rect(0, 0, r.xsize(), r.ysize()), + /*from=*/gi.channel[gic].plane, + /*rect_to=*/r, /*to=*/&fc.plane); + gic++; + } + return true; +} + +Status ModularFrameDecoder::DecodeVarDCTDC(size_t group_id, BitReader* reader, + PassesDecoderState* dec_state) { + const Rect r = dec_state->shared->DCGroupRect(group_id); + // TODO(eustas): investigate if we could reduce the impact of + // EvalRationalPolynomial; generally speaking, the limit is + // 2**(128/(3*magic)), where 128 comes from IEEE 754 exponent, + // 3 comes from XybToRgb that cubes the values, and "magic" is + // the sum of all other contributions. 2**18 is known to lead + // to NaN on input found by fuzzing (see commit message). + Image image(r.xsize(), r.ysize(), full_image.bitdepth, 3); + size_t stream_id = ModularStreamId::VarDCTDC(group_id).ID(frame_dim); + reader->Refill(); + size_t extra_precision = reader->ReadFixedBits<2>(); + float mul = 1.0f / (1 << extra_precision); + ModularOptions options; + for (size_t c = 0; c < 3; c++) { + Channel& ch = image.channel[c < 2 ? c ^ 1 : c]; + ch.w >>= dec_state->shared->frame_header.chroma_subsampling.HShift(c); + ch.h >>= dec_state->shared->frame_header.chroma_subsampling.VShift(c); + ch.shrink(); + } + if (!ModularGenericDecompress( + reader, image, /*header=*/nullptr, stream_id, &options, + /*undo_transforms=*/true, &tree, &code, &context_map)) { + return JXL_FAILURE("Failed to decode modular DC group"); + } + DequantDC(r, &dec_state->shared_storage.dc_storage, + &dec_state->shared_storage.quant_dc, image, + dec_state->shared->quantizer.MulDC(), mul, + dec_state->shared->cmap.DCFactors(), + dec_state->shared->frame_header.chroma_subsampling, + dec_state->shared->block_ctx_map); + return true; +} + +Status ModularFrameDecoder::DecodeAcMetadata(size_t group_id, BitReader* reader, + PassesDecoderState* dec_state) { + const Rect r = dec_state->shared->DCGroupRect(group_id); + size_t upper_bound = r.xsize() * r.ysize(); + reader->Refill(); + size_t count = reader->ReadBits(CeilLog2Nonzero(upper_bound)) + 1; + size_t stream_id = ModularStreamId::ACMetadata(group_id).ID(frame_dim); + // YToX, YToB, ACS + QF, EPF + Image image(r.xsize(), r.ysize(), full_image.bitdepth, 4); + static_assert(kColorTileDimInBlocks == 8, "Color tile size changed"); + Rect cr(r.x0() >> 3, r.y0() >> 3, (r.xsize() + 7) >> 3, (r.ysize() + 7) >> 3); + image.channel[0] = Channel(cr.xsize(), cr.ysize(), 3, 3); + image.channel[1] = Channel(cr.xsize(), cr.ysize(), 3, 3); + image.channel[2] = Channel(count, 2, 0, 0); + ModularOptions options; + if (!ModularGenericDecompress( + reader, image, /*header=*/nullptr, stream_id, &options, + /*undo_transforms=*/true, &tree, &code, &context_map)) { + return JXL_FAILURE("Failed to decode AC metadata"); + } + ConvertPlaneAndClamp(Rect(image.channel[0].plane), image.channel[0].plane, cr, + &dec_state->shared_storage.cmap.ytox_map); + ConvertPlaneAndClamp(Rect(image.channel[1].plane), image.channel[1].plane, cr, + &dec_state->shared_storage.cmap.ytob_map); + size_t num = 0; + bool is444 = dec_state->shared->frame_header.chroma_subsampling.Is444(); + auto& ac_strategy = dec_state->shared_storage.ac_strategy; + size_t xlim = std::min(ac_strategy.xsize(), r.x0() + r.xsize()); + size_t ylim = std::min(ac_strategy.ysize(), r.y0() + r.ysize()); + uint32_t local_used_acs = 0; + for (size_t iy = 0; iy < r.ysize(); iy++) { + size_t y = r.y0() + iy; + int32_t* row_qf = r.Row(&dec_state->shared_storage.raw_quant_field, iy); + uint8_t* row_epf = r.Row(&dec_state->shared_storage.epf_sharpness, iy); + int32_t* row_in_1 = image.channel[2].plane.Row(0); + int32_t* row_in_2 = image.channel[2].plane.Row(1); + int32_t* row_in_3 = image.channel[3].plane.Row(iy); + for (size_t ix = 0; ix < r.xsize(); ix++) { + size_t x = r.x0() + ix; + int sharpness = row_in_3[ix]; + if (sharpness < 0 || sharpness >= LoopFilter::kEpfSharpEntries) { + return JXL_FAILURE("Corrupted sharpness field"); + } + row_epf[ix] = sharpness; + if (ac_strategy.IsValid(x, y)) { + continue; + } + + if (num >= count) return JXL_FAILURE("Corrupted stream"); + + if (!AcStrategy::IsRawStrategyValid(row_in_1[num])) { + return JXL_FAILURE("Invalid AC strategy"); + } + local_used_acs |= 1u << row_in_1[num]; + AcStrategy acs = AcStrategy::FromRawStrategy(row_in_1[num]); + if ((acs.covered_blocks_x() > 1 || acs.covered_blocks_y() > 1) && + !is444) { + return JXL_FAILURE( + "AC strategy not compatible with chroma subsampling"); + } + // Ensure that blocks do not overflow *AC* groups. + size_t next_x_ac_block = (x / kGroupDimInBlocks + 1) * kGroupDimInBlocks; + size_t next_y_ac_block = (y / kGroupDimInBlocks + 1) * kGroupDimInBlocks; + size_t next_x_dct_block = x + acs.covered_blocks_x(); + size_t next_y_dct_block = y + acs.covered_blocks_y(); + if (next_x_dct_block > next_x_ac_block || next_x_dct_block > xlim) { + return JXL_FAILURE("Invalid AC strategy, x overflow"); + } + if (next_y_dct_block > next_y_ac_block || next_y_dct_block > ylim) { + return JXL_FAILURE("Invalid AC strategy, y overflow"); + } + JXL_RETURN_IF_ERROR( + ac_strategy.SetNoBoundsCheck(x, y, AcStrategy::Type(row_in_1[num]))); + row_qf[ix] = 1 + std::max(0, std::min(Quantizer::kQuantMax - 1, + row_in_2[num])); + num++; + } + } + dec_state->used_acs |= local_used_acs; + if (dec_state->shared->frame_header.loop_filter.epf_iters > 0) { + ComputeSigma(r, dec_state); + } + return true; +} + +Status ModularFrameDecoder::ModularImageToDecodedRect( + Image& gi, PassesDecoderState* dec_state, jxl::ThreadPool* pool, + RenderPipelineInput& render_pipeline_input, Rect modular_rect) { + const auto& frame_header = dec_state->shared->frame_header; + const auto* metadata = frame_header.nonserialized_metadata; + JXL_CHECK(gi.transform.empty()); + + auto get_row = [&](size_t c, size_t y) { + const auto& buffer = render_pipeline_input.GetBuffer(c); + return buffer.second.Row(buffer.first, y); + }; + + size_t c = 0; + if (do_color) { + const bool rgb_from_gray = + metadata->m.color_encoding.IsGray() && + frame_header.color_transform == ColorTransform::kNone; + const bool fp = metadata->m.bit_depth.floating_point_sample && + frame_header.color_transform != ColorTransform::kXYB; + for (; c < 3; c++) { + double factor = full_image.bitdepth < 32 + ? 1.0 / ((1u << full_image.bitdepth) - 1) + : 0; + size_t c_in = c; + if (frame_header.color_transform == ColorTransform::kXYB) { + factor = dec_state->shared->matrices.DCQuants()[c]; + // XYB is encoded as YX(B-Y) + if (c < 2) c_in = 1 - c; + } else if (rgb_from_gray) { + c_in = 0; + } + JXL_ASSERT(c_in < gi.channel.size()); + Channel& ch_in = gi.channel[c_in]; + // TODO(eustas): could we detect it on earlier stage? + if (ch_in.w == 0 || ch_in.h == 0) { + return JXL_FAILURE("Empty image"); + } + JXL_CHECK(ch_in.hshift <= 3 && ch_in.vshift <= 3); + Rect r = render_pipeline_input.GetBuffer(c).second; + Rect mr(modular_rect.x0() >> ch_in.hshift, + modular_rect.y0() >> ch_in.vshift, + DivCeil(modular_rect.xsize(), 1 << ch_in.hshift), + DivCeil(modular_rect.ysize(), 1 << ch_in.vshift)); + mr = mr.Crop(ch_in.plane); + size_t xsize_shifted = r.xsize(); + size_t ysize_shifted = r.ysize(); + if (r.ysize() != mr.ysize() || r.xsize() != mr.xsize()) { + return JXL_FAILURE("Dimension mismatch: trying to fit a %" PRIuS + "x%" PRIuS + " modular channel into " + "a %" PRIuS "x%" PRIuS " rect", + mr.xsize(), mr.ysize(), r.xsize(), r.ysize()); + } + if (frame_header.color_transform == ColorTransform::kXYB && c == 2) { + JXL_ASSERT(!fp); + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, ysize_shifted, ThreadPool::NoInit, + [&](const uint32_t task, size_t /* thread */) { + const size_t y = task; + const pixel_type* const JXL_RESTRICT row_in = + mr.Row(&ch_in.plane, y); + const pixel_type* const JXL_RESTRICT row_in_Y = + mr.Row(&gi.channel[0].plane, y); + float* const JXL_RESTRICT row_out = get_row(c, y); + HWY_DYNAMIC_DISPATCH(MultiplySum) + (xsize_shifted, row_in, row_in_Y, factor, row_out); + }, + "ModularIntToFloat")); + } else if (fp) { + int bits = metadata->m.bit_depth.bits_per_sample; + int exp_bits = metadata->m.bit_depth.exponent_bits_per_sample; + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, ysize_shifted, ThreadPool::NoInit, + [&](const uint32_t task, size_t /* thread */) { + const size_t y = task; + const pixel_type* const JXL_RESTRICT row_in = + mr.Row(&ch_in.plane, y); + if (rgb_from_gray) { + for (size_t cc = 0; cc < 3; cc++) { + float* const JXL_RESTRICT row_out = get_row(cc, y); + int_to_float(row_in, row_out, xsize_shifted, bits, exp_bits); + } + } else { + float* const JXL_RESTRICT row_out = get_row(c, y); + int_to_float(row_in, row_out, xsize_shifted, bits, exp_bits); + } + }, + "ModularIntToFloat_losslessfloat")); + } else { + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, ysize_shifted, ThreadPool::NoInit, + [&](const uint32_t task, size_t /* thread */) { + const size_t y = task; + const pixel_type* const JXL_RESTRICT row_in = + mr.Row(&ch_in.plane, y); + if (rgb_from_gray) { + if (full_image.bitdepth < 23) { + HWY_DYNAMIC_DISPATCH(RgbFromSingle) + (xsize_shifted, row_in, factor, get_row(0, y), get_row(1, y), + get_row(2, y)); + } else { + SingleFromSingleAccurate(xsize_shifted, row_in, factor, + get_row(0, y)); + SingleFromSingleAccurate(xsize_shifted, row_in, factor, + get_row(1, y)); + SingleFromSingleAccurate(xsize_shifted, row_in, factor, + get_row(2, y)); + } + } else { + float* const JXL_RESTRICT row_out = get_row(c, y); + if (full_image.bitdepth < 23) { + HWY_DYNAMIC_DISPATCH(SingleFromSingle) + (xsize_shifted, row_in, factor, row_out); + } else { + SingleFromSingleAccurate(xsize_shifted, row_in, factor, + row_out); + } + } + }, + "ModularIntToFloat")); + } + if (rgb_from_gray) { + break; + } + } + if (rgb_from_gray) { + c = 1; + } + } + size_t num_extra_channels = metadata->m.num_extra_channels; + for (size_t ec = 0; ec < num_extra_channels; ec++, c++) { + const ExtraChannelInfo& eci = metadata->m.extra_channel_info[ec]; + int bits = eci.bit_depth.bits_per_sample; + int exp_bits = eci.bit_depth.exponent_bits_per_sample; + bool fp = eci.bit_depth.floating_point_sample; + JXL_ASSERT(fp || bits < 32); + const double factor = fp ? 0 : (1.0 / ((1u << bits) - 1)); + JXL_ASSERT(c < gi.channel.size()); + Channel& ch_in = gi.channel[c]; + Rect r = render_pipeline_input.GetBuffer(3 + ec).second; + Rect mr(modular_rect.x0() >> ch_in.hshift, + modular_rect.y0() >> ch_in.vshift, + DivCeil(modular_rect.xsize(), 1 << ch_in.hshift), + DivCeil(modular_rect.ysize(), 1 << ch_in.vshift)); + mr = mr.Crop(ch_in.plane); + if (r.ysize() != mr.ysize() || r.xsize() != mr.xsize()) { + return JXL_FAILURE("Dimension mismatch: trying to fit a %" PRIuS + "x%" PRIuS + " modular channel into " + "a %" PRIuS "x%" PRIuS " rect", + mr.xsize(), mr.ysize(), r.xsize(), r.ysize()); + } + for (size_t y = 0; y < r.ysize(); ++y) { + float* const JXL_RESTRICT row_out = + r.Row(render_pipeline_input.GetBuffer(3 + ec).first, y); + const pixel_type* const JXL_RESTRICT row_in = mr.Row(&ch_in.plane, y); + if (fp) { + int_to_float(row_in, row_out, r.xsize(), bits, exp_bits); + } else { + if (full_image.bitdepth < 23) { + HWY_DYNAMIC_DISPATCH(SingleFromSingle) + (r.xsize(), row_in, factor, row_out); + } else { + SingleFromSingleAccurate(r.xsize(), row_in, factor, row_out); + } + } + } + } + return true; +} + +Status ModularFrameDecoder::FinalizeDecoding(PassesDecoderState* dec_state, + jxl::ThreadPool* pool, + bool inplace) { + if (!use_full_image) return true; + Image gi = (inplace ? std::move(full_image) : full_image.clone()); + size_t xsize = gi.w; + size_t ysize = gi.h; + + JXL_DEBUG_V(3, "Finalizing decoding for modular image: %s", + gi.DebugString().c_str()); + + // Don't use threads if total image size is smaller than a group + if (xsize * ysize < frame_dim.group_dim * frame_dim.group_dim) pool = nullptr; + + // Undo the global transforms + gi.undo_transforms(global_header.wp_header, pool); + JXL_DASSERT(global_transform.empty()); + if (gi.error) return JXL_FAILURE("Undoing transforms failed"); + + for (size_t i = 0; i < dec_state->shared->frame_dim.num_groups; i++) { + dec_state->render_pipeline->ClearDone(i); + } + std::atomic has_error{false}; + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, dec_state->shared->frame_dim.num_groups, + [&](size_t num_threads) { + const auto& frame_header = dec_state->shared->frame_header; + bool use_group_ids = (frame_header.encoding == FrameEncoding::kVarDCT || + (frame_header.flags & FrameHeader::kNoise)); + return dec_state->render_pipeline->PrepareForThreads(num_threads, + use_group_ids); + }, + [&](const uint32_t group, size_t thread_id) { + RenderPipelineInput input = + dec_state->render_pipeline->GetInputBuffers(group, thread_id); + if (!ModularImageToDecodedRect(gi, dec_state, nullptr, input, + dec_state->shared->GroupRect(group))) { + has_error = true; + return; + } + input.Done(); + }, + "ModularToRect")); + if (has_error) { + return JXL_FAILURE("Error producing input to render pipeline"); + } + return true; +} + +static constexpr const float kAlmostZero = 1e-8f; + +Status ModularFrameDecoder::DecodeQuantTable( + size_t required_size_x, size_t required_size_y, BitReader* br, + QuantEncoding* encoding, size_t idx, + ModularFrameDecoder* modular_frame_decoder) { + JXL_RETURN_IF_ERROR(F16Coder::Read(br, &encoding->qraw.qtable_den)); + if (encoding->qraw.qtable_den < kAlmostZero) { + // qtable[] values are already checked for <= 0 so the denominator may not + // be negative. + return JXL_FAILURE("Invalid qtable_den: value too small"); + } + Image image(required_size_x, required_size_y, 8, 3); + ModularOptions options; + if (modular_frame_decoder) { + JXL_RETURN_IF_ERROR(ModularGenericDecompress( + br, image, /*header=*/nullptr, + ModularStreamId::QuantTable(idx).ID(modular_frame_decoder->frame_dim), + &options, /*undo_transforms=*/true, &modular_frame_decoder->tree, + &modular_frame_decoder->code, &modular_frame_decoder->context_map)); + } else { + JXL_RETURN_IF_ERROR(ModularGenericDecompress(br, image, /*header=*/nullptr, + 0, &options, + /*undo_transforms=*/true)); + } + if (!encoding->qraw.qtable) { + encoding->qraw.qtable = new std::vector(); + } + encoding->qraw.qtable->resize(required_size_x * required_size_y * 3); + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < required_size_y; y++) { + int32_t* JXL_RESTRICT row = image.channel[c].Row(y); + for (size_t x = 0; x < required_size_x; x++) { + (*encoding->qraw.qtable)[c * required_size_x * required_size_y + + y * required_size_x + x] = row[x]; + if (row[x] <= 0) { + return JXL_FAILURE("Invalid raw quantization table"); + } + } + } + } + return true; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/dec_modular.h b/third_party/jpeg-xl/lib/jxl/dec_modular.h new file mode 100644 index 0000000000..aae643cf1f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_modular.h @@ -0,0 +1,140 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_MODULAR_H_ +#define LIB_JXL_DEC_MODULAR_H_ + +#include + +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/modular/modular_image.h" + +namespace jxl { + +struct ModularStreamId { + enum Kind { + kGlobalData, + kVarDCTDC, + kModularDC, + kACMetadata, + kQuantTable, + kModularAC + }; + Kind kind; + size_t quant_table_id; + size_t group_id; // DC or AC group id. + size_t pass_id; // Only for kModularAC. + size_t ID(const FrameDimensions& frame_dim) const { + size_t id = 0; + switch (kind) { + case kGlobalData: + id = 0; + break; + case kVarDCTDC: + id = 1 + group_id; + break; + case kModularDC: + id = 1 + frame_dim.num_dc_groups + group_id; + break; + case kACMetadata: + id = 1 + 2 * frame_dim.num_dc_groups + group_id; + break; + case kQuantTable: + id = 1 + 3 * frame_dim.num_dc_groups + quant_table_id; + break; + case kModularAC: + id = 1 + 3 * frame_dim.num_dc_groups + DequantMatrices::kNum + + frame_dim.num_groups * pass_id + group_id; + break; + }; + return id; + } + static ModularStreamId Global() { + return ModularStreamId{kGlobalData, 0, 0, 0}; + } + static ModularStreamId VarDCTDC(size_t group_id) { + return ModularStreamId{kVarDCTDC, 0, group_id, 0}; + } + static ModularStreamId ModularDC(size_t group_id) { + return ModularStreamId{kModularDC, 0, group_id, 0}; + } + static ModularStreamId ACMetadata(size_t group_id) { + return ModularStreamId{kACMetadata, 0, group_id, 0}; + } + static ModularStreamId QuantTable(size_t quant_table_id) { + JXL_ASSERT(quant_table_id < DequantMatrices::kNum); + return ModularStreamId{kQuantTable, quant_table_id, 0, 0}; + } + static ModularStreamId ModularAC(size_t group_id, size_t pass_id) { + return ModularStreamId{kModularAC, 0, group_id, pass_id}; + } + static size_t Num(const FrameDimensions& frame_dim, size_t passes) { + return ModularAC(0, passes).ID(frame_dim); + } + std::string DebugString() const; +}; + +class ModularFrameDecoder { + public: + void Init(const FrameDimensions& frame_dim) { this->frame_dim = frame_dim; } + Status DecodeGlobalInfo(BitReader* reader, const FrameHeader& frame_header, + bool allow_truncated_group); + Status DecodeGroup(const Rect& rect, BitReader* reader, int minShift, + int maxShift, const ModularStreamId& stream, bool zerofill, + PassesDecoderState* dec_state, + RenderPipelineInput* render_pipeline_input, + bool allow_truncated, bool* should_run_pipeline = nullptr); + // Decodes a VarDCT DC group (`group_id`) from the given `reader`. + Status DecodeVarDCTDC(size_t group_id, BitReader* reader, + PassesDecoderState* dec_state); + // Decodes a VarDCT AC Metadata group (`group_id`) from the given `reader`. + Status DecodeAcMetadata(size_t group_id, BitReader* reader, + PassesDecoderState* dec_state); + // Decodes a RAW quant table from `br` into the given `encoding`, of size + // `required_size_x x required_size_y`. If `modular_frame_decoder` is passed, + // its global tree is used, otherwise no global tree is used. + static Status DecodeQuantTable(size_t required_size_x, size_t required_size_y, + BitReader* br, QuantEncoding* encoding, + size_t idx, + ModularFrameDecoder* modular_frame_decoder); + // if inplace is true, this can only be called once + // if it is false, it can be called multiple times (e.g. for progressive + // steps) + Status FinalizeDecoding(PassesDecoderState* dec_state, jxl::ThreadPool* pool, + bool inplace); + bool have_dc() const { return have_something; } + void MaybeDropFullImage(); + bool UsesFullImage() const { return use_full_image; } + + private: + Status ModularImageToDecodedRect(Image& gi, PassesDecoderState* dec_state, + jxl::ThreadPool* pool, + RenderPipelineInput& render_pipeline_input, + Rect modular_rect); + + Image full_image; + std::vector global_transform; + FrameDimensions frame_dim; + bool do_color; + bool have_something; + bool use_full_image = true; + bool all_same_shift; + Tree tree; + ANSCode code; + std::vector context_map; + GroupHeader global_header; +}; + +} // namespace jxl + +#endif // LIB_JXL_DEC_MODULAR_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dec_noise.cc b/third_party/jpeg-xl/lib/jxl/dec_noise.cc new file mode 100644 index 0000000000..275a6d0b21 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_noise.cc @@ -0,0 +1,131 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_noise.h" + +#include +#include +#include + +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/dec_noise.cc" +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/sanitizers.h" +#include "lib/jxl/xorshift128plus-inl.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Or; +using hwy::HWY_NAMESPACE::ShiftRight; +using hwy::HWY_NAMESPACE::Vec; + +using D = HWY_CAPPED(float, kBlockDim); +using DI = hwy::HWY_NAMESPACE::Rebind; +using DI8 = hwy::HWY_NAMESPACE::Repartition; + +// Converts one vector's worth of random bits to floats in [1, 2). +// NOTE: as the convolution kernel sums to 0, it doesn't matter if inputs are in +// [0, 1) or in [1, 2). +void BitsToFloat(const uint32_t* JXL_RESTRICT random_bits, + float* JXL_RESTRICT floats) { + const HWY_FULL(float) df; + const HWY_FULL(uint32_t) du; + + const auto bits = Load(du, random_bits); + // 1.0 + 23 random mantissa bits = [1, 2) + const auto rand12 = BitCast(df, Or(ShiftRight<9>(bits), Set(du, 0x3F800000))); + Store(rand12, df, floats); +} + +void RandomImage(Xorshift128Plus* rng, const Rect& rect, + ImageF* JXL_RESTRICT noise) { + const size_t xsize = rect.xsize(); + const size_t ysize = rect.ysize(); + + // May exceed the vector size, hence we have two loops over x below. + constexpr size_t kFloatsPerBatch = + Xorshift128Plus::N * sizeof(uint64_t) / sizeof(float); + HWY_ALIGN uint64_t batch[Xorshift128Plus::N] = {}; + + const HWY_FULL(float) df; + const size_t N = Lanes(df); + + for (size_t y = 0; y < ysize; ++y) { + float* JXL_RESTRICT row = rect.Row(noise, y); + + size_t x = 0; + // Only entire batches (avoids exceeding the image padding). + for (; x + kFloatsPerBatch < xsize; x += kFloatsPerBatch) { + rng->Fill(batch); + for (size_t i = 0; i < kFloatsPerBatch; i += Lanes(df)) { + BitsToFloat(reinterpret_cast(batch) + i, row + x + i); + } + } + + // Any remaining pixels, rounded up to vectors (safe due to padding). + rng->Fill(batch); + size_t batch_pos = 0; // < kFloatsPerBatch + for (; x < xsize; x += N) { + BitsToFloat(reinterpret_cast(batch) + batch_pos, + row + x); + batch_pos += N; + } + } +} +void Random3Planes(size_t visible_frame_index, size_t nonvisible_frame_index, + size_t x0, size_t y0, const std::pair& plane0, + const std::pair& plane1, + const std::pair& plane2) { + HWY_ALIGN Xorshift128Plus rng(visible_frame_index, nonvisible_frame_index, x0, + y0); + RandomImage(&rng, plane0.second, plane0.first); + RandomImage(&rng, plane1.second, plane1.first); + RandomImage(&rng, plane2.second, plane2.first); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(Random3Planes); +void Random3Planes(size_t visible_frame_index, size_t nonvisible_frame_index, + size_t x0, size_t y0, const std::pair& plane0, + const std::pair& plane1, + const std::pair& plane2) { + return HWY_DYNAMIC_DISPATCH(Random3Planes)(visible_frame_index, + nonvisible_frame_index, x0, y0, + plane0, plane1, plane2); +} + +void DecodeFloatParam(float precision, float* val, BitReader* br) { + const int absval_quant = br->ReadFixedBits<10>(); + *val = absval_quant / precision; +} + +Status DecodeNoise(BitReader* br, NoiseParams* noise_params) { + for (float& i : noise_params->lut) { + DecodeFloatParam(kNoisePrecision, &i, br); + } + return true; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/dec_noise.h b/third_party/jpeg-xl/lib/jxl/dec_noise.h new file mode 100644 index 0000000000..ac05866470 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_noise.h @@ -0,0 +1,32 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_NOISE_H_ +#define LIB_JXL_DEC_NOISE_H_ + +// Noise synthesis. Currently disabled. + +#include +#include + +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/image.h" +#include "lib/jxl/noise.h" + +namespace jxl { + +void Random3Planes(size_t visible_frame_index, size_t nonvisible_frame_index, + size_t x0, size_t y0, const std::pair& plane0, + const std::pair& plane1, + const std::pair& plane2); + +// Must only call if FrameHeader.flags.kNoise. +Status DecodeNoise(BitReader* br, NoiseParams* noise_params); + +} // namespace jxl + +#endif // LIB_JXL_DEC_NOISE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.cc b/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.cc new file mode 100644 index 0000000000..85e5de3c8d --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.cc @@ -0,0 +1,347 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_patch_dictionary.h" + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/blending.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_frame.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/patch_dictionary_internal.h" + +namespace jxl { + +Status PatchDictionary::Decode(BitReader* br, size_t xsize, size_t ysize, + bool* uses_extra_channels) { + positions_.clear(); + std::vector context_map; + ANSCode code; + JXL_RETURN_IF_ERROR( + DecodeHistograms(br, kNumPatchDictionaryContexts, &code, &context_map)); + ANSSymbolReader decoder(&code, br); + + auto read_num = [&](size_t context) { + size_t r = decoder.ReadHybridUint(context, br, context_map); + return r; + }; + + size_t num_ref_patch = read_num(kNumRefPatchContext); + // Limit max memory usage of patches to about 66 bytes per pixel (assuming 8 + // bytes per size_t) + const size_t num_pixels = xsize * ysize; + const size_t max_ref_patches = 1024 + num_pixels / 4; + const size_t max_patches = max_ref_patches * 4; + const size_t max_blending_infos = max_patches * 4; + if (num_ref_patch > max_ref_patches) { + return JXL_FAILURE("Too many patches in dictionary"); + } + size_t num_ec = shared_->metadata->m.num_extra_channels; + + size_t total_patches = 0; + size_t next_size = 1; + + for (size_t id = 0; id < num_ref_patch; id++) { + PatchReferencePosition ref_pos; + ref_pos.ref = read_num(kReferenceFrameContext); + if (ref_pos.ref >= kMaxNumReferenceFrames || + shared_->reference_frames[ref_pos.ref].frame.xsize() == 0) { + return JXL_FAILURE("Invalid reference frame ID"); + } + if (!shared_->reference_frames[ref_pos.ref].ib_is_in_xyb) { + return JXL_FAILURE( + "Patches cannot use frames saved post color transforms"); + } + const ImageBundle& ib = shared_->reference_frames[ref_pos.ref].frame; + ref_pos.x0 = read_num(kPatchReferencePositionContext); + ref_pos.y0 = read_num(kPatchReferencePositionContext); + ref_pos.xsize = read_num(kPatchSizeContext) + 1; + ref_pos.ysize = read_num(kPatchSizeContext) + 1; + if (ref_pos.x0 + ref_pos.xsize > ib.xsize()) { + return JXL_FAILURE("Invalid position specified in reference frame"); + } + if (ref_pos.y0 + ref_pos.ysize > ib.ysize()) { + return JXL_FAILURE("Invalid position specified in reference frame"); + } + size_t id_count = read_num(kPatchCountContext) + 1; + total_patches += id_count; + if (total_patches > max_patches) { + return JXL_FAILURE("Too many patches in dictionary"); + } + if (next_size < total_patches) { + next_size *= 2; + next_size = std::min(next_size, max_patches); + } + if (next_size * (num_ec + 1) > max_blending_infos) { + return JXL_FAILURE("Too many patches in dictionary"); + } + positions_.reserve(next_size); + blendings_.reserve(next_size * (num_ec + 1)); + for (size_t i = 0; i < id_count; i++) { + PatchPosition pos; + pos.ref_pos_idx = ref_positions_.size(); + if (i == 0) { + pos.x = read_num(kPatchPositionContext); + pos.y = read_num(kPatchPositionContext); + } else { + pos.x = + positions_.back().x + UnpackSigned(read_num(kPatchOffsetContext)); + pos.y = + positions_.back().y + UnpackSigned(read_num(kPatchOffsetContext)); + } + if (pos.x + ref_pos.xsize > xsize) { + return JXL_FAILURE("Invalid patch x: at %" PRIuS " + %" PRIuS + " > %" PRIuS, + pos.x, ref_pos.xsize, xsize); + } + if (pos.y + ref_pos.ysize > ysize) { + return JXL_FAILURE("Invalid patch y: at %" PRIuS " + %" PRIuS + " > %" PRIuS, + pos.y, ref_pos.ysize, ysize); + } + for (size_t j = 0; j < num_ec + 1; j++) { + uint32_t blend_mode = read_num(kPatchBlendModeContext); + if (blend_mode >= uint32_t(PatchBlendMode::kNumBlendModes)) { + return JXL_FAILURE("Invalid patch blend mode: %u", blend_mode); + } + PatchBlending info; + info.mode = static_cast(blend_mode); + if (UsesAlpha(info.mode)) { + *uses_extra_channels = true; + } + if (info.mode != PatchBlendMode::kNone && j > 0) { + *uses_extra_channels = true; + } + if (UsesAlpha(info.mode) && + shared_->metadata->m.extra_channel_info.size() > 1) { + info.alpha_channel = read_num(kPatchAlphaChannelContext); + if (info.alpha_channel >= + shared_->metadata->m.extra_channel_info.size()) { + return JXL_FAILURE( + "Invalid alpha channel for blending: %u out of %u\n", + info.alpha_channel, + (uint32_t)shared_->metadata->m.extra_channel_info.size()); + } + } else { + info.alpha_channel = 0; + } + if (UsesClamp(info.mode)) { + info.clamp = read_num(kPatchClampContext); + } else { + info.clamp = false; + } + blendings_.push_back(info); + } + positions_.push_back(std::move(pos)); + } + ref_positions_.emplace_back(std::move(ref_pos)); + } + positions_.shrink_to_fit(); + + if (!decoder.CheckANSFinalState()) { + return JXL_FAILURE("ANS checksum failure."); + } + + ComputePatchTree(); + return true; +} + +int PatchDictionary::GetReferences() const { + int result = 0; + for (size_t i = 0; i < ref_positions_.size(); ++i) { + result |= (1 << static_cast(ref_positions_[i].ref)); + } + return result; +} + +namespace { +struct PatchInterval { + size_t idx; + size_t y0, y1; +}; +} // namespace + +void PatchDictionary::ComputePatchTree() { + patch_tree_.clear(); + num_patches_.clear(); + sorted_patches_y0_.clear(); + sorted_patches_y1_.clear(); + if (positions_.empty()) { + return; + } + // Create a y-interval for each patch. + std::vector intervals(positions_.size()); + for (size_t i = 0; i < positions_.size(); ++i) { + const auto& pos = positions_[i]; + intervals[i].idx = i; + intervals[i].y0 = pos.y; + intervals[i].y1 = pos.y + ref_positions_[pos.ref_pos_idx].ysize; + } + auto sort_by_y0 = [&intervals](size_t start, size_t end) { + std::sort(intervals.data() + start, intervals.data() + end, + [](const PatchInterval& i0, const PatchInterval& i1) { + return i0.y0 < i1.y0; + }); + }; + auto sort_by_y1 = [&intervals](size_t start, size_t end) { + std::sort(intervals.data() + start, intervals.data() + end, + [](const PatchInterval& i0, const PatchInterval& i1) { + return i0.y1 < i1.y1; + }); + }; + // Count the number of patches for each row. + sort_by_y1(0, intervals.size()); + num_patches_.resize(intervals.back().y1); + for (auto iv : intervals) { + for (size_t y = iv.y0; y < iv.y1; ++y) num_patches_[y]++; + } + PatchTreeNode root; + root.start = 0; + root.num = intervals.size(); + patch_tree_.push_back(root); + size_t next = 0; + while (next < patch_tree_.size()) { + auto& node = patch_tree_[next]; + size_t start = node.start; + size_t end = node.start + node.num; + // Choose the y_center for this node to be the median of interval starts. + sort_by_y0(start, end); + size_t middle_idx = start + node.num / 2; + node.y_center = intervals[middle_idx].y0; + // Divide the intervals in [start, end) into three groups: + // * those completely to the right of y_center: [right_start, end) + // * those overlapping y_center: [left_end, right_start) + // * those completely to the left of y_center: [start, left_end) + size_t right_start = middle_idx; + while (right_start < end && intervals[right_start].y0 == node.y_center) { + ++right_start; + } + sort_by_y1(start, right_start); + size_t left_end = right_start; + while (left_end > start && intervals[left_end - 1].y1 > node.y_center) { + --left_end; + } + // Fill in sorted_patches_y0_ and sorted_patches_y1_ for the current node. + node.num = right_start - left_end; + node.start = sorted_patches_y0_.size(); + for (ssize_t i = static_cast(right_start) - 1; + i >= static_cast(left_end); --i) { + sorted_patches_y1_.push_back({intervals[i].y1, intervals[i].idx}); + } + sort_by_y0(left_end, right_start); + for (size_t i = left_end; i < right_start; ++i) { + sorted_patches_y0_.push_back({intervals[i].y0, intervals[i].idx}); + } + // Create the left and right nodes (if not empty). + node.left_child = node.right_child = -1; + if (left_end > start) { + PatchTreeNode left; + left.start = start; + left.num = left_end - left.start; + patch_tree_[next].left_child = patch_tree_.size(); + patch_tree_.push_back(left); + } + if (right_start < end) { + PatchTreeNode right; + right.start = right_start; + right.num = end - right.start; + patch_tree_[next].right_child = patch_tree_.size(); + patch_tree_.push_back(right); + } + ++next; + } +} + +std::vector PatchDictionary::GetPatchesForRow(size_t y) const { + std::vector result; + if (y < num_patches_.size() && num_patches_[y] > 0) { + result.reserve(num_patches_[y]); + for (ssize_t tree_idx = 0; tree_idx != -1;) { + JXL_DASSERT(tree_idx < (ssize_t)patch_tree_.size()); + const auto& node = patch_tree_[tree_idx]; + if (y <= node.y_center) { + for (size_t i = 0; i < node.num; ++i) { + const auto& p = sorted_patches_y0_[node.start + i]; + if (y < p.first) break; + result.push_back(p.second); + } + tree_idx = y < node.y_center ? node.left_child : -1; + } else { + for (size_t i = 0; i < node.num; ++i) { + const auto& p = sorted_patches_y1_[node.start + i]; + if (y >= p.first) break; + result.push_back(p.second); + } + tree_idx = node.right_child; + } + } + // Ensure that he relative order of patches that affect the same pixels is + // preserved. This is important for patches that have a blend mode + // different from kAdd. + std::sort(result.begin(), result.end()); + } + return result; +} + +// Adds patches to a segment of `xsize` pixels, starting at `inout`, assumed +// to be located at position (x0, y) in the frame. +void PatchDictionary::AddOneRow(float* const* inout, size_t y, size_t x0, + size_t xsize) const { + size_t num_ec = shared_->metadata->m.num_extra_channels; + std::vector fg_ptrs(3 + num_ec); + for (size_t pos_idx : GetPatchesForRow(y)) { + const size_t blending_idx = pos_idx * (num_ec + 1); + const PatchPosition& pos = positions_[pos_idx]; + const PatchReferencePosition& ref_pos = ref_positions_[pos.ref_pos_idx]; + size_t by = pos.y; + size_t bx = pos.x; + size_t patch_xsize = ref_pos.xsize; + JXL_DASSERT(y >= by); + JXL_DASSERT(y < by + ref_pos.ysize); + size_t iy = y - by; + size_t ref = ref_pos.ref; + if (bx >= x0 + xsize) continue; + if (bx + patch_xsize < x0) continue; + size_t patch_x0 = std::max(bx, x0); + size_t patch_x1 = std::min(bx + patch_xsize, x0 + xsize); + for (size_t c = 0; c < 3; c++) { + fg_ptrs[c] = shared_->reference_frames[ref].frame.color().ConstPlaneRow( + c, ref_pos.y0 + iy) + + ref_pos.x0 + x0 - bx; + } + for (size_t i = 0; i < num_ec; i++) { + fg_ptrs[3 + i] = + shared_->reference_frames[ref].frame.extra_channels()[i].ConstRow( + ref_pos.y0 + iy) + + ref_pos.x0 + x0 - bx; + } + PerformBlending(inout, fg_ptrs.data(), inout, patch_x0 - x0, + patch_x1 - patch_x0, blendings_[blending_idx], + blendings_.data() + blending_idx + 1, + shared_->metadata->m.extra_channel_info); + } +} +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.h b/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.h new file mode 100644 index 0000000000..a950e83e85 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.h @@ -0,0 +1,151 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_PATCH_DICTIONARY_H_ +#define LIB_JXL_DEC_PATCH_DICTIONARY_H_ + +// Chooses reference patches, and avoids encoding them once per occurrence. + +#include +#include +#include + +#include +#include + +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/image.h" +#include "lib/jxl/opsin_params.h" + +namespace jxl { + +enum class PatchBlendMode : uint8_t { + // The new values are the old ones. Useful to skip some channels. + kNone = 0, + // The new values (in the crop) replace the old ones: sample = new + kReplace = 1, + // The new values (in the crop) get added to the old ones: sample = old + new + kAdd = 2, + // The new values (in the crop) get multiplied by the old ones: + // sample = old * new + // This blend mode is only supported if BlendColorSpace is kEncoded. The + // range of the new value matters for multiplication purposes, and its + // nominal range of 0..1 is computed the same way as this is done for the + // alpha values in kBlend and kAlphaWeightedAdd. + kMul = 3, + // The new values (in the crop) replace the old ones if alpha>0: + // For first alpha channel: + // alpha = old + new * (1 - old) + // For other channels if !alpha_associated: + // sample = ((1 - new_alpha) * old * old_alpha + new_alpha * new) / alpha + // For other channels if alpha_associated: + // sample = (1 - new_alpha) * old + new + // The alpha formula applies to the alpha used for the division in the other + // channels formula, and applies to the alpha channel itself if its + // blend_channel value matches itself. + // If using kBlendAbove, new is the patch and old is the original image; if + // using kBlendBelow, the meaning is inverted. + kBlendAbove = 4, + kBlendBelow = 5, + // The new values (in the crop) are added to the old ones if alpha>0: + // For first alpha channel: sample = sample = old + new * (1 - old) + // For other channels: sample = old + alpha * new + kAlphaWeightedAddAbove = 6, + kAlphaWeightedAddBelow = 7, + kNumBlendModes, +}; + +inline bool UsesAlpha(PatchBlendMode mode) { + return mode == PatchBlendMode::kBlendAbove || + mode == PatchBlendMode::kBlendBelow || + mode == PatchBlendMode::kAlphaWeightedAddAbove || + mode == PatchBlendMode::kAlphaWeightedAddBelow; +} +inline bool UsesClamp(PatchBlendMode mode) { + return UsesAlpha(mode) || mode == PatchBlendMode::kMul; +} + +struct PatchBlending { + PatchBlendMode mode; + uint32_t alpha_channel; + bool clamp; +}; + +// Position and size of the patch in the reference frame. +struct PatchReferencePosition { + size_t ref, x0, y0, xsize, ysize; +}; + +struct PatchPosition { + // Position of top-left corner of the patch in the image. + size_t x, y; + size_t ref_pos_idx; +}; + +struct PassesSharedState; + +// Encoder-side helper class to encode the PatchesDictionary. +class PatchDictionaryEncoder; + +class PatchDictionary { + public: + PatchDictionary() = default; + + void SetPassesSharedState(const PassesSharedState* shared) { + shared_ = shared; + } + + bool HasAny() const { return !positions_.empty(); } + + Status Decode(BitReader* br, size_t xsize, size_t ysize, + bool* uses_extra_channels); + + void Clear() { + positions_.clear(); + ComputePatchTree(); + } + + // Adds patches to a segment of `xsize` pixels, starting at `inout`, assumed + // to be located at position (x0, y) in the frame. + void AddOneRow(float* const* inout, size_t y, size_t x0, size_t xsize) const; + + // Returns dependencies of this patch dictionary on reference frame ids as a + // bit mask: bits 0-3 indicate reference frame 0-3. + int GetReferences() const; + + std::vector GetPatchesForRow(size_t y) const; + + private: + friend class PatchDictionaryEncoder; + + const PassesSharedState* shared_; + std::vector positions_; + std::vector ref_positions_; + std::vector blendings_; + + // Interval tree on the y coordinates of the patches. + struct PatchTreeNode { + ssize_t left_child; + ssize_t right_child; + size_t y_center; + // Range of patches in sorted_patches_y0_ and sorted_patches_y1_ that + // contain the row y_center. + size_t start; + size_t num; + }; + std::vector patch_tree_; + // Number of patches for each row. + std::vector num_patches_; + std::vector> sorted_patches_y0_; + std::vector> sorted_patches_y1_; + + void ComputePatchTree(); +}; + +} // namespace jxl + +#endif // LIB_JXL_DEC_PATCH_DICTIONARY_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dec_tone_mapping-inl.h b/third_party/jpeg-xl/lib/jxl/dec_tone_mapping-inl.h new file mode 100644 index 0000000000..26bf643152 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_tone_mapping-inl.h @@ -0,0 +1,234 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#if defined(LIB_JXL_DEC_TONE_MAPPING_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_DEC_TONE_MAPPING_INL_H_ +#undef LIB_JXL_DEC_TONE_MAPPING_INL_H_ +#else +#define LIB_JXL_DEC_TONE_MAPPING_INL_H_ +#endif + +#include + +#include "lib/jxl/transfer_functions-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Clamp; +using hwy::HWY_NAMESPACE::Max; +using hwy::HWY_NAMESPACE::ZeroIfNegative; + +template +class Rec2408ToneMapper { + private: + using V = hwy::HWY_NAMESPACE::Vec; + + public: + explicit Rec2408ToneMapper(std::pair source_range, + std::pair target_range, + const float primaries_luminances[3]) + : source_range_(source_range), + target_range_(target_range), + red_Y_(primaries_luminances[0]), + green_Y_(primaries_luminances[1]), + blue_Y_(primaries_luminances[2]) {} + + void ToneMap(V* red, V* green, V* blue) const { + const V luminance = Mul(Set(df_, source_range_.second), + (MulAdd(Set(df_, red_Y_), *red, + MulAdd(Set(df_, green_Y_), *green, + Mul(Set(df_, blue_Y_), *blue))))); + const V pq_mastering_min = Set(df_, pq_mastering_min_); + const V inv_pq_mastering_range = Set(df_, inv_pq_mastering_range_); + const V normalized_pq = Min( + Set(df_, 1.f), + Mul(Sub(InvEOTF(luminance), pq_mastering_min), inv_pq_mastering_range)); + const V ks = Set(df_, ks_); + const V e2 = + IfThenElse(Lt(normalized_pq, ks), normalized_pq, P(normalized_pq)); + const V one_minus_e2 = Sub(Set(df_, 1), e2); + const V one_minus_e2_2 = Mul(one_minus_e2, one_minus_e2); + const V one_minus_e2_4 = Mul(one_minus_e2_2, one_minus_e2_2); + const V b = Set(df_, min_lum_); + const V e3 = MulAdd(b, one_minus_e2_4, e2); + const V pq_mastering_range = Set(df_, pq_mastering_range_); + const V e4 = MulAdd(e3, pq_mastering_range, pq_mastering_min); + const V new_luminance = + Min(Set(df_, target_range_.second), + ZeroIfNegative( + Mul(Set(df_, 10000), TF_PQ().DisplayFromEncoded(df_, e4)))); + + const V ratio = Div(new_luminance, luminance); + const V inv_target_peak = Set(df_, inv_target_peak_); + const V normalizer = Set(df_, normalizer_); + const V multiplier = Mul(ratio, normalizer); + for (V* const val : {red, green, blue}) { + *val = IfThenElse(Le(luminance, Set(df_, 1e-6f)), + Mul(new_luminance, inv_target_peak), + Mul(*val, multiplier)); + } + } + + private: + V InvEOTF(const V luminance) const { + return TF_PQ().EncodedFromDisplay(df_, + Mul(luminance, Set(df_, 1. / 10000))); + } + float InvEOTF(const float luminance) const { + return TF_PQ().EncodedFromDisplay(luminance / 10000.0f); + } + V T(const V a) const { + const V ks = Set(df_, ks_); + const V inv_one_minus_ks = Set(df_, inv_one_minus_ks_); + return Mul(Sub(a, ks), inv_one_minus_ks); + } + V P(const V b) const { + const V t_b = T(b); + const V t_b_2 = Mul(t_b, t_b); + const V t_b_3 = Mul(t_b_2, t_b); + const V ks = Set(df_, ks_); + const V max_lum = Set(df_, max_lum_); + return MulAdd( + MulAdd(Set(df_, 2), t_b_3, MulAdd(Set(df_, -3), t_b_2, Set(df_, 1))), + ks, + MulAdd(Add(t_b_3, MulAdd(Set(df_, -2), t_b_2, t_b)), + Sub(Set(df_, 1), ks), + Mul(MulAdd(Set(df_, -2), t_b_3, Mul(Set(df_, 3), t_b_2)), + max_lum))); + } + + D df_; + const std::pair source_range_; + const std::pair target_range_; + const float red_Y_; + const float green_Y_; + const float blue_Y_; + + const float pq_mastering_min_ = InvEOTF(source_range_.first); + const float pq_mastering_max_ = InvEOTF(source_range_.second); + const float pq_mastering_range_ = pq_mastering_max_ - pq_mastering_min_; + const float inv_pq_mastering_range_ = 1.0f / pq_mastering_range_; + // TODO(eustas): divide instead of inverse-multiply? + const float min_lum_ = (InvEOTF(target_range_.first) - pq_mastering_min_) * + inv_pq_mastering_range_; + // TODO(eustas): divide instead of inverse-multiply? + const float max_lum_ = (InvEOTF(target_range_.second) - pq_mastering_min_) * + inv_pq_mastering_range_; + const float ks_ = 1.5f * max_lum_ - 0.5f; + const float b_ = min_lum_; + + const float inv_one_minus_ks_ = 1.0f / std::max(1e-6f, 1.0f - ks_); + + const float normalizer_ = source_range_.second / target_range_.second; + const float inv_target_peak_ = 1.f / target_range_.second; +}; + +class HlgOOTF { + public: + explicit HlgOOTF(float source_luminance, float target_luminance, + const float primaries_luminances[3]) + : HlgOOTF(/*gamma=*/std::pow( + 1.111f, std::log2(target_luminance / source_luminance)), + primaries_luminances) {} + + static HlgOOTF FromSceneLight(float display_luminance, + const float primaries_luminances[3]) { + return HlgOOTF(/*gamma=*/1.2f * + std::pow(1.111f, std::log2(display_luminance / 1000.f)), + primaries_luminances); + } + + static HlgOOTF ToSceneLight(float display_luminance, + const float primaries_luminances[3]) { + return HlgOOTF( + /*gamma=*/(1 / 1.2f) * + std::pow(1.111f, -std::log2(display_luminance / 1000.f)), + primaries_luminances); + } + + template + void Apply(V* red, V* green, V* blue) const { + hwy::HWY_NAMESPACE::DFromV df; + if (!apply_ootf_) return; + const V luminance = + MulAdd(Set(df, red_Y_), *red, + MulAdd(Set(df, green_Y_), *green, Mul(Set(df, blue_Y_), *blue))); + const V ratio = + Min(FastPowf(df, luminance, Set(df, exponent_)), Set(df, 1e9)); + *red = Mul(*red, ratio); + *green = Mul(*green, ratio); + *blue = Mul(*blue, ratio); + } + + bool WarrantsGamutMapping() const { return apply_ootf_ && exponent_ < 0; } + + private: + explicit HlgOOTF(float gamma, const float luminances[3]) + : exponent_(gamma - 1), + red_Y_(luminances[0]), + green_Y_(luminances[1]), + blue_Y_(luminances[2]) {} + const float exponent_; + const bool apply_ootf_ = exponent_ < -0.01f || 0.01f < exponent_; + const float red_Y_; + const float green_Y_; + const float blue_Y_; +}; + +template +void GamutMap(V* red, V* green, V* blue, const float primaries_luminances[3], + float preserve_saturation = 0.1f) { + hwy::HWY_NAMESPACE::DFromV df; + const V luminance = + MulAdd(Set(df, primaries_luminances[0]), *red, + MulAdd(Set(df, primaries_luminances[1]), *green, + Mul(Set(df, primaries_luminances[2]), *blue))); + + // Desaturate out-of-gamut pixels. This is done by mixing each pixel + // with just enough gray of the target luminance to make all + // components non-negative. + // - For saturation preservation, if a component is still larger than + // 1 then the pixel is normalized to have a maximum component of 1. + // That will reduce its luminance. + // - For luminance preservation, getting all components below 1 is + // done by mixing in yet more gray. That will desaturate it further. + V gray_mix_saturation = Zero(df); + V gray_mix_luminance = Zero(df); + for (const V* ch : {red, green, blue}) { + const V& val = *ch; + const V inv_val_minus_gray = Div(Set(df, 1), (Sub(val, luminance))); + gray_mix_saturation = + IfThenElse(Ge(val, luminance), gray_mix_saturation, + Max(gray_mix_saturation, Mul(val, inv_val_minus_gray))); + gray_mix_luminance = + Max(gray_mix_luminance, + IfThenElse(Le(val, luminance), gray_mix_saturation, + Mul(Sub(val, Set(df, 1)), inv_val_minus_gray))); + } + const V gray_mix = Clamp( + MulAdd(Set(df, preserve_saturation), + Sub(gray_mix_saturation, gray_mix_luminance), gray_mix_luminance), + Zero(df), Set(df, 1)); + for (V* const val : {red, green, blue}) { + *val = MulAdd(gray_mix, Sub(luminance, *val), *val); + } + const V normalizer = + Div(Set(df, 1), Max(Set(df, 1), Max(*red, Max(*green, *blue)))); + for (V* const val : {red, green, blue}) { + *val = Mul(*val, normalizer); + } +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_DEC_TONE_MAPPING_INL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dec_transforms-inl.h b/third_party/jpeg-xl/lib/jxl/dec_transforms-inl.h new file mode 100644 index 0000000000..075619b3b9 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_transforms-inl.h @@ -0,0 +1,853 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#if defined(LIB_JXL_DEC_TRANSFORMS_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_DEC_TRANSFORMS_INL_H_ +#undef LIB_JXL_DEC_TRANSFORMS_INL_H_ +#else +#define LIB_JXL_DEC_TRANSFORMS_INL_H_ +#endif + +#include + +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/dct-inl.h" +#include "lib/jxl/dct_scales.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::MulAdd; + +// Computes the lowest-frequency LF_ROWSxLF_COLS-sized square in output, which +// is a DCT_ROWS*DCT_COLS-sized DCT block, by doing a ROWS*COLS DCT on the +// input block. +template +JXL_INLINE void ReinterpretingDCT(const float* input, const size_t input_stride, + float* output, const size_t output_stride) { + static_assert(LF_ROWS == ROWS, + "ReinterpretingDCT should only be called with LF == N"); + static_assert(LF_COLS == COLS, + "ReinterpretingDCT should only be called with LF == N"); + HWY_ALIGN float block[ROWS * COLS]; + + // ROWS, COLS <= 8, so we can put scratch space on the stack. + HWY_ALIGN float scratch_space[ROWS * COLS]; + ComputeScaledDCT()(DCTFrom(input, input_stride), block, + scratch_space); + if (ROWS < COLS) { + for (size_t y = 0; y < LF_ROWS; y++) { + for (size_t x = 0; x < LF_COLS; x++) { + output[y * output_stride + x] = + block[y * COLS + x] * DCTTotalResampleScale(y) * + DCTTotalResampleScale(x); + } + } + } else { + for (size_t y = 0; y < LF_COLS; y++) { + for (size_t x = 0; x < LF_ROWS; x++) { + output[y * output_stride + x] = + block[y * ROWS + x] * DCTTotalResampleScale(y) * + DCTTotalResampleScale(x); + } + } + } +} + +template +void IDCT2TopBlock(const float* block, size_t stride_out, float* out) { + static_assert(kBlockDim % S == 0, "S should be a divisor of kBlockDim"); + static_assert(S % 2 == 0, "S should be even"); + float temp[kDCTBlockSize]; + constexpr size_t num_2x2 = S / 2; + for (size_t y = 0; y < num_2x2; y++) { + for (size_t x = 0; x < num_2x2; x++) { + float c00 = block[y * kBlockDim + x]; + float c01 = block[y * kBlockDim + num_2x2 + x]; + float c10 = block[(y + num_2x2) * kBlockDim + x]; + float c11 = block[(y + num_2x2) * kBlockDim + num_2x2 + x]; + float r00 = c00 + c01 + c10 + c11; + float r01 = c00 + c01 - c10 - c11; + float r10 = c00 - c01 + c10 - c11; + float r11 = c00 - c01 - c10 + c11; + temp[y * 2 * kBlockDim + x * 2] = r00; + temp[y * 2 * kBlockDim + x * 2 + 1] = r01; + temp[(y * 2 + 1) * kBlockDim + x * 2] = r10; + temp[(y * 2 + 1) * kBlockDim + x * 2 + 1] = r11; + } + } + for (size_t y = 0; y < S; y++) { + for (size_t x = 0; x < S; x++) { + out[y * stride_out + x] = temp[y * kBlockDim + x]; + } + } +} + +void AFVIDCT4x4(const float* JXL_RESTRICT coeffs, float* JXL_RESTRICT pixels) { + HWY_ALIGN static constexpr float k4x4AFVBasis[16][16] = { + { + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + }, + { + 0.876902929799142f, + 0.2206518106944235f, + -0.10140050393753763f, + -0.1014005039375375f, + 0.2206518106944236f, + -0.10140050393753777f, + -0.10140050393753772f, + -0.10140050393753763f, + -0.10140050393753758f, + -0.10140050393753769f, + -0.1014005039375375f, + -0.10140050393753768f, + -0.10140050393753768f, + -0.10140050393753759f, + -0.10140050393753763f, + -0.10140050393753741f, + }, + { + 0.0, + 0.0, + 0.40670075830260755f, + 0.44444816619734445f, + 0.0, + 0.0, + 0.19574399372042936f, + 0.2929100136981264f, + -0.40670075830260716f, + -0.19574399372042872f, + 0.0, + 0.11379074460448091f, + -0.44444816619734384f, + -0.29291001369812636f, + -0.1137907446044814f, + 0.0, + }, + { + 0.0, + 0.0, + -0.21255748058288748f, + 0.3085497062849767f, + 0.0, + 0.4706702258572536f, + -0.1621205195722993f, + 0.0, + -0.21255748058287047f, + -0.16212051957228327f, + -0.47067022585725277f, + -0.1464291867126764f, + 0.3085497062849487f, + 0.0, + -0.14642918671266536f, + 0.4251149611657548f, + }, + { + 0.0, + -0.7071067811865474f, + 0.0, + 0.0, + 0.7071067811865476f, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + }, + { + -0.4105377591765233f, + 0.6235485373547691f, + -0.06435071657946274f, + -0.06435071657946266f, + 0.6235485373547694f, + -0.06435071657946284f, + -0.0643507165794628f, + -0.06435071657946274f, + -0.06435071657946272f, + -0.06435071657946279f, + -0.06435071657946266f, + -0.06435071657946277f, + -0.06435071657946277f, + -0.06435071657946273f, + -0.06435071657946274f, + -0.0643507165794626f, + }, + { + 0.0, + 0.0, + -0.4517556589999482f, + 0.15854503551840063f, + 0.0, + -0.04038515160822202f, + 0.0074182263792423875f, + 0.39351034269210167f, + -0.45175565899994635f, + 0.007418226379244351f, + 0.1107416575309343f, + 0.08298163094882051f, + 0.15854503551839705f, + 0.3935103426921022f, + 0.0829816309488214f, + -0.45175565899994796f, + }, + { + 0.0, + 0.0, + -0.304684750724869f, + 0.5112616136591823f, + 0.0, + 0.0, + -0.290480129728998f, + -0.06578701549142804f, + 0.304684750724884f, + 0.2904801297290076f, + 0.0, + -0.23889773523344604f, + -0.5112616136592012f, + 0.06578701549142545f, + 0.23889773523345467f, + 0.0, + }, + { + 0.0, + 0.0, + 0.3017929516615495f, + 0.25792362796341184f, + 0.0, + 0.16272340142866204f, + 0.09520022653475037f, + 0.0, + 0.3017929516615503f, + 0.09520022653475055f, + -0.16272340142866173f, + -0.35312385449816297f, + 0.25792362796341295f, + 0.0, + -0.3531238544981624f, + -0.6035859033230976f, + }, + { + 0.0, + 0.0, + 0.40824829046386274f, + 0.0, + 0.0, + 0.0, + 0.0, + -0.4082482904638628f, + -0.4082482904638635f, + 0.0, + 0.0, + -0.40824829046386296f, + 0.0, + 0.4082482904638634f, + 0.408248290463863f, + 0.0, + }, + { + 0.0, + 0.0, + 0.1747866975480809f, + 0.0812611176717539f, + 0.0, + 0.0, + -0.3675398009862027f, + -0.307882213957909f, + -0.17478669754808135f, + 0.3675398009862011f, + 0.0, + 0.4826689115059883f, + -0.08126111767175039f, + 0.30788221395790305f, + -0.48266891150598584f, + 0.0, + }, + { + 0.0, + 0.0, + -0.21105601049335784f, + 0.18567180916109802f, + 0.0, + 0.0, + 0.49215859013738733f, + -0.38525013709251915f, + 0.21105601049335806f, + -0.49215859013738905f, + 0.0, + 0.17419412659916217f, + -0.18567180916109904f, + 0.3852501370925211f, + -0.1741941265991621f, + 0.0, + }, + { + 0.0, + 0.0, + -0.14266084808807264f, + -0.3416446842253372f, + 0.0, + 0.7367497537172237f, + 0.24627107722075148f, + -0.08574019035519306f, + -0.14266084808807344f, + 0.24627107722075137f, + 0.14883399227113567f, + -0.04768680350229251f, + -0.3416446842253373f, + -0.08574019035519267f, + -0.047686803502292804f, + -0.14266084808807242f, + }, + { + 0.0, + 0.0, + -0.13813540350758585f, + 0.3302282550303788f, + 0.0, + 0.08755115000587084f, + -0.07946706605909573f, + -0.4613374887461511f, + -0.13813540350758294f, + -0.07946706605910261f, + 0.49724647109535086f, + 0.12538059448563663f, + 0.3302282550303805f, + -0.4613374887461554f, + 0.12538059448564315f, + -0.13813540350758452f, + }, + { + 0.0, + 0.0, + -0.17437602599651067f, + 0.0702790691196284f, + 0.0, + -0.2921026642334881f, + 0.3623817333531167f, + 0.0, + -0.1743760259965108f, + 0.36238173335311646f, + 0.29210266423348785f, + -0.4326608024727445f, + 0.07027906911962818f, + 0.0, + -0.4326608024727457f, + 0.34875205199302267f, + }, + { + 0.0, + 0.0, + 0.11354987314994337f, + -0.07417504595810355f, + 0.0, + 0.19402893032594343f, + -0.435190496523228f, + 0.21918684838857466f, + 0.11354987314994257f, + -0.4351904965232251f, + 0.5550443808910661f, + -0.25468277124066463f, + -0.07417504595810233f, + 0.2191868483885728f, + -0.25468277124066413f, + 0.1135498731499429f, + }, + }; + + const HWY_CAPPED(float, 16) d; + for (size_t i = 0; i < 16; i += Lanes(d)) { + auto pixel = Zero(d); + for (size_t j = 0; j < 16; j++) { + auto cf = Set(d, coeffs[j]); + auto basis = Load(d, k4x4AFVBasis[j] + i); + pixel = MulAdd(cf, basis, pixel); + } + Store(pixel, d, pixels + i); + } +} + +template +void AFVTransformToPixels(const float* JXL_RESTRICT coefficients, + float* JXL_RESTRICT pixels, size_t pixels_stride) { + HWY_ALIGN float scratch_space[4 * 8]; + size_t afv_x = afv_kind & 1; + size_t afv_y = afv_kind / 2; + float dcs[3] = {}; + float block00 = coefficients[0]; + float block01 = coefficients[1]; + float block10 = coefficients[8]; + dcs[0] = (block00 + block10 + block01) * 4.0f; + dcs[1] = (block00 + block10 - block01); + dcs[2] = block00 - block10; + // IAFV: (even, even) positions. + HWY_ALIGN float coeff[4 * 4]; + coeff[0] = dcs[0]; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + if (ix == 0 && iy == 0) continue; + coeff[iy * 4 + ix] = coefficients[iy * 2 * 8 + ix * 2]; + } + } + HWY_ALIGN float block[4 * 8]; + AFVIDCT4x4(coeff, block); + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + pixels[(iy + afv_y * 4) * pixels_stride + afv_x * 4 + ix] = + block[(afv_y == 1 ? 3 - iy : iy) * 4 + (afv_x == 1 ? 3 - ix : ix)]; + } + } + // IDCT4x4 in (odd, even) positions. + block[0] = dcs[1]; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + if (ix == 0 && iy == 0) continue; + block[iy * 4 + ix] = coefficients[iy * 2 * 8 + ix * 2 + 1]; + } + } + ComputeScaledIDCT<4, 4>()( + block, + DCTTo(pixels + afv_y * 4 * pixels_stride + (afv_x == 1 ? 0 : 4), + pixels_stride), + scratch_space); + // IDCT4x8. + block[0] = dcs[2]; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 8; ix++) { + if (ix == 0 && iy == 0) continue; + block[iy * 8 + ix] = coefficients[(1 + iy * 2) * 8 + ix]; + } + } + ComputeScaledIDCT<4, 8>()( + block, + DCTTo(pixels + (afv_y == 1 ? 0 : 4) * pixels_stride, pixels_stride), + scratch_space); +} + +HWY_MAYBE_UNUSED void TransformToPixels(const AcStrategy::Type strategy, + float* JXL_RESTRICT coefficients, + float* JXL_RESTRICT pixels, + size_t pixels_stride, + float* scratch_space) { + using Type = AcStrategy::Type; + switch (strategy) { + case Type::IDENTITY: { + PROFILER_ZONE("IDCT Identity"); + float dcs[4] = {}; + float block00 = coefficients[0]; + float block01 = coefficients[1]; + float block10 = coefficients[8]; + float block11 = coefficients[9]; + dcs[0] = block00 + block01 + block10 + block11; + dcs[1] = block00 + block01 - block10 - block11; + dcs[2] = block00 - block01 + block10 - block11; + dcs[3] = block00 - block01 - block10 + block11; + for (size_t y = 0; y < 2; y++) { + for (size_t x = 0; x < 2; x++) { + float block_dc = dcs[y * 2 + x]; + float residual_sum = 0; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + if (ix == 0 && iy == 0) continue; + residual_sum += coefficients[(y + iy * 2) * 8 + x + ix * 2]; + } + } + pixels[(4 * y + 1) * pixels_stride + 4 * x + 1] = + block_dc - residual_sum * (1.0f / 16); + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + if (ix == 1 && iy == 1) continue; + pixels[(y * 4 + iy) * pixels_stride + x * 4 + ix] = + coefficients[(y + iy * 2) * 8 + x + ix * 2] + + pixels[(4 * y + 1) * pixels_stride + 4 * x + 1]; + } + } + pixels[y * 4 * pixels_stride + x * 4] = + coefficients[(y + 2) * 8 + x + 2] + + pixels[(4 * y + 1) * pixels_stride + 4 * x + 1]; + } + } + break; + } + case Type::DCT8X4: { + PROFILER_ZONE("IDCT 8x4"); + float dcs[2] = {}; + float block0 = coefficients[0]; + float block1 = coefficients[8]; + dcs[0] = block0 + block1; + dcs[1] = block0 - block1; + for (size_t x = 0; x < 2; x++) { + HWY_ALIGN float block[4 * 8]; + block[0] = dcs[x]; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 8; ix++) { + if (ix == 0 && iy == 0) continue; + block[iy * 8 + ix] = coefficients[(x + iy * 2) * 8 + ix]; + } + } + ComputeScaledIDCT<8, 4>()(block, DCTTo(pixels + x * 4, pixels_stride), + scratch_space); + } + break; + } + case Type::DCT4X8: { + PROFILER_ZONE("IDCT 4x8"); + float dcs[2] = {}; + float block0 = coefficients[0]; + float block1 = coefficients[8]; + dcs[0] = block0 + block1; + dcs[1] = block0 - block1; + for (size_t y = 0; y < 2; y++) { + HWY_ALIGN float block[4 * 8]; + block[0] = dcs[y]; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 8; ix++) { + if (ix == 0 && iy == 0) continue; + block[iy * 8 + ix] = coefficients[(y + iy * 2) * 8 + ix]; + } + } + ComputeScaledIDCT<4, 8>()( + block, DCTTo(pixels + y * 4 * pixels_stride, pixels_stride), + scratch_space); + } + break; + } + case Type::DCT4X4: { + PROFILER_ZONE("IDCT 4"); + float dcs[4] = {}; + float block00 = coefficients[0]; + float block01 = coefficients[1]; + float block10 = coefficients[8]; + float block11 = coefficients[9]; + dcs[0] = block00 + block01 + block10 + block11; + dcs[1] = block00 + block01 - block10 - block11; + dcs[2] = block00 - block01 + block10 - block11; + dcs[3] = block00 - block01 - block10 + block11; + for (size_t y = 0; y < 2; y++) { + for (size_t x = 0; x < 2; x++) { + HWY_ALIGN float block[4 * 4]; + block[0] = dcs[y * 2 + x]; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + if (ix == 0 && iy == 0) continue; + block[iy * 4 + ix] = coefficients[(y + iy * 2) * 8 + x + ix * 2]; + } + } + ComputeScaledIDCT<4, 4>()( + block, + DCTTo(pixels + y * 4 * pixels_stride + x * 4, pixels_stride), + scratch_space); + } + } + break; + } + case Type::DCT2X2: { + PROFILER_ZONE("IDCT 2"); + HWY_ALIGN float coeffs[kDCTBlockSize]; + memcpy(coeffs, coefficients, sizeof(float) * kDCTBlockSize); + IDCT2TopBlock<2>(coeffs, kBlockDim, coeffs); + IDCT2TopBlock<4>(coeffs, kBlockDim, coeffs); + IDCT2TopBlock<8>(coeffs, kBlockDim, coeffs); + for (size_t y = 0; y < kBlockDim; y++) { + for (size_t x = 0; x < kBlockDim; x++) { + pixels[y * pixels_stride + x] = coeffs[y * kBlockDim + x]; + } + } + break; + } + case Type::DCT16X16: { + PROFILER_ZONE("IDCT 16"); + ComputeScaledIDCT<16, 16>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT16X8: { + PROFILER_ZONE("IDCT 16x8"); + ComputeScaledIDCT<16, 8>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT8X16: { + PROFILER_ZONE("IDCT 8x16"); + ComputeScaledIDCT<8, 16>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT32X8: { + PROFILER_ZONE("IDCT 32x8"); + ComputeScaledIDCT<32, 8>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT8X32: { + PROFILER_ZONE("IDCT 8x32"); + ComputeScaledIDCT<8, 32>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT32X16: { + PROFILER_ZONE("IDCT 32x16"); + ComputeScaledIDCT<32, 16>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT16X32: { + PROFILER_ZONE("IDCT 16x32"); + ComputeScaledIDCT<16, 32>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT32X32: { + PROFILER_ZONE("IDCT 32"); + ComputeScaledIDCT<32, 32>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT: { + PROFILER_ZONE("IDCT 8"); + ComputeScaledIDCT<8, 8>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::AFV0: { + PROFILER_ZONE("IAFV0"); + AFVTransformToPixels<0>(coefficients, pixels, pixels_stride); + break; + } + case Type::AFV1: { + PROFILER_ZONE("IAFV1"); + AFVTransformToPixels<1>(coefficients, pixels, pixels_stride); + break; + } + case Type::AFV2: { + PROFILER_ZONE("IAFV2"); + AFVTransformToPixels<2>(coefficients, pixels, pixels_stride); + break; + } + case Type::AFV3: { + PROFILER_ZONE("IAFV3"); + AFVTransformToPixels<3>(coefficients, pixels, pixels_stride); + break; + } + case Type::DCT64X32: { + PROFILER_ZONE("IDCT 64x32"); + ComputeScaledIDCT<64, 32>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT32X64: { + PROFILER_ZONE("IDCT 32x64"); + ComputeScaledIDCT<32, 64>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT64X64: { + PROFILER_ZONE("IDCT 64"); + ComputeScaledIDCT<64, 64>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT128X64: { + PROFILER_ZONE("IDCT 128x64"); + ComputeScaledIDCT<128, 64>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT64X128: { + PROFILER_ZONE("IDCT 64x128"); + ComputeScaledIDCT<64, 128>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT128X128: { + PROFILER_ZONE("IDCT 128"); + ComputeScaledIDCT<128, 128>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT256X128: { + PROFILER_ZONE("IDCT 256x128"); + ComputeScaledIDCT<256, 128>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT128X256: { + PROFILER_ZONE("IDCT 128x256"); + ComputeScaledIDCT<128, 256>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT256X256: { + PROFILER_ZONE("IDCT 256"); + ComputeScaledIDCT<256, 256>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::kNumValidStrategies: + JXL_ABORT("Invalid strategy"); + } +} + +HWY_MAYBE_UNUSED void LowestFrequenciesFromDC(const AcStrategy::Type strategy, + const float* dc, size_t dc_stride, + float* llf) { + using Type = AcStrategy::Type; + switch (strategy) { + case Type::DCT16X8: { + ReinterpretingDCT( + dc, dc_stride, llf, 2 * kBlockDim); + break; + } + case Type::DCT8X16: { + ReinterpretingDCT( + dc, dc_stride, llf, 2 * kBlockDim); + break; + } + case Type::DCT16X16: { + ReinterpretingDCT( + dc, dc_stride, llf, 2 * kBlockDim); + break; + } + case Type::DCT32X8: { + ReinterpretingDCT( + dc, dc_stride, llf, 4 * kBlockDim); + break; + } + case Type::DCT8X32: { + ReinterpretingDCT( + dc, dc_stride, llf, 4 * kBlockDim); + break; + } + case Type::DCT32X16: { + ReinterpretingDCT( + dc, dc_stride, llf, 4 * kBlockDim); + break; + } + case Type::DCT16X32: { + ReinterpretingDCT( + dc, dc_stride, llf, 4 * kBlockDim); + break; + } + case Type::DCT32X32: { + ReinterpretingDCT( + dc, dc_stride, llf, 4 * kBlockDim); + break; + } + case Type::DCT64X32: { + ReinterpretingDCT( + dc, dc_stride, llf, 8 * kBlockDim); + break; + } + case Type::DCT32X64: { + ReinterpretingDCT( + dc, dc_stride, llf, 8 * kBlockDim); + break; + } + case Type::DCT64X64: { + ReinterpretingDCT( + dc, dc_stride, llf, 8 * kBlockDim); + break; + } + case Type::DCT128X64: { + ReinterpretingDCT( + dc, dc_stride, llf, 16 * kBlockDim); + break; + } + case Type::DCT64X128: { + ReinterpretingDCT( + dc, dc_stride, llf, 16 * kBlockDim); + break; + } + case Type::DCT128X128: { + ReinterpretingDCT< + /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim, + /*LF_ROWS=*/16, /*LF_COLS=*/16, /*ROWS=*/16, /*COLS=*/16>( + dc, dc_stride, llf, 16 * kBlockDim); + break; + } + case Type::DCT256X128: { + ReinterpretingDCT< + /*DCT_ROWS=*/32 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim, + /*LF_ROWS=*/32, /*LF_COLS=*/16, /*ROWS=*/32, /*COLS=*/16>( + dc, dc_stride, llf, 32 * kBlockDim); + break; + } + case Type::DCT128X256: { + ReinterpretingDCT< + /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/32 * kBlockDim, + /*LF_ROWS=*/16, /*LF_COLS=*/32, /*ROWS=*/16, /*COLS=*/32>( + dc, dc_stride, llf, 32 * kBlockDim); + break; + } + case Type::DCT256X256: { + ReinterpretingDCT< + /*DCT_ROWS=*/32 * kBlockDim, /*DCT_COLS=*/32 * kBlockDim, + /*LF_ROWS=*/32, /*LF_COLS=*/32, /*ROWS=*/32, /*COLS=*/32>( + dc, dc_stride, llf, 32 * kBlockDim); + break; + } + case Type::DCT: + case Type::DCT2X2: + case Type::DCT4X4: + case Type::DCT4X8: + case Type::DCT8X4: + case Type::AFV0: + case Type::AFV1: + case Type::AFV2: + case Type::AFV3: + case Type::IDENTITY: + llf[0] = dc[0]; + break; + case Type::kNumValidStrategies: + JXL_ABORT("Invalid strategy"); + }; +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_DEC_TRANSFORMS_INL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.cc b/third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.cc new file mode 100644 index 0000000000..9ee80c59dc --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.cc @@ -0,0 +1,41 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_transforms_testonly.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/dec_transforms_testonly.cc" +#include +#include + +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/dec_transforms-inl.h" + +namespace jxl { + +#if HWY_ONCE +HWY_EXPORT(TransformToPixels); +void TransformToPixels(AcStrategy::Type strategy, + float* JXL_RESTRICT coefficients, + float* JXL_RESTRICT pixels, size_t pixels_stride, + float* scratch_space) { + return HWY_DYNAMIC_DISPATCH(TransformToPixels)(strategy, coefficients, pixels, + pixels_stride, scratch_space); +} + +HWY_EXPORT(LowestFrequenciesFromDC); +void LowestFrequenciesFromDC(const jxl::AcStrategy::Type strategy, + const float* dc, size_t dc_stride, float* llf) { + return HWY_DYNAMIC_DISPATCH(LowestFrequenciesFromDC)(strategy, dc, dc_stride, + llf); +} + +HWY_EXPORT(AFVIDCT4x4); +void AFVIDCT4x4(const float* JXL_RESTRICT coeffs, float* JXL_RESTRICT pixels) { + return HWY_DYNAMIC_DISPATCH(AFVIDCT4x4)(coeffs, pixels); +} +#endif // HWY_ONCE + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.h b/third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.h new file mode 100644 index 0000000000..97c4ca543d --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.h @@ -0,0 +1,32 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_TRANSFORMS_TESTONLY_H_ +#define LIB_JXL_DEC_TRANSFORMS_TESTONLY_H_ + +// Facade for (non-inlined) inverse integral transforms. + +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/compiler_specific.h" + +namespace jxl { + +void TransformToPixels(AcStrategy::Type strategy, + float* JXL_RESTRICT coefficients, + float* JXL_RESTRICT pixels, size_t pixels_stride, + float* JXL_RESTRICT scratch_space); + +// Equivalent of the above for DC image. +void LowestFrequenciesFromDC(const jxl::AcStrategy::Type strategy, + const float* dc, size_t dc_stride, float* llf); + +void AFVIDCT4x4(const float* JXL_RESTRICT coeffs, float* JXL_RESTRICT pixels); + +} // namespace jxl + +#endif // LIB_JXL_DEC_TRANSFORMS_TESTONLY_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dec_xyb-inl.h b/third_party/jpeg-xl/lib/jxl/dec_xyb-inl.h new file mode 100644 index 0000000000..a4f24cd123 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_xyb-inl.h @@ -0,0 +1,346 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// XYB -> linear sRGB helper function. + +#if defined(LIB_JXL_DEC_XYB_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_DEC_XYB_INL_H_ +#undef LIB_JXL_DEC_XYB_INL_H_ +#else +#define LIB_JXL_DEC_XYB_INL_H_ +#endif + +#include + +#include "lib/jxl/dec_xyb.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Broadcast; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::Sub; + +// Inverts the pixel-wise RGB->XYB conversion in OpsinDynamicsImage() (including +// the gamma mixing and simple gamma). Avoids clamping to [0, 1] - out of (sRGB) +// gamut values may be in-gamut after transforming to a wider space. +// "inverse_matrix" points to 9 broadcasted vectors, which are the 3x3 entries +// of the (row-major) opsin absorbance matrix inverse. Pre-multiplying its +// entries by c is equivalent to multiplying linear_* by c afterwards. +template +HWY_INLINE HWY_MAYBE_UNUSED void XybToRgb(D d, const V opsin_x, const V opsin_y, + const V opsin_b, + const OpsinParams& opsin_params, + V* const HWY_RESTRICT linear_r, + V* const HWY_RESTRICT linear_g, + V* const HWY_RESTRICT linear_b) { +#if HWY_TARGET == HWY_SCALAR + const auto neg_bias_r = Set(d, opsin_params.opsin_biases[0]); + const auto neg_bias_g = Set(d, opsin_params.opsin_biases[1]); + const auto neg_bias_b = Set(d, opsin_params.opsin_biases[2]); +#else + const auto neg_bias_rgb = LoadDup128(d, opsin_params.opsin_biases); + const auto neg_bias_r = Broadcast<0>(neg_bias_rgb); + const auto neg_bias_g = Broadcast<1>(neg_bias_rgb); + const auto neg_bias_b = Broadcast<2>(neg_bias_rgb); +#endif + + // Color space: XYB -> RGB + auto gamma_r = Add(opsin_y, opsin_x); + auto gamma_g = Sub(opsin_y, opsin_x); + auto gamma_b = opsin_b; + + gamma_r = Sub(gamma_r, Set(d, opsin_params.opsin_biases_cbrt[0])); + gamma_g = Sub(gamma_g, Set(d, opsin_params.opsin_biases_cbrt[1])); + gamma_b = Sub(gamma_b, Set(d, opsin_params.opsin_biases_cbrt[2])); + + // Undo gamma compression: linear = gamma^3 for efficiency. + const auto gamma_r2 = Mul(gamma_r, gamma_r); + const auto gamma_g2 = Mul(gamma_g, gamma_g); + const auto gamma_b2 = Mul(gamma_b, gamma_b); + const auto mixed_r = MulAdd(gamma_r2, gamma_r, neg_bias_r); + const auto mixed_g = MulAdd(gamma_g2, gamma_g, neg_bias_g); + const auto mixed_b = MulAdd(gamma_b2, gamma_b, neg_bias_b); + + const float* HWY_RESTRICT inverse_matrix = opsin_params.inverse_opsin_matrix; + + // Unmix (multiply by 3x3 inverse_matrix) + // TODO(eustas): ref would be more readable than pointer + *linear_r = Mul(LoadDup128(d, &inverse_matrix[0 * 4]), mixed_r); + *linear_g = Mul(LoadDup128(d, &inverse_matrix[3 * 4]), mixed_r); + *linear_b = Mul(LoadDup128(d, &inverse_matrix[6 * 4]), mixed_r); + *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[1 * 4]), mixed_g, *linear_r); + *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[4 * 4]), mixed_g, *linear_g); + *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[7 * 4]), mixed_g, *linear_b); + *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[2 * 4]), mixed_b, *linear_r); + *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[5 * 4]), mixed_b, *linear_g); + *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[8 * 4]), mixed_b, *linear_b); +} + +static inline HWY_MAYBE_UNUSED bool HasFastXYBTosRGB8() { +#if HWY_TARGET == HWY_NEON + return true; +#else + return false; +#endif +} + +static inline HWY_MAYBE_UNUSED void FastXYBTosRGB8(const float* input[4], + uint8_t* output, + bool is_rgba, size_t xsize) { + // This function is very NEON-specific. As such, it uses intrinsics directly. +#if HWY_TARGET == HWY_NEON + // WARNING: doing fixed point arithmetic correctly is very complicated. + // Changes to this function should be thoroughly tested. + + // Note that the input is assumed to have 13 bits of mantissa, and the output + // will have 14 bits. + auto srgb_tf = [&](int16x8_t v16) { + int16x8_t clz = vclzq_s16(v16); + // Convert to [0.25, 0.5) range. + int16x8_t v025_05_16 = vqshlq_s16(v16, vqsubq_s16(clz, vdupq_n_s16(2))); + + // third degree polynomial approximation between 0.25 and 0.5 + // of 1.055/2^(7/2.4) * x^(1/2.4) / 32. + // poly ~ ((0.95x-1.75)*x+1.72)*x+0.29 + // We actually compute ~ ((0.47x-0.87)*x+0.86)*(2x)+0.29 as 1.75 and 1.72 + // overflow our fixed point representation. + + int16x8_t twov = vqaddq_s16(v025_05_16, v025_05_16); + + // 0.47 * x + int16x8_t step1 = vqrdmulhq_n_s16(v025_05_16, 15706); + // - 0.87 + int16x8_t step2 = vsubq_s16(step1, vdupq_n_s16(28546)); + // * x + int16x8_t step3 = vqrdmulhq_s16(step2, v025_05_16); + // + 0.86 + int16x8_t step4 = vaddq_s16(step3, vdupq_n_s16(28302)); + // * 2x + int16x8_t step5 = vqrdmulhq_s16(step4, twov); + // + 0.29 + int16x8_t mul16 = vaddq_s16(step5, vdupq_n_s16(9485)); + + int16x8_t exp16 = vsubq_s16(vdupq_n_s16(11), clz); + // Compute 2**(1/2.4*exp16)/32. Values of exp16 that would overflow are + // capped to 1. + // Generated with the following Python script: + // a = [] + // b = [] + // + // for i in range(0, 16): + // v = 2**(5/12.*i) + // v /= 16 + // v *= 256 * 128 + // v = int(v) + // a.append(v // 256) + // b.append(v % 256) + // + // print(", ".join("0x%02x" % x for x in a)) + // + // print(", ".join("0x%02x" % x for x in b)) + + HWY_ALIGN constexpr uint8_t k2to512powersm1div32_high[16] = { + 0x08, 0x0a, 0x0e, 0x13, 0x19, 0x21, 0x2d, 0x3c, + 0x50, 0x6b, 0x8f, 0x8f, 0x8f, 0x8f, 0x8f, 0x8f, + }; + HWY_ALIGN constexpr uint8_t k2to512powersm1div32_low[16] = { + 0x00, 0xad, 0x41, 0x06, 0x65, 0xe7, 0x41, 0x68, + 0xa2, 0xa2, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + }; + // Using the highway implementation here since vqtbl1q is aarch64-only. + using hwy::HWY_NAMESPACE::Vec128; + uint8x16_t pow_low = + TableLookupBytes( + Vec128(vld1q_u8(k2to512powersm1div32_low)), + Vec128(vreinterpretq_u8_s16(exp16))) + .raw; + uint8x16_t pow_high = + TableLookupBytes( + Vec128(vld1q_u8(k2to512powersm1div32_high)), + Vec128(vreinterpretq_u8_s16(exp16))) + .raw; + int16x8_t pow16 = vreinterpretq_s16_u16(vsliq_n_u16( + vreinterpretq_u16_u8(pow_low), vreinterpretq_u16_u8(pow_high), 8)); + + // approximation of v * 12.92, divided by 2 + // Note that our input is using 13 mantissa bits instead of 15. + int16x8_t v16_linear = vrshrq_n_s16(vmulq_n_s16(v16, 826), 5); + // 1.055*pow(v, 1/2.4) - 0.055, divided by 2 + auto v16_pow = vsubq_s16(vqrdmulhq_s16(mul16, pow16), vdupq_n_s16(901)); + // > 0.0031308f (note that v16 has 13 mantissa bits) + return vbslq_s16(vcgeq_s16(v16, vdupq_n_s16(26)), v16_pow, v16_linear); + }; + + const float* JXL_RESTRICT row_in_x = input[0]; + const float* JXL_RESTRICT row_in_y = input[1]; + const float* JXL_RESTRICT row_in_b = input[2]; + const float* JXL_RESTRICT row_in_a = input[3]; + for (size_t x = 0; x < xsize; x += 8) { + // Normal ranges for xyb for in-gamut sRGB colors: + // x: -0.015386 0.028100 + // y: 0.000000 0.845308 + // b: 0.000000 0.845308 + + // We actually want x * 8 to have some extra precision. + // TODO(veluca): consider different approaches here, like vld1q_f32_x2. + float32x4_t opsin_x_left = vld1q_f32(row_in_x + x); + int16x4_t opsin_x16_times8_left = + vqmovn_s32(vcvtq_n_s32_f32(opsin_x_left, 18)); + float32x4_t opsin_x_right = + vld1q_f32(row_in_x + x + (x + 4 < xsize ? 4 : 0)); + int16x4_t opsin_x16_times8_right = + vqmovn_s32(vcvtq_n_s32_f32(opsin_x_right, 18)); + int16x8_t opsin_x16_times8 = + vcombine_s16(opsin_x16_times8_left, opsin_x16_times8_right); + + float32x4_t opsin_y_left = vld1q_f32(row_in_y + x); + int16x4_t opsin_y16_left = vqmovn_s32(vcvtq_n_s32_f32(opsin_y_left, 15)); + float32x4_t opsin_y_right = + vld1q_f32(row_in_y + x + (x + 4 < xsize ? 4 : 0)); + int16x4_t opsin_y16_right = vqmovn_s32(vcvtq_n_s32_f32(opsin_y_right, 15)); + int16x8_t opsin_y16 = vcombine_s16(opsin_y16_left, opsin_y16_right); + + float32x4_t opsin_b_left = vld1q_f32(row_in_b + x); + int16x4_t opsin_b16_left = vqmovn_s32(vcvtq_n_s32_f32(opsin_b_left, 15)); + float32x4_t opsin_b_right = + vld1q_f32(row_in_b + x + (x + 4 < xsize ? 4 : 0)); + int16x4_t opsin_b16_right = vqmovn_s32(vcvtq_n_s32_f32(opsin_b_right, 15)); + int16x8_t opsin_b16 = vcombine_s16(opsin_b16_left, opsin_b16_right); + + int16x8_t neg_bias16 = vdupq_n_s16(-124); // -0.0037930732552754493 + int16x8_t neg_bias_cbrt16 = vdupq_n_s16(-5110); // -0.155954201 + int16x8_t neg_bias_half16 = vdupq_n_s16(-62); + + // Color space: XYB -> RGB + // Compute ((y+x-bias_cbrt)^3-(y-x-bias_cbrt)^3)/2, + // ((y+x-bias_cbrt)^3+(y-x-bias_cbrt)^3)/2+bias, (b-bias_cbrt)^3+bias. + // Note that ignoring x2 in the formulas below (as x << y) results in + // errors of at least 3 in the final sRGB values. + int16x8_t opsin_yp16 = vqsubq_s16(opsin_y16, neg_bias_cbrt16); + int16x8_t ysq16 = vqrdmulhq_s16(opsin_yp16, opsin_yp16); + int16x8_t twentyfourx16 = vmulq_n_s16(opsin_x16_times8, 3); + int16x8_t twentyfourxy16 = vqrdmulhq_s16(opsin_yp16, twentyfourx16); + int16x8_t threexsq16 = + vrshrq_n_s16(vqrdmulhq_s16(opsin_x16_times8, twentyfourx16), 6); + + // We can ignore x^3 here. Note that this is multiplied by 8. + int16x8_t mixed_rmg16 = vqrdmulhq_s16(twentyfourxy16, opsin_yp16); + + int16x8_t mixed_rpg_sos_half = vhaddq_s16(ysq16, threexsq16); + int16x8_t mixed_rpg16 = vhaddq_s16( + vqrdmulhq_s16(opsin_yp16, mixed_rpg_sos_half), neg_bias_half16); + + int16x8_t gamma_b16 = vqsubq_s16(opsin_b16, neg_bias_cbrt16); + int16x8_t gamma_bsq16 = vqrdmulhq_s16(gamma_b16, gamma_b16); + int16x8_t gamma_bcb16 = vqrdmulhq_s16(gamma_bsq16, gamma_b16); + int16x8_t mixed_b16 = vqaddq_s16(gamma_bcb16, neg_bias16); + // mixed_rpg and mixed_b are in 0-1 range. + // mixed_rmg has a smaller range (-0.035 to 0.035 for valid sRGB). Note + // that at this point it is already multiplied by 8. + + // We multiply all the mixed values by 1/4 (i.e. shift them to 13-bit + // fixed point) to ensure intermediate quantities are in range. Note that + // r-g is not shifted, and was x8 before here; this corresponds to a x32 + // overall multiplicative factor and ensures that all the matrix constants + // are in 0-1 range. + // Similarly, mixed_rpg16 is already multiplied by 1/4 because of the two + // vhadd + using neg_bias_half. + mixed_b16 = vshrq_n_s16(mixed_b16, 2); + + // Unmix (multiply by 3x3 inverse_matrix) + // For increased precision, we use a matrix for converting from + // ((mixed_r - mixed_g)/2, (mixed_r + mixed_g)/2, mixed_b) to rgb. This + // avoids cancellation effects when computing (y+x)^3-(y-x)^3. + // We compute mixed_rpg - mixed_b because the (1+c)*mixed_rpg - c * + // mixed_b pattern is repeated frequently in the code below. This allows + // us to save a multiply per channel, and removes the presence of + // some constants above 1. Moreover, mixed_rmg - mixed_b is in (-1, 1) + // range, so the subtraction is safe. + // All the magic-looking constants here are derived by computing the + // inverse opsin matrix for the transformation modified as described + // above. + + // Precomputation common to multiple color values. + int16x8_t mixed_rpgmb16 = vqsubq_s16(mixed_rpg16, mixed_b16); + int16x8_t mixed_rpgmb_times_016 = vqrdmulhq_n_s16(mixed_rpgmb16, 5394); + int16x8_t mixed_rg16 = vqaddq_s16(mixed_rpgmb_times_016, mixed_rpg16); + + // R + int16x8_t linear_r16 = + vqaddq_s16(mixed_rg16, vqrdmulhq_n_s16(mixed_rmg16, 21400)); + + // G + int16x8_t linear_g16 = + vqaddq_s16(mixed_rg16, vqrdmulhq_n_s16(mixed_rmg16, -7857)); + + // B + int16x8_t linear_b16 = vqrdmulhq_n_s16(mixed_rpgmb16, -30996); + linear_b16 = vqaddq_s16(linear_b16, mixed_b16); + linear_b16 = vqaddq_s16(linear_b16, vqrdmulhq_n_s16(mixed_rmg16, -6525)); + + // Apply SRGB transfer function. + int16x8_t r = srgb_tf(linear_r16); + int16x8_t g = srgb_tf(linear_g16); + int16x8_t b = srgb_tf(linear_b16); + + uint8x8_t r8 = + vqmovun_s16(vrshrq_n_s16(vsubq_s16(r, vshrq_n_s16(r, 8)), 6)); + uint8x8_t g8 = + vqmovun_s16(vrshrq_n_s16(vsubq_s16(g, vshrq_n_s16(g, 8)), 6)); + uint8x8_t b8 = + vqmovun_s16(vrshrq_n_s16(vsubq_s16(b, vshrq_n_s16(b, 8)), 6)); + + size_t n = xsize - x; + if (is_rgba) { + float32x4_t a_f32_left = + row_in_a ? vld1q_f32(row_in_a + x) : vdupq_n_f32(1.0f); + float32x4_t a_f32_right = + row_in_a ? vld1q_f32(row_in_a + x + (x + 4 < xsize ? 4 : 0)) + : vdupq_n_f32(1.0f); + int16x4_t a16_left = vqmovn_s32(vcvtq_n_s32_f32(a_f32_left, 8)); + int16x4_t a16_right = vqmovn_s32(vcvtq_n_s32_f32(a_f32_right, 8)); + uint8x8_t a8 = vqmovun_s16(vcombine_s16(a16_left, a16_right)); + uint8_t* buf = output + 4 * x; + uint8x8x4_t data = {r8, g8, b8, a8}; + if (n >= 8) { + vst4_u8(buf, data); + } else { + uint8_t tmp[8 * 4]; + vst4_u8(tmp, data); + memcpy(buf, tmp, n * 4); + } + } else { + uint8_t* buf = output + 3 * x; + uint8x8x3_t data = {r8, g8, b8}; + if (n >= 8) { + vst3_u8(buf, data); + } else { + uint8_t tmp[8 * 3]; + vst3_u8(tmp, data); + memcpy(buf, tmp, n * 3); + } + } + } +#else + (void)input; + (void)output; + (void)is_rgba; + (void)xsize; + JXL_ABORT("Unreachable"); +#endif +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_DEC_XYB_INL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/dec_xyb.cc b/third_party/jpeg-xl/lib/jxl/dec_xyb.cc new file mode 100644 index 0000000000..46fc63c49e --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_xyb.cc @@ -0,0 +1,329 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_xyb.h" + +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/dec_xyb.cc" +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_group_border.h" +#include "lib/jxl/dec_xyb-inl.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image.h" +#include "lib/jxl/matrix_ops.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/sanitizers.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::MulAdd; + +void OpsinToLinearInplace(Image3F* JXL_RESTRICT inout, ThreadPool* pool, + const OpsinParams& opsin_params) { + PROFILER_FUNC; + JXL_CHECK_IMAGE_INITIALIZED(*inout, Rect(*inout)); + + const size_t xsize = inout->xsize(); // not padded + JXL_CHECK(RunOnPool( + pool, 0, inout->ysize(), ThreadPool::NoInit, + [&](const uint32_t task, size_t /* thread */) { + const size_t y = task; + + // Faster than adding via ByteOffset at end of loop. + float* JXL_RESTRICT row0 = inout->PlaneRow(0, y); + float* JXL_RESTRICT row1 = inout->PlaneRow(1, y); + float* JXL_RESTRICT row2 = inout->PlaneRow(2, y); + + const HWY_FULL(float) d; + + for (size_t x = 0; x < xsize; x += Lanes(d)) { + const auto in_opsin_x = Load(d, row0 + x); + const auto in_opsin_y = Load(d, row1 + x); + const auto in_opsin_b = Load(d, row2 + x); + auto linear_r = Undefined(d); + auto linear_g = Undefined(d); + auto linear_b = Undefined(d); + XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b, opsin_params, + &linear_r, &linear_g, &linear_b); + + Store(linear_r, d, row0 + x); + Store(linear_g, d, row1 + x); + Store(linear_b, d, row2 + x); + } + }, + "OpsinToLinear")); +} + +// Same, but not in-place. +void OpsinToLinear(const Image3F& opsin, const Rect& rect, ThreadPool* pool, + Image3F* JXL_RESTRICT linear, + const OpsinParams& opsin_params) { + PROFILER_FUNC; + + JXL_ASSERT(SameSize(rect, *linear)); + JXL_CHECK_IMAGE_INITIALIZED(opsin, rect); + + JXL_CHECK(RunOnPool( + pool, 0, static_cast(rect.ysize()), ThreadPool::NoInit, + [&](const uint32_t task, size_t /*thread*/) { + const size_t y = static_cast(task); + + // Faster than adding via ByteOffset at end of loop. + const float* JXL_RESTRICT row_opsin_0 = rect.ConstPlaneRow(opsin, 0, y); + const float* JXL_RESTRICT row_opsin_1 = rect.ConstPlaneRow(opsin, 1, y); + const float* JXL_RESTRICT row_opsin_2 = rect.ConstPlaneRow(opsin, 2, y); + float* JXL_RESTRICT row_linear_0 = linear->PlaneRow(0, y); + float* JXL_RESTRICT row_linear_1 = linear->PlaneRow(1, y); + float* JXL_RESTRICT row_linear_2 = linear->PlaneRow(2, y); + + const HWY_FULL(float) d; + + for (size_t x = 0; x < rect.xsize(); x += Lanes(d)) { + const auto in_opsin_x = Load(d, row_opsin_0 + x); + const auto in_opsin_y = Load(d, row_opsin_1 + x); + const auto in_opsin_b = Load(d, row_opsin_2 + x); + auto linear_r = Undefined(d); + auto linear_g = Undefined(d); + auto linear_b = Undefined(d); + XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b, opsin_params, + &linear_r, &linear_g, &linear_b); + + Store(linear_r, d, row_linear_0 + x); + Store(linear_g, d, row_linear_1 + x); + Store(linear_b, d, row_linear_2 + x); + } + }, + "OpsinToLinear(Rect)")); + JXL_CHECK_IMAGE_INITIALIZED(*linear, rect); +} + +// Transform YCbCr to RGB. +// Could be performed in-place (i.e. Y, Cb and Cr could alias R, B and B). +void YcbcrToRgb(const Image3F& ycbcr, Image3F* rgb, const Rect& rect) { + JXL_CHECK_IMAGE_INITIALIZED(ycbcr, rect); + const HWY_CAPPED(float, kBlockDim) df; + const size_t S = Lanes(df); // Step. + + const size_t xsize = rect.xsize(); + const size_t ysize = rect.ysize(); + if ((xsize == 0) || (ysize == 0)) return; + + // Full-range BT.601 as defined by JFIF Clause 7: + // https://www.itu.int/rec/T-REC-T.871-201105-I/en + const auto c128 = Set(df, 128.0f / 255); + const auto crcr = Set(df, 1.402f); + const auto cgcb = Set(df, -0.114f * 1.772f / 0.587f); + const auto cgcr = Set(df, -0.299f * 1.402f / 0.587f); + const auto cbcb = Set(df, 1.772f); + + for (size_t y = 0; y < ysize; y++) { + const float* y_row = rect.ConstPlaneRow(ycbcr, 1, y); + const float* cb_row = rect.ConstPlaneRow(ycbcr, 0, y); + const float* cr_row = rect.ConstPlaneRow(ycbcr, 2, y); + float* r_row = rect.PlaneRow(rgb, 0, y); + float* g_row = rect.PlaneRow(rgb, 1, y); + float* b_row = rect.PlaneRow(rgb, 2, y); + for (size_t x = 0; x < xsize; x += S) { + const auto y_vec = Add(Load(df, y_row + x), c128); + const auto cb_vec = Load(df, cb_row + x); + const auto cr_vec = Load(df, cr_row + x); + const auto r_vec = MulAdd(crcr, cr_vec, y_vec); + const auto g_vec = MulAdd(cgcr, cr_vec, MulAdd(cgcb, cb_vec, y_vec)); + const auto b_vec = MulAdd(cbcb, cb_vec, y_vec); + Store(r_vec, df, r_row + x); + Store(g_vec, df, g_row + x); + Store(b_vec, df, b_row + x); + } + } + JXL_CHECK_IMAGE_INITIALIZED(*rgb, rect); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(OpsinToLinearInplace); +void OpsinToLinearInplace(Image3F* JXL_RESTRICT inout, ThreadPool* pool, + const OpsinParams& opsin_params) { + return HWY_DYNAMIC_DISPATCH(OpsinToLinearInplace)(inout, pool, opsin_params); +} + +HWY_EXPORT(OpsinToLinear); +void OpsinToLinear(const Image3F& opsin, const Rect& rect, ThreadPool* pool, + Image3F* JXL_RESTRICT linear, + const OpsinParams& opsin_params) { + return HWY_DYNAMIC_DISPATCH(OpsinToLinear)(opsin, rect, pool, linear, + opsin_params); +} + +HWY_EXPORT(YcbcrToRgb); +void YcbcrToRgb(const Image3F& ycbcr, Image3F* rgb, const Rect& rect) { + return HWY_DYNAMIC_DISPATCH(YcbcrToRgb)(ycbcr, rgb, rect); +} + +HWY_EXPORT(HasFastXYBTosRGB8); +bool HasFastXYBTosRGB8() { return HWY_DYNAMIC_DISPATCH(HasFastXYBTosRGB8)(); } + +HWY_EXPORT(FastXYBTosRGB8); +void FastXYBTosRGB8(const float* input[4], uint8_t* output, bool is_rgba, + size_t xsize) { + return HWY_DYNAMIC_DISPATCH(FastXYBTosRGB8)(input, output, is_rgba, xsize); +} + +void OpsinParams::Init(float intensity_target) { + InitSIMDInverseMatrix(GetOpsinAbsorbanceInverseMatrix(), inverse_opsin_matrix, + intensity_target); + memcpy(opsin_biases, kNegOpsinAbsorbanceBiasRGB, + sizeof(kNegOpsinAbsorbanceBiasRGB)); + memcpy(quant_biases, kDefaultQuantBias, sizeof(kDefaultQuantBias)); + for (size_t c = 0; c < 4; c++) { + opsin_biases_cbrt[c] = cbrtf(opsin_biases[c]); + } +} + +bool CanOutputToColorEncoding(const ColorEncoding& c_desired) { + if (!c_desired.HaveFields()) { + return false; + } + // TODO(veluca): keep in sync with dec_reconstruct.cc + if (!c_desired.tf.IsPQ() && !c_desired.tf.IsSRGB() && + !c_desired.tf.IsGamma() && !c_desired.tf.IsLinear() && + !c_desired.tf.IsHLG() && !c_desired.tf.IsDCI() && !c_desired.tf.Is709()) { + return false; + } + if (c_desired.IsGray() && c_desired.white_point != WhitePoint::kD65) { + // TODO(veluca): figure out what should happen here. + return false; + } + return true; +} + +Status OutputEncodingInfo::SetFromMetadata(const CodecMetadata& metadata) { + orig_color_encoding = metadata.m.color_encoding; + orig_intensity_target = metadata.m.IntensityTarget(); + desired_intensity_target = orig_intensity_target; + const auto& im = metadata.transform_data.opsin_inverse_matrix; + memcpy(orig_inverse_matrix, im.inverse_matrix, sizeof(orig_inverse_matrix)); + default_transform = im.all_default; + xyb_encoded = metadata.m.xyb_encoded; + std::copy(std::begin(im.opsin_biases), std::end(im.opsin_biases), + opsin_params.opsin_biases); + for (int i = 0; i < 3; ++i) { + opsin_params.opsin_biases_cbrt[i] = cbrtf(opsin_params.opsin_biases[i]); + } + opsin_params.opsin_biases_cbrt[3] = opsin_params.opsin_biases[3] = 1; + std::copy(std::begin(im.quant_biases), std::end(im.quant_biases), + opsin_params.quant_biases); + bool orig_ok = CanOutputToColorEncoding(orig_color_encoding); + bool orig_grey = orig_color_encoding.IsGray(); + return SetColorEncoding(!xyb_encoded || orig_ok + ? orig_color_encoding + : ColorEncoding::LinearSRGB(orig_grey)); +} + +Status OutputEncodingInfo::MaybeSetColorEncoding( + const ColorEncoding& c_desired) { + if (c_desired.GetColorSpace() == ColorSpace::kXYB && + ((color_encoding.GetColorSpace() == ColorSpace::kRGB && + color_encoding.primaries != Primaries::kSRGB) || + color_encoding.tf.IsPQ())) { + return false; + } + if (!xyb_encoded && !CanOutputToColorEncoding(c_desired)) { + return false; + } + return SetColorEncoding(c_desired); +} + +Status OutputEncodingInfo::SetColorEncoding(const ColorEncoding& c_desired) { + color_encoding = c_desired; + color_encoding_is_original = orig_color_encoding.SameColorEncoding(c_desired); + + // Compute the opsin inverse matrix and luminances based on primaries and + // white point. + float inverse_matrix[9]; + bool inverse_matrix_is_default = default_transform; + memcpy(inverse_matrix, orig_inverse_matrix, sizeof(inverse_matrix)); + constexpr float kSRGBLuminances[3] = {0.2126, 0.7152, 0.0722}; + memcpy(luminances, kSRGBLuminances, sizeof(luminances)); + if ((c_desired.primaries != Primaries::kSRGB || + c_desired.white_point != WhitePoint::kD65) && + !c_desired.IsGray()) { + float srgb_to_xyzd50[9]; + const auto& srgb = ColorEncoding::SRGB(/*is_gray=*/false); + JXL_CHECK(PrimariesToXYZD50( + srgb.GetPrimaries().r.x, srgb.GetPrimaries().r.y, + srgb.GetPrimaries().g.x, srgb.GetPrimaries().g.y, + srgb.GetPrimaries().b.x, srgb.GetPrimaries().b.y, + srgb.GetWhitePoint().x, srgb.GetWhitePoint().y, srgb_to_xyzd50)); + float original_to_xyz[3][3]; + JXL_RETURN_IF_ERROR(PrimariesToXYZ( + c_desired.GetPrimaries().r.x, c_desired.GetPrimaries().r.y, + c_desired.GetPrimaries().g.x, c_desired.GetPrimaries().g.y, + c_desired.GetPrimaries().b.x, c_desired.GetPrimaries().b.y, + c_desired.GetWhitePoint().x, c_desired.GetWhitePoint().y, + &original_to_xyz[0][0])); + memcpy(luminances, original_to_xyz[1], sizeof luminances); + if (xyb_encoded) { + float adapt_to_d50[9]; + JXL_RETURN_IF_ERROR(AdaptToXYZD50(c_desired.GetWhitePoint().x, + c_desired.GetWhitePoint().y, + adapt_to_d50)); + float xyzd50_to_original[9]; + Mul3x3Matrix(adapt_to_d50, &original_to_xyz[0][0], xyzd50_to_original); + JXL_RETURN_IF_ERROR(Inv3x3Matrix(xyzd50_to_original)); + float srgb_to_original[9]; + Mul3x3Matrix(xyzd50_to_original, srgb_to_xyzd50, srgb_to_original); + Mul3x3Matrix(srgb_to_original, orig_inverse_matrix, inverse_matrix); + inverse_matrix_is_default = false; + } + } + + if (c_desired.IsGray()) { + float tmp_inv_matrix[9]; + memcpy(tmp_inv_matrix, inverse_matrix, sizeof(inverse_matrix)); + float srgb_to_luma[9]; + memcpy(&srgb_to_luma[0], luminances, sizeof(luminances)); + memcpy(&srgb_to_luma[3], luminances, sizeof(luminances)); + memcpy(&srgb_to_luma[6], luminances, sizeof(luminances)); + Mul3x3Matrix(srgb_to_luma, tmp_inv_matrix, inverse_matrix); + } + + // The internal XYB color space uses absolute luminance, so we scale back the + // opsin inverse matrix to relative luminance where 1.0 corresponds to the + // original intensity target, or to absolute luminance for PQ, where 1.0 + // corresponds to 10000 nits. + if (xyb_encoded) { + float intensity_target = + (c_desired.tf.IsPQ() ? 10000 : orig_intensity_target); + InitSIMDInverseMatrix(inverse_matrix, opsin_params.inverse_opsin_matrix, + intensity_target); + all_default_opsin = (std::abs(intensity_target - 255.0) <= 0.1f && + inverse_matrix_is_default); + } + + // Set the inverse gamma based on color space transfer function. + inverse_gamma = (c_desired.tf.IsGamma() ? c_desired.tf.GetGamma() + : c_desired.tf.IsDCI() ? 1.0f / 2.6f + : 1.0); + return true; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/dec_xyb.h b/third_party/jpeg-xl/lib/jxl/dec_xyb.h new file mode 100644 index 0000000000..ebaae9a176 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/dec_xyb.h @@ -0,0 +1,89 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_XYB_H_ +#define LIB_JXL_DEC_XYB_H_ + +// XYB -> linear sRGB. + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_metadata.h" +#include "lib/jxl/opsin_params.h" + +namespace jxl { + +// Parameters for XYB->sRGB conversion. +struct OpsinParams { + float inverse_opsin_matrix[9 * 4]; + float opsin_biases[4]; + float opsin_biases_cbrt[4]; + float quant_biases[4]; + void Init(float intensity_target); +}; + +struct OutputEncodingInfo { + // + // Fields depending only on image metadata + // + ColorEncoding orig_color_encoding; + // Used for the HLG OOTF and PQ tone mapping. + float orig_intensity_target; + // Opsin inverse matrix taken from the metadata. + float orig_inverse_matrix[9]; + bool default_transform; + bool xyb_encoded; + // + // Fields depending on output color encoding + // + ColorEncoding color_encoding; + bool color_encoding_is_original; + // Contains an opsin matrix that converts to the primaries of the output + // encoding. + OpsinParams opsin_params; + bool all_default_opsin; + // Used for Gamma and DCI transfer functions. + float inverse_gamma; + // Luminances of color_encoding's primaries, used for the HLG inverse OOTF and + // for PQ tone mapping. + // Default to sRGB's. + float luminances[3]; + // Used for the HLG inverse OOTF and PQ tone mapping. + float desired_intensity_target; + + Status SetFromMetadata(const CodecMetadata& metadata); + Status MaybeSetColorEncoding(const ColorEncoding& c_desired); + + private: + Status SetColorEncoding(const ColorEncoding& c_desired); +}; + +// Converts `inout` (not padded) from opsin to linear sRGB in-place. Called from +// per-pass postprocessing, hence parallelized. +void OpsinToLinearInplace(Image3F* JXL_RESTRICT inout, ThreadPool* pool, + const OpsinParams& opsin_params); + +// Converts `opsin:rect` (opsin may be padded, rect.x0 must be vector-aligned) +// to linear sRGB. Called from whole-frame encoder, hence parallelized. +void OpsinToLinear(const Image3F& opsin, const Rect& rect, ThreadPool* pool, + Image3F* JXL_RESTRICT linear, + const OpsinParams& opsin_params); + +// Bt.601 to match JPEG/JFIF. Inputs are _signed_ YCbCr values suitable for DCT, +// see F.1.1.3 of T.81 (because our data type is float, there is no need to add +// a bias to make the values unsigned). +void YcbcrToRgb(const Image3F& ycbcr, Image3F* rgb, const Rect& rect); + +bool HasFastXYBTosRGB8(); +void FastXYBTosRGB8(const float* input[4], uint8_t* output, bool is_rgba, + size_t xsize); + +} // namespace jxl + +#endif // LIB_JXL_DEC_XYB_H_ diff --git a/third_party/jpeg-xl/lib/jxl/decode.cc b/third_party/jpeg-xl/lib/jxl/decode.cc new file mode 100644 index 0000000000..5476f686f6 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/decode.cc @@ -0,0 +1,2809 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#if JPEGXL_ENABLE_BOXES || JPEGXL_ENABLE_TRANSCODE_JPEG +#include "lib/jxl/box_content_decoder.h" +#endif +#include "lib/jxl/dec_external_image.h" +#include "lib/jxl/dec_frame.h" +#include "lib/jxl/dec_modular.h" +#if JPEGXL_ENABLE_TRANSCODE_JPEG +#include "lib/jxl/decode_to_jpeg.h" +#endif +#include "lib/jxl/fields.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/icc_codec.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/memory_manager_internal.h" +#include "lib/jxl/sanitizers.h" +#include "lib/jxl/toc.h" + +namespace { + +// Checks if a + b > size, taking possible integer overflow into account. +bool OutOfBounds(size_t a, size_t b, size_t size) { + size_t pos = a + b; + if (pos > size) return true; + if (pos < a) return true; // overflow happened + return false; +} + +bool SumOverflows(size_t a, size_t b, size_t c) { + size_t sum = a + b; + if (sum < b) return true; + sum += c; + if (sum < c) return true; + return false; +} + +JXL_INLINE size_t InitialBasicInfoSizeHint() { + // Amount of bytes before the start of the codestream in the container format, + // assuming that the codestream is the first box after the signature and + // filetype boxes. 12 bytes signature box + 20 bytes filetype box + 16 bytes + // codestream box length + name + optional XLBox length. + const size_t container_header_size = 48; + + // Worst-case amount of bytes for basic info of the JPEG XL codestream header, + // that is all information up to and including extra_channel_bits. Up to + // around 2 bytes signature + 8 bytes SizeHeader + 31 bytes ColorEncoding + 4 + // bytes rest of ImageMetadata + 5 bytes part of ImageMetadata2. + // TODO(lode): recompute and update this value when alpha_bits is moved to + // extra channels info. + const size_t max_codestream_basic_info_size = 50; + + return container_header_size + max_codestream_basic_info_size; +} + +// Debug-printing failure macro similar to JXL_FAILURE, but for the status code +// JXL_DEC_ERROR +#ifdef JXL_CRASH_ON_ERROR +#define JXL_API_ERROR(format, ...) \ + (::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__), \ + ::jxl::Abort(), JXL_DEC_ERROR) +#else // JXL_CRASH_ON_ERROR +#define JXL_API_ERROR(format, ...) \ + (((JXL_DEBUG_ON_ERROR) && \ + ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__)), \ + JXL_DEC_ERROR) +#endif // JXL_CRASH_ON_ERROR + +JxlDecoderStatus ConvertStatus(JxlDecoderStatus status) { return status; } + +JxlDecoderStatus ConvertStatus(jxl::Status status) { + return status ? JXL_DEC_SUCCESS : JXL_DEC_ERROR; +} + +JxlSignature ReadSignature(const uint8_t* buf, size_t len, size_t* pos) { + if (*pos >= len) return JXL_SIG_NOT_ENOUGH_BYTES; + + buf += *pos; + len -= *pos; + + // JPEG XL codestream: 0xff 0x0a + if (len >= 1 && buf[0] == 0xff) { + if (len < 2) { + return JXL_SIG_NOT_ENOUGH_BYTES; + } else if (buf[1] == jxl::kCodestreamMarker) { + *pos += 2; + return JXL_SIG_CODESTREAM; + } else { + return JXL_SIG_INVALID; + } + } + + // JPEG XL container + if (len >= 1 && buf[0] == 0) { + if (len < 12) { + return JXL_SIG_NOT_ENOUGH_BYTES; + } else if (buf[1] == 0 && buf[2] == 0 && buf[3] == 0xC && buf[4] == 'J' && + buf[5] == 'X' && buf[6] == 'L' && buf[7] == ' ' && + buf[8] == 0xD && buf[9] == 0xA && buf[10] == 0x87 && + buf[11] == 0xA) { + *pos += 12; + return JXL_SIG_CONTAINER; + } else { + return JXL_SIG_INVALID; + } + } + + return JXL_SIG_INVALID; +} + +} // namespace + +uint32_t JxlDecoderVersion(void) { + return JPEGXL_MAJOR_VERSION * 1000000 + JPEGXL_MINOR_VERSION * 1000 + + JPEGXL_PATCH_VERSION; +} + +JxlSignature JxlSignatureCheck(const uint8_t* buf, size_t len) { + size_t pos = 0; + return ReadSignature(buf, len, &pos); +} + +namespace { + +size_t BitsPerChannel(JxlDataType data_type) { + switch (data_type) { + case JXL_TYPE_UINT8: + return 8; + case JXL_TYPE_UINT16: + return 16; + case JXL_TYPE_FLOAT: + return 32; + case JXL_TYPE_FLOAT16: + return 16; + default: + return 0; // signals unhandled JxlDataType + } +} + +template +uint32_t GetBitDepth(JxlBitDepth bit_depth, const T& metadata, + JxlPixelFormat format) { + if (bit_depth.type == JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) { + return BitsPerChannel(format.data_type); + } else if (bit_depth.type == JXL_BIT_DEPTH_FROM_CODESTREAM) { + return metadata.bit_depth.bits_per_sample; + } else if (bit_depth.type == JXL_BIT_DEPTH_CUSTOM) { + return bit_depth.bits_per_sample; + } else { + return 0; + } +} + +enum class DecoderStage : uint32_t { + kInited, // Decoder created, no JxlDecoderProcessInput called yet + kStarted, // Running JxlDecoderProcessInput calls + kCodestreamFinished, // Codestream done, but other boxes could still occur. + // This stage can also occur before having seen the + // entire codestream if the user didn't subscribe to any + // codestream events at all, e.g. only to box events, + // or, the user only subscribed to basic info, and only + // the header of the codestream was parsed. + kError, // Error occurred, decoder object no longer usable +}; + +enum class FrameStage : uint32_t { + kHeader, // Must parse frame header. + kTOC, // Must parse TOC + kFull, // Must parse full pixels +}; + +enum class BoxStage : uint32_t { + kHeader, // Parsing box header of the next box, or start of non-container + // stream + kFtyp, // The ftyp box + kSkip, // Box whose contents are skipped + kCodestream, // Handling codestream box contents, or non-container stream + kPartialCodestream, // Handling the extra header of partial codestream box + kJpegRecon, // Handling jpeg reconstruction box +}; + +enum class JpegReconStage : uint32_t { + kNone, // Not outputting + kSettingMetadata, // Ready to output, must set metadata to the jpeg_data + kOutputting, // Currently outputting the JPEG bytes + kFinished, // JPEG reconstruction fully handled +}; + +/* +Given list of frame references to storage slots, and storage slots in which this +frame is saved, computes which frames are required to decode the frame at the +given index and any frames after it. The frames on which this depends are +returned as a vector of their indices, in no particular order. The given index +must be smaller than saved_as.size(), and references.size() must equal +saved_as.size(). Any frames beyond saved_as and references are considered +unknown future frames and must be treated as if something depends on them. +*/ +std::vector GetFrameDependencies(size_t index, + const std::vector& saved_as, + const std::vector& references) { + JXL_ASSERT(references.size() == saved_as.size()); + JXL_ASSERT(index < references.size()); + + std::vector result; + + constexpr size_t kNumStorage = 8; + + // value which indicates nothing is stored in this storage slot + const size_t invalid = references.size(); + // for each of the 8 storage slots, a vector that translates frame index to + // frame stored in this storage slot at this point, that is, the last + // frame that was stored in this slot before or at this index. + std::array, kNumStorage> storage; + for (size_t s = 0; s < kNumStorage; ++s) { + storage[s].resize(saved_as.size()); + int mask = 1 << s; + size_t id = invalid; + for (size_t i = 0; i < saved_as.size(); ++i) { + if (saved_as[i] & mask) { + id = i; + } + storage[s][i] = id; + } + } + + std::vector seen(index + 1, 0); + std::vector stack; + stack.push_back(index); + seen[index] = 1; + + // For frames after index, assume they can depend on any of the 8 storage + // slots, so push the frame for each stored reference to the stack and result. + // All frames after index are treated as having unknown references and with + // the possibility that there are more frames after the last known. + // TODO(lode): take values of saved_as and references after index, and a + // input flag indicating if they are all frames of the image, to further + // optimize this. + for (size_t s = 0; s < kNumStorage; ++s) { + size_t frame_ref = storage[s][index]; + if (frame_ref == invalid) continue; + if (seen[frame_ref]) continue; + stack.push_back(frame_ref); + seen[frame_ref] = 1; + result.push_back(frame_ref); + } + + while (!stack.empty()) { + size_t frame_index = stack.back(); + stack.pop_back(); + if (frame_index == 0) continue; // first frame cannot have references + for (size_t s = 0; s < kNumStorage; ++s) { + int mask = 1 << s; + if (!(references[frame_index] & mask)) continue; + size_t frame_ref = storage[s][frame_index - 1]; + if (frame_ref == invalid) continue; + if (seen[frame_ref]) continue; + stack.push_back(frame_ref); + seen[frame_ref] = 1; + result.push_back(frame_ref); + } + } + + return result; +} + +// Parameters for user-requested extra channel output. +struct ExtraChannelOutput { + JxlPixelFormat format; + void* buffer; + size_t buffer_size; +}; + +} // namespace + +namespace jxl { + +typedef struct JxlDecoderFrameIndexBoxEntryStruct { + // OFFi: offset of start byte of this frame compared to start + // byte of previous frame from this index in the JPEG XL codestream. For the + // first frame, this is the offset from the first byte of the JPEG XL + // codestream. + uint64_t OFFi; + // Ti: duration in ticks between the start of this frame and + // the start of the next frame in the index. If this is the last frame in the + // index, this is the duration in ticks between the start of this frame and + // the end of the stream. A tick lasts TNUM / TDEN seconds. + uint32_t Ti; + // Fi: amount of frames the next frame in the index occurs + // after this frame. If this is the last frame in the index, this is the + // amount of frames after this frame in the remainder of the stream. Only + // frames that are presented by the decoder are counted for this purpose, this + // excludes frames that are not intended for display but for compositing with + // other frames, such as frames that aren't the last frame with a duration of + // 0 ticks. + uint32_t Fi; +} JxlDecoderFrameIndexBoxEntry; + +typedef struct JxlDecoderFrameIndexBoxStruct { + int64_t NF() const { return entries.size(); } + int32_t TNUM = 1; + int32_t TDEN = 1000; + + std::vector entries; + + // That way we can ensure that every index box will have the first frame. + // If the API user decides to mark it as an indexed frame, we call + // the AddFrame again, this time with requested. + void AddFrame(uint64_t OFFi, uint32_t Ti, uint32_t Fi) { + JxlDecoderFrameIndexBoxEntry e; + e.OFFi = OFFi; + e.Ti = Ti; + e.Fi = Fi; + entries.push_back(e); + } +} JxlDecoderFrameIndexBox; + +} // namespace jxl + +// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding) +struct JxlDecoderStruct { + JxlDecoderStruct() = default; + + JxlMemoryManager memory_manager; + std::unique_ptr thread_pool; + + DecoderStage stage; + + // Status of progression, internal. + bool got_signature; + // Indicates we know that we've seen the last codestream box: either this + // was a jxlc box, or a jxlp box that has its index indicated as last by + // having its most significant bit set, or no boxes are used at all. This + // does not indicate the full codestream has already been seen, only the + // last box of it has been initiated. + bool last_codestream_seen; + bool got_codestream_signature; + bool got_basic_info; + bool got_transform_data; // To skip everything before ICC. + bool got_all_headers; // Codestream metadata headers. + bool post_headers; // Already decoding pixels. + jxl::ICCReader icc_reader; + jxl::JxlDecoderFrameIndexBox frame_index_box; + // This means either we actually got the preview image, or determined we + // cannot get it or there is none. + bool got_preview_image; + bool preview_frame; + + // Position of next_in in the original file including box format if present + // (as opposed to position in the codestream) + size_t file_pos; + + size_t box_contents_begin; + size_t box_contents_end; + size_t box_contents_size; + size_t box_size; + size_t header_size; + // Either a final box that runs until EOF, or the case of no container format + // at all. + bool box_contents_unbounded; + + JxlBoxType box_type; + JxlBoxType box_decoded_type; // Underlying type for brob boxes + // Set to true right after a JXL_DEC_BOX event only. + bool box_event; + bool decompress_boxes; + + bool box_out_buffer_set; + // Whether the out buffer is set for the current box, if the user did not yet + // release the buffer while the next box is encountered, this will be set to + // false. If this is false, no JXL_DEC_NEED_MORE_INPUT is emitted + // (irrespective of the value of box_out_buffer_set), because not setting + // output indicates the user does not wish the data of this box. + bool box_out_buffer_set_current_box; + uint8_t* box_out_buffer; + size_t box_out_buffer_size; + // which byte of the full box content the start of the out buffer points to + size_t box_out_buffer_begin; + // which byte of box_out_buffer to write to next + size_t box_out_buffer_pos; + + // Settings + bool keep_orientation; + bool unpremul_alpha; + bool render_spotcolors; + bool coalescing; + float desired_intensity_target; + + // Bitfield, for which informative events (JXL_DEC_BASIC_INFO, etc...) the + // decoder returns a status. By default, do not return for any of the events, + // only return when the decoder cannot continue because it needs more input or + // output data. + int events_wanted; + int orig_events_wanted; + + // Fields for reading the basic info from the header. + size_t basic_info_size_hint; + bool have_container; + size_t box_count; + + // The level of progressive detail in frame decoding. + JxlProgressiveDetail prog_detail = kDC; + // The progressive detail of the current frame. + JxlProgressiveDetail frame_prog_detail; + // The intended downsampling ratio for the current progression step. + size_t downsampling_target; + + // Set to true if either an image out buffer or an image out callback was set. + bool image_out_buffer_set; + + // Owned by the caller, buffer for preview or full resolution image. + void* image_out_buffer; + JxlImageOutInitCallback image_out_init_callback; + JxlImageOutRunCallback image_out_run_callback; + JxlImageOutDestroyCallback image_out_destroy_callback; + void* image_out_init_opaque; + struct SimpleImageOutCallback { + JxlImageOutCallback callback; + void* opaque; + }; + SimpleImageOutCallback simple_image_out_callback; + + size_t image_out_size; + + JxlPixelFormat image_out_format; + JxlBitDepth image_out_bit_depth; + + // For extra channels. Empty if no extra channels are requested, and they are + // reset each frame + std::vector extra_channel_output; + + jxl::CodecMetadata metadata; + // Same as metadata.m, except for the color_encoding, which is set to the + // output encoding. + jxl::ImageMetadata image_metadata; + std::unique_ptr ib; + + std::unique_ptr passes_state; + std::unique_ptr frame_dec; + size_t next_section; + std::vector section_processed; + + // headers and TOC for the current frame. When got_toc is true, this is + // always the frame header of the last frame of the current still series, + // that is, the displayed frame. + std::unique_ptr frame_header; + + size_t remaining_frame_size; + FrameStage frame_stage; + bool dc_frame_progression_done; + // The currently processed frame is the last of the current composite still, + // and so must be returned as pixels + bool is_last_of_still; + // The currently processed frame is the last of the codestream + bool is_last_total; + // How many frames to skip. + size_t skip_frames; + // Skipping the current frame. May be false if skip_frames was just set to + // a positive value while already processing a current frame, then + // skipping_frame will be enabled only for the next frame. + bool skipping_frame; + + // Amount of internal frames and external frames started. External frames are + // user-visible frames, internal frames includes all external frames and + // also invisible frames such as patches, blending-only and dc_level frames. + size_t internal_frames; + size_t external_frames; + + // For each internal frame, which storage locations it references, and which + // storage locations it is stored in, using the bit mask as defined in + // FrameDecoder::References and FrameDecoder::SaveAs. + std::vector frame_references; + std::vector frame_saved_as; + + // Translates external frame index to internal frame index. The external + // index is the index of user-visible frames. The internal index can be larger + // since non-visible frames (such as frames with patches, ...) are included. + std::vector frame_external_to_internal; + + // Whether the frame with internal index is required to decode the frame + // being skipped to or any frames after that. If no skipping is active, + // this vector is ignored. If the current internal frame index is beyond this + // vector, it must be treated as a required frame. + std::vector frame_required; + + // Codestream input data is copied here temporarily when the decoder needs + // more input bytes to process the next part of the stream. We copy the input + // data in order to be able to release it all through the API it when + // returning JXL_DEC_NEED_MORE_INPUT. + std::vector codestream_copy; + // Number of bytes at the end of codestream_copy that were not yet consumed + // by calling AdvanceInput(). + size_t codestream_unconsumed; + // Position in the codestream_copy vector that the decoder already finished + // processing. It can be greater than the current size of codestream_copy in + // case where the decoder skips some parts of the frame that were not yet + // provided. + size_t codestream_pos; + // Number of bits after codestream_pos that were already processed. + size_t codestream_bits_ahead; + + BoxStage box_stage; + +#if JPEGXL_ENABLE_BOXES + jxl::JxlBoxContentDecoder box_content_decoder; +#endif +#if JPEGXL_ENABLE_TRANSCODE_JPEG + jxl::JxlToJpegDecoder jpeg_decoder; + // Decodes Exif or XMP metadata for JPEG reconstruction + jxl::JxlBoxContentDecoder metadata_decoder; + std::vector exif_metadata; + std::vector xmp_metadata; + // must store JPEG reconstruction metadata from the current box + // 0 = not stored, 1 = currently storing, 2 = finished + int store_exif; + int store_xmp; + size_t recon_out_buffer_pos; + size_t recon_exif_size; // Expected exif size as read from the jbrd box + size_t recon_xmp_size; // Expected exif size as read from the jbrd box + JpegReconStage recon_output_jpeg; + + bool JbrdNeedMoreBoxes() const { + // jbrd box wants exif but exif box not yet seen + if (store_exif < 2 && recon_exif_size > 0) return true; + // jbrd box wants xmp but xmp box not yet seen + if (store_xmp < 2 && recon_xmp_size > 0) return true; + return false; + } +#endif + + const uint8_t* next_in; + size_t avail_in; + bool input_closed; + + void AdvanceInput(size_t size) { + JXL_DASSERT(avail_in >= size); + next_in += size; + avail_in -= size; + file_pos += size; + } + + size_t AvailableCodestream() const { + size_t avail_codestream = avail_in; + if (!box_contents_unbounded) { + avail_codestream = + std::min(avail_codestream, box_contents_end - file_pos); + } + return avail_codestream; + } + + void AdvanceCodestream(size_t size) { + size_t avail_codestream = AvailableCodestream(); + if (codestream_copy.empty()) { + if (size <= avail_codestream) { + AdvanceInput(size); + } else { + codestream_pos = size - avail_codestream; + AdvanceInput(avail_codestream); + } + } else { + codestream_pos += size; + if (codestream_pos + codestream_unconsumed >= codestream_copy.size()) { + size_t advance = std::min( + codestream_unconsumed, + codestream_unconsumed + codestream_pos - codestream_copy.size()); + AdvanceInput(advance); + codestream_pos -= std::min(codestream_pos, codestream_copy.size()); + codestream_unconsumed = 0; + codestream_copy.clear(); + } + } + } + + JxlDecoderStatus RequestMoreInput() { + if (codestream_copy.empty()) { + size_t avail_codestream = AvailableCodestream(); + codestream_copy.insert(codestream_copy.end(), next_in, + next_in + avail_codestream); + AdvanceInput(avail_codestream); + } else { + AdvanceInput(codestream_unconsumed); + codestream_unconsumed = 0; + } + return JXL_DEC_NEED_MORE_INPUT; + } + + JxlDecoderStatus GetCodestreamInput(jxl::Span* span) { + if (codestream_copy.empty() && codestream_pos > 0) { + size_t avail_codestream = AvailableCodestream(); + size_t skip = std::min(codestream_pos, avail_codestream); + AdvanceInput(skip); + codestream_pos -= skip; + if (codestream_pos > 0) { + return RequestMoreInput(); + } + } + JXL_ASSERT(codestream_pos <= codestream_copy.size()); + JXL_ASSERT(codestream_unconsumed <= codestream_copy.size()); + size_t avail_codestream = AvailableCodestream(); + if (codestream_copy.empty()) { + if (avail_codestream == 0) { + return RequestMoreInput(); + } + *span = jxl::Span(next_in, avail_codestream); + return JXL_DEC_SUCCESS; + } else { + codestream_copy.insert(codestream_copy.end(), + next_in + codestream_unconsumed, + next_in + avail_codestream); + codestream_unconsumed = avail_codestream; + *span = jxl::Span(codestream_copy.data() + codestream_pos, + codestream_copy.size() - codestream_pos); + return JXL_DEC_SUCCESS; + } + } + + // Whether the decoder can use more codestream input for a purpose it needs. + // This returns false if the user didn't subscribe to any events that + // require the codestream (e.g. only subscribed to metadata boxes), or all + // parts of the codestream that are subscribed to (e.g. only basic info) have + // already occurred. + bool CanUseMoreCodestreamInput() const { + // The decoder can set this to finished early if all relevant events were + // processed, so this check works. + return stage != DecoderStage::kCodestreamFinished; + } + + // If set then some operations will fail, if those would require + // allocating large objects. Actual memory usage might be two orders of + // magnitude bigger. + // TODO(eustas): remove once there is working API for memory / CPU limit. + size_t memory_limit_base = 0; + size_t cpu_limit_base = 0; + size_t used_cpu_base = 0; +}; + +namespace { + +bool CheckSizeLimit(JxlDecoder* dec, size_t xsize, size_t ysize) { + if (!dec->memory_limit_base) return true; + if (xsize == 0 || ysize == 0) return true; + if (xsize >= dec->memory_limit_base || ysize >= dec->memory_limit_base) { + return false; + } + // Rough estimate of real row length. + xsize = jxl::DivCeil(xsize, 32) * 32; + size_t num_pixels = xsize * ysize; + if (num_pixels / xsize != ysize) return false; // overflow + if (num_pixels > dec->memory_limit_base) return false; + return true; +} + +} // namespace + +// TODO(zond): Make this depend on the data loaded into the decoder. +JxlDecoderStatus JxlDecoderDefaultPixelFormat(const JxlDecoder* dec, + JxlPixelFormat* format) { + if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT; + *format = {4, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0}; + return JXL_DEC_SUCCESS; +} + +// Resets the state that must be reset for both Rewind and Reset +void JxlDecoderRewindDecodingState(JxlDecoder* dec) { + dec->stage = DecoderStage::kInited; + dec->got_signature = false; + dec->last_codestream_seen = false; + dec->got_codestream_signature = false; + dec->got_basic_info = false; + dec->got_transform_data = false; + dec->got_all_headers = false; + dec->post_headers = false; + dec->icc_reader.Reset(); + dec->got_preview_image = false; + dec->preview_frame = false; + dec->file_pos = 0; + dec->box_contents_begin = 0; + dec->box_contents_end = 0; + dec->box_contents_size = 0; + dec->box_size = 0; + dec->header_size = 0; + dec->box_contents_unbounded = false; + memset(dec->box_type, 0, sizeof(dec->box_type)); + memset(dec->box_decoded_type, 0, sizeof(dec->box_decoded_type)); + dec->box_event = false; + dec->box_stage = BoxStage::kHeader; + dec->box_out_buffer_set = false; + dec->box_out_buffer_set_current_box = false; + dec->box_out_buffer = nullptr; + dec->box_out_buffer_size = 0; + dec->box_out_buffer_begin = 0; + dec->box_out_buffer_pos = 0; + +#if JPEGXL_ENABLE_TRANSCODE_JPEG + dec->exif_metadata.clear(); + dec->xmp_metadata.clear(); + dec->store_exif = 0; + dec->store_xmp = 0; + dec->recon_out_buffer_pos = 0; + dec->recon_exif_size = 0; + dec->recon_xmp_size = 0; + dec->recon_output_jpeg = JpegReconStage::kNone; +#endif + + dec->events_wanted = dec->orig_events_wanted; + dec->basic_info_size_hint = InitialBasicInfoSizeHint(); + dec->have_container = 0; + dec->box_count = 0; + dec->downsampling_target = 8; + dec->image_out_buffer_set = false; + dec->image_out_buffer = nullptr; + dec->image_out_init_callback = nullptr; + dec->image_out_run_callback = nullptr; + dec->image_out_destroy_callback = nullptr; + dec->image_out_init_opaque = nullptr; + dec->image_out_size = 0; + dec->image_out_bit_depth.type = JXL_BIT_DEPTH_FROM_PIXEL_FORMAT; + dec->extra_channel_output.clear(); + dec->next_in = 0; + dec->avail_in = 0; + dec->input_closed = false; + + dec->passes_state.reset(nullptr); + dec->frame_dec.reset(nullptr); + dec->next_section = 0; + dec->section_processed.clear(); + + dec->ib.reset(); + dec->metadata = jxl::CodecMetadata(); + dec->image_metadata = dec->metadata.m; + dec->frame_header.reset(new jxl::FrameHeader(&dec->metadata)); + + dec->codestream_copy.clear(); + dec->codestream_unconsumed = 0; + dec->codestream_pos = 0; + dec->codestream_bits_ahead = 0; + + dec->frame_stage = FrameStage::kHeader; + dec->remaining_frame_size = 0; + dec->is_last_of_still = false; + dec->is_last_total = false; + dec->skip_frames = 0; + dec->skipping_frame = false; + dec->internal_frames = 0; + dec->external_frames = 0; +} + +void JxlDecoderReset(JxlDecoder* dec) { + JxlDecoderRewindDecodingState(dec); + + dec->thread_pool.reset(); + dec->keep_orientation = false; + dec->unpremul_alpha = false; + dec->render_spotcolors = true; + dec->coalescing = true; + dec->desired_intensity_target = 0; + dec->orig_events_wanted = 0; + dec->events_wanted = 0; + dec->frame_references.clear(); + dec->frame_saved_as.clear(); + dec->frame_external_to_internal.clear(); + dec->frame_required.clear(); + dec->decompress_boxes = false; +} + +JxlDecoder* JxlDecoderCreate(const JxlMemoryManager* memory_manager) { + JxlMemoryManager local_memory_manager; + if (!jxl::MemoryManagerInit(&local_memory_manager, memory_manager)) + return nullptr; + + void* alloc = + jxl::MemoryManagerAlloc(&local_memory_manager, sizeof(JxlDecoder)); + if (!alloc) return nullptr; + // Placement new constructor on allocated memory + JxlDecoder* dec = new (alloc) JxlDecoder(); + dec->memory_manager = local_memory_manager; + +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (!memory_manager) { + dec->memory_limit_base = 53 << 16; + // Allow 5 x max_image_size processing units; every frame is accounted + // as W x H CPU processing units, so there could be numerous small frames + // or few larger ones. + dec->cpu_limit_base = 5 * dec->memory_limit_base; + } +#endif + + JxlDecoderReset(dec); + + return dec; +} + +void JxlDecoderDestroy(JxlDecoder* dec) { + if (dec) { + JxlMemoryManager local_memory_manager = dec->memory_manager; + // Call destructor directly since custom free function is used. + dec->~JxlDecoder(); + jxl::MemoryManagerFree(&local_memory_manager, dec); + } +} + +void JxlDecoderRewind(JxlDecoder* dec) { JxlDecoderRewindDecodingState(dec); } + +void JxlDecoderSkipFrames(JxlDecoder* dec, size_t amount) { + // Increment amount, rather than set it: making the amount smaller is + // impossible because the decoder may already have skipped frames required to + // decode earlier frames, and making the amount larger compared to an existing + // amount is impossible because if JxlDecoderSkipFrames is called in the + // middle of already skipping frames, the user cannot know how many frames + // have already been skipped internally so far so an absolute value cannot + // be defined. + dec->skip_frames += amount; + + dec->frame_required.clear(); + size_t next_frame = dec->external_frames + dec->skip_frames; + + // A frame that has been seen before a rewind + if (next_frame < dec->frame_external_to_internal.size()) { + size_t internal_index = dec->frame_external_to_internal[next_frame]; + if (internal_index < dec->frame_saved_as.size()) { + std::vector deps = GetFrameDependencies( + internal_index, dec->frame_saved_as, dec->frame_references); + + dec->frame_required.resize(internal_index + 1, 0); + for (size_t i = 0; i < deps.size(); i++) { + JXL_ASSERT(deps[i] < dec->frame_required.size()); + dec->frame_required[deps[i]] = 1; + } + } + } +} + +JxlDecoderStatus JxlDecoderSkipCurrentFrame(JxlDecoder* dec) { + if (dec->frame_stage != FrameStage::kFull) { + return JXL_DEC_ERROR; + } + JXL_DASSERT(dec->frame_dec); + dec->frame_stage = FrameStage::kHeader; + dec->AdvanceCodestream(dec->remaining_frame_size); + if (dec->is_last_of_still) { + dec->image_out_buffer_set = false; + } + return JXL_DEC_SUCCESS; +} + +JXL_EXPORT JxlDecoderStatus +JxlDecoderSetParallelRunner(JxlDecoder* dec, JxlParallelRunner parallel_runner, + void* parallel_runner_opaque) { + if (dec->stage != DecoderStage::kInited) { + return JXL_API_ERROR("parallel_runner must be set before starting"); + } + dec->thread_pool.reset( + new jxl::ThreadPool(parallel_runner, parallel_runner_opaque)); + return JXL_DEC_SUCCESS; +} + +size_t JxlDecoderSizeHintBasicInfo(const JxlDecoder* dec) { + if (dec->got_basic_info) return 0; + return dec->basic_info_size_hint; +} + +JxlDecoderStatus JxlDecoderSubscribeEvents(JxlDecoder* dec, int events_wanted) { + if (dec->stage != DecoderStage::kInited) { + return JXL_DEC_ERROR; // Cannot subscribe to events after having started. + } + if (events_wanted & 63) { + return JXL_DEC_ERROR; // Can only subscribe to informative events. + } + dec->events_wanted = events_wanted; + dec->orig_events_wanted = events_wanted; + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderSetKeepOrientation(JxlDecoder* dec, + JXL_BOOL skip_reorientation) { + if (dec->stage != DecoderStage::kInited) { + return JXL_API_ERROR("Must set keep_orientation option before starting"); + } + dec->keep_orientation = !!skip_reorientation; + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderSetUnpremultiplyAlpha(JxlDecoder* dec, + JXL_BOOL unpremul_alpha) { + if (dec->stage != DecoderStage::kInited) { + return JXL_API_ERROR("Must set unpremul_alpha option before starting"); + } + dec->unpremul_alpha = !!unpremul_alpha; + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderSetRenderSpotcolors(JxlDecoder* dec, + JXL_BOOL render_spotcolors) { + if (dec->stage != DecoderStage::kInited) { + return JXL_API_ERROR("Must set render_spotcolors option before starting"); + } + dec->render_spotcolors = !!render_spotcolors; + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderSetCoalescing(JxlDecoder* dec, JXL_BOOL coalescing) { + if (dec->stage != DecoderStage::kInited) { + return JXL_API_ERROR("Must set coalescing option before starting"); + } + dec->coalescing = !!coalescing; + return JXL_DEC_SUCCESS; +} + +namespace { +// helper function to get the dimensions of the current image buffer +void GetCurrentDimensions(const JxlDecoder* dec, size_t& xsize, size_t& ysize) { + if (dec->frame_header->nonserialized_is_preview) { + xsize = dec->metadata.oriented_preview_xsize(dec->keep_orientation); + ysize = dec->metadata.oriented_preview_ysize(dec->keep_orientation); + return; + } + xsize = dec->metadata.oriented_xsize(dec->keep_orientation); + ysize = dec->metadata.oriented_ysize(dec->keep_orientation); + if (!dec->coalescing) { + const auto frame_dim = dec->frame_header->ToFrameDimensions(); + xsize = frame_dim.xsize_upsampled; + ysize = frame_dim.ysize_upsampled; + if (!dec->keep_orientation && + static_cast(dec->metadata.m.GetOrientation()) > 4) { + std::swap(xsize, ysize); + } + } +} +} // namespace + +namespace jxl { +namespace { + +template +bool CanRead(Span data, BitReader* reader, T* JXL_RESTRICT t) { + // Use a copy of the bit reader because CanRead advances bits. + BitReader reader2(data); + reader2.SkipBits(reader->TotalBitsConsumed()); + bool result = Bundle::CanRead(&reader2, t); + JXL_ASSERT(reader2.Close()); + return result; +} + +// Returns JXL_DEC_SUCCESS if the full bundle was successfully read, status +// indicating either error or need more input otherwise. +template +JxlDecoderStatus ReadBundle(JxlDecoder* dec, Span data, + BitReader* reader, T* JXL_RESTRICT t) { + if (!CanRead(data, reader, t)) { + return dec->RequestMoreInput(); + } + if (!Bundle::Read(reader, t)) { + return JXL_DEC_ERROR; + } + return JXL_DEC_SUCCESS; +} + +#define JXL_API_RETURN_IF_ERROR(expr) \ + { \ + JxlDecoderStatus status_ = ConvertStatus(expr); \ + if (status_ != JXL_DEC_SUCCESS) return status_; \ + } + +std::unique_ptr> GetBitReader( + Span span) { + BitReader* reader = new BitReader(span); + return std::unique_ptr>( + reader, [](BitReader* reader) { + // We can't allow Close to abort the program if the reader is out of + // bounds, or all return paths in the code, even those that already + // return failure, would have to manually call AllReadsWithinBounds(). + // Invalid JXL codestream should not cause program to quit. + (void)reader->AllReadsWithinBounds(); + (void)reader->Close(); + delete reader; + }); +} + +JxlDecoderStatus JxlDecoderReadBasicInfo(JxlDecoder* dec) { + if (!dec->got_codestream_signature) { + // Check and skip the codestream signature + Span span; + JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span)); + if (span.size() < 2) { + return dec->RequestMoreInput(); + } + if (span.data()[0] != 0xff || span.data()[1] != jxl::kCodestreamMarker) { + return JXL_API_ERROR("invalid signature"); + } + dec->got_codestream_signature = true; + dec->AdvanceCodestream(2); + } + + Span span; + JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span)); + auto reader = GetBitReader(span); + JXL_API_RETURN_IF_ERROR( + ReadBundle(dec, span, reader.get(), &dec->metadata.size)); + JXL_API_RETURN_IF_ERROR( + ReadBundle(dec, span, reader.get(), &dec->metadata.m)); + size_t total_bits = reader->TotalBitsConsumed(); + dec->AdvanceCodestream(total_bits / jxl::kBitsPerByte); + dec->codestream_bits_ahead = total_bits % jxl::kBitsPerByte; + dec->got_basic_info = true; + dec->basic_info_size_hint = 0; + dec->image_metadata = dec->metadata.m; + JXL_DEBUG_V(2, "Decoded BasicInfo: %s", dec->metadata.DebugString().c_str()); + + if (!CheckSizeLimit(dec, dec->metadata.size.xsize(), + dec->metadata.size.ysize())) { + return JXL_API_ERROR("image is too large"); + } + + return JXL_DEC_SUCCESS; +} + +// Reads all codestream headers (but not frame headers) +JxlDecoderStatus JxlDecoderReadAllHeaders(JxlDecoder* dec) { + if (!dec->got_transform_data) { + Span span; + JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span)); + auto reader = GetBitReader(span); + reader->SkipBits(dec->codestream_bits_ahead); + dec->metadata.transform_data.nonserialized_xyb_encoded = + dec->metadata.m.xyb_encoded; + JXL_API_RETURN_IF_ERROR( + ReadBundle(dec, span, reader.get(), &dec->metadata.transform_data)); + size_t total_bits = reader->TotalBitsConsumed(); + dec->AdvanceCodestream(total_bits / jxl::kBitsPerByte); + dec->codestream_bits_ahead = total_bits % jxl::kBitsPerByte; + dec->got_transform_data = true; + } + + Span span; + JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span)); + auto reader = GetBitReader(span); + reader->SkipBits(dec->codestream_bits_ahead); + + if (dec->metadata.m.color_encoding.WantICC()) { + jxl::Status status = + dec->icc_reader.Init(reader.get(), dec->memory_limit_base); + // Always check AllReadsWithinBounds, not all the C++ decoder implementation + // handles reader out of bounds correctly yet (e.g. context map). Not + // checking AllReadsWithinBounds can cause reader->Close() to trigger an + // assert, but we don't want library to quit program for invalid codestream. + if (!reader->AllReadsWithinBounds() || + status.code() == StatusCode::kNotEnoughBytes) { + return dec->RequestMoreInput(); + } + if (!status) { + // Other non-successful status is an error + return JXL_DEC_ERROR; + } + PaddedBytes icc; + status = dec->icc_reader.Process(reader.get(), &icc); + if (status.code() == StatusCode::kNotEnoughBytes) { + return dec->RequestMoreInput(); + } + if (!status) { + // Other non-successful status is an error + return JXL_DEC_ERROR; + } + if (!dec->metadata.m.color_encoding.SetICCRaw(std::move(icc))) { + return JXL_DEC_ERROR; + } + } + + dec->got_all_headers = true; + JXL_API_RETURN_IF_ERROR(reader->JumpToByteBoundary()); + + dec->AdvanceCodestream(reader->TotalBitsConsumed() / jxl::kBitsPerByte); + dec->codestream_bits_ahead = 0; + + if (!dec->passes_state) { + dec->passes_state.reset(new jxl::PassesDecoderState()); + } + + JXL_API_RETURN_IF_ERROR( + dec->passes_state->output_encoding_info.SetFromMetadata(dec->metadata)); + if (dec->desired_intensity_target > 0) { + dec->passes_state->output_encoding_info.desired_intensity_target = + dec->desired_intensity_target; + } + dec->image_metadata = dec->metadata.m; + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderProcessSections(JxlDecoder* dec) { + Span span; + JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span)); + const auto& toc = dec->frame_dec->Toc(); + size_t pos = 0; + std::vector section_info; + std::vector section_status; + for (size_t i = dec->next_section; i < toc.size(); ++i) { + if (dec->section_processed[i]) { + pos += toc[i].size; + continue; + } + size_t id = toc[i].id; + size_t size = toc[i].size; + if (OutOfBounds(pos, size, span.size())) { + break; + } + auto br = + new jxl::BitReader(jxl::Span(span.data() + pos, size)); + section_info.emplace_back(jxl::FrameDecoder::SectionInfo{br, id, i}); + section_status.emplace_back(); + pos += size; + } + jxl::Status status = dec->frame_dec->ProcessSections( + section_info.data(), section_info.size(), section_status.data()); + bool out_of_bounds = false; + for (const auto& info : section_info) { + if (!info.br->AllReadsWithinBounds()) { + // Mark out of bounds section, but keep closing and deleting the next + // ones as well. + out_of_bounds = true; + } + JXL_ASSERT(info.br->Close()); + delete info.br; + } + if (out_of_bounds) { + // If any bit reader indicates out of bounds, it's an error, not just + // needing more input, since we ensure only bit readers containing + // a complete section are provided to the FrameDecoder. + return JXL_API_ERROR("frame out of bounds"); + } + if (!status) { + return JXL_API_ERROR("frame processing failed"); + } + for (size_t i = 0; i < section_status.size(); ++i) { + auto status = section_status[i]; + if (status == jxl::FrameDecoder::kDone) { + dec->section_processed[section_info[i].index] = 1; + } else if (status != jxl::FrameDecoder::kSkipped) { + return JXL_API_ERROR("unexpected section status"); + } + } + size_t completed_prefix_bytes = 0; + while (dec->next_section < dec->section_processed.size() && + dec->section_processed[dec->next_section] == 1) { + completed_prefix_bytes += toc[dec->next_section].size; + ++dec->next_section; + } + dec->remaining_frame_size -= completed_prefix_bytes; + dec->AdvanceCodestream(completed_prefix_bytes); + return JXL_DEC_SUCCESS; +} + +// TODO(eustas): no CodecInOut -> no image size reinforcement -> possible OOM. +JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec) { + // If no parallel runner is set, use the default + // TODO(lode): move this initialization to an appropriate location once the + // runner is used to decode pixels. + if (!dec->thread_pool) { + dec->thread_pool.reset(new jxl::ThreadPool(nullptr, nullptr)); + } + + // No matter what events are wanted, the basic info is always required. + if (!dec->got_basic_info) { + JxlDecoderStatus status = JxlDecoderReadBasicInfo(dec); + if (status != JXL_DEC_SUCCESS) return status; + } + + if (dec->events_wanted & JXL_DEC_BASIC_INFO) { + dec->events_wanted &= ~JXL_DEC_BASIC_INFO; + return JXL_DEC_BASIC_INFO; + } + + if (!dec->events_wanted) { + dec->stage = DecoderStage::kCodestreamFinished; + return JXL_DEC_SUCCESS; + } + + if (!dec->got_all_headers) { + JxlDecoderStatus status = JxlDecoderReadAllHeaders(dec); + if (status != JXL_DEC_SUCCESS) return status; + } + + if (dec->events_wanted & JXL_DEC_COLOR_ENCODING) { + dec->events_wanted &= ~JXL_DEC_COLOR_ENCODING; + return JXL_DEC_COLOR_ENCODING; + } + + if (!dec->events_wanted) { + dec->stage = DecoderStage::kCodestreamFinished; + return JXL_DEC_SUCCESS; + } + + dec->post_headers = true; + + if (!dec->got_preview_image && dec->metadata.m.have_preview) { + dec->preview_frame = true; + } + + // Handle frames + for (;;) { + bool parse_frames = + (dec->events_wanted & + (JXL_DEC_PREVIEW_IMAGE | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + if (!parse_frames) { + break; + } + if (dec->frame_stage == FrameStage::kHeader && dec->is_last_total) { + break; + } + if (dec->frame_stage == FrameStage::kHeader) { +#if JPEGXL_ENABLE_TRANSCODE_JPEG + if (dec->recon_output_jpeg == JpegReconStage::kSettingMetadata || + dec->recon_output_jpeg == JpegReconStage::kOutputting) { + // The image bundle contains the JPEG reconstruction frame, but the + // decoder is still waiting to decode an EXIF or XMP box. It's not + // implemented to decode additional frames during this, and a JPEG + // reconstruction image should have only one frame. + return JXL_API_ERROR( + "cannot decode a next frame after JPEG reconstruction frame"); + } +#endif + if (!dec->ib) { + dec->ib.reset(new jxl::ImageBundle(&dec->image_metadata)); + } +#if JPEGXL_ENABLE_TRANSCODE_JPEG + // If JPEG reconstruction is wanted and possible, set the jpeg_data of + // the ImageBundle. + if (!dec->jpeg_decoder.SetImageBundleJpegData(dec->ib.get())) + return JXL_DEC_ERROR; +#endif + dec->frame_dec.reset(new FrameDecoder( + dec->passes_state.get(), dec->metadata, dec->thread_pool.get(), + /*use_slow_rendering_pipeline=*/false)); + dec->frame_header.reset(new FrameHeader(&dec->metadata)); + Span span; + JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span)); + auto reader = GetBitReader(span); + jxl::Status status = dec->frame_dec->InitFrame( + reader.get(), dec->ib.get(), dec->preview_frame); + if (!reader->AllReadsWithinBounds() || + status.code() == StatusCode::kNotEnoughBytes) { + return dec->RequestMoreInput(); + } else if (!status) { + return JXL_API_ERROR("invalid frame header"); + } + dec->AdvanceCodestream(reader->TotalBitsConsumed() / kBitsPerByte); + *dec->frame_header = dec->frame_dec->GetFrameHeader(); + jxl::FrameDimensions frame_dim = dec->frame_header->ToFrameDimensions(); + if (!CheckSizeLimit(dec, frame_dim.xsize_upsampled_padded, + frame_dim.ysize_upsampled_padded)) { + return JXL_API_ERROR("frame is too large"); + } + bool output_needed = + (dec->preview_frame ? (dec->events_wanted & JXL_DEC_PREVIEW_IMAGE) + : (dec->events_wanted & JXL_DEC_FULL_IMAGE)); + if (output_needed) { + JXL_API_RETURN_IF_ERROR(dec->frame_dec->InitFrameOutput()); + } + if (dec->cpu_limit_base != 0) { + // No overflow, checked in CheckSizeLimit. + size_t num_pixels = frame_dim.xsize * frame_dim.ysize; + if (dec->used_cpu_base + num_pixels < dec->used_cpu_base) { + return JXL_API_ERROR("used too much CPU"); + } + dec->used_cpu_base += num_pixels; + if (dec->used_cpu_base > dec->cpu_limit_base) { + return JXL_API_ERROR("used too much CPU"); + } + } + dec->remaining_frame_size = dec->frame_dec->SumSectionSizes(); + + dec->frame_stage = FrameStage::kTOC; + if (dec->preview_frame) { + if (!(dec->events_wanted & JXL_DEC_PREVIEW_IMAGE)) { + dec->frame_stage = FrameStage::kHeader; + dec->AdvanceCodestream(dec->remaining_frame_size); + dec->got_preview_image = true; + dec->preview_frame = false; + } + continue; + } + + int saved_as = FrameDecoder::SavedAs(*dec->frame_header); + // is last in entire codestream + dec->is_last_total = dec->frame_header->is_last; + // is last of current still + dec->is_last_of_still = + dec->is_last_total || dec->frame_header->animation_frame.duration > 0; + // is kRegularFrame and coalescing is disabled + dec->is_last_of_still |= + (!dec->coalescing && + dec->frame_header->frame_type == FrameType::kRegularFrame); + const size_t internal_frame_index = dec->internal_frames; + const size_t external_frame_index = dec->external_frames; + if (dec->is_last_of_still) dec->external_frames++; + dec->internal_frames++; + + if (dec->skip_frames > 0) { + dec->skipping_frame = true; + if (dec->is_last_of_still) { + dec->skip_frames--; + } + } else { + dec->skipping_frame = false; + } + + if (external_frame_index >= dec->frame_external_to_internal.size()) { + dec->frame_external_to_internal.push_back(internal_frame_index); + JXL_ASSERT(dec->frame_external_to_internal.size() == + external_frame_index + 1); + } + + if (internal_frame_index >= dec->frame_saved_as.size()) { + dec->frame_saved_as.push_back(saved_as); + JXL_ASSERT(dec->frame_saved_as.size() == internal_frame_index + 1); + + // add the value 0xff (which means all references) to new slots: we only + // know the references of the frame at FinalizeFrame, and fill in the + // correct values there. As long as this information is not known, the + // worst case where the frame depends on all storage slots is assumed. + dec->frame_references.push_back(0xff); + JXL_ASSERT(dec->frame_references.size() == internal_frame_index + 1); + } + + if (dec->skipping_frame) { + // Whether this frame could be referenced by any future frame: either + // because it's a frame saved for blending or patches, or because it's + // a DC frame. + bool referenceable = + dec->frame_header->CanBeReferenced() || + dec->frame_header->frame_type == FrameType::kDCFrame; + if (internal_frame_index < dec->frame_required.size() && + !dec->frame_required[internal_frame_index]) { + referenceable = false; + } + if (!referenceable) { + // Skip all decoding for this frame, since the user is skipping this + // frame and no future frames can reference it. + dec->frame_stage = FrameStage::kHeader; + dec->AdvanceCodestream(dec->remaining_frame_size); + continue; + } + } + + if ((dec->events_wanted & JXL_DEC_FRAME) && dec->is_last_of_still) { + // Only return this for the last of a series of stills: patches frames + // etc... before this one do not contain the correct information such + // as animation timing, ... + if (!dec->skipping_frame) { + return JXL_DEC_FRAME; + } + } + } + + if (dec->frame_stage == FrameStage::kTOC) { + dec->frame_dec->SetRenderSpotcolors(dec->render_spotcolors); + dec->frame_dec->SetCoalescing(dec->coalescing); + + if (!dec->preview_frame && + (dec->events_wanted & JXL_DEC_FRAME_PROGRESSION)) { + dec->frame_prog_detail = + dec->frame_dec->SetPauseAtProgressive(dec->prog_detail); + } else { + dec->frame_prog_detail = JxlProgressiveDetail::kFrames; + } + dec->dc_frame_progression_done = 0; + + dec->next_section = 0; + dec->section_processed.clear(); + dec->section_processed.resize(dec->frame_dec->Toc().size(), 0); + + // If we don't need pixels, we can skip actually decoding the frames. + if (dec->preview_frame || (dec->events_wanted & JXL_DEC_FULL_IMAGE)) { + dec->frame_stage = FrameStage::kFull; + } else if (!dec->is_last_total) { + dec->frame_stage = FrameStage::kHeader; + dec->AdvanceCodestream(dec->remaining_frame_size); + continue; + } else { + break; + } + } + + if (dec->frame_stage == FrameStage::kFull) { + if (!dec->image_out_buffer_set) { + if (dec->preview_frame) { + return JXL_DEC_NEED_PREVIEW_OUT_BUFFER; + } + if ( +#if JPEGXL_ENABLE_TRANSCODE_JPEG + (!dec->jpeg_decoder.IsOutputSet() || + dec->ib->jpeg_data == nullptr) && +#endif + dec->is_last_of_still && !dec->skipping_frame) { + // TODO(lode): remove the dec->is_last_of_still condition if the + // frame decoder needs the image buffer as working space for decoding + // non-visible or blending frames too + return JXL_DEC_NEED_IMAGE_OUT_BUFFER; + } + } + + if (dec->image_out_buffer_set) { + size_t xsize, ysize; + GetCurrentDimensions(dec, xsize, ysize); + size_t bits_per_sample = GetBitDepth( + dec->image_out_bit_depth, dec->metadata.m, dec->image_out_format); + dec->frame_dec->SetImageOutput( + PixelCallback{ + dec->image_out_init_callback, dec->image_out_run_callback, + dec->image_out_destroy_callback, dec->image_out_init_opaque}, + reinterpret_cast(dec->image_out_buffer), + dec->image_out_size, xsize, ysize, dec->image_out_format, + bits_per_sample, dec->unpremul_alpha, !dec->keep_orientation); + for (size_t i = 0; i < dec->extra_channel_output.size(); ++i) { + const auto& extra = dec->extra_channel_output[i]; + size_t ec_bits_per_sample = + GetBitDepth(dec->image_out_bit_depth, + dec->metadata.m.extra_channel_info[i], extra.format); + dec->frame_dec->AddExtraChannelOutput(extra.buffer, extra.buffer_size, + xsize, extra.format, + ec_bits_per_sample); + } + } + + size_t next_num_passes_to_pause = dec->frame_dec->NextNumPassesToPause(); + + JXL_API_RETURN_IF_ERROR(JxlDecoderProcessSections(dec)); + + bool all_sections_done = dec->frame_dec->HasDecodedAll(); + bool got_dc_only = !all_sections_done && dec->frame_dec->HasDecodedDC(); + + if (dec->frame_prog_detail >= JxlProgressiveDetail::kDC && + !dec->dc_frame_progression_done && got_dc_only) { + dec->dc_frame_progression_done = true; + dec->downsampling_target = 8; + return JXL_DEC_FRAME_PROGRESSION; + } + + bool new_progression_step_done = + dec->frame_dec->NumCompletePasses() >= next_num_passes_to_pause; + + if (!all_sections_done && + dec->frame_prog_detail >= JxlProgressiveDetail::kLastPasses && + new_progression_step_done) { + dec->downsampling_target = + dec->frame_header->passes.GetDownsamplingTargetForCompletedPasses( + dec->frame_dec->NumCompletePasses()); + return JXL_DEC_FRAME_PROGRESSION; + } + + if (!all_sections_done) { + // Not all sections have been processed yet + return dec->RequestMoreInput(); + } + + if (!dec->preview_frame) { + size_t internal_index = dec->internal_frames - 1; + JXL_ASSERT(dec->frame_references.size() > internal_index); + // Always fill this in, even if it was already written, it could be that + // this frame was skipped before and set to 255, while only now we know + // the true value. + dec->frame_references[internal_index] = dec->frame_dec->References(); + } + + if (!dec->frame_dec->FinalizeFrame()) { + return JXL_API_ERROR("decoding frame failed"); + } +#if JPEGXL_ENABLE_TRANSCODE_JPEG + // If jpeg output was requested, we merely return the JXL_DEC_FULL_IMAGE + // status without outputting pixels. + if (dec->jpeg_decoder.IsOutputSet() && dec->ib->jpeg_data != nullptr) { + dec->frame_stage = FrameStage::kHeader; + dec->recon_output_jpeg = JpegReconStage::kSettingMetadata; + return JXL_DEC_FULL_IMAGE; + } +#endif + if (dec->preview_frame || dec->is_last_of_still) { + dec->image_out_buffer_set = false; + dec->extra_channel_output.clear(); + } + } + + dec->frame_stage = FrameStage::kHeader; + + // The pixels have been output or are not needed, do not keep them in + // memory here. + dec->ib.reset(); + if (dec->preview_frame) { + dec->got_preview_image = true; + dec->preview_frame = false; + dec->events_wanted &= ~JXL_DEC_PREVIEW_IMAGE; + return JXL_DEC_PREVIEW_IMAGE; + } else if (dec->is_last_of_still && + (dec->events_wanted & JXL_DEC_FULL_IMAGE) && + !dec->skipping_frame) { + return JXL_DEC_FULL_IMAGE; + } + } + + dec->stage = DecoderStage::kCodestreamFinished; + // Return success, this means there is nothing more to do. + return JXL_DEC_SUCCESS; +} + +} // namespace +} // namespace jxl + +JxlDecoderStatus JxlDecoderSetInput(JxlDecoder* dec, const uint8_t* data, + size_t size) { + if (dec->next_in) { + return JXL_API_ERROR("already set input, use JxlDecoderReleaseInput first"); + } + if (dec->input_closed) { + return JXL_API_ERROR("input already closed"); + } + + dec->next_in = data; + dec->avail_in = size; + return JXL_DEC_SUCCESS; +} + +size_t JxlDecoderReleaseInput(JxlDecoder* dec) { + size_t result = dec->avail_in; + dec->next_in = nullptr; + dec->avail_in = 0; + return result; +} + +void JxlDecoderCloseInput(JxlDecoder* dec) { dec->input_closed = true; } + +JxlDecoderStatus JxlDecoderSetJPEGBuffer(JxlDecoder* dec, uint8_t* data, + size_t size) { +#if JPEGXL_ENABLE_TRANSCODE_JPEG + // JPEG reconstruction buffer can only set and updated before or during the + // first frame, the reconstruction box refers to the first frame and in + // theory multi-frame images should not be used with a jbrd box. + if (dec->internal_frames > 1) { + return JXL_API_ERROR("JPEG reconstruction only works for the first frame"); + } + if (dec->jpeg_decoder.IsOutputSet()) { + return JXL_API_ERROR("Already set JPEG buffer"); + } + return dec->jpeg_decoder.SetOutputBuffer(data, size); +#else + return JXL_API_ERROR("JPEG reconstruction is not supported."); +#endif +} + +size_t JxlDecoderReleaseJPEGBuffer(JxlDecoder* dec) { +#if JPEGXL_ENABLE_TRANSCODE_JPEG + return dec->jpeg_decoder.ReleaseOutputBuffer(); +#else + return JXL_API_ERROR("JPEG reconstruction is not supported."); +#endif +} + +// Parses the header of the box, outputting the 4-character type and the box +// size, including header size, as stored in the box header. +// @param in current input bytes. +// @param size available input size. +// @param pos position in the input, must begin at the header of the box. +// @param file_pos position of pos since the start of the JXL file, rather than +// the current input, used for integer overflow checking. +// @param type the output box type. +// @param box_size output the total box size, including header, in bytes, or 0 +// if it's a final unbounded box. +// @param header_size output size of the box header. +// @return JXL_DEC_SUCCESS if the box header was fully parsed. In that case the +// parsing position must be incremented by header_size bytes. +// JXL_DEC_NEED_MORE_INPUT if not enough input bytes available, in that case +// header_size indicates a lower bound for the known size the header has to be +// at least. JXL_DEC_ERROR if the box header is invalid. +static JxlDecoderStatus ParseBoxHeader(const uint8_t* in, size_t size, + size_t pos, size_t file_pos, + JxlBoxType type, uint64_t* box_size, + uint64_t* header_size) { + if (OutOfBounds(pos, 8, size)) { + *header_size = 8; + return JXL_DEC_NEED_MORE_INPUT; + } + size_t box_start = pos; + // Box size, including this header itself. + *box_size = LoadBE32(in + pos); + pos += 4; + if (*box_size == 1) { + *header_size = 16; + if (OutOfBounds(pos, 12, size)) return JXL_DEC_NEED_MORE_INPUT; + *box_size = LoadBE64(in + pos); + pos += 8; + } + memcpy(type, in + pos, 4); + pos += 4; + *header_size = pos - box_start; + if (*box_size > 0 && *box_size < *header_size) { + return JXL_API_ERROR("invalid box size"); + } + if (SumOverflows(file_pos, pos, *box_size)) { + return JXL_API_ERROR("Box size overflow"); + } + return JXL_DEC_SUCCESS; +} + +// This includes handling the codestream if it is not a box-based jxl file. +static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) { + // Box handling loop + for (;;) { + if (dec->box_stage != BoxStage::kHeader) { + dec->AdvanceInput(dec->header_size); + dec->header_size = 0; +#if JPEGXL_ENABLE_BOXES + if ((dec->events_wanted & JXL_DEC_BOX) && + dec->box_out_buffer_set_current_box) { + uint8_t* next_out = dec->box_out_buffer + dec->box_out_buffer_pos; + size_t avail_out = dec->box_out_buffer_size - dec->box_out_buffer_pos; + + JxlDecoderStatus box_result = dec->box_content_decoder.Process( + dec->next_in, dec->avail_in, + dec->file_pos - dec->box_contents_begin, &next_out, &avail_out); + size_t produced = + next_out - (dec->box_out_buffer + dec->box_out_buffer_pos); + dec->box_out_buffer_pos += produced; + + // Don't return JXL_DEC_NEED_MORE_INPUT: the box stages below, instead, + // handle the input progression, and the above only outputs the part of + // the box seen so far. + if (box_result != JXL_DEC_SUCCESS && + box_result != JXL_DEC_NEED_MORE_INPUT) { + return box_result; + } + } +#endif +#if JPEGXL_ENABLE_TRANSCODE_JPEG + if (dec->store_exif == 1 || dec->store_xmp == 1) { + std::vector& metadata = + (dec->store_exif == 1) ? dec->exif_metadata : dec->xmp_metadata; + for (;;) { + if (metadata.empty()) metadata.resize(64); + uint8_t* orig_next_out = metadata.data() + dec->recon_out_buffer_pos; + uint8_t* next_out = orig_next_out; + size_t avail_out = metadata.size() - dec->recon_out_buffer_pos; + JxlDecoderStatus box_result = dec->metadata_decoder.Process( + dec->next_in, dec->avail_in, + dec->file_pos - dec->box_contents_begin, &next_out, &avail_out); + size_t produced = next_out - orig_next_out; + dec->recon_out_buffer_pos += produced; + if (box_result == JXL_DEC_BOX_NEED_MORE_OUTPUT) { + metadata.resize(metadata.size() * 2); + } else if (box_result == JXL_DEC_NEED_MORE_INPUT) { + break; // box stage handling below will handle this instead + } else if (box_result == JXL_DEC_SUCCESS) { + size_t needed_size = (dec->store_exif == 1) ? dec->recon_exif_size + : dec->recon_xmp_size; + if (dec->box_contents_unbounded && + dec->recon_out_buffer_pos < needed_size) { + // Unbounded box, but we know the expected size due to the jbrd + // box's data. Treat this as the JXL_DEC_NEED_MORE_INPUT case. + break; + } else { + metadata.resize(dec->recon_out_buffer_pos); + if (dec->store_exif == 1) dec->store_exif = 2; + if (dec->store_xmp == 1) dec->store_xmp = 2; + break; + } + } else { + // error + return box_result; + } + } + } +#endif + } +#if JPEGXL_ENABLE_TRANSCODE_JPEG + if (dec->recon_output_jpeg == JpegReconStage::kSettingMetadata && + !dec->JbrdNeedMoreBoxes()) { + jxl::jpeg::JPEGData* jpeg_data = dec->ib->jpeg_data.get(); + if (dec->recon_exif_size) { + JxlDecoderStatus status = jxl::JxlToJpegDecoder::SetExif( + dec->exif_metadata.data(), dec->exif_metadata.size(), jpeg_data); + if (status != JXL_DEC_SUCCESS) return status; + } + if (dec->recon_xmp_size) { + JxlDecoderStatus status = jxl::JxlToJpegDecoder::SetXmp( + dec->xmp_metadata.data(), dec->xmp_metadata.size(), jpeg_data); + if (status != JXL_DEC_SUCCESS) return status; + } + dec->recon_output_jpeg = JpegReconStage::kOutputting; + } + + if (dec->recon_output_jpeg == JpegReconStage::kOutputting && + !dec->JbrdNeedMoreBoxes()) { + JxlDecoderStatus status = + dec->jpeg_decoder.WriteOutput(*dec->ib->jpeg_data); + if (status != JXL_DEC_SUCCESS) return status; + dec->recon_output_jpeg = JpegReconStage::kFinished; + dec->ib.reset(); + if (dec->events_wanted & JXL_DEC_FULL_IMAGE) { + // Return the full image event here now, this may be delayed if this + // could only be done after decoding an exif or xmp box after the + // codestream. + return JXL_DEC_FULL_IMAGE; + } + } +#endif + + if (dec->box_stage == BoxStage::kHeader) { + if (!dec->have_container) { + if (dec->stage == DecoderStage::kCodestreamFinished) + return JXL_DEC_SUCCESS; + dec->box_stage = BoxStage::kCodestream; + dec->box_contents_unbounded = true; + continue; + } + if (dec->avail_in == 0) { + if (dec->stage != DecoderStage::kCodestreamFinished) { + // Not yet seen (all) codestream boxes. + return JXL_DEC_NEED_MORE_INPUT; + } +#if JPEGXL_ENABLE_TRANSCODE_JPEG + if (dec->JbrdNeedMoreBoxes()) { + return JXL_DEC_NEED_MORE_INPUT; + } +#endif + if (dec->input_closed) { + return JXL_DEC_SUCCESS; + } + if (!(dec->events_wanted & JXL_DEC_BOX)) { + // All codestream and jbrd metadata boxes finished, and no individual + // boxes requested by user, so no need to request any more input. + // This returns success for backwards compatibility, when + // JxlDecoderCloseInput and JXL_DEC_BOX did not exist, as well + // as for efficiency. + return JXL_DEC_SUCCESS; + } + // Even though we are exactly at a box end, there still may be more + // boxes. The user may call JxlDecoderCloseInput to indicate the input + // is finished and get success instead. + return JXL_DEC_NEED_MORE_INPUT; + } + + bool boxed_codestream_done = + ((dec->events_wanted & JXL_DEC_BOX) && + dec->stage == DecoderStage::kCodestreamFinished && +#if JPEGXL_ENABLE_TRANSCODE_JPEG + !dec->JbrdNeedMoreBoxes() && +#endif + dec->last_codestream_seen); + if (boxed_codestream_done && dec->avail_in >= 2 && + dec->next_in[0] == 0xff && + dec->next_in[1] == jxl::kCodestreamMarker) { + // We detected the start of the next naked codestream, so we can return + // success here. + return JXL_DEC_SUCCESS; + } + + uint64_t box_size, header_size; + JxlDecoderStatus status = + ParseBoxHeader(dec->next_in, dec->avail_in, 0, dec->file_pos, + dec->box_type, &box_size, &header_size); + if (status != JXL_DEC_SUCCESS) { + if (status == JXL_DEC_NEED_MORE_INPUT) { + dec->basic_info_size_hint = + InitialBasicInfoSizeHint() + header_size - dec->file_pos; + } + return status; + } + if (memcmp(dec->box_type, "brob", 4) == 0) { + if (dec->avail_in < header_size + 4) { + return JXL_DEC_NEED_MORE_INPUT; + } + memcpy(dec->box_decoded_type, dec->next_in + header_size, + sizeof(dec->box_decoded_type)); + } else { + memcpy(dec->box_decoded_type, dec->box_type, + sizeof(dec->box_decoded_type)); + } + + // Box order validity checks + // The signature box at box_count == 1 is not checked here since that's + // already done at the beginning. + dec->box_count++; + if (boxed_codestream_done && memcmp(dec->box_type, "JXL ", 4) == 0) { + // We detected the start of the next boxed stream, so we can return + // success here. + return JXL_DEC_SUCCESS; + } + if (dec->box_count == 2 && memcmp(dec->box_type, "ftyp", 4) != 0) { + return JXL_API_ERROR("the second box must be the ftyp box"); + } + if (memcmp(dec->box_type, "ftyp", 4) == 0 && dec->box_count != 2) { + return JXL_API_ERROR("the ftyp box must come second"); + } + + dec->box_contents_unbounded = (box_size == 0); + dec->box_contents_begin = dec->file_pos + header_size; + dec->box_contents_end = + dec->box_contents_unbounded ? 0 : (dec->file_pos + box_size); + dec->box_contents_size = + dec->box_contents_unbounded ? 0 : (box_size - header_size); + dec->box_size = box_size; + dec->header_size = header_size; +#if JPEGXL_ENABLE_TRANSCODE_JPEG + if (dec->orig_events_wanted & JXL_DEC_JPEG_RECONSTRUCTION) { + // Initiate storing of Exif or XMP data for JPEG reconstruction + if (dec->store_exif == 0 && + memcmp(dec->box_decoded_type, "Exif", 4) == 0) { + dec->store_exif = 1; + dec->recon_out_buffer_pos = 0; + } + if (dec->store_xmp == 0 && + memcmp(dec->box_decoded_type, "xml ", 4) == 0) { + dec->store_xmp = 1; + dec->recon_out_buffer_pos = 0; + } + } +#endif +#if JPEGXL_ENABLE_BOXES + if (dec->events_wanted & JXL_DEC_BOX) { + bool decompress = + dec->decompress_boxes && memcmp(dec->box_type, "brob", 4) == 0; + dec->box_content_decoder.StartBox( + decompress, dec->box_contents_unbounded, dec->box_contents_size); + } +#endif +#if JPEGXL_ENABLE_TRANSCODE_JPEG + if (dec->store_exif == 1 || dec->store_xmp == 1) { + bool brob = memcmp(dec->box_type, "brob", 4) == 0; + dec->metadata_decoder.StartBox(brob, dec->box_contents_unbounded, + dec->box_contents_size); + } +#endif + if (memcmp(dec->box_type, "ftyp", 4) == 0) { + dec->box_stage = BoxStage::kFtyp; + } else if (memcmp(dec->box_type, "jxlc", 4) == 0) { + if (dec->last_codestream_seen) { + return JXL_API_ERROR("there can only be one jxlc box"); + } + dec->last_codestream_seen = true; + dec->box_stage = BoxStage::kCodestream; + } else if (memcmp(dec->box_type, "jxlp", 4) == 0) { + dec->box_stage = BoxStage::kPartialCodestream; +#if JPEGXL_ENABLE_TRANSCODE_JPEG + } else if ((dec->orig_events_wanted & JXL_DEC_JPEG_RECONSTRUCTION) && + memcmp(dec->box_type, "jbrd", 4) == 0) { + if (!(dec->events_wanted & JXL_DEC_JPEG_RECONSTRUCTION)) { + return JXL_API_ERROR( + "multiple JPEG reconstruction boxes not supported"); + } + dec->box_stage = BoxStage::kJpegRecon; +#endif + } else { + dec->box_stage = BoxStage::kSkip; + } + + if (dec->events_wanted & JXL_DEC_BOX) { + dec->box_event = true; + dec->box_out_buffer_set_current_box = false; + return JXL_DEC_BOX; + } + } else if (dec->box_stage == BoxStage::kFtyp) { + if (dec->box_contents_size < 12) { + return JXL_API_ERROR("file type box too small"); + } + if (dec->avail_in < 4) return JXL_DEC_NEED_MORE_INPUT; + if (memcmp(dec->next_in, "jxl ", 4) != 0) { + return JXL_API_ERROR("file type box major brand must be \"jxl \""); + } + dec->AdvanceInput(4); + dec->box_stage = BoxStage::kSkip; + } else if (dec->box_stage == BoxStage::kPartialCodestream) { + if (dec->last_codestream_seen) { + return JXL_API_ERROR("cannot have jxlp box after last jxlp box"); + } + // TODO(lode): error if box is unbounded but last bit not set + if (dec->avail_in < 4) return JXL_DEC_NEED_MORE_INPUT; + if (!dec->box_contents_unbounded && dec->box_contents_size < 4) { + return JXL_API_ERROR("jxlp box too small to contain index"); + } + size_t jxlp_index = LoadBE32(dec->next_in); + // The high bit of jxlp_index indicates whether this is the last + // jxlp box. + if (jxlp_index & 0x80000000) { + dec->last_codestream_seen = true; + } + dec->AdvanceInput(4); + dec->box_stage = BoxStage::kCodestream; + } else if (dec->box_stage == BoxStage::kCodestream) { + JxlDecoderStatus status = jxl::JxlDecoderProcessCodestream(dec); +#if JPEGXL_ENABLE_TRANSCODE_JPEG + if (status == JXL_DEC_FULL_IMAGE) { + if (dec->recon_output_jpeg != JpegReconStage::kNone) { + continue; + } + } +#endif + if (status == JXL_DEC_NEED_MORE_INPUT) { + if (dec->file_pos == dec->box_contents_end && + !dec->box_contents_unbounded) { + dec->box_stage = BoxStage::kHeader; + continue; + } + } + + if (status == JXL_DEC_SUCCESS) { +#if JPEGXL_ENABLE_TRANSCODE_JPEG + if (dec->JbrdNeedMoreBoxes()) { + dec->box_stage = BoxStage::kSkip; + continue; + } +#endif + if (dec->box_contents_unbounded) { + // Last box reached and codestream done, nothing more to do. + break; + } + if (dec->events_wanted & JXL_DEC_BOX) { + // Codestream done, but there may be more other boxes. + dec->box_stage = BoxStage::kSkip; + continue; + } + } + return status; +#if JPEGXL_ENABLE_TRANSCODE_JPEG + } else if (dec->box_stage == BoxStage::kJpegRecon) { + if (!dec->jpeg_decoder.IsParsingBox()) { + // This is a new JPEG reconstruction metadata box. + dec->jpeg_decoder.StartBox(dec->box_contents_unbounded, + dec->box_contents_size); + } + const uint8_t* next_in = dec->next_in; + size_t avail_in = dec->avail_in; + JxlDecoderStatus recon_result = + dec->jpeg_decoder.Process(&next_in, &avail_in); + size_t consumed = next_in - dec->next_in; + dec->AdvanceInput(consumed); + if (recon_result == JXL_DEC_JPEG_RECONSTRUCTION) { + jxl::jpeg::JPEGData* jpeg_data = dec->jpeg_decoder.GetJpegData(); + size_t num_exif = jxl::JxlToJpegDecoder::NumExifMarkers(*jpeg_data); + size_t num_xmp = jxl::JxlToJpegDecoder::NumXmpMarkers(*jpeg_data); + if (num_exif) { + if (num_exif > 1) { + return JXL_API_ERROR( + "multiple exif markers for JPEG reconstruction not supported"); + } + if (JXL_DEC_SUCCESS != jxl::JxlToJpegDecoder::ExifBoxContentSize( + *jpeg_data, &dec->recon_exif_size)) { + return JXL_API_ERROR("invalid jbrd exif size"); + } + } + if (num_xmp) { + if (num_xmp > 1) { + return JXL_API_ERROR( + "multiple XMP markers for JPEG reconstruction not supported"); + } + if (JXL_DEC_SUCCESS != jxl::JxlToJpegDecoder::XmlBoxContentSize( + *jpeg_data, &dec->recon_xmp_size)) { + return JXL_API_ERROR("invalid jbrd XMP size"); + } + } + + dec->box_stage = BoxStage::kHeader; + // If successful JPEG reconstruction, return the success if the user + // cares about it, otherwise continue. + if (dec->events_wanted & recon_result) { + dec->events_wanted &= ~recon_result; + return recon_result; + } + } else { + // If anything else, return the result. + return recon_result; + } +#endif + } else if (dec->box_stage == BoxStage::kSkip) { + if (dec->box_contents_unbounded) { + if (dec->input_closed) { + return JXL_DEC_SUCCESS; + } + if (!(dec->box_out_buffer_set)) { + // An unbounded box is always the last box. Not requesting box data, + // so return success even if JxlDecoderCloseInput was not called for + // backwards compatibility as well as efficiency since this box is + // being skipped. + return JXL_DEC_SUCCESS; + } + // Arbitrarily more bytes may follow, only JxlDecoderCloseInput can + // mark the end. + dec->AdvanceInput(dec->avail_in); + return JXL_DEC_NEED_MORE_INPUT; + } + // Amount of remaining bytes in the box that is being skipped. + size_t remaining = dec->box_contents_end - dec->file_pos; + if (dec->avail_in < remaining) { + // Indicate how many more bytes needed starting from next_in. + dec->basic_info_size_hint = + InitialBasicInfoSizeHint() + dec->box_contents_end - dec->file_pos; + // Don't have the full box yet, skip all we have so far + dec->AdvanceInput(dec->avail_in); + return JXL_DEC_NEED_MORE_INPUT; + } else { + // Full box available, skip all its remaining bytes + dec->AdvanceInput(remaining); + dec->box_stage = BoxStage::kHeader; + } + } else { + JXL_DASSERT(false); // unknown box stage + } + } + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderProcessInput(JxlDecoder* dec) { + if (dec->stage == DecoderStage::kInited) { + dec->stage = DecoderStage::kStarted; + } + if (dec->stage == DecoderStage::kError) { + return JXL_API_ERROR( + "Cannot keep using decoder after it encountered an error, use " + "JxlDecoderReset to reset it"); + } + + if (!dec->got_signature) { + JxlSignature sig = JxlSignatureCheck(dec->next_in, dec->avail_in); + if (sig == JXL_SIG_INVALID) return JXL_API_ERROR("invalid signature"); + if (sig == JXL_SIG_NOT_ENOUGH_BYTES) { + if (dec->input_closed) { + return JXL_API_ERROR("file too small for signature"); + } + return JXL_DEC_NEED_MORE_INPUT; + } + + dec->got_signature = true; + + if (sig == JXL_SIG_CONTAINER) { + dec->have_container = 1; + } else { + dec->last_codestream_seen = true; + } + } + + JxlDecoderStatus status = HandleBoxes(dec); + + if (status == JXL_DEC_NEED_MORE_INPUT && dec->input_closed) { + return JXL_API_ERROR("missing input"); + } + + // Even if the box handling returns success, certain types of + // data may be missing. + if (status == JXL_DEC_SUCCESS) { + if (dec->CanUseMoreCodestreamInput()) { + return JXL_API_ERROR("codestream never finished"); + } +#if JPEGXL_ENABLE_TRANSCODE_JPEG + if (dec->JbrdNeedMoreBoxes()) { + return JXL_API_ERROR("missing metadata boxes for jpeg reconstruction"); + } +#endif + } + + return status; +} + +// To ensure ABI forward-compatibility, this struct has a constant size. +static_assert(sizeof(JxlBasicInfo) == 204, + "JxlBasicInfo struct size should remain constant"); + +JxlDecoderStatus JxlDecoderGetBasicInfo(const JxlDecoder* dec, + JxlBasicInfo* info) { + if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT; + + if (info) { + memset(info, 0, sizeof(*info)); + + const jxl::ImageMetadata& meta = dec->metadata.m; + + info->have_container = dec->have_container; + info->xsize = dec->metadata.size.xsize(); + info->ysize = dec->metadata.size.ysize(); + info->uses_original_profile = !meta.xyb_encoded; + + info->bits_per_sample = meta.bit_depth.bits_per_sample; + info->exponent_bits_per_sample = meta.bit_depth.exponent_bits_per_sample; + + info->have_preview = meta.have_preview; + info->have_animation = meta.have_animation; + info->orientation = static_cast(meta.orientation); + + if (!dec->keep_orientation) { + if (info->orientation >= JXL_ORIENT_TRANSPOSE) { + std::swap(info->xsize, info->ysize); + } + info->orientation = JXL_ORIENT_IDENTITY; + } + + info->intensity_target = meta.IntensityTarget(); + if (dec->desired_intensity_target > 0) { + info->intensity_target = dec->desired_intensity_target; + } + info->min_nits = meta.tone_mapping.min_nits; + info->relative_to_max_display = meta.tone_mapping.relative_to_max_display; + info->linear_below = meta.tone_mapping.linear_below; + + const jxl::ExtraChannelInfo* alpha = meta.Find(jxl::ExtraChannel::kAlpha); + if (alpha != nullptr) { + info->alpha_bits = alpha->bit_depth.bits_per_sample; + info->alpha_exponent_bits = alpha->bit_depth.exponent_bits_per_sample; + info->alpha_premultiplied = alpha->alpha_associated; + } else { + info->alpha_bits = 0; + info->alpha_exponent_bits = 0; + info->alpha_premultiplied = 0; + } + + info->num_color_channels = + meta.color_encoding.GetColorSpace() == jxl::ColorSpace::kGray ? 1 : 3; + + info->num_extra_channels = meta.num_extra_channels; + + if (info->have_preview) { + info->preview.xsize = dec->metadata.m.preview_size.xsize(); + info->preview.ysize = dec->metadata.m.preview_size.ysize(); + } + + if (info->have_animation) { + info->animation.tps_numerator = dec->metadata.m.animation.tps_numerator; + info->animation.tps_denominator = + dec->metadata.m.animation.tps_denominator; + info->animation.num_loops = dec->metadata.m.animation.num_loops; + info->animation.have_timecodes = dec->metadata.m.animation.have_timecodes; + } + + if (meta.have_intrinsic_size) { + info->intrinsic_xsize = dec->metadata.m.intrinsic_size.xsize(); + info->intrinsic_ysize = dec->metadata.m.intrinsic_size.ysize(); + } else { + info->intrinsic_xsize = info->xsize; + info->intrinsic_ysize = info->ysize; + } + } + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetExtraChannelInfo(const JxlDecoder* dec, + size_t index, + JxlExtraChannelInfo* info) { + if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT; + + const std::vector& channels = + dec->metadata.m.extra_channel_info; + + if (index >= channels.size()) return JXL_DEC_ERROR; // out of bounds + const jxl::ExtraChannelInfo& channel = channels[index]; + + info->type = static_cast(channel.type); + info->bits_per_sample = channel.bit_depth.bits_per_sample; + info->exponent_bits_per_sample = + channel.bit_depth.floating_point_sample + ? channel.bit_depth.exponent_bits_per_sample + : 0; + info->dim_shift = channel.dim_shift; + info->name_length = channel.name.size(); + info->alpha_premultiplied = channel.alpha_associated; + info->spot_color[0] = channel.spot_color[0]; + info->spot_color[1] = channel.spot_color[1]; + info->spot_color[2] = channel.spot_color[2]; + info->spot_color[3] = channel.spot_color[3]; + info->cfa_channel = channel.cfa_channel; + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetExtraChannelName(const JxlDecoder* dec, + size_t index, char* name, + size_t size) { + if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT; + + const std::vector& channels = + dec->metadata.m.extra_channel_info; + + if (index >= channels.size()) return JXL_DEC_ERROR; // out of bounds + const jxl::ExtraChannelInfo& channel = channels[index]; + + // Also need null-termination character + if (channel.name.size() + 1 > size) return JXL_DEC_ERROR; + + memcpy(name, channel.name.c_str(), channel.name.size() + 1); + + return JXL_DEC_SUCCESS; +} + +namespace { + +// Gets the jxl::ColorEncoding for the desired target, and checks errors. +// Returns the object regardless of whether the actual color space is in ICC, +// but ensures that if the color encoding is not the encoding from the +// codestream header metadata, it cannot require ICC profile. +JxlDecoderStatus GetColorEncodingForTarget( + const JxlDecoder* dec, JxlColorProfileTarget target, + const jxl::ColorEncoding** encoding) { + if (!dec->got_all_headers) return JXL_DEC_NEED_MORE_INPUT; + *encoding = nullptr; + if (target == JXL_COLOR_PROFILE_TARGET_DATA && dec->metadata.m.xyb_encoded) { + *encoding = &dec->passes_state->output_encoding_info.color_encoding; + } else { + *encoding = &dec->metadata.m.color_encoding; + } + return JXL_DEC_SUCCESS; +} +} // namespace + +JxlDecoderStatus JxlDecoderGetColorAsEncodedProfile( + const JxlDecoder* dec, const JxlPixelFormat* unused_format, + JxlColorProfileTarget target, JxlColorEncoding* color_encoding) { + const jxl::ColorEncoding* jxl_color_encoding = nullptr; + JxlDecoderStatus status = + GetColorEncodingForTarget(dec, target, &jxl_color_encoding); + if (status) return status; + + if (jxl_color_encoding->WantICC()) + return JXL_DEC_ERROR; // Indicate no encoded profile available. + + if (color_encoding) { + ConvertInternalToExternalColorEncoding(*jxl_color_encoding, color_encoding); + } + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetICCProfileSize( + const JxlDecoder* dec, const JxlPixelFormat* unused_format, + JxlColorProfileTarget target, size_t* size) { + const jxl::ColorEncoding* jxl_color_encoding = nullptr; + JxlDecoderStatus status = + GetColorEncodingForTarget(dec, target, &jxl_color_encoding); + if (status != JXL_DEC_SUCCESS) return status; + + if (jxl_color_encoding->WantICC()) { + jxl::ColorSpace color_space = + dec->metadata.m.color_encoding.GetColorSpace(); + if (color_space == jxl::ColorSpace::kUnknown || + color_space == jxl::ColorSpace::kXYB) { + // This indicates there's no ICC profile available + // TODO(lode): for the XYB case, do we want to craft an ICC profile that + // represents XYB as an RGB profile? It may be possible, but not with + // only 1D transfer functions. + return JXL_DEC_ERROR; + } + } + + if (size) { + *size = jxl_color_encoding->ICC().size(); + } + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetColorAsICCProfile( + const JxlDecoder* dec, const JxlPixelFormat* unused_format, + JxlColorProfileTarget target, uint8_t* icc_profile, size_t size) { + size_t wanted_size; + // This also checks the NEED_MORE_INPUT and the unknown/xyb cases + JxlDecoderStatus status = + JxlDecoderGetICCProfileSize(dec, nullptr, target, &wanted_size); + if (status != JXL_DEC_SUCCESS) return status; + if (size < wanted_size) return JXL_API_ERROR("ICC profile output too small"); + + const jxl::ColorEncoding* jxl_color_encoding = nullptr; + status = GetColorEncodingForTarget(dec, target, &jxl_color_encoding); + if (status != JXL_DEC_SUCCESS) return status; + + memcpy(icc_profile, jxl_color_encoding->ICC().data(), + jxl_color_encoding->ICC().size()); + + return JXL_DEC_SUCCESS; +} + +namespace { + +// Returns the amount of bits needed for getting memory buffer size, and does +// all error checking required for size checking and format validity. +JxlDecoderStatus PrepareSizeCheck(const JxlDecoder* dec, + const JxlPixelFormat* format, size_t* bits) { + if (!dec->got_basic_info) { + // Don't know image dimensions yet, cannot check for valid size. + return JXL_DEC_NEED_MORE_INPUT; + } + if (!dec->coalescing && + (!dec->frame_header || dec->frame_stage == FrameStage::kHeader)) { + return JXL_API_ERROR("Don't know frame dimensions yet"); + } + if (format->num_channels > 4) { + return JXL_API_ERROR("More than 4 channels not supported"); + } + + *bits = BitsPerChannel(format->data_type); + + if (*bits == 0) { + return JXL_API_ERROR("Invalid/unsupported data type"); + } + + return JXL_DEC_SUCCESS; +} + +} // namespace + +size_t JxlDecoderGetIntendedDownsamplingRatio(JxlDecoder* dec) { + return dec->downsampling_target; +} + +JxlDecoderStatus JxlDecoderFlushImage(JxlDecoder* dec) { + if (!dec->image_out_buffer_set) return JXL_DEC_ERROR; + if (dec->frame_stage != FrameStage::kFull) { + return JXL_DEC_ERROR; + } + JXL_DASSERT(dec->frame_dec); + if (!dec->frame_dec->HasDecodedDC()) { + // FrameDecoder::Flush currently requires DC to have been decoded already + // to work correctly. + return JXL_DEC_ERROR; + } + + if (!dec->frame_dec->Flush()) { + return JXL_DEC_ERROR; + } + + return JXL_DEC_SUCCESS; +} + +JXL_EXPORT JxlDecoderStatus JxlDecoderPreviewOutBufferSize( + const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size) { + size_t bits; + JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits); + if (status != JXL_DEC_SUCCESS) return status; + if (format->num_channels < 3 && + !dec->image_metadata.color_encoding.IsGray()) { + return JXL_API_ERROR("Number of channels is too low for color output"); + } + + size_t xsize = dec->metadata.oriented_preview_xsize(dec->keep_orientation); + size_t ysize = dec->metadata.oriented_preview_ysize(dec->keep_orientation); + + size_t row_size = + jxl::DivCeil(xsize * format->num_channels * bits, jxl::kBitsPerByte); + size_t last_row_size = row_size; + if (format->align > 1) { + row_size = jxl::DivCeil(row_size, format->align) * format->align; + } + *size = row_size * (ysize - 1) + last_row_size; + return JXL_DEC_SUCCESS; +} + +JXL_EXPORT JxlDecoderStatus JxlDecoderSetPreviewOutBuffer( + JxlDecoder* dec, const JxlPixelFormat* format, void* buffer, size_t size) { + if (!dec->got_basic_info || !dec->metadata.m.have_preview || + !(dec->orig_events_wanted & JXL_DEC_PREVIEW_IMAGE)) { + return JXL_API_ERROR("No preview out buffer needed at this time"); + } + if (format->num_channels < 3 && + !dec->image_metadata.color_encoding.IsGray()) { + return JXL_API_ERROR("Number of channels is too low for color output"); + } + + size_t min_size; + // This also checks whether the format is valid and supported and basic info + // is available. + JxlDecoderStatus status = + JxlDecoderPreviewOutBufferSize(dec, format, &min_size); + if (status != JXL_DEC_SUCCESS) return status; + + if (size < min_size) return JXL_DEC_ERROR; + + dec->image_out_buffer_set = true; + dec->image_out_buffer = buffer; + dec->image_out_size = size; + dec->image_out_format = *format; + + return JXL_DEC_SUCCESS; +} + +JXL_EXPORT JxlDecoderStatus JxlDecoderImageOutBufferSize( + const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size) { + size_t bits; + JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits); + if (status != JXL_DEC_SUCCESS) return status; + if (format->num_channels < 3 && + !dec->image_metadata.color_encoding.IsGray()) { + return JXL_API_ERROR("Number of channels is too low for color output"); + } + size_t xsize, ysize; + GetCurrentDimensions(dec, xsize, ysize); + size_t row_size = + jxl::DivCeil(xsize * format->num_channels * bits, jxl::kBitsPerByte); + if (format->align > 1) { + row_size = jxl::DivCeil(row_size, format->align) * format->align; + } + *size = row_size * ysize; + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderSetImageOutBuffer(JxlDecoder* dec, + const JxlPixelFormat* format, + void* buffer, size_t size) { + if (!dec->got_basic_info || !(dec->orig_events_wanted & JXL_DEC_FULL_IMAGE)) { + return JXL_API_ERROR("No image out buffer needed at this time"); + } + if (dec->image_out_buffer_set && !!dec->image_out_run_callback) { + return JXL_API_ERROR( + "Cannot change from image out callback to image out buffer"); + } + if (format->num_channels < 3 && + !dec->image_metadata.color_encoding.IsGray()) { + return JXL_API_ERROR("Number of channels is too low for color output"); + } + size_t min_size; + // This also checks whether the format is valid and supported and basic info + // is available. + JxlDecoderStatus status = + JxlDecoderImageOutBufferSize(dec, format, &min_size); + if (status != JXL_DEC_SUCCESS) return status; + + if (size < min_size) return JXL_DEC_ERROR; + + dec->image_out_buffer_set = true; + dec->image_out_buffer = buffer; + dec->image_out_size = size; + dec->image_out_format = *format; + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderExtraChannelBufferSize(const JxlDecoder* dec, + const JxlPixelFormat* format, + size_t* size, + uint32_t index) { + if (!dec->got_basic_info || !(dec->orig_events_wanted & JXL_DEC_FULL_IMAGE)) { + return JXL_API_ERROR("No extra channel buffer needed at this time"); + } + + if (index >= dec->metadata.m.num_extra_channels) { + return JXL_API_ERROR("Invalid extra channel index"); + } + + size_t num_channels = 1; // Do not use format's num_channels + + size_t bits; + JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits); + if (status != JXL_DEC_SUCCESS) return status; + + size_t xsize, ysize; + GetCurrentDimensions(dec, xsize, ysize); + size_t row_size = + jxl::DivCeil(xsize * num_channels * bits, jxl::kBitsPerByte); + if (format->align > 1) { + row_size = jxl::DivCeil(row_size, format->align) * format->align; + } + *size = row_size * ysize; + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderSetExtraChannelBuffer(JxlDecoder* dec, + const JxlPixelFormat* format, + void* buffer, size_t size, + uint32_t index) { + size_t min_size; + // This also checks whether the format and index are valid and supported and + // basic info is available. + JxlDecoderStatus status = + JxlDecoderExtraChannelBufferSize(dec, format, &min_size, index); + if (status != JXL_DEC_SUCCESS) return status; + + if (size < min_size) return JXL_DEC_ERROR; + + if (dec->extra_channel_output.size() <= index) { + dec->extra_channel_output.resize(dec->metadata.m.num_extra_channels, + {{}, nullptr, 0}); + } + // Guaranteed correct thanks to check in JxlDecoderExtraChannelBufferSize. + JXL_ASSERT(index < dec->extra_channel_output.size()); + + dec->extra_channel_output[index].format = *format; + dec->extra_channel_output[index].format.num_channels = 1; + dec->extra_channel_output[index].buffer = buffer; + dec->extra_channel_output[index].buffer_size = size; + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderSetImageOutCallback(JxlDecoder* dec, + const JxlPixelFormat* format, + JxlImageOutCallback callback, + void* opaque) { + dec->simple_image_out_callback.callback = callback; + dec->simple_image_out_callback.opaque = opaque; + const auto init_callback = + +[](void* init_opaque, size_t num_threads, size_t num_pixels_per_thread) { + // No initialization to do, just reuse init_opaque as run_opaque. + return init_opaque; + }; + const auto run_callback = + +[](void* run_opaque, size_t thread_id, size_t x, size_t y, + size_t num_pixels, const void* pixels) { + const auto* const simple_callback = + static_cast(run_opaque); + simple_callback->callback(simple_callback->opaque, x, y, num_pixels, + pixels); + }; + const auto destroy_callback = +[](void* run_opaque) {}; + return JxlDecoderSetMultithreadedImageOutCallback( + dec, format, init_callback, run_callback, + /*destroy_callback=*/destroy_callback, &dec->simple_image_out_callback); +} + +JxlDecoderStatus JxlDecoderSetMultithreadedImageOutCallback( + JxlDecoder* dec, const JxlPixelFormat* format, + JxlImageOutInitCallback init_callback, JxlImageOutRunCallback run_callback, + JxlImageOutDestroyCallback destroy_callback, void* init_opaque) { + if (dec->image_out_buffer_set && !!dec->image_out_buffer) { + return JXL_API_ERROR( + "Cannot change from image out buffer to image out callback"); + } + + if (init_callback == nullptr || run_callback == nullptr || + destroy_callback == nullptr) { + return JXL_API_ERROR("All callbacks are required"); + } + + // Perform error checking for invalid format. + size_t bits_dummy; + JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits_dummy); + if (status != JXL_DEC_SUCCESS) return status; + + dec->image_out_buffer_set = true; + dec->image_out_init_callback = init_callback; + dec->image_out_run_callback = run_callback; + dec->image_out_destroy_callback = destroy_callback; + dec->image_out_init_opaque = init_opaque; + dec->image_out_format = *format; + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetFrameHeader(const JxlDecoder* dec, + JxlFrameHeader* header) { + if (!dec->frame_header || dec->frame_stage == FrameStage::kHeader) { + return JXL_API_ERROR("no frame header available"); + } + const auto& metadata = dec->metadata.m; + memset(header, 0, sizeof(*header)); + if (metadata.have_animation) { + header->duration = dec->frame_header->animation_frame.duration; + if (metadata.animation.have_timecodes) { + header->timecode = dec->frame_header->animation_frame.timecode; + } + } + header->name_length = dec->frame_header->name.size(); + header->is_last = dec->frame_header->is_last; + size_t xsize, ysize; + GetCurrentDimensions(dec, xsize, ysize); + header->layer_info.xsize = xsize; + header->layer_info.ysize = ysize; + if (!dec->coalescing && dec->frame_header->custom_size_or_origin) { + header->layer_info.crop_x0 = dec->frame_header->frame_origin.x0; + header->layer_info.crop_y0 = dec->frame_header->frame_origin.y0; + header->layer_info.have_crop = JXL_TRUE; + } else { + header->layer_info.crop_x0 = 0; + header->layer_info.crop_y0 = 0; + header->layer_info.have_crop = JXL_FALSE; + } + if (!dec->keep_orientation && !dec->coalescing) { + // orient the crop offset + size_t W = dec->metadata.oriented_xsize(false); + size_t H = dec->metadata.oriented_ysize(false); + if (metadata.orientation > 4) { + std::swap(header->layer_info.crop_x0, header->layer_info.crop_y0); + } + size_t o = (metadata.orientation - 1) & 3; + if (o > 0 && o < 3) { + header->layer_info.crop_x0 = W - xsize - header->layer_info.crop_x0; + } + if (o > 1) { + header->layer_info.crop_y0 = H - ysize - header->layer_info.crop_y0; + } + } + if (dec->coalescing) { + header->layer_info.blend_info.blendmode = JXL_BLEND_REPLACE; + header->layer_info.blend_info.source = 0; + header->layer_info.blend_info.alpha = 0; + header->layer_info.blend_info.clamp = JXL_FALSE; + header->layer_info.save_as_reference = 0; + } else { + header->layer_info.blend_info.blendmode = + static_cast(dec->frame_header->blending_info.mode); + header->layer_info.blend_info.source = + dec->frame_header->blending_info.source; + header->layer_info.blend_info.alpha = + dec->frame_header->blending_info.alpha_channel; + header->layer_info.blend_info.clamp = + dec->frame_header->blending_info.clamp; + header->layer_info.save_as_reference = dec->frame_header->save_as_reference; + } + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetExtraChannelBlendInfo(const JxlDecoder* dec, + size_t index, + JxlBlendInfo* blend_info) { + if (!dec->frame_header || dec->frame_stage == FrameStage::kHeader) { + return JXL_API_ERROR("no frame header available"); + } + const auto& metadata = dec->metadata.m; + if (index >= metadata.num_extra_channels) { + return JXL_API_ERROR("Invalid extra channel index"); + } + blend_info->blendmode = static_cast( + dec->frame_header->extra_channel_blending_info[index].mode); + blend_info->source = + dec->frame_header->extra_channel_blending_info[index].source; + blend_info->alpha = + dec->frame_header->extra_channel_blending_info[index].alpha_channel; + blend_info->clamp = + dec->frame_header->extra_channel_blending_info[index].clamp; + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetFrameName(const JxlDecoder* dec, char* name, + size_t size) { + if (!dec->frame_header || dec->frame_stage == FrameStage::kHeader) { + return JXL_API_ERROR("no frame header available"); + } + if (size < dec->frame_header->name.size() + 1) { + return JXL_API_ERROR("too small frame name output buffer"); + } + memcpy(name, dec->frame_header->name.c_str(), + dec->frame_header->name.size() + 1); + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderSetPreferredColorProfile( + JxlDecoder* dec, const JxlColorEncoding* color_encoding) { + if (!dec->got_all_headers) { + return JXL_API_ERROR("color info not yet available"); + } + if (dec->post_headers) { + return JXL_API_ERROR("too late to set the color encoding"); + } + if (dec->image_metadata.color_encoding.IsGray() && + color_encoding->color_space != JXL_COLOR_SPACE_GRAY && + dec->image_out_buffer_set && dec->image_out_format.num_channels < 3) { + return JXL_API_ERROR("Number of channels is too low for color output"); + } + if (color_encoding->color_space == JXL_COLOR_SPACE_UNKNOWN) { + return JXL_API_ERROR("Unknown output colorspace"); + } + jxl::ColorEncoding c_out; + JXL_API_RETURN_IF_ERROR( + ConvertExternalToInternalColorEncoding(*color_encoding, &c_out)); + JXL_API_RETURN_IF_ERROR(!c_out.ICC().empty()); + auto& output_encoding = dec->passes_state->output_encoding_info; + if (!c_out.SameColorEncoding(output_encoding.color_encoding)) { + JXL_API_RETURN_IF_ERROR(output_encoding.MaybeSetColorEncoding(c_out)); + dec->image_metadata.color_encoding = output_encoding.color_encoding; + } + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderSetDesiredIntensityTarget( + JxlDecoder* dec, float desired_intensity_target) { + if (desired_intensity_target < 0) { + return JXL_API_ERROR("negative intensity target requested"); + } + dec->desired_intensity_target = desired_intensity_target; + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderSetBoxBuffer(JxlDecoder* dec, uint8_t* data, + size_t size) { + if (dec->box_out_buffer_set) { + return JXL_API_ERROR("must release box buffer before setting it again"); + } + if (!dec->box_event) { + return JXL_API_ERROR("can only set box buffer after box event"); + } + + dec->box_out_buffer_set = true; + dec->box_out_buffer_set_current_box = true; + dec->box_out_buffer = data; + dec->box_out_buffer_size = size; + dec->box_out_buffer_pos = 0; + return JXL_DEC_SUCCESS; +} + +size_t JxlDecoderReleaseBoxBuffer(JxlDecoder* dec) { + if (!dec->box_out_buffer_set) { + return 0; + } + size_t result = dec->box_out_buffer_size - dec->box_out_buffer_pos; + dec->box_out_buffer_set = false; + dec->box_out_buffer = nullptr; + dec->box_out_buffer_size = 0; + if (!dec->box_out_buffer_set_current_box) { + dec->box_out_buffer_begin = 0; + } else { + dec->box_out_buffer_begin += dec->box_out_buffer_pos; + } + dec->box_out_buffer_set_current_box = false; + return result; +} + +JxlDecoderStatus JxlDecoderSetDecompressBoxes(JxlDecoder* dec, + JXL_BOOL decompress) { + // TODO(lode): return error if libbrotli is not compiled in the jxl decoding + // library + dec->decompress_boxes = decompress; + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetBoxType(JxlDecoder* dec, JxlBoxType type, + JXL_BOOL decompressed) { + if (!dec->box_event) { + return JXL_API_ERROR("can only get box info after JXL_DEC_BOX event"); + } + if (decompressed) { + memcpy(type, dec->box_decoded_type, sizeof(dec->box_decoded_type)); + } else { + memcpy(type, dec->box_type, sizeof(dec->box_type)); + } + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetBoxSizeRaw(const JxlDecoder* dec, + uint64_t* size) { + if (!dec->box_event) { + return JXL_API_ERROR("can only get box info after JXL_DEC_BOX event"); + } + if (size) { + *size = dec->box_size; + } + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderSetProgressiveDetail(JxlDecoder* dec, + JxlProgressiveDetail detail) { + if (detail != kDC && detail != kLastPasses && detail != kPasses) { + return JXL_API_ERROR( + "Values other than kDC (%d), kLastPasses (%d) and kPasses (%d), " + "like %d are not implemented.", + kDC, kLastPasses, kPasses, detail); + } + dec->prog_detail = detail; + return JXL_DEC_SUCCESS; +} + +namespace { + +template +JxlDecoderStatus VerifyOutputBitDepth(JxlBitDepth bit_depth, const T& metadata, + JxlPixelFormat format) { + if ((format.data_type == JXL_TYPE_FLOAT || + format.data_type == JXL_TYPE_FLOAT16) && + bit_depth.type != JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) { + return JXL_API_ERROR( + "Only JXL_BIT_DEPTH_FROM_PIXEL_FORMAT is implemented " + "for float types."); + } + uint32_t bits_per_sample = GetBitDepth(bit_depth, metadata, format); + if (format.data_type == JXL_TYPE_UINT8 && + (bits_per_sample == 0 || bits_per_sample > 8)) { + return JXL_API_ERROR("Inavlid bit depth %u for uint8 output", + bits_per_sample); + } else if (format.data_type == JXL_TYPE_UINT16 && + (bits_per_sample == 0 || bits_per_sample > 16)) { + return JXL_API_ERROR("Inavlid bit depth %u for uint16 output", + bits_per_sample); + } + return JXL_DEC_SUCCESS; +} + +} // namespace + +JxlDecoderStatus JxlDecoderSetImageOutBitDepth(JxlDecoder* dec, + const JxlBitDepth* bit_depth) { + if (!dec->image_out_buffer_set) { + return JXL_API_ERROR("No image out buffer was set."); + } + JXL_API_RETURN_IF_ERROR( + VerifyOutputBitDepth(*bit_depth, dec->metadata.m, dec->image_out_format)); + dec->image_out_bit_depth = *bit_depth; + return JXL_DEC_SUCCESS; +} diff --git a/third_party/jpeg-xl/lib/jxl/decode_test.cc b/third_party/jpeg-xl/lib/jxl/decode_test.cc new file mode 100644 index 0000000000..30f6b61183 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/decode_test.cc @@ -0,0 +1,5507 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "lib/extras/codec.h" +#include "lib/extras/dec/color_description.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/file_io.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_external_image.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/enc_external_image.h" +#include "lib/jxl/enc_fields.h" +#include "lib/jxl/enc_file.h" +#include "lib/jxl/enc_icc_codec.h" +#include "lib/jxl/enc_progressive_split.h" +#include "lib/jxl/encode_internal.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/icc_codec.h" +#include "lib/jxl/image_metadata.h" +#include "lib/jxl/jpeg/enc_jpeg_data.h" +#include "lib/jxl/test_image.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" +#include "lib/jxl/toc.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace { +void AppendU32BE(uint32_t u32, jxl::PaddedBytes* bytes) { + bytes->push_back(u32 >> 24); + bytes->push_back(u32 >> 16); + bytes->push_back(u32 >> 8); + bytes->push_back(u32 >> 0); +} + +// What type of codestream format in the boxes to use for testing +enum CodeStreamBoxFormat { + // Do not use box format at all, only pure codestream + kCSBF_None, + // Have a single codestream box, with its actual size given in the box + kCSBF_Single, + // Have a single codestream box, with box size 0 (final box running to end) + kCSBF_Single_Zero_Terminated, + // Single codestream box, with another unknown box behind it + kCSBF_Single_Other, + // Have multiple partial codestream boxes + kCSBF_Multi, + // Have multiple partial codestream boxes, with final box size 0 (running + // to end) + kCSBF_Multi_Zero_Terminated, + // Have multiple partial codestream boxes, terminated by non-codestream box + kCSBF_Multi_Other_Terminated, + // Have multiple partial codestream boxes, terminated by non-codestream box + // that has its size set to 0 (running to end) + kCSBF_Multi_Other_Zero_Terminated, + // Have multiple partial codestream boxes, and the first one has a content + // of zero length + kCSBF_Multi_First_Empty, + // Have multiple partial codestream boxes, and the last one has a content + // of zero length and there is an unknown empty box at the end + kCSBF_Multi_Last_Empty_Other, + // Have a compressed exif box before a regular codestream box + kCSBF_Brob_Exif, + // Not a value but used for counting amount of enum entries + kCSBF_NUM_ENTRIES, +}; + +// Unknown boxes for testing +static const char* unk1_box_type = "unk1"; +static const char* unk1_box_contents = "abcdefghijklmnopqrstuvwxyz"; +static const size_t unk1_box_size = strlen(unk1_box_contents); +static const char* unk2_box_type = "unk2"; +static const char* unk2_box_contents = "0123456789"; +static const size_t unk2_box_size = strlen(unk2_box_contents); +static const char* unk3_box_type = "unk3"; +static const char* unk3_box_contents = "ABCDEF123456"; +static const size_t unk3_box_size = strlen(unk3_box_contents); +// Box with brob-compressed exif, including header +static const uint8_t* box_brob_exif = reinterpret_cast( + "\0\0\0@brobExif\241\350\2\300\177\244v\2525\304\360\27=?\267{" + "\33\37\314\332\214QX17PT\"\256\0\0\202s\214\313t\333\310\320k\20\276\30" + "\204\277l$\326c#\1\b"); +size_t box_brob_exif_size = 64; +// The uncompressed Exif data from the brob box +static const uint8_t* exif_uncompressed = reinterpret_cast( + "\0\0\0\0MM\0*" + "\0\0\0\b\0\5\1\22\0\3\0\0\0\1\0\5\0\0\1\32\0\5\0\0\0\1\0\0\0J\1\33\0\5\0\0" + "\0\1\0\0\0R\1(" + "\0\3\0\0\0\1\0\1\0\0\2\23\0\3\0\0\0\1\0\1\0\0\0\0\0\0\0\0\0\1\0\0\0\1\0\0" + "\0\1\0\0\0\1"); +size_t exif_uncompressed_size = 94; + +// Returns an ICC profile output by the JPEG XL decoder for RGB_D65_SRG_Rel_Lin, +// but with, on purpose, rXYZ, bXYZ and gXYZ (the RGB primaries) switched to a +// different order to ensure the profile does not match any known profile, so +// the encoder cannot encode it in a compact struct instead. +jxl::PaddedBytes GetIccTestProfile() { + const uint8_t* profile = reinterpret_cast( + "\0\0\3\200lcms\0040\0\0mntrRGB XYZ " + "\a\344\0\a\0\27\0\21\0$" + "\0\37acspAPPL\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\0\366" + "\326\0\1\0\0\0\0\323-lcms\372c\207\36\227\200{" + "\2\232s\255\327\340\0\n\26\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" + "\0\0\0\0\0\0\0\0\rdesc\0\0\1 " + "\0\0\0Bcprt\0\0\1d\0\0\1\0wtpt\0\0\2d\0\0\0\24chad\0\0\2x\0\0\0," + "bXYZ\0\0\2\244\0\0\0\24gXYZ\0\0\2\270\0\0\0\24rXYZ\0\0\2\314\0\0\0\24rTR" + "C\0\0\2\340\0\0\0 gTRC\0\0\2\340\0\0\0 bTRC\0\0\2\340\0\0\0 " + "chrm\0\0\3\0\0\0\0$dmnd\0\0\3$\0\0\0(" + "dmdd\0\0\3L\0\0\0002mluc\0\0\0\0\0\0\0\1\0\0\0\fenUS\0\0\0&" + "\0\0\0\34\0R\0G\0B\0_\0D\0006\0005\0_\0S\0R\0G\0_\0R\0e\0l\0_" + "\0L\0i\0n\0\0mluc\0\0\0\0\0\0\0\1\0\0\0\fenUS\0\0\0\344\0\0\0\34\0C\0o\0" + "p\0y\0r\0i\0g\0h\0t\0 \0002\0000\0001\08\0 \0G\0o\0o\0g\0l\0e\0 " + "\0L\0L\0C\0,\0 \0C\0C\0-\0B\0Y\0-\0S\0A\0 \0003\0.\0000\0 " + "\0U\0n\0p\0o\0r\0t\0e\0d\0 " + "\0l\0i\0c\0e\0n\0s\0e\0(\0h\0t\0t\0p\0s\0:\0/\0/" + "\0c\0r\0e\0a\0t\0i\0v\0e\0c\0o\0m\0m\0o\0n\0s\0.\0o\0r\0g\0/" + "\0l\0i\0c\0e\0n\0s\0e\0s\0/\0b\0y\0-\0s\0a\0/\0003\0.\0000\0/" + "\0l\0e\0g\0a\0l\0c\0o\0d\0e\0)XYZ " + "\0\0\0\0\0\0\366\326\0\1\0\0\0\0\323-" + "sf32\0\0\0\0\0\1\fB\0\0\5\336\377\377\363%" + "\0\0\a\223\0\0\375\220\377\377\373\241\377\377\375\242\0\0\3\334\0\0\300" + "nXYZ \0\0\0\0\0\0o\240\0\08\365\0\0\3\220XYZ " + "\0\0\0\0\0\0$\237\0\0\17\204\0\0\266\304XYZ " + "\0\0\0\0\0\0b\227\0\0\267\207\0\0\30\331para\0\0\0\0\0\3\0\0\0\1\0\0\0\1" + "\0\0\0\0\0\0\0\1\0\0\0\0\0\0chrm\0\0\0\0\0\3\0\0\0\0\243\327\0\0T|" + "\0\0L\315\0\0\231\232\0\0&" + "g\0\0\17\\mluc\0\0\0\0\0\0\0\1\0\0\0\fenUS\0\0\0\f\0\0\0\34\0G\0o\0o\0g" + "\0l\0emluc\0\0\0\0\0\0\0\1\0\0\0\fenUS\0\0\0\26\0\0\0\34\0I\0m\0a\0g\0e" + "\0 \0c\0o\0d\0e\0c\0\0"); + size_t profile_size = 896; + jxl::PaddedBytes icc_profile; + icc_profile.assign(profile, profile + profile_size); + return icc_profile; +} + +} // namespace + +namespace jxl { +namespace { + +void AppendTestBox(const char* type, const char* contents, size_t contents_size, + bool unbounded, PaddedBytes* bytes) { + AppendU32BE(contents_size + 8, bytes); + bytes->push_back(type[0]); + bytes->push_back(type[1]); + bytes->push_back(type[2]); + bytes->push_back(type[3]); + const uint8_t* contents_u = reinterpret_cast(contents); + bytes->append(contents_u, contents_u + contents_size); +} + +enum PreviewMode { + kNoPreview, + kSmallPreview, + kBigPreview, + kNumPreviewModes, +}; + +void GeneratePreview(PreviewMode preview_mode, ImageBundle* ib) { + if (preview_mode == kSmallPreview) { + ib->ShrinkTo(ib->xsize() / 7, ib->ysize() / 7); + } else if (preview_mode == kBigPreview) { + auto upsample7 = [&](const ImageF& in, ImageF* out) { + for (size_t y = 0; y < out->ysize(); ++y) { + for (size_t x = 0; x < out->xsize(); ++x) { + out->Row(y)[x] = in.ConstRow(y / 7)[x / 7]; + } + } + }; + Image3F preview(ib->xsize() * 7, ib->ysize() * 7); + for (size_t c = 0; c < 3; ++c) { + upsample7(ib->color()->Plane(c), &preview.Plane(c)); + } + std::vector extra_channels; + for (size_t i = 0; i < ib->extra_channels().size(); ++i) { + ImageF ec(ib->xsize() * 7, ib->ysize() * 7); + upsample7(ib->extra_channels()[i], &ec); + extra_channels.emplace_back(std::move(ec)); + } + ib->RemoveColor(); + ib->ClearExtraChannels(); + ib->SetFromImage(std::move(preview), ib->c_current()); + ib->SetExtraChannels(std::move(extra_channels)); + } +} + +struct TestCodestreamParams { + CompressParams cparams; + CodeStreamBoxFormat box_format = kCSBF_None; + JxlOrientation orientation = JXL_ORIENT_IDENTITY; + PreviewMode preview_mode = kNoPreview; + bool add_intrinsic_size = false; + bool add_icc_profile = false; + float intensity_target = 0.0; + std::string color_space; + PaddedBytes* jpeg_codestream = nullptr; + const ProgressiveMode* progressive_mode = nullptr; +}; + +// Input pixels always given as 16-bit RGBA, 8 bytes per pixel. +// include_alpha determines if the encoded image should contain the alpha +// channel. +// add_icc_profile: if false, encodes the image as sRGB using the JXL fields, +// for grayscale or RGB images. If true, encodes the image using the ICC profile +// returned by GetIccTestProfile, without the JXL fields, this requires the +// image is RGB, not grayscale. +// Providing jpeg_codestream will populate the jpeg_codestream with compressed +// JPEG bytes, and make it possible to reconstruct those exact JPEG bytes using +// the return value _if_ add_container indicates a box format. +PaddedBytes CreateTestJXLCodestream(Span pixels, size_t xsize, + size_t ysize, size_t num_channels, + const TestCodestreamParams& params) { + // Compress the pixels with JPEG XL. + bool grayscale = (num_channels <= 2); + bool include_alpha = !(num_channels & 1) && params.jpeg_codestream == nullptr; + size_t bitdepth = params.jpeg_codestream == nullptr ? 16 : 8; + CodecInOut io; + io.SetSize(xsize, ysize); + ColorEncoding color_encoding; + if (params.add_icc_profile) { + // the hardcoded ICC profile we attach requires RGB. + EXPECT_EQ(false, grayscale); + EXPECT_TRUE(params.color_space.empty()); + EXPECT_TRUE(color_encoding.SetICC(GetIccTestProfile())); + } else if (!params.color_space.empty()) { + JxlColorEncoding c; + EXPECT_TRUE(jxl::ParseDescription(params.color_space, &c)); + EXPECT_TRUE(ConvertExternalToInternalColorEncoding(c, &color_encoding)); + EXPECT_EQ(color_encoding.IsGray(), grayscale); + } else { + color_encoding = jxl::ColorEncoding::SRGB(/*is_gray=*/grayscale); + } + ThreadPool pool(nullptr, nullptr); + io.metadata.m.SetUintSamples(bitdepth); + if (include_alpha) { + io.metadata.m.SetAlphaBits(bitdepth); + } + if (params.intensity_target != 0) { + io.metadata.m.SetIntensityTarget(params.intensity_target); + } + JxlPixelFormat format = {static_cast(num_channels), JXL_TYPE_UINT16, + JXL_BIG_ENDIAN, 0}; + // Make the grayscale-ness of the io metadata color_encoding and the packed + // image match. + io.metadata.m.color_encoding = color_encoding; + EXPECT_TRUE(ConvertFromExternal(pixels, xsize, ysize, color_encoding, + /*bits_per_sample=*/16, format, &pool, + &io.Main())); + jxl::PaddedBytes jpeg_data; + if (params.jpeg_codestream != nullptr) { +#if JPEGXL_ENABLE_JPEG + std::vector jpeg_bytes; + io.jpeg_quality = 70; + EXPECT_TRUE(Encode(io, extras::Codec::kJPG, io.metadata.m.color_encoding, + /*bits_per_sample=*/8, &jpeg_bytes, &pool)); + params.jpeg_codestream->append(jpeg_bytes.data(), + jpeg_bytes.data() + jpeg_bytes.size()); + EXPECT_TRUE(jxl::jpeg::DecodeImageJPG( + jxl::Span(jpeg_bytes.data(), jpeg_bytes.size()), &io)); + EXPECT_TRUE( + EncodeJPEGData(*io.Main().jpeg_data, &jpeg_data, params.cparams)); + io.metadata.m.xyb_encoded = false; +#else // JPEGXL_ENABLE_JPEG + JXL_ABORT( + "unable to create reconstructible JPEG without JPEG support enabled"); +#endif // JPEGXL_ENABLE_JPEG + } + if (params.preview_mode) { + io.preview_frame = io.Main().Copy(); + GeneratePreview(params.preview_mode, &io.preview_frame); + io.metadata.m.have_preview = true; + EXPECT_TRUE(io.metadata.m.preview_size.Set(io.preview_frame.xsize(), + io.preview_frame.ysize())); + } + if (params.add_intrinsic_size) { + EXPECT_TRUE(io.metadata.m.intrinsic_size.Set(xsize / 3, ysize / 3)); + } + io.metadata.m.orientation = params.orientation; + AuxOut aux_out; + PaddedBytes compressed; + PassesEncoderState enc_state; + if (params.progressive_mode) { + enc_state.progressive_splitter.SetProgressiveMode(*params.progressive_mode); + } + EXPECT_TRUE(EncodeFile(params.cparams, &io, &enc_state, &compressed, + GetJxlCms(), &aux_out, &pool)); + CodeStreamBoxFormat add_container = params.box_format; + if (add_container != kCSBF_None) { + // Header with signature box and ftyp box. + const uint8_t header[] = {0, 0, 0, 0xc, 0x4a, 0x58, 0x4c, 0x20, + 0xd, 0xa, 0x87, 0xa, 0, 0, 0, 0x14, + 0x66, 0x74, 0x79, 0x70, 0x6a, 0x78, 0x6c, 0x20, + 0, 0, 0, 0, 0x6a, 0x78, 0x6c, 0x20}; + + bool is_multi = add_container == kCSBF_Multi || + add_container == kCSBF_Multi_Zero_Terminated || + add_container == kCSBF_Multi_Other_Terminated || + add_container == kCSBF_Multi_Other_Zero_Terminated || + add_container == kCSBF_Multi_First_Empty || + add_container == kCSBF_Multi_Last_Empty_Other; + + if (is_multi) { + size_t third = compressed.size() / 3; + std::vector compressed0(compressed.data(), + compressed.data() + third); + std::vector compressed1(compressed.data() + third, + compressed.data() + 2 * third); + std::vector compressed2(compressed.data() + 2 * third, + compressed.data() + compressed.size()); + + PaddedBytes c; + c.append(header, header + sizeof(header)); + if (params.jpeg_codestream != nullptr) { + jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_data.size(), false, + &c); + c.append(jpeg_data.data(), jpeg_data.data() + jpeg_data.size()); + } + uint32_t jxlp_index = 0; + if (add_container == kCSBF_Multi_First_Empty) { + // Dummy (empty) codestream part + AppendU32BE(12, &c); + c.push_back('j'); + c.push_back('x'); + c.push_back('l'); + c.push_back('p'); + AppendU32BE(jxlp_index++, &c); + } + // First codestream part + AppendU32BE(compressed0.size() + 12, &c); + c.push_back('j'); + c.push_back('x'); + c.push_back('l'); + c.push_back('p'); + AppendU32BE(jxlp_index++, &c); + c.append(compressed0.data(), compressed0.data() + compressed0.size()); + // A few non-codestream boxes in between + AppendTestBox(unk1_box_type, unk1_box_contents, unk1_box_size, false, &c); + AppendTestBox(unk2_box_type, unk2_box_contents, unk2_box_size, false, &c); + // Dummy (empty) codestream part + AppendU32BE(12, &c); + c.push_back('j'); + c.push_back('x'); + c.push_back('l'); + c.push_back('p'); + AppendU32BE(jxlp_index++, &c); + // Second codestream part + AppendU32BE(compressed1.size() + 12, &c); + c.push_back('j'); + c.push_back('x'); + c.push_back('l'); + c.push_back('p'); + AppendU32BE(jxlp_index++, &c); + c.append(compressed1.data(), compressed1.data() + compressed1.size()); + // Third (last) codestream part + AppendU32BE(add_container == kCSBF_Multi_Zero_Terminated + ? 0 + : (compressed2.size() + 12), + &c); + c.push_back('j'); + c.push_back('x'); + c.push_back('l'); + c.push_back('p'); + if (add_container != kCSBF_Multi_Last_Empty_Other) { + AppendU32BE(jxlp_index++ | 0x80000000, &c); + } else { + AppendU32BE(jxlp_index++, &c); + } + c.append(compressed2.data(), compressed2.data() + compressed2.size()); + if (add_container == kCSBF_Multi_Last_Empty_Other) { + // Dummy (empty) codestream part + AppendU32BE(12, &c); + c.push_back('j'); + c.push_back('x'); + c.push_back('l'); + c.push_back('p'); + AppendU32BE(jxlp_index++ | 0x80000000, &c); + AppendTestBox(unk3_box_type, unk3_box_contents, unk3_box_size, false, + &c); + } + if (add_container == kCSBF_Multi_Other_Terminated) { + AppendTestBox(unk3_box_type, unk3_box_contents, unk3_box_size, false, + &c); + } + if (add_container == kCSBF_Multi_Other_Zero_Terminated) { + AppendTestBox(unk3_box_type, unk3_box_contents, unk3_box_size, true, + &c); + } + compressed.swap(c); + } else { + PaddedBytes c; + c.append(header, header + sizeof(header)); + if (params.jpeg_codestream != nullptr) { + jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_data.size(), false, + &c); + c.append(jpeg_data.data(), jpeg_data.data() + jpeg_data.size()); + } + if (add_container == kCSBF_Brob_Exif) { + c.append(box_brob_exif, box_brob_exif + box_brob_exif_size); + } + AppendU32BE(add_container == kCSBF_Single_Zero_Terminated + ? 0 + : (compressed.size() + 8), + &c); + c.push_back('j'); + c.push_back('x'); + c.push_back('l'); + c.push_back('c'); + c.append(compressed.data(), compressed.data() + compressed.size()); + if (add_container == kCSBF_Single_Other) { + AppendTestBox(unk1_box_type, unk1_box_contents, unk1_box_size, false, + &c); + } + compressed.swap(c); + } + } + + return compressed; +} + +JxlDecoderStatus ProcessInputIgnoreBoxes(JxlDecoder* dec) { + JxlDecoderStatus status = JXL_DEC_BOX; + while (status == JXL_DEC_BOX) { + status = JxlDecoderProcessInput(dec); + } + return status; +} + +// Decodes one-shot with the API for non-streaming decoding tests. +std::vector DecodeWithAPI(JxlDecoder* dec, + Span compressed, + const JxlPixelFormat& format, + bool use_callback, bool set_buffer_early, + bool use_resizable_runner, + bool require_boxes, bool expect_success, + PaddedBytes* icc = nullptr) { + JxlThreadParallelRunnerPtr runner_fixed; + JxlResizableParallelRunnerPtr runner_resizable; + JxlParallelRunner runner_fn; + void* runner; + + if (use_resizable_runner) { + runner_resizable = JxlResizableParallelRunnerMake(nullptr); + runner = runner_resizable.get(); + runner_fn = JxlResizableParallelRunner; + } else { + size_t hw_threads = JxlThreadParallelRunnerDefaultNumWorkerThreads(); + runner_fixed = + JxlThreadParallelRunnerMake(nullptr, std::min(hw_threads, 16)); + runner = runner_fixed.get(); + runner_fn = JxlThreadParallelRunner; + } + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetParallelRunner(dec, runner_fn, runner)); + + auto process_input = + require_boxes ? ProcessInputIgnoreBoxes : JxlDecoderProcessInput; + + EXPECT_EQ( + JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | (set_buffer_early ? JXL_DEC_FRAME : 0) | + JXL_DEC_PREVIEW_IMAGE | JXL_DEC_FULL_IMAGE | + (require_boxes ? JXL_DEC_BOX : 0) | + (icc != nullptr ? JXL_DEC_COLOR_ENCODING : 0))); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput(dec, compressed.data(), compressed.size())); + EXPECT_EQ(JXL_DEC_BASIC_INFO, process_input(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + if (use_resizable_runner) { + JxlResizableParallelRunnerSetThreads( + runner, + JxlResizableParallelRunnerSuggestThreads(info.xsize, info.ysize)); + } + + std::vector pixels(buffer_size); + size_t bytes_per_pixel = format.num_channels * + test::GetDataBits(format.data_type) / + jxl::kBitsPerByte; + size_t stride = bytes_per_pixel * info.xsize; + if (format.align > 1) { + stride = jxl::DivCeil(stride, format.align) * format.align; + } + auto callback = [&](size_t x, size_t y, size_t num_pixels, + const void* pixels_row) { + memcpy(pixels.data() + stride * y + bytes_per_pixel * x, pixels_row, + num_pixels * bytes_per_pixel); + }; + + JxlDecoderStatus status = process_input(dec); + + if (status == JXL_DEC_COLOR_ENCODING) { + size_t icc_size = 0; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetICCProfileSize( + dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, &icc_size)); + icc->resize(icc_size); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, + icc->data(), icc_size)); + + status = process_input(dec); + } + + std::vector preview; + if (status == JXL_DEC_NEED_PREVIEW_OUT_BUFFER) { + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size)); + preview.resize(buffer_size); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetPreviewOutBuffer(dec, &format, preview.data(), + preview.size())); + EXPECT_EQ(JXL_DEC_PREVIEW_IMAGE, process_input(dec)); + + status = process_input(dec); + } + + if (set_buffer_early) { + EXPECT_EQ(JXL_DEC_FRAME, status); + } else { + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, status); + } + + if (use_callback) { + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutCallback( + dec, &format, + [](void* opaque, size_t x, size_t y, size_t xsize, + const void* pixels_row) { + auto cb = static_cast(opaque); + (*cb)(x, y, xsize, pixels_row); + }, + /*opaque=*/&callback)); + } else { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels.data(), pixels.size())); + } + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, process_input(dec)); + + // After the full image was output, JxlDecoderProcessInput should return + // success to indicate all is done, unless we requested boxes and the last + // box was not a terminal unbounded box, in which case it should ask for + // more input. + JxlDecoderStatus expected_status = + expect_success ? JXL_DEC_SUCCESS : JXL_DEC_NEED_MORE_INPUT; + EXPECT_EQ(expected_status, process_input(dec)); + + return pixels; +} + +// Decodes one-shot with the API for non-streaming decoding tests. +std::vector DecodeWithAPI(Span compressed, + const JxlPixelFormat& format, + bool use_callback, bool set_buffer_early, + bool use_resizable_runner, + bool require_boxes, bool expect_success) { + JxlDecoder* dec = JxlDecoderCreate(NULL); + std::vector pixels = + DecodeWithAPI(dec, compressed, format, use_callback, set_buffer_early, + use_resizable_runner, require_boxes, expect_success); + JxlDecoderDestroy(dec); + return pixels; +} + +} // namespace +} // namespace jxl + +//////////////////////////////////////////////////////////////////////////////// + +TEST(DecodeTest, JxlSignatureCheckTest) { + std::vector>> tests = { + // No JPEGXL header starts with 'a'. + {JXL_SIG_INVALID, {'a'}}, + {JXL_SIG_INVALID, {'a', 'b', 'c', 'd', 'e', 'f'}}, + + // Empty file is not enough bytes. + {JXL_SIG_NOT_ENOUGH_BYTES, {}}, + + // JPEGXL headers. + {JXL_SIG_NOT_ENOUGH_BYTES, {0xff}}, // Part of a signature. + {JXL_SIG_INVALID, {0xff, 0xD8}}, // JPEG-1 + {JXL_SIG_CODESTREAM, {0xff, 0x0a}}, + + // JPEGXL container file. + {JXL_SIG_CONTAINER, + {0, 0, 0, 0xc, 'J', 'X', 'L', ' ', 0xD, 0xA, 0x87, 0xA}}, + // Ending with invalid byte. + {JXL_SIG_INVALID, {0, 0, 0, 0xc, 'J', 'X', 'L', ' ', 0xD, 0xA, 0x87, 0}}, + // Part of signature. + {JXL_SIG_NOT_ENOUGH_BYTES, + {0, 0, 0, 0xc, 'J', 'X', 'L', ' ', 0xD, 0xA, 0x87}}, + {JXL_SIG_NOT_ENOUGH_BYTES, {0}}, + }; + for (const auto& test : tests) { + EXPECT_EQ(test.first, + JxlSignatureCheck(test.second.data(), test.second.size())) + << "Where test data is " << ::testing::PrintToString(test.second); + } +} + +TEST(DecodeTest, DefaultAllocTest) { + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_NE(nullptr, dec); + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, CustomAllocTest) { + struct CalledCounters { + int allocs = 0; + int frees = 0; + } counters; + + JxlMemoryManager mm; + mm.opaque = &counters; + mm.alloc = [](void* opaque, size_t size) { + reinterpret_cast(opaque)->allocs++; + return malloc(size); + }; + mm.free = [](void* opaque, void* address) { + reinterpret_cast(opaque)->frees++; + free(address); + }; + + JxlDecoder* dec = JxlDecoderCreate(&mm); + EXPECT_NE(nullptr, dec); + EXPECT_LE(1, counters.allocs); + EXPECT_EQ(0, counters.frees); + JxlDecoderDestroy(dec); + EXPECT_LE(1, counters.frees); +} + +// TODO(lode): add multi-threaded test when multithreaded pixel decoding from +// API is implemented. +TEST(DecodeTest, DefaultParallelRunnerTest) { + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_NE(nullptr, dec); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetParallelRunner(dec, nullptr, nullptr)); + JxlDecoderDestroy(dec); +} + +// Creates the header of a JPEG XL file with various custom parameters for +// testing. +// xsize, ysize: image dimensions to store in the SizeHeader, max 512. +// bits_per_sample, orientation: a selection of header parameters to test with. +// orientation: image orientation to set in the metadata +// alpha_bits: if non-0, alpha extra channel bits to set in the metadata. Also +// gives the alpha channel the name "alpha_test" +// have_container: add box container format around the codestream. +// metadata_default: if true, ImageMetadata is set to default and +// bits_per_sample, orientation and alpha_bits are ignored. +// insert_box: insert an extra box before the codestream box, making the header +// farther away from the front than is ideal. Only used if have_container. +std::vector GetTestHeader(size_t xsize, size_t ysize, + size_t bits_per_sample, size_t orientation, + size_t alpha_bits, bool xyb_encoded, + bool have_container, bool metadata_default, + bool insert_extra_box, + const jxl::PaddedBytes& icc_profile) { + jxl::BitWriter writer; + jxl::BitWriter::Allotment allotment(&writer, 65536); // Large enough + + if (have_container) { + const std::vector signature_box = {0, 0, 0, 0xc, 'J', 'X', + 'L', ' ', 0xd, 0xa, 0x87, 0xa}; + const std::vector filetype_box = { + 0, 0, 0, 0x14, 'f', 't', 'y', 'p', 'j', 'x', + 'l', ' ', 0, 0, 0, 0, 'j', 'x', 'l', ' '}; + const std::vector extra_box_header = {0, 0, 0, 0xff, + 't', 'e', 's', 't'}; + // Beginning of codestream box, with an arbitrary size certainly large + // enough to contain the header + const std::vector codestream_box_header = {0, 0, 0, 0xff, + 'j', 'x', 'l', 'c'}; + + for (size_t i = 0; i < signature_box.size(); i++) { + writer.Write(8, signature_box[i]); + } + for (size_t i = 0; i < filetype_box.size(); i++) { + writer.Write(8, filetype_box[i]); + } + if (insert_extra_box) { + for (size_t i = 0; i < extra_box_header.size(); i++) { + writer.Write(8, extra_box_header[i]); + } + for (size_t i = 0; i < 255 - 8; i++) { + writer.Write(8, 0); + } + } + for (size_t i = 0; i < codestream_box_header.size(); i++) { + writer.Write(8, codestream_box_header[i]); + } + } + + // JXL signature + writer.Write(8, 0xff); + writer.Write(8, 0x0a); + + // SizeHeader + jxl::CodecMetadata metadata; + EXPECT_TRUE(metadata.size.Set(xsize, ysize)); + EXPECT_TRUE(WriteSizeHeader(metadata.size, &writer, 0, nullptr)); + + if (!metadata_default) { + metadata.m.SetUintSamples(bits_per_sample); + metadata.m.orientation = orientation; + metadata.m.SetAlphaBits(alpha_bits); + metadata.m.xyb_encoded = xyb_encoded; + if (alpha_bits != 0) { + metadata.m.extra_channel_info[0].name = "alpha_test"; + } + } + + if (!icc_profile.empty()) { + jxl::PaddedBytes copy = icc_profile; + EXPECT_TRUE(metadata.m.color_encoding.SetICC(std::move(copy))); + } + + EXPECT_TRUE(jxl::Bundle::Write(metadata.m, &writer, 0, nullptr)); + metadata.transform_data.nonserialized_xyb_encoded = metadata.m.xyb_encoded; + EXPECT_TRUE(jxl::Bundle::Write(metadata.transform_data, &writer, 0, nullptr)); + + if (!icc_profile.empty()) { + EXPECT_TRUE(metadata.m.color_encoding.WantICC()); + EXPECT_TRUE(jxl::WriteICC(icc_profile, &writer, 0, nullptr)); + } + + writer.ZeroPadToByte(); + allotment.ReclaimAndCharge(&writer, 0, nullptr); + return std::vector( + writer.GetSpan().data(), + writer.GetSpan().data() + writer.GetSpan().size()); +} + +TEST(DecodeTest, BasicInfoTest) { + size_t xsize[2] = {50, 33}; + size_t ysize[2] = {50, 77}; + size_t bits_per_sample[2] = {8, 23}; + size_t orientation[2] = {3, 5}; + size_t alpha_bits[2] = {0, 8}; + JXL_BOOL have_container[2] = {0, 1}; + bool xyb_encoded = false; + + std::vector> test_samples; + // Test with direct codestream + test_samples.push_back(GetTestHeader( + xsize[0], ysize[0], bits_per_sample[0], orientation[0], alpha_bits[0], + xyb_encoded, have_container[0], /*metadata_default=*/false, + /*insert_extra_box=*/false, {})); + // Test with container and different parameters + test_samples.push_back(GetTestHeader( + xsize[1], ysize[1], bits_per_sample[1], orientation[1], alpha_bits[1], + xyb_encoded, have_container[1], /*metadata_default=*/false, + /*insert_extra_box=*/false, {})); + + for (size_t i = 0; i < test_samples.size(); ++i) { + const std::vector& data = test_samples[i]; + // Test decoding too small header first, until we reach the final byte. + for (size_t size = 0; size <= data.size(); ++size) { + // Test with a new decoder for each tested byte size. + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO)); + const uint8_t* next_in = data.data(); + size_t avail_in = size; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + JxlDecoderStatus status = JxlDecoderProcessInput(dec); + + JxlBasicInfo info; + bool have_basic_info = !JxlDecoderGetBasicInfo(dec, &info); + + if (size == data.size()) { + EXPECT_EQ(JXL_DEC_BASIC_INFO, status); + + // All header bytes given so the decoder must have the basic info. + EXPECT_EQ(true, have_basic_info); + EXPECT_EQ(have_container[i], info.have_container); + EXPECT_EQ(alpha_bits[i], info.alpha_bits); + // Orientations 5..8 swap the dimensions + if (orientation[i] >= 5) { + EXPECT_EQ(xsize[i], info.ysize); + EXPECT_EQ(ysize[i], info.xsize); + } else { + EXPECT_EQ(xsize[i], info.xsize); + EXPECT_EQ(ysize[i], info.ysize); + } + // The API should set the orientation to identity by default since it + // already applies the transformation internally by default. + EXPECT_EQ(1u, info.orientation); + + EXPECT_EQ(3u, info.num_color_channels); + + if (alpha_bits[i] != 0) { + // Expect an extra channel + EXPECT_EQ(1u, info.num_extra_channels); + JxlExtraChannelInfo extra; + EXPECT_EQ(0, JxlDecoderGetExtraChannelInfo(dec, 0, &extra)); + EXPECT_EQ(alpha_bits[i], extra.bits_per_sample); + EXPECT_EQ(JXL_CHANNEL_ALPHA, extra.type); + EXPECT_EQ(0, extra.alpha_premultiplied); + // Verify the name "alpha_test" given to the alpha channel + EXPECT_EQ(10u, extra.name_length); + char name[11]; + EXPECT_EQ(0, + JxlDecoderGetExtraChannelName(dec, 0, name, sizeof(name))); + EXPECT_EQ(std::string("alpha_test"), std::string(name)); + } else { + EXPECT_EQ(0u, info.num_extra_channels); + } + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + } else { + // If we did not give the full header, the basic info should not be + // available. Allow a few bytes of slack due to some bits for default + // opsinmatrix/extension bits. + if (size + 2 < data.size()) { + EXPECT_EQ(false, have_basic_info); + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, status); + } + } + + // Test that decoder doesn't allow setting a setting required at beginning + // unless it's reset + EXPECT_EQ(JXL_DEC_ERROR, + JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO)); + JxlDecoderReset(dec); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO)); + + JxlDecoderDestroy(dec); + } + } +} + +TEST(DecodeTest, BufferSizeTest) { + size_t xsize = 33; + size_t ysize = 77; + size_t bits_per_sample = 8; + size_t orientation = 1; + size_t alpha_bits = 8; + bool have_container = false; + bool xyb_encoded = false; + + std::vector header = + GetTestHeader(xsize, ysize, bits_per_sample, orientation, alpha_bits, + xyb_encoded, have_container, /*metadata_default=*/false, + /*insert_extra_box=*/false, {}); + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO)); + const uint8_t* next_in = header.data(); + size_t avail_in = header.size(); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + JxlDecoderStatus status = JxlDecoderProcessInput(dec); + EXPECT_EQ(JXL_DEC_BASIC_INFO, status); + + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(xsize, info.xsize); + EXPECT_EQ(ysize, info.ysize); + + JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}; + size_t image_out_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &image_out_size)); + EXPECT_EQ(xsize * ysize * 4, image_out_size); + + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, BasicInfoSizeHintTest) { + // Test on a file where the size hint is too small initially due to inserting + // a box before the codestream (something that is normally not recommended) + size_t xsize = 50; + size_t ysize = 50; + size_t bits_per_sample = 16; + size_t orientation = 1; + size_t alpha_bits = 0; + bool xyb_encoded = false; + std::vector data = GetTestHeader( + xsize, ysize, bits_per_sample, orientation, alpha_bits, xyb_encoded, + /*have_container=*/true, /*metadata_default=*/false, + /*insert_extra_box=*/true, {}); + + JxlDecoderStatus status; + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO)); + + size_t hint0 = JxlDecoderSizeHintBasicInfo(dec); + // Test that the test works as intended: we construct a file on purpose to + // be larger than the first hint by having that extra box. + EXPECT_LT(hint0, data.size()); + const uint8_t* next_in = data.data(); + // Do as if we have only as many bytes as indicated by the hint available + size_t avail_in = std::min(hint0, data.size()); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + status = JxlDecoderProcessInput(dec); + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, status); + // Basic info cannot be available yet due to the extra inserted box. + EXPECT_EQ(false, !JxlDecoderGetBasicInfo(dec, nullptr)); + + size_t num_read = avail_in - JxlDecoderReleaseInput(dec); + EXPECT_LT(num_read, data.size()); + + size_t hint1 = JxlDecoderSizeHintBasicInfo(dec); + // The hint must be larger than the previous hint (taking already processed + // bytes into account, the hint is a hint for the next avail_in) since the + // decoder now knows there is a box in between. + EXPECT_GT(hint1 + num_read, hint0); + avail_in = std::min(hint1, data.size() - num_read); + next_in += num_read; + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + status = JxlDecoderProcessInput(dec); + EXPECT_EQ(JXL_DEC_BASIC_INFO, status); + JxlBasicInfo info; + // We should have the basic info now, since we only added one box in-between, + // and the decoder should have known its size, its implementation can return + // a correct hint. + EXPECT_EQ(true, !JxlDecoderGetBasicInfo(dec, &info)); + + // Also test if the basic info is correct. + EXPECT_EQ(1, info.have_container); + EXPECT_EQ(xsize, info.xsize); + EXPECT_EQ(ysize, info.ysize); + EXPECT_EQ(orientation, info.orientation); + EXPECT_EQ(bits_per_sample, info.bits_per_sample); + + JxlDecoderDestroy(dec); +} + +std::vector GetIccTestHeader(const jxl::PaddedBytes& icc_profile, + bool xyb_encoded) { + size_t xsize = 50; + size_t ysize = 50; + size_t bits_per_sample = 16; + size_t orientation = 1; + size_t alpha_bits = 0; + return GetTestHeader(xsize, ysize, bits_per_sample, orientation, alpha_bits, + xyb_encoded, + /*have_container=*/false, /*metadata_default=*/false, + /*insert_extra_box=*/false, icc_profile); +} + +// Tests the case where pixels and metadata ICC profile are the same +TEST(DecodeTest, IccProfileTestOriginal) { + jxl::PaddedBytes icc_profile = GetIccTestProfile(); + bool xyb_encoded = false; + std::vector data = GetIccTestHeader(icc_profile, xyb_encoded); + JxlPixelFormat format = {4, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0}; + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), data.size())); + + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + + // Expect the opposite of xyb_encoded for uses_original_profile + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(JXL_TRUE, info.uses_original_profile); + + EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec)); + + // the encoded color profile expected to be not available, since the image + // has an ICC profile instead + EXPECT_EQ(JXL_DEC_ERROR, + JxlDecoderGetColorAsEncodedProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr)); + + size_t dec_profile_size; + EXPECT_EQ( + JXL_DEC_SUCCESS, + JxlDecoderGetICCProfileSize( + dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, &dec_profile_size)); + + // Check that can get return status with NULL size + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetICCProfileSize( + dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr)); + + // The profiles must be equal. This requires they have equal size, and if + // they do, we can get the profile and compare the contents. + EXPECT_EQ(icc_profile.size(), dec_profile_size); + if (icc_profile.size() == dec_profile_size) { + jxl::PaddedBytes icc_profile2(icc_profile.size()); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetColorAsICCProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, + icc_profile2.data(), icc_profile2.size())); + EXPECT_EQ(icc_profile, icc_profile2); + } + + // the data is not xyb_encoded, so same result expected for the pixel data + // color profile + EXPECT_EQ(JXL_DEC_ERROR, + JxlDecoderGetColorAsEncodedProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, nullptr)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetICCProfileSize( + dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, + &dec_profile_size)); + EXPECT_EQ(icc_profile.size(), dec_profile_size); + + JxlDecoderDestroy(dec); +} + +// Tests the case where pixels and metadata ICC profile are different +TEST(DecodeTest, IccProfileTestXybEncoded) { + jxl::PaddedBytes icc_profile = GetIccTestProfile(); + bool xyb_encoded = true; + std::vector data = GetIccTestHeader(icc_profile, xyb_encoded); + JxlPixelFormat format = {4, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0}; + JxlPixelFormat format_int = {4, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}; + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), data.size())); + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + + // Expect the opposite of xyb_encoded for uses_original_profile + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(JXL_FALSE, info.uses_original_profile); + + EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec)); + + // the encoded color profile expected to be not available, since the image + // has an ICC profile instead + EXPECT_EQ(JXL_DEC_ERROR, + JxlDecoderGetColorAsEncodedProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr)); + + // Check that can get return status with NULL size + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetICCProfileSize( + dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr)); + + size_t dec_profile_size; + EXPECT_EQ( + JXL_DEC_SUCCESS, + JxlDecoderGetICCProfileSize( + dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, &dec_profile_size)); + + // The profiles must be equal. This requires they have equal size, and if + // they do, we can get the profile and compare the contents. + EXPECT_EQ(icc_profile.size(), dec_profile_size); + if (icc_profile.size() == dec_profile_size) { + jxl::PaddedBytes icc_profile2(icc_profile.size()); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetColorAsICCProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, + icc_profile2.data(), icc_profile2.size())); + EXPECT_EQ(icc_profile, icc_profile2); + } + + // Data is xyb_encoded, so the data profile is a different profile, encoded + // as structured profile. + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetColorAsEncodedProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, nullptr)); + JxlColorEncoding pixel_encoding; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetColorAsEncodedProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding)); + EXPECT_EQ(JXL_PRIMARIES_SRGB, pixel_encoding.primaries); + // The API returns LINEAR by default when the colorspace cannot be represented + // by enum values. + EXPECT_EQ(JXL_TRANSFER_FUNCTION_LINEAR, pixel_encoding.transfer_function); + + // Test the same but with integer format. + EXPECT_EQ( + JXL_DEC_SUCCESS, + JxlDecoderGetColorAsEncodedProfile( + dec, &format_int, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding)); + EXPECT_EQ(JXL_PRIMARIES_SRGB, pixel_encoding.primaries); + EXPECT_EQ(JXL_TRANSFER_FUNCTION_LINEAR, pixel_encoding.transfer_function); + + // Test after setting the preferred color profile to non-linear sRGB: + // for XYB images with ICC profile, this setting is expected to take effect. + jxl::ColorEncoding temp_jxl_srgb = jxl::ColorEncoding::SRGB(false); + JxlColorEncoding pixel_encoding_srgb; + ConvertInternalToExternalColorEncoding(temp_jxl_srgb, &pixel_encoding_srgb); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetPreferredColorProfile(dec, &pixel_encoding_srgb)); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetColorAsEncodedProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding)); + EXPECT_EQ(JXL_TRANSFER_FUNCTION_SRGB, pixel_encoding.transfer_function); + + // The decoder can also output this as a generated ICC profile anyway, and + // we're certain that it will differ from the above defined profile since + // the sRGB data should not have swapped R/G/B primaries. + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetICCProfileSize( + dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, + &dec_profile_size)); + // We don't need to dictate exactly what size the generated ICC profile + // must be (since there are many ways to represent the same color space), + // but it should not be zero. + EXPECT_NE(0u, dec_profile_size); + jxl::PaddedBytes icc_profile2(dec_profile_size); + if (0 != dec_profile_size) { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, + icc_profile2.data(), icc_profile2.size())); + // expected not equal + EXPECT_NE(icc_profile, icc_profile2); + } + + // Test setting another different preferred profile, to verify that the + // returned JXL_COLOR_PROFILE_TARGET_DATA ICC profile is correctly + // updated. + + jxl::ColorEncoding temp_jxl_linear = jxl::ColorEncoding::LinearSRGB(false); + JxlColorEncoding pixel_encoding_linear; + ConvertInternalToExternalColorEncoding(temp_jxl_linear, + &pixel_encoding_linear); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetPreferredColorProfile(dec, &pixel_encoding_linear)); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetColorAsEncodedProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding)); + EXPECT_EQ(JXL_TRANSFER_FUNCTION_LINEAR, pixel_encoding.transfer_function); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetICCProfileSize( + dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, + &dec_profile_size)); + EXPECT_NE(0u, dec_profile_size); + jxl::PaddedBytes icc_profile3(dec_profile_size); + if (0 != dec_profile_size) { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, + icc_profile3.data(), icc_profile3.size())); + // expected not equal to the previously set preferred profile. + EXPECT_NE(icc_profile2, icc_profile3); + } + + JxlDecoderDestroy(dec); +} + +// Test decoding ICC from partial files byte for byte. +// This test must pass also if JXL_CRASH_ON_ERROR is enabled, that is, the +// decoding of the ANS histogram and stream of the encoded ICC profile must also +// handle the case of not enough input bytes with StatusCode::kNotEnoughBytes +// rather than fatal error status codes. +TEST(DecodeTest, ICCPartialTest) { + jxl::PaddedBytes icc_profile = GetIccTestProfile(); + std::vector data = GetIccTestHeader(icc_profile, false); + JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}; + + const uint8_t* next_in = data.data(); + size_t avail_in = 0; + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING)); + + bool seen_basic_info = false; + bool seen_color_encoding = false; + size_t total_size = 0; + + for (;;) { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + JxlDecoderStatus status = JxlDecoderProcessInput(dec); + size_t remaining = JxlDecoderReleaseInput(dec); + EXPECT_LE(remaining, avail_in); + next_in += avail_in - remaining; + avail_in = remaining; + if (status == JXL_DEC_NEED_MORE_INPUT) { + if (total_size >= data.size()) { + // End of partial codestream with codestrema headers and ICC profile + // reached, it should not require more input since full image is not + // requested + FAIL(); + break; + } + size_t increment = 1; + if (total_size + increment > data.size()) { + increment = data.size() - total_size; + } + total_size += increment; + avail_in += increment; + } else if (status == JXL_DEC_BASIC_INFO) { + EXPECT_FALSE(seen_basic_info); + seen_basic_info = true; + } else if (status == JXL_DEC_COLOR_ENCODING) { + EXPECT_TRUE(seen_basic_info); + EXPECT_FALSE(seen_color_encoding); + seen_color_encoding = true; + + // Sanity check that the ICC profile was decoded correctly + size_t dec_profile_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetICCProfileSize(dec, &format, + JXL_COLOR_PROFILE_TARGET_ORIGINAL, + &dec_profile_size)); + EXPECT_EQ(icc_profile.size(), dec_profile_size); + + } else if (status == JXL_DEC_SUCCESS) { + EXPECT_TRUE(seen_color_encoding); + break; + } else { + // We do not expect any other events or errors + FAIL(); + break; + } + } + + EXPECT_TRUE(seen_basic_info); + EXPECT_TRUE(seen_color_encoding); + + JxlDecoderDestroy(dec); +} + +struct PixelTestConfig { + // Input image definition. + bool grayscale; + bool include_alpha; + size_t xsize; + size_t ysize; + jxl::PreviewMode preview_mode; + bool add_intrinsic_size; + // Output format. + JxlEndianness endianness; + JxlDataType data_type; + uint32_t output_channels; + // Container options. + CodeStreamBoxFormat add_container; + // Decoding mode. + bool use_callback; + bool set_buffer_early; + bool use_resizable_runner; + // Exif orientation, 1-8 + JxlOrientation orientation; + bool keep_orientation; + size_t upsampling; +}; + +class DecodeTestParam : public ::testing::TestWithParam {}; + +TEST_P(DecodeTestParam, PixelTest) { + PixelTestConfig config = GetParam(); + JxlDecoder* dec = JxlDecoderCreate(NULL); + + if (config.keep_orientation) { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetKeepOrientation(dec, JXL_TRUE)); + } + + size_t num_pixels = config.xsize * config.ysize; + uint32_t orig_channels = + (config.grayscale ? 1 : 3) + (config.include_alpha ? 1 : 0); + std::vector pixels = + jxl::test::GetSomeTestImage(config.xsize, config.ysize, orig_channels, 0); + JxlPixelFormat format_orig = {orig_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, + 0}; + jxl::TestCodestreamParams params; + // Lossless to verify pixels exactly after roundtrip. + params.cparams.SetLossless(); + params.cparams.speed_tier = jxl::SpeedTier::kThunder; + params.cparams.resampling = config.upsampling; + params.cparams.ec_resampling = config.upsampling; + params.box_format = config.add_container; + params.orientation = config.orientation; + params.preview_mode = config.preview_mode; + params.add_intrinsic_size = config.add_intrinsic_size; + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), config.xsize, + config.ysize, orig_channels, params); + + JxlPixelFormat format = {config.output_channels, config.data_type, + config.endianness, 0}; + + bool swap_xy = !config.keep_orientation && (config.orientation > 4); + size_t xsize = swap_xy ? config.ysize : config.xsize; + size_t ysize = swap_xy ? config.xsize : config.ysize; + + std::vector pixels2 = jxl::DecodeWithAPI( + dec, jxl::Span(compressed.data(), compressed.size()), + format, config.use_callback, config.set_buffer_early, + config.use_resizable_runner, /*require_boxes=*/false, + /*expect_success=*/true); + JxlDecoderReset(dec); + EXPECT_EQ(num_pixels * config.output_channels * + jxl::test::GetDataBits(config.data_type) / jxl::kBitsPerByte, + pixels2.size()); + + // If an orientation transformation is expected, to compare the pixels, also + // apply this transformation to the original pixels. ConvertToExternal is + // used to achieve this, with a temporary conversion to CodecInOut and back. + if (config.orientation > 1 && !config.keep_orientation) { + jxl::Span bytes(pixels.data(), pixels.size()); + jxl::ColorEncoding color_encoding = + jxl::ColorEncoding::SRGB(config.grayscale); + + jxl::CodecInOut io; + if (config.include_alpha) io.metadata.m.SetAlphaBits(16); + io.metadata.m.color_encoding = color_encoding; + io.SetSize(config.xsize, config.ysize); + + EXPECT_TRUE(ConvertFromExternal(bytes, config.xsize, config.ysize, + color_encoding, 16, format_orig, nullptr, + &io.Main())); + + for (size_t i = 0; i < pixels.size(); i++) pixels[i] = 0; + EXPECT_TRUE(ConvertToExternal( + io.Main(), 16, + /*float_out=*/false, orig_channels, JXL_BIG_ENDIAN, + xsize * 2 * orig_channels, nullptr, pixels.data(), pixels.size(), + /*out_callback=*/{}, + static_cast(config.orientation))); + } + if (config.upsampling == 1) { + EXPECT_EQ(0u, jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize, + ysize, format_orig, format)); + } else { + // resampling is of course not lossless, so as a rough check: + // count pixels that are more than off-by-25 in the 8-bit value of one of + // the channels + EXPECT_LE( + jxl::test::ComparePixels( + pixels.data(), pixels2.data(), xsize, ysize, format_orig, format, + 50.0 * (config.data_type == JXL_TYPE_UINT8 ? 1.0 : 256.0)), + 300u); + } + + JxlDecoderDestroy(dec); +} + +std::vector GeneratePixelTests() { + std::vector all_tests; + struct ChannelInfo { + bool grayscale; + bool include_alpha; + size_t output_channels; + }; + ChannelInfo ch_info[] = { + {false, true, 4}, // RGBA -> RGBA + {true, false, 1}, // G -> G + {true, true, 1}, // GA -> G + {true, true, 2}, // GA -> GA + {false, false, 3}, // RGB -> RGB + {false, true, 3}, // RGBA -> RGB + {false, false, 4}, // RGB -> RGBA + }; + + struct OutputFormat { + JxlEndianness endianness; + JxlDataType data_type; + }; + OutputFormat out_formats[] = { + {JXL_NATIVE_ENDIAN, JXL_TYPE_UINT8}, + {JXL_LITTLE_ENDIAN, JXL_TYPE_UINT16}, + {JXL_BIG_ENDIAN, JXL_TYPE_UINT16}, + {JXL_NATIVE_ENDIAN, JXL_TYPE_FLOAT16}, + {JXL_LITTLE_ENDIAN, JXL_TYPE_FLOAT}, + {JXL_BIG_ENDIAN, JXL_TYPE_FLOAT}, + }; + + auto make_test = [&](ChannelInfo ch, size_t xsize, size_t ysize, + jxl::PreviewMode preview_mode, bool intrinsic_size, + CodeStreamBoxFormat box, JxlOrientation orientation, + bool keep_orientation, OutputFormat format, + bool use_callback, bool set_buffer_early, + bool resizable_runner, size_t upsampling) { + PixelTestConfig c; + c.grayscale = ch.grayscale; + c.include_alpha = ch.include_alpha; + c.preview_mode = preview_mode; + c.add_intrinsic_size = intrinsic_size; + c.xsize = xsize; + c.ysize = ysize; + c.add_container = (CodeStreamBoxFormat)box; + c.output_channels = ch.output_channels; + c.data_type = format.data_type; + c.endianness = format.endianness; + c.use_callback = use_callback; + c.set_buffer_early = set_buffer_early; + c.use_resizable_runner = resizable_runner; + c.orientation = orientation; + c.keep_orientation = keep_orientation; + c.upsampling = upsampling; + all_tests.push_back(c); + }; + + // Test output formats and methods. + for (ChannelInfo ch : ch_info) { + for (int use_callback = 0; use_callback <= 1; use_callback++) { + for (size_t upsampling : {1, 2, 4, 8}) { + for (OutputFormat fmt : out_formats) { + make_test(ch, 301, 33, jxl::kNoPreview, + /*add_intrinsic_size=*/false, + CodeStreamBoxFormat::kCSBF_None, JXL_ORIENT_IDENTITY, + /*keep_orientation=*/false, fmt, use_callback, + /*set_buffer_early=*/false, /*resizable_runner=*/false, + upsampling); + } + } + } + } + // Test codestream formats. + for (size_t box = 1; box < kCSBF_NUM_ENTRIES; ++box) { + make_test(ch_info[0], 77, 33, jxl::kNoPreview, + /*add_intrinsic_size=*/false, (CodeStreamBoxFormat)box, + JXL_ORIENT_IDENTITY, + /*keep_orientation=*/false, out_formats[0], + /*use_callback=*/false, + /*set_buffer_early=*/false, /*resizable_runner=*/false, 1); + } + // Test previews. + for (int preview_mode = 0; preview_mode < jxl::kNumPreviewModes; + preview_mode++) { + make_test(ch_info[0], 77, 33, (jxl::PreviewMode)preview_mode, + /*add_intrinsic_size=*/false, CodeStreamBoxFormat::kCSBF_None, + JXL_ORIENT_IDENTITY, + /*keep_orientation=*/false, out_formats[0], + /*use_callback=*/false, /*set_buffer_early=*/false, + /*resizable_runner=*/false, 1); + } + // Test intrinsic sizes. + for (int add_intrinsic_size = 0; add_intrinsic_size <= 1; + add_intrinsic_size++) { + make_test(ch_info[0], 55, 34, jxl::kNoPreview, add_intrinsic_size, + CodeStreamBoxFormat::kCSBF_None, JXL_ORIENT_IDENTITY, + /*keep_orientation=*/false, out_formats[0], + /*use_callback=*/false, /*set_buffer_early=*/false, + /*resizable_runner=*/false, 1); + } + // Test setting buffers early. + make_test(ch_info[0], 300, 33, jxl::kNoPreview, + /*add_intrinsic_size=*/false, CodeStreamBoxFormat::kCSBF_None, + JXL_ORIENT_IDENTITY, + /*keep_orientation=*/false, out_formats[0], + /*use_callback=*/false, /*set_buffer_early=*/true, + /*resizable_runner=*/false, 1); + + // Test using the resizable runner + for (size_t i = 0; i < 4; i++) { + make_test(ch_info[0], 300 << i, 33 << i, jxl::kNoPreview, + /*add_intrinsic_size=*/false, CodeStreamBoxFormat::kCSBF_None, + JXL_ORIENT_IDENTITY, + /*keep_orientation=*/false, out_formats[0], + /*use_callback=*/false, /*set_buffer_early=*/false, + /*resizable_runner=*/true, 1); + } + + // Test orientations. + for (int orientation = 2; orientation <= 8; ++orientation) { + for (int keep_orientation = 0; keep_orientation <= 1; keep_orientation++) { + for (int use_callback = 0; use_callback <= 1; use_callback++) { + for (ChannelInfo ch : ch_info) { + for (OutputFormat fmt : out_formats) { + make_test(ch, 280, 12, jxl::kNoPreview, + /*add_intrinsic_size=*/false, + CodeStreamBoxFormat::kCSBF_None, + static_cast(orientation), + /*keep_orientation=*/keep_orientation, fmt, + /*use_callback=*/use_callback, /*set_buffer_early=*/true, + /*resizable_runner=*/false, 1); + } + } + } + } + } + + return all_tests; +} + +std::ostream& operator<<(std::ostream& os, const PixelTestConfig& c) { + os << c.xsize << "x" << c.ysize; + const char* colors[] = {"", "G", "GA", "RGB", "RGBA"}; + os << colors[(c.grayscale ? 1 : 3) + (c.include_alpha ? 1 : 0)]; + os << "to"; + os << colors[c.output_channels]; + switch (c.data_type) { + case JXL_TYPE_UINT8: + os << "u8"; + break; + case JXL_TYPE_UINT16: + os << "u16"; + break; + case JXL_TYPE_FLOAT: + os << "f32"; + break; + case JXL_TYPE_FLOAT16: + os << "f16"; + break; + default: + JXL_ASSERT(false); + }; + if (jxl::test::GetDataBits(c.data_type) > jxl::kBitsPerByte) { + if (c.endianness == JXL_NATIVE_ENDIAN) { + // add nothing + } else if (c.endianness == JXL_BIG_ENDIAN) { + os << "BE"; + } else if (c.endianness == JXL_LITTLE_ENDIAN) { + os << "LE"; + } + } + if (c.add_container != CodeStreamBoxFormat::kCSBF_None) { + os << "Box"; + os << (size_t)c.add_container; + } + if (c.preview_mode == jxl::kSmallPreview) os << "Preview"; + if (c.preview_mode == jxl::kBigPreview) os << "BigPreview"; + if (c.add_intrinsic_size) os << "IntrinicSize"; + if (c.use_callback) os << "Callback"; + if (c.set_buffer_early) os << "EarlyBuffer"; + if (c.use_resizable_runner) os << "ResizableRunner"; + if (c.orientation != 1) os << "O" << c.orientation; + if (c.keep_orientation) os << "Keep"; + if (c.upsampling > 1) os << "x" << c.upsampling; + return os; +} + +std::string PixelTestDescription( + const testing::TestParamInfo& info) { + std::stringstream name; + name << info.param; + return name.str(); +} + +JXL_GTEST_INSTANTIATE_TEST_SUITE_P(DecodeTest, DecodeTestParam, + testing::ValuesIn(GeneratePixelTests()), + PixelTestDescription); + +TEST(DecodeTest, PixelTestWithICCProfileLossless) { + JxlDecoder* dec = JxlDecoderCreate(NULL); + + size_t xsize = 123, ysize = 77; + size_t num_pixels = xsize * ysize; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + JxlPixelFormat format_orig = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + jxl::TestCodestreamParams params; + // Lossless to verify pixels exactly after roundtrip. + params.cparams.SetLossless(); + params.cparams.speed_tier = jxl::SpeedTier::kThunder; + params.add_icc_profile = true; + // For variation: some have container and no preview, others have preview + // and no container. + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 4, + params); + + for (uint32_t channels = 3; channels <= 4; ++channels) { + { + JxlPixelFormat format = {channels, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}; + + std::vector pixels2 = jxl::DecodeWithAPI( + dec, jxl::Span(compressed.data(), compressed.size()), + format, /*use_callback=*/false, /*set_buffer_early=*/false, + /*use_resizable_runner=*/false, /*require_boxes=*/false, + /*expect_success=*/true); + JxlDecoderReset(dec); + EXPECT_EQ(num_pixels * channels, pixels2.size()); + EXPECT_EQ(0u, + jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize, + ysize, format_orig, format)); + } + { + JxlPixelFormat format = {channels, JXL_TYPE_UINT16, JXL_LITTLE_ENDIAN, 0}; + + // Test with the container for one of the pixel formats. + std::vector pixels2 = jxl::DecodeWithAPI( + dec, jxl::Span(compressed.data(), compressed.size()), + format, /*use_callback=*/true, /*set_buffer_early=*/true, + /*use_resizable_runner=*/false, /*require_boxes=*/false, + /*expect_success=*/true); + JxlDecoderReset(dec); + EXPECT_EQ(num_pixels * channels * 2, pixels2.size()); + EXPECT_EQ(0u, + jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize, + ysize, format_orig, format)); + } + + { + JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0}; + + std::vector pixels2 = jxl::DecodeWithAPI( + dec, jxl::Span(compressed.data(), compressed.size()), + format, /*use_callback=*/false, /*set_buffer_early=*/false, + /*use_resizable_runner=*/false, /*require_boxes=*/false, + /*expect_success=*/true); + JxlDecoderReset(dec); + EXPECT_EQ(num_pixels * channels * 4, pixels2.size()); + EXPECT_EQ(0u, + jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize, + ysize, format_orig, format)); + } + } + + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, PixelTestWithICCProfileLossy) { + JxlDecoder* dec = JxlDecoderCreate(NULL); + + size_t xsize = 123, ysize = 77; + size_t num_pixels = xsize * ysize; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0); + JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + jxl::TestCodestreamParams params; + params.add_icc_profile = true; + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 3, + params); + uint32_t channels = 3; + + JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0}; + + jxl::PaddedBytes icc; + std::vector pixels2 = jxl::DecodeWithAPI( + dec, jxl::Span(compressed.data(), compressed.size()), + format, /*use_callback=*/false, /*set_buffer_early=*/true, + /*use_resizable_runner=*/false, /*require_boxes=*/false, + /*expect_success=*/true, /*icc=*/&icc); + JxlDecoderReset(dec); + EXPECT_EQ(num_pixels * channels * 4, pixels2.size()); + + // The input pixels use the profile matching GetIccTestProfile, since we set + // add_icc_profile for CreateTestJXLCodestream to true. + jxl::ColorEncoding color_encoding0; + EXPECT_TRUE(color_encoding0.SetICC(GetIccTestProfile())); + jxl::Span span0(pixels.data(), pixels.size()); + jxl::CodecInOut io0; + io0.SetSize(xsize, ysize); + EXPECT_TRUE(ConvertFromExternal(span0, xsize, ysize, color_encoding0, + /*bits_per_sample=*/16, format_orig, + /*pool=*/nullptr, &io0.Main())); + + jxl::ColorEncoding color_encoding1; + EXPECT_TRUE(color_encoding1.SetICC(std::move(icc))); + jxl::Span span1(pixels2.data(), pixels2.size()); + jxl::CodecInOut io1; + io1.SetSize(xsize, ysize); + EXPECT_TRUE(ConvertFromExternal(span1, xsize, ysize, color_encoding1, + /*bits_per_sample=*/32, format, + /*pool=*/nullptr, &io1.Main())); + + jxl::ButteraugliParams ba; + EXPECT_THAT(ButteraugliDistance(io0.frames, io1.frames, ba, jxl::GetJxlCms(), + /*distmap=*/nullptr, nullptr), + IsSlightlyBelow(0.79f)); + + JxlDecoderDestroy(dec); +} + +std::string ColorDescription(JxlColorEncoding c) { + jxl::ColorEncoding color_encoding; + EXPECT_TRUE(ConvertExternalToInternalColorEncoding(c, &color_encoding)); + return Description(color_encoding); +} + +std::string GetOrigProfile(JxlDecoder* dec) { + JxlColorEncoding c; + JxlColorProfileTarget target = JXL_COLOR_PROFILE_TARGET_ORIGINAL; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetColorAsEncodedProfile(dec, nullptr, target, &c)); + return ColorDescription(c); +} + +std::string GetDataProfile(JxlDecoder* dec) { + JxlColorEncoding c; + JxlColorProfileTarget target = JXL_COLOR_PROFILE_TARGET_DATA; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetColorAsEncodedProfile(dec, nullptr, target, &c)); + return ColorDescription(c); +} + +double ButteraugliDistance(size_t xsize, size_t ysize, + const std::vector& pixels_in, + const jxl::ColorEncoding& color_in, + float intensity_in, + const std::vector& pixels_out, + const jxl::ColorEncoding& color_out, + float intensity_out) { + jxl::CodecInOut in; + in.metadata.m.color_encoding = color_in; + in.metadata.m.SetIntensityTarget(intensity_in); + JxlPixelFormat format_in = {static_cast(color_in.Channels()), + JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + EXPECT_TRUE(jxl::ConvertFromExternal( + jxl::Span(pixels_in.data(), pixels_in.size()), xsize, + ysize, color_in, + /*bits_per_sample=*/16, format_in, + /*pool=*/nullptr, &in.Main())); + jxl::CodecInOut out; + out.metadata.m.color_encoding = color_out; + out.metadata.m.SetIntensityTarget(intensity_out); + JxlPixelFormat format_out = {static_cast(color_out.Channels()), + JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + EXPECT_TRUE(jxl::ConvertFromExternal( + jxl::Span(pixels_out.data(), pixels_out.size()), xsize, + ysize, color_out, + /*bits_per_sample=*/16, format_out, + /*pool=*/nullptr, &out.Main())); + return ButteraugliDistance(in.frames, out.frames, jxl::ButteraugliParams(), + jxl::GetJxlCms(), nullptr, nullptr); +} + +class DecodeAllEncodingsTest + : public ::testing::TestWithParam {}; +JXL_GTEST_INSTANTIATE_TEST_SUITE_P( + DecodeAllEncodingsTestInstantiation, DecodeAllEncodingsTest, + ::testing::ValuesIn(jxl::test::AllEncodings())); +TEST_P(DecodeAllEncodingsTest, PreserveOriginalProfileTest) { + size_t xsize = 123, ysize = 77; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0); + JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + int events = JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING | JXL_DEC_FULL_IMAGE; + const auto& cdesc = GetParam(); + jxl::ColorEncoding c_in = jxl::test::ColorEncodingFromDescriptor(cdesc); + if (c_in.rendering_intent != jxl::RenderingIntent::kRelative) return; + std::string color_space_in = Description(c_in); + float intensity_in = c_in.tf.IsPQ() ? 10000 : 255; + printf("Testing input color space %s\n", color_space_in.c_str()); + jxl::TestCodestreamParams params; + params.color_space = color_space_in; + params.intensity_target = intensity_in; + jxl::PaddedBytes data = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 3, + params); + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), data.size())); + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(xsize, info.xsize); + EXPECT_EQ(ysize, info.ysize); + EXPECT_FALSE(info.uses_original_profile); + EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec)); + EXPECT_EQ(GetOrigProfile(dec), color_space_in); + EXPECT_EQ(GetDataProfile(dec), color_space_in); + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + std::vector out(pixels.size()); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer(dec, &format, out.data(), out.size())); + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + double dist = ButteraugliDistance(xsize, ysize, pixels, c_in, intensity_in, + out, c_in, intensity_in); + EXPECT_LT(dist, 1.29); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + JxlDecoderDestroy(dec); +} + +namespace { +void SetPreferredColorProfileTest( + const jxl::test::ColorEncodingDescriptor& from) { + size_t xsize = 123, ysize = 77; + int events = JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING | JXL_DEC_FULL_IMAGE; + jxl::ColorEncoding c_in = jxl::test::ColorEncodingFromDescriptor(from); + if (c_in.rendering_intent != jxl::RenderingIntent::kRelative) return; + if (c_in.white_point != jxl::WhitePoint::kD65) return; + uint32_t num_channels = c_in.Channels(); + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0); + JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + std::string color_space_in = Description(c_in); + float intensity_in = c_in.tf.IsPQ() ? 10000 : 255; + jxl::TestCodestreamParams params; + params.color_space = color_space_in; + params.intensity_target = intensity_in; + jxl::PaddedBytes data = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + num_channels, params); + auto all_encodings = jxl::test::AllEncodings(); + all_encodings.push_back( + {jxl::ColorSpace::kXYB, jxl::WhitePoint::kD65, jxl::Primaries::kCustom, + jxl::TransferFunction::kUnknown, jxl::RenderingIntent::kPerceptual}); + for (const auto& c1 : all_encodings) { + jxl::ColorEncoding c_out = jxl::test::ColorEncodingFromDescriptor(c1); + float intensity_out = intensity_in; + if (c_out.GetColorSpace() != jxl::ColorSpace::kXYB) { + if (c_out.rendering_intent != jxl::RenderingIntent::kRelative) { + continue; + } + if ((c_in.primaries == jxl::Primaries::k2100 && + c_out.primaries != jxl::Primaries::k2100) || + (c_in.primaries == jxl::Primaries::kP3 && + c_out.primaries == jxl::Primaries::kSRGB)) { + // Converting to a narrower gamut does not work without gammut mapping. + continue; + } + } + if (c_out.tf.IsHLG() && intensity_out > 300) { + // The Linear->HLG OOTF function at this intensity level can push + // saturated colors out of gamut, so we would need gamut mapping in + // this case too. + continue; + } + std::string color_space_out = Description(c_out); + if (color_space_in == color_space_out) continue; + printf("Testing input color space %s with output color space %s\n", + color_space_in.c_str(), color_space_out.c_str()); + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events)); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput(dec, data.data(), data.size())); + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(xsize, info.xsize); + EXPECT_EQ(ysize, info.ysize); + EXPECT_FALSE(info.uses_original_profile); + EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec)); + EXPECT_EQ(GetOrigProfile(dec), color_space_in); + EXPECT_EQ(GetDataProfile(dec), color_space_in); + JxlColorEncoding encoding_out; + EXPECT_TRUE(jxl::ParseDescription(color_space_out, &encoding_out)); + if (c_out.GetColorSpace() == jxl::ColorSpace::kXYB && + (c_in.primaries != jxl::Primaries::kSRGB || c_in.tf.IsPQ())) { + EXPECT_EQ(JXL_DEC_ERROR, + JxlDecoderSetPreferredColorProfile(dec, &encoding_out)); + JxlDecoderDestroy(dec); + continue; + } + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetPreferredColorProfile(dec, &encoding_out)); + EXPECT_EQ(GetOrigProfile(dec), color_space_in); + EXPECT_EQ(GetDataProfile(dec), color_space_out); + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + size_t buffer_size; + JxlPixelFormat out_format = format; + out_format.num_channels = c_out.Channels(); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &out_format, &buffer_size)); + std::vector out(buffer_size); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &out_format, out.data(), out.size())); + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + double dist = ButteraugliDistance(xsize, ysize, pixels, c_in, intensity_in, + out, c_out, intensity_out); + if (c_in.white_point == c_out.white_point) { + EXPECT_LT(dist, 1.29); + } else { + EXPECT_LT(dist, 4.0); + } + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + JxlDecoderDestroy(dec); + } +} +} // namespace + +TEST(DecodeTest, SetPreferredColorProfileTestFromGray) { + jxl::test::ColorEncodingDescriptor gray = { + jxl::ColorSpace::kGray, jxl::WhitePoint::kD65, jxl::Primaries::kSRGB, + jxl::TransferFunction::kSRGB, jxl::RenderingIntent::kRelative}; + SetPreferredColorProfileTest(gray); +} + +TEST_P(DecodeAllEncodingsTest, SetPreferredColorProfileTest) { + const auto& from = GetParam(); + SetPreferredColorProfileTest(from); +} + +// Tests the case of lossy sRGB image without alpha channel, decoded to RGB8 +// and to RGBA8 +TEST(DecodeTest, PixelTestOpaqueSrgbLossy) { + for (unsigned channels = 3; channels <= 4; channels++) { + JxlDecoder* dec = JxlDecoderCreate(NULL); + + size_t xsize = 123, ysize = 77; + size_t num_pixels = xsize * ysize; + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, 3, 0); + JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 3, + jxl::TestCodestreamParams()); + + JxlPixelFormat format = {channels, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}; + + std::vector pixels2 = jxl::DecodeWithAPI( + dec, jxl::Span(compressed.data(), compressed.size()), + format, /*use_callback=*/true, /*set_buffer_early=*/false, + /*use_resizable_runner=*/false, /*require_boxes=*/false, + /*expect_success*/ true); + JxlDecoderReset(dec); + EXPECT_EQ(num_pixels * channels, pixels2.size()); + + jxl::ColorEncoding color_encoding0 = jxl::ColorEncoding::SRGB(false); + jxl::Span span0(pixels.data(), pixels.size()); + jxl::CodecInOut io0; + io0.SetSize(xsize, ysize); + EXPECT_TRUE(ConvertFromExternal(span0, xsize, ysize, color_encoding0, + /*bits_per_sample=*/16, format_orig, + /*pool=*/nullptr, &io0.Main())); + + jxl::ColorEncoding color_encoding1 = jxl::ColorEncoding::SRGB(false); + jxl::Span span1(pixels2.data(), pixels2.size()); + jxl::CodecInOut io1; + EXPECT_TRUE(ConvertFromExternal(span1, xsize, ysize, color_encoding1, + /*bits_per_sample=*/8, format, + /*pool=*/nullptr, &io1.Main())); + + jxl::ButteraugliParams ba; + EXPECT_THAT( + ButteraugliDistance(io0.frames, io1.frames, ba, jxl::GetJxlCms(), + /*distmap=*/nullptr, nullptr), + IsSlightlyBelow(0.7f)); + + JxlDecoderDestroy(dec); + } +} + +// Opaque image with noise enabled, decoded to RGB8 and RGBA8. +TEST(DecodeTest, PixelTestOpaqueSrgbLossyNoise) { + for (unsigned channels = 3; channels <= 4; channels++) { + JxlDecoder* dec = JxlDecoderCreate(NULL); + + size_t xsize = 512, ysize = 300; + size_t num_pixels = xsize * ysize; + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, 3, 0); + JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + jxl::TestCodestreamParams params; + params.cparams.noise = jxl::Override::kOn; + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 3, + params); + + JxlPixelFormat format = {channels, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}; + + std::vector pixels2 = jxl::DecodeWithAPI( + dec, jxl::Span(compressed.data(), compressed.size()), + format, /*use_callback=*/false, /*set_buffer_early=*/true, + /*use_resizable_runner=*/false, /*require_boxes=*/false, + /*expect_success=*/true); + JxlDecoderReset(dec); + EXPECT_EQ(num_pixels * channels, pixels2.size()); + + jxl::ColorEncoding color_encoding0 = jxl::ColorEncoding::SRGB(false); + jxl::Span span0(pixels.data(), pixels.size()); + jxl::CodecInOut io0; + io0.SetSize(xsize, ysize); + EXPECT_TRUE(ConvertFromExternal(span0, xsize, ysize, color_encoding0, + /*bits_per_sample=*/16, format_orig, + /*pool=*/nullptr, &io0.Main())); + + jxl::ColorEncoding color_encoding1 = jxl::ColorEncoding::SRGB(false); + jxl::Span span1(pixels2.data(), pixels2.size()); + jxl::CodecInOut io1; + EXPECT_TRUE(ConvertFromExternal(span1, xsize, ysize, color_encoding1, + /*bits_per_sample=*/8, format, + /*pool=*/nullptr, &io1.Main())); + + jxl::ButteraugliParams ba; + EXPECT_THAT( + ButteraugliDistance(io0.frames, io1.frames, ba, jxl::GetJxlCms(), + /*distmap=*/nullptr, nullptr), + IsSlightlyBelow(1.7f)); + + JxlDecoderDestroy(dec); + } +} + +TEST(DecodeTest, ProcessEmptyInputWithBoxes) { + size_t xsize = 123, ysize = 77; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0); + jxl::CompressParams cparams; + uint32_t channels = 3; + JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0}; + for (int i = 0; i < kCSBF_NUM_ENTRIES; ++i) { + JxlDecoder* dec = JxlDecoderCreate(NULL); + jxl::TestCodestreamParams params; + params.box_format = (CodeStreamBoxFormat)i; + printf("Testing empty input with box format %d\n", (int)params.box_format); + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 3, + params); + const int events = + JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE | JXL_DEC_COLOR_ENCODING; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events)); + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec)); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput(dec, compressed.data(), compressed.size())); + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + const size_t remaining = JxlDecoderReleaseInput(dec); + EXPECT_LE(remaining, compressed.size()); + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec)); + JxlDecoderDestroy(dec); + } +} + +TEST(DecodeTest, ExtraBytesAfterCompressedStream) { + size_t xsize = 123, ysize = 77; + size_t num_pixels = xsize * ysize; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0); + jxl::CompressParams cparams; + for (int i = 0; i < kCSBF_NUM_ENTRIES; ++i) { + CodeStreamBoxFormat box_format = (CodeStreamBoxFormat)i; + if (box_format == kCSBF_Multi_Other_Zero_Terminated) continue; + printf("Testing with box format %d\n", (int)box_format); + size_t last_unknown_box_size = 0; + if (box_format == kCSBF_Single_Other) { + last_unknown_box_size = unk1_box_size + 8; + } else if (box_format == kCSBF_Multi_Other_Terminated) { + last_unknown_box_size = unk3_box_size + 8; + } else if (box_format == kCSBF_Multi_Last_Empty_Other) { + // If boxes are not required, the decoder won't consume the last empty + // jxlp box. + last_unknown_box_size = 12 + unk3_box_size + 8; + } + jxl::TestCodestreamParams params; + params.box_format = box_format; + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 3, + params); + // Add some more bytes after compressed data. + compressed.push_back(0); + compressed.push_back(1); + compressed.push_back(2); + JxlDecoder* dec = JxlDecoderCreate(NULL); + uint32_t channels = 3; + JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0}; + std::vector pixels2 = jxl::DecodeWithAPI( + dec, jxl::Span(compressed.data(), compressed.size()), + format, /*use_callback=*/false, /*set_buffer_early=*/true, + /*use_resizable_runner=*/false, /*require_boxes=*/false, + /*expect_success=*/true); + size_t unconsumed_bytes = JxlDecoderReleaseInput(dec); + EXPECT_EQ(last_unknown_box_size + 3, unconsumed_bytes); + EXPECT_EQ(num_pixels * channels * 4, pixels2.size()); + JxlDecoderDestroy(dec); + } +} + +TEST(DecodeTest, ExtraBytesAfterCompressedStreamRequireBoxes) { + size_t xsize = 123, ysize = 77; + size_t num_pixels = xsize * ysize; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0); + jxl::CompressParams cparams; + for (int i = 0; i < kCSBF_NUM_ENTRIES; ++i) { + CodeStreamBoxFormat box_format = (CodeStreamBoxFormat)i; + if (box_format == kCSBF_Multi_Other_Zero_Terminated) continue; + printf("Testing with box format %d\n", (int)box_format); + bool expect_success = (box_format == kCSBF_None || + box_format == kCSBF_Single_Zero_Terminated || + box_format == kCSBF_Multi_Zero_Terminated); + jxl::TestCodestreamParams params; + params.box_format = box_format; + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 3, + params); + // Add some more bytes after compressed data. + compressed.push_back(0); + compressed.push_back(1); + compressed.push_back(2); + JxlDecoder* dec = JxlDecoderCreate(NULL); + uint32_t channels = 3; + JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0}; + std::vector pixels2 = jxl::DecodeWithAPI( + dec, jxl::Span(compressed.data(), compressed.size()), + format, /*use_callback=*/false, /*set_buffer_early=*/true, + /*use_resizable_runner=*/false, /*require_boxes=*/true, expect_success); + size_t unconsumed_bytes = JxlDecoderReleaseInput(dec); + EXPECT_EQ(3, unconsumed_bytes); + EXPECT_EQ(num_pixels * channels * 4, pixels2.size()); + JxlDecoderDestroy(dec); + } +} + +TEST(DecodeTest, ConcatenatedCompressedStreams) { + size_t xsize = 123, ysize = 77; + size_t num_pixels = xsize * ysize; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0); + jxl::CompressParams cparams; + for (int i = 0; i < kCSBF_NUM_ENTRIES; ++i) { + CodeStreamBoxFormat first_box_format = (CodeStreamBoxFormat)i; + if (first_box_format == kCSBF_Multi_Other_Zero_Terminated) continue; + jxl::TestCodestreamParams params1; + params1.box_format = first_box_format; + jxl::PaddedBytes compressed1 = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 3, + params1); + for (int j = 0; j < kCSBF_NUM_ENTRIES; ++j) { + CodeStreamBoxFormat second_box_format = (CodeStreamBoxFormat)j; + if (second_box_format == kCSBF_Multi_Other_Zero_Terminated) continue; + printf("Testing with box format pair %d, %d\n", (int)first_box_format, + (int)second_box_format); + jxl::TestCodestreamParams params2; + params2.box_format = second_box_format; + jxl::PaddedBytes compressed2 = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + 3, params2); + jxl::PaddedBytes concat; + concat.append(compressed1); + concat.append(compressed2); + uint32_t channels = 3; + JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0}; + size_t remaining = concat.size(); + for (int part = 0; part < 2; ++part) { + printf(" Decoding part %d\n", part + 1); + JxlDecoder* dec = JxlDecoderCreate(NULL); + size_t pos = concat.size() - remaining; + bool expect_success = + (part == 0 || second_box_format == kCSBF_None || + second_box_format == kCSBF_Single_Zero_Terminated || + second_box_format == kCSBF_Multi_Zero_Terminated); + std::vector pixels2 = jxl::DecodeWithAPI( + dec, jxl::Span(concat.data() + pos, remaining), + format, /*use_callback=*/false, /*set_buffer_early=*/true, + /*use_resizable_runner=*/false, /*require_boxes=*/true, + expect_success); + EXPECT_EQ(num_pixels * channels * 4, pixels2.size()); + remaining = JxlDecoderReleaseInput(dec); + JxlDecoderDestroy(dec); + } + EXPECT_EQ(0, remaining); + } + } +} + +void TestPartialStream(bool reconstructible_jpeg) { + size_t xsize = 123, ysize = 77; + uint32_t channels = 4; + if (reconstructible_jpeg) { + channels = 3; + } + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, channels, 0); + JxlPixelFormat format_orig = {channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + jxl::TestCodestreamParams params; + if (reconstructible_jpeg) { + params.cparams.color_transform = jxl::ColorTransform::kNone; + } else { + // Lossless to verify pixels exactly after roundtrip. + params.cparams.SetLossless(); + } + + std::vector pixels2; + pixels2.resize(pixels.size()); + + jxl::PaddedBytes jpeg_output(64); + size_t used_jpeg_output = 0; + + std::vector codestreams(kCSBF_NUM_ENTRIES); + std::vector jpeg_codestreams(kCSBF_NUM_ENTRIES); + for (size_t i = 0; i < kCSBF_NUM_ENTRIES; ++i) { + params.box_format = (CodeStreamBoxFormat)i; + if (reconstructible_jpeg) { + params.jpeg_codestream = &jpeg_codestreams[i]; + } + codestreams[i] = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + channels, params); + } + + // Test multiple step sizes, to test different combinations of the streaming + // box parsing. + std::vector increments = {1, 3, 17, 23, 120, 700, 1050}; + + for (size_t index = 0; index < increments.size(); index++) { + for (size_t i = 0; i < kCSBF_NUM_ENTRIES; ++i) { + if (reconstructible_jpeg && + (CodeStreamBoxFormat)i == CodeStreamBoxFormat::kCSBF_None) { + continue; + } + const jxl::PaddedBytes& data = codestreams[i]; + const uint8_t* next_in = data.data(); + size_t avail_in = 0; + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE | + JXL_DEC_JPEG_RECONSTRUCTION)); + + bool seen_basic_info = false; + bool seen_full_image = false; + bool seen_jpeg_recon = false; + + size_t total_size = 0; + + for (;;) { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + JxlDecoderStatus status = JxlDecoderProcessInput(dec); + size_t remaining = JxlDecoderReleaseInput(dec); + EXPECT_LE(remaining, avail_in); + next_in += avail_in - remaining; + avail_in = remaining; + if (status == JXL_DEC_NEED_MORE_INPUT) { + if (total_size >= data.size()) { + // End of test data reached, it should have successfully decoded the + // image now. + FAIL(); + break; + } + + size_t increment = increments[index]; + // End of the file reached, should be the final test. + if (total_size + increment > data.size()) { + increment = data.size() - total_size; + } + total_size += increment; + avail_in += increment; + } else if (status == JXL_DEC_BASIC_INFO) { + // This event should happen exactly once + EXPECT_FALSE(seen_basic_info); + if (seen_basic_info) break; + seen_basic_info = true; + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(info.xsize, xsize); + EXPECT_EQ(info.ysize, ysize); + } else if (status == JXL_DEC_JPEG_RECONSTRUCTION) { + EXPECT_FALSE(seen_basic_info); + EXPECT_FALSE(seen_full_image); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetJPEGBuffer(dec, jpeg_output.data(), + jpeg_output.size())); + seen_jpeg_recon = true; + } else if (status == JXL_DEC_JPEG_NEED_MORE_OUTPUT) { + EXPECT_TRUE(seen_jpeg_recon); + used_jpeg_output = + jpeg_output.size() - JxlDecoderReleaseJPEGBuffer(dec); + jpeg_output.resize(jpeg_output.size() * 2); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetJPEGBuffer( + dec, jpeg_output.data() + used_jpeg_output, + jpeg_output.size() - used_jpeg_output)); + } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) { + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer( + dec, &format_orig, pixels2.data(), pixels2.size())); + } else if (status == JXL_DEC_FULL_IMAGE) { + // This event should happen exactly once + EXPECT_FALSE(seen_full_image); + if (seen_full_image) break; + // This event should happen after basic info + EXPECT_TRUE(seen_basic_info); + seen_full_image = true; + if (reconstructible_jpeg) { + used_jpeg_output = + jpeg_output.size() - JxlDecoderReleaseJPEGBuffer(dec); + EXPECT_EQ(used_jpeg_output, jpeg_codestreams[i].size()); + EXPECT_EQ(0, memcmp(jpeg_output.data(), jpeg_codestreams[i].data(), + used_jpeg_output)); + } else { + EXPECT_EQ(pixels, pixels2); + } + } else if (status == JXL_DEC_SUCCESS) { + EXPECT_TRUE(seen_full_image); + break; + } else { + // We do not expect any other events or errors + FAIL(); + break; + } + } + + // Ensure the decoder emitted the basic info and full image events + EXPECT_TRUE(seen_basic_info); + EXPECT_TRUE(seen_full_image); + + JxlDecoderDestroy(dec); + } + } +} + +// Tests the return status when trying to decode pixels on incomplete file: it +// should return JXL_DEC_NEED_MORE_INPUT, not error. +TEST(DecodeTest, PixelPartialTest) { TestPartialStream(false); } + +#if JPEGXL_ENABLE_JPEG +// Tests the return status when trying to decode JPEG bytes on incomplete file. +TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGPartialTest)) { + TestPartialStream(true); +} +#endif // JPEGXL_ENABLE_JPEG + +// The DC event still exists, but is no longer implemented, it is deprecated. +TEST(DecodeTest, DCNotGettableTest) { + // 1x1 pixel JXL image + std::string compressed( + "\377\n\0\20\260\23\0H\200(" + "\0\334\0U\17\0\0\250P\31e\334\340\345\\\317\227\37:," + "\246m\\gh\253m\vK\22E\306\261I\252C&pH\22\353 " + "\363\6\22\bp\0\200\237\34\231W2d\255$\1", + 68); + + JxlDecoder* dec = JxlDecoderCreate(NULL); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO)); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput( + dec, reinterpret_cast(compressed.data()), + compressed.size())); + + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + + // Since the image is only 1x1 pixel, there is only 1 group, the decoder is + // unable to get DC size from this, and will not return the DC at all. Since + // no full image is requested either, it is expected to return success. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, PreviewTest) { + size_t xsize = 77, ysize = 120; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0); + JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + for (jxl::PreviewMode mode : {jxl::kSmallPreview, jxl::kBigPreview}) { + jxl::TestCodestreamParams params; + params.preview_mode = mode; + + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 3, + params); + + JxlPixelFormat format = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}; + + JxlDecoder* dec = JxlDecoderCreate(NULL); + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_PREVIEW_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size)); + + jxl::ColorEncoding c_srgb = jxl::ColorEncoding::SRGB(false); + jxl::CodecInOut io0; + EXPECT_TRUE(jxl::ConvertFromExternal( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + c_srgb, /*bits_per_sample=*/16, format_orig, /*pool=*/nullptr, + &io0.Main())); + GeneratePreview(params.preview_mode, &io0.Main()); + + size_t xsize_preview = io0.Main().xsize(); + size_t ysize_preview = io0.Main().ysize(); + EXPECT_EQ(xsize_preview, info.preview.xsize); + EXPECT_EQ(ysize_preview, info.preview.ysize); + EXPECT_EQ(xsize_preview * ysize_preview * 3, buffer_size); + + EXPECT_EQ(JXL_DEC_NEED_PREVIEW_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + std::vector preview(buffer_size); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetPreviewOutBuffer(dec, &format, preview.data(), + preview.size())); + + EXPECT_EQ(JXL_DEC_PREVIEW_IMAGE, JxlDecoderProcessInput(dec)); + + jxl::CodecInOut io1; + EXPECT_TRUE(jxl::ConvertFromExternal( + jxl::Span(preview.data(), preview.size()), xsize_preview, + ysize_preview, c_srgb, + /*bits_per_sample=*/8, format, + /*pool=*/nullptr, &io1.Main())); + + jxl::ButteraugliParams ba; + // TODO(lode): this ButteraugliDistance silently returns 0 (dangerous for + // tests) if xsize or ysize is < 8, no matter how different the images, a + // tiny size that could happen for a preview. ButteraugliDiffmap does + // support smaller than 8x8, but jxl's ButteraugliDistance does not. Perhaps + // move butteraugli's <8x8 handling from ButteraugliDiffmap to + // ButteraugliComparator::Diffmap in butteraugli.cc. + EXPECT_LE(ButteraugliDistance(io0.frames, io1.frames, ba, jxl::GetJxlCms(), + /*distmap=*/nullptr, nullptr), + mode == jxl::kSmallPreview ? 0.7f : 1.2f); + + JxlDecoderDestroy(dec); + } +} + +TEST(DecodeTest, AlignTest) { + size_t xsize = 123, ysize = 77; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + JxlPixelFormat format_orig = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + jxl::TestCodestreamParams params; + // Lossless to verify pixels exactly after roundtrip. + params.cparams.SetLossless(); + params.cparams.speed_tier = jxl::SpeedTier::kThunder; + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 4, + params); + + size_t align = 17; + JxlPixelFormat format = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, align}; + // On purpose not using jxl::RoundUpTo to test it independently. + size_t expected_line_bytes = (1 * 3 * xsize + align - 1) / align * align; + + for (int use_callback = 0; use_callback <= 1; ++use_callback) { + std::vector pixels2 = jxl::DecodeWithAPI( + jxl::Span(compressed.data(), compressed.size()), format, + use_callback, /*set_buffer_early=*/false, + /*use_resizable_runner=*/false, /*require_boxes=*/false, + /*expect_success=*/true); + EXPECT_EQ(expected_line_bytes * ysize, pixels2.size()); + EXPECT_EQ(0u, jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize, + ysize, format_orig, format)); + } +} + +TEST(DecodeTest, AnimationTest) { + size_t xsize = 123, ysize = 77; + static const size_t num_frames = 2; + std::vector frames[2]; + frames[0] = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0); + frames[1] = jxl::test::GetSomeTestImage(xsize, ysize, 3, 1); + JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + jxl::CodecInOut io; + io.SetSize(xsize, ysize); + io.metadata.m.SetUintSamples(16); + io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false); + io.metadata.m.have_animation = true; + io.frames.clear(); + io.frames.reserve(num_frames); + io.SetSize(xsize, ysize); + + std::vector frame_durations(num_frames); + for (size_t i = 0; i < num_frames; ++i) { + frame_durations[i] = 5 + i; + } + + for (size_t i = 0; i < num_frames; ++i) { + jxl::ImageBundle bundle(&io.metadata.m); + + EXPECT_TRUE(ConvertFromExternal( + jxl::Span(frames[i].data(), frames[i].size()), xsize, + ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), + /*bits_per_sample=*/16, format, + /*pool=*/nullptr, &bundle)); + bundle.duration = frame_durations[i]; + io.frames.push_back(std::move(bundle)); + } + + jxl::CompressParams cparams; + cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip. + cparams.speed_tier = jxl::SpeedTier::kThunder; + jxl::AuxOut aux_out; + jxl::PaddedBytes compressed; + jxl::PassesEncoderState enc_state; + EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed, + jxl::GetJxlCms(), &aux_out, nullptr)); + + // Decode and test the animation frames + + JxlDecoder* dec = JxlDecoderCreate(NULL); + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + + void* runner = JxlThreadParallelRunnerCreate( + NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads()); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetParallelRunner(dec, JxlThreadParallelRunner, runner)); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + + for (size_t i = 0; i < num_frames; ++i) { + std::vector pixels(buffer_size); + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + + JxlFrameHeader frame_header; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header)); + EXPECT_EQ(frame_durations[i], frame_header.duration); + EXPECT_EQ(0u, frame_header.name_length); + // For now, test with empty name, there's currently no easy way to encode + // a jxl file with a frame name because ImageBundle doesn't have a + // jxl::FrameHeader to set the name in. We can test the null termination + // character though. + char name; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameName(dec, &name, 1)); + EXPECT_EQ(0, name); + + EXPECT_EQ(i + 1 == num_frames, frame_header.is_last); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels.data(), pixels.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(), + xsize, ysize, format, format)); + } + + // After all frames were decoded, JxlDecoderProcessInput should return + // success to indicate all is done. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + + JxlThreadParallelRunnerDestroy(runner); + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, AnimationTestStreaming) { + size_t xsize = 123, ysize = 77; + static const size_t num_frames = 2; + std::vector frames[2]; + frames[0] = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0); + frames[1] = jxl::test::GetSomeTestImage(xsize, ysize, 3, 1); + JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + jxl::CodecInOut io; + io.SetSize(xsize, ysize); + io.metadata.m.SetUintSamples(16); + io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false); + io.metadata.m.have_animation = true; + io.frames.clear(); + io.frames.reserve(num_frames); + io.SetSize(xsize, ysize); + + std::vector frame_durations(num_frames); + for (size_t i = 0; i < num_frames; ++i) { + frame_durations[i] = 5 + i; + } + + for (size_t i = 0; i < num_frames; ++i) { + jxl::ImageBundle bundle(&io.metadata.m); + + EXPECT_TRUE(ConvertFromExternal( + jxl::Span(frames[i].data(), frames[i].size()), xsize, + ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), + /*bits_per_sample=*/16, format, + /*pool=*/nullptr, &bundle)); + bundle.duration = frame_durations[i]; + io.frames.push_back(std::move(bundle)); + } + + jxl::CompressParams cparams; + cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip. + cparams.speed_tier = jxl::SpeedTier::kThunder; + jxl::AuxOut aux_out; + jxl::PaddedBytes compressed; + jxl::PassesEncoderState enc_state; + EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed, + jxl::GetJxlCms(), &aux_out, nullptr)); + + // Decode and test the animation frames + + const size_t step_size = 16; + + JxlDecoder* dec = JxlDecoderCreate(NULL); + const uint8_t* next_in = compressed.data(); + size_t avail_in = 0; + size_t frame_headers_seen = 0; + size_t frames_seen = 0; + bool seen_basic_info = false; + + void* runner = JxlThreadParallelRunnerCreate( + NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads()); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetParallelRunner(dec, JxlThreadParallelRunner, runner)); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + + std::vector frames2[2]; + for (size_t i = 0; i < num_frames; ++i) { + frames2[i].resize(frames[i].size()); + } + + size_t total_in = 0; + size_t loop_count = 0; + + for (;;) { + if (loop_count++ > compressed.size()) { + fprintf(stderr, "Too many loops\n"); + FAIL(); + break; + } + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + auto status = JxlDecoderProcessInput(dec); + size_t remaining = JxlDecoderReleaseInput(dec); + EXPECT_LE(remaining, avail_in); + next_in += avail_in - remaining; + avail_in = remaining; + + if (status == JXL_DEC_SUCCESS) { + break; + } else if (status == JXL_DEC_ERROR) { + FAIL(); + } else if (status == JXL_DEC_NEED_MORE_INPUT) { + if (total_in >= compressed.size()) { + fprintf(stderr, "Already gave all input data\n"); + FAIL(); + break; + } + size_t amount = step_size; + if (total_in + amount > compressed.size()) { + amount = compressed.size() - total_in; + } + avail_in += amount; + total_in += amount; + } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, frames2[frames_seen].data(), + frames2[frames_seen].size())); + } else if (status == JXL_DEC_BASIC_INFO) { + EXPECT_EQ(false, seen_basic_info); + seen_basic_info = true; + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(xsize, info.xsize); + EXPECT_EQ(ysize, info.ysize); + } else if (status == JXL_DEC_FRAME) { + EXPECT_EQ(true, seen_basic_info); + frame_headers_seen++; + } else if (status == JXL_DEC_FULL_IMAGE) { + frames_seen++; + EXPECT_EQ(frame_headers_seen, frames_seen); + } else { + fprintf(stderr, "Unexpected status: %d\n", (int)status); + FAIL(); + } + } + + EXPECT_EQ(true, seen_basic_info); + EXPECT_EQ(num_frames, frames_seen); + EXPECT_EQ(num_frames, frame_headers_seen); + for (size_t i = 0; i < num_frames; ++i) { + EXPECT_EQ(frames[i], frames2[i]); + } + + JxlThreadParallelRunnerDestroy(runner); + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, ExtraChannelTest) { + size_t xsize = 55, ysize = 257; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + JxlPixelFormat format_orig = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + jxl::TestCodestreamParams params; + // Lossless to verify pixels exactly after roundtrip. + params.cparams.SetLossless(); + params.cparams.speed_tier = jxl::SpeedTier::kThunder; + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 4, + params); + + size_t align = 17; + JxlPixelFormat format = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, align}; + + JxlDecoder* dec = JxlDecoderCreate(NULL); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE)); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput(dec, compressed.data(), compressed.size())); + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(1u, info.num_extra_channels); + EXPECT_EQ(JXL_FALSE, info.alpha_premultiplied); + + JxlExtraChannelInfo extra_info; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetExtraChannelInfo(dec, 0, &extra_info)); + EXPECT_EQ(0, extra_info.type); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + size_t extra_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderExtraChannelBufferSize(dec, &format, &extra_size, 0)); + + std::vector image(buffer_size); + std::vector extra(extra_size); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, image.data(), image.size())); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetExtraChannelBuffer( + dec, &format, extra.data(), extra.size(), 0)); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + + // After the full image was output, JxlDecoderProcessInput should return + // success to indicate all is done. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + JxlDecoderDestroy(dec); + + EXPECT_EQ(0u, jxl::test::ComparePixels(pixels.data(), image.data(), xsize, + ysize, format_orig, format)); + + // Compare the extracted extra channel with the original alpha channel + + std::vector alpha(pixels.size() / 4); + for (size_t i = 0; i < pixels.size(); i += 8) { + size_t index_alpha = i / 4; + alpha[index_alpha + 0] = pixels[i + 6]; + alpha[index_alpha + 1] = pixels[i + 7]; + } + JxlPixelFormat format_alpha = format; + format_alpha.num_channels = 1; + JxlPixelFormat format_orig_alpha = format_orig; + format_orig_alpha.num_channels = 1; + + EXPECT_EQ(0u, + jxl::test::ComparePixels(alpha.data(), extra.data(), xsize, ysize, + format_orig_alpha, format_alpha)); +} + +TEST(DecodeTest, SkipCurrentFrameTest) { + size_t xsize = 90, ysize = 120; + constexpr size_t num_frames = 7; + std::vector frames[num_frames]; + for (size_t i = 0; i < num_frames; i++) { + frames[i] = jxl::test::GetSomeTestImage(xsize, ysize, 3, i); + } + JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + jxl::CodecInOut io; + io.SetSize(xsize, ysize); + io.metadata.m.SetUintSamples(16); + io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false); + io.metadata.m.have_animation = true; + io.frames.clear(); + io.frames.reserve(num_frames); + io.SetSize(xsize, ysize); + + std::vector frame_durations(num_frames); + for (size_t i = 0; i < num_frames; ++i) { + frame_durations[i] = 5 + i; + } + + for (size_t i = 0; i < num_frames; ++i) { + jxl::ImageBundle bundle(&io.metadata.m); + if (i & 1) { + // Mark some frames as referenceable, others not. + bundle.use_for_next_frame = true; + } + + EXPECT_TRUE(ConvertFromExternal( + jxl::Span(frames[i].data(), frames[i].size()), xsize, + ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), + /*bits_per_sample=*/16, format, + /*pool=*/nullptr, &bundle)); + bundle.duration = frame_durations[i]; + io.frames.push_back(std::move(bundle)); + } + + jxl::CompressParams cparams; + cparams.speed_tier = jxl::SpeedTier::kThunder; + jxl::AuxOut aux_out; + jxl::PaddedBytes compressed; + jxl::PassesEncoderState enc_state; + jxl::PassDefinition passes[] = {{2, 0, 4}, {4, 0, 4}, {8, 2, 2}, {8, 0, 1}}; + jxl::ProgressiveMode progressive_mode{passes}; + enc_state.progressive_splitter.SetProgressiveMode(progressive_mode); + EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed, + jxl::GetJxlCms(), &aux_out, nullptr)); + + JxlDecoder* dec = JxlDecoderCreate(NULL); + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | + JXL_DEC_FRAME_PROGRESSION | + JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetProgressiveDetail(dec, kLastPasses)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + + for (size_t i = 0; i < num_frames; ++i) { + printf("Decoding frame %d\n", (int)i); + EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderSkipCurrentFrame(dec)); + std::vector pixels(buffer_size); + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderSkipCurrentFrame(dec)); + JxlFrameHeader frame_header; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header)); + EXPECT_EQ(frame_durations[i], frame_header.duration); + EXPECT_EQ(i + 1 == num_frames, frame_header.is_last); + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels.data(), pixels.size())); + if (i == 2) { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSkipCurrentFrame(dec)); + continue; + } + EXPECT_EQ(JXL_DEC_FRAME_PROGRESSION, JxlDecoderProcessInput(dec)); + EXPECT_EQ(8, JxlDecoderGetIntendedDownsamplingRatio(dec)); + if (i == 3) { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSkipCurrentFrame(dec)); + continue; + } + EXPECT_EQ(JXL_DEC_FRAME_PROGRESSION, JxlDecoderProcessInput(dec)); + EXPECT_EQ(4, JxlDecoderGetIntendedDownsamplingRatio(dec)); + if (i == 4) { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSkipCurrentFrame(dec)); + continue; + } + EXPECT_EQ(JXL_DEC_FRAME_PROGRESSION, JxlDecoderProcessInput(dec)); + EXPECT_EQ(2, JxlDecoderGetIntendedDownsamplingRatio(dec)); + if (i == 5) { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSkipCurrentFrame(dec)); + continue; + } + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderSkipCurrentFrame(dec)); + } + + // After all frames were decoded, JxlDecoderProcessInput should return + // success to indicate all is done. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, SkipFrameTest) { + size_t xsize = 90, ysize = 120; + constexpr size_t num_frames = 16; + std::vector frames[num_frames]; + for (size_t i = 0; i < num_frames; i++) { + frames[i] = jxl::test::GetSomeTestImage(xsize, ysize, 3, i); + } + JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + jxl::CodecInOut io; + io.SetSize(xsize, ysize); + io.metadata.m.SetUintSamples(16); + io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false); + io.metadata.m.have_animation = true; + io.frames.clear(); + io.frames.reserve(num_frames); + io.SetSize(xsize, ysize); + + std::vector frame_durations(num_frames); + for (size_t i = 0; i < num_frames; ++i) { + frame_durations[i] = 5 + i; + } + + for (size_t i = 0; i < num_frames; ++i) { + jxl::ImageBundle bundle(&io.metadata.m); + if (i & 1) { + // Mark some frames as referenceable, others not. + bundle.use_for_next_frame = true; + } + + EXPECT_TRUE(ConvertFromExternal( + jxl::Span(frames[i].data(), frames[i].size()), xsize, + ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), + /*bits_per_sample=*/16, format, + /*pool=*/nullptr, &bundle)); + bundle.duration = frame_durations[i]; + io.frames.push_back(std::move(bundle)); + } + + jxl::CompressParams cparams; + cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip. + cparams.speed_tier = jxl::SpeedTier::kThunder; + jxl::AuxOut aux_out; + jxl::PaddedBytes compressed; + jxl::PassesEncoderState enc_state; + EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed, + jxl::GetJxlCms(), &aux_out, nullptr)); + + // Decode and test the animation frames + + JxlDecoder* dec = JxlDecoderCreate(NULL); + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + + void* runner = JxlThreadParallelRunnerCreate( + NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads()); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetParallelRunner(dec, JxlThreadParallelRunner, runner)); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + + for (size_t i = 0; i < num_frames; ++i) { + if (i == 3) { + JxlDecoderSkipFrames(dec, 5); + i += 5; + } + std::vector pixels(buffer_size); + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + + JxlFrameHeader frame_header; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header)); + EXPECT_EQ(frame_durations[i], frame_header.duration); + + EXPECT_EQ(i + 1 == num_frames, frame_header.is_last); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels.data(), pixels.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(), + xsize, ysize, format, format)); + } + + // After all frames were decoded, JxlDecoderProcessInput should return + // success to indicate all is done. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + + // Test rewinding the decoder and skipping different frames + + JxlDecoderRewind(dec); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + + for (size_t i = 0; i < num_frames; ++i) { + int test_skipping = (i == 9) ? 3 : 0; + std::vector pixels(buffer_size); + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + + // Since this is after JXL_DEC_FRAME but before JXL_DEC_FULL_IMAGE, this + // should only skip the next frame, not the currently processed one. + if (test_skipping) JxlDecoderSkipFrames(dec, test_skipping); + + JxlFrameHeader frame_header; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header)); + EXPECT_EQ(frame_durations[i], frame_header.duration); + + EXPECT_EQ(i + 1 == num_frames, frame_header.is_last); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels.data(), pixels.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(), + xsize, ysize, format, format)); + + if (test_skipping) i += test_skipping; + } + + JxlThreadParallelRunnerDestroy(runner); + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, SkipFrameWithBlendingTest) { + size_t xsize = 90, ysize = 120; + constexpr size_t num_frames = 16; + std::vector frames[num_frames]; + JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + jxl::CodecInOut io; + io.SetSize(xsize, ysize); + io.metadata.m.SetUintSamples(16); + io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false); + io.metadata.m.have_animation = true; + io.frames.clear(); + io.frames.reserve(num_frames); + io.SetSize(xsize, ysize); + + std::vector frame_durations(num_frames); + + for (size_t i = 0; i < num_frames; ++i) { + if (i < 5) { + std::vector frame_internal = + jxl::test::GetSomeTestImage(xsize, ysize, 3, i * 2 + 1); + // An internal frame with 0 duration, and use_for_next_frame, this is a + // frame that is not rendered and not output by the API, but on which the + // rendered frames depend + jxl::ImageBundle bundle_internal(&io.metadata.m); + EXPECT_TRUE(ConvertFromExternal( + jxl::Span(frame_internal.data(), + frame_internal.size()), + xsize, ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), + /*bits_per_sample=*/16, format, + /*pool=*/nullptr, &bundle_internal)); + bundle_internal.duration = 0; + bundle_internal.use_for_next_frame = true; + io.frames.push_back(std::move(bundle_internal)); + } + + std::vector frame = + jxl::test::GetSomeTestImage(xsize, ysize, 3, i * 2); + // Actual rendered frame + frame_durations[i] = 5 + i; + jxl::ImageBundle bundle(&io.metadata.m); + EXPECT_TRUE(ConvertFromExternal( + jxl::Span(frame.data(), frame.size()), xsize, ysize, + jxl::ColorEncoding::SRGB(/*is_gray=*/false), + /*bits_per_sample=*/16, format, + /*pool=*/nullptr, &bundle)); + bundle.duration = frame_durations[i]; + // Create some variation in which frames depend on which. + if (i != 3 && i != 9 && i != 10) { + bundle.use_for_next_frame = true; + } + if (i != 12) { + bundle.blend = true; + // Choose a blend mode that depends on the pixels of the saved frame and + // doesn't use alpha + bundle.blendmode = jxl::BlendMode::kMul; + } + io.frames.push_back(std::move(bundle)); + } + + jxl::CompressParams cparams; + cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip. + cparams.speed_tier = jxl::SpeedTier::kThunder; + jxl::AuxOut aux_out; + jxl::PaddedBytes compressed; + jxl::PassesEncoderState enc_state; + EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed, + jxl::GetJxlCms(), &aux_out, nullptr)); + + // Independently decode all frames without any skipping, to create the + // expected blended frames, for the actual tests below to compare with. + { + JxlDecoder* dec = JxlDecoderCreate(NULL); + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + + void* runner = JxlThreadParallelRunnerCreate( + NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads()); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetParallelRunner( + dec, JxlThreadParallelRunner, runner)); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + for (size_t i = 0; i < num_frames; ++i) { + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + frames[i].resize(xsize * ysize * 6); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer(dec, &format, frames[i].data(), + frames[i].size())); + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + } + + // After all frames were decoded, JxlDecoderProcessInput should return + // success to indicate all is done. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + JxlThreadParallelRunnerDestroy(runner); + JxlDecoderDestroy(dec); + } + + JxlDecoder* dec = JxlDecoderCreate(NULL); + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + + void* runner = JxlThreadParallelRunnerCreate( + NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads()); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetParallelRunner(dec, JxlThreadParallelRunner, runner)); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + + for (size_t i = 0; i < num_frames; ++i) { + std::vector pixels(buffer_size); + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + + JxlFrameHeader frame_header; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header)); + EXPECT_EQ(frame_durations[i], frame_header.duration); + + EXPECT_EQ(i + 1 == num_frames, frame_header.is_last); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels.data(), pixels.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(), + xsize, ysize, format, format)); + + // Test rewinding mid-way, not decoding all frames. + if (i == 8) { + break; + } + } + + JxlDecoderRewind(dec); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + + for (size_t i = 0; i < num_frames; ++i) { + if (i == 3) { + JxlDecoderSkipFrames(dec, 5); + i += 5; + } + std::vector pixels(buffer_size); + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + + JxlFrameHeader frame_header; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header)); + EXPECT_EQ(frame_durations[i], frame_header.duration); + + EXPECT_EQ(i + 1 == num_frames, frame_header.is_last); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels.data(), pixels.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(), + xsize, ysize, format, format)); + } + + // After all frames were decoded, JxlDecoderProcessInput should return + // success to indicate all is done. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + + // Test rewinding the decoder and skipping different frames + + JxlDecoderRewind(dec); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + + for (size_t i = 0; i < num_frames; ++i) { + int test_skipping = (i == 9) ? 3 : 0; + std::vector pixels(buffer_size); + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + + // Since this is after JXL_DEC_FRAME but before JXL_DEC_FULL_IMAGE, this + // should only skip the next frame, not the currently processed one. + if (test_skipping) JxlDecoderSkipFrames(dec, test_skipping); + + JxlFrameHeader frame_header; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header)); + EXPECT_EQ(frame_durations[i], frame_header.duration); + + EXPECT_EQ(i + 1 == num_frames, frame_header.is_last); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels.data(), pixels.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(), + xsize, ysize, format, format)); + + if (test_skipping) i += test_skipping; + } + + JxlThreadParallelRunnerDestroy(runner); + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, SkipFrameWithAlphaBlendingTest) { + size_t xsize = 90, ysize = 120; + constexpr size_t num_frames = 16; + std::vector frames[num_frames + 5]; + JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + jxl::CodecInOut io; + io.SetSize(xsize, ysize); + io.metadata.m.SetUintSamples(16); + io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false); + io.metadata.m.have_animation = true; + io.frames.clear(); + io.frames.reserve(num_frames + 5); + io.SetSize(xsize, ysize); + + std::vector frame_durations_c; + std::vector frame_durations_nc; + std::vector frame_xsize, frame_ysize, frame_x0, frame_y0; + + for (size_t i = 0; i < num_frames; ++i) { + size_t cropxsize = 1 + xsize * 2 / (i + 1); + size_t cropysize = 1 + ysize * 3 / (i + 2); + int cropx0 = i * 3 - 8; + int cropy0 = i * 4 - 7; + if (i < 5) { + std::vector frame_internal = + jxl::test::GetSomeTestImage(xsize / 2, ysize / 2, 4, i * 2 + 1); + // An internal frame with 0 duration, and use_for_next_frame, this is a + // frame that is not rendered and not output by default by the API, but on + // which the rendered frames depend + jxl::ImageBundle bundle_internal(&io.metadata.m); + EXPECT_TRUE(ConvertFromExternal( + jxl::Span(frame_internal.data(), + frame_internal.size()), + xsize / 2, ysize / 2, jxl::ColorEncoding::SRGB(/*is_gray=*/false), + /*bits_per_sample=*/16, format, + /*pool=*/nullptr, &bundle_internal)); + bundle_internal.duration = 0; + bundle_internal.use_for_next_frame = true; + bundle_internal.origin = {13, 17}; + io.frames.push_back(std::move(bundle_internal)); + frame_durations_nc.push_back(0); + frame_xsize.push_back(xsize / 2); + frame_ysize.push_back(ysize / 2); + frame_x0.push_back(13); + frame_y0.push_back(17); + } + + std::vector frame = + jxl::test::GetSomeTestImage(cropxsize, cropysize, 4, i * 2); + // Actual rendered frame + jxl::ImageBundle bundle(&io.metadata.m); + EXPECT_TRUE(ConvertFromExternal( + jxl::Span(frame.data(), frame.size()), cropxsize, + cropysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), + /*bits_per_sample=*/16, format, + /*pool=*/nullptr, &bundle)); + bundle.duration = 5 + i; + frame_durations_nc.push_back(5 + i); + frame_durations_c.push_back(5 + i); + frame_xsize.push_back(cropxsize); + frame_ysize.push_back(cropysize); + frame_x0.push_back(cropx0); + frame_y0.push_back(cropy0); + bundle.origin = {cropx0, cropy0}; + // Create some variation in which frames depend on which. + if (i != 3 && i != 9 && i != 10) { + bundle.use_for_next_frame = true; + } + if (i != 12) { + bundle.blend = true; + bundle.blendmode = jxl::BlendMode::kBlend; + } + io.frames.push_back(std::move(bundle)); + } + + jxl::CompressParams cparams; + cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip. + cparams.speed_tier = jxl::SpeedTier::kThunder; + jxl::AuxOut aux_out; + jxl::PaddedBytes compressed; + jxl::PassesEncoderState enc_state; + EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed, + jxl::GetJxlCms(), &aux_out, nullptr)); + // try both with and without coalescing + for (auto coalescing : {JXL_TRUE, JXL_FALSE}) { + // Independently decode all frames without any skipping, to create the + // expected blended frames, for the actual tests below to compare with. + { + JxlDecoder* dec = JxlDecoderCreate(NULL); + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec, coalescing)); + void* runner = JxlThreadParallelRunnerCreate( + NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads()); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetParallelRunner( + dec, JxlThreadParallelRunner, runner)); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + for (size_t i = 0; i < num_frames + (coalescing ? 0 : 5); ++i) { + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + if (coalescing) { + EXPECT_EQ(xsize * ysize * 8, buffer_size); + } else { + EXPECT_EQ(frame_xsize[i] * frame_ysize[i] * 8, buffer_size); + } + frames[i].resize(buffer_size); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer(dec, &format, frames[i].data(), + frames[i].size())); + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + } + + // After all frames were decoded, JxlDecoderProcessInput should return + // success to indicate all is done. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + JxlThreadParallelRunnerDestroy(runner); + JxlDecoderDestroy(dec); + } + + JxlDecoder* dec = JxlDecoderCreate(NULL); + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec, coalescing)); + void* runner = JxlThreadParallelRunnerCreate( + NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads()); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetParallelRunner( + dec, JxlThreadParallelRunner, runner)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | + JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + + for (size_t i = 0; i < num_frames; ++i) { + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + std::vector pixels(buffer_size); + + JxlFrameHeader frame_header; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header)); + EXPECT_EQ((coalescing ? frame_durations_c[i] : frame_durations_nc[i]), + frame_header.duration); + + EXPECT_EQ(i + 1 == num_frames, frame_header.is_last); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer(dec, &format, pixels.data(), + pixels.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + if (coalescing) { + EXPECT_EQ(frame_header.layer_info.xsize, xsize); + } else { + EXPECT_EQ(frame_header.layer_info.xsize, frame_xsize[i]); + } + if (coalescing) { + EXPECT_EQ(frame_header.layer_info.ysize, ysize); + } else { + EXPECT_EQ(frame_header.layer_info.ysize, frame_ysize[i]); + } + EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(), + frame_header.layer_info.xsize, + frame_header.layer_info.ysize, + format, format)); + + // Test rewinding mid-way, not decoding all frames. + if (i == 8) { + break; + } + } + + JxlDecoderRewind(dec); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents( + dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + + for (size_t i = 0; i < num_frames + (coalescing ? 0 : 5); ++i) { + if (i == 3) { + JxlDecoderSkipFrames(dec, 5); + i += 5; + } + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + std::vector pixels(buffer_size); + + JxlFrameHeader frame_header; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header)); + EXPECT_EQ((coalescing ? frame_durations_c[i] : frame_durations_nc[i]), + frame_header.duration); + + EXPECT_EQ(i + 1 == num_frames + (coalescing ? 0 : 5), + frame_header.is_last); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer(dec, &format, pixels.data(), + pixels.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + if (coalescing) { + EXPECT_EQ(frame_header.layer_info.xsize, xsize); + EXPECT_EQ(frame_header.layer_info.ysize, ysize); + EXPECT_EQ(frame_header.layer_info.crop_x0, 0); + EXPECT_EQ(frame_header.layer_info.crop_y0, 0); + } else { + EXPECT_EQ(frame_header.layer_info.xsize, frame_xsize[i]); + EXPECT_EQ(frame_header.layer_info.ysize, frame_ysize[i]); + EXPECT_EQ(frame_header.layer_info.crop_x0, frame_x0[i]); + EXPECT_EQ(frame_header.layer_info.crop_y0, frame_y0[i]); + EXPECT_EQ(frame_header.layer_info.blend_info.blendmode, + i != 12 + 5 && frame_header.duration != 0 + ? 2 + : 0); // kBlend or the default kReplace + } + EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(), + frame_header.layer_info.xsize, + frame_header.layer_info.ysize, + format, format)); + } + + // After all frames were decoded, JxlDecoderProcessInput should return + // success to indicate all is done. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + + // Test rewinding the decoder and skipping different frames + + JxlDecoderRewind(dec); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents( + dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + + for (size_t i = 0; i < num_frames + (coalescing ? 0 : 5); ++i) { + int test_skipping = (i == 9) ? 3 : 0; + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + std::vector pixels(buffer_size); + + // Since this is after JXL_DEC_FRAME but before JXL_DEC_FULL_IMAGE, this + // should only skip the next frame, not the currently processed one. + if (test_skipping) JxlDecoderSkipFrames(dec, test_skipping); + + JxlFrameHeader frame_header; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header)); + EXPECT_EQ((coalescing ? frame_durations_c[i] : frame_durations_nc[i]), + frame_header.duration); + + EXPECT_EQ(i + 1 == num_frames + (coalescing ? 0 : 5), + frame_header.is_last); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer(dec, &format, pixels.data(), + pixels.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(), + frame_header.layer_info.xsize, + frame_header.layer_info.ysize, + format, format)); + + if (test_skipping) i += test_skipping; + } + + JxlThreadParallelRunnerDestroy(runner); + JxlDecoderDestroy(dec); + } +} + +TEST(DecodeTest, OrientedCroppedFrameTest) { + const auto test = [](bool keep_orientation, uint32_t orientation, + uint32_t resampling) { + size_t xsize = 90, ysize = 120; + JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + size_t oxsize = (!keep_orientation && orientation > 4 ? ysize : xsize); + size_t oysize = (!keep_orientation && orientation > 4 ? xsize : ysize); + jxl::CodecInOut io; + io.SetSize(xsize, ysize); + io.metadata.m.SetUintSamples(16); + io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false); + io.metadata.m.orientation = orientation; + io.frames.clear(); + io.SetSize(xsize, ysize); + + for (size_t i = 0; i < 3; ++i) { + size_t cropxsize = 1 + xsize * 2 / (i + 1); + size_t cropysize = 1 + ysize * 3 / (i + 2); + int cropx0 = i * 3 - 8; + int cropy0 = i * 4 - 7; + + std::vector frame = + jxl::test::GetSomeTestImage(cropxsize, cropysize, 4, i * 2); + jxl::ImageBundle bundle(&io.metadata.m); + EXPECT_TRUE(ConvertFromExternal( + jxl::Span(frame.data(), frame.size()), cropxsize, + cropysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), + /*bits_per_sample=*/16, format, + /*pool=*/nullptr, &bundle)); + bundle.origin = {cropx0, cropy0}; + bundle.use_for_next_frame = true; + io.frames.push_back(std::move(bundle)); + } + + jxl::CompressParams cparams; + cparams + .SetLossless(); // Lossless to verify pixels exactly after roundtrip. + cparams.speed_tier = jxl::SpeedTier::kThunder; + cparams.resampling = resampling; + jxl::AuxOut aux_out; + jxl::PaddedBytes compressed; + jxl::PassesEncoderState enc_state; + EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed, + jxl::GetJxlCms(), &aux_out, nullptr)); + + // 0 is merged frame as decoded with coalescing enabled (default) + // 1-3 are non-coalesced frames as decoded with coalescing disabled + // 4 is the manually merged frame + std::vector frames[5]; + frames[4].resize(xsize * ysize * 8, 0); + + // try both with and without coalescing + for (auto coalescing : {JXL_TRUE, JXL_FALSE}) { + // Independently decode all frames without any skipping, to create the + // expected blended frames, for the actual tests below to compare with. + { + JxlDecoder* dec = JxlDecoderCreate(NULL); + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec, coalescing)); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetKeepOrientation(dec, keep_orientation)); + void* runner = JxlThreadParallelRunnerCreate( + NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads()); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetParallelRunner( + dec, JxlThreadParallelRunner, runner)); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + for (size_t i = (coalescing ? 0 : 1); i < (coalescing ? 1 : 4); ++i) { + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + JxlFrameHeader frame_header; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetFrameHeader(dec, &frame_header)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + if (coalescing) { + EXPECT_EQ(xsize * ysize * 8, buffer_size); + } else { + EXPECT_EQ(frame_header.layer_info.xsize * + frame_header.layer_info.ysize * 8, + buffer_size); + } + frames[i].resize(buffer_size); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer(dec, &format, frames[i].data(), + frames[i].size())); + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + EXPECT_EQ(frame_header.layer_info.blend_info.blendmode, + JXL_BLEND_REPLACE); + if (coalescing) { + EXPECT_EQ(frame_header.layer_info.xsize, oxsize); + EXPECT_EQ(frame_header.layer_info.ysize, oysize); + EXPECT_EQ(frame_header.layer_info.crop_x0, 0); + EXPECT_EQ(frame_header.layer_info.crop_y0, 0); + } else { + // manually merge this layer + int x0 = frame_header.layer_info.crop_x0; + int y0 = frame_header.layer_info.crop_y0; + int w = frame_header.layer_info.xsize; + int h = frame_header.layer_info.ysize; + for (int y = 0; y < static_cast(oysize); y++) { + if (y < y0 || y >= y0 + h) continue; + // pointers do whole 16-bit RGBA pixels at a time + uint64_t* row_merged = static_cast( + (void*)(frames[4].data() + y * oxsize * 8)); + uint64_t* row_layer = static_cast( + (void*)(frames[i].data() + (y - y0) * w * 8)); + for (int x = 0; x < static_cast(oxsize); x++) { + if (x < x0 || x >= x0 + w) continue; + row_merged[x] = row_layer[x - x0]; + } + } + } + } + + // After all frames were decoded, JxlDecoderProcessInput should return + // success to indicate all is done. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + JxlThreadParallelRunnerDestroy(runner); + JxlDecoderDestroy(dec); + } + } + + EXPECT_EQ(0u, jxl::test::ComparePixels(frames[0].data(), frames[4].data(), + oxsize, oysize, format, format)); + }; + + for (bool keep_orientation : {true, false}) { + for (uint32_t orientation = 1; orientation <= 8; orientation++) { + for (uint32_t resampling : {1, 2, 4, 8}) { + SCOPED_TRACE(testing::Message() + << "keep_orientation: " << keep_orientation << ", " + << "orientation: " << orientation << ", " + << "resampling: " << resampling); + test(keep_orientation, orientation, resampling); + } + } + } +} + +struct FramePositions { + size_t frame_start; + size_t header_end; + size_t toc_end; + std::vector section_end; +}; + +struct StreamPositions { + size_t codestream_start; + size_t codestream_end; + size_t basic_info; + size_t jbrd_end = 0; + std::vector box_start; + std::vector frames; +}; + +void AnalyzeCodestream(const jxl::PaddedBytes& data, + StreamPositions* streampos) { + // Unbox data to codestream and mark where it is broken up by boxes. + std::vector codestream; + std::vector> breakpoints; + bool codestream_end = false; + ASSERT_LE(2, data.size()); + if (data[0] == 0xff && data[1] == 0x0a) { + codestream = std::vector(data.begin(), data.end()); + streampos->codestream_start = 0; + } else { + const uint8_t* in = data.data(); + size_t pos = 0; + while (pos < data.size()) { + ASSERT_LE(pos + 8, data.size()); + streampos->box_start.push_back(pos); + size_t box_size = LoadBE32(in + pos); + if (box_size == 0) box_size = data.size() - pos; + ASSERT_LE(pos + box_size, data.size()); + if (memcmp(in + pos + 4, "jxlc", 4) == 0) { + EXPECT_TRUE(codestream.empty()); + streampos->codestream_start = pos + 8; + codestream.insert(codestream.end(), in + pos + 8, in + pos + box_size); + codestream_end = true; + } else if (memcmp(in + pos + 4, "jxlp", 4) == 0) { + codestream_end = (LoadBE32(in + pos + 8) & 0x80000000); + if (codestream.empty()) { + streampos->codestream_start = pos + 12; + } else if (box_size > 12 || !codestream_end) { + breakpoints.push_back({codestream.size(), 12}); + } + codestream.insert(codestream.end(), in + pos + 12, in + pos + box_size); + } else if (memcmp(in + pos + 4, "jbrd", 4) == 0) { + EXPECT_TRUE(codestream.empty()); + streampos->jbrd_end = pos + box_size; + } else if (!codestream.empty() && !codestream_end) { + breakpoints.push_back({codestream.size(), box_size}); + } + pos += box_size; + } + ASSERT_EQ(pos, data.size()); + } + // Translate codestream positions to boxed stream positions. + size_t offset = streampos->codestream_start; + size_t bp = 0; + auto add_offset = [&](size_t pos) { + while (bp < breakpoints.size() && pos >= breakpoints[bp].first) { + offset += breakpoints[bp++].second; + } + return pos + offset; + }; + // Analyze the unboxed codestream. + jxl::BitReader br( + jxl::Span(codestream.data(), codestream.size())); + ASSERT_EQ(br.ReadFixedBits<16>(), 0x0AFF); + jxl::CodecMetadata metadata; + EXPECT_TRUE(ReadSizeHeader(&br, &metadata.size)); + EXPECT_TRUE(ReadImageMetadata(&br, &metadata.m)); + streampos->basic_info = + add_offset(br.TotalBitsConsumed() / jxl::kBitsPerByte); + metadata.transform_data.nonserialized_xyb_encoded = metadata.m.xyb_encoded; + EXPECT_TRUE(jxl::Bundle::Read(&br, &metadata.transform_data)); + EXPECT_TRUE(br.JumpToByteBoundary()); + bool has_preview = metadata.m.have_preview; + while (br.TotalBitsConsumed() < br.TotalBytes() * jxl::kBitsPerByte) { + FramePositions p; + p.frame_start = add_offset(br.TotalBitsConsumed() / jxl::kBitsPerByte); + jxl::FrameHeader frame_header(&metadata); + if (has_preview) { + frame_header.nonserialized_is_preview = true; + has_preview = false; + } + EXPECT_TRUE(ReadFrameHeader(&br, &frame_header)); + p.header_end = + add_offset(jxl::DivCeil(br.TotalBitsConsumed(), jxl::kBitsPerByte)); + jxl::FrameDimensions frame_dim = frame_header.ToFrameDimensions(); + uint64_t groups_total_size; + const size_t toc_entries = jxl::NumTocEntries( + frame_dim.num_groups, frame_dim.num_dc_groups, + frame_header.passes.num_passes, /*has_ac_global=*/true); + std::vector section_offsets; + std::vector section_sizes; + EXPECT_TRUE(ReadGroupOffsets(toc_entries, &br, §ion_offsets, + §ion_sizes, &groups_total_size)); + EXPECT_EQ(br.TotalBitsConsumed() % jxl::kBitsPerByte, 0); + size_t sections_start = br.TotalBitsConsumed() / jxl::kBitsPerByte; + p.toc_end = add_offset(sections_start); + for (size_t i = 0; i < toc_entries; ++i) { + size_t end = sections_start + section_offsets[i] + section_sizes[i]; + p.section_end.push_back(add_offset(end)); + } + br.SkipBits(groups_total_size * jxl::kBitsPerByte); + streampos->frames.push_back(p); + } + streampos->codestream_end = add_offset(codestream.size()); + EXPECT_EQ(br.TotalBitsConsumed(), br.TotalBytes() * jxl::kBitsPerByte); + EXPECT_TRUE(br.Close()); +} + +enum ExpectedFlushState { NO_FLUSH, SAME_FLUSH, NEW_FLUSH }; +struct Breakpoint { + size_t file_pos; + ExpectedFlushState expect_flush; +}; + +void VerifyProgression(size_t xsize, size_t ysize, uint32_t num_channels, + const std::vector& pixels, + const jxl::PaddedBytes& data, + std::vector breakpoints) { + // Size large enough for multiple groups, required to have progressive stages. + ASSERT_LT(256, xsize); + ASSERT_LT(256, ysize); + std::vector pixels2; + pixels2.resize(pixels.size()); + JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + int bp = 0; + const uint8_t* next_in = data.data(); + size_t avail_in = breakpoints[bp].file_pos; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + double prev_dist = 1.0; + for (;;) { + JxlDecoderStatus status = JxlDecoderProcessInput(dec); + printf("bp: %d status: 0x%x\n", bp, (int)status); + if (status == JXL_DEC_BASIC_INFO) { + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(info.xsize, xsize); + EXPECT_EQ(info.ysize, ysize); + // Output buffer/callback not yet set + EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + EXPECT_EQ(pixels2.size(), buffer_size); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer(dec, &format, pixels2.data(), + pixels2.size())); + } else if (status == JXL_DEC_FRAME) { + // Nothing to do. + } else if (status == JXL_DEC_SUCCESS) { + EXPECT_EQ(bp + 1, breakpoints.size()); + break; + } else if (status == JXL_DEC_NEED_MORE_INPUT || + status == JXL_DEC_FULL_IMAGE) { + if (breakpoints[bp].expect_flush == NO_FLUSH) { + EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec)); + } else { + if (status != JXL_DEC_FULL_IMAGE) { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec)); + } + double dist = jxl::test::DistanceRMS(pixels2.data(), pixels.data(), + xsize, ysize, format); + if (breakpoints[bp].expect_flush == NEW_FLUSH) { + EXPECT_LT(dist, prev_dist); + prev_dist = dist; + } else { + EXPECT_EQ(dist, prev_dist); + } + } + if (status == JXL_DEC_FULL_IMAGE) { + EXPECT_EQ(bp + 1, breakpoints.size()); + continue; + } + ASSERT_LT(++bp, breakpoints.size()); + next_in += avail_in - JxlDecoderReleaseInput(dec); + avail_in = breakpoints[bp].file_pos - (next_in - data.data()); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + } else { + printf("Unexpected status: 0x%x\n", (int)status); + FAIL(); // unexpected returned status + } + } + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, ProgressionTest) { + size_t xsize = 508, ysize = 470; + uint32_t num_channels = 3; + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0); + jxl::TestCodestreamParams params; + params.cparams.progressive_dc = 1; + params.preview_mode = jxl::kSmallPreview; + jxl::PaddedBytes data = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + num_channels, params); + StreamPositions streampos; + AnalyzeCodestream(data, &streampos); + const std::vector& fp = streampos.frames; + // We have preview, dc frame and regular frame. + EXPECT_EQ(3, fp.size()); + EXPECT_EQ(7, fp[2].section_end.size()); + EXPECT_EQ(data.size(), fp[2].section_end[6]); + std::vector breakpoints{ + {fp[0].frame_start, NO_FLUSH}, // headers + {fp[1].frame_start, NO_FLUSH}, // preview + {fp[2].frame_start, NO_FLUSH}, // dc frame + {fp[2].section_end[0], NO_FLUSH}, // DC global + {fp[2].section_end[1] - 1, NO_FLUSH}, // partial DC group + {fp[2].section_end[1], NEW_FLUSH}, // DC group + {fp[2].section_end[2], SAME_FLUSH}, // AC global + {fp[2].section_end[3], NEW_FLUSH}, // AC group 0 + {fp[2].section_end[4] - 1, SAME_FLUSH}, // partial AC group 1 + {fp[2].section_end[4], NEW_FLUSH}, // AC group 1 + {fp[2].section_end[5], NEW_FLUSH}, // AC group 2 + {data.size() - 1, SAME_FLUSH}, // partial AC group 3 + {data.size(), NEW_FLUSH}}; // full image + VerifyProgression(xsize, ysize, num_channels, pixels, data, breakpoints); +} + +TEST(DecodeTest, ProgressionTestLosslessAlpha) { + size_t xsize = 508, ysize = 470; + uint32_t num_channels = 4; + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0); + jxl::TestCodestreamParams params; + params.cparams.SetLossless(); + params.cparams.speed_tier = jxl::SpeedTier::kThunder; + params.cparams.responsive = 1; + jxl::PaddedBytes data = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + num_channels, params); + StreamPositions streampos; + AnalyzeCodestream(data, &streampos); + const std::vector& fp = streampos.frames; + // We have preview, dc frame and regular frame. + EXPECT_EQ(1, fp.size()); + EXPECT_EQ(7, fp[0].section_end.size()); + EXPECT_EQ(data.size(), fp[0].section_end[6]); + std::vector breakpoints{ + {fp[0].frame_start, NO_FLUSH}, // headers + {fp[0].section_end[0] - 1, NO_FLUSH}, // partial DC global + {fp[0].section_end[0], NEW_FLUSH}, // DC global + {fp[0].section_end[1], SAME_FLUSH}, // DC group + {fp[0].section_end[2], SAME_FLUSH}, // AC global + {fp[0].section_end[3], NEW_FLUSH}, // AC group 0 + {fp[0].section_end[4] - 1, SAME_FLUSH}, // partial AC group 1 + {fp[0].section_end[4], NEW_FLUSH}, // AC group 1 + {fp[0].section_end[5], NEW_FLUSH}, // AC group 2 + {data.size() - 1, SAME_FLUSH}, // partial AC group 3 + {data.size(), NEW_FLUSH}}; // full image + VerifyProgression(xsize, ysize, num_channels, pixels, data, breakpoints); +} + +void VerifyFilePosition(size_t expected_pos, const jxl::PaddedBytes& data, + JxlDecoder* dec) { + size_t remaining = JxlDecoderReleaseInput(dec); + size_t pos = data.size() - remaining; + EXPECT_EQ(expected_pos, pos); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput(dec, data.data() + pos, remaining)); +} + +TEST(DecodeTest, InputHandlingTestOneShot) { + size_t xsize = 508, ysize = 470; + uint32_t num_channels = 3; + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0); + for (int i = 0; i < kCSBF_NUM_ENTRIES; ++i) { + printf("Testing with box format %d\n", i); + jxl::TestCodestreamParams params; + params.cparams.progressive_dc = 1; + params.preview_mode = jxl::kSmallPreview; + params.box_format = (CodeStreamBoxFormat)i; + jxl::PaddedBytes data = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + num_channels, params); + JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + StreamPositions streampos; + AnalyzeCodestream(data, &streampos); + const std::vector& fp = streampos.frames; + // We have preview, dc frame and regular frame. + EXPECT_EQ(3, fp.size()); + + std::vector pixels2; + pixels2.resize(pixels.size()); + + int kNumEvents = 6; + int events[] = { + JXL_DEC_BASIC_INFO, JXL_DEC_COLOR_ENCODING, JXL_DEC_PREVIEW_IMAGE, + JXL_DEC_FRAME, JXL_DEC_FULL_IMAGE, JXL_DEC_FRAME_PROGRESSION, + }; + size_t end_positions[] = { + streampos.basic_info, fp[0].frame_start, + fp[1].frame_start, fp[2].toc_end, + streampos.codestream_end, streampos.codestream_end}; + int events_wanted = 0; + for (int j = 0; j < kNumEvents; ++j) { + events_wanted |= events[j]; + size_t end_pos = end_positions[j]; + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events_wanted)); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput(dec, data.data(), data.size())); + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + VerifyFilePosition(streampos.basic_info, data, dec); + if (j >= 1) { + EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec)); + VerifyFilePosition(fp[0].frame_start, data, dec); + } + if (j >= 2) { + EXPECT_EQ(JXL_DEC_NEED_PREVIEW_OUT_BUFFER, JxlDecoderProcessInput(dec)); + VerifyFilePosition(fp[0].toc_end, data, dec); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size)); + EXPECT_GE(pixels2.size(), buffer_size); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetPreviewOutBuffer(dec, &format, pixels2.data(), + buffer_size)); + EXPECT_EQ(JXL_DEC_PREVIEW_IMAGE, JxlDecoderProcessInput(dec)); + VerifyFilePosition(fp[1].frame_start, data, dec); + } + if (j >= 3) { + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + VerifyFilePosition(fp[2].toc_end, data, dec); + if (j >= 5) { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetProgressiveDetail(dec, kDC)); + } + } + if (j >= 4) { + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + VerifyFilePosition(fp[2].toc_end, data, dec); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + EXPECT_EQ(pixels2.size(), buffer_size); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer(dec, &format, pixels2.data(), + pixels2.size())); + if (j >= 5) { + EXPECT_EQ(JXL_DEC_FRAME_PROGRESSION, JxlDecoderProcessInput(dec)); + VerifyFilePosition(fp[2].section_end[1], data, dec); + } + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + VerifyFilePosition(streampos.codestream_end, data, dec); + } + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + VerifyFilePosition(end_pos, data, dec); + JxlDecoderDestroy(dec); + } + } +} + +#if JPEGXL_ENABLE_JPEG +TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(InputHandlingTestJPEGOneshot)) { + size_t xsize = 123; + size_t ysize = 77; + size_t channels = 3; + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, channels, /*seed=*/0); + for (int i = 1; i < kCSBF_NUM_ENTRIES; ++i) { + printf("Testing with box format %d\n", i); + jxl::PaddedBytes jpeg_codestream; + jxl::TestCodestreamParams params; + params.cparams.color_transform = jxl::ColorTransform::kNone; + params.jpeg_codestream = &jpeg_codestream; + params.preview_mode = jxl::kSmallPreview; + params.box_format = (CodeStreamBoxFormat)i; + jxl::PaddedBytes data = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + channels, params); + JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + StreamPositions streampos; + AnalyzeCodestream(data, &streampos); + const std::vector& fp = streampos.frames; + // We have preview and regular frame. + EXPECT_EQ(2, fp.size()); + EXPECT_LT(0, streampos.jbrd_end); + + std::vector pixels2; + pixels2.resize(pixels.size()); + + int kNumEvents = 6; + int events[] = {JXL_DEC_BASIC_INFO, JXL_DEC_JPEG_RECONSTRUCTION, + JXL_DEC_COLOR_ENCODING, JXL_DEC_PREVIEW_IMAGE, + JXL_DEC_FRAME, JXL_DEC_FULL_IMAGE}; + size_t end_positions[] = {streampos.basic_info, streampos.basic_info, + fp[0].frame_start, fp[1].frame_start, + fp[1].toc_end, streampos.codestream_end}; + int events_wanted = 0; + for (int j = 0; j < kNumEvents; ++j) { + printf("j = %d\n", j); + events_wanted |= events[j]; + size_t end_pos = end_positions[j]; + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events_wanted)); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput(dec, data.data(), data.size())); + if (j >= 1) { + EXPECT_EQ(JXL_DEC_JPEG_RECONSTRUCTION, JxlDecoderProcessInput(dec)); + VerifyFilePosition(streampos.jbrd_end, data, dec); + } + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + VerifyFilePosition(streampos.basic_info, data, dec); + if (j >= 2) { + EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec)); + VerifyFilePosition(fp[0].frame_start, data, dec); + } + if (j >= 3) { + EXPECT_EQ(JXL_DEC_NEED_PREVIEW_OUT_BUFFER, JxlDecoderProcessInput(dec)); + VerifyFilePosition(fp[0].toc_end, data, dec); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size)); + EXPECT_GE(pixels2.size(), buffer_size); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetPreviewOutBuffer(dec, &format, pixels2.data(), + buffer_size)); + EXPECT_EQ(JXL_DEC_PREVIEW_IMAGE, JxlDecoderProcessInput(dec)); + VerifyFilePosition(fp[1].frame_start, data, dec); + } + if (j >= 4) { + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + VerifyFilePosition(fp[1].toc_end, data, dec); + } + if (j >= 5) { + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + VerifyFilePosition(fp[1].toc_end, data, dec); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + EXPECT_EQ(pixels2.size(), buffer_size); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer(dec, &format, pixels2.data(), + pixels2.size())); + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + VerifyFilePosition(streampos.codestream_end, data, dec); + } + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + VerifyFilePosition(end_pos, data, dec); + JxlDecoderDestroy(dec); + } + } +} +#endif // JPEGXL_ENABLE_JPEG + +TEST(DecodeTest, InputHandlingTestStreaming) { + size_t xsize = 508, ysize = 470; + uint32_t num_channels = 3; + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0); + for (int i = 0; i < kCSBF_NUM_ENTRIES; ++i) { + printf("Testing with box format %d\n", i); + fflush(stdout); + jxl::TestCodestreamParams params; + params.cparams.progressive_dc = 1; + params.box_format = (CodeStreamBoxFormat)i; + params.preview_mode = jxl::kSmallPreview; + jxl::PaddedBytes data = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + num_channels, params); + JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + StreamPositions streampos; + AnalyzeCodestream(data, &streampos); + const std::vector& fp = streampos.frames; + // We have preview, dc frame and regular frame. + EXPECT_EQ(3, fp.size()); + std::vector pixels2; + pixels2.resize(pixels.size()); + int events_wanted = + (JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING | JXL_DEC_PREVIEW_IMAGE | + JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE | JXL_DEC_FRAME_PROGRESSION | + JXL_DEC_BOX); + for (size_t increment : {1, 7, 27, 1024}) { + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events_wanted)); + size_t file_pos = 0; + size_t box_index = 0; + size_t avail_in = 0; + for (;;) { + const uint8_t* next_in = data.data() + file_pos; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + JxlDecoderStatus status = JxlDecoderProcessInput(dec); + size_t remaining = JxlDecoderReleaseInput(dec); + size_t consumed = avail_in - remaining; + file_pos += consumed; + avail_in += increment; + avail_in = std::min(avail_in, data.size() - file_pos); + if (status == JXL_DEC_BASIC_INFO) { + EXPECT_EQ(file_pos, streampos.basic_info); + } else if (status == JXL_DEC_COLOR_ENCODING) { + EXPECT_EQ(file_pos, streampos.frames[0].frame_start); + } else if (status == JXL_DEC_NEED_PREVIEW_OUT_BUFFER) { + EXPECT_EQ(file_pos, streampos.frames[0].toc_end); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size)); + EXPECT_GE(pixels2.size(), buffer_size); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetPreviewOutBuffer(dec, &format, pixels2.data(), + buffer_size)); + } else if (status == JXL_DEC_PREVIEW_IMAGE) { + EXPECT_EQ(file_pos, streampos.frames[1].frame_start); + } else if (status == JXL_DEC_FRAME) { + EXPECT_EQ(file_pos, streampos.frames[2].toc_end); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetProgressiveDetail(dec, kDC)); + } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) { + EXPECT_EQ(file_pos, streampos.frames[2].toc_end); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + EXPECT_EQ(pixels2.size(), buffer_size); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer(dec, &format, pixels2.data(), + pixels2.size())); + } else if (status == JXL_DEC_FRAME_PROGRESSION) { + EXPECT_EQ(file_pos, streampos.frames[2].section_end[1]); + } else if (status == JXL_DEC_FULL_IMAGE) { + EXPECT_EQ(file_pos, streampos.codestream_end); + } else if (status == JXL_DEC_SUCCESS) { + EXPECT_EQ(file_pos, streampos.codestream_end); + break; + } else if (status == JXL_DEC_NEED_MORE_INPUT) { + EXPECT_LT(remaining, 12); + if ((i == kCSBF_None && file_pos >= 2) || + (box_index > 0 && box_index < streampos.box_start.size() && + file_pos >= streampos.box_start[box_index - 1] + 12 && + file_pos < streampos.box_start[box_index])) { + EXPECT_EQ(remaining, 0); + } + if (file_pos == data.size()) break; + } else if (status == JXL_DEC_BOX) { + ASSERT_LT(box_index, streampos.box_start.size()); + EXPECT_EQ(file_pos, streampos.box_start[box_index++]); + } else { + printf("Unexpected status: 0x%x\n", (int)status); + FAIL(); + } + } + JxlDecoderDestroy(dec); + } + } +} + +TEST(DecodeTest, FlushTest) { + // Size large enough for multiple groups, required to have progressive + // stages + size_t xsize = 333, ysize = 300; + uint32_t num_channels = 3; + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0); + jxl::TestCodestreamParams params; + params.preview_mode = jxl::kSmallPreview; + jxl::PaddedBytes data = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + num_channels, params); + JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + std::vector pixels2; + pixels2.resize(pixels.size()); + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + + // Ensure that the first part contains at least the full DC of the image, + // otherwise flush does not work. + size_t first_part = data.size() - 1; + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), first_part)); + + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(info.xsize, xsize); + EXPECT_EQ(info.ysize, ysize); + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + + // Output buffer not yet set + EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec)); + + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + EXPECT_EQ(pixels2.size(), buffer_size); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels2.data(), pixels2.size())); + + // Must process input further until we get JXL_DEC_NEED_MORE_INPUT, even if + // data was already input before, since the processing of the frame only + // happens at the JxlDecoderProcessInput call after JXL_DEC_FRAME. + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec)); + + // Crude test of actual pixel data: pixel threshold of about 4% (2560/65535). + // 29000 pixels can be above the threshold + EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize, + ysize, format, format, 2560.0), + 29000u); + + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec)); + + size_t consumed = first_part - JxlDecoderReleaseInput(dec); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data() + consumed, + data.size() - consumed)); + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + // Lower threshold for the final (still lossy) image + EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize, + ysize, format, format, 2560.0), + 11000u); + + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, FlushTestImageOutCallback) { + // Size large enough for multiple groups, required to have progressive + // stages + size_t xsize = 333, ysize = 300; + uint32_t num_channels = 3; + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0); + jxl::TestCodestreamParams params; + params.preview_mode = jxl::kSmallPreview; + jxl::PaddedBytes data = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + num_channels, params); + JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + std::vector pixels2; + pixels2.resize(pixels.size()); + + size_t bytes_per_pixel = format.num_channels * 2; + size_t stride = bytes_per_pixel * xsize; + auto callback = [&](size_t x, size_t y, size_t num_pixels, + const void* pixels_row) { + memcpy(pixels2.data() + stride * y + bytes_per_pixel * x, pixels_row, + num_pixels * bytes_per_pixel); + }; + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + + // Ensure that the first part contains at least the full DC of the image, + // otherwise flush does not work. + size_t first_part = data.size() - 1; + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), first_part)); + + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(info.xsize, xsize); + EXPECT_EQ(info.ysize, ysize); + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + + // Output callback not yet set + EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutCallback( + dec, &format, + [](void* opaque, size_t x, size_t y, + size_t xsize, const void* pixels_row) { + auto cb = + static_cast(opaque); + (*cb)(x, y, xsize, pixels_row); + }, + /*opaque=*/&callback)); + + // Must process input further until we get JXL_DEC_NEED_MORE_INPUT, even if + // data was already input before, since the processing of the frame only + // happens at the JxlDecoderProcessInput call after JXL_DEC_FRAME. + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec)); + + // Crude test of actual pixel data: pixel threshold of about 4% (2560/65535). + // 29000 pixels can be above the threshold + EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize, + ysize, format, format, 2560.0), + 29000u); + + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec)); + + size_t consumed = first_part - JxlDecoderReleaseInput(dec); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data() + consumed, + data.size() - consumed)); + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + // Lower threshold for the final (still lossy) image + EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize, + ysize, format, format, 2560.0), + 11000u); + + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, FlushTestLossyProgressiveAlpha) { + // Size large enough for multiple groups, required to have progressive + // stages + size_t xsize = 333, ysize = 300; + uint32_t num_channels = 4; + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0); + jxl::TestCodestreamParams params; + params.preview_mode = jxl::kSmallPreview; + jxl::PaddedBytes data = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + num_channels, params); + JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + std::vector pixels2; + pixels2.resize(pixels.size()); + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + + // Ensure that the first part contains at least the full DC of the image, + // otherwise flush does not work. + size_t first_part = data.size() - 1; + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), first_part)); + + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(info.xsize, xsize); + EXPECT_EQ(info.ysize, ysize); + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + + // Output buffer not yet set + EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec)); + + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + EXPECT_EQ(pixels2.size(), buffer_size); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels2.data(), pixels2.size())); + + // Must process input further until we get JXL_DEC_NEED_MORE_INPUT, even if + // data was already input before, since the processing of the frame only + // happens at the JxlDecoderProcessInput call after JXL_DEC_FRAME. + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec)); + + EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize, + ysize, format, format, 2560.0), + 30000u); + + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec)); + + size_t consumed = first_part - JxlDecoderReleaseInput(dec); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data() + consumed, + data.size() - consumed)); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize, + ysize, format, format, 2560.0), + 11000u); + + JxlDecoderDestroy(dec); +} +TEST(DecodeTest, FlushTestLossyProgressiveAlphaUpsampling) { + size_t xsize = 533, ysize = 401; + uint32_t num_channels = 4; + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0); + jxl::TestCodestreamParams params; + params.cparams.resampling = 2; + params.cparams.ec_resampling = 4; + params.preview_mode = jxl::kSmallPreview; + jxl::PaddedBytes data = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + num_channels, params); + JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + std::vector pixels2; + pixels2.resize(pixels.size()); + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + + // Ensure that the first part contains at least the full DC of the image, + // otherwise flush does not work. + size_t first_part = data.size() * 2 / 3; + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), first_part)); + + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(info.xsize, xsize); + EXPECT_EQ(info.ysize, ysize); + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + + // Output buffer not yet set + EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec)); + + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + EXPECT_EQ(pixels2.size(), buffer_size); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels2.data(), pixels2.size())); + + // Must process input further until we get JXL_DEC_NEED_MORE_INPUT, even if + // data was already input before, since the processing of the frame only + // happens at the JxlDecoderProcessInput call after JXL_DEC_FRAME. + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec)); + + EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize, + ysize, format, format, 2560.0), + 125000u); + + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec)); + + size_t consumed = first_part - JxlDecoderReleaseInput(dec); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data() + consumed, + data.size() - consumed)); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize, + ysize, format, format, 2560.0), + 70000u); + + JxlDecoderDestroy(dec); +} +TEST(DecodeTest, FlushTestLosslessProgressiveAlpha) { + // Size large enough for multiple groups, required to have progressive + // stages + size_t xsize = 333, ysize = 300; + uint32_t num_channels = 4; + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0); + jxl::TestCodestreamParams params; + params.cparams.SetLossless(); + params.cparams.speed_tier = jxl::SpeedTier::kThunder; + params.cparams.responsive = 1; + params.preview_mode = jxl::kSmallPreview; + jxl::PaddedBytes data = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + num_channels, params); + JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + std::vector pixels2; + pixels2.resize(pixels.size()); + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + + // Ensure that the first part contains at least the full DC of the image, + // otherwise flush does not work. + size_t first_part = data.size() / 2; + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), first_part)); + + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(info.xsize, xsize); + EXPECT_EQ(info.ysize, ysize); + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + + // Output buffer not yet set + EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec)); + + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + EXPECT_EQ(pixels2.size(), buffer_size); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels2.data(), pixels2.size())); + + // Must process input further until we get JXL_DEC_NEED_MORE_INPUT, even if + // data was already input before, since the processing of the frame only + // happens at the JxlDecoderProcessInput call after JXL_DEC_FRAME. + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec)); + + EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize, + ysize, format, format, 2560.0), + 2700u); + + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec)); + + size_t consumed = first_part - JxlDecoderReleaseInput(dec); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data() + consumed, + data.size() - consumed)); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize, + ysize, format, format), + 0u); + + JxlDecoderDestroy(dec); +} + +class DecodeProgressiveTest : public ::testing::TestWithParam {}; +JXL_GTEST_INSTANTIATE_TEST_SUITE_P(DecodeProgressiveTestInstantiation, + DecodeProgressiveTest, + ::testing::Range(0, 8)); +TEST_P(DecodeProgressiveTest, ProgressiveEventTest) { + const int params = GetParam(); + int single_group = params & 1; + int lossless = (params >> 1) & 1; + uint32_t num_channels = 3 + ((params >> 2) & 1); + std::set progressive_details = {kDC, kLastPasses, + kPasses}; + for (auto prog_detail : progressive_details) { + // Only few combinations are expected to support outputting + // intermediate flushes for complete DC and complete passes. + // The test can be updated if more cases are expected to support it. + bool expect_flush = (num_channels & 1) && !lossless; + size_t xsize, ysize; + if (single_group) { + // An image smaller than 256x256 ensures it contains only 1 group. + xsize = 99; + ysize = 100; + } else { + xsize = 277; + ysize = 280; + } + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0); + JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + jxl::ColorEncoding color_encoding = jxl::ColorEncoding::SRGB(false); + jxl::CodecInOut io; + EXPECT_TRUE(jxl::ConvertFromExternal( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + color_encoding, + /*bits_per_sample=*/16, format, + /*pool=*/nullptr, &io.Main())); + jxl::TestCodestreamParams params; + if (lossless) { + params.cparams.SetLossless(); + } else { + params.cparams.butteraugli_distance = 0.5f; + } + jxl::PassDefinition passes[] = { + {2, 0, 4}, {4, 0, 4}, {8, 2, 2}, {8, 1, 2}, {8, 0, 1}}; + const int kNumPasses = 5; + jxl::ProgressiveMode progressive_mode{passes}; + params.progressive_mode = &progressive_mode; + jxl::PaddedBytes data = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + num_channels, params); + + for (size_t increment : {(size_t)1, data.size()}) { + printf( + "Testing with single_group=%d, lossless=%d, " + "num_channels=%d, prog_detail=%d, increment=%d\n", + single_group, lossless, (int)num_channels, (int)prog_detail, + (int)increment); + std::vector> passes(kNumPasses + 1); + for (int i = 0; i <= kNumPasses; ++i) { + passes[i].resize(pixels.size()); + } + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | + JXL_DEC_FULL_IMAGE | JXL_DEC_FRAME_PROGRESSION)); + EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderSetProgressiveDetail(dec, kFrames)); + EXPECT_EQ(JXL_DEC_ERROR, + JxlDecoderSetProgressiveDetail(dec, kDCProgressive)); + EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderSetProgressiveDetail(dec, kDCGroups)); + EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderSetProgressiveDetail(dec, kGroups)); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetProgressiveDetail(dec, prog_detail)); + + uint8_t* next_in = data.data(); + size_t avail_in = 0; + size_t pos = 0; + + auto process_input = [&]() { + for (;;) { + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput(dec, next_in, avail_in)); + JxlDecoderStatus status = JxlDecoderProcessInput(dec); + size_t remaining = JxlDecoderReleaseInput(dec); + EXPECT_LE(remaining, avail_in); + next_in += avail_in - remaining; + avail_in = remaining; + if (status == JXL_DEC_NEED_MORE_INPUT && pos < data.size()) { + size_t chunk = std::min(increment, data.size() - pos); + pos += chunk; + avail_in += chunk; + continue; + } + return status; + } + }; + + EXPECT_EQ(JXL_DEC_BASIC_INFO, process_input()); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(info.xsize, xsize); + EXPECT_EQ(info.ysize, ysize); + + EXPECT_EQ(JXL_DEC_FRAME, process_input()); + + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + EXPECT_EQ(pixels.size(), buffer_size); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, passes[kNumPasses].data(), + passes[kNumPasses].size())); + + auto next_pass = [&](int pass) { + if (prog_detail <= kDC) return kNumPasses; + if (prog_detail <= kLastPasses) { + return std::min(pass + 2, kNumPasses); + } + return pass + 1; + }; + + if (expect_flush) { + // Return a particular downsampling ratio only after the last + // pass for that downsampling was processed. + int expected_downsampling_ratios[] = {8, 8, 4, 4, 2}; + for (int p = 0; p < kNumPasses; p = next_pass(p)) { + EXPECT_EQ(JXL_DEC_FRAME_PROGRESSION, process_input()); + EXPECT_EQ(expected_downsampling_ratios[p], + JxlDecoderGetIntendedDownsamplingRatio(dec)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec)); + passes[p] = passes[kNumPasses]; + } + } + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, process_input()); + EXPECT_EQ(JXL_DEC_SUCCESS, process_input()); + + JxlDecoderDestroy(dec); + + if (!expect_flush) { + continue; + } + jxl::ButteraugliParams ba; + std::vector distances(kNumPasses + 1); + for (int p = 0;; p = next_pass(p)) { + jxl::CodecInOut io1; + EXPECT_TRUE(jxl::ConvertFromExternal( + jxl::Span(passes[p].data(), passes[p].size()), xsize, + ysize, color_encoding, + /*bits_per_sample=*/16, format, + /*pool=*/nullptr, &io1.Main())); + distances[p] = ButteraugliDistance(io.frames, io1.frames, ba, + jxl::GetJxlCms(), nullptr, nullptr); + if (p == kNumPasses) break; + } + const float kMaxDistance[kNumPasses + 1] = {30.0f, 20.0f, 10.0f, + 5.0f, 3.0f, 2.0f}; + EXPECT_LT(distances[kNumPasses], kMaxDistance[kNumPasses]); + for (int p = 0; p < kNumPasses;) { + int next_p = next_pass(p); + EXPECT_LT(distances[p], kMaxDistance[p]); + // Verify that the returned pass image is actually not the + // same as the next pass image, by checking that it has a bit + // worse butteraugli score. + EXPECT_LT(distances[next_p] * 1.1f, distances[p]); + p = next_p; + } + } + } +} + +void VerifyJPEGReconstruction(const jxl::PaddedBytes& container, + const jxl::PaddedBytes& jpeg_bytes) { + JxlDecoderPtr dec = JxlDecoderMake(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec.get(), JXL_DEC_JPEG_RECONSTRUCTION | JXL_DEC_FULL_IMAGE)); + JxlDecoderSetInput(dec.get(), container.data(), container.size()); + EXPECT_EQ(JXL_DEC_JPEG_RECONSTRUCTION, JxlDecoderProcessInput(dec.get())); + std::vector reconstructed_buffer(128); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetJPEGBuffer(dec.get(), reconstructed_buffer.data(), + reconstructed_buffer.size())); + size_t used = 0; + JxlDecoderStatus process_result = JXL_DEC_JPEG_NEED_MORE_OUTPUT; + while (process_result == JXL_DEC_JPEG_NEED_MORE_OUTPUT) { + used = reconstructed_buffer.size() - JxlDecoderReleaseJPEGBuffer(dec.get()); + reconstructed_buffer.resize(reconstructed_buffer.size() * 2); + EXPECT_EQ( + JXL_DEC_SUCCESS, + JxlDecoderSetJPEGBuffer(dec.get(), reconstructed_buffer.data() + used, + reconstructed_buffer.size() - used)); + process_result = JxlDecoderProcessInput(dec.get()); + } + ASSERT_EQ(JXL_DEC_FULL_IMAGE, process_result); + used = reconstructed_buffer.size() - JxlDecoderReleaseJPEGBuffer(dec.get()); + ASSERT_EQ(used, jpeg_bytes.size()); + EXPECT_EQ(0, memcmp(reconstructed_buffer.data(), jpeg_bytes.data(), used)); +} + +#if JPEGXL_ENABLE_JPEG +TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructTestCodestream)) { + size_t xsize = 123; + size_t ysize = 77; + size_t channels = 3; + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, channels, /*seed=*/0); + jxl::PaddedBytes jpeg_codestream; + jxl::TestCodestreamParams params; + params.cparams.color_transform = jxl::ColorTransform::kNone; + params.box_format = kCSBF_Single; + params.jpeg_codestream = &jpeg_codestream; + params.preview_mode = jxl::kSmallPreview; + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + channels, params); + VerifyJPEGReconstruction(compressed, jpeg_codestream); +} +#endif // JPEGXL_ENABLE_JPEG + +TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructionTest)) { + const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg"; + const jxl::PaddedBytes orig = jxl::test::ReadTestData(jpeg_path); + jxl::CodecInOut orig_io; + ASSERT_TRUE( + jxl::jpeg::DecodeImageJPG(jxl::Span(orig), &orig_io)); + orig_io.metadata.m.xyb_encoded = false; + jxl::BitWriter writer; + ASSERT_TRUE(WriteCodestreamHeaders(&orig_io.metadata, &writer, nullptr)); + writer.ZeroPadToByte(); + jxl::PassesEncoderState enc_state; + jxl::CompressParams cparams; + cparams.color_transform = jxl::ColorTransform::kNone; + ASSERT_TRUE(jxl::EncodeFrame(cparams, jxl::FrameInfo{}, &orig_io.metadata, + orig_io.Main(), &enc_state, jxl::GetJxlCms(), + /*pool=*/nullptr, &writer, + /*aux_out=*/nullptr)); + + jxl::PaddedBytes jpeg_data; + ASSERT_TRUE( + EncodeJPEGData(*orig_io.Main().jpeg_data.get(), &jpeg_data, cparams)); + jxl::PaddedBytes container; + container.append(jxl::kContainerHeader, + jxl::kContainerHeader + sizeof(jxl::kContainerHeader)); + jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_data.size(), false, + &container); + container.append(jpeg_data.data(), jpeg_data.data() + jpeg_data.size()); + jxl::AppendBoxHeader(jxl::MakeBoxType("jxlc"), 0, true, &container); + jxl::PaddedBytes codestream = std::move(writer).TakeBytes(); + container.append(codestream.data(), codestream.data() + codestream.size()); + VerifyJPEGReconstruction(container, orig); +} + +TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructionMetadataTest)) { + const std::string jpeg_path = "jxl/jpeg_reconstruction/1x1_exif_xmp.jpg"; + const std::string jxl_path = "jxl/jpeg_reconstruction/1x1_exif_xmp.jxl"; + const jxl::PaddedBytes jpeg = jxl::test::ReadTestData(jpeg_path); + const jxl::PaddedBytes jxl = jxl::test::ReadTestData(jxl_path); + VerifyJPEGReconstruction(jxl, jpeg); +} + +TEST(DecodeTest, ContinueFinalNonEssentialBoxTest) { + size_t xsize = 80, ysize = 90; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + jxl::TestCodestreamParams params; + params.box_format = kCSBF_Multi_Other_Terminated; + params.add_icc_profile = true; + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 4, + params); + StreamPositions streampos; + AnalyzeCodestream(compressed, &streampos); + + // The non-essential final box size including 8-byte header + size_t final_box_size = unk3_box_size + 8; + size_t last_box_begin = compressed.size() - final_box_size; + // Verify that the test is indeed setup correctly to be at the beginning of + // the 'unkn' box header. + ASSERT_EQ(compressed[last_box_begin + 3], final_box_size); + ASSERT_EQ(compressed[last_box_begin + 4], 'u'); + ASSERT_EQ(compressed[last_box_begin + 5], 'n'); + ASSERT_EQ(compressed[last_box_begin + 6], 'k'); + ASSERT_EQ(compressed[last_box_begin + 7], '3'); + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME)); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput(dec, compressed.data(), last_box_begin)); + + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + // The decoder returns success despite not having seen the final unknown box + // yet. This is because calling JxlDecoderCloseInput is not mandatory for + // backwards compatibility, so it doesn't know more bytes follow, the current + // bytes ended at a perfectly valid place. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + + size_t remaining = JxlDecoderReleaseInput(dec); + // Since the test was set up to end exactly at the boundary of the final + // codestream box, and the decoder returned success, all bytes are expected to + // be consumed until the end of the frame header. + EXPECT_EQ(remaining, last_box_begin - streampos.frames[0].toc_end); + + // Now set the remaining non-codestream box as input. + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput(dec, compressed.data() + last_box_begin, + compressed.size() - last_box_begin)); + // Even though JxlDecoderProcessInput already returned JXL_DEC_SUCCESS before, + // when calling it again now after setting more input, success is expected, no + // event occurs but the box has been successfully skipped. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + + JxlDecoderDestroy(dec); +} + +namespace { +bool BoxTypeEquals(const std::string& type_string, JxlBoxType type) { + return type_string.size() == 4 && type_string[0] == type[0] && + type_string[1] == type[1] && type_string[2] == type[2] && + type_string[3] == type[3]; +} +} // namespace + +TEST(DecodeTest, ExtentedBoxSizeTest) { + const std::string jxl_path = "jxl/boxes/square-extended-size-container.jxl"; + const jxl::PaddedBytes orig = jxl::test::ReadTestData(jxl_path); + JxlDecoder* dec = JxlDecoderCreate(nullptr); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, JXL_DEC_BOX)); + + JxlBoxType type; + uint64_t box_size; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, orig.data(), orig.size())); + EXPECT_EQ(JXL_DEC_BOX, JxlDecoderProcessInput(dec)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_FALSE)); + EXPECT_TRUE(BoxTypeEquals("JXL ", type)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxSizeRaw(dec, &box_size)); + EXPECT_EQ(12, box_size); + EXPECT_EQ(JXL_DEC_BOX, JxlDecoderProcessInput(dec)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_FALSE)); + EXPECT_TRUE(BoxTypeEquals("ftyp", type)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxSizeRaw(dec, &box_size)); + EXPECT_EQ(20, box_size); + EXPECT_EQ(JXL_DEC_BOX, JxlDecoderProcessInput(dec)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_FALSE)); + EXPECT_TRUE(BoxTypeEquals("jxlc", type)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxSizeRaw(dec, &box_size)); + EXPECT_EQ(72, box_size); + + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, JXL_BOXES_TEST(BoxTest)) { + size_t xsize = 1, ysize = 1; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + jxl::TestCodestreamParams params; + params.box_format = kCSBF_Multi_Other_Terminated; + params.add_icc_profile = true; + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 4, + params); + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, JXL_DEC_BOX)); + + std::vector expected_box_types = { + "JXL ", "ftyp", "jxlp", "unk1", "unk2", "jxlp", "jxlp", "jxlp", "unk3"}; + + // Value 0 means to not test the size: codestream is not required to be a + // particular exact size. + std::vector expected_box_sizes = {12, 20, 0, 34, 18, 0, 0, 0, 20}; + + JxlBoxType type; + uint64_t box_size; + std::vector contents(50); + size_t expected_release_size = 0; + + // Cannot get these when decoding didn't start yet + EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderGetBoxType(dec, type, JXL_FALSE)); + EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderGetBoxSizeRaw(dec, &box_size)); + + uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + for (size_t i = 0; i < expected_box_types.size(); i++) { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + EXPECT_EQ(JXL_DEC_BOX, JxlDecoderProcessInput(dec)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_FALSE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxSizeRaw(dec, &box_size)); + EXPECT_TRUE(BoxTypeEquals(expected_box_types[i], type)); + if (expected_box_sizes[i]) { + EXPECT_EQ(expected_box_sizes[i], box_size); + } + + if (expected_release_size > 0) { + EXPECT_EQ(expected_release_size, JxlDecoderReleaseBoxBuffer(dec)); + expected_release_size = 0; + } + + if (type[0] == 'u' && type[1] == 'n' && type[2] == 'k') { + JxlDecoderSetBoxBuffer(dec, contents.data(), contents.size()); + size_t expected_box_contents_size = + type[3] == '1' ? unk1_box_size + : (type[3] == '2' ? unk2_box_size : unk3_box_size); + expected_release_size = contents.size() - expected_box_contents_size; + } + size_t consumed = avail_in - JxlDecoderReleaseInput(dec); + next_in += consumed; + avail_in -= consumed; + } + + // After the last DEC_BOX event, check that the input position is exactly at + // the stat of the box header. + EXPECT_EQ(avail_in, expected_box_sizes.back()); + + // Even though all input is given, the decoder cannot assume there aren't + // more boxes if the input was not closed. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec)); + JxlDecoderCloseInput(dec); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, JXL_BOXES_TEST(ExifBrobBoxTest)) { + size_t xsize = 1, ysize = 1; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + jxl::TestCodestreamParams params; + // Lossless to verify pixels exactly after roundtrip. + params.cparams.SetLossless(); + params.box_format = kCSBF_Brob_Exif; + params.add_icc_profile = true; + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 4, + params); + + // Test raw brob box, not brotli-decompressing + for (int streaming = 0; streaming < 2; ++streaming) { + JxlDecoder* dec = JxlDecoderCreate(nullptr); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, JXL_DEC_BOX)); + if (!streaming) { + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput(dec, compressed.data(), compressed.size())); + JxlDecoderCloseInput(dec); + } + // for streaming input case + const uint8_t* next_in = compressed.data(); + size_t avail_in = 0; + size_t total_in = 0; + size_t step_size = 64; + + std::vector box_buffer; + size_t box_num_output; + bool seen_brob_begin = false; + bool seen_brob_end = false; + + for (;;) { + JxlDecoderStatus status = JxlDecoderProcessInput(dec); + if (status == JXL_DEC_NEED_MORE_INPUT) { + if (streaming) { + size_t remaining = JxlDecoderReleaseInput(dec); + EXPECT_LE(remaining, avail_in); + next_in += avail_in - remaining; + avail_in = remaining; + size_t amount = step_size; + if (total_in + amount > compressed.size()) { + amount = compressed.size() - total_in; + } + avail_in += amount; + total_in += amount; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput(dec, next_in, avail_in)); + if (total_in == compressed.size()) JxlDecoderCloseInput(dec); + } else { + FAIL(); + break; + } + } else if (status == JXL_DEC_BOX || status == JXL_DEC_SUCCESS) { + if (!box_buffer.empty()) { + EXPECT_EQ(false, seen_brob_end); + seen_brob_end = true; + size_t remaining = JxlDecoderReleaseBoxBuffer(dec); + box_num_output = box_buffer.size() - remaining; + EXPECT_EQ(box_num_output, box_brob_exif_size - 8); + EXPECT_EQ( + 0, memcmp(box_buffer.data(), box_brob_exif + 8, box_num_output)); + box_buffer.clear(); + } + if (status == JXL_DEC_SUCCESS) break; + JxlBoxType type; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_FALSE)); + if (BoxTypeEquals("brob", type)) { + EXPECT_EQ(false, seen_brob_begin); + seen_brob_begin = true; + box_buffer.resize(8); + JxlDecoderSetBoxBuffer(dec, box_buffer.data(), box_buffer.size()); + } + } else if (status == JXL_DEC_BOX_NEED_MORE_OUTPUT) { + size_t remaining = JxlDecoderReleaseBoxBuffer(dec); + box_num_output = box_buffer.size() - remaining; + box_buffer.resize(box_buffer.size() * 2); + JxlDecoderSetBoxBuffer(dec, box_buffer.data() + box_num_output, + box_buffer.size() - box_num_output); + } else { + // We do not expect any other events or errors + FAIL(); + break; + } + } + + EXPECT_EQ(true, seen_brob_begin); + EXPECT_EQ(true, seen_brob_end); + + JxlDecoderDestroy(dec); + } + + // Test decompressed brob box + for (int streaming = 0; streaming < 2; ++streaming) { + JxlDecoder* dec = JxlDecoderCreate(nullptr); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, JXL_DEC_BOX)); + if (!streaming) { + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput(dec, compressed.data(), compressed.size())); + JxlDecoderCloseInput(dec); + } + // for streaming input case + const uint8_t* next_in = compressed.data(); + size_t avail_in = 0; + size_t total_in = 0; + size_t step_size = 64; + + std::vector box_buffer; + size_t box_num_output; + bool seen_exif_begin = false; + bool seen_exif_end = false; + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetDecompressBoxes(dec, JXL_TRUE)); + + for (;;) { + JxlDecoderStatus status = JxlDecoderProcessInput(dec); + if (status == JXL_DEC_NEED_MORE_INPUT) { + if (streaming) { + size_t remaining = JxlDecoderReleaseInput(dec); + EXPECT_LE(remaining, avail_in); + next_in += avail_in - remaining; + avail_in = remaining; + size_t amount = step_size; + if (total_in + amount > compressed.size()) { + amount = compressed.size() - total_in; + } + avail_in += amount; + total_in += amount; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput(dec, next_in, avail_in)); + if (total_in == compressed.size()) JxlDecoderCloseInput(dec); + } else { + FAIL(); + break; + } + } else if (status == JXL_DEC_BOX || status == JXL_DEC_SUCCESS) { + if (!box_buffer.empty()) { + EXPECT_EQ(false, seen_exif_end); + seen_exif_end = true; + size_t remaining = JxlDecoderReleaseBoxBuffer(dec); + box_num_output = box_buffer.size() - remaining; + // Expect that the output has the same size and contents as the + // uncompressed exif data. Only check contents if the sizes match to + // avoid comparing uninitialized memory in the test. + EXPECT_EQ(box_num_output, exif_uncompressed_size); + if (box_num_output == exif_uncompressed_size) { + EXPECT_EQ(0, memcmp(box_buffer.data(), exif_uncompressed, + exif_uncompressed_size)); + } + box_buffer.clear(); + } + if (status == JXL_DEC_SUCCESS) break; + JxlBoxType type; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_TRUE)); + if (BoxTypeEquals("Exif", type)) { + EXPECT_EQ(false, seen_exif_begin); + seen_exif_begin = true; + box_buffer.resize(8); + JxlDecoderSetBoxBuffer(dec, box_buffer.data(), box_buffer.size()); + } + } else if (status == JXL_DEC_BOX_NEED_MORE_OUTPUT) { + size_t remaining = JxlDecoderReleaseBoxBuffer(dec); + box_num_output = box_buffer.size() - remaining; + box_buffer.resize(box_buffer.size() * 2); + JxlDecoderSetBoxBuffer(dec, box_buffer.data() + box_num_output, + box_buffer.size() - box_num_output); + } else { + // We do not expect any other events or errors + FAIL(); + break; + } + } + + EXPECT_EQ(true, seen_exif_begin); + EXPECT_EQ(true, seen_exif_end); + + JxlDecoderDestroy(dec); + } +} + +TEST(DecodeTest, JXL_BOXES_TEST(PartialCodestreamBoxTest)) { + size_t xsize = 23, ysize = 81; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + JxlPixelFormat format_orig = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + // Lossless to verify pixels exactly after roundtrip. + jxl::TestCodestreamParams params; + params.cparams.SetLossless(); + params.cparams.speed_tier = jxl::SpeedTier::kThunder; + params.box_format = kCSBF_Multi; + params.add_icc_profile = true; + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 4, + params); + + std::vector extracted_codestream; + + { + JxlDecoder* dec = JxlDecoderCreate(nullptr); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE | JXL_DEC_BOX)); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput(dec, compressed.data(), compressed.size())); + JxlDecoderCloseInput(dec); + + size_t num_jxlp = 0; + + std::vector pixels2; + pixels2.resize(pixels.size()); + + std::vector box_buffer; + size_t box_num_output; + + for (;;) { + JxlDecoderStatus status = JxlDecoderProcessInput(dec); + if (status == JXL_DEC_NEED_MORE_INPUT) { + FAIL(); + break; + } else if (status == JXL_DEC_BASIC_INFO) { + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(info.xsize, xsize); + EXPECT_EQ(info.ysize, ysize); + } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) { + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer(dec, &format_orig, pixels2.data(), + pixels2.size())); + } else if (status == JXL_DEC_FULL_IMAGE) { + continue; + } else if (status == JXL_DEC_BOX || status == JXL_DEC_SUCCESS) { + if (!box_buffer.empty()) { + size_t remaining = JxlDecoderReleaseBoxBuffer(dec); + box_num_output = box_buffer.size() - remaining; + EXPECT_GE(box_num_output, 4); + // Do not insert the first 4 bytes, which are not part of the + // codestream, but the partial codestream box index + extracted_codestream.insert(extracted_codestream.end(), + box_buffer.begin() + 4, + box_buffer.begin() + box_num_output); + box_buffer.clear(); + } + if (status == JXL_DEC_SUCCESS) break; + JxlBoxType type; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_FALSE)); + if (BoxTypeEquals("jxlp", type)) { + num_jxlp++; + box_buffer.resize(8); + JxlDecoderSetBoxBuffer(dec, box_buffer.data(), box_buffer.size()); + } + } else if (status == JXL_DEC_BOX_NEED_MORE_OUTPUT) { + size_t remaining = JxlDecoderReleaseBoxBuffer(dec); + box_num_output = box_buffer.size() - remaining; + box_buffer.resize(box_buffer.size() * 2); + JxlDecoderSetBoxBuffer(dec, box_buffer.data() + box_num_output, + box_buffer.size() - box_num_output); + } else { + // We do not expect any other events or errors + FAIL(); + break; + } + } + + // The test file created with kCSBF_Multi is expected to have 4 jxlp boxes. + EXPECT_EQ(4, num_jxlp); + + EXPECT_EQ(0u, jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize, + ysize, format_orig, format_orig)); + + JxlDecoderDestroy(dec); + } + + // Now test whether the codestream extracted from the jxlp boxes can itself + // also be decoded and gives the same pixels + { + JxlDecoder* dec = JxlDecoderCreate(nullptr); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE | JXL_DEC_BOX)); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput(dec, extracted_codestream.data(), + extracted_codestream.size())); + JxlDecoderCloseInput(dec); + + size_t num_boxes = 0; + + std::vector pixels2; + pixels2.resize(pixels.size()); + + std::vector box_buffer; + size_t box_num_output; + + for (;;) { + JxlDecoderStatus status = JxlDecoderProcessInput(dec); + if (status == JXL_DEC_NEED_MORE_INPUT) { + FAIL(); + break; + } else if (status == JXL_DEC_BASIC_INFO) { + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(info.xsize, xsize); + EXPECT_EQ(info.ysize, ysize); + } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) { + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer(dec, &format_orig, pixels2.data(), + pixels2.size())); + } else if (status == JXL_DEC_FULL_IMAGE) { + continue; + } else if (status == JXL_DEC_BOX) { + num_boxes++; + } else if (status == JXL_DEC_BOX_NEED_MORE_OUTPUT) { + size_t remaining = JxlDecoderReleaseBoxBuffer(dec); + box_num_output = box_buffer.size() - remaining; + box_buffer.resize(box_buffer.size() * 2); + JxlDecoderSetBoxBuffer(dec, box_buffer.data() + box_num_output, + box_buffer.size() - box_num_output); + } else if (status == JXL_DEC_SUCCESS) { + break; + } else { + // We do not expect any other events or errors + FAIL(); + break; + } + } + + EXPECT_EQ(0, num_boxes); // The data does not use the container format. + EXPECT_EQ(0u, jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize, + ysize, format_orig, format_orig)); + + JxlDecoderDestroy(dec); + } +} + +TEST(DecodeTest, SpotColorTest) { + jxl::ThreadPool* pool = nullptr; + jxl::CodecInOut io; + size_t xsize = 55, ysize = 257; + io.metadata.m.color_encoding = jxl::ColorEncoding::LinearSRGB(); + jxl::Image3F main(xsize, ysize); + jxl::ImageF spot(xsize, ysize); + jxl::ZeroFillImage(&main); + jxl::ZeroFillImage(&spot); + + for (size_t y = 0; y < ysize; y++) { + float* JXL_RESTRICT rowm = main.PlaneRow(1, y); + float* JXL_RESTRICT rows = spot.Row(y); + for (size_t x = 0; x < xsize; x++) { + rowm[x] = (x + y) * (1.f / 255.f); + rows[x] = ((x ^ y) & 255) * (1.f / 255.f); + } + } + io.SetFromImage(std::move(main), jxl::ColorEncoding::LinearSRGB()); + jxl::ExtraChannelInfo info; + info.bit_depth.bits_per_sample = 8; + info.dim_shift = 0; + info.type = jxl::ExtraChannel::kSpotColor; + info.spot_color[0] = 0.5f; + info.spot_color[1] = 0.2f; + info.spot_color[2] = 1.f; + info.spot_color[3] = 0.5f; + + io.metadata.m.extra_channel_info.push_back(info); + std::vector ec; + ec.push_back(std::move(spot)); + io.frames[0].SetExtraChannels(std::move(ec)); + + jxl::CompressParams cparams; + cparams.speed_tier = jxl::SpeedTier::kLightning; + cparams.modular_mode = true; + cparams.color_transform = jxl::ColorTransform::kNone; + cparams.butteraugli_distance = 0.f; + + jxl::PaddedBytes compressed; + std::unique_ptr enc_state = + jxl::make_unique(); + EXPECT_TRUE(jxl::EncodeFile(cparams, &io, enc_state.get(), &compressed, + jxl::GetJxlCms(), nullptr, pool)); + + for (size_t render_spot = 0; render_spot < 2; render_spot++) { + JxlPixelFormat format = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}; + + JxlDecoder* dec = JxlDecoderCreate(NULL); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE)); + if (!render_spot) { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetRenderSpotcolors(dec, JXL_FALSE)); + } + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput(dec, compressed.data(), compressed.size())); + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + JxlBasicInfo binfo; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &binfo)); + EXPECT_EQ(1u, binfo.num_extra_channels); + EXPECT_EQ(xsize, binfo.xsize); + EXPECT_EQ(ysize, binfo.ysize); + + JxlExtraChannelInfo extra_info; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetExtraChannelInfo(dec, 0, &extra_info)); + EXPECT_EQ((unsigned int)jxl::ExtraChannel::kSpotColor, extra_info.type); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + size_t extra_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderExtraChannelBufferSize(dec, &format, &extra_size, 0)); + + std::vector image(buffer_size); + std::vector extra(extra_size); + size_t bytes_per_pixel = format.num_channels * + jxl::test::GetDataBits(format.data_type) / + jxl::kBitsPerByte; + size_t stride = bytes_per_pixel * binfo.xsize; + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, image.data(), image.size())); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetExtraChannelBuffer(dec, &format, extra.data(), + extra.size(), 0)); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + + // After the full image was output, JxlDecoderProcessInput should return + // success to indicate all is done. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + JxlDecoderDestroy(dec); + + for (size_t y = 0; y < ysize; y++) { + uint8_t* JXL_RESTRICT rowm = image.data() + stride * y; + uint8_t* JXL_RESTRICT rows = extra.data() + xsize * y; + for (size_t x = 0; x < xsize; x++) { + if (!render_spot) { + // if spot color isn't rendered, main image should be as we made it + // (red and blue are all zeroes) + + EXPECT_EQ(rowm[x * 3 + 0], 0); + EXPECT_EQ(rowm[x * 3 + 1], (x + y > 255 ? 255 : x + y)); + EXPECT_EQ(rowm[x * 3 + 2], 0); + } + if (render_spot) { + // if spot color is rendered, expect red and blue to look like the + // spot color channel + EXPECT_LT(abs(rowm[x * 3 + 0] - (rows[x] * 0.25f)), 1); + EXPECT_LT(abs(rowm[x * 3 + 2] - (rows[x] * 0.5f)), 1); + } + EXPECT_EQ(rows[x], ((x ^ y) & 255)); + } + } + } +} + +TEST(DecodeTest, CloseInput) { + std::vector partial_file = {0xff}; + + JxlDecoderPtr dec = JxlDecoderMake(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec.get(), + JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec.get(), partial_file.data(), + partial_file.size())); + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec.get())); + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec.get())); + JxlDecoderCloseInput(dec.get()); + EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderProcessInput(dec.get())); +} diff --git a/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.cc b/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.cc new file mode 100644 index 0000000000..aa57b2723f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.cc @@ -0,0 +1,169 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/decode_to_jpeg.h" + +namespace jxl { + +#if JPEGXL_ENABLE_TRANSCODE_JPEG + +JxlDecoderStatus JxlToJpegDecoder::Process(const uint8_t** next_in, + size_t* avail_in) { + if (!inside_box_) { + JXL_ABORT( + "processing of JPEG reconstruction data outside JPEG reconstruction " + "box"); + } + Span to_decode; + if (box_until_eof_) { + // Until EOF means consume all data. + to_decode = Span(*next_in, *avail_in); + *next_in += *avail_in; + *avail_in = 0; + } else { + // Defined size means consume min(available, needed). + size_t avail_recon_in = + std::min(*avail_in, box_size_ - buffer_.size()); + to_decode = Span(*next_in, avail_recon_in); + *next_in += avail_recon_in; + *avail_in -= avail_recon_in; + } + bool old_data_exists = !buffer_.empty(); + if (old_data_exists) { + // Append incoming data to buffer if we already had data in the buffer. + buffer_.insert(buffer_.end(), to_decode.data(), + to_decode.data() + to_decode.size()); + to_decode = Span(buffer_.data(), buffer_.size()); + } + if (!box_until_eof_ && to_decode.size() > box_size_) { + JXL_ABORT("JPEG reconstruction data to decode larger than expected"); + } + if (box_until_eof_ || to_decode.size() == box_size_) { + // If undefined size, or the right size, try to decode. + jpeg_data_ = make_unique(); + const auto status = jpeg::DecodeJPEGData(to_decode, jpeg_data_.get()); + if (status.IsFatalError()) return JXL_DEC_ERROR; + if (status) { + // Successful decoding, emit event after updating state to track that we + // are no longer parsing JPEG reconstruction data. + inside_box_ = false; + return JXL_DEC_JPEG_RECONSTRUCTION; + } + if (box_until_eof_) { + // Unsuccessful decoding and undefined size, assume incomplete data. Copy + // the data if we haven't already. + if (!old_data_exists) { + buffer_.insert(buffer_.end(), to_decode.data(), + to_decode.data() + to_decode.size()); + } + } else { + // Unsuccessful decoding of correct amount of data, assume error. + return JXL_DEC_ERROR; + } + } else { + // Not enough data, copy the data if we haven't already. + if (!old_data_exists) { + buffer_.insert(buffer_.end(), to_decode.data(), + to_decode.data() + to_decode.size()); + } + } + return JXL_DEC_NEED_MORE_INPUT; +} + +size_t JxlToJpegDecoder::NumExifMarkers(const jpeg::JPEGData& jpeg_data) { + size_t num = 0; + for (size_t i = 0; i < jpeg_data.app_data.size(); ++i) { + if (jpeg_data.app_marker_type[i] == jxl::jpeg::AppMarkerType::kExif) { + num++; + } + } + return num; +} + +size_t JxlToJpegDecoder::NumXmpMarkers(const jpeg::JPEGData& jpeg_data) { + size_t num = 0; + for (size_t i = 0; i < jpeg_data.app_data.size(); ++i) { + if (jpeg_data.app_marker_type[i] == jxl::jpeg::AppMarkerType::kXMP) { + num++; + } + } + return num; +} + +JxlDecoderStatus JxlToJpegDecoder::ExifBoxContentSize( + const jpeg::JPEGData& jpeg_data, size_t* size) { + for (size_t i = 0; i < jpeg_data.app_data.size(); ++i) { + if (jpeg_data.app_marker_type[i] == jxl::jpeg::AppMarkerType::kExif) { + if (jpeg_data.app_data[i].size() < 3 + sizeof(jpeg::kExifTag)) { + // too small for app marker header + return JXL_DEC_ERROR; + } + // The first 4 bytes are the TIFF header from the box contents, and are + // not included in the JPEG + *size = jpeg_data.app_data[i].size() + 4 - 3 - sizeof(jpeg::kExifTag); + return JXL_DEC_SUCCESS; + } + } + return JXL_DEC_ERROR; +} + +JxlDecoderStatus JxlToJpegDecoder::XmlBoxContentSize( + const jpeg::JPEGData& jpeg_data, size_t* size) { + for (size_t i = 0; i < jpeg_data.app_data.size(); ++i) { + if (jpeg_data.app_marker_type[i] == jxl::jpeg::AppMarkerType::kXMP) { + if (jpeg_data.app_data[i].size() < 3 + sizeof(jpeg::kXMPTag)) { + // too small for app marker header + return JXL_DEC_ERROR; + } + *size = jpeg_data.app_data[i].size() - 3 - sizeof(jpeg::kXMPTag); + return JXL_DEC_SUCCESS; + } + } + return JXL_DEC_ERROR; +} + +JxlDecoderStatus JxlToJpegDecoder::SetExif(const uint8_t* data, size_t size, + jpeg::JPEGData* jpeg_data) { + for (size_t i = 0; i < jpeg_data->app_data.size(); ++i) { + if (jpeg_data->app_marker_type[i] == jxl::jpeg::AppMarkerType::kExif) { + if (jpeg_data->app_data[i].size() != + size + 3 + sizeof(jpeg::kExifTag) - 4) + return JXL_DEC_ERROR; + // The first 9 bytes are used for JPEG marker header. + jpeg_data->app_data[i][0] = 0xE1; + // The second and third byte are already filled in correctly + memcpy(jpeg_data->app_data[i].data() + 3, jpeg::kExifTag, + sizeof(jpeg::kExifTag)); + // The first 4 bytes are the TIFF header from the box contents, and are + // not included in the JPEG + memcpy(jpeg_data->app_data[i].data() + 3 + sizeof(jpeg::kExifTag), + data + 4, size - 4); + return JXL_DEC_SUCCESS; + } + } + return JXL_DEC_ERROR; +} +JxlDecoderStatus JxlToJpegDecoder::SetXmp(const uint8_t* data, size_t size, + jpeg::JPEGData* jpeg_data) { + for (size_t i = 0; i < jpeg_data->app_data.size(); ++i) { + if (jpeg_data->app_marker_type[i] == jxl::jpeg::AppMarkerType::kXMP) { + if (jpeg_data->app_data[i].size() != size + 3 + sizeof(jpeg::kXMPTag)) + return JXL_DEC_ERROR; + // The first 9 bytes are used for JPEG marker header. + jpeg_data->app_data[i][0] = 0xE1; + // The second and third byte are already filled in correctly + memcpy(jpeg_data->app_data[i].data() + 3, jpeg::kXMPTag, + sizeof(jpeg::kXMPTag)); + memcpy(jpeg_data->app_data[i].data() + 3 + sizeof(jpeg::kXMPTag), data, + size); + return JXL_DEC_SUCCESS; + } + } + return JXL_DEC_ERROR; +} + +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.h b/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.h new file mode 100644 index 0000000000..a64ace27a2 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.h @@ -0,0 +1,217 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DECODE_TO_JPEG_H_ +#define LIB_JXL_DECODE_TO_JPEG_H_ + +// JPEG XL to JPEG bytes decoder logic. The JxlToJpegDecoder class keeps track +// of the decoder state needed to parse the JPEG reconstruction box and provide +// the reconstructed JPEG to the output buffer. + +#include +#include +#include + +#include +#include + +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" // JPEGXL_ENABLE_TRANSCODE_JPEG +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/jpeg/dec_jpeg_data.h" +#if JPEGXL_ENABLE_TRANSCODE_JPEG +#include "lib/jxl/jpeg/dec_jpeg_data_writer.h" +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + +namespace jxl { + +#if JPEGXL_ENABLE_TRANSCODE_JPEG + +class JxlToJpegDecoder { + public: + // Returns whether an output buffer is set. + bool IsOutputSet() const { return next_out_ != nullptr; } + + // Returns whether the decoder is parsing a boxa JPEG box was parsed. + bool IsParsingBox() const { return inside_box_; } + + // Sets the output buffer used when producing JPEG output. + JxlDecoderStatus SetOutputBuffer(uint8_t* data, size_t size) { + if (next_out_) return JXL_DEC_ERROR; + next_out_ = data; + avail_size_ = size; + return JXL_DEC_SUCCESS; + } + + // Releases the buffer set with SetOutputBuffer(). + size_t ReleaseOutputBuffer() { + size_t result = avail_size_; + next_out_ = nullptr; + avail_size_ = 0; + return result; + } + + void StartBox(bool box_until_eof, size_t contents_size) { + // A new box implies that we clear the buffer. + buffer_.clear(); + inside_box_ = true; + if (box_until_eof) { + box_until_eof_ = true; + } else { + box_size_ = contents_size; + } + } + + // Consumes data from next_in/avail_in to reconstruct JPEG data. + // Uses box_size_, inside_box_ and box_until_eof_ to calculate how much to + // consume. Potentially stores unparsed data in buffer_. + // Potentially populates jpeg_data_. Potentially updates inside_box_. + // Returns JXL_DEC_JPEG_RECONSTRUCTION when finished, JXL_DEC_NEED_MORE_INPUT + // if more input is needed, JXL_DEC_ERROR on parsing error. + JxlDecoderStatus Process(const uint8_t** next_in, size_t* avail_in); + + // Returns non-owned copy of the JPEGData, only after Process finished and + // the JPEGData was not yet moved to an image bundle with + // SetImageBundleJpegData. + jpeg::JPEGData* GetJpegData() { return jpeg_data_.get(); } + + // Returns how many exif or xmp app markers are present in the JPEG data. A + // return value higher than 1 would require multiple exif boxes or multiple + // xmp boxes in the container format, and this is not supported by the API and + // considered an error. May only be called after Process returned success. + static size_t NumExifMarkers(const jpeg::JPEGData& jpeg_data); + static size_t NumXmpMarkers(const jpeg::JPEGData& jpeg_data); + + // Returns box content size for metadata, using the known data from the app + // markers. + static JxlDecoderStatus ExifBoxContentSize(const jpeg::JPEGData& jpeg_data, + size_t* size); + static JxlDecoderStatus XmlBoxContentSize(const jpeg::JPEGData& jpeg_data, + size_t* size); + + // Returns JXL_DEC_ERROR if there is no exif/XMP marker or the data size + // does not match, or this function is called before Process returned + // success, JXL_DEC_SUCCESS otherwise. As input, provide the full box contents + // but not the box header. In case of exif, this includes the 4-byte TIFF + // header, even though it won't be copied into the JPEG. + static JxlDecoderStatus SetExif(const uint8_t* data, size_t size, + jpeg::JPEGData* jpeg_data); + static JxlDecoderStatus SetXmp(const uint8_t* data, size_t size, + jpeg::JPEGData* jpeg_data); + + // Sets the JpegData of the ImageBundle passed if there is anything to set. + // Releases the JpegData from this decoder if set. + Status SetImageBundleJpegData(ImageBundle* ib) { + if (IsOutputSet() && jpeg_data_ != nullptr) { + if (!jpeg::SetJPEGDataFromICC(ib->metadata()->color_encoding.ICC(), + jpeg_data_.get())) { + return false; + } + ib->jpeg_data.reset(jpeg_data_.release()); + } + return true; + } + + JxlDecoderStatus WriteOutput(const jpeg::JPEGData& jpeg_data) { + // Copy JPEG bytestream if desired. + uint8_t* tmp_next_out = next_out_; + size_t tmp_avail_size = avail_size_; + auto write = [&tmp_next_out, &tmp_avail_size](const uint8_t* buf, + size_t len) { + size_t to_write = std::min(tmp_avail_size, len); + if (to_write != 0) memcpy(tmp_next_out, buf, to_write); + tmp_next_out += to_write; + tmp_avail_size -= to_write; + return to_write; + }; + Status write_result = jpeg::WriteJpeg(jpeg_data, write); + if (!write_result) { + if (tmp_avail_size == 0) { + return JXL_DEC_JPEG_NEED_MORE_OUTPUT; + } + return JXL_DEC_ERROR; + } + next_out_ = tmp_next_out; + avail_size_ = tmp_avail_size; + return JXL_DEC_SUCCESS; + } + + private: + // Content of the most recently parsed JPEG reconstruction box if any. + std::vector buffer_; + + // Decoded content of the most recently parsed JPEG reconstruction box is + // stored here. + std::unique_ptr jpeg_data_; + + // True if the decoder is currently reading bytes inside a JPEG reconstruction + // box. + bool inside_box_ = false; + + // True if the JPEG reconstruction box had undefined size (all remaining + // bytes). + bool box_until_eof_ = false; + // Size of most recently parsed JPEG reconstruction box contents. + size_t box_size_ = 0; + + // Next bytes to write JPEG reconstruction to. + uint8_t* next_out_ = nullptr; + // Available bytes to write JPEG reconstruction to. + size_t avail_size_ = 0; +}; + +#else + +// Fake class that disables support for decoding JPEG XL to JPEG. +class JxlToJpegDecoder { + public: + bool IsOutputSet() const { return false; } + bool IsParsingBox() const { return false; } + + JxlDecoderStatus SetOutputBuffer(uint8_t* /* data */, size_t /* size */) { + return JXL_DEC_ERROR; + } + size_t ReleaseOutputBuffer() { return 0; } + + void StartBox(bool /* box_until_eof */, size_t /* contents_size */) {} + + JxlDecoderStatus Process(const uint8_t** next_in, size_t* avail_in) { + return JXL_DEC_ERROR; + } + jpeg::JPEGData* GetJpegData() { return nullptr; } + + Status SetImageBundleJpegData(ImageBundle* /* ib */) { return true; } + + static size_t NumExifMarkers(const jpeg::JPEGData& /*jpeg_data*/) { + return 0; + } + static size_t NumXmpMarkers(const jpeg::JPEGData& /*jpeg_data*/) { return 0; } + static size_t ExifBoxContentSize(const jpeg::JPEGData& /*jpeg_data*/, + size_t* /*size*/) { + return JXL_DEC_ERROR; + } + static size_t XmlBoxContentSize(const jpeg::JPEGData& /*jpeg_data*/, + size_t* /*size*/) { + return JXL_DEC_ERROR; + } + static JxlDecoderStatus SetExif(const uint8_t* /*data*/, size_t /*size*/, + jpeg::JPEGData* /*jpeg_data*/) { + return JXL_DEC_ERROR; + } + static JxlDecoderStatus SetXmp(const uint8_t* /*data*/, size_t /*size*/, + jpeg::JPEGData* /*jpeg_data*/) { + return JXL_DEC_ERROR; + } + + JxlDecoderStatus WriteOutput(const jpeg::JPEGData& /* jpeg_data */) { + return JXL_DEC_SUCCESS; + } +}; + +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + +} // namespace jxl + +#endif // LIB_JXL_DECODE_TO_JPEG_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc b/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc new file mode 100644 index 0000000000..2b4d84196f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc @@ -0,0 +1,1168 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_ac_strategy.h" + +#include +#include + +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_ac_strategy.cc" +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_transforms-inl.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/fast_math-inl.h" + +// Some of the floating point constants in this file and in other +// files in the libjxl project have been obtained using the +// tools/optimizer/simplex_fork.py tool. It is a variation of +// Nelder-Mead optimization, and we generally try to minimize +// BPP * pnorm aggregate as reported by the benchmark_xl tool, +// but occasionally the values are optimized by using additional +// constraints such as maintaining a certain density, or ratio of +// popularity of integral transforms. Jyrki visually reviews all +// such changes and often makes manual changes to maintain good +// visual quality to changes where butteraugli was not sufficiently +// sensitive to some kind of degradation. Unfortunately image quality +// is still more of an art than science. + +// This must come before the begin/end_target, but HWY_ONCE is only true +// after that, so use an "include guard". +#ifndef LIB_JXL_ENC_AC_STRATEGY_ +#define LIB_JXL_ENC_AC_STRATEGY_ +// Parameters of the heuristic are marked with a OPTIMIZE comment. +namespace jxl { + +// Debugging utilities. + +// Returns a linear sRGB color (as bytes) for each AC strategy. +const uint8_t* TypeColor(const uint8_t& raw_strategy) { + JXL_ASSERT(AcStrategy::IsRawStrategyValid(raw_strategy)); + static_assert(AcStrategy::kNumValidStrategies == 27, "Change colors"); + static constexpr uint8_t kColors[][3] = { + {0xFF, 0xFF, 0x00}, // DCT8 + {0xFF, 0x80, 0x80}, // HORNUSS + {0xFF, 0x80, 0x80}, // DCT2x2 + {0xFF, 0x80, 0x80}, // DCT4x4 + {0x80, 0xFF, 0x00}, // DCT16x16 + {0x00, 0xC0, 0x00}, // DCT32x32 + {0xC0, 0xFF, 0x00}, // DCT16x8 + {0xC0, 0xFF, 0x00}, // DCT8x16 + {0x00, 0xFF, 0x00}, // DCT32x8 + {0x00, 0xFF, 0x00}, // DCT8x32 + {0x00, 0xFF, 0x00}, // DCT32x16 + {0x00, 0xFF, 0x00}, // DCT16x32 + {0xFF, 0x80, 0x00}, // DCT4x8 + {0xFF, 0x80, 0x00}, // DCT8x4 + {0xFF, 0xFF, 0x80}, // AFV0 + {0xFF, 0xFF, 0x80}, // AFV1 + {0xFF, 0xFF, 0x80}, // AFV2 + {0xFF, 0xFF, 0x80}, // AFV3 + {0x00, 0xC0, 0xFF}, // DCT64x64 + {0x00, 0xFF, 0xFF}, // DCT64x32 + {0x00, 0xFF, 0xFF}, // DCT32x64 + {0x00, 0x40, 0xFF}, // DCT128x128 + {0x00, 0x80, 0xFF}, // DCT128x64 + {0x00, 0x80, 0xFF}, // DCT64x128 + {0x00, 0x00, 0xC0}, // DCT256x256 + {0x00, 0x00, 0xFF}, // DCT256x128 + {0x00, 0x00, 0xFF}, // DCT128x256 + }; + return kColors[raw_strategy]; +} + +const uint8_t* TypeMask(const uint8_t& raw_strategy) { + JXL_ASSERT(AcStrategy::IsRawStrategyValid(raw_strategy)); + static_assert(AcStrategy::kNumValidStrategies == 27, "Add masks"); + // implicitly, first row and column is made dark + static constexpr uint8_t kMask[][64] = { + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // DCT8 + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 1, 0, 0, 1, 0, 0, // + 0, 0, 1, 0, 0, 1, 0, 0, // + 0, 0, 1, 1, 1, 1, 0, 0, // + 0, 0, 1, 0, 0, 1, 0, 0, // + 0, 0, 1, 0, 0, 1, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // HORNUSS + { + 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 0, 1, 0, 1, 0, 1, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 0, 1, 0, 1, 0, 1, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 0, 1, 0, 1, 0, 1, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 0, 1, 0, 1, 0, 1, 0, // + }, // 2x2 + { + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + }, // 4x4 + {}, // DCT16x16 (unused) + {}, // DCT32x32 (unused) + {}, // DCT16x8 (unused) + {}, // DCT8x16 (unused) + {}, // DCT32x8 (unused) + {}, // DCT8x32 (unused) + {}, // DCT32x16 (unused) + {}, // DCT16x32 (unused) + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // DCT4x8 + { + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + }, // DCT8x4 + { + 1, 1, 1, 1, 1, 0, 0, 0, // + 1, 1, 1, 1, 0, 0, 0, 0, // + 1, 1, 1, 0, 0, 0, 0, 0, // + 1, 1, 0, 0, 0, 0, 0, 0, // + 1, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // AFV0 + { + 0, 0, 0, 0, 1, 1, 1, 1, // + 0, 0, 0, 0, 0, 1, 1, 1, // + 0, 0, 0, 0, 0, 0, 1, 1, // + 0, 0, 0, 0, 0, 0, 0, 1, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // AFV1 + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 1, 0, 0, 0, 0, 0, 0, 0, // + 1, 1, 0, 0, 0, 0, 0, 0, // + 1, 1, 1, 0, 0, 0, 0, 0, // + 1, 1, 1, 1, 0, 0, 0, 0, // + }, // AFV2 + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 1, // + 0, 0, 0, 0, 0, 0, 1, 1, // + 0, 0, 0, 0, 0, 1, 1, 1, // + }, // AFV3 + }; + return kMask[raw_strategy]; +} + +void DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize, + size_t ysize, const char* tag, AuxOut* aux_out) { + Image3F color_acs(xsize, ysize); + for (size_t y = 0; y < ysize; y++) { + float* JXL_RESTRICT rows[3] = { + color_acs.PlaneRow(0, y), + color_acs.PlaneRow(1, y), + color_acs.PlaneRow(2, y), + }; + const AcStrategyRow acs_row = ac_strategy.ConstRow(y / kBlockDim); + for (size_t x = 0; x < xsize; x++) { + AcStrategy acs = acs_row[x / kBlockDim]; + const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy()); + for (size_t c = 0; c < 3; c++) { + rows[c][x] = color[c] / 255.f; + } + } + } + size_t stride = color_acs.PixelsPerRow(); + for (size_t c = 0; c < 3; c++) { + for (size_t by = 0; by < DivCeil(ysize, kBlockDim); by++) { + float* JXL_RESTRICT row = color_acs.PlaneRow(c, by * kBlockDim); + const AcStrategyRow acs_row = ac_strategy.ConstRow(by); + for (size_t bx = 0; bx < DivCeil(xsize, kBlockDim); bx++) { + AcStrategy acs = acs_row[bx]; + if (!acs.IsFirstBlock()) continue; + const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy()); + const uint8_t* JXL_RESTRICT mask = TypeMask(acs.RawStrategy()); + if (acs.covered_blocks_x() == 1 && acs.covered_blocks_y() == 1) { + for (size_t iy = 0; iy < kBlockDim && by * kBlockDim + iy < ysize; + iy++) { + for (size_t ix = 0; ix < kBlockDim && bx * kBlockDim + ix < xsize; + ix++) { + if (mask[iy * kBlockDim + ix]) { + row[iy * stride + bx * kBlockDim + ix] = color[c] / 800.f; + } + } + } + } + // draw block edges + for (size_t ix = 0; ix < kBlockDim * acs.covered_blocks_x() && + bx * kBlockDim + ix < xsize; + ix++) { + row[0 * stride + bx * kBlockDim + ix] = color[c] / 350.f; + } + for (size_t iy = 0; iy < kBlockDim * acs.covered_blocks_y() && + by * kBlockDim + iy < ysize; + iy++) { + row[iy * stride + bx * kBlockDim + 0] = color[c] / 350.f; + } + } + } + } + aux_out->DumpImage(tag, color_acs); +} + +} // namespace jxl +#endif // LIB_JXL_ENC_AC_STRATEGY_ + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::AbsDiff; +using hwy::HWY_NAMESPACE::Eq; +using hwy::HWY_NAMESPACE::IfThenElseZero; +using hwy::HWY_NAMESPACE::IfThenZeroElse; +using hwy::HWY_NAMESPACE::Round; +using hwy::HWY_NAMESPACE::Sqrt; + +bool MultiBlockTransformCrossesHorizontalBoundary( + const AcStrategyImage& ac_strategy, size_t start_x, size_t y, + size_t end_x) { + if (start_x >= ac_strategy.xsize() || y >= ac_strategy.ysize()) { + return false; + } + if (y % 8 == 0) { + // Nothing crosses 64x64 boundaries, and the memory on the other side + // of the 64x64 block may still uninitialized. + return false; + } + end_x = std::min(end_x, ac_strategy.xsize()); + // The first multiblock might be before the start_x, let's adjust it + // to point to the first IsFirstBlock() == true block we find by backward + // tracing. + AcStrategyRow row = ac_strategy.ConstRow(y); + const size_t start_x_limit = start_x & ~7; + while (start_x != start_x_limit && !row[start_x].IsFirstBlock()) { + --start_x; + } + for (size_t x = start_x; x < end_x;) { + if (row[x].IsFirstBlock()) { + x += row[x].covered_blocks_x(); + } else { + return true; + } + } + return false; +} + +bool MultiBlockTransformCrossesVerticalBoundary( + const AcStrategyImage& ac_strategy, size_t x, size_t start_y, + size_t end_y) { + if (x >= ac_strategy.xsize() || start_y >= ac_strategy.ysize()) { + return false; + } + if (x % 8 == 0) { + // Nothing crosses 64x64 boundaries, and the memory on the other side + // of the 64x64 block may still uninitialized. + return false; + } + end_y = std::min(end_y, ac_strategy.ysize()); + // The first multiblock might be before the start_y, let's adjust it + // to point to the first IsFirstBlock() == true block we find by backward + // tracing. + const size_t start_y_limit = start_y & ~7; + while (start_y != start_y_limit && + !ac_strategy.ConstRow(start_y)[x].IsFirstBlock()) { + --start_y; + } + + for (size_t y = start_y; y < end_y;) { + AcStrategyRow row = ac_strategy.ConstRow(y); + if (row[x].IsFirstBlock()) { + y += row[x].covered_blocks_y(); + } else { + return true; + } + } + return false; +} + +static const float kChromaErrorWeight[AcStrategy::kNumValidStrategies] = { + 0.95f, // DCT = 0, + 1.0f, // IDENTITY = 1, + 0.5f, // DCT2X2 = 2, + 1.0f, // DCT4X4 = 3, + 2.0f, // DCT16X16 = 4, + 2.0f, // DCT32X32 = 5, + 1.4f, // DCT16X8 = 6, + 1.4f, // DCT8X16 = 7, + 2.0f, // DCT32X8 = 8, + 2.0f, // DCT8X32 = 9, + 2.0f, // DCT32X16 = 10, + 2.0f, // DCT16X32 = 11, + 2.0f, // DCT4X8 = 12, + 2.0f, // DCT8X4 = 13, + 1.7f, // AFV0 = 14, + 1.7f, // AFV1 = 15, + 1.7f, // AFV2 = 16, + 1.7f, // AFV3 = 17, + 2.0f, // DCT64X64 = 18, + 2.0f, // DCT64X32 = 19, + 2.0f, // DCT32X64 = 20, + 2.0f, // DCT128X128 = 21, + 2.0f, // DCT128X64 = 22, + 2.0f, // DCT64X128 = 23, + 2.0f, // DCT256X256 = 24, + 2.0f, // DCT256X128 = 25, + 2.0f, // DCT128X256 = 26, +}; + +float EstimateEntropy(const AcStrategy& acs, size_t x, size_t y, + const ACSConfig& config, + const float* JXL_RESTRICT cmap_factors, float* block, + float* scratch_space, uint32_t* quantized) { + const size_t size = (1 << acs.log2_covered_blocks()) * kDCTBlockSize; + + // Apply transform. + for (size_t c = 0; c < 3; c++) { + float* JXL_RESTRICT block_c = block + size * c; + TransformFromPixels(acs.Strategy(), &config.Pixel(c, x, y), + config.src_stride, block_c, scratch_space); + } + + HWY_FULL(float) df; + + const size_t num_blocks = acs.covered_blocks_x() * acs.covered_blocks_y(); + // avoid large blocks when there is a lot going on in red-green. + float cmul[3] = {kChromaErrorWeight[acs.RawStrategy()], 1.0f, 1.0f}; + float quant_norm8 = 0; + float masking = 0; + if (num_blocks == 1) { + // When it is only one 8x8, we don't need aggregation of values. + quant_norm8 = config.Quant(x / 8, y / 8); + masking = 2.0f * config.Masking(x / 8, y / 8); + } else if (num_blocks == 2) { + // Taking max instead of 8th norm seems to work + // better for smallest blocks up to 16x8. Jyrki couldn't get + // improvements in trying the same for 16x16 blocks. + if (acs.covered_blocks_y() == 2) { + quant_norm8 = + std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8, y / 8 + 1)); + masking = 2.0f * std::max(config.Masking(x / 8, y / 8), + config.Masking(x / 8, y / 8 + 1)); + } else { + quant_norm8 = + std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8 + 1, y / 8)); + masking = 2.0f * std::max(config.Masking(x / 8, y / 8), + config.Masking(x / 8 + 1, y / 8)); + } + } else { + float masking_norm2 = 0; + float masking_max = 0; + // Load QF value, calculate empirical heuristic on masking field + // for weighting the information loss. Information loss manifests + // itself as ringing, and masking could hide it. + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + float qval = config.Quant(x / 8 + ix, y / 8 + iy); + qval *= qval; + qval *= qval; + quant_norm8 += qval * qval; + float maskval = config.Masking(x / 8 + ix, y / 8 + iy); + masking_max = std::max(masking_max, maskval); + masking_norm2 += maskval * maskval; + } + } + quant_norm8 /= num_blocks; + quant_norm8 = FastPowf(quant_norm8, 1.0f / 8.0f); + masking_norm2 = sqrt(masking_norm2 / num_blocks); + // This is a highly empirical formula. + masking = (masking_norm2 + masking_max); + } + const auto q = Set(df, quant_norm8); + + // Compute entropy. + float entropy = config.base_entropy; + auto info_loss = Zero(df); + auto info_loss2 = Zero(df); + + for (size_t c = 0; c < 3; c++) { + const float* inv_matrix = config.dequant->InvMatrix(acs.RawStrategy(), c); + const auto cmap_factor = Set(df, cmap_factors[c]); + + auto entropy_v = Zero(df); + auto nzeros_v = Zero(df); + auto cost1 = Set(df, config.cost1); + auto cost2 = Set(df, config.cost2); + auto cost_delta = Set(df, config.cost_delta); + for (size_t i = 0; i < num_blocks * kDCTBlockSize; i += Lanes(df)) { + const auto in = Load(df, block + c * size + i); + const auto in_y = Mul(Load(df, block + size + i), cmap_factor); + const auto im = Load(df, inv_matrix + i); + const auto val = Mul(Sub(in, in_y), Mul(im, q)); + const auto rval = Round(val); + const auto diff = AbsDiff(val, rval); + info_loss = Add(info_loss, diff); + info_loss2 = MulAdd(diff, diff, info_loss2); + const auto q = Abs(rval); + const auto q_is_zero = Eq(q, Zero(df)); + entropy_v = Add(entropy_v, IfThenElseZero(Ge(q, Set(df, 1.5f)), cost2)); + // We used to have q * C here, but that cost model seems to + // be punishing large values more than necessary. Sqrt tries + // to avoid large values less aggressively. Having high accuracy + // around zero is most important at low qualities, and there + // we have directly specified costs for 0, 1, and 2. + entropy_v = MulAdd(Sqrt(q), cost_delta, entropy_v); + nzeros_v = Add(nzeros_v, IfThenZeroElse(q_is_zero, Set(df, 1.0f))); + } + entropy_v = MulAdd(nzeros_v, cost1, entropy_v); + + entropy += cmul[c] * GetLane(SumOfLanes(df, entropy_v)); + size_t num_nzeros = GetLane(SumOfLanes(df, nzeros_v)); + // Add #bit of num_nonzeros, as an estimate of the cost for encoding the + // number of non-zeros of the block. + size_t nbits = CeilLog2Nonzero(num_nzeros + 1) + 1; + // Also add #bit of #bit of num_nonzeros, to estimate the ANS cost, with a + // bias. + entropy += config.zeros_mul * (CeilLog2Nonzero(nbits + 17) + nbits); + } + float ret = + entropy + + masking * + ((config.info_loss_multiplier * GetLane(SumOfLanes(df, info_loss))) + + (config.info_loss_multiplier2 * + sqrt(num_blocks * GetLane(SumOfLanes(df, info_loss2))))); + return ret; +} + +uint8_t FindBest8x8Transform(size_t x, size_t y, int encoding_speed_tier, + const ACSConfig& config, + const float* JXL_RESTRICT cmap_factors, + AcStrategyImage* JXL_RESTRICT ac_strategy, + float* block, float* scratch_space, + uint32_t* quantized, float* entropy_out) { + struct TransformTry8x8 { + AcStrategy::Type type; + int encoding_speed_tier_max_limit; + float entropy_add; + float entropy_mul; + }; + static const TransformTry8x8 kTransforms8x8[] = { + { + AcStrategy::Type::DCT, + 9, + 3.0f, + 0.745f, + }, + { + AcStrategy::Type::DCT4X4, + 5, + 4.0f, + 0.7f, + }, + { + AcStrategy::Type::DCT2X2, + 5, + 0.0f, + 0.66f, + }, + { + AcStrategy::Type::DCT4X8, + 4, + 0.0f, + 0.700754622182473063f, + }, + { + AcStrategy::Type::DCT8X4, + 4, + 0.0f, + 0.700754622182473063f, + }, + { + AcStrategy::Type::IDENTITY, + 5, + 8.0f, + 0.81217614513585534f, + }, + { + AcStrategy::Type::AFV0, + 4, + 3.0f, + 0.70086131125719425f, + }, + { + AcStrategy::Type::AFV1, + 4, + 3.0f, + 0.70086131125719425f, + }, + { + AcStrategy::Type::AFV2, + 4, + 3.0f, + 0.70086131125719425f, + }, + { + AcStrategy::Type::AFV3, + 4, + 3.0f, + 0.70086131125719425f, + }, + }; + double best = 1e30; + uint8_t best_tx = kTransforms8x8[0].type; + for (auto tx : kTransforms8x8) { + if (tx.encoding_speed_tier_max_limit < encoding_speed_tier) { + continue; + } + AcStrategy acs = AcStrategy::FromRawStrategy(tx.type); + float entropy = EstimateEntropy(acs, x, y, config, cmap_factors, block, + scratch_space, quantized); + entropy = tx.entropy_add + tx.entropy_mul * entropy; + if (entropy < best) { + best_tx = tx.type; + best = entropy; + } + } + *entropy_out = best; + return best_tx; +} + +// bx, by addresses the 64x64 block at 8x8 subresolution +// cx, cy addresses the left, upper 8x8 block position of the candidate +// transform. +void TryMergeAcs(AcStrategy::Type acs_raw, size_t bx, size_t by, size_t cx, + size_t cy, const ACSConfig& config, + const float* JXL_RESTRICT cmap_factors, + AcStrategyImage* JXL_RESTRICT ac_strategy, + const float entropy_mul, const uint8_t candidate_priority, + uint8_t* priority, float* JXL_RESTRICT entropy_estimate, + float* block, float* scratch_space, uint32_t* quantized) { + AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw); + float entropy_current = 0; + for (size_t iy = 0; iy < acs.covered_blocks_y(); ++iy) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ++ix) { + if (priority[(cy + iy) * 8 + (cx + ix)] >= candidate_priority) { + // Transform would reuse already allocated blocks and + // lead to invalid overlaps, for example DCT64X32 vs. + // DCT32X64. + return; + } + entropy_current += entropy_estimate[(cy + iy) * 8 + (cx + ix)]; + } + } + float entropy_candidate = + entropy_mul * EstimateEntropy(acs, (bx + cx) * 8, (by + cy) * 8, config, + cmap_factors, block, scratch_space, + quantized); + if (entropy_candidate >= entropy_current) return; + // Accept the candidate. + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + entropy_estimate[(cy + iy) * 8 + cx + ix] = 0; + priority[(cy + iy) * 8 + cx + ix] = candidate_priority; + } + } + ac_strategy->Set(bx + cx, by + cy, acs_raw); + entropy_estimate[cy * 8 + cx] = entropy_candidate; +} + +static void SetEntropyForTransform(size_t cx, size_t cy, + const AcStrategy::Type acs_raw, + float entropy, + float* JXL_RESTRICT entropy_estimate) { + const AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw); + for (size_t dy = 0; dy < acs.covered_blocks_y(); ++dy) { + for (size_t dx = 0; dx < acs.covered_blocks_x(); ++dx) { + entropy_estimate[(cy + dy) * 8 + cx + dx] = 0.0; + } + } + entropy_estimate[cy * 8 + cx] = entropy; +} + +AcStrategy::Type AcsSquare(size_t blocks) { + if (blocks == 2) { + return AcStrategy::Type::DCT16X16; + } else if (blocks == 4) { + return AcStrategy::Type::DCT32X32; + } else { + return AcStrategy::Type::DCT64X64; + } +} + +AcStrategy::Type AcsVerticalSplit(size_t blocks) { + if (blocks == 2) { + return AcStrategy::Type::DCT16X8; + } else if (blocks == 4) { + return AcStrategy::Type::DCT32X16; + } else { + return AcStrategy::Type::DCT64X32; + } +} + +AcStrategy::Type AcsHorizontalSplit(size_t blocks) { + if (blocks == 2) { + return AcStrategy::Type::DCT8X16; + } else if (blocks == 4) { + return AcStrategy::Type::DCT16X32; + } else { + return AcStrategy::Type::DCT32X64; + } +} + +// The following function tries to merge smaller transforms into +// squares and the rectangles originating from a single middle division +// (horizontal or vertical) fairly. +// +// This is now generalized to concern about squares +// of blocks X blocks size, where a block is 8x8 pixels. +void FindBestFirstLevelDivisionForSquare( + size_t blocks, bool allow_square_transform, size_t bx, size_t by, size_t cx, + size_t cy, const ACSConfig& config, const float* JXL_RESTRICT cmap_factors, + AcStrategyImage* JXL_RESTRICT ac_strategy, const float entropy_mul_JXK, + const float entropy_mul_JXJ, float* JXL_RESTRICT entropy_estimate, + float* block, float* scratch_space, uint32_t* quantized) { + // We denote J for the larger dimension here, and K for the smaller. + // For example, for 32x32 block splitting, J would be 32, K 16. + const size_t blocks_half = blocks / 2; + const AcStrategy::Type acs_rawJXK = AcsVerticalSplit(blocks); + const AcStrategy::Type acs_rawKXJ = AcsHorizontalSplit(blocks); + const AcStrategy::Type acs_rawJXJ = AcsSquare(blocks); + const AcStrategy acsJXK = AcStrategy::FromRawStrategy(acs_rawJXK); + const AcStrategy acsKXJ = AcStrategy::FromRawStrategy(acs_rawKXJ); + const AcStrategy acsJXJ = AcStrategy::FromRawStrategy(acs_rawJXJ); + AcStrategyRow row0 = ac_strategy->ConstRow(by + cy + 0); + AcStrategyRow row1 = ac_strategy->ConstRow(by + cy + blocks_half); + // Let's check if we can consider a JXJ block here at all. + // This is not necessary in the basic use of hierarchically merging + // blocks in the simplest possible way, but is needed when we try other + // 'floating' options of merging, possibly after a simple hierarchical + // merge has been explored. + if (MultiBlockTransformCrossesHorizontalBoundary(*ac_strategy, bx + cx, + by + cy, bx + cx + blocks) || + MultiBlockTransformCrossesHorizontalBoundary( + *ac_strategy, bx + cx, by + cy + blocks, bx + cx + blocks) || + MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx, by + cy, + by + cy + blocks) || + MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx + blocks, + by + cy, by + cy + blocks)) { + return; // not suitable for JxJ analysis, some transforms leak out. + } + // For floating transforms there may be + // already blocks selected that make either or both JXK and + // KXJ not feasible for this location. + const bool allow_JXK = !MultiBlockTransformCrossesVerticalBoundary( + *ac_strategy, bx + cx + blocks_half, by + cy, by + cy + blocks); + const bool allow_KXJ = !MultiBlockTransformCrossesHorizontalBoundary( + *ac_strategy, bx + cx, by + cy + blocks_half, bx + cx + blocks); + // Current entropies aggregated on NxN resolution. + float entropy[2][2] = {}; + for (size_t dy = 0; dy < blocks; ++dy) { + for (size_t dx = 0; dx < blocks; ++dx) { + entropy[dy / blocks_half][dx / blocks_half] += + entropy_estimate[(cy + dy) * 8 + (cx + dx)]; + } + } + float entropy_JXK_left = std::numeric_limits::max(); + float entropy_JXK_right = std::numeric_limits::max(); + float entropy_KXJ_top = std::numeric_limits::max(); + float entropy_KXJ_bottom = std::numeric_limits::max(); + float entropy_JXJ = std::numeric_limits::max(); + if (allow_JXK) { + if (row0[bx + cx + 0].RawStrategy() != acs_rawJXK) { + entropy_JXK_left = + entropy_mul_JXK * + EstimateEntropy(acsJXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, + cmap_factors, block, scratch_space, quantized); + } + if (row0[bx + cx + blocks_half].RawStrategy() != acs_rawJXK) { + entropy_JXK_right = + entropy_mul_JXK * EstimateEntropy(acsJXK, (bx + cx + blocks_half) * 8, + (by + cy + 0) * 8, config, + cmap_factors, block, scratch_space, + quantized); + } + } + if (allow_KXJ) { + if (row0[bx + cx].RawStrategy() != acs_rawKXJ) { + entropy_KXJ_top = + entropy_mul_JXK * + EstimateEntropy(acsKXJ, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, + cmap_factors, block, scratch_space, quantized); + } + if (row1[bx + cx].RawStrategy() != acs_rawKXJ) { + entropy_KXJ_bottom = + entropy_mul_JXK * EstimateEntropy(acsKXJ, (bx + cx + 0) * 8, + (by + cy + blocks_half) * 8, config, + cmap_factors, block, scratch_space, + quantized); + } + } + if (allow_square_transform) { + // We control the exploration of the square transform separately so that + // we can turn it off at high decoding speeds for 32x32, but still allow + // exploring 16x32 and 32x16. + entropy_JXJ = entropy_mul_JXJ * EstimateEntropy(acsJXJ, (bx + cx + 0) * 8, + (by + cy + 0) * 8, config, + cmap_factors, block, + scratch_space, quantized); + } + + // Test if this block should have JXK or KXJ transforms, + // because it can have only one or the other. + float costJxN = std::min(entropy_JXK_left, entropy[0][0] + entropy[1][0]) + + std::min(entropy_JXK_right, entropy[0][1] + entropy[1][1]); + float costNxJ = std::min(entropy_KXJ_top, entropy[0][0] + entropy[0][1]) + + std::min(entropy_KXJ_bottom, entropy[1][0] + entropy[1][1]); + if (entropy_JXJ < costJxN && entropy_JXJ < costNxJ) { + ac_strategy->Set(bx + cx, by + cy, acs_rawJXJ); + SetEntropyForTransform(cx, cy, acs_rawJXJ, entropy_JXJ, entropy_estimate); + } else if (costJxN < costNxJ) { + if (entropy_JXK_left < entropy[0][0] + entropy[1][0]) { + ac_strategy->Set(bx + cx, by + cy, acs_rawJXK); + SetEntropyForTransform(cx, cy, acs_rawJXK, entropy_JXK_left, + entropy_estimate); + } + if (entropy_JXK_right < entropy[0][1] + entropy[1][1]) { + ac_strategy->Set(bx + cx + blocks_half, by + cy, acs_rawJXK); + SetEntropyForTransform(cx + blocks_half, cy, acs_rawJXK, + entropy_JXK_right, entropy_estimate); + } + } else { + if (entropy_KXJ_top < entropy[0][0] + entropy[0][1]) { + ac_strategy->Set(bx + cx, by + cy, acs_rawKXJ); + SetEntropyForTransform(cx, cy, acs_rawKXJ, entropy_KXJ_top, + entropy_estimate); + } + if (entropy_KXJ_bottom < entropy[1][0] + entropy[1][1]) { + ac_strategy->Set(bx + cx, by + cy + blocks_half, acs_rawKXJ); + SetEntropyForTransform(cx, cy + blocks_half, acs_rawKXJ, + entropy_KXJ_bottom, entropy_estimate); + } + } +} + +void ProcessRectACS(PassesEncoderState* JXL_RESTRICT enc_state, + const ACSConfig& config, const Rect& rect) { + // Main philosophy here: + // 1. First find best 8x8 transform for each area. + // 2. Merging them into larger transforms where possibly, but + // starting from the smallest transforms (16x8 and 8x16). + // Additional complication: 16x8 and 8x16 are considered + // simultanouesly and fairly against each other. + // We are looking at 64x64 squares since the YtoX and YtoB + // maps happen to be at that resolution, and having + // integral transforms cross these boundaries leads to + // additional complications. + const CompressParams& cparams = enc_state->cparams; + const float butteraugli_target = cparams.butteraugli_distance; + AcStrategyImage* ac_strategy = &enc_state->shared.ac_strategy; + // TODO(veluca): reuse allocations + auto mem = hwy::AllocateAligned(5 * AcStrategy::kMaxCoeffArea); + auto qmem = hwy::AllocateAligned(AcStrategy::kMaxCoeffArea); + uint32_t* JXL_RESTRICT quantized = qmem.get(); + float* JXL_RESTRICT block = mem.get(); + float* JXL_RESTRICT scratch_space = mem.get() + 3 * AcStrategy::kMaxCoeffArea; + size_t bx = rect.x0(); + size_t by = rect.y0(); + JXL_ASSERT(rect.xsize() <= 8); + JXL_ASSERT(rect.ysize() <= 8); + size_t tx = bx / kColorTileDimInBlocks; + size_t ty = by / kColorTileDimInBlocks; + const float cmap_factors[3] = { + enc_state->shared.cmap.YtoXRatio( + enc_state->shared.cmap.ytox_map.ConstRow(ty)[tx]), + 0.0f, + enc_state->shared.cmap.YtoBRatio( + enc_state->shared.cmap.ytob_map.ConstRow(ty)[tx]), + }; + if (cparams.speed_tier > SpeedTier::kHare) return; + // First compute the best 8x8 transform for each square. Later, we do not + // experiment with different combinations, but only use the best of the 8x8s + // when DCT8X8 is specified in the tree search. + // 8x8 transforms have 10 variants, but every larger transform is just a DCT. + float entropy_estimate[64] = {}; + // Favor all 8x8 transforms (against 16x8 and larger transforms)) at + // low butteraugli_target distances. + static const float k8x8mul1 = -0.55; + static const float k8x8mul2 = 1.0; + static const float k8x8base = 1.4; + const float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base); + for (size_t iy = 0; iy < rect.ysize(); iy++) { + for (size_t ix = 0; ix < rect.xsize(); ix++) { + float entropy = 0.0; + const uint8_t best_of_8x8s = FindBest8x8Transform( + 8 * (bx + ix), 8 * (by + iy), static_cast(cparams.speed_tier), + config, cmap_factors, ac_strategy, block, scratch_space, quantized, + &entropy); + ac_strategy->Set(bx + ix, by + iy, + static_cast(best_of_8x8s)); + entropy_estimate[iy * 8 + ix] = entropy * mul8x8; + } + } + // Merge when a larger transform is better than the previously + // searched best combination of 8x8 transforms. + struct MergeTry { + AcStrategy::Type type; + uint8_t priority; + uint8_t decoding_speed_tier_max_limit; + uint8_t encoding_speed_tier_max_limit; + float entropy_mul; + }; + static const float k8X16mul1 = -0.55; + static const float k8X16mul2 = 0.865; + static const float k8X16base = 1.6; + const float entropy_mul16X8 = + k8X16mul2 + k8X16mul1 / (butteraugli_target + k8X16base); + // const float entropy_mul16X8 = mul8X16 * 0.91195782912371126f; + + static const float k16X16mul1 = -0.35; + static const float k16X16mul2 = 0.798; + static const float k16X16base = 2.0; + const float entropy_mul16X16 = + k16X16mul2 + k16X16mul1 / (butteraugli_target + k16X16base); + // const float entropy_mul16X16 = mul16X16 * 0.83183417727960129f; + + static const float k32X16mul1 = -0.1; + static const float k32X16mul2 = 0.854; + static const float k32X16base = 2.5; + const float entropy_mul16X32 = + k32X16mul2 + k32X16mul1 / (butteraugli_target + k32X16base); + + const float entropy_mul32X32 = 0.93; + const float entropy_mul64X64 = 1.52f; + // TODO(jyrki): Consider this feedback in further changes: + // Also effectively when the multipliers for smaller blocks are + // below 1, this raises the bar for the bigger blocks even higher + // in that sense these constants are not independent (e.g. changing + // the constant for DCT16x32 by -5% (making it more likely) also + // means that DCT32x32 becomes harder to do when starting from + // two DCT16x32s). It might be better to make them more independent, + // e.g. by not applying the multiplier when storing the new entropy + // estimates in TryMergeToACSCandidate(). + const MergeTry kTransformsForMerge[9] = { + {AcStrategy::Type::DCT16X8, 2, 4, 5, entropy_mul16X8}, + {AcStrategy::Type::DCT8X16, 2, 4, 5, entropy_mul16X8}, + // FindBestFirstLevelDivisionForSquare looks for DCT16X16 and its + // subdivisions. {AcStrategy::Type::DCT16X16, 3, entropy_mul16X16}, + {AcStrategy::Type::DCT16X32, 4, 4, 4, entropy_mul16X32}, + {AcStrategy::Type::DCT32X16, 4, 4, 4, entropy_mul16X32}, + // FindBestFirstLevelDivisionForSquare looks for DCT32X32 and its + // subdivisions. {AcStrategy::Type::DCT32X32, 5, 1, 5, + // 0.9822994906548809f}, + {AcStrategy::Type::DCT64X32, 6, 1, 3, 1.29f}, + {AcStrategy::Type::DCT32X64, 6, 1, 3, 1.29f}, + // {AcStrategy::Type::DCT64X64, 8, 1, 3, 2.0846542128012948f}, + }; + /* + These sizes not yet included in merge heuristic: + set(AcStrategy::Type::DCT32X8, 0.0f, 2.261390410971102f); + set(AcStrategy::Type::DCT8X32, 0.0f, 2.261390410971102f); + set(AcStrategy::Type::DCT128X128, 0.0f, 1.0f); + set(AcStrategy::Type::DCT128X64, 0.0f, 0.73f); + set(AcStrategy::Type::DCT64X128, 0.0f, 0.73f); + set(AcStrategy::Type::DCT256X256, 0.0f, 1.0f); + set(AcStrategy::Type::DCT256X128, 0.0f, 0.73f); + set(AcStrategy::Type::DCT128X256, 0.0f, 0.73f); + */ + + // Priority is a tricky kludge to avoid collisions so that transforms + // don't overlap. + uint8_t priority[64] = {}; + bool enable_32x32 = cparams.decoding_speed_tier < 4; + for (auto tx : kTransformsForMerge) { + if (tx.decoding_speed_tier_max_limit < cparams.decoding_speed_tier) { + continue; + } + AcStrategy acs = AcStrategy::FromRawStrategy(tx.type); + + for (size_t cy = 0; cy + acs.covered_blocks_y() - 1 < rect.ysize(); + cy += acs.covered_blocks_y()) { + for (size_t cx = 0; cx + acs.covered_blocks_x() - 1 < rect.xsize(); + cx += acs.covered_blocks_x()) { + if (cy + 7 < rect.ysize() && cx + 7 < rect.xsize()) { + if (cparams.decoding_speed_tier < 4 && + tx.type == AcStrategy::Type::DCT32X64) { + // We handle both DCT8X16 and DCT16X8 at the same time. + if ((cy | cx) % 8 == 0) { + FindBestFirstLevelDivisionForSquare( + 8, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, + tx.entropy_mul, entropy_mul64X64, entropy_estimate, block, + scratch_space, quantized); + } + continue; + } else if (tx.type == AcStrategy::Type::DCT32X16) { + // We handled both DCT8X16 and DCT16X8 at the same time, + // and that is above. The last column and last row, + // when the last column or last row is odd numbered, + // are still handled by TryMergeAcs. + continue; + } + } + if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) || + (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) { + // already covered by FindBest32X32 + continue; + } + + if (cy + 3 < rect.ysize() && cx + 3 < rect.xsize()) { + if (tx.type == AcStrategy::Type::DCT16X32) { + // We handle both DCT8X16 and DCT16X8 at the same time. + if ((cy | cx) % 4 == 0) { + FindBestFirstLevelDivisionForSquare( + 4, enable_32x32, bx, by, cx, cy, config, cmap_factors, + ac_strategy, tx.entropy_mul, entropy_mul32X32, + entropy_estimate, block, scratch_space, quantized); + } + continue; + } else if (tx.type == AcStrategy::Type::DCT32X16) { + // We handled both DCT8X16 and DCT16X8 at the same time, + // and that is above. The last column and last row, + // when the last column or last row is odd numbered, + // are still handled by TryMergeAcs. + continue; + } + } + if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) || + (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) { + // already covered by FindBest32X32 + continue; + } + if (cy + 1 < rect.ysize() && cx + 1 < rect.xsize()) { + if (tx.type == AcStrategy::Type::DCT8X16) { + // We handle both DCT8X16 and DCT16X8 at the same time. + if ((cy | cx) % 2 == 0) { + FindBestFirstLevelDivisionForSquare( + 2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, + tx.entropy_mul, entropy_mul16X16, entropy_estimate, block, + scratch_space, quantized); + } + continue; + } else if (tx.type == AcStrategy::Type::DCT16X8) { + // We handled both DCT8X16 and DCT16X8 at the same time, + // and that is above. The last column and last row, + // when the last column or last row is odd numbered, + // are still handled by TryMergeAcs. + continue; + } + } + if ((tx.type == AcStrategy::Type::DCT8X16 && cy % 2 == 1) || + (tx.type == AcStrategy::Type::DCT16X8 && cx % 2 == 1)) { + // already covered by FindBestFirstLevelDivisionForSquare + continue; + } + // All other merge sizes are handled here. + // Some of the DCT16X8s and DCT8X16s will still leak through here + // when there is an odd number of 8x8 blocks, then the last row + // and column will get their DCT16X8s and DCT8X16s through the + // normal integral transform merging process. + TryMergeAcs(tx.type, bx, by, cx, cy, config, cmap_factors, ac_strategy, + tx.entropy_mul, tx.priority, &priority[0], entropy_estimate, + block, scratch_space, quantized); + } + } + } + if (cparams.speed_tier >= SpeedTier::kHare) { + return; + } + // Here we still try to do some non-aligned matching, find a few more + // 16X8, 8X16 and 16X16s between the non-2-aligned blocks. + for (size_t cy = 0; cy + 1 < rect.ysize(); ++cy) { + for (size_t cx = 0; cx + 1 < rect.xsize(); ++cx) { + if ((cy | cx) % 2 != 0) { + FindBestFirstLevelDivisionForSquare( + 2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, + entropy_mul16X8, entropy_mul16X16, entropy_estimate, block, + scratch_space, quantized); + } + } + } + // Non-aligned matching for 32X32, 16X32 and 32X16. + size_t step = cparams.speed_tier >= SpeedTier::kTortoise ? 2 : 1; + for (size_t cy = 0; cy + 3 < rect.ysize(); cy += step) { + for (size_t cx = 0; cx + 3 < rect.xsize(); cx += step) { + if ((cy | cx) % 4 == 0) { + continue; // Already tried with loop above (DCT16X32 case). + } + FindBestFirstLevelDivisionForSquare( + 4, enable_32x32, bx, by, cx, cy, config, cmap_factors, ac_strategy, + entropy_mul16X32, entropy_mul32X32, entropy_estimate, block, + scratch_space, quantized); + } + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(ProcessRectACS); + +void AcStrategyHeuristics::Init(const Image3F& src, + PassesEncoderState* enc_state) { + this->enc_state = enc_state; + config.dequant = &enc_state->shared.matrices; + const CompressParams& cparams = enc_state->cparams; + const float butteraugli_target = cparams.butteraugli_distance; + + if (cparams.speed_tier >= SpeedTier::kCheetah) { + JXL_CHECK(enc_state->shared.matrices.EnsureComputed(1)); // DCT8 only + } else { + uint32_t acs_mask = 0; + // All transforms up to 64x64. + for (size_t i = 0; i < AcStrategy::DCT128X128; i++) { + acs_mask |= (1 << i); + } + JXL_CHECK(enc_state->shared.matrices.EnsureComputed(acs_mask)); + } + + // Image row pointers and strides. + config.quant_field_row = enc_state->initial_quant_field.Row(0); + config.quant_field_stride = enc_state->initial_quant_field.PixelsPerRow(); + auto& mask = enc_state->initial_quant_masking; + if (mask.xsize() > 0 && mask.ysize() > 0) { + config.masking_field_row = mask.Row(0); + config.masking_field_stride = mask.PixelsPerRow(); + } + + config.src_rows[0] = src.ConstPlaneRow(0, 0); + config.src_rows[1] = src.ConstPlaneRow(1, 0); + config.src_rows[2] = src.ConstPlaneRow(2, 0); + config.src_stride = src.PixelsPerRow(); + + // Entropy estimate is composed of two factors: + // - estimate of the number of bits that will be used by the block + // - information loss due to quantization + // The following constant controls the relative weights of these components. + config.info_loss_multiplier = 138.0f; + config.info_loss_multiplier2 = 50.46839691767866; + // TODO(jyrki): explore base_entropy setting more. + // A small value (0?) works better at high distance, while a larger value + // may be more effective at low distance/high bpp. + config.base_entropy = 0.0; + config.zeros_mul = 7.565053364251793f; + // Lots of +1 and -1 coefficients at high quality, it is + // beneficial to favor them. At low qualities zeros matter more + // and +1 / -1 coefficients are already quite harmful. + float slope = std::min(1.0f, butteraugli_target * (1.0f / 3)); + config.cost1 = 1 + slope * 8.8703248061477744f; + config.cost2 = 4.4628149885273363f; + config.cost_delta = 5.3359184934516337f; + JXL_ASSERT(enc_state->shared.ac_strategy.xsize() == + enc_state->shared.frame_dim.xsize_blocks); + JXL_ASSERT(enc_state->shared.ac_strategy.ysize() == + enc_state->shared.frame_dim.ysize_blocks); +} + +void AcStrategyHeuristics::ProcessRect(const Rect& rect) { + PROFILER_FUNC; + const CompressParams& cparams = enc_state->cparams; + // In Falcon mode, use DCT8 everywhere and uniform quantization. + if (cparams.speed_tier >= SpeedTier::kCheetah) { + enc_state->shared.ac_strategy.FillDCT8(rect); + return; + } + HWY_DYNAMIC_DISPATCH(ProcessRectACS) + (enc_state, config, rect); +} + +void AcStrategyHeuristics::Finalize(AuxOut* aux_out) { + const auto& ac_strategy = enc_state->shared.ac_strategy; + // Accounting and debug output. + if (aux_out != nullptr) { + aux_out->num_small_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::IDENTITY) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT2X2) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT4X4); + aux_out->num_dct4x8_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT4X8) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT8X4); + aux_out->num_afv_blocks = ac_strategy.CountBlocks(AcStrategy::Type::AFV0) + + ac_strategy.CountBlocks(AcStrategy::Type::AFV1) + + ac_strategy.CountBlocks(AcStrategy::Type::AFV2) + + ac_strategy.CountBlocks(AcStrategy::Type::AFV3); + aux_out->num_dct8_blocks = ac_strategy.CountBlocks(AcStrategy::Type::DCT); + aux_out->num_dct8x16_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT8X16) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT16X8); + aux_out->num_dct8x32_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT8X32) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT32X8); + aux_out->num_dct16_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT16X16); + aux_out->num_dct16x32_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT16X32) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT32X16); + aux_out->num_dct32_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT32X32); + aux_out->num_dct32x64_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT32X64) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT64X32); + aux_out->num_dct64_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT64X64); + } + + if (WantDebugOutput(aux_out)) { + DumpAcStrategy(ac_strategy, enc_state->shared.frame_dim.xsize, + enc_state->shared.frame_dim.ysize, "ac_strategy", aux_out); + } +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.h b/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.h new file mode 100644 index 0000000000..1ce3442ccf --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.h @@ -0,0 +1,74 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_AC_STRATEGY_H_ +#define LIB_JXL_ENC_AC_STRATEGY_H_ + +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image.h" +#include "lib/jxl/quant_weights.h" + +// `FindBestAcStrategy` uses heuristics to choose which AC strategy should be +// used in each block, as well as the initial quantization field. + +namespace jxl { + +struct AuxOut; + +// AC strategy selection: utility struct. + +struct ACSConfig { + const DequantMatrices* JXL_RESTRICT dequant; + float info_loss_multiplier; + float info_loss_multiplier2; + float* JXL_RESTRICT quant_field_row; + size_t quant_field_stride; + float* JXL_RESTRICT masking_field_row; + size_t masking_field_stride; + const float* JXL_RESTRICT src_rows[3]; + size_t src_stride; + // Cost for 1 (-1), 2 (-2) explicitly, cost for others computed with cost1 + + // cost2 + sqrt(q) * cost_delta. + float cost1; + float cost2; + float cost_delta; + float base_entropy; + float zeros_mul; + const float& Pixel(size_t c, size_t x, size_t y) const { + return src_rows[c][y * src_stride + x]; + } + float Masking(size_t bx, size_t by) const { + JXL_DASSERT(masking_field_row[by * masking_field_stride + bx] > 0); + return masking_field_row[by * masking_field_stride + bx]; + } + float Quant(size_t bx, size_t by) const { + JXL_DASSERT(quant_field_row[by * quant_field_stride + bx] > 0); + return quant_field_row[by * quant_field_stride + bx]; + } +}; + +struct AcStrategyHeuristics { + void Init(const Image3F& src, PassesEncoderState* enc_state); + void ProcessRect(const Rect& rect); + void Finalize(AuxOut* aux_out); + ACSConfig config; + PassesEncoderState* enc_state; +}; + +// Debug. +void DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize, + size_t ysize, const char* tag, AuxOut* aux_out); + +} // namespace jxl + +#endif // LIB_JXL_ENC_AC_STRATEGY_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.cc b/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.cc new file mode 100644 index 0000000000..f54204b059 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.cc @@ -0,0 +1,1145 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_adaptive_quantization.h" + +#include +#include +#include + +#include +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_adaptive_quantization.cc" +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/butteraugli/butteraugli.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/dec_group.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_group.h" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_transforms-inl.h" +#include "lib/jxl/epf.h" +#include "lib/jxl/fast_math-inl.h" +#include "lib/jxl/gauss_blur.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/quant_weights.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::AbsDiff; +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::And; +using hwy::HWY_NAMESPACE::Max; +using hwy::HWY_NAMESPACE::Rebind; +using hwy::HWY_NAMESPACE::Sqrt; +using hwy::HWY_NAMESPACE::ZeroIfNegative; + +// The following functions modulate an exponent (out_val) and return the updated +// value. Their descriptor is limited to 8 lanes for 8x8 blocks. + +// Hack for mask estimation. Eventually replace this code with butteraugli's +// masking. +float ComputeMaskForAcStrategyUse(const float out_val) { + const float kMul = 1.0f; + const float kOffset = 0.001f; + return kMul / (out_val + kOffset); +} + +template +V ComputeMask(const D d, const V out_val) { + const auto kBase = Set(d, -0.76471879237038032f); + const auto kMul4 = Set(d, 4.4585596705216615f); + const auto kMul2 = Set(d, 17.282053892620215f); + const auto kOffset2 = Set(d, 302.36961315317848f); + const auto kMul3 = Set(d, 7.0561261998705858f); + const auto kOffset3 = Set(d, 2.3179635626140773f); + const auto kOffset4 = Mul(Set(d, 0.25f), kOffset3); + const auto kMul0 = Set(d, 0.80061762862741759f); + const auto k1 = Set(d, 1.0f); + + // Avoid division by zero. + const auto v1 = Max(Mul(out_val, kMul0), Set(d, 1e-3f)); + const auto v2 = Div(k1, Add(v1, kOffset2)); + const auto v3 = Div(k1, MulAdd(v1, v1, kOffset3)); + const auto v4 = Div(k1, MulAdd(v1, v1, kOffset4)); + // TODO(jyrki): + // A log or two here could make sense. In butteraugli we have effectively + // log(log(x + C)) for this kind of use, as a single log is used in + // saturating visual masking and here the modulation values are exponential, + // another log would counter that. + return Add(kBase, MulAdd(kMul4, v4, MulAdd(kMul2, v2, Mul(kMul3, v3)))); +} + +// mul and mul2 represent a scaling difference between jxl and butteraugli. +static const float kSGmul = 226.77216153508914f; +static const float kSGmul2 = 1.0f / 73.377132366608819f; +static const float kLog2 = 0.693147181f; +// Includes correction factor for std::log -> log2. +static const float kSGRetMul = kSGmul2 * 18.6580932135f * kLog2; +static const float kSGVOffset = 7.7825991679894591f; + +template +V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) { + // The opsin space in jxl is the cubic root of photons, i.e., v * v * v + // is related to the number of photons. + // + // SimpleGamma(v * v * v) is the psychovisual space in butteraugli. + // This ratio allows quantization to move from jxl's opsin space to + // butteraugli's log-gamma space. + float kEpsilon = 1e-2; + v = ZeroIfNegative(v); + const auto kNumMul = Set(d, kSGRetMul * 3 * kSGmul); + const auto kVOffset = Set(d, kSGVOffset * kLog2 + kEpsilon); + const auto kDenMul = Set(d, kLog2 * kSGmul); + + const auto v2 = Mul(v, v); + + const auto num = MulAdd(kNumMul, v2, Set(d, kEpsilon)); + const auto den = MulAdd(Mul(kDenMul, v), v2, kVOffset); + return invert ? Div(num, den) : Div(den, num); +} + +template +static float RatioOfDerivativesOfCubicRootToSimpleGamma(float v) { + using DScalar = HWY_CAPPED(float, 1); + auto vscalar = Load(DScalar(), &v); + return GetLane( + RatioOfDerivativesOfCubicRootToSimpleGamma(DScalar(), vscalar)); +} + +// TODO(veluca): this function computes an approximation of the derivative of +// SimpleGamma with (f(x+eps)-f(x))/eps. Consider two-sided approximation or +// exact derivatives. For reference, SimpleGamma was: +/* +template +V SimpleGamma(const D d, V v) { + // A simple HDR compatible gamma function. + const auto mul = Set(d, kSGmul); + const auto kRetMul = Set(d, kSGRetMul); + const auto kRetAdd = Set(d, kSGmul2 * -20.2789020414f); + const auto kVOffset = Set(d, kSGVOffset); + + v *= mul; + + // This should happen rarely, but may lead to a NaN, which is rather + // undesirable. Since negative photons don't exist we solve the NaNs by + // clamping here. + // TODO(veluca): with FastLog2f, this no longer leads to NaNs. + v = ZeroIfNegative(v); + return kRetMul * FastLog2f(d, v + kVOffset) + kRetAdd; +} +*/ + +template +V GammaModulation(const D d, const size_t x, const size_t y, + const ImageF& xyb_x, const ImageF& xyb_y, const V out_val) { + const float kBias = 0.16f; + JXL_DASSERT(kBias > kOpsinAbsorbanceBias[0]); + JXL_DASSERT(kBias > kOpsinAbsorbanceBias[1]); + JXL_DASSERT(kBias > kOpsinAbsorbanceBias[2]); + auto overall_ratio = Zero(d); + auto bias = Set(d, kBias); + auto half = Set(d, 0.5f); + for (size_t dy = 0; dy < 8; ++dy) { + const float* const JXL_RESTRICT row_in_x = xyb_x.Row(y + dy); + const float* const JXL_RESTRICT row_in_y = xyb_y.Row(y + dy); + for (size_t dx = 0; dx < 8; dx += Lanes(d)) { + const auto iny = Add(Load(d, row_in_y + x + dx), bias); + const auto inx = Load(d, row_in_x + x + dx); + const auto r = Sub(iny, inx); + const auto g = Add(iny, inx); + const auto ratio_r = + RatioOfDerivativesOfCubicRootToSimpleGamma(d, r); + const auto ratio_g = + RatioOfDerivativesOfCubicRootToSimpleGamma(d, g); + const auto avg_ratio = Mul(half, Add(ratio_r, ratio_g)); + + overall_ratio = Add(overall_ratio, avg_ratio); + } + } + overall_ratio = Mul(SumOfLanes(d, overall_ratio), Set(d, 1.0f / 64)); + // ideally -1.0, but likely optimal correction adds some entropy, so slightly + // less than that. + // ln(2) constant folded in because we want std::log but have FastLog2f. + const auto kGam = Set(d, -0.15526878023684174f * 0.693147180559945f); + return MulAdd(kGam, FastLog2f(d, overall_ratio), out_val); +} + +template +V ColorModulation(const D d, const size_t x, const size_t y, + const ImageF& xyb_x, const ImageF& xyb_y, const ImageF& xyb_b, + const double butteraugli_target, V out_val) { + static const float kStrengthMul = 4.2456542701250122f; + static const float kRedRampStart = 0.18748564245760829f; + static const float kRedRampLength = 0.16701783842516479f; + static const float kBlueRampLength = 0.16117602661852037f; + static const float kBlueRampStart = 0.47897504338287333f; + const float strength = kStrengthMul * (1.0f - 0.15f * butteraugli_target); + if (strength < 0) { + return out_val; + } + // x values are smaller than y and b values, need to take the difference into + // account. + const float red_strength = strength * 6.0f; + const float blue_strength = strength; + { + // Reduce some bits from areas not blue or red. + const float offset = strength * -0.007; // 9174542291185913f; + out_val = Add(out_val, Set(d, offset)); + } + // Calculate how much of the 8x8 block is covered with blue or red. + auto blue_coverage = Zero(d); + auto red_coverage = Zero(d); + auto bias_y = Set(d, 0.2f); + auto bias_y_add = Set(d, 0.1f); + for (size_t dy = 0; dy < 8; ++dy) { + const float* const JXL_RESTRICT row_in_x = xyb_x.Row(y + dy); + const float* const JXL_RESTRICT row_in_y = xyb_y.Row(y + dy); + const float* const JXL_RESTRICT row_in_b = xyb_b.Row(y + dy); + for (size_t dx = 0; dx < 8; dx += Lanes(d)) { + const auto pixel_y = Load(d, row_in_y + x + dx); + // Estimate redness-greeness relative to the intensity. + const auto pixel_xpy = Div(Abs(Load(d, row_in_x + x + dx)), + Max(Add(bias_y_add, pixel_y), bias_y)); + const auto pixel_x = + Max(Set(d, 0.0f), Sub(pixel_xpy, Set(d, kRedRampStart))); + const auto pixel_b = + Max(Set(d, 0.0f), Sub(Load(d, row_in_b + x + dx), + Add(pixel_y, Set(d, kBlueRampStart)))); + const auto blue_slope = Min(pixel_b, Set(d, kBlueRampLength)); + const auto red_slope = Min(pixel_x, Set(d, kRedRampLength)); + red_coverage = Add(red_coverage, red_slope); + blue_coverage = Add(blue_coverage, blue_slope); + } + } + + // Saturate when the high red or high blue coverage is above a level. + // The idea here is that if a certain fraction of the block is red or + // blue we consider as if it was fully red or blue. + static const float ratio = 28.0f; // out of 64 pixels. + + auto overall_red_coverage = SumOfLanes(d, red_coverage); + overall_red_coverage = + Min(overall_red_coverage, Set(d, ratio * kRedRampLength)); + overall_red_coverage = + Mul(overall_red_coverage, Set(d, red_strength / ratio)); + + auto overall_blue_coverage = SumOfLanes(d, blue_coverage); + overall_blue_coverage = + Min(overall_blue_coverage, Set(d, ratio * kBlueRampLength)); + overall_blue_coverage = + Mul(overall_blue_coverage, Set(d, blue_strength / ratio)); + + return Add(overall_red_coverage, Add(overall_blue_coverage, out_val)); +} + +// Change precision in 8x8 blocks that have high frequency content. +template +V HfModulation(const D d, const size_t x, const size_t y, const ImageF& xyb, + const V out_val) { + // Zero out the invalid differences for the rightmost value per row. + const Rebind du; + HWY_ALIGN constexpr uint32_t kMaskRight[kBlockDim] = {~0u, ~0u, ~0u, ~0u, + ~0u, ~0u, ~0u, 0}; + + auto sum = Zero(d); // sum of absolute differences with right and below + + static const float valmin = 0.52489909479039587f; + auto valminv = Set(d, valmin); + for (size_t dy = 0; dy < 8; ++dy) { + const float* JXL_RESTRICT row_in = xyb.Row(y + dy) + x; + const float* JXL_RESTRICT row_in_next = + dy == 7 ? row_in : xyb.Row(y + dy + 1) + x; + + // In SCALAR, there is no guarantee of having extra row padding. + // Hence, we need to ensure we don't access pixels outside the row itself. + // In SIMD modes, however, rows are padded, so it's safe to access one + // garbage value after the row. The vector then gets masked with kMaskRight + // to remove the influence of that value. +#if HWY_TARGET != HWY_SCALAR + for (size_t dx = 0; dx < 8; dx += Lanes(d)) { +#else + for (size_t dx = 0; dx < 7; dx += Lanes(d)) { +#endif + const auto p = Load(d, row_in + dx); + const auto pr = LoadU(d, row_in + dx + 1); + const auto mask = BitCast(d, Load(du, kMaskRight + dx)); + sum = Add(sum, And(mask, Min(valminv, AbsDiff(p, pr)))); + + const auto pd = Load(d, row_in_next + dx); + sum = Add(sum, Min(valminv, AbsDiff(p, pd))); + } +#if HWY_TARGET == HWY_SCALAR + const auto p = Load(d, row_in + 7); + const auto pd = Load(d, row_in_next + 7); + sum = Add(sum, Min(valminv, AbsDiff(p, pd))); +#endif + } + // more negative value gives more bpp + static const float kOffset = -2.6545897672771526; + static const float kMul = -0.049868161744916512; + + sum = SumOfLanes(d, sum); + float scalar_sum = GetLane(sum); + static const float maxsum = 7.9076877647025947f; + static const float minsum = 0.53640540945659809f; + scalar_sum = std::min(maxsum, scalar_sum); + scalar_sum = std::max(minsum, scalar_sum); + scalar_sum += kOffset; + scalar_sum *= kMul; + return Add(Set(d, scalar_sum), out_val); +} + +void PerBlockModulations(const float butteraugli_target, const ImageF& xyb_x, + const ImageF& xyb_y, const ImageF& xyb_b, + const float scale, const Rect& rect, ImageF* out) { + JXL_ASSERT(SameSize(xyb_x, xyb_y)); + JXL_ASSERT(DivCeil(xyb_x.xsize(), kBlockDim) == out->xsize()); + JXL_ASSERT(DivCeil(xyb_x.ysize(), kBlockDim) == out->ysize()); + + float base_level = 0.48f * scale; + float kDampenRampStart = 2.0f; + float kDampenRampEnd = 14.0f; + float dampen = 1.0f; + if (butteraugli_target >= kDampenRampStart) { + dampen = 1.0f - ((butteraugli_target - kDampenRampStart) / + (kDampenRampEnd - kDampenRampStart)); + if (dampen < 0) { + dampen = 0; + } + } + const float mul = scale * dampen; + const float add = (1.0f - dampen) * base_level; + for (size_t iy = rect.y0(); iy < rect.y0() + rect.ysize(); iy++) { + const size_t y = iy * 8; + float* const JXL_RESTRICT row_out = out->Row(iy); + const HWY_CAPPED(float, kBlockDim) df; + for (size_t ix = rect.x0(); ix < rect.x0() + rect.xsize(); ix++) { + size_t x = ix * 8; + auto out_val = Set(df, row_out[ix]); + out_val = ComputeMask(df, out_val); + out_val = HfModulation(df, x, y, xyb_y, out_val); + out_val = ColorModulation(df, x, y, xyb_x, xyb_y, xyb_b, + butteraugli_target, out_val); + out_val = GammaModulation(df, x, y, xyb_x, xyb_y, out_val); + // We want multiplicative quantization field, so everything + // until this point has been modulating the exponent. + row_out[ix] = FastPow2f(GetLane(out_val) * 1.442695041f) * mul + add; + } + } +} + +template +V MaskingSqrt(const D d, V v) { + static const float kLogOffset = 27.97044946785558f; + static const float kMul = 211.53333281566171f; + const auto mul_v = Set(d, kMul * 1e8); + const auto offset_v = Set(d, kLogOffset); + return Mul(Set(d, 0.25f), Sqrt(MulAdd(v, Sqrt(mul_v), offset_v))); +} + +float MaskingSqrt(const float v) { + using DScalar = HWY_CAPPED(float, 1); + auto vscalar = Load(DScalar(), &v); + return GetLane(MaskingSqrt(DScalar(), vscalar)); +} + +void StoreMin4(const float v, float& min0, float& min1, float& min2, + float& min3) { + if (v < min3) { + if (v < min0) { + min3 = min2; + min2 = min1; + min1 = min0; + min0 = v; + } else if (v < min1) { + min3 = min2; + min2 = min1; + min1 = v; + } else if (v < min2) { + min3 = min2; + min2 = v; + } else { + min3 = v; + } + } +} + +// Look for smooth areas near the area of degradation. +// If the areas are generally smooth, don't do masking. +// Output is downsampled 2x. +void FuzzyErosion(const Rect& from_rect, const ImageF& from, + const Rect& to_rect, ImageF* to) { + const size_t xsize = from.xsize(); + const size_t ysize = from.ysize(); + constexpr int kStep = 1; + static_assert(kStep == 1, "Step must be 1"); + JXL_ASSERT(to_rect.xsize() * 2 == from_rect.xsize()); + JXL_ASSERT(to_rect.ysize() * 2 == from_rect.ysize()); + for (size_t fy = 0; fy < from_rect.ysize(); ++fy) { + size_t y = fy + from_rect.y0(); + size_t ym1 = y >= kStep ? y - kStep : y; + size_t yp1 = y + kStep < ysize ? y + kStep : y; + const float* rowt = from.Row(ym1); + const float* row = from.Row(y); + const float* rowb = from.Row(yp1); + float* row_out = to_rect.Row(to, fy / 2); + for (size_t fx = 0; fx < from_rect.xsize(); ++fx) { + size_t x = fx + from_rect.x0(); + size_t xm1 = x >= kStep ? x - kStep : x; + size_t xp1 = x + kStep < xsize ? x + kStep : x; + float min0 = row[x]; + float min1 = row[xm1]; + float min2 = row[xp1]; + float min3 = rowt[xm1]; + // Sort the first four values. + if (min0 > min1) std::swap(min0, min1); + if (min0 > min2) std::swap(min0, min2); + if (min0 > min3) std::swap(min0, min3); + if (min1 > min2) std::swap(min1, min2); + if (min1 > min3) std::swap(min1, min3); + if (min2 > min3) std::swap(min2, min3); + // The remaining five values of a 3x3 neighbourhood. + StoreMin4(rowt[x], min0, min1, min2, min3); + StoreMin4(rowt[xp1], min0, min1, min2, min3); + StoreMin4(rowb[xm1], min0, min1, min2, min3); + StoreMin4(rowb[x], min0, min1, min2, min3); + StoreMin4(rowb[xp1], min0, min1, min2, min3); + static const float kMul0 = 0.125f; + static const float kMul1 = 0.075f; + static const float kMul2 = 0.06f; + static const float kMul3 = 0.05f; + float v = kMul0 * min0 + kMul1 * min1 + kMul2 * min2 + kMul3 * min3; + if (fx % 2 == 0 && fy % 2 == 0) { + row_out[fx / 2] = v; + } else { + row_out[fx / 2] += v; + } + } + } +} + +struct AdaptiveQuantizationImpl { + void Init(const Image3F& xyb) { + JXL_DASSERT(xyb.xsize() % kBlockDim == 0); + JXL_DASSERT(xyb.ysize() % kBlockDim == 0); + const size_t xsize = xyb.xsize(); + const size_t ysize = xyb.ysize(); + aq_map = ImageF(xsize / kBlockDim, ysize / kBlockDim); + } + void PrepareBuffers(size_t num_threads) { + diff_buffer = ImageF(kEncTileDim + 8, num_threads); + for (size_t i = pre_erosion.size(); i < num_threads; i++) { + pre_erosion.emplace_back(kEncTileDimInBlocks * 2 + 2, + kEncTileDimInBlocks * 2 + 2); + } + } + + void ComputeTile(float butteraugli_target, float scale, const Image3F& xyb, + const Rect& rect, const int thread, ImageF* mask) { + PROFILER_ZONE("aq DiffPrecompute"); + const size_t xsize = xyb.xsize(); + const size_t ysize = xyb.ysize(); + + // The XYB gamma is 3.0 to be able to decode faster with two muls. + // Butteraugli's gamma is matching the gamma of human eye, around 2.6. + // We approximate the gamma difference by adding one cubic root into + // the adaptive quantization. This gives us a total gamma of 2.6666 + // for quantization uses. + const float match_gamma_offset = 0.019; + + const HWY_FULL(float) df; + + size_t y_start = rect.y0() * 8; + size_t y_end = y_start + rect.ysize() * 8; + + size_t x0 = rect.x0() * 8; + size_t x1 = x0 + rect.xsize() * 8; + if (x0 != 0) x0 -= 4; + if (x1 != xyb.xsize()) x1 += 4; + if (y_start != 0) y_start -= 4; + if (y_end != xyb.ysize()) y_end += 4; + pre_erosion[thread].ShrinkTo((x1 - x0) / 4, (y_end - y_start) / 4); + + static const float limit = 0.2f; + // Computes image (padded to multiple of 8x8) of local pixel differences. + // Subsample both directions by 4. + for (size_t y = y_start; y < y_end; ++y) { + size_t y2 = y + 1 < ysize ? y + 1 : y; + size_t y1 = y > 0 ? y - 1 : y; + + const float* row_in = xyb.PlaneRow(1, y); + const float* row_in1 = xyb.PlaneRow(1, y1); + const float* row_in2 = xyb.PlaneRow(1, y2); + float* JXL_RESTRICT row_out = diff_buffer.Row(thread); + + auto scalar_pixel = [&](size_t x) { + const size_t x2 = x + 1 < xsize ? x + 1 : x; + const size_t x1 = x > 0 ? x - 1 : x; + const float base = + 0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]); + const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma( + row_in[x] + match_gamma_offset); + float diff = gammac * (row_in[x] - base); + diff *= diff; + if (diff >= limit) { + diff = limit; + } + diff = MaskingSqrt(diff); + if ((y % 4) != 0) { + row_out[x - x0] += diff; + } else { + row_out[x - x0] = diff; + } + }; + + size_t x = x0; + // First pixel of the row. + if (x0 == 0) { + scalar_pixel(x0); + ++x; + } + // SIMD + const auto match_gamma_offset_v = Set(df, match_gamma_offset); + const auto quarter = Set(df, 0.25f); + for (; x + 1 + Lanes(df) < x1; x += Lanes(df)) { + const auto in = LoadU(df, row_in + x); + const auto in_r = LoadU(df, row_in + x + 1); + const auto in_l = LoadU(df, row_in + x - 1); + const auto in_t = LoadU(df, row_in2 + x); + const auto in_b = LoadU(df, row_in1 + x); + auto base = Mul(quarter, Add(Add(in_r, in_l), Add(in_t, in_b))); + auto gammacv = + RatioOfDerivativesOfCubicRootToSimpleGamma( + df, Add(in, match_gamma_offset_v)); + auto diff = Mul(gammacv, Sub(in, base)); + diff = Mul(diff, diff); + diff = Min(diff, Set(df, limit)); + diff = MaskingSqrt(df, diff); + if ((y & 3) != 0) { + diff = Add(diff, LoadU(df, row_out + x - x0)); + } + StoreU(diff, df, row_out + x - x0); + } + // Scalar + for (; x < x1; ++x) { + scalar_pixel(x); + } + if (y % 4 == 3) { + float* row_dout = pre_erosion[thread].Row((y - y_start) / 4); + for (size_t x = 0; x < (x1 - x0) / 4; x++) { + row_dout[x] = (row_out[x * 4] + row_out[x * 4 + 1] + + row_out[x * 4 + 2] + row_out[x * 4 + 3]) * + 0.25f; + } + } + } + Rect from_rect(x0 % 8 == 0 ? 0 : 1, y_start % 8 == 0 ? 0 : 1, + rect.xsize() * 2, rect.ysize() * 2); + FuzzyErosion(from_rect, pre_erosion[thread], rect, &aq_map); + for (size_t y = 0; y < rect.ysize(); ++y) { + const float* aq_map_row = rect.ConstRow(aq_map, y); + float* mask_row = rect.Row(mask, y); + for (size_t x = 0; x < rect.xsize(); ++x) { + mask_row[x] = ComputeMaskForAcStrategyUse(aq_map_row[x]); + } + } + PerBlockModulations(butteraugli_target, xyb.Plane(0), xyb.Plane(1), + xyb.Plane(2), scale, rect, &aq_map); + } + std::vector pre_erosion; + ImageF aq_map; + ImageF diff_buffer; +}; + +ImageF AdaptiveQuantizationMap(const float butteraugli_target, + const Image3F& xyb, + const FrameDimensions& frame_dim, float scale, + ThreadPool* pool, ImageF* mask) { + PROFILER_ZONE("aq AdaptiveQuantMap"); + + AdaptiveQuantizationImpl impl; + impl.Init(xyb); + *mask = ImageF(frame_dim.xsize_blocks, frame_dim.ysize_blocks); + JXL_CHECK(RunOnPool( + pool, 0, + DivCeil(frame_dim.xsize_blocks, kEncTileDimInBlocks) * + DivCeil(frame_dim.ysize_blocks, kEncTileDimInBlocks), + [&](const size_t num_threads) { + impl.PrepareBuffers(num_threads); + return true; + }, + [&](const uint32_t tid, const size_t thread) { + size_t n_enc_tiles = + DivCeil(frame_dim.xsize_blocks, kEncTileDimInBlocks); + size_t tx = tid % n_enc_tiles; + size_t ty = tid / n_enc_tiles; + size_t by0 = ty * kEncTileDimInBlocks; + size_t by1 = + std::min((ty + 1) * kEncTileDimInBlocks, frame_dim.ysize_blocks); + size_t bx0 = tx * kEncTileDimInBlocks; + size_t bx1 = + std::min((tx + 1) * kEncTileDimInBlocks, frame_dim.xsize_blocks); + Rect r(bx0, by0, bx1 - bx0, by1 - by0); + impl.ComputeTile(butteraugli_target, scale, xyb, r, thread, mask); + }, + "AQ DiffPrecompute")); + + return std::move(impl).aq_map; +} + +} // namespace + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(AdaptiveQuantizationMap); + +namespace { +// If true, prints the quantization maps at each iteration. +bool FLAGS_dump_quant_state = false; + +void DumpHeatmap(const AuxOut* aux_out, const std::string& label, + const ImageF& image, float good_threshold, + float bad_threshold) { + Image3F heatmap = CreateHeatMapImage(image, good_threshold, bad_threshold); + char filename[200]; + snprintf(filename, sizeof(filename), "%s%05d", label.c_str(), + aux_out->num_butteraugli_iters); + aux_out->DumpImage(filename, heatmap); +} + +void DumpHeatmaps(const AuxOut* aux_out, float ba_target, + const ImageF& quant_field, const ImageF& tile_heatmap, + const ImageF& bt_diffmap) { + if (!WantDebugOutput(aux_out)) return; + ImageF inv_qmap(quant_field.xsize(), quant_field.ysize()); + for (size_t y = 0; y < quant_field.ysize(); ++y) { + const float* JXL_RESTRICT row_q = quant_field.ConstRow(y); + float* JXL_RESTRICT row_inv_q = inv_qmap.Row(y); + for (size_t x = 0; x < quant_field.xsize(); ++x) { + row_inv_q[x] = 1.0f / row_q[x]; // never zero + } + } + DumpHeatmap(aux_out, "quant_heatmap", inv_qmap, 4.0f * ba_target, + 6.0f * ba_target); + DumpHeatmap(aux_out, "tile_heatmap", tile_heatmap, ba_target, + 1.5f * ba_target); + // matches heat maps produced by the command line tool. + DumpHeatmap(aux_out, "bt_diffmap", bt_diffmap, ButteraugliFuzzyInverse(1.5), + ButteraugliFuzzyInverse(0.5)); +} + +ImageF TileDistMap(const ImageF& distmap, int tile_size, int margin, + const AcStrategyImage& ac_strategy) { + PROFILER_FUNC; + const int tile_xsize = (distmap.xsize() + tile_size - 1) / tile_size; + const int tile_ysize = (distmap.ysize() + tile_size - 1) / tile_size; + ImageF tile_distmap(tile_xsize, tile_ysize); + size_t distmap_stride = tile_distmap.PixelsPerRow(); + for (int tile_y = 0; tile_y < tile_ysize; ++tile_y) { + AcStrategyRow ac_strategy_row = ac_strategy.ConstRow(tile_y); + float* JXL_RESTRICT dist_row = tile_distmap.Row(tile_y); + for (int tile_x = 0; tile_x < tile_xsize; ++tile_x) { + AcStrategy acs = ac_strategy_row[tile_x]; + if (!acs.IsFirstBlock()) continue; + int this_tile_xsize = acs.covered_blocks_x() * tile_size; + int this_tile_ysize = acs.covered_blocks_y() * tile_size; + int y_begin = std::max(0, tile_size * tile_y - margin); + int y_end = std::min(distmap.ysize(), + tile_size * tile_y + this_tile_ysize + margin); + int x_begin = std::max(0, tile_size * tile_x - margin); + int x_end = std::min(distmap.xsize(), + tile_size * tile_x + this_tile_xsize + margin); + float dist_norm = 0.0; + double pixels = 0; + for (int y = y_begin; y < y_end; ++y) { + float ymul = 1.0; + constexpr float kBorderMul = 0.98f; + constexpr float kCornerMul = 0.7f; + if (margin != 0 && (y == y_begin || y == y_end - 1)) { + ymul = kBorderMul; + } + const float* const JXL_RESTRICT row = distmap.Row(y); + for (int x = x_begin; x < x_end; ++x) { + float xmul = ymul; + if (margin != 0 && (x == x_begin || x == x_end - 1)) { + if (xmul == 1.0) { + xmul = kBorderMul; + } else { + xmul = kCornerMul; + } + } + float v = row[x]; + v *= v; + v *= v; + v *= v; + v *= v; + dist_norm += xmul * v; + pixels += xmul; + } + } + if (pixels == 0) pixels = 1; + // 16th norm is less than the max norm, we reduce the difference + // with this normalization factor. + constexpr float kTileNorm = 1.2f; + const float tile_dist = + kTileNorm * std::pow(dist_norm / pixels, 1.0f / 16.0f); + dist_row[tile_x] = tile_dist; + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + dist_row[tile_x + distmap_stride * iy + ix] = tile_dist; + } + } + } + } + return tile_distmap; +} + +static const float kDcQuantPow = 0.83; +static const float kDcQuant = 1.095924047623553f; +static const float kAcQuant = 0.80751132443618624f; + +void FindBestQuantization(const ImageBundle& linear, const Image3F& opsin, + PassesEncoderState* enc_state, + const JxlCmsInterface& cms, ThreadPool* pool, + AuxOut* aux_out) { + const CompressParams& cparams = enc_state->cparams; + if (cparams.resampling > 1 && + cparams.original_butteraugli_distance <= 4.0 * cparams.resampling) { + // For downsampled opsin image, the butteraugli based adaptive quantization + // loop would only make the size bigger without improving the distance much, + // so in this case we enable it only for very high butteraugli targets. + return; + } + Quantizer& quantizer = enc_state->shared.quantizer; + ImageI& raw_quant_field = enc_state->shared.raw_quant_field; + ImageF& quant_field = enc_state->initial_quant_field; + + // TODO(veluca): this should really be rather handled on the + // ButteraugliComparator side. + struct TemporaryShrink { + TemporaryShrink(ImageBundle& bundle, size_t xsize, size_t ysize) + : bundle(bundle), + orig_xsize(bundle.xsize()), + orig_ysize(bundle.ysize()) { + bundle.ShrinkTo(xsize, ysize); + } + TemporaryShrink(const TemporaryShrink&) = delete; + TemporaryShrink(TemporaryShrink&&) = delete; + + ~TemporaryShrink() { bundle.ShrinkTo(orig_xsize, orig_ysize); } + + ImageBundle& bundle; + size_t orig_xsize; + size_t orig_ysize; + } t(const_cast(linear), + enc_state->shared.frame_header.nonserialized_metadata->xsize(), + enc_state->shared.frame_header.nonserialized_metadata->ysize()); + + const float butteraugli_target = cparams.butteraugli_distance; + const float original_butteraugli = cparams.original_butteraugli_distance; + ButteraugliParams params = cparams.ba_params; + params.intensity_target = linear.metadata()->IntensityTarget(); + // Hack the default intensity target value to be 80.0, the intensity + // target of sRGB images and a more reasonable viewing default than + // JPEG XL file format's default. + if (fabs(params.intensity_target - 255.0f) < 1e-3) { + params.intensity_target = 80.0f; + } + JxlButteraugliComparator comparator(params, cms); + JXL_CHECK(comparator.SetReferenceImage(linear)); + bool lower_is_better = + (comparator.GoodQualityScore() < comparator.BadQualityScore()); + const float initial_quant_dc = InitialQuantDC(butteraugli_target); + AdjustQuantField(enc_state->shared.ac_strategy, Rect(quant_field), + &quant_field); + ImageF tile_distmap; + ImageF initial_quant_field = CopyImage(quant_field); + + float initial_qf_min, initial_qf_max; + ImageMinMax(initial_quant_field, &initial_qf_min, &initial_qf_max); + float initial_qf_ratio = initial_qf_max / initial_qf_min; + float qf_max_deviation_low = std::sqrt(250 / initial_qf_ratio); + float asymmetry = 2; + if (qf_max_deviation_low < asymmetry) asymmetry = qf_max_deviation_low; + float qf_lower = initial_qf_min / (asymmetry * qf_max_deviation_low); + float qf_higher = initial_qf_max * (qf_max_deviation_low / asymmetry); + + JXL_ASSERT(qf_higher / qf_lower < 253); + + constexpr int kOriginalComparisonRound = 1; + int iters = cparams.max_butteraugli_iters; + if (iters > 7) { + iters = 7; + } + if (cparams.speed_tier != SpeedTier::kTortoise) { + iters = 2; + } + for (int i = 0; i < iters + 1; ++i) { + if (FLAGS_dump_quant_state) { + printf("\nQuantization field:\n"); + for (size_t y = 0; y < quant_field.ysize(); ++y) { + for (size_t x = 0; x < quant_field.xsize(); ++x) { + printf(" %.5f", quant_field.Row(y)[x]); + } + printf("\n"); + } + } + quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field); + ImageBundle dec_linear = RoundtripImage(opsin, enc_state, cms, pool); + PROFILER_ZONE("enc Butteraugli"); + float score; + ImageF diffmap; + JXL_CHECK(comparator.CompareWith(dec_linear, &diffmap, &score)); + if (!lower_is_better) { + score = -score; + diffmap = ScaleImage(-1.0f, diffmap); + } + tile_distmap = TileDistMap(diffmap, 8 * cparams.resampling, 0, + enc_state->shared.ac_strategy); + if (WantDebugOutput(aux_out)) { + aux_out->DumpImage(("dec" + ToString(i)).c_str(), *dec_linear.color()); + DumpHeatmaps(aux_out, butteraugli_target, quant_field, tile_distmap, + diffmap); + } + if (aux_out != nullptr) ++aux_out->num_butteraugli_iters; + if (cparams.log_search_state) { + float minval, maxval; + ImageMinMax(quant_field, &minval, &maxval); + printf("\nButteraugli iter: %d/%d\n", i, cparams.max_butteraugli_iters); + printf("Butteraugli distance: %f (target = %f)\n", score, + original_butteraugli); + printf("quant range: %f ... %f DC quant: %f\n", minval, maxval, + initial_quant_dc); + if (FLAGS_dump_quant_state) { + quantizer.DumpQuantizationMap(raw_quant_field); + } + } + + if (i == iters) break; + + double kPow[8] = { + 0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + }; + double kPowMod[8] = { + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + }; + if (i == kOriginalComparisonRound) { + // Don't allow optimization to make the quant field a lot worse than + // what the initial guess was. This allows the AC field to have enough + // precision to reduce the oscillations due to the dc reconstruction. + double kInitMul = 0.6; + const double kOneMinusInitMul = 1.0 - kInitMul; + for (size_t y = 0; y < quant_field.ysize(); ++y) { + float* const JXL_RESTRICT row_q = quant_field.Row(y); + const float* const JXL_RESTRICT row_init = initial_quant_field.Row(y); + for (size_t x = 0; x < quant_field.xsize(); ++x) { + double clamp = kOneMinusInitMul * row_q[x] + kInitMul * row_init[x]; + if (row_q[x] < clamp) { + row_q[x] = clamp; + if (row_q[x] > qf_higher) row_q[x] = qf_higher; + if (row_q[x] < qf_lower) row_q[x] = qf_lower; + } + } + } + } + + double cur_pow = 0.0; + if (i < 7) { + cur_pow = kPow[i] + (original_butteraugli - 1.0) * kPowMod[i]; + if (cur_pow < 0) { + cur_pow = 0; + } + } + if (cur_pow == 0.0) { + for (size_t y = 0; y < quant_field.ysize(); ++y) { + const float* const JXL_RESTRICT row_dist = tile_distmap.Row(y); + float* const JXL_RESTRICT row_q = quant_field.Row(y); + for (size_t x = 0; x < quant_field.xsize(); ++x) { + const float diff = row_dist[x] / original_butteraugli; + if (diff > 1.0f) { + float old = row_q[x]; + row_q[x] *= diff; + int qf_old = old * quantizer.InvGlobalScale() + 0.5; + int qf_new = row_q[x] * quantizer.InvGlobalScale() + 0.5; + if (qf_old == qf_new) { + row_q[x] = old + quantizer.Scale(); + } + } + if (row_q[x] > qf_higher) row_q[x] = qf_higher; + if (row_q[x] < qf_lower) row_q[x] = qf_lower; + } + } + } else { + for (size_t y = 0; y < quant_field.ysize(); ++y) { + const float* const JXL_RESTRICT row_dist = tile_distmap.Row(y); + float* const JXL_RESTRICT row_q = quant_field.Row(y); + for (size_t x = 0; x < quant_field.xsize(); ++x) { + const float diff = row_dist[x] / original_butteraugli; + if (diff <= 1.0f) { + row_q[x] *= std::pow(diff, cur_pow); + } else { + float old = row_q[x]; + row_q[x] *= diff; + int qf_old = old * quantizer.InvGlobalScale() + 0.5; + int qf_new = row_q[x] * quantizer.InvGlobalScale() + 0.5; + if (qf_old == qf_new) { + row_q[x] = old + quantizer.Scale(); + } + } + if (row_q[x] > qf_higher) row_q[x] = qf_higher; + if (row_q[x] < qf_lower) row_q[x] = qf_lower; + } + } + } + } + quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field); +} + +void FindBestQuantizationMaxError(const Image3F& opsin, + PassesEncoderState* enc_state, + const JxlCmsInterface& cms, ThreadPool* pool, + AuxOut* aux_out) { + // TODO(szabadka): Make this work for non-opsin color spaces. + const CompressParams& cparams = enc_state->cparams; + Quantizer& quantizer = enc_state->shared.quantizer; + ImageI& raw_quant_field = enc_state->shared.raw_quant_field; + ImageF& quant_field = enc_state->initial_quant_field; + + // TODO(veluca): better choice of this value. + const float initial_quant_dc = + 16 * std::sqrt(0.1f / cparams.butteraugli_distance); + AdjustQuantField(enc_state->shared.ac_strategy, Rect(quant_field), + &quant_field); + + const float inv_max_err[3] = {1.0f / enc_state->cparams.max_error[0], + 1.0f / enc_state->cparams.max_error[1], + 1.0f / enc_state->cparams.max_error[2]}; + + for (int i = 0; i < cparams.max_butteraugli_iters + 1; ++i) { + quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field); + if (aux_out) { + aux_out->DumpXybImage(("ops" + ToString(i)).c_str(), opsin); + } + ImageBundle decoded = RoundtripImage(opsin, enc_state, cms, pool); + if (aux_out) { + aux_out->DumpXybImage(("dec" + ToString(i)).c_str(), *decoded.color()); + } + + for (size_t by = 0; by < enc_state->shared.frame_dim.ysize_blocks; by++) { + AcStrategyRow ac_strategy_row = + enc_state->shared.ac_strategy.ConstRow(by); + for (size_t bx = 0; bx < enc_state->shared.frame_dim.xsize_blocks; bx++) { + AcStrategy acs = ac_strategy_row[bx]; + if (!acs.IsFirstBlock()) continue; + float max_error = 0; + for (size_t c = 0; c < 3; c++) { + for (size_t y = by * kBlockDim; + y < (by + acs.covered_blocks_y()) * kBlockDim; y++) { + if (y >= decoded.ysize()) continue; + const float* JXL_RESTRICT in_row = opsin.ConstPlaneRow(c, y); + const float* JXL_RESTRICT dec_row = + decoded.color()->ConstPlaneRow(c, y); + for (size_t x = bx * kBlockDim; + x < (bx + acs.covered_blocks_x()) * kBlockDim; x++) { + if (x >= decoded.xsize()) continue; + max_error = std::max( + std::abs(in_row[x] - dec_row[x]) * inv_max_err[c], max_error); + } + } + } + // Target an error between max_error/2 and max_error. + // If the error in the varblock is above the target, increase the qf to + // compensate. If the error is below the target, decrease the qf. + // However, to avoid an excessive increase of the qf, only do so if the + // error is less than half the maximum allowed error. + const float qf_mul = (max_error < 0.5f) ? max_error * 2.0f + : (max_error > 1.0f) ? max_error + : 1.0f; + for (size_t qy = by; qy < by + acs.covered_blocks_y(); qy++) { + float* JXL_RESTRICT quant_field_row = quant_field.Row(qy); + for (size_t qx = bx; qx < bx + acs.covered_blocks_x(); qx++) { + quant_field_row[qx] *= qf_mul; + } + } + } + } + } + quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field); +} + +} // namespace + +void AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect, + ImageF* quant_field) { + // Replace the whole quant_field in non-8x8 blocks with the maximum of each + // 8x8 block. + size_t stride = quant_field->PixelsPerRow(); + for (size_t y = 0; y < rect.ysize(); ++y) { + AcStrategyRow ac_strategy_row = ac_strategy.ConstRow(rect, y); + float* JXL_RESTRICT quant_row = rect.Row(quant_field, y); + for (size_t x = 0; x < rect.xsize(); ++x) { + AcStrategy acs = ac_strategy_row[x]; + if (!acs.IsFirstBlock()) continue; + JXL_ASSERT(x + acs.covered_blocks_x() <= quant_field->xsize()); + JXL_ASSERT(y + acs.covered_blocks_y() <= quant_field->ysize()); + float max = quant_row[x]; + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + max = std::max(quant_row[x + ix + iy * stride], max); + } + } + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + quant_row[x + ix + iy * stride] = max; + } + } + } + } +} + +float InitialQuantDC(float butteraugli_target) { + const float kDcMul = 0.3; // Butteraugli target where non-linearity kicks in. + const float butteraugli_target_dc = std::max( + 0.5f * butteraugli_target, + std::min(butteraugli_target, + kDcMul * std::pow((1.0f / kDcMul) * butteraugli_target, + kDcQuantPow))); + // We want the maximum DC value to be at most 2**15 * kInvDCQuant / quant_dc. + // The maximum DC value might not be in the kXybRange because of inverse + // gaborish, so we add some slack to the maximum theoretical quant obtained + // this way (64). + return std::min(kDcQuant / butteraugli_target_dc, 50.f); +} + +ImageF InitialQuantField(const float butteraugli_target, const Image3F& opsin, + const FrameDimensions& frame_dim, ThreadPool* pool, + float rescale, ImageF* mask) { + PROFILER_FUNC; + const float quant_ac = kAcQuant / butteraugli_target; + return HWY_DYNAMIC_DISPATCH(AdaptiveQuantizationMap)( + butteraugli_target, opsin, frame_dim, quant_ac * rescale, pool, mask); +} + +void FindBestQuantizer(const ImageBundle* linear, const Image3F& opsin, + PassesEncoderState* enc_state, + const JxlCmsInterface& cms, ThreadPool* pool, + AuxOut* aux_out, double rescale) { + const CompressParams& cparams = enc_state->cparams; + if (cparams.max_error_mode) { + PROFILER_ZONE("enc find best maxerr"); + FindBestQuantizationMaxError(opsin, enc_state, cms, pool, aux_out); + } else if (cparams.speed_tier <= SpeedTier::kKitten) { + // Normal encoding to a butteraugli score. + PROFILER_ZONE("enc find best2"); + FindBestQuantization(*linear, opsin, enc_state, cms, pool, aux_out); + } +} + +ImageBundle RoundtripImage(const Image3F& opsin, PassesEncoderState* enc_state, + const JxlCmsInterface& cms, ThreadPool* pool) { + PROFILER_ZONE("enc roundtrip"); + std::unique_ptr dec_state = + jxl::make_unique(); + JXL_CHECK(dec_state->output_encoding_info.SetFromMetadata( + *enc_state->shared.metadata)); + dec_state->shared = &enc_state->shared; + JXL_ASSERT(opsin.ysize() % kBlockDim == 0); + + const size_t xsize_groups = DivCeil(opsin.xsize(), kGroupDim); + const size_t ysize_groups = DivCeil(opsin.ysize(), kGroupDim); + const size_t num_groups = xsize_groups * ysize_groups; + + size_t num_special_frames = enc_state->special_frames.size(); + + std::unique_ptr modular_frame_encoder = + jxl::make_unique(enc_state->shared.frame_header, + enc_state->cparams); + JXL_CHECK(InitializePassesEncoder(opsin, cms, pool, enc_state, + modular_frame_encoder.get(), nullptr)); + JXL_CHECK(dec_state->Init()); + JXL_CHECK(dec_state->InitForAC(pool)); + + ImageBundle decoded(&enc_state->shared.metadata->m); + decoded.origin = enc_state->shared.frame_header.frame_origin; + decoded.SetFromImage(Image3F(opsin.xsize(), opsin.ysize()), + dec_state->output_encoding_info.color_encoding); + + PassesDecoderState::PipelineOptions options; + options.use_slow_render_pipeline = false; + options.coalescing = true; + options.render_spotcolors = false; + + // Same as dec_state->shared->frame_header.nonserialized_metadata->m + const ImageMetadata& metadata = *decoded.metadata(); + + JXL_CHECK(dec_state->PreparePipeline(&decoded, options)); + + hwy::AlignedUniquePtr group_dec_caches; + const auto allocate_storage = [&](const size_t num_threads) -> Status { + JXL_RETURN_IF_ERROR( + dec_state->render_pipeline->PrepareForThreads(num_threads, + /*use_group_ids=*/false)); + group_dec_caches = hwy::MakeUniqueAlignedArray(num_threads); + return true; + }; + const auto process_group = [&](const uint32_t group_index, + const size_t thread) { + if (dec_state->shared->frame_header.loop_filter.epf_iters > 0) { + ComputeSigma(dec_state->shared->BlockGroupRect(group_index), + dec_state.get()); + } + RenderPipelineInput input = + dec_state->render_pipeline->GetInputBuffers(group_index, thread); + JXL_CHECK(DecodeGroupForRoundtrip( + enc_state->coeffs, group_index, dec_state.get(), + &group_dec_caches[thread], thread, input, &decoded, nullptr)); + for (size_t c = 0; c < metadata.num_extra_channels; c++) { + std::pair ri = input.GetBuffer(3 + c); + FillPlane(0.0f, ri.first, ri.second); + } + input.Done(); + }; + JXL_CHECK(RunOnPool(pool, 0, num_groups, allocate_storage, process_group, + "AQ loop")); + + // Ensure we don't create any new special frames. + enc_state->special_frames.resize(num_special_frames); + + return decoded; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.h b/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.h new file mode 100644 index 0000000000..a63c574492 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.h @@ -0,0 +1,66 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_ADAPTIVE_QUANTIZATION_H_ +#define LIB_JXL_ENC_ADAPTIVE_QUANTIZATION_H_ + +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/common.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/splines.h" + +// Heuristics to find a good quantizer for a given image. InitialQuantField +// produces a quantization field (i.e. relative quantization amounts for each +// block) out of an opsin-space image. `InitialQuantField` uses heuristics, +// `FindBestQuantizer` (in non-fast mode) will run multiple encoding-decoding +// steps and try to improve the given quant field. + +namespace jxl { + +struct AuxOut; + +// Computes the decoded image for a given set of compression parameters. Mainly +// used in the FindBestQuantization loops and in some tests. +// TODO(veluca): this doesn't seem the best possible file for this function. +ImageBundle RoundtripImage(const Image3F& opsin, PassesEncoderState* enc_state, + const JxlCmsInterface& cms, ThreadPool* pool); + +// Returns an image subsampled by kBlockDim in each direction. If the value +// at pixel (x,y) in the returned image is greater than 1.0, it means that +// more fine-grained quantization should be used in the corresponding block +// of the input image, while a value less than 1.0 indicates that less +// fine-grained quantization should be enough. Returns a mask, too, which +// can later be used to make better decisions about ac strategy. +ImageF InitialQuantField(float butteraugli_target, const Image3F& opsin, + const FrameDimensions& frame_dim, ThreadPool* pool, + float rescale, ImageF* initial_quant_mask); + +float InitialQuantDC(float butteraugli_target); + +void AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect, + ImageF* quant_field); + +// Returns a quantizer that uses an adjusted version of the provided +// quant_field. Also computes the dequant_map corresponding to the given +// dequant_float_map and chosen quantization levels. +// `linear` is only used in Kitten mode or slower. +void FindBestQuantizer(const ImageBundle* linear, const Image3F& opsin, + PassesEncoderState* enc_state, + const JxlCmsInterface& cms, ThreadPool* pool, + AuxOut* aux_out, double rescale = 1.0); + +} // namespace jxl + +#endif // LIB_JXL_ENC_ADAPTIVE_QUANTIZATION_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_ans.cc b/third_party/jpeg-xl/lib/jxl/enc_ans.cc new file mode 100644 index 0000000000..4249426bc9 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_ans.cc @@ -0,0 +1,1688 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_ans.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/ans_common.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_cluster.h" +#include "lib/jxl/enc_context_map.h" +#include "lib/jxl/enc_fields.h" +#include "lib/jxl/enc_huffman.h" +#include "lib/jxl/fast_math-inl.h" +#include "lib/jxl/fields.h" + +namespace jxl { + +namespace { + +bool ans_fuzzer_friendly_ = false; + +static const int kMaxNumSymbolsForSmallCode = 4; + +void ANSBuildInfoTable(const ANSHistBin* counts, const AliasTable::Entry* table, + size_t alphabet_size, size_t log_alpha_size, + ANSEncSymbolInfo* info) { + size_t log_entry_size = ANS_LOG_TAB_SIZE - log_alpha_size; + size_t entry_size_minus_1 = (1 << log_entry_size) - 1; + // create valid alias table for empty streams. + for (size_t s = 0; s < std::max(1, alphabet_size); ++s) { + const ANSHistBin freq = s == alphabet_size ? ANS_TAB_SIZE : counts[s]; + info[s].freq_ = static_cast(freq); +#ifdef USE_MULT_BY_RECIPROCAL + if (freq != 0) { + info[s].ifreq_ = + ((1ull << RECIPROCAL_PRECISION) + info[s].freq_ - 1) / info[s].freq_; + } else { + info[s].ifreq_ = 1; // shouldn't matter (symbol shouldn't occur), but... + } +#endif + info[s].reverse_map_.resize(freq); + } + for (int i = 0; i < ANS_TAB_SIZE; i++) { + AliasTable::Symbol s = + AliasTable::Lookup(table, i, log_entry_size, entry_size_minus_1); + info[s.value].reverse_map_[s.offset] = i; + } +} + +float EstimateDataBits(const ANSHistBin* histogram, const ANSHistBin* counts, + size_t len) { + float sum = 0.0f; + int total_histogram = 0; + int total_counts = 0; + for (size_t i = 0; i < len; ++i) { + total_histogram += histogram[i]; + total_counts += counts[i]; + if (histogram[i] > 0) { + JXL_ASSERT(counts[i] > 0); + // += histogram[i] * -log(counts[i]/total_counts) + sum += histogram[i] * + std::max(0.0f, ANS_LOG_TAB_SIZE - FastLog2f(counts[i])); + } + } + if (total_histogram > 0) { + // Used only in assert. + (void)total_counts; + JXL_ASSERT(total_counts == ANS_TAB_SIZE); + } + return sum; +} + +float EstimateDataBitsFlat(const ANSHistBin* histogram, size_t len) { + const float flat_bits = std::max(FastLog2f(len), 0.0f); + float total_histogram = 0; + for (size_t i = 0; i < len; ++i) { + total_histogram += histogram[i]; + } + return total_histogram * flat_bits; +} + +// Static Huffman code for encoding logcounts. The last symbol is used as RLE +// sequence. +static const uint8_t kLogCountBitLengths[ANS_LOG_TAB_SIZE + 2] = { + 5, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 6, 7, 7, +}; +static const uint8_t kLogCountSymbols[ANS_LOG_TAB_SIZE + 2] = { + 17, 11, 15, 3, 9, 7, 4, 2, 5, 6, 0, 33, 1, 65, +}; + +// Returns the difference between largest count that can be represented and is +// smaller than "count" and smallest representable count larger than "count". +static int SmallestIncrement(uint32_t count, uint32_t shift) { + int bits = count == 0 ? -1 : FloorLog2Nonzero(count); + int drop_bits = bits - GetPopulationCountPrecision(bits, shift); + return drop_bits < 0 ? 1 : (1 << drop_bits); +} + +template +bool RebalanceHistogram(const float* targets, int max_symbol, int table_size, + uint32_t shift, int* omit_pos, ANSHistBin* counts) { + int sum = 0; + float sum_nonrounded = 0.0; + int remainder_pos = 0; // if all of them are handled in first loop + int remainder_log = -1; + for (int n = 0; n < max_symbol; ++n) { + if (targets[n] > 0 && targets[n] < 1.0f) { + counts[n] = 1; + sum_nonrounded += targets[n]; + sum += counts[n]; + } + } + const float discount_ratio = + (table_size - sum) / (table_size - sum_nonrounded); + JXL_ASSERT(discount_ratio > 0); + JXL_ASSERT(discount_ratio <= 1.0f); + // Invariant for minimize_error_of_sum == true: + // abs(sum - sum_nonrounded) + // <= SmallestIncrement(max(targets[])) + max_symbol + for (int n = 0; n < max_symbol; ++n) { + if (targets[n] >= 1.0f) { + sum_nonrounded += targets[n]; + counts[n] = + static_cast(targets[n] * discount_ratio); // truncate + if (counts[n] == 0) counts[n] = 1; + if (counts[n] == table_size) counts[n] = table_size - 1; + // Round the count to the closest nonzero multiple of SmallestIncrement + // (when minimize_error_of_sum is false) or one of two closest so as to + // keep the sum as close as possible to sum_nonrounded. + int inc = SmallestIncrement(counts[n], shift); + counts[n] -= counts[n] & (inc - 1); + // TODO(robryk): Should we rescale targets[n]? + const float target = + minimize_error_of_sum ? (sum_nonrounded - sum) : targets[n]; + if (counts[n] == 0 || + (target > counts[n] + inc / 2 && counts[n] + inc < table_size)) { + counts[n] += inc; + } + sum += counts[n]; + const int count_log = FloorLog2Nonzero(static_cast(counts[n])); + if (count_log > remainder_log) { + remainder_pos = n; + remainder_log = count_log; + } + } + } + JXL_ASSERT(remainder_pos != -1); + // NOTE: This is the only place where counts could go negative. We could + // detect that, return false and make ANSHistBin uint32_t. + counts[remainder_pos] -= sum - table_size; + *omit_pos = remainder_pos; + return counts[remainder_pos] > 0; +} + +Status NormalizeCounts(ANSHistBin* counts, int* omit_pos, const int length, + const int precision_bits, uint32_t shift, + int* num_symbols, int* symbols) { + const int32_t table_size = 1 << precision_bits; // target sum / table size + uint64_t total = 0; + int max_symbol = 0; + int symbol_count = 0; + for (int n = 0; n < length; ++n) { + total += counts[n]; + if (counts[n] > 0) { + if (symbol_count < kMaxNumSymbolsForSmallCode) { + symbols[symbol_count] = n; + } + ++symbol_count; + max_symbol = n + 1; + } + } + *num_symbols = symbol_count; + if (symbol_count == 0) { + return true; + } + if (symbol_count == 1) { + counts[symbols[0]] = table_size; + return true; + } + if (symbol_count > table_size) + return JXL_FAILURE("Too many entries in an ANS histogram"); + + const float norm = 1.f * table_size / total; + std::vector targets(max_symbol); + for (size_t n = 0; n < targets.size(); ++n) { + targets[n] = norm * counts[n]; + } + if (!RebalanceHistogram(&targets[0], max_symbol, table_size, shift, + omit_pos, counts)) { + // Use an alternative rebalancing mechanism if the one above failed + // to create a histogram that is positive wherever the original one was. + if (!RebalanceHistogram(&targets[0], max_symbol, table_size, shift, + omit_pos, counts)) { + return JXL_FAILURE("Logic error: couldn't rebalance a histogram"); + } + } + return true; +} + +struct SizeWriter { + size_t size = 0; + void Write(size_t num, size_t bits) { size += num; } +}; + +template +void StoreVarLenUint8(size_t n, Writer* writer) { + JXL_DASSERT(n <= 255); + if (n == 0) { + writer->Write(1, 0); + } else { + writer->Write(1, 1); + size_t nbits = FloorLog2Nonzero(n); + writer->Write(3, nbits); + writer->Write(nbits, n - (1ULL << nbits)); + } +} + +template +void StoreVarLenUint16(size_t n, Writer* writer) { + JXL_DASSERT(n <= 65535); + if (n == 0) { + writer->Write(1, 0); + } else { + writer->Write(1, 1); + size_t nbits = FloorLog2Nonzero(n); + writer->Write(4, nbits); + writer->Write(nbits, n - (1ULL << nbits)); + } +} + +template +bool EncodeCounts(const ANSHistBin* counts, const int alphabet_size, + const int omit_pos, const int num_symbols, uint32_t shift, + const int* symbols, Writer* writer) { + bool ok = true; + if (num_symbols <= 2) { + // Small tree marker to encode 1-2 symbols. + writer->Write(1, 1); + if (num_symbols == 0) { + writer->Write(1, 0); + StoreVarLenUint8(0, writer); + } else { + writer->Write(1, num_symbols - 1); + for (int i = 0; i < num_symbols; ++i) { + StoreVarLenUint8(symbols[i], writer); + } + } + if (num_symbols == 2) { + writer->Write(ANS_LOG_TAB_SIZE, counts[symbols[0]]); + } + } else { + // Mark non-small tree. + writer->Write(1, 0); + // Mark non-flat histogram. + writer->Write(1, 0); + + // Precompute sequences for RLE encoding. Contains the number of identical + // values starting at a given index. Only contains the value at the first + // element of the series. + std::vector same(alphabet_size, 0); + int last = 0; + for (int i = 1; i < alphabet_size; i++) { + // Store the sequence length once different symbol reached, or we're at + // the end, or the length is longer than we can encode, or we are at + // the omit_pos. We don't support including the omit_pos in an RLE + // sequence because this value may use a different amount of log2 bits + // than standard, it is too complex to handle in the decoder. + if (counts[i] != counts[last] || i + 1 == alphabet_size || + (i - last) >= 255 || i == omit_pos || i == omit_pos + 1) { + same[last] = (i - last); + last = i + 1; + } + } + + int length = 0; + std::vector logcounts(alphabet_size); + int omit_log = 0; + for (int i = 0; i < alphabet_size; ++i) { + JXL_ASSERT(counts[i] <= ANS_TAB_SIZE); + JXL_ASSERT(counts[i] >= 0); + if (i == omit_pos) { + length = i + 1; + } else if (counts[i] > 0) { + logcounts[i] = FloorLog2Nonzero(static_cast(counts[i])) + 1; + length = i + 1; + if (i < omit_pos) { + omit_log = std::max(omit_log, logcounts[i] + 1); + } else { + omit_log = std::max(omit_log, logcounts[i]); + } + } + } + logcounts[omit_pos] = omit_log; + + // Elias gamma-like code for shift. Only difference is that if the number + // of bits to be encoded is equal to FloorLog2(ANS_LOG_TAB_SIZE+1), we skip + // the terminating 0 in unary coding. + int upper_bound_log = FloorLog2Nonzero(ANS_LOG_TAB_SIZE + 1); + int log = FloorLog2Nonzero(shift + 1); + writer->Write(log, (1 << log) - 1); + if (log != upper_bound_log) writer->Write(1, 0); + writer->Write(log, ((1 << log) - 1) & (shift + 1)); + + // Since num_symbols >= 3, we know that length >= 3, therefore we encode + // length - 3. + if (length - 3 > 255) { + // Pretend that everything is OK, but complain about correctness later. + StoreVarLenUint8(255, writer); + ok = false; + } else { + StoreVarLenUint8(length - 3, writer); + } + + // The logcount values are encoded with a static Huffman code. + static const size_t kMinReps = 4; + size_t rep = ANS_LOG_TAB_SIZE + 1; + for (int i = 0; i < length; ++i) { + if (i > 0 && same[i - 1] > kMinReps) { + // Encode the RLE symbol and skip the repeated ones. + writer->Write(kLogCountBitLengths[rep], kLogCountSymbols[rep]); + StoreVarLenUint8(same[i - 1] - kMinReps - 1, writer); + i += same[i - 1] - 2; + continue; + } + writer->Write(kLogCountBitLengths[logcounts[i]], + kLogCountSymbols[logcounts[i]]); + } + for (int i = 0; i < length; ++i) { + if (i > 0 && same[i - 1] > kMinReps) { + // Skip symbols encoded by RLE. + i += same[i - 1] - 2; + continue; + } + if (logcounts[i] > 1 && i != omit_pos) { + int bitcount = GetPopulationCountPrecision(logcounts[i] - 1, shift); + int drop_bits = logcounts[i] - 1 - bitcount; + JXL_CHECK((counts[i] & ((1 << drop_bits) - 1)) == 0); + writer->Write(bitcount, (counts[i] >> drop_bits) - (1 << bitcount)); + } + } + } + return ok; +} + +void EncodeFlatHistogram(const int alphabet_size, BitWriter* writer) { + // Mark non-small tree. + writer->Write(1, 0); + // Mark uniform histogram. + writer->Write(1, 1); + JXL_ASSERT(alphabet_size > 0); + // Encode alphabet size. + StoreVarLenUint8(alphabet_size - 1, writer); +} + +float ComputeHistoAndDataCost(const ANSHistBin* histogram, size_t alphabet_size, + uint32_t method) { + if (method == 0) { // Flat code + return ANS_LOG_TAB_SIZE + 2 + + EstimateDataBitsFlat(histogram, alphabet_size); + } + // Non-flat: shift = method-1. + uint32_t shift = method - 1; + std::vector counts(histogram, histogram + alphabet_size); + int omit_pos = 0; + int num_symbols; + int symbols[kMaxNumSymbolsForSmallCode] = {}; + JXL_CHECK(NormalizeCounts(counts.data(), &omit_pos, alphabet_size, + ANS_LOG_TAB_SIZE, shift, &num_symbols, symbols)); + SizeWriter writer; + // Ignore the correctness, no real encoding happens at this stage. + (void)EncodeCounts(counts.data(), alphabet_size, omit_pos, num_symbols, shift, + symbols, &writer); + return writer.size + + EstimateDataBits(histogram, counts.data(), alphabet_size); +} + +uint32_t ComputeBestMethod( + const ANSHistBin* histogram, size_t alphabet_size, float* cost, + HistogramParams::ANSHistogramStrategy ans_histogram_strategy) { + size_t method = 0; + float fcost = ComputeHistoAndDataCost(histogram, alphabet_size, 0); + auto try_shift = [&](size_t shift) { + float c = ComputeHistoAndDataCost(histogram, alphabet_size, shift + 1); + if (c < fcost) { + method = shift + 1; + fcost = c; + } + }; + switch (ans_histogram_strategy) { + case HistogramParams::ANSHistogramStrategy::kPrecise: { + for (uint32_t shift = 0; shift <= ANS_LOG_TAB_SIZE; shift++) { + try_shift(shift); + } + break; + } + case HistogramParams::ANSHistogramStrategy::kApproximate: { + for (uint32_t shift = 0; shift <= ANS_LOG_TAB_SIZE; shift += 2) { + try_shift(shift); + } + break; + } + case HistogramParams::ANSHistogramStrategy::kFast: { + try_shift(0); + try_shift(ANS_LOG_TAB_SIZE / 2); + try_shift(ANS_LOG_TAB_SIZE); + break; + } + }; + *cost = fcost; + return method; +} + +} // namespace + +// Returns an estimate of the cost of encoding this histogram and the +// corresponding data. +size_t BuildAndStoreANSEncodingData( + HistogramParams::ANSHistogramStrategy ans_histogram_strategy, + const ANSHistBin* histogram, size_t alphabet_size, size_t log_alpha_size, + bool use_prefix_code, ANSEncSymbolInfo* info, BitWriter* writer) { + if (use_prefix_code) { + if (alphabet_size <= 1) return 0; + std::vector histo(alphabet_size); + for (size_t i = 0; i < alphabet_size; i++) { + histo[i] = histogram[i]; + JXL_CHECK(histogram[i] >= 0); + } + size_t cost = 0; + { + std::vector depths(alphabet_size); + std::vector bits(alphabet_size); + if (writer == nullptr) { + BitWriter tmp_writer; + BitWriter::Allotment allotment( + &tmp_writer, 8 * alphabet_size + 8); // safe upper bound + BuildAndStoreHuffmanTree(histo.data(), alphabet_size, depths.data(), + bits.data(), &tmp_writer); + allotment.ReclaimAndCharge(&tmp_writer, 0, /*aux_out=*/nullptr); + cost = tmp_writer.BitsWritten(); + } else { + size_t start = writer->BitsWritten(); + BuildAndStoreHuffmanTree(histo.data(), alphabet_size, depths.data(), + bits.data(), writer); + cost = writer->BitsWritten() - start; + } + for (size_t i = 0; i < alphabet_size; i++) { + info[i].bits = depths[i] == 0 ? 0 : bits[i]; + info[i].depth = depths[i]; + } + } + // Estimate data cost. + for (size_t i = 0; i < alphabet_size; i++) { + cost += histogram[i] * info[i].depth; + } + return cost; + } + JXL_ASSERT(alphabet_size <= ANS_TAB_SIZE); + // Ensure we ignore trailing zeros in the histogram. + if (alphabet_size != 0) { + size_t largest_symbol = 0; + for (size_t i = 0; i < alphabet_size; i++) { + if (histogram[i] != 0) largest_symbol = i; + } + alphabet_size = largest_symbol + 1; + } + float cost; + uint32_t method = ComputeBestMethod(histogram, alphabet_size, &cost, + ans_histogram_strategy); + JXL_ASSERT(cost >= 0); + int num_symbols; + int symbols[kMaxNumSymbolsForSmallCode] = {}; + std::vector counts(histogram, histogram + alphabet_size); + if (!counts.empty()) { + size_t sum = 0; + for (size_t i = 0; i < counts.size(); i++) { + sum += counts[i]; + } + if (sum == 0) { + counts[0] = ANS_TAB_SIZE; + } + } + if (method == 0) { + counts = CreateFlatHistogram(alphabet_size, ANS_TAB_SIZE); + AliasTable::Entry a[ANS_MAX_ALPHABET_SIZE]; + InitAliasTable(counts, ANS_TAB_SIZE, log_alpha_size, a); + ANSBuildInfoTable(counts.data(), a, alphabet_size, log_alpha_size, info); + if (writer != nullptr) { + EncodeFlatHistogram(alphabet_size, writer); + } + return cost; + } + int omit_pos = 0; + uint32_t shift = method - 1; + JXL_CHECK(NormalizeCounts(counts.data(), &omit_pos, alphabet_size, + ANS_LOG_TAB_SIZE, shift, &num_symbols, symbols)); + AliasTable::Entry a[ANS_MAX_ALPHABET_SIZE]; + InitAliasTable(counts, ANS_TAB_SIZE, log_alpha_size, a); + ANSBuildInfoTable(counts.data(), a, alphabet_size, log_alpha_size, info); + if (writer != nullptr) { + bool ok = EncodeCounts(counts.data(), alphabet_size, omit_pos, num_symbols, + shift, symbols, writer); + (void)ok; + JXL_DASSERT(ok); + } + return cost; +} + +float ANSPopulationCost(const ANSHistBin* data, size_t alphabet_size) { + float c; + ComputeBestMethod(data, alphabet_size, &c, + HistogramParams::ANSHistogramStrategy::kFast); + return c; +} + +template +void EncodeUintConfig(const HybridUintConfig uint_config, Writer* writer, + size_t log_alpha_size) { + writer->Write(CeilLog2Nonzero(log_alpha_size + 1), + uint_config.split_exponent); + if (uint_config.split_exponent == log_alpha_size) { + return; // msb/lsb don't matter. + } + size_t nbits = CeilLog2Nonzero(uint_config.split_exponent + 1); + writer->Write(nbits, uint_config.msb_in_token); + nbits = CeilLog2Nonzero(uint_config.split_exponent - + uint_config.msb_in_token + 1); + writer->Write(nbits, uint_config.lsb_in_token); +} +template +void EncodeUintConfigs(const std::vector& uint_config, + Writer* writer, size_t log_alpha_size) { + // TODO(veluca): RLE? + for (size_t i = 0; i < uint_config.size(); i++) { + EncodeUintConfig(uint_config[i], writer, log_alpha_size); + } +} +template void EncodeUintConfigs(const std::vector&, + BitWriter*, size_t); + +namespace { + +void ChooseUintConfigs(const HistogramParams& params, + const std::vector>& tokens, + const std::vector& context_map, + std::vector* clustered_histograms, + EntropyEncodingData* codes, size_t* log_alpha_size) { + codes->uint_config.resize(clustered_histograms->size()); + + if (params.uint_method == HistogramParams::HybridUintMethod::kNone) return; + if (params.uint_method == HistogramParams::HybridUintMethod::k000) { + codes->uint_config.clear(); + codes->uint_config.resize(clustered_histograms->size(), + HybridUintConfig(0, 0, 0)); + return; + } + if (params.uint_method == HistogramParams::HybridUintMethod::kContextMap) { + codes->uint_config.clear(); + codes->uint_config.resize(clustered_histograms->size(), + HybridUintConfig(2, 0, 1)); + return; + } + + // Brute-force method that tries a few options. + std::vector configs; + if (params.uint_method == HistogramParams::HybridUintMethod::kBest) { + configs = { + HybridUintConfig(4, 2, 0), // default + HybridUintConfig(4, 1, 0), // less precise + HybridUintConfig(4, 2, 1), // add sign + HybridUintConfig(4, 2, 2), // add sign+parity + HybridUintConfig(4, 1, 2), // add parity but less msb + // Same as above, but more direct coding. + HybridUintConfig(5, 2, 0), HybridUintConfig(5, 1, 0), + HybridUintConfig(5, 2, 1), HybridUintConfig(5, 2, 2), + HybridUintConfig(5, 1, 2), + // Same as above, but less direct coding. + HybridUintConfig(3, 2, 0), HybridUintConfig(3, 1, 0), + HybridUintConfig(3, 2, 1), HybridUintConfig(3, 1, 2), + // For near-lossless. + HybridUintConfig(4, 1, 3), HybridUintConfig(5, 1, 4), + HybridUintConfig(5, 2, 3), HybridUintConfig(6, 1, 5), + HybridUintConfig(6, 2, 4), HybridUintConfig(6, 0, 0), + // Other + HybridUintConfig(0, 0, 0), // varlenuint + HybridUintConfig(2, 0, 1), // works well for ctx map + HybridUintConfig(7, 0, 0), // direct coding + HybridUintConfig(8, 0, 0), // direct coding + HybridUintConfig(9, 0, 0), // direct coding + HybridUintConfig(10, 0, 0), // direct coding + HybridUintConfig(11, 0, 0), // direct coding + HybridUintConfig(12, 0, 0), // direct coding + }; + } else if (params.uint_method == HistogramParams::HybridUintMethod::kFast) { + configs = { + HybridUintConfig(4, 2, 0), // default + HybridUintConfig(4, 1, 2), // add parity but less msb + HybridUintConfig(0, 0, 0), // smallest histograms + HybridUintConfig(2, 0, 1), // works well for ctx map + }; + } + + std::vector costs(clustered_histograms->size(), + std::numeric_limits::max()); + std::vector extra_bits(clustered_histograms->size()); + std::vector is_valid(clustered_histograms->size()); + size_t max_alpha = + codes->use_prefix_code ? PREFIX_MAX_ALPHABET_SIZE : ANS_MAX_ALPHABET_SIZE; + for (HybridUintConfig cfg : configs) { + std::fill(is_valid.begin(), is_valid.end(), true); + std::fill(extra_bits.begin(), extra_bits.end(), 0); + + for (size_t i = 0; i < clustered_histograms->size(); i++) { + (*clustered_histograms)[i].Clear(); + } + for (size_t i = 0; i < tokens.size(); ++i) { + for (size_t j = 0; j < tokens[i].size(); ++j) { + const Token token = tokens[i][j]; + // TODO(veluca): do not ignore lz77 commands. + if (token.is_lz77_length) continue; + size_t histo = context_map[token.context]; + uint32_t tok, nbits, bits; + cfg.Encode(token.value, &tok, &nbits, &bits); + if (tok >= max_alpha || + (codes->lz77.enabled && tok >= codes->lz77.min_symbol)) { + is_valid[histo] = false; + continue; + } + extra_bits[histo] += nbits; + (*clustered_histograms)[histo].Add(tok); + } + } + + for (size_t i = 0; i < clustered_histograms->size(); i++) { + if (!is_valid[i]) continue; + float cost = (*clustered_histograms)[i].PopulationCost() + extra_bits[i]; + // add signaling cost of the hybriduintconfig itself + cost += CeilLog2Nonzero(cfg.split_exponent + 1); + cost += CeilLog2Nonzero(cfg.split_exponent - cfg.msb_in_token + 1); + if (cost < costs[i]) { + codes->uint_config[i] = cfg; + costs[i] = cost; + } + } + } + + // Rebuild histograms. + for (size_t i = 0; i < clustered_histograms->size(); i++) { + (*clustered_histograms)[i].Clear(); + } + *log_alpha_size = 4; + for (size_t i = 0; i < tokens.size(); ++i) { + for (size_t j = 0; j < tokens[i].size(); ++j) { + const Token token = tokens[i][j]; + uint32_t tok, nbits, bits; + size_t histo = context_map[token.context]; + (token.is_lz77_length ? codes->lz77.length_uint_config + : codes->uint_config[histo]) + .Encode(token.value, &tok, &nbits, &bits); + tok += token.is_lz77_length ? codes->lz77.min_symbol : 0; + (*clustered_histograms)[histo].Add(tok); + while (tok >= (1u << *log_alpha_size)) (*log_alpha_size)++; + } + } +#if JXL_ENABLE_ASSERT + size_t max_log_alpha_size = codes->use_prefix_code ? PREFIX_MAX_BITS : 8; + JXL_ASSERT(*log_alpha_size <= max_log_alpha_size); +#endif +} + +class HistogramBuilder { + public: + explicit HistogramBuilder(const size_t num_contexts) + : histograms_(num_contexts) {} + + void VisitSymbol(int symbol, size_t histo_idx) { + JXL_DASSERT(histo_idx < histograms_.size()); + histograms_[histo_idx].Add(symbol); + } + + // NOTE: `layer` is only for clustered_entropy; caller does ReclaimAndCharge. + size_t BuildAndStoreEntropyCodes( + const HistogramParams& params, + const std::vector>& tokens, EntropyEncodingData* codes, + std::vector* context_map, bool use_prefix_code, + BitWriter* writer, size_t layer, AuxOut* aux_out) const { + size_t cost = 0; + codes->encoding_info.clear(); + std::vector clustered_histograms(histograms_); + context_map->resize(histograms_.size()); + if (histograms_.size() > 1) { + if (!ans_fuzzer_friendly_) { + std::vector histogram_symbols; + ClusterHistograms(params, histograms_, kClustersLimit, + &clustered_histograms, &histogram_symbols); + for (size_t c = 0; c < histograms_.size(); ++c) { + (*context_map)[c] = static_cast(histogram_symbols[c]); + } + } else { + fill(context_map->begin(), context_map->end(), 0); + size_t max_symbol = 0; + for (const Histogram& h : histograms_) { + max_symbol = std::max(h.data_.size(), max_symbol); + } + size_t num_symbols = 1 << CeilLog2Nonzero(max_symbol + 1); + clustered_histograms.resize(1); + clustered_histograms[0].Clear(); + for (size_t i = 0; i < num_symbols; i++) { + clustered_histograms[0].Add(i); + } + } + if (writer != nullptr) { + EncodeContextMap(*context_map, clustered_histograms.size(), writer, + layer, aux_out); + } + } + if (aux_out != nullptr) { + for (size_t i = 0; i < clustered_histograms.size(); ++i) { + aux_out->layers[layer].clustered_entropy += + clustered_histograms[i].ShannonEntropy(); + } + } + codes->use_prefix_code = use_prefix_code; + size_t log_alpha_size = codes->lz77.enabled ? 8 : 7; // Sane default. + if (ans_fuzzer_friendly_) { + codes->uint_config.clear(); + codes->uint_config.resize(1, HybridUintConfig(7, 0, 0)); + } else { + ChooseUintConfigs(params, tokens, *context_map, &clustered_histograms, + codes, &log_alpha_size); + } + if (log_alpha_size < 5) log_alpha_size = 5; + SizeWriter size_writer; // Used if writer == nullptr to estimate costs. + cost += 1; + if (writer) writer->Write(1, use_prefix_code); + + if (use_prefix_code) { + log_alpha_size = PREFIX_MAX_BITS; + } else { + cost += 2; + } + if (writer == nullptr) { + EncodeUintConfigs(codes->uint_config, &size_writer, log_alpha_size); + } else { + if (!use_prefix_code) writer->Write(2, log_alpha_size - 5); + EncodeUintConfigs(codes->uint_config, writer, log_alpha_size); + } + if (use_prefix_code) { + for (size_t c = 0; c < clustered_histograms.size(); ++c) { + size_t num_symbol = 1; + for (size_t i = 0; i < clustered_histograms[c].data_.size(); i++) { + if (clustered_histograms[c].data_[i]) num_symbol = i + 1; + } + if (writer) { + StoreVarLenUint16(num_symbol - 1, writer); + } else { + StoreVarLenUint16(num_symbol - 1, &size_writer); + } + } + } + cost += size_writer.size; + for (size_t c = 0; c < clustered_histograms.size(); ++c) { + size_t num_symbol = 1; + for (size_t i = 0; i < clustered_histograms[c].data_.size(); i++) { + if (clustered_histograms[c].data_[i]) num_symbol = i + 1; + } + codes->encoding_info.emplace_back(); + codes->encoding_info.back().resize(std::max(1, num_symbol)); + + BitWriter::Allotment allotment(writer, 256 + num_symbol * 24); + cost += BuildAndStoreANSEncodingData( + params.ans_histogram_strategy, clustered_histograms[c].data_.data(), + num_symbol, log_alpha_size, use_prefix_code, + codes->encoding_info.back().data(), writer); + allotment.FinishedHistogram(writer); + allotment.ReclaimAndCharge(writer, layer, aux_out); + } + return cost; + } + + const Histogram& Histo(size_t i) const { return histograms_[i]; } + + private: + std::vector histograms_; +}; + +class SymbolCostEstimator { + public: + SymbolCostEstimator(size_t num_contexts, bool force_huffman, + const std::vector>& tokens, + const LZ77Params& lz77) { + HistogramBuilder builder(num_contexts); + // Build histograms for estimating lz77 savings. + HybridUintConfig uint_config; + for (size_t i = 0; i < tokens.size(); ++i) { + for (size_t j = 0; j < tokens[i].size(); ++j) { + const Token token = tokens[i][j]; + uint32_t tok, nbits, bits; + (token.is_lz77_length ? lz77.length_uint_config : uint_config) + .Encode(token.value, &tok, &nbits, &bits); + tok += token.is_lz77_length ? lz77.min_symbol : 0; + builder.VisitSymbol(tok, token.context); + } + } + max_alphabet_size_ = 0; + for (size_t i = 0; i < num_contexts; i++) { + max_alphabet_size_ = + std::max(max_alphabet_size_, builder.Histo(i).data_.size()); + } + bits_.resize(num_contexts * max_alphabet_size_); + // TODO(veluca): SIMD? + add_symbol_cost_.resize(num_contexts); + for (size_t i = 0; i < num_contexts; i++) { + float inv_total = 1.0f / (builder.Histo(i).total_count_ + 1e-8f); + float total_cost = 0; + for (size_t j = 0; j < builder.Histo(i).data_.size(); j++) { + size_t cnt = builder.Histo(i).data_[j]; + float cost = 0; + if (cnt != 0 && cnt != builder.Histo(i).total_count_) { + cost = -FastLog2f(cnt * inv_total); + if (force_huffman) cost = std::ceil(cost); + } else if (cnt == 0) { + cost = ANS_LOG_TAB_SIZE; // Highest possible cost. + } + bits_[i * max_alphabet_size_ + j] = cost; + total_cost += cost * builder.Histo(i).data_[j]; + } + // Penalty for adding a lz77 symbol to this contest (only used for static + // cost model). Higher penalty for contexts that have a very low + // per-symbol entropy. + add_symbol_cost_[i] = std::max(0.0f, 6.0f - total_cost * inv_total); + } + } + float Bits(size_t ctx, size_t sym) const { + return bits_[ctx * max_alphabet_size_ + sym]; + } + float LenCost(size_t ctx, size_t len, const LZ77Params& lz77) const { + uint32_t nbits, bits, tok; + lz77.length_uint_config.Encode(len, &tok, &nbits, &bits); + tok += lz77.min_symbol; + return nbits + Bits(ctx, tok); + } + float DistCost(size_t len, const LZ77Params& lz77) const { + uint32_t nbits, bits, tok; + HybridUintConfig().Encode(len, &tok, &nbits, &bits); + return nbits + Bits(lz77.nonserialized_distance_context, tok); + } + float AddSymbolCost(size_t idx) const { return add_symbol_cost_[idx]; } + + private: + size_t max_alphabet_size_; + std::vector bits_; + std::vector add_symbol_cost_; +}; + +void ApplyLZ77_RLE(const HistogramParams& params, size_t num_contexts, + const std::vector>& tokens, + LZ77Params& lz77, + std::vector>& tokens_lz77) { + // TODO(veluca): tune heuristics here. + SymbolCostEstimator sce(num_contexts, params.force_huffman, tokens, lz77); + float bit_decrease = 0; + size_t total_symbols = 0; + tokens_lz77.resize(tokens.size()); + std::vector sym_cost; + HybridUintConfig uint_config; + for (size_t stream = 0; stream < tokens.size(); stream++) { + size_t distance_multiplier = + params.image_widths.size() > stream ? params.image_widths[stream] : 0; + const auto& in = tokens[stream]; + auto& out = tokens_lz77[stream]; + total_symbols += in.size(); + // Cumulative sum of bit costs. + sym_cost.resize(in.size() + 1); + for (size_t i = 0; i < in.size(); i++) { + uint32_t tok, nbits, unused_bits; + uint_config.Encode(in[i].value, &tok, &nbits, &unused_bits); + sym_cost[i + 1] = sce.Bits(in[i].context, tok) + nbits + sym_cost[i]; + } + out.reserve(in.size()); + for (size_t i = 0; i < in.size(); i++) { + size_t num_to_copy = 0; + size_t distance_symbol = 0; // 1 for RLE. + if (distance_multiplier != 0) { + distance_symbol = 1; // Special distance 1 if enabled. + JXL_DASSERT(kSpecialDistances[1][0] == 1); + JXL_DASSERT(kSpecialDistances[1][1] == 0); + } + if (i > 0) { + for (; i + num_to_copy < in.size(); num_to_copy++) { + if (in[i + num_to_copy].value != in[i - 1].value) { + break; + } + } + } + if (num_to_copy == 0) { + out.push_back(in[i]); + continue; + } + float cost = sym_cost[i + num_to_copy] - sym_cost[i]; + // This subtraction might overflow, but that's OK. + size_t lz77_len = num_to_copy - lz77.min_length; + float lz77_cost = num_to_copy >= lz77.min_length + ? CeilLog2Nonzero(lz77_len + 1) + 1 + : 0; + if (num_to_copy < lz77.min_length || cost <= lz77_cost) { + for (size_t j = 0; j < num_to_copy; j++) { + out.push_back(in[i + j]); + } + i += num_to_copy - 1; + continue; + } + // Output the LZ77 length + out.emplace_back(in[i].context, lz77_len); + out.back().is_lz77_length = true; + i += num_to_copy - 1; + bit_decrease += cost - lz77_cost; + // Output the LZ77 copy distance. + out.emplace_back(lz77.nonserialized_distance_context, distance_symbol); + } + } + + if (bit_decrease > total_symbols * 0.2 + 16) { + lz77.enabled = true; + } +} + +// Hash chain for LZ77 matching +struct HashChain { + size_t size_; + std::vector data_; + + unsigned hash_num_values_ = 32768; + unsigned hash_mask_ = hash_num_values_ - 1; + unsigned hash_shift_ = 5; + + std::vector head; + std::vector chain; + std::vector val; + + // Speed up repetitions of zero + std::vector headz; + std::vector chainz; + std::vector zeros; + uint32_t numzeros = 0; + + size_t window_size_; + size_t window_mask_; + size_t min_length_; + size_t max_length_; + + // Map of special distance codes. + std::unordered_map special_dist_table_; + size_t num_special_distances_ = 0; + + uint32_t maxchainlength = 256; // window_size_ to allow all + + HashChain(const Token* data, size_t size, size_t window_size, + size_t min_length, size_t max_length, size_t distance_multiplier) + : size_(size), + window_size_(window_size), + window_mask_(window_size - 1), + min_length_(min_length), + max_length_(max_length) { + data_.resize(size); + for (size_t i = 0; i < size; i++) { + data_[i] = data[i].value; + } + + head.resize(hash_num_values_, -1); + val.resize(window_size_, -1); + chain.resize(window_size_); + for (uint32_t i = 0; i < window_size_; ++i) { + chain[i] = i; // same value as index indicates uninitialized + } + + zeros.resize(window_size_); + headz.resize(window_size_ + 1, -1); + chainz.resize(window_size_); + for (uint32_t i = 0; i < window_size_; ++i) { + chainz[i] = i; + } + // Translate distance to special distance code. + if (distance_multiplier) { + // Count down, so if due to small distance multiplier multiple distances + // map to the same code, the smallest code will be used in the end. + for (int i = kNumSpecialDistances - 1; i >= 0; --i) { + int xi = kSpecialDistances[i][0]; + int yi = kSpecialDistances[i][1]; + int distance = yi * distance_multiplier + xi; + // Ensure that we map distance 1 to the lowest symbols. + if (distance < 1) distance = 1; + special_dist_table_[distance] = i; + } + num_special_distances_ = kNumSpecialDistances; + } + } + + uint32_t GetHash(size_t pos) const { + uint32_t result = 0; + if (pos + 2 < size_) { + // TODO(lode): take the MSB's of the uint32_t values into account as well, + // given that the hash code itself is less than 32 bits. + result ^= (uint32_t)(data_[pos + 0] << 0u); + result ^= (uint32_t)(data_[pos + 1] << hash_shift_); + result ^= (uint32_t)(data_[pos + 2] << (hash_shift_ * 2)); + } else { + // No need to compute hash of last 2 bytes, the length 2 is too short. + return 0; + } + return result & hash_mask_; + } + + uint32_t CountZeros(size_t pos, uint32_t prevzeros) const { + size_t end = pos + window_size_; + if (end > size_) end = size_; + if (prevzeros > 0) { + if (prevzeros >= window_mask_ && data_[end - 1] == 0 && + end == pos + window_size_) { + return prevzeros; + } else { + return prevzeros - 1; + } + } + uint32_t num = 0; + while (pos + num < end && data_[pos + num] == 0) num++; + return num; + } + + void Update(size_t pos) { + uint32_t hashval = GetHash(pos); + uint32_t wpos = pos & window_mask_; + + val[wpos] = (int)hashval; + if (head[hashval] != -1) chain[wpos] = head[hashval]; + head[hashval] = wpos; + + if (pos > 0 && data_[pos] != data_[pos - 1]) numzeros = 0; + numzeros = CountZeros(pos, numzeros); + + zeros[wpos] = numzeros; + if (headz[numzeros] != -1) chainz[wpos] = headz[numzeros]; + headz[numzeros] = wpos; + } + + void Update(size_t pos, size_t len) { + for (size_t i = 0; i < len; i++) { + Update(pos + i); + } + } + + template + void FindMatches(size_t pos, int max_dist, const CB& found_match) const { + uint32_t wpos = pos & window_mask_; + uint32_t hashval = GetHash(pos); + uint32_t hashpos = chain[wpos]; + + int prev_dist = 0; + int end = std::min(pos + max_length_, size_); + uint32_t chainlength = 0; + uint32_t best_len = 0; + for (;;) { + int dist = (hashpos <= wpos) ? (wpos - hashpos) + : (wpos - hashpos + window_mask_ + 1); + if (dist < prev_dist) break; + prev_dist = dist; + uint32_t len = 0; + if (dist > 0) { + int i = pos; + int j = pos - dist; + if (numzeros > 3) { + int r = std::min(numzeros - 1, zeros[hashpos]); + if (i + r >= end) r = end - i - 1; + i += r; + j += r; + } + while (i < end && data_[i] == data_[j]) { + i++; + j++; + } + len = i - pos; + // This can trigger even if the new length is slightly smaller than the + // best length, because it is possible for a slightly cheaper distance + // symbol to occur. + if (len >= min_length_ && len + 2 >= best_len) { + auto it = special_dist_table_.find(dist); + int dist_symbol = (it == special_dist_table_.end()) + ? (num_special_distances_ + dist - 1) + : it->second; + found_match(len, dist_symbol); + if (len > best_len) best_len = len; + } + } + + chainlength++; + if (chainlength >= maxchainlength) break; + + if (numzeros >= 3 && len > numzeros) { + if (hashpos == chainz[hashpos]) break; + hashpos = chainz[hashpos]; + if (zeros[hashpos] != numzeros) break; + } else { + if (hashpos == chain[hashpos]) break; + hashpos = chain[hashpos]; + if (val[hashpos] != (int)hashval) break; // outdated hash value + } + } + } + void FindMatch(size_t pos, int max_dist, size_t* result_dist_symbol, + size_t* result_len) const { + *result_dist_symbol = 0; + *result_len = 1; + FindMatches(pos, max_dist, [&](size_t len, size_t dist_symbol) { + if (len > *result_len || + (len == *result_len && *result_dist_symbol > dist_symbol)) { + *result_len = len; + *result_dist_symbol = dist_symbol; + } + }); + } +}; + +float LenCost(size_t len) { + uint32_t nbits, bits, tok; + HybridUintConfig(1, 0, 0).Encode(len, &tok, &nbits, &bits); + constexpr float kCostTable[] = { + 2.797667318563126, 3.213177690381199, 2.5706009246743737, + 2.408392498667534, 2.829649191872326, 3.3923087753324577, + 4.029267451554331, 4.415576699706408, 4.509357574741465, + 9.21481543803004, 10.020590190114898, 11.858671627804766, + 12.45853300490526, 11.713105831990857, 12.561996324849314, + 13.775477692278367, 13.174027068768641, + }; + size_t table_size = sizeof kCostTable / sizeof *kCostTable; + if (tok >= table_size) tok = table_size - 1; + return kCostTable[tok] + nbits; +} + +// TODO(veluca): this does not take into account usage or non-usage of distance +// multipliers. +float DistCost(size_t dist) { + uint32_t nbits, bits, tok; + HybridUintConfig(7, 0, 0).Encode(dist, &tok, &nbits, &bits); + constexpr float kCostTable[] = { + 6.368282626312716, 5.680793277090298, 8.347404197105247, + 7.641619201599141, 6.914328374119438, 7.959808291537444, + 8.70023120759855, 8.71378518934703, 9.379132523982769, + 9.110472749092708, 9.159029569270908, 9.430936766731973, + 7.278284055315169, 7.8278514904267755, 10.026641158289236, + 9.976049229827066, 9.64351607048908, 9.563403863480442, + 10.171474111762747, 10.45950155077234, 9.994813912104219, + 10.322524683741156, 8.465808729388186, 8.756254166066853, + 10.160930174662234, 10.247329273413435, 10.04090403724809, + 10.129398517544082, 9.342311691539546, 9.07608009102374, + 10.104799540677513, 10.378079384990906, 10.165828974075072, + 10.337595322341553, 7.940557464567944, 10.575665823319431, + 11.023344321751955, 10.736144698831827, 11.118277044595054, + 7.468468230648442, 10.738305230932939, 10.906980780216568, + 10.163468216353817, 10.17805759656433, 11.167283670483565, + 11.147050200274544, 10.517921919244333, 10.651764778156886, + 10.17074446448919, 11.217636876224745, 11.261630721139484, + 11.403140815247259, 10.892472096873417, 11.1859607804481, + 8.017346947551262, 7.895143720278828, 11.036577113822025, + 11.170562110315794, 10.326988722591086, 10.40872184751056, + 11.213498225466386, 11.30580635516863, 10.672272515665442, + 10.768069466228063, 11.145257364153565, 11.64668307145549, + 10.593156194627339, 11.207499484844943, 10.767517766396908, + 10.826629811407042, 10.737764794499988, 10.6200448518045, + 10.191315385198092, 8.468384171390085, 11.731295299170432, + 11.824619886654398, 10.41518844301179, 10.16310536548649, + 10.539423685097576, 10.495136599328031, 10.469112847728267, + 11.72057686174922, 10.910326337834674, 11.378921834673758, + 11.847759036098536, 11.92071647623854, 10.810628276345282, + 11.008601085273893, 11.910326337834674, 11.949212023423133, + 11.298614839104337, 11.611603659010392, 10.472930394619985, + 11.835564720850282, 11.523267392285337, 12.01055816679611, + 8.413029688994023, 11.895784139536406, 11.984679534970505, + 11.220654278717394, 11.716311684833672, 10.61036646226114, + 10.89849965960364, 10.203762898863669, 10.997560826267238, + 11.484217379438984, 11.792836176993665, 12.24310468755171, + 11.464858097919262, 12.212747017409377, 11.425595666074955, + 11.572048533398757, 12.742093965163013, 11.381874288645637, + 12.191870445817015, 11.683156920035426, 11.152442115262197, + 11.90303691580457, 11.653292787169159, 11.938615382266098, + 16.970641701570223, 16.853602280380002, 17.26240782594733, + 16.644655390108507, 17.14310889757499, 16.910935455445955, + 17.505678976959697, 17.213498225466388, 2.4162310293553024, + 3.494587244462329, 3.5258600986408344, 3.4959806589517095, + 3.098390886949687, 3.343454654302911, 3.588847442290287, + 4.14614790111827, 5.152948641990529, 7.433696808092598, + 9.716311684833672, + }; + size_t table_size = sizeof kCostTable / sizeof *kCostTable; + if (tok >= table_size) tok = table_size - 1; + return kCostTable[tok] + nbits; +} + +void ApplyLZ77_LZ77(const HistogramParams& params, size_t num_contexts, + const std::vector>& tokens, + LZ77Params& lz77, + std::vector>& tokens_lz77) { + // TODO(veluca): tune heuristics here. + SymbolCostEstimator sce(num_contexts, params.force_huffman, tokens, lz77); + float bit_decrease = 0; + size_t total_symbols = 0; + tokens_lz77.resize(tokens.size()); + HybridUintConfig uint_config; + std::vector sym_cost; + for (size_t stream = 0; stream < tokens.size(); stream++) { + size_t distance_multiplier = + params.image_widths.size() > stream ? params.image_widths[stream] : 0; + const auto& in = tokens[stream]; + auto& out = tokens_lz77[stream]; + total_symbols += in.size(); + // Cumulative sum of bit costs. + sym_cost.resize(in.size() + 1); + for (size_t i = 0; i < in.size(); i++) { + uint32_t tok, nbits, unused_bits; + uint_config.Encode(in[i].value, &tok, &nbits, &unused_bits); + sym_cost[i + 1] = sce.Bits(in[i].context, tok) + nbits + sym_cost[i]; + } + + out.reserve(in.size()); + size_t max_distance = in.size(); + size_t min_length = lz77.min_length; + JXL_ASSERT(min_length >= 3); + size_t max_length = in.size(); + + // Use next power of two as window size. + size_t window_size = 1; + while (window_size < max_distance && window_size < kWindowSize) { + window_size <<= 1; + } + + HashChain chain(in.data(), in.size(), window_size, min_length, max_length, + distance_multiplier); + size_t len, dist_symbol; + + const size_t max_lazy_match_len = 256; // 0 to disable lazy matching + + // Whether the next symbol was already updated (to test lazy matching) + bool already_updated = false; + for (size_t i = 0; i < in.size(); i++) { + out.push_back(in[i]); + if (!already_updated) chain.Update(i); + already_updated = false; + chain.FindMatch(i, max_distance, &dist_symbol, &len); + if (len >= min_length) { + if (len < max_lazy_match_len && i + 1 < in.size()) { + // Try length at next symbol lazy matching + chain.Update(i + 1); + already_updated = true; + size_t len2, dist_symbol2; + chain.FindMatch(i + 1, max_distance, &dist_symbol2, &len2); + if (len2 > len) { + // Use the lazy match. Add literal, and use the next length starting + // from the next byte. + ++i; + already_updated = false; + len = len2; + dist_symbol = dist_symbol2; + out.push_back(in[i]); + } + } + + float cost = sym_cost[i + len] - sym_cost[i]; + size_t lz77_len = len - lz77.min_length; + float lz77_cost = LenCost(lz77_len) + DistCost(dist_symbol) + + sce.AddSymbolCost(out.back().context); + + if (lz77_cost <= cost) { + out.back().value = len - min_length; + out.back().is_lz77_length = true; + out.emplace_back(lz77.nonserialized_distance_context, dist_symbol); + bit_decrease += cost - lz77_cost; + } else { + // LZ77 match ignored, and symbol already pushed. Push all other + // symbols and skip. + for (size_t j = 1; j < len; j++) { + out.push_back(in[i + j]); + } + } + + if (already_updated) { + chain.Update(i + 2, len - 2); + already_updated = false; + } else { + chain.Update(i + 1, len - 1); + } + i += len - 1; + } else { + // Literal, already pushed + } + } + } + + if (bit_decrease > total_symbols * 0.2 + 16) { + lz77.enabled = true; + } +} + +void ApplyLZ77_Optimal(const HistogramParams& params, size_t num_contexts, + const std::vector>& tokens, + LZ77Params& lz77, + std::vector>& tokens_lz77) { + std::vector> tokens_for_cost_estimate; + ApplyLZ77_LZ77(params, num_contexts, tokens, lz77, tokens_for_cost_estimate); + // If greedy-LZ77 does not give better compression than no-lz77, no reason to + // run the optimal matching. + if (!lz77.enabled) return; + SymbolCostEstimator sce(num_contexts + 1, params.force_huffman, + tokens_for_cost_estimate, lz77); + tokens_lz77.resize(tokens.size()); + HybridUintConfig uint_config; + std::vector sym_cost; + std::vector dist_symbols; + for (size_t stream = 0; stream < tokens.size(); stream++) { + size_t distance_multiplier = + params.image_widths.size() > stream ? params.image_widths[stream] : 0; + const auto& in = tokens[stream]; + auto& out = tokens_lz77[stream]; + // Cumulative sum of bit costs. + sym_cost.resize(in.size() + 1); + for (size_t i = 0; i < in.size(); i++) { + uint32_t tok, nbits, unused_bits; + uint_config.Encode(in[i].value, &tok, &nbits, &unused_bits); + sym_cost[i + 1] = sce.Bits(in[i].context, tok) + nbits + sym_cost[i]; + } + + out.reserve(in.size()); + size_t max_distance = in.size(); + size_t min_length = lz77.min_length; + JXL_ASSERT(min_length >= 3); + size_t max_length = in.size(); + + // Use next power of two as window size. + size_t window_size = 1; + while (window_size < max_distance && window_size < kWindowSize) { + window_size <<= 1; + } + + HashChain chain(in.data(), in.size(), window_size, min_length, max_length, + distance_multiplier); + + struct MatchInfo { + uint32_t len; + uint32_t dist_symbol; + uint32_t ctx; + float total_cost = std::numeric_limits::max(); + }; + // Total cost to encode the first N symbols. + std::vector prefix_costs(in.size() + 1); + prefix_costs[0].total_cost = 0; + + size_t rle_length = 0; + size_t skip_lz77 = 0; + for (size_t i = 0; i < in.size(); i++) { + chain.Update(i); + float lit_cost = + prefix_costs[i].total_cost + sym_cost[i + 1] - sym_cost[i]; + if (prefix_costs[i + 1].total_cost > lit_cost) { + prefix_costs[i + 1].dist_symbol = 0; + prefix_costs[i + 1].len = 1; + prefix_costs[i + 1].ctx = in[i].context; + prefix_costs[i + 1].total_cost = lit_cost; + } + if (skip_lz77 > 0) { + skip_lz77--; + continue; + } + dist_symbols.clear(); + chain.FindMatches(i, max_distance, + [&dist_symbols](size_t len, size_t dist_symbol) { + if (dist_symbols.size() <= len) { + dist_symbols.resize(len + 1, dist_symbol); + } + if (dist_symbol < dist_symbols[len]) { + dist_symbols[len] = dist_symbol; + } + }); + if (dist_symbols.size() <= min_length) continue; + { + size_t best_cost = dist_symbols.back(); + for (size_t j = dist_symbols.size() - 1; j >= min_length; j--) { + if (dist_symbols[j] < best_cost) { + best_cost = dist_symbols[j]; + } + dist_symbols[j] = best_cost; + } + } + for (size_t j = min_length; j < dist_symbols.size(); j++) { + // Cost model that uses results from lazy LZ77. + float lz77_cost = sce.LenCost(in[i].context, j - min_length, lz77) + + sce.DistCost(dist_symbols[j], lz77); + float cost = prefix_costs[i].total_cost + lz77_cost; + if (prefix_costs[i + j].total_cost > cost) { + prefix_costs[i + j].len = j; + prefix_costs[i + j].dist_symbol = dist_symbols[j] + 1; + prefix_costs[i + j].ctx = in[i].context; + prefix_costs[i + j].total_cost = cost; + } + } + // We are in a RLE sequence: skip all the symbols except the first 8 and + // the last 8. This avoid quadratic costs for sequences with long runs of + // the same symbol. + if ((dist_symbols.back() == 0 && distance_multiplier == 0) || + (dist_symbols.back() == 1 && distance_multiplier != 0)) { + rle_length++; + } else { + rle_length = 0; + } + if (rle_length >= 8 && dist_symbols.size() > 9) { + skip_lz77 = dist_symbols.size() - 10; + rle_length = 0; + } + } + size_t pos = in.size(); + while (pos > 0) { + bool is_lz77_length = prefix_costs[pos].dist_symbol != 0; + if (is_lz77_length) { + size_t dist_symbol = prefix_costs[pos].dist_symbol - 1; + out.emplace_back(lz77.nonserialized_distance_context, dist_symbol); + } + size_t val = is_lz77_length ? prefix_costs[pos].len - min_length + : in[pos - 1].value; + out.emplace_back(prefix_costs[pos].ctx, val); + out.back().is_lz77_length = is_lz77_length; + pos -= prefix_costs[pos].len; + } + std::reverse(out.begin(), out.end()); + } +} + +void ApplyLZ77(const HistogramParams& params, size_t num_contexts, + const std::vector>& tokens, LZ77Params& lz77, + std::vector>& tokens_lz77) { + lz77.enabled = false; + if (params.force_huffman) { + lz77.min_symbol = std::min(PREFIX_MAX_ALPHABET_SIZE - 32, 512); + } else { + lz77.min_symbol = 224; + } + if (params.lz77_method == HistogramParams::LZ77Method::kNone) { + return; + } else if (params.lz77_method == HistogramParams::LZ77Method::kRLE) { + ApplyLZ77_RLE(params, num_contexts, tokens, lz77, tokens_lz77); + } else if (params.lz77_method == HistogramParams::LZ77Method::kLZ77) { + ApplyLZ77_LZ77(params, num_contexts, tokens, lz77, tokens_lz77); + } else if (params.lz77_method == HistogramParams::LZ77Method::kOptimal) { + ApplyLZ77_Optimal(params, num_contexts, tokens, lz77, tokens_lz77); + } else { + JXL_ABORT("Not implemented"); + } +} +} // namespace + +size_t BuildAndEncodeHistograms(const HistogramParams& params, + size_t num_contexts, + std::vector>& tokens, + EntropyEncodingData* codes, + std::vector* context_map, + BitWriter* writer, size_t layer, + AuxOut* aux_out) { + size_t total_bits = 0; + codes->lz77.nonserialized_distance_context = num_contexts; + std::vector> tokens_lz77; + ApplyLZ77(params, num_contexts, tokens, codes->lz77, tokens_lz77); + if (ans_fuzzer_friendly_) { + codes->lz77.length_uint_config = HybridUintConfig(10, 0, 0); + codes->lz77.min_symbol = 2048; + } + + const size_t max_contexts = std::min(num_contexts, kClustersLimit); + BitWriter::Allotment allotment(writer, + 128 + num_contexts * 40 + max_contexts * 96); + if (writer) { + JXL_CHECK(Bundle::Write(codes->lz77, writer, layer, aux_out)); + } else { + size_t ebits, bits; + JXL_CHECK(Bundle::CanEncode(codes->lz77, &ebits, &bits)); + total_bits += bits; + } + if (codes->lz77.enabled) { + if (writer) { + size_t b = writer->BitsWritten(); + EncodeUintConfig(codes->lz77.length_uint_config, writer, + /*log_alpha_size=*/8); + total_bits += writer->BitsWritten() - b; + } else { + SizeWriter size_writer; + EncodeUintConfig(codes->lz77.length_uint_config, &size_writer, + /*log_alpha_size=*/8); + total_bits += size_writer.size; + } + num_contexts += 1; + tokens = std::move(tokens_lz77); + } + size_t total_tokens = 0; + // Build histograms. + HistogramBuilder builder(num_contexts); + HybridUintConfig uint_config; // Default config for clustering. + // Unless we are using the kContextMap histogram option. + if (params.uint_method == HistogramParams::HybridUintMethod::kContextMap) { + uint_config = HybridUintConfig(2, 0, 1); + } + if (params.uint_method == HistogramParams::HybridUintMethod::k000) { + uint_config = HybridUintConfig(0, 0, 0); + } + if (ans_fuzzer_friendly_) { + uint_config = HybridUintConfig(10, 0, 0); + } + for (size_t i = 0; i < tokens.size(); ++i) { + if (codes->lz77.enabled) { + for (size_t j = 0; j < tokens[i].size(); ++j) { + const Token& token = tokens[i][j]; + total_tokens++; + uint32_t tok, nbits, bits; + (token.is_lz77_length ? codes->lz77.length_uint_config : uint_config) + .Encode(token.value, &tok, &nbits, &bits); + tok += token.is_lz77_length ? codes->lz77.min_symbol : 0; + builder.VisitSymbol(tok, token.context); + } + } else if (num_contexts == 1) { + for (size_t j = 0; j < tokens[i].size(); ++j) { + const Token& token = tokens[i][j]; + total_tokens++; + uint32_t tok, nbits, bits; + uint_config.Encode(token.value, &tok, &nbits, &bits); + builder.VisitSymbol(tok, /*token.context=*/0); + } + } else { + for (size_t j = 0; j < tokens[i].size(); ++j) { + const Token& token = tokens[i][j]; + total_tokens++; + uint32_t tok, nbits, bits; + uint_config.Encode(token.value, &tok, &nbits, &bits); + builder.VisitSymbol(tok, token.context); + } + } + } + + bool use_prefix_code = + params.force_huffman || total_tokens < 100 || + params.clustering == HistogramParams::ClusteringType::kFastest || + ans_fuzzer_friendly_; + if (!use_prefix_code) { + bool all_singleton = true; + for (size_t i = 0; i < num_contexts; i++) { + if (builder.Histo(i).ShannonEntropy() >= 1e-5) { + all_singleton = false; + } + } + if (all_singleton) { + use_prefix_code = true; + } + } + + // Encode histograms. + total_bits += builder.BuildAndStoreEntropyCodes(params, tokens, codes, + context_map, use_prefix_code, + writer, layer, aux_out); + allotment.FinishedHistogram(writer); + allotment.ReclaimAndCharge(writer, layer, aux_out); + + if (aux_out != nullptr) { + aux_out->layers[layer].num_clustered_histograms += + codes->encoding_info.size(); + } + return total_bits; +} + +size_t WriteTokens(const std::vector& tokens, + const EntropyEncodingData& codes, + const std::vector& context_map, BitWriter* writer) { + size_t num_extra_bits = 0; + if (codes.use_prefix_code) { + for (size_t i = 0; i < tokens.size(); i++) { + uint32_t tok, nbits, bits; + const Token& token = tokens[i]; + size_t histo = context_map[token.context]; + (token.is_lz77_length ? codes.lz77.length_uint_config + : codes.uint_config[histo]) + .Encode(token.value, &tok, &nbits, &bits); + tok += token.is_lz77_length ? codes.lz77.min_symbol : 0; + // Combine two calls to the BitWriter. Equivalent to: + // writer->Write(codes.encoding_info[histo][tok].depth, + // codes.encoding_info[histo][tok].bits); + // writer->Write(nbits, bits); + uint64_t data = codes.encoding_info[histo][tok].bits; + data |= bits << codes.encoding_info[histo][tok].depth; + writer->Write(codes.encoding_info[histo][tok].depth + nbits, data); + num_extra_bits += nbits; + } + return num_extra_bits; + } + std::vector out; + std::vector out_nbits; + out.reserve(tokens.size()); + out_nbits.reserve(tokens.size()); + uint64_t allbits = 0; + size_t numallbits = 0; + // Writes in *reversed* order. + auto addbits = [&](size_t bits, size_t nbits) { + if (JXL_UNLIKELY(nbits)) { + JXL_DASSERT(bits >> nbits == 0); + if (JXL_UNLIKELY(numallbits + nbits > BitWriter::kMaxBitsPerCall)) { + out.push_back(allbits); + out_nbits.push_back(numallbits); + numallbits = allbits = 0; + } + allbits <<= nbits; + allbits |= bits; + numallbits += nbits; + } + }; + const int end = tokens.size(); + ANSCoder ans; + if (codes.lz77.enabled || context_map.size() > 1) { + for (int i = end - 1; i >= 0; --i) { + const Token token = tokens[i]; + const uint8_t histo = context_map[token.context]; + uint32_t tok, nbits, bits; + (token.is_lz77_length ? codes.lz77.length_uint_config + : codes.uint_config[histo]) + .Encode(tokens[i].value, &tok, &nbits, &bits); + tok += token.is_lz77_length ? codes.lz77.min_symbol : 0; + const ANSEncSymbolInfo& info = codes.encoding_info[histo][tok]; + // Extra bits first as this is reversed. + addbits(bits, nbits); + num_extra_bits += nbits; + uint8_t ans_nbits = 0; + uint32_t ans_bits = ans.PutSymbol(info, &ans_nbits); + addbits(ans_bits, ans_nbits); + } + } else { + for (int i = end - 1; i >= 0; --i) { + uint32_t tok, nbits, bits; + codes.uint_config[0].Encode(tokens[i].value, &tok, &nbits, &bits); + const ANSEncSymbolInfo& info = codes.encoding_info[0][tok]; + // Extra bits first as this is reversed. + addbits(bits, nbits); + num_extra_bits += nbits; + uint8_t ans_nbits = 0; + uint32_t ans_bits = ans.PutSymbol(info, &ans_nbits); + addbits(ans_bits, ans_nbits); + } + } + const uint32_t state = ans.GetState(); + writer->Write(32, state); + writer->Write(numallbits, allbits); + for (int i = out.size(); i > 0; --i) { + writer->Write(out_nbits[i - 1], out[i - 1]); + } + return num_extra_bits; +} + +void WriteTokens(const std::vector& tokens, + const EntropyEncodingData& codes, + const std::vector& context_map, BitWriter* writer, + size_t layer, AuxOut* aux_out) { + BitWriter::Allotment allotment(writer, 32 * tokens.size() + 32 * 1024 * 4); + size_t num_extra_bits = WriteTokens(tokens, codes, context_map, writer); + allotment.ReclaimAndCharge(writer, layer, aux_out); + if (aux_out != nullptr) { + aux_out->layers[layer].extra_bits += num_extra_bits; + } +} + +void SetANSFuzzerFriendly(bool ans_fuzzer_friendly) { +#if JXL_IS_DEBUG_BUILD // Guard against accidental / malicious changes. + ans_fuzzer_friendly_ = ans_fuzzer_friendly; +#endif +} +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_ans.h b/third_party/jpeg-xl/lib/jxl/enc_ans.h new file mode 100644 index 0000000000..a4afb19b4e --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_ans.h @@ -0,0 +1,143 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_ANS_H_ +#define LIB_JXL_ENC_ANS_H_ + +// Library to encode the ANS population counts to the bit-stream and encode +// symbols based on the respective distributions. + +#include +#include +#include +#include +#include + +#include +#include + +#include "lib/jxl/ans_common.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/enc_ans_params.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/huffman_table.h" + +namespace jxl { + +struct AuxOut; + +#define USE_MULT_BY_RECIPROCAL + +// precision must be equal to: #bits(state_) + #bits(freq) +#define RECIPROCAL_PRECISION (32 + ANS_LOG_TAB_SIZE) + +// Data structure representing one element of the encoding table built +// from a distribution. +// TODO(veluca): split this up, or use an union. +struct ANSEncSymbolInfo { + // ANS + uint16_t freq_; + std::vector reverse_map_; +#ifdef USE_MULT_BY_RECIPROCAL + uint64_t ifreq_; +#endif + // Prefix coding. + uint8_t depth; + uint16_t bits; +}; + +class ANSCoder { + public: + ANSCoder() : state_(ANS_SIGNATURE << 16) {} + + uint32_t PutSymbol(const ANSEncSymbolInfo& t, uint8_t* nbits) { + uint32_t bits = 0; + *nbits = 0; + if ((state_ >> (32 - ANS_LOG_TAB_SIZE)) >= t.freq_) { + bits = state_ & 0xffff; + state_ >>= 16; + *nbits = 16; + } +#ifdef USE_MULT_BY_RECIPROCAL + // We use mult-by-reciprocal trick, but that requires 64b calc. + const uint32_t v = (state_ * t.ifreq_) >> RECIPROCAL_PRECISION; + const uint32_t offset = t.reverse_map_[state_ - v * t.freq_]; + state_ = (v << ANS_LOG_TAB_SIZE) + offset; +#else + state_ = ((state_ / t.freq_) << ANS_LOG_TAB_SIZE) + + t.reverse_map_[state_ % t.freq_]; +#endif + return bits; + } + + uint32_t GetState() const { return state_; } + + private: + uint32_t state_; +}; + +// RebalanceHistogram requires a signed type. +using ANSHistBin = int32_t; + +struct EntropyEncodingData { + std::vector> encoding_info; + bool use_prefix_code; + std::vector uint_config; + LZ77Params lz77; +}; + +// Integer to be encoded by an entropy coder, either ANS or Huffman. +struct Token { + Token() {} + Token(uint32_t c, uint32_t value) + : is_lz77_length(false), context(c), value(value) {} + uint32_t is_lz77_length : 1; + uint32_t context : 31; + uint32_t value; +}; + +// Returns an estimate of the number of bits required to encode the given +// histogram (header bits plus data bits). +float ANSPopulationCost(const ANSHistBin* data, size_t alphabet_size); + +// Apply context clustering, compute histograms and encode them. Returns an +// estimate of the total bits used for encoding the stream. If `writer` == +// nullptr, the bit estimate will not take into account the context map (which +// does not get written if `num_contexts` == 1). +size_t BuildAndEncodeHistograms(const HistogramParams& params, + size_t num_contexts, + std::vector>& tokens, + EntropyEncodingData* codes, + std::vector* context_map, + BitWriter* writer, size_t layer, + AuxOut* aux_out); + +// Write the tokens to a string. +void WriteTokens(const std::vector& tokens, + const EntropyEncodingData& codes, + const std::vector& context_map, BitWriter* writer, + size_t layer, AuxOut* aux_out); + +// Same as above, but assumes allotment created by caller. +size_t WriteTokens(const std::vector& tokens, + const EntropyEncodingData& codes, + const std::vector& context_map, BitWriter* writer); + +// Exposed for tests; to be used with Writer=BitWriter only. +template +void EncodeUintConfigs(const std::vector& uint_config, + Writer* writer, size_t log_alpha_size); +extern template void EncodeUintConfigs(const std::vector&, + BitWriter*, size_t); + +// Globally set the option to create fuzzer-friendly ANS streams. Negatively +// impacts compression. Not thread-safe. +void SetANSFuzzerFriendly(bool ans_fuzzer_friendly); +} // namespace jxl + +#endif // LIB_JXL_ENC_ANS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_ans_params.h b/third_party/jpeg-xl/lib/jxl/enc_ans_params.h new file mode 100644 index 0000000000..50ca31dc03 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_ans_params.h @@ -0,0 +1,76 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_ANS_PARAMS_H_ +#define LIB_JXL_ENC_ANS_PARAMS_H_ + +// Encoder-only parameter needed for ANS entropy encoding methods. + +#include +#include + +#include "lib/jxl/enc_params.h" + +namespace jxl { + +struct HistogramParams { + enum class ClusteringType { + kFastest, // Only 4 clusters. + kFast, + kBest, + }; + + enum class HybridUintMethod { + kNone, // just use kHybridUint420Config. + k000, // force the fastest option. + kFast, // just try a couple of options. + kContextMap, // fast choice for ctx map. + kBest, + }; + + enum class LZ77Method { + kNone, // do not try lz77. + kRLE, // only try doing RLE. + kLZ77, // try lz77 with backward references. + kOptimal, // optimal-matching LZ77 parsing. + }; + + enum class ANSHistogramStrategy { + kFast, // Only try some methods, early exit. + kApproximate, // Only try some methods. + kPrecise, // Try all methods. + }; + + HistogramParams() = default; + + HistogramParams(SpeedTier tier, size_t num_ctx) { + if (tier > SpeedTier::kFalcon) { + clustering = ClusteringType::kFastest; + lz77_method = LZ77Method::kNone; + } else if (tier > SpeedTier::kTortoise) { + clustering = ClusteringType::kFast; + } else { + clustering = ClusteringType::kBest; + } + if (tier > SpeedTier::kTortoise) { + uint_method = HybridUintMethod::kNone; + } + if (tier >= SpeedTier::kSquirrel) { + ans_histogram_strategy = ANSHistogramStrategy::kApproximate; + } + } + + ClusteringType clustering = ClusteringType::kBest; + HybridUintMethod uint_method = HybridUintMethod::kBest; + LZ77Method lz77_method = LZ77Method::kRLE; + ANSHistogramStrategy ans_histogram_strategy = ANSHistogramStrategy::kPrecise; + std::vector image_widths; + size_t max_histograms = ~0; + bool force_huffman = false; +}; + +} // namespace jxl + +#endif // LIB_JXL_ENC_ANS_PARAMS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.cc b/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.cc new file mode 100644 index 0000000000..9030430e2b --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.cc @@ -0,0 +1,325 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_ar_control_field.h" + +#include +#include + +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_ar_control_field.cc" +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/enc_adaptive_quantization.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::GetLane; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::Sqrt; + +void ProcessTile(const Image3F& opsin, PassesEncoderState* enc_state, + const Rect& rect, + ArControlFieldHeuristics::TempImages* temp_image) { + constexpr size_t N = kBlockDim; + ImageB* JXL_RESTRICT epf_sharpness = &enc_state->shared.epf_sharpness; + ImageF* JXL_RESTRICT quant = &enc_state->initial_quant_field; + JXL_ASSERT( + epf_sharpness->xsize() == enc_state->shared.frame_dim.xsize_blocks && + epf_sharpness->ysize() == enc_state->shared.frame_dim.ysize_blocks); + + if (enc_state->cparams.butteraugli_distance < kMinButteraugliForDynamicAR || + enc_state->cparams.speed_tier > SpeedTier::kWombat || + enc_state->shared.frame_header.loop_filter.epf_iters == 0) { + FillPlane(static_cast(4), epf_sharpness, rect); + return; + } + + // Likely better to have a higher X weight, like: + // const float kChannelWeights[3] = {47.0f, 4.35f, 0.287f}; + const float kChannelWeights[3] = {4.35f, 4.35f, 0.287f}; + const float kChannelWeightsLapNeg[3] = {-0.125f * kChannelWeights[0], + -0.125f * kChannelWeights[1], + -0.125f * kChannelWeights[2]}; + const size_t sharpness_stride = + static_cast(epf_sharpness->PixelsPerRow()); + + size_t by0 = rect.y0(); + size_t by1 = rect.y0() + rect.ysize(); + size_t bx0 = rect.x0(); + size_t bx1 = rect.x0() + rect.xsize(); + temp_image->InitOnce(); + ImageF& laplacian_sqrsum = temp_image->laplacian_sqrsum; + // Calculate the L2 of the 3x3 Laplacian in an integral transform + // (for example 32x32 dct). This relates to transforms ability + // to propagate artefacts. + size_t y0 = by0 == 0 ? 2 : 0; + size_t y1 = by1 * N + 4 <= opsin.ysize() + 2 ? (by1 - by0) * N + 4 + : opsin.ysize() + 2 - by0 * N; + size_t x0 = bx0 == 0 ? 2 : 0; + size_t x1 = bx1 * N + 4 <= opsin.xsize() + 2 ? (bx1 - bx0) * N + 4 + : opsin.xsize() + 2 - bx0 * N; + HWY_FULL(float) df; + for (size_t y = y0; y < y1; y++) { + float* JXL_RESTRICT laplacian_sqrsum_row = laplacian_sqrsum.Row(y); + size_t cy = y + by0 * N - 2; + const float* JXL_RESTRICT in_row_t[3]; + const float* JXL_RESTRICT in_row[3]; + const float* JXL_RESTRICT in_row_b[3]; + for (size_t c = 0; c < 3; c++) { + in_row_t[c] = opsin.PlaneRow(c, cy > 0 ? cy - 1 : cy); + in_row[c] = opsin.PlaneRow(c, cy); + in_row_b[c] = opsin.PlaneRow(c, cy + 1 < opsin.ysize() ? cy + 1 : cy); + } + auto compute_laplacian_scalar = [&](size_t x) { + size_t cx = x + bx0 * N - 2; + const size_t prevX = cx >= 1 ? cx - 1 : cx; + const size_t nextX = cx + 1 < opsin.xsize() ? cx + 1 : cx; + float sumsqr = 0; + for (size_t c = 0; c < 3; c++) { + float laplacian = + kChannelWeights[c] * in_row[c][cx] + + kChannelWeightsLapNeg[c] * + (in_row[c][prevX] + in_row[c][nextX] + in_row_b[c][prevX] + + in_row_b[c][cx] + in_row_b[c][nextX] + in_row_t[c][prevX] + + in_row_t[c][cx] + in_row_t[c][nextX]); + sumsqr += laplacian * laplacian; + } + laplacian_sqrsum_row[x] = sumsqr; + }; + size_t x = x0; + for (; x + bx0 * N < 3; x++) { + compute_laplacian_scalar(x); + } + // Interior. One extra pixel of border as the last pixel is special. + for (; x + Lanes(df) <= x1 && x + Lanes(df) + bx0 * N - 1 <= opsin.xsize(); + x += Lanes(df)) { + size_t cx = x + bx0 * N - 2; + auto sumsqr = Zero(df); + for (size_t c = 0; c < 3; c++) { + auto laplacian = + Mul(LoadU(df, in_row[c] + cx), Set(df, kChannelWeights[c])); + auto sum_oth0 = LoadU(df, in_row[c] + cx - 1); + auto sum_oth1 = LoadU(df, in_row[c] + cx + 1); + auto sum_oth2 = LoadU(df, in_row_t[c] + cx - 1); + auto sum_oth3 = LoadU(df, in_row_t[c] + cx); + sum_oth0 = Add(sum_oth0, LoadU(df, in_row_t[c] + cx + 1)); + sum_oth1 = Add(sum_oth1, LoadU(df, in_row_b[c] + cx - 1)); + sum_oth2 = Add(sum_oth2, LoadU(df, in_row_b[c] + cx)); + sum_oth3 = Add(sum_oth3, LoadU(df, in_row_b[c] + cx + 1)); + sum_oth0 = Add(sum_oth0, sum_oth1); + sum_oth2 = Add(sum_oth2, sum_oth3); + sum_oth0 = Add(sum_oth0, sum_oth2); + laplacian = + MulAdd(Set(df, kChannelWeightsLapNeg[c]), sum_oth0, laplacian); + sumsqr = MulAdd(laplacian, laplacian, sumsqr); + } + StoreU(sumsqr, df, laplacian_sqrsum_row + x); + } + for (; x < x1; x++) { + compute_laplacian_scalar(x); + } + } + HWY_CAPPED(float, 4) df4; + // Calculate the L2 of the 3x3 Laplacian in 4x4 blocks within the area + // of the integral transform. Sample them within the integral transform + // with two offsets (0,0) and (-2, -2) pixels (sqrsum_00 and sqrsum_22, + // respectively). + ImageF& sqrsum_00 = temp_image->sqrsum_00; + size_t sqrsum_00_stride = sqrsum_00.PixelsPerRow(); + float* JXL_RESTRICT sqrsum_00_row = sqrsum_00.Row(0); + for (size_t y = 0; y < (by1 - by0) * 2; y++) { + const float* JXL_RESTRICT rows_in[4]; + for (size_t iy = 0; iy < 4; iy++) { + rows_in[iy] = laplacian_sqrsum.ConstRow(y * 4 + iy + 2); + } + float* JXL_RESTRICT row_out = sqrsum_00_row + y * sqrsum_00_stride; + for (size_t x = 0; x < (bx1 - bx0) * 2; x++) { + auto sum = Zero(df4); + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix += Lanes(df4)) { + sum = Add(sum, LoadU(df4, rows_in[iy] + x * 4 + ix + 2)); + } + } + row_out[x] = GetLane(Sqrt(SumOfLanes(df4, sum))) * (1.0f / 4.0f); + } + } + // Indexing iy and ix is a bit tricky as we include a 2 pixel border + // around the block for evenness calculations. This is similar to what + // we did in guetzli for the observability of artefacts, except there + // the element is a sliding 5x5, not sparsely sampled 4x4 box like here. + ImageF& sqrsum_22 = temp_image->sqrsum_22; + size_t sqrsum_22_stride = sqrsum_22.PixelsPerRow(); + float* JXL_RESTRICT sqrsum_22_row = sqrsum_22.Row(0); + for (size_t y = 0; y < (by1 - by0) * 2 + 1; y++) { + const float* JXL_RESTRICT rows_in[4]; + for (size_t iy = 0; iy < 4; iy++) { + rows_in[iy] = laplacian_sqrsum.ConstRow(y * 4 + iy); + } + float* JXL_RESTRICT row_out = sqrsum_22_row + y * sqrsum_22_stride; + // ignore pixels outside the image. + // Y coordinates are relative to by0*8+y*4. + size_t sy = y * 4 + by0 * 8 > 0 ? 0 : 2; + size_t ey = y * 4 + by0 * 8 + 4 <= opsin.ysize() + 2 + ? 4 + : opsin.ysize() - y * 4 - by0 * 8 + 2; + for (size_t x = 0; x < (bx1 - bx0) * 2 + 1; x++) { + // ignore pixels outside the image. + // X coordinates are relative to bx0*8. + size_t sx = x * 4 + bx0 * 8 > 0 ? x * 4 : x * 4 + 2; + size_t ex = x * 4 + bx0 * 8 + 4 <= opsin.xsize() + 2 + ? x * 4 + 4 + : opsin.xsize() - bx0 * 8 + 2; + if (ex - sx == 4 && ey - sy == 4) { + auto sum = Zero(df4); + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix += Lanes(df4)) { + sum = Add(sum, Load(df4, rows_in[iy] + sx + ix)); + } + } + row_out[x] = GetLane(Sqrt(SumOfLanes(df4, sum))) * (1.0f / 4.0f); + } else { + float sum = 0; + for (size_t iy = sy; iy < ey; iy++) { + for (size_t ix = sx; ix < ex; ix++) { + sum += rows_in[iy][ix]; + } + } + row_out[x] = std::sqrt(sum / ((ex - sx) * (ey - sy))); + } + } + } + for (size_t by = by0; by < by1; by++) { + AcStrategyRow acs_row = enc_state->shared.ac_strategy.ConstRow(by); + uint8_t* JXL_RESTRICT out_row = epf_sharpness->Row(by); + float* JXL_RESTRICT quant_row = quant->Row(by); + for (size_t bx = bx0; bx < bx1; bx++) { + AcStrategy acs = acs_row[bx]; + if (!acs.IsFirstBlock()) continue; + // The errors are going to be linear to the quantization value in this + // locality. We only have access to the initial quant field here. + float quant_val = 1.0f / quant_row[bx]; + + const auto sq00 = [&](size_t y, size_t x) { + return sqrsum_00_row[((by - by0) * 2 + y) * sqrsum_00_stride + + (bx - bx0) * 2 + x]; + }; + const auto sq22 = [&](size_t y, size_t x) { + return sqrsum_22_row[((by - by0) * 2 + y) * sqrsum_22_stride + + (bx - bx0) * 2 + x]; + }; + float sqrsum_integral_transform = 0; + for (size_t iy = 0; iy < acs.covered_blocks_y() * 2; iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x() * 2; ix++) { + sqrsum_integral_transform += sq00(iy, ix) * sq00(iy, ix); + } + } + sqrsum_integral_transform /= + 4 * acs.covered_blocks_x() * acs.covered_blocks_y(); + sqrsum_integral_transform = std::sqrt(sqrsum_integral_transform); + // If masking is high or amplitude of the artefacts is low, then no + // smoothing is needed. + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + // Five 4x4 blocks for masking estimation, all within the + // 8x8 area. + float minval_1 = std::min(sq00(2 * iy + 0, 2 * ix + 0), + sq00(2 * iy + 0, 2 * ix + 1)); + float minval_2 = std::min(sq00(2 * iy + 1, 2 * ix + 0), + sq00(2 * iy + 1, 2 * ix + 1)); + float minval = std::min(minval_1, minval_2); + minval = std::min(minval, sq22(2 * iy + 1, 2 * ix + 1)); + // Nine more 4x4 blocks for masking estimation, includes + // the 2 pixel area around the 8x8 block being controlled. + float minval2_1 = std::min(sq22(2 * iy + 0, 2 * ix + 0), + sq22(2 * iy + 0, 2 * ix + 1)); + float minval2_2 = std::min(sq22(2 * iy + 0, 2 * ix + 2), + sq22(2 * iy + 1, 2 * ix + 0)); + float minval2_3 = std::min(sq22(2 * iy + 1, 2 * ix + 1), + sq22(2 * iy + 1, 2 * ix + 2)); + float minval2_4 = std::min(sq22(2 * iy + 2, 2 * ix + 0), + sq22(2 * iy + 2, 2 * ix + 1)); + float minval2_5 = std::min(minval2_1, minval2_2); + float minval2_6 = std::min(minval2_3, minval2_4); + float minval2 = std::min(minval2_5, minval2_6); + minval2 = std::min(minval2, sq22(2 * iy + 2, 2 * ix + 2)); + float minval3 = std::min(minval, minval2); + minval *= 0.125f; + minval += 0.625f * minval3; + minval += + 0.125f * std::min(1.5f * minval3, sq22(2 * iy + 1, 2 * ix + 1)); + minval += 0.125f * minval2; + // Larger kBias, less smoothing for low intensity changes. + float kDeltaLimit = 3.2; + float bias = 0.0625f * quant_val; + float delta = + (sqrsum_integral_transform + (kDeltaLimit + 0.05) * bias) / + (minval + bias); + int out = 4; + if (delta > kDeltaLimit) { + out = 4; // smooth + } else { + out = 0; + } + // 'threshold' is separate from 'bias' for easier tuning of these + // heuristics. + float threshold = 0.0625f * quant_val; + const float kSmoothLimit = 0.085f; + float smooth = 0.20f * (sq00(2 * iy + 0, 2 * ix + 0) + + sq00(2 * iy + 0, 2 * ix + 1) + + sq00(2 * iy + 1, 2 * ix + 0) + + sq00(2 * iy + 1, 2 * ix + 1) + minval); + if (smooth < kSmoothLimit * threshold) { + out = 4; + } + out_row[bx + sharpness_stride * iy + ix] = out; + } + } + } + } +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(ProcessTile); + +void ArControlFieldHeuristics::RunRect(const Rect& block_rect, + const Image3F& opsin, + PassesEncoderState* enc_state, + size_t thread) { + HWY_DYNAMIC_DISPATCH(ProcessTile) + (opsin, enc_state, block_rect, &temp_images[thread]); +} + +} // namespace jxl + +#endif diff --git a/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.h b/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.h new file mode 100644 index 0000000000..aabe71f46f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.h @@ -0,0 +1,49 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_AR_CONTROL_FIELD_H_ +#define LIB_JXL_ENC_AR_CONTROL_FIELD_H_ + +#include + +#include + +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image.h" + +namespace jxl { + +struct PassesEncoderState; + +struct ArControlFieldHeuristics { + struct TempImages { + void InitOnce() { + if (laplacian_sqrsum.xsize() != 0) return; + laplacian_sqrsum = ImageF(kEncTileDim + 4, kEncTileDim + 4); + sqrsum_00 = ImageF(kEncTileDim / 4, kEncTileDim / 4); + sqrsum_22 = ImageF(kEncTileDim / 4 + 1, kEncTileDim / 4 + 1); + } + + ImageF laplacian_sqrsum; + ImageF sqrsum_00; + ImageF sqrsum_22; + }; + + void PrepareForThreads(size_t num_threads) { + temp_images.resize(num_threads); + } + + void RunRect(const Rect& block_rect, const Image3F& opsin, + PassesEncoderState* enc_state, size_t thread); + + std::vector temp_images; + ImageB* epf_sharpness; + ImageF* quant; + bool all_default; +}; + +} // namespace jxl + +#endif // LIB_JXL_AR_ENC_CONTROL_FIELD_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_aux_out.cc b/third_party/jpeg-xl/lib/jxl/enc_aux_out.cc new file mode 100644 index 0000000000..1c141d1727 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_aux_out.cc @@ -0,0 +1,205 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_aux_out.h" + +#include +#include +#include +#include + +#include +#include // accumulate +#include + +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { + +const char* LayerName(size_t layer) { + switch (layer) { + case kLayerHeader: + return "Headers"; + case kLayerTOC: + return "TOC"; + case kLayerDictionary: + return "Patches"; + case kLayerSplines: + return "Splines"; + case kLayerNoise: + return "Noise"; + case kLayerQuant: + return "Quantizer"; + case kLayerModularTree: + return "ModularTree"; + case kLayerModularGlobal: + return "ModularGlobal"; + case kLayerDC: + return "DC"; + case kLayerModularDcGroup: + return "ModularDcGroup"; + case kLayerControlFields: + return "ControlFields"; + case kLayerOrder: + return "CoeffOrder"; + case kLayerAC: + return "ACHistograms"; + case kLayerACTokens: + return "ACTokens"; + case kLayerModularAcGroup: + return "ModularAcGroup"; + default: + JXL_ABORT("Invalid layer %d\n", static_cast(layer)); + } +} + +void AuxOut::LayerTotals::Print(size_t num_inputs) const { + printf("%10" PRId64, static_cast(total_bits)); + if (histogram_bits != 0) { + printf(" [c/i:%6.2f | hst:%8" PRId64 " | ex:%8" PRId64 " | h+c+e:%12.3f", + num_clustered_histograms * 1.0 / num_inputs, + static_cast(histogram_bits >> 3), + static_cast(extra_bits >> 3), + (histogram_bits + clustered_entropy + extra_bits) / 8.0); + printf("]"); + } + printf("\n"); +} + +void AuxOut::Assimilate(const AuxOut& victim) { + for (size_t i = 0; i < layers.size(); ++i) { + layers[i].Assimilate(victim.layers[i]); + } + num_blocks += victim.num_blocks; + num_small_blocks += victim.num_small_blocks; + num_dct4x8_blocks += victim.num_dct4x8_blocks; + num_afv_blocks += victim.num_afv_blocks; + num_dct8_blocks += victim.num_dct8_blocks; + num_dct8x16_blocks += victim.num_dct8x16_blocks; + num_dct8x32_blocks += victim.num_dct8x32_blocks; + num_dct16_blocks += victim.num_dct16_blocks; + num_dct16x32_blocks += victim.num_dct16x32_blocks; + num_dct32_blocks += victim.num_dct32_blocks; + num_dct32x64_blocks += victim.num_dct32x64_blocks; + num_dct64_blocks += victim.num_dct64_blocks; + num_butteraugli_iters += victim.num_butteraugli_iters; + for (size_t i = 0; i < dc_pred_usage.size(); ++i) { + dc_pred_usage[i] += victim.dc_pred_usage[i]; + dc_pred_usage_xb[i] += victim.dc_pred_usage_xb[i]; + } + max_quant_rescale = std::max(max_quant_rescale, victim.max_quant_rescale); + min_quant_rescale = std::min(min_quant_rescale, victim.min_quant_rescale); + max_bitrate_error = std::max(max_bitrate_error, victim.max_bitrate_error); + min_bitrate_error = std::min(min_bitrate_error, victim.min_bitrate_error); +} + +void AuxOut::Print(size_t num_inputs) const { + if (num_inputs == 0) return; + + LayerTotals all_layers; + for (size_t i = 0; i < layers.size(); ++i) { + all_layers.Assimilate(layers[i]); + } + + printf("Average butteraugli iters: %10.2f\n", + num_butteraugli_iters * 1.0 / num_inputs); + if (min_quant_rescale != 1.0 || max_quant_rescale != 1.0) { + printf("quant rescale range: %f .. %f\n", min_quant_rescale, + max_quant_rescale); + printf("bitrate error range: %.3f%% .. %.3f%%\n", + 100.0f * min_bitrate_error, 100.0f * max_bitrate_error); + } + + for (size_t i = 0; i < layers.size(); ++i) { + if (layers[i].total_bits != 0) { + printf("Total layer bits %-10s\t", LayerName(i)); + printf("%10f%%", 100.0 * layers[i].total_bits / all_layers.total_bits); + layers[i].Print(num_inputs); + } + } + printf("Total image size "); + all_layers.Print(num_inputs); + + const uint32_t dc_pred_total = + std::accumulate(dc_pred_usage.begin(), dc_pred_usage.end(), 0u); + const uint32_t dc_pred_total_xb = + std::accumulate(dc_pred_usage_xb.begin(), dc_pred_usage_xb.end(), 0u); + if (dc_pred_total + dc_pred_total_xb != 0) { + printf("\nDC pred Y XB:\n"); + for (size_t i = 0; i < dc_pred_usage.size(); ++i) { + printf(" %6u (%5.2f%%) %6u (%5.2f%%)\n", dc_pred_usage[i], + 100.0 * dc_pred_usage[i] / dc_pred_total, dc_pred_usage_xb[i], + 100.0 * dc_pred_usage_xb[i] / dc_pred_total_xb); + } + } + + size_t total_blocks = 0; + size_t total_positions = 0; + if (total_blocks != 0 && total_positions != 0) { + printf("\n\t\t Blocks\t\tPositions\t\t\tBlocks/Position\n"); + printf(" Total:\t\t %7" PRIuS "\t\t %7" PRIuS " \t\t\t%10f%%\n\n", + total_blocks, total_positions, + 100.0 * total_blocks / total_positions); + } +} + +template +void AuxOut::DumpImage(const char* label, const Image3& image) const { + if (!dump_image) return; + if (debug_prefix.empty()) return; + std::ostringstream pathname; + pathname << debug_prefix << label << ".png"; + (void)dump_image(ConvertToFloat(image), ColorEncoding::SRGB(), + pathname.str()); +} +template void AuxOut::DumpImage(const char* label, + const Image3& image) const; +template void AuxOut::DumpImage(const char* label, + const Image3& image) const; + +template +void AuxOut::DumpPlaneNormalized(const char* label, + const Plane& image) const { + T min; + T max; + ImageMinMax(image, &min, &max); + Image3B normalized(image.xsize(), image.ysize()); + for (size_t c = 0; c < 3; ++c) { + float mul = min == max ? 0 : (255.0f / (max - min)); + for (size_t y = 0; y < image.ysize(); ++y) { + const T* JXL_RESTRICT row_in = image.ConstRow(y); + uint8_t* JXL_RESTRICT row_out = normalized.PlaneRow(c, y); + for (size_t x = 0; x < image.xsize(); ++x) { + row_out[x] = static_cast((row_in[x] - min) * mul); + } + } + } + DumpImage(label, normalized); +} +template void AuxOut::DumpPlaneNormalized(const char* label, + const Plane& image) const; +template void AuxOut::DumpPlaneNormalized(const char* label, + const Plane& image) const; + +void AuxOut::DumpXybImage(const char* label, const Image3F& image) const { + if (!dump_image) return; + if (debug_prefix.empty()) return; + std::ostringstream pathname; + pathname << debug_prefix << label << ".png"; + + Image3F linear(image.xsize(), image.ysize()); + OpsinParams opsin_params; + opsin_params.Init(kDefaultIntensityTarget); + OpsinToLinear(image, Rect(linear), nullptr, &linear, opsin_params); + + (void)dump_image(std::move(linear), ColorEncoding::LinearSRGB(), + pathname.str()); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_aux_out.h b/third_party/jpeg-xl/lib/jxl/enc_aux_out.h new file mode 100644 index 0000000000..78222823ae --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_aux_out.h @@ -0,0 +1,163 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_AUX_OUT_H_ +#define LIB_JXL_AUX_OUT_H_ + +// Optional output information for debugging and analyzing size usage. + +#include + +#include +#include +#include + +#include "lib/jxl/image.h" +#include "lib/jxl/jxl_inspection.h" + +namespace jxl { + +struct ColorEncoding; + +// For LayerName and AuxOut::layers[] index. Order does not matter. +enum { + kLayerHeader = 0, + kLayerTOC, + kLayerDictionary, + kLayerSplines, + kLayerNoise, + kLayerQuant, + kLayerModularTree, + kLayerModularGlobal, + kLayerDC, + kLayerModularDcGroup, + kLayerControlFields, + kLayerOrder, + kLayerAC, + kLayerACTokens, + kLayerModularAcGroup, + kNumImageLayers +}; + +const char* LayerName(size_t layer); + +// Statistics gathered during compression or decompression. +struct AuxOut { + private: + struct LayerTotals { + void Assimilate(const LayerTotals& victim) { + num_clustered_histograms += victim.num_clustered_histograms; + histogram_bits += victim.histogram_bits; + extra_bits += victim.extra_bits; + total_bits += victim.total_bits; + clustered_entropy += victim.clustered_entropy; + } + void Print(size_t num_inputs) const; + + size_t num_clustered_histograms = 0; + size_t extra_bits = 0; + + // Set via BitsWritten below + size_t histogram_bits = 0; + size_t total_bits = 0; + + double clustered_entropy = 0.0; + }; + + public: + AuxOut() = default; + AuxOut(const AuxOut&) = default; + + void Assimilate(const AuxOut& victim); + + void Print(size_t num_inputs) const; + + size_t TotalBits() const { + size_t total = 0; + for (const auto& layer : layers) { + total += layer.total_bits; + } + return total; + } + + template + void DumpImage(const char* label, const Image3& image) const; + + void DumpXybImage(const char* label, const Image3F& image) const; + + template + void DumpPlaneNormalized(const char* label, const Plane& image) const; + + void SetInspectorImage3F(const jxl::InspectorImage3F& inspector) { + inspector_image3f_ = inspector; + } + + // Allows hooking intermediate data inspection into various places of the + // processing pipeline. Returns true iff processing should proceed. + bool InspectImage3F(const char* label, const Image3F& image) { + if (inspector_image3f_ != nullptr) { + return inspector_image3f_(label, image); + } + return true; + } + + std::array layers; + size_t num_blocks = 0; + + // Number of blocks that use larger DCT (set by ac_strategy). + size_t num_small_blocks = 0; + size_t num_dct4x8_blocks = 0; + size_t num_afv_blocks = 0; + size_t num_dct8_blocks = 0; + size_t num_dct8x16_blocks = 0; + size_t num_dct8x32_blocks = 0; + size_t num_dct16_blocks = 0; + size_t num_dct16x32_blocks = 0; + size_t num_dct32_blocks = 0; + size_t num_dct32x64_blocks = 0; + size_t num_dct64_blocks = 0; + + std::array dc_pred_usage = {{0}}; + std::array dc_pred_usage_xb = {{0}}; + + int num_butteraugli_iters = 0; + + float max_quant_rescale = 1.0f; + float min_quant_rescale = 1.0f; + float min_bitrate_error = 0.0f; + float max_bitrate_error = 0.0f; + + // If not empty, additional debugging information (e.g. debug images) is + // saved in files with this prefix. + std::string debug_prefix; + + // By how much the decoded image was downsampled relative to the encoded + // image. + size_t downsampling = 1; + + jxl::InspectorImage3F inspector_image3f_; + + std::function + dump_image = nullptr; +}; + +extern template void AuxOut::DumpImage(const char* label, + const Image3& image) const; +extern template void AuxOut::DumpImage(const char* label, + const Image3& image) const; +extern template void AuxOut::DumpPlaneNormalized( + const char* label, const Plane& image) const; +extern template void AuxOut::DumpPlaneNormalized( + const char* label, const Plane& image) const; + +// Used to skip image creation if they won't be written to debug directory. +static inline bool WantDebugOutput(const AuxOut* aux_out) { + // Need valid pointer and filename. + return aux_out != nullptr && !aux_out->debug_prefix.empty(); +} + +} // namespace jxl + +#endif // LIB_JXL_AUX_OUT_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_bit_writer.cc b/third_party/jpeg-xl/lib/jxl/enc_bit_writer.cc new file mode 100644 index 0000000000..7964c28f76 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_bit_writer.cc @@ -0,0 +1,201 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_bit_writer.h" + +#include // memcpy + +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_aux_out.h" + +namespace jxl { + +BitWriter::Allotment::Allotment(BitWriter* JXL_RESTRICT writer, size_t max_bits) + : max_bits_(max_bits) { + if (writer == nullptr) return; + prev_bits_written_ = writer->BitsWritten(); + const size_t prev_bytes = writer->storage_.size(); + const size_t next_bytes = DivCeil(max_bits, kBitsPerByte); + writer->storage_.resize(prev_bytes + next_bytes); + parent_ = writer->current_allotment_; + writer->current_allotment_ = this; +} + +BitWriter::Allotment::~Allotment() { + if (!called_) { + // Not calling is a bug - unused storage will not be reclaimed. + JXL_ABORT("Did not call Allotment::ReclaimUnused"); + } +} + +void BitWriter::Allotment::FinishedHistogram(BitWriter* JXL_RESTRICT writer) { + if (writer == nullptr) return; + JXL_ASSERT(!called_); // Call before ReclaimUnused + JXL_ASSERT(histogram_bits_ == 0); // Do not call twice + JXL_ASSERT(writer->BitsWritten() >= prev_bits_written_); + histogram_bits_ = writer->BitsWritten() - prev_bits_written_; +} + +void BitWriter::Allotment::ReclaimAndCharge(BitWriter* JXL_RESTRICT writer, + size_t layer, + AuxOut* JXL_RESTRICT aux_out) { + size_t used_bits, unused_bits; + PrivateReclaim(writer, &used_bits, &unused_bits); + +#if 0 + printf("Layer %s bits: max %" PRIuS " used %" PRIuS " unused %" PRIuS "\n", + LayerName(layer), MaxBits(), used_bits, unused_bits); +#endif + + // This may be a nested call with aux_out == null. Whenever we know that + // aux_out is null, we can call ReclaimUnused directly. + if (aux_out != nullptr) { + aux_out->layers[layer].total_bits += used_bits; + aux_out->layers[layer].histogram_bits += HistogramBits(); + } +} + +void BitWriter::Allotment::PrivateReclaim(BitWriter* JXL_RESTRICT writer, + size_t* JXL_RESTRICT used_bits, + size_t* JXL_RESTRICT unused_bits) { + JXL_ASSERT(!called_); // Do not call twice + called_ = true; + if (writer == nullptr) return; + + JXL_ASSERT(writer->BitsWritten() >= prev_bits_written_); + *used_bits = writer->BitsWritten() - prev_bits_written_; + JXL_ASSERT(*used_bits <= max_bits_); + *unused_bits = max_bits_ - *used_bits; + + // Reclaim unused bytes whole bytes from writer's allotment. + const size_t unused_bytes = *unused_bits / kBitsPerByte; // truncate + JXL_ASSERT(writer->storage_.size() >= unused_bytes); + writer->storage_.resize(writer->storage_.size() - unused_bytes); + writer->current_allotment_ = parent_; + // Ensure we don't also charge the parent for these bits. + auto parent = parent_; + while (parent != nullptr) { + parent->prev_bits_written_ += *used_bits; + parent = parent->parent_; + } +} + +void BitWriter::AppendByteAligned(const Span& span) { + if (span.empty()) return; + storage_.resize(storage_.size() + span.size() + 1); // extra zero padding + + // Concatenate by copying bytes because both source and destination are bytes. + JXL_ASSERT(BitsWritten() % kBitsPerByte == 0); + size_t pos = BitsWritten() / kBitsPerByte; + memcpy(storage_.data() + pos, span.data(), span.size()); + pos += span.size(); + storage_[pos++] = 0; // for next Write + JXL_ASSERT(pos <= storage_.size()); + bits_written_ += span.size() * kBitsPerByte; +} + +void BitWriter::AppendByteAligned(const BitWriter& other) { + JXL_ASSERT(other.BitsWritten() % kBitsPerByte == 0); + JXL_ASSERT(other.BitsWritten() / kBitsPerByte != 0); + + AppendByteAligned(other.GetSpan()); +} + +void BitWriter::AppendByteAligned(const std::vector& others) { + // Total size to add so we can preallocate + size_t other_bytes = 0; + for (const BitWriter& writer : others) { + JXL_ASSERT(writer.BitsWritten() % kBitsPerByte == 0); + other_bytes += writer.BitsWritten() / kBitsPerByte; + } + if (other_bytes == 0) { + // No bytes to append: this happens for example when creating per-group + // storage for groups, but not writing anything in them for e.g. lossless + // images with no alpha. Do nothing. + return; + } + storage_.resize(storage_.size() + other_bytes + 1); // extra zero padding + + // Concatenate by copying bytes because both source and destination are bytes. + JXL_ASSERT(BitsWritten() % kBitsPerByte == 0); + size_t pos = BitsWritten() / kBitsPerByte; + for (const BitWriter& writer : others) { + const Span span = writer.GetSpan(); + if (!span.empty()) { + memcpy(storage_.data() + pos, span.data(), span.size()); + pos += span.size(); + } + } + storage_[pos++] = 0; // for next Write + JXL_ASSERT(pos <= storage_.size()); + bits_written_ += other_bytes * kBitsPerByte; +} + +// TODO(lode): avoid code duplication +void BitWriter::AppendByteAligned( + const std::vector>& others) { + // Total size to add so we can preallocate + size_t other_bytes = 0; + for (const auto& writer : others) { + JXL_ASSERT(writer->BitsWritten() % kBitsPerByte == 0); + other_bytes += writer->BitsWritten() / kBitsPerByte; + } + if (other_bytes == 0) { + // No bytes to append: this happens for example when creating per-group + // storage for groups, but not writing anything in them for e.g. lossless + // images with no alpha. Do nothing. + return; + } + storage_.resize(storage_.size() + other_bytes + 1); // extra zero padding + + // Concatenate by copying bytes because both source and destination are bytes. + JXL_ASSERT(BitsWritten() % kBitsPerByte == 0); + size_t pos = BitsWritten() / kBitsPerByte; + for (const auto& writer : others) { + const Span span = writer->GetSpan(); + memcpy(storage_.data() + pos, span.data(), span.size()); + pos += span.size(); + } + storage_[pos++] = 0; // for next Write + JXL_ASSERT(pos <= storage_.size()); + bits_written_ += other_bytes * kBitsPerByte; +} + +// Example: let's assume that 3 bits (Rs below) have been written already: +// BYTE+0 BYTE+1 BYTE+2 +// 0000 0RRR ???? ???? ???? ???? +// +// Now, we could write up to 5 bits by just shifting them left by 3 bits and +// OR'ing to BYTE-0. +// +// For n > 5 bits, we write the lowest 5 bits as above, then write the next +// lowest bits into BYTE+1 starting from its lower bits and so on. +void BitWriter::Write(size_t n_bits, uint64_t bits) { + JXL_DASSERT((bits >> n_bits) == 0); + JXL_DASSERT(n_bits <= kMaxBitsPerCall); + uint8_t* p = &storage_[bits_written_ / kBitsPerByte]; + const size_t bits_in_first_byte = bits_written_ % kBitsPerByte; + bits <<= bits_in_first_byte; +#if JXL_BYTE_ORDER_LITTLE + uint64_t v = *p; + // Last (partial) or next byte to write must be zero-initialized! + // PaddedBytes initializes the first, and Write/Append maintain this. + JXL_DASSERT(v >> bits_in_first_byte == 0); + v |= bits; + memcpy(p, &v, sizeof(v)); // Write bytes: possibly more than n_bits/8 +#else + *p++ |= static_cast(bits & 0xFF); + for (size_t bits_left_to_write = n_bits + bits_in_first_byte; + bits_left_to_write >= 9; bits_left_to_write -= 8) { + bits >>= 8; + *p++ = static_cast(bits & 0xFF); + } + *p = 0; +#endif + bits_written_ += n_bits; +} +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_bit_writer.h b/third_party/jpeg-xl/lib/jxl/enc_bit_writer.h new file mode 100644 index 0000000000..d3fac15a68 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_bit_writer.h @@ -0,0 +1,129 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_BIT_WRITER_H_ +#define LIB_JXL_ENC_BIT_WRITER_H_ + +// BitWriter class: unbuffered writes using unaligned 64-bit stores. + +#include +#include + +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" + +namespace jxl { + +struct AuxOut; + +struct BitWriter { + // Upper bound on `n_bits` in each call to Write. We shift a 64-bit word by + // 7 bits (max already valid bits in the last byte) and at least 1 bit is + // needed to zero-initialize the bit-stream ahead (i.e. if 7 bits are valid + // and we write 57 bits, then the next write will access a byte that was not + // yet zero-initialized). + static constexpr size_t kMaxBitsPerCall = 56; + + BitWriter() : bits_written_(0) {} + + // Disallow copying - may lead to bugs. + BitWriter(const BitWriter&) = delete; + BitWriter& operator=(const BitWriter&) = delete; + BitWriter(BitWriter&&) = default; + BitWriter& operator=(BitWriter&&) = default; + + size_t BitsWritten() const { return bits_written_; } + + Span GetSpan() const { + // Callers must ensure byte alignment to avoid uninitialized bits. + JXL_ASSERT(bits_written_ % kBitsPerByte == 0); + return Span(storage_.data(), bits_written_ / kBitsPerByte); + } + + // Example usage: bytes = std::move(writer).TakeBytes(); Useful for the + // top-level encoder which returns PaddedBytes, not a BitWriter. + // *this must be an rvalue reference and is invalid afterwards. + PaddedBytes&& TakeBytes() && { + // Callers must ensure byte alignment to avoid uninitialized bits. + JXL_ASSERT(bits_written_ % kBitsPerByte == 0); + storage_.resize(bits_written_ / kBitsPerByte); + return std::move(storage_); + } + + private: + // Must be byte-aligned before calling. + void AppendByteAligned(const Span& span); + + public: + // NOTE: no allotment needed, the other BitWriters have already been charged. + void AppendByteAligned(const BitWriter& other); + void AppendByteAligned(const std::vector>& others); + void AppendByteAligned(const std::vector& others); + + class Allotment { + public: + // Expands a BitWriter's storage. Must happen before calling Write or + // ZeroPadToByte. Must call ReclaimUnused after writing to reclaim the + // unused storage so that BitWriter memory use remains tightly bounded. + Allotment(BitWriter* JXL_RESTRICT writer, size_t max_bits); + ~Allotment(); + + size_t MaxBits() const { return max_bits_; } + + // Call after writing a histogram, but before ReclaimUnused. + void FinishedHistogram(BitWriter* JXL_RESTRICT writer); + + size_t HistogramBits() const { + JXL_ASSERT(called_); + return histogram_bits_; + } + + void ReclaimAndCharge(BitWriter* JXL_RESTRICT writer, size_t layer, + AuxOut* JXL_RESTRICT aux_out); + + private: + void PrivateReclaim(BitWriter* JXL_RESTRICT writer, + size_t* JXL_RESTRICT used_bits, + size_t* JXL_RESTRICT unused_bits); + + size_t prev_bits_written_; + const size_t max_bits_; + size_t histogram_bits_ = 0; + bool called_ = false; + Allotment* parent_; + }; + + // Writes bits into bytes in increasing addresses, and within a byte + // least-significant-bit first. + // + // The function can write up to 56 bits in one go. + void Write(size_t n_bits, uint64_t bits); + + // This should only rarely be used - e.g. when the current location will be + // referenced via byte offset (TOCs point to groups), or byte-aligned reading + // is required for speed. + void ZeroPadToByte() { + const size_t remainder_bits = + RoundUpBitsToByteMultiple(bits_written_) - bits_written_; + if (remainder_bits == 0) return; + Write(remainder_bits, 0); + JXL_ASSERT(bits_written_ % kBitsPerByte == 0); + } + + private: + size_t bits_written_; + PaddedBytes storage_; + Allotment* current_allotment_ = nullptr; +}; + +} // namespace jxl + +#endif // LIB_JXL_ENC_BIT_WRITER_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.cc b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.cc new file mode 100644 index 0000000000..5711f45884 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.cc @@ -0,0 +1,99 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_butteraugli_comparator.h" + +#include +#include + +#include "lib/jxl/color_management.h" +#include "lib/jxl/enc_image_bundle.h" + +namespace jxl { + +JxlButteraugliComparator::JxlButteraugliComparator( + const ButteraugliParams& params, const JxlCmsInterface& cms) + : params_(params), cms_(cms) {} + +Status JxlButteraugliComparator::SetReferenceImage(const ImageBundle& ref) { + const ImageBundle* ref_linear_srgb; + ImageMetadata metadata = *ref.metadata(); + ImageBundle store(&metadata); + if (!TransformIfNeeded(ref, ColorEncoding::LinearSRGB(ref.IsGray()), cms_, + /*pool=*/nullptr, &store, &ref_linear_srgb)) { + return false; + } + + comparator_.reset( + new ButteraugliComparator(ref_linear_srgb->color(), params_)); + xsize_ = ref.xsize(); + ysize_ = ref.ysize(); + return true; +} + +Status JxlButteraugliComparator::CompareWith(const ImageBundle& actual, + ImageF* diffmap, float* score) { + if (!comparator_) { + return JXL_FAILURE("Must set reference image first"); + } + if (xsize_ != actual.xsize() || ysize_ != actual.ysize()) { + return JXL_FAILURE("Images must have same size"); + } + + const ImageBundle* actual_linear_srgb; + ImageMetadata metadata = *actual.metadata(); + ImageBundle store(&metadata); + if (!TransformIfNeeded(actual, ColorEncoding::LinearSRGB(actual.IsGray()), + cms_, + /*pool=*/nullptr, &store, &actual_linear_srgb)) { + return false; + } + + ImageF temp_diffmap(xsize_, ysize_); + comparator_->Diffmap(actual_linear_srgb->color(), temp_diffmap); + + if (score != nullptr) { + *score = ButteraugliScoreFromDiffmap(temp_diffmap, ¶ms_); + } + if (diffmap != nullptr) { + diffmap->Swap(temp_diffmap); + } + + return true; +} + +float JxlButteraugliComparator::GoodQualityScore() const { + return ButteraugliFuzzyInverse(1.5); +} + +float JxlButteraugliComparator::BadQualityScore() const { + return ButteraugliFuzzyInverse(0.5); +} + +float ButteraugliDistance(const ImageBundle& rgb0, const ImageBundle& rgb1, + const ButteraugliParams& params, + const JxlCmsInterface& cms, ImageF* distmap, + ThreadPool* pool) { + JxlButteraugliComparator comparator(params, cms); + return ComputeScore(rgb0, rgb1, &comparator, cms, distmap, pool); +} + +float ButteraugliDistance(const std::vector& frames0, + const std::vector& frames1, + const ButteraugliParams& params, + const JxlCmsInterface& cms, ImageF* distmap, + ThreadPool* pool) { + JxlButteraugliComparator comparator(params, cms); + JXL_ASSERT(frames0.size() == frames1.size()); + float max_dist = 0.0f; + for (size_t i = 0; i < frames0.size(); ++i) { + max_dist = std::max( + max_dist, + ComputeScore(frames0[i], frames1[i], &comparator, cms, distmap, pool)); + } + return max_dist; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.h b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.h new file mode 100644 index 0000000000..6c37d1dc7d --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.h @@ -0,0 +1,59 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_BUTTERAUGLI_COMPARATOR_H_ +#define LIB_JXL_ENC_BUTTERAUGLI_COMPARATOR_H_ + +#include +#include + +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/butteraugli/butteraugli.h" +#include "lib/jxl/enc_comparator.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +class JxlButteraugliComparator : public Comparator { + public: + explicit JxlButteraugliComparator(const ButteraugliParams& params, + const JxlCmsInterface& cms); + + Status SetReferenceImage(const ImageBundle& ref) override; + + Status CompareWith(const ImageBundle& actual, ImageF* diffmap, + float* score) override; + + float GoodQualityScore() const override; + float BadQualityScore() const override; + + private: + ButteraugliParams params_; + JxlCmsInterface cms_; + std::unique_ptr comparator_; + size_t xsize_ = 0; + size_t ysize_ = 0; +}; + +// Returns the butteraugli distance between rgb0 and rgb1. +// If distmap is not null, it must be the same size as rgb0 and rgb1. +float ButteraugliDistance(const ImageBundle& rgb0, const ImageBundle& rgb1, + const ButteraugliParams& params, + const JxlCmsInterface& cms, ImageF* distmap = nullptr, + ThreadPool* pool = nullptr); + +float ButteraugliDistance(const std::vector& frames0, + const std::vector& frames1, + const ButteraugliParams& params, + const JxlCmsInterface& cms, ImageF* distmap = nullptr, + ThreadPool* pool = nullptr); + +} // namespace jxl + +#endif // LIB_JXL_ENC_BUTTERAUGLI_COMPARATOR_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.cc b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.cc new file mode 100644 index 0000000000..fe5629dcda --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.cc @@ -0,0 +1,211 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_butteraugli_pnorm.h" + +#include +#include + +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_butteraugli_pnorm.cc" +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/color_encoding_internal.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::GetLane; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::Rebind; + +double ComputeDistanceP(const ImageF& distmap, const ButteraugliParams& params, + double p) { + PROFILER_FUNC; + + const double onePerPixels = 1.0 / (distmap.ysize() * distmap.xsize()); + if (std::abs(p - 3.0) < 1E-6) { + double sum1[3] = {0.0}; + +// Prefer double if possible, but otherwise use float rather than scalar. +#if HWY_CAP_FLOAT64 + using T = double; + const Rebind df; +#else + using T = float; +#endif + const HWY_FULL(T) d; + constexpr size_t N = MaxLanes(HWY_FULL(T)()); + // Manually aligned storage to avoid asan crash on clang-7 due to + // unaligned spill. + HWY_ALIGN T sum_totals0[N] = {0}; + HWY_ALIGN T sum_totals1[N] = {0}; + HWY_ALIGN T sum_totals2[N] = {0}; + + for (size_t y = 0; y < distmap.ysize(); ++y) { + const float* JXL_RESTRICT row = distmap.ConstRow(y); + + auto sums0 = Zero(d); + auto sums1 = Zero(d); + auto sums2 = Zero(d); + + size_t x = 0; + for (; x + Lanes(d) <= distmap.xsize(); x += Lanes(d)) { +#if HWY_CAP_FLOAT64 + const auto d1 = PromoteTo(d, Load(df, row + x)); +#else + const auto d1 = Load(d, row + x); +#endif + const auto d2 = Mul(d1, Mul(d1, d1)); + sums0 = Add(sums0, d2); + const auto d3 = Mul(d2, d2); + sums1 = Add(sums1, d3); + const auto d4 = Mul(d3, d3); + sums2 = Add(sums2, d4); + } + + Store(Add(sums0, Load(d, sum_totals0)), d, sum_totals0); + Store(Add(sums1, Load(d, sum_totals1)), d, sum_totals1); + Store(Add(sums2, Load(d, sum_totals2)), d, sum_totals2); + + for (; x < distmap.xsize(); ++x) { + const double d1 = row[x]; + double d2 = d1 * d1 * d1; + sum1[0] += d2; + d2 *= d2; + sum1[1] += d2; + d2 *= d2; + sum1[2] += d2; + } + } + double v = 0; + v += pow( + onePerPixels * (sum1[0] + GetLane(SumOfLanes(d, Load(d, sum_totals0)))), + 1.0 / (p * 1.0)); + v += pow( + onePerPixels * (sum1[1] + GetLane(SumOfLanes(d, Load(d, sum_totals1)))), + 1.0 / (p * 2.0)); + v += pow( + onePerPixels * (sum1[2] + GetLane(SumOfLanes(d, Load(d, sum_totals2)))), + 1.0 / (p * 4.0)); + v /= 3.0; + return v; + } else { + static std::atomic once{0}; + if (once.fetch_add(1, std::memory_order_relaxed) == 0) { + JXL_WARNING("WARNING: using slow ComputeDistanceP"); + } + double sum1[3] = {0.0}; + for (size_t y = 0; y < distmap.ysize(); ++y) { + const float* JXL_RESTRICT row = distmap.ConstRow(y); + for (size_t x = 0; x < distmap.xsize(); ++x) { + double d2 = std::pow(row[x], p); + sum1[0] += d2; + d2 *= d2; + sum1[1] += d2; + d2 *= d2; + sum1[2] += d2; + } + } + double v = 0; + for (int i = 0; i < 3; ++i) { + v += pow(onePerPixels * (sum1[i]), 1.0 / (p * (1 << i))); + } + v /= 3.0; + return v; + } +} + +// TODO(lode): take alpha into account when needed +double ComputeDistance2(const ImageBundle& ib1, const ImageBundle& ib2, + const JxlCmsInterface& cms) { + PROFILER_FUNC; + // Convert to sRGB - closer to perception than linear. + const Image3F* srgb1 = &ib1.color(); + Image3F copy1; + if (!ib1.IsSRGB()) { + JXL_CHECK( + ib1.CopyTo(Rect(ib1), ColorEncoding::SRGB(ib1.IsGray()), cms, ©1)); + srgb1 = ©1; + } + const Image3F* srgb2 = &ib2.color(); + Image3F copy2; + if (!ib2.IsSRGB()) { + JXL_CHECK( + ib2.CopyTo(Rect(ib2), ColorEncoding::SRGB(ib2.IsGray()), cms, ©2)); + srgb2 = ©2; + } + + JXL_CHECK(SameSize(*srgb1, *srgb2)); + + // TODO(veluca): SIMD. + float yuvmatrix[3][3] = {{0.299, 0.587, 0.114}, + {-0.14713, -0.28886, 0.436}, + {0.615, -0.51499, -0.10001}}; + double sum_of_squares[3] = {}; + for (size_t y = 0; y < srgb1->ysize(); ++y) { + const float* JXL_RESTRICT row1[3]; + const float* JXL_RESTRICT row2[3]; + for (size_t j = 0; j < 3; j++) { + row1[j] = srgb1->ConstPlaneRow(j, y); + row2[j] = srgb2->ConstPlaneRow(j, y); + } + for (size_t x = 0; x < srgb1->xsize(); ++x) { + float cdiff[3] = {}; + // YUV conversion is linear, so we can run it on the difference. + for (size_t j = 0; j < 3; j++) { + cdiff[j] = row1[j][x] - row2[j][x]; + } + float yuvdiff[3] = {}; + for (size_t j = 0; j < 3; j++) { + for (size_t k = 0; k < 3; k++) { + yuvdiff[j] += yuvmatrix[j][k] * cdiff[k]; + } + } + for (size_t j = 0; j < 3; j++) { + sum_of_squares[j] += yuvdiff[j] * yuvdiff[j]; + } + } + } + // Weighted PSNR as in JPEG-XL: chroma counts 1/8. + const float weights[3] = {6.0f / 8, 1.0f / 8, 1.0f / 8}; + // Avoid squaring the weight - 1/64 is too extreme. + double norm = 0; + for (size_t i = 0; i < 3; i++) { + norm += std::sqrt(sum_of_squares[i]) * weights[i]; + } + // This function returns distance *squared*. + return norm * norm; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(ComputeDistanceP); +double ComputeDistanceP(const ImageF& distmap, const ButteraugliParams& params, + double p) { + return HWY_DYNAMIC_DISPATCH(ComputeDistanceP)(distmap, params, p); +} + +HWY_EXPORT(ComputeDistance2); +double ComputeDistance2(const ImageBundle& ib1, const ImageBundle& ib2, + const JxlCmsInterface& cms) { + return HWY_DYNAMIC_DISPATCH(ComputeDistance2)(ib1, ib2, cms); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.h b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.h new file mode 100644 index 0000000000..cf6872e5d0 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.h @@ -0,0 +1,25 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_BUTTERAUGLI_PNORM_H_ +#define LIB_JXL_ENC_BUTTERAUGLI_PNORM_H_ + +#include + +#include "lib/jxl/butteraugli/butteraugli.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +// Computes p-norm given the butteraugli distmap. +double ComputeDistanceP(const ImageF& distmap, const ButteraugliParams& params, + double p); + +double ComputeDistance2(const ImageBundle& ib1, const ImageBundle& ib2, + const JxlCmsInterface& cms); + +} // namespace jxl + +#endif // LIB_JXL_ENC_BUTTERAUGLI_PNORM_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_cache.cc b/third_party/jpeg-xl/lib/jxl/enc_cache.cc new file mode 100644 index 0000000000..fc3e5c9f30 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_cache.cc @@ -0,0 +1,218 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_cache.h" + +#include +#include + +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/common.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/dec_frame.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_frame.h" +#include "lib/jxl/enc_group.h" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/passes_state.h" +#include "lib/jxl/quantizer.h" + +namespace jxl { + +Status InitializePassesEncoder(const Image3F& opsin, const JxlCmsInterface& cms, + ThreadPool* pool, PassesEncoderState* enc_state, + ModularFrameEncoder* modular_frame_encoder, + AuxOut* aux_out) { + PROFILER_FUNC; + + PassesSharedState& JXL_RESTRICT shared = enc_state->shared; + + enc_state->histogram_idx.resize(shared.frame_dim.num_groups); + + enc_state->x_qm_multiplier = + std::pow(1.25f, shared.frame_header.x_qm_scale - 2.0f); + enc_state->b_qm_multiplier = + std::pow(1.25f, shared.frame_header.b_qm_scale - 2.0f); + + if (enc_state->coeffs.size() < shared.frame_header.passes.num_passes) { + enc_state->coeffs.reserve(shared.frame_header.passes.num_passes); + for (size_t i = enc_state->coeffs.size(); + i < shared.frame_header.passes.num_passes; i++) { + // Allocate enough coefficients for each group on every row. + enc_state->coeffs.emplace_back(make_unique>( + kGroupDim * kGroupDim, shared.frame_dim.num_groups)); + } + } + while (enc_state->coeffs.size() > shared.frame_header.passes.num_passes) { + enc_state->coeffs.pop_back(); + } + + float scale = + shared.quantizer.ScaleGlobalScale(enc_state->cparams.quant_ac_rescale); + DequantMatricesScaleDC(&shared.matrices, scale); + shared.quantizer.RecomputeFromGlobalScale(); + + Image3F dc(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks); + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, shared.frame_dim.num_groups, ThreadPool::NoInit, + [&](size_t group_idx, size_t _) { + ComputeCoefficients(group_idx, enc_state, opsin, &dc); + }, + "Compute coeffs")); + + if (shared.frame_header.flags & FrameHeader::kUseDcFrame) { + CompressParams cparams = enc_state->cparams; + cparams.dots = Override::kOff; + cparams.noise = Override::kOff; + cparams.patches = Override::kOff; + cparams.gaborish = Override::kOff; + cparams.epf = 0; + cparams.resampling = 1; + cparams.ec_resampling = 1; + // The DC frame will have alpha=0. Don't erase its contents. + cparams.keep_invisible = Override::kOn; + JXL_ASSERT(cparams.progressive_dc > 0); + cparams.progressive_dc--; + // Use kVarDCT in max_error_mode for intermediate progressive DC, + // and kModular for the smallest DC (first in the bitstream) + if (cparams.progressive_dc == 0) { + cparams.modular_mode = true; + cparams.speed_tier = + SpeedTier(std::max(static_cast(SpeedTier::kTortoise), + static_cast(cparams.speed_tier) - 1)); + cparams.butteraugli_distance = + std::max(kMinButteraugliDistance, + enc_state->cparams.butteraugli_distance * 0.02f); + } else { + cparams.max_error_mode = true; + for (size_t c = 0; c < 3; c++) { + cparams.max_error[c] = shared.quantizer.MulDC()[c]; + } + // Guess a distance that produces good initial results. + cparams.butteraugli_distance = + std::max(kMinButteraugliDistance, + enc_state->cparams.butteraugli_distance * 0.1f); + } + ImageBundle ib(&shared.metadata->m); + // This is a lie - dc is in XYB + // (but EncodeFrame will skip RGB->XYB conversion anyway) + ib.SetFromImage( + std::move(dc), + ColorEncoding::LinearSRGB(shared.metadata->m.color_encoding.IsGray())); + if (!ib.metadata()->extra_channel_info.empty()) { + // Add dummy extra channels to the patch image: dc_level frames do not yet + // support extra channels, but the codec expects that the amount of extra + // channels in frames matches that in the metadata of the codestream. + std::vector extra_channels; + extra_channels.reserve(ib.metadata()->extra_channel_info.size()); + for (size_t i = 0; i < ib.metadata()->extra_channel_info.size(); i++) { + extra_channels.emplace_back(ib.xsize(), ib.ysize()); + // Must initialize the image with data to not affect blending with + // uninitialized memory. + // TODO(lode): dc_level must copy and use the real extra channels + // instead. + ZeroFillImage(&extra_channels.back()); + } + ib.SetExtraChannels(std::move(extra_channels)); + } + std::unique_ptr state = + jxl::make_unique(); + + auto special_frame = std::unique_ptr(new BitWriter()); + FrameInfo dc_frame_info; + dc_frame_info.frame_type = FrameType::kDCFrame; + dc_frame_info.dc_level = shared.frame_header.dc_level + 1; + dc_frame_info.ib_needs_color_transform = false; + dc_frame_info.save_before_color_transform = true; // Implicitly true + AuxOut dc_aux_out; + if (aux_out) { + dc_aux_out.debug_prefix = aux_out->debug_prefix; + } + JXL_CHECK(EncodeFrame(cparams, dc_frame_info, shared.metadata, ib, + state.get(), cms, pool, special_frame.get(), + aux_out ? &dc_aux_out : nullptr)); + if (aux_out) { + for (const auto& l : dc_aux_out.layers) { + aux_out->layers[kLayerDC].Assimilate(l); + } + } + const Span encoded = special_frame->GetSpan(); + enc_state->special_frames.emplace_back(std::move(special_frame)); + + ImageBundle decoded(&shared.metadata->m); + std::unique_ptr dec_state = + jxl::make_unique(); + JXL_CHECK( + dec_state->output_encoding_info.SetFromMetadata(*shared.metadata)); + const uint8_t* frame_start = encoded.data(); + size_t encoded_size = encoded.size(); + for (int i = 0; i <= cparams.progressive_dc; ++i) { + JXL_CHECK(DecodeFrame(dec_state.get(), pool, frame_start, encoded_size, + &decoded, *shared.metadata)); + frame_start += decoded.decoded_bytes(); + encoded_size -= decoded.decoded_bytes(); + } + // TODO(lode): shared.frame_header.dc_level should be equal to + // dec_state.shared->frame_header.dc_level - 1 here, since above we set + // dc_frame_info.dc_level = shared.frame_header.dc_level + 1, and + // dc_frame_info.dc_level is used by EncodeFrame. However, if EncodeFrame + // outputs multiple frames, this assumption could be wrong. + shared.dc_storage = + CopyImage(dec_state->shared->dc_frames[shared.frame_header.dc_level]); + ZeroFillImage(&shared.quant_dc); + shared.dc = &shared.dc_storage; + JXL_CHECK(encoded_size == 0); + } else { + auto compute_dc_coeffs = [&](int group_index, int /* thread */) { + modular_frame_encoder->AddVarDCTDC( + dc, group_index, enc_state->cparams.speed_tier < SpeedTier::kFalcon, + enc_state, /*jpeg_transcode=*/false); + }; + JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, shared.frame_dim.num_dc_groups, + ThreadPool::NoInit, compute_dc_coeffs, + "Compute DC coeffs")); + // TODO(veluca): this is only useful in tests and if inspection is enabled. + if (!(shared.frame_header.flags & FrameHeader::kSkipAdaptiveDCSmoothing)) { + AdaptiveDCSmoothing(shared.quantizer.MulDC(), &shared.dc_storage, pool); + } + } + auto compute_ac_meta = [&](int group_index, int /* thread */) { + modular_frame_encoder->AddACMetadata(group_index, /*jpeg_transcode=*/false, + enc_state); + }; + JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, shared.frame_dim.num_dc_groups, + ThreadPool::NoInit, compute_ac_meta, + "Compute AC Metadata")); + + if (aux_out != nullptr) { + aux_out->InspectImage3F("compressed_image:InitializeFrameEncCache:dc_dec", + shared.dc_storage); + } + return true; +} + +void EncCache::InitOnce() { + PROFILER_FUNC; + + if (num_nzeroes.xsize() == 0) { + num_nzeroes = Image3I(kGroupDimInBlocks, kGroupDimInBlocks); + } +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_cache.h b/third_party/jpeg-xl/lib/jxl/enc_cache.h new file mode 100644 index 0000000000..6c7870ba00 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_cache.h @@ -0,0 +1,93 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_CACHE_H_ +#define LIB_JXL_ENC_CACHE_H_ + +#include +#include + +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_heuristics.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_progressive_split.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/passes_state.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" + +namespace jxl { + +struct AuxOut; + +// Contains encoder state. +struct PassesEncoderState { + PassesSharedState shared; + + ImageF initial_quant_field; // Invalid in Falcon mode. + ImageF initial_quant_masking; // Invalid in Falcon mode. + + // Per-pass DCT coefficients for the image. One row per group. + std::vector> coeffs; + + // Raw data for special (reference+DC) frames. + std::vector> special_frames; + + // For splitting into passes. + ProgressiveSplitter progressive_splitter; + + CompressParams cparams; + + struct PassData { + std::vector> ac_tokens; + std::vector context_map; + EntropyEncodingData codes; + }; + + std::vector passes; + std::vector histogram_idx; + + // Coefficient orders that are non-default. + std::vector used_orders; + + // Multiplier to be applied to the quant matrices of the x channel. + float x_qm_multiplier = 1.0f; + float b_qm_multiplier = 1.0f; + + // Heuristics to be used by the encoder. + std::unique_ptr heuristics = + make_unique(); +}; + +// Initialize per-frame information. +class ModularFrameEncoder; +Status InitializePassesEncoder(const Image3F& opsin, const JxlCmsInterface& cms, + ThreadPool* pool, + PassesEncoderState* passes_enc_state, + ModularFrameEncoder* modular_frame_encoder, + AuxOut* aux_out); + +// Working area for ComputeCoefficients (per-group!) +struct EncCache { + // Allocates memory when first called, shrinks images to current group size. + void InitOnce(); + + // TokenizeCoefficients + Image3I num_nzeroes; +}; + +} // namespace jxl + +#endif // LIB_JXL_ENC_CACHE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.cc b/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.cc new file mode 100644 index 0000000000..0cdd2a7823 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.cc @@ -0,0 +1,409 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_chroma_from_luma.h" + +#include +#include + +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_chroma_from_luma.cc" +#include +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_transforms-inl.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_transforms-inl.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/quantizer.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Abs; +using hwy::HWY_NAMESPACE::Ge; +using hwy::HWY_NAMESPACE::GetLane; +using hwy::HWY_NAMESPACE::IfThenElse; +using hwy::HWY_NAMESPACE::Lt; + +static HWY_FULL(float) df; + +struct CFLFunction { + static constexpr float kCoeff = 1.f / 3; + static constexpr float kThres = 100.0f; + static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor; + CFLFunction(const float* values_m, const float* values_s, size_t num, + float base, float distance_mul) + : values_m(values_m), + values_s(values_s), + num(num), + base(base), + distance_mul(distance_mul) {} + + // Returns f'(x), where f is 1/3 * sum ((|color residual| + 1)^2-1) + + // distance_mul * x^2 * num. + float Compute(float x, float eps, float* fpeps, float* fmeps) const { + float first_derivative = 2 * distance_mul * num * x; + float first_derivative_peps = 2 * distance_mul * num * (x + eps); + float first_derivative_meps = 2 * distance_mul * num * (x - eps); + + const auto inv_color_factor = Set(df, kInvColorFactor); + const auto thres = Set(df, kThres); + const auto coeffx2 = Set(df, kCoeff * 2.0f); + const auto one = Set(df, 1.0f); + const auto zero = Set(df, 0.0f); + const auto base_v = Set(df, base); + const auto x_v = Set(df, x); + const auto xpe_v = Set(df, x + eps); + const auto xme_v = Set(df, x - eps); + auto fd_v = Zero(df); + auto fdpe_v = Zero(df); + auto fdme_v = Zero(df); + JXL_ASSERT(num % Lanes(df) == 0); + + for (size_t i = 0; i < num; i += Lanes(df)) { + // color residual = ax + b + const auto a = Mul(inv_color_factor, Load(df, values_m + i)); + const auto b = + Sub(Mul(base_v, Load(df, values_m + i)), Load(df, values_s + i)); + const auto v = MulAdd(a, x_v, b); + const auto vpe = MulAdd(a, xpe_v, b); + const auto vme = MulAdd(a, xme_v, b); + const auto av = Abs(v); + const auto avpe = Abs(vpe); + const auto avme = Abs(vme); + const auto acoeffx2 = Mul(coeffx2, a); + auto d = Mul(acoeffx2, Add(av, one)); + auto dpe = Mul(acoeffx2, Add(avpe, one)); + auto dme = Mul(acoeffx2, Add(avme, one)); + d = IfThenElse(Lt(v, zero), Sub(zero, d), d); + dpe = IfThenElse(Lt(vpe, zero), Sub(zero, dpe), dpe); + dme = IfThenElse(Lt(vme, zero), Sub(zero, dme), dme); + const auto above = Ge(av, thres); + // TODO(eustas): use IfThenElseZero + fd_v = Add(fd_v, IfThenElse(above, zero, d)); + fdpe_v = Add(fdpe_v, IfThenElse(above, zero, dpe)); + fdme_v = Add(fdme_v, IfThenElse(above, zero, dme)); + } + + *fpeps = first_derivative_peps + GetLane(SumOfLanes(df, fdpe_v)); + *fmeps = first_derivative_meps + GetLane(SumOfLanes(df, fdme_v)); + return first_derivative + GetLane(SumOfLanes(df, fd_v)); + } + + const float* JXL_RESTRICT values_m; + const float* JXL_RESTRICT values_s; + size_t num; + float base; + float distance_mul; +}; + +// Chroma-from-luma search, values_m will have luma -- and values_s chroma. +int32_t FindBestMultiplier(const float* values_m, const float* values_s, + size_t num, float base, float distance_mul, + bool fast) { + if (num == 0) { + return 0; + } + float x; + if (fast) { + static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor; + auto ca = Zero(df); + auto cb = Zero(df); + const auto inv_color_factor = Set(df, kInvColorFactor); + const auto base_v = Set(df, base); + for (size_t i = 0; i < num; i += Lanes(df)) { + // color residual = ax + b + const auto a = Mul(inv_color_factor, Load(df, values_m + i)); + const auto b = + Sub(Mul(base_v, Load(df, values_m + i)), Load(df, values_s + i)); + ca = MulAdd(a, a, ca); + cb = MulAdd(a, b, cb); + } + // + distance_mul * x^2 * num + x = -GetLane(SumOfLanes(df, cb)) / + (GetLane(SumOfLanes(df, ca)) + num * distance_mul * 0.5f); + } else { + constexpr float eps = 100; + constexpr float kClamp = 20.0f; + CFLFunction fn(values_m, values_s, num, base, distance_mul); + x = 0; + // Up to 20 Newton iterations, with approximate derivatives. + // Derivatives are approximate due to the high amount of noise in the exact + // derivatives. + for (size_t i = 0; i < 20; i++) { + float dfpeps, dfmeps; + float df = fn.Compute(x, eps, &dfpeps, &dfmeps); + float ddf = (dfpeps - dfmeps) / (2 * eps); + float kExperimentalInsignificantStabilizer = 0.85; + float step = df / (ddf + kExperimentalInsignificantStabilizer); + x -= std::min(kClamp, std::max(-kClamp, step)); + if (std::abs(step) < 3e-3) break; + } + } + // CFL seems to be tricky for larger transforms for HF components + // close to zero. This heuristic brings the solutions closer to zero + // and reduces red-green oscillations. + float towards_zero = 2.6; + if (x >= towards_zero) { + x -= towards_zero; + } else if (x <= -towards_zero) { + x += towards_zero; + } else { + x = 0; + } + return std::max(-128.0f, std::min(127.0f, roundf(x))); +} + +void InitDCStorage(size_t num_blocks, ImageF* dc_values) { + // First row: Y channel + // Second row: X channel + // Third row: Y channel + // Fourth row: B channel + *dc_values = ImageF(RoundUpTo(num_blocks, Lanes(df)), 4); + + JXL_ASSERT(dc_values->xsize() != 0); + // Zero-fill the last lanes + for (size_t y = 0; y < 4; y++) { + for (size_t x = dc_values->xsize() - Lanes(df); x < dc_values->xsize(); + x++) { + dc_values->Row(y)[x] = 0; + } + } +} + +void ComputeDC(const ImageF& dc_values, bool fast, int32_t* dc_x, + int32_t* dc_b) { + constexpr float kDistanceMultiplierDC = 1e-5f; + const float* JXL_RESTRICT dc_values_yx = dc_values.Row(0); + const float* JXL_RESTRICT dc_values_x = dc_values.Row(1); + const float* JXL_RESTRICT dc_values_yb = dc_values.Row(2); + const float* JXL_RESTRICT dc_values_b = dc_values.Row(3); + *dc_x = FindBestMultiplier(dc_values_yx, dc_values_x, dc_values.xsize(), 0.0f, + kDistanceMultiplierDC, fast); + *dc_b = FindBestMultiplier(dc_values_yb, dc_values_b, dc_values.xsize(), + kYToBRatio, kDistanceMultiplierDC, fast); +} + +void ComputeTile(const Image3F& opsin, const DequantMatrices& dequant, + const AcStrategyImage* ac_strategy, + const ImageI* raw_quant_field, const Quantizer* quantizer, + const Rect& r, bool fast, bool use_dct8, ImageSB* map_x, + ImageSB* map_b, ImageF* dc_values, float* mem) { + static_assert(kEncTileDimInBlocks == kColorTileDimInBlocks, + "Invalid color tile dim"); + size_t xsize_blocks = opsin.xsize() / kBlockDim; + constexpr float kDistanceMultiplierAC = 1e-9f; + + const size_t y0 = r.y0(); + const size_t x0 = r.x0(); + const size_t x1 = r.x0() + r.xsize(); + const size_t y1 = r.y0() + r.ysize(); + + int ty = y0 / kColorTileDimInBlocks; + int tx = x0 / kColorTileDimInBlocks; + + int8_t* JXL_RESTRICT row_out_x = map_x->Row(ty); + int8_t* JXL_RESTRICT row_out_b = map_b->Row(ty); + + float* JXL_RESTRICT dc_values_yx = dc_values->Row(0); + float* JXL_RESTRICT dc_values_x = dc_values->Row(1); + float* JXL_RESTRICT dc_values_yb = dc_values->Row(2); + float* JXL_RESTRICT dc_values_b = dc_values->Row(3); + + // All are aligned. + float* HWY_RESTRICT block_y = mem; + float* HWY_RESTRICT block_x = block_y + AcStrategy::kMaxCoeffArea; + float* HWY_RESTRICT block_b = block_x + AcStrategy::kMaxCoeffArea; + float* HWY_RESTRICT coeffs_yx = block_b + AcStrategy::kMaxCoeffArea; + float* HWY_RESTRICT coeffs_x = coeffs_yx + kColorTileDim * kColorTileDim; + float* HWY_RESTRICT coeffs_yb = coeffs_x + kColorTileDim * kColorTileDim; + float* HWY_RESTRICT coeffs_b = coeffs_yb + kColorTileDim * kColorTileDim; + float* HWY_RESTRICT scratch_space = coeffs_b + kColorTileDim * kColorTileDim; + JXL_DASSERT(scratch_space + 2 * AcStrategy::kMaxCoeffArea == + block_y + CfLHeuristics::kItemsPerThread); + + // Small (~256 bytes each) + HWY_ALIGN_MAX float + dc_y[AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks] = {}; + HWY_ALIGN_MAX float + dc_x[AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks] = {}; + HWY_ALIGN_MAX float + dc_b[AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks] = {}; + size_t num_ac = 0; + + for (size_t y = y0; y < y1; ++y) { + const float* JXL_RESTRICT row_y = opsin.ConstPlaneRow(1, y * kBlockDim); + const float* JXL_RESTRICT row_x = opsin.ConstPlaneRow(0, y * kBlockDim); + const float* JXL_RESTRICT row_b = opsin.ConstPlaneRow(2, y * kBlockDim); + size_t stride = opsin.PixelsPerRow(); + + for (size_t x = x0; x < x1; x++) { + AcStrategy acs = use_dct8 + ? AcStrategy::FromRawStrategy(AcStrategy::Type::DCT) + : ac_strategy->ConstRow(y)[x]; + if (!acs.IsFirstBlock()) continue; + size_t xs = acs.covered_blocks_x(); + TransformFromPixels(acs.Strategy(), row_y + x * kBlockDim, stride, + block_y, scratch_space); + DCFromLowestFrequencies(acs.Strategy(), block_y, dc_y, xs); + TransformFromPixels(acs.Strategy(), row_x + x * kBlockDim, stride, + block_x, scratch_space); + DCFromLowestFrequencies(acs.Strategy(), block_x, dc_x, xs); + TransformFromPixels(acs.Strategy(), row_b + x * kBlockDim, stride, + block_b, scratch_space); + DCFromLowestFrequencies(acs.Strategy(), block_b, dc_b, xs); + const float* const JXL_RESTRICT qm_x = + dequant.InvMatrix(acs.Strategy(), 0); + const float* const JXL_RESTRICT qm_b = + dequant.InvMatrix(acs.Strategy(), 2); + float q_dc_x = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(0); + float q_dc_b = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(2); + + // Copy DCs in dc_values. + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < xs; ix++) { + dc_values_yx[(iy + y) * xsize_blocks + ix + x] = + dc_y[iy * xs + ix] * q_dc_x; + dc_values_x[(iy + y) * xsize_blocks + ix + x] = + dc_x[iy * xs + ix] * q_dc_x; + dc_values_yb[(iy + y) * xsize_blocks + ix + x] = + dc_y[iy * xs + ix] * q_dc_b; + dc_values_b[(iy + y) * xsize_blocks + ix + x] = + dc_b[iy * xs + ix] * q_dc_b; + } + } + + // Do not use this block for computing AC CfL. + if (acs.covered_blocks_x() + x0 > x1 || + acs.covered_blocks_y() + y0 > y1) { + continue; + } + + // Copy AC coefficients in the local block. The order in which + // coefficients get stored does not matter. + size_t cx = acs.covered_blocks_x(); + size_t cy = acs.covered_blocks_y(); + CoefficientLayout(&cy, &cx); + // Zero out LFs. This introduces terms in the optimization loop that + // don't affect the result, as they are all 0, but allow for simpler + // SIMDfication. + for (size_t iy = 0; iy < cy; iy++) { + for (size_t ix = 0; ix < cx; ix++) { + block_y[cx * kBlockDim * iy + ix] = 0; + block_x[cx * kBlockDim * iy + ix] = 0; + block_b[cx * kBlockDim * iy + ix] = 0; + } + } + // Unclear why this is like it is. (This works slightly better + // than the previous approach which was also a hack.) + const float qq = + (raw_quant_field == nullptr) ? 1.0f : raw_quant_field->Row(y)[x]; + // Experimentally values 128-130 seem best -- I don't know why we + // need this multiplier. + const float kStrangeMultiplier = 128; + float q = use_dct8 ? 1 : quantizer->Scale() * kStrangeMultiplier * qq; + const auto qv = Set(df, q); + for (size_t i = 0; i < cx * cy * 64; i += Lanes(df)) { + const auto b_y = Load(df, block_y + i); + const auto b_x = Load(df, block_x + i); + const auto b_b = Load(df, block_b + i); + const auto qqm_x = Mul(qv, Load(df, qm_x + i)); + const auto qqm_b = Mul(qv, Load(df, qm_b + i)); + Store(Mul(b_y, qqm_x), df, coeffs_yx + num_ac); + Store(Mul(b_x, qqm_x), df, coeffs_x + num_ac); + Store(Mul(b_y, qqm_b), df, coeffs_yb + num_ac); + Store(Mul(b_b, qqm_b), df, coeffs_b + num_ac); + num_ac += Lanes(df); + } + } + } + JXL_CHECK(num_ac % Lanes(df) == 0); + row_out_x[tx] = FindBestMultiplier(coeffs_yx, coeffs_x, num_ac, 0.0f, + kDistanceMultiplierAC, fast); + row_out_b[tx] = FindBestMultiplier(coeffs_yb, coeffs_b, num_ac, kYToBRatio, + kDistanceMultiplierAC, fast); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(InitDCStorage); +HWY_EXPORT(ComputeDC); +HWY_EXPORT(ComputeTile); + +void CfLHeuristics::Init(const Image3F& opsin) { + size_t xsize_blocks = opsin.xsize() / kBlockDim; + size_t ysize_blocks = opsin.ysize() / kBlockDim; + HWY_DYNAMIC_DISPATCH(InitDCStorage) + (xsize_blocks * ysize_blocks, &dc_values); +} + +void CfLHeuristics::ComputeTile(const Rect& r, const Image3F& opsin, + const DequantMatrices& dequant, + const AcStrategyImage* ac_strategy, + const ImageI* raw_quant_field, + const Quantizer* quantizer, bool fast, + size_t thread, ColorCorrelationMap* cmap) { + bool use_dct8 = ac_strategy == nullptr; + HWY_DYNAMIC_DISPATCH(ComputeTile) + (opsin, dequant, ac_strategy, raw_quant_field, quantizer, r, fast, use_dct8, + &cmap->ytox_map, &cmap->ytob_map, &dc_values, + mem.get() + thread * kItemsPerThread); +} + +void CfLHeuristics::ComputeDC(bool fast, ColorCorrelationMap* cmap) { + int32_t ytob_dc = 0; + int32_t ytox_dc = 0; + HWY_DYNAMIC_DISPATCH(ComputeDC)(dc_values, fast, &ytox_dc, &ytob_dc); + cmap->SetYToBDC(ytob_dc); + cmap->SetYToXDC(ytox_dc); +} + +void ColorCorrelationMapEncodeDC(ColorCorrelationMap* map, BitWriter* writer, + size_t layer, AuxOut* aux_out) { + float color_factor = map->GetColorFactor(); + float base_correlation_x = map->GetBaseCorrelationX(); + float base_correlation_b = map->GetBaseCorrelationB(); + int32_t ytox_dc = map->GetYToXDC(); + int32_t ytob_dc = map->GetYToBDC(); + + BitWriter::Allotment allotment(writer, 1 + 2 * kBitsPerByte + 12 + 32); + if (ytox_dc == 0 && ytob_dc == 0 && color_factor == kDefaultColorFactor && + base_correlation_x == 0.0f && base_correlation_b == kYToBRatio) { + writer->Write(1, 1); + allotment.ReclaimAndCharge(writer, layer, aux_out); + return; + } + writer->Write(1, 0); + JXL_CHECK(U32Coder::Write(kColorFactorDist, color_factor, writer)); + JXL_CHECK(F16Coder::Write(base_correlation_x, writer)); + JXL_CHECK(F16Coder::Write(base_correlation_b, writer)); + writer->Write(kBitsPerByte, ytox_dc - std::numeric_limits::min()); + writer->Write(kBitsPerByte, ytob_dc - std::numeric_limits::min()); + allotment.ReclaimAndCharge(writer, layer, aux_out); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.h b/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.h new file mode 100644 index 0000000000..899b91b041 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.h @@ -0,0 +1,68 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_CHROMA_FROM_LUMA_H_ +#define LIB_JXL_ENC_CHROMA_FROM_LUMA_H_ + +// Chroma-from-luma, computed using heuristics to determine the best linear +// model for the X and B channels from the Y channel. + +#include +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/field_encodings.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/quant_weights.h" + +namespace jxl { + +struct AuxOut; +class Quantizer; + +void ColorCorrelationMapEncodeDC(ColorCorrelationMap* map, BitWriter* writer, + size_t layer, AuxOut* aux_out); + +struct CfLHeuristics { + void Init(const Image3F& opsin); + + void PrepareForThreads(size_t num_threads) { + mem = hwy::AllocateAligned(num_threads * kItemsPerThread); + } + + void ComputeTile(const Rect& r, const Image3F& opsin, + const DequantMatrices& dequant, + const AcStrategyImage* ac_strategy, + const ImageI* raw_quant_field, const Quantizer* quantizer, + bool fast, size_t thread, ColorCorrelationMap* cmap); + + void ComputeDC(bool fast, ColorCorrelationMap* cmap); + + ImageF dc_values; + hwy::AlignedFreeUniquePtr mem; + + // Working set is too large for stack; allocate dynamically. + constexpr static size_t kItemsPerThread = + AcStrategy::kMaxCoeffArea * 3 // Blocks + + kColorTileDim * kColorTileDim * 4 // AC coeff storage + + AcStrategy::kMaxCoeffArea * 2; // Scratch space +}; + +} // namespace jxl + +#endif // LIB_JXL_ENC_CHROMA_FROM_LUMA_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_cluster.cc b/third_party/jpeg-xl/lib/jxl/enc_cluster.cc new file mode 100644 index 0000000000..c79b3ac834 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_cluster.cc @@ -0,0 +1,295 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_cluster.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_cluster.cc" +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/fast_math-inl.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Eq; +using hwy::HWY_NAMESPACE::IfThenZeroElse; + +template +V Entropy(V count, V inv_total, V total) { + const HWY_CAPPED(float, Histogram::kRounding) d; + const auto zero = Set(d, 0.0f); + // TODO(eustas): why (0 - x) instead of Neg(x)? + return IfThenZeroElse( + Eq(count, total), + Sub(zero, Mul(count, FastLog2f(d, Mul(inv_total, count))))); +} + +void HistogramEntropy(const Histogram& a) { + a.entropy_ = 0.0f; + if (a.total_count_ == 0) return; + + const HWY_CAPPED(float, Histogram::kRounding) df; + const HWY_CAPPED(int32_t, Histogram::kRounding) di; + + const auto inv_tot = Set(df, 1.0f / a.total_count_); + auto entropy_lanes = Zero(df); + auto total = Set(df, a.total_count_); + + for (size_t i = 0; i < a.data_.size(); i += Lanes(di)) { + const auto counts = LoadU(di, &a.data_[i]); + entropy_lanes = + Add(entropy_lanes, Entropy(ConvertTo(df, counts), inv_tot, total)); + } + a.entropy_ += GetLane(SumOfLanes(df, entropy_lanes)); +} + +float HistogramDistance(const Histogram& a, const Histogram& b) { + if (a.total_count_ == 0 || b.total_count_ == 0) return 0; + + const HWY_CAPPED(float, Histogram::kRounding) df; + const HWY_CAPPED(int32_t, Histogram::kRounding) di; + + const auto inv_tot = Set(df, 1.0f / (a.total_count_ + b.total_count_)); + auto distance_lanes = Zero(df); + auto total = Set(df, a.total_count_ + b.total_count_); + + for (size_t i = 0; i < std::max(a.data_.size(), b.data_.size()); + i += Lanes(di)) { + const auto a_counts = + a.data_.size() > i ? LoadU(di, &a.data_[i]) : Zero(di); + const auto b_counts = + b.data_.size() > i ? LoadU(di, &b.data_[i]) : Zero(di); + const auto counts = ConvertTo(df, Add(a_counts, b_counts)); + distance_lanes = Add(distance_lanes, Entropy(counts, inv_tot, total)); + } + const float total_distance = GetLane(SumOfLanes(df, distance_lanes)); + return total_distance - a.entropy_ - b.entropy_; +} + +// First step of a k-means clustering with a fancy distance metric. +void FastClusterHistograms(const std::vector& in, + size_t max_histograms, std::vector* out, + std::vector* histogram_symbols) { + PROFILER_FUNC; + out->clear(); + out->reserve(max_histograms); + histogram_symbols->clear(); + histogram_symbols->resize(in.size(), max_histograms); + + std::vector dists(in.size(), std::numeric_limits::max()); + size_t largest_idx = 0; + for (size_t i = 0; i < in.size(); i++) { + if (in[i].total_count_ == 0) { + (*histogram_symbols)[i] = 0; + dists[i] = 0.0f; + continue; + } + HistogramEntropy(in[i]); + if (in[i].total_count_ > in[largest_idx].total_count_) { + largest_idx = i; + } + } + + constexpr float kMinDistanceForDistinct = 48.0f; + while (out->size() < max_histograms) { + (*histogram_symbols)[largest_idx] = out->size(); + out->push_back(in[largest_idx]); + dists[largest_idx] = 0.0f; + largest_idx = 0; + for (size_t i = 0; i < in.size(); i++) { + if (dists[i] == 0.0f) continue; + dists[i] = std::min(HistogramDistance(in[i], out->back()), dists[i]); + if (dists[i] > dists[largest_idx]) largest_idx = i; + } + if (dists[largest_idx] < kMinDistanceForDistinct) break; + } + + for (size_t i = 0; i < in.size(); i++) { + if ((*histogram_symbols)[i] != max_histograms) continue; + size_t best = 0; + float best_dist = HistogramDistance(in[i], (*out)[best]); + for (size_t j = 1; j < out->size(); j++) { + float dist = HistogramDistance(in[i], (*out)[j]); + if (dist < best_dist) { + best = j; + best_dist = dist; + } + } + (*out)[best].AddHistogram(in[i]); + HistogramEntropy((*out)[best]); + (*histogram_symbols)[i] = best; + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(FastClusterHistograms); // Local function +HWY_EXPORT(HistogramEntropy); // Local function + +float Histogram::ShannonEntropy() const { + HWY_DYNAMIC_DISPATCH(HistogramEntropy)(*this); + return entropy_; +} + +namespace { +// ----------------------------------------------------------------------------- +// Histogram refinement + +// Reorder histograms in *out so that the new symbols in *symbols come in +// increasing order. +void HistogramReindex(std::vector* out, + std::vector* symbols) { + std::vector tmp(*out); + std::map new_index; + int next_index = 0; + for (uint32_t symbol : *symbols) { + if (new_index.find(symbol) == new_index.end()) { + new_index[symbol] = next_index; + (*out)[next_index] = tmp[symbol]; + ++next_index; + } + } + out->resize(next_index); + for (uint32_t& symbol : *symbols) { + symbol = new_index[symbol]; + } +} + +} // namespace + +// Clusters similar histograms in 'in' together, the selected histograms are +// placed in 'out', and for each index in 'in', *histogram_symbols will +// indicate which of the 'out' histograms is the best approximation. +void ClusterHistograms(const HistogramParams params, + const std::vector& in, size_t max_histograms, + std::vector* out, + std::vector* histogram_symbols) { + max_histograms = std::min(max_histograms, params.max_histograms); + max_histograms = std::min(max_histograms, in.size()); + if (params.clustering == HistogramParams::ClusteringType::kFastest) { + max_histograms = std::min(max_histograms, static_cast(4)); + } + + HWY_DYNAMIC_DISPATCH(FastClusterHistograms) + (in, max_histograms, out, histogram_symbols); + + if (params.clustering == HistogramParams::ClusteringType::kBest) { + for (size_t i = 0; i < out->size(); i++) { + (*out)[i].entropy_ = + ANSPopulationCost((*out)[i].data_.data(), (*out)[i].data_.size()); + } + uint32_t next_version = 2; + std::vector version(out->size(), 1); + std::vector renumbering(out->size()); + std::iota(renumbering.begin(), renumbering.end(), 0); + + // Try to pair up clusters if doing so reduces the total cost. + + struct HistogramPair { + // validity of a pair: p.version == max(version[i], version[j]) + float cost; + uint32_t first; + uint32_t second; + uint32_t version; + // We use > because priority queues sort in *decreasing* order, but we + // want lower cost elements to appear first. + bool operator<(const HistogramPair& other) const { + return std::make_tuple(cost, first, second, version) > + std::make_tuple(other.cost, other.first, other.second, + other.version); + } + }; + + // Create list of all pairs by increasing merging cost. + std::priority_queue pairs_to_merge; + for (uint32_t i = 0; i < out->size(); i++) { + for (uint32_t j = i + 1; j < out->size(); j++) { + Histogram histo; + histo.AddHistogram((*out)[i]); + histo.AddHistogram((*out)[j]); + float cost = ANSPopulationCost(histo.data_.data(), histo.data_.size()) - + (*out)[i].entropy_ - (*out)[j].entropy_; + // Avoid enqueueing pairs that are not advantageous to merge. + if (cost >= 0) continue; + pairs_to_merge.push( + HistogramPair{cost, i, j, std::max(version[i], version[j])}); + } + } + + // Merge the best pair to merge, add new pairs that get formed as a + // consequence. + while (!pairs_to_merge.empty()) { + uint32_t first = pairs_to_merge.top().first; + uint32_t second = pairs_to_merge.top().second; + uint32_t ver = pairs_to_merge.top().version; + pairs_to_merge.pop(); + if (ver != std::max(version[first], version[second]) || + version[first] == 0 || version[second] == 0) { + continue; + } + (*out)[first].AddHistogram((*out)[second]); + (*out)[first].entropy_ = ANSPopulationCost((*out)[first].data_.data(), + (*out)[first].data_.size()); + for (size_t i = 0; i < renumbering.size(); i++) { + if (renumbering[i] == second) { + renumbering[i] = first; + } + } + version[second] = 0; + version[first] = next_version++; + for (uint32_t j = 0; j < out->size(); j++) { + if (j == first) continue; + if (version[j] == 0) continue; + Histogram histo; + histo.AddHistogram((*out)[first]); + histo.AddHistogram((*out)[j]); + float cost = ANSPopulationCost(histo.data_.data(), histo.data_.size()) - + (*out)[first].entropy_ - (*out)[j].entropy_; + // Avoid enqueueing pairs that are not advantageous to merge. + if (cost >= 0) continue; + pairs_to_merge.push( + HistogramPair{cost, std::min(first, j), std::max(first, j), + std::max(version[first], version[j])}); + } + } + std::vector reverse_renumbering(out->size(), -1); + size_t num_alive = 0; + for (size_t i = 0; i < out->size(); i++) { + if (version[i] == 0) continue; + (*out)[num_alive++] = (*out)[i]; + reverse_renumbering[i] = num_alive - 1; + } + out->resize(num_alive); + for (size_t i = 0; i < histogram_symbols->size(); i++) { + (*histogram_symbols)[i] = + reverse_renumbering[renumbering[(*histogram_symbols)[i]]]; + } + } + + // Convert the context map to a canonical form. + HistogramReindex(out, histogram_symbols); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/enc_cluster.h b/third_party/jpeg-xl/lib/jxl/enc_cluster.h new file mode 100644 index 0000000000..4b062e820c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_cluster.h @@ -0,0 +1,63 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Functions for clustering similar histograms together. + +#ifndef LIB_JXL_ENC_CLUSTER_H_ +#define LIB_JXL_ENC_CLUSTER_H_ + +#include +#include +#include + +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/enc_ans.h" + +namespace jxl { + +struct Histogram { + Histogram() { + total_count_ = 0; + entropy_ = 0.0; + } + void Clear() { + data_.clear(); + total_count_ = 0; + } + void Add(size_t symbol) { + if (data_.size() <= symbol) { + data_.resize(DivCeil(symbol + 1, kRounding) * kRounding); + } + ++data_[symbol]; + ++total_count_; + } + void AddHistogram(const Histogram& other) { + if (other.data_.size() > data_.size()) { + data_.resize(other.data_.size()); + } + for (size_t i = 0; i < other.data_.size(); ++i) { + data_[i] += other.data_[i]; + } + total_count_ += other.total_count_; + } + float PopulationCost() const { + return ANSPopulationCost(data_.data(), data_.size()); + } + float ShannonEntropy() const; + + std::vector data_; + size_t total_count_; + mutable float entropy_; // WARNING: not kept up-to-date. + static constexpr size_t kRounding = 8; +}; + +void ClusterHistograms(HistogramParams params, const std::vector& in, + size_t max_histograms, std::vector* out, + std::vector* histogram_symbols); +} // namespace jxl + +#endif // LIB_JXL_ENC_CLUSTER_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_coeff_order.cc b/third_party/jpeg-xl/lib/jxl/enc_coeff_order.cc new file mode 100644 index 0000000000..389b53598a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_coeff_order.cc @@ -0,0 +1,291 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include +#include +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/lehmer_code.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/modular/modular_image.h" + +namespace jxl { + +struct AuxOut; + +std::pair ComputeUsedOrders( + const SpeedTier speed, const AcStrategyImage& ac_strategy, + const Rect& rect) { + // Only uses DCT8 = 0, so bitfield = 1. + if (speed >= SpeedTier::kFalcon) return {1, 1}; + + uint32_t ret = 0; + uint32_t ret_customize = 0; + size_t xsize_blocks = rect.xsize(); + size_t ysize_blocks = rect.ysize(); + // TODO(veluca): precompute when doing DCT. + for (size_t by = 0; by < ysize_blocks; ++by) { + AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by); + for (size_t bx = 0; bx < xsize_blocks; ++bx) { + int ord = kStrategyOrder[acs_row[bx].RawStrategy()]; + // Do not customize coefficient orders for blocks bigger than 32x32. + ret |= 1u << ord; + if (ord > 6) { + continue; + } + ret_customize |= 1u << ord; + } + } + // Use default orders for small images. + if (ac_strategy.xsize() < 5 && ac_strategy.ysize() < 5) return {ret, 0}; + return {ret, ret_customize}; +} + +void ComputeCoeffOrder(SpeedTier speed, const ACImage& acs, + const AcStrategyImage& ac_strategy, + const FrameDimensions& frame_dim, uint32_t& used_orders, + uint16_t used_acs, coeff_order_t* JXL_RESTRICT order) { + std::vector num_zeros(kCoeffOrderMaxSize); + // If compressing at high speed and only using 8x8 DCTs, only consider a + // subset of blocks. + double block_fraction = 1.0f; + // TODO(veluca): figure out why sampling blocks if non-8x8s are used makes + // encoding significantly less dense. + if (speed >= SpeedTier::kSquirrel && used_orders == 1) { + block_fraction = 0.5f; + } + // No need to compute number of zero coefficients if all orders are the + // default. + if (used_orders != 0) { + uint64_t threshold = + (std::numeric_limits::max() >> 32) * block_fraction; + uint64_t s[2] = {static_cast(0x94D049BB133111EBull), + static_cast(0xBF58476D1CE4E5B9ull)}; + // Xorshift128+ adapted from xorshift128+-inl.h + auto use_sample = [&]() { + auto s1 = s[0]; + const auto s0 = s[1]; + const auto bits = s1 + s0; // b, c + s[0] = s0; + s1 ^= s1 << 23; + s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5); + s[1] = s1; + return (bits >> 32) <= threshold; + }; + + // Count number of zero coefficients, separately for each DCT band. + // TODO(veluca): precompute when doing DCT. + for (size_t group_index = 0; group_index < frame_dim.num_groups; + group_index++) { + const size_t gx = group_index % frame_dim.xsize_groups; + const size_t gy = group_index / frame_dim.xsize_groups; + const Rect rect(gx * kGroupDimInBlocks, gy * kGroupDimInBlocks, + kGroupDimInBlocks, kGroupDimInBlocks, + frame_dim.xsize_blocks, frame_dim.ysize_blocks); + ConstACPtr rows[3]; + ACType type = acs.Type(); + for (size_t c = 0; c < 3; c++) { + rows[c] = acs.PlaneRow(c, group_index, 0); + } + size_t ac_offset = 0; + + // TODO(veluca): SIMDfy. + for (size_t by = 0; by < rect.ysize(); ++by) { + AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by); + for (size_t bx = 0; bx < rect.xsize(); ++bx) { + AcStrategy acs = acs_row[bx]; + if (!acs.IsFirstBlock()) continue; + if (!use_sample()) continue; + size_t size = kDCTBlockSize << acs.log2_covered_blocks(); + for (size_t c = 0; c < 3; ++c) { + const size_t order_offset = + CoeffOrderOffset(kStrategyOrder[acs.RawStrategy()], c); + if (type == ACType::k16) { + for (size_t k = 0; k < size; k++) { + bool is_zero = rows[c].ptr16[ac_offset + k] == 0; + num_zeros[order_offset + k] += is_zero ? 1 : 0; + } + } else { + for (size_t k = 0; k < size; k++) { + bool is_zero = rows[c].ptr32[ac_offset + k] == 0; + num_zeros[order_offset + k] += is_zero ? 1 : 0; + } + } + // Ensure LLFs are first in the order. + size_t cx = acs.covered_blocks_x(); + size_t cy = acs.covered_blocks_y(); + CoefficientLayout(&cy, &cx); + for (size_t iy = 0; iy < cy; iy++) { + for (size_t ix = 0; ix < cx; ix++) { + num_zeros[order_offset + iy * kBlockDim * cx + ix] = -1; + } + } + } + ac_offset += size; + } + } + } + } + struct PosAndCount { + uint32_t pos; + uint32_t count; + }; + auto mem = hwy::AllocateAligned(AcStrategy::kMaxCoeffArea); + + std::vector natural_order_buffer; + + uint16_t computed = 0; + for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { + uint8_t ord = kStrategyOrder[o]; + if (computed & (1 << ord)) continue; + computed |= 1 << ord; + AcStrategy acs = AcStrategy::FromRawStrategy(o); + size_t sz = kDCTBlockSize * acs.covered_blocks_x() * acs.covered_blocks_y(); + + // Do nothing for transforms that don't appear. + if ((1 << ord) & ~used_acs) continue; + + if (natural_order_buffer.size() < sz) natural_order_buffer.resize(sz); + acs.ComputeNaturalCoeffOrder(natural_order_buffer.data()); + + // Ensure natural coefficient order is not permuted if the order is + // not transmitted. + if ((1 << ord) & ~used_orders) { + for (size_t c = 0; c < 3; c++) { + size_t offset = CoeffOrderOffset(ord, c); + JXL_DASSERT(CoeffOrderOffset(ord, c + 1) - offset == sz); + memcpy(&order[offset], natural_order_buffer.data(), + sz * sizeof(*order)); + } + continue; + } + + bool is_nondefault = false; + for (uint8_t c = 0; c < 3; c++) { + // Apply zig-zag order. + PosAndCount* pos_and_val = mem.get(); + size_t offset = CoeffOrderOffset(ord, c); + JXL_DASSERT(CoeffOrderOffset(ord, c + 1) - offset == sz); + float inv_sqrt_sz = 1.0f / std::sqrt(sz); + for (size_t i = 0; i < sz; ++i) { + size_t pos = natural_order_buffer[i]; + pos_and_val[i].pos = pos; + // We don't care for the exact number -> quantize number of zeros, + // to get less permuted order. + pos_and_val[i].count = num_zeros[offset + pos] * inv_sqrt_sz + 0.1f; + } + + // Stable-sort -> elements with same number of zeros will preserve their + // order. + auto comparator = [](const PosAndCount& a, const PosAndCount& b) -> bool { + return a.count < b.count; + }; + std::stable_sort(pos_and_val, pos_and_val + sz, comparator); + + // Grab indices. + for (size_t i = 0; i < sz; ++i) { + order[offset + i] = pos_and_val[i].pos; + is_nondefault |= natural_order_buffer[i] != pos_and_val[i].pos; + } + } + if (!is_nondefault) { + used_orders &= ~(1 << ord); + } + } +} + +namespace { + +void TokenizePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip, + size_t size, std::vector* tokens) { + std::vector lehmer(size); + std::vector temp(size + 1); + ComputeLehmerCode(order, temp.data(), size, lehmer.data()); + size_t end = size; + while (end > skip && lehmer[end - 1] == 0) { + --end; + } + tokens->emplace_back(CoeffOrderContext(size), end - skip); + uint32_t last = 0; + for (size_t i = skip; i < end; ++i) { + tokens->emplace_back(CoeffOrderContext(last), lehmer[i]); + last = lehmer[i]; + } +} + +} // namespace + +void EncodePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip, + size_t size, BitWriter* writer, int layer, + AuxOut* aux_out) { + std::vector> tokens(1); + TokenizePermutation(order, skip, size, &tokens[0]); + std::vector context_map; + EntropyEncodingData codes; + BuildAndEncodeHistograms(HistogramParams(), kPermutationContexts, tokens, + &codes, &context_map, writer, layer, aux_out); + WriteTokens(tokens[0], codes, context_map, writer, layer, aux_out); +} + +namespace { +void EncodeCoeffOrder(const coeff_order_t* JXL_RESTRICT order, AcStrategy acs, + std::vector* tokens, coeff_order_t* order_zigzag, + std::vector& natural_order_lut) { + const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y(); + const size_t size = kDCTBlockSize * llf; + for (size_t i = 0; i < size; ++i) { + order_zigzag[i] = natural_order_lut[order[i]]; + } + TokenizePermutation(order_zigzag, llf, size, tokens); +} +} // namespace + +void EncodeCoeffOrders(uint16_t used_orders, + const coeff_order_t* JXL_RESTRICT order, + BitWriter* writer, size_t layer, + AuxOut* JXL_RESTRICT aux_out) { + auto mem = hwy::AllocateAligned(AcStrategy::kMaxCoeffArea); + uint16_t computed = 0; + std::vector> tokens(1); + std::vector natural_order_lut; + for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { + uint8_t ord = kStrategyOrder[o]; + if (computed & (1 << ord)) continue; + computed |= 1 << ord; + if ((used_orders & (1 << ord)) == 0) continue; + AcStrategy acs = AcStrategy::FromRawStrategy(o); + const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y(); + const size_t size = kDCTBlockSize * llf; + if (natural_order_lut.size() < size) natural_order_lut.resize(size); + acs.ComputeNaturalCoeffOrderLut(natural_order_lut.data()); + for (size_t c = 0; c < 3; c++) { + EncodeCoeffOrder(&order[CoeffOrderOffset(ord, c)], acs, &tokens[0], + mem.get(), natural_order_lut); + } + } + // Do not write anything if no order is used. + if (used_orders != 0) { + std::vector context_map; + EntropyEncodingData codes; + BuildAndEncodeHistograms(HistogramParams(), kPermutationContexts, tokens, + &codes, &context_map, writer, layer, aux_out); + WriteTokens(tokens[0], codes, context_map, writer, layer, aux_out); + } +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_coeff_order.h b/third_party/jpeg-xl/lib/jxl/enc_coeff_order.h new file mode 100644 index 0000000000..3a43f4f986 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_coeff_order.h @@ -0,0 +1,54 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_COEFF_ORDER_H_ +#define LIB_JXL_ENC_COEFF_ORDER_H_ + +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_params.h" + +namespace jxl { + +struct AuxOut; + +// Orders that are actually used in part of image. `rect` is in block units. +// Returns {orders that are used, orders that might be made non-default}. +std::pair ComputeUsedOrders( + SpeedTier speed, const AcStrategyImage& ac_strategy, const Rect& rect); + +// Modify zig-zag order, so that DCT bands with more zeros go later. +// Order of DCT bands with same number of zeros is untouched, so +// permutation will be cheaper to encode. +void ComputeCoeffOrder(SpeedTier speed, const ACImage& acs, + const AcStrategyImage& ac_strategy, + const FrameDimensions& frame_dim, uint32_t& used_orders, + uint16_t used_acs, coeff_order_t* JXL_RESTRICT order); + +void EncodeCoeffOrders(uint16_t used_orders, + const coeff_order_t* JXL_RESTRICT order, + BitWriter* writer, size_t layer, + AuxOut* JXL_RESTRICT aux_out); + +// Encoding/decoding of a single permutation. `size`: number of elements in the +// permutation. `skip`: number of elements to skip from the *beginning* of the +// permutation. +void EncodePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip, + size_t size, BitWriter* writer, int layer, + AuxOut* aux_out); + +} // namespace jxl + +#endif // LIB_JXL_ENC_COEFF_ORDER_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_color_management.cc b/third_party/jpeg-xl/lib/jxl/enc_color_management.cc new file mode 100644 index 0000000000..8a23ead473 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_color_management.cc @@ -0,0 +1,1293 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_color_management.h" + +#ifndef JPEGXL_ENABLE_SKCMS +#define JPEGXL_ENABLE_SKCMS 0 +#endif + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_color_management.cc" +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/field_encodings.h" +#include "lib/jxl/matrix_ops.h" +#include "lib/jxl/transfer_functions-inl.h" +#if JPEGXL_ENABLE_SKCMS +#include "lib/jxl/enc_jxl_skcms.h" +#else // JPEGXL_ENABLE_SKCMS +#include "lcms2.h" +#include "lcms2_plugin.h" +#endif // JPEGXL_ENABLE_SKCMS + +#define JXL_CMS_VERBOSE 0 + +// Define these only once. We can't use HWY_ONCE here because it is defined as +// 1 only on the last pass. +#ifndef LIB_JXL_ENC_COLOR_MANAGEMENT_CC_ +#define LIB_JXL_ENC_COLOR_MANAGEMENT_CC_ + +namespace jxl { +namespace { +struct JxlCms { +#if JPEGXL_ENABLE_SKCMS + PaddedBytes icc_src, icc_dst; + skcms_ICCProfile profile_src, profile_dst; +#else + void* lcms_transform; +#endif + + // These fields are used when the HLG OOTF or inverse OOTF must be applied. + bool apply_hlg_ootf; + size_t hlg_ootf_num_channels; + // Y component of the primaries. + std::array hlg_ootf_luminances; + + size_t channels_src; + size_t channels_dst; + ImageF buf_src; + ImageF buf_dst; + float intensity_target; + bool skip_lcms = false; + ExtraTF preprocess = ExtraTF::kNone; + ExtraTF postprocess = ExtraTF::kNone; +}; + +Status ApplyHlgOotf(JxlCms* t, float* JXL_RESTRICT buf, size_t xsize, + bool forward); +} // namespace +} // namespace jxl + +#endif // LIB_JXL_ENC_COLOR_MANAGEMENT_CC_ + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +#if JXL_CMS_VERBOSE >= 2 +const size_t kX = 0; // pixel index, multiplied by 3 for RGB +#endif + +// xform_src = UndoGammaCompression(buf_src). +Status BeforeTransform(JxlCms* t, const float* buf_src, float* xform_src, + size_t buf_size) { + switch (t->preprocess) { + case ExtraTF::kNone: + JXL_DASSERT(false); // unreachable + break; + + case ExtraTF::kPQ: { + // By default, PQ content has an intensity target of 10000, stored + // exactly. + HWY_FULL(float) df; + const auto multiplier = Set(df, t->intensity_target == 10000.f + ? 1.0f + : 10000.f / t->intensity_target); + for (size_t i = 0; i < buf_size; i += Lanes(df)) { + const auto val = Load(df, buf_src + i); + const auto result = + Mul(multiplier, TF_PQ().DisplayFromEncoded(df, val)); + Store(result, df, xform_src + i); + } +#if JXL_CMS_VERBOSE >= 2 + printf("pre in %.4f %.4f %.4f undoPQ %.4f %.4f %.4f\n", buf_src[3 * kX], + buf_src[3 * kX + 1], buf_src[3 * kX + 2], xform_src[3 * kX], + xform_src[3 * kX + 1], xform_src[3 * kX + 2]); +#endif + break; + } + + case ExtraTF::kHLG: + for (size_t i = 0; i < buf_size; ++i) { + xform_src[i] = static_cast( + TF_HLG().DisplayFromEncoded(static_cast(buf_src[i]))); + } + if (t->apply_hlg_ootf) { + JXL_RETURN_IF_ERROR( + ApplyHlgOotf(t, xform_src, buf_size, /*forward=*/true)); + } +#if JXL_CMS_VERBOSE >= 2 + printf("pre in %.4f %.4f %.4f undoHLG %.4f %.4f %.4f\n", buf_src[3 * kX], + buf_src[3 * kX + 1], buf_src[3 * kX + 2], xform_src[3 * kX], + xform_src[3 * kX + 1], xform_src[3 * kX + 2]); +#endif + break; + + case ExtraTF::kSRGB: + HWY_FULL(float) df; + for (size_t i = 0; i < buf_size; i += Lanes(df)) { + const auto val = Load(df, buf_src + i); + const auto result = TF_SRGB().DisplayFromEncoded(val); + Store(result, df, xform_src + i); + } +#if JXL_CMS_VERBOSE >= 2 + printf("pre in %.4f %.4f %.4f undoSRGB %.4f %.4f %.4f\n", buf_src[3 * kX], + buf_src[3 * kX + 1], buf_src[3 * kX + 2], xform_src[3 * kX], + xform_src[3 * kX + 1], xform_src[3 * kX + 2]); +#endif + break; + } + return true; +} + +// Applies gamma compression in-place. +Status AfterTransform(JxlCms* t, float* JXL_RESTRICT buf_dst, size_t buf_size) { + switch (t->postprocess) { + case ExtraTF::kNone: + JXL_DASSERT(false); // unreachable + break; + case ExtraTF::kPQ: { + HWY_FULL(float) df; + const auto multiplier = + Set(df, t->intensity_target == 10000.f ? 1.0f + : t->intensity_target * 1e-4f); + for (size_t i = 0; i < buf_size; i += Lanes(df)) { + const auto val = Load(df, buf_dst + i); + const auto result = + TF_PQ().EncodedFromDisplay(df, Mul(multiplier, val)); + Store(result, df, buf_dst + i); + } +#if JXL_CMS_VERBOSE >= 2 + printf("after PQ enc %.4f %.4f %.4f\n", buf_dst[3 * kX], + buf_dst[3 * kX + 1], buf_dst[3 * kX + 2]); +#endif + break; + } + case ExtraTF::kHLG: + if (t->apply_hlg_ootf) { + JXL_RETURN_IF_ERROR( + ApplyHlgOotf(t, buf_dst, buf_size, /*forward=*/false)); + } + for (size_t i = 0; i < buf_size; ++i) { + buf_dst[i] = static_cast( + TF_HLG().EncodedFromDisplay(static_cast(buf_dst[i]))); + } +#if JXL_CMS_VERBOSE >= 2 + printf("after HLG enc %.4f %.4f %.4f\n", buf_dst[3 * kX], + buf_dst[3 * kX + 1], buf_dst[3 * kX + 2]); +#endif + break; + case ExtraTF::kSRGB: + HWY_FULL(float) df; + for (size_t i = 0; i < buf_size; i += Lanes(df)) { + const auto val = Load(df, buf_dst + i); + const auto result = + TF_SRGB().EncodedFromDisplay(HWY_FULL(float)(), val); + Store(result, df, buf_dst + i); + } +#if JXL_CMS_VERBOSE >= 2 + printf("after SRGB enc %.4f %.4f %.4f\n", buf_dst[3 * kX], + buf_dst[3 * kX + 1], buf_dst[3 * kX + 2]); +#endif + break; + } + return true; +} + +Status DoColorSpaceTransform(void* cms_data, const size_t thread, + const float* buf_src, float* buf_dst, + size_t xsize) { + // No lock needed. + JxlCms* t = reinterpret_cast(cms_data); + + const float* xform_src = buf_src; // Read-only. + if (t->preprocess != ExtraTF::kNone) { + float* mutable_xform_src = t->buf_src.Row(thread); // Writable buffer. + JXL_RETURN_IF_ERROR(BeforeTransform(t, buf_src, mutable_xform_src, + xsize * t->channels_src)); + xform_src = mutable_xform_src; + } + +#if JPEGXL_ENABLE_SKCMS + if (t->channels_src == 1 && !t->skip_lcms) { + // Expand from 1 to 3 channels, starting from the end in case + // xform_src == t->buf_src.Row(thread). + float* mutable_xform_src = t->buf_src.Row(thread); + for (size_t i = 0; i < xsize; ++i) { + const size_t x = xsize - i - 1; + mutable_xform_src[x * 3] = mutable_xform_src[x * 3 + 1] = + mutable_xform_src[x * 3 + 2] = xform_src[x]; + } + xform_src = mutable_xform_src; + } +#else + if (t->channels_src == 4 && !t->skip_lcms) { + // LCMS does CMYK in a weird way: 0 = white, 100 = max ink + float* mutable_xform_src = t->buf_src.Row(thread); + for (size_t x = 0; x < xsize * 4; ++x) { + mutable_xform_src[x] = 100.f - 100.f * mutable_xform_src[x]; + } + xform_src = mutable_xform_src; + } +#endif + +#if JXL_CMS_VERBOSE >= 2 + // Save inputs for printing before in-place transforms overwrite them. + const float in0 = xform_src[3 * kX + 0]; + const float in1 = xform_src[3 * kX + 1]; + const float in2 = xform_src[3 * kX + 2]; +#endif + + if (t->skip_lcms) { + if (buf_dst != xform_src) { + memcpy(buf_dst, xform_src, xsize * t->channels_src * sizeof(*buf_dst)); + } // else: in-place, no need to copy + } else { +#if JPEGXL_ENABLE_SKCMS + JXL_CHECK( + skcms_Transform(xform_src, + (t->channels_src == 4 ? skcms_PixelFormat_RGBA_ffff + : skcms_PixelFormat_RGB_fff), + skcms_AlphaFormat_Opaque, &t->profile_src, buf_dst, + skcms_PixelFormat_RGB_fff, skcms_AlphaFormat_Opaque, + &t->profile_dst, xsize)); +#else // JPEGXL_ENABLE_SKCMS + cmsDoTransform(t->lcms_transform, xform_src, buf_dst, + static_cast(xsize)); +#endif // JPEGXL_ENABLE_SKCMS + } +#if JXL_CMS_VERBOSE >= 2 + printf("xform skip%d: %.4f %.4f %.4f (%p) -> (%p) %.4f %.4f %.4f\n", + t->skip_lcms, in0, in1, in2, xform_src, buf_dst, buf_dst[3 * kX], + buf_dst[3 * kX + 1], buf_dst[3 * kX + 2]); +#endif + +#if JPEGXL_ENABLE_SKCMS + if (t->channels_dst == 1 && !t->skip_lcms) { + // Contract back from 3 to 1 channel, this time forward. + float* grayscale_buf_dst = t->buf_dst.Row(thread); + for (size_t x = 0; x < xsize; ++x) { + grayscale_buf_dst[x] = buf_dst[x * 3]; + } + buf_dst = grayscale_buf_dst; + } +#endif + + if (t->postprocess != ExtraTF::kNone) { + JXL_RETURN_IF_ERROR(AfterTransform(t, buf_dst, xsize * t->channels_dst)); + } + return true; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +namespace { + +HWY_EXPORT(DoColorSpaceTransform); +int DoColorSpaceTransform(void* t, size_t thread, const float* buf_src, + float* buf_dst, size_t xsize) { + return HWY_DYNAMIC_DISPATCH(DoColorSpaceTransform)(t, thread, buf_src, + buf_dst, xsize); +} + +// Define to 1 on OS X as a workaround for older LCMS lacking MD5. +#define JXL_CMS_OLD_VERSION 0 + +#if JPEGXL_ENABLE_SKCMS + +JXL_MUST_USE_RESULT CIExy CIExyFromXYZ(const float XYZ[3]) { + const float factor = 1.f / (XYZ[0] + XYZ[1] + XYZ[2]); + CIExy xy; + xy.x = XYZ[0] * factor; + xy.y = XYZ[1] * factor; + return xy; +} + +#else // JPEGXL_ENABLE_SKCMS +// (LCMS interface requires xyY but we omit the Y for white points/primaries.) + +JXL_MUST_USE_RESULT CIExy CIExyFromxyY(const cmsCIExyY& xyY) { + CIExy xy; + xy.x = xyY.x; + xy.y = xyY.y; + return xy; +} + +JXL_MUST_USE_RESULT CIExy CIExyFromXYZ(const cmsCIEXYZ& XYZ) { + cmsCIExyY xyY; + cmsXYZ2xyY(/*Dest=*/&xyY, /*Source=*/&XYZ); + return CIExyFromxyY(xyY); +} + +JXL_MUST_USE_RESULT cmsCIEXYZ D50_XYZ() { + // Quantized D50 as stored in ICC profiles. + return {0.96420288, 1.0, 0.82490540}; +} + +// RAII + +struct ProfileDeleter { + void operator()(void* p) { cmsCloseProfile(p); } +}; +using Profile = std::unique_ptr; + +struct TransformDeleter { + void operator()(void* p) { cmsDeleteTransform(p); } +}; +using Transform = std::unique_ptr; + +struct CurveDeleter { + void operator()(cmsToneCurve* p) { cmsFreeToneCurve(p); } +}; +using Curve = std::unique_ptr; + +Status CreateProfileXYZ(const cmsContext context, + Profile* JXL_RESTRICT profile) { + profile->reset(cmsCreateXYZProfileTHR(context)); + if (profile->get() == nullptr) return JXL_FAILURE("Failed to create XYZ"); + return true; +} + +#endif // !JPEGXL_ENABLE_SKCMS + +#if JPEGXL_ENABLE_SKCMS +// IMPORTANT: icc must outlive profile. +Status DecodeProfile(const uint8_t* icc, size_t size, + skcms_ICCProfile* const profile) { + if (!skcms_Parse(icc, size, profile)) { + return JXL_FAILURE("Failed to parse ICC profile with %" PRIuS " bytes", + size); + } + return true; +} +#else // JPEGXL_ENABLE_SKCMS +Status DecodeProfile(const cmsContext context, const PaddedBytes& icc, + Profile* profile) { + profile->reset(cmsOpenProfileFromMemTHR(context, icc.data(), icc.size())); + if (profile->get() == nullptr) { + return JXL_FAILURE("Failed to decode profile"); + } + + // WARNING: due to the LCMS MD5 issue mentioned above, many existing + // profiles have incorrect MD5, so do not even bother checking them nor + // generating warning clutter. + + return true; +} +#endif // JPEGXL_ENABLE_SKCMS + +#if JPEGXL_ENABLE_SKCMS + +ColorSpace ColorSpaceFromProfile(const skcms_ICCProfile& profile) { + switch (profile.data_color_space) { + case skcms_Signature_RGB: + case skcms_Signature_CMYK: + // spec says CMYK is encoded as RGB (the kBlack extra channel signals that + // it is actually CMYK) + return ColorSpace::kRGB; + case skcms_Signature_Gray: + return ColorSpace::kGray; + default: + return ColorSpace::kUnknown; + } +} + +// vector_out := matmul(matrix, vector_in) +void MatrixProduct(const skcms_Matrix3x3& matrix, const float vector_in[3], + float vector_out[3]) { + for (int i = 0; i < 3; ++i) { + vector_out[i] = 0; + for (int j = 0; j < 3; ++j) { + vector_out[i] += matrix.vals[i][j] * vector_in[j]; + } + } +} + +// Returns white point that was specified when creating the profile. +JXL_MUST_USE_RESULT Status UnadaptedWhitePoint(const skcms_ICCProfile& profile, + CIExy* out) { + float media_white_point_XYZ[3]; + if (!skcms_GetWTPT(&profile, media_white_point_XYZ)) { + return JXL_FAILURE("ICC profile does not contain WhitePoint tag"); + } + skcms_Matrix3x3 CHAD; + if (!skcms_GetCHAD(&profile, &CHAD)) { + // If there is no chromatic adaptation matrix, it means that the white point + // is already unadapted. + *out = CIExyFromXYZ(media_white_point_XYZ); + return true; + } + // Otherwise, it has been adapted to the PCS white point using said matrix, + // and the adaptation needs to be undone. + skcms_Matrix3x3 inverse_CHAD; + if (!skcms_Matrix3x3_invert(&CHAD, &inverse_CHAD)) { + return JXL_FAILURE("Non-invertible ChromaticAdaptation matrix"); + } + float unadapted_white_point_XYZ[3]; + MatrixProduct(inverse_CHAD, media_white_point_XYZ, unadapted_white_point_XYZ); + *out = CIExyFromXYZ(unadapted_white_point_XYZ); + return true; +} + +Status IdentifyPrimaries(const skcms_ICCProfile& profile, + const CIExy& wp_unadapted, ColorEncoding* c) { + if (!c->HasPrimaries()) return true; + + skcms_Matrix3x3 CHAD, inverse_CHAD; + if (skcms_GetCHAD(&profile, &CHAD)) { + JXL_RETURN_IF_ERROR(skcms_Matrix3x3_invert(&CHAD, &inverse_CHAD)); + } else { + static constexpr skcms_Matrix3x3 kLMSFromXYZ = { + {{0.8951, 0.2664, -0.1614}, + {-0.7502, 1.7135, 0.0367}, + {0.0389, -0.0685, 1.0296}}}; + static constexpr skcms_Matrix3x3 kXYZFromLMS = { + {{0.9869929, -0.1470543, 0.1599627}, + {0.4323053, 0.5183603, 0.0492912}, + {-0.0085287, 0.0400428, 0.9684867}}}; + static constexpr float kWpD50XYZ[3] = {0.96420288, 1.0, 0.82490540}; + float wp_unadapted_XYZ[3]; + JXL_RETURN_IF_ERROR(CIEXYZFromWhiteCIExy(wp_unadapted, wp_unadapted_XYZ)); + float wp_D50_LMS[3], wp_unadapted_LMS[3]; + MatrixProduct(kLMSFromXYZ, kWpD50XYZ, wp_D50_LMS); + MatrixProduct(kLMSFromXYZ, wp_unadapted_XYZ, wp_unadapted_LMS); + inverse_CHAD = {{{wp_unadapted_LMS[0] / wp_D50_LMS[0], 0, 0}, + {0, wp_unadapted_LMS[1] / wp_D50_LMS[1], 0}, + {0, 0, wp_unadapted_LMS[2] / wp_D50_LMS[2]}}}; + inverse_CHAD = skcms_Matrix3x3_concat(&kXYZFromLMS, &inverse_CHAD); + inverse_CHAD = skcms_Matrix3x3_concat(&inverse_CHAD, &kLMSFromXYZ); + } + + float XYZ[3]; + PrimariesCIExy primaries; + CIExy* const chromaticities[] = {&primaries.r, &primaries.g, &primaries.b}; + for (int i = 0; i < 3; ++i) { + float RGB[3] = {}; + RGB[i] = 1; + skcms_Transform(RGB, skcms_PixelFormat_RGB_fff, skcms_AlphaFormat_Opaque, + &profile, XYZ, skcms_PixelFormat_RGB_fff, + skcms_AlphaFormat_Opaque, skcms_XYZD50_profile(), 1); + float unadapted_XYZ[3]; + MatrixProduct(inverse_CHAD, XYZ, unadapted_XYZ); + *chromaticities[i] = CIExyFromXYZ(unadapted_XYZ); + } + return c->SetPrimaries(primaries); +} + +void DetectTransferFunction(const skcms_ICCProfile& profile, + ColorEncoding* JXL_RESTRICT c) { + if (c->tf.SetImplicit()) return; + + float gamma[3] = {}; + if (profile.has_trc) { + const auto IsGamma = [](const skcms_TransferFunction& tf) { + return tf.a == 1 && tf.b == 0 && + /* if b and d are zero, it is fine for c not to be */ tf.d == 0 && + tf.e == 0 && tf.f == 0; + }; + for (int i = 0; i < 3; ++i) { + if (profile.trc[i].table_entries == 0 && + IsGamma(profile.trc->parametric)) { + gamma[i] = 1.f / profile.trc->parametric.g; + } else { + skcms_TransferFunction approximate_tf; + float max_error; + if (skcms_ApproximateCurve(&profile.trc[i], &approximate_tf, + &max_error)) { + if (IsGamma(approximate_tf)) { + gamma[i] = 1.f / approximate_tf.g; + } + } + } + } + } + if (gamma[0] != 0 && std::abs(gamma[0] - gamma[1]) < 1e-4f && + std::abs(gamma[1] - gamma[2]) < 1e-4f) { + if (c->tf.SetGamma(gamma[0])) { + skcms_ICCProfile profile_test; + PaddedBytes bytes; + if (MaybeCreateProfile(*c, &bytes) && + DecodeProfile(bytes.data(), bytes.size(), &profile_test) && + skcms_ApproximatelyEqualProfiles(&profile, &profile_test)) { + return; + } + } + } + + for (TransferFunction tf : Values()) { + // Can only create profile from known transfer function. + if (tf == TransferFunction::kUnknown) continue; + + c->tf.SetTransferFunction(tf); + + skcms_ICCProfile profile_test; + PaddedBytes bytes; + if (MaybeCreateProfile(*c, &bytes) && + DecodeProfile(bytes.data(), bytes.size(), &profile_test) && + skcms_ApproximatelyEqualProfiles(&profile, &profile_test)) { + return; + } + } + + c->tf.SetTransferFunction(TransferFunction::kUnknown); +} + +#else // JPEGXL_ENABLE_SKCMS + +uint32_t Type32(const ColorEncoding& c, bool cmyk) { + if (cmyk) return TYPE_CMYK_FLT; + if (c.IsGray()) return TYPE_GRAY_FLT; + return TYPE_RGB_FLT; +} + +uint32_t Type64(const ColorEncoding& c) { + if (c.IsGray()) return TYPE_GRAY_DBL; + return TYPE_RGB_DBL; +} + +ColorSpace ColorSpaceFromProfile(const Profile& profile) { + switch (cmsGetColorSpace(profile.get())) { + case cmsSigRgbData: + case cmsSigCmykData: + return ColorSpace::kRGB; + case cmsSigGrayData: + return ColorSpace::kGray; + default: + return ColorSpace::kUnknown; + } +} + +// "profile1" is pre-decoded to save time in DetectTransferFunction. +Status ProfileEquivalentToICC(const cmsContext context, const Profile& profile1, + const PaddedBytes& icc, const ColorEncoding& c) { + const uint32_t type_src = Type64(c); + + Profile profile2; + JXL_RETURN_IF_ERROR(DecodeProfile(context, icc, &profile2)); + + Profile profile_xyz; + JXL_RETURN_IF_ERROR(CreateProfileXYZ(context, &profile_xyz)); + + const uint32_t intent = INTENT_RELATIVE_COLORIMETRIC; + const uint32_t flags = cmsFLAGS_NOOPTIMIZE | cmsFLAGS_BLACKPOINTCOMPENSATION | + cmsFLAGS_HIGHRESPRECALC; + Transform xform1(cmsCreateTransformTHR(context, profile1.get(), type_src, + profile_xyz.get(), TYPE_XYZ_DBL, + intent, flags)); + Transform xform2(cmsCreateTransformTHR(context, profile2.get(), type_src, + profile_xyz.get(), TYPE_XYZ_DBL, + intent, flags)); + if (xform1 == nullptr || xform2 == nullptr) { + return JXL_FAILURE("Failed to create transform"); + } + + double in[3]; + double out1[3]; + double out2[3]; + + // Uniformly spaced samples from very dark to almost fully bright. + const double init = 1E-3; + const double step = 0.2; + + if (c.IsGray()) { + // Finer sampling and replicate each component. + for (in[0] = init; in[0] < 1.0; in[0] += step / 8) { + cmsDoTransform(xform1.get(), in, out1, 1); + cmsDoTransform(xform2.get(), in, out2, 1); + if (!ApproxEq(out1[0], out2[0], 2E-4)) { + return false; + } + } + } else { + for (in[0] = init; in[0] < 1.0; in[0] += step) { + for (in[1] = init; in[1] < 1.0; in[1] += step) { + for (in[2] = init; in[2] < 1.0; in[2] += step) { + cmsDoTransform(xform1.get(), in, out1, 1); + cmsDoTransform(xform2.get(), in, out2, 1); + for (size_t i = 0; i < 3; ++i) { + if (!ApproxEq(out1[i], out2[i], 2E-4)) { + return false; + } + } + } + } + } + } + + return true; +} + +// Returns white point that was specified when creating the profile. +// NOTE: we can't just use cmsSigMediaWhitePointTag because its interpretation +// differs between ICC versions. +JXL_MUST_USE_RESULT cmsCIEXYZ UnadaptedWhitePoint(const cmsContext context, + const Profile& profile, + const ColorEncoding& c) { + const cmsCIEXYZ* white_point = static_cast( + cmsReadTag(profile.get(), cmsSigMediaWhitePointTag)); + if (white_point != nullptr && + cmsReadTag(profile.get(), cmsSigChromaticAdaptationTag) == nullptr) { + // No chromatic adaptation matrix: the white point is already unadapted. + return *white_point; + } + + cmsCIEXYZ XYZ = {1.0, 1.0, 1.0}; + Profile profile_xyz; + if (!CreateProfileXYZ(context, &profile_xyz)) return XYZ; + // Array arguments are one per profile. + cmsHPROFILE profiles[2] = {profile.get(), profile_xyz.get()}; + // Leave white point unchanged - that is what we're trying to extract. + cmsUInt32Number intents[2] = {INTENT_ABSOLUTE_COLORIMETRIC, + INTENT_ABSOLUTE_COLORIMETRIC}; + cmsBool black_compensation[2] = {0, 0}; + cmsFloat64Number adaption[2] = {0.0, 0.0}; + // Only transforming a single pixel, so skip expensive optimizations. + cmsUInt32Number flags = cmsFLAGS_NOOPTIMIZE | cmsFLAGS_HIGHRESPRECALC; + Transform xform(cmsCreateExtendedTransform( + context, 2, profiles, black_compensation, intents, adaption, nullptr, 0, + Type64(c), TYPE_XYZ_DBL, flags)); + if (!xform) return XYZ; // TODO(lode): return error + + // xy are relative, so magnitude does not matter if we ignore output Y. + const cmsFloat64Number in[3] = {1.0, 1.0, 1.0}; + cmsDoTransform(xform.get(), in, &XYZ.X, 1); + return XYZ; +} + +Status IdentifyPrimaries(const cmsContext context, const Profile& profile, + const cmsCIEXYZ& wp_unadapted, ColorEncoding* c) { + if (!c->HasPrimaries()) return true; + if (ColorSpaceFromProfile(profile) == ColorSpace::kUnknown) return true; + + // These were adapted to the profile illuminant before storing in the profile. + const cmsCIEXYZ* adapted_r = static_cast( + cmsReadTag(profile.get(), cmsSigRedColorantTag)); + const cmsCIEXYZ* adapted_g = static_cast( + cmsReadTag(profile.get(), cmsSigGreenColorantTag)); + const cmsCIEXYZ* adapted_b = static_cast( + cmsReadTag(profile.get(), cmsSigBlueColorantTag)); + + cmsCIEXYZ converted_rgb[3]; + if (adapted_r == nullptr || adapted_g == nullptr || adapted_b == nullptr) { + // No colorant tag, determine the XYZ coordinates of the primaries by + // converting from the colorspace. + Profile profile_xyz; + if (!CreateProfileXYZ(context, &profile_xyz)) { + return JXL_FAILURE("Failed to retrieve colorants"); + } + // Array arguments are one per profile. + cmsHPROFILE profiles[2] = {profile.get(), profile_xyz.get()}; + cmsUInt32Number intents[2] = {INTENT_RELATIVE_COLORIMETRIC, + INTENT_RELATIVE_COLORIMETRIC}; + cmsBool black_compensation[2] = {0, 0}; + cmsFloat64Number adaption[2] = {0.0, 0.0}; + // Only transforming three pixels, so skip expensive optimizations. + cmsUInt32Number flags = cmsFLAGS_NOOPTIMIZE | cmsFLAGS_HIGHRESPRECALC; + Transform xform(cmsCreateExtendedTransform( + context, 2, profiles, black_compensation, intents, adaption, nullptr, 0, + Type64(*c), TYPE_XYZ_DBL, flags)); + if (!xform) return JXL_FAILURE("Failed to retrieve colorants"); + + const cmsFloat64Number in[9] = {1.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 1.0}; + cmsDoTransform(xform.get(), in, &converted_rgb->X, 3); + adapted_r = &converted_rgb[0]; + adapted_g = &converted_rgb[1]; + adapted_b = &converted_rgb[2]; + } + + // TODO(janwas): no longer assume Bradford and D50. + // Undo the chromatic adaptation. + const cmsCIEXYZ d50 = D50_XYZ(); + + cmsCIEXYZ r, g, b; + cmsAdaptToIlluminant(&r, &d50, &wp_unadapted, adapted_r); + cmsAdaptToIlluminant(&g, &d50, &wp_unadapted, adapted_g); + cmsAdaptToIlluminant(&b, &d50, &wp_unadapted, adapted_b); + + const PrimariesCIExy rgb = {CIExyFromXYZ(r), CIExyFromXYZ(g), + CIExyFromXYZ(b)}; + return c->SetPrimaries(rgb); +} + +void DetectTransferFunction(const cmsContext context, const Profile& profile, + ColorEncoding* JXL_RESTRICT c) { + if (c->tf.SetImplicit()) return; + + float gamma = 0; + if (const auto* gray_trc = reinterpret_cast( + cmsReadTag(profile.get(), cmsSigGrayTRCTag))) { + const double estimated_gamma = + cmsEstimateGamma(gray_trc, /*precision=*/1e-4); + if (estimated_gamma > 0) { + gamma = 1. / estimated_gamma; + } + } else { + float rgb_gamma[3] = {}; + int i = 0; + for (const auto tag : + {cmsSigRedTRCTag, cmsSigGreenTRCTag, cmsSigBlueTRCTag}) { + if (const auto* trc = reinterpret_cast( + cmsReadTag(profile.get(), tag))) { + const double estimated_gamma = + cmsEstimateGamma(trc, /*precision=*/1e-4); + if (estimated_gamma > 0) { + rgb_gamma[i] = 1. / estimated_gamma; + } + } + ++i; + } + if (rgb_gamma[0] != 0 && std::abs(rgb_gamma[0] - rgb_gamma[1]) < 1e-4f && + std::abs(rgb_gamma[1] - rgb_gamma[2]) < 1e-4f) { + gamma = rgb_gamma[0]; + } + } + + if (gamma != 0 && c->tf.SetGamma(gamma)) { + PaddedBytes icc_test; + if (MaybeCreateProfile(*c, &icc_test) && + ProfileEquivalentToICC(context, profile, icc_test, *c)) { + return; + } + } + + for (TransferFunction tf : Values()) { + // Can only create profile from known transfer function. + if (tf == TransferFunction::kUnknown) continue; + + c->tf.SetTransferFunction(tf); + + PaddedBytes icc_test; + if (MaybeCreateProfile(*c, &icc_test) && + ProfileEquivalentToICC(context, profile, icc_test, *c)) { + return; + } + } + + c->tf.SetTransferFunction(TransferFunction::kUnknown); +} + +void ErrorHandler(cmsContext context, cmsUInt32Number code, const char* text) { + JXL_WARNING("LCMS error %u: %s", code, text); +} + +// Returns a context for the current thread, creating it if necessary. +cmsContext GetContext() { + static thread_local void* context_; + if (context_ == nullptr) { + context_ = cmsCreateContext(nullptr, nullptr); + JXL_ASSERT(context_ != nullptr); + + cmsSetLogErrorHandlerTHR(static_cast(context_), &ErrorHandler); + } + return static_cast(context_); +} + +#endif // JPEGXL_ENABLE_SKCMS + +Status GetPrimariesLuminances(const ColorEncoding& encoding, + float luminances[3]) { + // Explanation: + // We know that the three primaries must sum to white: + // + // [Xr, Xg, Xb; [1; [Xw; + // Yr, Yg, Yb; × 1; = Yw; + // Zr, Zg, Zb] 1] Zw] + // + // By noting that X = x·(X+Y+Z), Y = y·(X+Y+Z) and Z = z·(X+Y+Z) (note the + // lower case indicating chromaticity), and factoring the totals (X+Y+Z) out + // of the left matrix and into the all-ones vector, we get: + // + // [xr, xg, xb; [Xr + Yr + Zr; [Xw; + // yr, yg, yb; × Xg + Yg + Zg; = Yw; + // zr, zg, zb] Xb + Yb + Zb] Zw] + // + // Which makes it apparent that we can compute those totals as: + // + // [Xr + Yr + Zr; inv([xr, xg, xb; [Xw; + // Xg + Yg + Zg; = yr, yg, yb; × Yw; + // Xb + Yb + Zb] zr, zg, zb]) Zw] + // + // From there, by multiplying each total by its corresponding y, we get Y for + // that primary. + + float white_XYZ[3]; + JXL_RETURN_IF_ERROR( + CIEXYZFromWhiteCIExy(encoding.GetWhitePoint(), white_XYZ)); + + const PrimariesCIExy primaries = encoding.GetPrimaries(); + double chromaticities[3][3] = { + {primaries.r.x, primaries.g.x, primaries.b.x}, + {primaries.r.y, primaries.g.y, primaries.b.y}, + {1 - primaries.r.x - primaries.r.y, 1 - primaries.g.x - primaries.g.y, + 1 - primaries.b.x - primaries.b.y}}; + JXL_RETURN_IF_ERROR(Inv3x3Matrix(&chromaticities[0][0])); + const double ys[3] = {primaries.r.y, primaries.g.y, primaries.b.y}; + for (size_t i = 0; i < 3; ++i) { + luminances[i] = ys[i] * (chromaticities[i][0] * white_XYZ[0] + + chromaticities[i][1] * white_XYZ[1] + + chromaticities[i][2] * white_XYZ[2]); + } + return true; +} + +Status ApplyHlgOotf(JxlCms* t, float* JXL_RESTRICT buf, size_t xsize, + bool forward) { + if (295 <= t->intensity_target && t->intensity_target <= 305) { + // The gamma is approximately 1 so this can essentially be skipped. + return true; + } + float gamma = 1.2f * std::pow(1.111f, std::log2(t->intensity_target * 1e-3f)); + if (!forward) gamma = 1.f / gamma; + + switch (t->hlg_ootf_num_channels) { + case 1: + for (size_t x = 0; x < xsize; ++x) { + buf[x] = std::pow(buf[x], gamma); + } + break; + + case 3: + for (size_t x = 0; x < xsize; x += 3) { + const float luminance = buf[x] * t->hlg_ootf_luminances[0] + + buf[x + 1] * t->hlg_ootf_luminances[1] + + buf[x + 2] * t->hlg_ootf_luminances[2]; + const float ratio = std::pow(luminance, gamma - 1); + if (std::isfinite(ratio)) { + buf[x] *= ratio; + buf[x + 1] *= ratio; + buf[x + 2] *= ratio; + if (forward && gamma < 1) { + // If gamma < 1, the ratio above will be > 1 which can push bright + // saturated highlights out of gamut. There are several possible + // ways to bring them back in-gamut; this one preserves hue and + // saturation at the slight expense of luminance. If !forward, the + // previously-applied forward OOTF with gamma > 1 already pushed + // those highlights down and we are simply putting them back where + // they were so this is not necessary. + const float maximum = + std::max(buf[x], std::max(buf[x + 1], buf[x + 2])); + if (maximum > 1) { + const float normalizer = 1.f / maximum; + buf[x] *= normalizer; + buf[x + 1] *= normalizer; + buf[x + 2] *= normalizer; + } + } + } + } + break; + + default: + return JXL_FAILURE("HLG OOTF not implemented for %" PRIuS " channels", + t->hlg_ootf_num_channels); + } + return true; +} + +bool ApplyCICP(const uint8_t color_primaries, + const uint8_t transfer_characteristics, + const uint8_t matrix_coefficients, const uint8_t full_range, + ColorEncoding* JXL_RESTRICT c) { + if (matrix_coefficients != 0) return false; + if (full_range != 1) return false; + + const auto primaries = static_cast(color_primaries); + const auto tf = static_cast(transfer_characteristics); + if (tf == TransferFunction::kUnknown || !EnumValid(tf)) return false; + if (primaries == Primaries::kCustom || + !(color_primaries == 12 || EnumValid(primaries))) { + return false; + } + c->SetColorSpace(ColorSpace::kRGB); + c->tf.SetTransferFunction(tf); + if (primaries == Primaries::kP3) { + c->white_point = WhitePoint::kDCI; + c->primaries = Primaries::kP3; + } else if (color_primaries == 12) { + c->white_point = WhitePoint::kD65; + c->primaries = Primaries::kP3; + } else { + c->white_point = WhitePoint::kD65; + c->primaries = primaries; + } + return true; +} + +} // namespace + +Status ColorEncoding::SetFieldsFromICC() { + // In case parsing fails, mark the ColorEncoding as invalid. + SetColorSpace(ColorSpace::kUnknown); + tf.SetTransferFunction(TransferFunction::kUnknown); + + if (icc_.empty()) return JXL_FAILURE("Empty ICC profile"); + +#if JPEGXL_ENABLE_SKCMS + if (icc_.size() < 128) { + return JXL_FAILURE("ICC file too small"); + } + + skcms_ICCProfile profile; + JXL_RETURN_IF_ERROR(skcms_Parse(icc_.data(), icc_.size(), &profile)); + + // skcms does not return the rendering intent, so get it from the file. It + // is encoded as big-endian 32-bit integer in bytes 60..63. + uint32_t rendering_intent32 = icc_[67]; + if (rendering_intent32 > 3 || icc_[64] != 0 || icc_[65] != 0 || + icc_[66] != 0) { + return JXL_FAILURE("Invalid rendering intent %u\n", rendering_intent32); + } + // ICC and RenderingIntent have the same values (0..3). + rendering_intent = static_cast(rendering_intent32); + + if (profile.has_CICP && ApplyCICP(profile.CICP.color_primaries, + profile.CICP.transfer_characteristics, + profile.CICP.matrix_coefficients, + profile.CICP.video_full_range_flag, this)) { + return true; + } + + SetColorSpace(ColorSpaceFromProfile(profile)); + cmyk_ = (profile.data_color_space == skcms_Signature_CMYK); + + CIExy wp_unadapted; + JXL_RETURN_IF_ERROR(UnadaptedWhitePoint(profile, &wp_unadapted)); + JXL_RETURN_IF_ERROR(SetWhitePoint(wp_unadapted)); + + // Relies on color_space. + JXL_RETURN_IF_ERROR(IdentifyPrimaries(profile, wp_unadapted, this)); + + // Relies on color_space/white point/primaries being set already. + DetectTransferFunction(profile, this); +#else // JPEGXL_ENABLE_SKCMS + + const cmsContext context = GetContext(); + + Profile profile; + JXL_RETURN_IF_ERROR(DecodeProfile(context, icc_, &profile)); + + static constexpr size_t kCICPSize = 12; + static constexpr auto kCICPSignature = + static_cast(0x63696370); + uint8_t cicp_buffer[kCICPSize]; + if (cmsReadRawTag(profile.get(), kCICPSignature, cicp_buffer, kCICPSize) == + kCICPSize && + ApplyCICP(cicp_buffer[8], cicp_buffer[9], cicp_buffer[10], + cicp_buffer[11], this)) { + return true; + } + + const cmsUInt32Number rendering_intent32 = + cmsGetHeaderRenderingIntent(profile.get()); + if (rendering_intent32 > 3) { + return JXL_FAILURE("Invalid rendering intent %u\n", rendering_intent32); + } + // ICC and RenderingIntent have the same values (0..3). + rendering_intent = static_cast(rendering_intent32); + + SetColorSpace(ColorSpaceFromProfile(profile)); + if (cmsGetColorSpace(profile.get()) == cmsSigCmykData) { + cmyk_ = true; + return true; + } + + const cmsCIEXYZ wp_unadapted = UnadaptedWhitePoint(context, profile, *this); + JXL_RETURN_IF_ERROR(SetWhitePoint(CIExyFromXYZ(wp_unadapted))); + + // Relies on color_space. + JXL_RETURN_IF_ERROR(IdentifyPrimaries(context, profile, wp_unadapted, this)); + + // Relies on color_space/white point/primaries being set already. + DetectTransferFunction(context, profile, this); + +#endif // JPEGXL_ENABLE_SKCMS + + return true; +} + +void ColorEncoding::DecideIfWantICC() { + PaddedBytes icc_new; +#if JPEGXL_ENABLE_SKCMS + skcms_ICCProfile profile; + if (!DecodeProfile(ICC().data(), ICC().size(), &profile)) return; + if (!MaybeCreateProfile(*this, &icc_new)) return; +#else // JPEGXL_ENABLE_SKCMS + const cmsContext context = GetContext(); + Profile profile; + if (!DecodeProfile(context, ICC(), &profile)) return; + if (cmsGetColorSpace(profile.get()) == cmsSigCmykData) return; + if (!MaybeCreateProfile(*this, &icc_new)) return; +#endif // JPEGXL_ENABLE_SKCMS + + want_icc_ = false; +} + +namespace { + +void JxlCmsDestroy(void* cms_data) { + if (cms_data == nullptr) return; + JxlCms* t = reinterpret_cast(cms_data); +#if !JPEGXL_ENABLE_SKCMS + TransformDeleter()(t->lcms_transform); +#endif + delete t; +} + +void* JxlCmsInit(void* init_data, size_t num_threads, size_t xsize, + const JxlColorProfile* input, const JxlColorProfile* output, + float intensity_target) { + auto t = jxl::make_unique(); + PaddedBytes icc_src, icc_dst; + icc_src.assign(input->icc.data, input->icc.data + input->icc.size); + ColorEncoding c_src; + if (!c_src.SetICC(std::move(icc_src))) { + JXL_NOTIFY_ERROR("JxlCmsInit: failed to parse input ICC"); + return nullptr; + } + icc_dst.assign(output->icc.data, output->icc.data + output->icc.size); + ColorEncoding c_dst; + if (!c_dst.SetICC(std::move(icc_dst))) { + JXL_NOTIFY_ERROR("JxlCmsInit: failed to parse output ICC"); + return nullptr; + } +#if JXL_CMS_VERBOSE + printf("%s -> %s\n", Description(c_src).c_str(), Description(c_dst).c_str()); +#endif + +#if JPEGXL_ENABLE_SKCMS + if (!DecodeProfile(input->icc.data, input->icc.size, &t->profile_src)) { + JXL_NOTIFY_ERROR("JxlCmsInit: skcms failed to parse input ICC"); + return nullptr; + } + if (!DecodeProfile(output->icc.data, output->icc.size, &t->profile_dst)) { + JXL_NOTIFY_ERROR("JxlCmsInit: skcms failed to parse output ICC"); + return nullptr; + } +#else // JPEGXL_ENABLE_SKCMS + const cmsContext context = GetContext(); + Profile profile_src, profile_dst; + if (!DecodeProfile(context, c_src.ICC(), &profile_src)) { + JXL_NOTIFY_ERROR("JxlCmsInit: lcms failed to parse input ICC"); + return nullptr; + } + if (!DecodeProfile(context, c_dst.ICC(), &profile_dst)) { + JXL_NOTIFY_ERROR("JxlCmsInit: lcms failed to parse output ICC"); + return nullptr; + } +#endif // JPEGXL_ENABLE_SKCMS + + t->skip_lcms = false; + if (c_src.SameColorEncoding(c_dst)) { + t->skip_lcms = true; +#if JXL_CMS_VERBOSE + printf("Skip CMS\n"); +#endif + } + + t->apply_hlg_ootf = c_src.tf.IsHLG() != c_dst.tf.IsHLG(); + if (t->apply_hlg_ootf) { + const ColorEncoding* c_hlg = c_src.tf.IsHLG() ? &c_src : &c_dst; + t->hlg_ootf_num_channels = c_hlg->Channels(); + if (t->hlg_ootf_num_channels == 3 && + !GetPrimariesLuminances(*c_hlg, t->hlg_ootf_luminances.data())) { + JXL_NOTIFY_ERROR( + "JxlCmsInit: failed to compute the luminances of primaries"); + return nullptr; + } + } + + // Special-case SRGB <=> linear if the primaries / white point are the same, + // or any conversion where PQ or HLG is involved: + bool src_linear = c_src.tf.IsLinear(); + const bool dst_linear = c_dst.tf.IsLinear(); + + if (c_src.tf.IsPQ() || c_src.tf.IsHLG() || + (c_src.tf.IsSRGB() && dst_linear && c_src.SameColorSpace(c_dst))) { + // Construct new profile as if the data were already/still linear. + ColorEncoding c_linear_src = c_src; + c_linear_src.tf.SetTransferFunction(TransferFunction::kLinear); +#if JPEGXL_ENABLE_SKCMS + skcms_ICCProfile new_src; +#else // JPEGXL_ENABLE_SKCMS + Profile new_src; +#endif // JPEGXL_ENABLE_SKCMS + // Only enable ExtraTF if profile creation succeeded. + if (MaybeCreateProfile(c_linear_src, &icc_src) && +#if JPEGXL_ENABLE_SKCMS + DecodeProfile(icc_src.data(), icc_src.size(), &new_src)) { +#else // JPEGXL_ENABLE_SKCMS + DecodeProfile(context, icc_src, &new_src)) { +#endif // JPEGXL_ENABLE_SKCMS +#if JXL_CMS_VERBOSE + printf("Special HLG/PQ/sRGB -> linear\n"); +#endif +#if JPEGXL_ENABLE_SKCMS + t->icc_src = std::move(icc_src); + t->profile_src = new_src; +#else // JPEGXL_ENABLE_SKCMS + profile_src.swap(new_src); +#endif // JPEGXL_ENABLE_SKCMS + t->preprocess = c_src.tf.IsSRGB() + ? ExtraTF::kSRGB + : (c_src.tf.IsPQ() ? ExtraTF::kPQ : ExtraTF::kHLG); + c_src = c_linear_src; + src_linear = true; + } else { + if (t->apply_hlg_ootf) { + JXL_NOTIFY_ERROR( + "Failed to create extra linear source profile, and HLG OOTF " + "required"); + return nullptr; + } + JXL_WARNING("Failed to create extra linear destination profile"); + } + } + + if (c_dst.tf.IsPQ() || c_dst.tf.IsHLG() || + (c_dst.tf.IsSRGB() && src_linear && c_src.SameColorSpace(c_dst))) { + ColorEncoding c_linear_dst = c_dst; + c_linear_dst.tf.SetTransferFunction(TransferFunction::kLinear); +#if JPEGXL_ENABLE_SKCMS + skcms_ICCProfile new_dst; +#else // JPEGXL_ENABLE_SKCMS + Profile new_dst; +#endif // JPEGXL_ENABLE_SKCMS + // Only enable ExtraTF if profile creation succeeded. + if (MaybeCreateProfile(c_linear_dst, &icc_dst) && +#if JPEGXL_ENABLE_SKCMS + DecodeProfile(icc_dst.data(), icc_dst.size(), &new_dst)) { +#else // JPEGXL_ENABLE_SKCMS + DecodeProfile(context, icc_dst, &new_dst)) { +#endif // JPEGXL_ENABLE_SKCMS +#if JXL_CMS_VERBOSE + printf("Special linear -> HLG/PQ/sRGB\n"); +#endif +#if JPEGXL_ENABLE_SKCMS + t->icc_dst = std::move(icc_dst); + t->profile_dst = new_dst; +#else // JPEGXL_ENABLE_SKCMS + profile_dst.swap(new_dst); +#endif // JPEGXL_ENABLE_SKCMS + t->postprocess = c_dst.tf.IsSRGB() + ? ExtraTF::kSRGB + : (c_dst.tf.IsPQ() ? ExtraTF::kPQ : ExtraTF::kHLG); + c_dst = c_linear_dst; + } else { + if (t->apply_hlg_ootf) { + JXL_NOTIFY_ERROR( + "Failed to create extra linear destination profile, and inverse " + "HLG OOTF required"); + return nullptr; + } + JXL_WARNING("Failed to create extra linear destination profile"); + } + } + + if (c_src.SameColorEncoding(c_dst)) { +#if JXL_CMS_VERBOSE + printf("Same intermediary linear profiles, skipping CMS\n"); +#endif + t->skip_lcms = true; + } + +#if JPEGXL_ENABLE_SKCMS + if (!skcms_MakeUsableAsDestination(&t->profile_dst)) { + JXL_NOTIFY_ERROR( + "Failed to make %s usable as a color transform destination", + Description(c_dst).c_str()); + return nullptr; + } +#endif // JPEGXL_ENABLE_SKCMS + + // Not including alpha channel (copied separately). + const size_t channels_src = (c_src.IsCMYK() ? 4 : c_src.Channels()); + const size_t channels_dst = c_dst.Channels(); + JXL_CHECK(channels_src == channels_dst || + (channels_src == 4 && channels_dst == 3)); +#if JXL_CMS_VERBOSE + printf("Channels: %" PRIuS "; Threads: %" PRIuS "\n", channels_src, + num_threads); +#endif + +#if !JPEGXL_ENABLE_SKCMS + // Type includes color space (XYZ vs RGB), so can be different. + const uint32_t type_src = Type32(c_src, channels_src == 4); + const uint32_t type_dst = Type32(c_dst, false); + const uint32_t intent = static_cast(c_dst.rendering_intent); + // Use cmsFLAGS_NOCACHE to disable the 1-pixel cache and make calling + // cmsDoTransform() thread-safe. + const uint32_t flags = cmsFLAGS_NOCACHE | cmsFLAGS_BLACKPOINTCOMPENSATION | + cmsFLAGS_HIGHRESPRECALC; + t->lcms_transform = + cmsCreateTransformTHR(context, profile_src.get(), type_src, + profile_dst.get(), type_dst, intent, flags); + if (t->lcms_transform == nullptr) { + JXL_NOTIFY_ERROR("Failed to create transform"); + return nullptr; + } +#endif // !JPEGXL_ENABLE_SKCMS + + // Ideally LCMS would convert directly from External to Image3. However, + // cmsDoTransformLineStride only accepts 32-bit BytesPerPlaneIn, whereas our + // planes can be more than 4 GiB apart. Hence, transform inputs/outputs must + // be interleaved. Calling cmsDoTransform for each pixel is expensive + // (indirect call). We therefore transform rows, which requires per-thread + // buffers. To avoid separate allocations, we use the rows of an image. + // Because LCMS apparently also cannot handle <= 16 bit inputs and 32-bit + // outputs (or vice versa), we use floating point input/output. + t->channels_src = channels_src; + t->channels_dst = channels_dst; +#if JPEGXL_ENABLE_SKCMS + // SkiaCMS doesn't support grayscale float buffers, so we create space for RGB + // float buffers anyway. + t->buf_src = ImageF(xsize * (channels_src == 4 ? 4 : 3), num_threads); + t->buf_dst = ImageF(xsize * 3, num_threads); +#else + t->buf_src = ImageF(xsize * channels_src, num_threads); + t->buf_dst = ImageF(xsize * channels_dst, num_threads); +#endif + t->intensity_target = intensity_target; + return t.release(); +} + +float* JxlCmsGetSrcBuf(void* cms_data, size_t thread) { + JxlCms* t = reinterpret_cast(cms_data); + return t->buf_src.Row(thread); +} + +float* JxlCmsGetDstBuf(void* cms_data, size_t thread) { + JxlCms* t = reinterpret_cast(cms_data); + return t->buf_dst.Row(thread); +} + +} // namespace + +const JxlCmsInterface& GetJxlCms() { + static constexpr JxlCmsInterface kInterface = { + /*init_data=*/nullptr, + /*init=*/&JxlCmsInit, + /*get_src_buf=*/&JxlCmsGetSrcBuf, + /*get_dst_buf=*/&JxlCmsGetDstBuf, + /*run=*/&DoColorSpaceTransform, + /*destroy=*/&JxlCmsDestroy}; + return kInterface; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/enc_color_management.h b/third_party/jpeg-xl/lib/jxl/enc_color_management.h new file mode 100644 index 0000000000..6f6e9023a6 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_color_management.h @@ -0,0 +1,90 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_COLOR_MANAGEMENT_H_ +#define LIB_JXL_ENC_COLOR_MANAGEMENT_H_ + +// ICC profiles and color space conversions. + +#include +#include +#include + +#include + +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image.h" + +namespace jxl { + +// Internal C++ wrapper for a JxlCmsInterface. +class ColorSpaceTransform { + public: + explicit ColorSpaceTransform(const JxlCmsInterface& cms) : cms_(cms) {} + ~ColorSpaceTransform() { + if (cms_data_ != nullptr) { + cms_.destroy(cms_data_); + } + } + + // Cannot copy. + ColorSpaceTransform(const ColorSpaceTransform&) = delete; + ColorSpaceTransform& operator=(const ColorSpaceTransform&) = delete; + + Status Init(const ColorEncoding& c_src, const ColorEncoding& c_dst, + float intensity_target, size_t xsize, size_t num_threads) { + xsize_ = xsize; + JxlColorProfile input_profile; + icc_src_ = c_src.ICC(); + input_profile.icc.data = icc_src_.data(); + input_profile.icc.size = icc_src_.size(); + ConvertInternalToExternalColorEncoding(c_src, + &input_profile.color_encoding); + input_profile.num_channels = c_src.IsCMYK() ? 4 : c_src.Channels(); + JxlColorProfile output_profile; + icc_dst_ = c_dst.ICC(); + output_profile.icc.data = icc_dst_.data(); + output_profile.icc.size = icc_dst_.size(); + ConvertInternalToExternalColorEncoding(c_dst, + &output_profile.color_encoding); + if (c_dst.IsCMYK()) + return JXL_FAILURE("Conversion to CMYK is not supported"); + output_profile.num_channels = c_dst.Channels(); + cms_data_ = cms_.init(cms_.init_data, num_threads, xsize, &input_profile, + &output_profile, intensity_target); + JXL_RETURN_IF_ERROR(cms_data_ != nullptr); + return true; + } + + float* BufSrc(const size_t thread) const { + return cms_.get_src_buf(cms_data_, thread); + } + + float* BufDst(const size_t thread) const { + return cms_.get_dst_buf(cms_data_, thread); + } + + Status Run(const size_t thread, const float* buf_src, float* buf_dst) { + return cms_.run(cms_data_, thread, buf_src, buf_dst, xsize_); + } + + private: + JxlCmsInterface cms_; + void* cms_data_ = nullptr; + // The interface may retain pointers into these. + PaddedBytes icc_src_; + PaddedBytes icc_dst_; + size_t xsize_; +}; + +const JxlCmsInterface& GetJxlCms(); + +} // namespace jxl + +#endif // LIB_JXL_ENC_COLOR_MANAGEMENT_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_comparator.cc b/third_party/jpeg-xl/lib/jxl/enc_comparator.cc new file mode 100644 index 0000000000..cbdd0f78d9 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_comparator.cc @@ -0,0 +1,130 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_comparator.h" + +#include +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/enc_gamma_correct.h" +#include "lib/jxl/enc_image_bundle.h" + +namespace jxl { +namespace { + +// color is linear, but blending happens in gamma-compressed space using +// (gamma-compressed) grayscale background color, alpha image represents +// weights of the sRGB colors in the [0 .. (1 << bit_depth) - 1] interval, +// output image is in linear space. +void AlphaBlend(const Image3F& in, const size_t c, float background_linear, + const ImageF& alpha, Image3F* out) { + const float background = LinearToSrgb8Direct(background_linear); + + for (size_t y = 0; y < out->ysize(); ++y) { + const float* JXL_RESTRICT row_a = alpha.ConstRow(y); + const float* JXL_RESTRICT row_i = in.ConstPlaneRow(c, y); + float* JXL_RESTRICT row_o = out->PlaneRow(c, y); + for (size_t x = 0; x < out->xsize(); ++x) { + const float a = row_a[x]; + if (a <= 0.f) { + row_o[x] = background_linear; + } else if (a >= 1.f) { + row_o[x] = row_i[x]; + } else { + const float w_fg = a; + const float w_bg = 1.0f - w_fg; + const float fg = w_fg * LinearToSrgb8Direct(row_i[x]); + const float bg = w_bg * background; + row_o[x] = Srgb8ToLinearDirect(fg + bg); + } + } + } +} + +void AlphaBlend(float background_linear, ImageBundle* io_linear_srgb) { + // No alpha => all opaque. + if (!io_linear_srgb->HasAlpha()) return; + + for (size_t c = 0; c < 3; ++c) { + AlphaBlend(*io_linear_srgb->color(), c, background_linear, + *io_linear_srgb->alpha(), io_linear_srgb->color()); + } +} + +float ComputeScoreImpl(const ImageBundle& rgb0, const ImageBundle& rgb1, + Comparator* comparator, ImageF* distmap) { + JXL_CHECK(comparator->SetReferenceImage(rgb0)); + float score; + JXL_CHECK(comparator->CompareWith(rgb1, distmap, &score)); + return score; +} + +} // namespace + +float ComputeScore(const ImageBundle& rgb0, const ImageBundle& rgb1, + Comparator* comparator, const JxlCmsInterface& cms, + ImageF* diffmap, ThreadPool* pool) { + PROFILER_FUNC; + // Convert to linear sRGB (unless already in that space) + ImageMetadata metadata0 = *rgb0.metadata(); + ImageBundle store0(&metadata0); + const ImageBundle* linear_srgb0; + JXL_CHECK(TransformIfNeeded(rgb0, ColorEncoding::LinearSRGB(rgb0.IsGray()), + cms, pool, &store0, &linear_srgb0)); + ImageMetadata metadata1 = *rgb1.metadata(); + ImageBundle store1(&metadata1); + const ImageBundle* linear_srgb1; + JXL_CHECK(TransformIfNeeded(rgb1, ColorEncoding::LinearSRGB(rgb1.IsGray()), + cms, pool, &store1, &linear_srgb1)); + + // No alpha: skip blending, only need a single call to Butteraugli. + if (!rgb0.HasAlpha() && !rgb1.HasAlpha()) { + return ComputeScoreImpl(*linear_srgb0, *linear_srgb1, comparator, diffmap); + } + + // Blend on black and white backgrounds + + const float black = 0.0f; + ImageBundle blended_black0 = linear_srgb0->Copy(); + ImageBundle blended_black1 = linear_srgb1->Copy(); + AlphaBlend(black, &blended_black0); + AlphaBlend(black, &blended_black1); + + const float white = 1.0f; + ImageBundle blended_white0 = linear_srgb0->Copy(); + ImageBundle blended_white1 = linear_srgb1->Copy(); + + AlphaBlend(white, &blended_white0); + AlphaBlend(white, &blended_white1); + + ImageF diffmap_black, diffmap_white; + const float dist_black = ComputeScoreImpl(blended_black0, blended_black1, + comparator, &diffmap_black); + const float dist_white = ComputeScoreImpl(blended_white0, blended_white1, + comparator, &diffmap_white); + + // diffmap and return values are the max of diffmap_black/white. + if (diffmap != nullptr) { + const size_t xsize = rgb0.xsize(); + const size_t ysize = rgb0.ysize(); + *diffmap = ImageF(xsize, ysize); + for (size_t y = 0; y < ysize; ++y) { + const float* JXL_RESTRICT row_black = diffmap_black.ConstRow(y); + const float* JXL_RESTRICT row_white = diffmap_white.ConstRow(y); + float* JXL_RESTRICT row_out = diffmap->Row(y); + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = std::max(row_black[x], row_white[x]); + } + } + } + return std::max(dist_black, dist_white); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_comparator.h b/third_party/jpeg-xl/lib/jxl/enc_comparator.h new file mode 100644 index 0000000000..0ac4df8296 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_comparator.h @@ -0,0 +1,52 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_COMPARATOR_H_ +#define LIB_JXL_ENC_COMPARATOR_H_ + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +class Comparator { + public: + virtual ~Comparator() = default; + + // Sets the reference image, the first to compare + // Image must be in linear sRGB (gamma expanded) in range 0.0f-1.0f as + // the range from standard black point to standard white point, but values + // outside permitted. + virtual Status SetReferenceImage(const ImageBundle& ref) = 0; + + // Sets the actual image (with loss), the second to compare + // Image must be in linear sRGB (gamma expanded) in range 0.0f-1.0f as + // the range from standard black point to standard white point, but values + // outside permitted. + // In diffmap it outputs the local score per pixel, while in score it outputs + // a single score. Any one may be set to nullptr to not compute it. + virtual Status CompareWith(const ImageBundle& actual, ImageF* diffmap, + float* score) = 0; + + // Quality thresholds for diffmap and score values. + // The good score must represent a value where the images are considered to + // be perceptually indistinguishable (but not identical) + // The bad value must be larger than good to indicate "lower means better" + // and smaller than good to indicate "higher means better" + virtual float GoodQualityScore() const = 0; + virtual float BadQualityScore() const = 0; +}; + +// Computes the score given images in any RGB color model, optionally with +// alpha channel. +float ComputeScore(const ImageBundle& rgb0, const ImageBundle& rgb1, + Comparator* comparator, const JxlCmsInterface& cms, + ImageF* diffmap = nullptr, ThreadPool* pool = nullptr); + +} // namespace jxl + +#endif // LIB_JXL_ENC_COMPARATOR_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_context_map.cc b/third_party/jpeg-xl/lib/jxl/enc_context_map.cc new file mode 100644 index 0000000000..842dd12423 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_context_map.cc @@ -0,0 +1,141 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Library to encode the context map. + +#include "lib/jxl/enc_context_map.h" + +#include + +#include +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/entropy_coder.h" + +namespace jxl { + +namespace { + +size_t IndexOf(const std::vector& v, uint8_t value) { + size_t i = 0; + for (; i < v.size(); ++i) { + if (v[i] == value) return i; + } + return i; +} + +void MoveToFront(std::vector* v, size_t index) { + uint8_t value = (*v)[index]; + for (size_t i = index; i != 0; --i) { + (*v)[i] = (*v)[i - 1]; + } + (*v)[0] = value; +} + +std::vector MoveToFrontTransform(const std::vector& v) { + if (v.empty()) return v; + uint8_t max_value = *std::max_element(v.begin(), v.end()); + std::vector mtf(max_value + 1); + for (size_t i = 0; i <= max_value; ++i) mtf[i] = i; + std::vector result(v.size()); + for (size_t i = 0; i < v.size(); ++i) { + size_t index = IndexOf(mtf, v[i]); + JXL_ASSERT(index < mtf.size()); + result[i] = static_cast(index); + MoveToFront(&mtf, index); + } + return result; +} + +} // namespace + +void EncodeContextMap(const std::vector& context_map, + size_t num_histograms, BitWriter* writer, size_t layer, + AuxOut* aux_out) { + if (num_histograms == 1) { + // Simple code + writer->Write(1, 1); + // 0 bits per entry. + writer->Write(2, 0); + return; + } + + std::vector transformed_symbols = MoveToFrontTransform(context_map); + std::vector> tokens(1), mtf_tokens(1); + EntropyEncodingData codes; + std::vector dummy_context_map; + for (size_t i = 0; i < context_map.size(); i++) { + tokens[0].emplace_back(0, context_map[i]); + } + for (size_t i = 0; i < transformed_symbols.size(); i++) { + mtf_tokens[0].emplace_back(0, transformed_symbols[i]); + } + HistogramParams params; + params.uint_method = HistogramParams::HybridUintMethod::kContextMap; + size_t ans_cost = BuildAndEncodeHistograms( + params, 1, tokens, &codes, &dummy_context_map, nullptr, 0, nullptr); + size_t mtf_cost = BuildAndEncodeHistograms( + params, 1, mtf_tokens, &codes, &dummy_context_map, nullptr, 0, nullptr); + bool use_mtf = mtf_cost < ans_cost; + // Rebuild token list. + tokens[0].clear(); + for (size_t i = 0; i < transformed_symbols.size(); i++) { + tokens[0].emplace_back(0, + use_mtf ? transformed_symbols[i] : context_map[i]); + } + size_t entry_bits = CeilLog2Nonzero(num_histograms); + size_t simple_cost = entry_bits * context_map.size(); + if (entry_bits < 4 && simple_cost < ans_cost && simple_cost < mtf_cost) { + writer->Write(1, 1); + writer->Write(2, entry_bits); + for (size_t i = 0; i < context_map.size(); i++) { + writer->Write(entry_bits, context_map[i]); + } + } else { + writer->Write(1, 0); + writer->Write(1, use_mtf); // Use/don't use MTF. + BuildAndEncodeHistograms(params, 1, tokens, &codes, &dummy_context_map, + writer, layer, aux_out); + WriteTokens(tokens[0], codes, dummy_context_map, writer); + } +} + +void EncodeBlockCtxMap(const BlockCtxMap& block_ctx_map, BitWriter* writer, + AuxOut* aux_out) { + auto& dct = block_ctx_map.dc_thresholds; + auto& qft = block_ctx_map.qf_thresholds; + auto& ctx_map = block_ctx_map.ctx_map; + BitWriter::Allotment allotment( + writer, + (dct[0].size() + dct[1].size() + dct[2].size() + qft.size()) * 34 + 1 + + 4 + 4 + ctx_map.size() * 10 + 1024); + if (dct[0].empty() && dct[1].empty() && dct[2].empty() && qft.empty() && + ctx_map.size() == 21 && + std::equal(ctx_map.begin(), ctx_map.end(), BlockCtxMap::kDefaultCtxMap)) { + writer->Write(1, 1); // default + allotment.ReclaimAndCharge(writer, kLayerAC, aux_out); + return; + } + writer->Write(1, 0); + for (int j : {0, 1, 2}) { + writer->Write(4, dct[j].size()); + for (int i : dct[j]) { + JXL_CHECK(U32Coder::Write(kDCThresholdDist, PackSigned(i), writer)); + } + } + writer->Write(4, qft.size()); + for (uint32_t i : qft) { + JXL_CHECK(U32Coder::Write(kQFThresholdDist, i - 1, writer)); + } + EncodeContextMap(ctx_map, block_ctx_map.num_ctxs, writer, kLayerAC, aux_out); + allotment.ReclaimAndCharge(writer, kLayerAC, aux_out); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_context_map.h b/third_party/jpeg-xl/lib/jxl/enc_context_map.h new file mode 100644 index 0000000000..041e71de7a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_context_map.h @@ -0,0 +1,35 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_CONTEXT_MAP_H_ +#define LIB_JXL_ENC_CONTEXT_MAP_H_ + +#include +#include + +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/enc_bit_writer.h" + +namespace jxl { + +struct AuxOut; + +// Max limit is 255 because encoding assumes numbers < 255 +// More clusters can help compression, but makes encode/decode somewhat slower +static const size_t kClustersLimit = 128; + +// Encodes the given context map to the bit stream. The number of different +// histogram ids is given by num_histograms. +void EncodeContextMap(const std::vector& context_map, + size_t num_histograms, BitWriter* writer, size_t layer, + AuxOut* aux_out); + +void EncodeBlockCtxMap(const BlockCtxMap& block_ctx_map, BitWriter* writer, + AuxOut* aux_out); +} // namespace jxl + +#endif // LIB_JXL_ENC_CONTEXT_MAP_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_detect_dots.cc b/third_party/jpeg-xl/lib/jxl/enc_detect_dots.cc new file mode 100644 index 0000000000..5819036987 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_detect_dots.cc @@ -0,0 +1,626 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_detect_dots.h" + +#include + +#include +#include +#include +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_detect_dots.cc" +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/enc_linalg.h" +#include "lib/jxl/enc_optimize.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_ops.h" + +// Set JXL_DEBUG_DOT_DETECT to 1 to enable debugging. +#ifndef JXL_DEBUG_DOT_DETECT +#define JXL_DEBUG_DOT_DETECT 0 +#endif + +#if JXL_DEBUG_DOT_DETECT +#include "lib/jxl/enc_aux_out.h" +#endif + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::Sub; + +ImageF SumOfSquareDifferences(const Image3F& forig, const Image3F& smooth, + ThreadPool* pool) { + const HWY_FULL(float) d; + const auto color_coef0 = Set(d, 0.0f); + const auto color_coef1 = Set(d, 10.0f); + const auto color_coef2 = Set(d, 0.0f); + + ImageF sum_of_squares(forig.xsize(), forig.ysize()); + JXL_CHECK(RunOnPool( + pool, 0, forig.ysize(), ThreadPool::NoInit, + [&](const uint32_t task, size_t thread) { + const size_t y = static_cast(task); + const float* JXL_RESTRICT orig_row0 = forig.Plane(0).ConstRow(y); + const float* JXL_RESTRICT orig_row1 = forig.Plane(1).ConstRow(y); + const float* JXL_RESTRICT orig_row2 = forig.Plane(2).ConstRow(y); + const float* JXL_RESTRICT smooth_row0 = smooth.Plane(0).ConstRow(y); + const float* JXL_RESTRICT smooth_row1 = smooth.Plane(1).ConstRow(y); + const float* JXL_RESTRICT smooth_row2 = smooth.Plane(2).ConstRow(y); + float* JXL_RESTRICT sos_row = sum_of_squares.Row(y); + + for (size_t x = 0; x < forig.xsize(); x += Lanes(d)) { + auto v0 = Sub(Load(d, orig_row0 + x), Load(d, smooth_row0 + x)); + auto v1 = Sub(Load(d, orig_row1 + x), Load(d, smooth_row1 + x)); + auto v2 = Sub(Load(d, orig_row2 + x), Load(d, smooth_row2 + x)); + v0 = Mul(Mul(v0, v0), color_coef0); + v1 = Mul(Mul(v1, v1), color_coef1); + v2 = Mul(Mul(v2, v2), color_coef2); + const auto sos = + Add(v0, Add(v1, v2)); // weighted sum of square diffs + Store(sos, d, sos_row + x); + } + }, + "ComputeEnergyImage")); + return sum_of_squares; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(SumOfSquareDifferences); // Local function + +const int kEllipseWindowSize = 5; + +namespace { +struct GaussianEllipse { + double x; // position in x + double y; // position in y + double sigma_x; // scale in x + double sigma_y; // scale in y + double angle; // ellipse rotation in radians + std::array intensity; // intensity in each channel + + // The following variables do not need to be encoded + double l2_loss; // error after the Gaussian was fit + double l1_loss; + double ridge_loss; // the l2_loss plus regularization term + double custom_loss; // experimental custom loss + std::array bgColor; // best background color + size_t neg_pixels; // number of negative pixels when subtracting dot + std::array neg_value; // debt due to channel truncation +}; +double DotGaussianModel(double dx, double dy, double ct, double st, + double sigma_x, double sigma_y, double intensity) { + double rx = ct * dx + st * dy; + double ry = -st * dx + ct * dy; + double md = (rx * rx / sigma_x) + (ry * ry / sigma_y); + double value = intensity * exp(-0.5 * md); + return value; +} + +constexpr bool kOptimizeBackground = true; + +// Gaussian that smooths noise but preserves dots +const WeightsSeparable5& WeightsSeparable5Gaussian0_65() { + constexpr float w0 = 0.558311f; + constexpr float w1 = 0.210395f; + constexpr float w2 = 0.010449f; + static constexpr WeightsSeparable5 weights = { + {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}, + {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}}; + return weights; +} + +// (Iterated) Gaussian that removes dots. +const WeightsSeparable5& WeightsSeparable5Gaussian3() { + constexpr float w0 = 0.222338f; + constexpr float w1 = 0.210431f; + constexpr float w2 = 0.1784f; + static constexpr WeightsSeparable5 weights = { + {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}, + {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}}; + return weights; +} + +ImageF ComputeEnergyImage(const Image3F& orig, Image3F* smooth, + ThreadPool* pool) { + PROFILER_FUNC; + + // Prepare guidance images for dot selection. + Image3F forig(orig.xsize(), orig.ysize()); + *smooth = Image3F(orig.xsize(), orig.ysize()); + Rect rect(orig); + + const auto& weights1 = WeightsSeparable5Gaussian0_65(); + const auto& weights3 = WeightsSeparable5Gaussian3(); + + for (size_t c = 0; c < 3; ++c) { + // Use forig as temporary storage to reduce memory and keep it warmer. + Separable5(orig.Plane(c), rect, weights3, pool, &forig.Plane(c)); + Separable5(forig.Plane(c), rect, weights3, pool, &smooth->Plane(c)); + Separable5(orig.Plane(c), rect, weights1, pool, &forig.Plane(c)); + } + +#if JXL_DEBUG_DOT_DETECT + AuxOut aux; + aux.debug_prefix = "/tmp/sebastian/"; + aux.DumpImage("filtered", forig); + aux.DumpImage("sm", *smooth); +#endif + + return HWY_DYNAMIC_DISPATCH(SumOfSquareDifferences)(forig, *smooth, pool); +} + +struct Pixel { + int x; + int y; +}; + +Pixel operator+(const Pixel& a, const Pixel& b) { + return Pixel{a.x + b.x, a.y + b.y}; +} + +// Maximum area in pixels of a ellipse +const size_t kMaxCCSize = 1000; + +// Extracts a connected component from a Binary image where seed is part +// of the component +bool ExtractComponent(ImageF* img, std::vector* pixels, + const Pixel& seed, double threshold) { + PROFILER_FUNC; + static const std::vector neighbors{{1, -1}, {1, 0}, {1, 1}, {0, -1}, + {0, 1}, {-1, -1}, {-1, 1}, {1, 0}}; + std::vector q{seed}; + while (!q.empty()) { + Pixel current = q.back(); + q.pop_back(); + pixels->push_back(current); + if (pixels->size() > kMaxCCSize) return false; + for (const Pixel& delta : neighbors) { + Pixel child = current + delta; + if (child.x >= 0 && static_cast(child.x) < img->xsize() && + child.y >= 0 && static_cast(child.y) < img->ysize()) { + float* value = &img->Row(child.y)[child.x]; + if (*value > threshold) { + *value = 0.0; + q.push_back(child); + } + } + } + } + return true; +} + +inline bool PointInRect(const Rect& r, const Pixel& p) { + return (static_cast(p.x) >= r.x0() && + static_cast(p.x) < (r.x0() + r.xsize()) && + static_cast(p.y) >= r.y0() && + static_cast(p.y) < (r.y0() + r.ysize())); +} + +struct ConnectedComponent { + ConnectedComponent(const Rect& bounds, const std::vector&& pixels) + : bounds(bounds), pixels(pixels) {} + Rect bounds; + std::vector pixels; + float maxEnergy; + float meanEnergy; + float varEnergy; + float meanBg; + float varBg; + float score; + Pixel mode; + + void CompStats(const ImageF& energy, int extra) { + PROFILER_FUNC; + maxEnergy = 0.0; + meanEnergy = 0.0; + varEnergy = 0.0; + meanBg = 0.0; + varBg = 0.0; + int nIn = 0; + int nOut = 0; + mode.x = 0; + mode.y = 0; + for (int sy = -extra; sy < (static_cast(bounds.ysize()) + extra); + sy++) { + int y = sy + static_cast(bounds.y0()); + if (y < 0 || static_cast(y) >= energy.ysize()) continue; + const float* JXL_RESTRICT erow = energy.ConstRow(y); + for (int sx = -extra; sx < (static_cast(bounds.xsize()) + extra); + sx++) { + int x = sx + static_cast(bounds.x0()); + if (x < 0 || static_cast(x) >= energy.xsize()) continue; + if (erow[x] > maxEnergy) { + maxEnergy = erow[x]; + mode.x = x; + mode.y = y; + } + if (PointInRect(bounds, Pixel{x, y})) { + meanEnergy += erow[x]; + varEnergy += erow[x] * erow[x]; + nIn++; + } else { + meanBg += erow[x]; + varBg += erow[x] * erow[x]; + nOut++; + } + } + } + meanEnergy = meanEnergy / nIn; + meanBg = meanBg / nOut; + varEnergy = (varEnergy / nIn) - meanEnergy * meanEnergy; + varBg = (varBg / nOut) - meanBg * meanBg; + score = (meanEnergy - meanBg) / std::sqrt(varBg); + } +}; + +Rect BoundingRectangle(const std::vector& pixels) { + PROFILER_FUNC; + JXL_ASSERT(!pixels.empty()); + int low_x, high_x, low_y, high_y; + low_x = high_x = pixels[0].x; + low_y = high_y = pixels[0].y; + for (const Pixel& p : pixels) { + low_x = std::min(low_x, p.x); + high_x = std::max(high_x, p.x); + low_y = std::min(low_y, p.y); + high_y = std::max(high_y, p.y); + } + return Rect(low_x, low_y, high_x - low_x + 1, high_y - low_y + 1); +} + +std::vector FindCC(const ImageF& energy, double t_low, + double t_high, uint32_t maxWindow, + double minScore) { + PROFILER_FUNC; + const int kExtraRect = 4; + ImageF img = CopyImage(energy); + std::vector ans; + for (size_t y = 0; y < img.ysize(); y++) { + float* JXL_RESTRICT row = img.Row(y); + for (size_t x = 0; x < img.xsize(); x++) { + if (row[x] > t_high) { + std::vector pixels; + row[x] = 0.0; + bool success = ExtractComponent( + &img, &pixels, Pixel{static_cast(x), static_cast(y)}, + t_low); + if (!success) continue; +#if JXL_DEBUG_DOT_DETECT + for (size_t i = 0; i < pixels.size(); i++) { + fprintf(stderr, "(%d,%d) ", pixels[i].x, pixels[i].y); + } + fprintf(stderr, "\n"); +#endif // JXL_DEBUG_DOT_DETECT + Rect bounds = BoundingRectangle(pixels); + if (bounds.xsize() < maxWindow && bounds.ysize() < maxWindow) { + ConnectedComponent cc{bounds, std::move(pixels)}; + cc.CompStats(energy, kExtraRect); + if (cc.score < minScore) continue; + JXL_DEBUG(JXL_DEBUG_DOT_DETECT, + "cc mode: (%d,%d), max: %f, bgMean: %f bgVar: " + "%f bound:(%" PRIuS ",%" PRIuS ",%" PRIuS ",%" PRIuS ")\n", + cc.mode.x, cc.mode.y, cc.maxEnergy, cc.meanEnergy, + cc.varEnergy, cc.bounds.x0(), cc.bounds.y0(), + cc.bounds.xsize(), cc.bounds.ysize()); + ans.push_back(cc); + } + } + } + } + return ans; +} + +// TODO (sggonzalez): Adapt this function for the different color spaces or +// remove it if the color space with the best performance does not need it +void ComputeDotLosses(GaussianEllipse* ellipse, const ConnectedComponent& cc, + const Image3F& img, const Image3F& background) { + PROFILER_FUNC; + const int rectBounds = 2; + const double kIntensityR = 0.0; // 0.015; + const double kSigmaR = 0.0; // 0.01; + const double kZeroEpsilon = 0.1; // Tolerance to consider a value negative + double ct = cos(ellipse->angle), st = sin(ellipse->angle); + const std::array channelGains{{1.0, 1.0, 1.0}}; + int N = 0; + ellipse->l1_loss = 0.0; + ellipse->l2_loss = 0.0; + ellipse->neg_pixels = 0; + ellipse->neg_value.fill(0.0); + double distMeanModeSq = (cc.mode.x - ellipse->x) * (cc.mode.x - ellipse->x) + + (cc.mode.y - ellipse->y) * (cc.mode.y - ellipse->y); + ellipse->custom_loss = 0.0; + for (int c = 0; c < 3; c++) { + for (int sy = -rectBounds; + sy < (static_cast(cc.bounds.ysize()) + rectBounds); sy++) { + int y = sy + cc.bounds.y0(); + if (y < 0 || static_cast(y) >= img.ysize()) continue; + const float* JXL_RESTRICT row = img.ConstPlaneRow(c, y); + // bgrow is only used if kOptimizeBackground is false. + // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores) + const float* JXL_RESTRICT bgrow = background.ConstPlaneRow(c, y); + for (int sx = -rectBounds; + sx < (static_cast(cc.bounds.xsize()) + rectBounds); sx++) { + int x = sx + cc.bounds.x0(); + if (x < 0 || static_cast(x) >= img.xsize()) continue; + double target = row[x]; + double dotDelta = DotGaussianModel( + x - ellipse->x, y - ellipse->y, ct, st, ellipse->sigma_x, + ellipse->sigma_y, ellipse->intensity[c]); + if (dotDelta > target + kZeroEpsilon) { + ellipse->neg_pixels++; + ellipse->neg_value[c] += dotDelta - target; + } + double bkg = kOptimizeBackground ? ellipse->bgColor[c] : bgrow[x]; + double pred = bkg + dotDelta; + double diff = target - pred; + double l2 = channelGains[c] * diff * diff; + double l1 = channelGains[c] * std::fabs(diff); + ellipse->l2_loss += l2; + ellipse->l1_loss += l1; + double w = DotGaussianModel(x - cc.mode.x, y - cc.mode.y, 1.0, 0.0, + 1.0 + ellipse->sigma_x, + 1.0 + ellipse->sigma_y, 1.0); + ellipse->custom_loss += w * l2; + N++; + } + } + } + ellipse->l2_loss /= N; + ellipse->custom_loss /= N; + ellipse->custom_loss += 20.0 * distMeanModeSq + ellipse->neg_value[1]; + ellipse->l1_loss /= N; + double ridgeTerm = kSigmaR * ellipse->sigma_x + kSigmaR * ellipse->sigma_y; + for (int c = 0; c < 3; c++) { + ridgeTerm += kIntensityR * ellipse->intensity[c] * ellipse->intensity[c]; + } + ellipse->ridge_loss = ellipse->l2_loss + ridgeTerm; +} + +GaussianEllipse FitGaussianFast(const ConnectedComponent& cc, + const ImageF& energy, const Image3F& img, + const Image3F& background) { + PROFILER_FUNC; + constexpr bool leastSqIntensity = true; + constexpr double kEpsilon = 1e-6; + GaussianEllipse ans; + constexpr int kRectBounds = (kEllipseWindowSize >> 1); + + // Compute the 1st and 2nd moments of the CC + double sum = 0.0; + int N = 0; + std::array m1{{0.0, 0.0, 0.0}}; + std::array m2{{0.0, 0.0, 0.0}}; + std::array color{{0.0, 0.0, 0.0}}; + std::array bgColor{{0.0, 0.0, 0.0}}; + + JXL_DEBUG(JXL_DEBUG_DOT_DETECT, + "%" PRIuS " %" PRIuS " %" PRIuS " %" PRIuS "\n", cc.bounds.x0(), + cc.bounds.y0(), cc.bounds.xsize(), cc.bounds.ysize()); + for (int c = 0; c < 3; c++) { + color[c] = img.ConstPlaneRow(c, cc.mode.y)[cc.mode.x] - + background.ConstPlaneRow(c, cc.mode.y)[cc.mode.x]; + } + double sign = (color[1] > 0) ? 1 : -1; + for (int sy = -kRectBounds; sy <= kRectBounds; sy++) { + int y = sy + cc.mode.y; + if (y < 0 || static_cast(y) >= energy.ysize()) continue; + const float* JXL_RESTRICT row = img.ConstPlaneRow(1, y); + const float* JXL_RESTRICT bgrow = background.ConstPlaneRow(1, y); + for (int sx = -kRectBounds; sx <= kRectBounds; sx++) { + int x = sx + cc.mode.x; + if (x < 0 || static_cast(x) >= energy.xsize()) continue; + double w = std::max(kEpsilon, sign * (row[x] - bgrow[x])); + sum += w; + + m1[0] += w * x; + m1[1] += w * y; + m2[0] += w * x * x; + m2[1] += w * x * y; + m2[2] += w * y * y; + for (int c = 0; c < 3; c++) { + bgColor[c] += background.ConstPlaneRow(c, y)[x]; + } + N++; + } + } + JXL_CHECK(N > 0); + + for (int i = 0; i < 3; i++) { + m1[i] /= sum; + m2[i] /= sum; + bgColor[i] /= N; + } + + // Some magic constants + constexpr double kSigmaMult = 1.0; + constexpr std::array kScaleMult{{1.1, 1.1, 1.1}}; + + // Now set the parameters of the Gaussian + ans.x = m1[0]; + ans.y = m1[1]; + for (int j = 0; j < 3; j++) { + ans.intensity[j] = kScaleMult[j] * color[j]; + } + + ImageD Sigma(2, 2), D(1, 2), U(2, 2); + Sigma.Row(0)[0] = m2[0] - m1[0] * m1[0]; + Sigma.Row(1)[1] = m2[2] - m1[1] * m1[1]; + Sigma.Row(0)[1] = Sigma.Row(1)[0] = m2[1] - m1[0] * m1[1]; + ConvertToDiagonal(Sigma, &D, &U); + const double* JXL_RESTRICT d = D.ConstRow(0); + const double* JXL_RESTRICT u = U.ConstRow(1); + int p1 = 0, p2 = 1; + if (d[0] < d[1]) std::swap(p1, p2); + ans.sigma_x = kSigmaMult * d[p1]; + ans.sigma_y = kSigmaMult * d[p2]; + ans.angle = std::atan2(u[p1], u[p2]); + ans.l2_loss = 0.0; + ans.bgColor = bgColor; + if (leastSqIntensity) { + GaussianEllipse* ellipse = &ans; + double ct = cos(ans.angle), st = sin(ans.angle); + // Estimate intensity with least squares (fixed background) + for (int c = 0; c < 3; c++) { + double gg = 0.0; + double gd = 0.0; + int yc = static_cast(cc.mode.y); + int xc = static_cast(cc.mode.x); + for (int y = yc - kRectBounds; y <= yc + kRectBounds; y++) { + if (y < 0 || static_cast(y) >= img.ysize()) continue; + const float* JXL_RESTRICT row = img.ConstPlaneRow(c, y); + const float* JXL_RESTRICT bgrow = background.ConstPlaneRow(c, y); + for (int x = xc - kRectBounds; x <= xc + kRectBounds; x++) { + if (x < 0 || static_cast(x) >= img.xsize()) continue; + double target = row[x] - bgrow[x]; + double gaussian = + DotGaussianModel(x - ellipse->x, y - ellipse->y, ct, st, + ellipse->sigma_x, ellipse->sigma_y, 1.0); + gg += gaussian * gaussian; + gd += gaussian * target; + } + } + ans.intensity[c] = gd / (gg + 1e-6); // Regularized least squares + } + } + ComputeDotLosses(&ans, cc, img, background); + return ans; +} + +GaussianEllipse FitGaussian(const ConnectedComponent& cc, const ImageF& energy, + const Image3F& img, const Image3F& background) { + auto ellipse = FitGaussianFast(cc, energy, img, background); + if (ellipse.sigma_x < ellipse.sigma_y) { + std::swap(ellipse.sigma_x, ellipse.sigma_y); + ellipse.angle += kPi / 2.0; + } + ellipse.angle -= kPi * std::floor(ellipse.angle / kPi); + if (fabs(ellipse.angle - kPi) < 1e-6 || fabs(ellipse.angle) < 1e-6) { + ellipse.angle = 0.0; + } + JXL_CHECK(ellipse.angle >= 0 && ellipse.angle <= kPi && + ellipse.sigma_x >= ellipse.sigma_y); + JXL_DEBUG(JXL_DEBUG_DOT_DETECT, + "Ellipse mu=(%lf,%lf) sigma=(%lf,%lf) angle=%lf " + "intensity=(%lf,%lf,%lf) bg=(%lf,%lf,%lf) l2_loss=%lf " + "custom_loss=%lf, neg_pix=%" PRIuS ", neg_v=(%lf,%lf,%lf)\n", + ellipse.x, ellipse.y, ellipse.sigma_x, ellipse.sigma_y, + ellipse.angle, ellipse.intensity[0], ellipse.intensity[1], + ellipse.intensity[2], ellipse.bgColor[0], ellipse.bgColor[1], + ellipse.bgColor[2], ellipse.l2_loss, ellipse.custom_loss, + ellipse.neg_pixels, ellipse.neg_value[0], ellipse.neg_value[1], + ellipse.neg_value[2]); + return ellipse; +} + +} // namespace + +std::vector DetectGaussianEllipses( + const Image3F& opsin, const GaussianDetectParams& params, + const EllipseQuantParams& qParams, ThreadPool* pool) { + PROFILER_FUNC; + std::vector dots; + Image3F smooth(opsin.xsize(), opsin.ysize()); + ImageF energy = ComputeEnergyImage(opsin, &smooth, pool); +#if JXL_DEBUG_DOT_DETECT + AuxOut aux; + aux.debug_prefix = "/tmp/sebastian/"; + aux.DumpXybImage("smooth", smooth); + aux.DumpPlaneNormalized("energy", energy); +#endif // JXL_DEBUG_DOT_DETECT + std::vector components = FindCC( + energy, params.t_low, params.t_high, params.maxWinSize, params.minScore); + size_t numCC = + std::min(params.maxCC, (components.size() * params.percCC) / 100); + if (components.size() > numCC) { + std::sort( + components.begin(), components.end(), + [](const ConnectedComponent& a, const ConnectedComponent& b) -> bool { + return a.score > b.score; + }); + components.erase(components.begin() + numCC, components.end()); + } + for (const auto& cc : components) { + GaussianEllipse ellipse = FitGaussian(cc, energy, opsin, smooth); + if (ellipse.x < 0.0 || + std::ceil(ellipse.x) >= static_cast(opsin.xsize()) || + ellipse.y < 0.0 || + std::ceil(ellipse.y) >= static_cast(opsin.ysize())) { + continue; + } + if (ellipse.neg_pixels > params.maxNegPixels) continue; + double intensity = 0.21 * ellipse.intensity[0] + + 0.72 * ellipse.intensity[1] + + 0.07 * ellipse.intensity[2]; + double intensitySq = intensity * intensity; + // for (int c = 0; c < 3; c++) { + // intensitySq += ellipse.intensity[c] * ellipse.intensity[c]; + //} + double sqDistMeanMode = (ellipse.x - cc.mode.x) * (ellipse.x - cc.mode.x) + + (ellipse.y - cc.mode.y) * (ellipse.y - cc.mode.y); + if (ellipse.l2_loss < params.maxL2Loss && + ellipse.custom_loss < params.maxCustomLoss && + intensitySq > (params.minIntensity * params.minIntensity) && + sqDistMeanMode < params.maxDistMeanMode * params.maxDistMeanMode) { + size_t x0 = cc.bounds.x0(); + size_t y0 = cc.bounds.y0(); + dots.emplace_back(); + dots.back().second.emplace_back(x0, y0); + QuantizedPatch& patch = dots.back().first; + patch.xsize = cc.bounds.xsize(); + patch.ysize = cc.bounds.ysize(); + for (size_t y = 0; y < patch.ysize; y++) { + for (size_t x = 0; x < patch.xsize; x++) { + for (size_t c = 0; c < 3; c++) { + patch.fpixels[c][y * patch.xsize + x] = + opsin.ConstPlaneRow(c, y0 + y)[x0 + x] - + smooth.ConstPlaneRow(c, y0 + y)[x0 + x]; + } + } + } + } + } +#if JXL_DEBUG_DOT_DETECT + JXL_DEBUG(JXL_DEBUG_DOT_DETECT, "Candidates: %" PRIuS ", Dots: %" PRIuS "\n", + components.size(), dots.size()); + ApplyGaussianEllipses(&smooth, dots, 1.0); + aux.DumpXybImage("draw", smooth); + ApplyGaussianEllipses(&smooth, dots, -1.0); + + auto qdots = QuantizeGaussianEllipses(dots, qParams); + auto deq = DequantizeGaussianEllipses(qdots, qParams); + ApplyGaussianEllipses(&smooth, deq, 1.0); + aux.DumpXybImage("qdraw", smooth); + ApplyGaussianEllipses(&smooth, deq, -1.0); +#endif // JXL_DEBUG_DOT_DETECT + return dots; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/enc_detect_dots.h b/third_party/jpeg-xl/lib/jxl/enc_detect_dots.h new file mode 100644 index 0000000000..c3071d9a2f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_detect_dots.h @@ -0,0 +1,67 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// We attempt to remove dots, or speckle from images using Gaussian blur. +#ifndef LIB_JXL_ENC_DETECT_DOTS_H_ +#define LIB_JXL_ENC_DETECT_DOTS_H_ + +#include +#include + +#include +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/dec_patch_dictionary.h" +#include "lib/jxl/enc_patch_dictionary.h" +#include "lib/jxl/image.h" + +namespace jxl { + +struct GaussianDetectParams { + double t_high = 0; // at least one pixel must have larger energy than t_high + double t_low = 0; // all pixels must have a larger energy than tLow + uint32_t maxWinSize = 0; // discard dots larger than this containing window + double maxL2Loss = 0; + double maxCustomLoss = 0; + double minIntensity = 0; // If the intensity is too low, discard it + double maxDistMeanMode = 0; // The mean and the mode must be close + size_t maxNegPixels = 0; // Maximum number of negative pixel + size_t minScore = 0; + size_t maxCC = 50; // Maximum number of CC to keep + size_t percCC = 15; // Percentage in [0,100] of CC to keep +}; + +// Ellipse Quantization Params +struct EllipseQuantParams { + size_t xsize; // Image size in x + size_t ysize; // Image size in y + size_t qPosition; // Position quantization delta + // Quantization for the Gaussian sigma parameters + double minSigma; + double maxSigma; + size_t qSigma; // number of quantization levels + // Quantization for the rotation angle (between -pi and pi) + size_t qAngle; + // Quantization for the intensity + std::array minIntensity; + std::array maxIntensity; + std::array qIntensity; // number of quantization levels + // Extra parameters for the encoding + bool subtractQuantized; // Should we subtract quantized or detected dots? + float ytox; + float ytob; + + void QuantPositionSize(size_t* xsize, size_t* ysize) const; +}; + +// Detects dots in XYB image. +std::vector DetectGaussianEllipses( + const Image3F& opsin, const GaussianDetectParams& params, + const EllipseQuantParams& qParams, ThreadPool* pool); + +} // namespace jxl + +#endif // LIB_JXL_ENC_DETECT_DOTS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.cc b/third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.cc new file mode 100644 index 0000000000..2d22c1edb8 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.cc @@ -0,0 +1,71 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_dot_dictionary.h" + +#include +#include + +#include +#include + +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_detect_dots.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/image.h" + +namespace jxl { + +// Private implementation of Dictionary Encode/Decode +namespace { + +/* Quantization constants for Ellipse dots */ +const size_t kEllipsePosQ = 2; // Quantization level for the position +const double kEllipseMinSigma = 0.1; // Minimum sigma value +const double kEllipseMaxSigma = 3.1; // Maximum Sigma value +const size_t kEllipseSigmaQ = 16; // Number of quantization levels for sigma +const size_t kEllipseAngleQ = 8; // Quantization level for the angle +// TODO: fix these values. +const std::array kEllipseMinIntensity{{-0.05, 0.0, -0.5}}; +const std::array kEllipseMaxIntensity{{0.05, 1.0, 0.4}}; +const std::array kEllipseIntensityQ{{10, 36, 10}}; +} // namespace + +std::vector FindDotDictionary(const CompressParams& cparams, + const Image3F& opsin, + const ColorCorrelationMap& cmap, + ThreadPool* pool) { + if (ApplyOverride(cparams.dots, + cparams.butteraugli_distance >= kMinButteraugliForDots)) { + GaussianDetectParams ellipse_params; + ellipse_params.t_high = 0.04; + ellipse_params.t_low = 0.02; + ellipse_params.maxWinSize = 5; + ellipse_params.maxL2Loss = 0.005; + ellipse_params.maxCustomLoss = 300; + ellipse_params.minIntensity = 0.12; + ellipse_params.maxDistMeanMode = 1.0; + ellipse_params.maxNegPixels = 0; + ellipse_params.minScore = 12.0; + ellipse_params.maxCC = 100; + ellipse_params.percCC = 100; + EllipseQuantParams qParams{ + opsin.xsize(), opsin.ysize(), kEllipsePosQ, + kEllipseMinSigma, kEllipseMaxSigma, kEllipseSigmaQ, + kEllipseAngleQ, kEllipseMinIntensity, kEllipseMaxIntensity, + kEllipseIntensityQ, kEllipsePosQ <= 5, cmap.YtoXRatio(0), + cmap.YtoBRatio(0)}; + + return DetectGaussianEllipses(opsin, ellipse_params, qParams, pool); + } + return {}; +} +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.h b/third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.h new file mode 100644 index 0000000000..2ba4393f30 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.h @@ -0,0 +1,34 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_DOT_DICTIONARY_H_ +#define LIB_JXL_ENC_DOT_DICTIONARY_H_ + +// Dots are stored in a dictionary to avoid storing similar dots multiple +// times. + +#include + +#include + +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_patch_dictionary.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_patch_dictionary.h" +#include "lib/jxl/image.h" + +namespace jxl { + +std::vector FindDotDictionary(const CompressParams& cparams, + const Image3F& opsin, + const ColorCorrelationMap& cmap, + ThreadPool* pool); + +} // namespace jxl + +#endif // LIB_JXL_ENC_DOT_DICTIONARY_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_entropy_coder.cc b/third_party/jpeg-xl/lib/jxl/enc_entropy_coder.cc new file mode 100644 index 0000000000..c634445e83 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_entropy_coder.cc @@ -0,0 +1,274 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_entropy_coder.h" + +#include +#include + +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_entropy_coder.cc" +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_context_map.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/epf.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_ops.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::AndNot; +using hwy::HWY_NAMESPACE::Eq; +using hwy::HWY_NAMESPACE::GetLane; + +// Returns number of non-zero coefficients (but skip LLF). +// We cannot rely on block[] being all-zero bits, so first truncate to integer. +// Also writes the per-8x8 block nzeros starting at nzeros_pos. +int32_t NumNonZeroExceptLLF(const size_t cx, const size_t cy, + const AcStrategy acs, const size_t covered_blocks, + const size_t log2_covered_blocks, + const int32_t* JXL_RESTRICT block, + const size_t nzeros_stride, + int32_t* JXL_RESTRICT nzeros_pos) { + const HWY_CAPPED(int32_t, kBlockDim) di; + + const auto zero = Zero(di); + // Add FF..FF for every zero coefficient, negate to get #zeros. + auto neg_sum_zero = zero; + + { + // Mask sufficient for one row of coefficients. + HWY_ALIGN const int32_t + llf_mask_lanes[AcStrategy::kMaxCoeffBlocks * (1 + kBlockDim)] = { + -1, -1, -1, -1}; + // First cx=1,2,4 elements are FF..FF, others 0. + const int32_t* llf_mask_pos = + llf_mask_lanes + AcStrategy::kMaxCoeffBlocks - cx; + + // Rows with LLF: mask out the LLF + for (size_t y = 0; y < cy; y++) { + for (size_t x = 0; x < cx * kBlockDim; x += Lanes(di)) { + const auto llf_mask = LoadU(di, llf_mask_pos + x); + + // LLF counts as zero so we don't include it in nzeros. + const auto coef = + AndNot(llf_mask, Load(di, &block[y * cx * kBlockDim + x])); + + neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero))); + } + } + } + + // Remaining rows: no mask + for (size_t y = cy; y < cy * kBlockDim; y++) { + for (size_t x = 0; x < cx * kBlockDim; x += Lanes(di)) { + const auto coef = Load(di, &block[y * cx * kBlockDim + x]); + neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero))); + } + } + + // We want area - sum_zero, add because neg_sum_zero is already negated. + const int32_t nzeros = + int32_t(cx * cy * kDCTBlockSize) + GetLane(SumOfLanes(di, neg_sum_zero)); + + const int32_t shifted_nzeros = static_cast( + (nzeros + covered_blocks - 1) >> log2_covered_blocks); + // Need non-canonicalized dimensions! + for (size_t y = 0; y < acs.covered_blocks_y(); y++) { + for (size_t x = 0; x < acs.covered_blocks_x(); x++) { + nzeros_pos[x + y * nzeros_stride] = shifted_nzeros; + } + } + + return nzeros; +} + +// Specialization for 8x8, where only top-left is LLF/DC. +// About 1% overall speedup vs. NumNonZeroExceptLLF. +int32_t NumNonZero8x8ExceptDC(const int32_t* JXL_RESTRICT block, + int32_t* JXL_RESTRICT nzeros_pos) { + const HWY_CAPPED(int32_t, kBlockDim) di; + + const auto zero = Zero(di); + // Add FF..FF for every zero coefficient, negate to get #zeros. + auto neg_sum_zero = zero; + + { + // First row has DC, so mask + const size_t y = 0; + HWY_ALIGN const int32_t dc_mask_lanes[kBlockDim] = {-1}; + + for (size_t x = 0; x < kBlockDim; x += Lanes(di)) { + const auto dc_mask = Load(di, dc_mask_lanes + x); + + // DC counts as zero so we don't include it in nzeros. + const auto coef = AndNot(dc_mask, Load(di, &block[y * kBlockDim + x])); + + neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero))); + } + } + + // Remaining rows: no mask + for (size_t y = 1; y < kBlockDim; y++) { + for (size_t x = 0; x < kBlockDim; x += Lanes(di)) { + const auto coef = Load(di, &block[y * kBlockDim + x]); + neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero))); + } + } + + // We want 64 - sum_zero, add because neg_sum_zero is already negated. + const int32_t nzeros = + int32_t(kDCTBlockSize) + GetLane(SumOfLanes(di, neg_sum_zero)); + + *nzeros_pos = nzeros; + + return nzeros; +} + +// The number of nonzeros of each block is predicted from the top and the left +// blocks, with opportune scaling to take into account the number of blocks of +// each strategy. The predicted number of nonzeros divided by two is used as a +// context; if this number is above 63, a specific context is used. If the +// number of nonzeros of a strategy is above 63, it is written directly using a +// fixed number of bits (that depends on the size of the strategy). +void TokenizeCoefficients(const coeff_order_t* JXL_RESTRICT orders, + const Rect& rect, + const int32_t* JXL_RESTRICT* JXL_RESTRICT ac_rows, + const AcStrategyImage& ac_strategy, + YCbCrChromaSubsampling cs, + Image3I* JXL_RESTRICT tmp_num_nzeroes, + std::vector* JXL_RESTRICT output, + const ImageB& qdc, const ImageI& qf, + const BlockCtxMap& block_ctx_map) { + const size_t xsize_blocks = rect.xsize(); + const size_t ysize_blocks = rect.ysize(); + + // TODO(user): update the estimate: usually less coefficients are used. + output->reserve(output->size() + + 3 * xsize_blocks * ysize_blocks * kDCTBlockSize); + + size_t offset[3] = {}; + const size_t nzeros_stride = tmp_num_nzeroes->PixelsPerRow(); + for (size_t by = 0; by < ysize_blocks; ++by) { + size_t sby[3] = {by >> cs.VShift(0), by >> cs.VShift(1), + by >> cs.VShift(2)}; + int32_t* JXL_RESTRICT row_nzeros[3] = { + tmp_num_nzeroes->PlaneRow(0, sby[0]), + tmp_num_nzeroes->PlaneRow(1, sby[1]), + tmp_num_nzeroes->PlaneRow(2, sby[2]), + }; + const int32_t* JXL_RESTRICT row_nzeros_top[3] = { + sby[0] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(0, sby[0] - 1), + sby[1] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(1, sby[1] - 1), + sby[2] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(2, sby[2] - 1), + }; + const uint8_t* JXL_RESTRICT row_qdc = + qdc.ConstRow(rect.y0() + by) + rect.x0(); + const int32_t* JXL_RESTRICT row_qf = rect.ConstRow(qf, by); + AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by); + for (size_t bx = 0; bx < xsize_blocks; ++bx) { + AcStrategy acs = acs_row[bx]; + if (!acs.IsFirstBlock()) continue; + size_t sbx[3] = {bx >> cs.HShift(0), bx >> cs.HShift(1), + bx >> cs.HShift(2)}; + size_t cx = acs.covered_blocks_x(); + size_t cy = acs.covered_blocks_y(); + const size_t covered_blocks = cx * cy; // = #LLF coefficients + const size_t log2_covered_blocks = + Num0BitsBelowLS1Bit_Nonzero(covered_blocks); + const size_t size = covered_blocks * kDCTBlockSize; + + CoefficientLayout(&cy, &cx); // swap cx/cy to canonical order + + for (int c : {1, 0, 2}) { + if (sbx[c] << cs.HShift(c) != bx) continue; + if (sby[c] << cs.VShift(c) != by) continue; + const int32_t* JXL_RESTRICT block = ac_rows[c] + offset[c]; + + int32_t nzeros = + (covered_blocks == 1) + ? NumNonZero8x8ExceptDC(block, row_nzeros[c] + sbx[c]) + : NumNonZeroExceptLLF(cx, cy, acs, covered_blocks, + log2_covered_blocks, block, nzeros_stride, + row_nzeros[c] + sbx[c]); + + int ord = kStrategyOrder[acs.RawStrategy()]; + const coeff_order_t* JXL_RESTRICT order = + &orders[CoeffOrderOffset(ord, c)]; + + int32_t predicted_nzeros = + PredictFromTopAndLeft(row_nzeros_top[c], row_nzeros[c], sbx[c], 32); + size_t block_ctx = + block_ctx_map.Context(row_qdc[bx], row_qf[sbx[c]], ord, c); + const int32_t nzero_ctx = + block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx); + + output->emplace_back(nzero_ctx, nzeros); + const size_t histo_offset = + block_ctx_map.ZeroDensityContextsOffset(block_ctx); + // Skip LLF. + size_t prev = (nzeros > static_cast(size / 16) ? 0 : 1); + for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) { + int32_t coeff = block[order[k]]; + size_t ctx = + histo_offset + ZeroDensityContext(nzeros, k, covered_blocks, + log2_covered_blocks, prev); + uint32_t u_coeff = PackSigned(coeff); + output->emplace_back(ctx, u_coeff); + prev = coeff != 0; + nzeros -= prev; + } + JXL_DASSERT(nzeros == 0); + offset[c] += size; + } + } + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(TokenizeCoefficients); +void TokenizeCoefficients(const coeff_order_t* JXL_RESTRICT orders, + const Rect& rect, + const int32_t* JXL_RESTRICT* JXL_RESTRICT ac_rows, + const AcStrategyImage& ac_strategy, + YCbCrChromaSubsampling cs, + Image3I* JXL_RESTRICT tmp_num_nzeroes, + std::vector* JXL_RESTRICT output, + const ImageB& qdc, const ImageI& qf, + const BlockCtxMap& block_ctx_map) { + return HWY_DYNAMIC_DISPATCH(TokenizeCoefficients)( + orders, rect, ac_rows, ac_strategy, cs, tmp_num_nzeroes, output, qdc, qf, + block_ctx_map); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/enc_entropy_coder.h b/third_party/jpeg-xl/lib/jxl/enc_entropy_coder.h new file mode 100644 index 0000000000..7dfc71c726 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_entropy_coder.h @@ -0,0 +1,46 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_ENTROPY_CODER_H_ +#define LIB_JXL_ENC_ENTROPY_CODER_H_ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "lib/jxl/ac_context.h" // BlockCtxMap +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/field_encodings.h" +#include "lib/jxl/frame_header.h" // YCbCrChromaSubsampling +#include "lib/jxl/image.h" + +// Entropy coding and context modeling of DC and AC coefficients, as well as AC +// strategy and quantization field. + +namespace jxl { + +// Generate DCT NxN quantized AC values tokens. +// Only the subset "rect" [in units of blocks] within all images. +// See also DecodeACVarBlock. +void TokenizeCoefficients(const coeff_order_t* JXL_RESTRICT orders, + const Rect& rect, + const int32_t* JXL_RESTRICT* JXL_RESTRICT ac_rows, + const AcStrategyImage& ac_strategy, + YCbCrChromaSubsampling cs, + Image3I* JXL_RESTRICT tmp_num_nzeroes, + std::vector* JXL_RESTRICT output, + const ImageB& qdc, const ImageI& qf, + const BlockCtxMap& block_ctx_map); + +} // namespace jxl + +#endif // LIB_JXL_ENC_ENTROPY_CODER_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_external_image.cc b/third_party/jpeg-xl/lib/jxl/enc_external_image.cc new file mode 100644 index 0000000000..1408746476 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_external_image.cc @@ -0,0 +1,183 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_external_image.h" + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/alpha.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/float.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" + +namespace jxl { +namespace { + +size_t JxlDataTypeBytes(JxlDataType data_type) { + switch (data_type) { + case JXL_TYPE_UINT8: + return 1; + case JXL_TYPE_UINT16: + return 2; + case JXL_TYPE_FLOAT16: + return 2; + case JXL_TYPE_FLOAT: + return 4; + default: + return 0; + } +} + +} // namespace + +Status ConvertFromExternal(Span bytes, size_t xsize, + size_t ysize, size_t bits_per_sample, + JxlPixelFormat format, size_t c, ThreadPool* pool, + ImageF* channel) { + if (format.data_type == JXL_TYPE_UINT8) { + JXL_RETURN_IF_ERROR(bits_per_sample > 0 && bits_per_sample <= 8); + } else if (format.data_type == JXL_TYPE_UINT16) { + JXL_RETURN_IF_ERROR(bits_per_sample > 8 && bits_per_sample <= 16); + } else if (format.data_type == JXL_TYPE_FLOAT16) { + JXL_RETURN_IF_ERROR(bits_per_sample == 16); + } else if (format.data_type == JXL_TYPE_FLOAT) { + JXL_RETURN_IF_ERROR(bits_per_sample == 32); + } else { + JXL_FAILURE("unsupported pixel format data type %d", format.data_type); + } + size_t bytes_per_channel = JxlDataTypeBytes(format.data_type); + size_t bytes_per_pixel = format.num_channels * bytes_per_channel; + size_t pixel_offset = c * bytes_per_channel; + // Only for uint8/16. + float scale = 1. / ((1ull << bits_per_sample) - 1); + + const size_t last_row_size = xsize * bytes_per_pixel; + const size_t align = format.align; + const size_t row_size = + (align > 1 ? jxl::DivCeil(last_row_size, align) * align : last_row_size); + const size_t bytes_to_read = row_size * (ysize - 1) + last_row_size; + if (xsize == 0 || ysize == 0) return JXL_FAILURE("Empty image"); + if (bytes.size() < bytes_to_read) { + return JXL_FAILURE("Buffer size is too small, expected: %" PRIuS + " got: %" PRIuS " (Image: %" PRIuS "x%" PRIuS + "x%u, bytes_per_channel: %" PRIuS ")", + bytes_to_read, bytes.size(), xsize, ysize, + format.num_channels, bytes_per_channel); + } + JXL_ASSERT(channel->xsize() == xsize); + JXL_ASSERT(channel->ysize() == ysize); + // Too large buffer is likely an application bug, so also fail for that. + // Do allow padding to stride in last row though. + if (bytes.size() > row_size * ysize) { + return JXL_FAILURE("Buffer size is too large"); + } + + const bool little_endian = + format.endianness == JXL_LITTLE_ENDIAN || + (format.endianness == JXL_NATIVE_ENDIAN && IsLittleEndian()); + + const uint8_t* const in = bytes.data(); + + std::atomic error_count = {0}; + + const auto convert_row = [&](const uint32_t task, size_t /*thread*/) { + const size_t y = task; + size_t offset = row_size * task + pixel_offset; + float* JXL_RESTRICT row_out = channel->Row(y); + const auto save_value = [&](size_t index, float value) { + row_out[index] = value; + }; + if (!LoadFloatRow(in + offset, xsize, bytes_per_pixel, format.data_type, + little_endian, scale, save_value)) { + error_count++; + } + }; + JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, static_cast(ysize), + ThreadPool::NoInit, convert_row, + "ConvertExtraChannel")); + + if (error_count) { + JXL_FAILURE("unsupported pixel format data type"); + } + + return true; +} +Status ConvertFromExternal(Span bytes, size_t xsize, + size_t ysize, const ColorEncoding& c_current, + size_t bits_per_sample, JxlPixelFormat format, + ThreadPool* pool, ImageBundle* ib) { + const size_t color_channels = c_current.Channels(); + bool has_alpha = format.num_channels == 2 || format.num_channels == 4; + if (format.num_channels < color_channels) { + return JXL_FAILURE("Expected %" PRIuS + " color channels, received only %u channels", + color_channels, format.num_channels); + } + + Image3F color(xsize, ysize); + for (size_t c = 0; c < color_channels; ++c) { + JXL_RETURN_IF_ERROR(ConvertFromExternal(bytes, xsize, ysize, + bits_per_sample, format, c, pool, + &color.Plane(c))); + } + if (color_channels == 1) { + CopyImageTo(color.Plane(0), &color.Plane(1)); + CopyImageTo(color.Plane(0), &color.Plane(2)); + } + ib->SetFromImage(std::move(color), c_current); + + // Passing an interleaved image with an alpha channel to an image that doesn't + // have alpha channel just discards the passed alpha channel. + if (has_alpha && ib->HasAlpha()) { + ImageF alpha(xsize, ysize); + JXL_RETURN_IF_ERROR( + ConvertFromExternal(bytes, xsize, ysize, bits_per_sample, format, + format.num_channels - 1, pool, &alpha)); + ib->SetAlpha(std::move(alpha)); + } else if (!has_alpha && ib->HasAlpha()) { + // if alpha is not passed, but it is expected, then assume + // it is all-opaque + ImageF alpha(xsize, ysize); + FillImage(1.0f, &alpha); + ib->SetAlpha(std::move(alpha)); + } + + return true; +} + +Status BufferToImageF(const JxlPixelFormat& pixel_format, size_t xsize, + size_t ysize, const void* buffer, size_t size, + ThreadPool* pool, ImageF* channel) { + size_t bitdepth = JxlDataTypeBytes(pixel_format.data_type) * kBitsPerByte; + return ConvertFromExternal( + jxl::Span(static_cast(buffer), size), + xsize, ysize, bitdepth, pixel_format, 0, pool, channel); +} + +Status BufferToImageBundle(const JxlPixelFormat& pixel_format, uint32_t xsize, + uint32_t ysize, const void* buffer, size_t size, + jxl::ThreadPool* pool, + const jxl::ColorEncoding& c_current, + jxl::ImageBundle* ib) { + size_t bitdepth = JxlDataTypeBytes(pixel_format.data_type) * kBitsPerByte; + JXL_RETURN_IF_ERROR(ConvertFromExternal( + jxl::Span(static_cast(buffer), size), + xsize, ysize, c_current, bitdepth, pixel_format, pool, ib)); + ib->VerifyMetadata(); + + return true; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_external_image.h b/third_party/jpeg-xl/lib/jxl/enc_external_image.h new file mode 100644 index 0000000000..3b2b295076 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_external_image.h @@ -0,0 +1,45 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_EXTERNAL_IMAGE_H_ +#define LIB_JXL_ENC_EXTERNAL_IMAGE_H_ + +// Interleaved image for color transforms and Codec. + +#include +#include +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { +Status ConvertFromExternal(Span bytes, size_t xsize, + size_t ysize, size_t bits_per_sample, + JxlPixelFormat format, size_t c, ThreadPool* pool, + ImageF* channel); + +// Convert an interleaved pixel buffer to the internal ImageBundle +// representation. This is the opposite of ConvertToExternal(). +Status ConvertFromExternal(Span bytes, size_t xsize, + size_t ysize, const ColorEncoding& c_current, + size_t bits_per_sample, JxlPixelFormat format, + ThreadPool* pool, ImageBundle* ib); +Status BufferToImageF(const JxlPixelFormat& pixel_format, size_t xsize, + size_t ysize, const void* buffer, size_t size, + ThreadPool* pool, ImageF* channel); +Status BufferToImageBundle(const JxlPixelFormat& pixel_format, uint32_t xsize, + uint32_t ysize, const void* buffer, size_t size, + jxl::ThreadPool* pool, + const jxl::ColorEncoding& c_current, + jxl::ImageBundle* ib); + +} // namespace jxl + +#endif // LIB_JXL_ENC_EXTERNAL_IMAGE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_external_image_gbench.cc b/third_party/jpeg-xl/lib/jxl/enc_external_image_gbench.cc new file mode 100644 index 0000000000..4b7147817a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_external_image_gbench.cc @@ -0,0 +1,46 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "benchmark/benchmark.h" +#include "lib/jxl/enc_external_image.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { +namespace { + +// Encoder case, deinterleaves a buffer. +void BM_EncExternalImage_ConvertImageRGBA(benchmark::State& state) { + const size_t kNumIter = 5; + size_t xsize = state.range(); + size_t ysize = state.range(); + + ImageMetadata im; + im.SetAlphaBits(8); + ImageBundle ib(&im); + + std::vector interleaved(xsize * ysize * 4); + JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0}; + for (auto _ : state) { + for (size_t i = 0; i < kNumIter; ++i) { + JXL_CHECK(ConvertFromExternal( + Span(interleaved.data(), interleaved.size()), xsize, + ysize, + /*c_current=*/ColorEncoding::SRGB(), + /*bits_per_sample=*/8, format, + /*pool=*/nullptr, &ib)); + } + } + + // Pixels per second. + state.SetItemsProcessed(kNumIter * state.iterations() * xsize * ysize); + state.SetBytesProcessed(kNumIter * state.iterations() * interleaved.size()); +} + +BENCHMARK(BM_EncExternalImage_ConvertImageRGBA) + ->RangeMultiplier(2) + ->Range(256, 2048); + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_external_image_test.cc b/third_party/jpeg-xl/lib/jxl/enc_external_image_test.cc new file mode 100644 index 0000000000..7be8d45f2d --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_external_image_test.cc @@ -0,0 +1,79 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_external_image.h" + +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +#if !defined(JXL_CRASH_ON_ERROR) +TEST(ExternalImageTest, InvalidSize) { + ImageMetadata im; + im.SetAlphaBits(8); + ImageBundle ib(&im); + + JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + const uint8_t buf[10 * 100 * 8] = {}; + EXPECT_FALSE(ConvertFromExternal( + Span(buf, 10), /*xsize=*/10, /*ysize=*/100, + /*c_current=*/ColorEncoding::SRGB(), + /*bits_per_sample=*/16, format, nullptr, &ib)); + EXPECT_FALSE(ConvertFromExternal( + Span(buf, sizeof(buf) - 1), /*xsize=*/10, /*ysize=*/100, + /*c_current=*/ColorEncoding::SRGB(), + /*bits_per_sample=*/16, format, nullptr, &ib)); + EXPECT_TRUE( + ConvertFromExternal(Span(buf, sizeof(buf)), /*xsize=*/10, + /*ysize=*/100, /*c_current=*/ColorEncoding::SRGB(), + /*bits_per_sample=*/16, format, nullptr, &ib)); +} +#endif + +TEST(ExternalImageTest, AlphaMissing) { + ImageMetadata im; + im.SetAlphaBits(0); // No alpha + ImageBundle ib(&im); + + const size_t xsize = 10; + const size_t ysize = 20; + const uint8_t buf[xsize * ysize * 4] = {}; + + JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0}; + // has_alpha is true but the ImageBundle has no alpha. Alpha channel should + // be ignored. + EXPECT_TRUE(ConvertFromExternal(Span(buf, sizeof(buf)), xsize, + ysize, + /*c_current=*/ColorEncoding::SRGB(), + /*bits_per_sample=*/8, format, nullptr, &ib)); + EXPECT_FALSE(ib.HasAlpha()); +} + +TEST(ExternalImageTest, AlphaPremultiplied) { + ImageMetadata im; + im.SetAlphaBits(8, true); + + ImageBundle ib(&im); + const size_t xsize = 10; + const size_t ysize = 20; + const size_t size = xsize * ysize * 8; + const uint8_t buf[size] = {}; + + JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + EXPECT_TRUE(BufferToImageBundle(format, xsize, ysize, buf, size, nullptr, + ColorEncoding::SRGB(), &ib)); +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.cc b/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.cc new file mode 100644 index 0000000000..286990ee8a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.cc @@ -0,0 +1,3860 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef FJXL_SELF_INCLUDE + +#include "lib/jxl/enc_fast_lossless.h" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +// Enable NEON and AVX2/AVX512 if not asked to do otherwise and the compilers +// support it. +#if defined(__aarch64__) || defined(_M_ARM64) +#include + +#ifndef FJXL_ENABLE_NEON +#define FJXL_ENABLE_NEON 1 +#endif + +#elif (defined(__x86_64__) || defined(_M_X64)) && !defined(_MSC_VER) +#include + +// manually add _mm512_cvtsi512_si32 definition if missing +// (e.g. with Xcode on macOS Mojave) +// copied from gcc 11.1.0 include/avx512fintrin.h line 14367-14373 +#if defined(__clang__) && \ + ((!defined(__apple_build_version__) && __clang_major__ < 10) || \ + (defined(__apple_build_version__) && __apple_build_version__ < 12000032)) +inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsi512_si32(__m512i __A) { + __v16si __B = (__v16si)__A; + return __B[0]; +} +#endif + +// TODO(veluca): MSVC support for dynamic dispatch. +#if defined(__clang__) || defined(__GNUC__) + +#ifndef FJXL_ENABLE_AVX2 +#define FJXL_ENABLE_AVX2 1 +#endif + +#ifndef FJXL_ENABLE_AVX512 +// On clang-7 or earlier, and gcc-10 or earlier, AVX512 seems broken. +#if (defined(__clang__) && \ + (!defined(__apple_build_version__) && __clang_major__ > 7) || \ + (defined(__apple_build_version__) && \ + __apple_build_version__ > 10010046)) || \ + (defined(__GNUC__) && __GNUC__ > 10) +#define FJXL_ENABLE_AVX512 1 +#endif +#endif + +#endif + +#endif + +#ifndef FJXL_ENABLE_NEON +#define FJXL_ENABLE_NEON 0 +#endif + +#ifndef FJXL_ENABLE_AVX2 +#define FJXL_ENABLE_AVX2 0 +#endif + +#ifndef FJXL_ENABLE_AVX512 +#define FJXL_ENABLE_AVX512 0 +#endif + +namespace { +#if defined(_MSC_VER) && !defined(__clang__) +#define FJXL_INLINE __forceinline +FJXL_INLINE uint32_t FloorLog2(uint32_t v) { + unsigned long index; + _BitScanReverse(&index, v); + return index; +} +FJXL_INLINE uint32_t CtzNonZero(uint64_t v) { + unsigned long index; + _BitScanForward(&index, v); + return index; +} +#else +#define FJXL_INLINE inline __attribute__((always_inline)) +FJXL_INLINE uint32_t FloorLog2(uint32_t v) { + return v ? 31 - __builtin_clz(v) : 0; +} +FJXL_INLINE uint32_t CtzNonZero(uint64_t v) { return __builtin_ctzll(v); } +#endif + +// Compiles to a memcpy on little-endian systems. +FJXL_INLINE void StoreLE64(uint8_t* tgt, uint64_t data) { +#if (!defined(__BYTE_ORDER__) || (__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__)) + for (int i = 0; i < 8; i++) { + tgt[i] = (data >> (i * 8)) & 0xFF; + } +#else + memcpy(tgt, &data, 8); +#endif +} + +FJXL_INLINE size_t AddBits(uint32_t count, uint64_t bits, uint8_t* data_buf, + size_t& bits_in_buffer, uint64_t& bit_buffer) { + bit_buffer |= bits << bits_in_buffer; + bits_in_buffer += count; + StoreLE64(data_buf, bit_buffer); + size_t bytes_in_buffer = bits_in_buffer / 8; + bits_in_buffer -= bytes_in_buffer * 8; + bit_buffer >>= bytes_in_buffer * 8; + return bytes_in_buffer; +} + +struct BitWriter { + void Allocate(size_t maximum_bit_size) { + assert(data == nullptr); + // Leave some padding. + data.reset(static_cast(malloc(maximum_bit_size / 8 + 64))); + } + + void Write(uint32_t count, uint64_t bits) { + bytes_written += AddBits(count, bits, data.get() + bytes_written, + bits_in_buffer, buffer); + } + + void ZeroPadToByte() { + if (bits_in_buffer != 0) { + Write(8 - bits_in_buffer, 0); + } + } + + FJXL_INLINE void WriteMultiple(const uint64_t* nbits, const uint64_t* bits, + size_t n) { + // Necessary because Write() is only guaranteed to work with <=56 bits. + // Trying to SIMD-fy this code results in lower speed (and definitely less + // clarity). + { + for (size_t i = 0; i < n; i++) { + this->buffer |= bits[i] << this->bits_in_buffer; + memcpy(this->data.get() + this->bytes_written, &this->buffer, 8); + uint64_t shift = 64 - this->bits_in_buffer; + this->bits_in_buffer += nbits[i]; + // This `if` seems to be faster than using ternaries. + if (this->bits_in_buffer >= 64) { + uint64_t next_buffer = bits[i] >> shift; + this->buffer = next_buffer; + this->bits_in_buffer -= 64; + this->bytes_written += 8; + } + } + memcpy(this->data.get() + this->bytes_written, &this->buffer, 8); + size_t bytes_in_buffer = this->bits_in_buffer / 8; + this->bits_in_buffer -= bytes_in_buffer * 8; + this->buffer >>= bytes_in_buffer * 8; + this->bytes_written += bytes_in_buffer; + } + } + + std::unique_ptr data = {nullptr, free}; + size_t bytes_written = 0; + size_t bits_in_buffer = 0; + uint64_t buffer = 0; +}; + +} // namespace + +extern "C" { + +struct JxlFastLosslessFrameState { + size_t width; + size_t height; + size_t nb_chans; + size_t bitdepth; + BitWriter header; + std::vector> group_data; + size_t current_bit_writer = 0; + size_t bit_writer_byte_pos = 0; + size_t bits_in_buffer = 0; + uint64_t bit_buffer = 0; +}; + +size_t JxlFastLosslessOutputSize(const JxlFastLosslessFrameState* frame) { + size_t total_size_groups = 0; + for (size_t i = 0; i < frame->group_data.size(); i++) { + size_t sz = 0; + for (size_t j = 0; j < frame->nb_chans; j++) { + const auto& writer = frame->group_data[i][j]; + sz += writer.bytes_written * 8 + writer.bits_in_buffer; + } + sz = (sz + 7) / 8; + total_size_groups += sz; + } + return frame->header.bytes_written + total_size_groups; +} + +size_t JxlFastLosslessMaxRequiredOutput( + const JxlFastLosslessFrameState* frame) { + return JxlFastLosslessOutputSize(frame) + 32; +} + +void JxlFastLosslessPrepareHeader(JxlFastLosslessFrameState* frame, + int add_image_header, int is_last) { + BitWriter* output = &frame->header; + output->Allocate(1000 + frame->group_data.size() * 32); + + std::vector group_sizes(frame->group_data.size()); + for (size_t i = 0; i < frame->group_data.size(); i++) { + size_t sz = 0; + for (size_t j = 0; j < frame->nb_chans; j++) { + const auto& writer = frame->group_data[i][j]; + sz += writer.bytes_written * 8 + writer.bits_in_buffer; + } + sz = (sz + 7) / 8; + group_sizes[i] = sz; + } + + bool have_alpha = (frame->nb_chans == 2 || frame->nb_chans == 4); + + if (add_image_header) { + // Signature + output->Write(16, 0x0AFF); + + // Size header, hand-crafted. + // Not small + output->Write(1, 0); + + auto wsz = [output](size_t size) { + if (size - 1 < (1 << 9)) { + output->Write(2, 0b00); + output->Write(9, size - 1); + } else if (size - 1 < (1 << 13)) { + output->Write(2, 0b01); + output->Write(13, size - 1); + } else if (size - 1 < (1 << 18)) { + output->Write(2, 0b10); + output->Write(18, size - 1); + } else { + output->Write(2, 0b11); + output->Write(30, size - 1); + } + }; + + wsz(frame->height); + + // No special ratio. + output->Write(3, 0); + + wsz(frame->width); + + // Hand-crafted ImageMetadata. + output->Write(1, 0); // all_default + output->Write(1, 0); // extra_fields + output->Write(1, 0); // bit_depth.floating_point_sample + if (frame->bitdepth == 8) { + output->Write(2, 0b00); // bit_depth.bits_per_sample = 8 + } else if (frame->bitdepth == 10) { + output->Write(2, 0b01); // bit_depth.bits_per_sample = 10 + } else if (frame->bitdepth == 12) { + output->Write(2, 0b10); // bit_depth.bits_per_sample = 12 + } else { + output->Write(2, 0b11); // 1 + u(6) + output->Write(6, frame->bitdepth - 1); + } + if (frame->bitdepth <= 14) { + output->Write(1, 1); // 16-bit-buffer sufficient + } else { + output->Write(1, 0); // 16-bit-buffer NOT sufficient + } + if (have_alpha) { + output->Write(2, 0b01); // One extra channel + output->Write(1, 1); // ... all_default (ie. 8-bit alpha) + } else { + output->Write(2, 0b00); // No extra channel + } + output->Write(1, 0); // Not XYB + if (frame->nb_chans > 2) { + output->Write(1, 1); // color_encoding.all_default (sRGB) + } else { + output->Write(1, 0); // color_encoding.all_default false + output->Write(1, 0); // color_encoding.want_icc false + output->Write(2, 1); // grayscale + output->Write(2, 1); // D65 + output->Write(1, 0); // no gamma transfer function + output->Write(2, 0b10); // tf: 2 + u(4) + output->Write(4, 11); // tf of sRGB + output->Write(2, 1); // relative rendering intent + } + output->Write(2, 0b00); // No extensions. + + output->Write(1, 1); // all_default transform data + + // No ICC, no preview. Frame should start at byte boundery. + output->ZeroPadToByte(); + } + + // Handcrafted frame header. + output->Write(1, 0); // all_default + output->Write(2, 0b00); // regular frame + output->Write(1, 1); // modular + output->Write(2, 0b00); // default flags + output->Write(1, 0); // not YCbCr + output->Write(2, 0b00); // no upsampling + if (have_alpha) { + output->Write(2, 0b00); // no alpha upsampling + } + output->Write(2, 0b01); // default group size + output->Write(2, 0b00); // exactly one pass + output->Write(1, 0); // no custom size or origin + output->Write(2, 0b00); // kReplace blending mode + if (have_alpha) { + output->Write(2, 0b00); // kReplace blending mode for alpha channel + } + output->Write(1, is_last); // is_last + output->Write(2, 0b00); // a frame has no name + output->Write(1, 0); // loop filter is not all_default + output->Write(1, 0); // no gaborish + output->Write(2, 0); // 0 EPF iters + output->Write(2, 0b00); // No LF extensions + output->Write(2, 0b00); // No FH extensions + + output->Write(1, 0); // No TOC permutation + output->ZeroPadToByte(); // TOC is byte-aligned. + for (size_t i = 0; i < frame->group_data.size(); i++) { + size_t sz = group_sizes[i]; + if (sz < (1 << 10)) { + output->Write(2, 0b00); + output->Write(10, sz); + } else if (sz - 1024 < (1 << 14)) { + output->Write(2, 0b01); + output->Write(14, sz - 1024); + } else if (sz - 17408 < (1 << 22)) { + output->Write(2, 0b10); + output->Write(22, sz - 17408); + } else { + output->Write(2, 0b11); + output->Write(30, sz - 4211712); + } + } + output->ZeroPadToByte(); // Groups are byte-aligned. +} + +#if FJXL_ENABLE_AVX512 +__attribute__((target("avx512vbmi2"))) static size_t AppendBytesWithBitOffset( + const uint8_t* data, size_t n, size_t bit_buffer_nbits, + unsigned char* output, uint64_t& bit_buffer) { + if (n < 128) { + return 0; + } + + size_t i = 0; + __m512i shift = _mm512_set1_epi64(64 - bit_buffer_nbits); + __m512i carry = _mm512_set1_epi64(bit_buffer << (64 - bit_buffer_nbits)); + + for (; i + 64 <= n; i += 64) { + __m512i current = _mm512_loadu_si512(data + i); + __m512i previous_u64 = _mm512_alignr_epi64(current, carry, 7); + carry = current; + __m512i out = _mm512_shrdv_epi64(previous_u64, current, shift); + _mm512_storeu_si512(output + i, out); + } + + bit_buffer = data[i - 1] >> (8 - bit_buffer_nbits); + + return i; +} +#endif + +size_t JxlFastLosslessWriteOutput(JxlFastLosslessFrameState* frame, + unsigned char* output, size_t output_size) { + assert(output_size >= 32); + unsigned char* initial_output = output; + size_t (*append_bytes_with_bit_offset)(const uint8_t*, size_t, size_t, + unsigned char*, uint64_t&) = nullptr; + +#if FJXL_ENABLE_AVX512 + if (__builtin_cpu_supports("avx512vbmi2")) { + append_bytes_with_bit_offset = AppendBytesWithBitOffset; + } +#endif + + while (true) { + size_t& cur = frame->current_bit_writer; + size_t& bw_pos = frame->bit_writer_byte_pos; + if (cur >= 1 + frame->group_data.size() * frame->nb_chans) { + return output - initial_output; + } + if (output_size <= 8) { + return output - initial_output; + } + size_t nbc = frame->nb_chans; + const BitWriter& writer = + cur == 0 ? frame->header + : frame->group_data[(cur - 1) / nbc][(cur - 1) % nbc]; + size_t full_byte_count = + std::min(output_size - 8, writer.bytes_written - bw_pos); + if (frame->bits_in_buffer == 0) { + memcpy(output, writer.data.get() + bw_pos, full_byte_count); + } else { + size_t i = 0; + if (append_bytes_with_bit_offset) { + i += append_bytes_with_bit_offset( + writer.data.get() + bw_pos, full_byte_count, frame->bits_in_buffer, + output, frame->bit_buffer); + } +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + // Copy 8 bytes at a time until we reach the border. + for (; i + 8 < full_byte_count; i += 8) { + uint64_t chunk; + memcpy(&chunk, writer.data.get() + bw_pos + i, 8); + uint64_t out = frame->bit_buffer | (chunk << frame->bits_in_buffer); + memcpy(output + i, &out, 8); + frame->bit_buffer = chunk >> (64 - frame->bits_in_buffer); + } +#endif + for (; i < full_byte_count; i++) { + AddBits(8, writer.data.get()[bw_pos + i], output + i, + frame->bits_in_buffer, frame->bit_buffer); + } + } + output += full_byte_count; + output_size -= full_byte_count; + bw_pos += full_byte_count; + if (bw_pos == writer.bytes_written) { + auto write = [&](size_t num, uint64_t bits) { + size_t n = AddBits(num, bits, output, frame->bits_in_buffer, + frame->bit_buffer); + output += n; + output_size -= n; + }; + if (writer.bits_in_buffer) { + write(writer.bits_in_buffer, writer.buffer); + } + bw_pos = 0; + cur++; + if ((cur - 1) % nbc == 0 && frame->bits_in_buffer != 0) { + write(8 - frame->bits_in_buffer, 0); + } + } + } +} + +void JxlFastLosslessFreeFrameState(JxlFastLosslessFrameState* frame) { + delete frame; +} + +} // extern "C" + +#endif + +#ifdef FJXL_SELF_INCLUDE + +namespace { + +constexpr size_t kNumRawSymbols = 19; +constexpr size_t kNumLZ77 = 33; +constexpr size_t kLZ77CacheSize = 32; + +constexpr size_t kLZ77Offset = 224; +constexpr size_t kLZ77MinLength = 7; + +void EncodeHybridUintLZ77(uint32_t value, uint32_t* token, uint32_t* nbits, + uint32_t* bits) { + // 400 config + uint32_t n = FloorLog2(value); + *token = value < 16 ? value : 16 + n - 4; + *nbits = value < 16 ? 0 : n; + *bits = value < 16 ? 0 : value - (1 << *nbits); +} + +struct PrefixCode { + uint8_t raw_nbits[kNumRawSymbols] = {}; + uint8_t raw_bits[kNumRawSymbols] = {}; + + alignas(64) uint8_t raw_nbits_simd[16] = {}; + alignas(64) uint8_t raw_bits_simd[16] = {}; + + uint8_t lz77_nbits[kNumLZ77] = {}; + uint16_t lz77_bits[kNumLZ77] = {}; + + uint64_t lz77_cache_bits[kLZ77CacheSize] = {}; + uint8_t lz77_cache_nbits[kLZ77CacheSize] = {}; + + static uint16_t BitReverse(size_t nbits, uint16_t bits) { + constexpr uint16_t kNibbleLookup[16] = { + 0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010, 0b0110, 0b1110, + 0b0001, 0b1001, 0b0101, 0b1101, 0b0011, 0b1011, 0b0111, 0b1111, + }; + uint16_t rev16 = (kNibbleLookup[bits & 0xF] << 12) | + (kNibbleLookup[(bits >> 4) & 0xF] << 8) | + (kNibbleLookup[(bits >> 8) & 0xF] << 4) | + (kNibbleLookup[bits >> 12]); + return rev16 >> (16 - nbits); + } + + // Create the prefix codes given the code lengths. + // Supports the code lengths being split into two halves. + static void ComputeCanonicalCode(const uint8_t* first_chunk_nbits, + uint8_t* first_chunk_bits, + size_t first_chunk_size, + const uint8_t* second_chunk_nbits, + uint16_t* second_chunk_bits, + size_t second_chunk_size) { + constexpr size_t kMaxCodeLength = 15; + uint8_t code_length_counts[kMaxCodeLength + 1] = {}; + for (size_t i = 0; i < first_chunk_size; i++) { + code_length_counts[first_chunk_nbits[i]]++; + assert(first_chunk_nbits[i] <= kMaxCodeLength); + assert(first_chunk_nbits[i] <= 8); + assert(first_chunk_nbits[i] > 0); + } + for (size_t i = 0; i < second_chunk_size; i++) { + code_length_counts[second_chunk_nbits[i]]++; + assert(second_chunk_nbits[i] <= kMaxCodeLength); + } + + uint16_t next_code[kMaxCodeLength + 1] = {}; + + uint16_t code = 0; + for (size_t i = 1; i < kMaxCodeLength + 1; i++) { + code = (code + code_length_counts[i - 1]) << 1; + next_code[i] = code; + } + + for (size_t i = 0; i < first_chunk_size; i++) { + first_chunk_bits[i] = + BitReverse(first_chunk_nbits[i], next_code[first_chunk_nbits[i]]++); + } + for (size_t i = 0; i < second_chunk_size; i++) { + second_chunk_bits[i] = + BitReverse(second_chunk_nbits[i], next_code[second_chunk_nbits[i]]++); + } + } + + template + static void ComputeCodeLengthsNonZeroImpl(const uint64_t* freqs, size_t n, + size_t precision, T infty, + uint8_t* min_limit, + uint8_t* max_limit, + uint8_t* nbits) { + std::vector dynp(((1U << precision) + 1) * (n + 1), infty); + auto d = [&](size_t sym, size_t off) -> T& { + return dynp[sym * ((1 << precision) + 1) + off]; + }; + d(0, 0) = 0; + for (size_t sym = 0; sym < n; sym++) { + for (T bits = min_limit[sym]; bits <= max_limit[sym]; bits++) { + size_t off_delta = 1U << (precision - bits); + for (size_t off = 0; off + off_delta <= (1U << precision); off++) { + d(sym + 1, off + off_delta) = + std::min(d(sym, off) + static_cast(freqs[sym]) * bits, + d(sym + 1, off + off_delta)); + } + } + } + + size_t sym = n; + size_t off = 1U << precision; + + assert(d(sym, off) != infty); + + while (sym-- > 0) { + assert(off > 0); + for (size_t bits = min_limit[sym]; bits <= max_limit[sym]; bits++) { + size_t off_delta = 1U << (precision - bits); + if (off_delta <= off && + d(sym + 1, off) == d(sym, off - off_delta) + freqs[sym] * bits) { + off -= off_delta; + nbits[sym] = bits; + break; + } + } + } + } + + // Computes nbits[i] for i <= n, subject to min_limit[i] <= nbits[i] <= + // max_limit[i] and sum 2**-nbits[i] == 1, so to minimize sum(nbits[i] * + // freqs[i]). + static void ComputeCodeLengthsNonZero(const uint64_t* freqs, size_t n, + uint8_t* min_limit, uint8_t* max_limit, + uint8_t* nbits) { + size_t precision = 0; + size_t shortest_length = 255; + uint64_t freqsum = 0; + for (size_t i = 0; i < n; i++) { + assert(freqs[i] != 0); + freqsum += freqs[i]; + if (min_limit[i] < 1) min_limit[i] = 1; + assert(min_limit[i] <= max_limit[i]); + precision = std::max(max_limit[i], precision); + shortest_length = std::min(min_limit[i], shortest_length); + } + // If all the minimum limits are greater than 1, shift precision so that we + // behave as if the shortest was 1. + precision -= shortest_length - 1; + uint64_t infty = freqsum * precision; + if (infty < std::numeric_limits::max() / 2) { + ComputeCodeLengthsNonZeroImpl(freqs, n, precision, + static_cast(infty), min_limit, + max_limit, nbits); + } else { + ComputeCodeLengthsNonZeroImpl(freqs, n, precision, infty, min_limit, + max_limit, nbits); + } + } + + static constexpr size_t kMaxNumSymbols = + kNumRawSymbols + 1 < kNumLZ77 ? kNumLZ77 : kNumRawSymbols + 1; + static void ComputeCodeLengths(const uint64_t* freqs, size_t n, + const uint8_t* min_limit_in, + const uint8_t* max_limit_in, uint8_t* nbits) { + assert(n <= kMaxNumSymbols); + uint64_t compact_freqs[kMaxNumSymbols]; + uint8_t min_limit[kMaxNumSymbols]; + uint8_t max_limit[kMaxNumSymbols]; + size_t ni = 0; + for (size_t i = 0; i < n; i++) { + if (freqs[i]) { + compact_freqs[ni] = freqs[i]; + min_limit[ni] = min_limit_in[i]; + max_limit[ni] = max_limit_in[i]; + ni++; + } + } + uint8_t num_bits[kMaxNumSymbols] = {}; + ComputeCodeLengthsNonZero(compact_freqs, ni, min_limit, max_limit, + num_bits); + ni = 0; + for (size_t i = 0; i < n; i++) { + nbits[i] = 0; + if (freqs[i]) { + nbits[i] = num_bits[ni++]; + } + } + } + + // Invalid code, used to construct arrays. + PrefixCode() {} + + template + PrefixCode(BitDepth, uint64_t* raw_counts, uint64_t* lz77_counts) { + // "merge" together all the lz77 counts in a single symbol for the level 1 + // table (containing just the raw symbols, up to length 7). + uint64_t level1_counts[kNumRawSymbols + 1]; + memcpy(level1_counts, raw_counts, kNumRawSymbols * sizeof(uint64_t)); + size_t numraw = kNumRawSymbols; + while (numraw > 0 && level1_counts[numraw - 1] == 0) numraw--; + + level1_counts[numraw] = 0; + for (size_t i = 0; i < kNumLZ77; i++) { + level1_counts[numraw] += lz77_counts[i]; + } + uint8_t level1_nbits[kNumRawSymbols + 1] = {}; + ComputeCodeLengths(level1_counts, numraw + 1, BitDepth::kMinRawLength, + BitDepth::kMaxRawLength, level1_nbits); + + uint8_t level2_nbits[kNumLZ77] = {}; + uint8_t min_lengths[kNumLZ77] = {}; + uint8_t l = 15 - level1_nbits[numraw]; + uint8_t max_lengths[kNumLZ77]; + for (size_t i = 0; i < kNumLZ77; i++) { + max_lengths[i] = l; + } + size_t num_lz77 = kNumLZ77; + while (num_lz77 > 0 && lz77_counts[num_lz77 - 1] == 0) num_lz77--; + ComputeCodeLengths(lz77_counts, num_lz77, min_lengths, max_lengths, + level2_nbits); + for (size_t i = 0; i < numraw; i++) { + raw_nbits[i] = level1_nbits[i]; + } + for (size_t i = 0; i < num_lz77; i++) { + lz77_nbits[i] = + level2_nbits[i] ? level1_nbits[numraw] + level2_nbits[i] : 0; + } + + ComputeCanonicalCode(raw_nbits, raw_bits, numraw, lz77_nbits, lz77_bits, + kNumLZ77); + BitDepth::PrepareForSimd(raw_nbits, raw_bits, numraw, raw_nbits_simd, + raw_bits_simd); + + // Prepare lz77 cache + for (size_t count = 0; count < kLZ77CacheSize; count++) { + unsigned token, nbits, bits; + EncodeHybridUintLZ77(count, &token, &nbits, &bits); + lz77_cache_nbits[count] = lz77_nbits[token] + nbits + raw_nbits[0]; + lz77_cache_bits[count] = + (((bits << lz77_nbits[token]) | lz77_bits[token]) << raw_nbits[0]) | + raw_bits[0]; + } + } + + void WriteTo(BitWriter* writer) const { + uint64_t code_length_counts[18] = {}; + code_length_counts[17] = 3 + 2 * (kNumLZ77 - 1); + for (size_t i = 0; i < kNumRawSymbols; i++) { + code_length_counts[raw_nbits[i]]++; + } + for (size_t i = 0; i < kNumLZ77; i++) { + code_length_counts[lz77_nbits[i]]++; + } + uint8_t code_length_nbits[18] = {}; + uint8_t code_length_nbits_min[18] = {}; + uint8_t code_length_nbits_max[18] = { + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + }; + ComputeCodeLengths(code_length_counts, 18, code_length_nbits_min, + code_length_nbits_max, code_length_nbits); + writer->Write(2, 0b00); // HSKIP = 0, i.e. don't skip code lengths. + + // As per Brotli RFC. + uint8_t code_length_order[18] = {1, 2, 3, 4, 0, 5, 17, 6, 16, + 7, 8, 9, 10, 11, 12, 13, 14, 15}; + uint8_t code_length_length_nbits[] = {2, 4, 3, 2, 2, 4}; + uint8_t code_length_length_bits[] = {0, 7, 3, 2, 1, 15}; + + // Encode lengths of code lengths. + size_t num_code_lengths = 18; + while (code_length_nbits[code_length_order[num_code_lengths - 1]] == 0) { + num_code_lengths--; + } + for (size_t i = 0; i < num_code_lengths; i++) { + int symbol = code_length_nbits[code_length_order[i]]; + writer->Write(code_length_length_nbits[symbol], + code_length_length_bits[symbol]); + } + + // Compute the canonical codes for the codes that represent the lengths of + // the actual codes for data. + uint16_t code_length_bits[18] = {}; + ComputeCanonicalCode(nullptr, nullptr, 0, code_length_nbits, + code_length_bits, 18); + // Encode raw bit code lengths. + for (size_t i = 0; i < kNumRawSymbols; i++) { + writer->Write(code_length_nbits[raw_nbits[i]], + code_length_bits[raw_nbits[i]]); + } + size_t num_lz77 = kNumLZ77; + while (lz77_nbits[num_lz77 - 1] == 0) { + num_lz77--; + } + // Encode 0s until 224 (start of LZ77 symbols). This is in total 224-19 = + // 205. + static_assert(kLZ77Offset == 224, ""); + static_assert(kNumRawSymbols == 19, ""); + writer->Write(code_length_nbits[17], code_length_bits[17]); + writer->Write(3, 0b010); // 5 + writer->Write(code_length_nbits[17], code_length_bits[17]); + writer->Write(3, 0b000); // (5-2)*8 + 3 = 27 + writer->Write(code_length_nbits[17], code_length_bits[17]); + writer->Write(3, 0b010); // (27-2)*8 + 5 = 205 + // Encode LZ77 symbols, with values 224+i. + for (size_t i = 0; i < num_lz77; i++) { + writer->Write(code_length_nbits[lz77_nbits[i]], + code_length_bits[lz77_nbits[i]]); + } + } +}; + +template +struct VecPair { + T low; + T hi; +}; + +#ifdef FJXL_GENERIC_SIMD +#undef FJXL_GENERIC_SIMD +#endif + +#ifdef FJXL_AVX512 +#define FJXL_GENERIC_SIMD +struct SIMDVec32; +struct Mask32 { + __mmask16 mask; + SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false); + size_t CountPrefix() const { + return CtzNonZero(~uint64_t{_cvtmask16_u32(mask)}); + } +}; + +struct SIMDVec32 { + __m512i vec; + + static constexpr size_t kLanes = 16; + + FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) { + return SIMDVec32{_mm512_loadu_si512((__m512i*)data)}; + } + FJXL_INLINE void Store(uint32_t* data) { + _mm512_storeu_si512((__m512i*)data, vec); + } + FJXL_INLINE static SIMDVec32 Val(uint32_t v) { + return SIMDVec32{_mm512_set1_epi32(v)}; + } + FJXL_INLINE SIMDVec32 ValToToken() const { + return SIMDVec32{ + _mm512_sub_epi32(_mm512_set1_epi32(32), _mm512_lzcnt_epi32(vec))}; + } + FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const { + return SIMDVec32{_mm512_sub_epi32(_mm512_max_epu32(vec, to_subtract.vec), + to_subtract.vec)}; + } + FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const { + return SIMDVec32{_mm512_sub_epi32(vec, to_subtract.vec)}; + } + FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const { + return SIMDVec32{_mm512_add_epi32(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const { + return SIMDVec32{_mm512_xor_epi32(vec, oth.vec)}; + } + FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const { + return Mask32{_mm512_cmpeq_epi32_mask(vec, oth.vec)}; + } + FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const { + return Mask32{_mm512_cmpgt_epi32_mask(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec32 Pow2() const { + return SIMDVec32{_mm512_sllv_epi32(_mm512_set1_epi32(1), vec)}; + } + template + FJXL_INLINE SIMDVec32 SignedShiftRight() const { + return SIMDVec32{_mm512_srai_epi32(vec, i)}; + } +}; + +struct SIMDVec16; + +struct Mask16 { + __mmask32 mask; + SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false); + Mask16 And(const Mask16& oth) const { + return Mask16{_kand_mask32(mask, oth.mask)}; + } + size_t CountPrefix() const { + return CtzNonZero(~uint64_t{_cvtmask32_u32(mask)}); + } +}; + +struct SIMDVec16 { + __m512i vec; + + static constexpr size_t kLanes = 32; + + FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) { + return SIMDVec16{_mm512_loadu_si512((__m512i*)data)}; + } + FJXL_INLINE void Store(uint16_t* data) { + _mm512_storeu_si512((__m512i*)data, vec); + } + FJXL_INLINE static SIMDVec16 Val(uint16_t v) { + return SIMDVec16{_mm512_set1_epi16(v)}; + } + FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo, + const SIMDVec32& hi) { + auto tmp = _mm512_packus_epi32(lo.vec, hi.vec); + alignas(64) uint64_t perm[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + return SIMDVec16{ + _mm512_permutex2var_epi64(tmp, _mm512_load_si512((__m512i*)perm), tmp)}; + } + + FJXL_INLINE SIMDVec16 ValToToken() const { + auto c16 = _mm512_set1_epi32(16); + auto c32 = _mm512_set1_epi32(32); + auto low16bit = _mm512_set1_epi32(0x0000FFFF); + auto lzhi = + _mm512_sub_epi32(c16, _mm512_min_epu32(c16, _mm512_lzcnt_epi32(vec))); + auto lzlo = _mm512_sub_epi32( + c32, _mm512_lzcnt_epi32(_mm512_and_si512(low16bit, vec))); + return SIMDVec16{_mm512_or_si512(lzlo, _mm512_slli_epi32(lzhi, 16))}; + } + + FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const { + return SIMDVec16{_mm512_subs_epu16(vec, to_subtract.vec)}; + } + FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const { + return SIMDVec16{_mm512_sub_epi16(vec, to_subtract.vec)}; + } + FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const { + return SIMDVec16{_mm512_add_epi16(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const { + return SIMDVec16{_mm512_min_epu16(vec, oth.vec)}; + } + FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const { + return Mask16{_mm512_cmpeq_epi16_mask(vec, oth.vec)}; + } + FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const { + return Mask16{_mm512_cmpgt_epi16_mask(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec16 Pow2() const { + return SIMDVec16{_mm512_sllv_epi16(_mm512_set1_epi16(1), vec)}; + } + FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const { + return SIMDVec16{_mm512_or_si512(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const { + return SIMDVec16{_mm512_xor_si512(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const { + return SIMDVec16{_mm512_and_si512(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const { + return SIMDVec16{_mm512_srai_epi16(_mm512_add_epi16(vec, oth.vec), 1)}; + } + FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const { + return SIMDVec16{_mm512_or_si512(vec, _mm512_set1_epi16(0xFF00))}; + } + FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const { + return SIMDVec16{_mm512_shuffle_epi8( + _mm512_broadcast_i32x4(_mm_loadu_si128((__m128i*)table)), vec)}; + } + FJXL_INLINE VecPair Interleave(const SIMDVec16& low) const { + auto lo = _mm512_unpacklo_epi16(low.vec, vec); + auto hi = _mm512_unpackhi_epi16(low.vec, vec); + alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11}; + alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15}; + return {SIMDVec16{_mm512_permutex2var_epi64( + lo, _mm512_load_si512((__m512i*)perm1), hi)}, + SIMDVec16{_mm512_permutex2var_epi64( + lo, _mm512_load_si512((__m512i*)perm2), hi)}}; + } + FJXL_INLINE VecPair Upcast() const { + auto lo = _mm512_unpacklo_epi16(vec, _mm512_setzero_si512()); + auto hi = _mm512_unpackhi_epi16(vec, _mm512_setzero_si512()); + alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11}; + alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15}; + return {SIMDVec32{_mm512_permutex2var_epi64( + lo, _mm512_load_si512((__m512i*)perm1), hi)}, + SIMDVec32{_mm512_permutex2var_epi64( + lo, _mm512_load_si512((__m512i*)perm2), hi)}}; + } + template + FJXL_INLINE SIMDVec16 SignedShiftRight() const { + return SIMDVec16{_mm512_srai_epi16(vec, i)}; + } + + static std::array LoadG8(const unsigned char* data) { + __m256i bytes = _mm256_loadu_si256((__m256i*)data); + return {SIMDVec16{_mm512_cvtepu8_epi16(bytes)}}; + } + static std::array LoadG16(const unsigned char* data) { + return {Load((const uint16_t*)data)}; + } + + static std::array LoadGA8(const unsigned char* data) { + __m512i bytes = _mm512_loadu_si512((__m512i*)data); + __m512i gray = _mm512_and_si512(bytes, _mm512_set1_epi16(0xFF)); + __m512i alpha = _mm512_srli_epi16(bytes, 8); + return {SIMDVec16{gray}, SIMDVec16{alpha}}; + } + static std::array LoadGA16(const unsigned char* data) { + __m512i bytes1 = _mm512_loadu_si512((__m512i*)data); + __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64)); + __m512i g_mask = _mm512_set1_epi32(0xFFFF); + __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0); + __m512i g = _mm512_permutexvar_epi64( + permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, g_mask), + _mm512_and_si512(bytes2, g_mask))); + __m512i a = _mm512_permutexvar_epi64( + permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16), + _mm512_srli_epi32(bytes2, 16))); + return {SIMDVec16{g}, SIMDVec16{a}}; + } + + static std::array LoadRGB8(const unsigned char* data) { + __m512i bytes0 = _mm512_loadu_si512((__m512i*)data); + __m512i bytes1 = + _mm512_zextsi256_si512(_mm256_loadu_si256((__m256i*)(data + 64))); + + // 0x7A = element of upper half of second vector = 0 after lookup; still in + // the upper half once we add 1 or 2. + uint8_t z = 0x7A; + __m512i ridx = + _mm512_set_epi8(z, 93, z, 90, z, 87, z, 84, z, 81, z, 78, z, 75, z, 72, + z, 69, z, 66, z, 63, z, 60, z, 57, z, 54, z, 51, z, 48, + z, 45, z, 42, z, 39, z, 36, z, 33, z, 30, z, 27, z, 24, + z, 21, z, 18, z, 15, z, 12, z, 9, z, 6, z, 3, z, 0); + __m512i gidx = _mm512_add_epi8(ridx, _mm512_set1_epi8(1)); + __m512i bidx = _mm512_add_epi8(gidx, _mm512_set1_epi8(1)); + __m512i r = _mm512_permutex2var_epi8(bytes0, ridx, bytes1); + __m512i g = _mm512_permutex2var_epi8(bytes0, gidx, bytes1); + __m512i b = _mm512_permutex2var_epi8(bytes0, bidx, bytes1); + return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}}; + } + static std::array LoadRGB16(const unsigned char* data) { + __m512i bytes0 = _mm512_loadu_si512((__m512i*)data); + __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64)); + __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128)); + + __m512i ridx_lo = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63, 60, 57, + 54, 51, 48, 45, 42, 39, 36, 33, 30, 27, + 24, 21, 18, 15, 12, 9, 6, 3, 0); + // -1 is such that when adding 1 or 2, we get the correct index for + // green/blue. + __m512i ridx_hi = + _mm512_set_epi16(29, 26, 23, 20, 17, 14, 11, 8, 5, 2, -1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + __m512i gidx_lo = _mm512_add_epi16(ridx_lo, _mm512_set1_epi16(1)); + __m512i gidx_hi = _mm512_add_epi16(ridx_hi, _mm512_set1_epi16(1)); + __m512i bidx_lo = _mm512_add_epi16(gidx_lo, _mm512_set1_epi16(1)); + __m512i bidx_hi = _mm512_add_epi16(gidx_hi, _mm512_set1_epi16(1)); + + __mmask32 rmask = _cvtu32_mask32(0b11111111110000000000000000000000); + __mmask32 gbmask = _cvtu32_mask32(0b11111111111000000000000000000000); + + __m512i rlo = _mm512_permutex2var_epi16(bytes0, ridx_lo, bytes1); + __m512i glo = _mm512_permutex2var_epi16(bytes0, gidx_lo, bytes1); + __m512i blo = _mm512_permutex2var_epi16(bytes0, bidx_lo, bytes1); + __m512i r = _mm512_mask_permutexvar_epi16(rlo, rmask, ridx_hi, bytes2); + __m512i g = _mm512_mask_permutexvar_epi16(glo, gbmask, gidx_hi, bytes2); + __m512i b = _mm512_mask_permutexvar_epi16(blo, gbmask, bidx_hi, bytes2); + return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}}; + } + + static std::array LoadRGBA8(const unsigned char* data) { + __m512i bytes1 = _mm512_loadu_si512((__m512i*)data); + __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64)); + __m512i rg_mask = _mm512_set1_epi32(0xFFFF); + __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0); + __m512i rg = _mm512_permutexvar_epi64( + permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, rg_mask), + _mm512_and_si512(bytes2, rg_mask))); + __m512i ba = _mm512_permutexvar_epi64( + permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16), + _mm512_srli_epi32(bytes2, 16))); + __m512i r = _mm512_and_si512(rg, _mm512_set1_epi16(0xFF)); + __m512i g = _mm512_srli_epi16(rg, 8); + __m512i b = _mm512_and_si512(ba, _mm512_set1_epi16(0xFF)); + __m512i a = _mm512_srli_epi16(ba, 8); + return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}}; + } + static std::array LoadRGBA16(const unsigned char* data) { + __m512i bytes0 = _mm512_loadu_si512((__m512i*)data); + __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64)); + __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128)); + __m512i bytes3 = _mm512_loadu_si512((__m512i*)(data + 192)); + + auto pack32 = [](__m512i a, __m512i b) { + __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0); + return _mm512_permutexvar_epi64(permuteidx, _mm512_packus_epi32(a, b)); + }; + auto packlow32 = [&pack32](__m512i a, __m512i b) { + __m512i mask = _mm512_set1_epi32(0xFFFF); + return pack32(_mm512_and_si512(a, mask), _mm512_and_si512(b, mask)); + }; + auto packhi32 = [&pack32](__m512i a, __m512i b) { + return pack32(_mm512_srli_epi32(a, 16), _mm512_srli_epi32(b, 16)); + }; + + __m512i rb0 = packlow32(bytes0, bytes1); + __m512i rb1 = packlow32(bytes2, bytes3); + __m512i ga0 = packhi32(bytes0, bytes1); + __m512i ga1 = packhi32(bytes2, bytes3); + + __m512i r = packlow32(rb0, rb1); + __m512i g = packlow32(ga0, ga1); + __m512i b = packhi32(rb0, rb1); + __m512i a = packhi32(ga0, ga1); + return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}}; + } + + void SwapEndian() { + auto indices = _mm512_broadcast_i32x4( + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14)); + vec = _mm512_shuffle_epi8(vec, indices); + } +}; + +SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true, + const SIMDVec16& if_false) { + return SIMDVec16{_mm512_mask_blend_epi16(mask, if_false.vec, if_true.vec)}; +} + +SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true, + const SIMDVec32& if_false) { + return SIMDVec32{_mm512_mask_blend_epi32(mask, if_false.vec, if_true.vec)}; +} + +struct Bits64 { + static constexpr size_t kLanes = 8; + + __m512i nbits; + __m512i bits; + + FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) { + _mm512_storeu_si512((__m512i*)nbits_out, nbits); + _mm512_storeu_si512((__m512i*)bits_out, bits); + } +}; + +struct Bits32 { + __m512i nbits; + __m512i bits; + + static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) { + return Bits32{nbits.vec, bits.vec}; + } + + Bits64 Merge() const { + auto nbits_hi32 = _mm512_srli_epi64(nbits, 32); + auto nbits_lo32 = _mm512_and_si512(nbits, _mm512_set1_epi64(0xFFFFFFFF)); + auto bits_hi32 = _mm512_srli_epi64(bits, 32); + auto bits_lo32 = _mm512_and_si512(bits, _mm512_set1_epi64(0xFFFFFFFF)); + + auto nbits64 = _mm512_add_epi64(nbits_hi32, nbits_lo32); + auto bits64 = + _mm512_or_si512(_mm512_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32); + return Bits64{nbits64, bits64}; + } + + void Interleave(const Bits32& low) { + bits = _mm512_or_si512(_mm512_sllv_epi32(bits, low.nbits), low.bits); + nbits = _mm512_add_epi32(nbits, low.nbits); + } + + void ClipTo(size_t n) { + n = std::min(n, 16); + constexpr uint32_t kMask[32] = { + ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, + ~0u, ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; + __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n)); + nbits = _mm512_and_si512(mask, nbits); + bits = _mm512_and_si512(mask, bits); + } + void Skip(size_t n) { + n = std::min(n, 16); + constexpr uint32_t kMask[32] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, + ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, + }; + __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n)); + nbits = _mm512_and_si512(mask, nbits); + bits = _mm512_and_si512(mask, bits); + } +}; + +struct Bits16 { + __m512i nbits; + __m512i bits; + + static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) { + return Bits16{nbits.vec, bits.vec}; + } + + Bits32 Merge() const { + auto nbits_hi16 = _mm512_srli_epi32(nbits, 16); + auto nbits_lo16 = _mm512_and_si512(nbits, _mm512_set1_epi32(0xFFFF)); + auto bits_hi16 = _mm512_srli_epi32(bits, 16); + auto bits_lo16 = _mm512_and_si512(bits, _mm512_set1_epi32(0xFFFF)); + + auto nbits32 = _mm512_add_epi32(nbits_hi16, nbits_lo16); + auto bits32 = + _mm512_or_si512(_mm512_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16); + return Bits32{nbits32, bits32}; + } + + void Interleave(const Bits16& low) { + bits = _mm512_or_si512(_mm512_sllv_epi16(bits, low.nbits), low.bits); + nbits = _mm512_add_epi16(nbits, low.nbits); + } + + void ClipTo(size_t n) { + n = std::min(n, 32); + constexpr uint16_t kMask[64] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }; + __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n)); + nbits = _mm512_and_si512(mask, nbits); + bits = _mm512_and_si512(mask, bits); + } + void Skip(size_t n) { + n = std::min(n, 32); + constexpr uint16_t kMask[64] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + }; + __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n)); + nbits = _mm512_and_si512(mask, nbits); + bits = _mm512_and_si512(mask, bits); + } +}; + +#endif + +#ifdef FJXL_AVX2 +#define FJXL_GENERIC_SIMD + +struct SIMDVec32; + +struct Mask32 { + __m256i mask; + SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false); + size_t CountPrefix() const { + return CtzNonZero(~static_cast( + (uint8_t)_mm256_movemask_ps(_mm256_castsi256_ps(mask)))); + } +}; + +struct SIMDVec32 { + __m256i vec; + + static constexpr size_t kLanes = 8; + + FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) { + return SIMDVec32{_mm256_loadu_si256((__m256i*)data)}; + } + FJXL_INLINE void Store(uint32_t* data) { + _mm256_storeu_si256((__m256i*)data, vec); + } + FJXL_INLINE static SIMDVec32 Val(uint32_t v) { + return SIMDVec32{_mm256_set1_epi32(v)}; + } + FJXL_INLINE SIMDVec32 ValToToken() const { + // we know that each value has at most 20 bits, so we just need 5 nibbles + // and don't need to mask the fifth. However we do need to set the higher + // bytes to 0xFF, which will make table lookups return 0. + auto nibble0 = + _mm256_or_si256(_mm256_and_si256(vec, _mm256_set1_epi32(0xF)), + _mm256_set1_epi32(0xFFFFFF00)); + auto nibble1 = _mm256_or_si256( + _mm256_and_si256(_mm256_srli_epi32(vec, 4), _mm256_set1_epi32(0xF)), + _mm256_set1_epi32(0xFFFFFF00)); + auto nibble2 = _mm256_or_si256( + _mm256_and_si256(_mm256_srli_epi32(vec, 8), _mm256_set1_epi32(0xF)), + _mm256_set1_epi32(0xFFFFFF00)); + auto nibble3 = _mm256_or_si256( + _mm256_and_si256(_mm256_srli_epi32(vec, 12), _mm256_set1_epi32(0xF)), + _mm256_set1_epi32(0xFFFFFF00)); + auto nibble4 = _mm256_or_si256(_mm256_srli_epi32(vec, 16), + _mm256_set1_epi32(0xFFFFFF00)); + + auto lut0 = _mm256_broadcastsi128_si256( + _mm_setr_epi8(0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4)); + auto lut1 = _mm256_broadcastsi128_si256( + _mm_setr_epi8(0, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8)); + auto lut2 = _mm256_broadcastsi128_si256(_mm_setr_epi8( + 0, 9, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12)); + auto lut3 = _mm256_broadcastsi128_si256(_mm_setr_epi8( + 0, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16)); + auto lut4 = _mm256_broadcastsi128_si256(_mm_setr_epi8( + 0, 17, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20)); + + auto token0 = _mm256_shuffle_epi8(lut0, nibble0); + auto token1 = _mm256_shuffle_epi8(lut1, nibble1); + auto token2 = _mm256_shuffle_epi8(lut2, nibble2); + auto token3 = _mm256_shuffle_epi8(lut3, nibble3); + auto token4 = _mm256_shuffle_epi8(lut4, nibble4); + + auto token = + _mm256_max_epi32(_mm256_max_epi32(_mm256_max_epi32(token0, token1), + _mm256_max_epi32(token2, token3)), + token4); + return SIMDVec32{token}; + } + FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const { + return SIMDVec32{_mm256_sub_epi32(_mm256_max_epu32(vec, to_subtract.vec), + to_subtract.vec)}; + } + FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const { + return SIMDVec32{_mm256_sub_epi32(vec, to_subtract.vec)}; + } + FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const { + return SIMDVec32{_mm256_add_epi32(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const { + return SIMDVec32{_mm256_xor_si256(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec32 Pow2() const { + return SIMDVec32{_mm256_sllv_epi32(_mm256_set1_epi32(1), vec)}; + } + FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const { + return Mask32{_mm256_cmpeq_epi32(vec, oth.vec)}; + } + FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const { + return Mask32{_mm256_cmpgt_epi32(vec, oth.vec)}; + } + template + FJXL_INLINE SIMDVec32 SignedShiftRight() const { + return SIMDVec32{_mm256_srai_epi32(vec, i)}; + } +}; + +struct SIMDVec16; + +struct Mask16 { + __m256i mask; + SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false); + Mask16 And(const Mask16& oth) const { + return Mask16{_mm256_and_si256(mask, oth.mask)}; + } + size_t CountPrefix() const { + return CtzNonZero( + ~static_cast((uint32_t)_mm256_movemask_epi8(mask))) / + 2; + } +}; + +struct SIMDVec16 { + __m256i vec; + + static constexpr size_t kLanes = 16; + + FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) { + return SIMDVec16{_mm256_loadu_si256((__m256i*)data)}; + } + FJXL_INLINE void Store(uint16_t* data) { + _mm256_storeu_si256((__m256i*)data, vec); + } + FJXL_INLINE static SIMDVec16 Val(uint16_t v) { + return SIMDVec16{_mm256_set1_epi16(v)}; + } + FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo, + const SIMDVec32& hi) { + auto tmp = _mm256_packus_epi32(lo.vec, hi.vec); + return SIMDVec16{_mm256_permute4x64_epi64(tmp, 0b11011000)}; + } + + FJXL_INLINE SIMDVec16 ValToToken() const { + auto nibble0 = + _mm256_or_si256(_mm256_and_si256(vec, _mm256_set1_epi16(0xF)), + _mm256_set1_epi16(0xFF00)); + auto nibble1 = _mm256_or_si256( + _mm256_and_si256(_mm256_srli_epi16(vec, 4), _mm256_set1_epi16(0xF)), + _mm256_set1_epi16(0xFF00)); + auto nibble2 = _mm256_or_si256( + _mm256_and_si256(_mm256_srli_epi16(vec, 8), _mm256_set1_epi16(0xF)), + _mm256_set1_epi16(0xFF00)); + auto nibble3 = + _mm256_or_si256(_mm256_srli_epi16(vec, 12), _mm256_set1_epi16(0xFF00)); + + auto lut0 = _mm256_broadcastsi128_si256( + _mm_setr_epi8(0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4)); + auto lut1 = _mm256_broadcastsi128_si256( + _mm_setr_epi8(0, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8)); + auto lut2 = _mm256_broadcastsi128_si256(_mm_setr_epi8( + 0, 9, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12)); + auto lut3 = _mm256_broadcastsi128_si256(_mm_setr_epi8( + 0, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16)); + + auto token0 = _mm256_shuffle_epi8(lut0, nibble0); + auto token1 = _mm256_shuffle_epi8(lut1, nibble1); + auto token2 = _mm256_shuffle_epi8(lut2, nibble2); + auto token3 = _mm256_shuffle_epi8(lut3, nibble3); + + auto token = _mm256_max_epi16(_mm256_max_epi16(token0, token1), + _mm256_max_epi16(token2, token3)); + return SIMDVec16{token}; + } + + FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const { + return SIMDVec16{_mm256_subs_epu16(vec, to_subtract.vec)}; + } + FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const { + return SIMDVec16{_mm256_sub_epi16(vec, to_subtract.vec)}; + } + FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const { + return SIMDVec16{_mm256_add_epi16(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const { + return SIMDVec16{_mm256_min_epu16(vec, oth.vec)}; + } + FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const { + return Mask16{_mm256_cmpeq_epi16(vec, oth.vec)}; + } + FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const { + return Mask16{_mm256_cmpgt_epi16(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec16 Pow2() const { + auto pow2_lo_lut = _mm256_broadcastsi128_si256( + _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, + 1u << 7, 0, 0, 0, 0, 0, 0, 0, 0)); + auto pow2_hi_lut = _mm256_broadcastsi128_si256( + _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1 << 0, 1 << 1, 1 << 2, 1 << 3, + 1 << 4, 1 << 5, 1 << 6, 1u << 7)); + + auto masked = _mm256_or_si256(vec, _mm256_set1_epi16(0xFF00)); + + auto pow2_lo = _mm256_shuffle_epi8(pow2_lo_lut, masked); + auto pow2_hi = _mm256_shuffle_epi8(pow2_hi_lut, masked); + + auto pow2 = _mm256_or_si256(_mm256_slli_epi16(pow2_hi, 8), pow2_lo); + return SIMDVec16{pow2}; + } + FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const { + return SIMDVec16{_mm256_or_si256(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const { + return SIMDVec16{_mm256_xor_si256(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const { + return SIMDVec16{_mm256_and_si256(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const { + return SIMDVec16{_mm256_srai_epi16(_mm256_add_epi16(vec, oth.vec), 1)}; + } + FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const { + return SIMDVec16{_mm256_or_si256(vec, _mm256_set1_epi16(0xFF00))}; + } + FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const { + return SIMDVec16{_mm256_shuffle_epi8( + _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)table)), vec)}; + } + FJXL_INLINE VecPair Interleave(const SIMDVec16& low) const { + auto v02 = _mm256_unpacklo_epi16(low.vec, vec); + auto v13 = _mm256_unpackhi_epi16(low.vec, vec); + return {SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x20)}, + SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x31)}}; + } + FJXL_INLINE VecPair Upcast() const { + auto v02 = _mm256_unpacklo_epi16(vec, _mm256_setzero_si256()); + auto v13 = _mm256_unpackhi_epi16(vec, _mm256_setzero_si256()); + return {SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x20)}, + SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x31)}}; + } + template + FJXL_INLINE SIMDVec16 SignedShiftRight() const { + return SIMDVec16{_mm256_srai_epi16(vec, i)}; + } + + static std::array LoadG8(const unsigned char* data) { + __m128i bytes = _mm_loadu_si128((__m128i*)data); + return {SIMDVec16{_mm256_cvtepu8_epi16(bytes)}}; + } + static std::array LoadG16(const unsigned char* data) { + return {Load((const uint16_t*)data)}; + } + + static std::array LoadGA8(const unsigned char* data) { + __m256i bytes = _mm256_loadu_si256((__m256i*)data); + __m256i gray = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF)); + __m256i alpha = _mm256_srli_epi16(bytes, 8); + return {SIMDVec16{gray}, SIMDVec16{alpha}}; + } + static std::array LoadGA16(const unsigned char* data) { + __m256i bytes1 = _mm256_loadu_si256((__m256i*)data); + __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32)); + __m256i g_mask = _mm256_set1_epi32(0xFFFF); + __m256i g = _mm256_permute4x64_epi64( + _mm256_packus_epi32(_mm256_and_si256(bytes1, g_mask), + _mm256_and_si256(bytes2, g_mask)), + 0b11011000); + __m256i a = _mm256_permute4x64_epi64( + _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16), + _mm256_srli_epi32(bytes2, 16)), + 0b11011000); + return {SIMDVec16{g}, SIMDVec16{a}}; + } + + static std::array LoadRGB8(const unsigned char* data) { + __m128i bytes0 = _mm_loadu_si128((__m128i*)data); + __m128i bytes1 = _mm_loadu_si128((__m128i*)(data + 16)); + __m128i bytes2 = _mm_loadu_si128((__m128i*)(data + 32)); + + __m128i idx = + _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13); + + __m128i r6b5g5_0 = _mm_shuffle_epi8(bytes0, idx); + __m128i g6r5b5_1 = _mm_shuffle_epi8(bytes1, idx); + __m128i b6g5r5_2 = _mm_shuffle_epi8(bytes2, idx); + + __m128i mask010 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0, 0, 0, 0, 0); + __m128i mask001 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF); + + __m128i b2g2b1 = _mm_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001); + __m128i b2b0b1 = _mm_blendv_epi8(b2g2b1, r6b5g5_0, mask010); + + __m128i r0r1b1 = _mm_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010); + __m128i r0r1r2 = _mm_blendv_epi8(r0r1b1, b6g5r5_2, mask001); + + __m128i g1r1g0 = _mm_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001); + __m128i g1g2g0 = _mm_blendv_epi8(g1r1g0, b6g5r5_2, mask010); + + __m128i g0g1g2 = _mm_alignr_epi8(g1g2g0, g1g2g0, 11); + __m128i b0b1b2 = _mm_alignr_epi8(b2b0b1, b2b0b1, 6); + + return {SIMDVec16{_mm256_cvtepu8_epi16(r0r1r2)}, + SIMDVec16{_mm256_cvtepu8_epi16(g0g1g2)}, + SIMDVec16{_mm256_cvtepu8_epi16(b0b1b2)}}; + } + static std::array LoadRGB16(const unsigned char* data) { + auto load_and_split_lohi = [](const unsigned char* data) { + // LHLHLH... + __m256i bytes = _mm256_loadu_si256((__m256i*)data); + // L0L0L0... + __m256i lo = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF)); + // H0H0H0... + __m256i hi = _mm256_srli_epi16(bytes, 8); + // LLLLLLLLHHHHHHHHLLLLLLLLHHHHHHHH + __m256i packed = _mm256_packus_epi16(lo, hi); + return _mm256_permute4x64_epi64(packed, 0b11011000); + }; + __m256i bytes0 = load_and_split_lohi(data); + __m256i bytes1 = load_and_split_lohi(data + 32); + __m256i bytes2 = load_and_split_lohi(data + 64); + + __m256i idx = _mm256_broadcastsi128_si256( + _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13)); + + __m256i r6b5g5_0 = _mm256_shuffle_epi8(bytes0, idx); + __m256i g6r5b5_1 = _mm256_shuffle_epi8(bytes1, idx); + __m256i b6g5r5_2 = _mm256_shuffle_epi8(bytes2, idx); + + __m256i mask010 = _mm256_broadcastsi128_si256(_mm_setr_epi8( + 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0)); + __m256i mask001 = _mm256_broadcastsi128_si256(_mm_setr_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF)); + + __m256i b2g2b1 = _mm256_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001); + __m256i b2b0b1 = _mm256_blendv_epi8(b2g2b1, r6b5g5_0, mask010); + + __m256i r0r1b1 = _mm256_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010); + __m256i r0r1r2 = _mm256_blendv_epi8(r0r1b1, b6g5r5_2, mask001); + + __m256i g1r1g0 = _mm256_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001); + __m256i g1g2g0 = _mm256_blendv_epi8(g1r1g0, b6g5r5_2, mask010); + + __m256i g0g1g2 = _mm256_alignr_epi8(g1g2g0, g1g2g0, 11); + __m256i b0b1b2 = _mm256_alignr_epi8(b2b0b1, b2b0b1, 6); + + // Now r0r1r2, g0g1g2, b0b1b2 have the low bytes of the RGB pixels in their + // lower half, and the high bytes in their upper half. + + auto combine_low_hi = [](__m256i v) { + __m128i low = _mm256_extracti128_si256(v, 0); + __m128i hi = _mm256_extracti128_si256(v, 1); + __m256i low16 = _mm256_cvtepu8_epi16(low); + __m256i hi16 = _mm256_cvtepu8_epi16(hi); + return _mm256_or_si256(_mm256_slli_epi16(hi16, 8), low16); + }; + + return {SIMDVec16{combine_low_hi(r0r1r2)}, + SIMDVec16{combine_low_hi(g0g1g2)}, + SIMDVec16{combine_low_hi(b0b1b2)}}; + } + + static std::array LoadRGBA8(const unsigned char* data) { + __m256i bytes1 = _mm256_loadu_si256((__m256i*)data); + __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32)); + __m256i rg_mask = _mm256_set1_epi32(0xFFFF); + __m256i rg = _mm256_permute4x64_epi64( + _mm256_packus_epi32(_mm256_and_si256(bytes1, rg_mask), + _mm256_and_si256(bytes2, rg_mask)), + 0b11011000); + __m256i ba = _mm256_permute4x64_epi64( + _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16), + _mm256_srli_epi32(bytes2, 16)), + 0b11011000); + __m256i r = _mm256_and_si256(rg, _mm256_set1_epi16(0xFF)); + __m256i g = _mm256_srli_epi16(rg, 8); + __m256i b = _mm256_and_si256(ba, _mm256_set1_epi16(0xFF)); + __m256i a = _mm256_srli_epi16(ba, 8); + return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}}; + } + static std::array LoadRGBA16(const unsigned char* data) { + __m256i bytes0 = _mm256_loadu_si256((__m256i*)data); + __m256i bytes1 = _mm256_loadu_si256((__m256i*)(data + 32)); + __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 64)); + __m256i bytes3 = _mm256_loadu_si256((__m256i*)(data + 96)); + + auto pack32 = [](__m256i a, __m256i b) { + return _mm256_permute4x64_epi64(_mm256_packus_epi32(a, b), 0b11011000); + }; + auto packlow32 = [&pack32](__m256i a, __m256i b) { + __m256i mask = _mm256_set1_epi32(0xFFFF); + return pack32(_mm256_and_si256(a, mask), _mm256_and_si256(b, mask)); + }; + auto packhi32 = [&pack32](__m256i a, __m256i b) { + return pack32(_mm256_srli_epi32(a, 16), _mm256_srli_epi32(b, 16)); + }; + + __m256i rb0 = packlow32(bytes0, bytes1); + __m256i rb1 = packlow32(bytes2, bytes3); + __m256i ga0 = packhi32(bytes0, bytes1); + __m256i ga1 = packhi32(bytes2, bytes3); + + __m256i r = packlow32(rb0, rb1); + __m256i g = packlow32(ga0, ga1); + __m256i b = packhi32(rb0, rb1); + __m256i a = packhi32(ga0, ga1); + return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}}; + } + + void SwapEndian() { + auto indices = _mm256_broadcastsi128_si256( + _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14)); + vec = _mm256_shuffle_epi8(vec, indices); + } +}; + +SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true, + const SIMDVec16& if_false) { + return SIMDVec16{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)}; +} + +SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true, + const SIMDVec32& if_false) { + return SIMDVec32{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)}; +} + +struct Bits64 { + static constexpr size_t kLanes = 4; + + __m256i nbits; + __m256i bits; + + FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) { + _mm256_storeu_si256((__m256i*)nbits_out, nbits); + _mm256_storeu_si256((__m256i*)bits_out, bits); + } +}; + +struct Bits32 { + __m256i nbits; + __m256i bits; + + static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) { + return Bits32{nbits.vec, bits.vec}; + } + + Bits64 Merge() const { + auto nbits_hi32 = _mm256_srli_epi64(nbits, 32); + auto nbits_lo32 = _mm256_and_si256(nbits, _mm256_set1_epi64x(0xFFFFFFFF)); + auto bits_hi32 = _mm256_srli_epi64(bits, 32); + auto bits_lo32 = _mm256_and_si256(bits, _mm256_set1_epi64x(0xFFFFFFFF)); + + auto nbits64 = _mm256_add_epi64(nbits_hi32, nbits_lo32); + auto bits64 = + _mm256_or_si256(_mm256_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32); + return Bits64{nbits64, bits64}; + } + + void Interleave(const Bits32& low) { + bits = _mm256_or_si256(_mm256_sllv_epi32(bits, low.nbits), low.bits); + nbits = _mm256_add_epi32(nbits, low.nbits); + } + + void ClipTo(size_t n) { + n = std::min(n, 8); + constexpr uint32_t kMask[16] = { + ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0, 0, 0, 0, 0, + }; + __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n)); + nbits = _mm256_and_si256(mask, nbits); + bits = _mm256_and_si256(mask, bits); + } + void Skip(size_t n) { + n = std::min(n, 8); + constexpr uint32_t kMask[16] = { + 0, 0, 0, 0, 0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, + }; + __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n)); + nbits = _mm256_and_si256(mask, nbits); + bits = _mm256_and_si256(mask, bits); + } +}; + +struct Bits16 { + __m256i nbits; + __m256i bits; + + static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) { + return Bits16{nbits.vec, bits.vec}; + } + + Bits32 Merge() const { + auto nbits_hi16 = _mm256_srli_epi32(nbits, 16); + auto nbits_lo16 = _mm256_and_si256(nbits, _mm256_set1_epi32(0xFFFF)); + auto bits_hi16 = _mm256_srli_epi32(bits, 16); + auto bits_lo16 = _mm256_and_si256(bits, _mm256_set1_epi32(0xFFFF)); + + auto nbits32 = _mm256_add_epi32(nbits_hi16, nbits_lo16); + auto bits32 = + _mm256_or_si256(_mm256_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16); + return Bits32{nbits32, bits32}; + } + + void Interleave(const Bits16& low) { + auto pow2_lo_lut = _mm256_broadcastsi128_si256( + _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, + 1u << 7, 0, 0, 0, 0, 0, 0, 0, 0)); + auto low_nbits_masked = + _mm256_or_si256(low.nbits, _mm256_set1_epi16(0xFF00)); + + auto bits_shifted = _mm256_mullo_epi16( + bits, _mm256_shuffle_epi8(pow2_lo_lut, low_nbits_masked)); + + nbits = _mm256_add_epi16(nbits, low.nbits); + bits = _mm256_or_si256(bits_shifted, low.bits); + } + + void ClipTo(size_t n) { + n = std::min(n, 16); + constexpr uint16_t kMask[32] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }; + __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n)); + nbits = _mm256_and_si256(mask, nbits); + bits = _mm256_and_si256(mask, bits); + } + + void Skip(size_t n) { + n = std::min(n, 16); + constexpr uint16_t kMask[32] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + }; + __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n)); + nbits = _mm256_and_si256(mask, nbits); + bits = _mm256_and_si256(mask, bits); + } +}; + +#endif + +#ifdef FJXL_NEON +#define FJXL_GENERIC_SIMD + +struct SIMDVec32; + +struct Mask32 { + uint32x4_t mask; + SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false); + Mask32 And(const Mask32& oth) const { + return Mask32{vandq_u32(mask, oth.mask)}; + } + size_t CountPrefix() const { + uint32_t val_unset[4] = {0, 1, 2, 3}; + uint32_t val_set[4] = {4, 4, 4, 4}; + uint32x4_t val = vbslq_u32(mask, vld1q_u32(val_set), vld1q_u32(val_unset)); + return vminvq_u32(val); + } +}; + +struct SIMDVec32 { + uint32x4_t vec; + + static constexpr size_t kLanes = 4; + + FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) { + return SIMDVec32{vld1q_u32(data)}; + } + FJXL_INLINE void Store(uint32_t* data) { vst1q_u32(data, vec); } + FJXL_INLINE static SIMDVec32 Val(uint32_t v) { + return SIMDVec32{vdupq_n_u32(v)}; + } + FJXL_INLINE SIMDVec32 ValToToken() const { + return SIMDVec32{vsubq_u32(vdupq_n_u32(32), vclzq_u32(vec))}; + } + FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const { + return SIMDVec32{vqsubq_u32(vec, to_subtract.vec)}; + } + FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const { + return SIMDVec32{vsubq_u32(vec, to_subtract.vec)}; + } + FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const { + return SIMDVec32{vaddq_u32(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const { + return SIMDVec32{veorq_u32(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec32 Pow2() const { + return SIMDVec32{vshlq_u32(vdupq_n_u32(1), vreinterpretq_s32_u32(vec))}; + } + FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const { + return Mask32{vceqq_u32(vec, oth.vec)}; + } + FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const { + return Mask32{ + vcgtq_s32(vreinterpretq_s32_u32(vec), vreinterpretq_s32_u32(oth.vec))}; + } + template + FJXL_INLINE SIMDVec32 SignedShiftRight() const { + return SIMDVec32{ + vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(vec), i))}; + } +}; + +struct SIMDVec16; + +struct Mask16 { + uint16x8_t mask; + SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false); + Mask16 And(const Mask16& oth) const { + return Mask16{vandq_u16(mask, oth.mask)}; + } + size_t CountPrefix() const { + uint16_t val_unset[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + uint16_t val_set[8] = {8, 8, 8, 8, 8, 8, 8, 8}; + uint16x8_t val = vbslq_u16(mask, vld1q_u16(val_set), vld1q_u16(val_unset)); + return vminvq_u16(val); + } +}; + +struct SIMDVec16 { + uint16x8_t vec; + + static constexpr size_t kLanes = 8; + + FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) { + return SIMDVec16{vld1q_u16(data)}; + } + FJXL_INLINE void Store(uint16_t* data) { vst1q_u16(data, vec); } + FJXL_INLINE static SIMDVec16 Val(uint16_t v) { + return SIMDVec16{vdupq_n_u16(v)}; + } + FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo, + const SIMDVec32& hi) { + return SIMDVec16{vmovn_high_u32(vmovn_u32(lo.vec), hi.vec)}; + } + + FJXL_INLINE SIMDVec16 ValToToken() const { + return SIMDVec16{vsubq_u16(vdupq_n_u16(16), vclzq_u16(vec))}; + } + FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const { + return SIMDVec16{vqsubq_u16(vec, to_subtract.vec)}; + } + FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const { + return SIMDVec16{vsubq_u16(vec, to_subtract.vec)}; + } + FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const { + return SIMDVec16{vaddq_u16(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const { + return SIMDVec16{vminq_u16(vec, oth.vec)}; + } + FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const { + return Mask16{vceqq_u16(vec, oth.vec)}; + } + FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const { + return Mask16{ + vcgtq_s16(vreinterpretq_s16_u16(vec), vreinterpretq_s16_u16(oth.vec))}; + } + FJXL_INLINE SIMDVec16 Pow2() const { + return SIMDVec16{vshlq_u16(vdupq_n_u16(1), vreinterpretq_s16_u16(vec))}; + } + FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const { + return SIMDVec16{vorrq_u16(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const { + return SIMDVec16{veorq_u16(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const { + return SIMDVec16{vandq_u16(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const { + return SIMDVec16{vhaddq_u16(vec, oth.vec)}; + } + FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const { + return SIMDVec16{vorrq_u16(vec, vdupq_n_u16(0xFF00))}; + } + FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const { + uint8x16_t tbl = vld1q_u8(table); + uint8x16_t indices = vreinterpretq_u8_u16(vec); + return SIMDVec16{vreinterpretq_u16_u8(vqtbl1q_u8(tbl, indices))}; + } + FJXL_INLINE VecPair Interleave(const SIMDVec16& low) const { + return {SIMDVec16{vzip1q_u16(low.vec, vec)}, + SIMDVec16{vzip2q_u16(low.vec, vec)}}; + } + FJXL_INLINE VecPair Upcast() const { + uint32x4_t lo = vmovl_u16(vget_low_u16(vec)); + uint32x4_t hi = vmovl_high_u16(vec); + return {SIMDVec32{lo}, SIMDVec32{hi}}; + } + template + FJXL_INLINE SIMDVec16 SignedShiftRight() const { + return SIMDVec16{ + vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(vec), i))}; + } + + static std::array LoadG8(const unsigned char* data) { + uint8x8_t v = vld1_u8(data); + return {SIMDVec16{vmovl_u8(v)}}; + } + static std::array LoadG16(const unsigned char* data) { + return {Load((const uint16_t*)data)}; + } + + static std::array LoadGA8(const unsigned char* data) { + uint8x8x2_t v = vld2_u8(data); + return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])}}; + } + static std::array LoadGA16(const unsigned char* data) { + uint16x8x2_t v = vld2q_u16((const uint16_t*)data); + return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}}; + } + + static std::array LoadRGB8(const unsigned char* data) { + uint8x8x3_t v = vld3_u8(data); + return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])}, + SIMDVec16{vmovl_u8(v.val[2])}}; + } + static std::array LoadRGB16(const unsigned char* data) { + uint16x8x3_t v = vld3q_u16((const uint16_t*)data); + return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]}}; + } + + static std::array LoadRGBA8(const unsigned char* data) { + uint8x8x4_t v = vld4_u8(data); + return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])}, + SIMDVec16{vmovl_u8(v.val[2])}, SIMDVec16{vmovl_u8(v.val[3])}}; + } + static std::array LoadRGBA16(const unsigned char* data) { + uint16x8x4_t v = vld4q_u16((const uint16_t*)data); + return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]}, + SIMDVec16{v.val[3]}}; + } + + void SwapEndian() { + vec = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(vec))); + } +}; + +SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true, + const SIMDVec16& if_false) { + return SIMDVec16{vbslq_u16(mask, if_true.vec, if_false.vec)}; +} + +SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true, + const SIMDVec32& if_false) { + return SIMDVec32{vbslq_u32(mask, if_true.vec, if_false.vec)}; +} + +struct Bits64 { + static constexpr size_t kLanes = 2; + + uint64x2_t nbits; + uint64x2_t bits; + + FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) { + vst1q_u64(nbits_out, nbits); + vst1q_u64(bits_out, bits); + } +}; + +struct Bits32 { + uint32x4_t nbits; + uint32x4_t bits; + + static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) { + return Bits32{nbits.vec, bits.vec}; + } + + Bits64 Merge() const { + // TODO(veluca): can probably be optimized. + uint64x2_t nbits_lo32 = + vandq_u64(vreinterpretq_u64_u32(nbits), vdupq_n_u64(0xFFFFFFFF)); + uint64x2_t bits_hi32 = + vshlq_u64(vshrq_n_u64(vreinterpretq_u64_u32(bits), 32), + vreinterpretq_s64_u64(nbits_lo32)); + uint64x2_t bits_lo32 = + vandq_u64(vreinterpretq_u64_u32(bits), vdupq_n_u64(0xFFFFFFFF)); + uint64x2_t nbits64 = + vsraq_n_u64(nbits_lo32, vreinterpretq_u64_u32(nbits), 32); + uint64x2_t bits64 = vorrq_u64(bits_hi32, bits_lo32); + return Bits64{nbits64, bits64}; + } + + void Interleave(const Bits32& low) { + bits = + vorrq_u32(vshlq_u32(bits, vreinterpretq_s32_u32(low.nbits)), low.bits); + nbits = vaddq_u32(nbits, low.nbits); + } + + void ClipTo(size_t n) { + n = std::min(n, 4); + constexpr uint32_t kMask[8] = { + ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0, + }; + uint32x4_t mask = vld1q_u32(kMask + 4 - n); + nbits = vandq_u32(mask, nbits); + bits = vandq_u32(mask, bits); + } + void Skip(size_t n) { + n = std::min(n, 4); + constexpr uint32_t kMask[8] = { + 0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u, + }; + uint32x4_t mask = vld1q_u32(kMask + 4 - n); + nbits = vandq_u32(mask, nbits); + bits = vandq_u32(mask, bits); + } +}; + +struct Bits16 { + uint16x8_t nbits; + uint16x8_t bits; + + static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) { + return Bits16{nbits.vec, bits.vec}; + } + + Bits32 Merge() const { + // TODO(veluca): can probably be optimized. + uint32x4_t nbits_lo16 = + vandq_u32(vreinterpretq_u32_u16(nbits), vdupq_n_u32(0xFFFF)); + uint32x4_t bits_hi16 = + vshlq_u32(vshrq_n_u32(vreinterpretq_u32_u16(bits), 16), + vreinterpretq_s32_u32(nbits_lo16)); + uint32x4_t bits_lo16 = + vandq_u32(vreinterpretq_u32_u16(bits), vdupq_n_u32(0xFFFF)); + uint32x4_t nbits32 = + vsraq_n_u32(nbits_lo16, vreinterpretq_u32_u16(nbits), 16); + uint32x4_t bits32 = vorrq_u32(bits_hi16, bits_lo16); + return Bits32{nbits32, bits32}; + } + + void Interleave(const Bits16& low) { + bits = + vorrq_u16(vshlq_u16(bits, vreinterpretq_s16_u16(low.nbits)), low.bits); + nbits = vaddq_u16(nbits, low.nbits); + } + + void ClipTo(size_t n) { + n = std::min(n, 8); + constexpr uint16_t kMask[16] = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0, 0, 0, 0, 0, 0, 0, 0, + }; + uint16x8_t mask = vld1q_u16(kMask + 8 - n); + nbits = vandq_u16(mask, nbits); + bits = vandq_u16(mask, bits); + } + void Skip(size_t n) { + n = std::min(n, 8); + constexpr uint16_t kMask[16] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + }; + uint16x8_t mask = vld1q_u16(kMask + 8 - n); + nbits = vandq_u16(mask, nbits); + bits = vandq_u16(mask, bits); + } +}; + +#endif + +#ifdef FJXL_GENERIC_SIMD +constexpr size_t SIMDVec32::kLanes; +constexpr size_t SIMDVec16::kLanes; + +//  Each of these functions will process SIMDVec16::kLanes worth of values. + +FJXL_INLINE void TokenizeSIMD(const uint16_t* residuals, uint16_t* token_out, + uint16_t* nbits_out, uint16_t* bits_out) { + SIMDVec16 res = SIMDVec16::Load(residuals); + SIMDVec16 token = res.ValToToken(); + SIMDVec16 nbits = token.SatSubU(SIMDVec16::Val(1)); + SIMDVec16 bits = res.SatSubU(nbits.Pow2()); + token.Store(token_out); + nbits.Store(nbits_out); + bits.Store(bits_out); +} + +FJXL_INLINE void TokenizeSIMD(const uint32_t* residuals, uint16_t* token_out, + uint32_t* nbits_out, uint32_t* bits_out) { + static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes, ""); + SIMDVec32 res_lo = SIMDVec32::Load(residuals); + SIMDVec32 res_hi = SIMDVec32::Load(residuals + SIMDVec32::kLanes); + SIMDVec32 token_lo = res_lo.ValToToken(); + SIMDVec32 token_hi = res_hi.ValToToken(); + SIMDVec32 nbits_lo = token_lo.SatSubU(SIMDVec32::Val(1)); + SIMDVec32 nbits_hi = token_hi.SatSubU(SIMDVec32::Val(1)); + SIMDVec32 bits_lo = res_lo.SatSubU(nbits_lo.Pow2()); + SIMDVec32 bits_hi = res_hi.SatSubU(nbits_hi.Pow2()); + SIMDVec16 token = SIMDVec16::FromTwo32(token_lo, token_hi); + token.Store(token_out); + nbits_lo.Store(nbits_out); + nbits_hi.Store(nbits_out + SIMDVec32::kLanes); + bits_lo.Store(bits_out); + bits_hi.Store(bits_out + SIMDVec32::kLanes); +} + +FJXL_INLINE void HuffmanSIMDUpTo13(const uint16_t* tokens, + const PrefixCode& code, uint16_t* nbits_out, + uint16_t* bits_out) { + SIMDVec16 tok = SIMDVec16::Load(tokens).PrepareForU8Lookup(); + tok.U8Lookup(code.raw_nbits_simd).Store(nbits_out); + tok.U8Lookup(code.raw_bits_simd).Store(bits_out); +} + +FJXL_INLINE void HuffmanSIMD14(const uint16_t* tokens, const PrefixCode& code, + uint16_t* nbits_out, uint16_t* bits_out) { + SIMDVec16 token_cap = SIMDVec16::Val(15); + SIMDVec16 tok = SIMDVec16::Load(tokens); + SIMDVec16 tok_index = tok.Min(token_cap).PrepareForU8Lookup(); + SIMDVec16 huff_bits_pre = tok_index.U8Lookup(code.raw_bits_simd); + // Set the highest bit when token == 16; the Huffman code is constructed in + // such a way that the code for token 15 is the same as the code for 16, + // except for the highest bit. + Mask16 needs_high_bit = tok.Eq(SIMDVec16::Val(16)); + SIMDVec16 huff_bits = needs_high_bit.IfThenElse( + huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre); + huff_bits.Store(bits_out); + tok_index.U8Lookup(code.raw_nbits_simd).Store(nbits_out); +} + +FJXL_INLINE void HuffmanSIMDAbove14(const uint16_t* tokens, + const PrefixCode& code, uint16_t* nbits_out, + uint16_t* bits_out) { + SIMDVec16 tok = SIMDVec16::Load(tokens); + // We assume `tok` fits in a *signed* 16-bit integer. + Mask16 above = tok.Gt(SIMDVec16::Val(12)); + // 13, 14 -> 13 + // 15, 16 -> 14 + // 17, 18 -> 15 + SIMDVec16 remap_tok = above.IfThenElse(tok.HAdd(SIMDVec16::Val(13)), tok); + SIMDVec16 tok_index = remap_tok.PrepareForU8Lookup(); + SIMDVec16 huff_bits_pre = tok_index.U8Lookup(code.raw_bits_simd); + // Set the highest bit when token == 14, 16, 18. + Mask16 needs_high_bit = above.And(tok.Eq(tok.And(SIMDVec16::Val(0xFFFE)))); + SIMDVec16 huff_bits = needs_high_bit.IfThenElse( + huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre); + huff_bits.Store(bits_out); + tok_index.U8Lookup(code.raw_nbits_simd).Store(nbits_out); +} + +FJXL_INLINE void StoreSIMDUpTo8(const uint16_t* nbits_tok, + const uint16_t* bits_tok, + const uint16_t* nbits_huff, + const uint16_t* bits_huff, size_t n, + size_t skip, Bits32* bits_out) { + Bits16 bits = + Bits16::FromRaw(SIMDVec16::Load(nbits_tok), SIMDVec16::Load(bits_tok)); + Bits16 huff_bits = + Bits16::FromRaw(SIMDVec16::Load(nbits_huff), SIMDVec16::Load(bits_huff)); + bits.Interleave(huff_bits); + bits.ClipTo(n); + bits.Skip(skip); + bits_out[0] = bits.Merge(); +} + +// Huffman and raw bits don't necessarily fit in a single u16 here. +FJXL_INLINE void StoreSIMDUpTo14(const uint16_t* nbits_tok, + const uint16_t* bits_tok, + const uint16_t* nbits_huff, + const uint16_t* bits_huff, size_t n, + size_t skip, Bits32* bits_out) { + VecPair bits = + SIMDVec16::Load(bits_tok).Interleave(SIMDVec16::Load(bits_huff)); + VecPair nbits = + SIMDVec16::Load(nbits_tok).Interleave(SIMDVec16::Load(nbits_huff)); + Bits16 low = Bits16::FromRaw(nbits.low, bits.low); + Bits16 hi = Bits16::FromRaw(nbits.hi, bits.hi); + low.ClipTo(2 * n); + low.Skip(2 * skip); + hi.ClipTo(std::max(2 * n, SIMDVec16::kLanes) - SIMDVec16::kLanes); + hi.Skip(std::max(2 * skip, SIMDVec16::kLanes) - SIMDVec16::kLanes); + + bits_out[0] = low.Merge(); + bits_out[1] = hi.Merge(); +} + +FJXL_INLINE void StoreSIMDAbove14(const uint32_t* nbits_tok, + const uint32_t* bits_tok, + const uint16_t* nbits_huff, + const uint16_t* bits_huff, size_t n, + size_t skip, Bits32* bits_out) { + static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes, ""); + Bits32 bits_low = + Bits32::FromRaw(SIMDVec32::Load(nbits_tok), SIMDVec32::Load(bits_tok)); + Bits32 bits_hi = + Bits32::FromRaw(SIMDVec32::Load(nbits_tok + SIMDVec32::kLanes), + SIMDVec32::Load(bits_tok + SIMDVec32::kLanes)); + + VecPair huff_bits = SIMDVec16::Load(bits_huff).Upcast(); + VecPair huff_nbits = SIMDVec16::Load(nbits_huff).Upcast(); + + Bits32 huff_low = Bits32::FromRaw(huff_nbits.low, huff_bits.low); + Bits32 huff_hi = Bits32::FromRaw(huff_nbits.hi, huff_bits.hi); + + bits_low.Interleave(huff_low); + bits_low.ClipTo(n); + bits_low.Skip(skip); + bits_out[0] = bits_low; + bits_hi.Interleave(huff_hi); + bits_hi.ClipTo(std::max(n, SIMDVec32::kLanes) - SIMDVec32::kLanes); + bits_hi.Skip(std::max(skip, SIMDVec32::kLanes) - SIMDVec32::kLanes); + bits_out[1] = bits_hi; +} + +#ifdef FJXL_AVX512 +FJXL_INLINE void StoreToWriterAVX512(const Bits32& bits32, BitWriter& output) { + __m512i bits = bits32.bits; + __m512i nbits = bits32.nbits; + + // Insert the leftover bits from the bit buffer at the bottom of the vector + // and extract the top of the vector. + uint64_t trail_bits = + _mm512_cvtsi512_si32(_mm512_alignr_epi32(bits, bits, 15)); + uint64_t trail_nbits = + _mm512_cvtsi512_si32(_mm512_alignr_epi32(nbits, nbits, 15)); + __m512i lead_bits = _mm512_set1_epi32(output.buffer); + __m512i lead_nbits = _mm512_set1_epi32(output.bits_in_buffer); + bits = _mm512_alignr_epi32(bits, lead_bits, 15); + nbits = _mm512_alignr_epi32(nbits, lead_nbits, 15); + + // Merge 32 -> 64 bits. + Bits32 b{nbits, bits}; + Bits64 b64 = b.Merge(); + bits = b64.bits; + nbits = b64.nbits; + + __m512i zero = _mm512_setzero_si512(); + + auto sh1 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 7); }; + auto sh2 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 6); }; + auto sh4 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 4); }; + + // Compute first-past-end-bit-position. + __m512i end_interm0 = _mm512_add_epi64(nbits, sh1(nbits)); + __m512i end_interm1 = _mm512_add_epi64(end_interm0, sh2(end_interm0)); + __m512i end = _mm512_add_epi64(end_interm1, sh4(end_interm1)); + + uint64_t simd_nbits = _mm512_cvtsi512_si32(_mm512_alignr_epi64(end, end, 7)); + + // Compute begin-bit-position. + __m512i begin = _mm512_sub_epi64(end, nbits); + + // Index of the last bit in the chunk, or the end bit if nbits==0. + __m512i last = _mm512_mask_sub_epi64( + end, _mm512_cmpneq_epi64_mask(nbits, zero), end, _mm512_set1_epi64(1)); + + __m512i lane_offset_mask = _mm512_set1_epi64(63); + + // Starting position of the chunk that each lane will ultimately belong to. + __m512i chunk_start = _mm512_andnot_si512(lane_offset_mask, last); + + // For all lanes that contain bits belonging to two different 64-bit chunks, + // compute the number of bits that belong to the first chunk. + // total # of bits fit in a u16, so we can satsub_u16 here. + __m512i first_chunk_nbits = _mm512_subs_epu16(chunk_start, begin); + + // Move all the previous-chunk-bits to the previous lane. + __m512i negnbits = _mm512_sub_epi64(_mm512_set1_epi64(64), first_chunk_nbits); + __m512i first_chunk_bits = + _mm512_srlv_epi64(_mm512_sllv_epi64(bits, negnbits), negnbits); + __m512i first_chunk_bits_down = + _mm512_alignr_epi32(zero, first_chunk_bits, 2); + bits = _mm512_srlv_epi64(bits, first_chunk_nbits); + nbits = _mm512_sub_epi64(nbits, first_chunk_nbits); + bits = _mm512_or_si512(bits, _mm512_sllv_epi64(first_chunk_bits_down, nbits)); + begin = _mm512_add_epi64(begin, first_chunk_nbits); + + // We now know that every lane should give bits to only one chunk. We can + // shift the bits and then horizontally-or-reduce them within the same chunk. + __m512i offset = _mm512_and_si512(begin, lane_offset_mask); + __m512i aligned_bits = _mm512_sllv_epi64(bits, offset); + // h-or-reduce within same chunk + __m512i red0 = _mm512_mask_or_epi64( + aligned_bits, _mm512_cmpeq_epi64_mask(sh1(chunk_start), chunk_start), + sh1(aligned_bits), aligned_bits); + __m512i red1 = _mm512_mask_or_epi64( + red0, _mm512_cmpeq_epi64_mask(sh2(chunk_start), chunk_start), sh2(red0), + red0); + __m512i reduced = _mm512_mask_or_epi64( + red1, _mm512_cmpeq_epi64_mask(sh4(chunk_start), chunk_start), sh4(red1), + red1); + // Extract the highest lane that belongs to each chunk (the lane that ends up + // with the OR-ed value of all the other lanes of that chunk). + __m512i next_chunk_start = + _mm512_alignr_epi32(_mm512_set1_epi64(~0), chunk_start, 2); + __m512i result = _mm512_maskz_compress_epi64( + _mm512_cmpneq_epi64_mask(chunk_start, next_chunk_start), reduced); + + _mm512_storeu_si512((__m512i*)(output.data.get() + output.bytes_written), + result); + + // Update the bit writer and add the last 32-bit lane. + // Note that since trail_nbits was at most 32 to begin with, operating on + // trail_bits does not risk overflowing. + output.bytes_written += simd_nbits / 8; + // Here we are implicitly relying on the fact that simd_nbits < 512 to know + // that the byte of bitreader data we access is initialized. This is + // guaranteed because the remaining bits in the bitreader buffer are at most + // 7, so simd_nbits <= 505 always. + trail_bits = (trail_bits << (simd_nbits % 8)) + + output.data.get()[output.bytes_written]; + trail_nbits += simd_nbits % 8; + StoreLE64(output.data.get() + output.bytes_written, trail_bits); + size_t trail_bytes = trail_nbits / 8; + output.bits_in_buffer = trail_nbits % 8; + output.buffer = trail_bits >> (trail_bytes * 8); + output.bytes_written += trail_bytes; +} + +#endif + +template +FJXL_INLINE void StoreToWriter(const Bits32* bits, BitWriter& output) { +#ifdef FJXL_AVX512 + static_assert(n <= 2, ""); + StoreToWriterAVX512(bits[0], output); + if (n == 2) { + StoreToWriterAVX512(bits[1], output); + } + return; +#endif + static_assert(n <= 4, ""); + alignas(64) uint64_t nbits64[Bits64::kLanes * n]; + alignas(64) uint64_t bits64[Bits64::kLanes * n]; + bits[0].Merge().Store(nbits64, bits64); + if (n > 1) { + bits[1].Merge().Store(nbits64 + Bits64::kLanes, bits64 + Bits64::kLanes); + } + if (n > 2) { + bits[2].Merge().Store(nbits64 + 2 * Bits64::kLanes, + bits64 + 2 * Bits64::kLanes); + } + if (n > 3) { + bits[3].Merge().Store(nbits64 + 3 * Bits64::kLanes, + bits64 + 3 * Bits64::kLanes); + } + output.WriteMultiple(nbits64, bits64, Bits64::kLanes * n); +} + +namespace detail { +template +struct IntegerTypes; + +template <> +struct IntegerTypes { + using signed_ = int16_t; + using unsigned_ = uint16_t; +}; + +template <> +struct IntegerTypes { + using signed_ = int32_t; + using unsigned_ = uint32_t; +}; + +template +struct SIMDType; + +template <> +struct SIMDType { + using type = SIMDVec16; +}; + +template <> +struct SIMDType { + using type = SIMDVec32; +}; + +} // namespace detail + +template +using signed_t = typename detail::IntegerTypes::signed_; + +template +using unsigned_t = typename detail::IntegerTypes::unsigned_; + +template +using simd_t = typename detail::SIMDType::type; + +// This function will process exactly one vector worth of pixels. + +template +size_t PredictPixels(const signed_t* pixels, const signed_t* pixels_left, + const signed_t* pixels_top, + const signed_t* pixels_topleft, + unsigned_t* residuals) { + T px = T::Load((unsigned_t*)pixels); + T left = T::Load((unsigned_t*)pixels_left); + T top = T::Load((unsigned_t*)pixels_top); + T topleft = T::Load((unsigned_t*)pixels_topleft); + T ac = left.Sub(topleft); + T ab = left.Sub(top); + T bc = top.Sub(topleft); + T grad = ac.Add(top); + T d = ab.Xor(bc); + T zero = T::Val(0); + T clamp = zero.Gt(d).IfThenElse(top, left); + T s = ac.Xor(bc); + T pred = zero.Gt(s).IfThenElse(grad, clamp); + T res = px.Sub(pred); + T res_times_2 = res.Add(res); + res = zero.Gt(res).IfThenElse(T::Val(-1).Sub(res_times_2), res_times_2); + res.Store(residuals); + return res.Eq(T::Val(0)).CountPrefix(); +} + +#endif + +void EncodeHybridUint000(uint32_t value, uint32_t* token, uint32_t* nbits, + uint32_t* bits) { + uint32_t n = FloorLog2(value); + *token = value ? n + 1 : 0; + *nbits = value ? n : 0; + *bits = value ? value - (1 << n) : 0; +} + +#ifdef FJXL_AVX512 +constexpr static size_t kLogChunkSize = 5; +#elif defined(FJXL_AVX2) || defined(FJXL_NEON) +// Even if NEON only has 128-bit lanes, it is still significantly (~1.3x) faster +// to process two vectors at a time. +constexpr static size_t kLogChunkSize = 4; +#else +constexpr static size_t kLogChunkSize = 3; +#endif + +constexpr static size_t kChunkSize = 1 << kLogChunkSize; + +template +void GenericEncodeChunk(const Residual* residuals, size_t n, size_t skip, + const PrefixCode& code, BitWriter& output) { + for (size_t ix = skip; ix < n; ix++) { + unsigned token, nbits, bits; + EncodeHybridUint000(residuals[ix], &token, &nbits, &bits); + output.Write(code.raw_nbits[token] + nbits, + code.raw_bits[token] | bits << code.raw_nbits[token]); + } +} + +struct UpTo8Bits { + size_t bitdepth; + explicit UpTo8Bits(size_t bitdepth) : bitdepth(bitdepth) { + assert(bitdepth <= 8); + } + // Here we can fit up to 9 extra bits + 7 Huffman bits in a u16; for all other + // symbols, we could actually go up to 8 Huffman bits as we have at most 8 + // extra bits; however, the SIMD bit merging logic for AVX2 assumes that no + // Huffman length is 8 or more, so we cap at 8 anyway. Last symbol is used for + // LZ77 lengths and has no limitations except allowing to represent 32 symbols + // in total. + static constexpr uint8_t kMinRawLength[12] = {}; + static constexpr uint8_t kMaxRawLength[12] = { + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10, + }; + static size_t MaxEncodedBitsPerSample() { return 16; } + static constexpr size_t kInputBytes = 1; + using pixel_t = int16_t; + using upixel_t = uint16_t; + + static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits, + size_t n, uint8_t* nbits_simd, + uint8_t* bits_simd) { + assert(n <= 16); + memcpy(nbits_simd, nbits, 16); + memcpy(bits_simd, bits, 16); + } + + static void EncodeChunk(upixel_t* residuals, size_t n, size_t skip, + const PrefixCode& code, BitWriter& output) { +#ifdef FJXL_GENERIC_SIMD + Bits32 bits32[kChunkSize / SIMDVec16::kLanes]; + alignas(64) uint16_t bits[SIMDVec16::kLanes]; + alignas(64) uint16_t nbits[SIMDVec16::kLanes]; + alignas(64) uint16_t bits_huff[SIMDVec16::kLanes]; + alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes]; + alignas(64) uint16_t token[SIMDVec16::kLanes]; + for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) { + TokenizeSIMD(residuals + i, token, nbits, bits); + HuffmanSIMDUpTo13(token, code, nbits_huff, bits_huff); + StoreSIMDUpTo8(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i, + std::max(skip, i) - i, bits32 + i / SIMDVec16::kLanes); + } + StoreToWriter(bits32, output); + return; +#endif + GenericEncodeChunk(residuals, n, skip, code, output); + } + + size_t NumSymbols(bool doing_ycocg) const { + // values gain 1 bit for YCoCg, 1 bit for prediction. + // Maximum symbol is 1 + effective bit depth of residuals. + if (doing_ycocg) { + return bitdepth + 3; + } else { + return bitdepth + 2; + } + } +}; +constexpr uint8_t UpTo8Bits::kMinRawLength[]; +constexpr uint8_t UpTo8Bits::kMaxRawLength[]; + +struct From9To13Bits { + size_t bitdepth; + explicit From9To13Bits(size_t bitdepth) : bitdepth(bitdepth) { + assert(bitdepth <= 13 && bitdepth >= 9); + } + // Last symbol is used for LZ77 lengths and has no limitations except allowing + // to represent 32 symbols in total. + // We cannot fit all the bits in a u16, so do not even try and use up to 8 + // bits per raw symbol. + // There are at most 16 raw symbols, so Huffman coding can be SIMDfied without + // any special tricks. + static constexpr uint8_t kMinRawLength[17] = {}; + static constexpr uint8_t kMaxRawLength[17] = { + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10, + }; + static size_t MaxEncodedBitsPerSample() { return 21; } + static constexpr size_t kInputBytes = 2; + using pixel_t = int16_t; + using upixel_t = uint16_t; + + static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits, + size_t n, uint8_t* nbits_simd, + uint8_t* bits_simd) { + assert(n <= 16); + memcpy(nbits_simd, nbits, 16); + memcpy(bits_simd, bits, 16); + } + + static void EncodeChunk(upixel_t* residuals, size_t n, size_t skip, + const PrefixCode& code, BitWriter& output) { +#ifdef FJXL_GENERIC_SIMD + Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes]; + alignas(64) uint16_t bits[SIMDVec16::kLanes]; + alignas(64) uint16_t nbits[SIMDVec16::kLanes]; + alignas(64) uint16_t bits_huff[SIMDVec16::kLanes]; + alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes]; + alignas(64) uint16_t token[SIMDVec16::kLanes]; + for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) { + TokenizeSIMD(residuals + i, token, nbits, bits); + HuffmanSIMDUpTo13(token, code, nbits_huff, bits_huff); + StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i, + std::max(skip, i) - i, + bits32 + 2 * i / SIMDVec16::kLanes); + } + StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output); + return; +#endif + GenericEncodeChunk(residuals, n, skip, code, output); + } + + size_t NumSymbols(bool doing_ycocg) const { + // values gain 1 bit for YCoCg, 1 bit for prediction. + // Maximum symbol is 1 + effective bit depth of residuals. + if (doing_ycocg) { + return bitdepth + 3; + } else { + return bitdepth + 2; + } + } +}; +constexpr uint8_t From9To13Bits::kMinRawLength[]; +constexpr uint8_t From9To13Bits::kMaxRawLength[]; + +void CheckHuffmanBitsSIMD(int bits1, int nbits1, int bits2, int nbits2) { + assert(nbits1 == 8); + assert(nbits2 == 8); + assert(bits2 == (bits1 | 128)); +} + +struct Exactly14Bits { + explicit Exactly14Bits(size_t bitdepth) { assert(bitdepth == 14); } + // Force LZ77 symbols to have at least 8 bits, and raw symbols 15 and 16 to + // have exactly 8, and no other symbol to have 8 or more. This ensures that + // the representation for 15 and 16 is identical up to one bit. + static constexpr uint8_t kMinRawLength[18] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 7, + }; + static constexpr uint8_t kMaxRawLength[18] = { + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 10, + }; + static constexpr size_t bitdepth = 14; + static size_t MaxEncodedBitsPerSample() { return 22; } + static constexpr size_t kInputBytes = 2; + using pixel_t = int16_t; + using upixel_t = uint16_t; + + static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits, + size_t n, uint8_t* nbits_simd, + uint8_t* bits_simd) { + assert(n == 17); + CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]); + memcpy(nbits_simd, nbits, 16); + memcpy(bits_simd, bits, 16); + } + + static void EncodeChunk(upixel_t* residuals, size_t n, size_t skip, + const PrefixCode& code, BitWriter& output) { +#ifdef FJXL_GENERIC_SIMD + Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes]; + alignas(64) uint16_t bits[SIMDVec16::kLanes]; + alignas(64) uint16_t nbits[SIMDVec16::kLanes]; + alignas(64) uint16_t bits_huff[SIMDVec16::kLanes]; + alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes]; + alignas(64) uint16_t token[SIMDVec16::kLanes]; + for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) { + TokenizeSIMD(residuals + i, token, nbits, bits); + HuffmanSIMD14(token, code, nbits_huff, bits_huff); + StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i, + std::max(skip, i) - i, + bits32 + 2 * i / SIMDVec16::kLanes); + } + StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output); + return; +#endif + GenericEncodeChunk(residuals, n, skip, code, output); + } + + size_t NumSymbols(bool) const { return 17; } +}; +constexpr uint8_t Exactly14Bits::kMinRawLength[]; +constexpr uint8_t Exactly14Bits::kMaxRawLength[]; + +struct MoreThan14Bits { + size_t bitdepth; + explicit MoreThan14Bits(size_t bitdepth) : bitdepth(bitdepth) { + assert(bitdepth > 14); + assert(bitdepth <= 16); + } + // Force LZ77 symbols to have at least 8 bits, and raw symbols 13 to 18 to + // have exactly 8, and no other symbol to have 8 or more. This ensures that + // the representation for (13, 14), (15, 16), (17, 18) is identical up to one + // bit. + static constexpr uint8_t kMinRawLength[20] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 7, + }; + static constexpr uint8_t kMaxRawLength[20] = { + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 10, + }; + static size_t MaxEncodedBitsPerSample() { return 24; } + static constexpr size_t kInputBytes = 2; + using pixel_t = int32_t; + using upixel_t = uint32_t; + + static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits, + size_t n, uint8_t* nbits_simd, + uint8_t* bits_simd) { + assert(n == 19); + CheckHuffmanBitsSIMD(bits[13], nbits[13], bits[14], nbits[14]); + CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]); + CheckHuffmanBitsSIMD(bits[17], nbits[17], bits[18], nbits[18]); + for (size_t i = 0; i < 14; i++) { + nbits_simd[i] = nbits[i]; + bits_simd[i] = bits[i]; + } + nbits_simd[14] = nbits[15]; + bits_simd[14] = bits[15]; + nbits_simd[15] = nbits[17]; + bits_simd[15] = bits[17]; + } + + static void EncodeChunk(upixel_t* residuals, size_t n, size_t skip, + const PrefixCode& code, BitWriter& output) { +#ifdef FJXL_GENERIC_SIMD + Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes]; + alignas(64) uint32_t bits[SIMDVec16::kLanes]; + alignas(64) uint32_t nbits[SIMDVec16::kLanes]; + alignas(64) uint16_t bits_huff[SIMDVec16::kLanes]; + alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes]; + alignas(64) uint16_t token[SIMDVec16::kLanes]; + for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) { + TokenizeSIMD(residuals + i, token, nbits, bits); + HuffmanSIMDAbove14(token, code, nbits_huff, bits_huff); + StoreSIMDAbove14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i, + std::max(skip, i) - i, + bits32 + 2 * i / SIMDVec16::kLanes); + } + StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output); + return; +#endif + GenericEncodeChunk(residuals, n, skip, code, output); + } + size_t NumSymbols(bool) const { return 19; } +}; +constexpr uint8_t MoreThan14Bits::kMinRawLength[]; +constexpr uint8_t MoreThan14Bits::kMaxRawLength[]; + +void PrepareDCGlobalCommon(bool is_single_group, size_t width, size_t height, + const PrefixCode code[4], BitWriter* output) { + output->Allocate(100000 + (is_single_group ? width * height * 16 : 0)); + // No patches, spline or noise. + output->Write(1, 1); // default DC dequantization factors (?) + output->Write(1, 1); // use global tree / histograms + output->Write(1, 0); // no lz77 for the tree + + output->Write(1, 1); // simple code for the tree's context map + output->Write(2, 0); // all contexts clustered together + output->Write(1, 1); // use prefix code for tree + output->Write(4, 0); // 000 hybrid uint + output->Write(6, 0b100011); // Alphabet size is 4 (var16) + output->Write(2, 1); // simple prefix code + output->Write(2, 3); // with 4 symbols + output->Write(2, 0); + output->Write(2, 1); + output->Write(2, 2); + output->Write(2, 3); + output->Write(1, 0); // First tree encoding option + // Huffman table + extra bits for the tree. + uint8_t symbol_bits[6] = {0b00, 0b10, 0b001, 0b101, 0b0011, 0b0111}; + uint8_t symbol_nbits[6] = {2, 2, 3, 3, 4, 4}; + // Write a tree with a leaf per channel, and gradient predictor for every + // leaf. + for (auto v : {1, 2, 1, 4, 1, 0, 0, 5, 0, 0, 0, 0, 5, + 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 0, 0}) { + output->Write(symbol_nbits[v], symbol_bits[v]); + } + + output->Write(1, 1); // Enable lz77 for the main bitstream + output->Write(2, 0b00); // lz77 offset 224 + static_assert(kLZ77Offset == 224, ""); + output->Write(4, 0b1010); // lz77 min length 7 + // 400 hybrid uint config for lz77 + output->Write(4, 4); + output->Write(3, 0); + output->Write(3, 0); + + output->Write(1, 1); // simple code for the context map + output->Write(2, 3); // 3 bits per entry + output->Write(3, 4); // channel 3 + output->Write(3, 3); // channel 2 + output->Write(3, 2); // channel 1 + output->Write(3, 1); // channel 0 + output->Write(3, 0); // distance histogram first + + output->Write(1, 1); // use prefix codes + output->Write(4, 0); // 000 hybrid uint config for distances (only need 0) + for (size_t i = 0; i < 4; i++) { + output->Write(4, 0); // 000 hybrid uint config for symbols (only <= 10) + } + + // Distance alphabet size: + output->Write(5, 0b00001); // 2: just need 1 for RLE (i.e. distance 1) + // Symbol + LZ77 alphabet size: + for (size_t i = 0; i < 4; i++) { + output->Write(1, 1); // > 1 + output->Write(4, 8); // <= 512 + output->Write(8, 256); // == 512 + } + + // Distance histogram: + output->Write(2, 1); // simple prefix code + output->Write(2, 0); // with one symbol + output->Write(1, 1); // 1 + + // Symbol + lz77 histogram: + for (size_t i = 0; i < 4; i++) { + code[i].WriteTo(output); + } + + // Group header for global modular image. + output->Write(1, 1); // Global tree + output->Write(1, 1); // All default wp +} + +void PrepareDCGlobal(bool is_single_group, size_t width, size_t height, + size_t nb_chans, const PrefixCode code[4], + BitWriter* output) { + PrepareDCGlobalCommon(is_single_group, width, height, code, output); + if (nb_chans > 2) { + output->Write(2, 0b01); // 1 transform + output->Write(2, 0b00); // RCT + output->Write(5, 0b00000); // Starting from ch 0 + output->Write(2, 0b00); // YCoCg + } else { + output->Write(2, 0b00); // no transforms + } + if (!is_single_group) { + output->ZeroPadToByte(); + } +} + +template +struct ChunkEncoder { + FJXL_INLINE static void EncodeRle(size_t count, const PrefixCode& code, + BitWriter& output) { + if (count == 0) return; + count -= kLZ77MinLength + 1; + if (count < kLZ77CacheSize) { + output.Write(code.lz77_cache_nbits[count], code.lz77_cache_bits[count]); + } else { + unsigned token, nbits, bits; + EncodeHybridUintLZ77(count, &token, &nbits, &bits); + uint64_t wbits = bits; + wbits = (wbits << code.lz77_nbits[token]) | code.lz77_bits[token]; + wbits = (wbits << code.raw_nbits[0]) | code.raw_bits[0]; + output.Write(code.lz77_nbits[token] + nbits + code.raw_nbits[0], wbits); + } + } + + FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals, + size_t skip, size_t n) { + EncodeRle(run, *code, *output); + BitDepth::EncodeChunk(residuals, n, skip, *code, *output); + } + + inline void Finalize(size_t run) { EncodeRle(run, *code, *output); } + + const PrefixCode* code; + BitWriter* output; +}; + +template +struct ChunkSampleCollector { + FJXL_INLINE void Rle(size_t count, uint64_t* lz77_counts) { + if (count == 0) return; + raw_counts[0] += 1; + count -= kLZ77MinLength + 1; + unsigned token, nbits, bits; + EncodeHybridUintLZ77(count, &token, &nbits, &bits); + lz77_counts[token]++; + } + + FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals, + size_t skip, size_t n) { + // Run is broken. Encode the run and encode the individual vector. + Rle(run, lz77_counts); + for (size_t ix = skip; ix < n; ix++) { + unsigned token, nbits, bits; + EncodeHybridUint000(residuals[ix], &token, &nbits, &bits); + raw_counts[token]++; + } + } + + // don't count final run since we don't know how long it really is + void Finalize(size_t run) {} + + uint64_t* raw_counts; + uint64_t* lz77_counts; +}; + +constexpr uint32_t PackSigned(int32_t value) { + return (static_cast(value) << 1) ^ + ((static_cast(~value) >> 31) - 1); +} + +template +struct ChannelRowProcessor { + using upixel_t = typename BitDepth::upixel_t; + using pixel_t = typename BitDepth::pixel_t; + T* t; + void ProcessChunk(const pixel_t* row, const pixel_t* row_left, + const pixel_t* row_top, const pixel_t* row_topleft, + size_t n) { + alignas(64) upixel_t residuals[kChunkSize] = {}; + size_t prefix_size = 0; + size_t required_prefix_size = 0; +#ifdef FJXL_GENERIC_SIMD + constexpr size_t kNum = + sizeof(pixel_t) == 2 ? SIMDVec16::kLanes : SIMDVec32::kLanes; + for (size_t ix = 0; ix < kChunkSize; ix += kNum) { + size_t c = + PredictPixels>(row + ix, row_left + ix, row_top + ix, + row_topleft + ix, residuals + ix); + prefix_size = + prefix_size == required_prefix_size ? prefix_size + c : prefix_size; + required_prefix_size += kNum; + } +#else + for (size_t ix = 0; ix < kChunkSize; ix++) { + pixel_t px = row[ix]; + pixel_t left = row_left[ix]; + pixel_t top = row_top[ix]; + pixel_t topleft = row_topleft[ix]; + pixel_t ac = left - topleft; + pixel_t ab = left - top; + pixel_t bc = top - topleft; + pixel_t grad = static_cast(static_cast(ac) + + static_cast(top)); + pixel_t d = ab ^ bc; + pixel_t clamp = d < 0 ? top : left; + pixel_t s = ac ^ bc; + pixel_t pred = s < 0 ? grad : clamp; + residuals[ix] = PackSigned(px - pred); + prefix_size = prefix_size == required_prefix_size + ? prefix_size + (residuals[ix] == 0) + : prefix_size; + required_prefix_size += 1; + } +#endif + prefix_size = std::min(n, prefix_size); + if (prefix_size == n && (run > 0 || prefix_size > kLZ77MinLength)) { + // Run continues, nothing to do. + run += prefix_size; + } else if (prefix_size + run > kLZ77MinLength) { + // Run is broken. Encode the run and encode the individual vector. + t->Chunk(run + prefix_size, residuals, prefix_size, n); + run = 0; + } else { + // There was no run to begin with. + t->Chunk(0, residuals, 0, n); + } + } + + void ProcessRow(const pixel_t* row, const pixel_t* row_left, + const pixel_t* row_top, const pixel_t* row_topleft, + size_t xs) { + for (size_t x = 0; x < xs; x += kChunkSize) { + ProcessChunk(row + x, row_left + x, row_top + x, row_topleft + x, + std::min(kChunkSize, xs - x)); + } + } + + void Finalize() { t->Finalize(run); } + // Invariant: run == 0 or run > kLZ77MinLength. + size_t run = 0; +}; + +uint16_t LoadLE16(const unsigned char* ptr) { + return uint16_t{ptr[0]} | (uint16_t{ptr[1]} << 8); +} + +uint16_t SwapEndian(uint16_t in) { return (in >> 8) | (in << 8); } + +#ifdef FJXL_GENERIC_SIMD +void StorePixels(SIMDVec16 p, int16_t* dest) { p.Store((uint16_t*)dest); } + +void StorePixels(SIMDVec16 p, int32_t* dest) { + VecPair p_up = p.Upcast(); + p_up.low.Store((uint32_t*)dest); + p_up.hi.Store((uint32_t*)dest + SIMDVec32::kLanes); +} +#endif + +template +void FillRowG8(const unsigned char* rgba, size_t oxs, pixel_t* luma) { + size_t x = 0; +#ifdef FJXL_GENERIC_SIMD + for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) { + auto rgb = SIMDVec16::LoadG8(rgba + x); + StorePixels(rgb[0], luma + x); + } +#endif + for (; x < oxs; x++) { + luma[x] = rgba[x]; + } +} + +template +void FillRowG16(const unsigned char* rgba, size_t oxs, pixel_t* luma) { + size_t x = 0; +#ifdef FJXL_GENERIC_SIMD + for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) { + auto rgb = SIMDVec16::LoadG16(rgba + 2 * x); + if (big_endian) { + rgb[0].SwapEndian(); + } + StorePixels(rgb[0], luma + x); + } +#endif + for (; x < oxs; x++) { + uint16_t val = LoadLE16(rgba + 2 * x); + if (big_endian) { + val = SwapEndian(val); + } + luma[x] = val; + } +} + +template +void FillRowGA8(const unsigned char* rgba, size_t oxs, pixel_t* luma, + pixel_t* alpha) { + size_t x = 0; +#ifdef FJXL_GENERIC_SIMD + for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) { + auto rgb = SIMDVec16::LoadGA8(rgba + 2 * x); + StorePixels(rgb[0], luma + x); + StorePixels(rgb[1], alpha + x); + } +#endif + for (; x < oxs; x++) { + luma[x] = rgba[2 * x]; + alpha[x] = rgba[2 * x + 1]; + } +} + +template +void FillRowGA16(const unsigned char* rgba, size_t oxs, pixel_t* luma, + pixel_t* alpha) { + size_t x = 0; +#ifdef FJXL_GENERIC_SIMD + for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) { + auto rgb = SIMDVec16::LoadGA16(rgba + 4 * x); + if (big_endian) { + rgb[0].SwapEndian(); + rgb[1].SwapEndian(); + } + StorePixels(rgb[0], luma + x); + StorePixels(rgb[1], alpha + x); + } +#endif + for (; x < oxs; x++) { + uint16_t l = LoadLE16(rgba + 4 * x); + uint16_t a = LoadLE16(rgba + 4 * x + 2); + if (big_endian) { + l = SwapEndian(l); + a = SwapEndian(a); + } + luma[x] = l; + alpha[x] = a; + } +} + +template +void StoreYCoCg(pixel_t r, pixel_t g, pixel_t b, pixel_t* y, pixel_t* co, + pixel_t* cg) { + *co = r - b; + pixel_t tmp = b + (*co >> 1); + *cg = g - tmp; + *y = tmp + (*cg >> 1); +} + +#ifdef FJXL_GENERIC_SIMD +void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int16_t* y, int16_t* co, + int16_t* cg) { + SIMDVec16 co_v = r.Sub(b); + SIMDVec16 tmp = b.Add(co_v.SignedShiftRight<1>()); + SIMDVec16 cg_v = g.Sub(tmp); + SIMDVec16 y_v = tmp.Add(cg_v.SignedShiftRight<1>()); + y_v.Store((uint16_t*)y); + co_v.Store((uint16_t*)co); + cg_v.Store((uint16_t*)cg); +} + +void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int32_t* y, int32_t* co, + int32_t* cg) { + VecPair r_up = r.Upcast(); + VecPair g_up = g.Upcast(); + VecPair b_up = b.Upcast(); + SIMDVec32 co_lo_v = r_up.low.Sub(b_up.low); + SIMDVec32 tmp_lo = b_up.low.Add(co_lo_v.SignedShiftRight<1>()); + SIMDVec32 cg_lo_v = g_up.low.Sub(tmp_lo); + SIMDVec32 y_lo_v = tmp_lo.Add(cg_lo_v.SignedShiftRight<1>()); + SIMDVec32 co_hi_v = r_up.hi.Sub(b_up.hi); + SIMDVec32 tmp_hi = b_up.hi.Add(co_hi_v.SignedShiftRight<1>()); + SIMDVec32 cg_hi_v = g_up.hi.Sub(tmp_hi); + SIMDVec32 y_hi_v = tmp_hi.Add(cg_hi_v.SignedShiftRight<1>()); + y_lo_v.Store((uint32_t*)y); + co_lo_v.Store((uint32_t*)co); + cg_lo_v.Store((uint32_t*)cg); + y_hi_v.Store((uint32_t*)y + SIMDVec32::kLanes); + co_hi_v.Store((uint32_t*)co + SIMDVec32::kLanes); + cg_hi_v.Store((uint32_t*)cg + SIMDVec32::kLanes); +} +#endif + +template +void FillRowRGB8(const unsigned char* rgba, size_t oxs, pixel_t* y, pixel_t* co, + pixel_t* cg) { + size_t x = 0; +#ifdef FJXL_GENERIC_SIMD + for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) { + auto rgb = SIMDVec16::LoadRGB8(rgba + 3 * x); + StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x); + } +#endif + for (; x < oxs; x++) { + uint16_t r = rgba[3 * x]; + uint16_t g = rgba[3 * x + 1]; + uint16_t b = rgba[3 * x + 2]; + StoreYCoCg(r, g, b, y + x, co + x, cg + x); + } +} + +template +void FillRowRGB16(const unsigned char* rgba, size_t oxs, pixel_t* y, + pixel_t* co, pixel_t* cg) { + size_t x = 0; +#ifdef FJXL_GENERIC_SIMD + for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) { + auto rgb = SIMDVec16::LoadRGB16(rgba + 6 * x); + if (big_endian) { + rgb[0].SwapEndian(); + rgb[1].SwapEndian(); + rgb[2].SwapEndian(); + } + StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x); + } +#endif + for (; x < oxs; x++) { + uint16_t r = LoadLE16(rgba + 6 * x); + uint16_t g = LoadLE16(rgba + 6 * x + 2); + uint16_t b = LoadLE16(rgba + 6 * x + 4); + if (big_endian) { + r = SwapEndian(r); + g = SwapEndian(g); + b = SwapEndian(b); + } + StoreYCoCg(r, g, b, y + x, co + x, cg + x); + } +} + +template +void FillRowRGBA8(const unsigned char* rgba, size_t oxs, pixel_t* y, + pixel_t* co, pixel_t* cg, pixel_t* alpha) { + size_t x = 0; +#ifdef FJXL_GENERIC_SIMD + for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) { + auto rgb = SIMDVec16::LoadRGBA8(rgba + 4 * x); + StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x); + StorePixels(rgb[3], alpha + x); + } +#endif + for (; x < oxs; x++) { + uint16_t r = rgba[4 * x]; + uint16_t g = rgba[4 * x + 1]; + uint16_t b = rgba[4 * x + 2]; + uint16_t a = rgba[4 * x + 3]; + StoreYCoCg(r, g, b, y + x, co + x, cg + x); + alpha[x] = a; + } +} + +template +void FillRowRGBA16(const unsigned char* rgba, size_t oxs, pixel_t* y, + pixel_t* co, pixel_t* cg, pixel_t* alpha) { + size_t x = 0; +#ifdef FJXL_GENERIC_SIMD + for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) { + auto rgb = SIMDVec16::LoadRGBA16(rgba + 8 * x); + if (big_endian) { + rgb[0].SwapEndian(); + rgb[1].SwapEndian(); + rgb[2].SwapEndian(); + rgb[3].SwapEndian(); + } + StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x); + StorePixels(rgb[3], alpha + x); + } +#endif + for (; x < oxs; x++) { + uint16_t r = LoadLE16(rgba + 8 * x); + uint16_t g = LoadLE16(rgba + 8 * x + 2); + uint16_t b = LoadLE16(rgba + 8 * x + 4); + uint16_t a = LoadLE16(rgba + 8 * x + 6); + if (big_endian) { + r = SwapEndian(r); + g = SwapEndian(g); + b = SwapEndian(b); + a = SwapEndian(a); + } + StoreYCoCg(r, g, b, y + x, co + x, cg + x); + alpha[x] = a; + } +} + +template +void ProcessImageArea(const unsigned char* rgba, size_t x0, size_t y0, + size_t xs, size_t yskip, size_t ys, size_t row_stride, + BitDepth bitdepth, size_t nb_chans, bool big_endian, + Processor* processors) { + constexpr size_t kPadding = 32; + + using pixel_t = typename BitDepth::pixel_t; + + constexpr size_t kAlign = 64; + constexpr size_t kAlignPixels = kAlign / sizeof(pixel_t); + + auto align = [=](pixel_t* ptr) { + size_t offset = reinterpret_cast(ptr) % kAlign; + if (offset) { + ptr += offset / sizeof(pixel_t); + } + return ptr; + }; + + constexpr size_t kNumPx = + (256 + kPadding * 2 + kAlignPixels + kAlignPixels - 1) / kAlignPixels * + kAlignPixels; + + std::vector, 2>> group_data(nb_chans); + + for (size_t y = 0; y < ys; y++) { + const auto rgba_row = + rgba + row_stride * (y0 + y) + x0 * nb_chans * BitDepth::kInputBytes; + pixel_t* crow[4] = {}; + pixel_t* prow[4] = {}; + for (size_t i = 0; i < nb_chans; i++) { + crow[i] = align(&group_data[i][y & 1][kPadding]); + prow[i] = align(&group_data[i][(y - 1) & 1][kPadding]); + } + + // Pre-fill rows with YCoCg converted pixels. + if (nb_chans == 1) { + if (BitDepth::kInputBytes == 1) { + FillRowG8(rgba_row, xs, crow[0]); + } else if (big_endian) { + FillRowG16(rgba_row, xs, crow[0]); + } else { + FillRowG16(rgba_row, xs, crow[0]); + } + } else if (nb_chans == 2) { + if (BitDepth::kInputBytes == 1) { + FillRowGA8(rgba_row, xs, crow[0], crow[1]); + } else if (big_endian) { + FillRowGA16(rgba_row, xs, crow[0], crow[1]); + } else { + FillRowGA16(rgba_row, xs, crow[0], crow[1]); + } + } else if (nb_chans == 3) { + if (BitDepth::kInputBytes == 1) { + FillRowRGB8(rgba_row, xs, crow[0], crow[1], crow[2]); + } else if (big_endian) { + FillRowRGB16(rgba_row, xs, crow[0], crow[1], + crow[2]); + } else { + FillRowRGB16(rgba_row, xs, crow[0], crow[1], + crow[2]); + } + } else { + if (BitDepth::kInputBytes == 1) { + FillRowRGBA8(rgba_row, xs, crow[0], crow[1], crow[2], crow[3]); + } else if (big_endian) { + FillRowRGBA16(rgba_row, xs, crow[0], crow[1], + crow[2], crow[3]); + } else { + FillRowRGBA16(rgba_row, xs, crow[0], crow[1], + crow[2], crow[3]); + } + } + // Deal with x == 0. + for (size_t c = 0; c < nb_chans; c++) { + *(crow[c] - 1) = y > 0 ? *(prow[c]) : 0; + // Fix topleft. + *(prow[c] - 1) = y > 0 ? *(prow[c]) : 0; + } + if (y < yskip) continue; + for (size_t c = 0; c < nb_chans; c++) { + // Get pointers to px/left/top/topleft data to speedup loop. + const pixel_t* row = crow[c]; + const pixel_t* row_left = crow[c] - 1; + const pixel_t* row_top = y == 0 ? row_left : prow[c]; + const pixel_t* row_topleft = y == 0 ? row_left : prow[c] - 1; + + processors[c].ProcessRow(row, row_left, row_top, row_topleft, xs); + } + } + for (size_t c = 0; c < nb_chans; c++) { + processors[c].Finalize(); + } +} + +template +void WriteACSection(const unsigned char* rgba, size_t x0, size_t y0, size_t xs, + size_t ys, size_t row_stride, bool is_single_group, + BitDepth bitdepth, size_t nb_chans, bool big_endian, + const PrefixCode code[4], + std::array& output) { + for (size_t i = 0; i < nb_chans; i++) { + if (is_single_group && i == 0) continue; + output[i].Allocate(xs * ys * bitdepth.MaxEncodedBitsPerSample() + 4); + } + if (!is_single_group) { + // Group header for modular image. + // When the image is single-group, the global modular image is the one + // that contains the pixel data, and there is no group header. + output[0].Write(1, 1); // Global tree + output[0].Write(1, 1); // All default wp + output[0].Write(2, 0b00); // 0 transforms + } + + ChunkEncoder encoders[4]; + ChannelRowProcessor, BitDepth> row_encoders[4]; + for (size_t c = 0; c < nb_chans; c++) { + row_encoders[c].t = &encoders[c]; + encoders[c].output = &output[c]; + encoders[c].code = &code[c]; + } + ProcessImageArea, BitDepth>>( + rgba, x0, y0, xs, 0, ys, row_stride, bitdepth, nb_chans, big_endian, + row_encoders); +} + +constexpr int kHashExp = 16; +constexpr uint32_t kHashSize = 1 << kHashExp; +constexpr uint32_t kHashMultiplier = 2654435761; +constexpr int kMaxColors = 512; + +// can be any function that returns a value in 0 .. kHashSize-1 +// has to map 0 to 0 +inline uint32_t pixel_hash(uint32_t p) { + return (p * kHashMultiplier) >> (32 - kHashExp); +} + +template +void FillRowPalette(const unsigned char* inrow, size_t xs, + const int16_t* lookup, int16_t* out) { + for (size_t x = 0; x < xs; x++) { + uint32_t p = 0; + memcpy(&p, inrow + x * nb_chans, nb_chans); + out[x] = lookup[pixel_hash(p)]; + } +} + +template +void ProcessImageAreaPalette(const unsigned char* rgba, size_t x0, size_t y0, + size_t xs, size_t yskip, size_t ys, + size_t row_stride, const int16_t* lookup, + size_t nb_chans, Processor* processors) { + constexpr size_t kPadding = 32; + + std::vector> group_data(2); + Processor& row_encoder = processors[0]; + + for (size_t y = 0; y < ys; y++) { + // Pre-fill rows with palette converted pixels. + const unsigned char* inrow = rgba + row_stride * (y0 + y) + x0 * nb_chans; + int16_t* outrow = &group_data[y & 1][kPadding]; + if (nb_chans == 1) { + FillRowPalette<1>(inrow, xs, lookup, outrow); + } else if (nb_chans == 2) { + FillRowPalette<2>(inrow, xs, lookup, outrow); + } else if (nb_chans == 3) { + FillRowPalette<3>(inrow, xs, lookup, outrow); + } else if (nb_chans == 4) { + FillRowPalette<4>(inrow, xs, lookup, outrow); + } + // Deal with x == 0. + group_data[y & 1][kPadding - 1] = + y > 0 ? group_data[(y - 1) & 1][kPadding] : 0; + // Fix topleft. + group_data[(y - 1) & 1][kPadding - 1] = + y > 0 ? group_data[(y - 1) & 1][kPadding] : 0; + // Get pointers to px/left/top/topleft data to speedup loop. + const int16_t* row = &group_data[y & 1][kPadding]; + const int16_t* row_left = &group_data[y & 1][kPadding - 1]; + const int16_t* row_top = + y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding]; + const int16_t* row_topleft = + y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding - 1]; + + row_encoder.ProcessRow(row, row_left, row_top, row_topleft, xs); + } + row_encoder.Finalize(); +} + +void WriteACSectionPalette(const unsigned char* rgba, size_t x0, size_t y0, + size_t xs, size_t ys, size_t row_stride, + bool is_single_group, const PrefixCode code[4], + const int16_t* lookup, size_t nb_chans, + BitWriter& output) { + if (!is_single_group) { + output.Allocate(16 * xs * ys + 4); + // Group header for modular image. + // When the image is single-group, the global modular image is the one + // that contains the pixel data, and there is no group header. + output.Write(1, 1); // Global tree + output.Write(1, 1); // All default wp + output.Write(2, 0b00); // 0 transforms + } + + ChunkEncoder encoder; + ChannelRowProcessor, UpTo8Bits> row_encoder; + + row_encoder.t = &encoder; + encoder.output = &output; + encoder.code = &code[is_single_group ? 1 : 0]; + ProcessImageAreaPalette< + ChannelRowProcessor, UpTo8Bits>>( + rgba, x0, y0, xs, 0, ys, row_stride, lookup, nb_chans, &row_encoder); +} + +template +void CollectSamples(const unsigned char* rgba, size_t x0, size_t y0, size_t xs, + size_t row_stride, size_t row_count, + uint64_t raw_counts[4][kNumRawSymbols], + uint64_t lz77_counts[4][kNumLZ77], bool is_single_group, + bool palette, BitDepth bitdepth, size_t nb_chans, + bool big_endian, const int16_t* lookup) { + if (palette) { + ChunkSampleCollector sample_collectors[4]; + ChannelRowProcessor, UpTo8Bits> + row_sample_collectors[4]; + for (size_t c = 0; c < nb_chans; c++) { + row_sample_collectors[c].t = &sample_collectors[c]; + sample_collectors[c].raw_counts = raw_counts[is_single_group ? 1 : 0]; + sample_collectors[c].lz77_counts = lz77_counts[is_single_group ? 1 : 0]; + } + ProcessImageAreaPalette< + ChannelRowProcessor, UpTo8Bits>>( + rgba, x0, y0, xs, 1, 1 + row_count, row_stride, lookup, nb_chans, + row_sample_collectors); + } else { + ChunkSampleCollector sample_collectors[4]; + ChannelRowProcessor, BitDepth> + row_sample_collectors[4]; + for (size_t c = 0; c < nb_chans; c++) { + row_sample_collectors[c].t = &sample_collectors[c]; + sample_collectors[c].raw_counts = raw_counts[c]; + sample_collectors[c].lz77_counts = lz77_counts[c]; + } + ProcessImageArea< + ChannelRowProcessor, BitDepth>>( + rgba, x0, y0, xs, 1, 1 + row_count, row_stride, bitdepth, nb_chans, + big_endian, row_sample_collectors); + } +} + +void PrepareDCGlobalPalette(bool is_single_group, size_t width, size_t height, + const PrefixCode code[4], + const std::vector& palette, + size_t pcolors, BitWriter* output) { + PrepareDCGlobalCommon(is_single_group, width, height, code, output); + output->Write(2, 0b01); // 1 transform + output->Write(2, 0b01); // Palette + output->Write(5, 0b00000); // Starting from ch 0 + output->Write(2, 0b10); // 4-channel palette (RGBA) + // pcolors <= kMaxColors + kChunkSize - 1 + static_assert(kMaxColors + kChunkSize < 1281, + "add code to signal larger palette sizes"); + if (pcolors < 256) { + output->Write(2, 0b00); + output->Write(8, pcolors); + } else { + output->Write(2, 0b01); + output->Write(10, pcolors - 256); + } + + output->Write(2, 0b00); // nb_deltas == 0 + output->Write(4, 0); // Zero predictor for delta palette + // Encode palette + ChunkEncoder encoder; + ChannelRowProcessor, UpTo8Bits> row_encoder; + row_encoder.t = &encoder; + encoder.output = output; + encoder.code = &code[0]; + int16_t p[4][32 + 1024] = {}; + uint8_t prgba[4]; + size_t i = 0; + size_t have_zero = 0; + if (palette[pcolors - 1] == 0) have_zero = 1; + for (; i < pcolors; i++) { + memcpy(prgba, &palette[i], 4); + p[0][16 + i + have_zero] = prgba[0]; + p[1][16 + i + have_zero] = prgba[1]; + p[2][16 + i + have_zero] = prgba[2]; + p[3][16 + i + have_zero] = prgba[3]; + } + p[0][15] = 0; + row_encoder.ProcessRow(p[0] + 16, p[0] + 15, p[0] + 15, p[0] + 15, pcolors); + p[1][15] = p[0][16]; + p[0][15] = p[0][16]; + row_encoder.ProcessRow(p[1] + 16, p[1] + 15, p[0] + 16, p[0] + 15, pcolors); + p[2][15] = p[1][16]; + p[1][15] = p[1][16]; + row_encoder.ProcessRow(p[2] + 16, p[2] + 15, p[1] + 16, p[1] + 15, pcolors); + p[3][15] = p[2][16]; + p[2][15] = p[2][16]; + row_encoder.ProcessRow(p[3] + 16, p[3] + 15, p[2] + 16, p[2] + 15, pcolors); + row_encoder.Finalize(); + + if (!is_single_group) { + output->ZeroPadToByte(); + } +} + +template +JxlFastLosslessFrameState* LLEnc(const unsigned char* rgba, size_t width, + size_t stride, size_t height, + BitDepth bitdepth, size_t nb_chans, + bool big_endian, int effort, + void* runner_opaque, + FJxlParallelRunner runner) { + assert(width != 0); + assert(height != 0); + assert(stride >= nb_chans * BitDepth::kInputBytes * width); + + // Count colors to try palette + std::vector palette(kHashSize); + palette[0] = 1; + std::vector lookup(kHashSize); + lookup[0] = 0; + int pcolors = 0; + bool collided = effort < 2 || bitdepth.bitdepth != 8 || + nb_chans < 4; // todo: also do rgb palette + for (size_t y = 0; y < height && !collided; y++) { + const unsigned char* r = rgba + stride * y; + size_t x = 0; + if (nb_chans == 4) { + // this is just an unrolling of the next loop + for (; x + 7 < width; x += 8) { + uint32_t p[8], index[8]; + memcpy(p, r + x * 4, 32); + for (int i = 0; i < 8; i++) index[i] = pixel_hash(p[i]); + for (int i = 0; i < 8; i++) { + uint32_t init_entry = index[i] ? 0 : 1; + if (init_entry != palette[index[i]] && p[i] != palette[index[i]]) { + collided = true; + } + } + for (int i = 0; i < 8; i++) palette[index[i]] = p[i]; + } + for (; x < width; x++) { + uint32_t p; + memcpy(&p, r + x * 4, 4); + uint32_t index = pixel_hash(p); + uint32_t init_entry = index ? 0 : 1; + if (init_entry != palette[index] && p != palette[index]) { + collided = true; + } + palette[index] = p; + } + } else { + for (; x < width; x++) { + uint32_t p = 0; + memcpy(&p, r + x * nb_chans, nb_chans); + uint32_t index = pixel_hash(p); + uint32_t init_entry = index ? 0 : 1; + if (init_entry != palette[index] && p != palette[index]) { + collided = true; + } + palette[index] = p; + } + } + } + + int nb_entries = 0; + if (!collided) { + if (palette[0] == 0) pcolors = 1; + if (palette[0] == 1) palette[0] = 0; + bool have_color = false; + uint8_t minG = 255, maxG = 0; + for (uint32_t k = 0; k < kHashSize; k++) { + if (palette[k] == 0) continue; + uint8_t p[4]; + memcpy(p, &palette[k], 4); + // move entries to front so sort has less work + palette[nb_entries] = palette[k]; + if (p[0] != p[1] || p[0] != p[2]) have_color = true; + if (p[1] < minG) minG = p[1]; + if (p[1] > maxG) maxG = p[1]; + nb_entries++; + // don't do palette if too many colors are needed + if (nb_entries + pcolors > kMaxColors) { + collided = true; + break; + } + } + if (!have_color) { + // don't do palette if it's just grayscale without many holes + if (maxG - minG < nb_entries * 1.4f) collided = true; + } + } + if (!collided) { + std::sort( + palette.begin(), palette.begin() + nb_entries, + [](uint32_t ap, uint32_t bp) { + if (ap == 0) return false; + if (bp == 0) return true; + uint8_t a[4], b[4]; + memcpy(a, &ap, 4); + memcpy(b, &bp, 4); + float ay, by; + ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f) * a[3]; + by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f) * b[3]; + return ay < by; // sort on alpha*luma + }); + for (int k = 0; k < nb_entries; k++) { + if (palette[k] == 0) break; + lookup[pixel_hash(palette[k])] = pcolors++; + } + } + + size_t num_groups_x = (width + 255) / 256; + size_t num_groups_y = (height + 255) / 256; + size_t num_dc_groups_x = (width + 2047) / 2048; + size_t num_dc_groups_y = (height + 2047) / 2048; + + uint64_t raw_counts[4][kNumRawSymbols] = {}; + uint64_t lz77_counts[4][kNumLZ77] = {}; + + bool onegroup = num_groups_x == 1 && num_groups_y == 1; + + // sample the middle (effort * 2) rows of every group + for (size_t g = 0; g < num_groups_y * num_groups_x; g++) { + size_t xg = g % num_groups_x; + size_t yg = g / num_groups_x; + int y_offset = yg * 256; + int y_max = std::min(height - yg * 256, 256); + int y_begin = y_offset + std::max(0, y_max - 2 * effort) / 2; + int y_count = + std::min(2 * effort * y_max / 256, y_offset + y_max - y_begin - 1); + int x_max = + std::min(width - xg * 256, 256) / kChunkSize * kChunkSize; + CollectSamples(rgba, xg * 256, y_begin, x_max, stride, y_count, raw_counts, + lz77_counts, onegroup, !collided, bitdepth, nb_chans, + big_endian, lookup.data()); + } + + // TODO(veluca): can probably improve this and make it bitdepth-dependent. + uint64_t base_raw_counts[kNumRawSymbols] = { + 3843, 852, 1270, 1214, 1014, 727, 481, 300, 159, 51, + 5, 1, 1, 1, 1, 1, 1, 1, 1}; + + bool doing_ycocg = nb_chans > 2 && collided; + for (size_t i = bitdepth.NumSymbols(doing_ycocg); i < kNumRawSymbols; i++) { + base_raw_counts[i] = 0; + } + + for (size_t c = 0; c < 4; c++) { + for (size_t i = 0; i < kNumRawSymbols; i++) { + raw_counts[c][i] = (raw_counts[c][i] << 8) + base_raw_counts[i]; + } + } + + if (!collided) { + unsigned token, nbits, bits; + EncodeHybridUint000(PackSigned(pcolors - 1), &token, &nbits, &bits); + // ensure all palette indices can actually be encoded + for (size_t i = 0; i < token + 1; i++) + raw_counts[0][i] = std::max(raw_counts[0][i], 1); + // these tokens are only used for the palette itself so they can get a bad + // code + for (size_t i = token + 1; i < 10; i++) raw_counts[0][i] = 1; + } + + uint64_t base_lz77_counts[kNumLZ77] = { + 29, 27, 25, 23, 21, 21, 19, 18, 21, 17, 16, 15, 15, 14, + 13, 13, 137, 98, 61, 34, 1, 1, 1, 1, 1, 1, 1, 1, + }; + + for (size_t c = 0; c < 4; c++) { + for (size_t i = 0; i < kNumLZ77; i++) { + lz77_counts[c][i] = (lz77_counts[c][i] << 8) + base_lz77_counts[i]; + } + } + + alignas(64) PrefixCode hcode[4]; + for (size_t i = 0; i < 4; i++) { + hcode[i] = PrefixCode(bitdepth, raw_counts[i], lz77_counts[i]); + } + + size_t num_groups = onegroup ? 1 + : (2 + num_dc_groups_x * num_dc_groups_y + + num_groups_x * num_groups_y); + + JxlFastLosslessFrameState* frame_state = new JxlFastLosslessFrameState(); + + frame_state->width = width; + frame_state->height = height; + frame_state->nb_chans = nb_chans; + frame_state->bitdepth = bitdepth.bitdepth; + + frame_state->group_data = std::vector>(num_groups); + if (collided) { + PrepareDCGlobal(onegroup, width, height, nb_chans, hcode, + &frame_state->group_data[0][0]); + } else { + PrepareDCGlobalPalette(onegroup, width, height, hcode, palette, pcolors, + &frame_state->group_data[0][0]); + } + + auto run_one = [&](size_t g) { + size_t xg = g % num_groups_x; + size_t yg = g / num_groups_x; + size_t group_id = + onegroup ? 0 : (2 + num_dc_groups_x * num_dc_groups_y + g); + size_t xs = std::min(width - xg * 256, 256); + size_t ys = std::min(height - yg * 256, 256); + size_t x0 = xg * 256; + size_t y0 = yg * 256; + auto& gd = frame_state->group_data[group_id]; + if (collided) { + WriteACSection(rgba, x0, y0, xs, ys, stride, onegroup, bitdepth, nb_chans, + big_endian, hcode, gd); + + } else { + WriteACSectionPalette(rgba, x0, y0, xs, ys, stride, onegroup, hcode, + lookup.data(), nb_chans, gd[0]); + } + }; + + runner( + runner_opaque, &run_one, + +[](void* r, size_t i) { (*reinterpret_cast(r))(i); }, + num_groups_x * num_groups_y); + + return frame_state; +} + +JxlFastLosslessFrameState* JxlFastLosslessEncodeImpl( + const unsigned char* rgba, size_t width, size_t stride, size_t height, + size_t nb_chans, size_t bitdepth, bool big_endian, int effort, + void* runner_opaque, FJxlParallelRunner runner) { + assert(bitdepth > 0); + assert(nb_chans <= 4); + assert(nb_chans != 0); + if (bitdepth <= 8) { + return LLEnc(rgba, width, stride, height, UpTo8Bits(bitdepth), nb_chans, + big_endian, effort, runner_opaque, runner); + } + if (bitdepth <= 13) { + return LLEnc(rgba, width, stride, height, From9To13Bits(bitdepth), nb_chans, + big_endian, effort, runner_opaque, runner); + } + if (bitdepth == 14) { + return LLEnc(rgba, width, stride, height, Exactly14Bits(bitdepth), nb_chans, + big_endian, effort, runner_opaque, runner); + } + return LLEnc(rgba, width, stride, height, MoreThan14Bits(bitdepth), nb_chans, + big_endian, effort, runner_opaque, runner); +} + +} // namespace + +#endif // FJXL_SELF_INCLUDE + +#ifndef FJXL_SELF_INCLUDE + +#define FJXL_SELF_INCLUDE + +// If we have NEON enabled, it is the default target. +#if FJXL_ENABLE_NEON + +namespace default_implementation { +#define FJXL_NEON +#include "lib/jxl/enc_fast_lossless.cc" +#undef FJXL_NEON +} // namespace default_implementation + +#else // FJXL_ENABLE_NEON + +namespace default_implementation { +#include "lib/jxl/enc_fast_lossless.cc" +} + +#if FJXL_ENABLE_AVX2 +#ifdef __clang__ +#pragma clang attribute push(__attribute__((target("avx,avx2"))), \ + apply_to = function) +// Causes spurious warnings on clang5. +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wmissing-braces" +#elif defined(__GNUC__) +#pragma GCC push_options +// Seems to cause spurious errors on GCC8. +#pragma GCC diagnostic ignored "-Wpsabi" +#pragma GCC target "avx,avx2" +#endif + +namespace AVX2 { +#define FJXL_AVX2 +#include "lib/jxl/enc_fast_lossless.cc" +#undef FJXL_AVX2 +} // namespace AVX2 + +#ifdef __clang__ +#pragma clang attribute pop +#pragma clang diagnostic pop +#elif defined(__GNUC__) +#pragma GCC pop_options +#endif +#endif // FJXL_ENABLE_AVX2 + +#if FJXL_ENABLE_AVX512 +#ifdef __clang__ +#pragma clang attribute push( \ + __attribute__((target("avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi"))), \ + apply_to = function) +#elif defined(__GNUC__) +#pragma GCC push_options +#pragma GCC target "avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi" +#endif + +namespace AVX512 { +#define FJXL_AVX512 +#include "lib/jxl/enc_fast_lossless.cc" +#undef FJXL_AVX512 +} // namespace AVX512 + +#ifdef __clang__ +#pragma clang attribute pop +#elif defined(__GNUC__) +#pragma GCC pop_options +#endif +#endif // FJXL_ENABLE_AVX512 + +#endif + +extern "C" { + +size_t JxlFastLosslessEncode(const unsigned char* rgba, size_t width, + size_t row_stride, size_t height, size_t nb_chans, + size_t bitdepth, int big_endian, int effort, + unsigned char** output, void* runner_opaque, + FJxlParallelRunner runner) { + auto frame_state = JxlFastLosslessPrepareFrame( + rgba, width, row_stride, height, nb_chans, bitdepth, big_endian, effort, + runner_opaque, runner); + JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/1, + /*is_last=*/1); + size_t output_size = JxlFastLosslessMaxRequiredOutput(frame_state); + *output = (unsigned char*)malloc(output_size); + size_t written = 0; + size_t total = 0; + while ((written = JxlFastLosslessWriteOutput(frame_state, *output + total, + output_size - total)) != 0) { + total += written; + } + return total; +} + +JxlFastLosslessFrameState* JxlFastLosslessPrepareFrame( + const unsigned char* rgba, size_t width, size_t row_stride, size_t height, + size_t nb_chans, size_t bitdepth, int big_endian, int effort, + void* runner_opaque, FJxlParallelRunner runner) { + auto trivial_runner = + +[](void*, void* opaque, void fun(void*, size_t), size_t count) { + for (size_t i = 0; i < count; i++) { + fun(opaque, i); + } + }; + + if (runner == nullptr) { + runner = trivial_runner; + } + +#if FJXL_ENABLE_AVX512 + if (__builtin_cpu_supports("avx512cd") && + __builtin_cpu_supports("avx512vbmi") && + __builtin_cpu_supports("avx512bw") && __builtin_cpu_supports("avx512f") && + __builtin_cpu_supports("avx512vl")) { + return AVX512::JxlFastLosslessEncodeImpl(rgba, width, row_stride, height, + nb_chans, bitdepth, big_endian, + effort, runner_opaque, runner); + } +#endif +#if FJXL_ENABLE_AVX2 + if (__builtin_cpu_supports("avx2")) { + return AVX2::JxlFastLosslessEncodeImpl(rgba, width, row_stride, height, + nb_chans, bitdepth, big_endian, + effort, runner_opaque, runner); + } +#endif + + return default_implementation::JxlFastLosslessEncodeImpl( + rgba, width, row_stride, height, nb_chans, bitdepth, big_endian, effort, + runner_opaque, runner); +} + +} // extern "C" + +#endif // FJXL_SELF_INCLUDE diff --git a/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.h b/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.h new file mode 100644 index 0000000000..4ea1d4f69b --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.h @@ -0,0 +1,72 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_FAST_LOSSLESS_H_ +#define LIB_JXL_ENC_FAST_LOSSLESS_H_ +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Simple encoding API. + +// A FJxlParallelRunner must call fun(opaque, i) for all i from 0 to count. It +// may do so in parallel. +typedef void(FJxlParallelRunner)(void* runner_opaque, void* opaque, + void fun(void*, size_t), size_t count); + +// You may pass `nullptr` as a runner: encoding will be sequential. +size_t JxlFastLosslessEncode(const unsigned char* rgba, size_t width, + size_t row_stride, size_t height, size_t nb_chans, + size_t bitdepth, int big_endian, int effort, + unsigned char** output, void* runner_opaque, + FJxlParallelRunner runner); + +// More complex API for cases in which you may want to allocate your own buffer +// and other advanced use cases. + +// Opaque struct that represents an intermediate state of the computation. +struct JxlFastLosslessFrameState; + +// Returned JxlFastLosslessFrameState must be freed by calling +// JxlFastLosslessFreeFrameState. +JxlFastLosslessFrameState* JxlFastLosslessPrepareFrame( + const unsigned char* rgba, size_t width, size_t row_stride, size_t height, + size_t nb_chans, size_t bitdepth, int big_endian, int effort, + void* runner_opaque, FJxlParallelRunner runner); + +// Prepare the (image/frame) header. You may encode animations by concatenating +// the output of multiple frames, of which the first one has add_image_header = +// 1 and subsequent ones have add_image_header = 0, and all frames but the last +// one have is_last = 0. +void JxlFastLosslessPrepareHeader(JxlFastLosslessFrameState* frame, + int add_image_header, int is_last); + +// Upper bound on the required output size, including any padding that may be +// required by JxlFastLosslessWriteOutput. Cannot be called before +// JxlFastLosslessPrepareHeader. +size_t JxlFastLosslessMaxRequiredOutput(const JxlFastLosslessFrameState* frame); + +// Actual size of the frame once it is encoded. This is not identical to +// JxlFastLosslessMaxRequiredOutput because JxlFastLosslessWriteOutput may +// require extra padding. +size_t JxlFastLosslessOutputSize(const JxlFastLosslessFrameState* frame); + +// Writes the frame to the given output buffer. Returns the number of bytes that +// were written, which is at least 1 unless the entire output has been written +// already. It is required that `output_size >= 32` when calling this function. +// This function must be called repeatedly until it returns 0. +size_t JxlFastLosslessWriteOutput(JxlFastLosslessFrameState* frame, + unsigned char* output, size_t output_size); + +// Frees the provided frame state. +void JxlFastLosslessFreeFrameState(JxlFastLosslessFrameState* frame); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // LIB_JXL_ENC_FAST_LOSSLESS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_fields.cc b/third_party/jpeg-xl/lib/jxl/enc_fields.cc new file mode 100644 index 0000000000..22c763e13f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_fields.cc @@ -0,0 +1,239 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_fields.h" + +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/fields.h" + +namespace jxl { + +namespace { +using ::jxl::fields_internal::VisitorBase; +class WriteVisitor : public VisitorBase { + public: + WriteVisitor(const size_t extension_bits, BitWriter* JXL_RESTRICT writer) + : extension_bits_(extension_bits), writer_(writer) {} + + Status Bits(const size_t bits, const uint32_t /*default_value*/, + uint32_t* JXL_RESTRICT value) override { + ok_ &= BitsCoder::Write(bits, *value, writer_); + return true; + } + Status U32(const U32Enc enc, const uint32_t /*default_value*/, + uint32_t* JXL_RESTRICT value) override { + ok_ &= U32Coder::Write(enc, *value, writer_); + return true; + } + + Status U64(const uint64_t /*default_value*/, + uint64_t* JXL_RESTRICT value) override { + ok_ &= U64Coder::Write(*value, writer_); + return true; + } + + Status F16(const float /*default_value*/, + float* JXL_RESTRICT value) override { + ok_ &= F16Coder::Write(*value, writer_); + return true; + } + + Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override { + JXL_QUIET_RETURN_IF_ERROR(VisitorBase::BeginExtensions(extensions)); + if (*extensions == 0) { + JXL_ASSERT(extension_bits_ == 0); + return true; + } + // TODO(janwas): extend API to pass in array of extension_bits, one per + // extension. We currently ascribe all bits to the first extension, but + // this is only an encoder limitation. NOTE: extension_bits_ can be zero + // if an extension does not require any additional fields. + ok_ &= U64Coder::Write(extension_bits_, writer_); + // For each nonzero bit except the lowest/first (already written): + for (uint64_t remaining_extensions = *extensions & (*extensions - 1); + remaining_extensions != 0; + remaining_extensions &= remaining_extensions - 1) { + ok_ &= U64Coder::Write(0, writer_); + } + return true; + } + // EndExtensions = default. + + Status OK() const { return ok_; } + + private: + const size_t extension_bits_; + BitWriter* JXL_RESTRICT writer_; + bool ok_ = true; +}; +} // namespace + +Status Bundle::Write(const Fields& fields, BitWriter* writer, size_t layer, + AuxOut* aux_out) { + size_t extension_bits, total_bits; + JXL_RETURN_IF_ERROR(Bundle::CanEncode(fields, &extension_bits, &total_bits)); + + BitWriter::Allotment allotment(writer, total_bits); + WriteVisitor visitor(extension_bits, writer); + JXL_RETURN_IF_ERROR(visitor.VisitConst(fields)); + JXL_RETURN_IF_ERROR(visitor.OK()); + allotment.ReclaimAndCharge(writer, layer, aux_out); + return true; +} + +// Returns false if the value is too large to encode. +Status BitsCoder::Write(const size_t bits, const uint32_t value, + BitWriter* JXL_RESTRICT writer) { + if (value >= (1ULL << bits)) { + return JXL_FAILURE("Value %d too large to encode in %" PRIu64 " bits", + value, static_cast(bits)); + } + writer->Write(bits, value); + return true; +} + +// Returns false if the value is too large to encode. +Status U32Coder::Write(const U32Enc enc, const uint32_t value, + BitWriter* JXL_RESTRICT writer) { + uint32_t selector; + size_t total_bits; + JXL_RETURN_IF_ERROR(ChooseSelector(enc, value, &selector, &total_bits)); + + writer->Write(2, selector); + + const U32Distr d = enc.GetDistr(selector); + if (!d.IsDirect()) { // Nothing more to write for direct encoding + const uint32_t offset = d.Offset(); + JXL_ASSERT(value >= offset); + writer->Write(total_bits - 2, value - offset); + } + + return true; +} + +// Returns false if the value is too large to encode. +Status U64Coder::Write(uint64_t value, BitWriter* JXL_RESTRICT writer) { + if (value == 0) { + // Selector: use 0 bits, value 0 + writer->Write(2, 0); + } else if (value <= 16) { + // Selector: use 4 bits, value 1..16 + writer->Write(2, 1); + writer->Write(4, value - 1); + } else if (value <= 272) { + // Selector: use 8 bits, value 17..272 + writer->Write(2, 2); + writer->Write(8, value - 17); + } else { + // Selector: varint, first a 12-bit group, after that per 8-bit group. + writer->Write(2, 3); + writer->Write(12, value & 4095); + value >>= 12; + int shift = 12; + while (value > 0 && shift < 60) { + // Indicate varint not done + writer->Write(1, 1); + writer->Write(8, value & 255); + value >>= 8; + shift += 8; + } + if (value > 0) { + // This only could happen if shift == N - 4. + writer->Write(1, 1); + writer->Write(4, value & 15); + // Implicitly closed sequence, no extra stop bit is required. + } else { + // Indicate end of varint + writer->Write(1, 0); + } + } + + return true; +} + +Status F16Coder::Write(float value, BitWriter* JXL_RESTRICT writer) { + uint32_t bits32; + memcpy(&bits32, &value, sizeof(bits32)); + const uint32_t sign = bits32 >> 31; + const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF; + const uint32_t mantissa32 = bits32 & 0x7FFFFF; + + const int32_t exp = static_cast(biased_exp32) - 127; + if (JXL_UNLIKELY(exp > 15)) { + return JXL_FAILURE("Too big to encode, CanEncode should return false"); + } + + // Tiny or zero => zero. + if (exp < -24) { + writer->Write(16, 0); + return true; + } + + uint32_t biased_exp16, mantissa16; + + // exp = [-24, -15] => subnormal + if (JXL_UNLIKELY(exp < -14)) { + biased_exp16 = 0; + const uint32_t sub_exp = static_cast(-14 - exp); + JXL_ASSERT(1 <= sub_exp && sub_exp < 11); + mantissa16 = (1 << (10 - sub_exp)) + (mantissa32 >> (13 + sub_exp)); + } else { + // exp = [-14, 15] + biased_exp16 = static_cast(exp + 15); + JXL_ASSERT(1 <= biased_exp16 && biased_exp16 < 31); + mantissa16 = mantissa32 >> 13; + } + + JXL_ASSERT(mantissa16 < 1024); + const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16; + JXL_ASSERT(bits16 < 0x10000); + writer->Write(16, bits16); + return true; +} + +Status WriteCodestreamHeaders(CodecMetadata* metadata, BitWriter* writer, + AuxOut* aux_out) { + // Marker/signature + BitWriter::Allotment allotment(writer, 16); + writer->Write(8, 0xFF); + writer->Write(8, kCodestreamMarker); + allotment.ReclaimAndCharge(writer, kLayerHeader, aux_out); + + JXL_RETURN_IF_ERROR( + WriteSizeHeader(metadata->size, writer, kLayerHeader, aux_out)); + + JXL_RETURN_IF_ERROR( + WriteImageMetadata(metadata->m, writer, kLayerHeader, aux_out)); + + metadata->transform_data.nonserialized_xyb_encoded = metadata->m.xyb_encoded; + JXL_RETURN_IF_ERROR( + Bundle::Write(metadata->transform_data, writer, kLayerHeader, aux_out)); + + return true; +} + +Status WriteFrameHeader(const FrameHeader& frame, + BitWriter* JXL_RESTRICT writer, AuxOut* aux_out) { + return Bundle::Write(frame, writer, kLayerHeader, aux_out); +} + +Status WriteImageMetadata(const ImageMetadata& metadata, + BitWriter* JXL_RESTRICT writer, size_t layer, + AuxOut* aux_out) { + return Bundle::Write(metadata, writer, layer, aux_out); +} + +Status WriteQuantizerParams(const QuantizerParams& params, + BitWriter* JXL_RESTRICT writer, size_t layer, + AuxOut* aux_out) { + return Bundle::Write(params, writer, layer, aux_out); +} + +Status WriteSizeHeader(const SizeHeader& size, BitWriter* JXL_RESTRICT writer, + size_t layer, AuxOut* aux_out) { + return Bundle::Write(size, writer, layer, aux_out); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_fields.h b/third_party/jpeg-xl/lib/jxl/enc_fields.h new file mode 100644 index 0000000000..5bb179a719 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_fields.h @@ -0,0 +1,37 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_FIELDS_H_ +#define LIB_JXL_ENC_FIELDS_H_ + +#include "lib/jxl/base/status.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/image_metadata.h" +#include "lib/jxl/quantizer.h" + +namespace jxl { + +struct AuxOut; + +// Write headers from the CodecMetadata. Also may modify nonserialized_... +// fields of the metadata. +Status WriteCodestreamHeaders(CodecMetadata* metadata, BitWriter* writer, + AuxOut* aux_out); + +Status WriteFrameHeader(const FrameHeader& frame, + BitWriter* JXL_RESTRICT writer, AuxOut* aux_out); + +Status WriteQuantizerParams(const QuantizerParams& params, + BitWriter* JXL_RESTRICT writer, size_t layer, + AuxOut* aux_out); + +Status WriteSizeHeader(const SizeHeader& size, BitWriter* JXL_RESTRICT writer, + size_t layer, AuxOut* aux_out); + +} // namespace jxl + +#endif // LIB_JXL_ENC_FIELDS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_file.cc b/third_party/jpeg-xl/lib/jxl/enc_file.cc new file mode 100644 index 0000000000..b1f1442cc2 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_file.cc @@ -0,0 +1,141 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_file.h" + +#include + +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_fields.h" +#include "lib/jxl/enc_frame.h" +#include "lib/jxl/enc_icc_codec.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +namespace { + +Status PrepareCodecMetadataFromIO(const CompressParams& cparams, + const CodecInOut* io, + CodecMetadata* metadata) { + *metadata = io->metadata; + size_t ups = 1; + if (cparams.already_downsampled) ups = cparams.resampling; + + JXL_RETURN_IF_ERROR(metadata->size.Set(io->xsize() * ups, io->ysize() * ups)); + + // Keep ICC profile in lossless modes because a reconstructed profile may be + // slightly different (quantization). + // Also keep ICC in JPEG reconstruction mode as we need byte-exact profiles. + if (!cparams.IsLossless() && !io->Main().IsJPEG()) { + metadata->m.color_encoding.DecideIfWantICC(); + } + + metadata->m.xyb_encoded = + cparams.color_transform == ColorTransform::kXYB ? true : false; + + // TODO(firsching): move this EncodeFile to test_utils / re-implement this + // using API functions + return true; +} + +} // namespace + +Status EncodePreview(const CompressParams& cparams, const ImageBundle& ib, + const CodecMetadata* metadata, const JxlCmsInterface& cms, + ThreadPool* pool, BitWriter* JXL_RESTRICT writer) { + BitWriter preview_writer; + // TODO(janwas): also support generating preview by downsampling + if (ib.HasColor()) { + AuxOut aux_out; + PassesEncoderState passes_enc_state; + // TODO(lode): check if we want all extra channels and matching xyb_encoded + // for the preview, such that using the main ImageMetadata object for + // encoding this frame is warrented. + FrameInfo frame_info; + frame_info.is_preview = true; + JXL_RETURN_IF_ERROR(EncodeFrame(cparams, frame_info, metadata, ib, + &passes_enc_state, cms, pool, + &preview_writer, &aux_out)); + preview_writer.ZeroPadToByte(); + } + + if (preview_writer.BitsWritten() != 0) { + writer->ZeroPadToByte(); + writer->AppendByteAligned(preview_writer); + } + + return true; +} + +Status EncodeFile(const CompressParams& params, const CodecInOut* io, + PassesEncoderState* passes_enc_state, PaddedBytes* compressed, + const JxlCmsInterface& cms, AuxOut* aux_out, + ThreadPool* pool) { + io->CheckMetadata(); + BitWriter writer; + + CompressParams cparams = params; + if (io->Main().color_transform != ColorTransform::kNone) { + // Set the color transform to YCbCr or XYB if the original image is such. + cparams.color_transform = io->Main().color_transform; + } + + JXL_RETURN_IF_ERROR(ParamsPostInit(&cparams)); + + std::unique_ptr metadata = jxl::make_unique(); + JXL_RETURN_IF_ERROR(PrepareCodecMetadataFromIO(cparams, io, metadata.get())); + JXL_RETURN_IF_ERROR(WriteCodestreamHeaders(metadata.get(), &writer, aux_out)); + + // Only send ICC (at least several hundred bytes) if fields aren't enough. + if (metadata->m.color_encoding.WantICC()) { + JXL_RETURN_IF_ERROR(WriteICC(metadata->m.color_encoding.ICC(), &writer, + kLayerHeader, aux_out)); + } + + if (metadata->m.have_preview) { + JXL_RETURN_IF_ERROR(EncodePreview(cparams, io->preview_frame, + metadata.get(), cms, pool, &writer)); + } + + // Each frame should start on byte boundaries. + BitWriter::Allotment allotment(&writer, 8); + writer.ZeroPadToByte(); + allotment.ReclaimAndCharge(&writer, kLayerHeader, aux_out); + + for (size_t i = 0; i < io->frames.size(); i++) { + FrameInfo info; + info.is_last = i == io->frames.size() - 1; + if (io->frames[i].use_for_next_frame) { + info.save_as_reference = 1; + } + JXL_RETURN_IF_ERROR(EncodeFrame(cparams, info, metadata.get(), + io->frames[i], passes_enc_state, cms, pool, + &writer, aux_out)); + } + + // Clean up passes_enc_state in case it gets reused. + for (size_t i = 0; i < 4; i++) { + passes_enc_state->shared.dc_frames[i] = Image3F(); + passes_enc_state->shared.reference_frames[i].frame = ImageBundle(); + } + + *compressed = std::move(writer).TakeBytes(); + return true; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_file.h b/third_party/jpeg-xl/lib/jxl/enc_file.h new file mode 100644 index 0000000000..ff3ad1233d --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_file.h @@ -0,0 +1,31 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_FILE_H_ +#define LIB_JXL_ENC_FILE_H_ + +// Facade for JXL encoding. + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_params.h" + +namespace jxl { + +struct AuxOut; +class CodecInOut; + +// Compresses pixels from `io` (given in any ColorEncoding). +// `io->metadata.m.original` must be set. +Status EncodeFile(const CompressParams& params, const CodecInOut* io, + PassesEncoderState* passes_enc_state, PaddedBytes* compressed, + const JxlCmsInterface& cms, AuxOut* aux_out = nullptr, + ThreadPool* pool = nullptr); + +} // namespace jxl + +#endif // LIB_JXL_ENC_FILE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_frame.cc b/third_party/jpeg-xl/lib/jxl/enc_frame.cc new file mode 100644 index 0000000000..ed4088120e --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_frame.cc @@ -0,0 +1,1745 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_frame.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/enc_adaptive_quantization.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_chroma_from_luma.h" +#include "lib/jxl/enc_coeff_order.h" +#include "lib/jxl/enc_context_map.h" +#include "lib/jxl/enc_entropy_coder.h" +#include "lib/jxl/enc_fields.h" +#include "lib/jxl/enc_gaborish.h" +#include "lib/jxl/enc_group.h" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_noise.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_patch_dictionary.h" +#include "lib/jxl/enc_photon_noise.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/enc_splines.h" +#include "lib/jxl/enc_toc.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/modular/options.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/splines.h" +#include "lib/jxl/toc.h" + +namespace jxl { +namespace { + +PassDefinition progressive_passes_dc_vlf_lf_full_ac[] = { + {/*num_coefficients=*/2, /*shift=*/0, + /*suitable_for_downsampling_of_at_least=*/4}, + {/*num_coefficients=*/3, /*shift=*/0, + /*suitable_for_downsampling_of_at_least=*/2}, + {/*num_coefficients=*/8, /*shift=*/0, + /*suitable_for_downsampling_of_at_least=*/0}, +}; + +PassDefinition progressive_passes_dc_quant_ac_full_ac[] = { + {/*num_coefficients=*/8, /*shift=*/1, + /*suitable_for_downsampling_of_at_least=*/2}, + {/*num_coefficients=*/8, /*shift=*/0, + /*suitable_for_downsampling_of_at_least=*/0}, +}; + +void ClusterGroups(PassesEncoderState* enc_state) { + if (enc_state->shared.frame_header.passes.num_passes > 1) { + // TODO(veluca): implement this for progressive modes. + return; + } + // This only considers pass 0 for now. + std::vector context_map; + EntropyEncodingData codes; + auto& ac = enc_state->passes[0].ac_tokens; + size_t limit = std::ceil(std::sqrt(ac.size())); + if (limit == 1) return; + size_t num_contexts = enc_state->shared.block_ctx_map.NumACContexts(); + std::vector costs(ac.size()); + HistogramParams params; + params.uint_method = HistogramParams::HybridUintMethod::kNone; + params.lz77_method = HistogramParams::LZ77Method::kNone; + params.ans_histogram_strategy = + HistogramParams::ANSHistogramStrategy::kApproximate; + size_t max = 0; + auto token_cost = [&](std::vector>& tokens, size_t num_ctx, + bool estimate = true) { + // TODO(veluca): not estimating is very expensive. + BitWriter writer; + size_t c = BuildAndEncodeHistograms( + params, num_ctx, tokens, &codes, &context_map, + estimate ? nullptr : &writer, 0, /*aux_out=*/0); + if (estimate) return c; + for (size_t i = 0; i < tokens.size(); i++) { + WriteTokens(tokens[i], codes, context_map, &writer, 0, nullptr); + } + return writer.BitsWritten(); + }; + for (size_t i = 0; i < ac.size(); i++) { + std::vector> tokens{ac[i]}; + costs[i] = + token_cost(tokens, enc_state->shared.block_ctx_map.NumACContexts()); + if (costs[i] > costs[max]) { + max = i; + } + } + auto dist = [&](int i, int j) { + std::vector> tokens{ac[i], ac[j]}; + return token_cost(tokens, num_contexts) - costs[i] - costs[j]; + }; + std::vector out{max}; + std::vector dists(ac.size()); + size_t farthest = 0; + for (size_t i = 0; i < ac.size(); i++) { + if (i == max) continue; + dists[i] = dist(max, i); + if (dists[i] > dists[farthest]) { + farthest = i; + } + } + + while (dists[farthest] > 0 && out.size() < limit) { + out.push_back(farthest); + dists[farthest] = 0; + enc_state->histogram_idx[farthest] = out.size() - 1; + for (size_t i = 0; i < ac.size(); i++) { + float d = dist(out.back(), i); + if (d < dists[i]) { + dists[i] = d; + enc_state->histogram_idx[i] = out.size() - 1; + } + if (dists[i] > dists[farthest]) { + farthest = i; + } + } + } + + std::vector remap(out.size()); + std::iota(remap.begin(), remap.end(), 0); + for (size_t i = 0; i < enc_state->histogram_idx.size(); i++) { + enc_state->histogram_idx[i] = remap[enc_state->histogram_idx[i]]; + } + auto remap_cost = [&](std::vector remap) { + std::vector re_remap(remap.size(), remap.size()); + size_t r = 0; + for (size_t i = 0; i < remap.size(); i++) { + if (re_remap[remap[i]] == remap.size()) { + re_remap[remap[i]] = r++; + } + remap[i] = re_remap[remap[i]]; + } + auto tokens = ac; + size_t max_hist = 0; + for (size_t i = 0; i < tokens.size(); i++) { + for (size_t j = 0; j < tokens[i].size(); j++) { + size_t hist = remap[enc_state->histogram_idx[i]]; + tokens[i][j].context += hist * num_contexts; + max_hist = std::max(hist + 1, max_hist); + } + } + return token_cost(tokens, max_hist * num_contexts, /*estimate=*/false); + }; + + for (size_t src = 0; src < out.size(); src++) { + float cost = remap_cost(remap); + size_t best = src; + for (size_t j = src + 1; j < out.size(); j++) { + if (remap[src] == remap[j]) continue; + auto remap_c = remap; + std::replace(remap_c.begin(), remap_c.end(), remap[src], remap[j]); + float c = remap_cost(remap_c); + if (c < cost) { + best = j; + cost = c; + } + } + if (src != best) { + std::replace(remap.begin(), remap.end(), remap[src], remap[best]); + } + } + std::vector re_remap(remap.size(), remap.size()); + size_t r = 0; + for (size_t i = 0; i < remap.size(); i++) { + if (re_remap[remap[i]] == remap.size()) { + re_remap[remap[i]] = r++; + } + remap[i] = re_remap[remap[i]]; + } + + enc_state->shared.num_histograms = + *std::max_element(remap.begin(), remap.end()) + 1; + for (size_t i = 0; i < enc_state->histogram_idx.size(); i++) { + enc_state->histogram_idx[i] = remap[enc_state->histogram_idx[i]]; + } + for (size_t i = 0; i < ac.size(); i++) { + for (size_t j = 0; j < ac[i].size(); j++) { + ac[i][j].context += enc_state->histogram_idx[i] * num_contexts; + } + } +} + +uint64_t FrameFlagsFromParams(const CompressParams& cparams) { + uint64_t flags = 0; + + const float dist = cparams.butteraugli_distance; + + // We don't add noise at low butteraugli distances because the original + // noise is stored within the compressed image and adding noise makes things + // worse. + if (ApplyOverride(cparams.noise, dist >= kMinButteraugliForNoise) || + cparams.photon_noise_iso > 0 || + cparams.manual_noise.size() == NoiseParams::kNumNoisePoints) { + flags |= FrameHeader::kNoise; + } + + if (cparams.progressive_dc > 0 && cparams.modular_mode == false) { + flags |= FrameHeader::kUseDcFrame; + } + + return flags; +} + +Status LoopFilterFromParams(const CompressParams& cparams, + FrameHeader* JXL_RESTRICT frame_header) { + LoopFilter* loop_filter = &frame_header->loop_filter; + + // Gaborish defaults to enabled in Hare or slower. + loop_filter->gab = ApplyOverride( + cparams.gaborish, cparams.speed_tier <= SpeedTier::kHare && + frame_header->encoding == FrameEncoding::kVarDCT && + cparams.decoding_speed_tier < 4); + + if (cparams.epf != -1) { + loop_filter->epf_iters = cparams.epf; + } else { + if (frame_header->encoding == FrameEncoding::kModular) { + loop_filter->epf_iters = 0; + } else { + constexpr float kThresholds[3] = {0.7, 1.5, 4.0}; + loop_filter->epf_iters = 0; + if (cparams.decoding_speed_tier < 3) { + for (size_t i = cparams.decoding_speed_tier == 2 ? 1 : 0; i < 3; i++) { + if (cparams.butteraugli_distance >= kThresholds[i]) { + loop_filter->epf_iters++; + } + } + } + } + } + // Strength of EPF in modular mode. + if (frame_header->encoding == FrameEncoding::kModular && + !cparams.IsLossless()) { + // TODO(veluca): this formula is nonsense. + loop_filter->epf_sigma_for_modular = cparams.butteraugli_distance; + } + if (frame_header->encoding == FrameEncoding::kModular && + cparams.lossy_palette) { + loop_filter->epf_sigma_for_modular = 1.0f; + } + + return true; +} + +Status MakeFrameHeader(const CompressParams& cparams, + const ProgressiveSplitter& progressive_splitter, + const FrameInfo& frame_info, const ImageBundle& ib, + FrameHeader* JXL_RESTRICT frame_header) { + frame_header->nonserialized_is_preview = frame_info.is_preview; + frame_header->is_last = frame_info.is_last; + frame_header->save_before_color_transform = + frame_info.save_before_color_transform; + frame_header->frame_type = frame_info.frame_type; + frame_header->name = ib.name; + + progressive_splitter.InitPasses(&frame_header->passes); + + if (cparams.modular_mode) { + frame_header->encoding = FrameEncoding::kModular; + frame_header->group_size_shift = cparams.modular_group_size_shift; + } + + frame_header->chroma_subsampling = ib.chroma_subsampling; + if (ib.IsJPEG()) { + // we are transcoding a JPEG, so we don't get to choose + frame_header->encoding = FrameEncoding::kVarDCT; + frame_header->color_transform = ib.color_transform; + } else { + frame_header->color_transform = cparams.color_transform; + if (!cparams.modular_mode && + (frame_header->chroma_subsampling.MaxHShift() != 0 || + frame_header->chroma_subsampling.MaxVShift() != 0)) { + return JXL_FAILURE( + "Chroma subsampling is not supported in VarDCT mode when not " + "recompressing JPEGs"); + } + } + if (frame_header->color_transform != ColorTransform::kYCbCr && + (frame_header->chroma_subsampling.MaxHShift() != 0 || + frame_header->chroma_subsampling.MaxVShift() != 0)) { + return JXL_FAILURE( + "Chroma subsampling is not supported when color transform is not " + "YCbCr"); + } + + frame_header->flags = FrameFlagsFromParams(cparams); + // Non-photon noise is not supported in the Modular encoder for now. + if (frame_header->encoding != FrameEncoding::kVarDCT && + cparams.photon_noise_iso == 0 && cparams.manual_noise.empty()) { + frame_header->UpdateFlag(false, FrameHeader::Flags::kNoise); + } + + JXL_RETURN_IF_ERROR(LoopFilterFromParams(cparams, frame_header)); + + frame_header->dc_level = frame_info.dc_level; + if (frame_header->dc_level > 2) { + // With 3 or more progressive_dc frames, the implementation does not yet + // work, see enc_cache.cc. + return JXL_FAILURE("progressive_dc > 2 is not yet supported"); + } + if (cparams.progressive_dc > 0 && + (cparams.ec_resampling != 1 || cparams.resampling != 1)) { + return JXL_FAILURE("Resampling not supported with DC frames"); + } + if (cparams.resampling != 1 && cparams.resampling != 2 && + cparams.resampling != 4 && cparams.resampling != 8) { + return JXL_FAILURE("Invalid resampling factor"); + } + if (cparams.ec_resampling != 1 && cparams.ec_resampling != 2 && + cparams.ec_resampling != 4 && cparams.ec_resampling != 8) { + return JXL_FAILURE("Invalid ec_resampling factor"); + } + // Resized frames. + if (frame_info.frame_type != FrameType::kDCFrame) { + frame_header->frame_origin = ib.origin; + size_t ups = 1; + if (cparams.already_downsampled) ups = cparams.resampling; + + // TODO(lode): this is not correct in case of odd original image sizes in + // combination with cparams.already_downsampled. Likely these values should + // be set to respectively frame_header->default_xsize() and + // frame_header->default_ysize() instead, the original (non downsampled) + // intended decoded image dimensions. But it may be more subtle than that + // if combined with crop. This issue causes custom_size_or_origin to be + // incorrectly set to true in case of already_downsampled with odd output + // image size when no cropping is used. + frame_header->frame_size.xsize = ib.xsize() * ups; + frame_header->frame_size.ysize = ib.ysize() * ups; + if (ib.origin.x0 != 0 || ib.origin.y0 != 0 || + frame_header->frame_size.xsize != frame_header->default_xsize() || + frame_header->frame_size.ysize != frame_header->default_ysize()) { + frame_header->custom_size_or_origin = true; + } + } + // Upsampling. + frame_header->upsampling = cparams.resampling; + const std::vector& extra_channels = + frame_header->nonserialized_metadata->m.extra_channel_info; + frame_header->extra_channel_upsampling.clear(); + frame_header->extra_channel_upsampling.resize(extra_channels.size(), + cparams.ec_resampling); + frame_header->save_as_reference = frame_info.save_as_reference; + + // Set blending-related information. + if (ib.blend || frame_header->custom_size_or_origin) { + // Set blend_channel to the first alpha channel. These values are only + // encoded in case a blend mode involving alpha is used and there are more + // than one extra channels. + size_t index = 0; + if (frame_info.alpha_channel == -1) { + if (extra_channels.size() > 1) { + for (size_t i = 0; i < extra_channels.size(); i++) { + if (extra_channels[i].type == ExtraChannel::kAlpha) { + index = i; + break; + } + } + } + } else { + index = static_cast(frame_info.alpha_channel); + JXL_ASSERT(index == 0 || index < extra_channels.size()); + } + frame_header->blending_info.alpha_channel = index; + frame_header->blending_info.mode = + ib.blend ? ib.blendmode : BlendMode::kReplace; + frame_header->blending_info.source = frame_info.source; + frame_header->blending_info.clamp = frame_info.clamp; + const auto& extra_channel_info = frame_info.extra_channel_blending_info; + for (size_t i = 0; i < extra_channels.size(); i++) { + if (i < extra_channel_info.size()) { + frame_header->extra_channel_blending_info[i] = extra_channel_info[i]; + } else { + frame_header->extra_channel_blending_info[i].alpha_channel = index; + BlendMode default_blend = ib.blendmode; + if (extra_channels[i].type != ExtraChannel::kBlack && i != index) { + // K needs to be blended, spot colors and other stuff gets added + default_blend = BlendMode::kAdd; + } + frame_header->extra_channel_blending_info[i].mode = + ib.blend ? default_blend : BlendMode::kReplace; + frame_header->extra_channel_blending_info[i].source = 1; + } + } + } + + frame_header->animation_frame.duration = ib.duration; + frame_header->animation_frame.timecode = ib.timecode; + + return true; +} + +// Invisible (alpha = 0) pixels tend to be a mess in optimized PNGs. +// Since they have no visual impact whatsoever, we can replace them with +// something that compresses better and reduces artifacts near the edges. This +// does some kind of smooth stuff that seems to work. +// Replace invisible pixels with a weighted average of the pixel to the left, +// the pixel to the topright, and non-invisible neighbours. +// Produces downward-blurry smears, with in the upwards direction only a 1px +// edge duplication but not more. It would probably be better to smear in all +// directions. That requires an alpha-weighed convolution with a large enough +// kernel though, which might be overkill... +void SimplifyInvisible(Image3F* image, const ImageF& alpha, bool lossless) { + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < image->ysize(); ++y) { + float* JXL_RESTRICT row = image->PlaneRow(c, y); + const float* JXL_RESTRICT prow = + (y > 0 ? image->PlaneRow(c, y - 1) : nullptr); + const float* JXL_RESTRICT nrow = + (y + 1 < image->ysize() ? image->PlaneRow(c, y + 1) : nullptr); + const float* JXL_RESTRICT a = alpha.Row(y); + const float* JXL_RESTRICT pa = (y > 0 ? alpha.Row(y - 1) : nullptr); + const float* JXL_RESTRICT na = + (y + 1 < image->ysize() ? alpha.Row(y + 1) : nullptr); + for (size_t x = 0; x < image->xsize(); ++x) { + if (a[x] == 0) { + if (lossless) { + row[x] = 0; + continue; + } + float d = 0.f; + row[x] = 0; + if (x > 0) { + row[x] += row[x - 1]; + d++; + if (a[x - 1] > 0.f) { + row[x] += row[x - 1]; + d++; + } + } + if (x + 1 < image->xsize()) { + if (y > 0) { + row[x] += prow[x + 1]; + d++; + } + if (a[x + 1] > 0.f) { + row[x] += 2.f * row[x + 1]; + d += 2.f; + } + if (y > 0 && pa[x + 1] > 0.f) { + row[x] += 2.f * prow[x + 1]; + d += 2.f; + } + if (y + 1 < image->ysize() && na[x + 1] > 0.f) { + row[x] += 2.f * nrow[x + 1]; + d += 2.f; + } + } + if (y > 0 && pa[x] > 0.f) { + row[x] += 2.f * prow[x]; + d += 2.f; + } + if (y + 1 < image->ysize() && na[x] > 0.f) { + row[x] += 2.f * nrow[x]; + d += 2.f; + } + if (d > 1.f) row[x] /= d; + } + } + } + } +} + +struct PixelStatsForChromacityAdjustment { + float dx = 0; + float db = 0; + float exposed_blue = 0; + float CalcPlane(const ImageF* JXL_RESTRICT plane) const { + float xmax = 0; + float ymax = 0; + for (size_t ty = 1; ty < plane->ysize(); ++ty) { + for (size_t tx = 1; tx < plane->xsize(); ++tx) { + float cur = plane->Row(ty)[tx]; + float prev_row = plane->Row(ty - 1)[tx]; + float prev = plane->Row(ty)[tx - 1]; + xmax = std::max(xmax, std::abs(cur - prev)); + ymax = std::max(ymax, std::abs(cur - prev_row)); + } + } + return std::max(xmax, ymax); + } + void CalcExposedBlue(const ImageF* JXL_RESTRICT plane_y, + const ImageF* JXL_RESTRICT plane_b) { + float eb = 0; + float xmax = 0; + float ymax = 0; + for (size_t ty = 1; ty < plane_y->ysize(); ++ty) { + for (size_t tx = 1; tx < plane_y->xsize(); ++tx) { + float cur_y = plane_y->Row(ty)[tx]; + float cur_b = plane_b->Row(ty)[tx]; + float exposed_b = cur_b - cur_y * 1.2; + float diff_b = cur_b - cur_y; + float prev_row = plane_b->Row(ty - 1)[tx]; + float prev = plane_b->Row(ty)[tx - 1]; + float diff_prev_row = prev_row - plane_y->Row(ty - 1)[tx]; + float diff_prev = prev - plane_y->Row(ty)[tx - 1]; + xmax = std::max(xmax, std::abs(diff_b - diff_prev)); + ymax = std::max(ymax, std::abs(diff_b - diff_prev_row)); + if (exposed_b >= 0) { + exposed_b *= fabs(cur_b - prev) + fabs(cur_b - prev_row); + eb = std::max(eb, exposed_b); + } + } + } + exposed_blue = eb; + db = std::max(xmax, ymax); + } + void Calc(const Image3F* JXL_RESTRICT opsin) { + dx = CalcPlane(&opsin->Plane(0)); + CalcExposedBlue(&opsin->Plane(1), &opsin->Plane(2)); + } + int HowMuchIsXChannelPixelized() { + if (dx >= 0.03) { + return 2; + } + if (dx >= 0.017) { + return 1; + } + return 0; + } + int HowMuchIsBChannelPixelized() { + int add = exposed_blue >= 0.13 ? 1 : 0; + if (db > 0.38) { + return 2 + add; + } + if (db > 0.33) { + return 1 + add; + } + if (db > 0.28) { + return add; + } + return 0; + } +}; + +} // namespace + +class LossyFrameEncoder { + public: + LossyFrameEncoder(const CompressParams& cparams, + const FrameHeader& frame_header, + PassesEncoderState* JXL_RESTRICT enc_state, + const JxlCmsInterface& cms, ThreadPool* pool, + AuxOut* aux_out) + : enc_state_(enc_state), cms_(cms), pool_(pool), aux_out_(aux_out) { + JXL_CHECK(InitializePassesSharedState(frame_header, &enc_state_->shared, + /*encoder=*/true)); + enc_state_->cparams = cparams; + enc_state_->passes.clear(); + } + + Status ComputeEncodingData(const ImageBundle* linear, + Image3F* JXL_RESTRICT opsin, + const JxlCmsInterface& cms, ThreadPool* pool, + ModularFrameEncoder* modular_frame_encoder, + FrameHeader* frame_header) { + PROFILER_ZONE("ComputeEncodingData uninstrumented"); + JXL_ASSERT((opsin->xsize() % kBlockDim) == 0 && + (opsin->ysize() % kBlockDim) == 0); + PassesSharedState& shared = enc_state_->shared; + + if (!enc_state_->cparams.max_error_mode) { + // Compute chromacity adjustments using two approaches. + // 1) Distance based approach for chromacity adjustment: + float x_qm_scale_steps[4] = {1.25f, 7.0f, 15.0f, 24.0f}; + shared.frame_header.x_qm_scale = 2; + for (float x_qm_scale_step : x_qm_scale_steps) { + if (enc_state_->cparams.original_butteraugli_distance > + x_qm_scale_step) { + shared.frame_header.x_qm_scale++; + } + } + if (enc_state_->cparams.butteraugli_distance < 0.299f) { + // Favor chromacity preservation for making images appear more + // faithful to original even with extreme (5-10x) zooming. + shared.frame_header.x_qm_scale++; + } + // 2) Pixel-based approach for chromacity adjustment: + // look at the individual pixels and make a guess how difficult + // the image would be based on the worst case pixel. + PixelStatsForChromacityAdjustment pixel_stats; + if (enc_state_->cparams.speed_tier <= SpeedTier::kWombat) { + pixel_stats.Calc(opsin); + } + // For X take the most severe adjustment. + shared.frame_header.x_qm_scale = + std::max(shared.frame_header.x_qm_scale, + 2 + pixel_stats.HowMuchIsXChannelPixelized()); + // B only ajudsted by pixel-based approach. + shared.frame_header.b_qm_scale = + 2 + pixel_stats.HowMuchIsBChannelPixelized(); + } + + JXL_RETURN_IF_ERROR(enc_state_->heuristics->LossyFrameHeuristics( + enc_state_, modular_frame_encoder, linear, opsin, cms_, pool_, + aux_out_)); + + JXL_RETURN_IF_ERROR(InitializePassesEncoder( + *opsin, cms, pool_, enc_state_, modular_frame_encoder, aux_out_)); + + enc_state_->passes.resize(enc_state_->progressive_splitter.GetNumPasses()); + for (PassesEncoderState::PassData& pass : enc_state_->passes) { + pass.ac_tokens.resize(shared.frame_dim.num_groups); + } + + ComputeAllCoeffOrders(shared.frame_dim); + shared.num_histograms = 1; + + const auto tokenize_group_init = [&](const size_t num_threads) { + group_caches_.resize(num_threads); + return true; + }; + const auto tokenize_group = [&](const uint32_t group_index, + const size_t thread) { + // Tokenize coefficients. + const Rect rect = shared.BlockGroupRect(group_index); + for (size_t idx_pass = 0; idx_pass < enc_state_->passes.size(); + idx_pass++) { + JXL_ASSERT(enc_state_->coeffs[idx_pass]->Type() == ACType::k32); + const int32_t* JXL_RESTRICT ac_rows[3] = { + enc_state_->coeffs[idx_pass]->PlaneRow(0, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(1, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(2, group_index, 0).ptr32, + }; + // Ensure group cache is initialized. + group_caches_[thread].InitOnce(); + TokenizeCoefficients( + &shared.coeff_orders[idx_pass * shared.coeff_order_size], rect, + ac_rows, shared.ac_strategy, frame_header->chroma_subsampling, + &group_caches_[thread].num_nzeroes, + &enc_state_->passes[idx_pass].ac_tokens[group_index], + enc_state_->shared.quant_dc, enc_state_->shared.raw_quant_field, + enc_state_->shared.block_ctx_map); + } + }; + JXL_RETURN_IF_ERROR(RunOnPool(pool_, 0, shared.frame_dim.num_groups, + tokenize_group_init, tokenize_group, + "TokenizeGroup")); + + *frame_header = shared.frame_header; + return true; + } + + Status ComputeJPEGTranscodingData(const jpeg::JPEGData& jpeg_data, + ModularFrameEncoder* modular_frame_encoder, + FrameHeader* frame_header) { + PROFILER_ZONE("ComputeJPEGTranscodingData uninstrumented"); + PassesSharedState& shared = enc_state_->shared; + + frame_header->x_qm_scale = 2; + frame_header->b_qm_scale = 2; + + FrameDimensions frame_dim = frame_header->ToFrameDimensions(); + + const size_t xsize = frame_dim.xsize_padded; + const size_t ysize = frame_dim.ysize_padded; + const size_t xsize_blocks = frame_dim.xsize_blocks; + const size_t ysize_blocks = frame_dim.ysize_blocks; + + // no-op chroma from luma + shared.cmap = ColorCorrelationMap(xsize, ysize, false); + shared.ac_strategy.FillDCT8(); + FillImage(uint8_t(0), &shared.epf_sharpness); + + enc_state_->passes.resize(enc_state_->progressive_splitter.GetNumPasses()); + for (PassesEncoderState::PassData& pass : enc_state_->passes) { + pass.ac_tokens.resize(shared.frame_dim.num_groups); + } + + enc_state_->coeffs.clear(); + while (enc_state_->coeffs.size() < enc_state_->passes.size()) { + enc_state_->coeffs.emplace_back(make_unique>( + kGroupDim * kGroupDim, frame_dim.num_groups)); + } + + // convert JPEG quantization table to a Quantizer object + float dcquantization[3]; + std::vector qe(DequantMatrices::kNum, + QuantEncoding::Library(0)); + + auto jpeg_c_map = JpegOrder(frame_header->color_transform, + jpeg_data.components.size() == 1); + + std::vector qt(192); + for (size_t c = 0; c < 3; c++) { + size_t jpeg_c = jpeg_c_map[c]; + const int32_t* quant = + jpeg_data.quant[jpeg_data.components[jpeg_c].quant_idx].values.data(); + + dcquantization[c] = 255 * 8.0f / quant[0]; + for (size_t y = 0; y < 8; y++) { + for (size_t x = 0; x < 8; x++) { + // JPEG XL transposes the DCT, JPEG doesn't. + qt[c * 64 + 8 * x + y] = quant[8 * y + x]; + } + } + } + DequantMatricesSetCustomDC(&shared.matrices, dcquantization); + float dcquantization_r[3] = {1.0f / dcquantization[0], + 1.0f / dcquantization[1], + 1.0f / dcquantization[2]}; + + qe[AcStrategy::Type::DCT] = QuantEncoding::RAW(qt); + DequantMatricesSetCustom(&shared.matrices, qe, modular_frame_encoder); + + // Ensure that InvGlobalScale() is 1. + shared.quantizer = Quantizer(&shared.matrices, 1, kGlobalScaleDenom); + // Recompute MulDC() and InvMulDC(). + shared.quantizer.RecomputeFromGlobalScale(); + + // Per-block dequant scaling should be 1. + FillImage(static_cast(shared.quantizer.InvGlobalScale()), + &shared.raw_quant_field); + + std::vector scaled_qtable(192); + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < 64; i++) { + scaled_qtable[64 * c + i] = + (1 << kCFLFixedPointPrecision) * qt[64 + i] / qt[64 * c + i]; + } + } + + auto jpeg_row = [&](size_t c, size_t y) { + return jpeg_data.components[jpeg_c_map[c]].coeffs.data() + + jpeg_data.components[jpeg_c_map[c]].width_in_blocks * + kDCTBlockSize * y; + }; + + Image3F dc = Image3F(xsize_blocks, ysize_blocks); + bool DCzero = + (shared.frame_header.color_transform == ColorTransform::kYCbCr); + // Compute chroma-from-luma for AC (doesn't seem to be useful for DC) + if (frame_header->chroma_subsampling.Is444() && + enc_state_->cparams.force_cfl_jpeg_recompression && + jpeg_data.components.size() == 3) { + for (size_t c : {0, 2}) { + ImageSB* map = (c == 0 ? &shared.cmap.ytox_map : &shared.cmap.ytob_map); + const float kScale = kDefaultColorFactor; + const int kOffset = 127; + const float kBase = + c == 0 ? shared.cmap.YtoXRatio(0) : shared.cmap.YtoBRatio(0); + const float kZeroThresh = + kScale * kZeroBiasDefault[c] * + 0.9999f; // just epsilon less for better rounding + + auto process_row = [&](const uint32_t task, const size_t thread) { + size_t ty = task; + int8_t* JXL_RESTRICT row_out = map->Row(ty); + for (size_t tx = 0; tx < map->xsize(); ++tx) { + const size_t y0 = ty * kColorTileDimInBlocks; + const size_t x0 = tx * kColorTileDimInBlocks; + const size_t y1 = std::min(frame_dim.ysize_blocks, + (ty + 1) * kColorTileDimInBlocks); + const size_t x1 = std::min(frame_dim.xsize_blocks, + (tx + 1) * kColorTileDimInBlocks); + int32_t d_num_zeros[257] = {0}; + // TODO(veluca): this needs SIMD + fixed point adaptation, and/or + // conversion to the new CfL algorithm. + for (size_t y = y0; y < y1; ++y) { + const int16_t* JXL_RESTRICT row_m = jpeg_row(1, y); + const int16_t* JXL_RESTRICT row_s = jpeg_row(c, y); + for (size_t x = x0; x < x1; ++x) { + for (size_t coeffpos = 1; coeffpos < kDCTBlockSize; + coeffpos++) { + const float scaled_m = + row_m[x * kDCTBlockSize + coeffpos] * + scaled_qtable[64 * c + coeffpos] * + (1.0f / (1 << kCFLFixedPointPrecision)); + const float scaled_s = + kScale * row_s[x * kDCTBlockSize + coeffpos] + + (kOffset - kBase * kScale) * scaled_m; + if (std::abs(scaled_m) > 1e-8f) { + float from, to; + if (scaled_m > 0) { + from = (scaled_s - kZeroThresh) / scaled_m; + to = (scaled_s + kZeroThresh) / scaled_m; + } else { + from = (scaled_s + kZeroThresh) / scaled_m; + to = (scaled_s - kZeroThresh) / scaled_m; + } + if (from < 0.0f) { + from = 0.0f; + } + if (to > 255.0f) { + to = 255.0f; + } + // Instead of clamping the both values + // we just check that range is sane. + if (from <= to) { + d_num_zeros[static_cast(std::ceil(from))]++; + d_num_zeros[static_cast(std::floor(to + 1))]--; + } + } + } + } + } + int best = 0; + int32_t best_sum = 0; + FindIndexOfSumMaximum(d_num_zeros, 256, &best, &best_sum); + int32_t offset_sum = 0; + for (int i = 0; i < 256; ++i) { + if (i <= kOffset) { + offset_sum += d_num_zeros[i]; + } + } + row_out[tx] = 0; + if (best_sum > offset_sum + 1) { + row_out[tx] = best - kOffset; + } + } + }; + + JXL_RETURN_IF_ERROR(RunOnPool(pool_, 0, map->ysize(), + ThreadPool::NoInit, process_row, + "FindCorrelation")); + } + } + + if (!frame_header->chroma_subsampling.Is444()) { + ZeroFillImage(&dc); + for (auto& coeff : enc_state_->coeffs) { + coeff->ZeroFill(); + } + } + // JPEG DC is from -1024 to 1023. + std::vector dc_counts[3] = {}; + dc_counts[0].resize(2048); + dc_counts[1].resize(2048); + dc_counts[2].resize(2048); + size_t total_dc[3] = {}; + for (size_t c : {1, 0, 2}) { + if (jpeg_data.components.size() == 1 && c != 1) { + for (auto& coeff : enc_state_->coeffs) { + coeff->ZeroFillPlane(c); + } + ZeroFillImage(&dc.Plane(c)); + // Ensure no division by 0. + dc_counts[c][1024] = 1; + total_dc[c] = 1; + continue; + } + size_t hshift = frame_header->chroma_subsampling.HShift(c); + size_t vshift = frame_header->chroma_subsampling.VShift(c); + ImageSB& map = (c == 0 ? shared.cmap.ytox_map : shared.cmap.ytob_map); + for (size_t group_index = 0; group_index < frame_dim.num_groups; + group_index++) { + const size_t gx = group_index % frame_dim.xsize_groups; + const size_t gy = group_index / frame_dim.xsize_groups; + int32_t* coeffs[kMaxNumPasses]; + for (size_t i = 0; i < enc_state_->coeffs.size(); i++) { + coeffs[i] = enc_state_->coeffs[i]->PlaneRow(c, group_index, 0).ptr32; + } + int32_t block[64]; + for (size_t by = gy * kGroupDimInBlocks; + by < ysize_blocks && by < (gy + 1) * kGroupDimInBlocks; ++by) { + if ((by >> vshift) << vshift != by) continue; + const int16_t* JXL_RESTRICT inputjpeg = jpeg_row(c, by >> vshift); + const int16_t* JXL_RESTRICT inputjpegY = jpeg_row(1, by); + float* JXL_RESTRICT fdc = dc.PlaneRow(c, by >> vshift); + const int8_t* JXL_RESTRICT cm = + map.ConstRow(by / kColorTileDimInBlocks); + for (size_t bx = gx * kGroupDimInBlocks; + bx < xsize_blocks && bx < (gx + 1) * kGroupDimInBlocks; ++bx) { + if ((bx >> hshift) << hshift != bx) continue; + size_t base = (bx >> hshift) * kDCTBlockSize; + int idc; + if (DCzero) { + idc = inputjpeg[base]; + } else { + idc = inputjpeg[base] + 1024 / qt[c * 64]; + } + dc_counts[c][std::min(static_cast(idc + 1024), + uint32_t(2047))]++; + total_dc[c]++; + fdc[bx >> hshift] = idc * dcquantization_r[c]; + if (c == 1 || !enc_state_->cparams.force_cfl_jpeg_recompression || + !frame_header->chroma_subsampling.Is444()) { + for (size_t y = 0; y < 8; y++) { + for (size_t x = 0; x < 8; x++) { + block[y * 8 + x] = inputjpeg[base + x * 8 + y]; + } + } + } else { + const int32_t scale = + shared.cmap.RatioJPEG(cm[bx / kColorTileDimInBlocks]); + + for (size_t y = 0; y < 8; y++) { + for (size_t x = 0; x < 8; x++) { + int Y = inputjpegY[kDCTBlockSize * bx + x * 8 + y]; + int QChroma = inputjpeg[kDCTBlockSize * bx + x * 8 + y]; + // Fixed-point multiply of CfL scale with quant table ratio + // first, and Y value second. + int coeff_scale = (scale * scaled_qtable[64 * c + y * 8 + x] + + (1 << (kCFLFixedPointPrecision - 1))) >> + kCFLFixedPointPrecision; + int cfl_factor = (Y * coeff_scale + + (1 << (kCFLFixedPointPrecision - 1))) >> + kCFLFixedPointPrecision; + int QCR = QChroma - cfl_factor; + block[y * 8 + x] = QCR; + } + } + } + enc_state_->progressive_splitter.SplitACCoefficients( + block, AcStrategy::FromRawStrategy(AcStrategy::Type::DCT), bx, + by, coeffs); + for (size_t i = 0; i < enc_state_->coeffs.size(); i++) { + coeffs[i] += kDCTBlockSize; + } + } + } + } + } + + auto& dct = enc_state_->shared.block_ctx_map.dc_thresholds; + auto& num_dc_ctxs = enc_state_->shared.block_ctx_map.num_dc_ctxs; + num_dc_ctxs = 1; + for (size_t i = 0; i < 3; i++) { + dct[i].clear(); + int num_thresholds = (CeilLog2Nonzero(total_dc[i]) - 12) / 2; + // up to 3 buckets per channel: + // dark/medium/bright, yellow/unsat/blue, green/unsat/red + num_thresholds = std::min(std::max(num_thresholds, 0), 2); + size_t cumsum = 0; + size_t cut = total_dc[i] / (num_thresholds + 1); + for (int j = 0; j < 2048; j++) { + cumsum += dc_counts[i][j]; + if (cumsum > cut) { + dct[i].push_back(j - 1025); + cut = total_dc[i] * (dct[i].size() + 1) / (num_thresholds + 1); + } + } + num_dc_ctxs *= dct[i].size() + 1; + } + + auto& ctx_map = enc_state_->shared.block_ctx_map.ctx_map; + ctx_map.clear(); + ctx_map.resize(3 * kNumOrders * num_dc_ctxs, 0); + + int lbuckets = (dct[1].size() + 1); + for (size_t i = 0; i < num_dc_ctxs; i++) { + // up to 9 contexts for luma + ctx_map[i] = i / lbuckets; + // up to 3 contexts for chroma + ctx_map[kNumOrders * num_dc_ctxs + i] = + ctx_map[2 * kNumOrders * num_dc_ctxs + i] = + num_dc_ctxs / lbuckets + (i % lbuckets); + } + enc_state_->shared.block_ctx_map.num_ctxs = + *std::max_element(ctx_map.begin(), ctx_map.end()) + 1; + + enc_state_->histogram_idx.resize(shared.frame_dim.num_groups); + + // disable DC frame for now + shared.frame_header.UpdateFlag(false, FrameHeader::kUseDcFrame); + auto compute_dc_coeffs = [&](const uint32_t group_index, + size_t /* thread */) { + modular_frame_encoder->AddVarDCTDC(dc, group_index, /*nl_dc=*/false, + enc_state_, /*jpeg_transcode=*/true); + modular_frame_encoder->AddACMetadata(group_index, /*jpeg_transcode=*/true, + enc_state_); + }; + JXL_RETURN_IF_ERROR(RunOnPool(pool_, 0, shared.frame_dim.num_dc_groups, + ThreadPool::NoInit, compute_dc_coeffs, + "Compute DC coeffs")); + + // Must happen before WriteFrameHeader! + shared.frame_header.UpdateFlag(true, FrameHeader::kSkipAdaptiveDCSmoothing); + + ComputeAllCoeffOrders(frame_dim); + shared.num_histograms = 1; + + const auto tokenize_group_init = [&](const size_t num_threads) { + group_caches_.resize(num_threads); + return true; + }; + const auto tokenize_group = [&](const uint32_t group_index, + const size_t thread) { + // Tokenize coefficients. + const Rect rect = shared.BlockGroupRect(group_index); + for (size_t idx_pass = 0; idx_pass < enc_state_->passes.size(); + idx_pass++) { + JXL_ASSERT(enc_state_->coeffs[idx_pass]->Type() == ACType::k32); + const int32_t* JXL_RESTRICT ac_rows[3] = { + enc_state_->coeffs[idx_pass]->PlaneRow(0, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(1, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(2, group_index, 0).ptr32, + }; + // Ensure group cache is initialized. + group_caches_[thread].InitOnce(); + TokenizeCoefficients( + &shared.coeff_orders[idx_pass * shared.coeff_order_size], rect, + ac_rows, shared.ac_strategy, frame_header->chroma_subsampling, + &group_caches_[thread].num_nzeroes, + &enc_state_->passes[idx_pass].ac_tokens[group_index], + enc_state_->shared.quant_dc, enc_state_->shared.raw_quant_field, + enc_state_->shared.block_ctx_map); + } + }; + JXL_RETURN_IF_ERROR(RunOnPool(pool_, 0, shared.frame_dim.num_groups, + tokenize_group_init, tokenize_group, + "TokenizeGroup")); + *frame_header = shared.frame_header; + doing_jpeg_recompression = true; + return true; + } + + Status EncodeGlobalDCInfo(const FrameHeader& frame_header, + BitWriter* writer) const { + // Encode quantizer DC and global scale. + QuantizerParams params = enc_state_->shared.quantizer.GetParams(); + JXL_RETURN_IF_ERROR( + WriteQuantizerParams(params, writer, kLayerQuant, aux_out_)); + EncodeBlockCtxMap(enc_state_->shared.block_ctx_map, writer, aux_out_); + ColorCorrelationMapEncodeDC(&enc_state_->shared.cmap, writer, kLayerDC, + aux_out_); + return true; + } + + Status EncodeGlobalACInfo(BitWriter* writer, + ModularFrameEncoder* modular_frame_encoder) { + JXL_RETURN_IF_ERROR(DequantMatricesEncode(&enc_state_->shared.matrices, + writer, kLayerQuant, aux_out_, + modular_frame_encoder)); + if (enc_state_->cparams.speed_tier <= SpeedTier::kTortoise) { + if (!doing_jpeg_recompression) ClusterGroups(enc_state_); + } + size_t num_histo_bits = + CeilLog2Nonzero(enc_state_->shared.frame_dim.num_groups); + if (num_histo_bits != 0) { + BitWriter::Allotment allotment(writer, num_histo_bits); + writer->Write(num_histo_bits, enc_state_->shared.num_histograms - 1); + allotment.ReclaimAndCharge(writer, kLayerAC, aux_out_); + } + + for (size_t i = 0; i < enc_state_->progressive_splitter.GetNumPasses(); + i++) { + // Encode coefficient orders. + size_t order_bits = 0; + JXL_RETURN_IF_ERROR(U32Coder::CanEncode( + kOrderEnc, enc_state_->used_orders[i], &order_bits)); + BitWriter::Allotment allotment(writer, order_bits); + JXL_CHECK(U32Coder::Write(kOrderEnc, enc_state_->used_orders[i], writer)); + allotment.ReclaimAndCharge(writer, kLayerOrder, aux_out_); + EncodeCoeffOrders( + enc_state_->used_orders[i], + &enc_state_->shared + .coeff_orders[i * enc_state_->shared.coeff_order_size], + writer, kLayerOrder, aux_out_); + + // Encode histograms. + HistogramParams hist_params( + enc_state_->cparams.speed_tier, + enc_state_->shared.block_ctx_map.NumACContexts()); + if (enc_state_->cparams.speed_tier > SpeedTier::kTortoise) { + hist_params.lz77_method = HistogramParams::LZ77Method::kNone; + } + if (enc_state_->cparams.decoding_speed_tier >= 1) { + hist_params.max_histograms = 6; + } + BuildAndEncodeHistograms( + hist_params, + enc_state_->shared.num_histograms * + enc_state_->shared.block_ctx_map.NumACContexts(), + enc_state_->passes[i].ac_tokens, &enc_state_->passes[i].codes, + &enc_state_->passes[i].context_map, writer, kLayerAC, aux_out_); + } + + return true; + } + + Status EncodeACGroup(size_t pass, size_t group_index, BitWriter* group_code, + AuxOut* local_aux_out) { + return EncodeGroupTokenizedCoefficients( + group_index, pass, enc_state_->histogram_idx[group_index], *enc_state_, + group_code, local_aux_out); + } + + PassesEncoderState* State() { return enc_state_; } + + private: + void ComputeAllCoeffOrders(const FrameDimensions& frame_dim) { + PROFILER_FUNC; + // No coefficient reordering in Falcon or faster. + auto used_orders_info = ComputeUsedOrders( + enc_state_->cparams.speed_tier, enc_state_->shared.ac_strategy, + Rect(enc_state_->shared.raw_quant_field)); + enc_state_->used_orders.clear(); + enc_state_->used_orders.resize( + enc_state_->progressive_splitter.GetNumPasses(), + used_orders_info.second); + for (size_t i = 0; i < enc_state_->progressive_splitter.GetNumPasses(); + i++) { + ComputeCoeffOrder( + enc_state_->cparams.speed_tier, *enc_state_->coeffs[i], + enc_state_->shared.ac_strategy, frame_dim, enc_state_->used_orders[i], + used_orders_info.first, + &enc_state_->shared + .coeff_orders[i * enc_state_->shared.coeff_order_size]); + } + } + + template + static inline void FindIndexOfSumMaximum(const V* array, const size_t len, + R* idx, V* sum) { + JXL_ASSERT(len > 0); + V maxval = 0; + V val = 0; + R maxidx = 0; + for (size_t i = 0; i < len; ++i) { + val += array[i]; + if (val > maxval) { + maxval = val; + maxidx = i; + } + } + *idx = maxidx; + *sum = maxval; + } + + PassesEncoderState* JXL_RESTRICT enc_state_; + JxlCmsInterface cms_; + ThreadPool* pool_; + AuxOut* aux_out_; + std::vector group_caches_; + bool doing_jpeg_recompression = false; +}; + +Status ParamsPostInit(CompressParams* p) { + if (!p->manual_noise.empty() && + p->manual_noise.size() != NoiseParams::kNumNoisePoints) { + return JXL_FAILURE("Invalid number of noise lut entries"); + } + if (!p->manual_xyb_factors.empty() && p->manual_xyb_factors.size() != 3) { + return JXL_FAILURE("Invalid number of XYB quantization factors"); + } + if (!p->modular_mode && p->butteraugli_distance == 0.0) { + p->butteraugli_distance = kMinButteraugliDistance; + } + if (p->original_butteraugli_distance == -1.0) { + p->original_butteraugli_distance = p->butteraugli_distance; + } + if (p->resampling <= 0) { + p->resampling = 1; + // For very low bit rates, using 2x2 resampling gives better results on + // most photographic images, with an adjusted butteraugli score chosen to + // give roughly the same amount of bits per pixel. + if (!p->already_downsampled && p->butteraugli_distance >= 20) { + p->resampling = 2; + p->butteraugli_distance = 6 + ((p->butteraugli_distance - 20) * 0.25); + } + } + if (p->ec_resampling <= 0) { + p->ec_resampling = p->resampling; + } + return true; +} + +Status EncodeFrame(const CompressParams& cparams_orig, + const FrameInfo& frame_info, const CodecMetadata* metadata, + const ImageBundle& ib, PassesEncoderState* passes_enc_state, + const JxlCmsInterface& cms, ThreadPool* pool, + BitWriter* writer, AuxOut* aux_out) { + CompressParams cparams = cparams_orig; + if (cparams.speed_tier == SpeedTier::kGlacier && !cparams.IsLossless()) { + cparams.speed_tier = SpeedTier::kTortoise; + } + if (cparams.speed_tier == SpeedTier::kGlacier) { + std::vector all_params; + std::vector size; + + CompressParams cparams_attempt = cparams_orig; + cparams_attempt.speed_tier = SpeedTier::kTortoise; + cparams_attempt.options.max_properties = 4; + + for (float x : {0.0f, 80.f}) { + cparams_attempt.channel_colors_percent = x; + for (float y : {0.0f, 95.0f}) { + cparams_attempt.channel_colors_pre_transform_percent = y; + // 70000 ensures that the number of palette colors is representable in + // modular headers. + for (int K : {0, 1 << 10, 70000}) { + cparams_attempt.palette_colors = K; + for (int tree_mode : {-1, (int)ModularOptions::TreeMode::kNoWP, + (int)ModularOptions::TreeMode::kDefault}) { + if (tree_mode == -1) { + // LZ77 only + cparams_attempt.options.nb_repeats = 0; + } else { + cparams_attempt.options.nb_repeats = 1; + cparams_attempt.options.wp_tree_mode = + static_cast(tree_mode); + } + for (Predictor pred : {Predictor::Zero, Predictor::Variable}) { + cparams_attempt.options.predictor = pred; + for (int g : {0, 1, 3}) { + cparams_attempt.modular_group_size_shift = g; + for (Override patches : {Override::kDefault, Override::kOff}) { + cparams_attempt.patches = patches; + all_params.push_back(cparams_attempt); + } + } + } + } + } + } + } + + size.resize(all_params.size()); + + std::atomic num_errors{0}; + + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, all_params.size(), ThreadPool::NoInit, + [&](size_t task, size_t) { + BitWriter w; + PassesEncoderState state; + if (!EncodeFrame(all_params[task], frame_info, metadata, ib, &state, + cms, nullptr, &w, aux_out)) { + num_errors.fetch_add(1, std::memory_order_relaxed); + return; + } + size[task] = w.BitsWritten(); + }, + "Compress kGlacier")); + JXL_RETURN_IF_ERROR(num_errors.load(std::memory_order_relaxed) == 0); + + size_t best_idx = 0; + for (size_t i = 1; i < all_params.size(); i++) { + if (size[best_idx] > size[i]) { + best_idx = i; + } + } + cparams = all_params[best_idx]; + } + + if (cparams_orig.target_bitrate > 0.0f && + frame_info.frame_type == FrameType::kRegularFrame) { + cparams.target_bitrate = 0.0f; + const float target_bitrate = cparams_orig.target_bitrate; + float bitrate = 0.0f; + float prev_bitrate = 0.0f; + float rescale = 1.0f; + size_t prev_bits = 0; + float error = 0.0f; + float best_error = 100.0f; + float best_rescale = 1.0f; + for (size_t i = 0; i < 10; ++i) { + std::unique_ptr state = + jxl::make_unique(); + BitWriter bw; + JXL_CHECK(EncodeFrame(cparams, frame_info, metadata, ib, state.get(), cms, + pool, &bw, nullptr)); + bitrate = bw.BitsWritten() * 1.0 / (ib.xsize() * ib.ysize()); + error = target_bitrate / bitrate - 1.0f; + if (std::abs(error) < std::abs(best_error)) { + best_error = error; + best_rescale = cparams.quant_ac_rescale; + } + if (bw.BitsWritten() == prev_bits || std::abs(error) < 0.0005f) { + break; + } + float lambda = 1.0f; + if (i > 0) { + lambda = (((bitrate / prev_bitrate) - 1.0f) / (rescale - 1.0f)); + } + rescale = (1.0f + ((target_bitrate / bitrate) - 1.0f) / lambda); + if (rescale < 0.0f) { + break; + } + cparams.quant_ac_rescale *= rescale; + prev_bitrate = bitrate; + prev_bits = bw.BitsWritten(); + } + if (aux_out) { + aux_out->max_quant_rescale = best_rescale; + aux_out->min_quant_rescale = best_rescale; + aux_out->min_bitrate_error = best_error; + aux_out->max_bitrate_error = best_error; + } + cparams.quant_ac_rescale = best_rescale; + } + ib.VerifyMetadata(); + + passes_enc_state->special_frames.clear(); + + if (cparams.qprogressive_mode) { + passes_enc_state->progressive_splitter.SetProgressiveMode( + ProgressiveMode{progressive_passes_dc_quant_ac_full_ac}); + } else if (cparams.progressive_mode) { + passes_enc_state->progressive_splitter.SetProgressiveMode( + ProgressiveMode{progressive_passes_dc_vlf_lf_full_ac}); + } + + JXL_RETURN_IF_ERROR(ParamsPostInit(&cparams)); + + if (cparams.progressive_dc < 0) { + if (cparams.progressive_dc != -1) { + return JXL_FAILURE("Invalid progressive DC setting value (%d)", + cparams.progressive_dc); + } + cparams.progressive_dc = 0; + } + if (cparams.ec_resampling < cparams.resampling) { + cparams.ec_resampling = cparams.resampling; + } + if (cparams.resampling > 1 || frame_info.is_preview) { + cparams.progressive_dc = 0; + } + + if (frame_info.dc_level + cparams.progressive_dc > 4) { + return JXL_FAILURE("Too many levels of progressive DC"); + } + + if (cparams.butteraugli_distance != 0 && + cparams.butteraugli_distance < kMinButteraugliDistance) { + return JXL_FAILURE("Butteraugli distance is too low (%f)", + cparams.butteraugli_distance); + } + + if (ib.IsJPEG()) { + cparams.gaborish = Override::kOff; + cparams.epf = 0; + cparams.modular_mode = false; + } + + if (ib.xsize() == 0 || ib.ysize() == 0) return JXL_FAILURE("Empty image"); + + // Assert that this metadata is correctly set up for the compression params, + // this should have been done by enc_file.cc + JXL_ASSERT(metadata->m.xyb_encoded == + (cparams.color_transform == ColorTransform::kXYB)); + std::unique_ptr frame_header = + jxl::make_unique(metadata); + JXL_RETURN_IF_ERROR(MakeFrameHeader(cparams, + passes_enc_state->progressive_splitter, + frame_info, ib, frame_header.get())); + // Check that if the codestream header says xyb_encoded, the color_transform + // matches the requirement. This is checked from the cparams here, even though + // optimally we'd be able to check this against what has actually been written + // in the main codestream header, but since ib is a const object and the data + // written to the main codestream header is (in modified form) in ib, the + // encoder cannot indicate this fact in the ib's metadata. + if (cparams_orig.color_transform == ColorTransform::kXYB) { + if (frame_header->color_transform != ColorTransform::kXYB) { + return JXL_FAILURE( + "The color transform of frames must be xyb if the codestream is xyb " + "encoded"); + } + } else { + if (frame_header->color_transform == ColorTransform::kXYB) { + return JXL_FAILURE( + "The color transform of frames cannot be xyb if the codestream is " + "not xyb encoded"); + } + } + + FrameDimensions frame_dim = frame_header->ToFrameDimensions(); + + const size_t num_groups = frame_dim.num_groups; + + Image3F opsin; + const ColorEncoding& c_linear = ColorEncoding::LinearSRGB(ib.IsGray()); + std::unique_ptr metadata_linear = + jxl::make_unique(); + metadata_linear->xyb_encoded = + (cparams.color_transform == ColorTransform::kXYB); + metadata_linear->color_encoding = c_linear; + ImageBundle linear_storage(metadata_linear.get()); + + std::vector aux_outs; + // LossyFrameEncoder stores a reference to a std::function + // so we need to keep the std::function being referenced + // alive while lossy_frame_encoder is used. We could make resize_aux_outs a + // lambda type by making LossyFrameEncoder a template instead, but this is + // simpler. + const std::function resize_aux_outs = + [&aux_outs, aux_out](const size_t num_threads) -> Status { + if (aux_out != nullptr) { + size_t old_size = aux_outs.size(); + for (size_t i = num_threads; i < old_size; i++) { + aux_out->Assimilate(aux_outs[i]); + } + aux_outs.resize(num_threads); + // Each thread needs these INPUTS. Don't copy the entire AuxOut + // because it may contain stats which would be Assimilated multiple + // times below. + for (size_t i = old_size; i < aux_outs.size(); i++) { + aux_outs[i].dump_image = aux_out->dump_image; + aux_outs[i].debug_prefix = aux_out->debug_prefix; + } + } + return true; + }; + + LossyFrameEncoder lossy_frame_encoder(cparams, *frame_header, + passes_enc_state, cms, pool, aux_out); + std::unique_ptr modular_frame_encoder = + jxl::make_unique(*frame_header, cparams); + + const std::vector* extra_channels = &ib.extra_channels(); + std::vector extra_channels_storage; + // Clear patches + passes_enc_state->shared.image_features.patches = PatchDictionary(); + passes_enc_state->shared.image_features.patches.SetPassesSharedState( + &passes_enc_state->shared); + + if (ib.IsJPEG()) { + JXL_RETURN_IF_ERROR(lossy_frame_encoder.ComputeJPEGTranscodingData( + *ib.jpeg_data, modular_frame_encoder.get(), frame_header.get())); + } else if (!lossy_frame_encoder.State()->heuristics->HandlesColorConversion( + cparams, ib) || + frame_header->encoding != FrameEncoding::kVarDCT) { + // Allocating a large enough image avoids a copy when padding. + opsin = + Image3F(RoundUpToBlockDim(ib.xsize()), RoundUpToBlockDim(ib.ysize())); + opsin.ShrinkTo(ib.xsize(), ib.ysize()); + + const bool want_linear = frame_header->encoding == FrameEncoding::kVarDCT && + cparams.speed_tier <= SpeedTier::kKitten; + const ImageBundle* JXL_RESTRICT ib_or_linear = &ib; + + if (frame_header->color_transform == ColorTransform::kXYB && + frame_info.ib_needs_color_transform) { + // linear_storage would only be used by the Butteraugli loop (passing + // linear sRGB avoids a color conversion there). Otherwise, don't + // fill it to reduce memory usage. + ib_or_linear = + ToXYB(ib, pool, &opsin, cms, want_linear ? &linear_storage : nullptr); + } else { // RGB or YCbCr: don't do anything (forward YCbCr is not + // implemented, this is only used when the input is already in + // YCbCr) + // If encoding a special DC or reference frame, don't do anything: + // input is already in XYB. + CopyImageTo(ib.color(), &opsin); + } + bool lossless = cparams.IsLossless(); + if (ib.HasAlpha() && !ib.AlphaIsPremultiplied() && + frame_header->frame_type == FrameType::kRegularFrame && + !ApplyOverride(cparams.keep_invisible, lossless) && + cparams.ec_resampling == cparams.resampling) { + // simplify invisible pixels + SimplifyInvisible(&opsin, ib.alpha(), lossless); + if (want_linear) { + SimplifyInvisible(const_cast(&ib_or_linear->color()), + ib.alpha(), lossless); + } + } + if (aux_out != nullptr) { + JXL_RETURN_IF_ERROR( + aux_out->InspectImage3F("enc_frame:OpsinDynamicsImage", opsin)); + } + if (frame_header->encoding == FrameEncoding::kVarDCT) { + PadImageToBlockMultipleInPlace(&opsin); + JXL_RETURN_IF_ERROR(lossy_frame_encoder.ComputeEncodingData( + ib_or_linear, &opsin, cms, pool, modular_frame_encoder.get(), + frame_header.get())); + } else if (frame_header->upsampling != 1 && !cparams.already_downsampled) { + // In VarDCT mode, LossyFrameHeuristics takes care of running downsampling + // after noise, if necessary. + DownsampleImage(&opsin, frame_header->upsampling); + } + } else { + JXL_RETURN_IF_ERROR(lossy_frame_encoder.ComputeEncodingData( + &ib, &opsin, cms, pool, modular_frame_encoder.get(), + frame_header.get())); + } + if (cparams.ec_resampling != 1 && !cparams.already_downsampled) { + extra_channels = &extra_channels_storage; + for (size_t i = 0; i < ib.extra_channels().size(); i++) { + extra_channels_storage.emplace_back(CopyImage(ib.extra_channels()[i])); + DownsampleImage(&extra_channels_storage.back(), cparams.ec_resampling); + } + } + // needs to happen *AFTER* VarDCT-ComputeEncodingData. + JXL_RETURN_IF_ERROR(modular_frame_encoder->ComputeEncodingData( + *frame_header, *ib.metadata(), &opsin, *extra_channels, + lossy_frame_encoder.State(), cms, pool, aux_out, + /* do_color=*/frame_header->encoding == FrameEncoding::kModular)); + + writer->AppendByteAligned(lossy_frame_encoder.State()->special_frames); + frame_header->UpdateFlag( + lossy_frame_encoder.State()->shared.image_features.patches.HasAny(), + FrameHeader::kPatches); + frame_header->UpdateFlag( + lossy_frame_encoder.State()->shared.image_features.splines.HasAny(), + FrameHeader::kSplines); + JXL_RETURN_IF_ERROR(WriteFrameHeader(*frame_header, writer, aux_out)); + + const size_t num_passes = + passes_enc_state->progressive_splitter.GetNumPasses(); + + // DC global info + DC groups + AC global info + AC groups * + // num_passes. + const bool has_ac_global = true; + std::vector group_codes(NumTocEntries(frame_dim.num_groups, + frame_dim.num_dc_groups, + num_passes, has_ac_global)); + const size_t global_ac_index = frame_dim.num_dc_groups + 1; + const bool is_small_image = frame_dim.num_groups == 1 && num_passes == 1; + const auto get_output = [&](const size_t index) { + return &group_codes[is_small_image ? 0 : index]; + }; + auto ac_group_code = [&](size_t pass, size_t group) { + return get_output(AcGroupIndex(pass, group, frame_dim.num_groups, + frame_dim.num_dc_groups, has_ac_global)); + }; + + if (frame_header->flags & FrameHeader::kPatches) { + PatchDictionaryEncoder::Encode( + lossy_frame_encoder.State()->shared.image_features.patches, + get_output(0), kLayerDictionary, aux_out); + } + + if (frame_header->flags & FrameHeader::kSplines) { + EncodeSplines(lossy_frame_encoder.State()->shared.image_features.splines, + get_output(0), kLayerSplines, HistogramParams(), aux_out); + } + + if (cparams.photon_noise_iso > 0) { + lossy_frame_encoder.State()->shared.image_features.noise_params = + SimulatePhotonNoise(ib.xsize(), ib.ysize(), cparams.photon_noise_iso); + } + if (cparams.manual_noise.size() == NoiseParams::kNumNoisePoints) { + for (size_t i = 0; i < NoiseParams::kNumNoisePoints; i++) { + lossy_frame_encoder.State()->shared.image_features.noise_params.lut[i] = + cparams.manual_noise[i]; + } + } + if (frame_header->flags & FrameHeader::kNoise) { + EncodeNoise(lossy_frame_encoder.State()->shared.image_features.noise_params, + get_output(0), kLayerNoise, aux_out); + } + + JXL_RETURN_IF_ERROR( + DequantMatricesEncodeDC(&lossy_frame_encoder.State()->shared.matrices, + get_output(0), kLayerQuant, aux_out)); + if (frame_header->encoding == FrameEncoding::kVarDCT) { + JXL_RETURN_IF_ERROR( + lossy_frame_encoder.EncodeGlobalDCInfo(*frame_header, get_output(0))); + } + JXL_RETURN_IF_ERROR( + modular_frame_encoder->EncodeGlobalInfo(get_output(0), aux_out)); + JXL_RETURN_IF_ERROR(modular_frame_encoder->EncodeStream( + get_output(0), aux_out, kLayerModularGlobal, ModularStreamId::Global())); + + const auto process_dc_group = [&](const uint32_t group_index, + const size_t thread) { + AuxOut* my_aux_out = aux_out ? &aux_outs[thread] : nullptr; + BitWriter* output = get_output(group_index + 1); + if (frame_header->encoding == FrameEncoding::kVarDCT && + !(frame_header->flags & FrameHeader::kUseDcFrame)) { + BitWriter::Allotment allotment(output, 2); + output->Write(2, modular_frame_encoder->extra_dc_precision[group_index]); + allotment.ReclaimAndCharge(output, kLayerDC, my_aux_out); + JXL_CHECK(modular_frame_encoder->EncodeStream( + output, my_aux_out, kLayerDC, + ModularStreamId::VarDCTDC(group_index))); + } + JXL_CHECK(modular_frame_encoder->EncodeStream( + output, my_aux_out, kLayerModularDcGroup, + ModularStreamId::ModularDC(group_index))); + if (frame_header->encoding == FrameEncoding::kVarDCT) { + const Rect& rect = + lossy_frame_encoder.State()->shared.DCGroupRect(group_index); + size_t nb_bits = CeilLog2Nonzero(rect.xsize() * rect.ysize()); + if (nb_bits != 0) { + BitWriter::Allotment allotment(output, nb_bits); + output->Write(nb_bits, + modular_frame_encoder->ac_metadata_size[group_index] - 1); + allotment.ReclaimAndCharge(output, kLayerControlFields, my_aux_out); + } + JXL_CHECK(modular_frame_encoder->EncodeStream( + output, my_aux_out, kLayerControlFields, + ModularStreamId::ACMetadata(group_index))); + } + }; + JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, frame_dim.num_dc_groups, + resize_aux_outs, process_dc_group, + "EncodeDCGroup")); + + if (frame_header->encoding == FrameEncoding::kVarDCT) { + JXL_RETURN_IF_ERROR(lossy_frame_encoder.EncodeGlobalACInfo( + get_output(global_ac_index), modular_frame_encoder.get())); + } + + std::atomic num_errors{0}; + const auto process_group = [&](const uint32_t group_index, + const size_t thread) { + AuxOut* my_aux_out = aux_out ? &aux_outs[thread] : nullptr; + + for (size_t i = 0; i < num_passes; i++) { + if (frame_header->encoding == FrameEncoding::kVarDCT) { + if (!lossy_frame_encoder.EncodeACGroup( + i, group_index, ac_group_code(i, group_index), my_aux_out)) { + num_errors.fetch_add(1, std::memory_order_relaxed); + return; + } + } + // Write all modular encoded data (color?, alpha, depth, extra channels) + if (!modular_frame_encoder->EncodeStream( + ac_group_code(i, group_index), my_aux_out, kLayerModularAcGroup, + ModularStreamId::ModularAC(group_index, i))) { + num_errors.fetch_add(1, std::memory_order_relaxed); + return; + } + } + }; + JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, num_groups, resize_aux_outs, + process_group, "EncodeGroupCoefficients")); + + // Resizing aux_outs to 0 also Assimilates the array. + static_cast(resize_aux_outs(0)); + JXL_RETURN_IF_ERROR(num_errors.load(std::memory_order_relaxed) == 0); + + for (BitWriter& bw : group_codes) { + BitWriter::Allotment allotment(&bw, 8); + bw.ZeroPadToByte(); // end of group. + allotment.ReclaimAndCharge(&bw, kLayerAC, aux_out); + } + + std::vector* permutation_ptr = nullptr; + std::vector permutation; + if (cparams.centerfirst && !(num_passes == 1 && num_groups == 1)) { + permutation_ptr = &permutation; + // Don't permute global DC/AC or DC. + permutation.resize(global_ac_index + 1); + std::iota(permutation.begin(), permutation.end(), 0); + std::vector ac_group_order(num_groups); + std::iota(ac_group_order.begin(), ac_group_order.end(), 0); + size_t group_dim = frame_dim.group_dim; + + // The center of the image is either given by parameters or chosen + // to be the middle of the image by default if center_x, center_y resp. + // are not provided. + + int64_t imag_cx; + if (cparams.center_x != static_cast(-1)) { + JXL_RETURN_IF_ERROR(cparams.center_x < ib.xsize()); + imag_cx = cparams.center_x; + } else { + imag_cx = ib.xsize() / 2; + } + + int64_t imag_cy; + if (cparams.center_y != static_cast(-1)) { + JXL_RETURN_IF_ERROR(cparams.center_y < ib.ysize()); + imag_cy = cparams.center_y; + } else { + imag_cy = ib.ysize() / 2; + } + + // The center of the group containing the center of the image. + int64_t cx = (imag_cx / group_dim) * group_dim + group_dim / 2; + int64_t cy = (imag_cy / group_dim) * group_dim + group_dim / 2; + // This identifies in what area of the central group the center of the image + // lies in. + double direction = -std::atan2(imag_cy - cy, imag_cx - cx); + // This identifies the side of the central group the center of the image + // lies closest to. This can take values 0, 1, 2, 3 corresponding to left, + // bottom, right, top. + int64_t side = std::fmod((direction + 5 * kPi / 4), 2 * kPi) * 2 / kPi; + auto get_distance_from_center = [&](size_t gid) { + Rect r = passes_enc_state->shared.GroupRect(gid); + int64_t gcx = r.x0() + group_dim / 2; + int64_t gcy = r.y0() + group_dim / 2; + int64_t dx = gcx - cx; + int64_t dy = gcy - cy; + // The angle is determined by taking atan2 and adding an appropriate + // starting point depending on the side we want to start on. + double angle = std::remainder( + std::atan2(dy, dx) + kPi / 4 + side * (kPi / 2), 2 * kPi); + // Concentric squares in clockwise order. + return std::make_pair(std::max(std::abs(dx), std::abs(dy)), angle); + }; + std::sort(ac_group_order.begin(), ac_group_order.end(), + [&](coeff_order_t a, coeff_order_t b) { + return get_distance_from_center(a) < + get_distance_from_center(b); + }); + std::vector inv_ac_group_order(ac_group_order.size(), 0); + for (size_t i = 0; i < ac_group_order.size(); i++) { + inv_ac_group_order[ac_group_order[i]] = i; + } + for (size_t i = 0; i < num_passes; i++) { + size_t pass_start = permutation.size(); + for (coeff_order_t v : inv_ac_group_order) { + permutation.push_back(pass_start + v); + } + } + std::vector new_group_codes(group_codes.size()); + for (size_t i = 0; i < permutation.size(); i++) { + new_group_codes[permutation[i]] = std::move(group_codes[i]); + } + group_codes = std::move(new_group_codes); + } + + JXL_RETURN_IF_ERROR( + WriteGroupOffsets(group_codes, permutation_ptr, writer, aux_out)); + writer->AppendByteAligned(group_codes); + + return true; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_frame.h b/third_party/jpeg-xl/lib/jxl/enc_frame.h new file mode 100644 index 0000000000..b1dc637eb0 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_frame.h @@ -0,0 +1,78 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_FRAME_H_ +#define LIB_JXL_ENC_FRAME_H_ + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +struct AuxOut; + +// Information needed for encoding a frame that is not contained elsewhere and +// does not belong to `cparams`. +// TODO(lode): if possible, it might be better to replace FrameInfo and several +// fields from ImageBundle (such as frame name and duration) by direct usage of +// jxl::FrameHeader itself. +struct FrameInfo { + // TODO(veluca): consider adding more parameters, such as custom patches. + bool save_before_color_transform = false; + // Whether or not the input image bundle is already in the codestream + // colorspace (as deduced by cparams). + // TODO(veluca): this is a hack - ImageBundle doesn't have a simple way to say + // "this is already in XYB". + bool ib_needs_color_transform = true; + FrameType frame_type = FrameType::kRegularFrame; + size_t dc_level = 0; + // Only used for kRegularFrame. + bool is_last = true; + bool is_preview = false; + // Information for storing this frame for future use (only for non-DC frames). + size_t save_as_reference = 0; + // The source frame for blending of a next frame, matching the + // save_as_reference value of a previous frame. Animated frames can use + // save_as_reference values 1, 2 and 3, while composite still frames can use + // save_as_reference values 0, 1, 2 and 3. The current C++ encoder + // implementation is assuming and using 1 for all frames of animations, so + // using that as the default value here. + // Corresponds to BlendingInfo::source from the FrameHeader. + size_t source = 1; + // Corresponds to BlendingInfo::clamp from the FrameHeader. + size_t clamp = 1; + // Corresponds to BlendingInfo::alpha_channel from the FrameHeader, or set to + // -1 to automatically choose it as the index of the first extra channel of + // type alpha. + int alpha_channel = -1; + + // If non-empty, uses this blending info for the extra channels, otherwise + // automatically chooses it. The encoder API will fill this vector with the + // extra channel info and allows more options. The non-API cjxl leaves it + // empty and relies on the default behavior. + std::vector extra_channel_blending_info; +}; + +// Checks and adjusts CompressParams when they are all initialized. +Status ParamsPostInit(CompressParams* p); + +// Encodes a single frame (including its header) into a byte stream. Groups may +// be processed in parallel by `pool`. metadata is the ImageMetadata encoded in +// the codestream, and must be used for the FrameHeaders, do not use +// ib.metadata. +Status EncodeFrame(const CompressParams& cparams_orig, + const FrameInfo& frame_info, const CodecMetadata* metadata, + const ImageBundle& ib, PassesEncoderState* passes_enc_state, + const JxlCmsInterface& cms, ThreadPool* pool, + BitWriter* writer, AuxOut* aux_out); + +} // namespace jxl + +#endif // LIB_JXL_ENC_FRAME_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_gaborish.cc b/third_party/jpeg-xl/lib/jxl/enc_gaborish.cc new file mode 100644 index 0000000000..d57bb68b7f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_gaborish.cc @@ -0,0 +1,61 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_gaborish.h" + +#include + +#include + +#include "lib/jxl/base/status.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { + +void GaborishInverse(Image3F* in_out, float mul[3], ThreadPool* pool) { + WeightsSymmetric5 weights[3]; + // Only an approximation. One or even two 3x3, and rank-1 (separable) 5x5 + // are insufficient. The numbers here have been obtained by butteraugli + // based optimizing the whole system and the errors produced are likely + // more favorable for good rate-distortion compromises rather than + // just using mathematical optimization to find the inverse. + static const float kGaborish[5] = { + -0.090881924078487886f, -0.043663953593472138f, 0.01392497846646211f, + 0.0036189602184591141f, 0.0030557936884763499f}; + for (int i = 0; i < 3; ++i) { + double sum = 1.0 + mul[i] * 4 * + (kGaborish[0] + kGaborish[1] + kGaborish[2] + + kGaborish[4] + 2 * kGaborish[3]); + if (sum < 1e-5) { + sum = 1e-5; + } + const float normalize = static_cast(1.0 / sum); + const float normalize_mul = mul[i] * normalize; + weights[i] = WeightsSymmetric5{{HWY_REP4(normalize)}, + {HWY_REP4(normalize_mul * kGaborish[0])}, + {HWY_REP4(normalize_mul * kGaborish[2])}, + {HWY_REP4(normalize_mul * kGaborish[1])}, + {HWY_REP4(normalize_mul * kGaborish[4])}, + {HWY_REP4(normalize_mul * kGaborish[3])}}; + } + // Reduce memory footprint by only allocating a single plane and swapping it + // into the output Image3F. Better still would be tiling. + // Note that we cannot *allocate* a plane, as doing so might cause Image3F to + // have planes of different stride. Instead, we copy one plane in a temporary + // image and reuse the existing planes of the in/out image. + ImageF temp = CopyImage(in_out->Plane(2)); + Symmetric5(in_out->Plane(0), Rect(*in_out), weights[0], pool, + &in_out->Plane(2)); + Symmetric5(in_out->Plane(1), Rect(*in_out), weights[1], pool, + &in_out->Plane(0)); + Symmetric5(temp, Rect(*in_out), weights[2], pool, &in_out->Plane(1)); + // Now planes are 1, 2, 0. + in_out->Plane(0).Swap(in_out->Plane(1)); + // 2 1 0 + in_out->Plane(0).Swap(in_out->Plane(2)); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_gaborish.h b/third_party/jpeg-xl/lib/jxl/enc_gaborish.h new file mode 100644 index 0000000000..102064f9a2 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_gaborish.h @@ -0,0 +1,26 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_GABORISH_H_ +#define LIB_JXL_GABORISH_H_ + +// Linear smoothing (3x3 convolution) for deblocking without too much blur. + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/image.h" + +namespace jxl { + +// Used in encoder to reduce the impact of the decoder's smoothing. +// This is not exact. Works in-place to reduce memory use. +// The input is typically in XYB space. +void GaborishInverse(Image3F* in_out, float mul[3], ThreadPool* pool); + +} // namespace jxl + +#endif // LIB_JXL_GABORISH_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_gaborish_test.cc b/third_party/jpeg-xl/lib/jxl/enc_gaborish_test.cc new file mode 100644 index 0000000000..57a18e3338 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_gaborish_test.cc @@ -0,0 +1,77 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_gaborish.h" + +#include + +#include "lib/jxl/convolve.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +// weight1,2 need not be normalized. +WeightsSymmetric3 GaborishKernel(float weight1, float weight2) { + constexpr float weight0 = 1.0f; + + // Normalize + const float mul = 1.0f / (weight0 + 4 * (weight1 + weight2)); + const float w0 = weight0 * mul; + const float w1 = weight1 * mul; + const float w2 = weight2 * mul; + + const WeightsSymmetric3 w = {{HWY_REP4(w0)}, {HWY_REP4(w1)}, {HWY_REP4(w2)}}; + return w; +} + +void ConvolveGaborish(const ImageF& in, float weight1, float weight2, + ThreadPool* pool, ImageF* JXL_RESTRICT out) { + JXL_CHECK(SameSize(in, *out)); + Symmetric3(in, Rect(in), GaborishKernel(weight1, weight2), pool, out); +} + +void TestRoundTrip(const Image3F& in, float max_l1) { + Image3F fwd(in.xsize(), in.ysize()); + ThreadPool* null_pool = nullptr; + ConvolveGaborish(in.Plane(0), 0, 0, null_pool, &fwd.Plane(0)); + ConvolveGaborish(in.Plane(1), 0, 0, null_pool, &fwd.Plane(1)); + ConvolveGaborish(in.Plane(2), 0, 0, null_pool, &fwd.Plane(2)); + float w = 0.92718927264540152f; + float weights[3] = { + w, + w, + w, + }; + GaborishInverse(&fwd, weights, null_pool); + JXL_ASSERT_OK(VerifyRelativeError(in, fwd, max_l1, 1E-4f, _)); +} + +TEST(GaborishTest, TestZero) { + Image3F in(20, 20); + ZeroFillImage(&in); + TestRoundTrip(in, 0.0f); +} + +// Disabled: large difference. +#if 0 +TEST(GaborishTest, TestDirac) { + Image3F in(20, 20); + ZeroFillImage(&in); + in.PlaneRow(1, 10)[10] = 10.0f; + TestRoundTrip(in, 0.26f); +} +#endif + +TEST(GaborishTest, TestFlat) { + Image3F in(20, 20); + FillImage(1.0f, &in); + TestRoundTrip(in, 1E-5f); +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_gamma_correct.h b/third_party/jpeg-xl/lib/jxl/enc_gamma_correct.h new file mode 100644 index 0000000000..0db7012bbe --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_gamma_correct.h @@ -0,0 +1,36 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_GAMMA_CORRECT_H_ +#define LIB_JXL_ENC_GAMMA_CORRECT_H_ + +// Deprecated: sRGB transfer function. Use color_management.h instead. + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/transfer_functions-inl.h" + +namespace jxl { + +// Values are in [0, 1]. +static JXL_INLINE double Srgb8ToLinearDirect(double srgb) { + if (srgb <= 0.0) return 0.0; + if (srgb <= 0.04045) return srgb / 12.92; + if (srgb >= 1.0) return 1.0; + return std::pow((srgb + 0.055) / 1.055, 2.4); +} + +// Values are in [0, 1]. +static JXL_INLINE double LinearToSrgb8Direct(double linear) { + if (linear <= 0.0) return 0.0; + if (linear >= 1.0) return 1.0; + if (linear <= 0.0031308) return linear * 12.92; + return std::pow(linear, 1.0 / 2.4) * 1.055 - 0.055; +} + +} // namespace jxl + +#endif // LIB_JXL_ENC_GAMMA_CORRECT_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_group.cc b/third_party/jpeg-xl/lib/jxl/enc_group.cc new file mode 100644 index 0000000000..074cf1553a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_group.cc @@ -0,0 +1,426 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_group.h" + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_group.cc" +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/dec_transforms-inl.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_transforms-inl.h" +#include "lib/jxl/image.h" +#include "lib/jxl/quantizer-inl.h" +#include "lib/jxl/quantizer.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Abs; +using hwy::HWY_NAMESPACE::Ge; +using hwy::HWY_NAMESPACE::IfThenElse; +using hwy::HWY_NAMESPACE::IfThenElseZero; +using hwy::HWY_NAMESPACE::MaskFromVec; +using hwy::HWY_NAMESPACE::Round; + +// NOTE: caller takes care of extracting quant from rect of RawQuantField. +void QuantizeBlockAC(const Quantizer& quantizer, const bool error_diffusion, + size_t c, float qm_multiplier, size_t quant_kind, + size_t xsize, size_t ysize, float* thresholds, + const float* JXL_RESTRICT block_in, int32_t* quant, + int32_t* JXL_RESTRICT block_out) { + PROFILER_FUNC; + const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c); + float qac = quantizer.Scale() * (*quant); + // Not SIMD-fied for now. + if (c != 1 && (xsize > 1 || ysize > 1)) { + for (int i = 0; i < 4; ++i) { + thresholds[i] -= Clamp1(0.003f * xsize * ysize, 0.f, 0.08f); + if (thresholds[i] < 0.54) { + thresholds[i] = 0.54; + } + } + } + HWY_CAPPED(float, kBlockDim) df; + HWY_CAPPED(int32_t, kBlockDim) di; + HWY_CAPPED(uint32_t, kBlockDim) du; + const auto quantv = Set(df, qac * qm_multiplier); + for (size_t y = 0; y < ysize * kBlockDim; y++) { + size_t yfix = static_cast(y >= ysize * kBlockDim / 2) * 2; + const size_t off = y * kBlockDim * xsize; + for (size_t x = 0; x < xsize * kBlockDim; x += Lanes(df)) { + auto thr = Zero(df); + if (xsize == 1) { + HWY_ALIGN uint32_t kMask[kBlockDim] = {0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u}; + const auto mask = MaskFromVec(BitCast(df, Load(du, kMask + x))); + thr = IfThenElse(mask, Set(df, thresholds[yfix + 1]), + Set(df, thresholds[yfix])); + } else { + // Same for all lanes in the vector. + thr = Set( + df, + thresholds[yfix + static_cast(x >= xsize * kBlockDim / 2)]); + } + const auto q = Mul(Load(df, qm + off + x), quantv); + const auto in = Load(df, block_in + off + x); + const auto val = Mul(q, in); + const auto nzero_mask = Ge(Abs(val), thr); + const auto v = ConvertTo(di, IfThenElseZero(nzero_mask, Round(val))); + Store(v, di, block_out + off + x); + } + } +} + +void AdjustQuantBlockAC(const Quantizer& quantizer, size_t c, + float qm_multiplier, size_t quant_kind, size_t xsize, + size_t ysize, float* thresholds, + const float* JXL_RESTRICT block_in, int32_t* quant) { + // No quantization adjusting for these small blocks. + // Quantization adjusting attempts to fix some known issues + // with larger blocks and on the 8x8 dct's emerging 8x8 blockiness + // when there are not many non-zeros. + constexpr size_t kPartialBlockKinds = + (1 << AcStrategy::Type::IDENTITY) | (1 << AcStrategy::Type::DCT2X2) | + (1 << AcStrategy::Type::DCT4X4) | (1 << AcStrategy::Type::DCT4X8) | + (1 << AcStrategy::Type::DCT8X4) | (1 << AcStrategy::Type::AFV0) | + (1 << AcStrategy::Type::AFV1) | (1 << AcStrategy::Type::AFV2) | + (1 << AcStrategy::Type::AFV3); + if ((1 << quant_kind) & kPartialBlockKinds) return; + + const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c); + float qac = quantizer.Scale() * (*quant); + if (xsize > 1 || ysize > 1) { + for (int i = 0; i < 4; ++i) { + thresholds[i] -= Clamp1(0.003f * xsize * ysize, 0.f, 0.08f); + if (thresholds[i] < 0.54) { + thresholds[i] = 0.54; + } + } + } + float sum_of_highest_freq_row_and_column = 0; + float hfNonZeros[4] = {}; + float hfMaxError[4] = {}; + + for (size_t y = 0; y < ysize * kBlockDim; y++) { + for (size_t x = 0; x < xsize * kBlockDim; x++) { + const size_t pos = y * kBlockDim * xsize + x; + if (x < xsize && y < ysize) { + continue; + } + const size_t hfix = (static_cast(y >= ysize * kBlockDim / 2) * 2 + + static_cast(x >= xsize * kBlockDim / 2)); + const float val = block_in[pos] * (qm[pos] * qac * qm_multiplier); + const float v = (std::abs(val) < thresholds[hfix]) ? 0 : rintf(val); + if (c == 1 && v == 0) { + const float error = std::abs(val); + if (hfMaxError[hfix] < error) { + hfMaxError[hfix] = error; + } + } + if (v != 0.0f) { + hfNonZeros[hfix] += std::abs(v); + if ((y == ysize * kBlockDim - 1 || x == xsize * kBlockDim - 1) && + (x >= xsize * 4 && y >= ysize * 4)) { + sum_of_highest_freq_row_and_column += std::abs(val); + } + } + } + } + if (c == 1) { + static const double kLimit = 0.49f; + for (int i = 1; i < 4; ++i) { + if (hfNonZeros[i] == 0.0 && hfMaxError[i] > kLimit) { + thresholds[i] = 0.9999 * hfMaxError[i]; + } + } + } + // Heuristic for improving accuracy of high-frequency patterns + // occurring in an environment with no medium-frequency masking + // patterns. This should be improved later to be done in X and B + // planes too as 32x32 and larger transforms become rather ugly + // when this is not compensated for. + if (15 * sum_of_highest_freq_row_and_column >= hfNonZeros[0] + 1) { + constexpr int inc = 5; + *quant += inc; + if (8 * sum_of_highest_freq_row_and_column >= hfNonZeros[0] + 1) { + *quant += inc; + } + if (5 * sum_of_highest_freq_row_and_column >= hfNonZeros[0] + 1) { + *quant += inc; + } + if (3 * sum_of_highest_freq_row_and_column >= hfNonZeros[0] + 1) { + *quant += inc; + } + if (*quant >= Quantizer::kQuantMax) { + *quant = Quantizer::kQuantMax - 1; + } + } + if (quant_kind == AcStrategy::Type::DCT) { + // If this 8x8 block is too flat, increase the adaptive quantization level + // a bit to reduce visible block boundaries and requantize the block. + if (hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] < 11) { + *quant += 1; + if (*quant >= Quantizer::kQuantMax) { + *quant = Quantizer::kQuantMax - 1; + } + } + } + { + // Reduce quant in highly active areas. + int32_t div = (xsize + ysize) / 2; + int32_t activity = (hfNonZeros[0] + div / 2) / div; + int32_t orig_qp_limit = std::max(4, *quant / 2); + for (int i = 1; i < 4; ++i) { + activity = std::min(activity, (hfNonZeros[i] + div / 2) / div); + } + if (activity >= 15) { + activity = 15; + } + int32_t qp = *quant - activity; + if (qp < orig_qp_limit) { + qp = orig_qp_limit; + } + *quant = qp; + } +} + +// NOTE: caller takes care of extracting quant from rect of RawQuantField. +void QuantizeRoundtripYBlockAC(PassesEncoderState* enc_state, const size_t size, + const Quantizer& quantizer, + const bool error_diffusion, size_t quant_kind, + size_t xsize, size_t ysize, + const float* JXL_RESTRICT biases, int32_t* quant, + float* JXL_RESTRICT inout, + int32_t* JXL_RESTRICT quantized) { + float thres_y[4] = {0.58f, 0.64f, 0.64f, 0.64f}; + { + int32_t max_quant = 0; + int quant_orig = *quant; + float val[3] = {enc_state->x_qm_multiplier, 1.0f, + enc_state->b_qm_multiplier}; + int clut[3] = {1, 0, 2}; + for (int ii = 0; ii < 3; ++ii) { + float thres[4] = {0.58f, 0.64f, 0.64f, 0.64f}; + int c = clut[ii]; + *quant = quant_orig; + AdjustQuantBlockAC(quantizer, c, val[c], quant_kind, xsize, ysize, + &thres[0], inout + c * size, quant); + // Dead zone adjustment + if (c == 1) { + for (int k = 0; k < 4; ++k) { + thres_y[k] = thres[k]; + } + } + max_quant = std::max(*quant, max_quant); + } + *quant = max_quant; + } + + QuantizeBlockAC(quantizer, error_diffusion, 1, 1.0f, quant_kind, xsize, ysize, + &thres_y[0], inout + size, quant, quantized + size); + + PROFILER_ZONE("enc quant adjust bias"); + const float* JXL_RESTRICT dequant_matrix = + quantizer.DequantMatrix(quant_kind, 1); + + HWY_CAPPED(float, kDCTBlockSize) df; + HWY_CAPPED(int32_t, kDCTBlockSize) di; + const auto inv_qac = Set(df, quantizer.inv_quant_ac(*quant)); + for (size_t k = 0; k < kDCTBlockSize * xsize * ysize; k += Lanes(df)) { + const auto quant = Load(di, quantized + size + k); + const auto adj_quant = AdjustQuantBias(di, 1, quant, biases); + const auto dequantm = Load(df, dequant_matrix + k); + Store(Mul(Mul(adj_quant, dequantm), inv_qac), df, inout + size + k); + } +} + +void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state, + const Image3F& opsin, Image3F* dc) { + PROFILER_FUNC; + const Rect block_group_rect = enc_state->shared.BlockGroupRect(group_idx); + const Rect group_rect = enc_state->shared.GroupRect(group_idx); + const Rect cmap_rect( + block_group_rect.x0() / kColorTileDimInBlocks, + block_group_rect.y0() / kColorTileDimInBlocks, + DivCeil(block_group_rect.xsize(), kColorTileDimInBlocks), + DivCeil(block_group_rect.ysize(), kColorTileDimInBlocks)); + + const size_t xsize_blocks = block_group_rect.xsize(); + const size_t ysize_blocks = block_group_rect.ysize(); + + const size_t dc_stride = static_cast(dc->PixelsPerRow()); + const size_t opsin_stride = static_cast(opsin.PixelsPerRow()); + + ImageI& full_quant_field = enc_state->shared.raw_quant_field; + const CompressParams& cparams = enc_state->cparams; + + // TODO(veluca): consider strategies to reduce this memory. + auto mem = hwy::AllocateAligned(3 * AcStrategy::kMaxCoeffArea); + auto fmem = hwy::AllocateAligned(5 * AcStrategy::kMaxCoeffArea); + float* JXL_RESTRICT scratch_space = + fmem.get() + 3 * AcStrategy::kMaxCoeffArea; + { + // Only use error diffusion in Squirrel mode or slower. + const bool error_diffusion = cparams.speed_tier <= SpeedTier::kSquirrel; + constexpr HWY_CAPPED(float, kDCTBlockSize) d; + + int32_t* JXL_RESTRICT coeffs[3][kMaxNumPasses] = {}; + size_t num_passes = enc_state->progressive_splitter.GetNumPasses(); + JXL_DASSERT(num_passes > 0); + for (size_t i = 0; i < num_passes; i++) { + // TODO(veluca): 16-bit quantized coeffs are not implemented yet. + JXL_ASSERT(enc_state->coeffs[i]->Type() == ACType::k32); + for (size_t c = 0; c < 3; c++) { + coeffs[c][i] = enc_state->coeffs[i]->PlaneRow(c, group_idx, 0).ptr32; + } + } + + HWY_ALIGN float* coeffs_in = fmem.get(); + HWY_ALIGN int32_t* quantized = mem.get(); + + for (size_t by = 0; by < ysize_blocks; ++by) { + int32_t* JXL_RESTRICT row_quant_ac = + block_group_rect.Row(&full_quant_field, by); + size_t ty = by / kColorTileDimInBlocks; + const int8_t* JXL_RESTRICT row_cmap[3] = { + cmap_rect.ConstRow(enc_state->shared.cmap.ytox_map, ty), + nullptr, + cmap_rect.ConstRow(enc_state->shared.cmap.ytob_map, ty), + }; + const float* JXL_RESTRICT opsin_rows[3] = { + group_rect.ConstPlaneRow(opsin, 0, by * kBlockDim), + group_rect.ConstPlaneRow(opsin, 1, by * kBlockDim), + group_rect.ConstPlaneRow(opsin, 2, by * kBlockDim), + }; + float* JXL_RESTRICT dc_rows[3] = { + block_group_rect.PlaneRow(dc, 0, by), + block_group_rect.PlaneRow(dc, 1, by), + block_group_rect.PlaneRow(dc, 2, by), + }; + AcStrategyRow ac_strategy_row = + enc_state->shared.ac_strategy.ConstRow(block_group_rect, by); + for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks); + tx++) { + const auto x_factor = + Set(d, enc_state->shared.cmap.YtoXRatio(row_cmap[0][tx])); + const auto b_factor = + Set(d, enc_state->shared.cmap.YtoBRatio(row_cmap[2][tx])); + for (size_t bx = tx * kColorTileDimInBlocks; + bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks; ++bx) { + const AcStrategy acs = ac_strategy_row[bx]; + if (!acs.IsFirstBlock()) continue; + + size_t xblocks = acs.covered_blocks_x(); + size_t yblocks = acs.covered_blocks_y(); + + CoefficientLayout(&yblocks, &xblocks); + + size_t size = kDCTBlockSize * xblocks * yblocks; + + // DCT Y channel, roundtrip-quantize it and set DC. + int32_t quant_ac = row_quant_ac[bx]; + for (size_t c : {0, 1, 2}) { + TransformFromPixels(acs.Strategy(), opsin_rows[c] + bx * kBlockDim, + opsin_stride, coeffs_in + c * size, + scratch_space); + } + DCFromLowestFrequencies(acs.Strategy(), coeffs_in + size, + dc_rows[1] + bx, dc_stride); + + QuantizeRoundtripYBlockAC( + enc_state, size, enc_state->shared.quantizer, error_diffusion, + acs.RawStrategy(), xblocks, yblocks, kDefaultQuantBias, &quant_ac, + coeffs_in, quantized); + + // Unapply color correlation + for (size_t k = 0; k < size; k += Lanes(d)) { + const auto in_x = Load(d, coeffs_in + k); + const auto in_y = Load(d, coeffs_in + size + k); + const auto in_b = Load(d, coeffs_in + 2 * size + k); + const auto out_x = NegMulAdd(x_factor, in_y, in_x); + const auto out_b = NegMulAdd(b_factor, in_y, in_b); + Store(out_x, d, coeffs_in + k); + Store(out_b, d, coeffs_in + 2 * size + k); + } + + // Quantize X and B channels and set DC. + for (size_t c : {0, 2}) { + float thres[4] = {0.58f, 0.62f, 0.62f, 0.62f}; + QuantizeBlockAC(enc_state->shared.quantizer, error_diffusion, c, + c == 0 ? enc_state->x_qm_multiplier + : enc_state->b_qm_multiplier, + acs.RawStrategy(), xblocks, yblocks, &thres[0], + coeffs_in + c * size, &quant_ac, + quantized + c * size); + DCFromLowestFrequencies(acs.Strategy(), coeffs_in + c * size, + dc_rows[c] + bx, dc_stride); + } + row_quant_ac[bx] = quant_ac; + for (size_t c = 0; c < 3; c++) { + enc_state->progressive_splitter.SplitACCoefficients( + quantized + c * size, acs, bx, by, coeffs[c]); + for (size_t p = 0; p < num_passes; p++) { + coeffs[c][p] += size; + } + } + } + } + } + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(ComputeCoefficients); +void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state, + const Image3F& opsin, Image3F* dc) { + return HWY_DYNAMIC_DISPATCH(ComputeCoefficients)(group_idx, enc_state, opsin, + dc); +} + +Status EncodeGroupTokenizedCoefficients(size_t group_idx, size_t pass_idx, + size_t histogram_idx, + const PassesEncoderState& enc_state, + BitWriter* writer, AuxOut* aux_out) { + // Select which histogram to use among those of the current pass. + const size_t num_histograms = enc_state.shared.num_histograms; + // num_histograms is 0 only for lossless. + JXL_ASSERT(num_histograms == 0 || histogram_idx < num_histograms); + size_t histo_selector_bits = CeilLog2Nonzero(num_histograms); + + if (histo_selector_bits != 0) { + BitWriter::Allotment allotment(writer, histo_selector_bits); + writer->Write(histo_selector_bits, histogram_idx); + allotment.ReclaimAndCharge(writer, kLayerAC, aux_out); + } + WriteTokens(enc_state.passes[pass_idx].ac_tokens[group_idx], + enc_state.passes[pass_idx].codes, + enc_state.passes[pass_idx].context_map, writer, kLayerACTokens, + aux_out); + + return true; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/enc_group.h b/third_party/jpeg-xl/lib/jxl/enc_group.h new file mode 100644 index 0000000000..0caf408a03 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_group.h @@ -0,0 +1,32 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_GROUP_H_ +#define LIB_JXL_ENC_GROUP_H_ + +#include +#include + +#include "lib/jxl/base/status.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/image.h" + +namespace jxl { + +struct AuxOut; +struct PassesEncoderState; + +// Fills DC +void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state, + const Image3F& opsin, Image3F* dc); + +Status EncodeGroupTokenizedCoefficients(size_t group_idx, size_t pass_idx, + size_t histogram_idx, + const PassesEncoderState& enc_state, + BitWriter* writer, AuxOut* aux_out); + +} // namespace jxl + +#endif // LIB_JXL_ENC_GROUP_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_heuristics.cc b/third_party/jpeg-xl/lib/jxl/enc_heuristics.cc new file mode 100644 index 0000000000..18122fa769 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_heuristics.cc @@ -0,0 +1,948 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_heuristics.h" + +#include +#include + +#include +#include +#include + +#include "lib/jxl/enc_ac_strategy.h" +#include "lib/jxl/enc_adaptive_quantization.h" +#include "lib/jxl/enc_ar_control_field.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_chroma_from_luma.h" +#include "lib/jxl/enc_gaborish.h" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_noise.h" +#include "lib/jxl/enc_patch_dictionary.h" +#include "lib/jxl/enc_photon_noise.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/enc_splines.h" +#include "lib/jxl/enc_xyb.h" + +namespace jxl { + +struct AuxOut; + +namespace { +void FindBestBlockEntropyModel(PassesEncoderState& enc_state) { + if (enc_state.cparams.decoding_speed_tier >= 1) { + static constexpr uint8_t kSimpleCtxMap[] = { + // Cluster all blocks together + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // + }; + static_assert( + 3 * kNumOrders == sizeof(kSimpleCtxMap) / sizeof *kSimpleCtxMap, + "Update simple context map"); + + auto bcm = enc_state.shared.block_ctx_map; + bcm.ctx_map.assign(std::begin(kSimpleCtxMap), std::end(kSimpleCtxMap)); + bcm.num_ctxs = 2; + bcm.num_dc_ctxs = 1; + return; + } + if (enc_state.cparams.speed_tier >= SpeedTier::kFalcon) { + return; + } + const ImageI& rqf = enc_state.shared.raw_quant_field; + // No need to change context modeling for small images. + size_t tot = rqf.xsize() * rqf.ysize(); + size_t size_for_ctx_model = + (1 << 10) * enc_state.cparams.butteraugli_distance; + if (tot < size_for_ctx_model) return; + + struct OccCounters { + // count the occurrences of each qf value and each strategy type. + OccCounters(const ImageI& rqf, const AcStrategyImage& ac_strategy) { + for (size_t y = 0; y < rqf.ysize(); y++) { + const int32_t* qf_row = rqf.Row(y); + AcStrategyRow acs_row = ac_strategy.ConstRow(y); + for (size_t x = 0; x < rqf.xsize(); x++) { + int ord = kStrategyOrder[acs_row[x].RawStrategy()]; + int qf = qf_row[x] - 1; + qf_counts[qf]++; + qf_ord_counts[ord][qf]++; + ord_counts[ord]++; + } + } + } + + size_t qf_counts[256] = {}; + size_t qf_ord_counts[kNumOrders][256] = {}; + size_t ord_counts[kNumOrders] = {}; + }; + // The OccCounters struct is too big to allocate on the stack. + std::unique_ptr counters( + new OccCounters(rqf, enc_state.shared.ac_strategy)); + + // Splitting the context model according to the quantization field seems to + // mostly benefit only large images. + size_t size_for_qf_split = (1 << 13) * enc_state.cparams.butteraugli_distance; + size_t num_qf_segments = tot < size_for_qf_split ? 1 : 2; + std::vector& qft = enc_state.shared.block_ctx_map.qf_thresholds; + qft.clear(); + // Divide the quant field in up to num_qf_segments segments. + size_t cumsum = 0; + size_t next = 1; + size_t last_cut = 256; + size_t cut = tot * next / num_qf_segments; + for (uint32_t j = 0; j < 256; j++) { + cumsum += counters->qf_counts[j]; + if (cumsum > cut) { + if (j != 0) { + qft.push_back(j); + } + last_cut = j; + while (cumsum > cut) { + next++; + cut = tot * next / num_qf_segments; + } + } else if (next > qft.size() + 1) { + if (j - 1 == last_cut && j != 0) { + qft.push_back(j); + } + } + } + + // Count the occurrences of each segment. + std::vector counts(kNumOrders * (qft.size() + 1)); + size_t qft_pos = 0; + for (size_t j = 0; j < 256; j++) { + if (qft_pos < qft.size() && j == qft[qft_pos]) { + qft_pos++; + } + for (size_t i = 0; i < kNumOrders; i++) { + counts[qft_pos + i * (qft.size() + 1)] += counters->qf_ord_counts[i][j]; + } + } + + // Repeatedly merge the lowest-count pair. + std::vector remap((qft.size() + 1) * kNumOrders); + std::iota(remap.begin(), remap.end(), 0); + std::vector clusters(remap); + size_t nb_clusters = Clamp1((int)(tot / size_for_ctx_model / 2), 2, 9); + size_t nb_clusters_chroma = Clamp1((int)(tot / size_for_ctx_model / 3), 1, 5); + // This is O(n^2 log n), but n is small. + while (clusters.size() > nb_clusters) { + std::sort(clusters.begin(), clusters.end(), + [&](int a, int b) { return counts[a] > counts[b]; }); + counts[clusters[clusters.size() - 2]] += counts[clusters.back()]; + counts[clusters.back()] = 0; + remap[clusters.back()] = clusters[clusters.size() - 2]; + clusters.pop_back(); + } + for (size_t i = 0; i < remap.size(); i++) { + while (remap[remap[i]] != remap[i]) { + remap[i] = remap[remap[i]]; + } + } + // Relabel starting from 0. + std::vector remap_remap(remap.size(), remap.size()); + size_t num = 0; + for (size_t i = 0; i < remap.size(); i++) { + if (remap_remap[remap[i]] == remap.size()) { + remap_remap[remap[i]] = num++; + } + remap[i] = remap_remap[remap[i]]; + } + // Write the block context map. + auto& ctx_map = enc_state.shared.block_ctx_map.ctx_map; + ctx_map = remap; + ctx_map.resize(remap.size() * 3); + // for chroma, only use up to nb_clusters_chroma separate block contexts + // (those for the biggest clusters) + for (size_t i = remap.size(); i < remap.size() * 3; i++) { + ctx_map[i] = num + Clamp1((int)remap[i % remap.size()], 0, + (int)nb_clusters_chroma - 1); + } + enc_state.shared.block_ctx_map.num_ctxs = + *std::max_element(ctx_map.begin(), ctx_map.end()) + 1; +} + +} // namespace + +void FindBestDequantMatrices(const CompressParams& cparams, + const Image3F& opsin, + ModularFrameEncoder* modular_frame_encoder, + DequantMatrices* dequant_matrices) { + // TODO(veluca): quant matrices for no-gaborish. + // TODO(veluca): heuristics for in-bitstream quant tables. + *dequant_matrices = DequantMatrices(); + if (cparams.max_error_mode) { + // Set numerators of all quantization matrices to constant values. + float weights[3][1] = {{1.0f / cparams.max_error[0]}, + {1.0f / cparams.max_error[1]}, + {1.0f / cparams.max_error[2]}}; + DctQuantWeightParams dct_params(weights); + std::vector encodings(DequantMatrices::kNum, + QuantEncoding::DCT(dct_params)); + DequantMatricesSetCustom(dequant_matrices, encodings, + modular_frame_encoder); + float dc_weights[3] = {1.0f / cparams.max_error[0], + 1.0f / cparams.max_error[1], + 1.0f / cparams.max_error[2]}; + DequantMatricesSetCustomDC(dequant_matrices, dc_weights); + } +} + +bool DefaultEncoderHeuristics::HandlesColorConversion( + const CompressParams& cparams, const ImageBundle& ib) { + return cparams.noise != Override::kOn && cparams.patches != Override::kOn && + cparams.speed_tier >= SpeedTier::kWombat && cparams.resampling == 1 && + cparams.color_transform == ColorTransform::kXYB && + !cparams.modular_mode && !ib.HasAlpha(); +} + +namespace { + +void StoreMin2(const float v, float& min1, float& min2) { + if (v < min2) { + if (v < min1) { + min2 = min1; + min1 = v; + } else { + min2 = v; + } + } +} + +void CreateMask(const ImageF& image, ImageF& mask) { + for (size_t y = 0; y < image.ysize(); y++) { + auto* row_n = y > 0 ? image.Row(y - 1) : image.Row(y); + auto* row_in = image.Row(y); + auto* row_s = y + 1 < image.ysize() ? image.Row(y + 1) : image.Row(y); + auto* row_out = mask.Row(y); + for (size_t x = 0; x < image.xsize(); x++) { + // Center, west, east, north, south values and their absolute difference + float c = row_in[x]; + float w = x > 0 ? row_in[x - 1] : row_in[x]; + float e = x + 1 < image.xsize() ? row_in[x + 1] : row_in[x]; + float n = row_n[x]; + float s = row_s[x]; + float dw = std::abs(c - w); + float de = std::abs(c - e); + float dn = std::abs(c - n); + float ds = std::abs(c - s); + float min = std::numeric_limits::max(); + float min2 = std::numeric_limits::max(); + StoreMin2(dw, min, min2); + StoreMin2(de, min, min2); + StoreMin2(dn, min, min2); + StoreMin2(ds, min, min2); + row_out[x] = min2; + } + } +} + +// Downsamples the image by a factor of 2 with a kernel that's sharper than +// the standard 2x2 box kernel used by DownsampleImage. +// The kernel is optimized against the result of the 2x2 upsampling kernel used +// by the decoder. Ringing is slightly reduced by clamping the values of the +// resulting pixels within certain bounds of a small region in the original +// image. +void DownsampleImage2_Sharper(const ImageF& input, ImageF* output) { + const int64_t kernelx = 12; + const int64_t kernely = 12; + + static const float kernel[144] = { + -0.000314256996835, -0.000314256996835, -0.000897597057705, + -0.000562751488849, -0.000176807273646, 0.001864627368902, + 0.001864627368902, -0.000176807273646, -0.000562751488849, + -0.000897597057705, -0.000314256996835, -0.000314256996835, + -0.000314256996835, -0.001527942804748, -0.000121760530512, + 0.000191123989093, 0.010193185932466, 0.058637519197110, + 0.058637519197110, 0.010193185932466, 0.000191123989093, + -0.000121760530512, -0.001527942804748, -0.000314256996835, + -0.000897597057705, -0.000121760530512, 0.000946363683751, + 0.007113577630288, 0.000437956841058, -0.000372823835211, + -0.000372823835211, 0.000437956841058, 0.007113577630288, + 0.000946363683751, -0.000121760530512, -0.000897597057705, + -0.000562751488849, 0.000191123989093, 0.007113577630288, + 0.044592622228814, 0.000222278879007, -0.162864473015945, + -0.162864473015945, 0.000222278879007, 0.044592622228814, + 0.007113577630288, 0.000191123989093, -0.000562751488849, + -0.000176807273646, 0.010193185932466, 0.000437956841058, + 0.000222278879007, -0.000913092543974, -0.017071696107902, + -0.017071696107902, -0.000913092543974, 0.000222278879007, + 0.000437956841058, 0.010193185932466, -0.000176807273646, + 0.001864627368902, 0.058637519197110, -0.000372823835211, + -0.162864473015945, -0.017071696107902, 0.414660099370354, + 0.414660099370354, -0.017071696107902, -0.162864473015945, + -0.000372823835211, 0.058637519197110, 0.001864627368902, + 0.001864627368902, 0.058637519197110, -0.000372823835211, + -0.162864473015945, -0.017071696107902, 0.414660099370354, + 0.414660099370354, -0.017071696107902, -0.162864473015945, + -0.000372823835211, 0.058637519197110, 0.001864627368902, + -0.000176807273646, 0.010193185932466, 0.000437956841058, + 0.000222278879007, -0.000913092543974, -0.017071696107902, + -0.017071696107902, -0.000913092543974, 0.000222278879007, + 0.000437956841058, 0.010193185932466, -0.000176807273646, + -0.000562751488849, 0.000191123989093, 0.007113577630288, + 0.044592622228814, 0.000222278879007, -0.162864473015945, + -0.162864473015945, 0.000222278879007, 0.044592622228814, + 0.007113577630288, 0.000191123989093, -0.000562751488849, + -0.000897597057705, -0.000121760530512, 0.000946363683751, + 0.007113577630288, 0.000437956841058, -0.000372823835211, + -0.000372823835211, 0.000437956841058, 0.007113577630288, + 0.000946363683751, -0.000121760530512, -0.000897597057705, + -0.000314256996835, -0.001527942804748, -0.000121760530512, + 0.000191123989093, 0.010193185932466, 0.058637519197110, + 0.058637519197110, 0.010193185932466, 0.000191123989093, + -0.000121760530512, -0.001527942804748, -0.000314256996835, + -0.000314256996835, -0.000314256996835, -0.000897597057705, + -0.000562751488849, -0.000176807273646, 0.001864627368902, + 0.001864627368902, -0.000176807273646, -0.000562751488849, + -0.000897597057705, -0.000314256996835, -0.000314256996835}; + + int64_t xsize = input.xsize(); + int64_t ysize = input.ysize(); + + ImageF box_downsample = CopyImage(input); + DownsampleImage(&box_downsample, 2); + + ImageF mask(box_downsample.xsize(), box_downsample.ysize()); + CreateMask(box_downsample, mask); + + for (size_t y = 0; y < output->ysize(); y++) { + float* row_out = output->Row(y); + const float* row_in[kernely]; + const float* row_mask = mask.Row(y); + // get the rows in the support + for (size_t ky = 0; ky < kernely; ky++) { + int64_t iy = y * 2 + ky - (kernely - 1) / 2; + if (iy < 0) iy = 0; + if (iy >= ysize) iy = ysize - 1; + row_in[ky] = input.Row(iy); + } + + for (size_t x = 0; x < output->xsize(); x++) { + // get min and max values of the original image in the support + float min = std::numeric_limits::max(); + float max = std::numeric_limits::min(); + // kernelx - R and kernely - R are the radius of a rectangular region in + // which the values of a pixel are bounded to reduce ringing. + static constexpr int64_t R = 5; + for (int64_t ky = R; ky + R < kernely; ky++) { + for (int64_t kx = R; kx + R < kernelx; kx++) { + int64_t ix = x * 2 + kx - (kernelx - 1) / 2; + if (ix < 0) ix = 0; + if (ix >= xsize) ix = xsize - 1; + min = std::min(min, row_in[ky][ix]); + max = std::max(max, row_in[ky][ix]); + } + } + + float sum = 0; + for (int64_t ky = 0; ky < kernely; ky++) { + for (int64_t kx = 0; kx < kernelx; kx++) { + int64_t ix = x * 2 + kx - (kernelx - 1) / 2; + if (ix < 0) ix = 0; + if (ix >= xsize) ix = xsize - 1; + sum += row_in[ky][ix] * kernel[ky * kernelx + kx]; + } + } + + row_out[x] = sum; + + // Clamp the pixel within the value of a small area to prevent ringning. + // The mask determines how much to clamp, clamp more to reduce more + // ringing in smooth areas, clamp less in noisy areas to get more + // sharpness. Higher mask_multiplier gives less clamping, so less + // ringing reduction. + const constexpr float mask_multiplier = 1; + float a = row_mask[x] * mask_multiplier; + float clip_min = min - a; + float clip_max = max + a; + if (row_out[x] < clip_min) { + row_out[x] = clip_min; + } else if (row_out[x] > clip_max) { + row_out[x] = clip_max; + } + } + } +} + +void DownsampleImage2_Sharper(Image3F* opsin) { + // Allocate extra space to avoid a reallocation when padding. + Image3F downsampled(DivCeil(opsin->xsize(), 2) + kBlockDim, + DivCeil(opsin->ysize(), 2) + kBlockDim); + downsampled.ShrinkTo(downsampled.xsize() - kBlockDim, + downsampled.ysize() - kBlockDim); + + for (size_t c = 0; c < 3; c++) { + DownsampleImage2_Sharper(opsin->Plane(c), &downsampled.Plane(c)); + } + *opsin = std::move(downsampled); +} + +// The default upsampling kernels used by Upsampler in the decoder. +static const constexpr int64_t kSize = 5; + +static const float kernel00[25] = { + -0.01716200f, -0.03452303f, -0.04022174f, -0.02921014f, -0.00624645f, + -0.03452303f, 0.14111091f, 0.28896755f, 0.00278718f, -0.01610267f, + -0.04022174f, 0.28896755f, 0.56661550f, 0.03777607f, -0.01986694f, + -0.02921014f, 0.00278718f, 0.03777607f, -0.03144731f, -0.01185068f, + -0.00624645f, -0.01610267f, -0.01986694f, -0.01185068f, -0.00213539f, +}; +static const float kernel01[25] = { + -0.00624645f, -0.01610267f, -0.01986694f, -0.01185068f, -0.00213539f, + -0.02921014f, 0.00278718f, 0.03777607f, -0.03144731f, -0.01185068f, + -0.04022174f, 0.28896755f, 0.56661550f, 0.03777607f, -0.01986694f, + -0.03452303f, 0.14111091f, 0.28896755f, 0.00278718f, -0.01610267f, + -0.01716200f, -0.03452303f, -0.04022174f, -0.02921014f, -0.00624645f, +}; +static const float kernel10[25] = { + -0.00624645f, -0.02921014f, -0.04022174f, -0.03452303f, -0.01716200f, + -0.01610267f, 0.00278718f, 0.28896755f, 0.14111091f, -0.03452303f, + -0.01986694f, 0.03777607f, 0.56661550f, 0.28896755f, -0.04022174f, + -0.01185068f, -0.03144731f, 0.03777607f, 0.00278718f, -0.02921014f, + -0.00213539f, -0.01185068f, -0.01986694f, -0.01610267f, -0.00624645f, +}; +static const float kernel11[25] = { + -0.00213539f, -0.01185068f, -0.01986694f, -0.01610267f, -0.00624645f, + -0.01185068f, -0.03144731f, 0.03777607f, 0.00278718f, -0.02921014f, + -0.01986694f, 0.03777607f, 0.56661550f, 0.28896755f, -0.04022174f, + -0.01610267f, 0.00278718f, 0.28896755f, 0.14111091f, -0.03452303f, + -0.00624645f, -0.02921014f, -0.04022174f, -0.03452303f, -0.01716200f, +}; + +// Does exactly the same as the Upsampler in dec_upsampler for 2x2 pixels, with +// default CustomTransformData. +// TODO(lode): use Upsampler instead. However, it requires pre-initialization +// and padding on the left side of the image which requires refactoring the +// other code using this. +static void UpsampleImage(const ImageF& input, ImageF* output) { + int64_t xsize = input.xsize(); + int64_t ysize = input.ysize(); + int64_t xsize2 = output->xsize(); + int64_t ysize2 = output->ysize(); + for (int64_t y = 0; y < ysize2; y++) { + for (int64_t x = 0; x < xsize2; x++) { + auto kernel = kernel00; + if ((x & 1) && (y & 1)) { + kernel = kernel11; + } else if (x & 1) { + kernel = kernel10; + } else if (y & 1) { + kernel = kernel01; + } + float sum = 0; + int64_t x2 = x / 2; + int64_t y2 = y / 2; + + // get min and max values of the original image in the support + float min = std::numeric_limits::max(); + float max = std::numeric_limits::min(); + + for (int64_t ky = 0; ky < kSize; ky++) { + for (int64_t kx = 0; kx < kSize; kx++) { + int64_t xi = x2 - kSize / 2 + kx; + int64_t yi = y2 - kSize / 2 + ky; + if (xi < 0) xi = 0; + if (xi >= xsize) xi = input.xsize() - 1; + if (yi < 0) yi = 0; + if (yi >= ysize) yi = input.ysize() - 1; + min = std::min(min, input.Row(yi)[xi]); + max = std::max(max, input.Row(yi)[xi]); + } + } + + for (int64_t ky = 0; ky < kSize; ky++) { + for (int64_t kx = 0; kx < kSize; kx++) { + int64_t xi = x2 - kSize / 2 + kx; + int64_t yi = y2 - kSize / 2 + ky; + if (xi < 0) xi = 0; + if (xi >= xsize) xi = input.xsize() - 1; + if (yi < 0) yi = 0; + if (yi >= ysize) yi = input.ysize() - 1; + sum += input.Row(yi)[xi] * kernel[ky * kSize + kx]; + } + } + output->Row(y)[x] = sum; + if (output->Row(y)[x] < min) output->Row(y)[x] = min; + if (output->Row(y)[x] > max) output->Row(y)[x] = max; + } + } +} + +// Returns the derivative of Upsampler, with respect to input pixel x2, y2, to +// output pixel x, y (ignoring the clamping). +float UpsamplerDeriv(int64_t x2, int64_t y2, int64_t x, int64_t y) { + auto kernel = kernel00; + if ((x & 1) && (y & 1)) { + kernel = kernel11; + } else if (x & 1) { + kernel = kernel10; + } else if (y & 1) { + kernel = kernel01; + } + + int64_t ix = x / 2; + int64_t iy = y / 2; + int64_t kx = x2 - ix + kSize / 2; + int64_t ky = y2 - iy + kSize / 2; + + // This should not happen. + if (kx < 0 || kx >= kSize || ky < 0 || ky >= kSize) return 0; + + return kernel[ky * kSize + kx]; +} + +// Apply the derivative of the Upsampler to the input, reversing the effect of +// its coefficients. The output image is 2x2 times smaller than the input. +void AntiUpsample(const ImageF& input, ImageF* d) { + int64_t xsize = input.xsize(); + int64_t ysize = input.ysize(); + int64_t xsize2 = d->xsize(); + int64_t ysize2 = d->ysize(); + int64_t k0 = kSize - 1; + int64_t k1 = kSize; + for (int64_t y2 = 0; y2 < ysize2; ++y2) { + auto* row = d->Row(y2); + for (int64_t x2 = 0; x2 < xsize2; ++x2) { + int64_t x0 = x2 * 2 - k0; + if (x0 < 0) x0 = 0; + int64_t x1 = x2 * 2 + k1 + 1; + if (x1 > xsize) x1 = xsize; + int64_t y0 = y2 * 2 - k0; + if (y0 < 0) y0 = 0; + int64_t y1 = y2 * 2 + k1 + 1; + if (y1 > ysize) y1 = ysize; + + float sum = 0; + for (int64_t y = y0; y < y1; ++y) { + const auto* row_in = input.Row(y); + for (int64_t x = x0; x < x1; ++x) { + double deriv = UpsamplerDeriv(x2, y2, x, y); + sum += deriv * row_in[x]; + } + } + row[x2] = sum; + } + } +} + +// Element-wise multiplies two images. +template +void ElwiseMul(const Plane& image1, const Plane& image2, Plane* out) { + const size_t xsize = image1.xsize(); + const size_t ysize = image1.ysize(); + JXL_CHECK(xsize == image2.xsize()); + JXL_CHECK(ysize == image2.ysize()); + JXL_CHECK(xsize == out->xsize()); + JXL_CHECK(ysize == out->ysize()); + for (size_t y = 0; y < ysize; ++y) { + const T* const JXL_RESTRICT row1 = image1.Row(y); + const T* const JXL_RESTRICT row2 = image2.Row(y); + T* const JXL_RESTRICT row_out = out->Row(y); + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = row1[x] * row2[x]; + } + } +} + +// Element-wise divides two images. +template +void ElwiseDiv(const Plane& image1, const Plane& image2, Plane* out) { + const size_t xsize = image1.xsize(); + const size_t ysize = image1.ysize(); + JXL_CHECK(xsize == image2.xsize()); + JXL_CHECK(ysize == image2.ysize()); + JXL_CHECK(xsize == out->xsize()); + JXL_CHECK(ysize == out->ysize()); + for (size_t y = 0; y < ysize; ++y) { + const T* const JXL_RESTRICT row1 = image1.Row(y); + const T* const JXL_RESTRICT row2 = image2.Row(y); + T* const JXL_RESTRICT row_out = out->Row(y); + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = row1[x] / row2[x]; + } + } +} + +void ReduceRinging(const ImageF& initial, const ImageF& mask, ImageF& down) { + int64_t xsize2 = down.xsize(); + int64_t ysize2 = down.ysize(); + + for (size_t y = 0; y < down.ysize(); y++) { + const float* row_mask = mask.Row(y); + float* row_out = down.Row(y); + for (size_t x = 0; x < down.xsize(); x++) { + float v = down.Row(y)[x]; + float min = initial.Row(y)[x]; + float max = initial.Row(y)[x]; + for (int64_t yi = -1; yi < 2; yi++) { + for (int64_t xi = -1; xi < 2; xi++) { + int64_t x2 = (int64_t)x + xi; + int64_t y2 = (int64_t)y + yi; + if (x2 < 0 || y2 < 0 || x2 >= (int64_t)xsize2 || + y2 >= (int64_t)ysize2) + continue; + min = std::min(min, initial.Row(y2)[x2]); + max = std::max(max, initial.Row(y2)[x2]); + } + } + + row_out[x] = v; + + // Clamp the pixel within the value of a small area to prevent ringning. + // The mask determines how much to clamp, clamp more to reduce more + // ringing in smooth areas, clamp less in noisy areas to get more + // sharpness. Higher mask_multiplier gives less clamping, so less + // ringing reduction. + const constexpr float mask_multiplier = 2; + float a = row_mask[x] * mask_multiplier; + float clip_min = min - a; + float clip_max = max + a; + if (row_out[x] < clip_min) row_out[x] = clip_min; + if (row_out[x] > clip_max) row_out[x] = clip_max; + } + } +} + +// TODO(lode): move this to a separate file enc_downsample.cc +void DownsampleImage2_Iterative(const ImageF& orig, ImageF* output) { + int64_t xsize = orig.xsize(); + int64_t ysize = orig.ysize(); + int64_t xsize2 = DivCeil(orig.xsize(), 2); + int64_t ysize2 = DivCeil(orig.ysize(), 2); + + ImageF box_downsample = CopyImage(orig); + DownsampleImage(&box_downsample, 2); + ImageF mask(box_downsample.xsize(), box_downsample.ysize()); + CreateMask(box_downsample, mask); + + output->ShrinkTo(xsize2, ysize2); + + // Initial result image using the sharper downsampling. + // Allocate extra space to avoid a reallocation when padding. + ImageF initial(DivCeil(orig.xsize(), 2) + kBlockDim, + DivCeil(orig.ysize(), 2) + kBlockDim); + initial.ShrinkTo(initial.xsize() - kBlockDim, initial.ysize() - kBlockDim); + DownsampleImage2_Sharper(orig, &initial); + + ImageF down = CopyImage(initial); + ImageF up(xsize, ysize); + ImageF corr(xsize, ysize); + ImageF corr2(xsize2, ysize2); + + // In the weights map, relatively higher values will allow less ringing but + // also less sharpness. With all constant values, it optimizes equally + // everywhere. Even in this case, the weights2 computed from + // this is still used and differs at the borders of the image. + // TODO(lode): Make use of the weights field for anti-ringing and clamping, + // the values are all set to 1 for now, but it is intended to be used for + // reducing ringing based on the mask, and taking clamping into account. + ImageF weights(xsize, ysize); + for (size_t y = 0; y < weights.ysize(); y++) { + auto* row = weights.Row(y); + for (size_t x = 0; x < weights.xsize(); x++) { + row[x] = 1; + } + } + ImageF weights2(xsize2, ysize2); + AntiUpsample(weights, &weights2); + + const size_t num_it = 3; + for (size_t it = 0; it < num_it; ++it) { + UpsampleImage(down, &up); + corr = LinComb(1, orig, -1, up); + ElwiseMul(corr, weights, &corr); + AntiUpsample(corr, &corr2); + ElwiseDiv(corr2, weights2, &corr2); + + down = LinComb(1, down, 1, corr2); + } + + ReduceRinging(initial, mask, down); + + // can't just use CopyImage, because the output image was prepared with + // padding. + for (size_t y = 0; y < down.ysize(); y++) { + for (size_t x = 0; x < down.xsize(); x++) { + float v = down.Row(y)[x]; + output->Row(y)[x] = v; + } + } +} + +void DownsampleImage2_Iterative(Image3F* opsin) { + // Allocate extra space to avoid a reallocation when padding. + Image3F downsampled(DivCeil(opsin->xsize(), 2) + kBlockDim, + DivCeil(opsin->ysize(), 2) + kBlockDim); + downsampled.ShrinkTo(downsampled.xsize() - kBlockDim, + downsampled.ysize() - kBlockDim); + + Image3F rgb(opsin->xsize(), opsin->ysize()); + OpsinParams opsin_params; // TODO: use the ones that are actually used + opsin_params.Init(kDefaultIntensityTarget); + OpsinToLinear(*opsin, Rect(rgb), nullptr, &rgb, opsin_params); + + ImageF mask(opsin->xsize(), opsin->ysize()); + ButteraugliParams butter_params; + ButteraugliComparator butter(rgb, butter_params); + butter.Mask(&mask); + ImageF mask_fuzzy(opsin->xsize(), opsin->ysize()); + + for (size_t c = 0; c < 3; c++) { + DownsampleImage2_Iterative(opsin->Plane(c), &downsampled.Plane(c)); + } + *opsin = std::move(downsampled); +} +} // namespace + +Status DefaultEncoderHeuristics::LossyFrameHeuristics( + PassesEncoderState* enc_state, ModularFrameEncoder* modular_frame_encoder, + const ImageBundle* original_pixels, Image3F* opsin, + const JxlCmsInterface& cms, ThreadPool* pool, AuxOut* aux_out) { + PROFILER_ZONE("JxlLossyFrameHeuristics uninstrumented"); + + CompressParams& cparams = enc_state->cparams; + PassesSharedState& shared = enc_state->shared; + + // Compute parameters for noise synthesis. + if (shared.frame_header.flags & FrameHeader::kNoise) { + PROFILER_ZONE("enc GetNoiseParam"); + if (cparams.photon_noise_iso == 0) { + // Don't start at zero amplitude since adding noise is expensive -- it + // significantly slows down decoding, and this is unlikely to + // completely go away even with advanced optimizations. After the + // kNoiseModelingRampUpDistanceRange we have reached the full level, + // i.e. noise is no longer represented by the compressed image, so we + // can add full noise by the noise modeling itself. + static const float kNoiseModelingRampUpDistanceRange = 0.6; + static const float kNoiseLevelAtStartOfRampUp = 0.25; + static const float kNoiseRampupStart = 1.0; + // TODO(user) test and properly select quality_coef with smooth + // filter + float quality_coef = 1.0f; + const float rampup = (cparams.butteraugli_distance - kNoiseRampupStart) / + kNoiseModelingRampUpDistanceRange; + if (rampup < 1.0f) { + quality_coef = kNoiseLevelAtStartOfRampUp + + (1.0f - kNoiseLevelAtStartOfRampUp) * rampup; + } + if (rampup < 0.0f) { + quality_coef = kNoiseRampupStart; + } + if (!GetNoiseParameter(*opsin, &shared.image_features.noise_params, + quality_coef)) { + shared.frame_header.flags &= ~FrameHeader::kNoise; + } + } + } + if (enc_state->shared.frame_header.upsampling != 1 && + !cparams.already_downsampled) { + // In VarDCT mode, LossyFrameHeuristics takes care of running downsampling + // after noise, if necessary. + if (cparams.resampling == 2) { + // TODO(lode): use the regular DownsampleImage, or adapt to the custom + // coefficients, if there is are custom upscaling coefficients in + // CustomTransformData + if (cparams.speed_tier <= SpeedTier::kSquirrel) { + // TODO(lode): DownsampleImage2_Iterative is currently too slow to + // be used for squirrel, make it faster, and / or enable it only for + // kitten. + DownsampleImage2_Iterative(opsin); + } else { + DownsampleImage2_Sharper(opsin); + } + } else { + DownsampleImage(opsin, cparams.resampling); + } + PadImageToBlockMultipleInPlace(opsin); + } + + if (cparams.butteraugli_distance < 0) { + return JXL_FAILURE("Expected non-negative distance"); + } + + // Find and subtract splines. + if (cparams.speed_tier <= SpeedTier::kSquirrel) { + // If we do already have them, they were passed upstream to EncodeFile. + if (!shared.image_features.splines.HasAny()) { + shared.image_features.splines = FindSplines(*opsin); + } + JXL_RETURN_IF_ERROR(shared.image_features.splines.InitializeDrawCache( + opsin->xsize(), opsin->ysize(), shared.cmap)); + shared.image_features.splines.SubtractFrom(opsin); + } + + // Find and subtract patches/dots. + if (ApplyOverride(cparams.patches, + cparams.speed_tier <= SpeedTier::kSquirrel)) { + FindBestPatchDictionary(*opsin, enc_state, cms, pool, aux_out); + PatchDictionaryEncoder::SubtractFrom(shared.image_features.patches, opsin); + } + + static const float kAcQuant = 0.79f; + const float quant_dc = InitialQuantDC(cparams.butteraugli_distance); + Quantizer& quantizer = enc_state->shared.quantizer; + // We don't know the quant field yet, but for computing the global scale + // assuming that it will be the same as for Falcon mode is good enough. + quantizer.ComputeGlobalScaleAndQuant( + quant_dc, kAcQuant / cparams.butteraugli_distance, 0); + + // TODO(veluca): we can now run all the code from here to FindBestQuantizer + // (excluded) one rect at a time. Do that. + + // Dependency graph: + // + // input: either XYB or input image + // + // input image -> XYB [optional] + // XYB -> initial quant field + // XYB -> Gaborished XYB + // Gaborished XYB -> CfL1 + // initial quant field, Gaborished XYB, CfL1 -> ACS + // initial quant field, ACS, Gaborished XYB -> EPF control field + // initial quant field -> adjusted initial quant field + // adjusted initial quant field, ACS -> raw quant field + // raw quant field, ACS, Gaborished XYB -> CfL2 + // + // output: Gaborished XYB, CfL, ACS, raw quant field, EPF control field. + + ArControlFieldHeuristics ar_heuristics; + AcStrategyHeuristics acs_heuristics; + CfLHeuristics cfl_heuristics; + + if (!opsin->xsize()) { + JXL_ASSERT(HandlesColorConversion(cparams, *original_pixels)); + *opsin = Image3F(RoundUpToBlockDim(original_pixels->xsize()), + RoundUpToBlockDim(original_pixels->ysize())); + opsin->ShrinkTo(original_pixels->xsize(), original_pixels->ysize()); + ToXYB(*original_pixels, pool, opsin, cms, /*linear=*/nullptr); + PadImageToBlockMultipleInPlace(opsin); + } + + // Compute an initial estimate of the quantization field. + // Call InitialQuantField only in Hare mode or slower. Otherwise, rely + // on simple heuristics in FindBestAcStrategy, or set a constant for Falcon + // mode. + if (cparams.speed_tier > SpeedTier::kHare || cparams.uniform_quant > 0) { + enc_state->initial_quant_field = + ImageF(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks); + enc_state->initial_quant_masking = + ImageF(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks); + float q = cparams.uniform_quant > 0 + ? cparams.uniform_quant + : kAcQuant / cparams.butteraugli_distance; + FillImage(q, &enc_state->initial_quant_field); + FillImage(1.0f / (q + 0.001f), &enc_state->initial_quant_masking); + } else { + // Call this here, as it relies on pre-gaborish values. + float butteraugli_distance_for_iqf = cparams.butteraugli_distance; + if (!shared.frame_header.loop_filter.gab) { + butteraugli_distance_for_iqf *= 0.73f; + } + enc_state->initial_quant_field = InitialQuantField( + butteraugli_distance_for_iqf, *opsin, shared.frame_dim, pool, 1.0f, + &enc_state->initial_quant_masking); + quantizer.SetQuantField(quant_dc, enc_state->initial_quant_field, nullptr); + } + + // TODO(veluca): do something about animations. + + // Apply inverse-gaborish. + if (shared.frame_header.loop_filter.gab) { + // Unsure why better to do some more gaborish on X and B than Y. + float weight[3] = { + 1.0036278514398933f, + 0.99406123118127299f, + 0.99719338015886894f, + }; + GaborishInverse(opsin, weight, pool); + } + + FindBestDequantMatrices(cparams, *opsin, modular_frame_encoder, + &enc_state->shared.matrices); + + cfl_heuristics.Init(*opsin); + acs_heuristics.Init(*opsin, enc_state); + + auto process_tile = [&](const uint32_t tid, const size_t thread) { + size_t n_enc_tiles = + DivCeil(enc_state->shared.frame_dim.xsize_blocks, kEncTileDimInBlocks); + size_t tx = tid % n_enc_tiles; + size_t ty = tid / n_enc_tiles; + size_t by0 = ty * kEncTileDimInBlocks; + size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, + enc_state->shared.frame_dim.ysize_blocks); + size_t bx0 = tx * kEncTileDimInBlocks; + size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, + enc_state->shared.frame_dim.xsize_blocks); + Rect r(bx0, by0, bx1 - bx0, by1 - by0); + + // For speeds up to Wombat, we only compute the color correlation map + // once we know the transform type and the quantization map. + if (cparams.speed_tier <= SpeedTier::kSquirrel) { + cfl_heuristics.ComputeTile(r, *opsin, enc_state->shared.matrices, + /*ac_strategy=*/nullptr, + /*raw_quant_field=*/nullptr, + /*quantizer=*/nullptr, /*fast=*/false, thread, + &enc_state->shared.cmap); + } + + // Choose block sizes. + acs_heuristics.ProcessRect(r); + + // Choose amount of post-processing smoothing. + // TODO(veluca): should this go *after* AdjustQuantField? + ar_heuristics.RunRect(r, *opsin, enc_state, thread); + + // Always set the initial quant field, so we can compute the CfL map with + // more accuracy. The initial quant field might change in slower modes, but + // adjusting the quant field with butteraugli when all the other encoding + // parameters are fixed is likely a more reliable choice anyway. + AdjustQuantField(enc_state->shared.ac_strategy, r, + &enc_state->initial_quant_field); + quantizer.SetQuantFieldRect(enc_state->initial_quant_field, r, + &enc_state->shared.raw_quant_field); + + // Compute a non-default CfL map if we are at Hare speed, or slower. + if (cparams.speed_tier <= SpeedTier::kHare) { + cfl_heuristics.ComputeTile( + r, *opsin, enc_state->shared.matrices, &enc_state->shared.ac_strategy, + &enc_state->shared.raw_quant_field, &enc_state->shared.quantizer, + /*fast=*/cparams.speed_tier >= SpeedTier::kWombat, thread, + &enc_state->shared.cmap); + } + }; + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, + DivCeil(enc_state->shared.frame_dim.xsize_blocks, kEncTileDimInBlocks) * + DivCeil(enc_state->shared.frame_dim.ysize_blocks, + kEncTileDimInBlocks), + [&](const size_t num_threads) { + ar_heuristics.PrepareForThreads(num_threads); + cfl_heuristics.PrepareForThreads(num_threads); + return true; + }, + process_tile, "Enc Heuristics")); + + acs_heuristics.Finalize(aux_out); + if (cparams.speed_tier <= SpeedTier::kHare) { + cfl_heuristics.ComputeDC(/*fast=*/cparams.speed_tier >= SpeedTier::kWombat, + &enc_state->shared.cmap); + } + + // Refine quantization levels. + FindBestQuantizer(original_pixels, *opsin, enc_state, cms, pool, aux_out); + + // Choose a context model that depends on the amount of quantization for AC. + if (cparams.speed_tier < SpeedTier::kFalcon) { + FindBestBlockEntropyModel(*enc_state); + } + return true; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_heuristics.h b/third_party/jpeg-xl/lib/jxl/enc_heuristics.h new file mode 100644 index 0000000000..3cb9b506a6 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_heuristics.h @@ -0,0 +1,81 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_HEURISTICS_H_ +#define LIB_JXL_ENC_HEURISTICS_H_ + +// Hook for custom encoder heuristics (VarDCT only for now). + +#include +#include +#include + +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/image.h" +#include "lib/jxl/modular/encoding/enc_ma.h" + +namespace jxl { + +struct AuxOut; +struct PassesEncoderState; +class DequantMatrices; +class ImageBundle; +class ModularFrameEncoder; + +class EncoderHeuristics { + public: + virtual ~EncoderHeuristics() = default; + // Initializes encoder structures in `enc_state` using the original image data + // in `original_pixels`, and the XYB image data in `opsin`. Also modifies the + // `opsin` image by applying Gaborish, and doing other modifications if + // necessary. `pool` is used for running the computations on multiple threads. + // `aux_out` collects statistics and can be used to print debug images. + virtual Status LossyFrameHeuristics( + PassesEncoderState* enc_state, ModularFrameEncoder* modular_frame_encoder, + const ImageBundle* original_pixels, Image3F* opsin, + const JxlCmsInterface& cms, ThreadPool* pool, AuxOut* aux_out) = 0; + + // Custom fixed tree for lossless mode. Must set `tree` to a valid tree if + // the function returns true. + virtual bool CustomFixedTreeLossless(const FrameDimensions& frame_dim, + Tree* tree) { + return false; + } + + // If this method returns `true`, the `opsin` parameter to + // LossyFrameHeuristics will not be initialized, and should be initialized + // during the call. Moreover, `original_pixels` may not be in a linear + // colorspace (but will be the same as the `ib` value passed to this + // function). + virtual bool HandlesColorConversion(const CompressParams& cparams, + const ImageBundle& ib) { + return false; + } +}; + +class DefaultEncoderHeuristics : public EncoderHeuristics { + public: + Status LossyFrameHeuristics(PassesEncoderState* enc_state, + ModularFrameEncoder* modular_frame_encoder, + const ImageBundle* original_pixels, + Image3F* opsin, const JxlCmsInterface& cms, + ThreadPool* pool, AuxOut* aux_out) override; + bool HandlesColorConversion(const CompressParams& cparams, + const ImageBundle& ib) override; +}; + +// Exposed here since it may be used by other EncoderHeuristics implementations +// outside this project. +void FindBestDequantMatrices(const CompressParams& cparams, + const Image3F& opsin, + ModularFrameEncoder* modular_frame_encoder, + DequantMatrices* dequant_matrices); + +} // namespace jxl + +#endif // LIB_JXL_ENC_HEURISTICS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_huffman.cc b/third_party/jpeg-xl/lib/jxl/enc_huffman.cc new file mode 100644 index 0000000000..3eab2c218a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_huffman.cc @@ -0,0 +1,214 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_huffman.h" + +#include +#include + +#include "lib/jxl/enc_huffman_tree.h" + +namespace jxl { + +namespace { + +constexpr int kCodeLengthCodes = 18; + +void StoreHuffmanTreeOfHuffmanTreeToBitMask(const int num_codes, + const uint8_t* code_length_bitdepth, + BitWriter* writer) { + static const uint8_t kStorageOrder[kCodeLengthCodes] = { + 1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + // The bit lengths of the Huffman code over the code length alphabet + // are compressed with the following static Huffman code: + // Symbol Code + // ------ ---- + // 0 00 + // 1 1110 + // 2 110 + // 3 01 + // 4 10 + // 5 1111 + static const uint8_t kHuffmanBitLengthHuffmanCodeSymbols[6] = {0, 7, 3, + 2, 1, 15}; + static const uint8_t kHuffmanBitLengthHuffmanCodeBitLengths[6] = {2, 4, 3, + 2, 2, 4}; + + // Throw away trailing zeros: + size_t codes_to_store = kCodeLengthCodes; + if (num_codes > 1) { + for (; codes_to_store > 0; --codes_to_store) { + if (code_length_bitdepth[kStorageOrder[codes_to_store - 1]] != 0) { + break; + } + } + } + size_t skip_some = 0; // skips none. + if (code_length_bitdepth[kStorageOrder[0]] == 0 && + code_length_bitdepth[kStorageOrder[1]] == 0) { + skip_some = 2; // skips two. + if (code_length_bitdepth[kStorageOrder[2]] == 0) { + skip_some = 3; // skips three. + } + } + writer->Write(2, skip_some); + for (size_t i = skip_some; i < codes_to_store; ++i) { + size_t l = code_length_bitdepth[kStorageOrder[i]]; + writer->Write(kHuffmanBitLengthHuffmanCodeBitLengths[l], + kHuffmanBitLengthHuffmanCodeSymbols[l]); + } +} + +void StoreHuffmanTreeToBitMask(const size_t huffman_tree_size, + const uint8_t* huffman_tree, + const uint8_t* huffman_tree_extra_bits, + const uint8_t* code_length_bitdepth, + const uint16_t* code_length_bitdepth_symbols, + BitWriter* writer) { + for (size_t i = 0; i < huffman_tree_size; ++i) { + size_t ix = huffman_tree[i]; + writer->Write(code_length_bitdepth[ix], code_length_bitdepth_symbols[ix]); + // Extra bits + switch (ix) { + case 16: + writer->Write(2, huffman_tree_extra_bits[i]); + break; + case 17: + writer->Write(3, huffman_tree_extra_bits[i]); + break; + } + } +} + +void StoreSimpleHuffmanTree(const uint8_t* depths, size_t symbols[4], + size_t num_symbols, size_t max_bits, + BitWriter* writer) { + // value of 1 indicates a simple Huffman code + writer->Write(2, 1); + writer->Write(2, num_symbols - 1); // NSYM - 1 + + // Sort + for (size_t i = 0; i < num_symbols; i++) { + for (size_t j = i + 1; j < num_symbols; j++) { + if (depths[symbols[j]] < depths[symbols[i]]) { + std::swap(symbols[j], symbols[i]); + } + } + } + + if (num_symbols == 2) { + writer->Write(max_bits, symbols[0]); + writer->Write(max_bits, symbols[1]); + } else if (num_symbols == 3) { + writer->Write(max_bits, symbols[0]); + writer->Write(max_bits, symbols[1]); + writer->Write(max_bits, symbols[2]); + } else { + writer->Write(max_bits, symbols[0]); + writer->Write(max_bits, symbols[1]); + writer->Write(max_bits, symbols[2]); + writer->Write(max_bits, symbols[3]); + // tree-select + writer->Write(1, depths[symbols[0]] == 1 ? 1 : 0); + } +} + +// num = alphabet size +// depths = symbol depths +void StoreHuffmanTree(const uint8_t* depths, size_t num, BitWriter* writer) { + // Write the Huffman tree into the compact representation. + std::unique_ptr arena(new uint8_t[2 * num]); + uint8_t* huffman_tree = arena.get(); + uint8_t* huffman_tree_extra_bits = arena.get() + num; + size_t huffman_tree_size = 0; + WriteHuffmanTree(depths, num, &huffman_tree_size, huffman_tree, + huffman_tree_extra_bits); + + // Calculate the statistics of the Huffman tree in the compact representation. + uint32_t huffman_tree_histogram[kCodeLengthCodes] = {0}; + for (size_t i = 0; i < huffman_tree_size; ++i) { + ++huffman_tree_histogram[huffman_tree[i]]; + } + + int num_codes = 0; + int code = 0; + for (int i = 0; i < kCodeLengthCodes; ++i) { + if (huffman_tree_histogram[i]) { + if (num_codes == 0) { + code = i; + num_codes = 1; + } else if (num_codes == 1) { + num_codes = 2; + break; + } + } + } + + // Calculate another Huffman tree to use for compressing both the + // earlier Huffman tree with. + uint8_t code_length_bitdepth[kCodeLengthCodes] = {0}; + uint16_t code_length_bitdepth_symbols[kCodeLengthCodes] = {0}; + CreateHuffmanTree(&huffman_tree_histogram[0], kCodeLengthCodes, 5, + &code_length_bitdepth[0]); + ConvertBitDepthsToSymbols(code_length_bitdepth, kCodeLengthCodes, + &code_length_bitdepth_symbols[0]); + + // Now, we have all the data, let's start storing it + StoreHuffmanTreeOfHuffmanTreeToBitMask(num_codes, code_length_bitdepth, + writer); + + if (num_codes == 1) { + code_length_bitdepth[code] = 0; + } + + // Store the real huffman tree now. + StoreHuffmanTreeToBitMask(huffman_tree_size, huffman_tree, + huffman_tree_extra_bits, &code_length_bitdepth[0], + code_length_bitdepth_symbols, writer); +} + +} // namespace + +void BuildAndStoreHuffmanTree(const uint32_t* histogram, const size_t length, + uint8_t* depth, uint16_t* bits, + BitWriter* writer) { + size_t count = 0; + size_t s4[4] = {0}; + for (size_t i = 0; i < length; i++) { + if (histogram[i]) { + if (count < 4) { + s4[count] = i; + } else if (count > 4) { + break; + } + count++; + } + } + + size_t max_bits_counter = length - 1; + size_t max_bits = 0; + while (max_bits_counter) { + max_bits_counter >>= 1; + ++max_bits; + } + + if (count <= 1) { + // Output symbol bits and depths are initialized with 0, nothing to do. + writer->Write(4, 1); + writer->Write(max_bits, s4[0]); + return; + } + + CreateHuffmanTree(histogram, length, 15, depth); + ConvertBitDepthsToSymbols(depth, length, bits); + + if (count <= 4) { + StoreSimpleHuffmanTree(depth, s4, count, max_bits, writer); + } else { + StoreHuffmanTree(depth, length, writer); + } +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_huffman.h b/third_party/jpeg-xl/lib/jxl/enc_huffman.h new file mode 100644 index 0000000000..d7a66584e8 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_huffman.h @@ -0,0 +1,22 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_HUFFMAN_H_ +#define LIB_JXL_ENC_HUFFMAN_H_ + +#include "lib/jxl/enc_bit_writer.h" + +namespace jxl { + +// Builds a Huffman tree for the given histogram, and encodes it into writer +// in a format that can be read by HuffmanDecodingData::ReadFromBitstream. +// An allotment for `writer` must already have been created by the caller. +void BuildAndStoreHuffmanTree(const uint32_t* histogram, size_t length, + uint8_t* depth, uint16_t* bits, + BitWriter* writer); + +} // namespace jxl + +#endif // LIB_JXL_ENC_HUFFMAN_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_huffman_tree.cc b/third_party/jpeg-xl/lib/jxl/enc_huffman_tree.cc new file mode 100644 index 0000000000..5c40dea770 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_huffman_tree.cc @@ -0,0 +1,328 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_huffman_tree.h" + +#include +#include +#include + +#include "lib/jxl/base/status.h" + +namespace jxl { + +void SetDepth(const HuffmanTree& p, HuffmanTree* pool, uint8_t* depth, + uint8_t level) { + if (p.index_left >= 0) { + ++level; + SetDepth(pool[p.index_left], pool, depth, level); + SetDepth(pool[p.index_right_or_value], pool, depth, level); + } else { + depth[p.index_right_or_value] = level; + } +} + +// Sort the root nodes, least popular first. +static JXL_INLINE bool Compare(const HuffmanTree& v0, const HuffmanTree& v1) { + return v0.total_count < v1.total_count; +} + +// This function will create a Huffman tree. +// +// The catch here is that the tree cannot be arbitrarily deep. +// Brotli specifies a maximum depth of 15 bits for "code trees" +// and 7 bits for "code length code trees." +// +// count_limit is the value that is to be faked as the minimum value +// and this minimum value is raised until the tree matches the +// maximum length requirement. +// +// This algorithm is not of excellent performance for very long data blocks, +// especially when population counts are longer than 2**tree_limit, but +// we are not planning to use this with extremely long blocks. +// +// See http://en.wikipedia.org/wiki/Huffman_coding +void CreateHuffmanTree(const uint32_t* data, const size_t length, + const int tree_limit, uint8_t* depth) { + // For block sizes below 64 kB, we never need to do a second iteration + // of this loop. Probably all of our block sizes will be smaller than + // that, so this loop is mostly of academic interest. If we actually + // would need this, we would be better off with the Katajainen algorithm. + for (uint32_t count_limit = 1;; count_limit *= 2) { + std::vector tree; + tree.reserve(2 * length + 1); + + for (size_t i = length; i != 0;) { + --i; + if (data[i]) { + const uint32_t count = std::max(data[i], count_limit - 1); + tree.emplace_back(count, -1, static_cast(i)); + } + } + + const size_t n = tree.size(); + if (n == 1) { + // Fake value; will be fixed on upper level. + depth[tree[0].index_right_or_value] = 1; + break; + } + + std::stable_sort(tree.begin(), tree.end(), Compare); + + // The nodes are: + // [0, n): the sorted leaf nodes that we start with. + // [n]: we add a sentinel here. + // [n + 1, 2n): new parent nodes are added here, starting from + // (n+1). These are naturally in ascending order. + // [2n]: we add a sentinel at the end as well. + // There will be (2n+1) elements at the end. + const HuffmanTree sentinel(std::numeric_limits::max(), -1, -1); + tree.push_back(sentinel); + tree.push_back(sentinel); + + size_t i = 0; // Points to the next leaf node. + size_t j = n + 1; // Points to the next non-leaf node. + for (size_t k = n - 1; k != 0; --k) { + size_t left, right; + if (tree[i].total_count <= tree[j].total_count) { + left = i; + ++i; + } else { + left = j; + ++j; + } + if (tree[i].total_count <= tree[j].total_count) { + right = i; + ++i; + } else { + right = j; + ++j; + } + + // The sentinel node becomes the parent node. + size_t j_end = tree.size() - 1; + tree[j_end].total_count = + tree[left].total_count + tree[right].total_count; + tree[j_end].index_left = static_cast(left); + tree[j_end].index_right_or_value = static_cast(right); + + // Add back the last sentinel node. + tree.push_back(sentinel); + } + JXL_DASSERT(tree.size() == 2 * n + 1); + SetDepth(tree[2 * n - 1], &tree[0], depth, 0); + + // We need to pack the Huffman tree in tree_limit bits. + // If this was not successful, add fake entities to the lowest values + // and retry. + if (*std::max_element(&depth[0], &depth[length]) <= tree_limit) { + break; + } + } +} + +void Reverse(uint8_t* v, size_t start, size_t end) { + --end; + while (start < end) { + uint8_t tmp = v[start]; + v[start] = v[end]; + v[end] = tmp; + ++start; + --end; + } +} + +void WriteHuffmanTreeRepetitions(const uint8_t previous_value, + const uint8_t value, size_t repetitions, + size_t* tree_size, uint8_t* tree, + uint8_t* extra_bits_data) { + JXL_DASSERT(repetitions > 0); + if (previous_value != value) { + tree[*tree_size] = value; + extra_bits_data[*tree_size] = 0; + ++(*tree_size); + --repetitions; + } + if (repetitions == 7) { + tree[*tree_size] = value; + extra_bits_data[*tree_size] = 0; + ++(*tree_size); + --repetitions; + } + if (repetitions < 3) { + for (size_t i = 0; i < repetitions; ++i) { + tree[*tree_size] = value; + extra_bits_data[*tree_size] = 0; + ++(*tree_size); + } + } else { + repetitions -= 3; + size_t start = *tree_size; + while (true) { + tree[*tree_size] = 16; + extra_bits_data[*tree_size] = repetitions & 0x3; + ++(*tree_size); + repetitions >>= 2; + if (repetitions == 0) { + break; + } + --repetitions; + } + Reverse(tree, start, *tree_size); + Reverse(extra_bits_data, start, *tree_size); + } +} + +void WriteHuffmanTreeRepetitionsZeros(size_t repetitions, size_t* tree_size, + uint8_t* tree, uint8_t* extra_bits_data) { + if (repetitions == 11) { + tree[*tree_size] = 0; + extra_bits_data[*tree_size] = 0; + ++(*tree_size); + --repetitions; + } + if (repetitions < 3) { + for (size_t i = 0; i < repetitions; ++i) { + tree[*tree_size] = 0; + extra_bits_data[*tree_size] = 0; + ++(*tree_size); + } + } else { + repetitions -= 3; + size_t start = *tree_size; + while (true) { + tree[*tree_size] = 17; + extra_bits_data[*tree_size] = repetitions & 0x7; + ++(*tree_size); + repetitions >>= 3; + if (repetitions == 0) { + break; + } + --repetitions; + } + Reverse(tree, start, *tree_size); + Reverse(extra_bits_data, start, *tree_size); + } +} + +static void DecideOverRleUse(const uint8_t* depth, const size_t length, + bool* use_rle_for_non_zero, + bool* use_rle_for_zero) { + size_t total_reps_zero = 0; + size_t total_reps_non_zero = 0; + size_t count_reps_zero = 1; + size_t count_reps_non_zero = 1; + for (size_t i = 0; i < length;) { + const uint8_t value = depth[i]; + size_t reps = 1; + for (size_t k = i + 1; k < length && depth[k] == value; ++k) { + ++reps; + } + if (reps >= 3 && value == 0) { + total_reps_zero += reps; + ++count_reps_zero; + } + if (reps >= 4 && value != 0) { + total_reps_non_zero += reps; + ++count_reps_non_zero; + } + i += reps; + } + *use_rle_for_non_zero = total_reps_non_zero > count_reps_non_zero * 2; + *use_rle_for_zero = total_reps_zero > count_reps_zero * 2; +} + +void WriteHuffmanTree(const uint8_t* depth, size_t length, size_t* tree_size, + uint8_t* tree, uint8_t* extra_bits_data) { + uint8_t previous_value = 8; + + // Throw away trailing zeros. + size_t new_length = length; + for (size_t i = 0; i < length; ++i) { + if (depth[length - i - 1] == 0) { + --new_length; + } else { + break; + } + } + + // First gather statistics on if it is a good idea to do rle. + bool use_rle_for_non_zero = false; + bool use_rle_for_zero = false; + if (length > 50) { + // Find rle coding for longer codes. + // Shorter codes seem not to benefit from rle. + DecideOverRleUse(depth, new_length, &use_rle_for_non_zero, + &use_rle_for_zero); + } + + // Actual rle coding. + for (size_t i = 0; i < new_length;) { + const uint8_t value = depth[i]; + size_t reps = 1; + if ((value != 0 && use_rle_for_non_zero) || + (value == 0 && use_rle_for_zero)) { + for (size_t k = i + 1; k < new_length && depth[k] == value; ++k) { + ++reps; + } + } + if (value == 0) { + WriteHuffmanTreeRepetitionsZeros(reps, tree_size, tree, extra_bits_data); + } else { + WriteHuffmanTreeRepetitions(previous_value, value, reps, tree_size, tree, + extra_bits_data); + previous_value = value; + } + i += reps; + } +} + +namespace { + +uint16_t ReverseBits(int num_bits, uint16_t bits) { + static const size_t kLut[16] = {// Pre-reversed 4-bit values. + 0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe, + 0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf}; + size_t retval = kLut[bits & 0xf]; + for (int i = 4; i < num_bits; i += 4) { + retval <<= 4; + bits = static_cast(bits >> 4); + retval |= kLut[bits & 0xf]; + } + retval >>= (-num_bits & 0x3); + return static_cast(retval); +} + +} // namespace + +void ConvertBitDepthsToSymbols(const uint8_t* depth, size_t len, + uint16_t* bits) { + // In Brotli, all bit depths are [1..15] + // 0 bit depth means that the symbol does not exist. + const int kMaxBits = 16; // 0..15 are values for bits + uint16_t bl_count[kMaxBits] = {0}; + { + for (size_t i = 0; i < len; ++i) { + ++bl_count[depth[i]]; + } + bl_count[0] = 0; + } + uint16_t next_code[kMaxBits]; + next_code[0] = 0; + { + int code = 0; + for (size_t i = 1; i < kMaxBits; ++i) { + code = (code + bl_count[i - 1]) << 1; + next_code[i] = static_cast(code); + } + } + for (size_t i = 0; i < len; ++i) { + if (depth[i]) { + bits[i] = ReverseBits(depth[i], next_code[depth[i]]++); + } + } +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_huffman_tree.h b/third_party/jpeg-xl/lib/jxl/enc_huffman_tree.h new file mode 100644 index 0000000000..7d716cd3b5 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_huffman_tree.h @@ -0,0 +1,52 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Library for creating Huffman codes from population counts. + +#ifndef LIB_JXL_HUFFMAN_TREE_H_ +#define LIB_JXL_HUFFMAN_TREE_H_ + +#include +#include + +namespace jxl { + +// A node of a Huffman tree. +struct HuffmanTree { + HuffmanTree(uint32_t count, int16_t left, int16_t right) + : total_count(count), index_left(left), index_right_or_value(right) {} + uint32_t total_count; + int16_t index_left; + int16_t index_right_or_value; +}; + +void SetDepth(const HuffmanTree& p, HuffmanTree* pool, uint8_t* depth, + uint8_t level); + +// This function will create a Huffman tree. +// +// The (data,length) contains the population counts. +// The tree_limit is the maximum bit depth of the Huffman codes. +// +// The depth contains the tree, i.e., how many bits are used for +// the symbol. +// +// See http://en.wikipedia.org/wiki/Huffman_coding +void CreateHuffmanTree(const uint32_t* data, size_t length, int tree_limit, + uint8_t* depth); + +// Write a Huffman tree from bit depths into the bitstream representation +// of a Huffman tree. The generated Huffman tree is to be compressed once +// more using a Huffman tree +void WriteHuffmanTree(const uint8_t* depth, size_t length, size_t* tree_size, + uint8_t* tree, uint8_t* extra_bits_data); + +// Get the actual bit values for a tree of bit depths. +void ConvertBitDepthsToSymbols(const uint8_t* depth, size_t len, + uint16_t* bits); + +} // namespace jxl + +#endif // LIB_JXL_HUFFMAN_TREE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_icc_codec.cc b/third_party/jpeg-xl/lib/jxl/enc_icc_codec.cc new file mode 100644 index 0000000000..a6782f6a45 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_icc_codec.cc @@ -0,0 +1,406 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_icc_codec.h" + +#include + +#include +#include +#include + +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/common.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/icc_codec_common.h" + +namespace jxl { +namespace { + +// Unshuffles or de-interleaves bytes, for example with width 2, turns +// "AaBbCcDc" into "ABCDabcd", this for example de-interleaves UTF-16 bytes into +// first all the high order bytes, then all the low order bytes. +// Transposes a matrix of width columns and ceil(size / width) rows. There are +// size elements, size may be < width * height, if so the +// last elements of the bottom row are missing, the missing spots are +// transposed along with the filled spots, and the result has the missing +// elements at the bottom of the rightmost column. The input is the input matrix +// in scanline order, the output is the result matrix in scanline order, with +// missing elements skipped over (this may occur at multiple positions). +void Unshuffle(uint8_t* data, size_t size, size_t width) { + size_t height = (size + width - 1) / width; // amount of rows of input + PaddedBytes result(size); + // i = input index, j output index + size_t s = 0, j = 0; + for (size_t i = 0; i < size; i++) { + result[j] = data[i]; + j += height; + if (j >= size) j = ++s; + } + + for (size_t i = 0; i < size; i++) { + data[i] = result[i]; + } +} + +// This is performed by the encoder, the encoder must be able to encode any +// random byte stream (not just byte streams that are a valid ICC profile), so +// an error returned by this function is an implementation error. +Status PredictAndShuffle(size_t stride, size_t width, int order, size_t num, + const uint8_t* data, size_t size, size_t* pos, + PaddedBytes* result) { + JXL_RETURN_IF_ERROR(CheckOutOfBounds(*pos, num, size)); + // Required by the specification, see decoder. stride * 4 must be < *pos. + if (!*pos || ((*pos - 1u) >> 2u) < stride) { + return JXL_FAILURE("Invalid stride"); + } + if (*pos < stride * 4) return JXL_FAILURE("Too large stride"); + size_t start = result->size(); + for (size_t i = 0; i < num; i++) { + uint8_t predicted = + LinearPredictICCValue(data, *pos, i, stride, width, order); + result->push_back(data[*pos + i] - predicted); + } + *pos += num; + if (width > 1) Unshuffle(result->data() + start, num, width); + return true; +} +} // namespace + +// Outputs a transformed form of the given icc profile. The result itself is +// not particularly smaller than the input data in bytes, but it will be in a +// form that is easier to compress (more zeroes, ...) and will compress better +// with brotli. +Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result) { + PaddedBytes commands; + PaddedBytes data; + + EncodeVarInt(size, result); + + // Header + PaddedBytes header = ICCInitialHeaderPrediction(); + EncodeUint32(0, size, &header); + for (size_t i = 0; i < kICCHeaderSize && i < size; i++) { + ICCPredictHeader(icc, size, header.data(), i); + data.push_back(icc[i] - header[i]); + } + if (size <= kICCHeaderSize) { + EncodeVarInt(0, result); // 0 commands + for (size_t i = 0; i < data.size(); i++) { + result->push_back(data[i]); + } + return true; + } + + std::vector tags; + std::vector tagstarts; + std::vector tagsizes; + std::map tagmap; + + // Tag list + size_t pos = kICCHeaderSize; + if (pos + 4 <= size) { + uint64_t numtags = DecodeUint32(icc, size, pos); + pos += 4; + EncodeVarInt(numtags + 1, &commands); + uint64_t prevtagstart = kICCHeaderSize + numtags * 12; + uint32_t prevtagsize = 0; + for (size_t i = 0; i < numtags; i++) { + if (pos + 12 > size) break; + + Tag tag = DecodeKeyword(icc, size, pos + 0); + uint32_t tagstart = DecodeUint32(icc, size, pos + 4); + uint32_t tagsize = DecodeUint32(icc, size, pos + 8); + pos += 12; + + tags.push_back(tag); + tagstarts.push_back(tagstart); + tagsizes.push_back(tagsize); + tagmap[tagstart] = tags.size() - 1; + + uint8_t tagcode = kCommandTagUnknown; + for (size_t j = 0; j < kNumTagStrings; j++) { + if (tag == *kTagStrings[j]) { + tagcode = j + kCommandTagStringFirst; + break; + } + } + + if (tag == kRtrcTag && pos + 24 < size) { + bool ok = true; + ok &= DecodeKeyword(icc, size, pos + 0) == kGtrcTag; + ok &= DecodeKeyword(icc, size, pos + 12) == kBtrcTag; + if (ok) { + for (size_t kk = 0; kk < 8; kk++) { + if (icc[pos - 8 + kk] != icc[pos + 4 + kk]) ok = false; + if (icc[pos - 8 + kk] != icc[pos + 16 + kk]) ok = false; + } + } + if (ok) { + tagcode = kCommandTagTRC; + pos += 24; + i += 2; + } + } + + if (tag == kRxyzTag && pos + 24 < size) { + bool ok = true; + ok &= DecodeKeyword(icc, size, pos + 0) == kGxyzTag; + ok &= DecodeKeyword(icc, size, pos + 12) == kBxyzTag; + uint32_t offsetr = tagstart; + uint32_t offsetg = DecodeUint32(icc, size, pos + 4); + uint32_t offsetb = DecodeUint32(icc, size, pos + 16); + uint32_t sizer = tagsize; + uint32_t sizeg = DecodeUint32(icc, size, pos + 8); + uint32_t sizeb = DecodeUint32(icc, size, pos + 20); + ok &= sizer == 20; + ok &= sizeg == 20; + ok &= sizeb == 20; + ok &= (offsetg == offsetr + 20); + ok &= (offsetb == offsetr + 40); + if (ok) { + tagcode = kCommandTagXYZ; + pos += 24; + i += 2; + } + } + + uint8_t command = tagcode; + uint64_t predicted_tagstart = prevtagstart + prevtagsize; + if (predicted_tagstart != tagstart) command |= kFlagBitOffset; + size_t predicted_tagsize = prevtagsize; + if (tag == kRxyzTag || tag == kGxyzTag || tag == kBxyzTag || + tag == kKxyzTag || tag == kWtptTag || tag == kBkptTag || + tag == kLumiTag) { + predicted_tagsize = 20; + } + if (predicted_tagsize != tagsize) command |= kFlagBitSize; + commands.push_back(command); + if (tagcode == 1) { + AppendKeyword(tag, &data); + } + if (command & kFlagBitOffset) EncodeVarInt(tagstart, &commands); + if (command & kFlagBitSize) EncodeVarInt(tagsize, &commands); + + prevtagstart = tagstart; + prevtagsize = tagsize; + } + } + // Indicate end of tag list or varint indicating there's none + commands.push_back(0); + + // Main content + // The main content in a valid ICC profile contains tagged elements, with the + // tag types (4 letter names) given by the tag list above, and the tag list + // pointing to the start and indicating the size of each tagged element. It is + // allowed for tagged elements to overlap, e.g. the curve for R, G and B could + // all point to the same one. + Tag tag; + size_t tagstart = 0, tagsize = 0, clutstart = 0; + + size_t last0 = pos; + // This loop appends commands to the output, processing some sub-section of a + // current tagged element each time. We need to keep track of the tagtype of + // the current element, and update it when we encounter the boundary of a + // next one. + // It is not required that the input data is a valid ICC profile, if the + // encoder does not recognize the data it will still be able to output bytes + // but will not predict as well. + while (pos <= size) { + size_t last1 = pos; + PaddedBytes commands_add; + PaddedBytes data_add; + + // This means the loop brought the position beyond the tag end. + if (pos > tagstart + tagsize) { + tag = {{0, 0, 0, 0}}; // nonsensical value + } + + if (commands_add.empty() && data_add.empty() && tagmap.count(pos) && + pos + 4 <= size) { + size_t index = tagmap[pos]; + tag = DecodeKeyword(icc, size, pos); + tagstart = tagstarts[index]; + tagsize = tagsizes[index]; + + if (tag == kMlucTag && pos + tagsize <= size && tagsize > 8 && + icc[pos + 4] == 0 && icc[pos + 5] == 0 && icc[pos + 6] == 0 && + icc[pos + 7] == 0) { + size_t num = tagsize - 8; + commands_add.push_back(kCommandTypeStartFirst + 3); + pos += 8; + commands_add.push_back(kCommandShuffle2); + EncodeVarInt(num, &commands_add); + size_t start = data_add.size(); + for (size_t i = 0; i < num; i++) { + data_add.push_back(icc[pos]); + pos++; + } + Unshuffle(data_add.data() + start, num, 2); + } + + if (tag == kCurvTag && pos + tagsize <= size && tagsize > 8 && + icc[pos + 4] == 0 && icc[pos + 5] == 0 && icc[pos + 6] == 0 && + icc[pos + 7] == 0) { + size_t num = tagsize - 8; + if (num > 16 && num < (1 << 28) && pos + num <= size && pos > 0) { + commands_add.push_back(kCommandTypeStartFirst + 5); + pos += 8; + commands_add.push_back(kCommandPredict); + int order = 1, width = 2, stride = width; + commands_add.push_back((order << 2) | (width - 1)); + EncodeVarInt(num, &commands_add); + JXL_RETURN_IF_ERROR(PredictAndShuffle(stride, width, order, num, icc, + size, &pos, &data_add)); + } + } + } + + if (tag == kMab_Tag || tag == kMba_Tag) { + Tag subTag = DecodeKeyword(icc, size, pos); + if (pos + 12 < size && (subTag == kCurvTag || subTag == kVcgtTag) && + DecodeUint32(icc, size, pos + 4) == 0) { + uint32_t num = DecodeUint32(icc, size, pos + 8) * 2; + if (num > 16 && num < (1 << 28) && pos + 12 + num <= size) { + pos += 12; + last1 = pos; + commands_add.push_back(kCommandPredict); + int order = 1, width = 2, stride = width; + commands_add.push_back((order << 2) | (width - 1)); + EncodeVarInt(num, &commands_add); + JXL_RETURN_IF_ERROR(PredictAndShuffle(stride, width, order, num, icc, + size, &pos, &data_add)); + } + } + + if (pos == tagstart + 24 && pos + 4 < size) { + // Note that this value can be remembered for next iterations of the + // loop, so the "pos == clutstart" if below can trigger during a later + // iteration. + clutstart = tagstart + DecodeUint32(icc, size, pos); + } + + if (pos == clutstart && clutstart + 16 < size) { + size_t numi = icc[tagstart + 8]; + size_t numo = icc[tagstart + 9]; + size_t width = icc[clutstart + 16]; + size_t stride = width * numo; + size_t num = width * numo; + for (size_t i = 0; i < numi && clutstart + i < size; i++) { + num *= icc[clutstart + i]; + } + if ((width == 1 || width == 2) && num > 64 && num < (1 << 28) && + pos + num <= size && pos > stride * 4) { + commands_add.push_back(kCommandPredict); + int order = 1; + uint8_t flags = + (order << 2) | (width - 1) | (stride == width ? 0 : 16); + commands_add.push_back(flags); + if (flags & 16) EncodeVarInt(stride, &commands_add); + EncodeVarInt(num, &commands_add); + JXL_RETURN_IF_ERROR(PredictAndShuffle(stride, width, order, num, icc, + size, &pos, &data_add)); + } + } + } + + if (commands_add.empty() && data_add.empty() && tag == kGbd_Tag && + pos == tagstart + 8 && pos + tagsize - 8 <= size && pos > 16 && + tagsize > 8) { + size_t width = 4, order = 0, stride = width; + size_t num = tagsize - 8; + uint8_t flags = (order << 2) | (width - 1) | (stride == width ? 0 : 16); + commands_add.push_back(kCommandPredict); + commands_add.push_back(flags); + if (flags & 16) EncodeVarInt(stride, &commands_add); + EncodeVarInt(num, &commands_add); + JXL_RETURN_IF_ERROR(PredictAndShuffle(stride, width, order, num, icc, + size, &pos, &data_add)); + } + + if (commands_add.empty() && data_add.empty() && pos + 20 <= size) { + Tag subTag = DecodeKeyword(icc, size, pos); + if (subTag == kXyz_Tag && DecodeUint32(icc, size, pos + 4) == 0) { + commands_add.push_back(kCommandXYZ); + pos += 8; + for (size_t j = 0; j < 12; j++) data_add.push_back(icc[pos++]); + } + } + + if (commands_add.empty() && data_add.empty() && pos + 8 <= size) { + if (DecodeUint32(icc, size, pos + 4) == 0) { + Tag subTag = DecodeKeyword(icc, size, pos); + for (size_t i = 0; i < kNumTypeStrings; i++) { + if (subTag == *kTypeStrings[i]) { + commands_add.push_back(kCommandTypeStartFirst + i); + pos += 8; + break; + } + } + } + } + + if (!(commands_add.empty() && data_add.empty()) || pos == size) { + if (last0 < last1) { + commands.push_back(kCommandInsert); + EncodeVarInt(last1 - last0, &commands); + while (last0 < last1) { + data.push_back(icc[last0++]); + } + } + for (size_t i = 0; i < commands_add.size(); i++) { + commands.push_back(commands_add[i]); + } + for (size_t i = 0; i < data_add.size(); i++) { + data.push_back(data_add[i]); + } + last0 = pos; + } + if (commands_add.empty() && data_add.empty()) { + pos++; + } + } + + EncodeVarInt(commands.size(), result); + for (size_t i = 0; i < commands.size(); i++) { + result->push_back(commands[i]); + } + for (size_t i = 0; i < data.size(); i++) { + result->push_back(data[i]); + } + + return true; +} + +Status WriteICC(const PaddedBytes& icc, BitWriter* JXL_RESTRICT writer, + size_t layer, AuxOut* JXL_RESTRICT aux_out) { + if (icc.empty()) return JXL_FAILURE("ICC must be non-empty"); + PaddedBytes enc; + JXL_RETURN_IF_ERROR(PredictICC(icc.data(), icc.size(), &enc)); + std::vector> tokens(1); + BitWriter::Allotment allotment(writer, 128); + JXL_RETURN_IF_ERROR(U64Coder::Write(enc.size(), writer)); + allotment.ReclaimAndCharge(writer, layer, aux_out); + + for (size_t i = 0; i < enc.size(); i++) { + tokens[0].emplace_back( + ICCANSContext(i, i > 0 ? enc[i - 1] : 0, i > 1 ? enc[i - 2] : 0), + enc[i]); + } + HistogramParams params; + params.lz77_method = enc.size() < 4096 ? HistogramParams::LZ77Method::kOptimal + : HistogramParams::LZ77Method::kLZ77; + EntropyEncodingData code; + std::vector context_map; + params.force_huffman = true; + BuildAndEncodeHistograms(params, kNumICCContexts, tokens, &code, &context_map, + writer, layer, aux_out); + WriteTokens(tokens[0], code, context_map, writer, layer, aux_out); + return true; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_icc_codec.h b/third_party/jpeg-xl/lib/jxl/enc_icc_codec.h new file mode 100644 index 0000000000..c22cf5994e --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_icc_codec.h @@ -0,0 +1,33 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_ICC_CODEC_H_ +#define LIB_JXL_ENC_ICC_CODEC_H_ + +// Compressed representation of ICC profiles. + +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_bit_writer.h" + +namespace jxl { + +struct AuxOut; + +// Should still be called if `icc.empty()` - if so, writes only 1 bit. +Status WriteICC(const PaddedBytes& icc, BitWriter* JXL_RESTRICT writer, + size_t layer, AuxOut* JXL_RESTRICT aux_out); + +// Exposed only for testing +Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result); + +} // namespace jxl + +#endif // LIB_JXL_ENC_ICC_CODEC_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_image_bundle.cc b/third_party/jpeg-xl/lib/jxl/enc_image_bundle.cc new file mode 100644 index 0000000000..a77d3e0743 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_image_bundle.cc @@ -0,0 +1,154 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_image_bundle.h" + +#include + +#include +#include +#include + +#include "lib/jxl/alpha.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +namespace { + +// Copies ib:rect, converts, and copies into out. +Status CopyToT(const ImageMetadata* metadata, const ImageBundle* ib, + const Rect& rect, const ColorEncoding& c_desired, + const JxlCmsInterface& cms, ThreadPool* pool, Image3F* out) { + PROFILER_FUNC; + ColorSpaceTransform c_transform(cms); + // Changing IsGray is probably a bug. + JXL_CHECK(ib->IsGray() == c_desired.IsGray()); + bool is_gray = ib->IsGray(); + if (out->xsize() < rect.xsize() || out->ysize() < rect.ysize()) { + *out = Image3F(rect.xsize(), rect.ysize()); + } else { + out->ShrinkTo(rect.xsize(), rect.ysize()); + } + std::atomic ok{true}; + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, rect.ysize(), + [&](const size_t num_threads) { + return c_transform.Init(ib->c_current(), c_desired, + metadata->IntensityTarget(), rect.xsize(), + num_threads); + }, + [&](const uint32_t y, const size_t thread) { + float* mutable_src_buf = c_transform.BufSrc(thread); + const float* src_buf = mutable_src_buf; + // Interleave input. + if (is_gray) { + src_buf = rect.ConstPlaneRow(ib->color(), 0, y); + } else if (ib->c_current().IsCMYK()) { + if (!ib->HasBlack()) { + ok.store(false); + return; + } + const float* JXL_RESTRICT row_in0 = + rect.ConstPlaneRow(ib->color(), 0, y); + const float* JXL_RESTRICT row_in1 = + rect.ConstPlaneRow(ib->color(), 1, y); + const float* JXL_RESTRICT row_in2 = + rect.ConstPlaneRow(ib->color(), 2, y); + const float* JXL_RESTRICT row_in3 = rect.ConstRow(ib->black(), y); + for (size_t x = 0; x < rect.xsize(); x++) { + // CMYK convention in JXL: 0 = max ink, 1 = white + mutable_src_buf[4 * x + 0] = row_in0[x]; + mutable_src_buf[4 * x + 1] = row_in1[x]; + mutable_src_buf[4 * x + 2] = row_in2[x]; + mutable_src_buf[4 * x + 3] = row_in3[x]; + } + } else { + const float* JXL_RESTRICT row_in0 = + rect.ConstPlaneRow(ib->color(), 0, y); + const float* JXL_RESTRICT row_in1 = + rect.ConstPlaneRow(ib->color(), 1, y); + const float* JXL_RESTRICT row_in2 = + rect.ConstPlaneRow(ib->color(), 2, y); + for (size_t x = 0; x < rect.xsize(); x++) { + mutable_src_buf[3 * x + 0] = row_in0[x]; + mutable_src_buf[3 * x + 1] = row_in1[x]; + mutable_src_buf[3 * x + 2] = row_in2[x]; + } + } + float* JXL_RESTRICT dst_buf = c_transform.BufDst(thread); + if (!c_transform.Run(thread, src_buf, dst_buf)) { + ok.store(false); + return; + } + float* JXL_RESTRICT row_out0 = out->PlaneRow(0, y); + float* JXL_RESTRICT row_out1 = out->PlaneRow(1, y); + float* JXL_RESTRICT row_out2 = out->PlaneRow(2, y); + // De-interleave output and convert type. + if (is_gray) { + for (size_t x = 0; x < rect.xsize(); x++) { + row_out0[x] = dst_buf[x]; + row_out1[x] = dst_buf[x]; + row_out2[x] = dst_buf[x]; + } + } else { + for (size_t x = 0; x < rect.xsize(); x++) { + row_out0[x] = dst_buf[3 * x + 0]; + row_out1[x] = dst_buf[3 * x + 1]; + row_out2[x] = dst_buf[3 * x + 2]; + } + } + }, + "Colorspace transform")); + return ok.load(); +} + +} // namespace + +Status ImageBundle::TransformTo(const ColorEncoding& c_desired, + const JxlCmsInterface& cms, ThreadPool* pool) { + PROFILER_FUNC; + JXL_RETURN_IF_ERROR(CopyTo(Rect(color_), c_desired, cms, &color_, pool)); + c_current_ = c_desired; + return true; +} +Status ImageBundle::CopyTo(const Rect& rect, const ColorEncoding& c_desired, + const JxlCmsInterface& cms, Image3F* out, + ThreadPool* pool) const { + return CopyToT(metadata_, this, rect, c_desired, cms, pool, out); +} +Status TransformIfNeeded(const ImageBundle& in, const ColorEncoding& c_desired, + const JxlCmsInterface& cms, ThreadPool* pool, + ImageBundle* store, const ImageBundle** out) { + if (in.c_current().SameColorEncoding(c_desired) && !in.HasBlack()) { + *out = ∈ + return true; + } + // TODO(janwas): avoid copying via createExternal+copyBackToIO + // instead of copy+createExternal+copyBackToIO + store->SetFromImage(CopyImage(in.color()), in.c_current()); + + // Must at least copy the alpha channel for use by external_image. + if (in.HasExtraChannels()) { + std::vector extra_channels; + for (const ImageF& extra_channel : in.extra_channels()) { + extra_channels.emplace_back(CopyImage(extra_channel)); + } + store->SetExtraChannels(std::move(extra_channels)); + } + + if (!store->TransformTo(c_desired, cms, pool)) { + return false; + } + *out = store; + return true; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_image_bundle.h b/third_party/jpeg-xl/lib/jxl/enc_image_bundle.h new file mode 100644 index 0000000000..85f8e14e1c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_image_bundle.h @@ -0,0 +1,25 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_IMAGE_BUNDLE_H_ +#define LIB_JXL_ENC_IMAGE_BUNDLE_H_ + +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +// Does color transformation from in.c_current() to c_desired if the color +// encodings are different, or nothing if they are already the same. +// If color transformation is done, stores the transformed values into store and +// sets the out pointer to store, else leaves store untouched and sets the out +// pointer to &in. +// Returns false if color transform fails. +Status TransformIfNeeded(const ImageBundle& in, const ColorEncoding& c_desired, + const JxlCmsInterface& cms, ThreadPool* pool, + ImageBundle* store, const ImageBundle** out); + +} // namespace jxl + +#endif // LIB_JXL_ENC_IMAGE_BUNDLE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_jxl_skcms.h b/third_party/jpeg-xl/lib/jxl/enc_jxl_skcms.h new file mode 100644 index 0000000000..3c364e883d --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_jxl_skcms.h @@ -0,0 +1,54 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_JXL_SKCMS_H_ +#define LIB_JXL_ENC_JXL_SKCMS_H_ + +// skcms wrapper to rename the skcms symbols to avoid conflicting names with +// other projects using skcms as well. When using JPEGXL_BUNDLE_SKCMS the +// bundled functions will be renamed from skcms_ to jxl_skcms_ + +#ifdef SKCMS_API +#error "Must include enc_jxl_skcms.h and not skcms.h directly" +#endif // SKCMS_API + +#if JPEGXL_BUNDLE_SKCMS + +#define skcms_252_random_bytes jxl_skcms_252_random_bytes +#define skcms_AdaptToXYZD50 jxl_skcms_AdaptToXYZD50 +#define skcms_ApproximateCurve jxl_skcms_ApproximateCurve +#define skcms_ApproximatelyEqualProfiles jxl_skcms_ApproximatelyEqualProfiles +#define skcms_AreApproximateInverses jxl_skcms_AreApproximateInverses +#define skcms_GetCHAD jxl_skcms_GetCHAD +#define skcms_GetTagByIndex jxl_skcms_GetTagByIndex +#define skcms_GetTagBySignature jxl_skcms_GetTagBySignature +#define skcms_GetWTPT jxl_skcms_GetWTPT +#define skcms_Identity_TransferFunction jxl_skcms_Identity_TransferFunction +#define skcms_MakeUsableAsDestination jxl_skcms_MakeUsableAsDestination +#define skcms_MakeUsableAsDestinationWithSingleCurve \ + jxl_skcms_MakeUsableAsDestinationWithSingleCurve +#define skcms_Matrix3x3_concat jxl_skcms_Matrix3x3_concat +#define skcms_Matrix3x3_invert jxl_skcms_Matrix3x3_invert +#define skcms_MaxRoundtripError jxl_skcms_MaxRoundtripError +#define skcms_Parse jxl_skcms_Parse +#define skcms_PrimariesToXYZD50 jxl_skcms_PrimariesToXYZD50 +#define skcms_sRGB_Inverse_TransferFunction \ + jxl_skcms_sRGB_Inverse_TransferFunction +#define skcms_sRGB_profile jxl_skcms_sRGB_profile +#define skcms_sRGB_TransferFunction jxl_skcms_sRGB_TransferFunction +#define skcms_TransferFunction_eval jxl_skcms_TransferFunction_eval +#define skcms_TransferFunction_invert jxl_skcms_TransferFunction_invert +#define skcms_TransferFunction_makeHLGish jxl_skcms_TransferFunction_makeHLGish +#define skcms_TransferFunction_makePQish jxl_skcms_TransferFunction_makePQish +#define skcms_Transform jxl_skcms_Transform +#define skcms_TransformWithPalette jxl_skcms_TransformWithPalette +#define skcms_TRCs_AreApproximateInverse jxl_skcms_TRCs_AreApproximateInverse +#define skcms_XYZD50_profile jxl_skcms_XYZD50_profile + +#endif // JPEGXL_BUNDLE_SKCMS + +#include "skcms.h" + +#endif // LIB_JXL_ENC_JXL_SKCMS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_linalg.cc b/third_party/jpeg-xl/lib/jxl/enc_linalg.cc new file mode 100644 index 0000000000..fe2090a909 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_linalg.cc @@ -0,0 +1,52 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_linalg.h" + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +void ConvertToDiagonal(const ImageD& A, ImageD* const JXL_RESTRICT diag, + ImageD* const JXL_RESTRICT U) { +#if JXL_ENABLE_ASSERT + JXL_ASSERT(A.xsize() == 2); + JXL_ASSERT(A.ysize() == 2); + JXL_ASSERT(std::abs(A.Row(0)[1] - A.Row(1)[0]) < 1e-15); +#endif + + if (std::abs(A.ConstRow(0)[1]) < 1e-15) { + // Already diagonal. + diag->Row(0)[0] = A.ConstRow(0)[0]; + diag->Row(0)[1] = A.ConstRow(1)[1]; + U->Row(0)[0] = U->Row(1)[1] = 1.0; + U->Row(0)[1] = U->Row(1)[0] = 0.0; + return; + } + double b = -(A.Row(0)[0] + A.Row(1)[1]); + double c = A.Row(0)[0] * A.Row(1)[1] - A.Row(0)[1] * A.Row(0)[1]; + double d = b * b - 4.0 * c; + double sqd = std::sqrt(d); + double l1 = (-b - sqd) * 0.5; + double l2 = (-b + sqd) * 0.5; + + double v1[2] = {A.Row(0)[0] - l1, A.Row(1)[0]}; + double v1n = 1.0 / std::hypot(v1[0], v1[1]); + v1[0] = v1[0] * v1n; + v1[1] = v1[1] * v1n; + + diag->Row(0)[0] = l1; + diag->Row(0)[1] = l2; + + U->Row(0)[0] = v1[1]; + U->Row(0)[1] = -v1[0]; + U->Row(1)[0] = v1[0]; + U->Row(1)[1] = v1[1]; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_linalg.h b/third_party/jpeg-xl/lib/jxl/enc_linalg.h new file mode 100644 index 0000000000..791770d5d4 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_linalg.h @@ -0,0 +1,24 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_LINALG_H_ +#define LIB_JXL_LINALG_H_ + +// Linear algebra. + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/image.h" + +namespace jxl { + +using ImageD = Plane; + +// A is symmetric, U is orthogonal, and A = U * Diagonal(diag) * Transpose(U). +void ConvertToDiagonal(const ImageD& A, ImageD* JXL_RESTRICT diag, + ImageD* JXL_RESTRICT U); + +} // namespace jxl + +#endif // LIB_JXL_LINALG_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_linalg_test.cc b/third_party/jpeg-xl/lib/jxl/enc_linalg_test.cc new file mode 100644 index 0000000000..967b9a3afb --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_linalg_test.cc @@ -0,0 +1,118 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_linalg.h" + +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +ImageD Identity(const size_t N) { + ImageD out(N, N); + for (size_t i = 0; i < N; ++i) { + double* JXL_RESTRICT row = out.Row(i); + std::fill(row, row + N, 0); + row[i] = 1.0; + } + return out; +} + +ImageD Diagonal(const ImageD& d) { + JXL_ASSERT(d.ysize() == 1); + ImageD out(d.xsize(), d.xsize()); + const double* JXL_RESTRICT row_diag = d.Row(0); + for (size_t k = 0; k < d.xsize(); ++k) { + double* JXL_RESTRICT row_out = out.Row(k); + std::fill(row_out, row_out + d.xsize(), 0.0); + row_out[k] = row_diag[k]; + } + return out; +} + +ImageD MatMul(const ImageD& A, const ImageD& B) { + JXL_ASSERT(A.ysize() == B.xsize()); + ImageD out(A.xsize(), B.ysize()); + for (size_t y = 0; y < B.ysize(); ++y) { + const double* const JXL_RESTRICT row_b = B.Row(y); + double* const JXL_RESTRICT row_out = out.Row(y); + for (size_t x = 0; x < A.xsize(); ++x) { + row_out[x] = 0.0; + for (size_t k = 0; k < B.xsize(); ++k) { + row_out[x] += A.Row(k)[x] * row_b[k]; + } + } + } + return out; +} + +ImageD Transpose(const ImageD& A) { + ImageD out(A.ysize(), A.xsize()); + for (size_t x = 0; x < A.xsize(); ++x) { + double* const JXL_RESTRICT row_out = out.Row(x); + for (size_t y = 0; y < A.ysize(); ++y) { + row_out[y] = A.Row(y)[x]; + } + } + return out; +} + +ImageD RandomSymmetricMatrix(const size_t N, Rng& rng, const double vmin, + const double vmax) { + ImageD A(N, N); + GenerateImage(rng, &A, vmin, vmax); + for (size_t i = 0; i < N; ++i) { + for (size_t j = 0; j < i; ++j) { + A.Row(j)[i] = A.Row(i)[j]; + } + } + return A; +} + +void VerifyMatrixEqual(const ImageD& A, const ImageD& B, const double eps) { + ASSERT_EQ(A.xsize(), B.xsize()); + ASSERT_EQ(A.ysize(), B.ysize()); + for (size_t y = 0; y < A.ysize(); ++y) { + for (size_t x = 0; x < A.xsize(); ++x) { + ASSERT_NEAR(A.Row(y)[x], B.Row(y)[x], eps); + } + } +} + +void VerifyOrthogonal(const ImageD& A, const double eps) { + VerifyMatrixEqual(Identity(A.xsize()), MatMul(Transpose(A), A), eps); +} + +TEST(LinAlgTest, ConvertToDiagonal) { + { + ImageD I = Identity(2); + ImageD U(2, 2), d(2, 1); + ConvertToDiagonal(I, &d, &U); + VerifyMatrixEqual(I, U, 1e-15); + for (size_t k = 0; k < 2; ++k) { + ASSERT_NEAR(d.Row(0)[k], 1.0, 1e-15); + } + } + { + ImageD A = Identity(2); + A.Row(0)[1] = A.Row(1)[0] = 2.0; + ImageD U(2, 2), d(2, 1); + ConvertToDiagonal(A, &d, &U); + VerifyOrthogonal(U, 1e-12); + VerifyMatrixEqual(A, MatMul(U, MatMul(Diagonal(d), Transpose(U))), 1e-12); + } + Rng rng(0); + for (size_t i = 0; i < 100; ++i) { + ImageD A = RandomSymmetricMatrix(2, rng, -1.0, 1.0); + ImageD U(2, 2), d(2, 1); + ConvertToDiagonal(A, &d, &U); + VerifyOrthogonal(U, 1e-12); + VerifyMatrixEqual(A, MatMul(U, MatMul(Diagonal(d), Transpose(U))), 1e-12); + } +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_modular.cc b/third_party/jpeg-xl/lib/jxl/enc_modular.cc new file mode 100644 index 0000000000..0453b34654 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_modular.cc @@ -0,0 +1,1762 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_modular.h" + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cluster.h" +#include "lib/jxl/enc_fields.h" +#include "lib/jxl/enc_gaborish.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_patch_dictionary.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/encoding/enc_debug_tree.h" +#include "lib/jxl/modular/encoding/enc_encoding.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/modular/encoding/ma_common.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/options.h" +#include "lib/jxl/modular/transform/enc_transform.h" +#include "lib/jxl/toc.h" + +namespace jxl { + +namespace { +constexpr bool kPrintTree = false; + +// Squeeze default quantization factors +// these quantization factors are for -Q 50 (other qualities simply scale the +// factors; things are rounded down and obviously cannot get below 1) +static const float squeeze_quality_factor = + 0.35; // for easy tweaking of the quality range (decrease this number for + // higher quality) +static const float squeeze_luma_factor = + 1.1; // for easy tweaking of the balance between luma (or anything + // non-chroma) and chroma (decrease this number for higher quality + // luma) +static const float squeeze_quality_factor_xyb = 2.4f; +static const float squeeze_xyb_qtable[3][16] = { + {163.84, 81.92, 40.96, 20.48, 10.24, 5.12, 2.56, 1.28, 0.64, 0.32, 0.16, + 0.08, 0.04, 0.02, 0.01, 0.005}, // Y + {1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1, 0.5, 0.5, 0.5, 0.5, + 0.5}, // X + {2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1, 0.5, 0.5, 0.5, + 0.5}, // B-Y +}; + +static const float squeeze_luma_qtable[16] = { + 163.84, 81.92, 40.96, 20.48, 10.24, 5.12, 2.56, 1.28, + 0.64, 0.32, 0.16, 0.08, 0.04, 0.02, 0.01, 0.005}; +// for 8-bit input, the range of YCoCg chroma is -255..255 so basically this +// does 4:2:0 subsampling (two most fine grained layers get quantized away) +static const float squeeze_chroma_qtable[16] = { + 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1, 0.5, 0.5, 0.5, 0.5, 0.5}; + +// `cutoffs` must be sorted. +Tree MakeFixedTree(int property, const std::vector& cutoffs, + Predictor pred, size_t num_pixels) { + size_t log_px = CeilLog2Nonzero(num_pixels); + size_t min_gap = 0; + // Reduce fixed tree height when encoding small images. + if (log_px < 14) { + min_gap = 8 * (14 - log_px); + } + Tree tree; + struct NodeInfo { + size_t begin, end, pos; + }; + std::queue q; + // Leaf IDs will be set by roundtrip decoding the tree. + tree.push_back(PropertyDecisionNode::Leaf(pred)); + q.push(NodeInfo{0, cutoffs.size(), 0}); + while (!q.empty()) { + NodeInfo info = q.front(); + q.pop(); + if (info.begin + min_gap >= info.end) continue; + uint32_t split = (info.begin + info.end) / 2; + tree[info.pos] = + PropertyDecisionNode::Split(property, cutoffs[split], tree.size()); + q.push(NodeInfo{split + 1, info.end, tree.size()}); + tree.push_back(PropertyDecisionNode::Leaf(pred)); + q.push(NodeInfo{info.begin, split, tree.size()}); + tree.push_back(PropertyDecisionNode::Leaf(pred)); + } + return tree; +} + +Tree PredefinedTree(ModularOptions::TreeKind tree_kind, size_t total_pixels) { + if (tree_kind == ModularOptions::TreeKind::kJpegTranscodeACMeta || + tree_kind == ModularOptions::TreeKind::kTrivialTreeNoPredictor) { + // All the data is 0, so no need for a fancy tree. + return {PropertyDecisionNode::Leaf(Predictor::Zero)}; + } + if (tree_kind == ModularOptions::TreeKind::kFalconACMeta) { + // All the data is 0 except the quant field. TODO(veluca): make that 0 too. + return {PropertyDecisionNode::Leaf(Predictor::Left)}; + } + if (tree_kind == ModularOptions::TreeKind::kACMeta) { + // Small image. + if (total_pixels < 1024) { + return {PropertyDecisionNode::Leaf(Predictor::Left)}; + } + Tree tree; + // 0: c > 1 + tree.push_back(PropertyDecisionNode::Split(0, 1, 1)); + // 1: c > 2 + tree.push_back(PropertyDecisionNode::Split(0, 2, 3)); + // 2: c > 0 + tree.push_back(PropertyDecisionNode::Split(0, 0, 5)); + // 3: EPF control field (all 0 or 4), top > 0 + tree.push_back(PropertyDecisionNode::Split(6, 0, 21)); + // 4: ACS+QF, y > 0 + tree.push_back(PropertyDecisionNode::Split(2, 0, 7)); + // 5: CfL x + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Gradient)); + // 6: CfL b + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Gradient)); + // 7: QF: split according to the left quant value. + tree.push_back(PropertyDecisionNode::Split(7, 5, 9)); + // 8: ACS: split in 4 segments (8x8 from 0 to 3, large square 4-5, large + // rectangular 6-11, 8x8 12+), according to previous ACS value. + tree.push_back(PropertyDecisionNode::Split(7, 5, 15)); + // QF + tree.push_back(PropertyDecisionNode::Split(7, 11, 11)); + tree.push_back(PropertyDecisionNode::Split(7, 3, 13)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left)); + // ACS + tree.push_back(PropertyDecisionNode::Split(7, 11, 17)); + tree.push_back(PropertyDecisionNode::Split(7, 3, 19)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero)); + // EPF, left > 0 + tree.push_back(PropertyDecisionNode::Split(7, 0, 23)); + tree.push_back(PropertyDecisionNode::Split(7, 0, 25)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero)); + return tree; + } + if (tree_kind == ModularOptions::TreeKind::kWPFixedDC) { + std::vector cutoffs = { + -500, -392, -255, -191, -127, -95, -63, -47, -31, -23, -15, + -11, -7, -4, -3, -1, 0, 1, 3, 5, 7, 11, + 15, 23, 31, 47, 63, 95, 127, 191, 255, 392, 500}; + return MakeFixedTree(kWPProp, cutoffs, Predictor::Weighted, total_pixels); + } + if (tree_kind == ModularOptions::TreeKind::kGradientFixedDC) { + std::vector cutoffs = { + -500, -392, -255, -191, -127, -95, -63, -47, -31, -23, -15, + -11, -7, -4, -3, -1, 0, 1, 3, 5, 7, 11, + 15, 23, 31, 47, 63, 95, 127, 191, 255, 392, 500}; + return MakeFixedTree(kGradientProp, cutoffs, Predictor::Gradient, + total_pixels); + } + JXL_ABORT("Unreachable"); + return {}; +} + +// Merges the trees in `trees` using nodes that decide on stream_id, as defined +// by `tree_splits`. +void MergeTrees(const std::vector& trees, + const std::vector& tree_splits, size_t begin, + size_t end, Tree* tree) { + JXL_ASSERT(trees.size() + 1 == tree_splits.size()); + JXL_ASSERT(end > begin); + JXL_ASSERT(end <= trees.size()); + if (end == begin + 1) { + // Insert the tree, adding the opportune offset to all child nodes. + // This will make the leaf IDs wrong, but subsequent roundtripping will fix + // them. + size_t sz = tree->size(); + tree->insert(tree->end(), trees[begin].begin(), trees[begin].end()); + for (size_t i = sz; i < tree->size(); i++) { + (*tree)[i].lchild += sz; + (*tree)[i].rchild += sz; + } + return; + } + size_t mid = (begin + end) / 2; + size_t splitval = tree_splits[mid] - 1; + size_t cur = tree->size(); + tree->emplace_back(1 /*stream_id*/, splitval, 0, 0, Predictor::Zero, 0, 1); + (*tree)[cur].lchild = tree->size(); + MergeTrees(trees, tree_splits, mid, end, tree); + (*tree)[cur].rchild = tree->size(); + MergeTrees(trees, tree_splits, begin, mid, tree); +} + +void QuantizeChannel(Channel& ch, const int q) { + if (q == 1) return; + for (size_t y = 0; y < ch.plane.ysize(); y++) { + pixel_type* row = ch.plane.Row(y); + for (size_t x = 0; x < ch.plane.xsize(); x++) { + if (row[x] < 0) { + row[x] = -((-row[x] + q / 2) / q) * q; + } else { + row[x] = ((row[x] + q / 2) / q) * q; + } + } + } +} + +// convert binary32 float that corresponds to custom [bits]-bit float (with +// [exp_bits] exponent bits) to a [bits]-bit integer representation that should +// fit in pixel_type +Status float_to_int(const float* const row_in, pixel_type* const row_out, + size_t xsize, unsigned int bits, unsigned int exp_bits, + bool fp, double dfactor) { + JXL_ASSERT(sizeof(pixel_type) * 8 >= bits); + if (!fp) { + if (bits > 22) { + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = row_in[x] * dfactor + (row_in[x] < 0 ? -0.5 : 0.5); + } + } else { + float factor = dfactor; + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = row_in[x] * factor + (row_in[x] < 0 ? -0.5f : 0.5f); + } + } + return true; + } + if (bits == 32 && fp) { + JXL_ASSERT(exp_bits == 8); + memcpy((void*)row_out, (const void*)row_in, 4 * xsize); + return true; + } + + int exp_bias = (1 << (exp_bits - 1)) - 1; + int max_exp = (1 << exp_bits) - 1; + uint32_t sign = (1u << (bits - 1)); + int mant_bits = bits - exp_bits - 1; + int mant_shift = 23 - mant_bits; + for (size_t x = 0; x < xsize; ++x) { + uint32_t f; + memcpy(&f, &row_in[x], 4); + int signbit = (f >> 31); + f &= 0x7fffffff; + if (f == 0) { + row_out[x] = (signbit ? sign : 0); + continue; + } + int exp = (f >> 23) - 127; + if (exp == 128) return JXL_FAILURE("Inf/NaN not allowed"); + int mantissa = (f & 0x007fffff); + // broke up the binary32 into its parts, now reassemble into + // arbitrary float + exp += exp_bias; + if (exp < 0) { // will become a subnormal number + // add implicit leading 1 to mantissa + mantissa |= 0x00800000; + if (exp < -mant_bits) { + return JXL_FAILURE( + "Invalid float number: %g cannot be represented with %i " + "exp_bits and %i mant_bits (exp %i)", + row_in[x], exp_bits, mant_bits, exp); + } + mantissa >>= 1 - exp; + exp = 0; + } + // exp should be representable in exp_bits, otherwise input was + // invalid + if (exp > max_exp) return JXL_FAILURE("Invalid float exponent"); + if (mantissa & ((1 << mant_shift) - 1)) { + return JXL_FAILURE("%g is losing precision (mant: %x)", row_in[x], + mantissa); + } + mantissa >>= mant_shift; + f = (signbit ? sign : 0); + f |= (exp << mant_bits); + f |= mantissa; + row_out[x] = (pixel_type)f; + } + return true; +} +} // namespace + +ModularFrameEncoder::ModularFrameEncoder(const FrameHeader& frame_header, + const CompressParams& cparams_orig) + : frame_dim_(frame_header.ToFrameDimensions()), cparams_(cparams_orig) { + size_t num_streams = + ModularStreamId::Num(frame_dim_, frame_header.passes.num_passes); + if (cparams_.ModularPartIsLossless()) { + switch (cparams_.decoding_speed_tier) { + case 0: + break; + case 1: + cparams_.options.wp_tree_mode = ModularOptions::TreeMode::kWPOnly; + break; + case 2: { + cparams_.options.wp_tree_mode = ModularOptions::TreeMode::kGradientOnly; + cparams_.options.predictor = Predictor::Gradient; + break; + } + case 3: { // LZ77, no Gradient. + cparams_.options.nb_repeats = 0; + cparams_.options.predictor = Predictor::Gradient; + break; + } + default: { // LZ77, no predictor. + cparams_.options.nb_repeats = 0; + cparams_.options.predictor = Predictor::Zero; + break; + } + } + } + if (cparams_.decoding_speed_tier >= 1 && cparams_.responsive && + cparams_.ModularPartIsLossless()) { + cparams_.options.tree_kind = + ModularOptions::TreeKind::kTrivialTreeNoPredictor; + cparams_.options.nb_repeats = 0; + } + stream_images_.resize(num_streams); + + // use a sensible default if nothing explicit is specified: + // Squeeze for lossy, no squeeze for lossless + if (cparams_.responsive < 0) { + if (cparams_.ModularPartIsLossless()) { + cparams_.responsive = 0; + } else { + cparams_.responsive = 1; + } + } + + if (cparams_.speed_tier > SpeedTier::kWombat) { + cparams_.options.splitting_heuristics_node_threshold = 192; + } else { + cparams_.options.splitting_heuristics_node_threshold = 96; + } + { + // Set properties. + std::vector prop_order; + if (cparams_.responsive) { + // Properties in order of their likelihood of being useful for Squeeze + // residuals. + prop_order = {0, 1, 4, 5, 6, 7, 8, 15, 9, 10, 11, 12, 13, 14, 2, 3}; + } else { + // Same, but for the non-Squeeze case. + prop_order = {0, 1, 15, 9, 10, 11, 12, 13, 14, 2, 3, 4, 5, 6, 7, 8}; + } + switch (cparams_.speed_tier) { + case SpeedTier::kSquirrel: + cparams_.options.splitting_heuristics_properties.assign( + prop_order.begin(), prop_order.begin() + 8); + cparams_.options.max_property_values = 32; + break; + case SpeedTier::kKitten: + cparams_.options.splitting_heuristics_properties.assign( + prop_order.begin(), prop_order.begin() + 10); + cparams_.options.max_property_values = 64; + break; + case SpeedTier::kTortoise: + cparams_.options.splitting_heuristics_properties = prop_order; + cparams_.options.max_property_values = 256; + break; + default: + cparams_.options.splitting_heuristics_properties.assign( + prop_order.begin(), prop_order.begin() + 6); + cparams_.options.max_property_values = 16; + break; + } + if (cparams_.speed_tier > SpeedTier::kTortoise) { + // Gradient in previous channels. + for (int i = 0; i < cparams_.options.max_properties; i++) { + cparams_.options.splitting_heuristics_properties.push_back( + kNumNonrefProperties + i * 4 + 3); + } + } else { + // All the extra properties in Tortoise mode. + for (int i = 0; i < cparams_.options.max_properties * 4; i++) { + cparams_.options.splitting_heuristics_properties.push_back( + kNumNonrefProperties + i); + } + } + } + + if (cparams_.options.predictor == static_cast(-1)) { + // no explicit predictor(s) given, set a good default + if ((cparams_.speed_tier <= SpeedTier::kTortoise || + cparams_.modular_mode == false) && + cparams_.IsLossless() && cparams_.responsive == false) { + // TODO(veluca): allow all predictors that don't break residual + // multipliers in lossy mode. + cparams_.options.predictor = Predictor::Variable; + } else if (cparams_.responsive || cparams_.lossy_palette) { + // zero predictor for Squeeze residues and lossy palette + cparams_.options.predictor = Predictor::Zero; + } else if (!cparams_.IsLossless()) { + // If not responsive and lossy. TODO(veluca): use near_lossless instead? + cparams_.options.predictor = Predictor::Gradient; + } else if (cparams_.speed_tier < SpeedTier::kFalcon) { + // try median and weighted predictor for anything else + cparams_.options.predictor = Predictor::Best; + } else if (cparams_.speed_tier == SpeedTier::kFalcon) { + // just weighted predictor in falcon mode + cparams_.options.predictor = Predictor::Weighted; + } else if (cparams_.speed_tier > SpeedTier::kFalcon) { + // just gradient predictor in thunder mode + cparams_.options.predictor = Predictor::Gradient; + } + } else { + delta_pred_ = cparams_.options.predictor; + if (cparams_.lossy_palette) cparams_.options.predictor = Predictor::Zero; + } + if (!cparams_.ModularPartIsLossless()) { + if (cparams_.options.predictor == Predictor::Weighted || + cparams_.options.predictor == Predictor::Variable || + cparams_.options.predictor == Predictor::Best) + cparams_.options.predictor = Predictor::Zero; + } + tree_splits_.push_back(0); + if (cparams_.modular_mode == false) { + cparams_.options.fast_decode_multiplier = 1.0f; + tree_splits_.push_back(ModularStreamId::VarDCTDC(0).ID(frame_dim_)); + tree_splits_.push_back(ModularStreamId::ModularDC(0).ID(frame_dim_)); + tree_splits_.push_back(ModularStreamId::ACMetadata(0).ID(frame_dim_)); + tree_splits_.push_back(ModularStreamId::QuantTable(0).ID(frame_dim_)); + tree_splits_.push_back(ModularStreamId::ModularAC(0, 0).ID(frame_dim_)); + ac_metadata_size.resize(frame_dim_.num_dc_groups); + extra_dc_precision.resize(frame_dim_.num_dc_groups); + } + tree_splits_.push_back(num_streams); + cparams_.options.max_chan_size = frame_dim_.group_dim; + cparams_.options.group_dim = frame_dim_.group_dim; + + // TODO(veluca): figure out how to use different predictor sets per channel. + stream_options_.resize(num_streams, cparams_.options); +} + +bool do_transform(Image& image, const Transform& tr, + const weighted::Header& wp_header, + jxl::ThreadPool* pool = nullptr, bool force_jxlart = false) { + Transform t = tr; + bool did_it = true; + if (force_jxlart) { + if (!t.MetaApply(image)) return false; + } else { + did_it = TransformForward(t, image, wp_header, pool); + } + if (did_it) image.transform.push_back(t); + return did_it; +} + +Status ModularFrameEncoder::ComputeEncodingData( + const FrameHeader& frame_header, const ImageMetadata& metadata, + Image3F* JXL_RESTRICT color, const std::vector& extra_channels, + PassesEncoderState* JXL_RESTRICT enc_state, const JxlCmsInterface& cms, + ThreadPool* pool, AuxOut* aux_out, bool do_color) { + JXL_DEBUG_V(6, "Computing modular encoding data for frame %s", + frame_header.DebugString().c_str()); + + if (do_color && frame_header.loop_filter.gab) { + float w = 0.9908511000000001f; + float weights[3] = {w, w, w}; + GaborishInverse(color, weights, pool); + } + + if (do_color && metadata.bit_depth.bits_per_sample <= 16 && + cparams_.speed_tier < SpeedTier::kCheetah && + cparams_.decoding_speed_tier < 2) { + FindBestPatchDictionary(*color, enc_state, cms, nullptr, aux_out, + cparams_.color_transform == ColorTransform::kXYB); + PatchDictionaryEncoder::SubtractFrom( + enc_state->shared.image_features.patches, color); + } + + // Convert ImageBundle to modular Image object + const size_t xsize = frame_dim_.xsize; + const size_t ysize = frame_dim_.ysize; + + int nb_chans = 3; + if (metadata.color_encoding.IsGray() && + cparams_.color_transform == ColorTransform::kNone) { + nb_chans = 1; + } + if (!do_color) nb_chans = 0; + + nb_chans += extra_channels.size(); + + bool fp = metadata.bit_depth.floating_point_sample && + cparams_.color_transform != ColorTransform::kXYB; + + // bits_per_sample is just metadata for XYB images. + if (metadata.bit_depth.bits_per_sample >= 32 && do_color && + cparams_.color_transform != ColorTransform::kXYB) { + if (metadata.bit_depth.bits_per_sample == 32 && fp == false) { + return JXL_FAILURE("uint32_t not supported in enc_modular"); + } else if (metadata.bit_depth.bits_per_sample > 32) { + return JXL_FAILURE("bits_per_sample > 32 not supported"); + } + } + + // in the non-float case, there is an implicit 0 sign bit + int max_bitdepth = + do_color ? metadata.bit_depth.bits_per_sample + (fp ? 0 : 1) : 0; + Image& gi = stream_images_[0]; + gi = Image(xsize, ysize, metadata.bit_depth.bits_per_sample, nb_chans); + int c = 0; + if (cparams_.color_transform == ColorTransform::kXYB && + cparams_.modular_mode == true) { + float enc_factors[3] = {32768.0f, 2048.0f, 2048.0f}; + if (cparams_.butteraugli_distance > 0 && !cparams_.responsive) { + // quantize XYB here and then treat it as a lossless image + enc_factors[0] *= 1.f / (1.f + 23.f * cparams_.butteraugli_distance); + enc_factors[1] *= 1.f / (1.f + 14.f * cparams_.butteraugli_distance); + enc_factors[2] *= 1.f / (1.f + 14.f * cparams_.butteraugli_distance); + cparams_.butteraugli_distance = 0; + } + if (cparams_.manual_xyb_factors.size() == 3) { + DequantMatricesSetCustomDC(&enc_state->shared.matrices, + cparams_.manual_xyb_factors.data()); + // TODO(jon): update max_bitdepth in this case + } else { + DequantMatricesSetCustomDC(&enc_state->shared.matrices, enc_factors); + max_bitdepth = 12; + } + } + pixel_type maxval = gi.bitdepth < 32 ? (1u << gi.bitdepth) - 1 : 0; + if (do_color) { + for (; c < 3; c++) { + if (metadata.color_encoding.IsGray() && + cparams_.color_transform == ColorTransform::kNone && + c != (cparams_.color_transform == ColorTransform::kXYB ? 1 : 0)) + continue; + int c_out = c; + // XYB is encoded as YX(B-Y) + if (cparams_.color_transform == ColorTransform::kXYB && c < 2) + c_out = 1 - c_out; + double factor = maxval; + if (cparams_.color_transform == ColorTransform::kXYB) + factor = enc_state->shared.matrices.InvDCQuant(c); + if (c == 2 && cparams_.color_transform == ColorTransform::kXYB) { + JXL_ASSERT(!fp); + for (size_t y = 0; y < ysize; ++y) { + const float* const JXL_RESTRICT row_in = color->PlaneRow(c, y); + pixel_type* const JXL_RESTRICT row_out = gi.channel[c_out].Row(y); + pixel_type* const JXL_RESTRICT row_Y = gi.channel[0].Row(y); + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = row_in[x] * factor + 0.5f; + row_out[x] -= row_Y[x]; + // zero the lsb of B + row_out[x] = row_out[x] / 2 * 2; + } + } + } else { + int bits = metadata.bit_depth.bits_per_sample; + int exp_bits = metadata.bit_depth.exponent_bits_per_sample; + gi.channel[c_out].hshift = + enc_state->shared.frame_header.chroma_subsampling.HShift(c); + gi.channel[c_out].vshift = + enc_state->shared.frame_header.chroma_subsampling.VShift(c); + size_t xsize_shifted = DivCeil(xsize, 1 << gi.channel[c_out].hshift); + size_t ysize_shifted = DivCeil(ysize, 1 << gi.channel[c_out].vshift); + gi.channel[c_out].shrink(xsize_shifted, ysize_shifted); + std::atomic has_error{false}; + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, ysize_shifted, ThreadPool::NoInit, + [&](const int task, const int thread) { + const size_t y = task; + const float* const JXL_RESTRICT row_in = color->PlaneRow(c, y); + pixel_type* const JXL_RESTRICT row_out = gi.channel[c_out].Row(y); + if (!float_to_int(row_in, row_out, xsize_shifted, bits, exp_bits, + fp, factor)) { + has_error = true; + }; + }, + "float2int")); + if (has_error) { + return JXL_FAILURE("Error in float to integer conversion"); + } + } + } + if (metadata.color_encoding.IsGray() && + cparams_.color_transform == ColorTransform::kNone) + c = 1; + } + + for (size_t ec = 0; ec < extra_channels.size(); ec++, c++) { + const ExtraChannelInfo& eci = metadata.extra_channel_info[ec]; + size_t ecups = frame_header.extra_channel_upsampling[ec]; + gi.channel[c].shrink(DivCeil(frame_dim_.xsize_upsampled, ecups), + DivCeil(frame_dim_.ysize_upsampled, ecups)); + gi.channel[c].hshift = gi.channel[c].vshift = + CeilLog2Nonzero(ecups) - CeilLog2Nonzero(frame_header.upsampling); + + int bits = eci.bit_depth.bits_per_sample; + int exp_bits = eci.bit_depth.exponent_bits_per_sample; + bool fp = eci.bit_depth.floating_point_sample; + double factor = (fp ? 1 : ((1u << eci.bit_depth.bits_per_sample) - 1)); + if (bits + (fp ? 0 : 1) > max_bitdepth) max_bitdepth = bits + (fp ? 0 : 1); + std::atomic has_error{false}; + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, gi.channel[c].plane.ysize(), ThreadPool::NoInit, + [&](const int task, const int thread) { + const size_t y = task; + const float* const JXL_RESTRICT row_in = extra_channels[ec].Row(y); + pixel_type* const JXL_RESTRICT row_out = gi.channel[c].Row(y); + if (!float_to_int(row_in, row_out, gi.channel[c].plane.xsize(), bits, + exp_bits, fp, factor)) { + has_error = true; + }; + }, + "float2int")); + if (has_error) return JXL_FAILURE("Error in float to integer conversion"); + } + JXL_ASSERT(c == nb_chans); + + int level_max_bitdepth = (cparams_.level == 5 ? 16 : 32); + if (max_bitdepth > level_max_bitdepth) + return JXL_FAILURE( + "Bitdepth too high for level %i (need %i bits, have only %i in this " + "level)", + cparams_.level, max_bitdepth, level_max_bitdepth); + + // Set options and apply transformations + if (!cparams_.ModularPartIsLossless()) { + if (cparams_.palette_colors != 0) { + JXL_DEBUG_V(3, "Lossy encode, not doing palette transforms"); + } + if (cparams_.color_transform == ColorTransform::kXYB) { + cparams_.channel_colors_pre_transform_percent = 0; + } + cparams_.channel_colors_percent = 0; + cparams_.palette_colors = 0; + cparams_.lossy_palette = false; + } + + // if few colors, do all-channel palette before trying channel palette + // Logic is as follows: + // - if you can make a palette with few colors (arbitrary threshold: 200), + // then you can also make channel palettes, but they will just be extra + // signaling cost for almost no benefit + // - if the palette needs more colors, then channel palette might help to + // reduce palette signaling cost + if (cparams_.palette_colors != 0 && + cparams_.speed_tier < SpeedTier::kFalcon) { + // all-channel palette (e.g. RGBA) + if (gi.channel.size() > 1) { + Transform maybe_palette(TransformId::kPalette); + maybe_palette.begin_c = gi.nb_meta_channels; + maybe_palette.num_c = gi.channel.size() - gi.nb_meta_channels; + maybe_palette.nb_colors = + std::min(std::min(200, (int)(xsize * ysize / 8)), + std::abs(cparams_.palette_colors) / 16); + maybe_palette.ordered_palette = cparams_.palette_colors >= 0; + maybe_palette.lossy_palette = false; + do_transform(gi, maybe_palette, weighted::Header(), pool); + } + } + + // Global channel palette + if (cparams_.channel_colors_pre_transform_percent > 0 && + !cparams_.lossy_palette && + (cparams_.speed_tier <= SpeedTier::kThunder || + (do_color && metadata.bit_depth.bits_per_sample > 8))) { + // single channel palette (like FLIF's ChannelCompact) + size_t nb_channels = gi.channel.size() - gi.nb_meta_channels; + int orig_bitdepth = max_bitdepth; + max_bitdepth = 0; + for (size_t i = 0; i < nb_channels; i++) { + int32_t min, max; + compute_minmax(gi.channel[gi.nb_meta_channels + i], &min, &max); + int64_t colors = (int64_t)max - min + 1; + JXL_DEBUG_V(10, "Channel %" PRIuS ": range=%i..%i", i, min, max); + Transform maybe_palette_1(TransformId::kPalette); + maybe_palette_1.begin_c = i + gi.nb_meta_channels; + maybe_palette_1.num_c = 1; + // simple heuristic: if less than X percent of the values in the range + // actually occur, it is probably worth it to do a compaction + // (but only if the channel palette is less than 6% the size of the + // image itself) + maybe_palette_1.nb_colors = std::min( + (int)(xsize * ysize / 16), + (int)(cparams_.channel_colors_pre_transform_percent / 100. * colors)); + if (do_transform(gi, maybe_palette_1, weighted::Header(), pool)) { + // effective bit depth is lower, adjust quantization accordingly + compute_minmax(gi.channel[gi.nb_meta_channels + i], &min, &max); + if (max < maxval) maxval = max; + int ch_bitdepth = + (max > 0 ? CeilLog2Nonzero(static_cast(max)) : 0); + if (ch_bitdepth > max_bitdepth) max_bitdepth = ch_bitdepth; + } else + max_bitdepth = orig_bitdepth; + } + } + + // Global palette + if ((cparams_.palette_colors != 0 || cparams_.lossy_palette) && + cparams_.speed_tier < SpeedTier::kFalcon) { + // all-channel palette (e.g. RGBA) + if (gi.channel.size() - gi.nb_meta_channels > 1) { + Transform maybe_palette(TransformId::kPalette); + maybe_palette.begin_c = gi.nb_meta_channels; + maybe_palette.num_c = gi.channel.size() - gi.nb_meta_channels; + maybe_palette.nb_colors = + std::min((int)(xsize * ysize / 8), std::abs(cparams_.palette_colors)); + maybe_palette.ordered_palette = cparams_.palette_colors >= 0; + maybe_palette.lossy_palette = + (cparams_.lossy_palette && maybe_palette.num_c == 3); + if (maybe_palette.lossy_palette) { + maybe_palette.predictor = delta_pred_; + } + // TODO(veluca): use a custom weighted header if using the weighted + // predictor. + do_transform(gi, maybe_palette, weighted::Header(), pool, + cparams_.options.zero_tokens); + } + // all-minus-one-channel palette (RGB with separate alpha, or CMY with + // separate K) + if (gi.channel.size() - gi.nb_meta_channels > 3) { + Transform maybe_palette_3(TransformId::kPalette); + maybe_palette_3.begin_c = gi.nb_meta_channels; + maybe_palette_3.num_c = gi.channel.size() - gi.nb_meta_channels - 1; + maybe_palette_3.nb_colors = + std::min((int)(xsize * ysize / 8), std::abs(cparams_.palette_colors)); + maybe_palette_3.ordered_palette = cparams_.palette_colors >= 0; + maybe_palette_3.lossy_palette = cparams_.lossy_palette; + if (maybe_palette_3.lossy_palette) { + maybe_palette_3.predictor = delta_pred_; + } + do_transform(gi, maybe_palette_3, weighted::Header(), pool, + cparams_.options.zero_tokens); + } + } + + // don't do an RCT if we're short on bits + if (cparams_.color_transform == ColorTransform::kNone && do_color && + gi.channel.size() - gi.nb_meta_channels >= 3 && + max_bitdepth + 1 < level_max_bitdepth) { + if (cparams_.colorspace < 0 && (!cparams_.ModularPartIsLossless() || + cparams_.speed_tier > SpeedTier::kHare)) { + Transform ycocg{TransformId::kRCT}; + ycocg.rct_type = 6; + ycocg.begin_c = gi.nb_meta_channels; + do_transform(gi, ycocg, weighted::Header(), pool); + max_bitdepth++; + } else if (cparams_.colorspace > 0) { + Transform sg(TransformId::kRCT); + sg.begin_c = gi.nb_meta_channels; + sg.rct_type = cparams_.colorspace; + do_transform(gi, sg, weighted::Header(), pool); + max_bitdepth++; + } + } + + // don't do squeeze if we don't have some spare bits + if (cparams_.responsive && !gi.channel.empty() && + max_bitdepth + 2 < level_max_bitdepth) { + Transform t(TransformId::kSqueeze); + t.squeezes = cparams_.squeezes; + do_transform(gi, t, weighted::Header(), pool); + max_bitdepth += 2; + } + + if (max_bitdepth + 1 > level_max_bitdepth) { + // force no group RCTs if we don't have a spare bit + cparams_.colorspace = 0; + } + JXL_ASSERT(max_bitdepth <= level_max_bitdepth); + + std::vector quants; + + if (!cparams_.ModularPartIsLossless()) { + quants.resize(gi.channel.size(), 1); + float quantizer = 0.25f; + if (!cparams_.responsive) { + JXL_DEBUG_V(1, + "Warning: lossy compression without Squeeze " + "transform is just color quantization."); + quantizer *= 0.1f; + } + float bitdepth_correction = 1.f; + if (cparams_.color_transform != ColorTransform::kXYB) { + bitdepth_correction = maxval / 255.f; + } + std::vector quantizers; + float dist = cparams_.butteraugli_distance; + for (size_t i = 0; i < 3; i++) { + quantizers.push_back(quantizer * dist * bitdepth_correction); + } + for (size_t i = 0; i < extra_channels.size(); i++) { + int ec_bitdepth = + metadata.extra_channel_info[i].bit_depth.bits_per_sample; + pixel_type ec_maxval = ec_bitdepth < 32 ? (1u << ec_bitdepth) - 1 : 0; + bitdepth_correction = ec_maxval / 255.f; + if (i < cparams_.ec_distance.size()) dist = cparams_.ec_distance[i]; + if (dist < 0) dist = cparams_.butteraugli_distance; + quantizers.push_back(quantizer * dist * bitdepth_correction); + } + if (cparams_.options.nb_repeats == 0) { + return JXL_FAILURE("nb_repeats = 0 not supported with modular lossy!"); + } + for (uint32_t i = gi.nb_meta_channels; i < gi.channel.size(); i++) { + Channel& ch = gi.channel[i]; + int shift = ch.hshift + ch.vshift; // number of pixel halvings + if (shift > 16) shift = 16; + if (shift > 0) shift--; + int q; + // assuming default Squeeze here + int component = + (do_color ? 0 : 3) + ((i - gi.nb_meta_channels) % nb_chans); + // last 4 channels are final chroma residuals + if (nb_chans > 2 && i >= gi.channel.size() - 4 && cparams_.responsive) { + component = 1; + } + if (cparams_.color_transform == ColorTransform::kXYB && component < 3) { + q = quantizers[component] * squeeze_quality_factor_xyb * + squeeze_xyb_qtable[component][shift]; + } else { + if (cparams_.colorspace != 0 && component > 0 && component < 3) { + q = quantizers[component] * squeeze_quality_factor * + squeeze_chroma_qtable[shift]; + } else { + q = quantizers[component] * squeeze_quality_factor * + squeeze_luma_factor * squeeze_luma_qtable[shift]; + } + } + if (q < 1) q = 1; + QuantizeChannel(gi.channel[i], q); + quants[i] = q; + } + } + + // Fill other groups. + struct GroupParams { + Rect rect; + int minShift; + int maxShift; + ModularStreamId id; + }; + std::vector stream_params; + + stream_options_[0] = cparams_.options; + + // DC + for (size_t group_id = 0; group_id < frame_dim_.num_dc_groups; group_id++) { + const size_t gx = group_id % frame_dim_.xsize_dc_groups; + const size_t gy = group_id / frame_dim_.xsize_dc_groups; + const Rect rect(gx * frame_dim_.dc_group_dim, gy * frame_dim_.dc_group_dim, + frame_dim_.dc_group_dim, frame_dim_.dc_group_dim); + // minShift==3 because (frame_dim.dc_group_dim >> 3) == frame_dim.group_dim + // maxShift==1000 is infinity + stream_params.push_back( + GroupParams{rect, 3, 1000, ModularStreamId::ModularDC(group_id)}); + } + // AC global -> nothing. + // AC + for (size_t group_id = 0; group_id < frame_dim_.num_groups; group_id++) { + const size_t gx = group_id % frame_dim_.xsize_groups; + const size_t gy = group_id / frame_dim_.xsize_groups; + const Rect mrect(gx * frame_dim_.group_dim, gy * frame_dim_.group_dim, + frame_dim_.group_dim, frame_dim_.group_dim); + for (size_t i = 0; i < enc_state->progressive_splitter.GetNumPasses(); + i++) { + int maxShift, minShift; + frame_header.passes.GetDownsamplingBracket(i, minShift, maxShift); + stream_params.push_back(GroupParams{ + mrect, minShift, maxShift, ModularStreamId::ModularAC(group_id, i)}); + } + } + // if there's only one group, everything ends up in GlobalModular + // in that case, also try RCTs/WP params for the one group + if (stream_params.size() == 2) { + stream_params.push_back(GroupParams{Rect(0, 0, xsize, ysize), 0, 1000, + ModularStreamId::Global()}); + } + gi_channel_.resize(stream_images_.size()); + + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, stream_params.size(), ThreadPool::NoInit, + [&](const uint32_t i, size_t /* thread */) { + stream_options_[stream_params[i].id.ID(frame_dim_)] = cparams_.options; + JXL_CHECK(PrepareStreamParams( + stream_params[i].rect, cparams_, stream_params[i].minShift, + stream_params[i].maxShift, stream_params[i].id, do_color)); + }, + "ChooseParams")); + { + // Clear out channels that have been copied to groups. + Image& full_image = stream_images_[0]; + size_t c = full_image.nb_meta_channels; + for (; c < full_image.channel.size(); c++) { + Channel& fc = full_image.channel[c]; + if (fc.w > frame_dim_.group_dim || fc.h > frame_dim_.group_dim) break; + } + for (; c < full_image.channel.size(); c++) { + full_image.channel[c].plane = ImageI(); + } + } + + if (!quants.empty()) { + for (uint32_t stream_id = 0; stream_id < stream_images_.size(); + stream_id++) { + // skip non-modular stream_ids + if (stream_id > 0 && gi_channel_[stream_id].empty()) continue; + const Image& image = stream_images_[stream_id]; + const ModularOptions& options = stream_options_[stream_id]; + for (uint32_t i = image.nb_meta_channels; i < image.channel.size(); i++) { + if (i >= image.nb_meta_channels && + (image.channel[i].w > options.max_chan_size || + image.channel[i].h > options.max_chan_size)) { + continue; + } + if (stream_id > 0 && gi_channel_[stream_id].empty()) continue; + size_t ch_id = stream_id == 0 + ? i + : gi_channel_[stream_id][i - image.nb_meta_channels]; + uint32_t q = quants[ch_id]; + // Inform the tree splitting heuristics that each channel in each group + // used this quantization factor. This will produce a tree with the + // given multipliers. + if (multiplier_info_.empty() || + multiplier_info_.back().range[1][0] != stream_id || + multiplier_info_.back().multiplier != q) { + StaticPropRange range; + range[0] = {{i, i + 1}}; + range[1] = {{stream_id, stream_id + 1}}; + multiplier_info_.push_back({range, (uint32_t)q}); + } else { + // Previous channel in the same group had the same quantization + // factor. Don't provide two different ranges, as that creates + // unnecessary nodes. + multiplier_info_.back().range[0][1] = i + 1; + } + } + } + // Merge group+channel settings that have the same channels and quantization + // factors, to avoid unnecessary nodes. + std::sort(multiplier_info_.begin(), multiplier_info_.end(), + [](ModularMultiplierInfo a, ModularMultiplierInfo b) { + return std::make_tuple(a.range, a.multiplier) < + std::make_tuple(b.range, b.multiplier); + }); + size_t new_num = 1; + for (size_t i = 1; i < multiplier_info_.size(); i++) { + ModularMultiplierInfo& prev = multiplier_info_[new_num - 1]; + ModularMultiplierInfo& cur = multiplier_info_[i]; + if (prev.range[0] == cur.range[0] && prev.multiplier == cur.multiplier && + prev.range[1][1] == cur.range[1][0]) { + prev.range[1][1] = cur.range[1][1]; + } else { + multiplier_info_[new_num++] = multiplier_info_[i]; + } + } + multiplier_info_.resize(new_num); + } + + JXL_RETURN_IF_ERROR(ValidateChannelDimensions(gi, stream_options_[0])); + + return PrepareEncoding(frame_header, pool, enc_state->heuristics.get(), + aux_out); +} + +Status ModularFrameEncoder::PrepareEncoding(const FrameHeader& frame_header, + ThreadPool* pool, + EncoderHeuristics* heuristics, + AuxOut* aux_out) { + if (!tree_.empty()) return true; + + // Compute tree. + size_t num_streams = stream_images_.size(); + stream_headers_.resize(num_streams); + tokens_.resize(num_streams); + + if (heuristics->CustomFixedTreeLossless(frame_dim_, &tree_)) { + // Using a fixed tree. + } else if (cparams_.speed_tier < SpeedTier::kFalcon || + !cparams_.modular_mode) { + // Avoid creating a tree with leaves that don't correspond to any pixels. + std::vector useful_splits; + useful_splits.reserve(tree_splits_.size()); + for (size_t chunk = 0; chunk < tree_splits_.size() - 1; chunk++) { + bool has_pixels = false; + size_t start = tree_splits_[chunk]; + size_t stop = tree_splits_[chunk + 1]; + for (size_t i = start; i < stop; i++) { + if (!stream_images_[i].empty()) has_pixels = true; + } + if (has_pixels) { + useful_splits.push_back(tree_splits_[chunk]); + } + } + // Don't do anything if modular mode does not have any pixels in this image + if (useful_splits.empty()) return true; + useful_splits.push_back(tree_splits_.back()); + + std::atomic_flag invalid_force_wp = ATOMIC_FLAG_INIT; + + std::vector trees(useful_splits.size() - 1); + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, useful_splits.size() - 1, ThreadPool::NoInit, + [&](const uint32_t chunk, size_t /* thread */) { + // TODO(veluca): parallelize more. + size_t total_pixels = 0; + uint32_t start = useful_splits[chunk]; + uint32_t stop = useful_splits[chunk + 1]; + while (start < stop && stream_images_[start].empty()) ++start; + while (start < stop && stream_images_[stop - 1].empty()) --stop; + uint32_t max_c = 0; + if (stream_options_[start].tree_kind != + ModularOptions::TreeKind::kLearn) { + for (size_t i = start; i < stop; i++) { + for (const Channel& ch : stream_images_[i].channel) { + total_pixels += ch.w * ch.h; + } + } + trees[chunk] = + PredefinedTree(stream_options_[start].tree_kind, total_pixels); + return; + } + TreeSamples tree_samples; + if (!tree_samples.SetPredictor(stream_options_[start].predictor, + stream_options_[start].wp_tree_mode)) { + invalid_force_wp.test_and_set(std::memory_order_acq_rel); + return; + } + if (!tree_samples.SetProperties( + stream_options_[start].splitting_heuristics_properties, + stream_options_[start].wp_tree_mode)) { + invalid_force_wp.test_and_set(std::memory_order_acq_rel); + return; + } + std::vector pixel_samples; + std::vector diff_samples; + std::vector group_pixel_count; + std::vector channel_pixel_count; + for (size_t i = start; i < stop; i++) { + max_c = std::max(stream_images_[i].channel.size(), max_c); + CollectPixelSamples(stream_images_[i], stream_options_[i], i, + group_pixel_count, channel_pixel_count, + pixel_samples, diff_samples); + } + StaticPropRange range; + range[0] = {{0, max_c}}; + range[1] = {{start, stop}}; + auto local_multiplier_info = multiplier_info_; + + tree_samples.PreQuantizeProperties( + range, local_multiplier_info, group_pixel_count, + channel_pixel_count, pixel_samples, diff_samples, + stream_options_[start].max_property_values); + for (size_t i = start; i < stop; i++) { + JXL_CHECK(ModularGenericCompress( + stream_images_[i], stream_options_[i], /*writer=*/nullptr, + /*aux_out=*/nullptr, 0, i, &tree_samples, &total_pixels)); + } + + // TODO(veluca): parallelize more. + trees[chunk] = + LearnTree(std::move(tree_samples), total_pixels, + stream_options_[start], local_multiplier_info, range); + }, + "LearnTrees")); + if (invalid_force_wp.test_and_set(std::memory_order_acq_rel)) { + return JXL_FAILURE("PrepareEncoding: force_no_wp with {Weighted}"); + } + tree_.clear(); + MergeTrees(trees, useful_splits, 0, useful_splits.size() - 1, &tree_); + } else { + // Fixed tree. + size_t total_pixels = 0; + for (const Image& img : stream_images_) { + for (const Channel& ch : img.channel) { + total_pixels += ch.w * ch.h; + } + } + if (cparams_.speed_tier <= SpeedTier::kFalcon) { + tree_ = + PredefinedTree(ModularOptions::TreeKind::kWPFixedDC, total_pixels); + } else if (cparams_.speed_tier <= SpeedTier::kThunder) { + tree_ = PredefinedTree(ModularOptions::TreeKind::kGradientFixedDC, + total_pixels); + } else { + tree_ = {PropertyDecisionNode::Leaf(Predictor::Gradient)}; + } + } + tree_tokens_.resize(1); + tree_tokens_[0].clear(); + Tree decoded_tree; + TokenizeTree(tree_, &tree_tokens_[0], &decoded_tree); + JXL_ASSERT(tree_.size() == decoded_tree.size()); + tree_ = std::move(decoded_tree); + + if (kPrintTree && WantDebugOutput(aux_out)) { + if (frame_header.dc_level > 0) { + PrintTree(tree_, aux_out->debug_prefix + "/dc_frame_level" + + std::to_string(frame_header.dc_level) + "_tree"); + } else { + PrintTree(tree_, aux_out->debug_prefix + "/global_tree"); + } + } + + image_widths_.resize(num_streams); + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, num_streams, ThreadPool::NoInit, + [&](const uint32_t stream_id, size_t /* thread */) { + AuxOut my_aux_out; + if (aux_out) { + my_aux_out.dump_image = aux_out->dump_image; + my_aux_out.debug_prefix = aux_out->debug_prefix; + } + tokens_[stream_id].clear(); + JXL_CHECK(ModularGenericCompress( + stream_images_[stream_id], stream_options_[stream_id], + /*writer=*/nullptr, &my_aux_out, 0, stream_id, + /*tree_samples=*/nullptr, + /*total_pixels=*/nullptr, + /*tree=*/&tree_, /*header=*/&stream_headers_[stream_id], + /*tokens=*/&tokens_[stream_id], + /*widths=*/&image_widths_[stream_id])); + }, + "ComputeTokens")); + return true; +} + +Status ModularFrameEncoder::EncodeGlobalInfo(BitWriter* writer, + AuxOut* aux_out) { + BitWriter::Allotment allotment(writer, 1); + // If we are using brotli, or not using modular mode. + if (tree_tokens_.empty() || tree_tokens_[0].empty()) { + writer->Write(1, 0); + allotment.ReclaimAndCharge(writer, kLayerModularTree, aux_out); + return true; + } + writer->Write(1, 1); + allotment.ReclaimAndCharge(writer, kLayerModularTree, aux_out); + + // Write tree + HistogramParams params; + if (cparams_.speed_tier > SpeedTier::kKitten) { + params.clustering = HistogramParams::ClusteringType::kFast; + params.ans_histogram_strategy = + cparams_.speed_tier > SpeedTier::kThunder + ? HistogramParams::ANSHistogramStrategy::kFast + : HistogramParams::ANSHistogramStrategy::kApproximate; + params.lz77_method = + cparams_.decoding_speed_tier >= 3 && cparams_.modular_mode + ? (cparams_.speed_tier >= SpeedTier::kFalcon + ? HistogramParams::LZ77Method::kRLE + : HistogramParams::LZ77Method::kLZ77) + : HistogramParams::LZ77Method::kNone; + // Near-lossless DC, as well as modular mode, require choosing hybrid uint + // more carefully. + if ((!extra_dc_precision.empty() && extra_dc_precision[0] != 0) || + (cparams_.modular_mode && cparams_.speed_tier < SpeedTier::kCheetah)) { + params.uint_method = HistogramParams::HybridUintMethod::kFast; + } else { + params.uint_method = HistogramParams::HybridUintMethod::kNone; + } + } else if (cparams_.speed_tier <= SpeedTier::kTortoise) { + params.lz77_method = HistogramParams::LZ77Method::kOptimal; + } else { + params.lz77_method = HistogramParams::LZ77Method::kLZ77; + } + if (cparams_.decoding_speed_tier >= 1) { + params.max_histograms = 12; + } + if (cparams_.decoding_speed_tier >= 1 && cparams_.responsive) { + params.lz77_method = cparams_.speed_tier >= SpeedTier::kCheetah + ? HistogramParams::LZ77Method::kRLE + : cparams_.speed_tier >= SpeedTier::kKitten + ? HistogramParams::LZ77Method::kLZ77 + : HistogramParams::LZ77Method::kOptimal; + } + if (cparams_.decoding_speed_tier >= 2 && cparams_.responsive) { + params.uint_method = HistogramParams::HybridUintMethod::k000; + params.force_huffman = true; + } + BuildAndEncodeHistograms(params, kNumTreeContexts, tree_tokens_, &code_, + &context_map_, writer, kLayerModularTree, aux_out); + WriteTokens(tree_tokens_[0], code_, context_map_, writer, kLayerModularTree, + aux_out); + params.image_widths = image_widths_; + // Write histograms. + BuildAndEncodeHistograms(params, (tree_.size() + 1) / 2, tokens_, &code_, + &context_map_, writer, kLayerModularGlobal, aux_out); + return true; +} + +Status ModularFrameEncoder::EncodeStream(BitWriter* writer, AuxOut* aux_out, + size_t layer, + const ModularStreamId& stream) { + size_t stream_id = stream.ID(frame_dim_); + if (stream_images_[stream_id].channel.empty()) { + return true; // Image with no channels, header never gets decoded. + } + JXL_RETURN_IF_ERROR( + Bundle::Write(stream_headers_[stream_id], writer, layer, aux_out)); + WriteTokens(tokens_[stream_id], code_, context_map_, writer, layer, aux_out); + return true; +} + +namespace { +float EstimateWPCost(const Image& img, size_t i) { + size_t extra_bits = 0; + float histo_cost = 0; + HybridUintConfig config; + int32_t cutoffs[] = {-500, -392, -255, -191, -127, -95, -63, -47, -31, + -23, -15, -11, -7, -4, -3, -1, 0, 1, + 3, 5, 7, 11, 15, 23, 31, 47, 63, + 95, 127, 191, 255, 392, 500}; + constexpr size_t nc = sizeof(cutoffs) / sizeof(*cutoffs) + 1; + Histogram histo[nc] = {}; + weighted::Header wp_header; + PredictorMode(i, &wp_header); + for (const Channel& ch : img.channel) { + const intptr_t onerow = ch.plane.PixelsPerRow(); + weighted::State wp_state(wp_header, ch.w, ch.h); + Properties properties(1); + for (size_t y = 0; y < ch.h; y++) { + const pixel_type* JXL_RESTRICT r = ch.Row(y); + for (size_t x = 0; x < ch.w; x++) { + size_t offset = 0; + pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0); + pixel_type_w top = (y ? *(r + x - onerow) : left); + pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left); + pixel_type_w topright = + (x + 1 < ch.w && y ? *(r + x + 1 - onerow) : top); + pixel_type_w toptop = (y > 1 ? *(r + x - onerow - onerow) : top); + pixel_type guess = wp_state.Predict( + x, y, ch.w, top, left, topright, topleft, toptop, &properties, + offset); + size_t ctx = 0; + for (int c : cutoffs) { + ctx += c >= properties[0]; + } + pixel_type res = r[x] - guess; + uint32_t token, nbits, bits; + config.Encode(PackSigned(res), &token, &nbits, &bits); + histo[ctx].Add(token); + extra_bits += nbits; + wp_state.UpdateErrors(r[x], x, y, ch.w); + } + } + for (size_t h = 0; h < nc; h++) { + histo_cost += histo[h].ShannonEntropy(); + histo[h].Clear(); + } + } + return histo_cost + extra_bits; +} + +float EstimateCost(const Image& img) { + // TODO(veluca): consider SIMDfication of this code. + size_t extra_bits = 0; + float histo_cost = 0; + HybridUintConfig config; + uint32_t cutoffs[] = {0, 1, 3, 5, 7, 11, 15, 23, 31, + 47, 63, 95, 127, 191, 255, 392, 500}; + constexpr size_t nc = sizeof(cutoffs) / sizeof(*cutoffs) + 1; + Histogram histo[nc] = {}; + for (const Channel& ch : img.channel) { + const intptr_t onerow = ch.plane.PixelsPerRow(); + for (size_t y = 0; y < ch.h; y++) { + const pixel_type* JXL_RESTRICT r = ch.Row(y); + for (size_t x = 0; x < ch.w; x++) { + pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0); + pixel_type_w top = (y ? *(r + x - onerow) : left); + pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left); + size_t maxdiff = std::max(std::max(left, top), topleft) - + std::min(std::min(left, top), topleft); + size_t ctx = 0; + for (uint32_t c : cutoffs) { + ctx += c > maxdiff; + } + pixel_type res = r[x] - ClampedGradient(top, left, topleft); + uint32_t token, nbits, bits; + config.Encode(PackSigned(res), &token, &nbits, &bits); + histo[ctx].Add(token); + extra_bits += nbits; + } + } + for (size_t h = 0; h < nc; h++) { + histo_cost += histo[h].ShannonEntropy(); + histo[h].Clear(); + } + } + return histo_cost + extra_bits; +} + +} // namespace + +Status ModularFrameEncoder::PrepareStreamParams(const Rect& rect, + const CompressParams& cparams_, + int minShift, int maxShift, + const ModularStreamId& stream, + bool do_color) { + size_t stream_id = stream.ID(frame_dim_); + Image& full_image = stream_images_[0]; + const size_t xsize = rect.xsize(); + const size_t ysize = rect.ysize(); + Image& gi = stream_images_[stream_id]; + if (stream_id > 0) { + gi = Image(xsize, ysize, full_image.bitdepth, 0); + // start at the first bigger-than-frame_dim.group_dim non-metachannel + size_t c = full_image.nb_meta_channels; + for (; c < full_image.channel.size(); c++) { + Channel& fc = full_image.channel[c]; + if (fc.w > frame_dim_.group_dim || fc.h > frame_dim_.group_dim) break; + } + for (; c < full_image.channel.size(); c++) { + Channel& fc = full_image.channel[c]; + int shift = std::min(fc.hshift, fc.vshift); + if (shift > maxShift) continue; + if (shift < minShift) continue; + Rect r(rect.x0() >> fc.hshift, rect.y0() >> fc.vshift, + rect.xsize() >> fc.hshift, rect.ysize() >> fc.vshift, fc.w, fc.h); + if (r.xsize() == 0 || r.ysize() == 0) continue; + gi_channel_[stream_id].push_back(c); + Channel gc(r.xsize(), r.ysize()); + gc.hshift = fc.hshift; + gc.vshift = fc.vshift; + for (size_t y = 0; y < r.ysize(); ++y) { + memcpy(gc.Row(y), r.ConstRow(fc.plane, y), + r.xsize() * sizeof(pixel_type)); + } + gi.channel.emplace_back(std::move(gc)); + } + + if (gi.channel.empty()) return true; + // Do some per-group transforms + + // Local palette + // TODO(veluca): make this work with quantize-after-prediction in lossy + // mode. + if (cparams_.butteraugli_distance == 0.f && cparams_.palette_colors != 0 && + cparams_.speed_tier < SpeedTier::kCheetah) { + // all-channel palette (e.g. RGBA) + if (gi.channel.size() - gi.nb_meta_channels > 1) { + Transform maybe_palette(TransformId::kPalette); + maybe_palette.begin_c = gi.nb_meta_channels; + maybe_palette.num_c = gi.channel.size() - gi.nb_meta_channels; + maybe_palette.nb_colors = std::abs(cparams_.palette_colors); + maybe_palette.ordered_palette = cparams_.palette_colors >= 0; + do_transform(gi, maybe_palette, weighted::Header()); + } + // all-minus-one-channel palette (RGB with separate alpha, or CMY with + // separate K) + if (gi.channel.size() - gi.nb_meta_channels > 3) { + Transform maybe_palette_3(TransformId::kPalette); + maybe_palette_3.begin_c = gi.nb_meta_channels; + maybe_palette_3.num_c = gi.channel.size() - gi.nb_meta_channels - 1; + maybe_palette_3.nb_colors = std::abs(cparams_.palette_colors); + maybe_palette_3.ordered_palette = cparams_.palette_colors >= 0; + maybe_palette_3.lossy_palette = cparams_.lossy_palette; + if (maybe_palette_3.lossy_palette) { + maybe_palette_3.predictor = Predictor::Weighted; + } + do_transform(gi, maybe_palette_3, weighted::Header()); + } + } + + // Local channel palette + if (cparams_.channel_colors_percent > 0 && + cparams_.butteraugli_distance == 0.f && !cparams_.lossy_palette && + cparams_.speed_tier < SpeedTier::kCheetah && + !(cparams_.responsive && cparams_.decoding_speed_tier >= 1)) { + // single channel palette (like FLIF's ChannelCompact) + size_t nb_channels = gi.channel.size() - gi.nb_meta_channels; + for (size_t i = 0; i < nb_channels; i++) { + int32_t min, max; + compute_minmax(gi.channel[gi.nb_meta_channels + i], &min, &max); + int64_t colors = (int64_t)max - min + 1; + JXL_DEBUG_V(10, "Channel %" PRIuS ": range=%i..%i", i, min, max); + Transform maybe_palette_1(TransformId::kPalette); + maybe_palette_1.begin_c = i + gi.nb_meta_channels; + maybe_palette_1.num_c = 1; + // simple heuristic: if less than X percent of the values in the range + // actually occur, it is probably worth it to do a compaction + // (but only if the channel palette is less than 80% the size of the + // image itself) + maybe_palette_1.nb_colors = + std::min((int)(xsize * ysize * 0.8), + (int)(cparams_.channel_colors_percent / 100. * colors)); + do_transform(gi, maybe_palette_1, weighted::Header()); + } + } + } + + // lossless and no specific color transform specified: try Nothing, YCoCg, + // and 17 RCTs + if (cparams_.color_transform == ColorTransform::kNone && + cparams_.IsLossless() && cparams_.colorspace < 0 && + gi.channel.size() - gi.nb_meta_channels >= 3 && + cparams_.responsive == false && do_color && + cparams_.speed_tier <= SpeedTier::kHare) { + Transform sg(TransformId::kRCT); + sg.begin_c = gi.nb_meta_channels; + size_t nb_rcts_to_try = 0; + switch (cparams_.speed_tier) { + case SpeedTier::kLightning: + case SpeedTier::kThunder: + case SpeedTier::kFalcon: + case SpeedTier::kCheetah: + nb_rcts_to_try = 0; // Just do global YCoCg + break; + case SpeedTier::kHare: + nb_rcts_to_try = 4; + break; + case SpeedTier::kWombat: + nb_rcts_to_try = 5; + break; + case SpeedTier::kSquirrel: + nb_rcts_to_try = 7; + break; + case SpeedTier::kKitten: + nb_rcts_to_try = 9; + break; + case SpeedTier::kGlacier: + case SpeedTier::kTortoise: + nb_rcts_to_try = 19; + break; + } + float best_cost = std::numeric_limits::max(); + size_t best_rct = 0; + // These should be 19 actually different transforms; the remaining ones + // are equivalent to one of these (note that the first two are do-nothing + // and YCoCg) modulo channel reordering (which only matters in the case of + // MA-with-prev-channels-properties) and/or sign (e.g. RmG vs GmR) + for (int i : {0 * 7 + 0, 0 * 7 + 6, 0 * 7 + 5, 1 * 7 + 3, 3 * 7 + 5, + 5 * 7 + 5, 1 * 7 + 5, 2 * 7 + 5, 1 * 7 + 1, 0 * 7 + 4, + 1 * 7 + 2, 2 * 7 + 1, 2 * 7 + 2, 2 * 7 + 3, 4 * 7 + 4, + 4 * 7 + 5, 0 * 7 + 2, 0 * 7 + 1, 0 * 7 + 3}) { + if (nb_rcts_to_try == 0) break; + sg.rct_type = i; + nb_rcts_to_try--; + if (do_transform(gi, sg, weighted::Header())) { + float cost = EstimateCost(gi); + if (cost < best_cost) { + best_rct = i; + best_cost = cost; + } + Transform t = gi.transform.back(); + JXL_RETURN_IF_ERROR(t.Inverse(gi, weighted::Header(), nullptr)); + gi.transform.pop_back(); + } + } + // Apply the best RCT to the image for future encoding. + sg.rct_type = best_rct; + do_transform(gi, sg, weighted::Header()); + } else { + // No need to try anything, just use the default options. + } + size_t nb_wp_modes = 1; + if (cparams_.speed_tier <= SpeedTier::kTortoise) { + nb_wp_modes = 5; + } else if (cparams_.speed_tier <= SpeedTier::kKitten) { + nb_wp_modes = 2; + } + if (nb_wp_modes > 1 && + (stream_options_[stream_id].predictor == Predictor::Weighted || + stream_options_[stream_id].predictor == Predictor::Best || + stream_options_[stream_id].predictor == Predictor::Variable)) { + float best_cost = std::numeric_limits::max(); + stream_options_[stream_id].wp_mode = 0; + for (size_t i = 0; i < nb_wp_modes; i++) { + float cost = EstimateWPCost(gi, i); + if (cost < best_cost) { + best_cost = cost; + stream_options_[stream_id].wp_mode = i; + } + } + } + return true; +} + +constexpr float q_deadzone = 0.62f; +int QuantizeWP(const int32_t* qrow, size_t onerow, size_t c, size_t x, size_t y, + size_t w, weighted::State* wp_state, float value, + float inv_factor) { + float svalue = value * inv_factor; + PredictionResult pred = + PredictNoTreeWP(w, qrow + x, onerow, x, y, Predictor::Weighted, wp_state); + svalue -= pred.guess; + if (svalue > -q_deadzone && svalue < q_deadzone) svalue = 0; + int residual = roundf(svalue); + if (residual > 2 || residual < -2) residual = roundf(svalue * 0.5) * 2; + return residual + pred.guess; +} + +int QuantizeGradient(const int32_t* qrow, size_t onerow, size_t c, size_t x, + size_t y, size_t w, float value, float inv_factor) { + float svalue = value * inv_factor; + PredictionResult pred = + PredictNoTreeNoWP(w, qrow + x, onerow, x, y, Predictor::Gradient); + svalue -= pred.guess; + if (svalue > -q_deadzone && svalue < q_deadzone) svalue = 0; + int residual = roundf(svalue); + if (residual > 2 || residual < -2) residual = roundf(svalue * 0.5) * 2; + return residual + pred.guess; +} + +void ModularFrameEncoder::AddVarDCTDC(const Image3F& dc, size_t group_index, + bool nl_dc, PassesEncoderState* enc_state, + bool jpeg_transcode) { + const Rect r = enc_state->shared.DCGroupRect(group_index); + extra_dc_precision[group_index] = nl_dc ? 1 : 0; + float mul = 1 << extra_dc_precision[group_index]; + + size_t stream_id = ModularStreamId::VarDCTDC(group_index).ID(frame_dim_); + stream_options_[stream_id].max_chan_size = 0xFFFFFF; + stream_options_[stream_id].predictor = Predictor::Weighted; + stream_options_[stream_id].wp_tree_mode = ModularOptions::TreeMode::kWPOnly; + if (cparams_.speed_tier >= SpeedTier::kSquirrel) { + stream_options_[stream_id].tree_kind = ModularOptions::TreeKind::kWPFixedDC; + } + if (cparams_.speed_tier < SpeedTier::kSquirrel && !nl_dc) { + stream_options_[stream_id].predictor = + (cparams_.speed_tier < SpeedTier::kKitten ? Predictor::Variable + : Predictor::Best); + stream_options_[stream_id].wp_tree_mode = + ModularOptions::TreeMode::kDefault; + stream_options_[stream_id].tree_kind = ModularOptions::TreeKind::kLearn; + } + if (cparams_.decoding_speed_tier >= 1) { + stream_options_[stream_id].tree_kind = + ModularOptions::TreeKind::kGradientFixedDC; + } + + stream_images_[stream_id] = Image(r.xsize(), r.ysize(), 8, 3); + if (nl_dc && stream_options_[stream_id].tree_kind == + ModularOptions::TreeKind::kGradientFixedDC) { + JXL_ASSERT(enc_state->shared.frame_header.chroma_subsampling.Is444()); + for (size_t c : {1, 0, 2}) { + float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul; + float y_factor = enc_state->shared.quantizer.GetDcStep(1) / mul; + float cfl_factor = enc_state->shared.cmap.DCFactors()[c]; + for (size_t y = 0; y < r.ysize(); y++) { + int32_t* quant_row = + stream_images_[stream_id].channel[c < 2 ? c ^ 1 : c].plane.Row(y); + size_t stride = stream_images_[stream_id] + .channel[c < 2 ? c ^ 1 : c] + .plane.PixelsPerRow(); + const float* row = r.ConstPlaneRow(dc, c, y); + if (c == 1) { + for (size_t x = 0; x < r.xsize(); x++) { + quant_row[x] = QuantizeGradient(quant_row, stride, c, x, y, + r.xsize(), row[x], inv_factor); + } + } else { + int32_t* quant_row_y = + stream_images_[stream_id].channel[0].plane.Row(y); + for (size_t x = 0; x < r.xsize(); x++) { + quant_row[x] = QuantizeGradient( + quant_row, stride, c, x, y, r.xsize(), + row[x] - quant_row_y[x] * (y_factor * cfl_factor), inv_factor); + } + } + } + } + } else if (nl_dc) { + JXL_ASSERT(enc_state->shared.frame_header.chroma_subsampling.Is444()); + for (size_t c : {1, 0, 2}) { + float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul; + float y_factor = enc_state->shared.quantizer.GetDcStep(1) / mul; + float cfl_factor = enc_state->shared.cmap.DCFactors()[c]; + weighted::Header header; + weighted::State wp_state(header, r.xsize(), r.ysize()); + for (size_t y = 0; y < r.ysize(); y++) { + int32_t* quant_row = + stream_images_[stream_id].channel[c < 2 ? c ^ 1 : c].plane.Row(y); + size_t stride = stream_images_[stream_id] + .channel[c < 2 ? c ^ 1 : c] + .plane.PixelsPerRow(); + const float* row = r.ConstPlaneRow(dc, c, y); + if (c == 1) { + for (size_t x = 0; x < r.xsize(); x++) { + quant_row[x] = QuantizeWP(quant_row, stride, c, x, y, r.xsize(), + &wp_state, row[x], inv_factor); + wp_state.UpdateErrors(quant_row[x], x, y, r.xsize()); + } + } else { + int32_t* quant_row_y = + stream_images_[stream_id].channel[0].plane.Row(y); + for (size_t x = 0; x < r.xsize(); x++) { + quant_row[x] = QuantizeWP( + quant_row, stride, c, x, y, r.xsize(), &wp_state, + row[x] - quant_row_y[x] * (y_factor * cfl_factor), inv_factor); + wp_state.UpdateErrors(quant_row[x], x, y, r.xsize()); + } + } + } + } + } else if (enc_state->shared.frame_header.chroma_subsampling.Is444()) { + for (size_t c : {1, 0, 2}) { + float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul; + float y_factor = enc_state->shared.quantizer.GetDcStep(1) / mul; + float cfl_factor = enc_state->shared.cmap.DCFactors()[c]; + for (size_t y = 0; y < r.ysize(); y++) { + int32_t* quant_row = + stream_images_[stream_id].channel[c < 2 ? c ^ 1 : c].plane.Row(y); + const float* row = r.ConstPlaneRow(dc, c, y); + if (c == 1) { + for (size_t x = 0; x < r.xsize(); x++) { + quant_row[x] = roundf(row[x] * inv_factor); + } + } else { + int32_t* quant_row_y = + stream_images_[stream_id].channel[0].plane.Row(y); + for (size_t x = 0; x < r.xsize(); x++) { + quant_row[x] = + roundf((row[x] - quant_row_y[x] * (y_factor * cfl_factor)) * + inv_factor); + } + } + } + } + } else { + for (size_t c : {1, 0, 2}) { + Rect rect( + r.x0() >> enc_state->shared.frame_header.chroma_subsampling.HShift(c), + r.y0() >> enc_state->shared.frame_header.chroma_subsampling.VShift(c), + r.xsize() >> + enc_state->shared.frame_header.chroma_subsampling.HShift(c), + r.ysize() >> + enc_state->shared.frame_header.chroma_subsampling.VShift(c)); + float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul; + size_t ys = rect.ysize(); + size_t xs = rect.xsize(); + Channel& ch = stream_images_[stream_id].channel[c < 2 ? c ^ 1 : c]; + ch.w = xs; + ch.h = ys; + ch.shrink(); + for (size_t y = 0; y < ys; y++) { + int32_t* quant_row = ch.plane.Row(y); + const float* row = rect.ConstPlaneRow(dc, c, y); + for (size_t x = 0; x < xs; x++) { + quant_row[x] = roundf(row[x] * inv_factor); + } + } + } + } + + DequantDC(r, &enc_state->shared.dc_storage, &enc_state->shared.quant_dc, + stream_images_[stream_id], enc_state->shared.quantizer.MulDC(), + 1.0 / mul, enc_state->shared.cmap.DCFactors(), + enc_state->shared.frame_header.chroma_subsampling, + enc_state->shared.block_ctx_map); +} + +void ModularFrameEncoder::AddACMetadata(size_t group_index, bool jpeg_transcode, + PassesEncoderState* enc_state) { + const Rect r = enc_state->shared.DCGroupRect(group_index); + size_t stream_id = ModularStreamId::ACMetadata(group_index).ID(frame_dim_); + stream_options_[stream_id].max_chan_size = 0xFFFFFF; + stream_options_[stream_id].wp_tree_mode = ModularOptions::TreeMode::kNoWP; + if (jpeg_transcode) { + stream_options_[stream_id].tree_kind = + ModularOptions::TreeKind::kJpegTranscodeACMeta; + } else if (cparams_.speed_tier >= SpeedTier::kFalcon) { + stream_options_[stream_id].tree_kind = + ModularOptions::TreeKind::kFalconACMeta; + } else if (cparams_.speed_tier > SpeedTier::kKitten) { + stream_options_[stream_id].tree_kind = ModularOptions::TreeKind::kACMeta; + } + // If we are using a non-constant CfL field, and are in a slow enough mode, + // re-enable tree computation for it. + if (cparams_.speed_tier < SpeedTier::kSquirrel && + cparams_.force_cfl_jpeg_recompression) { + stream_options_[stream_id].tree_kind = ModularOptions::TreeKind::kLearn; + } + // YToX, YToB, ACS + QF, EPF + Image& image = stream_images_[stream_id]; + image = Image(r.xsize(), r.ysize(), 8, 4); + static_assert(kColorTileDimInBlocks == 8, "Color tile size changed"); + Rect cr(r.x0() >> 3, r.y0() >> 3, (r.xsize() + 7) >> 3, (r.ysize() + 7) >> 3); + image.channel[0] = Channel(cr.xsize(), cr.ysize(), 3, 3); + image.channel[1] = Channel(cr.xsize(), cr.ysize(), 3, 3); + image.channel[2] = Channel(r.xsize() * r.ysize(), 2, 0, 0); + ConvertPlaneAndClamp(cr, enc_state->shared.cmap.ytox_map, + Rect(image.channel[0].plane), &image.channel[0].plane); + ConvertPlaneAndClamp(cr, enc_state->shared.cmap.ytob_map, + Rect(image.channel[1].plane), &image.channel[1].plane); + size_t num = 0; + for (size_t y = 0; y < r.ysize(); y++) { + AcStrategyRow row_acs = enc_state->shared.ac_strategy.ConstRow(r, y); + const int32_t* row_qf = r.ConstRow(enc_state->shared.raw_quant_field, y); + const uint8_t* row_epf = r.ConstRow(enc_state->shared.epf_sharpness, y); + int32_t* out_acs = image.channel[2].plane.Row(0); + int32_t* out_qf = image.channel[2].plane.Row(1); + int32_t* row_out_epf = image.channel[3].plane.Row(y); + for (size_t x = 0; x < r.xsize(); x++) { + row_out_epf[x] = row_epf[x]; + if (!row_acs[x].IsFirstBlock()) continue; + out_acs[num] = row_acs[x].RawStrategy(); + out_qf[num] = row_qf[x] - 1; + num++; + } + } + image.channel[2].w = num; + ac_metadata_size[group_index] = num; +} + +void ModularFrameEncoder::EncodeQuantTable( + size_t size_x, size_t size_y, BitWriter* writer, + const QuantEncoding& encoding, size_t idx, + ModularFrameEncoder* modular_frame_encoder) { + JXL_ASSERT(encoding.qraw.qtable != nullptr); + JXL_ASSERT(size_x * size_y * 3 == encoding.qraw.qtable->size()); + JXL_CHECK(F16Coder::Write(encoding.qraw.qtable_den, writer)); + if (modular_frame_encoder) { + JXL_CHECK(modular_frame_encoder->EncodeStream( + writer, nullptr, 0, ModularStreamId::QuantTable(idx))); + return; + } + Image image(size_x, size_y, 8, 3); + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < size_y; y++) { + int32_t* JXL_RESTRICT row = image.channel[c].Row(y); + for (size_t x = 0; x < size_x; x++) { + row[x] = (*encoding.qraw.qtable)[c * size_x * size_y + y * size_x + x]; + } + } + } + ModularOptions cfopts; + JXL_CHECK(ModularGenericCompress(image, cfopts, writer)); +} + +void ModularFrameEncoder::AddQuantTable(size_t size_x, size_t size_y, + const QuantEncoding& encoding, + size_t idx) { + size_t stream_id = ModularStreamId::QuantTable(idx).ID(frame_dim_); + JXL_ASSERT(encoding.qraw.qtable != nullptr); + JXL_ASSERT(size_x * size_y * 3 == encoding.qraw.qtable->size()); + Image& image = stream_images_[stream_id]; + image = Image(size_x, size_y, 8, 3); + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < size_y; y++) { + int32_t* JXL_RESTRICT row = image.channel[c].Row(y); + for (size_t x = 0; x < size_x; x++) { + row[x] = (*encoding.qraw.qtable)[c * size_x * size_y + y * size_x + x]; + } + } + } +} +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_modular.h b/third_party/jpeg-xl/lib/jxl/enc_modular.h new file mode 100644 index 0000000000..2af66e951f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_modular.h @@ -0,0 +1,92 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_MODULAR_H_ +#define LIB_JXL_ENC_MODULAR_H_ + +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_modular.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/modular/modular_image.h" + +namespace jxl { + +struct AuxOut; + +class ModularFrameEncoder { + public: + ModularFrameEncoder(const FrameHeader& frame_header, + const CompressParams& cparams_orig); + Status ComputeEncodingData(const FrameHeader& frame_header, + const ImageMetadata& metadata, + Image3F* JXL_RESTRICT color, + const std::vector& extra_channels, + PassesEncoderState* JXL_RESTRICT enc_state, + const JxlCmsInterface& cms, ThreadPool* pool, + AuxOut* aux_out, bool do_color); + // Encodes global info (tree + histograms) in the `writer`. + Status EncodeGlobalInfo(BitWriter* writer, AuxOut* aux_out); + // Encodes a specific modular image (identified by `stream`) in the `writer`, + // assigning bits to the provided `layer`. + Status EncodeStream(BitWriter* writer, AuxOut* aux_out, size_t layer, + const ModularStreamId& stream); + // Creates a modular image for a given DC group of VarDCT mode. `dc` is the + // input DC image, not quantized; the group is specified by `group_index`, and + // `nl_dc` decides whether to apply a near-lossless processing to the DC or + // not. + void AddVarDCTDC(const Image3F& dc, size_t group_index, bool nl_dc, + PassesEncoderState* enc_state, bool jpeg_transcode); + // Creates a modular image for the AC metadata of the given group + // (`group_index`). + void AddACMetadata(size_t group_index, bool jpeg_transcode, + PassesEncoderState* enc_state); + // Encodes a RAW quantization table in `writer`. If `modular_frame_encoder` is + // null, the quantization table in `encoding` is used, with dimensions `size_x + // x size_y`. Otherwise, the table with ID `idx` is encoded from the given + // `modular_frame_encoder`. + static void EncodeQuantTable(size_t size_x, size_t size_y, BitWriter* writer, + const QuantEncoding& encoding, size_t idx, + ModularFrameEncoder* modular_frame_encoder); + // Stores a quantization table for future usage with `EncodeQuantTable`. + void AddQuantTable(size_t size_x, size_t size_y, + const QuantEncoding& encoding, size_t idx); + + std::vector ac_metadata_size; + std::vector extra_dc_precision; + + private: + Status PrepareEncoding(const FrameHeader& frame_header, ThreadPool* pool, + EncoderHeuristics* heuristics, + AuxOut* aux_out = nullptr); + Status PrepareStreamParams(const Rect& rect, const CompressParams& cparams, + int minShift, int maxShift, + const ModularStreamId& stream, bool do_color); + std::vector stream_images_; + std::vector stream_options_; + + Tree tree_; + std::vector> tree_tokens_; + std::vector stream_headers_; + std::vector> tokens_; + EntropyEncodingData code_; + std::vector context_map_; + FrameDimensions frame_dim_; + CompressParams cparams_; + std::vector tree_splits_; + std::vector multiplier_info_; + std::vector> gi_channel_; + std::vector image_widths_; + Predictor delta_pred_ = Predictor::Average4; +}; + +} // namespace jxl + +#endif // LIB_JXL_ENC_MODULAR_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_noise.cc b/third_party/jpeg-xl/lib/jxl/enc_noise.cc new file mode 100644 index 0000000000..54bb4482e8 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_noise.cc @@ -0,0 +1,374 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_noise.h" + +#include +#include +#include + +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_optimize.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/opsin_params.h" + +namespace jxl { +namespace { + +using OptimizeArray = optimize::Array; + +float GetScoreSumsOfAbsoluteDifferences(const Image3F& opsin, const int x, + const int y, const int block_size) { + const int small_bl_size_x = 3; + const int small_bl_size_y = 4; + const int kNumSAD = + (block_size - small_bl_size_x) * (block_size - small_bl_size_y); + // block_size x block_size reference pixels + int counter = 0; + const int offset = 2; + + std::vector sad(kNumSAD, 0); + for (int y_bl = 0; y_bl + small_bl_size_y < block_size; ++y_bl) { + for (int x_bl = 0; x_bl + small_bl_size_x < block_size; ++x_bl) { + float sad_sum = 0; + // size of the center patch, we compare all the patches inside window with + // the center one + for (int cy = 0; cy < small_bl_size_y; ++cy) { + for (int cx = 0; cx < small_bl_size_x; ++cx) { + float wnd = 0.5f * (opsin.PlaneRow(1, y + y_bl + cy)[x + x_bl + cx] + + opsin.PlaneRow(0, y + y_bl + cy)[x + x_bl + cx]); + float center = + 0.5f * (opsin.PlaneRow(1, y + offset + cy)[x + offset + cx] + + opsin.PlaneRow(0, y + offset + cy)[x + offset + cx]); + sad_sum += std::abs(center - wnd); + } + } + sad[counter++] = sad_sum; + } + } + const int kSamples = (kNumSAD) / 2; + // As with ROAD (rank order absolute distance), we keep the smallest half of + // the values in SAD (we use here the more robust patch SAD instead of + // absolute single-pixel differences). + std::sort(sad.begin(), sad.end()); + const float total_sad_sum = + std::accumulate(sad.begin(), sad.begin() + kSamples, 0.0f); + return total_sad_sum / kSamples; +} + +class NoiseHistogram { + public: + static constexpr int kBins = 256; + + NoiseHistogram() { std::fill(bins, bins + kBins, 0); } + + void Increment(const float x) { bins[Index(x)] += 1; } + int Get(const float x) const { return bins[Index(x)]; } + int Bin(const size_t bin) const { return bins[bin]; } + + int Mode() const { + size_t max_idx = 0; + for (size_t i = 0; i < kBins; i++) { + if (bins[i] > bins[max_idx]) max_idx = i; + } + return max_idx; + } + + double Quantile(double q01) const { + const int64_t total = std::accumulate(bins, bins + kBins, int64_t{1}); + const int64_t target = static_cast(q01 * total); + // Until sum >= target: + int64_t sum = 0; + size_t i = 0; + for (; i < kBins; ++i) { + sum += bins[i]; + // Exact match: assume middle of bin i + if (sum == target) { + return i + 0.5; + } + if (sum > target) break; + } + + // Next non-empty bin (in case histogram is sparsely filled) + size_t next = i + 1; + while (next < kBins && bins[next] == 0) { + ++next; + } + + // Linear interpolation according to how far into next we went + const double excess = target - sum; + const double weight_next = bins[Index(next)] / excess; + return ClampX(next * weight_next + i * (1.0 - weight_next)); + } + + // Inter-quartile range + double IQR() const { return Quantile(0.75) - Quantile(0.25); } + + private: + template + T ClampX(const T x) const { + return std::min(std::max(T(0), x), T(kBins - 1)); + } + size_t Index(const float x) const { return ClampX(static_cast(x)); } + + uint32_t bins[kBins]; +}; + +std::vector GetSADScoresForPatches(const Image3F& opsin, + const size_t block_s, + const size_t num_bin, + NoiseHistogram* sad_histogram) { + std::vector sad_scores( + (opsin.ysize() / block_s) * (opsin.xsize() / block_s), 0.0f); + + int block_index = 0; + + for (size_t y = 0; y + block_s <= opsin.ysize(); y += block_s) { + for (size_t x = 0; x + block_s <= opsin.xsize(); x += block_s) { + float sad_sc = GetScoreSumsOfAbsoluteDifferences(opsin, x, y, block_s); + sad_scores[block_index++] = sad_sc; + sad_histogram->Increment(sad_sc * num_bin); + } + } + return sad_scores; +} + +float GetSADThreshold(const NoiseHistogram& histogram, const int num_bin) { + // Here we assume that the most patches with similar SAD value is a "flat" + // patches. However, some images might contain regular texture part and + // generate second strong peak at the histogram + // TODO(user) handle bimodal and heavy-tailed case + const int mode = histogram.Mode(); + return static_cast(mode) / NoiseHistogram::kBins; +} + +// loss = sum asym * (F(x) - nl)^2 + kReg * num_points * sum (w[i] - w[i+1])^2 +// where asym = 1 if F(x) < nl, kAsym if F(x) > nl. +struct LossFunction { + explicit LossFunction(std::vector nl0) : nl(std::move(nl0)) {} + + double Compute(const OptimizeArray& w, OptimizeArray* df, + bool skip_regularization = false) const { + constexpr double kReg = 0.005; + constexpr double kAsym = 1.1; + double loss_function = 0; + for (size_t i = 0; i < w.size(); i++) { + (*df)[i] = 0; + } + for (auto ind : nl) { + std::pair pos = IndexAndFrac(ind.intensity); + JXL_DASSERT(pos.first >= 0 && static_cast(pos.first) < + NoiseParams::kNumNoisePoints - 1); + double low = w[pos.first]; + double hi = w[pos.first + 1]; + double val = low * (1.0f - pos.second) + hi * pos.second; + double dist = val - ind.noise_level; + if (dist > 0) { + loss_function += kAsym * dist * dist; + (*df)[pos.first] -= kAsym * (1.0f - pos.second) * dist; + (*df)[pos.first + 1] -= kAsym * pos.second * dist; + } else { + loss_function += dist * dist; + (*df)[pos.first] -= (1.0f - pos.second) * dist; + (*df)[pos.first + 1] -= pos.second * dist; + } + } + if (skip_regularization) return loss_function; + for (size_t i = 0; i + 1 < w.size(); i++) { + double diff = w[i] - w[i + 1]; + loss_function += kReg * nl.size() * diff * diff; + (*df)[i] -= kReg * diff * nl.size(); + (*df)[i + 1] += kReg * diff * nl.size(); + } + return loss_function; + } + + std::vector nl; +}; + +void OptimizeNoiseParameters(const std::vector& noise_level, + NoiseParams* noise_params) { + constexpr double kMaxError = 1e-3; + static const double kPrecision = 1e-8; + static const int kMaxIter = 40; + + float avg = 0; + for (const NoiseLevel& nl : noise_level) { + avg += nl.noise_level; + } + avg /= noise_level.size(); + + LossFunction loss_function(noise_level); + OptimizeArray parameter_vector; + for (size_t i = 0; i < parameter_vector.size(); i++) { + parameter_vector[i] = avg; + } + + parameter_vector = optimize::OptimizeWithScaledConjugateGradientMethod( + loss_function, parameter_vector, kPrecision, kMaxIter); + + OptimizeArray df = parameter_vector; + float loss = loss_function.Compute(parameter_vector, &df, + /*skip_regularization=*/true) / + noise_level.size(); + + // Approximation went too badly: escape with no noise at all. + if (loss > kMaxError) { + noise_params->Clear(); + return; + } + + for (size_t i = 0; i < parameter_vector.size(); i++) { + noise_params->lut[i] = std::max(parameter_vector[i], 0.0); + } +} + +std::vector GetNoiseLevel( + const Image3F& opsin, const std::vector& texture_strength, + const float threshold, const size_t block_s) { + std::vector noise_level_per_intensity; + + const int filt_size = 1; + static const float kLaplFilter[filt_size * 2 + 1][filt_size * 2 + 1] = { + {-0.25f, -1.0f, -0.25f}, + {-1.0f, 5.0f, -1.0f}, + {-0.25f, -1.0f, -0.25f}, + }; + + // The noise model is built based on channel 0.5 * (X+Y) as we notice that it + // is similar to the model 0.5 * (Y-X) + size_t patch_index = 0; + + for (size_t y = 0; y + block_s <= opsin.ysize(); y += block_s) { + for (size_t x = 0; x + block_s <= opsin.xsize(); x += block_s) { + if (texture_strength[patch_index] <= threshold) { + // Calculate mean value + float mean_int = 0; + for (size_t y_bl = 0; y_bl < block_s; ++y_bl) { + for (size_t x_bl = 0; x_bl < block_s; ++x_bl) { + mean_int += 0.5f * (opsin.PlaneRow(1, y + y_bl)[x + x_bl] + + opsin.PlaneRow(0, y + y_bl)[x + x_bl]); + } + } + mean_int /= block_s * block_s; + + // Calculate Noise level + float noise_level = 0; + size_t count = 0; + for (size_t y_bl = 0; y_bl < block_s; ++y_bl) { + for (size_t x_bl = 0; x_bl < block_s; ++x_bl) { + float filtered_value = 0; + for (int y_f = -1 * filt_size; y_f <= filt_size; ++y_f) { + if ((static_cast(y_bl) + y_f) >= 0 && + (y_bl + y_f) < block_s) { + for (int x_f = -1 * filt_size; x_f <= filt_size; ++x_f) { + if ((static_cast(x_bl) + x_f) >= 0 && + (x_bl + x_f) < block_s) { + filtered_value += + 0.5f * + (opsin.PlaneRow(1, y + y_bl + y_f)[x + x_bl + x_f] + + opsin.PlaneRow(0, y + y_bl + y_f)[x + x_bl + x_f]) * + kLaplFilter[y_f + filt_size][x_f + filt_size]; + } else { + filtered_value += + 0.5f * + (opsin.PlaneRow(1, y + y_bl + y_f)[x + x_bl - x_f] + + opsin.PlaneRow(0, y + y_bl + y_f)[x + x_bl - x_f]) * + kLaplFilter[y_f + filt_size][x_f + filt_size]; + } + } + } else { + for (int x_f = -1 * filt_size; x_f <= filt_size; ++x_f) { + if ((static_cast(x_bl) + x_f) >= 0 && + (x_bl + x_f) < block_s) { + filtered_value += + 0.5f * + (opsin.PlaneRow(1, y + y_bl - y_f)[x + x_bl + x_f] + + opsin.PlaneRow(0, y + y_bl - y_f)[x + x_bl + x_f]) * + kLaplFilter[y_f + filt_size][x_f + filt_size]; + } else { + filtered_value += + 0.5f * + (opsin.PlaneRow(1, y + y_bl - y_f)[x + x_bl - x_f] + + opsin.PlaneRow(0, y + y_bl - y_f)[x + x_bl - x_f]) * + kLaplFilter[y_f + filt_size][x_f + filt_size]; + } + } + } + } + noise_level += std::abs(filtered_value); + ++count; + } + } + noise_level /= count; + NoiseLevel nl; + nl.intensity = mean_int; + nl.noise_level = noise_level; + noise_level_per_intensity.push_back(nl); + } + ++patch_index; + } + } + return noise_level_per_intensity; +} + +void EncodeFloatParam(float val, float precision, BitWriter* writer) { + JXL_ASSERT(val >= 0); + const int absval_quant = static_cast(val * precision + 0.5f); + JXL_ASSERT(absval_quant < (1 << 10)); + writer->Write(10, absval_quant); +} + +} // namespace + +Status GetNoiseParameter(const Image3F& opsin, NoiseParams* noise_params, + float quality_coef) { + // The size of a patch in decoder might be different from encoder's patch + // size. + // For encoder: the patch size should be big enough to estimate + // noise level, but, at the same time, it should be not too big + // to be able to estimate intensity value of the patch + const size_t block_s = 8; + const size_t kNumBin = 256; + NoiseHistogram sad_histogram; + std::vector sad_scores = + GetSADScoresForPatches(opsin, block_s, kNumBin, &sad_histogram); + float sad_threshold = GetSADThreshold(sad_histogram, kNumBin); + // If threshold is too large, the image has a strong pattern. This pattern + // fools our model and it will add too much noise. Therefore, we do not add + // noise for such images + if (sad_threshold > 0.15f || sad_threshold <= 0.0f) { + noise_params->Clear(); + return false; + } + std::vector nl = + GetNoiseLevel(opsin, sad_scores, sad_threshold, block_s); + + OptimizeNoiseParameters(nl, noise_params); + for (float& i : noise_params->lut) { + i *= quality_coef * 1.4; + } + return noise_params->HasAny(); +} + +void EncodeNoise(const NoiseParams& noise_params, BitWriter* writer, + size_t layer, AuxOut* aux_out) { + JXL_ASSERT(noise_params.HasAny()); + + BitWriter::Allotment allotment(writer, NoiseParams::kNumNoisePoints * 16); + for (float i : noise_params.lut) { + EncodeFloatParam(i, kNoisePrecision, writer); + } + allotment.ReclaimAndCharge(writer, layer, aux_out); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_noise.h b/third_party/jpeg-xl/lib/jxl/enc_noise.h new file mode 100644 index 0000000000..851fdd12db --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_noise.h @@ -0,0 +1,34 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_NOISE_H_ +#define LIB_JXL_ENC_NOISE_H_ + +// Noise parameter estimation. + +#include + +#include "lib/jxl/base/status.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/image.h" +#include "lib/jxl/noise.h" + +namespace jxl { + +struct AuxOut; + +// Get parameters of the noise for NoiseParams model +// Returns whether a valid noise model (with HasAny()) is set. +Status GetNoiseParameter(const Image3F& opsin, NoiseParams* noise_params, + float quality_coef); + +// Does not write anything if `noise_params` are empty. Otherwise, caller must +// set FrameHeader.flags.kNoise. +void EncodeNoise(const NoiseParams& noise_params, BitWriter* writer, + size_t layer, AuxOut* aux_out); + +} // namespace jxl + +#endif // LIB_JXL_ENC_NOISE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_optimize.cc b/third_party/jpeg-xl/lib/jxl/enc_optimize.cc new file mode 100644 index 0000000000..6865ff67df --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_optimize.cc @@ -0,0 +1,163 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_optimize.h" + +#include + +#include "lib/jxl/base/status.h" + +namespace jxl { + +namespace optimize { + +namespace { + +// simplex vector must be sorted by first element of its elements +std::vector Midpoint(const std::vector>& simplex) { + JXL_CHECK(!simplex.empty()); + JXL_CHECK(simplex.size() == simplex[0].size()); + int dim = simplex.size() - 1; + std::vector result(dim + 1, 0); + for (int i = 0; i < dim; i++) { + for (int k = 0; k < dim; k++) { + result[i + 1] += simplex[k][i + 1]; + } + result[i + 1] /= dim; + } + return result; +} + +// first element ignored +std::vector Subtract(const std::vector& a, + const std::vector& b) { + JXL_CHECK(a.size() == b.size()); + std::vector result(a.size()); + result[0] = 0; + for (size_t i = 1; i < result.size(); i++) { + result[i] = a[i] - b[i]; + } + return result; +} + +// first element ignored +std::vector Add(const std::vector& a, + const std::vector& b) { + JXL_CHECK(a.size() == b.size()); + std::vector result(a.size()); + result[0] = 0; + for (size_t i = 1; i < result.size(); i++) { + result[i] = a[i] + b[i]; + } + return result; +} + +// first element ignored +std::vector Average(const std::vector& a, + const std::vector& b) { + JXL_CHECK(a.size() == b.size()); + std::vector result(a.size()); + result[0] = 0; + for (size_t i = 1; i < result.size(); i++) { + result[i] = 0.5 * (a[i] + b[i]); + } + return result; +} + +// vec: [0] will contain the objective function, [1:] will +// contain the vector position for the objective function. +// fun: the function evaluates the value. +void Eval(std::vector* vec, + const std::function&)>& fun) { + std::vector args(vec->begin() + 1, vec->end()); + (*vec)[0] = fun(args); +} + +void Sort(std::vector>* simplex) { + std::sort(simplex->begin(), simplex->end()); +} + +// Main iteration step of Nelder-Mead like optimization. +void Reflect(std::vector>* simplex, + const std::function&)>& fun) { + Sort(simplex); + const std::vector& last = simplex->back(); + std::vector mid = Midpoint(*simplex); + std::vector diff = Subtract(mid, last); + std::vector mirrored = Add(mid, diff); + Eval(&mirrored, fun); + if (mirrored[0] > (*simplex)[simplex->size() - 2][0]) { + // Still the worst, shrink towards the best. + std::vector shrinking = Average(simplex->back(), (*simplex)[0]); + Eval(&shrinking, fun); + simplex->back() = shrinking; + } else if (mirrored[0] < (*simplex)[0][0]) { + // new best + std::vector even_further = Add(mirrored, diff); + Eval(&even_further, fun); + if (even_further[0] < mirrored[0]) { + mirrored = even_further; + } + simplex->back() = mirrored; + } else { + // not a best, not a worst point + simplex->back() = mirrored; + } +} + +// Initialize the simplex at origin. +std::vector> InitialSimplex( + int dim, double amount, const std::vector& init, + const std::function&)>& fun) { + std::vector best(1 + dim, 0); + std::copy(init.begin(), init.end(), best.begin() + 1); + Eval(&best, fun); + std::vector> result{best}; + for (int i = 0; i < dim; i++) { + best = result[0]; + best[i + 1] += amount; + Eval(&best, fun); + result.push_back(best); + Sort(&result); + } + return result; +} + +// For comparing the same with the python tool +/*void RunSimplexExternal( + int dim, double amount, int max_iterations, + const std::function&))>& fun) { + vector vars; + for (int i = 0; i < dim; i++) { + vars.push_back(atof(getenv(StrCat("VAR", i).c_str()))); + } + double result = fun(vars); + std::cout << "Result=" << result; +}*/ + +} // namespace + +std::vector RunSimplex( + int dim, double amount, int max_iterations, const std::vector& init, + const std::function&)>& fun) { + std::vector> simplex = + InitialSimplex(dim, amount, init, fun); + for (int i = 0; i < max_iterations; i++) { + Sort(&simplex); + Reflect(&simplex, fun); + } + return simplex[0]; +} + +std::vector RunSimplex( + int dim, double amount, int max_iterations, + const std::function&)>& fun) { + std::vector init(dim, 0.0); + return RunSimplex(dim, amount, max_iterations, init, fun); +} + +} // namespace optimize + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_optimize.h b/third_party/jpeg-xl/lib/jxl/enc_optimize.h new file mode 100644 index 0000000000..0a60198214 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_optimize.h @@ -0,0 +1,218 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Utility functions for optimizing multi-dimensional nonlinear functions. + +#ifndef LIB_JXL_OPTIMIZE_H_ +#define LIB_JXL_OPTIMIZE_H_ + +#include + +#include +#include +#include +#include + +#include "lib/jxl/base/status.h" + +namespace jxl { +namespace optimize { + +// An array type of numeric values that supports math operations with operator-, +// operator+, etc. +template +class Array { + public: + Array() = default; + explicit Array(T v) { + for (size_t i = 0; i < N; i++) v_[i] = v; + } + + size_t size() const { return N; } + + T& operator[](size_t index) { + JXL_DASSERT(index < N); + return v_[index]; + } + T operator[](size_t index) const { + JXL_DASSERT(index < N); + return v_[index]; + } + + private: + // The values used by this Array. + T v_[N]; +}; + +template +Array operator+(const Array& x, const Array& y) { + Array z; + for (size_t i = 0; i < N; ++i) { + z[i] = x[i] + y[i]; + } + return z; +} + +template +Array operator-(const Array& x, const Array& y) { + Array z; + for (size_t i = 0; i < N; ++i) { + z[i] = x[i] - y[i]; + } + return z; +} + +template +Array operator*(T v, const Array& x) { + Array y; + for (size_t i = 0; i < N; ++i) { + y[i] = v * x[i]; + } + return y; +} + +template +T operator*(const Array& x, const Array& y) { + T r = 0.0; + for (size_t i = 0; i < N; ++i) { + r += x[i] * y[i]; + } + return r; +} + +// Runs Nelder-Mead like optimization. Runs for max_iterations times, +// fun gets called with a vector of size dim as argument, and returns the score +// based on those parameters (lower is better). Returns a vector of dim+1 +// dimensions, where the first value is the optimal value of the function and +// the rest is the argmin value. Use init to pass an initial guess or where +// the optimal value is. +// +// Usage example: +// +// RunSimplex(2, 0.1, 100, [](const vector& v) { +// return (v[0] - 5) * (v[0] - 5) + (v[1] - 7) * (v[1] - 7); +// }); +// +// Returns (0.0, 5, 7) +std::vector RunSimplex( + int dim, double amount, int max_iterations, + const std::function&)>& fun); +std::vector RunSimplex( + int dim, double amount, int max_iterations, const std::vector& init, + const std::function&)>& fun); + +// Implementation of the Scaled Conjugate Gradient method described in the +// following paper: +// Moller, M. "A Scaled Conjugate Gradient Algorithm for Fast Supervised +// Learning", Neural Networks, Vol. 6. pp. 525-533, 1993 +// http://sci2s.ugr.es/keel/pdf/algorithm/articulo/moller1990.pdf +// +// The Function template parameter is a class that has the following method: +// +// // Returns the value of the function at point w and sets *df to be the +// // negative gradient vector of the function at point w. +// double Compute(const optimize::Array& w, +// optimize::Array* df) const; +// +// Returns a vector w, such that |df(w)| < grad_norm_threshold. +template +Array OptimizeWithScaledConjugateGradientMethod( + const Function& f, const Array& w0, const T grad_norm_threshold, + size_t max_iters) { + const size_t n = w0.size(); + const T rsq_threshold = grad_norm_threshold * grad_norm_threshold; + const T sigma0 = static_cast(0.0001); + const T l_min = static_cast(1.0e-15); + const T l_max = static_cast(1.0e15); + + Array w = w0; + Array wp; + Array r; + Array rt; + Array e; + Array p; + T psq; + T fp; + T D; + T d; + T m; + T a; + T b; + T s; + T t; + + T fw = f.Compute(w, &r); + T rsq = r * r; + e = r; + p = r; + T l = static_cast(1.0); + bool success = true; + size_t n_success = 0; + size_t k = 0; + + while (k++ < max_iters) { + if (success) { + m = -(p * r); + if (m >= 0) { + p = r; + m = -(p * r); + } + psq = p * p; + s = sigma0 / std::sqrt(psq); + f.Compute(w + (s * p), &rt); + t = (p * (r - rt)) / s; + } + + d = t + l * psq; + if (d <= 0) { + d = l * psq; + l = l - t / psq; + } + + a = -m / d; + wp = w + a * p; + fp = f.Compute(wp, &rt); + + D = 2.0 * (fp - fw) / (a * m); + if (D >= 0.0) { + success = true; + n_success++; + w = wp; + } else { + success = false; + } + + if (success) { + e = r; + r = rt; + rsq = r * r; + fw = fp; + if (rsq <= rsq_threshold) { + break; + } + } + + if (D < 0.25) { + l = std::min(4.0 * l, l_max); + } else if (D > 0.75) { + l = std::max(0.25 * l, l_min); + } + + if ((n_success % n) == 0) { + p = r; + l = 1.0; + } else if (success) { + b = ((e - r) * r) / m; + p = b * p + r; + } + } + + return w; +} + +} // namespace optimize +} // namespace jxl + +#endif // LIB_JXL_OPTIMIZE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_optimize_test.cc b/third_party/jpeg-xl/lib/jxl/enc_optimize_test.cc new file mode 100644 index 0000000000..1c6699f99e --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_optimize_test.cc @@ -0,0 +1,109 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_optimize.h" + +#include + +#include "lib/jxl/testing.h" + +namespace jxl { +namespace optimize { +namespace { + +// The maximum number of iterations for the test. +static const size_t kMaxTestIter = 100000; + +// F(w) = (w - w_min)^2. +struct SimpleQuadraticFunction { + typedef Array ArrayType; + explicit SimpleQuadraticFunction(const ArrayType& w0) : w_min(w0) {} + + double Compute(const ArrayType& w, ArrayType* df) const { + ArrayType dw = w - w_min; + *df = -2.0 * dw; + return dw * dw; + } + + ArrayType w_min; +}; + +// F(alpha, beta, gamma| x,y) = \sum_i(y_i - (alpha x_i ^ gamma + beta))^2. +struct PowerFunction { + explicit PowerFunction(const std::vector& x0, + const std::vector& y0) + : x(x0), y(y0) {} + + typedef Array ArrayType; + double Compute(const ArrayType& w, ArrayType* df) const { + double loss_function = 0; + (*df)[0] = 0; + (*df)[1] = 0; + (*df)[2] = 0; + for (size_t ind = 0; ind < y.size(); ++ind) { + if (x[ind] != 0) { + double l_f = y[ind] - (w[0] * pow(x[ind], w[1]) + w[2]); + (*df)[0] += 2.0 * l_f * pow(x[ind], w[1]); + (*df)[1] += 2.0 * l_f * w[0] * pow(x[ind], w[1]) * log(x[ind]); + (*df)[2] += 2.0 * l_f * 1; + loss_function += l_f * l_f; + } + } + return loss_function; + } + + std::vector x; + std::vector y; +}; + +TEST(OptimizeTest, SimpleQuadraticFunction) { + SimpleQuadraticFunction::ArrayType w_min; + w_min[0] = 1.0; + w_min[1] = 2.0; + SimpleQuadraticFunction f(w_min); + SimpleQuadraticFunction::ArrayType w(0.); + static const double kPrecision = 1e-8; + w = optimize::OptimizeWithScaledConjugateGradientMethod(f, w, kPrecision, + kMaxTestIter); + EXPECT_NEAR(w[0], 1.0, kPrecision); + EXPECT_NEAR(w[1], 2.0, kPrecision); +} + +TEST(OptimizeTest, PowerFunction) { + std::vector x(10); + std::vector y(10); + for (int ind = 0; ind < 10; ++ind) { + x[ind] = 1. * ind; + y[ind] = 2. * pow(x[ind], 3) + 5.; + } + PowerFunction f(x, y); + PowerFunction::ArrayType w(0.); + + static const double kPrecision = 0.01; + w = optimize::OptimizeWithScaledConjugateGradientMethod(f, w, kPrecision, + kMaxTestIter); + EXPECT_NEAR(w[0], 2.0, kPrecision); + EXPECT_NEAR(w[1], 3.0, kPrecision); + EXPECT_NEAR(w[2], 5.0, kPrecision); +} + +TEST(OptimizeTest, SimplexOptTest) { + auto f = [](const std::vector& x) -> double { + double t1 = x[0] - 1.0; + double t2 = x[1] + 1.5; + return 2.0 + t1 * t1 + t2 * t2; + }; + auto opt = RunSimplex(2, 0.01, 100, f); + EXPECT_EQ(opt.size(), 3u); + + static const double kPrecision = 0.01; + EXPECT_NEAR(opt[0], 2.0, kPrecision); + EXPECT_NEAR(opt[1], 1.0, kPrecision); + EXPECT_NEAR(opt[2], -1.5, kPrecision); +} + +} // namespace +} // namespace optimize +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_params.h b/third_party/jpeg-xl/lib/jxl/enc_params.h new file mode 100644 index 0000000000..737a951362 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_params.h @@ -0,0 +1,225 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_PARAMS_H_ +#define LIB_JXL_ENC_PARAMS_H_ + +// Parameters and flags that govern JXL compression. + +#include +#include + +#include + +#include "lib/jxl/base/override.h" +#include "lib/jxl/butteraugli/butteraugli.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/modular/options.h" +#include "lib/jxl/modular/transform/transform.h" + +namespace jxl { + +enum class SpeedTier { + // Try multiple combinations of Tortoise flags for modular mode. Otherwise + // like kTortoise. + kGlacier = 0, + // Turns on FindBestQuantizationHQ loop. Equivalent to "guetzli" mode. + kTortoise = 1, + // Turns on FindBestQuantization butteraugli loop. + kKitten = 2, + // Turns on dots, patches, and spline detection by default, as well as full + // context clustering. Default. + kSquirrel = 3, + // Turns on error diffusion and full AC strategy heuristics. Equivalent to + // "fast" mode. + kWombat = 4, + // Turns on gaborish by default, non-default cmap, initial quant field. + kHare = 5, + // Turns on simple heuristics for AC strategy, quant field, and clustering; + // also enables coefficient reordering. + kCheetah = 6, + // Turns off most encoder features. Does context clustering. + // Modular: uses fixed tree with Weighted predictor. + kFalcon = 7, + // Currently fastest possible setting for VarDCT. + // Modular: uses fixed tree with Gradient predictor. + kThunder = 8, + // VarDCT: same as kThunder. + // Modular: no tree, Gradient predictor, fast histograms + kLightning = 9 +}; + +// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding) +struct CompressParams { + float butteraugli_distance = 1.0f; + + // explicit distances for extra channels (defaults to butteraugli_distance + // when not set; value of -1 can be used to represent 'default') + std::vector ec_distance; + size_t target_size = 0; + float target_bitrate = 0.0f; + + // 0.0 means search for the adaptive quantization map that matches the + // butteraugli distance, positive values mean quantize everywhere with that + // value. + float uniform_quant = 0.0f; + float quant_border_bias = 0.0f; + + // Try to achieve a maximum pixel-by-pixel error on each channel. + bool max_error_mode = false; + float max_error[3] = {0.0, 0.0, 0.0}; + + SpeedTier speed_tier = SpeedTier::kSquirrel; + int brotli_effort = -1; + + // 0 = default. + // 1 = slightly worse quality. + // 4 = fastest speed, lowest quality + size_t decoding_speed_tier = 0; + + int max_butteraugli_iters = 4; + + int max_butteraugli_iters_guetzli_mode = 100; + + ColorTransform color_transform = ColorTransform::kXYB; + YCbCrChromaSubsampling chroma_subsampling; + + // If true, the "modular mode options" members below are used. + bool modular_mode = false; + + // Change group size in modular mode (0=128, 1=256, 2=512, 3=1024). + size_t modular_group_size_shift = 1; + + Override preview = Override::kDefault; + Override noise = Override::kDefault; + Override dots = Override::kDefault; + Override patches = Override::kDefault; + Override gaborish = Override::kDefault; + int epf = -1; + + // Progressive mode. + bool progressive_mode = false; + + // Quantized-progressive mode. + bool qprogressive_mode = false; + + // Put center groups first in the bitstream. + bool centerfirst = false; + + // Pixel coordinates of the center. First group will contain that center. + size_t center_x = static_cast(-1); + size_t center_y = static_cast(-1); + + int progressive_dc = -1; + + // If on: preserve color of invisible pixels (if off: don't care) + // Default: on for lossless, off for lossy + Override keep_invisible = Override::kDefault; + + // Currently unused as of 2020-01. + bool clear_metadata = false; + + // Prints extra information during/after encoding. + bool verbose = false; + bool log_search_state = false; + + ButteraugliParams ba_params; + + // Force usage of CfL when doing JPEG recompression. This can have unexpected + // effects on the decoded pixels, while still being JPEG-compliant and + // allowing reconstruction of the original JPEG. + bool force_cfl_jpeg_recompression = true; + + // Use brotli compression for any boxes derived from a JPEG frame. + bool jpeg_compress_boxes = true; + + // Set the noise to what it would approximately be if shooting at the nominal + // exposure for a given ISO setting on a 35mm camera. + float photon_noise_iso = 0; + + // modular mode options below + ModularOptions options; + int responsive = -1; + // empty for default squeeze + std::vector squeezes; + int colorspace = -1; + // Use Global channel palette if #colors < this percentage of range + float channel_colors_pre_transform_percent = 95.f; + // Use Local channel palette if #colors < this percentage of range + float channel_colors_percent = 80.f; + int palette_colors = 1 << 10; // up to 10-bit palette is probably worthwhile + bool lossy_palette = false; + + // Returns whether these params are lossless as defined by SetLossless(); + bool IsLossless() const { return modular_mode && ModularPartIsLossless(); } + + bool ModularPartIsLossless() const { + if (modular_mode) { + // YCbCr is also considered lossless here since it's intended for + // source material that is already YCbCr (we don't do the fwd transform) + if (butteraugli_distance != 0 || + color_transform == jxl::ColorTransform::kXYB) + return false; + } + for (float f : ec_distance) { + if (f > 0) return false; + if (f < 0 && butteraugli_distance != 0) return false; + } + // if no explicit ec_distance given, and using vardct, then the modular part + // is empty or not lossless + if (!modular_mode && ec_distance.empty()) return false; + // all modular channels are encoded at distance 0 + return true; + } + + // Sets the parameters required to make the codec lossless. + void SetLossless() { + modular_mode = true; + butteraugli_distance = 0.0f; + for (float &f : ec_distance) f = 0.0f; + color_transform = jxl::ColorTransform::kNone; + } + + // Down/upsample the image before encoding / after decoding by this factor. + // The resampling value can also be set to <= 0 to automatically choose based + // on distance, however EncodeFrame doesn't support this, so it is + // required to call PostInit() to set a valid positive resampling + // value and altered butteraugli score if this is used. + int resampling = -1; + int ec_resampling = -1; + // Skip the downsampling before encoding if this is true. + bool already_downsampled = false; + // Butteraugli target distance on the original full size image, this can be + // different from butteraugli_distance if resampling was used. + float original_butteraugli_distance = -1.0f; + + float quant_ac_rescale = 1.0; + + // Codestream level to conform to. + // -1: don't care + int level = -1; + + std::vector manual_noise; + std::vector manual_xyb_factors; +}; + +static constexpr float kMinButteraugliForDynamicAR = 0.5f; +static constexpr float kMinButteraugliForDots = 3.0f; +static constexpr float kMinButteraugliToSubtractOriginalPatches = 3.0f; + +// Always off +static constexpr float kMinButteraugliForNoise = 99.0f; + +// Minimum butteraugli distance the encoder accepts. +static constexpr float kMinButteraugliDistance = 0.001f; + +// Tile size for encoder-side processing. Must be equal to color tile dim in the +// current implementation. +static constexpr size_t kEncTileDim = 64; +static constexpr size_t kEncTileDimInBlocks = kEncTileDim / kBlockDim; + +} // namespace jxl + +#endif // LIB_JXL_ENC_PARAMS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.cc b/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.cc new file mode 100644 index 0000000000..157e18c3a8 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.cc @@ -0,0 +1,813 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_patch_dictionary.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/random.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/dec_frame.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_dot_dictionary.h" +#include "lib/jxl/enc_frame.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/patch_dictionary_internal.h" + +namespace jxl { + +static constexpr size_t kPatchFrameReferenceId = 3; + +// static +void PatchDictionaryEncoder::Encode(const PatchDictionary& pdic, + BitWriter* writer, size_t layer, + AuxOut* aux_out) { + JXL_ASSERT(pdic.HasAny()); + std::vector> tokens(1); + size_t num_ec = pdic.shared_->metadata->m.num_extra_channels; + + auto add_num = [&](int context, size_t num) { + tokens[0].emplace_back(context, num); + }; + size_t num_ref_patch = 0; + for (size_t i = 0; i < pdic.positions_.size();) { + size_t ref_pos_idx = pdic.positions_[i].ref_pos_idx; + while (i < pdic.positions_.size() && + pdic.positions_[i].ref_pos_idx == ref_pos_idx) { + i++; + } + num_ref_patch++; + } + add_num(kNumRefPatchContext, num_ref_patch); + size_t blend_pos = 0; + for (size_t i = 0; i < pdic.positions_.size();) { + size_t i_start = i; + size_t ref_pos_idx = pdic.positions_[i].ref_pos_idx; + const auto& ref_pos = pdic.ref_positions_[ref_pos_idx]; + while (i < pdic.positions_.size() && + pdic.positions_[i].ref_pos_idx == ref_pos_idx) { + i++; + } + size_t num = i - i_start; + JXL_ASSERT(num > 0); + add_num(kReferenceFrameContext, ref_pos.ref); + add_num(kPatchReferencePositionContext, ref_pos.x0); + add_num(kPatchReferencePositionContext, ref_pos.y0); + add_num(kPatchSizeContext, ref_pos.xsize - 1); + add_num(kPatchSizeContext, ref_pos.ysize - 1); + add_num(kPatchCountContext, num - 1); + for (size_t j = i_start; j < i; j++) { + const PatchPosition& pos = pdic.positions_[j]; + if (j == i_start) { + add_num(kPatchPositionContext, pos.x); + add_num(kPatchPositionContext, pos.y); + } else { + add_num(kPatchOffsetContext, + PackSigned(pos.x - pdic.positions_[j - 1].x)); + add_num(kPatchOffsetContext, + PackSigned(pos.y - pdic.positions_[j - 1].y)); + } + for (size_t j = 0; j < num_ec + 1; ++j, ++blend_pos) { + const PatchBlending& info = pdic.blendings_[blend_pos]; + add_num(kPatchBlendModeContext, static_cast(info.mode)); + if (UsesAlpha(info.mode) && + pdic.shared_->metadata->m.extra_channel_info.size() > 1) { + add_num(kPatchAlphaChannelContext, info.alpha_channel); + } + if (UsesClamp(info.mode)) { + add_num(kPatchClampContext, info.clamp); + } + } + } + } + + EntropyEncodingData codes; + std::vector context_map; + BuildAndEncodeHistograms(HistogramParams(), kNumPatchDictionaryContexts, + tokens, &codes, &context_map, writer, layer, + aux_out); + WriteTokens(tokens[0], codes, context_map, writer, layer, aux_out); +} + +// static +void PatchDictionaryEncoder::SubtractFrom(const PatchDictionary& pdic, + Image3F* opsin) { + size_t num_ec = pdic.shared_->metadata->m.num_extra_channels; + // TODO(veluca): this can likely be optimized knowing it runs on full images. + for (size_t y = 0; y < opsin->ysize(); y++) { + float* JXL_RESTRICT rows[3] = { + opsin->PlaneRow(0, y), + opsin->PlaneRow(1, y), + opsin->PlaneRow(2, y), + }; + for (size_t pos_idx : pdic.GetPatchesForRow(y)) { + const size_t blending_idx = pos_idx * (num_ec + 1); + const PatchPosition& pos = pdic.positions_[pos_idx]; + const PatchReferencePosition& ref_pos = + pdic.ref_positions_[pos.ref_pos_idx]; + const PatchBlendMode mode = pdic.blendings_[blending_idx].mode; + size_t by = pos.y; + size_t bx = pos.x; + size_t xsize = ref_pos.xsize; + JXL_DASSERT(y >= by); + JXL_DASSERT(y < by + ref_pos.ysize); + size_t iy = y - by; + size_t ref = ref_pos.ref; + const float* JXL_RESTRICT ref_rows[3] = { + pdic.shared_->reference_frames[ref].frame.color().ConstPlaneRow( + 0, ref_pos.y0 + iy) + + ref_pos.x0, + pdic.shared_->reference_frames[ref].frame.color().ConstPlaneRow( + 1, ref_pos.y0 + iy) + + ref_pos.x0, + pdic.shared_->reference_frames[ref].frame.color().ConstPlaneRow( + 2, ref_pos.y0 + iy) + + ref_pos.x0, + }; + for (size_t ix = 0; ix < xsize; ix++) { + for (size_t c = 0; c < 3; c++) { + if (mode == PatchBlendMode::kAdd) { + rows[c][bx + ix] -= ref_rows[c][ix]; + } else if (mode == PatchBlendMode::kReplace) { + rows[c][bx + ix] = 0; + } else if (mode == PatchBlendMode::kNone) { + // Nothing to do. + } else { + JXL_ABORT("Blending mode %u not yet implemented", (uint32_t)mode); + } + } + } + } + } +} + +namespace { + +struct PatchColorspaceInfo { + float kChannelDequant[3]; + float kChannelWeights[3]; + + explicit PatchColorspaceInfo(bool is_xyb) { + if (is_xyb) { + kChannelDequant[0] = 0.01615; + kChannelDequant[1] = 0.08875; + kChannelDequant[2] = 0.1922; + kChannelWeights[0] = 30.0; + kChannelWeights[1] = 3.0; + kChannelWeights[2] = 1.0; + } else { + kChannelDequant[0] = 20.0f / 255; + kChannelDequant[1] = 22.0f / 255; + kChannelDequant[2] = 20.0f / 255; + kChannelWeights[0] = 0.017 * 255; + kChannelWeights[1] = 0.02 * 255; + kChannelWeights[2] = 0.017 * 255; + } + } + + float ScaleForQuantization(float val, size_t c) { + return val / kChannelDequant[c]; + } + + int Quantize(float val, size_t c) { + return truncf(ScaleForQuantization(val, c)); + } + + bool is_similar_v(const float v1[3], const float v2[3], float threshold) { + float distance = 0; + for (size_t c = 0; c < 3; c++) { + distance += std::fabs(v1[c] - v2[c]) * kChannelWeights[c]; + } + return distance <= threshold; + } +}; + +std::vector FindTextLikePatches( + const Image3F& opsin, const PassesEncoderState* JXL_RESTRICT state, + ThreadPool* pool, AuxOut* aux_out, bool is_xyb) { + if (state->cparams.patches == Override::kOff) return {}; + + PatchColorspaceInfo pci(is_xyb); + float kSimilarThreshold = 0.8f; + + auto is_similar_impl = [&pci](std::pair p1, + std::pair p2, + const float* JXL_RESTRICT rows[3], + size_t stride, float threshold) { + float v1[3], v2[3]; + for (size_t c = 0; c < 3; c++) { + v1[c] = rows[c][p1.second * stride + p1.first]; + v2[c] = rows[c][p2.second * stride + p2.first]; + } + return pci.is_similar_v(v1, v2, threshold); + }; + + std::atomic has_screenshot_areas{false}; + const size_t opsin_stride = opsin.PixelsPerRow(); + const float* JXL_RESTRICT opsin_rows[3] = {opsin.ConstPlaneRow(0, 0), + opsin.ConstPlaneRow(1, 0), + opsin.ConstPlaneRow(2, 0)}; + + auto is_same = [&opsin_rows, opsin_stride](std::pair p1, + std::pair p2) { + for (size_t c = 0; c < 3; c++) { + float v1 = opsin_rows[c][p1.second * opsin_stride + p1.first]; + float v2 = opsin_rows[c][p2.second * opsin_stride + p2.first]; + if (std::fabs(v1 - v2) > 1e-4) { + return false; + } + } + return true; + }; + + auto is_similar = [&](std::pair p1, + std::pair p2) { + return is_similar_impl(p1, p2, opsin_rows, opsin_stride, kSimilarThreshold); + }; + + constexpr int64_t kPatchSide = 4; + constexpr int64_t kExtraSide = 4; + + // Look for kPatchSide size squares, naturally aligned, that all have the same + // pixel values. + ImageB is_screenshot_like(DivCeil(opsin.xsize(), kPatchSide), + DivCeil(opsin.ysize(), kPatchSide)); + ZeroFillImage(&is_screenshot_like); + uint8_t* JXL_RESTRICT screenshot_row = is_screenshot_like.Row(0); + const size_t screenshot_stride = is_screenshot_like.PixelsPerRow(); + const auto process_row = [&](const uint32_t y, size_t /* thread */) { + for (uint64_t x = 0; x < opsin.xsize() / kPatchSide; x++) { + bool all_same = true; + for (size_t iy = 0; iy < static_cast(kPatchSide); iy++) { + for (size_t ix = 0; ix < static_cast(kPatchSide); ix++) { + size_t cx = x * kPatchSide + ix; + size_t cy = y * kPatchSide + iy; + if (!is_same({cx, cy}, {x * kPatchSide, y * kPatchSide})) { + all_same = false; + break; + } + } + } + if (!all_same) continue; + size_t num = 0; + size_t num_same = 0; + for (int64_t iy = -kExtraSide; iy < kExtraSide + kPatchSide; iy++) { + for (int64_t ix = -kExtraSide; ix < kExtraSide + kPatchSide; ix++) { + int64_t cx = x * kPatchSide + ix; + int64_t cy = y * kPatchSide + iy; + if (cx < 0 || static_cast(cx) >= opsin.xsize() || // + cy < 0 || static_cast(cy) >= opsin.ysize()) { + continue; + } + num++; + if (is_same({cx, cy}, {x * kPatchSide, y * kPatchSide})) num_same++; + } + } + // Too few equal pixels nearby. + if (num_same * 8 < num * 7) continue; + screenshot_row[y * screenshot_stride + x] = 1; + has_screenshot_areas = true; + } + }; + JXL_CHECK(RunOnPool(pool, 0, opsin.ysize() / kPatchSide, ThreadPool::NoInit, + process_row, "IsScreenshotLike")); + + // TODO(veluca): also parallelize the rest of this function. + if (WantDebugOutput(aux_out)) { + aux_out->DumpPlaneNormalized("screenshot_like", is_screenshot_like); + } + + constexpr int kSearchRadius = 1; + + if (!ApplyOverride(state->cparams.patches, has_screenshot_areas)) { + return {}; + } + + // Search for "similar enough" pixels near the screenshot-like areas. + ImageB is_background(opsin.xsize(), opsin.ysize()); + ZeroFillImage(&is_background); + Image3F background(opsin.xsize(), opsin.ysize()); + ZeroFillImage(&background); + constexpr size_t kDistanceLimit = 50; + float* JXL_RESTRICT background_rows[3] = { + background.PlaneRow(0, 0), + background.PlaneRow(1, 0), + background.PlaneRow(2, 0), + }; + const size_t background_stride = background.PixelsPerRow(); + uint8_t* JXL_RESTRICT is_background_row = is_background.Row(0); + const size_t is_background_stride = is_background.PixelsPerRow(); + std::vector< + std::pair, std::pair>> + queue; + size_t queue_front = 0; + for (size_t y = 0; y < opsin.ysize(); y++) { + for (size_t x = 0; x < opsin.xsize(); x++) { + if (!screenshot_row[screenshot_stride * (y / kPatchSide) + + (x / kPatchSide)]) + continue; + queue.push_back({{x, y}, {x, y}}); + } + } + while (queue.size() != queue_front) { + std::pair cur = queue[queue_front].first; + std::pair src = queue[queue_front].second; + queue_front++; + if (is_background_row[cur.second * is_background_stride + cur.first]) + continue; + is_background_row[cur.second * is_background_stride + cur.first] = 1; + for (size_t c = 0; c < 3; c++) { + background_rows[c][cur.second * background_stride + cur.first] = + opsin_rows[c][src.second * opsin_stride + src.first]; + } + for (int dx = -kSearchRadius; dx <= kSearchRadius; dx++) { + for (int dy = -kSearchRadius; dy <= kSearchRadius; dy++) { + if (dx == 0 && dy == 0) continue; + int next_first = cur.first + dx; + int next_second = cur.second + dy; + if (next_first < 0 || next_second < 0 || + static_cast(next_first) >= opsin.xsize() || + static_cast(next_second) >= opsin.ysize()) { + continue; + } + if (static_cast( + std::abs(next_first - static_cast(src.first)) + + std::abs(next_second - static_cast(src.second))) > + kDistanceLimit) { + continue; + } + std::pair next{next_first, next_second}; + if (is_similar(src, next)) { + if (!screenshot_row[next.second / kPatchSide * screenshot_stride + + next.first / kPatchSide] || + is_same(src, next)) { + if (!is_background_row[next.second * is_background_stride + + next.first]) + queue.emplace_back(next, src); + } + } + } + } + } + queue.clear(); + + ImageF ccs; + Rng rng(0); + bool paint_ccs = false; + if (WantDebugOutput(aux_out)) { + aux_out->DumpPlaneNormalized("is_background", is_background); + if (is_xyb) { + aux_out->DumpXybImage("background", background); + } else { + aux_out->DumpImage("background", background); + } + ccs = ImageF(opsin.xsize(), opsin.ysize()); + ZeroFillImage(&ccs); + paint_ccs = true; + } + + constexpr float kVerySimilarThreshold = 0.03f; + constexpr float kHasSimilarThreshold = 0.03f; + + const float* JXL_RESTRICT const_background_rows[3] = { + background_rows[0], background_rows[1], background_rows[2]}; + auto is_similar_b = [&](std::pair p1, std::pair p2) { + return is_similar_impl(p1, p2, const_background_rows, background_stride, + kVerySimilarThreshold); + }; + + constexpr int kMinPeak = 2; + constexpr int kHasSimilarRadius = 2; + + std::vector info; + + // Find small CC outside the "similar enough" areas, compute bounding boxes, + // and run heuristics to exclude some patches. + ImageB visited(opsin.xsize(), opsin.ysize()); + ZeroFillImage(&visited); + uint8_t* JXL_RESTRICT visited_row = visited.Row(0); + const size_t visited_stride = visited.PixelsPerRow(); + std::vector> cc; + std::vector> stack; + for (size_t y = 0; y < opsin.ysize(); y++) { + for (size_t x = 0; x < opsin.xsize(); x++) { + if (is_background_row[y * is_background_stride + x]) continue; + cc.clear(); + stack.clear(); + stack.emplace_back(x, y); + size_t min_x = x; + size_t max_x = x; + size_t min_y = y; + size_t max_y = y; + std::pair reference; + bool found_border = false; + bool all_similar = true; + while (!stack.empty()) { + std::pair cur = stack.back(); + stack.pop_back(); + if (visited_row[cur.second * visited_stride + cur.first]) continue; + visited_row[cur.second * visited_stride + cur.first] = 1; + if (cur.first < min_x) min_x = cur.first; + if (cur.first > max_x) max_x = cur.first; + if (cur.second < min_y) min_y = cur.second; + if (cur.second > max_y) max_y = cur.second; + if (paint_ccs) { + cc.push_back(cur); + } + for (int dx = -kSearchRadius; dx <= kSearchRadius; dx++) { + for (int dy = -kSearchRadius; dy <= kSearchRadius; dy++) { + if (dx == 0 && dy == 0) continue; + int next_first = static_cast(cur.first) + dx; + int next_second = static_cast(cur.second) + dy; + if (next_first < 0 || next_second < 0 || + static_cast(next_first) >= opsin.xsize() || + static_cast(next_second) >= opsin.ysize()) { + continue; + } + std::pair next{next_first, next_second}; + if (!is_background_row[next.second * is_background_stride + + next.first]) { + stack.push_back(next); + } else { + if (!found_border) { + reference = next; + found_border = true; + } else { + if (!is_similar_b(next, reference)) all_similar = false; + } + } + } + } + } + if (!found_border || !all_similar || max_x - min_x >= kMaxPatchSize || + max_y - min_y >= kMaxPatchSize) { + continue; + } + size_t bpos = background_stride * reference.second + reference.first; + float ref[3] = {background_rows[0][bpos], background_rows[1][bpos], + background_rows[2][bpos]}; + bool has_similar = false; + for (size_t iy = std::max( + static_cast(min_y) - kHasSimilarRadius, 0); + iy < std::min(max_y + kHasSimilarRadius + 1, opsin.ysize()); iy++) { + for (size_t ix = std::max( + static_cast(min_x) - kHasSimilarRadius, 0); + ix < std::min(max_x + kHasSimilarRadius + 1, opsin.xsize()); + ix++) { + size_t opos = opsin_stride * iy + ix; + float px[3] = {opsin_rows[0][opos], opsin_rows[1][opos], + opsin_rows[2][opos]}; + if (pci.is_similar_v(ref, px, kHasSimilarThreshold)) { + has_similar = true; + } + } + } + if (!has_similar) continue; + info.emplace_back(); + info.back().second.emplace_back(min_x, min_y); + QuantizedPatch& patch = info.back().first; + patch.xsize = max_x - min_x + 1; + patch.ysize = max_y - min_y + 1; + int max_value = 0; + for (size_t c : {1, 0, 2}) { + for (size_t iy = min_y; iy <= max_y; iy++) { + for (size_t ix = min_x; ix <= max_x; ix++) { + size_t offset = (iy - min_y) * patch.xsize + ix - min_x; + patch.fpixels[c][offset] = + opsin_rows[c][iy * opsin_stride + ix] - ref[c]; + int val = pci.Quantize(patch.fpixels[c][offset], c); + patch.pixels[c][offset] = val; + if (std::abs(val) > max_value) max_value = std::abs(val); + } + } + } + if (max_value < kMinPeak) { + info.pop_back(); + continue; + } + if (paint_ccs) { + float cc_color = rng.UniformF(0.5, 1.0); + for (std::pair p : cc) { + ccs.Row(p.second)[p.first] = cc_color; + } + } + } + } + + if (paint_ccs) { + JXL_ASSERT(WantDebugOutput(aux_out)); + aux_out->DumpPlaneNormalized("ccs", ccs); + } + if (info.empty()) { + return {}; + } + + // Remove duplicates. + constexpr size_t kMinPatchOccurrences = 2; + std::sort(info.begin(), info.end()); + size_t unique = 0; + for (size_t i = 1; i < info.size(); i++) { + if (info[i].first == info[unique].first) { + info[unique].second.insert(info[unique].second.end(), + info[i].second.begin(), info[i].second.end()); + } else { + if (info[unique].second.size() >= kMinPatchOccurrences) { + unique++; + } + info[unique] = info[i]; + } + } + if (info[unique].second.size() >= kMinPatchOccurrences) { + unique++; + } + info.resize(unique); + + size_t max_patch_size = 0; + + for (size_t i = 0; i < info.size(); i++) { + size_t pixels = info[i].first.xsize * info[i].first.ysize; + if (pixels > max_patch_size) max_patch_size = pixels; + } + + // don't use patches if all patches are smaller than this + constexpr size_t kMinMaxPatchSize = 20; + if (max_patch_size < kMinMaxPatchSize) return {}; + + return info; +} + +} // namespace + +void FindBestPatchDictionary(const Image3F& opsin, + PassesEncoderState* JXL_RESTRICT state, + const JxlCmsInterface& cms, ThreadPool* pool, + AuxOut* aux_out, bool is_xyb) { + std::vector info = + FindTextLikePatches(opsin, state, pool, aux_out, is_xyb); + + // TODO(veluca): this doesn't work if both dots and patches are enabled. + // For now, since dots and patches are not likely to occur in the same kind of + // images, disable dots if some patches were found. + if (info.empty() && + ApplyOverride( + state->cparams.dots, + state->cparams.speed_tier <= SpeedTier::kSquirrel && + state->cparams.butteraugli_distance >= kMinButteraugliForDots)) { + info = FindDotDictionary(state->cparams, opsin, state->shared.cmap, pool); + } + + if (info.empty()) return; + + std::sort( + info.begin(), info.end(), [&](const PatchInfo& a, const PatchInfo& b) { + return a.first.xsize * a.first.ysize > b.first.xsize * b.first.ysize; + }); + + size_t max_x_size = 0; + size_t max_y_size = 0; + size_t total_pixels = 0; + + for (size_t i = 0; i < info.size(); i++) { + size_t pixels = info[i].first.xsize * info[i].first.ysize; + if (max_x_size < info[i].first.xsize) max_x_size = info[i].first.xsize; + if (max_y_size < info[i].first.ysize) max_y_size = info[i].first.ysize; + total_pixels += pixels; + } + + // Bin-packing & conversion of patches. + constexpr float kBinPackingSlackness = 1.05f; + size_t ref_xsize = std::max(max_x_size, std::sqrt(total_pixels)); + size_t ref_ysize = std::max(max_y_size, std::sqrt(total_pixels)); + std::vector> ref_positions(info.size()); + // TODO(veluca): allow partial overlaps of patches that have the same pixels. + size_t max_y = 0; + do { + max_y = 0; + // Increase packed image size. + ref_xsize = ref_xsize * kBinPackingSlackness + 1; + ref_ysize = ref_ysize * kBinPackingSlackness + 1; + + ImageB occupied(ref_xsize, ref_ysize); + ZeroFillImage(&occupied); + uint8_t* JXL_RESTRICT occupied_rows = occupied.Row(0); + size_t occupied_stride = occupied.PixelsPerRow(); + + bool success = true; + // For every patch... + for (size_t patch = 0; patch < info.size(); patch++) { + size_t x0 = 0; + size_t y0 = 0; + size_t xsize = info[patch].first.xsize; + size_t ysize = info[patch].first.ysize; + bool found = false; + // For every possible start position ... + for (; y0 + ysize <= ref_ysize; y0++) { + x0 = 0; + for (; x0 + xsize <= ref_xsize; x0++) { + bool has_occupied_pixel = false; + size_t x = x0; + // Check if it is possible to place the patch in this position in the + // reference frame. + for (size_t y = y0; y < y0 + ysize; y++) { + x = x0; + for (; x < x0 + xsize; x++) { + if (occupied_rows[y * occupied_stride + x]) { + has_occupied_pixel = true; + break; + } + } + } // end of positioning check + if (!has_occupied_pixel) { + found = true; + break; + } + x0 = x; // Jump to next pixel after the occupied one. + } + if (found) break; + } // end of start position checking + + // We didn't find a possible position: repeat from the beginning with a + // larger reference frame size. + if (!found) { + success = false; + break; + } + + // We found a position: mark the corresponding positions in the reference + // image as used. + ref_positions[patch] = {x0, y0}; + for (size_t y = y0; y < y0 + ysize; y++) { + for (size_t x = x0; x < x0 + xsize; x++) { + occupied_rows[y * occupied_stride + x] = true; + } + } + max_y = std::max(max_y, y0 + ysize); + } + + if (success) break; + } while (true); + + JXL_ASSERT(ref_ysize >= max_y); + + ref_ysize = max_y; + + Image3F reference_frame(ref_xsize, ref_ysize); + // TODO(veluca): figure out a better way to fill the image. + ZeroFillImage(&reference_frame); + std::vector positions; + std::vector pref_positions; + std::vector blendings; + float* JXL_RESTRICT ref_rows[3] = { + reference_frame.PlaneRow(0, 0), + reference_frame.PlaneRow(1, 0), + reference_frame.PlaneRow(2, 0), + }; + size_t ref_stride = reference_frame.PixelsPerRow(); + size_t num_ec = state->shared.metadata->m.num_extra_channels; + + for (size_t i = 0; i < info.size(); i++) { + PatchReferencePosition ref_pos; + ref_pos.xsize = info[i].first.xsize; + ref_pos.ysize = info[i].first.ysize; + ref_pos.x0 = ref_positions[i].first; + ref_pos.y0 = ref_positions[i].second; + ref_pos.ref = kPatchFrameReferenceId; + for (size_t y = 0; y < ref_pos.ysize; y++) { + for (size_t x = 0; x < ref_pos.xsize; x++) { + for (size_t c = 0; c < 3; c++) { + ref_rows[c][(y + ref_pos.y0) * ref_stride + x + ref_pos.x0] = + info[i].first.fpixels[c][y * ref_pos.xsize + x]; + } + } + } + for (const auto& pos : info[i].second) { + positions.emplace_back( + PatchPosition{pos.first, pos.second, pref_positions.size()}); + // Add blending for color channels, ignore other channels. + blendings.push_back({PatchBlendMode::kAdd, 0, false}); + for (size_t j = 0; j < num_ec; ++j) { + blendings.push_back({PatchBlendMode::kNone, 0, false}); + } + } + pref_positions.emplace_back(std::move(ref_pos)); + } + + CompressParams cparams = state->cparams; + // Recursive application of patches could create very weird issues. + cparams.patches = Override::kOff; + + RoundtripPatchFrame(&reference_frame, state, kPatchFrameReferenceId, cparams, + cms, pool, aux_out, /*subtract=*/true); + + // TODO(veluca): this assumes that applying patches is commutative, which is + // not true for all blending modes. This code only produces kAdd patches, so + // this works out. + PatchDictionaryEncoder::SetPositions( + &state->shared.image_features.patches, std::move(positions), + std::move(pref_positions), std::move(blendings)); +} + +void RoundtripPatchFrame(Image3F* reference_frame, + PassesEncoderState* JXL_RESTRICT state, int idx, + CompressParams& cparams, const JxlCmsInterface& cms, + ThreadPool* pool, AuxOut* aux_out, bool subtract) { + FrameInfo patch_frame_info; + cparams.resampling = 1; + cparams.ec_resampling = 1; + cparams.dots = Override::kOff; + cparams.noise = Override::kOff; + cparams.modular_mode = true; + cparams.responsive = 0; + cparams.progressive_dc = 0; + cparams.progressive_mode = false; + cparams.qprogressive_mode = false; + // Use gradient predictor and not Predictor::Best. + cparams.options.predictor = Predictor::Gradient; + patch_frame_info.save_as_reference = idx; // always saved. + patch_frame_info.frame_type = FrameType::kReferenceOnly; + patch_frame_info.save_before_color_transform = true; + ImageBundle ib(&state->shared.metadata->m); + // TODO(veluca): metadata.color_encoding is a lie: ib is in XYB, but there is + // no simple way to express that yet. + patch_frame_info.ib_needs_color_transform = false; + ib.SetFromImage(std::move(*reference_frame), + state->shared.metadata->m.color_encoding); + if (!ib.metadata()->extra_channel_info.empty()) { + // Add dummy extra channels to the patch image: patch encoding does not yet + // support extra channels, but the codec expects that the amount of extra + // channels in frames matches that in the metadata of the codestream. + std::vector extra_channels; + extra_channels.reserve(ib.metadata()->extra_channel_info.size()); + for (size_t i = 0; i < ib.metadata()->extra_channel_info.size(); i++) { + extra_channels.emplace_back(ib.xsize(), ib.ysize()); + // Must initialize the image with data to not affect blending with + // uninitialized memory. + // TODO(lode): patches must copy and use the real extra channels instead. + ZeroFillImage(&extra_channels.back()); + } + ib.SetExtraChannels(std::move(extra_channels)); + } + PassesEncoderState roundtrip_state; + auto special_frame = std::unique_ptr(new BitWriter()); + AuxOut patch_aux_out; + JXL_CHECK(EncodeFrame(cparams, patch_frame_info, state->shared.metadata, ib, + &roundtrip_state, cms, pool, special_frame.get(), + aux_out ? &patch_aux_out : nullptr)); + if (aux_out) { + for (const auto& l : patch_aux_out.layers) { + aux_out->layers[kLayerDictionary].Assimilate(l); + } + } + const Span encoded = special_frame->GetSpan(); + state->special_frames.emplace_back(std::move(special_frame)); + if (subtract) { + ImageBundle decoded(&state->shared.metadata->m); + PassesDecoderState dec_state; + JXL_CHECK(dec_state.output_encoding_info.SetFromMetadata( + *state->shared.metadata)); + const uint8_t* frame_start = encoded.data(); + size_t encoded_size = encoded.size(); + JXL_CHECK(DecodeFrame(&dec_state, pool, frame_start, encoded_size, &decoded, + *state->shared.metadata)); + frame_start += decoded.decoded_bytes(); + encoded_size -= decoded.decoded_bytes(); + size_t ref_xsize = + dec_state.shared_storage.reference_frames[idx].frame.color()->xsize(); + // if the frame itself uses patches, we need to decode another frame + if (!ref_xsize) { + JXL_CHECK(DecodeFrame(&dec_state, pool, frame_start, encoded_size, + &decoded, *state->shared.metadata)); + } + JXL_CHECK(encoded_size == 0); + state->shared.reference_frames[idx] = + std::move(dec_state.shared_storage.reference_frames[idx]); + } else { + state->shared.reference_frames[idx].frame = std::move(ib); + } +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.h b/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.h new file mode 100644 index 0000000000..f30881b232 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.h @@ -0,0 +1,109 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_PATCH_DICTIONARY_H_ +#define LIB_JXL_ENC_PATCH_DICTIONARY_H_ + +// Chooses reference patches, and avoids encoding them once per occurrence. + +#include +#include +#include + +#include +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_patch_dictionary.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image.h" +#include "lib/jxl/opsin_params.h" + +namespace jxl { + +struct AuxOut; + +constexpr size_t kMaxPatchSize = 32; + +struct QuantizedPatch { + size_t xsize; + size_t ysize; + QuantizedPatch() { + for (size_t i = 0; i < 3; i++) { + pixels[i].resize(kMaxPatchSize * kMaxPatchSize); + fpixels[i].resize(kMaxPatchSize * kMaxPatchSize); + } + } + std::vector pixels[3] = {}; + // Not compared. Used only to retrieve original pixels to construct the + // reference image. + std::vector fpixels[3] = {}; + bool operator==(const QuantizedPatch& other) const { + if (xsize != other.xsize) return false; + if (ysize != other.ysize) return false; + for (size_t c = 0; c < 3; c++) { + if (memcmp(pixels[c].data(), other.pixels[c].data(), + sizeof(int8_t) * xsize * ysize) != 0) + return false; + } + return true; + } + + bool operator<(const QuantizedPatch& other) const { + if (xsize != other.xsize) return xsize < other.xsize; + if (ysize != other.ysize) return ysize < other.ysize; + for (size_t c = 0; c < 3; c++) { + int cmp = memcmp(pixels[c].data(), other.pixels[c].data(), + sizeof(int8_t) * xsize * ysize); + if (cmp > 0) return false; + if (cmp < 0) return true; + } + return false; + } +}; + +// Pair (patch, vector of occurrences). +using PatchInfo = + std::pair>>; + +// Friend class of PatchDictionary. +class PatchDictionaryEncoder { + public: + // Only call if HasAny(). + static void Encode(const PatchDictionary& pdic, BitWriter* writer, + size_t layer, AuxOut* aux_out); + + static void SetPositions(PatchDictionary* pdic, + std::vector positions, + std::vector ref_positions, + std::vector blendings) { + pdic->positions_ = std::move(positions); + pdic->ref_positions_ = std::move(ref_positions); + pdic->blendings_ = std::move(blendings); + pdic->ComputePatchTree(); + } + + static void SubtractFrom(const PatchDictionary& pdic, Image3F* opsin); +}; + +void FindBestPatchDictionary(const Image3F& opsin, + PassesEncoderState* JXL_RESTRICT state, + const JxlCmsInterface& cms, ThreadPool* pool, + AuxOut* aux_out, bool is_xyb = true); + +void RoundtripPatchFrame(Image3F* reference_frame, + PassesEncoderState* JXL_RESTRICT state, int idx, + CompressParams& cparams, const JxlCmsInterface& cms, + ThreadPool* pool, AuxOut* aux_out, bool subtract); + +} // namespace jxl + +#endif // LIB_JXL_ENC_PATCH_DICTIONARY_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_photon_noise.cc b/third_party/jpeg-xl/lib/jxl/enc_photon_noise.cc new file mode 100644 index 0000000000..3786ef5cf5 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_photon_noise.cc @@ -0,0 +1,89 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_photon_noise.h" + +namespace jxl { + +namespace { + +// Assumes a daylight-like spectrum. +// https://www.strollswithmydog.com/effective-quantum-efficiency-of-sensor/#:~:text=11%2C260%20photons/um%5E2/lx-s +constexpr float kPhotonsPerLxSPerUm2 = 11260; + +// Order of magnitude for cameras in the 2010-2020 decade, taking the CFA into +// account. +constexpr float kEffectiveQuantumEfficiency = 0.20; + +// TODO(sboukortt): reevaluate whether these are good defaults, notably whether +// it would be worth making read noise higher at lower ISO settings. +constexpr float kPhotoResponseNonUniformity = 0.005; +constexpr float kInputReferredReadNoise = 3; + +// Assumes a 35mm sensor. +constexpr float kSensorAreaUm2 = 36000.f * 24000; + +template +inline constexpr T Square(const T x) { + return x * x; +} +template +inline constexpr T Cube(const T x) { + return x * x * x; +} + +} // namespace + +NoiseParams SimulatePhotonNoise(const size_t xsize, const size_t ysize, + const float iso) { + const float kOpsinAbsorbanceBiasCbrt = std::cbrt(kOpsinAbsorbanceBias[1]); + + // Focal plane exposure for 18% of kDefaultIntensityTarget, in lx·s. + // (ISO = 10 lx·s ÷ H) + const float h_18 = 10 / iso; + + const float pixel_area_um2 = kSensorAreaUm2 / (xsize * ysize); + + const float electrons_per_pixel_18 = kEffectiveQuantumEfficiency * + kPhotonsPerLxSPerUm2 * h_18 * + pixel_area_um2; + + NoiseParams params; + + for (size_t i = 0; i < NoiseParams::kNumNoisePoints; ++i) { + const float scaled_index = i / (NoiseParams::kNumNoisePoints - 2.f); + // scaled_index is used for XYB = (0, 2·scaled_index, 2·scaled_index) + const float y = 2 * scaled_index; + // 1 = default intensity target + const float linear = std::max( + 0.f, Cube(y - kOpsinAbsorbanceBiasCbrt) + kOpsinAbsorbanceBias[1]); + const float electrons_per_pixel = electrons_per_pixel_18 * (linear / 0.18f); + // Quadrature sum of read noise, photon shot noise (sqrt(S) so simply not + // squared here) and photo response non-uniformity. + // https://doi.org/10.1117/3.725073 + // Units are electrons rms. + const float noise = + std::sqrt(Square(kInputReferredReadNoise) + electrons_per_pixel + + Square(kPhotoResponseNonUniformity * electrons_per_pixel)); + const float linear_noise = noise * (0.18f / electrons_per_pixel_18); + const float opsin_derivative = + (1.f / 3) / Square(std::cbrt(linear - kOpsinAbsorbanceBias[1])); + const float opsin_noise = linear_noise * opsin_derivative; + + // TODO(sboukortt): verify more thoroughly whether the denominator is + // correct. + params.lut[i] = + Clamp1(opsin_noise / + (0.22f // norm_const + * std::sqrt(2.f) // red_noise + green_noise + * 1.13f // standard deviation of a plane of generated noise + ), + 0.f, 1.f); + } + + return params; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_photon_noise.h b/third_party/jpeg-xl/lib/jxl/enc_photon_noise.h new file mode 100644 index 0000000000..f43e14d560 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_photon_noise.h @@ -0,0 +1,22 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_PHOTON_NOISE_H_ +#define LIB_JXL_ENC_PHOTON_NOISE_H_ + +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/image.h" +#include "lib/jxl/noise.h" + +namespace jxl { + +// Constructs a NoiseParams representing the noise that would be seen at the +// selected nominal exposure on a last-decade (as of 2021) color camera with a +// 36×24mm sensor (“35mm format”). +NoiseParams SimulatePhotonNoise(size_t xsize, size_t ysize, float iso); + +} // namespace jxl + +#endif // LIB_JXL_ENC_PHOTON_NOISE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_photon_noise_test.cc b/third_party/jpeg-xl/lib/jxl/enc_photon_noise_test.cc new file mode 100644 index 0000000000..be11b465ad --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_photon_noise_test.cc @@ -0,0 +1,51 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_photon_noise.h" + +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +using ::testing::FloatNear; +using ::testing::Pointwise; + +MATCHER(AreApproximatelyEqual, "") { + constexpr float kTolerance = 1e-6; + const float actual = std::get<0>(arg); + const float expected = std::get<1>(arg); + return testing::ExplainMatchResult(FloatNear(expected, kTolerance), actual, + result_listener); +} + +TEST(EncPhotonNoiseTest, LUTs) { + EXPECT_THAT( + SimulatePhotonNoise(/*xsize=*/6000, /*ysize=*/4000, /*iso=*/100).lut, + Pointwise(AreApproximatelyEqual(), + {0.00259652, 0.0139648, 0.00681551, 0.00632582, 0.00694917, + 0.00803922, 0.00934574, 0.0107607})); + EXPECT_THAT( + SimulatePhotonNoise(/*xsize=*/6000, /*ysize=*/4000, /*iso=*/800).lut, + Pointwise(AreApproximatelyEqual(), + {0.02077220, 0.0420923, 0.01820690, 0.01439020, 0.01293670, + 0.01254030, 0.01277390, 0.0134161})); + EXPECT_THAT( + SimulatePhotonNoise(/*xsize=*/6000, /*ysize=*/4000, /*iso=*/6400).lut, + Pointwise(AreApproximatelyEqual(), + {0.1661770, 0.1691120, 0.05309080, 0.03963960, 0.03357410, + 0.03001650, 0.02776740, 0.0263478})); + + // Lower when measured on a per-pixel basis as there are fewer of them. + EXPECT_THAT( + SimulatePhotonNoise(/*xsize=*/4000, /*ysize=*/3000, /*iso=*/6400).lut, + Pointwise(AreApproximatelyEqual(), + {0.0830886, 0.1008720, 0.0367748, 0.0280305, 0.0240236, + 0.0218040, 0.0205771, 0.0200058})); +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_progressive_split.cc b/third_party/jpeg-xl/lib/jxl/enc_progressive_split.cc new file mode 100644 index 0000000000..b65319f3fd --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_progressive_split.cc @@ -0,0 +1,82 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_progressive_split.h" + +#include + +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image.h" + +namespace jxl { + +template +void ProgressiveSplitter::SplitACCoefficients( + const T* JXL_RESTRICT block, const AcStrategy& acs, size_t bx, size_t by, + T* JXL_RESTRICT output[kMaxNumPasses]) { + size_t size = acs.covered_blocks_x() * acs.covered_blocks_y() * kDCTBlockSize; + auto shift_right_round0 = [&](T v, int shift) { + T one_if_negative = static_cast(v) >> 31; + T add = (one_if_negative << shift) - one_if_negative; + return (v + add) >> shift; + }; + // Early quit for the simple case of only one pass. + if (mode_.num_passes == 1) { + memcpy(output[0], block, sizeof(T) * size); + return; + } + size_t ncoeffs_all_done_from_earlier_passes = 1; + + int previous_pass_shift = 0; + for (size_t num_pass = 0; num_pass < mode_.num_passes; num_pass++) { // pass + // Zero out output block. + memset(output[num_pass], 0, size * sizeof(T)); + const int pass_shift = mode_.passes[num_pass].shift; + size_t frame_ncoeffs = mode_.passes[num_pass].num_coefficients; + size_t xsize = acs.covered_blocks_x(); + size_t ysize = acs.covered_blocks_y(); + CoefficientLayout(&ysize, &xsize); + for (size_t y = 0; y < ysize * frame_ncoeffs; y++) { // superblk-y + for (size_t x = 0; x < xsize * frame_ncoeffs; x++) { // superblk-x + size_t pos = y * xsize * kBlockDim + x; + if (x < xsize * ncoeffs_all_done_from_earlier_passes && + y < ysize * ncoeffs_all_done_from_earlier_passes) { + // This coefficient was already included in an earlier pass, + // which included a genuinely smaller set of coefficients. + continue; + } + T v = block[pos]; + // Previous pass discarded some bits: do not encode them again. + if (previous_pass_shift != 0) { + T previous_v = shift_right_round0(v, previous_pass_shift) * + (1 << previous_pass_shift); + v -= previous_v; + } + output[num_pass][pos] = shift_right_round0(v, pass_shift); + } // superblk-x + } // superblk-y + // We just finished a pass. + // Hence, we are now guaranteed to have included all coeffs up to + // frame_ncoeffs in every block, unless the current pass is shifted. + if (mode_.passes[num_pass].shift == 0) { + ncoeffs_all_done_from_earlier_passes = frame_ncoeffs; + } + previous_pass_shift = mode_.passes[num_pass].shift; + } // num_pass +} + +template void ProgressiveSplitter::SplitACCoefficients( + const int32_t* JXL_RESTRICT, const AcStrategy&, size_t, size_t, + int32_t* JXL_RESTRICT[kMaxNumPasses]); + +template void ProgressiveSplitter::SplitACCoefficients( + const int16_t* JXL_RESTRICT, const AcStrategy&, size_t, size_t, + int16_t* JXL_RESTRICT[kMaxNumPasses]); + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_progressive_split.h b/third_party/jpeg-xl/lib/jxl/enc_progressive_split.h new file mode 100644 index 0000000000..ef25944bb7 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_progressive_split.h @@ -0,0 +1,131 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_PROGRESSIVE_SPLIT_H_ +#define LIB_JXL_PROGRESSIVE_SPLIT_H_ + +#include +#include + +#include +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/splines.h" + +// Functions to split DCT coefficients in multiple passes. All the passes of a +// single frame are added together. + +namespace jxl { + +constexpr size_t kNoDownsamplingFactor = std::numeric_limits::max(); + +struct PassDefinition { + // Side of the square of the coefficients that should be kept in each 8x8 + // block. Must be greater than 1, and at most 8. Should be in non-decreasing + // order. + size_t num_coefficients; + + // How much to shift the encoded values by, with rounding. + size_t shift; + + // If specified, this indicates that if the requested downsampling factor is + // sufficiently high, then it is fine to stop decoding after this pass. + // By default, passes are not marked as being suitable for any downsampling. + size_t suitable_for_downsampling_of_at_least; +}; + +struct ProgressiveMode { + size_t num_passes = 1; + PassDefinition passes[kMaxNumPasses] = { + PassDefinition{/*num_coefficients=*/8, /*shift=*/0, + /*suitable_for_downsampling_of_at_least=*/1}}; + + ProgressiveMode() = default; + + template + explicit ProgressiveMode(const PassDefinition (&p)[nump]) { + JXL_ASSERT(nump <= kMaxNumPasses); + num_passes = nump; + PassDefinition previous_pass{ + /*num_coefficients=*/1, /*shift=*/0, + /*suitable_for_downsampling_of_at_least=*/kNoDownsamplingFactor}; + size_t last_downsampling_factor = kNoDownsamplingFactor; + for (size_t i = 0; i < nump; i++) { + JXL_ASSERT(p[i].num_coefficients > previous_pass.num_coefficients || + (p[i].num_coefficients == previous_pass.num_coefficients && + p[i].shift < previous_pass.shift)); + JXL_ASSERT(p[i].suitable_for_downsampling_of_at_least == + kNoDownsamplingFactor || + p[i].suitable_for_downsampling_of_at_least <= + last_downsampling_factor); + // Only used inside assert. + (void)last_downsampling_factor; + if (p[i].suitable_for_downsampling_of_at_least != kNoDownsamplingFactor) { + last_downsampling_factor = p[i].suitable_for_downsampling_of_at_least; + } + previous_pass = passes[i] = p[i]; + } + } +}; + +class ProgressiveSplitter { + public: + void SetProgressiveMode(ProgressiveMode mode) { mode_ = mode; } + + size_t GetNumPasses() const { return mode_.num_passes; } + + void InitPasses(Passes* JXL_RESTRICT passes) const { + passes->num_passes = static_cast(GetNumPasses()); + passes->num_downsample = 0; + JXL_ASSERT(passes->num_passes != 0); + passes->shift[passes->num_passes - 1] = 0; + if (passes->num_passes == 1) return; // Done, arrays are empty + + for (uint32_t i = 0; i < mode_.num_passes - 1; ++i) { + const size_t min_downsampling_factor = + mode_.passes[i].suitable_for_downsampling_of_at_least; + passes->shift[i] = mode_.passes[i].shift; + if (1 < min_downsampling_factor && + min_downsampling_factor != kNoDownsamplingFactor) { + passes->downsample[passes->num_downsample] = min_downsampling_factor; + passes->last_pass[passes->num_downsample] = i; + if (mode_.passes[i + 1].suitable_for_downsampling_of_at_least < + min_downsampling_factor) { + passes->num_downsample += 1; + } + } + } + } + + template + void SplitACCoefficients(const T* JXL_RESTRICT block, const AcStrategy& acs, + size_t bx, size_t by, + T* JXL_RESTRICT output[kMaxNumPasses]); + + private: + ProgressiveMode mode_; +}; + +extern template void ProgressiveSplitter::SplitACCoefficients( + const int32_t* JXL_RESTRICT, const AcStrategy&, size_t, size_t, + int32_t* JXL_RESTRICT[kMaxNumPasses]); + +extern template void ProgressiveSplitter::SplitACCoefficients( + const int16_t* JXL_RESTRICT, const AcStrategy&, size_t, size_t, + int16_t* JXL_RESTRICT[kMaxNumPasses]); + +} // namespace jxl + +#endif // LIB_JXL_PROGRESSIVE_SPLIT_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_quant_weights.cc b/third_party/jpeg-xl/lib/jxl/enc_quant_weights.cc new file mode 100644 index 0000000000..848310e75d --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_quant_weights.cc @@ -0,0 +1,214 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_quant_weights.h" + +#include +#include + +#include +#include +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/modular/options.h" + +namespace jxl { + +struct AuxOut; + +namespace { + +Status EncodeDctParams(const DctQuantWeightParams& params, BitWriter* writer) { + JXL_ASSERT(params.num_distance_bands >= 1); + writer->Write(DctQuantWeightParams::kLog2MaxDistanceBands, + params.num_distance_bands - 1); + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < params.num_distance_bands; i++) { + JXL_RETURN_IF_ERROR(F16Coder::Write( + params.distance_bands[c][i] * (i == 0 ? (1 / 64.0f) : 1.0f), writer)); + } + } + return true; +} + +Status EncodeQuant(const QuantEncoding& encoding, size_t idx, size_t size_x, + size_t size_y, BitWriter* writer, + ModularFrameEncoder* modular_frame_encoder) { + writer->Write(kLog2NumQuantModes, encoding.mode); + size_x *= kBlockDim; + size_y *= kBlockDim; + switch (encoding.mode) { + case QuantEncoding::kQuantModeLibrary: { + writer->Write(kCeilLog2NumPredefinedTables, encoding.predefined); + break; + } + case QuantEncoding::kQuantModeID: { + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < 3; i++) { + JXL_RETURN_IF_ERROR( + F16Coder::Write(encoding.idweights[c][i] * (1.0f / 64), writer)); + } + } + break; + } + case QuantEncoding::kQuantModeDCT2: { + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < 6; i++) { + JXL_RETURN_IF_ERROR(F16Coder::Write( + encoding.dct2weights[c][i] * (1.0f / 64), writer)); + } + } + break; + } + case QuantEncoding::kQuantModeDCT4X8: { + for (size_t c = 0; c < 3; c++) { + JXL_RETURN_IF_ERROR( + F16Coder::Write(encoding.dct4x8multipliers[c], writer)); + } + JXL_RETURN_IF_ERROR(EncodeDctParams(encoding.dct_params, writer)); + break; + } + case QuantEncoding::kQuantModeDCT4: { + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < 2; i++) { + JXL_RETURN_IF_ERROR( + F16Coder::Write(encoding.dct4multipliers[c][i], writer)); + } + } + JXL_RETURN_IF_ERROR(EncodeDctParams(encoding.dct_params, writer)); + break; + } + case QuantEncoding::kQuantModeDCT: { + JXL_RETURN_IF_ERROR(EncodeDctParams(encoding.dct_params, writer)); + break; + } + case QuantEncoding::kQuantModeRAW: { + ModularFrameEncoder::EncodeQuantTable(size_x, size_y, writer, encoding, + idx, modular_frame_encoder); + break; + } + case QuantEncoding::kQuantModeAFV: { + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < 9; i++) { + JXL_RETURN_IF_ERROR(F16Coder::Write( + encoding.afv_weights[c][i] * (i < 6 ? 1.0f / 64 : 1.0f), writer)); + } + } + JXL_RETURN_IF_ERROR(EncodeDctParams(encoding.dct_params, writer)); + JXL_RETURN_IF_ERROR(EncodeDctParams(encoding.dct_params_afv_4x4, writer)); + break; + } + } + return true; +} + +} // namespace + +Status DequantMatricesEncode(const DequantMatrices* matrices, BitWriter* writer, + size_t layer, AuxOut* aux_out, + ModularFrameEncoder* modular_frame_encoder) { + bool all_default = true; + const std::vector& encodings = matrices->encodings(); + + for (size_t i = 0; i < encodings.size(); i++) { + if (encodings[i].mode != QuantEncoding::kQuantModeLibrary || + encodings[i].predefined != 0) { + all_default = false; + } + } + // TODO(janwas): better bound + BitWriter::Allotment allotment(writer, 512 * 1024); + writer->Write(1, all_default); + if (!all_default) { + for (size_t i = 0; i < encodings.size(); i++) { + JXL_RETURN_IF_ERROR(EncodeQuant( + encodings[i], i, DequantMatrices::required_size_x[i], + DequantMatrices::required_size_y[i], writer, modular_frame_encoder)); + } + } + allotment.ReclaimAndCharge(writer, layer, aux_out); + return true; +} + +Status DequantMatricesEncodeDC(const DequantMatrices* matrices, + BitWriter* writer, size_t layer, + AuxOut* aux_out) { + bool all_default = true; + const float* dc_quant = matrices->DCQuants(); + for (size_t c = 0; c < 3; c++) { + if (dc_quant[c] != kDCQuant[c]) { + all_default = false; + } + } + BitWriter::Allotment allotment(writer, 1 + sizeof(float) * kBitsPerByte * 3); + writer->Write(1, all_default); + if (!all_default) { + for (size_t c = 0; c < 3; c++) { + JXL_RETURN_IF_ERROR(F16Coder::Write(dc_quant[c] * 128.0f, writer)); + } + } + allotment.ReclaimAndCharge(writer, layer, aux_out); + return true; +} + +void DequantMatricesSetCustomDC(DequantMatrices* matrices, const float* dc) { + matrices->SetDCQuant(dc); + // Roundtrip encode/decode DC to ensure same values as decoder. + BitWriter writer; + JXL_CHECK(DequantMatricesEncodeDC(matrices, &writer, 0, nullptr)); + writer.ZeroPadToByte(); + BitReader br(writer.GetSpan()); + // Called only in the encoder: should fail only for programmer errors. + JXL_CHECK(matrices->DecodeDC(&br)); + JXL_CHECK(br.Close()); +} + +void DequantMatricesScaleDC(DequantMatrices* matrices, const float scale) { + float dc[3]; + for (size_t c = 0; c < 3; ++c) { + dc[c] = matrices->InvDCQuant(c) * (1.0f / scale); + } + DequantMatricesSetCustomDC(matrices, dc); +} + +void DequantMatricesRoundtrip(DequantMatrices* matrices) { + // Do not pass modular en/decoder, as they only change entropy and not + // values. + BitWriter writer; + JXL_CHECK(DequantMatricesEncode(matrices, &writer, 0, nullptr)); + writer.ZeroPadToByte(); + BitReader br(writer.GetSpan()); + // Called only in the encoder: should fail only for programmer errors. + JXL_CHECK(matrices->Decode(&br)); + JXL_CHECK(br.Close()); +} + +void DequantMatricesSetCustom(DequantMatrices* matrices, + const std::vector& encodings, + ModularFrameEncoder* encoder) { + JXL_ASSERT(encodings.size() == DequantMatrices::kNum); + matrices->SetEncodings(encodings); + for (size_t i = 0; i < encodings.size(); i++) { + if (encodings[i].mode == QuantEncodingInternal::kQuantModeRAW) { + encoder->AddQuantTable(DequantMatrices::required_size_x[i] * kBlockDim, + DequantMatrices::required_size_y[i] * kBlockDim, + encodings[i], i); + } + } + DequantMatricesRoundtrip(matrices); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_quant_weights.h b/third_party/jpeg-xl/lib/jxl/enc_quant_weights.h new file mode 100644 index 0000000000..e0a387fed5 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_quant_weights.h @@ -0,0 +1,37 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_QUANT_WEIGHTS_H_ +#define LIB_JXL_ENC_QUANT_WEIGHTS_H_ + +#include "lib/jxl/quant_weights.h" + +namespace jxl { + +struct AuxOut; +struct BitWriter; + +Status DequantMatricesEncode( + const DequantMatrices* matrices, BitWriter* writer, size_t layer, + AuxOut* aux_out, ModularFrameEncoder* modular_frame_encoder = nullptr); +Status DequantMatricesEncodeDC(const DequantMatrices* matrices, + BitWriter* writer, size_t layer, + AuxOut* aux_out); +// For consistency with QuantEncoding, higher values correspond to more +// precision. +void DequantMatricesSetCustomDC(DequantMatrices* matrices, const float* dc); + +void DequantMatricesScaleDC(DequantMatrices* matrices, float scale); + +void DequantMatricesSetCustom(DequantMatrices* matrices, + const std::vector& encodings, + ModularFrameEncoder* encoder); + +// Roundtrip encode/decode the matrices to ensure same values as decoder. +void DequantMatricesRoundtrip(DequantMatrices* matrices); + +} // namespace jxl + +#endif // LIB_JXL_ENC_QUANT_WEIGHTS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_splines.cc b/third_party/jpeg-xl/lib/jxl/enc_splines.cc new file mode 100644 index 0000000000..ddcd78a748 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_splines.cc @@ -0,0 +1,98 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/splines.h" + +namespace jxl { + +struct AuxOut; + +class QuantizedSplineEncoder { + public: + // Only call if HasAny(). + static void Tokenize(const QuantizedSpline& spline, + std::vector* const tokens) { + tokens->emplace_back(kNumControlPointsContext, + spline.control_points_.size()); + for (const auto& point : spline.control_points_) { + tokens->emplace_back(kControlPointsContext, PackSigned(point.first)); + tokens->emplace_back(kControlPointsContext, PackSigned(point.second)); + } + const auto encode_dct = [tokens](const int dct[32]) { + for (int i = 0; i < 32; ++i) { + tokens->emplace_back(kDCTContext, PackSigned(dct[i])); + } + }; + for (int c = 0; c < 3; ++c) { + encode_dct(spline.color_dct_[c]); + } + encode_dct(spline.sigma_dct_); + } +}; + +namespace { + +void EncodeAllStartingPoints(const std::vector& points, + std::vector* tokens) { + int64_t last_x = 0; + int64_t last_y = 0; + for (size_t i = 0; i < points.size(); i++) { + const int64_t x = lroundf(points[i].x); + const int64_t y = lroundf(points[i].y); + if (i == 0) { + tokens->emplace_back(kStartingPositionContext, x); + tokens->emplace_back(kStartingPositionContext, y); + } else { + tokens->emplace_back(kStartingPositionContext, PackSigned(x - last_x)); + tokens->emplace_back(kStartingPositionContext, PackSigned(y - last_y)); + } + last_x = x; + last_y = y; + } +} + +} // namespace + +void EncodeSplines(const Splines& splines, BitWriter* writer, + const size_t layer, const HistogramParams& histogram_params, + AuxOut* aux_out) { + JXL_ASSERT(splines.HasAny()); + + const std::vector& quantized_splines = + splines.QuantizedSplines(); + std::vector> tokens(1); + tokens[0].emplace_back(kNumSplinesContext, quantized_splines.size() - 1); + EncodeAllStartingPoints(splines.StartingPoints(), &tokens[0]); + + tokens[0].emplace_back(kQuantizationAdjustmentContext, + PackSigned(splines.GetQuantizationAdjustment())); + + for (const QuantizedSpline& spline : quantized_splines) { + QuantizedSplineEncoder::Tokenize(spline, &tokens[0]); + } + + EntropyEncodingData codes; + std::vector context_map; + BuildAndEncodeHistograms(histogram_params, kNumSplineContexts, tokens, &codes, + &context_map, writer, layer, aux_out); + WriteTokens(tokens[0], codes, context_map, writer, layer, aux_out); +} + +Splines FindSplines(const Image3F& opsin) { + // TODO: implement spline detection. + return {}; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_splines.h b/third_party/jpeg-xl/lib/jxl/enc_splines.h new file mode 100644 index 0000000000..be700dba75 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_splines.h @@ -0,0 +1,38 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_SPLINES_H_ +#define LIB_JXL_ENC_SPLINES_H_ + +#include +#include + +#include +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/image.h" +#include "lib/jxl/splines.h" + +namespace jxl { + +struct AuxOut; + +// Only call if splines.HasAny(). +void EncodeSplines(const Splines& splines, BitWriter* writer, size_t layer, + const HistogramParams& histogram_params, AuxOut* aux_out); + +Splines FindSplines(const Image3F& opsin); + +} // namespace jxl + +#endif // LIB_JXL_ENC_SPLINES_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_toc.cc b/third_party/jpeg-xl/lib/jxl/enc_toc.cc new file mode 100644 index 0000000000..dc75fdd9ba --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_toc.cc @@ -0,0 +1,45 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_toc.h" + +#include + +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/common.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_coeff_order.h" +#include "lib/jxl/field_encodings.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/toc.h" + +namespace jxl { +Status WriteGroupOffsets(const std::vector& group_codes, + const std::vector* permutation, + BitWriter* JXL_RESTRICT writer, AuxOut* aux_out) { + BitWriter::Allotment allotment(writer, MaxBits(group_codes.size())); + if (permutation && !group_codes.empty()) { + // Don't write a permutation at all for an empty group_codes. + writer->Write(1, 1); // permutation + JXL_DASSERT(permutation->size() == group_codes.size()); + EncodePermutation(permutation->data(), /*skip=*/0, permutation->size(), + writer, /* layer= */ 0, aux_out); + + } else { + writer->Write(1, 0); // no permutation + } + writer->ZeroPadToByte(); // before TOC entries + + for (size_t i = 0; i < group_codes.size(); i++) { + JXL_ASSERT(group_codes[i].BitsWritten() % kBitsPerByte == 0); + const size_t group_size = group_codes[i].BitsWritten() / kBitsPerByte; + JXL_RETURN_IF_ERROR(U32Coder::Write(kTocDist, group_size, writer)); + } + writer->ZeroPadToByte(); // before first group + allotment.ReclaimAndCharge(writer, kLayerTOC, aux_out); + return true; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_toc.h b/third_party/jpeg-xl/lib/jxl/enc_toc.h new file mode 100644 index 0000000000..242b3efccb --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_toc.h @@ -0,0 +1,31 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_TOC_H_ +#define LIB_JXL_ENC_TOC_H_ + +#include +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/enc_bit_writer.h" + +namespace jxl { + +struct AuxOut; + +// Writes the group offsets. If the permutation vector is nullptr, the identity +// permutation will be used. +Status WriteGroupOffsets(const std::vector& group_codes, + const std::vector* permutation, + BitWriter* JXL_RESTRICT writer, AuxOut* aux_out); + +} // namespace jxl + +#endif // LIB_JXL_ENC_TOC_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_transforms-inl.h b/third_party/jpeg-xl/lib/jxl/enc_transforms-inl.h new file mode 100644 index 0000000000..ef6dc2bbd7 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_transforms-inl.h @@ -0,0 +1,827 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#if defined(LIB_JXL_ENC_TRANSFORMS_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_ENC_TRANSFORMS_INL_H_ +#undef LIB_JXL_ENC_TRANSFORMS_INL_H_ +#else +#define LIB_JXL_ENC_TRANSFORMS_INL_H_ +#endif + +#include + +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/dct-inl.h" +#include "lib/jxl/dct_scales.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// Inverse of ReinterpretingDCT. +template +HWY_INLINE void ReinterpretingIDCT(const float* input, + const size_t input_stride, float* output, + const size_t output_stride) { + HWY_ALIGN float block[ROWS * COLS] = {}; + if (ROWS < COLS) { + for (size_t y = 0; y < LF_ROWS; y++) { + for (size_t x = 0; x < LF_COLS; x++) { + block[y * COLS + x] = input[y * input_stride + x] * + DCTTotalResampleScale(y) * + DCTTotalResampleScale(x); + } + } + } else { + for (size_t y = 0; y < LF_COLS; y++) { + for (size_t x = 0; x < LF_ROWS; x++) { + block[y * ROWS + x] = input[y * input_stride + x] * + DCTTotalResampleScale(y) * + DCTTotalResampleScale(x); + } + } + } + + // ROWS, COLS <= 8, so we can put scratch space on the stack. + HWY_ALIGN float scratch_space[ROWS * COLS]; + ComputeScaledIDCT()(block, DCTTo(output, output_stride), + scratch_space); +} + +template +void DCT2TopBlock(const float* block, size_t stride, float* out) { + static_assert(kBlockDim % S == 0, "S should be a divisor of kBlockDim"); + static_assert(S % 2 == 0, "S should be even"); + float temp[kDCTBlockSize]; + constexpr size_t num_2x2 = S / 2; + for (size_t y = 0; y < num_2x2; y++) { + for (size_t x = 0; x < num_2x2; x++) { + float c00 = block[y * 2 * stride + x * 2]; + float c01 = block[y * 2 * stride + x * 2 + 1]; + float c10 = block[(y * 2 + 1) * stride + x * 2]; + float c11 = block[(y * 2 + 1) * stride + x * 2 + 1]; + float r00 = c00 + c01 + c10 + c11; + float r01 = c00 + c01 - c10 - c11; + float r10 = c00 - c01 + c10 - c11; + float r11 = c00 - c01 - c10 + c11; + r00 *= 0.25f; + r01 *= 0.25f; + r10 *= 0.25f; + r11 *= 0.25f; + temp[y * kBlockDim + x] = r00; + temp[y * kBlockDim + num_2x2 + x] = r01; + temp[(y + num_2x2) * kBlockDim + x] = r10; + temp[(y + num_2x2) * kBlockDim + num_2x2 + x] = r11; + } + } + for (size_t y = 0; y < S; y++) { + for (size_t x = 0; x < S; x++) { + out[y * kBlockDim + x] = temp[y * kBlockDim + x]; + } + } +} + +void AFVDCT4x4(const float* JXL_RESTRICT pixels, float* JXL_RESTRICT coeffs) { + HWY_ALIGN static constexpr float k4x4AFVBasisTranspose[16][16] = { + { + 0.2500000000000000, + 0.8769029297991420f, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + -0.4105377591765233f, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + }, + { + 0.2500000000000000, + 0.2206518106944235f, + 0.0000000000000000, + 0.0000000000000000, + -0.7071067811865474f, + 0.6235485373547691f, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + }, + { + 0.2500000000000000, + -0.1014005039375376f, + 0.4067007583026075f, + -0.2125574805828875f, + 0.0000000000000000, + -0.0643507165794627f, + -0.4517556589999482f, + -0.3046847507248690f, + 0.3017929516615495f, + 0.4082482904638627f, + 0.1747866975480809f, + -0.2110560104933578f, + -0.1426608480880726f, + -0.1381354035075859f, + -0.1743760259965107f, + 0.1135498731499434f, + }, + { + 0.2500000000000000, + -0.1014005039375375f, + 0.4444481661973445f, + 0.3085497062849767f, + 0.0000000000000000f, + -0.0643507165794627f, + 0.1585450355184006f, + 0.5112616136591823f, + 0.2579236279634118f, + 0.0000000000000000, + 0.0812611176717539f, + 0.1856718091610980f, + -0.3416446842253372f, + 0.3302282550303788f, + 0.0702790691196284f, + -0.0741750459581035f, + }, + { + 0.2500000000000000, + 0.2206518106944236f, + 0.0000000000000000, + 0.0000000000000000, + 0.7071067811865476f, + 0.6235485373547694f, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + }, + { + 0.2500000000000000, + -0.1014005039375378f, + 0.0000000000000000, + 0.4706702258572536f, + 0.0000000000000000, + -0.0643507165794628f, + -0.0403851516082220f, + 0.0000000000000000, + 0.1627234014286620f, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.7367497537172237f, + 0.0875511500058708f, + -0.2921026642334881f, + 0.1940289303259434f, + }, + { + 0.2500000000000000, + -0.1014005039375377f, + 0.1957439937204294f, + -0.1621205195722993f, + 0.0000000000000000, + -0.0643507165794628f, + 0.0074182263792424f, + -0.2904801297289980f, + 0.0952002265347504f, + 0.0000000000000000, + -0.3675398009862027f, + 0.4921585901373873f, + 0.2462710772207515f, + -0.0794670660590957f, + 0.3623817333531167f, + -0.4351904965232280f, + }, + { + 0.2500000000000000, + -0.1014005039375376f, + 0.2929100136981264f, + 0.0000000000000000, + 0.0000000000000000, + -0.0643507165794627f, + 0.3935103426921017f, + -0.0657870154914280f, + 0.0000000000000000, + -0.4082482904638628f, + -0.3078822139579090f, + -0.3852501370925192f, + -0.0857401903551931f, + -0.4613374887461511f, + 0.0000000000000000, + 0.2191868483885747f, + }, + { + 0.2500000000000000, + -0.1014005039375376f, + -0.4067007583026072f, + -0.2125574805828705f, + 0.0000000000000000, + -0.0643507165794627f, + -0.4517556589999464f, + 0.3046847507248840f, + 0.3017929516615503f, + -0.4082482904638635f, + -0.1747866975480813f, + 0.2110560104933581f, + -0.1426608480880734f, + -0.1381354035075829f, + -0.1743760259965108f, + 0.1135498731499426f, + }, + { + 0.2500000000000000, + -0.1014005039375377f, + -0.1957439937204287f, + -0.1621205195722833f, + 0.0000000000000000, + -0.0643507165794628f, + 0.0074182263792444f, + 0.2904801297290076f, + 0.0952002265347505f, + 0.0000000000000000, + 0.3675398009862011f, + -0.4921585901373891f, + 0.2462710772207514f, + -0.0794670660591026f, + 0.3623817333531165f, + -0.4351904965232251f, + }, + { + 0.2500000000000000, + -0.1014005039375375f, + 0.0000000000000000, + -0.4706702258572528f, + 0.0000000000000000, + -0.0643507165794627f, + 0.1107416575309343f, + 0.0000000000000000, + -0.1627234014286617f, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.1488339922711357f, + 0.4972464710953509f, + 0.2921026642334879f, + 0.5550443808910661f, + }, + { + 0.2500000000000000, + -0.1014005039375377f, + 0.1137907446044809f, + -0.1464291867126764f, + 0.0000000000000000, + -0.0643507165794628f, + 0.0829816309488205f, + -0.2388977352334460f, + -0.3531238544981630f, + -0.4082482904638630f, + 0.4826689115059883f, + 0.1741941265991622f, + -0.0476868035022925f, + 0.1253805944856366f, + -0.4326608024727445f, + -0.2546827712406646f, + }, + { + 0.2500000000000000, + -0.1014005039375377f, + -0.4444481661973438f, + 0.3085497062849487f, + 0.0000000000000000, + -0.0643507165794628f, + 0.1585450355183970f, + -0.5112616136592012f, + 0.2579236279634129f, + 0.0000000000000000, + -0.0812611176717504f, + -0.1856718091610990f, + -0.3416446842253373f, + 0.3302282550303805f, + 0.0702790691196282f, + -0.0741750459581023f, + }, + { + 0.2500000000000000, + -0.1014005039375376f, + -0.2929100136981264f, + 0.0000000000000000, + 0.0000000000000000, + -0.0643507165794627f, + 0.3935103426921022f, + 0.0657870154914254f, + 0.0000000000000000, + 0.4082482904638634f, + 0.3078822139579031f, + 0.3852501370925211f, + -0.0857401903551927f, + -0.4613374887461554f, + 0.0000000000000000, + 0.2191868483885728f, + }, + { + 0.2500000000000000, + -0.1014005039375376f, + -0.1137907446044814f, + -0.1464291867126654f, + 0.0000000000000000, + -0.0643507165794627f, + 0.0829816309488214f, + 0.2388977352334547f, + -0.3531238544981624f, + 0.4082482904638630f, + -0.4826689115059858f, + -0.1741941265991621f, + -0.0476868035022928f, + 0.1253805944856431f, + -0.4326608024727457f, + -0.2546827712406641f, + }, + { + 0.2500000000000000, + -0.1014005039375374f, + 0.0000000000000000, + 0.4251149611657548f, + 0.0000000000000000, + -0.0643507165794626f, + -0.4517556589999480f, + 0.0000000000000000, + -0.6035859033230976f, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + -0.1426608480880724f, + -0.1381354035075845f, + 0.3487520519930227f, + 0.1135498731499429f, + }, + }; + + const HWY_CAPPED(float, 16) d; + for (size_t i = 0; i < 16; i += Lanes(d)) { + auto scalar = Zero(d); + for (size_t j = 0; j < 16; j++) { + auto px = Set(d, pixels[j]); + auto basis = Load(d, k4x4AFVBasisTranspose[j] + i); + scalar = MulAdd(px, basis, scalar); + } + Store(scalar, d, coeffs + i); + } +} + +// Coefficient layout: +// - (even, even) positions hold AFV coefficients +// - (odd, even) positions hold DCT4x4 coefficients +// - (any, odd) positions hold DCT4x8 coefficients +template +void AFVTransformFromPixels(const float* JXL_RESTRICT pixels, + size_t pixels_stride, + float* JXL_RESTRICT coefficients) { + HWY_ALIGN float scratch_space[4 * 8 * 2]; + size_t afv_x = afv_kind & 1; + size_t afv_y = afv_kind / 2; + HWY_ALIGN float block[4 * 8]; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + block[(afv_y == 1 ? 3 - iy : iy) * 4 + (afv_x == 1 ? 3 - ix : ix)] = + pixels[(iy + 4 * afv_y) * pixels_stride + ix + 4 * afv_x]; + } + } + // AFV coefficients in (even, even) positions. + HWY_ALIGN float coeff[4 * 4]; + AFVDCT4x4(block, coeff); + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + coefficients[iy * 2 * 8 + ix * 2] = coeff[iy * 4 + ix]; + } + } + // 4x4 DCT of the block with same y and different x. + ComputeScaledDCT<4, 4>()( + DCTFrom(pixels + afv_y * 4 * pixels_stride + (afv_x == 1 ? 0 : 4), + pixels_stride), + block, scratch_space); + // ... in (odd, even) positions. + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 8; ix++) { + coefficients[iy * 2 * 8 + ix * 2 + 1] = block[iy * 4 + ix]; + } + } + // 4x8 DCT of the other half of the block. + ComputeScaledDCT<4, 8>()( + DCTFrom(pixels + (afv_y == 1 ? 0 : 4) * pixels_stride, pixels_stride), + block, scratch_space); + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 8; ix++) { + coefficients[(1 + iy * 2) * 8 + ix] = block[iy * 8 + ix]; + } + } + float block00 = coefficients[0] * 0.25f; + float block01 = coefficients[1]; + float block10 = coefficients[8]; + coefficients[0] = (block00 + block01 + 2 * block10) * 0.25f; + coefficients[1] = (block00 - block01) * 0.5f; + coefficients[8] = (block00 + block01 - 2 * block10) * 0.25f; +} + +HWY_MAYBE_UNUSED void TransformFromPixels(const AcStrategy::Type strategy, + const float* JXL_RESTRICT pixels, + size_t pixels_stride, + float* JXL_RESTRICT coefficients, + float* JXL_RESTRICT scratch_space) { + using Type = AcStrategy::Type; + switch (strategy) { + case Type::IDENTITY: { + PROFILER_ZONE("DCT Identity"); + for (size_t y = 0; y < 2; y++) { + for (size_t x = 0; x < 2; x++) { + float block_dc = 0; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + block_dc += pixels[(y * 4 + iy) * pixels_stride + x * 4 + ix]; + } + } + block_dc *= 1.0f / 16; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + if (ix == 1 && iy == 1) continue; + coefficients[(y + iy * 2) * 8 + x + ix * 2] = + pixels[(y * 4 + iy) * pixels_stride + x * 4 + ix] - + pixels[(y * 4 + 1) * pixels_stride + x * 4 + 1]; + } + } + coefficients[(y + 2) * 8 + x + 2] = coefficients[y * 8 + x]; + coefficients[y * 8 + x] = block_dc; + } + } + float block00 = coefficients[0]; + float block01 = coefficients[1]; + float block10 = coefficients[8]; + float block11 = coefficients[9]; + coefficients[0] = (block00 + block01 + block10 + block11) * 0.25f; + coefficients[1] = (block00 + block01 - block10 - block11) * 0.25f; + coefficients[8] = (block00 - block01 + block10 - block11) * 0.25f; + coefficients[9] = (block00 - block01 - block10 + block11) * 0.25f; + break; + } + case Type::DCT8X4: { + PROFILER_ZONE("DCT 8x4"); + for (size_t x = 0; x < 2; x++) { + HWY_ALIGN float block[4 * 8]; + ComputeScaledDCT<8, 4>()(DCTFrom(pixels + x * 4, pixels_stride), block, + scratch_space); + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 8; ix++) { + // Store transposed. + coefficients[(x + iy * 2) * 8 + ix] = block[iy * 8 + ix]; + } + } + } + float block0 = coefficients[0]; + float block1 = coefficients[8]; + coefficients[0] = (block0 + block1) * 0.5f; + coefficients[8] = (block0 - block1) * 0.5f; + break; + } + case Type::DCT4X8: { + PROFILER_ZONE("DCT 4x8"); + for (size_t y = 0; y < 2; y++) { + HWY_ALIGN float block[4 * 8]; + ComputeScaledDCT<4, 8>()( + DCTFrom(pixels + y * 4 * pixels_stride, pixels_stride), block, + scratch_space); + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 8; ix++) { + coefficients[(y + iy * 2) * 8 + ix] = block[iy * 8 + ix]; + } + } + } + float block0 = coefficients[0]; + float block1 = coefficients[8]; + coefficients[0] = (block0 + block1) * 0.5f; + coefficients[8] = (block0 - block1) * 0.5f; + break; + } + case Type::DCT4X4: { + PROFILER_ZONE("DCT 4"); + for (size_t y = 0; y < 2; y++) { + for (size_t x = 0; x < 2; x++) { + HWY_ALIGN float block[4 * 4]; + ComputeScaledDCT<4, 4>()( + DCTFrom(pixels + y * 4 * pixels_stride + x * 4, pixels_stride), + block, scratch_space); + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + coefficients[(y + iy * 2) * 8 + x + ix * 2] = block[iy * 4 + ix]; + } + } + } + } + float block00 = coefficients[0]; + float block01 = coefficients[1]; + float block10 = coefficients[8]; + float block11 = coefficients[9]; + coefficients[0] = (block00 + block01 + block10 + block11) * 0.25f; + coefficients[1] = (block00 + block01 - block10 - block11) * 0.25f; + coefficients[8] = (block00 - block01 + block10 - block11) * 0.25f; + coefficients[9] = (block00 - block01 - block10 + block11) * 0.25f; + break; + } + case Type::DCT2X2: { + PROFILER_ZONE("DCT 2"); + DCT2TopBlock<8>(pixels, pixels_stride, coefficients); + DCT2TopBlock<4>(coefficients, kBlockDim, coefficients); + DCT2TopBlock<2>(coefficients, kBlockDim, coefficients); + break; + } + case Type::DCT16X16: { + PROFILER_ZONE("DCT 16"); + ComputeScaledDCT<16, 16>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT16X8: { + PROFILER_ZONE("DCT 16x8"); + ComputeScaledDCT<16, 8>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT8X16: { + PROFILER_ZONE("DCT 8x16"); + ComputeScaledDCT<8, 16>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT32X8: { + PROFILER_ZONE("DCT 32x8"); + ComputeScaledDCT<32, 8>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT8X32: { + PROFILER_ZONE("DCT 8x32"); + ComputeScaledDCT<8, 32>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT32X16: { + PROFILER_ZONE("DCT 32x16"); + ComputeScaledDCT<32, 16>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT16X32: { + PROFILER_ZONE("DCT 16x32"); + ComputeScaledDCT<16, 32>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT32X32: { + PROFILER_ZONE("DCT 32"); + ComputeScaledDCT<32, 32>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT: { + PROFILER_ZONE("DCT 8"); + ComputeScaledDCT<8, 8>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::AFV0: { + PROFILER_ZONE("AFV0"); + AFVTransformFromPixels<0>(pixels, pixels_stride, coefficients); + break; + } + case Type::AFV1: { + PROFILER_ZONE("AFV1"); + AFVTransformFromPixels<1>(pixels, pixels_stride, coefficients); + break; + } + case Type::AFV2: { + PROFILER_ZONE("AFV2"); + AFVTransformFromPixels<2>(pixels, pixels_stride, coefficients); + break; + } + case Type::AFV3: { + PROFILER_ZONE("AFV3"); + AFVTransformFromPixels<3>(pixels, pixels_stride, coefficients); + break; + } + case Type::DCT64X64: { + PROFILER_ZONE("DCT 64x64"); + ComputeScaledDCT<64, 64>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT64X32: { + PROFILER_ZONE("DCT 64x32"); + ComputeScaledDCT<64, 32>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT32X64: { + PROFILER_ZONE("DCT 32x64"); + ComputeScaledDCT<32, 64>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT128X128: { + PROFILER_ZONE("DCT 128x128"); + ComputeScaledDCT<128, 128>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT128X64: { + PROFILER_ZONE("DCT 128x64"); + ComputeScaledDCT<128, 64>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT64X128: { + PROFILER_ZONE("DCT 64x128"); + ComputeScaledDCT<64, 128>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT256X256: { + PROFILER_ZONE("DCT 256x256"); + ComputeScaledDCT<256, 256>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT256X128: { + PROFILER_ZONE("DCT 256x128"); + ComputeScaledDCT<256, 128>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT128X256: { + PROFILER_ZONE("DCT 128x256"); + ComputeScaledDCT<128, 256>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::kNumValidStrategies: + JXL_ABORT("Invalid strategy"); + } +} + +HWY_MAYBE_UNUSED void DCFromLowestFrequencies(const AcStrategy::Type strategy, + const float* block, float* dc, + size_t dc_stride) { + using Type = AcStrategy::Type; + switch (strategy) { + case Type::DCT16X8: { + ReinterpretingIDCT( + block, 2 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT8X16: { + ReinterpretingIDCT( + block, 2 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT16X16: { + ReinterpretingIDCT( + block, 2 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT32X8: { + ReinterpretingIDCT( + block, 4 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT8X32: { + ReinterpretingIDCT( + block, 4 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT32X16: { + ReinterpretingIDCT( + block, 4 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT16X32: { + ReinterpretingIDCT( + block, 4 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT32X32: { + ReinterpretingIDCT( + block, 4 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT64X32: { + ReinterpretingIDCT( + block, 8 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT32X64: { + ReinterpretingIDCT( + block, 8 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT64X64: { + ReinterpretingIDCT( + block, 8 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT128X64: { + ReinterpretingIDCT< + /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/8 * kBlockDim, + /*LF_ROWS=*/16, /*LF_COLS=*/8, /*ROWS=*/16, /*COLS=*/8>( + block, 16 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT64X128: { + ReinterpretingIDCT< + /*DCT_ROWS=*/8 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim, + /*LF_ROWS=*/8, /*LF_COLS=*/16, /*ROWS=*/8, /*COLS=*/16>( + block, 16 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT128X128: { + ReinterpretingIDCT< + /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim, + /*LF_ROWS=*/16, /*LF_COLS=*/16, /*ROWS=*/16, /*COLS=*/16>( + block, 16 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT256X128: { + ReinterpretingIDCT< + /*DCT_ROWS=*/32 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim, + /*LF_ROWS=*/32, /*LF_COLS=*/16, /*ROWS=*/32, /*COLS=*/16>( + block, 32 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT128X256: { + ReinterpretingIDCT< + /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/32 * kBlockDim, + /*LF_ROWS=*/16, /*LF_COLS=*/32, /*ROWS=*/16, /*COLS=*/32>( + block, 32 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT256X256: { + ReinterpretingIDCT< + /*DCT_ROWS=*/32 * kBlockDim, /*DCT_COLS=*/32 * kBlockDim, + /*LF_ROWS=*/32, /*LF_COLS=*/32, /*ROWS=*/32, /*COLS=*/32>( + block, 32 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT: + case Type::DCT2X2: + case Type::DCT4X4: + case Type::DCT4X8: + case Type::DCT8X4: + case Type::AFV0: + case Type::AFV1: + case Type::AFV2: + case Type::AFV3: + case Type::IDENTITY: + dc[0] = block[0]; + break; + case Type::kNumValidStrategies: + JXL_ABORT("Invalid strategy"); + } +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_ENC_TRANSFORMS_INL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_transforms.cc b/third_party/jpeg-xl/lib/jxl/enc_transforms.cc new file mode 100644 index 0000000000..8978ba1dcb --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_transforms.cc @@ -0,0 +1,41 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_transforms.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_transforms.cc" +#include +#include + +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/enc_transforms-inl.h" + +namespace jxl { + +#if HWY_ONCE +HWY_EXPORT(TransformFromPixels); +void TransformFromPixels(const AcStrategy::Type strategy, + const float* JXL_RESTRICT pixels, size_t pixels_stride, + float* JXL_RESTRICT coefficients, + float* scratch_space) { + return HWY_DYNAMIC_DISPATCH(TransformFromPixels)( + strategy, pixels, pixels_stride, coefficients, scratch_space); +} + +HWY_EXPORT(DCFromLowestFrequencies); +void DCFromLowestFrequencies(AcStrategy::Type strategy, const float* block, + float* dc, size_t dc_stride) { + return HWY_DYNAMIC_DISPATCH(DCFromLowestFrequencies)(strategy, block, dc, + dc_stride); +} + +HWY_EXPORT(AFVDCT4x4); +void AFVDCT4x4(const float* JXL_RESTRICT pixels, float* JXL_RESTRICT coeffs) { + return HWY_DYNAMIC_DISPATCH(AFVDCT4x4)(pixels, coeffs); +} +#endif // HWY_ONCE + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/enc_transforms.h b/third_party/jpeg-xl/lib/jxl/enc_transforms.h new file mode 100644 index 0000000000..039ccc3893 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_transforms.h @@ -0,0 +1,32 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_TRANSFORMS_H_ +#define LIB_JXL_ENC_TRANSFORMS_H_ + +// Facade for (non-inlined) integral transforms. + +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/compiler_specific.h" + +namespace jxl { + +void TransformFromPixels(const AcStrategy::Type strategy, + const float* JXL_RESTRICT pixels, size_t pixels_stride, + float* JXL_RESTRICT coefficients, + float* JXL_RESTRICT scratch_space); + +// Equivalent of the above for DC image. +void DCFromLowestFrequencies(AcStrategy::Type strategy, const float* block, + float* dc, size_t dc_stride); + +void AFVDCT4x4(const float* JXL_RESTRICT pixels, float* JXL_RESTRICT coeffs); + +} // namespace jxl + +#endif // LIB_JXL_ENC_TRANSFORMS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/enc_xyb.cc b/third_party/jpeg-xl/lib/jxl/enc_xyb.cc new file mode 100644 index 0000000000..2ee0abf821 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_xyb.cc @@ -0,0 +1,520 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_xyb.h" + +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_xyb.cc" +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/enc_image_bundle.h" +#include "lib/jxl/fast_math-inl.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/transfer_functions-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::Sub; +using hwy::HWY_NAMESPACE::ZeroIfNegative; + +// 4x3 matrix * 3x1 SIMD vectors +template +JXL_INLINE void OpsinAbsorbance(const V r, const V g, const V b, + const float* JXL_RESTRICT premul_absorb, + V* JXL_RESTRICT mixed0, V* JXL_RESTRICT mixed1, + V* JXL_RESTRICT mixed2) { + const float* bias = &kOpsinAbsorbanceBias[0]; + const HWY_FULL(float) d; + const size_t N = Lanes(d); + const auto m0 = Load(d, premul_absorb + 0 * N); + const auto m1 = Load(d, premul_absorb + 1 * N); + const auto m2 = Load(d, premul_absorb + 2 * N); + const auto m3 = Load(d, premul_absorb + 3 * N); + const auto m4 = Load(d, premul_absorb + 4 * N); + const auto m5 = Load(d, premul_absorb + 5 * N); + const auto m6 = Load(d, premul_absorb + 6 * N); + const auto m7 = Load(d, premul_absorb + 7 * N); + const auto m8 = Load(d, premul_absorb + 8 * N); + *mixed0 = MulAdd(m0, r, MulAdd(m1, g, MulAdd(m2, b, Set(d, bias[0])))); + *mixed1 = MulAdd(m3, r, MulAdd(m4, g, MulAdd(m5, b, Set(d, bias[1])))); + *mixed2 = MulAdd(m6, r, MulAdd(m7, g, MulAdd(m8, b, Set(d, bias[2])))); +} + +template +void StoreXYB(const V r, V g, const V b, float* JXL_RESTRICT valx, + float* JXL_RESTRICT valy, float* JXL_RESTRICT valz) { + const HWY_FULL(float) d; + const V half = Set(d, 0.5f); + Store(Mul(half, Sub(r, g)), d, valx); + Store(Mul(half, Add(r, g)), d, valy); + Store(b, d, valz); +} + +// Converts one RGB vector to XYB. +template +void LinearRGBToXYB(const V r, const V g, const V b, + const float* JXL_RESTRICT premul_absorb, + float* JXL_RESTRICT valx, float* JXL_RESTRICT valy, + float* JXL_RESTRICT valz) { + V mixed0, mixed1, mixed2; + OpsinAbsorbance(r, g, b, premul_absorb, &mixed0, &mixed1, &mixed2); + + // mixed* should be non-negative even for wide-gamut, so clamp to zero. + mixed0 = ZeroIfNegative(mixed0); + mixed1 = ZeroIfNegative(mixed1); + mixed2 = ZeroIfNegative(mixed2); + + const HWY_FULL(float) d; + const size_t N = Lanes(d); + mixed0 = CubeRootAndAdd(mixed0, Load(d, premul_absorb + 9 * N)); + mixed1 = CubeRootAndAdd(mixed1, Load(d, premul_absorb + 10 * N)); + mixed2 = CubeRootAndAdd(mixed2, Load(d, premul_absorb + 11 * N)); + StoreXYB(mixed0, mixed1, mixed2, valx, valy, valz); + + // For wide-gamut inputs, r/g/b and valx (but not y/z) are often negative. +} + +void LinearRGBRowToXYB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1, + float* JXL_RESTRICT row2, + const float* JXL_RESTRICT premul_absorb, size_t xsize) { + const HWY_FULL(float) d; + for (size_t x = 0; x < xsize; x += Lanes(d)) { + const auto r = Load(d, row0 + x); + const auto g = Load(d, row1 + x); + const auto b = Load(d, row2 + x); + LinearRGBToXYB(r, g, b, premul_absorb, row0 + x, row1 + x, row2 + x); + } +} + +// Input/output uses the codec.h scaling: nominally 0-1 if in-gamut. +template +V LinearFromSRGB(V encoded) { + return TF_SRGB().DisplayFromEncoded(encoded); +} + +Status LinearSRGBToXYB(const Image3F& linear, + const float* JXL_RESTRICT premul_absorb, + ThreadPool* pool, Image3F* JXL_RESTRICT xyb) { + const size_t xsize = linear.xsize(); + + const HWY_FULL(float) d; + return RunOnPool( + pool, 0, static_cast(linear.ysize()), ThreadPool::NoInit, + [&](const uint32_t task, size_t /*thread*/) { + const size_t y = static_cast(task); + const float* JXL_RESTRICT row_in0 = linear.ConstPlaneRow(0, y); + const float* JXL_RESTRICT row_in1 = linear.ConstPlaneRow(1, y); + const float* JXL_RESTRICT row_in2 = linear.ConstPlaneRow(2, y); + float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y); + float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y); + float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y); + + for (size_t x = 0; x < xsize; x += Lanes(d)) { + const auto in_r = Load(d, row_in0 + x); + const auto in_g = Load(d, row_in1 + x); + const auto in_b = Load(d, row_in2 + x); + LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x, + row_xyb1 + x, row_xyb2 + x); + } + }, + "LinearToXYB"); +} + +Status SRGBToXYB(const Image3F& srgb, const float* JXL_RESTRICT premul_absorb, + ThreadPool* pool, Image3F* JXL_RESTRICT xyb) { + const size_t xsize = srgb.xsize(); + + const HWY_FULL(float) d; + return RunOnPool( + pool, 0, static_cast(srgb.ysize()), ThreadPool::NoInit, + [&](const uint32_t task, size_t /*thread*/) { + const size_t y = static_cast(task); + const float* JXL_RESTRICT row_srgb0 = srgb.ConstPlaneRow(0, y); + const float* JXL_RESTRICT row_srgb1 = srgb.ConstPlaneRow(1, y); + const float* JXL_RESTRICT row_srgb2 = srgb.ConstPlaneRow(2, y); + float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y); + float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y); + float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y); + + for (size_t x = 0; x < xsize; x += Lanes(d)) { + const auto in_r = LinearFromSRGB(Load(d, row_srgb0 + x)); + const auto in_g = LinearFromSRGB(Load(d, row_srgb1 + x)); + const auto in_b = LinearFromSRGB(Load(d, row_srgb2 + x)); + LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x, + row_xyb1 + x, row_xyb2 + x); + } + }, + "SRGBToXYB"); +} + +Status SRGBToXYBAndLinear(const Image3F& srgb, + const float* JXL_RESTRICT premul_absorb, + ThreadPool* pool, Image3F* JXL_RESTRICT xyb, + Image3F* JXL_RESTRICT linear) { + const size_t xsize = srgb.xsize(); + + const HWY_FULL(float) d; + return RunOnPool( + pool, 0, static_cast(srgb.ysize()), ThreadPool::NoInit, + [&](const uint32_t task, size_t /*thread*/) { + const size_t y = static_cast(task); + const float* JXL_RESTRICT row_srgb0 = srgb.ConstPlaneRow(0, y); + const float* JXL_RESTRICT row_srgb1 = srgb.ConstPlaneRow(1, y); + const float* JXL_RESTRICT row_srgb2 = srgb.ConstPlaneRow(2, y); + + float* JXL_RESTRICT row_linear0 = linear->PlaneRow(0, y); + float* JXL_RESTRICT row_linear1 = linear->PlaneRow(1, y); + float* JXL_RESTRICT row_linear2 = linear->PlaneRow(2, y); + + float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y); + float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y); + float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y); + + for (size_t x = 0; x < xsize; x += Lanes(d)) { + const auto in_r = LinearFromSRGB(Load(d, row_srgb0 + x)); + const auto in_g = LinearFromSRGB(Load(d, row_srgb1 + x)); + const auto in_b = LinearFromSRGB(Load(d, row_srgb2 + x)); + + Store(in_r, d, row_linear0 + x); + Store(in_g, d, row_linear1 + x); + Store(in_b, d, row_linear2 + x); + + LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x, + row_xyb1 + x, row_xyb2 + x); + } + }, + "SRGBToXYBAndLinear"); +} + +void ComputePremulAbsorb(float intensity_target, float* premul_absorb) { + const HWY_FULL(float) d; + const size_t N = Lanes(d); + const float mul = intensity_target / 255.0f; + for (size_t i = 0; i < 9; ++i) { + const auto absorb = Set(d, kOpsinAbsorbanceMatrix[i] * mul); + Store(absorb, d, premul_absorb + i * N); + } + for (size_t i = 0; i < 3; ++i) { + const auto neg_bias_cbrt = Set(d, -cbrtf(kOpsinAbsorbanceBias[i])); + Store(neg_bias_cbrt, d, premul_absorb + (9 + i) * N); + } +} + +Image3F TransformToLinearRGB(const Image3F& in, + const ColorEncoding& color_encoding, + float intensity_target, const JxlCmsInterface& cms, + ThreadPool* pool) { + ColorSpaceTransform c_transform(cms); + bool is_gray = color_encoding.IsGray(); + const ColorEncoding& c_desired = ColorEncoding::LinearSRGB(is_gray); + Image3F out(in.xsize(), in.ysize()); + std::atomic ok{true}; + JXL_CHECK(RunOnPool( + pool, 0, in.ysize(), + [&](const size_t num_threads) { + return c_transform.Init(color_encoding, c_desired, intensity_target, + in.xsize(), num_threads); + }, + [&](const uint32_t y, const size_t thread) { + float* mutable_src_buf = c_transform.BufSrc(thread); + const float* src_buf = mutable_src_buf; + // Interleave input. + if (is_gray) { + src_buf = in.ConstPlaneRow(0, y); + } else { + const float* JXL_RESTRICT row_in0 = in.ConstPlaneRow(0, y); + const float* JXL_RESTRICT row_in1 = in.ConstPlaneRow(1, y); + const float* JXL_RESTRICT row_in2 = in.ConstPlaneRow(2, y); + for (size_t x = 0; x < in.xsize(); x++) { + mutable_src_buf[3 * x + 0] = row_in0[x]; + mutable_src_buf[3 * x + 1] = row_in1[x]; + mutable_src_buf[3 * x + 2] = row_in2[x]; + } + } + float* JXL_RESTRICT dst_buf = c_transform.BufDst(thread); + if (!c_transform.Run(thread, src_buf, dst_buf)) { + ok.store(false); + return; + } + float* JXL_RESTRICT row_out0 = out.PlaneRow(0, y); + float* JXL_RESTRICT row_out1 = out.PlaneRow(1, y); + float* JXL_RESTRICT row_out2 = out.PlaneRow(2, y); + // De-interleave output and convert type. + if (is_gray) { + for (size_t x = 0; x < in.xsize(); x++) { + row_out0[x] = dst_buf[x]; + row_out1[x] = dst_buf[x]; + row_out2[x] = dst_buf[x]; + } + } else { + for (size_t x = 0; x < in.xsize(); x++) { + row_out0[x] = dst_buf[3 * x + 0]; + row_out1[x] = dst_buf[3 * x + 1]; + row_out2[x] = dst_buf[3 * x + 2]; + } + } + }, + "Colorspace transform")); + JXL_CHECK(ok.load()); + return out; +} + +void Image3FToXYB(const Image3F& in, const ColorEncoding& color_encoding, + float intensity_target, ThreadPool* pool, + Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms) { + JXL_ASSERT(SameSize(in, *xyb)); + + const HWY_FULL(float) d; + // Pre-broadcasted constants + HWY_ALIGN float premul_absorb[MaxLanes(d) * 12]; + ComputePremulAbsorb(intensity_target, premul_absorb); + + bool is_gray = color_encoding.IsGray(); + const ColorEncoding& c_linear_srgb = ColorEncoding::LinearSRGB(is_gray); + if (c_linear_srgb.SameColorEncoding(color_encoding)) { + JXL_CHECK(LinearSRGBToXYB(in, premul_absorb, pool, xyb)); + } else if (color_encoding.IsSRGB()) { + JXL_CHECK(SRGBToXYB(in, premul_absorb, pool, xyb)); + } else { + Image3F linear = + TransformToLinearRGB(in, color_encoding, intensity_target, cms, pool); + JXL_CHECK(LinearSRGBToXYB(linear, premul_absorb, pool, xyb)); + } +} + +// This is different from Butteraugli's OpsinDynamicsImage() in the sense that +// it does not contain a sensitivity multiplier based on the blurred image. +const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool, + Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms, + ImageBundle* const JXL_RESTRICT linear) { + PROFILER_FUNC; + + const size_t xsize = in.xsize(); + const size_t ysize = in.ysize(); + JXL_ASSERT(SameSize(in, *xyb)); + + const HWY_FULL(float) d; + // Pre-broadcasted constants + HWY_ALIGN float premul_absorb[MaxLanes(d) * 12]; + ComputePremulAbsorb(in.metadata()->IntensityTarget(), premul_absorb); + + const bool want_linear = linear != nullptr; + + const ColorEncoding& c_linear_srgb = ColorEncoding::LinearSRGB(in.IsGray()); + // Linear sRGB inputs are rare but can be useful for the fastest encoders, for + // which undoing the sRGB transfer function would be a large part of the cost. + if (c_linear_srgb.SameColorEncoding(in.c_current())) { + JXL_CHECK(LinearSRGBToXYB(in.color(), premul_absorb, pool, xyb)); + // This only happens if kitten or slower, moving ImageBundle might be + // possible but the encoder is much slower than this copy. + if (want_linear) { + *linear = in.Copy(); + return linear; + } + return ∈ + } + + // Common case: already sRGB, can avoid the color transform + if (in.IsSRGB()) { + // Common case: can avoid allocating/copying + if (!want_linear) { + JXL_CHECK(SRGBToXYB(in.color(), premul_absorb, pool, xyb)); + return ∈ + } + + // Slow encoder also wants linear sRGB. + linear->SetFromImage(Image3F(xsize, ysize), c_linear_srgb); + JXL_CHECK(SRGBToXYBAndLinear(in.color(), premul_absorb, pool, xyb, + linear->color())); + return linear; + } + + // General case: not sRGB, need color transform. + ImageBundle linear_storage; // Local storage only used if !want_linear. + + ImageBundle* linear_storage_ptr; + if (want_linear) { + // Caller asked for linear, use that storage directly. + linear_storage_ptr = linear; + } else { + // Caller didn't ask for linear, create our own local storage + // OK to reuse metadata, it will not be changed. + linear_storage = ImageBundle(const_cast(in.metadata())); + linear_storage_ptr = &linear_storage; + } + + const ImageBundle* ptr; + JXL_CHECK(TransformIfNeeded(in, c_linear_srgb, cms, pool, linear_storage_ptr, + &ptr)); + // If no transform was necessary, should have taken the above codepath. + JXL_ASSERT(ptr == linear_storage_ptr); + + JXL_CHECK( + LinearSRGBToXYB(*linear_storage_ptr->color(), premul_absorb, pool, xyb)); + return want_linear ? linear : ∈ +} + +// Transform RGB to YCbCr. +// Could be performed in-place (i.e. Y, Cb and Cr could alias R, B and B). +Status RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane, + const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane, + ImageF* cr_plane, ThreadPool* pool) { + const HWY_FULL(float) df; + const size_t S = Lanes(df); // Step. + + const size_t xsize = r_plane.xsize(); + const size_t ysize = r_plane.ysize(); + if ((xsize == 0) || (ysize == 0)) return true; + + // Full-range BT.601 as defined by JFIF Clause 7: + // https://www.itu.int/rec/T-REC-T.871-201105-I/en + const auto k128 = Set(df, 128.0f / 255); + const auto kR = Set(df, 0.299f); // NTSC luma + const auto kG = Set(df, 0.587f); + const auto kB = Set(df, 0.114f); + const auto kAmpR = Set(df, 0.701f); + const auto kAmpB = Set(df, 0.886f); + const auto kDiffR = Add(kAmpR, kR); + const auto kDiffB = Add(kAmpB, kB); + const auto kNormR = Div(Set(df, 1.0f), (Add(kAmpR, Add(kG, kB)))); + const auto kNormB = Div(Set(df, 1.0f), (Add(kR, Add(kG, kAmpB)))); + + constexpr size_t kGroupArea = kGroupDim * kGroupDim; + const size_t lines_per_group = DivCeil(kGroupArea, xsize); + const size_t num_stripes = DivCeil(ysize, lines_per_group); + const auto transform = [&](int idx, int /* thread*/) { + const size_t y0 = idx * lines_per_group; + const size_t y1 = std::min(y0 + lines_per_group, ysize); + for (size_t y = y0; y < y1; ++y) { + const float* r_row = r_plane.ConstRow(y); + const float* g_row = g_plane.ConstRow(y); + const float* b_row = b_plane.ConstRow(y); + float* y_row = y_plane->Row(y); + float* cb_row = cb_plane->Row(y); + float* cr_row = cr_plane->Row(y); + for (size_t x = 0; x < xsize; x += S) { + const auto r = Load(df, r_row + x); + const auto g = Load(df, g_row + x); + const auto b = Load(df, b_row + x); + const auto r_base = Mul(r, kR); + const auto r_diff = Mul(r, kDiffR); + const auto g_base = Mul(g, kG); + const auto b_base = Mul(b, kB); + const auto b_diff = Mul(b, kDiffB); + const auto y_base = Add(r_base, Add(g_base, b_base)); + const auto y_vec = Sub(y_base, k128); + const auto cb_vec = Mul(Sub(b_diff, y_base), kNormB); + const auto cr_vec = Mul(Sub(r_diff, y_base), kNormR); + Store(y_vec, df, y_row + x); + Store(cb_vec, df, cb_row + x); + Store(cr_vec, df, cr_row + x); + } + } + }; + return RunOnPool(pool, 0, static_cast(num_stripes), ThreadPool::NoInit, + transform, "RgbToYcbCr"); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(ToXYB); +const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool, + Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms, + ImageBundle* JXL_RESTRICT linear_storage) { + return HWY_DYNAMIC_DISPATCH(ToXYB)(in, pool, xyb, cms, linear_storage); +} + +HWY_EXPORT(LinearRGBRowToXYB); +void LinearRGBRowToXYB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1, + float* JXL_RESTRICT row2, + const float* JXL_RESTRICT premul_absorb, size_t xsize) { + HWY_DYNAMIC_DISPATCH(LinearRGBRowToXYB) + (row0, row1, row2, premul_absorb, xsize); +} + +HWY_EXPORT(ComputePremulAbsorb); +void ComputePremulAbsorb(float intensity_target, float* premul_absorb) { + HWY_DYNAMIC_DISPATCH(ComputePremulAbsorb)(intensity_target, premul_absorb); +} + +void ScaleXYBRow(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1, + float* JXL_RESTRICT row2, size_t xsize) { + for (size_t x = 0; x < xsize; x++) { + row2[x] = (row2[x] - row1[x] + kScaledXYBOffset[2]) * kScaledXYBScale[2]; + row0[x] = (row0[x] + kScaledXYBOffset[0]) * kScaledXYBScale[0]; + row1[x] = (row1[x] + kScaledXYBOffset[1]) * kScaledXYBScale[1]; + } +} + +void ScaleXYB(Image3F* opsin) { + for (size_t y = 0; y < opsin->ysize(); y++) { + float* row0 = opsin->PlaneRow(0, y); + float* row1 = opsin->PlaneRow(1, y); + float* row2 = opsin->PlaneRow(2, y); + ScaleXYBRow(row0, row1, row2, opsin->xsize()); + } +} + +HWY_EXPORT(Image3FToXYB); +void Image3FToXYB(const Image3F& in, const ColorEncoding& color_encoding, + float intensity_target, ThreadPool* pool, + Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms) { + return HWY_DYNAMIC_DISPATCH(Image3FToXYB)(in, color_encoding, + intensity_target, pool, xyb, cms); +} + +HWY_EXPORT(RgbToYcbcr); +Status RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane, + const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane, + ImageF* cr_plane, ThreadPool* pool) { + return HWY_DYNAMIC_DISPATCH(RgbToYcbcr)(r_plane, g_plane, b_plane, y_plane, + cb_plane, cr_plane, pool); +} + +// DEPRECATED +Image3F OpsinDynamicsImage(const Image3B& srgb8, const JxlCmsInterface& cms) { + ImageMetadata metadata; + metadata.SetUintSamples(8); + metadata.color_encoding = ColorEncoding::SRGB(); + ImageBundle ib(&metadata); + ib.SetFromImage(ConvertToFloat(srgb8), metadata.color_encoding); + JXL_CHECK(ib.TransformTo(ColorEncoding::LinearSRGB(ib.IsGray()), cms)); + ThreadPool* null_pool = nullptr; + Image3F xyb(srgb8.xsize(), srgb8.ysize()); + + ImageBundle linear_storage(&metadata); + (void)ToXYB(ib, null_pool, &xyb, cms, &linear_storage); + return xyb; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/enc_xyb.h b/third_party/jpeg-xl/lib/jxl/enc_xyb.h new file mode 100644 index 0000000000..fc902848ee --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/enc_xyb.h @@ -0,0 +1,56 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_XYB_H_ +#define LIB_JXL_ENC_XYB_H_ + +// Converts to XYB color space. + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +// Converts any color space to XYB. If `linear` is not null, returns `linear` +// after filling it with a linear sRGB copy of `in`. Otherwise, returns `&in`. +// +// NOTE this return value can avoid an extra color conversion if `in` would +// later be passed to JxlButteraugliComparator. +const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool, + Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms, + ImageBundle* JXL_RESTRICT linear = nullptr); + +void Image3FToXYB(const Image3F& in, const ColorEncoding& color_encoding, + float intensity_target, ThreadPool* pool, + Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms); + +void LinearRGBRowToXYB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1, + float* JXL_RESTRICT row2, + const float* JXL_RESTRICT premul_absorb, size_t xsize); + +void ComputePremulAbsorb(float intensity_target, float* premul_absorb); + +// Transforms each color component of the given XYB image into the [0.0, 1.0] +// interval with an affine transform. +void ScaleXYB(Image3F* opsin); +void ScaleXYBRow(float* row0, float* row1, float* row2, size_t xsize); + +// Bt.601 to match JPEG/JFIF. Outputs _signed_ YCbCr values suitable for DCT, +// see F.1.1.3 of T.81 (because our data type is float, there is no need to add +// a bias to make the values unsigned). +Status RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane, + const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane, + ImageF* cr_plane, ThreadPool* pool); + +// DEPRECATED, used by opsin_image_wrapper. +Image3F OpsinDynamicsImage(const Image3B& srgb8, const JxlCmsInterface& cms); + +} // namespace jxl + +#endif // LIB_JXL_ENC_XYB_H_ diff --git a/third_party/jpeg-xl/lib/jxl/encode.cc b/third_party/jpeg-xl/lib/jxl/encode.cc new file mode 100644 index 0000000000..fbd5133ae5 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/encode.cc @@ -0,0 +1,2128 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/enc_external_image.h" +#include "lib/jxl/enc_fast_lossless.h" +#include "lib/jxl/enc_fields.h" +#include "lib/jxl/enc_file.h" +#include "lib/jxl/enc_icc_codec.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/encode_internal.h" +#include "lib/jxl/exif.h" +#include "lib/jxl/jpeg/enc_jpeg_data.h" +#include "lib/jxl/luminance.h" +#include "lib/jxl/sanitizers.h" + +// Debug-printing failure macro similar to JXL_FAILURE, but for the status code +// JXL_ENC_ERROR +#ifdef JXL_CRASH_ON_ERROR +#define JXL_API_ERROR(enc, error_code, format, ...) \ + (enc->error = error_code, \ + ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__), \ + ::jxl::Abort(), JXL_ENC_ERROR) +#define JXL_API_ERROR_NOSET(format, ...) \ + (::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__), \ + ::jxl::Abort(), JXL_ENC_ERROR) +#else // JXL_CRASH_ON_ERROR +#define JXL_API_ERROR(enc, error_code, format, ...) \ + (enc->error = error_code, \ + ((JXL_DEBUG_ON_ERROR) && \ + ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__)), \ + JXL_ENC_ERROR) +#define JXL_API_ERROR_NOSET(format, ...) \ + (::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__), \ + JXL_ENC_ERROR) +#endif // JXL_CRASH_ON_ERROR + +namespace jxl {} // namespace jxl + +uint32_t JxlEncoderVersion(void) { + return JPEGXL_MAJOR_VERSION * 1000000 + JPEGXL_MINOR_VERSION * 1000 + + JPEGXL_PATCH_VERSION; +} + +namespace { +template +void AppendJxlpBoxCounter(uint32_t counter, bool last, T* output) { + if (last) counter |= 0x80000000; + for (size_t i = 0; i < 4; i++) { + output->push_back(counter >> (8 * (3 - i)) & 0xff); + } +} + +void QueueFrame( + const JxlEncoderFrameSettings* frame_settings, + jxl::MemoryManagerUniquePtr& frame) { + if (frame_settings->values.lossless) { + frame->option_values.cparams.SetLossless(); + } + + jxl::JxlEncoderQueuedInput queued_input(frame_settings->enc->memory_manager); + queued_input.frame = std::move(frame); + frame_settings->enc->input_queue.emplace_back(std::move(queued_input)); + frame_settings->enc->num_queued_frames++; +} + +void QueueFastLosslessFrame(const JxlEncoderFrameSettings* frame_settings, + JxlFastLosslessFrameState* fast_lossless_frame) { + jxl::JxlEncoderQueuedInput queued_input(frame_settings->enc->memory_manager); + queued_input.fast_lossless_frame.reset(fast_lossless_frame); + frame_settings->enc->input_queue.emplace_back(std::move(queued_input)); + frame_settings->enc->num_queued_frames++; +} + +void QueueBox(JxlEncoder* enc, + jxl::MemoryManagerUniquePtr& box) { + jxl::JxlEncoderQueuedInput queued_input(enc->memory_manager); + queued_input.box = std::move(box); + enc->input_queue.emplace_back(std::move(queued_input)); + enc->num_queued_boxes++; +} + +// TODO(lode): share this code and the Brotli compression code in enc_jpeg_data +JxlEncoderStatus BrotliCompress(int quality, const uint8_t* in, size_t in_size, + jxl::PaddedBytes* out) { + std::unique_ptr + enc(BrotliEncoderCreateInstance(nullptr, nullptr, nullptr), + BrotliEncoderDestroyInstance); + if (!enc) return JXL_API_ERROR_NOSET("BrotliEncoderCreateInstance failed"); + + BrotliEncoderSetParameter(enc.get(), BROTLI_PARAM_QUALITY, quality); + BrotliEncoderSetParameter(enc.get(), BROTLI_PARAM_SIZE_HINT, in_size); + + constexpr size_t kBufferSize = 128 * 1024; + jxl::PaddedBytes temp_buffer(kBufferSize); + + size_t avail_in = in_size; + const uint8_t* next_in = in; + + size_t total_out = 0; + + for (;;) { + size_t avail_out = kBufferSize; + uint8_t* next_out = temp_buffer.data(); + jxl::msan::MemoryIsInitialized(next_in, avail_in); + if (!BrotliEncoderCompressStream(enc.get(), BROTLI_OPERATION_FINISH, + &avail_in, &next_in, &avail_out, &next_out, + &total_out)) { + return JXL_API_ERROR_NOSET("Brotli compression failed"); + } + size_t out_size = next_out - temp_buffer.data(); + jxl::msan::UnpoisonMemory(next_out - out_size, out_size); + out->resize(out->size() + out_size); + memcpy(out->data() + out->size() - out_size, temp_buffer.data(), out_size); + if (BrotliEncoderIsFinished(enc.get())) break; + } + + return JXL_ENC_SUCCESS; +} + +// The JXL codestream can have level 5 or level 10. Levels have certain +// restrictions such as max allowed image dimensions. This function checks the +// level required to support the current encoder settings. The debug_string is +// intended to be used for developer API error messages, and may be set to +// nullptr. +int VerifyLevelSettings(const JxlEncoder* enc, std::string* debug_string) { + const auto& m = enc->metadata.m; + + uint64_t xsize = enc->metadata.size.xsize(); + uint64_t ysize = enc->metadata.size.ysize(); + // The uncompressed ICC size, if it is used. + size_t icc_size = 0; + if (m.color_encoding.WantICC()) { + icc_size = m.color_encoding.ICC().size(); + } + + // Level 10 checks + + if (xsize > (1ull << 30ull) || ysize > (1ull << 30ull) || + xsize * ysize > (1ull << 40ull)) { + if (debug_string) *debug_string = "Too large image dimensions"; + return -1; + } + if (icc_size > (1ull << 28)) { + if (debug_string) *debug_string = "Too large ICC profile size"; + return -1; + } + if (m.num_extra_channels > 256) { + if (debug_string) *debug_string = "Too many extra channels"; + return -1; + } + + // Level 5 checks + + if (!m.modular_16_bit_buffer_sufficient) { + if (debug_string) *debug_string = "Too high modular bit depth"; + return 10; + } + if (xsize > (1ull << 18ull) || ysize > (1ull << 18ull) || + xsize * ysize > (1ull << 28ull)) { + if (debug_string) *debug_string = "Too large image dimensions"; + return 10; + } + if (icc_size > (1ull << 22)) { + if (debug_string) *debug_string = "Too large ICC profile"; + return 10; + } + if (m.num_extra_channels > 4) { + if (debug_string) *debug_string = "Too many extra channels"; + return 10; + } + for (size_t i = 0; i < m.extra_channel_info.size(); ++i) { + if (m.extra_channel_info[i].type == jxl::ExtraChannel::kBlack) { + if (debug_string) *debug_string = "CMYK channel not allowed"; + return 10; + } + } + + // TODO(lode): also need to check if consecutive composite-still frames total + // pixel amount doesn't exceed 2**28 in the case of level 5. This should be + // done when adding frame and requires ability to add composite still frames + // to be added first. + + // TODO(lode): also need to check animation duration of a frame. This should + // be done when adding frame, but first requires implementing setting the + // JxlFrameHeader for a frame. + + // TODO(lode): also need to check properties such as num_splines, num_patches, + // modular_16bit_buffers and multiple properties of modular trees. However + // these are not user-set properties so cannot be checked here, but decisions + // the C++ encoder should be able to make based on the level. + + // All level 5 checks passes, so can return the more compatible level 5 + return 5; +} + +size_t BitsPerChannel(JxlDataType data_type) { + switch (data_type) { + case JXL_TYPE_UINT8: + return 8; + case JXL_TYPE_UINT16: + return 16; + case JXL_TYPE_FLOAT: + return 32; + case JXL_TYPE_FLOAT16: + return 16; + default: + return 0; // signals unhandled JxlDataType + } +} + +template +uint32_t GetBitDepth(JxlBitDepth bit_depth, const T& metadata, + JxlPixelFormat format) { + if (bit_depth.type == JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) { + return BitsPerChannel(format.data_type); + } else if (bit_depth.type == JXL_BIT_DEPTH_FROM_CODESTREAM) { + return metadata.bit_depth.bits_per_sample; + } else if (bit_depth.type == JXL_BIT_DEPTH_CUSTOM) { + return bit_depth.bits_per_sample; + } else { + return 0; + } +} + +JxlEncoderStatus CheckValidBitdepth(uint32_t bits_per_sample, + uint32_t exponent_bits_per_sample) { + if (!exponent_bits_per_sample) { + // The spec allows up to 31 for bits_per_sample here, but + // the code does not (yet) support it. + if (!(bits_per_sample > 0 && bits_per_sample <= 24)) { + return JXL_API_ERROR_NOSET("Invalid value for bits_per_sample"); + } + } else if ((exponent_bits_per_sample > 8) || + (bits_per_sample > 24 + exponent_bits_per_sample) || + (bits_per_sample < 3 + exponent_bits_per_sample)) { + return JXL_API_ERROR_NOSET("Invalid float description"); + } + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus VerifyInputBitDepth(JxlBitDepth bit_depth, + JxlPixelFormat format) { + if ((format.data_type == JXL_TYPE_FLOAT || + format.data_type == JXL_TYPE_FLOAT16) && + bit_depth.type != JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) { + return JXL_API_ERROR_NOSET( + "Only JXL_BIT_DEPTH_FROM_PIXEL_FORMAT is " + "implemented for float types."); + } + return JXL_ENC_SUCCESS; +} + +bool EncodeFrameIndexBox(const jxl::JxlEncoderFrameIndexBox& frame_index_box, + jxl::BitWriter& writer) { + bool ok = true; + int NF = 0; + for (size_t i = 0; i < frame_index_box.entries.size(); ++i) { + if (i == 0 || frame_index_box.entries[i].to_be_indexed) { + ++NF; + } + } + // Frame index box contents varint + 8 bytes + // continue with NF * 3 * varint + // varint max length is 10 for 64 bit numbers, and these numbers + // are limited to 63 bits. + static const int kVarintMaxLength = 10; + static const int kFrameIndexBoxHeaderLength = kVarintMaxLength + 8; + static const int kFrameIndexBoxElementLength = 3 * kVarintMaxLength; + const int buffer_size = + kFrameIndexBoxHeaderLength + NF * kFrameIndexBoxElementLength; + std::vector buffer_vec(buffer_size); + uint8_t* buffer = buffer_vec.data(); + size_t output_pos = 0; + ok &= jxl::EncodeVarInt(NF, buffer_vec.size(), &output_pos, buffer); + StoreBE32(frame_index_box.TNUM, &buffer[output_pos]); + output_pos += 4; + StoreBE32(frame_index_box.TDEN, &buffer[output_pos]); + output_pos += 4; + // When we record a frame in the index, the record needs to know + // how many frames until the next indexed frame. That is why + // we store the 'prev' record. That 'prev' record needs to store + // the offset byte position to previously recorded indexed frame, + // that's why we also trace previous to the previous frame. + int prev_prev_ix = -1; // For position offset (OFFi) delta coding. + int prev_ix = 0; + int T_prev = 0; + int T = 0; + for (size_t i = 1; i < frame_index_box.entries.size(); ++i) { + if (frame_index_box.entries[i].to_be_indexed) { + // Now we can record the previous entry, since we need to store + // there how many frames until the next one. + int64_t OFFi = frame_index_box.entries[prev_ix].OFFi; + if (prev_prev_ix != -1) { + // Offi needs to be offset of start byte of this frame compared to start + // byte of previous frame from this index in the JPEG XL codestream. For + // the first frame, this is the offset from the first byte of the JPEG + // XL codestream. + OFFi -= frame_index_box.entries[prev_prev_ix].OFFi; + } + int32_t Ti = T_prev; + int32_t Fi = i - prev_ix; + ok &= jxl::EncodeVarInt(OFFi, buffer_vec.size(), &output_pos, buffer); + ok &= jxl::EncodeVarInt(Ti, buffer_vec.size(), &output_pos, buffer); + ok &= jxl::EncodeVarInt(Fi, buffer_vec.size(), &output_pos, buffer); + prev_prev_ix = prev_ix; + prev_ix = i; + T_prev = T; + T += frame_index_box.entries[i].duration; + } + } + { + // Last frame. + size_t i = frame_index_box.entries.size(); + int64_t OFFi = frame_index_box.entries[prev_ix].OFFi; + if (prev_prev_ix != -1) { + OFFi -= frame_index_box.entries[prev_prev_ix].OFFi; + } + int32_t Ti = T_prev; + int32_t Fi = i - prev_ix; + ok &= jxl::EncodeVarInt(OFFi, buffer_vec.size(), &output_pos, buffer); + ok &= jxl::EncodeVarInt(Ti, buffer_vec.size(), &output_pos, buffer); + ok &= jxl::EncodeVarInt(Fi, buffer_vec.size(), &output_pos, buffer); + } + // Enough buffer has been allocated, this function should never fail in + // writing. + JXL_ASSERT(ok); + return ok; +} + +} // namespace + +JxlEncoderStatus JxlEncoderStruct::RefillOutputByteQueue() { + jxl::PaddedBytes bytes; + + jxl::JxlEncoderQueuedInput& input = input_queue[0]; + + // TODO(lode): split this into 3 functions: for adding the signature and other + // initial headers (jbrd, ...), one for adding frame, and one for adding user + // box. + + if (!wrote_bytes) { + // First time encoding any data, verify the level 5 vs level 10 settings + std::string level_message; + int required_level = VerifyLevelSettings(this, &level_message); + // Only level 5 and 10 are defined, and the function can return -1 to + // indicate full incompatibility. + JXL_ASSERT(required_level == -1 || required_level == 5 || + required_level == 10); + // codestream_level == -1 means auto-set to the required level + if (codestream_level == -1) codestream_level = required_level; + if (codestream_level == 5 && required_level != 5) { + // If the required level is 10, return error rather than automatically + // setting the level to 10, to avoid inadvertently creating a level 10 + // JXL file while intending to target a level 5 decoder. + return JXL_API_ERROR( + this, JXL_ENC_ERR_API_USAGE, "%s", + ("Codestream level verification for level 5 failed: " + level_message) + .c_str()); + } + if (required_level == -1) { + return JXL_API_ERROR( + this, JXL_ENC_ERR_API_USAGE, "%s", + ("Codestream level verification for level 10 failed: " + + level_message) + .c_str()); + } + + jxl::BitWriter writer; + if (!WriteCodestreamHeaders(&metadata, &writer, nullptr)) { + return JXL_API_ERROR(this, JXL_ENC_ERR_GENERIC, + "Failed to write codestream header"); + } + // Only send ICC (at least several hundred bytes) if fields aren't enough. + if (metadata.m.color_encoding.WantICC()) { + if (!jxl::WriteICC(metadata.m.color_encoding.ICC(), &writer, + jxl::kLayerHeader, nullptr)) { + return JXL_API_ERROR(this, JXL_ENC_ERR_GENERIC, + "Failed to write ICC profile"); + } + } + // TODO(lode): preview should be added here if a preview image is added + + writer.ZeroPadToByte(); + + // Not actually the end of frame, but the end of metadata/ICC, but helps + // the next frame to start here for indexing purposes. + codestream_bytes_written_end_of_frame += + jxl::DivCeil(writer.BitsWritten(), 8); + + bytes = std::move(writer).TakeBytes(); + + if (MustUseContainer()) { + // Add "JXL " and ftyp box. + output_byte_queue.insert( + output_byte_queue.end(), jxl::kContainerHeader, + jxl::kContainerHeader + sizeof(jxl::kContainerHeader)); + if (codestream_level != 5) { + // Add jxll box directly after the ftyp box to indicate the codestream + // level. + output_byte_queue.insert( + output_byte_queue.end(), jxl::kLevelBoxHeader, + jxl::kLevelBoxHeader + sizeof(jxl::kLevelBoxHeader)); + output_byte_queue.push_back(codestream_level); + } + + // Whether to write the basic info and color profile header of the + // codestream into an early separate jxlp box, so that it comes before + // metadata or jpeg reconstruction boxes. In theory this could simply + // always be done, but there's no reason to add an extra box with box + // header overhead if the codestream will already come immediately after + // the signature and level boxes. + bool partial_header = + store_jpeg_metadata || + (use_boxes && (!input.frame && !input.fast_lossless_frame)); + + if (partial_header) { + jxl::AppendBoxHeader(jxl::MakeBoxType("jxlp"), bytes.size() + 4, + /*unbounded=*/false, &output_byte_queue); + AppendJxlpBoxCounter(jxlp_counter++, /*last=*/false, + &output_byte_queue); + output_byte_queue.insert(output_byte_queue.end(), bytes.data(), + bytes.data() + bytes.size()); + bytes.clear(); + } + + if (store_jpeg_metadata && !jpeg_metadata.empty()) { + jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_metadata.size(), + false, &output_byte_queue); + output_byte_queue.insert(output_byte_queue.end(), jpeg_metadata.begin(), + jpeg_metadata.end()); + } + } + wrote_bytes = true; + } + + // Choose frame or box processing: exactly one of the two unique pointers (box + // or frame) in the input queue item is non-null. + if (input.frame || input.fast_lossless_frame) { + jxl::MemoryManagerUniquePtr input_frame = + std::move(input.frame); + if (input.fast_lossless_frame) { + output_fast_frame_queue.push_back(std::move(input.fast_lossless_frame)); + } + input_queue.erase(input_queue.begin()); + num_queued_frames--; + if (input_frame) { + for (unsigned idx = 0; idx < input_frame->ec_initialized.size(); idx++) { + if (!input_frame->ec_initialized[idx]) { + return JXL_API_ERROR(this, JXL_ENC_ERR_API_USAGE, + "Extra channel %u is not initialized", idx); + } + } + + // TODO(zond): If the input queue is empty and the frames_closed is true, + // then mark this frame as the last. + + // TODO(zond): Handle progressive mode like EncodeFile does it. + // TODO(zond): Handle animation like EncodeFile does it, by checking if + // JxlEncoderCloseFrames has been called and if the frame + // queue is empty (to see if it's the last animation frame). + + if (metadata.m.xyb_encoded) { + input_frame->option_values.cparams.color_transform = + jxl::ColorTransform::kXYB; + } else { + // TODO(zond): Figure out when to use kYCbCr instead. + input_frame->option_values.cparams.color_transform = + jxl::ColorTransform::kNone; + } + } + + uint32_t duration; + uint32_t timecode; + if (input_frame && metadata.m.have_animation) { + duration = input_frame->option_values.header.duration; + timecode = input_frame->option_values.header.timecode; + } else { + // If have_animation is false, the encoder should ignore the duration and + // timecode values. However, assigning them to ib will cause the encoder + // to write an invalid frame header that can't be decoded so ensure + // they're the default value of 0 here. + duration = 0; + timecode = 0; + } + + bool last_frame = frames_closed && !num_queued_frames; + + size_t codestream_byte_size = 0; + + jxl::BitWriter writer; + + if (input_frame) { + jxl::PassesEncoderState enc_state; + + frame_index_box.AddFrame(codestream_bytes_written_end_of_frame, duration, + input_frame->option_values.frame_index_box); + + // EncodeFrame creates jxl::FrameHeader object internally based on the + // FrameInfo, imagebundle, cparams and metadata. Copy the information to + // these. + jxl::ImageBundle& ib = input_frame->frame; + ib.duration = duration; + ib.timecode = timecode; + ib.name = input_frame->option_values.frame_name; + ib.blendmode = static_cast( + input_frame->option_values.header.layer_info.blend_info.blendmode); + ib.blend = + input_frame->option_values.header.layer_info.blend_info.blendmode != + JXL_BLEND_REPLACE; + + size_t save_as_reference = + input_frame->option_values.header.layer_info.save_as_reference; + if (save_as_reference >= 3) { + return JXL_API_ERROR( + this, JXL_ENC_ERR_API_USAGE, + "Cannot use save_as_reference values >=3 (found: %d)", + (int)save_as_reference); + } + ib.use_for_next_frame = !!save_as_reference; + + jxl::FrameInfo frame_info; + frame_info.is_last = last_frame; + frame_info.save_as_reference = save_as_reference; + frame_info.source = + input_frame->option_values.header.layer_info.blend_info.source; + frame_info.clamp = + input_frame->option_values.header.layer_info.blend_info.clamp; + frame_info.alpha_channel = + input_frame->option_values.header.layer_info.blend_info.alpha; + frame_info.extra_channel_blending_info.resize( + metadata.m.num_extra_channels); + // If extra channel blend info has not been set, use the blend mode from + // the layer_info. + JxlBlendInfo default_blend_info = + input_frame->option_values.header.layer_info.blend_info; + for (size_t i = 0; i < metadata.m.num_extra_channels; ++i) { + auto& to = frame_info.extra_channel_blending_info[i]; + const auto& from = + i < input_frame->option_values.extra_channel_blend_info.size() + ? input_frame->option_values.extra_channel_blend_info[i] + : default_blend_info; + to.mode = static_cast(from.blendmode); + to.source = from.source; + to.alpha_channel = from.alpha; + to.clamp = (from.clamp != 0); + } + + if (input_frame->option_values.header.layer_info.have_crop) { + ib.origin.x0 = input_frame->option_values.header.layer_info.crop_x0; + ib.origin.y0 = input_frame->option_values.header.layer_info.crop_y0; + } + JXL_ASSERT(writer.BitsWritten() == 0); + if (!jxl::EncodeFrame(input_frame->option_values.cparams, frame_info, + &metadata, input_frame->frame, &enc_state, cms, + thread_pool.get(), &writer, + /*aux_out=*/nullptr)) { + return JXL_API_ERROR(this, JXL_ENC_ERR_GENERIC, + "Failed to encode frame"); + } + codestream_bytes_written_beginning_of_frame = + codestream_bytes_written_end_of_frame; + codestream_bytes_written_end_of_frame += + jxl::DivCeil(writer.BitsWritten(), 8); + + // Possibly bytes already contains the codestream header: in case this is + // the first frame, and the codestream header was not encoded as jxlp + // above. + bytes.append(std::move(writer).TakeBytes()); + codestream_byte_size = bytes.size(); + } else { + JXL_CHECK(!output_fast_frame_queue.empty()); + JxlFastLosslessPrepareHeader(output_fast_frame_queue.front().get(), + /*add_image_header=*/0, last_frame); + codestream_byte_size = + JxlFastLosslessOutputSize(output_fast_frame_queue.front().get()) + + bytes.size(); + } + + if (MustUseContainer()) { + if (last_frame && jxlp_counter == 0) { + // If this is the last frame and no jxlp boxes were used yet, it's + // slighly more efficient to write a jxlc box since it has 4 bytes + // less overhead. + jxl::AppendBoxHeader(jxl::MakeBoxType("jxlc"), codestream_byte_size, + /*unbounded=*/false, &output_byte_queue); + } else { + jxl::AppendBoxHeader(jxl::MakeBoxType("jxlp"), codestream_byte_size + 4, + /*unbounded=*/false, &output_byte_queue); + AppendJxlpBoxCounter(jxlp_counter++, last_frame, &output_byte_queue); + } + } + + output_byte_queue.insert(output_byte_queue.end(), bytes.data(), + bytes.data() + bytes.size()); + + if (input_frame) { + last_used_cparams = input_frame->option_values.cparams; + } + if (last_frame && frame_index_box.StoreFrameIndexBox()) { + bytes.clear(); + EncodeFrameIndexBox(frame_index_box, writer); + jxl::AppendBoxHeader(jxl::MakeBoxType("jxli"), bytes.size(), + /*unbounded=*/false, &output_byte_queue); + } + } else { + // Not a frame, so is a box instead + jxl::MemoryManagerUniquePtr box = + std::move(input.box); + input_queue.erase(input_queue.begin()); + num_queued_boxes--; + + if (box->compress_box) { + jxl::PaddedBytes compressed(4); + // Prepend the original box type in the brob box contents + for (size_t i = 0; i < 4; i++) { + compressed[i] = static_cast(box->type[i]); + } + if (JXL_ENC_SUCCESS != + BrotliCompress((brotli_effort >= 0 ? brotli_effort : 4), + box->contents.data(), box->contents.size(), + &compressed)) { + return JXL_API_ERROR(this, JXL_ENC_ERR_GENERIC, + "Brotli compression for brob box failed"); + } + jxl::AppendBoxHeader(jxl::MakeBoxType("brob"), compressed.size(), false, + &output_byte_queue); + output_byte_queue.insert(output_byte_queue.end(), compressed.data(), + compressed.data() + compressed.size()); + } else { + jxl::AppendBoxHeader(box->type, box->contents.size(), false, + &output_byte_queue); + output_byte_queue.insert(output_byte_queue.end(), box->contents.data(), + box->contents.data() + box->contents.size()); + } + } + + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderSetColorEncoding(JxlEncoder* enc, + const JxlColorEncoding* color) { + if (!enc->basic_info_set) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, "Basic info not yet set"); + } + if (enc->color_encoding_set) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, + "Color encoding is already set"); + } + if (!jxl::ConvertExternalToInternalColorEncoding( + *color, &enc->metadata.m.color_encoding)) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_GENERIC, "Error in color conversion"); + } + if (enc->metadata.m.color_encoding.GetColorSpace() == + jxl::ColorSpace::kGray) { + if (enc->basic_info.num_color_channels != 1) + return JXL_API_ERROR( + enc, JXL_ENC_ERR_API_USAGE, + "Cannot use grayscale color encoding with num_color_channels != 1"); + } else { + if (enc->basic_info.num_color_channels != 3) + return JXL_API_ERROR( + enc, JXL_ENC_ERR_API_USAGE, + "Cannot use RGB color encoding with num_color_channels != 3"); + } + enc->color_encoding_set = true; + if (!enc->intensity_target_set) { + jxl::SetIntensityTarget(&enc->metadata.m); + } + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderSetICCProfile(JxlEncoder* enc, + const uint8_t* icc_profile, + size_t size) { + if (!enc->basic_info_set) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, "Basic info not yet set"); + } + if (enc->color_encoding_set) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, + "ICC profile is already set"); + } + jxl::PaddedBytes icc; + icc.assign(icc_profile, icc_profile + size); + if (!enc->metadata.m.color_encoding.SetICC(std::move(icc))) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_BAD_INPUT, + "ICC profile could not be set"); + } + if (enc->metadata.m.color_encoding.GetColorSpace() == + jxl::ColorSpace::kGray) { + if (enc->basic_info.num_color_channels != 1) + return JXL_API_ERROR( + enc, JXL_ENC_ERR_BAD_INPUT, + "Cannot use grayscale ICC profile with num_color_channels != 1"); + } else { + if (enc->basic_info.num_color_channels != 3) + return JXL_API_ERROR( + enc, JXL_ENC_ERR_BAD_INPUT, + "Cannot use RGB ICC profile with num_color_channels != 3"); + // TODO(jon): also check that a kBlack extra channel is provided in the CMYK + // case + } + enc->color_encoding_set = true; + if (!enc->intensity_target_set) { + jxl::SetIntensityTarget(&enc->metadata.m); + } + + if (!enc->basic_info.uses_original_profile) { + enc->metadata.m.color_encoding.DecideIfWantICC(); + } + + return JXL_ENC_SUCCESS; +} + +void JxlEncoderInitBasicInfo(JxlBasicInfo* info) { + info->have_container = JXL_FALSE; + info->xsize = 0; + info->ysize = 0; + info->bits_per_sample = 8; + info->exponent_bits_per_sample = 0; + info->intensity_target = 0.f; + info->min_nits = 0.f; + info->relative_to_max_display = JXL_FALSE; + info->linear_below = 0.f; + info->uses_original_profile = JXL_FALSE; + info->have_preview = JXL_FALSE; + info->have_animation = JXL_FALSE; + info->orientation = JXL_ORIENT_IDENTITY; + info->num_color_channels = 3; + info->num_extra_channels = 0; + info->alpha_bits = 0; + info->alpha_exponent_bits = 0; + info->alpha_premultiplied = JXL_FALSE; + info->preview.xsize = 0; + info->preview.ysize = 0; + info->intrinsic_xsize = 0; + info->intrinsic_ysize = 0; + info->animation.tps_numerator = 10; + info->animation.tps_denominator = 1; + info->animation.num_loops = 0; + info->animation.have_timecodes = JXL_FALSE; +} + +void JxlEncoderInitFrameHeader(JxlFrameHeader* frame_header) { + // For each field, the default value of the specification is used. Depending + // on whether an animation frame, or a composite still blending frame, + // is used, different fields have to be set up by the user after initing + // the frame header. + frame_header->duration = 0; + frame_header->timecode = 0; + frame_header->name_length = 0; + // In the specification, the default value of is_last is !frame_type, and the + // default frame_type is kRegularFrame which has value 0, so is_last is true + // by default. However, the encoder does not use this value (the field exists + // for the decoder to set) since last frame is determined by usage of + // JxlEncoderCloseFrames instead. + frame_header->is_last = JXL_TRUE; + frame_header->layer_info.have_crop = JXL_FALSE; + frame_header->layer_info.crop_x0 = 0; + frame_header->layer_info.crop_y0 = 0; + // These must be set if have_crop is enabled, but the default value has + // have_crop false, and these dimensions 0. The user must set these to the + // desired size after enabling have_crop (which is not yet implemented). + frame_header->layer_info.xsize = 0; + frame_header->layer_info.ysize = 0; + JxlEncoderInitBlendInfo(&frame_header->layer_info.blend_info); + frame_header->layer_info.save_as_reference = 0; +} + +void JxlEncoderInitBlendInfo(JxlBlendInfo* blend_info) { + // Default blend mode in the specification is 0. Note that combining + // blend mode of replace with a duration is not useful, but the user has to + // manually set duration in case of animation, or manually change the blend + // mode in case of composite stills, so initing to a combination that is not + // useful on its own is not an issue. + blend_info->blendmode = JXL_BLEND_REPLACE; + blend_info->source = 0; + blend_info->alpha = 0; + blend_info->clamp = 0; +} + +JxlEncoderStatus JxlEncoderSetBasicInfo(JxlEncoder* enc, + const JxlBasicInfo* info) { + if (!enc->metadata.size.Set(info->xsize, info->ysize)) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, "Invalid dimensions"); + } + if (JXL_ENC_SUCCESS != CheckValidBitdepth(info->bits_per_sample, + info->exponent_bits_per_sample)) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, "Invalid bit depth"); + } + enc->metadata.m.bit_depth.bits_per_sample = info->bits_per_sample; + enc->metadata.m.bit_depth.exponent_bits_per_sample = + info->exponent_bits_per_sample; + enc->metadata.m.bit_depth.floating_point_sample = + (info->exponent_bits_per_sample != 0u); + enc->metadata.m.modular_16_bit_buffer_sufficient = + (!info->uses_original_profile || info->bits_per_sample <= 12) && + info->alpha_bits <= 12; + if ((info->intrinsic_xsize > 0 || info->intrinsic_ysize > 0) && + (info->intrinsic_xsize != info->xsize || + info->intrinsic_ysize != info->ysize)) { + if (info->intrinsic_xsize > (1ull << 30ull) || + info->intrinsic_ysize > (1ull << 30ull) || + !enc->metadata.m.intrinsic_size.Set(info->intrinsic_xsize, + info->intrinsic_ysize)) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, + "Invalid intrinsic dimensions"); + } + enc->metadata.m.have_intrinsic_size = true; + } + + // The number of extra channels includes the alpha channel, so for example and + // RGBA with no other extra channels, has exactly num_extra_channels == 1 + enc->metadata.m.num_extra_channels = info->num_extra_channels; + enc->metadata.m.extra_channel_info.resize(enc->metadata.m.num_extra_channels); + if (info->num_extra_channels == 0 && info->alpha_bits) { + return JXL_API_ERROR( + enc, JXL_ENC_ERR_API_USAGE, + "when alpha_bits is non-zero, the number of channels must be at least " + "1"); + } + // If the user provides non-zero alpha_bits, we make the channel info at index + // zero the appropriate alpha channel. + if (info->alpha_bits) { + JxlExtraChannelInfo channel_info; + JxlEncoderInitExtraChannelInfo(JXL_CHANNEL_ALPHA, &channel_info); + channel_info.bits_per_sample = info->alpha_bits; + channel_info.exponent_bits_per_sample = info->alpha_exponent_bits; + if (JxlEncoderSetExtraChannelInfo(enc, 0, &channel_info)) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, + "Problem setting extra channel info for alpha"); + } + } + + enc->metadata.m.xyb_encoded = !info->uses_original_profile; + if (info->orientation > 0 && info->orientation <= 8) { + enc->metadata.m.orientation = info->orientation; + } else { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, + "Invalid value for orientation field"); + } + if (info->num_color_channels != 1 && info->num_color_channels != 3) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, + "Invalid number of color channels"); + } + if (info->intensity_target != 0) { + enc->metadata.m.SetIntensityTarget(info->intensity_target); + enc->intensity_target_set = true; + } else if (enc->color_encoding_set) { + // If this is false, JxlEncoderSetColorEncoding will be called later and we + // will get one more chance to call jxl::SetIntensityTarget, after the color + // encoding is indeed set. + jxl::SetIntensityTarget(&enc->metadata.m); + enc->intensity_target_set = true; + } + enc->metadata.m.tone_mapping.min_nits = info->min_nits; + enc->metadata.m.tone_mapping.relative_to_max_display = + info->relative_to_max_display; + enc->metadata.m.tone_mapping.linear_below = info->linear_below; + enc->basic_info = *info; + enc->basic_info_set = true; + + enc->metadata.m.have_animation = info->have_animation; + if (info->have_animation) { + if (info->animation.tps_denominator < 1) { + return JXL_API_ERROR( + enc, JXL_ENC_ERR_API_USAGE, + "If animation is used, tps_denominator must be >= 1"); + } + if (info->animation.tps_numerator < 1) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, + "If animation is used, tps_numerator must be >= 1"); + } + enc->metadata.m.animation.tps_numerator = info->animation.tps_numerator; + enc->metadata.m.animation.tps_denominator = info->animation.tps_denominator; + enc->metadata.m.animation.num_loops = info->animation.num_loops; + enc->metadata.m.animation.have_timecodes = info->animation.have_timecodes; + } + std::string level_message; + int required_level = VerifyLevelSettings(enc, &level_message); + if (required_level == -1 || + (static_cast(enc->codestream_level) < required_level && + enc->codestream_level != -1)) { + return JXL_API_ERROR( + enc, JXL_ENC_ERR_API_USAGE, "%s", + ("Codestream level verification for level " + + std::to_string(enc->codestream_level) + " failed: " + level_message) + .c_str()); + } + return JXL_ENC_SUCCESS; +} + +void JxlEncoderInitExtraChannelInfo(JxlExtraChannelType type, + JxlExtraChannelInfo* info) { + info->type = type; + info->bits_per_sample = 8; + info->exponent_bits_per_sample = 0; + info->dim_shift = 0; + info->name_length = 0; + info->alpha_premultiplied = JXL_FALSE; + info->spot_color[0] = 0; + info->spot_color[1] = 0; + info->spot_color[2] = 0; + info->spot_color[3] = 0; + info->cfa_channel = 0; +} + +JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelInfo( + JxlEncoder* enc, size_t index, const JxlExtraChannelInfo* info) { + if (index >= enc->metadata.m.num_extra_channels) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, + "Invalid value for the index of extra channel"); + } + if (JXL_ENC_SUCCESS != CheckValidBitdepth(info->bits_per_sample, + info->exponent_bits_per_sample)) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, "Invalid bit depth"); + } + + jxl::ExtraChannelInfo& channel = enc->metadata.m.extra_channel_info[index]; + channel.type = static_cast(info->type); + channel.bit_depth.bits_per_sample = info->bits_per_sample; + enc->metadata.m.modular_16_bit_buffer_sufficient &= + info->bits_per_sample <= 12; + channel.bit_depth.exponent_bits_per_sample = info->exponent_bits_per_sample; + channel.bit_depth.floating_point_sample = info->exponent_bits_per_sample != 0; + channel.dim_shift = info->dim_shift; + channel.name = ""; + channel.alpha_associated = (info->alpha_premultiplied != 0); + channel.cfa_channel = info->cfa_channel; + channel.spot_color[0] = info->spot_color[0]; + channel.spot_color[1] = info->spot_color[1]; + channel.spot_color[2] = info->spot_color[2]; + channel.spot_color[3] = info->spot_color[3]; + std::string level_message; + int required_level = VerifyLevelSettings(enc, &level_message); + if (required_level == -1 || + (static_cast(enc->codestream_level) < required_level && + enc->codestream_level != -1)) { + return JXL_API_ERROR( + enc, JXL_ENC_ERR_API_USAGE, "%s", + ("Codestream level verification for level " + + std::to_string(enc->codestream_level) + " failed: " + level_message) + .c_str()); + } + return JXL_ENC_SUCCESS; +} + +JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelName(JxlEncoder* enc, + size_t index, + const char* name, + size_t size) { + if (index >= enc->metadata.m.num_extra_channels) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, + "Invalid value for the index of extra channel"); + } + enc->metadata.m.extra_channel_info[index].name = + std::string(name, name + size); + return JXL_ENC_SUCCESS; +} + +JxlEncoderFrameSettings* JxlEncoderFrameSettingsCreate( + JxlEncoder* enc, const JxlEncoderFrameSettings* source) { + auto opts = jxl::MemoryManagerMakeUnique( + &enc->memory_manager); + if (!opts) return nullptr; + opts->enc = enc; + if (source != nullptr) { + opts->values = source->values; + } else { + opts->values.lossless = false; + } + opts->values.cparams.level = enc->codestream_level; + opts->values.cparams.ec_distance.resize(enc->metadata.m.num_extra_channels, + -1); + + JxlEncoderFrameSettings* ret = opts.get(); + enc->encoder_options.emplace_back(std::move(opts)); + return ret; +} + +JxlEncoderFrameSettings* JxlEncoderOptionsCreate( + JxlEncoder* enc, const JxlEncoderFrameSettings* source) { + // Deprecated function name, call the non-deprecated function + return JxlEncoderFrameSettingsCreate(enc, source); +} + +JxlEncoderStatus JxlEncoderSetFrameLossless( + JxlEncoderFrameSettings* frame_settings, const JXL_BOOL lossless) { + if (lossless && frame_settings->enc->basic_info_set && + frame_settings->enc->metadata.m.xyb_encoded) { + return JXL_API_ERROR( + frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Set uses_original_profile=true for lossless encoding"); + } + frame_settings->values.lossless = lossless; + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderOptionsSetLossless( + JxlEncoderFrameSettings* frame_settings, JXL_BOOL lossless) { + // Deprecated function name, call the non-deprecated function + return JxlEncoderSetFrameLossless(frame_settings, lossless); +} + +JxlEncoderStatus JxlEncoderOptionsSetEffort( + JxlEncoderFrameSettings* frame_settings, const int effort) { + return JxlEncoderFrameSettingsSetOption(frame_settings, + JXL_ENC_FRAME_SETTING_EFFORT, effort); +} + +JxlEncoderStatus JxlEncoderSetFrameDistance( + JxlEncoderFrameSettings* frame_settings, float distance) { + if (distance < 0.f || distance > 25.f) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Distance has to be in [0.0..25.0] (corresponding to " + "quality in [0.0..100.0])"); + } + if (distance > 0.f && distance < 0.01f) { + distance = 0.01f; + } + frame_settings->values.cparams.butteraugli_distance = distance; + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderSetExtraChannelDistance( + JxlEncoderFrameSettings* frame_settings, size_t index, float distance) { + if (index >= frame_settings->enc->metadata.m.num_extra_channels) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Invalid value for the index of extra channel"); + } + if (distance != -1.f && (distance < 0.f || distance > 25.f)) { + return JXL_API_ERROR( + frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Distance has to be -1 or in [0.0..25.0] (corresponding to " + "quality in [0.0..100.0])"); + } + if (distance > 0.f && distance < 0.01f) { + distance = 0.01f; + } + + if (index >= frame_settings->values.cparams.ec_distance.size()) { + // This can only happen if JxlEncoderFrameSettingsCreate() was called before + // JxlEncoderSetBasicInfo(). + frame_settings->values.cparams.ec_distance.resize( + frame_settings->enc->metadata.m.num_extra_channels, -1); + } + + frame_settings->values.cparams.ec_distance[index] = distance; + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderOptionsSetDistance( + JxlEncoderFrameSettings* frame_settings, float distance) { + // Deprecated function name, call the non-deprecated function + return JxlEncoderSetFrameDistance(frame_settings, distance); +} + +JxlEncoderStatus JxlEncoderOptionsSetDecodingSpeed( + JxlEncoderFrameSettings* frame_settings, int tier) { + return JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_DECODING_SPEED, tier); +} + +JxlEncoderStatus JxlEncoderFrameSettingsSetOption( + JxlEncoderFrameSettings* frame_settings, JxlEncoderFrameSettingId option, + int64_t value) { + // check if value is -1, 0 or 1 for Override-type options + switch (option) { + case JXL_ENC_FRAME_SETTING_NOISE: + case JXL_ENC_FRAME_SETTING_DOTS: + case JXL_ENC_FRAME_SETTING_PATCHES: + case JXL_ENC_FRAME_SETTING_GABORISH: + case JXL_ENC_FRAME_SETTING_MODULAR: + case JXL_ENC_FRAME_SETTING_KEEP_INVISIBLE: + case JXL_ENC_FRAME_SETTING_GROUP_ORDER: + case JXL_ENC_FRAME_SETTING_RESPONSIVE: + case JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC: + case JXL_ENC_FRAME_SETTING_QPROGRESSIVE_AC: + case JXL_ENC_FRAME_SETTING_LOSSY_PALETTE: + case JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL: + case JXL_ENC_FRAME_SETTING_JPEG_COMPRESS_BOXES: + if (value < -1 || value > 1) { + return JXL_API_ERROR( + frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Option value has to be -1 (default), 0 (off) or 1 (on)"); + } + break; + default: + break; + } + + switch (option) { + case JXL_ENC_FRAME_SETTING_EFFORT: + if (frame_settings->enc->allow_expert_options) { + if (value < 1 || value > 10) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED, + "Encode effort has to be in [1..10]"); + } + } else { + if (value < 1 || value > 9) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED, + "Encode effort has to be in [1..9]"); + } + } + frame_settings->values.cparams.speed_tier = + static_cast(10 - value); + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_BROTLI_EFFORT: + if (value < -1 || value > 11) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Brotli effort has to be in [-1..11]"); + } + // set cparams for brotli use in JPEG frames + frame_settings->values.cparams.brotli_effort = value; + // set enc option for brotli use in brob boxes + frame_settings->enc->brotli_effort = value; + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_DECODING_SPEED: + if (value < 0 || value > 4) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED, + "Decoding speed has to be in [0..4]"); + } + frame_settings->values.cparams.decoding_speed_tier = value; + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_RESAMPLING: + if (value != -1 && value != 1 && value != 2 && value != 4 && value != 8) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Resampling factor has to be 1, 2, 4 or 8"); + } + frame_settings->values.cparams.resampling = value; + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING: + // TODO(lode): the jxl codestream allows choosing a different resampling + // factor for each extra channel, independently per frame. Move this + // option to a JxlEncoderFrameSettings-option that can be set per extra + // channel, so needs its own function rather than + // JxlEncoderFrameSettingsSetOption due to the extra channel index + // argument required. + if (value != -1 && value != 1 && value != 2 && value != 4 && value != 8) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Resampling factor has to be 1, 2, 4 or 8"); + } + frame_settings->values.cparams.ec_resampling = value; + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_ALREADY_DOWNSAMPLED: + if (value < 0 || value > 1) { + return JXL_ENC_ERROR; + } + frame_settings->values.cparams.already_downsampled = (value == 1); + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_NOISE: + frame_settings->values.cparams.noise = static_cast(value); + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_DOTS: + frame_settings->values.cparams.dots = static_cast(value); + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_PATCHES: + frame_settings->values.cparams.patches = + static_cast(value); + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_EPF: + if (value < -1 || value > 3) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "EPF value has to be in [-1..3]"); + } + frame_settings->values.cparams.epf = static_cast(value); + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_GABORISH: + frame_settings->values.cparams.gaborish = + static_cast(value); + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_MODULAR: + frame_settings->values.cparams.modular_mode = (value == 1); + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_KEEP_INVISIBLE: + frame_settings->values.cparams.keep_invisible = + static_cast(value); + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_GROUP_ORDER: + frame_settings->values.cparams.centerfirst = (value == 1); + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_X: + if (value < -1) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Center x coordinate has to be -1 or positive"); + } + frame_settings->values.cparams.center_x = static_cast(value); + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_Y: + if (value < -1) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Center y coordinate has to be -1 or positive"); + } + frame_settings->values.cparams.center_y = static_cast(value); + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_RESPONSIVE: + frame_settings->values.cparams.responsive = value; + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC: + frame_settings->values.cparams.progressive_mode = value; + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_QPROGRESSIVE_AC: + frame_settings->values.cparams.qprogressive_mode = value; + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC: + if (value < -1 || value > 2) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Progressive DC has to be in [-1..2]"); + } + frame_settings->values.cparams.progressive_dc = value; + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_PALETTE_COLORS: + if (value < -1 || value > 70913) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Option value has to be in [-1..70913]"); + } + if (value == -1) { + frame_settings->values.cparams.palette_colors = 1 << 10; + } else { + frame_settings->values.cparams.palette_colors = value; + } + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_LOSSY_PALETTE: + // TODO(lode): the defaults of some palette settings depend on others. + // See the logic in cjxl. Similar for other settings. This should be + // handled in the encoder during JxlEncoderProcessOutput (or, + // alternatively, in the cjxl binary like now) + frame_settings->values.cparams.lossy_palette = (value == 1); + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_COLOR_TRANSFORM: + if (value < -1 || value > 2) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Option value has to be in [-1..2]"); + } + if (value == -1) { + frame_settings->values.cparams.color_transform = + jxl::ColorTransform::kXYB; + } else { + frame_settings->values.cparams.color_transform = + static_cast(value); + } + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_MODULAR_COLOR_SPACE: + if (value < -1 || value > 41) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Option value has to be in [-1..41]"); + } + frame_settings->values.cparams.colorspace = value; + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_MODULAR_GROUP_SIZE: + if (value < -1 || value > 3) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Option value has to be in [-1..3]"); + } + // TODO(lode): the default behavior of this parameter for cjxl is + // to choose 1 or 2 depending on the situation. This behavior needs to be + // implemented either in the C++ library by allowing to set this to -1, or + // kept in cjxl and set it to 1 or 2 using this API. + if (value == -1) { + frame_settings->values.cparams.modular_group_size_shift = 1; + } else { + frame_settings->values.cparams.modular_group_size_shift = value; + } + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR: + if (value < -1 || value > 15) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Option value has to be in [-1..15]"); + } + frame_settings->values.cparams.options.predictor = + static_cast(value); + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_MODULAR_NB_PREV_CHANNELS: + // The max allowed value can in theory be higher. However, it depends on + // the effort setting. 11 is the highest safe value that doesn't cause + // tree_samples to be >= 64 in the encoder. The specification may allow + // more than this. With more fine tuning higher values could be allowed. + // For N-channel images, the largest useful value is N-1. + if (value < -1 || value > 11) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Option value has to be in [-1..11]"); + } + if (value == -1) { + frame_settings->values.cparams.options.max_properties = 0; + } else { + frame_settings->values.cparams.options.max_properties = value; + } + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL: + if (value == -1) { + frame_settings->values.cparams.force_cfl_jpeg_recompression = true; + } else { + frame_settings->values.cparams.force_cfl_jpeg_recompression = value; + } + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_INDEX_BOX: + frame_settings->values.frame_index_box = true; + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_PHOTON_NOISE: + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED, + "Float option, try setting it with " + "JxlEncoderFrameSettingsSetFloatOption"); + case JXL_ENC_FRAME_SETTING_JPEG_COMPRESS_BOXES: + frame_settings->values.cparams.jpeg_compress_boxes = value; + return JXL_ENC_SUCCESS; + default: + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED, + "Unknown option"); + } +} + +JxlEncoderStatus JxlEncoderFrameSettingsSetFloatOption( + JxlEncoderFrameSettings* frame_settings, JxlEncoderFrameSettingId option, + float value) { + switch (option) { + case JXL_ENC_FRAME_SETTING_PHOTON_NOISE: + if (value < 0) return JXL_ENC_ERROR; + // TODO(lode): add encoder setting to set the 8 floating point values of + // the noise synthesis parameters per frame for more fine grained control. + frame_settings->values.cparams.photon_noise_iso = value; + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_MODULAR_MA_TREE_LEARNING_PERCENT: + if (value < -1.f || value > 100.f) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Option value has to be smaller than 100"); + } + // This value is called "iterations" or "nb_repeats" in cjxl, but is in + // fact a fraction in range 0.0-1.0, with the default value 0.5. + // Convert from floating point percentage to floating point fraction here. + if (value < -.5f) { + // TODO(lode): for this and many other settings (also in + // JxlEncoderFrameSettingsSetOption), avoid duplicating the default + // values here and in enc_params.h and options.h, have one location + // where the defaults are specified. + frame_settings->values.cparams.options.nb_repeats = 0.5f; + } else { + frame_settings->values.cparams.options.nb_repeats = value * 0.01f; + } + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GLOBAL_PERCENT: + if (value < -1.f || value > 100.f) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Option value has to be in [-1..100]"); + } + if (value < -.5f) { + frame_settings->values.cparams.channel_colors_pre_transform_percent = + 95.0f; + } else { + frame_settings->values.cparams.channel_colors_pre_transform_percent = + value; + } + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GROUP_PERCENT: + if (value < -1.f || value > 100.f) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Option value has to be in [-1..100]"); + } + if (value < -.5f) { + frame_settings->values.cparams.channel_colors_percent = 80.0f; + } else { + frame_settings->values.cparams.channel_colors_percent = value; + } + return JXL_ENC_SUCCESS; + case JXL_ENC_FRAME_SETTING_EFFORT: + case JXL_ENC_FRAME_SETTING_DECODING_SPEED: + case JXL_ENC_FRAME_SETTING_RESAMPLING: + case JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING: + case JXL_ENC_FRAME_SETTING_ALREADY_DOWNSAMPLED: + case JXL_ENC_FRAME_SETTING_NOISE: + case JXL_ENC_FRAME_SETTING_DOTS: + case JXL_ENC_FRAME_SETTING_PATCHES: + case JXL_ENC_FRAME_SETTING_EPF: + case JXL_ENC_FRAME_SETTING_GABORISH: + case JXL_ENC_FRAME_SETTING_MODULAR: + case JXL_ENC_FRAME_SETTING_KEEP_INVISIBLE: + case JXL_ENC_FRAME_SETTING_GROUP_ORDER: + case JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_X: + case JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_Y: + case JXL_ENC_FRAME_SETTING_RESPONSIVE: + case JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC: + case JXL_ENC_FRAME_SETTING_QPROGRESSIVE_AC: + case JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC: + case JXL_ENC_FRAME_SETTING_PALETTE_COLORS: + case JXL_ENC_FRAME_SETTING_LOSSY_PALETTE: + case JXL_ENC_FRAME_SETTING_COLOR_TRANSFORM: + case JXL_ENC_FRAME_SETTING_MODULAR_COLOR_SPACE: + case JXL_ENC_FRAME_SETTING_MODULAR_GROUP_SIZE: + case JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR: + case JXL_ENC_FRAME_SETTING_MODULAR_NB_PREV_CHANNELS: + case JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL: + case JXL_ENC_FRAME_INDEX_BOX: + case JXL_ENC_FRAME_SETTING_BROTLI_EFFORT: + case JXL_ENC_FRAME_SETTING_FILL_ENUM: + case JXL_ENC_FRAME_SETTING_JPEG_COMPRESS_BOXES: + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED, + "Int option, try setting it with " + "JxlEncoderFrameSettingsSetOption"); + default: + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED, + "Unknown option"); + } +} +JxlEncoder* JxlEncoderCreate(const JxlMemoryManager* memory_manager) { + JxlMemoryManager local_memory_manager; + if (!jxl::MemoryManagerInit(&local_memory_manager, memory_manager)) { + return nullptr; + } + + void* alloc = + jxl::MemoryManagerAlloc(&local_memory_manager, sizeof(JxlEncoder)); + if (!alloc) return nullptr; + JxlEncoder* enc = new (alloc) JxlEncoder(); + enc->memory_manager = local_memory_manager; + // TODO(sboukortt): add an API function to set this. + enc->cms = jxl::GetJxlCms(); + + // Initialize all the field values. + JxlEncoderReset(enc); + + return enc; +} + +void JxlEncoderReset(JxlEncoder* enc) { + enc->thread_pool.reset(); + enc->input_queue.clear(); + enc->num_queued_frames = 0; + enc->num_queued_boxes = 0; + enc->encoder_options.clear(); + enc->output_byte_queue.clear(); + enc->output_fast_frame_queue.clear(); + enc->codestream_bytes_written_beginning_of_frame = 0; + enc->codestream_bytes_written_end_of_frame = 0; + enc->wrote_bytes = false; + enc->jxlp_counter = 0; + enc->metadata = jxl::CodecMetadata(); + enc->last_used_cparams = jxl::CompressParams(); + enc->frames_closed = false; + enc->boxes_closed = false; + enc->basic_info_set = false; + enc->color_encoding_set = false; + enc->intensity_target_set = false; + enc->use_container = false; + enc->use_boxes = false; + enc->codestream_level = -1; + JxlEncoderInitBasicInfo(&enc->basic_info); +} + +void JxlEncoderDestroy(JxlEncoder* enc) { + if (enc) { + JxlMemoryManager local_memory_manager = enc->memory_manager; + // Call destructor directly since custom free function is used. + enc->~JxlEncoder(); + jxl::MemoryManagerFree(&local_memory_manager, enc); + } +} + +JxlEncoderError JxlEncoderGetError(JxlEncoder* enc) { return enc->error; } + +JxlEncoderStatus JxlEncoderUseContainer(JxlEncoder* enc, + JXL_BOOL use_container) { + if (enc->wrote_bytes) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, + "this setting can only be set at the beginning"); + } + enc->use_container = static_cast(use_container); + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderStoreJPEGMetadata(JxlEncoder* enc, + JXL_BOOL store_jpeg_metadata) { + if (enc->wrote_bytes) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, + "this setting can only be set at the beginning"); + } + enc->store_jpeg_metadata = static_cast(store_jpeg_metadata); + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderSetCodestreamLevel(JxlEncoder* enc, int level) { + if (level != -1 && level != 5 && level != 10) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_NOT_SUPPORTED, "invalid level"); + } + if (enc->wrote_bytes) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, + "this setting can only be set at the beginning"); + } + enc->codestream_level = level; + return JXL_ENC_SUCCESS; +} + +int JxlEncoderGetRequiredCodestreamLevel(const JxlEncoder* enc) { + return VerifyLevelSettings(enc, nullptr); +} + +void JxlEncoderSetCms(JxlEncoder* enc, JxlCmsInterface cms) { + jxl::msan::MemoryIsInitialized(&cms, sizeof(cms)); + enc->cms = cms; +} + +JxlEncoderStatus JxlEncoderSetParallelRunner(JxlEncoder* enc, + JxlParallelRunner parallel_runner, + void* parallel_runner_opaque) { + if (enc->thread_pool) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, + "parallel runner already set"); + } + enc->thread_pool = jxl::MemoryManagerMakeUnique( + &enc->memory_manager, parallel_runner, parallel_runner_opaque); + if (!enc->thread_pool) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_GENERIC, + "error setting parallel runner"); + } + return JXL_ENC_SUCCESS; +} + +namespace { +JxlEncoderStatus GetCurrentDimensions( + const JxlEncoderFrameSettings* frame_settings, size_t& xsize, + size_t& ysize) { + xsize = frame_settings->enc->metadata.xsize(); + ysize = frame_settings->enc->metadata.ysize(); + if (frame_settings->values.header.layer_info.have_crop) { + xsize = frame_settings->values.header.layer_info.xsize; + ysize = frame_settings->values.header.layer_info.ysize; + } + if (frame_settings->values.cparams.already_downsampled) { + size_t factor = frame_settings->values.cparams.resampling; + xsize = jxl::DivCeil(xsize, factor); + ysize = jxl::DivCeil(ysize, factor); + } + if (xsize == 0 || ysize == 0) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "zero-sized frame is not allowed"); + } + return JXL_ENC_SUCCESS; +} +} // namespace + +JxlEncoderStatus JxlEncoderAddJPEGFrame( + const JxlEncoderFrameSettings* frame_settings, const uint8_t* buffer, + size_t size) { + if (frame_settings->enc->frames_closed) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Frame input is already closed"); + } + + jxl::CodecInOut io; + if (!jxl::jpeg::DecodeImageJPG(jxl::Span(buffer, size), &io)) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_BAD_INPUT, + "Error during decode of input JPEG"); + } + + if (!frame_settings->enc->color_encoding_set) { + if (!SetColorEncodingFromJpegData( + *io.Main().jpeg_data, + &frame_settings->enc->metadata.m.color_encoding)) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_BAD_INPUT, + "Error in input JPEG color space"); + } + } + + if (!frame_settings->enc->basic_info_set) { + JxlBasicInfo basic_info; + JxlEncoderInitBasicInfo(&basic_info); + basic_info.xsize = io.Main().jpeg_data->width; + basic_info.ysize = io.Main().jpeg_data->height; + basic_info.uses_original_profile = true; + if (JxlEncoderSetBasicInfo(frame_settings->enc, &basic_info) != + JXL_ENC_SUCCESS) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC, + "Error setting basic info"); + } + } + + if (frame_settings->enc->metadata.m.xyb_encoded) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Can't XYB encode a lossless JPEG"); + } + if (!io.blobs.exif.empty()) { + JxlOrientation orientation = static_cast( + frame_settings->enc->metadata.m.orientation); + jxl::InterpretExif(io.blobs.exif, &orientation); + frame_settings->enc->metadata.m.orientation = orientation; + + size_t exif_size = io.blobs.exif.size(); + // Exif data in JPEG is limited to 64k + if (exif_size > 0xFFFF) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC, + "Exif larger than possible in JPEG?"); + } + exif_size += 4; // prefix 4 zero bytes for tiff offset + std::vector exif(exif_size); + memcpy(exif.data() + 4, io.blobs.exif.data(), io.blobs.exif.size()); + JxlEncoderUseBoxes(frame_settings->enc); + JxlEncoderAddBox(frame_settings->enc, "Exif", exif.data(), exif_size, + frame_settings->values.cparams.jpeg_compress_boxes); + } + if (!io.blobs.xmp.empty()) { + JxlEncoderUseBoxes(frame_settings->enc); + JxlEncoderAddBox(frame_settings->enc, "xml ", io.blobs.xmp.data(), + io.blobs.xmp.size(), + frame_settings->values.cparams.jpeg_compress_boxes); + } + if (!io.blobs.jumbf.empty()) { + JxlEncoderUseBoxes(frame_settings->enc); + JxlEncoderAddBox(frame_settings->enc, "jumb", io.blobs.jumbf.data(), + io.blobs.jumbf.size(), + frame_settings->values.cparams.jpeg_compress_boxes); + } + if (frame_settings->enc->store_jpeg_metadata) { + jxl::jpeg::JPEGData data_in = *io.Main().jpeg_data; + jxl::PaddedBytes jpeg_data; + if (!jxl::jpeg::EncodeJPEGData(data_in, &jpeg_data, + frame_settings->values.cparams)) { + return JXL_API_ERROR( + frame_settings->enc, JXL_ENC_ERR_JBRD, + "JPEG bitstream reconstruction data cannot be encoded"); + } + frame_settings->enc->jpeg_metadata = std::vector( + jpeg_data.data(), jpeg_data.data() + jpeg_data.size()); + } + + auto queued_frame = jxl::MemoryManagerMakeUnique( + &frame_settings->enc->memory_manager, + // JxlEncoderQueuedFrame is a struct with no constructors, so we use the + // default move constructor there. + jxl::JxlEncoderQueuedFrame{ + frame_settings->values, + jxl::ImageBundle(&frame_settings->enc->metadata.m), + {}}); + if (!queued_frame) { + // TODO(jon): when can this happen? is this an API usage error? + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC, + "No frame queued?"); + } + queued_frame->frame.SetFromImage(std::move(*io.Main().color()), + io.Main().c_current()); + size_t xsize, ysize; + if (GetCurrentDimensions(frame_settings, xsize, ysize) != JXL_ENC_SUCCESS) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC, + "bad dimensions"); + } + if (xsize != static_cast(io.Main().jpeg_data->width) || + ysize != static_cast(io.Main().jpeg_data->height)) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC, + "JPEG dimensions don't match frame dimensions"); + } + std::vector extra_channels( + frame_settings->enc->metadata.m.num_extra_channels); + for (auto& extra_channel : extra_channels) { + extra_channel = jxl::ImageF(xsize, ysize); + queued_frame->ec_initialized.push_back(0); + } + queued_frame->frame.SetExtraChannels(std::move(extra_channels)); + queued_frame->frame.jpeg_data = std::move(io.Main().jpeg_data); + queued_frame->frame.color_transform = io.Main().color_transform; + queued_frame->frame.chroma_subsampling = io.Main().chroma_subsampling; + + QueueFrame(frame_settings, queued_frame); + return JXL_ENC_SUCCESS; +} + +static bool CanDoFastLossless(const JxlEncoderFrameSettings* frame_settings, + const JxlPixelFormat* pixel_format, + bool has_alpha) { + if (!frame_settings->values.lossless) { + return false; + } + // TODO(veluca): many of the following options could be made to work, but are + // just not implemented in FJXL's frame header handling yet. + if (frame_settings->values.frame_index_box) { + return false; + } + if (frame_settings->values.header.layer_info.have_crop) { + return false; + } + if (frame_settings->enc->metadata.m.have_animation) { + return false; + } + if (frame_settings->values.cparams.speed_tier != jxl::SpeedTier::kLightning) { + return false; + } + if (frame_settings->values.image_bit_depth.type == + JxlBitDepthType::JXL_BIT_DEPTH_CUSTOM && + frame_settings->values.image_bit_depth.bits_per_sample != + frame_settings->enc->metadata.m.bit_depth.bits_per_sample) { + return false; + } + // TODO(veluca): implement support for LSB-padded input in fast_lossless. + if (frame_settings->values.image_bit_depth.type == + JxlBitDepthType::JXL_BIT_DEPTH_FROM_PIXEL_FORMAT && + frame_settings->values.image_bit_depth.bits_per_sample % 8 != 0) { + return false; + } + if (!frame_settings->values.frame_name.empty()) { + return false; + } + // No extra channels other than alpha. + if (!(has_alpha && frame_settings->enc->metadata.m.num_extra_channels == 1) && + frame_settings->enc->metadata.m.num_extra_channels != 0) { + return false; + } + if (frame_settings->enc->metadata.m.bit_depth.bits_per_sample > 16) { + return false; + } + if (pixel_format->data_type != JxlDataType::JXL_TYPE_FLOAT16 && + pixel_format->data_type != JxlDataType::JXL_TYPE_UINT16 && + pixel_format->data_type != JxlDataType::JXL_TYPE_UINT8) { + return false; + } + if ((frame_settings->enc->metadata.m.bit_depth.bits_per_sample > 8) != + (pixel_format->data_type == JxlDataType::JXL_TYPE_UINT16 || + pixel_format->data_type == JxlDataType::JXL_TYPE_FLOAT16)) { + return false; + } + if (!((pixel_format->num_channels == 1 || pixel_format->num_channels == 3) && + !has_alpha) && + !((pixel_format->num_channels == 2 || pixel_format->num_channels == 4) && + has_alpha)) { + return false; + } + + return true; +} + +JxlEncoderStatus JxlEncoderAddImageFrame( + const JxlEncoderFrameSettings* frame_settings, + const JxlPixelFormat* pixel_format, const void* buffer, size_t size) { + if (!frame_settings->enc->basic_info_set || + (!frame_settings->enc->color_encoding_set && + !frame_settings->enc->metadata.m.xyb_encoded)) { + // Basic Info must be set, and color encoding must be set directly, + // or set to XYB via JxlBasicInfo.uses_original_profile = JXL_FALSE + // Otherwise, this is an API misuse. + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Basic info or color encoding not set yet"); + } + + if (frame_settings->enc->frames_closed) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Frame input already closed"); + } + if (pixel_format->num_channels < 3) { + if (frame_settings->enc->basic_info.num_color_channels != 1) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Grayscale pixel format input for an RGB image"); + } + } else { + if (frame_settings->enc->basic_info.num_color_channels != 3) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "RGB pixel format input for a grayscale image"); + } + } + + bool has_alpha = frame_settings->enc->metadata.m.HasAlpha(); + + size_t xsize, ysize; + if (GetCurrentDimensions(frame_settings, xsize, ysize) != JXL_ENC_SUCCESS) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC, + "bad dimensions"); + } + + // All required conditions to do fast-lossless. + if (CanDoFastLossless(frame_settings, pixel_format, has_alpha)) { + const size_t bytes_per_pixel = + pixel_format->data_type == JxlDataType::JXL_TYPE_UINT8 + ? pixel_format->num_channels + : pixel_format->num_channels * 2; + const size_t last_row_size = xsize * bytes_per_pixel; + const size_t align = pixel_format->align; + const size_t row_size = + (align > 1 ? jxl::DivCeil(last_row_size, align) * align + : last_row_size); + const size_t bytes_to_read = row_size * (ysize - 1) + last_row_size; + if (bytes_to_read > size) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "provided image buffer too small"); + } + const bool big_endian = + pixel_format->endianness == JXL_BIG_ENDIAN || + (pixel_format->endianness == JXL_NATIVE_ENDIAN && !IsLittleEndian()); + + auto runner = +[](void* void_pool, void* opaque, void fun(void*, size_t), + size_t count) { + auto* pool = reinterpret_cast(void_pool); + JXL_CHECK(jxl::RunOnPool( + pool, 0, count, jxl::ThreadPool::NoInit, + [&](size_t i, size_t) { fun(opaque, i); }, "Encode fast lossless")); + }; + QueueFastLosslessFrame( + frame_settings, + JxlFastLosslessPrepareFrame( + reinterpret_cast(buffer), xsize, row_size, + ysize, pixel_format->num_channels, + frame_settings->enc->metadata.m.bit_depth.bits_per_sample, + big_endian, /*effort=*/2, frame_settings->enc->thread_pool.get(), + runner)); + return JXL_ENC_SUCCESS; + } + + auto queued_frame = jxl::MemoryManagerMakeUnique( + &frame_settings->enc->memory_manager, + // JxlEncoderQueuedFrame is a struct with no constructors, so we use the + // default move constructor there. + jxl::JxlEncoderQueuedFrame{ + frame_settings->values, + jxl::ImageBundle(&frame_settings->enc->metadata.m), + {}}); + + if (!queued_frame) { + // TODO(jon): when can this happen? is this an API usage error? + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC, + "No frame queued?"); + } + + jxl::ColorEncoding c_current; + if (!frame_settings->enc->color_encoding_set) { + if ((pixel_format->data_type == JXL_TYPE_FLOAT) || + (pixel_format->data_type == JXL_TYPE_FLOAT16)) { + c_current = + jxl::ColorEncoding::LinearSRGB(pixel_format->num_channels < 3); + } else { + c_current = jxl::ColorEncoding::SRGB(pixel_format->num_channels < 3); + } + } else { + c_current = frame_settings->enc->metadata.m.color_encoding; + } + uint32_t num_channels = pixel_format->num_channels; + size_t has_interleaved_alpha = + static_cast(num_channels == 2 || num_channels == 4); + if (has_interleaved_alpha > + frame_settings->enc->metadata.m.num_extra_channels) { + return JXL_API_ERROR( + frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "number of extra channels mismatch (need 1 extra channel for alpha)"); + } + std::vector extra_channels( + frame_settings->enc->metadata.m.num_extra_channels); + for (auto& extra_channel : extra_channels) { + extra_channel = jxl::ImageF(xsize, ysize); + } + queued_frame->frame.SetExtraChannels(std::move(extra_channels)); + for (auto& ec_info : frame_settings->enc->metadata.m.extra_channel_info) { + if (has_interleaved_alpha && ec_info.type == jxl::ExtraChannel::kAlpha) { + queued_frame->ec_initialized.push_back(1); + has_interleaved_alpha = 0; // only first Alpha is initialized + } else { + queued_frame->ec_initialized.push_back(0); + } + } + queued_frame->frame.origin.x0 = + frame_settings->values.header.layer_info.crop_x0; + queued_frame->frame.origin.y0 = + frame_settings->values.header.layer_info.crop_y0; + queued_frame->frame.use_for_next_frame = + (frame_settings->values.header.layer_info.save_as_reference != 0u); + queued_frame->frame.blendmode = + frame_settings->values.header.layer_info.blend_info.blendmode == + JXL_BLEND_REPLACE + ? jxl::BlendMode::kReplace + : jxl::BlendMode::kBlend; + queued_frame->frame.blend = + frame_settings->values.header.layer_info.blend_info.source > 0; + + if (JXL_ENC_SUCCESS != + VerifyInputBitDepth(frame_settings->values.image_bit_depth, + *pixel_format)) { + return JXL_API_ERROR_NOSET("Invalid input bit depth"); + } + size_t bits_per_sample = + GetBitDepth(frame_settings->values.image_bit_depth, + frame_settings->enc->metadata.m, *pixel_format); + const uint8_t* uint8_buffer = reinterpret_cast(buffer); + if (!jxl::ConvertFromExternal( + jxl::Span(uint8_buffer, size), xsize, ysize, c_current, + bits_per_sample, *pixel_format, + frame_settings->enc->thread_pool.get(), &(queued_frame->frame))) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Invalid input buffer"); + } + if (frame_settings->values.lossless && + frame_settings->enc->metadata.m.xyb_encoded) { + return JXL_API_ERROR( + frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Set uses_original_profile=true for lossless encoding"); + } + queued_frame->option_values.cparams.level = + frame_settings->enc->codestream_level; + + QueueFrame(frame_settings, queued_frame); + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderUseBoxes(JxlEncoder* enc) { + if (enc->wrote_bytes) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, + "this setting can only be set at the beginning"); + } + enc->use_boxes = true; + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderAddBox(JxlEncoder* enc, const JxlBoxType type, + const uint8_t* contents, size_t size, + JXL_BOOL compress_box) { + if (!enc->use_boxes) { + return JXL_API_ERROR( + enc, JXL_ENC_ERR_API_USAGE, + "must set JxlEncoderUseBoxes at the beginning to add boxes"); + } + if (compress_box) { + if (memcmp("jxl", type, 3) == 0) { + return JXL_API_ERROR( + enc, JXL_ENC_ERR_API_USAGE, + "brob box may not contain a type starting with \"jxl\""); + } + if (memcmp("jbrd", type, 4) == 0) { + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, + "jbrd box may not be brob compressed"); + } + if (memcmp("brob", type, 4) == 0) { + // The compress_box will compress an existing non-brob box into a brob + // box. If already giving a valid brotli-compressed brob box, set + // compress_box to false since it is already compressed. + return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, + "a brob box cannot contain another brob box"); + } + } + + auto box = jxl::MemoryManagerMakeUnique( + &enc->memory_manager); + + box->type = jxl::MakeBoxType(type); + box->contents.assign(contents, contents + size); + box->compress_box = !!compress_box; + QueueBox(enc, box); + return JXL_ENC_SUCCESS; +} + +JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelBuffer( + const JxlEncoderOptions* frame_settings, const JxlPixelFormat* pixel_format, + const void* buffer, size_t size, uint32_t index) { + if (index >= frame_settings->enc->metadata.m.num_extra_channels) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Invalid value for the index of extra channel"); + } + if (!frame_settings->enc->basic_info_set || + !frame_settings->enc->color_encoding_set) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Basic info has to be set first"); + } + if (frame_settings->enc->input_queue.empty()) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "First add image frame, then extra channels"); + } + if (frame_settings->enc->frames_closed) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Frame input already closed"); + } + size_t xsize, ysize; + if (GetCurrentDimensions(frame_settings, xsize, ysize) != JXL_ENC_SUCCESS) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC, + "bad dimensions"); + } + JxlPixelFormat ec_format = *pixel_format; + ec_format.num_channels = 1; + if (JXL_ENC_SUCCESS != + VerifyInputBitDepth(frame_settings->values.image_bit_depth, ec_format)) { + return JXL_API_ERROR_NOSET("Invalid input bit depth"); + } + size_t bits_per_sample = GetBitDepth( + frame_settings->values.image_bit_depth, + frame_settings->enc->metadata.m.extra_channel_info[index], ec_format); + const uint8_t* uint8_buffer = reinterpret_cast(buffer); + auto queued_frame = frame_settings->enc->input_queue.back().frame.get(); + if (!jxl::ConvertFromExternal(jxl::Span(uint8_buffer, size), + xsize, ysize, bits_per_sample, ec_format, 0, + frame_settings->enc->thread_pool.get(), + &queued_frame->frame.extra_channels()[index])) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Failed to set buffer for extra channel"); + } + queued_frame->ec_initialized[index] = 1; + + return JXL_ENC_SUCCESS; +} + +void JxlEncoderCloseFrames(JxlEncoder* enc) { enc->frames_closed = true; } + +void JxlEncoderCloseBoxes(JxlEncoder* enc) { enc->boxes_closed = true; } + +void JxlEncoderCloseInput(JxlEncoder* enc) { + JxlEncoderCloseFrames(enc); + JxlEncoderCloseBoxes(enc); +} +JxlEncoderStatus JxlEncoderProcessOutput(JxlEncoder* enc, uint8_t** next_out, + size_t* avail_out) { + while (*avail_out >= 32 && + (!enc->output_byte_queue.empty() || + !enc->output_fast_frame_queue.empty() || !enc->input_queue.empty())) { + if (!enc->output_byte_queue.empty()) { + size_t to_copy = std::min(*avail_out, enc->output_byte_queue.size()); + std::copy_n(enc->output_byte_queue.begin(), to_copy, *next_out); + *next_out += to_copy; + *avail_out -= to_copy; + enc->output_byte_queue.erase(enc->output_byte_queue.begin(), + enc->output_byte_queue.begin() + to_copy); + } else if (!enc->output_fast_frame_queue.empty()) { + size_t count = JxlFastLosslessWriteOutput( + enc->output_fast_frame_queue.front().get(), *next_out, *avail_out); + *next_out += count; + *avail_out -= count; + if (count == 0) { + enc->output_fast_frame_queue.pop_front(); + } + + } else if (!enc->input_queue.empty()) { + if (enc->RefillOutputByteQueue() != JXL_ENC_SUCCESS) { + return JXL_ENC_ERROR; + } + } + } + + if (!enc->output_byte_queue.empty() || + !enc->output_fast_frame_queue.empty() || !enc->input_queue.empty()) { + return JXL_ENC_NEED_MORE_OUTPUT; + } + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderSetFrameHeader(JxlEncoderOptions* frame_settings, + const JxlFrameHeader* frame_header) { + if (frame_header->layer_info.blend_info.source > 3) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "invalid blending source index"); + } + // If there are no extra channels, it's ok for the value to be 0. + if (frame_header->layer_info.blend_info.alpha != 0 && + frame_header->layer_info.blend_info.alpha >= + frame_settings->enc->metadata.m.extra_channel_info.size()) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "alpha blend channel index out of bounds"); + } + + frame_settings->values.header = *frame_header; + // Setting the frame header resets the frame name, it must be set again with + // JxlEncoderSetFrameName if desired. + frame_settings->values.frame_name = ""; + + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderSetExtraChannelBlendInfo( + JxlEncoderOptions* frame_settings, size_t index, + const JxlBlendInfo* blend_info) { + if (index >= frame_settings->enc->metadata.m.num_extra_channels) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "Invalid value for the index of extra channel"); + } + + if (frame_settings->values.extra_channel_blend_info.size() != + frame_settings->enc->metadata.m.num_extra_channels) { + JxlBlendInfo default_blend_info; + JxlEncoderInitBlendInfo(&default_blend_info); + frame_settings->values.extra_channel_blend_info.resize( + frame_settings->enc->metadata.m.num_extra_channels, default_blend_info); + } + frame_settings->values.extra_channel_blend_info[index] = *blend_info; + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderSetFrameName(JxlEncoderFrameSettings* frame_settings, + const char* frame_name) { + std::string str = frame_name ? frame_name : ""; + if (str.size() > 1071) { + return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE, + "frame name can be max 1071 bytes long"); + } + frame_settings->values.frame_name = str; + frame_settings->values.header.name_length = str.size(); + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderSetFrameBitDepth( + JxlEncoderFrameSettings* frame_settings, const JxlBitDepth* bit_depth) { + if (bit_depth->type != JXL_BIT_DEPTH_FROM_PIXEL_FORMAT && + bit_depth->type != JXL_BIT_DEPTH_FROM_CODESTREAM) { + return JXL_API_ERROR_NOSET( + "Only JXL_BIT_DEPTH_FROM_PIXEL_FORMAT and " + "JXL_BIT_DEPTH_FROM_CODESTREAM is implemented " + "for input buffers."); + } + frame_settings->values.image_bit_depth = *bit_depth; + return JXL_ENC_SUCCESS; +} + +void JxlColorEncodingSetToSRGB(JxlColorEncoding* color_encoding, + JXL_BOOL is_gray) { + ConvertInternalToExternalColorEncoding(jxl::ColorEncoding::SRGB(is_gray), + color_encoding); +} + +void JxlColorEncodingSetToLinearSRGB(JxlColorEncoding* color_encoding, + JXL_BOOL is_gray) { + ConvertInternalToExternalColorEncoding( + jxl::ColorEncoding::LinearSRGB(is_gray), color_encoding); +} + +void JxlEncoderAllowExpertOptions(JxlEncoder* enc) { + enc->allow_expert_options = true; +} diff --git a/third_party/jpeg-xl/lib/jxl/encode_internal.h b/third_party/jpeg-xl/lib/jxl/encode_internal.h new file mode 100644 index 0000000000..7713c5cab6 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/encode_internal.h @@ -0,0 +1,275 @@ +/* Copyright (c) the JPEG XL Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style + * license that can be found in the LICENSE file. + */ + +#ifndef LIB_JXL_ENCODE_INTERNAL_H_ +#define LIB_JXL_ENCODE_INTERNAL_H_ + +#include +#include +#include +#include + +#include +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/enc_fast_lossless.h" +#include "lib/jxl/enc_frame.h" +#include "lib/jxl/memory_manager_internal.h" + +namespace jxl { + +/* Frame index box 'jxli' will start with Varint() for +NF: has type Varint(): number of frames listed in the index. +TNUM: has type u32: numerator of tick unit. +TDEN: has type u32: denominator of tick unit. Value 0 means the file is +ill-formed. per frame i listed: OFFi: has type Varint(): offset of start byte of +this frame compared to start byte of previous frame from this index in the JPEG +XL codestream. For the first frame, this is the offset from the first byte of +the JPEG XL codestream. Ti: has type Varint(): duration in ticks between the +start of this frame and the start of the next frame in the index. If this is the +last frame in the index, this is the duration in ticks between the start of this +frame and the end of the stream. A tick lasts TNUM / TDEN seconds. Fi: has type +Varint(): amount of frames the next frame in the index occurs after this frame. +If this is the last frame in the index, this is the amount of frames after this +frame in the remainder of the stream. Only frames that are presented by the +decoder are counted for this purpose, this excludes frames that are not intended +for display but for compositing with other frames, such as frames that aren't +the last frame with a duration of 0 ticks. + +All the frames listed in jxli are keyframes and the first frame is +present in the list. +There shall be either zero or one Frame Index boxes in a JPEG XL file. +The offsets OFFi per frame are given as bytes in the codestream, not as +bytes in the file format using the box structure. This means if JPEG XL Partial +Codestream boxes are used, the offset is counted within the concatenated +codestream, bytes from box headers or non-codestream boxes are not counted. +*/ + +typedef struct JxlEncoderFrameIndexBoxEntryStruct { + bool to_be_indexed; + uint32_t duration; + uint64_t OFFi; +} JxlEncoderFrameIndexBoxEntry; + +typedef struct JxlEncoderFrameIndexBoxStruct { + // We always need to record the first frame entry, so presence of the + // first entry alone is not an indication if it was requested to be + // stored. + bool index_box_requested_through_api = false; + + int64_t NF() const { return entries.size(); } + bool StoreFrameIndexBox() { + for (auto e : entries) { + if (e.to_be_indexed) { + return true; + } + } + return false; + } + int32_t TNUM = 1; + int32_t TDEN = 1000; + + std::vector entries; + + // That way we can ensure that every index box will have the first frame. + // If the API user decides to mark it as an indexed frame, we call + // the AddFrame again, this time with requested. + void AddFrame(uint64_t OFFi, uint32_t duration, bool to_be_indexed) { + // We call AddFrame to every frame. + // Recording the first frame is required by the standard. + // Knowing the last frame is required, since the last indexed frame + // needs to know how many frames until the end. + // To be able to tell how many frames there are between each index + // entry we just record every frame here. + if (entries.size() == 1) { + if (OFFi == entries[0].OFFi) { + // API use for the first frame, let's clear the already recorded first + // frame. + entries.clear(); + } + } + JxlEncoderFrameIndexBoxEntry e; + e.to_be_indexed = to_be_indexed; + e.OFFi = OFFi; + e.duration = duration; + entries.push_back(e); + } +} JxlEncoderFrameIndexBox; + +// The encoder options (such as quality, compression speed, ...) for a single +// frame, but not encoder-wide options such as box-related options. +typedef struct JxlEncoderFrameSettingsValuesStruct { + // lossless is a separate setting from cparams because it is a combination + // setting that overrides multiple settings inside of cparams. + bool lossless; + CompressParams cparams; + JxlFrameHeader header; + std::vector extra_channel_blend_info; + std::string frame_name; + JxlBitDepth image_bit_depth; + bool frame_index_box = false; +} JxlEncoderFrameSettingsValues; + +typedef std::array BoxType; + +// Utility function that makes a BoxType from a string literal. The string must +// have 4 characters, a 5th null termination character is optional. +constexpr BoxType MakeBoxType(const char* type) { + return BoxType( + {{static_cast(type[0]), static_cast(type[1]), + static_cast(type[2]), static_cast(type[3])}}); +} + +constexpr unsigned char kContainerHeader[] = { + 0, 0, 0, 0xc, 'J', 'X', 'L', ' ', 0xd, 0xa, 0x87, + 0xa, 0, 0, 0, 0x14, 'f', 't', 'y', 'p', 'j', 'x', + 'l', ' ', 0, 0, 0, 0, 'j', 'x', 'l', ' '}; + +constexpr unsigned char kLevelBoxHeader[] = {0, 0, 0, 0x9, 'j', 'x', 'l', 'l'}; + +struct JxlEncoderQueuedFrame { + JxlEncoderFrameSettingsValues option_values; + ImageBundle frame; + std::vector ec_initialized; +}; + +struct JxlEncoderQueuedBox { + BoxType type; + std::vector contents; + bool compress_box; +}; + +using FJXLFrameUniquePtr = + std::unique_ptr; + +// Either a frame, or a box, not both. +// Can also be a FJXL frame. +struct JxlEncoderQueuedInput { + explicit JxlEncoderQueuedInput(const JxlMemoryManager& memory_manager) + : frame(nullptr, jxl::MemoryManagerDeleteHelper(&memory_manager)), + box(nullptr, jxl::MemoryManagerDeleteHelper(&memory_manager)) {} + MemoryManagerUniquePtr frame; + MemoryManagerUniquePtr box; + FJXLFrameUniquePtr fast_lossless_frame = {nullptr, + JxlFastLosslessFreeFrameState}; +}; + +// Appends a JXL container box header with given type, size, and unbounded +// properties to output. +template +void AppendBoxHeader(const jxl::BoxType& type, size_t size, bool unbounded, + T* output) { + uint64_t box_size = 0; + bool large_size = false; + if (!unbounded) { + box_size = size + 8; + if (box_size >= 0x100000000ull) { + large_size = true; + } + } + + { + const uint64_t store = large_size ? 1 : box_size; + for (size_t i = 0; i < 4; i++) { + output->push_back(store >> (8 * (3 - i)) & 0xff); + } + } + for (size_t i = 0; i < 4; i++) { + output->push_back(type[i]); + } + + if (large_size) { + for (size_t i = 0; i < 8; i++) { + output->push_back(box_size >> (8 * (7 - i)) & 0xff); + } + } +} + +} // namespace jxl + +// Internal use only struct, can only be initialized correctly by +// JxlEncoderCreate. +struct JxlEncoderStruct { + JxlEncoderError error = JxlEncoderError::JXL_ENC_ERR_OK; + JxlMemoryManager memory_manager; + jxl::MemoryManagerUniquePtr thread_pool{ + nullptr, jxl::MemoryManagerDeleteHelper(&memory_manager)}; + JxlCmsInterface cms; + std::vector> + encoder_options; + + size_t num_queued_frames; + size_t num_queued_boxes; + std::vector input_queue; + std::deque output_byte_queue; + std::deque output_fast_frame_queue; + + // How many codestream bytes have been written, i.e., + // content of jxlc and jxlp boxes. Frame index box jxli + // requires position indices to point to codestream bytes, + // so we need to keep track of the total of flushed or queue + // codestream bytes. These bytes may be in a single jxlc box + // or across multiple jxlp boxes. + size_t codestream_bytes_written_beginning_of_frame; + size_t codestream_bytes_written_end_of_frame; + jxl::JxlEncoderFrameIndexBox frame_index_box; + + // Force using the container even if not needed + bool use_container; + // User declared they will add metadata boxes + bool use_boxes; + + // TODO(lode): move level into jxl::CompressParams since some C++ + // implementation decisions should be based on it: level 10 allows more + // features to be used. + int32_t codestream_level; + bool store_jpeg_metadata; + jxl::CodecMetadata metadata; + std::vector jpeg_metadata; + + // Wrote any output at all, so wrote the data before the first user added + // frame or box, such as signature, basic info, ICC profile or jpeg + // reconstruction box. + bool wrote_bytes; + jxl::CompressParams last_used_cparams; + JxlBasicInfo basic_info; + + // Encoder wrote a jxlp (partial codestream) box, so any next codestream + // parts must also be written in jxlp boxes, a single jxlc box cannot be + // used. The counter is used for the 4-byte jxlp box index header. + size_t jxlp_counter; + + bool frames_closed; + bool boxes_closed; + bool basic_info_set; + bool color_encoding_set; + bool intensity_target_set; + bool allow_expert_options = false; + int brotli_effort = -1; + + // Takes the first frame in the input_queue, encodes it, and appends + // the bytes to the output_byte_queue. + JxlEncoderStatus RefillOutputByteQueue(); + + bool MustUseContainer() const { + return use_container || (codestream_level != 5 && codestream_level != -1) || + store_jpeg_metadata || use_boxes; + } + + // Appends the bytes of a JXL box header with the provided type and size to + // the end of the output_byte_queue. If unbounded is true, the size won't be + // added to the header and the box will be assumed to continue until EOF. + void AppendBoxHeader(const jxl::BoxType& type, size_t size, bool unbounded); +}; + +struct JxlEncoderFrameSettingsStruct { + JxlEncoder* enc; + jxl::JxlEncoderFrameSettingsValues values; +}; + +#endif // LIB_JXL_ENCODE_INTERNAL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/encode_test.cc b/third_party/jpeg-xl/lib/jxl/encode_test.cc new file mode 100644 index 0000000000..3f1d77fd62 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/encode_test.cc @@ -0,0 +1,1405 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include +#include + +#include "lib/extras/codec.h" +#include "lib/extras/dec/jxl.h" +#include "lib/jxl/enc_butteraugli_pnorm.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/encode_internal.h" +#include "lib/jxl/jpeg/dec_jpeg_data.h" +#include "lib/jxl/jpeg/dec_jpeg_data_writer.h" +#include "lib/jxl/test_image.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +TEST(EncodeTest, AddFrameAfterCloseInputTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + + JxlEncoderCloseInput(enc.get()); + + size_t xsize = 64; + size_t ysize = 64; + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + + jxl::CodecInOut input_io = + jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize); + + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format); + basic_info.xsize = xsize; + basic_info.ysize = ysize; + basic_info.uses_original_profile = false; + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10)); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info)); + JxlColorEncoding color_encoding; + JxlColorEncodingSetToSRGB(&color_encoding, + /*is_gray=*/pixel_format.num_channels < 3); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetColorEncoding(enc.get(), &color_encoding)); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_ERROR, + JxlEncoderAddImageFrame(frame_settings, &pixel_format, + pixels.data(), pixels.size())); +} + +TEST(EncodeTest, AddJPEGAfterCloseTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + + JxlEncoderCloseInput(enc.get()); + + const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg"; + const jxl::PaddedBytes orig = jxl::test::ReadTestData(jpeg_path); + + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + + EXPECT_EQ(JXL_ENC_ERROR, + JxlEncoderAddJPEGFrame(frame_settings, orig.data(), orig.size())); +} + +TEST(EncodeTest, AddFrameBeforeColorEncodingTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + + size_t xsize = 64; + size_t ysize = 64; + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + + jxl::CodecInOut input_io = + jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize); + + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format); + basic_info.xsize = xsize; + basic_info.ysize = ysize; + basic_info.uses_original_profile = true; + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10)); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info)); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_ERROR, + JxlEncoderAddImageFrame(frame_settings, &pixel_format, + pixels.data(), pixels.size())); +} + +TEST(EncodeTest, AddFrameBeforeBasicInfoTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + + size_t xsize = 64; + size_t ysize = 64; + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + + jxl::CodecInOut input_io = + jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize); + + JxlColorEncoding color_encoding; + JxlColorEncodingSetToSRGB(&color_encoding, + /*is_gray=*/pixel_format.num_channels < 3); + EXPECT_EQ(JXL_ENC_ERROR, + JxlEncoderSetColorEncoding(enc.get(), &color_encoding)); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_ERROR, + JxlEncoderAddImageFrame(frame_settings, &pixel_format, + pixels.data(), pixels.size())); +} + +TEST(EncodeTest, DefaultAllocTest) { + JxlEncoder* enc = JxlEncoderCreate(nullptr); + EXPECT_NE(nullptr, enc); + JxlEncoderDestroy(enc); +} + +TEST(EncodeTest, CustomAllocTest) { + struct CalledCounters { + int allocs = 0; + int frees = 0; + } counters; + + JxlMemoryManager mm; + mm.opaque = &counters; + mm.alloc = [](void* opaque, size_t size) { + reinterpret_cast(opaque)->allocs++; + return malloc(size); + }; + mm.free = [](void* opaque, void* address) { + reinterpret_cast(opaque)->frees++; + free(address); + }; + + { + JxlEncoderPtr enc = JxlEncoderMake(&mm); + EXPECT_NE(nullptr, enc.get()); + EXPECT_LE(1, counters.allocs); + EXPECT_EQ(0, counters.frees); + } + EXPECT_LE(1, counters.frees); +} + +TEST(EncodeTest, DefaultParallelRunnerTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetParallelRunner(enc.get(), nullptr, nullptr)); +} + +void VerifyFrameEncoding(size_t xsize, size_t ysize, JxlEncoder* enc, + const JxlEncoderFrameSettings* frame_settings, + size_t max_compressed_size, + bool lossy_use_original_profile) { + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + + jxl::CodecInOut input_io = + jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize); + + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format); + basic_info.xsize = xsize; + basic_info.ysize = ysize; + if (frame_settings->values.lossless || lossy_use_original_profile) { + basic_info.uses_original_profile = true; + } else { + basic_info.uses_original_profile = false; + } + // 16-bit alpha means this requires level 10 + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc, 10)); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info)); + JxlColorEncoding color_encoding; + JxlColorEncodingSetToSRGB(&color_encoding, true); + EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderSetColorEncoding(enc, &color_encoding)); + JxlColorEncodingSetToSRGB(&color_encoding, false); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetColorEncoding(enc, &color_encoding)); + pixel_format.num_channels = 1; + EXPECT_EQ(JXL_ENC_ERROR, + JxlEncoderAddImageFrame(frame_settings, &pixel_format, + pixels.data(), pixels.size())); + pixel_format.num_channels = 4; + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddImageFrame(frame_settings, &pixel_format, + pixels.data(), pixels.size())); + JxlEncoderCloseInput(enc); + + std::vector compressed = std::vector(64); + uint8_t* next_out = compressed.data(); + size_t avail_out = compressed.size() - (next_out - compressed.data()); + JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT; + while (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + process_result = JxlEncoderProcessOutput(enc, &next_out, &avail_out); + if (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + size_t offset = next_out - compressed.data(); + compressed.resize(compressed.size() * 2); + next_out = compressed.data() + offset; + avail_out = compressed.size() - offset; + } + } + compressed.resize(next_out - compressed.data()); + EXPECT_LE(compressed.size(), max_compressed_size); + EXPECT_EQ(JXL_ENC_SUCCESS, process_result); + jxl::CodecInOut decoded_io; + EXPECT_TRUE(jxl::test::DecodeFile( + {}, jxl::Span(compressed.data(), compressed.size()), + &decoded_io)); + + EXPECT_LE( + ComputeDistance2(input_io.Main(), decoded_io.Main(), jxl::GetJxlCms()), +#if JXL_HIGH_PRECISION + 1.84); +#else + 8.7); +#endif +} + +void VerifyFrameEncoding(JxlEncoder* enc, + const JxlEncoderFrameSettings* frame_settings) { + VerifyFrameEncoding(63, 129, enc, frame_settings, 2700, + /*lossy_use_original_profile=*/false); +} + +TEST(EncodeTest, FrameEncodingTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + VerifyFrameEncoding(enc.get(), + JxlEncoderFrameSettingsCreate(enc.get(), nullptr)); +} + +TEST(EncodeTest, EncoderResetTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + VerifyFrameEncoding(50, 200, enc.get(), + JxlEncoderFrameSettingsCreate(enc.get(), nullptr), 4300, + false); + // Encoder should become reusable for a new image from scratch after using + // reset. + JxlEncoderReset(enc.get()); + VerifyFrameEncoding(157, 77, enc.get(), + JxlEncoderFrameSettingsCreate(enc.get(), nullptr), 2300, + false); +} + +TEST(EncodeTest, CmsTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + bool cms_called = false; + JxlCmsInterface cms = jxl::GetJxlCms(); + struct InitData { + void* original_init_data; + jpegxl_cms_init_func original_init; + bool* cms_called; + }; + InitData init_data = {/*original_init_data=*/cms.init_data, + /*original_init=*/cms.init, + /*cms_called=*/&cms_called}; + cms.init_data = &init_data; + cms.init = +[](void* raw_init_data, size_t num_threads, + size_t pixels_per_thread, const JxlColorProfile* input_profile, + const JxlColorProfile* output_profile, + float intensity_target) { + const InitData* init_data = static_cast(raw_init_data); + *init_data->cms_called = true; + return init_data->original_init(init_data->original_init_data, num_threads, + pixels_per_thread, input_profile, + output_profile, intensity_target); + }; + JxlEncoderSetCms(enc.get(), cms); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), nullptr); + JxlEncoderSetFrameLossless(frame_settings, false); + ASSERT_EQ(JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption(frame_settings, + JXL_ENC_FRAME_SETTING_EFFORT, 8)); + VerifyFrameEncoding(enc.get(), frame_settings); + EXPECT_TRUE(cms_called); +} + +TEST(EncodeTest, frame_settingsTest) { + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_EFFORT, 5)); + VerifyFrameEncoding(enc.get(), frame_settings); + EXPECT_EQ(jxl::SpeedTier::kHare, enc->last_used_cparams.speed_tier); + } + + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + // Lower than currently supported values + EXPECT_EQ(JXL_ENC_ERROR, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_EFFORT, 0)); + // Higher than currently supported values + EXPECT_EQ(JXL_ENC_ERROR, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_EFFORT, 11)); + } + + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetFrameLossless(frame_settings, JXL_TRUE)); + VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 3600, false); + EXPECT_EQ(true, enc->last_used_cparams.IsLossless()); + } + + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetFrameDistance(frame_settings, 0.5)); + VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 3030, false); + EXPECT_EQ(0.5, enc->last_used_cparams.butteraugli_distance); + } + + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + // Disallowed negative distance + EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderSetFrameDistance(frame_settings, -1)); + } + + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_DECODING_SPEED, 2)); + VerifyFrameEncoding(enc.get(), frame_settings); + EXPECT_EQ(2u, enc->last_used_cparams.decoding_speed_tier); + } + + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_ERROR, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_GROUP_ORDER, 100)); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_GROUP_ORDER, 1)); + EXPECT_EQ( + JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_X, 5)); + VerifyFrameEncoding(enc.get(), frame_settings); + EXPECT_EQ(true, enc->last_used_cparams.centerfirst); + EXPECT_EQ(5, enc->last_used_cparams.center_x); + } + + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_RESPONSIVE, 0)); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC, 1)); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_QPROGRESSIVE_AC, -1)); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC, 2)); + VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 2830, + /*lossy_use_original_profile=*/false); + EXPECT_EQ(false, enc->last_used_cparams.responsive); + EXPECT_EQ(true, enc->last_used_cparams.progressive_mode); + EXPECT_EQ(2, enc->last_used_cparams.progressive_dc); + } + + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + EXPECT_EQ( + JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetFloatOption( + frame_settings, JXL_ENC_FRAME_SETTING_PHOTON_NOISE, 1777.777)); + VerifyFrameEncoding(enc.get(), frame_settings); + EXPECT_NEAR(1777.777f, enc->last_used_cparams.photon_noise_iso, 1E-4); + } + + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetFloatOption( + frame_settings, + JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GLOBAL_PERCENT, 55.0f)); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetFloatOption( + frame_settings, + JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GROUP_PERCENT, 25.0f)); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_PALETTE_COLORS, 70000)); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_LOSSY_PALETTE, 1)); + VerifyFrameEncoding(enc.get(), frame_settings); + EXPECT_NEAR(55.0f, + enc->last_used_cparams.channel_colors_pre_transform_percent, + 1E-6); + EXPECT_NEAR(25.0f, enc->last_used_cparams.channel_colors_percent, 1E-6); + EXPECT_EQ(70000, enc->last_used_cparams.palette_colors); + EXPECT_EQ(true, enc->last_used_cparams.lossy_palette); + } + + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + EXPECT_EQ( + JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_MODULAR_COLOR_SPACE, 30)); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_MODULAR_GROUP_SIZE, 2)); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR, 14)); + EXPECT_EQ( + JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetFloatOption( + frame_settings, + JXL_ENC_FRAME_SETTING_MODULAR_MA_TREE_LEARNING_PERCENT, 77.0f)); + EXPECT_EQ( + JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_MODULAR_NB_PREV_CHANNELS, 7)); + VerifyFrameEncoding(enc.get(), frame_settings); + EXPECT_EQ(30, enc->last_used_cparams.colorspace); + EXPECT_EQ(2, enc->last_used_cparams.modular_group_size_shift); + EXPECT_EQ(jxl::Predictor::Best, enc->last_used_cparams.options.predictor); + EXPECT_NEAR(0.77f, enc->last_used_cparams.options.nb_repeats, 1E-6); + EXPECT_EQ(7, enc->last_used_cparams.options.max_properties); + } + + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL, 0)); + VerifyFrameEncoding(enc.get(), frame_settings); + EXPECT_EQ(false, enc->last_used_cparams.force_cfl_jpeg_recompression); + } + + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL, 1)); + VerifyFrameEncoding(enc.get(), frame_settings); + EXPECT_EQ(true, enc->last_used_cparams.force_cfl_jpeg_recompression); + } +} + +TEST(EncodeTest, LossyEncoderUseOriginalProfileTest) { + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + ASSERT_NE(nullptr, enc.get()); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 7897, true); + } + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + ASSERT_NE(nullptr, enc.get()); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC, 2)); + VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 8310, true); + } + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + ASSERT_NE(nullptr, enc.get()); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + ASSERT_EQ(JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_EFFORT, 8)); + VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 7173, true); + } +} + +namespace { +// Returns a copy of buf from offset to offset+size, or a new zeroed vector if +// the result would have been out of bounds taking integer overflow into +// account. +std::vector SliceSpan(const jxl::Span& buf, + size_t offset, size_t size) { + if (offset + size >= buf.size()) { + return std::vector(size, 0); + } + if (offset + size < offset) { + return std::vector(size, 0); + } + return std::vector(buf.data() + offset, buf.data() + offset + size); +} + +struct Box { + // The type of the box. + // If "uuid", use extended_type instead + char type[4] = {0, 0, 0, 0}; + + // The extended_type is only used when type == "uuid". + // Extended types are not used in JXL. However, the box format itself + // supports this so they are handled correctly. + char extended_type[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + // Box data. + jxl::Span data = jxl::Span(nullptr, 0); + + // If the size is not given, the datasize extends to the end of the file. + // If this field is false, the size field is not encoded when the box is + // serialized. + bool data_size_given = true; + + // If successful, returns true and sets `in` to be the rest data (if any). + // If `in` contains a box with a size larger than `in.size()`, will not + // modify `in`, and will return true but the data `Span` will + // remain set to nullptr. + // If unsuccessful, returns error and doesn't modify `in`. + jxl::Status Decode(jxl::Span* in) { + // Total box_size including this header itself. + uint64_t box_size = LoadBE32(SliceSpan(*in, 0, 4).data()); + size_t pos = 4; + + memcpy(type, SliceSpan(*in, pos, 4).data(), 4); + pos += 4; + + if (box_size == 1) { + // If the size is 1, it indicates extended size read from 64-bit integer. + box_size = LoadBE64(SliceSpan(*in, pos, 8).data()); + pos += 8; + } + + if (!memcmp("uuid", type, 4)) { + memcpy(extended_type, SliceSpan(*in, pos, 16).data(), 16); + pos += 16; + } + + // This is the end of the box header, the box data begins here. Handle + // the data size now. + const size_t header_size = pos; + + if (box_size != 0) { + if (box_size < header_size) { + return JXL_FAILURE("Invalid box size"); + } + if (box_size > in->size()) { + // The box is fine, but the input is too short. + return true; + } + data_size_given = true; + data = jxl::Span(in->data() + header_size, + box_size - header_size); + } else { + data_size_given = false; + data = jxl::Span(in->data() + header_size, + in->size() - header_size); + } + + *in = jxl::Span(in->data() + header_size + data.size(), + in->size() - header_size - data.size()); + return true; + } +}; + +struct Container { + std::vector boxes; + + // If successful, returns true and sets `in` to be the rest data (if any). + // If unsuccessful, returns error and doesn't modify `in`. + jxl::Status Decode(jxl::Span* in) { + boxes.clear(); + + Box signature_box; + JXL_RETURN_IF_ERROR(signature_box.Decode(in)); + if (memcmp("JXL ", signature_box.type, 4) != 0) { + return JXL_FAILURE("Invalid magic signature"); + } + if (signature_box.data.size() != 4) + return JXL_FAILURE("Invalid magic signature"); + if (signature_box.data[0] != 0xd || signature_box.data[1] != 0xa || + signature_box.data[2] != 0x87 || signature_box.data[3] != 0xa) { + return JXL_FAILURE("Invalid magic signature"); + } + + Box ftyp_box; + JXL_RETURN_IF_ERROR(ftyp_box.Decode(in)); + if (memcmp("ftyp", ftyp_box.type, 4) != 0) { + return JXL_FAILURE("Invalid ftyp"); + } + if (ftyp_box.data.size() != 12) return JXL_FAILURE("Invalid ftyp"); + const char* expected = "jxl \0\0\0\0jxl "; + if (memcmp(expected, ftyp_box.data.data(), 12) != 0) + return JXL_FAILURE("Invalid ftyp"); + + while (!in->empty()) { + Box box = {}; + JXL_RETURN_IF_ERROR(box.Decode(in)); + if (box.data.data() == nullptr) { + // The decoding encountered a box, but not enough data yet. + return true; + } + boxes.emplace_back(box); + } + + return true; + } +}; + +} // namespace + +TEST(EncodeTest, SingleFrameBoundedJXLCTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseContainer(enc.get(), true)); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + + size_t xsize = 71; + size_t ysize = 23; + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format); + basic_info.xsize = xsize; + basic_info.ysize = ysize; + basic_info.uses_original_profile = false; + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10)); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info)); + JxlColorEncoding color_encoding; + JxlColorEncodingSetToSRGB(&color_encoding, + /*is_gray=*/false); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetColorEncoding(enc.get(), &color_encoding)); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddImageFrame(frame_settings, &pixel_format, + pixels.data(), pixels.size())); + JxlEncoderCloseInput(enc.get()); + + std::vector compressed = std::vector(64); + uint8_t* next_out = compressed.data(); + size_t avail_out = compressed.size() - (next_out - compressed.data()); + JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT; + while (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + process_result = JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out); + if (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + size_t offset = next_out - compressed.data(); + compressed.resize(compressed.size() * 2); + next_out = compressed.data() + offset; + avail_out = compressed.size() - offset; + } + } + compressed.resize(next_out - compressed.data()); + EXPECT_EQ(JXL_ENC_SUCCESS, process_result); + + Container container = {}; + jxl::Span encoded_span = + jxl::Span(compressed.data(), compressed.size()); + EXPECT_TRUE(container.Decode(&encoded_span)); + EXPECT_EQ(0u, encoded_span.size()); + bool found_jxlc = false; + bool found_jxlp = false; + // The encoder is allowed to either emit a jxlc or one or more jxlp. + for (size_t i = 0; i < container.boxes.size(); ++i) { + if (memcmp("jxlc", container.boxes[i].type, 4) == 0) { + EXPECT_EQ(false, found_jxlc); // Max 1 jxlc + EXPECT_EQ(false, found_jxlp); // Can't mix jxlc and jxlp + found_jxlc = true; + } + if (memcmp("jxlp", container.boxes[i].type, 4) == 0) { + EXPECT_EQ(false, found_jxlc); // Can't mix jxlc and jxlp + found_jxlp = true; + } + // The encoder shouldn't create an unbounded box in this case, with the + // single frame it knows the full size in time, so can help make decoding + // more efficient by giving the full box size of the final box. + EXPECT_EQ(true, container.boxes[i].data_size_given); + } + EXPECT_EQ(true, found_jxlc || found_jxlp); +} + +TEST(EncodeTest, CodestreamLevelTest) { + size_t xsize = 64; + size_t ysize = 64; + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + + jxl::CodecInOut input_io = + jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize); + + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format); + basic_info.xsize = xsize; + basic_info.ysize = ysize; + basic_info.uses_original_profile = false; + + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10)); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info)); + JxlColorEncoding color_encoding; + JxlColorEncodingSetToSRGB(&color_encoding, + /*is_gray=*/pixel_format.num_channels < 3); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetColorEncoding(enc.get(), &color_encoding)); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddImageFrame(frame_settings, &pixel_format, + pixels.data(), pixels.size())); + JxlEncoderCloseInput(enc.get()); + + std::vector compressed = std::vector(64); + uint8_t* next_out = compressed.data(); + size_t avail_out = compressed.size() - (next_out - compressed.data()); + JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT; + while (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + process_result = JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out); + if (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + size_t offset = next_out - compressed.data(); + compressed.resize(compressed.size() * 2); + next_out = compressed.data() + offset; + avail_out = compressed.size() - offset; + } + } + compressed.resize(next_out - compressed.data()); + EXPECT_EQ(JXL_ENC_SUCCESS, process_result); + + Container container = {}; + jxl::Span encoded_span = + jxl::Span(compressed.data(), compressed.size()); + EXPECT_TRUE(container.Decode(&encoded_span)); + EXPECT_EQ(0u, encoded_span.size()); + EXPECT_EQ(0, memcmp("jxll", container.boxes[0].type, 4)); +} + +TEST(EncodeTest, CodestreamLevelVerificationTest) { + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0}; + + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format); + basic_info.xsize = 64; + basic_info.ysize = 64; + basic_info.uses_original_profile = false; + + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info)); + + EXPECT_EQ(5, JxlEncoderGetRequiredCodestreamLevel(enc.get())); + + // Set an image dimension that is too large for level 5, but fits in level 10 + + basic_info.xsize = 1ull << 30ull; + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 5)); + EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderSetBasicInfo(enc.get(), &basic_info)); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10)); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info)); + EXPECT_EQ(10, JxlEncoderGetRequiredCodestreamLevel(enc.get())); + + // Set an image dimension that is too large even for level 10 + + basic_info.xsize = 1ull << 31ull; + EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderSetBasicInfo(enc.get(), &basic_info)); +} + +TEST(EncodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructionTest)) { + const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg"; + const jxl::PaddedBytes orig = jxl::test::ReadTestData(jpeg_path); + + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderStoreJPEGMetadata(enc.get(), JXL_TRUE)); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddJPEGFrame(frame_settings, orig.data(), orig.size())); + JxlEncoderCloseInput(enc.get()); + + std::vector compressed = std::vector(64); + uint8_t* next_out = compressed.data(); + size_t avail_out = compressed.size() - (next_out - compressed.data()); + JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT; + while (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + process_result = JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out); + if (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + size_t offset = next_out - compressed.data(); + compressed.resize(compressed.size() * 2); + next_out = compressed.data() + offset; + avail_out = compressed.size() - offset; + } + } + compressed.resize(next_out - compressed.data()); + EXPECT_EQ(JXL_ENC_SUCCESS, process_result); + + jxl::extras::JXLDecompressParams dparams; + std::vector decoded_jpeg_bytes; + jxl::extras::PackedPixelFile ppf; + EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), dparams, + nullptr, &ppf, &decoded_jpeg_bytes)); + + EXPECT_EQ(decoded_jpeg_bytes.size(), orig.size()); + EXPECT_EQ(0, memcmp(decoded_jpeg_bytes.data(), orig.data(), orig.size())); +} + +TEST(EncodeTest, JXL_TRANSCODE_JPEG_TEST(ProgressiveJPEGReconstructionTest)) { + const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg"; + const jxl::PaddedBytes orig = jxl::test::ReadTestData(jpeg_path); + + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + + frame_settings->values.cparams.progressive_mode = true; + + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderStoreJPEGMetadata(enc.get(), JXL_TRUE)); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddJPEGFrame(frame_settings, orig.data(), orig.size())); + JxlEncoderCloseInput(enc.get()); + + std::vector compressed = std::vector(64); + uint8_t* next_out = compressed.data(); + size_t avail_out = compressed.size() - (next_out - compressed.data()); + JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT; + while (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + process_result = JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out); + if (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + size_t offset = next_out - compressed.data(); + compressed.resize(compressed.size() * 2); + next_out = compressed.data() + offset; + avail_out = compressed.size() - offset; + } + } + compressed.resize(next_out - compressed.data()); + EXPECT_EQ(JXL_ENC_SUCCESS, process_result); + + jxl::extras::JXLDecompressParams dparams; + std::vector decoded_jpeg_bytes; + jxl::extras::PackedPixelFile ppf; + EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), dparams, + nullptr, &ppf, &decoded_jpeg_bytes)); + + EXPECT_EQ(decoded_jpeg_bytes.size(), orig.size()); + EXPECT_EQ(0, memcmp(decoded_jpeg_bytes.data(), orig.data(), orig.size())); +} + +static void ProcessEncoder(JxlEncoder* enc, std::vector& compressed, + uint8_t*& next_out, size_t& avail_out) { + JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT; + while (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + process_result = JxlEncoderProcessOutput(enc, &next_out, &avail_out); + if (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + size_t offset = next_out - compressed.data(); + compressed.resize(compressed.size() * 2); + next_out = compressed.data() + offset; + avail_out = compressed.size() - offset; + } + } + size_t offset = next_out - compressed.data(); + compressed.resize(next_out - compressed.data()); + next_out = compressed.data() + offset; + avail_out = compressed.size() - offset; + EXPECT_EQ(JXL_ENC_SUCCESS, process_result); +} + +TEST(EncodeTest, BasicInfoTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + size_t xsize = 1; + size_t ysize = 1; + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format); + basic_info.xsize = xsize; + basic_info.ysize = ysize; + basic_info.uses_original_profile = false; + basic_info.have_animation = true; + basic_info.intensity_target = 123.4; + basic_info.min_nits = 5.0; + basic_info.linear_below = 12.7; + basic_info.orientation = JXL_ORIENT_ROTATE_90_CW; + basic_info.intrinsic_xsize = 88; + basic_info.intrinsic_ysize = 99; + basic_info.animation.tps_numerator = 55; + basic_info.animation.tps_denominator = 77; + basic_info.animation.num_loops = 10; + basic_info.animation.have_timecodes = JXL_TRUE; + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10)); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info)); + JxlColorEncoding color_encoding; + JxlColorEncodingSetToSRGB(&color_encoding, /*is_gray=*/false); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetColorEncoding(enc.get(), &color_encoding)); + + std::vector compressed = std::vector(64); + uint8_t* next_out = compressed.data(); + size_t avail_out = compressed.size() - (next_out - compressed.data()); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddImageFrame(frame_settings, &pixel_format, + pixels.data(), pixels.size())); + JxlEncoderCloseFrames(enc.get()); + ProcessEncoder(enc.get(), compressed, next_out, avail_out); + + // Decode to verify the boxes, we don't decode to pixels, only the boxes. + JxlDecoderPtr dec = JxlDecoderMake(nullptr); + EXPECT_NE(nullptr, dec.get()); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec.get(), JXL_DEC_BASIC_INFO)); + // Allow testing the orientation field, without this setting it will be + // overridden to identity. + JxlDecoderSetKeepOrientation(dec.get(), JXL_TRUE); + JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size()); + JxlDecoderCloseInput(dec.get()); + + for (;;) { + JxlDecoderStatus status = JxlDecoderProcessInput(dec.get()); + if (status == JXL_DEC_ERROR) { + FAIL(); + } else if (status == JXL_DEC_SUCCESS) { + break; + } else if (status == JXL_DEC_BASIC_INFO) { + JxlBasicInfo basic_info2; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetBasicInfo(dec.get(), &basic_info2)); + EXPECT_EQ(basic_info.xsize, basic_info2.xsize); + EXPECT_EQ(basic_info.ysize, basic_info2.ysize); + EXPECT_EQ(basic_info.bits_per_sample, basic_info2.bits_per_sample); + EXPECT_EQ(basic_info.exponent_bits_per_sample, + basic_info2.exponent_bits_per_sample); + EXPECT_NEAR(basic_info.intensity_target, basic_info2.intensity_target, + 0.5); + EXPECT_NEAR(basic_info.min_nits, basic_info2.min_nits, 0.5); + EXPECT_NEAR(basic_info.linear_below, basic_info2.linear_below, 0.5); + EXPECT_EQ(basic_info.relative_to_max_display, + basic_info2.relative_to_max_display); + EXPECT_EQ(basic_info.uses_original_profile, + basic_info2.uses_original_profile); + EXPECT_EQ(basic_info.orientation, basic_info2.orientation); + EXPECT_EQ(basic_info.intrinsic_xsize, basic_info2.intrinsic_xsize); + EXPECT_EQ(basic_info.intrinsic_ysize, basic_info2.intrinsic_ysize); + EXPECT_EQ(basic_info.num_color_channels, basic_info2.num_color_channels); + // TODO(lode): also test num_extra_channels, but currently there may be a + // mismatch between 0 and 1 if there is alpha, until encoder support for + // extra channels is fully implemented. + EXPECT_EQ(basic_info.alpha_bits, basic_info2.alpha_bits); + EXPECT_EQ(basic_info.alpha_exponent_bits, + basic_info2.alpha_exponent_bits); + EXPECT_EQ(basic_info.alpha_premultiplied, + basic_info2.alpha_premultiplied); + + EXPECT_EQ(basic_info.have_preview, basic_info2.have_preview); + if (basic_info.have_preview) { + EXPECT_EQ(basic_info.preview.xsize, basic_info2.preview.xsize); + EXPECT_EQ(basic_info.preview.ysize, basic_info2.preview.ysize); + } + + EXPECT_EQ(basic_info.have_animation, basic_info2.have_animation); + if (basic_info.have_animation) { + EXPECT_EQ(basic_info.animation.tps_numerator, + basic_info2.animation.tps_numerator); + EXPECT_EQ(basic_info.animation.tps_denominator, + basic_info2.animation.tps_denominator); + EXPECT_EQ(basic_info.animation.num_loops, + basic_info2.animation.num_loops); + EXPECT_EQ(basic_info.animation.have_timecodes, + basic_info2.animation.have_timecodes); + } + } else { + FAIL(); // unexpected status + } + } +} + +TEST(EncodeTest, AnimationHeaderTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + size_t xsize = 1; + size_t ysize = 1; + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format); + basic_info.xsize = xsize; + basic_info.ysize = ysize; + basic_info.have_animation = true; + basic_info.animation.tps_numerator = 1000; + basic_info.animation.tps_denominator = 1; + basic_info.animation.have_timecodes = JXL_TRUE; + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10)); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info)); + JxlColorEncoding color_encoding; + JxlColorEncodingSetToSRGB(&color_encoding, /*is_gray=*/false); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetColorEncoding(enc.get(), &color_encoding)); + + std::string frame_name = "test frame"; + JxlFrameHeader header; + JxlEncoderInitFrameHeader(&header); + header.duration = 50; + header.timecode = 800; + header.layer_info.blend_info.blendmode = JXL_BLEND_BLEND; + header.layer_info.blend_info.source = 2; + header.layer_info.blend_info.clamp = 1; + JxlBlendInfo extra_channel_blend_info; + JxlEncoderInitBlendInfo(&extra_channel_blend_info); + extra_channel_blend_info.blendmode = JXL_BLEND_MULADD; + JxlEncoderSetFrameHeader(frame_settings, &header); + JxlEncoderSetExtraChannelBlendInfo(frame_settings, 0, + &extra_channel_blend_info); + JxlEncoderSetFrameName(frame_settings, frame_name.c_str()); + + std::vector compressed = std::vector(64); + uint8_t* next_out = compressed.data(); + size_t avail_out = compressed.size() - (next_out - compressed.data()); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddImageFrame(frame_settings, &pixel_format, + pixels.data(), pixels.size())); + JxlEncoderCloseFrames(enc.get()); + ProcessEncoder(enc.get(), compressed, next_out, avail_out); + + // Decode to verify the boxes, we don't decode to pixels, only the boxes. + JxlDecoderPtr dec = JxlDecoderMake(nullptr); + EXPECT_NE(nullptr, dec.get()); + + // To test the blend_info fields, coalescing must be set to false in the + // decoder. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec.get(), JXL_FALSE)); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec.get(), JXL_DEC_FRAME)); + JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size()); + JxlDecoderCloseInput(dec.get()); + + bool seen_frame = false; + + for (;;) { + JxlDecoderStatus status = JxlDecoderProcessInput(dec.get()); + if (status == JXL_DEC_ERROR) { + FAIL(); + } else if (status == JXL_DEC_SUCCESS) { + break; + } else if (status == JXL_DEC_FRAME) { + seen_frame = true; + JxlFrameHeader header2; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec.get(), &header2)); + EXPECT_EQ(header.duration, header2.duration); + EXPECT_EQ(header.timecode, header2.timecode); + EXPECT_EQ(header.layer_info.blend_info.blendmode, + header2.layer_info.blend_info.blendmode); + EXPECT_EQ(header.layer_info.blend_info.clamp, + header2.layer_info.blend_info.clamp); + EXPECT_EQ(header.layer_info.blend_info.source, + header2.layer_info.blend_info.source); + EXPECT_EQ(frame_name.size(), header2.name_length); + JxlBlendInfo extra_channel_blend_info2; + JxlDecoderGetExtraChannelBlendInfo(dec.get(), 0, + &extra_channel_blend_info2); + EXPECT_EQ(extra_channel_blend_info.blendmode, + extra_channel_blend_info2.blendmode); + if (header2.name_length > 0) { + std::string frame_name2(header2.name_length + 1, '\0'); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetFrameName(dec.get(), &frame_name2.front(), + frame_name2.size())); + frame_name2.resize(header2.name_length); + EXPECT_EQ(frame_name, frame_name2); + } + } else { + FAIL(); // unexpected status + } + } + + EXPECT_EQ(true, seen_frame); +} +TEST(EncodeTest, CroppedFrameTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + size_t xsize = 300; + size_t ysize = 300; + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + std::vector pixels2(pixels.size()); + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format); + // Encoding a 300x300 frame in an image that is only 100x100 + basic_info.xsize = 100; + basic_info.ysize = 100; + basic_info.uses_original_profile = JXL_TRUE; + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10)); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info)); + JxlColorEncoding color_encoding; + JxlColorEncodingSetToSRGB(&color_encoding, /*is_gray=*/false); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetColorEncoding(enc.get(), &color_encoding)); + + JxlFrameHeader header; + JxlEncoderInitFrameHeader(&header); + header.layer_info.have_crop = JXL_TRUE; + header.layer_info.xsize = xsize; + header.layer_info.ysize = ysize; + header.layer_info.crop_x0 = -50; + header.layer_info.crop_y0 = -250; + JxlEncoderSetFrameLossless(frame_settings, JXL_TRUE); + JxlEncoderSetFrameHeader(frame_settings, &header); + JxlEncoderFrameSettingsSetOption(frame_settings, JXL_ENC_FRAME_SETTING_EFFORT, + 1); + + std::vector compressed = std::vector(100); + uint8_t* next_out = compressed.data(); + size_t avail_out = compressed.size() - (next_out - compressed.data()); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddImageFrame(frame_settings, &pixel_format, + pixels.data(), pixels.size())); + JxlEncoderCloseFrames(enc.get()); + ProcessEncoder(enc.get(), compressed, next_out, avail_out); + + JxlDecoderPtr dec = JxlDecoderMake(nullptr); + EXPECT_NE(nullptr, dec.get()); + // Non-coalesced decoding so we can get the full uncropped frame + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec.get(), JXL_FALSE)); + EXPECT_EQ( + JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec.get(), JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size()); + JxlDecoderCloseInput(dec.get()); + + bool seen_frame = false; + bool checked_frame = false; + for (;;) { + JxlDecoderStatus status = JxlDecoderProcessInput(dec.get()); + if (status == JXL_DEC_ERROR) { + FAIL(); + } else if (status == JXL_DEC_SUCCESS) { + break; + } else if (status == JXL_DEC_FRAME) { + seen_frame = true; + JxlFrameHeader header2; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec.get(), &header2)); + EXPECT_EQ(header.layer_info.xsize, header2.layer_info.xsize); + EXPECT_EQ(header.layer_info.ysize, header2.layer_info.ysize); + EXPECT_EQ(header.layer_info.crop_x0, header2.layer_info.crop_x0); + EXPECT_EQ(header.layer_info.crop_y0, header2.layer_info.crop_y0); + } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) { + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer(dec.get(), &pixel_format, + pixels2.data(), pixels2.size())); + } else if (status == JXL_DEC_FULL_IMAGE) { + EXPECT_EQ(0, memcmp(pixels.data(), pixels2.data(), pixels.size())); + checked_frame = true; + } else { + FAIL(); // unexpected status + } + } + EXPECT_EQ(true, checked_frame); + EXPECT_EQ(true, seen_frame); +} + +TEST(EncodeTest, JXL_BOXES_TEST(BoxTest)) { + // Test with uncompressed boxes and with brob boxes + for (int compress_box = 0; compress_box <= 1; ++compress_box) { + // Tests adding two metadata boxes with the encoder: an exif box before the + // image frame, and an xml box after the image frame. Then verifies the + // decoder can decode them, they are in the expected place, and have the + // correct content after decoding. + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseBoxes(enc.get())); + + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + size_t xsize = 50; + size_t ysize = 17; + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format); + basic_info.xsize = xsize; + basic_info.ysize = ysize; + basic_info.uses_original_profile = false; + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10)); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info)); + JxlColorEncoding color_encoding; + JxlColorEncodingSetToSRGB(&color_encoding, + /*is_gray=*/false); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetColorEncoding(enc.get(), &color_encoding)); + + std::vector compressed = std::vector(64); + uint8_t* next_out = compressed.data(); + size_t avail_out = compressed.size() - (next_out - compressed.data()); + + // Add an early metadata box. Also add a valid 4-byte TIFF offset header + // before the fake exif data of these box contents. + constexpr const char* exif_test_string = "\0\0\0\0exif test data"; + const uint8_t* exif_data = + reinterpret_cast(exif_test_string); + // Skip the 4 zeroes for strlen + const size_t exif_size = 4 + strlen(exif_test_string + 4); + JxlEncoderAddBox(enc.get(), "Exif", exif_data, exif_size, compress_box); + + // Write to output + ProcessEncoder(enc.get(), compressed, next_out, avail_out); + + // Add image frame + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddImageFrame(frame_settings, &pixel_format, + pixels.data(), pixels.size())); + // Indicate this is the last frame + JxlEncoderCloseFrames(enc.get()); + + // Write to output + ProcessEncoder(enc.get(), compressed, next_out, avail_out); + + // Add a late metadata box + constexpr const char* xml_test_string = ""; + const uint8_t* xml_data = reinterpret_cast(xml_test_string); + size_t xml_size = strlen(xml_test_string); + JxlEncoderAddBox(enc.get(), "XML ", xml_data, xml_size, compress_box); + + // Indicate this is the last box + JxlEncoderCloseBoxes(enc.get()); + + // Write to output + ProcessEncoder(enc.get(), compressed, next_out, avail_out); + + // Decode to verify the boxes, we don't decode to pixels, only the boxes. + JxlDecoderPtr dec = JxlDecoderMake(nullptr); + EXPECT_NE(nullptr, dec.get()); + + if (compress_box) { + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetDecompressBoxes(dec.get(), JXL_TRUE)); + } + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents( + dec.get(), JXL_DEC_FRAME | JXL_DEC_BOX)); + + JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size()); + JxlDecoderCloseInput(dec.get()); + + std::vector dec_exif_box(exif_size); + std::vector dec_xml_box(xml_size); + + for (bool post_frame = false;;) { + JxlDecoderStatus status = JxlDecoderProcessInput(dec.get()); + if (status == JXL_DEC_ERROR) { + FAIL(); + } else if (status == JXL_DEC_SUCCESS) { + EXPECT_EQ(0, JxlDecoderReleaseBoxBuffer(dec.get())); + break; + } else if (status == JXL_DEC_FRAME) { + post_frame = true; + } else if (status == JXL_DEC_BOX) { + // Since we gave the exif/xml box output buffer of the exact known + // correct size, 0 bytes should be released. Same when no buffer was + // set. + EXPECT_EQ(0, JxlDecoderReleaseBoxBuffer(dec.get())); + JxlBoxType type; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec.get(), type, true)); + if (!memcmp(type, "Exif", 4)) { + // This box should have been encoded before the image frame + EXPECT_EQ(false, post_frame); + JxlDecoderSetBoxBuffer(dec.get(), dec_exif_box.data(), + dec_exif_box.size()); + } else if (!memcmp(type, "XML ", 4)) { + // This box should have been encoded after the image frame + EXPECT_EQ(true, post_frame); + JxlDecoderSetBoxBuffer(dec.get(), dec_xml_box.data(), + dec_xml_box.size()); + } + } else { + FAIL(); // unexpected status + } + } + + EXPECT_EQ(0, memcmp(exif_data, dec_exif_box.data(), exif_size)); + EXPECT_EQ(0, memcmp(xml_data, dec_xml_box.data(), xml_size)); + } +} + +#if JPEGXL_ENABLE_JPEG // Loading .jpg files requires libjpeg support. +TEST(EncodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGFrameTest)) { + for (int skip_basic_info = 0; skip_basic_info < 2; skip_basic_info++) { + for (int skip_color_encoding = 0; skip_color_encoding < 2; + skip_color_encoding++) { + // cannot set color encoding if basic info is not set + if (skip_basic_info && !skip_color_encoding) continue; + const std::string jpeg_path = "jxl/flower/flower_cropped.jpg"; + const jxl::PaddedBytes orig = jxl::test::ReadTestData(jpeg_path); + jxl::CodecInOut orig_io; + ASSERT_TRUE(SetFromBytes(jxl::Span(orig), &orig_io, + /*pool=*/nullptr)); + + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + JxlEncoderFrameSettingsSetOption(frame_settings, + JXL_ENC_FRAME_SETTING_EFFORT, 1); + if (!skip_basic_info) { + JxlBasicInfo basic_info; + JxlEncoderInitBasicInfo(&basic_info); + basic_info.xsize = orig_io.xsize(); + basic_info.ysize = orig_io.ysize(); + basic_info.uses_original_profile = true; + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetBasicInfo(enc.get(), &basic_info)); + } + if (!skip_color_encoding) { + JxlColorEncoding color_encoding; + JxlColorEncodingSetToSRGB(&color_encoding, /*is_gray=*/false); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetColorEncoding(enc.get(), &color_encoding)); + } + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderAddJPEGFrame( + frame_settings, orig.data(), orig.size())); + JxlEncoderCloseInput(enc.get()); + + std::vector compressed = std::vector(64); + uint8_t* next_out = compressed.data(); + size_t avail_out = compressed.size() - (next_out - compressed.data()); + JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT; + while (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + process_result = + JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out); + if (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + size_t offset = next_out - compressed.data(); + compressed.resize(compressed.size() * 2); + next_out = compressed.data() + offset; + avail_out = compressed.size() - offset; + } + } + compressed.resize(next_out - compressed.data()); + EXPECT_EQ(JXL_ENC_SUCCESS, process_result); + + jxl::CodecInOut decoded_io; + EXPECT_TRUE(jxl::test::DecodeFile( + {}, jxl::Span(compressed.data(), compressed.size()), + &decoded_io)); + + EXPECT_LE( + ComputeDistance2(orig_io.Main(), decoded_io.Main(), jxl::GetJxlCms()), + 3.5); + } + } +} +#endif // JPEGXL_ENABLE_JPEG diff --git a/third_party/jpeg-xl/lib/jxl/entropy_coder.cc b/third_party/jpeg-xl/lib/jxl/entropy_coder.cc new file mode 100644 index 0000000000..0043c2d31e --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/entropy_coder.cc @@ -0,0 +1,70 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/entropy_coder.h" + +#include +#include + +#include +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_context_map.h" +#include "lib/jxl/epf.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { + +Status DecodeBlockCtxMap(BitReader* br, BlockCtxMap* block_ctx_map) { + auto& dct = block_ctx_map->dc_thresholds; + auto& qft = block_ctx_map->qf_thresholds; + auto& ctx_map = block_ctx_map->ctx_map; + bool is_default = br->ReadFixedBits<1>(); + if (is_default) { + *block_ctx_map = BlockCtxMap(); + return true; + } + block_ctx_map->num_dc_ctxs = 1; + for (int j : {0, 1, 2}) { + dct[j].resize(br->ReadFixedBits<4>()); + block_ctx_map->num_dc_ctxs *= dct[j].size() + 1; + for (int& i : dct[j]) { + i = UnpackSigned(U32Coder::Read(kDCThresholdDist, br)); + } + } + qft.resize(br->ReadFixedBits<4>()); + for (uint32_t& i : qft) { + i = U32Coder::Read(kQFThresholdDist, br) + 1; + } + + if (block_ctx_map->num_dc_ctxs * (qft.size() + 1) > 64) { + return JXL_FAILURE("Invalid block context map: too big"); + } + + ctx_map.resize(3 * kNumOrders * block_ctx_map->num_dc_ctxs * + (qft.size() + 1)); + JXL_RETURN_IF_ERROR(DecodeContextMap(&ctx_map, &block_ctx_map->num_ctxs, br)); + if (block_ctx_map->num_ctxs > 16) { + return JXL_FAILURE("Invalid block context map: too many distinct contexts"); + } + return true; +} + +constexpr uint8_t BlockCtxMap::kDefaultCtxMap[]; // from ac_context.h + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/entropy_coder.h b/third_party/jpeg-xl/lib/jxl/entropy_coder.h new file mode 100644 index 0000000000..e4afa7a631 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/entropy_coder.h @@ -0,0 +1,45 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENTROPY_CODER_H_ +#define LIB_JXL_ENTROPY_CODER_H_ + +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/field_encodings.h" + +// Entropy coding and context modeling of DC and AC coefficients, as well as AC +// strategy and quantization field. + +namespace jxl { + +static JXL_INLINE int32_t PredictFromTopAndLeft( + const int32_t* const JXL_RESTRICT row_top, + const int32_t* const JXL_RESTRICT row, size_t x, int32_t default_val) { + if (x == 0) { + return row_top == nullptr ? default_val : row_top[x]; + } + if (row_top == nullptr) { + return row[x - 1]; + } + return (row_top[x] + row[x - 1] + 1) / 2; +} + +static constexpr U32Enc kDCThresholdDist(Bits(4), BitsOffset(8, 16), + BitsOffset(16, 272), + BitsOffset(32, 65808)); + +static constexpr U32Enc kQFThresholdDist(Bits(2), BitsOffset(3, 4), + BitsOffset(5, 12), BitsOffset(8, 44)); + +Status DecodeBlockCtxMap(BitReader* br, BlockCtxMap* block_ctx_map); + +} // namespace jxl + +#endif // LIB_JXL_ENTROPY_CODER_H_ diff --git a/third_party/jpeg-xl/lib/jxl/entropy_coder_test.cc b/third_party/jpeg-xl/lib/jxl/entropy_coder_test.cc new file mode 100644 index 0000000000..9dbeb137af --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/entropy_coder_test.cc @@ -0,0 +1,68 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// TODO(deymo): Move these tests to dec_ans.h and common.h + +#include + +#include "lib/jxl/base/random.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +TEST(EntropyCoderTest, PackUnpack) { + for (int32_t i = -31; i < 32; ++i) { + uint32_t packed = PackSigned(i); + EXPECT_LT(packed, 63u); + int32_t unpacked = UnpackSigned(packed); + EXPECT_EQ(i, unpacked); + } +} + +struct DummyBitReader { + uint32_t nbits, bits; + void Consume(uint32_t nbits) {} + uint32_t PeekBits(uint32_t n) { + EXPECT_EQ(n, nbits); + return bits; + } +}; + +void HybridUintRoundtrip(HybridUintConfig config, size_t limit = 1 << 24) { + Rng rng(0); + constexpr size_t kNumIntegers = 1 << 20; + std::vector integers(kNumIntegers); + std::vector token(kNumIntegers); + std::vector nbits(kNumIntegers); + std::vector bits(kNumIntegers); + for (size_t i = 0; i < kNumIntegers; i++) { + integers[i] = rng.UniformU(0, limit + 1); + config.Encode(integers[i], &token[i], &nbits[i], &bits[i]); + } + for (size_t i = 0; i < kNumIntegers; i++) { + DummyBitReader br{nbits[i], bits[i]}; + EXPECT_EQ(integers[i], + ANSSymbolReader::ReadHybridUintConfig(config, token[i], &br)); + } +} + +TEST(HybridUintTest, Test000) { + HybridUintRoundtrip(HybridUintConfig{0, 0, 0}); +} +TEST(HybridUintTest, Test411) { + HybridUintRoundtrip(HybridUintConfig{4, 1, 1}); +} +TEST(HybridUintTest, Test420) { + HybridUintRoundtrip(HybridUintConfig{4, 2, 0}); +} +TEST(HybridUintTest, Test421) { + HybridUintRoundtrip(HybridUintConfig{4, 2, 1}, 256); +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/epf.cc b/third_party/jpeg-xl/lib/jxl/epf.cc new file mode 100644 index 0000000000..7288ed9ca6 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/epf.cc @@ -0,0 +1,146 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Edge-preserving smoothing: weighted average based on L1 patch similarity. + +#include "lib/jxl/epf.h" + +#include +#include +#include +#include +#include + +#include +#include +#include // std::accumulate +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" + +namespace jxl { + +// Mirror n floats starting at *p and store them before p. +JXL_INLINE void LeftMirror(float* p, size_t n) { + for (size_t i = 0; i < n; i++) { + *(p - 1 - i) = p[i]; + } +} + +// Mirror n floats starting at *(p - n) and store them at *p. +JXL_INLINE void RightMirror(float* p, size_t n) { + for (size_t i = 0; i < n; i++) { + p[i] = *(p - 1 - i); + } +} + +void ComputeSigma(const Rect& block_rect, PassesDecoderState* state) { + const LoopFilter& lf = state->shared->frame_header.loop_filter; + JXL_CHECK(lf.epf_iters > 0); + const AcStrategyImage& ac_strategy = state->shared->ac_strategy; + const float quant_scale = state->shared->quantizer.Scale(); + + const size_t sigma_stride = state->sigma.PixelsPerRow(); + const size_t sharpness_stride = state->shared->epf_sharpness.PixelsPerRow(); + + for (size_t by = 0; by < block_rect.ysize(); ++by) { + float* JXL_RESTRICT sigma_row = block_rect.Row(&state->sigma, by); + const uint8_t* JXL_RESTRICT sharpness_row = + block_rect.ConstRow(state->shared->epf_sharpness, by); + AcStrategyRow acs_row = ac_strategy.ConstRow(block_rect, by); + const int32_t* const JXL_RESTRICT row_quant = + block_rect.ConstRow(state->shared->raw_quant_field, by); + + for (size_t bx = 0; bx < block_rect.xsize(); bx++) { + AcStrategy acs = acs_row[bx]; + size_t llf_x = acs.covered_blocks_x(); + if (!acs.IsFirstBlock()) continue; + // quant_scale is smaller for low quality. + // quant_scale is roughly 0.08 / butteraugli score. + // + // row_quant is smaller for low quality. + // row_quant is a quantization multiplier of form 1.0 / + // row_quant[bx] + // + // lf.epf_quant_mul is a parameter in the format + // kInvSigmaNum is a constant + float sigma_quant = + lf.epf_quant_mul / (quant_scale * row_quant[bx] * kInvSigmaNum); + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + float sigma = + sigma_quant * + lf.epf_sharp_lut[sharpness_row[bx + ix + iy * sharpness_stride]]; + // Avoid infinities. + sigma = std::min(-1e-4f, sigma); // TODO(veluca): remove this. + sigma_row[bx + ix + kSigmaPadding + + (iy + kSigmaPadding) * sigma_stride] = 1.0f / sigma; + } + } + // TODO(veluca): remove this padding. + // Left padding with mirroring. + if (bx + block_rect.x0() == 0) { + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + LeftMirror( + sigma_row + kSigmaPadding + (iy + kSigmaPadding) * sigma_stride, + kSigmaBorder); + } + } + // Right padding with mirroring. + if (bx + block_rect.x0() + llf_x == + state->shared->frame_dim.xsize_blocks) { + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + RightMirror(sigma_row + kSigmaPadding + bx + llf_x + + (iy + kSigmaPadding) * sigma_stride, + kSigmaBorder); + } + } + // Offsets for row copying, in blocks. + size_t offset_before = bx + block_rect.x0() == 0 ? 1 : bx + kSigmaPadding; + size_t offset_after = + bx + block_rect.x0() + llf_x == state->shared->frame_dim.xsize_blocks + ? kSigmaPadding + llf_x + bx + kSigmaBorder + : kSigmaPadding + llf_x + bx; + size_t num = offset_after - offset_before; + // Above + if (by + block_rect.y0() == 0) { + for (size_t iy = 0; iy < kSigmaBorder; iy++) { + memcpy( + sigma_row + offset_before + + (kSigmaPadding - 1 - iy) * sigma_stride, + sigma_row + offset_before + (kSigmaPadding + iy) * sigma_stride, + num * sizeof(*sigma_row)); + } + } + // Below + if (by + block_rect.y0() + acs.covered_blocks_y() == + state->shared->frame_dim.ysize_blocks) { + for (size_t iy = 0; iy < kSigmaBorder; iy++) { + memcpy( + sigma_row + offset_before + + sigma_stride * (acs.covered_blocks_y() + kSigmaPadding + iy), + sigma_row + offset_before + + sigma_stride * + (acs.covered_blocks_y() + kSigmaPadding - 1 - iy), + num * sizeof(*sigma_row)); + } + } + } + } +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/epf.h b/third_party/jpeg-xl/lib/jxl/epf.h new file mode 100644 index 0000000000..7a0834ed97 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/epf.h @@ -0,0 +1,33 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_EPF_H_ +#define LIB_JXL_EPF_H_ + +// Fast SIMD "in-loop" edge preserving filter (adaptive, nonlinear). + +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/passes_state.h" + +namespace jxl { + +// 4 * (sqrt(0.5)-1), so that Weight(sigma) = 0.5. +static constexpr float kInvSigmaNum = -1.1715728752538099024f; + +// kInvSigmaNum / 0.3 +constexpr float kMinSigma = -3.90524291751269967465540850526868f; + +// Fills the `state->filter_weights.sigma` image with the precomputed sigma +// values in the area inside `block_rect`. Accesses the AC strategy, quant field +// and epf_sharpness fields in the corresponding positions. +void ComputeSigma(const Rect& block_rect, PassesDecoderState* state); + +} // namespace jxl + +#endif // LIB_JXL_EPF_H_ diff --git a/third_party/jpeg-xl/lib/jxl/exif.h b/third_party/jpeg-xl/lib/jxl/exif.h new file mode 100644 index 0000000000..0cf493fc71 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/exif.h @@ -0,0 +1,87 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_EXIF_H_ +#define LIB_JXL_EXIF_H_ + +// Basic parsing of Exif (just enough for the render-impacting things +// like orientation) + +#include + +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/image_metadata.h" + +namespace jxl { + +constexpr uint16_t kExifOrientationTag = 274; + +// Checks if a blob looks like Exif, and if so, sets bigendian +// according to the tiff endianness +inline bool IsExif(const std::vector& exif, bool* bigendian) { + if (exif.size() < 12) return false; // not enough bytes for a valid exif blob + const uint8_t* t = exif.data(); + if (LoadLE32(t) == 0x2A004D4D) { + *bigendian = true; + return true; + } else if (LoadLE32(t) == 0x002A4949) { + *bigendian = false; + return true; + } + return false; // not a valid tiff header +} + +// Finds the position of an Exif tag, or 0 if it is not found +inline size_t FindExifTagPosition(const std::vector& exif, + uint16_t tagname) { + bool bigendian; + if (!IsExif(exif, &bigendian)) return 0; + const uint8_t* t = exif.data() + 4; + uint64_t offset = (bigendian ? LoadBE32(t) : LoadLE32(t)); + if (exif.size() < 12 + offset + 2 || offset < 8) return 0; + t += offset - 4; + if (offset + 2 >= exif.size()) return 0; + uint16_t nb_tags = (bigendian ? LoadBE16(t) : LoadLE16(t)); + t += 2; + while (nb_tags > 0) { + if (t + 12 >= exif.data() + exif.size()) return 0; + uint16_t tag = (bigendian ? LoadBE16(t) : LoadLE16(t)); + t += 2; + if (tag == tagname) return static_cast(t - exif.data()); + t += 10; + nb_tags--; + } + return 0; +} + +// TODO (jon): tag 1 can be used to represent Adobe RGB 1998 if it has value +// "R03" +// TODO (jon): set intrinsic dimensions according to +// https://discourse.wicg.io/t/proposal-exif-image-resolution-auto-and-from-image/4326/24 +// Parses the Exif data just enough to extract any render-impacting info. +// If the Exif data is invalid or could not be parsed, then it is treated +// as a no-op. +inline void InterpretExif(const std::vector& exif, + JxlOrientation* orientation) { + bool bigendian; + if (!IsExif(exif, &bigendian)) return; + size_t o_pos = FindExifTagPosition(exif, kExifOrientationTag); + if (o_pos) { + const uint8_t* t = exif.data() + o_pos; + uint16_t type = (bigendian ? LoadBE16(t) : LoadLE16(t)); + t += 2; + uint32_t count = (bigendian ? LoadBE32(t) : LoadLE32(t)); + t += 4; + uint16_t value = (bigendian ? LoadBE16(t) : LoadLE16(t)); + t += 4; + if (type == 3 && count == 1 && value >= 1 && value <= 8) { + *orientation = static_cast(value); + } + } +} + +} // namespace jxl + +#endif // LIB_JXL_EXIF_H_ diff --git a/third_party/jpeg-xl/lib/jxl/fake_parallel_runner_testonly.h b/third_party/jpeg-xl/lib/jxl/fake_parallel_runner_testonly.h new file mode 100644 index 0000000000..508d808cc5 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/fake_parallel_runner_testonly.h @@ -0,0 +1,79 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_FAKE_PARALLEL_RUNNER_TESTONLY_H_ +#define LIB_JXL_FAKE_PARALLEL_RUNNER_TESTONLY_H_ + +#include +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/random.h" + +namespace jxl { + +// A parallel runner implementation that runs all the jobs in a single thread +// (the caller thread) but runs them pretending to use multiple threads and +// potentially out of order. This is useful for testing conditions that only +// occur under heavy load where the order of operations is different. +class FakeParallelRunner { + public: + FakeParallelRunner(uint32_t order_seed, uint32_t num_threads) + : order_seed_(order_seed), rng_(order_seed), num_threads_(num_threads) { + if (num_threads_ < 1) num_threads_ = 1; + } + + JxlParallelRetCode Run(void* jxl_opaque, JxlParallelRunInit init, + JxlParallelRunFunction func, uint32_t start, + uint32_t end) { + JxlParallelRetCode ret = init(jxl_opaque, num_threads_); + if (ret != 0) return ret; + + if (order_seed_ == 0) { + for (uint32_t i = start; i < end; i++) { + func(jxl_opaque, i, i % num_threads_); + } + } else { + std::vector order(end - start); + for (uint32_t i = start; i < end; i++) { + order[i - start] = i; + } + rng_.Shuffle(order.data(), order.size()); + for (uint32_t i = start; i < end; i++) { + func(jxl_opaque, order[i - start], i % num_threads_); + } + } + return ret; + } + + private: + // Seed for the RNG for defining the execution order. A value of 0 means + // sequential order from start to end. + uint32_t order_seed_; + + // The PRNG object, initialized with the order_seed_. Only used if the seed is + // not 0. + Rng rng_; + + // Number of fake threads. All the tasks are run on the same thread, but using + // different thread_id values based on this num_threads. + uint32_t num_threads_; +}; + +} // namespace jxl + +extern "C" { +// Function to pass as the parallel runner. +JXL_INLINE JxlParallelRetCode JxlFakeParallelRunner( + void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init, + JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range) { + return static_cast(runner_opaque) + ->Run(jpegxl_opaque, init, func, start_range, end_range); +} +} + +#endif // LIB_JXL_FAKE_PARALLEL_RUNNER_TESTONLY_H_ diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct-inl.h new file mode 100644 index 0000000000..d2453b0e10 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/fast_dct-inl.h @@ -0,0 +1,238 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#if defined(LIB_JXL_FAST_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_FAST_DCT_INL_H_ +#undef LIB_JXL_FAST_DCT_INL_H_ +#else +#define LIB_JXL_FAST_DCT_INL_H_ +#endif + +#include + +#include +#include + +#include "lib/jxl/base/status.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +#if HWY_TARGET == HWY_NEON +HWY_NOINLINE void FastTransposeBlock(const int16_t* JXL_RESTRICT data_in, + size_t stride_in, size_t N, size_t M, + int16_t* JXL_RESTRICT data_out, + size_t stride_out) { + JXL_DASSERT(N % 8 == 0); + JXL_DASSERT(M % 8 == 0); + for (size_t i = 0; i < N; i += 8) { + for (size_t j = 0; j < M; j += 8) { + // TODO(veluca): one could optimize the M==8, stride_in==8 case further + // with vld4. + // This code is about 40% faster for N == M == stride_in == + // stride_out == 8 + // Using loads + stores to reshuffle things to be able to + // use vld4 doesn't help. + /* + auto a0 = vld4q_s16(data_in); auto a1 = vld4q_s16(data_in + 32); + int16x8x4_t out0; + int16x8x4_t out1; + out0.val[0] = vuzp1q_s16(a0.val[0], a1.val[0]); + out0.val[1] = vuzp1q_s16(a0.val[1], a1.val[1]); + out0.val[2] = vuzp1q_s16(a0.val[2], a1.val[2]); + out0.val[3] = vuzp1q_s16(a0.val[3], a1.val[3]); + out1.val[0] = vuzp2q_s16(a0.val[0], a1.val[0]); + out1.val[1] = vuzp2q_s16(a0.val[1], a1.val[1]); + out1.val[2] = vuzp2q_s16(a0.val[2], a1.val[2]); + out1.val[3] = vuzp2q_s16(a0.val[3], a1.val[3]); + vst1q_s16_x4(data_out, out0); + vst1q_s16_x4(data_out + 32, out1); + */ + auto a0 = vld1q_s16(data_in + i * stride_in + j); + auto a1 = vld1q_s16(data_in + (i + 1) * stride_in + j); + auto a2 = vld1q_s16(data_in + (i + 2) * stride_in + j); + auto a3 = vld1q_s16(data_in + (i + 3) * stride_in + j); + + auto a01 = vtrnq_s16(a0, a1); + auto a23 = vtrnq_s16(a2, a3); + + auto four0 = vtrnq_s32(vreinterpretq_s32_s16(a01.val[0]), + vreinterpretq_s32_s16(a23.val[0])); + auto four1 = vtrnq_s32(vreinterpretq_s32_s16(a01.val[1]), + vreinterpretq_s32_s16(a23.val[1])); + + auto a4 = vld1q_s16(data_in + (i + 4) * stride_in + j); + auto a5 = vld1q_s16(data_in + (i + 5) * stride_in + j); + auto a6 = vld1q_s16(data_in + (i + 6) * stride_in + j); + auto a7 = vld1q_s16(data_in + (i + 7) * stride_in + j); + + auto a45 = vtrnq_s16(a4, a5); + auto a67 = vtrnq_s16(a6, a7); + + auto four2 = vtrnq_s32(vreinterpretq_s32_s16(a45.val[0]), + vreinterpretq_s32_s16(a67.val[0])); + auto four3 = vtrnq_s32(vreinterpretq_s32_s16(a45.val[1]), + vreinterpretq_s32_s16(a67.val[1])); + + auto out0 = + vcombine_s32(vget_low_s32(four0.val[0]), vget_low_s32(four2.val[0])); + auto out1 = + vcombine_s32(vget_low_s32(four1.val[0]), vget_low_s32(four3.val[0])); + auto out2 = + vcombine_s32(vget_low_s32(four0.val[1]), vget_low_s32(four2.val[1])); + auto out3 = + vcombine_s32(vget_low_s32(four1.val[1]), vget_low_s32(four3.val[1])); + auto out4 = vcombine_s32(vget_high_s32(four0.val[0]), + vget_high_s32(four2.val[0])); + auto out5 = vcombine_s32(vget_high_s32(four1.val[0]), + vget_high_s32(four3.val[0])); + auto out6 = vcombine_s32(vget_high_s32(four0.val[1]), + vget_high_s32(four2.val[1])); + auto out7 = vcombine_s32(vget_high_s32(four1.val[1]), + vget_high_s32(four3.val[1])); + vst1q_s16(data_out + j * stride_out + i, vreinterpretq_s16_s32(out0)); + vst1q_s16(data_out + (j + 1) * stride_out + i, + vreinterpretq_s16_s32(out1)); + vst1q_s16(data_out + (j + 2) * stride_out + i, + vreinterpretq_s16_s32(out2)); + vst1q_s16(data_out + (j + 3) * stride_out + i, + vreinterpretq_s16_s32(out3)); + vst1q_s16(data_out + (j + 4) * stride_out + i, + vreinterpretq_s16_s32(out4)); + vst1q_s16(data_out + (j + 5) * stride_out + i, + vreinterpretq_s16_s32(out5)); + vst1q_s16(data_out + (j + 6) * stride_out + i, + vreinterpretq_s16_s32(out6)); + vst1q_s16(data_out + (j + 7) * stride_out + i, + vreinterpretq_s16_s32(out7)); + } + } +} + +template +struct FastDCTTag {}; + +#include "lib/jxl/fast_dct128-inl.h" +#include "lib/jxl/fast_dct16-inl.h" +#include "lib/jxl/fast_dct256-inl.h" +#include "lib/jxl/fast_dct32-inl.h" +#include "lib/jxl/fast_dct64-inl.h" +#include "lib/jxl/fast_dct8-inl.h" + +template +struct ComputeFastScaledIDCT { + // scratch_space must be aligned, and should have space for ROWS*COLS + // int16_ts. + HWY_MAYBE_UNUSED void operator()(int16_t* JXL_RESTRICT from, int16_t* to, + size_t to_stride, + int16_t* JXL_RESTRICT scratch_space) { + // Reverse the steps done in ComputeScaledDCT. + if (ROWS < COLS) { + FastTransposeBlock(from, COLS, ROWS, COLS, scratch_space, ROWS); + FastIDCT(FastDCTTag(), scratch_space, ROWS, from, ROWS, ROWS); + FastTransposeBlock(from, ROWS, COLS, ROWS, scratch_space, COLS); + FastIDCT(FastDCTTag(), scratch_space, COLS, to, to_stride, COLS); + } else { + FastIDCT(FastDCTTag(), from, ROWS, scratch_space, ROWS, ROWS); + FastTransposeBlock(scratch_space, ROWS, COLS, ROWS, from, COLS); + FastIDCT(FastDCTTag(), from, COLS, to, to_stride, COLS); + } + } +}; +#endif + +template +HWY_NOINLINE void TestFastIDCT() { +#if HWY_TARGET == HWY_NEON + auto pixels_mem = hwy::AllocateAligned(N * M); + float* pixels = pixels_mem.get(); + auto dct_mem = hwy::AllocateAligned(N * M); + float* dct = dct_mem.get(); + auto dct_i_mem = hwy::AllocateAligned(N * M); + int16_t* dct_i = dct_i_mem.get(); + auto dct_in_mem = hwy::AllocateAligned(N * M); + int16_t* dct_in = dct_in_mem.get(); + auto idct_mem = hwy::AllocateAligned(N * M); + int16_t* idct = idct_mem.get(); + + auto scratch_space_mem = hwy::AllocateAligned(N * M * 2); + float* scratch_space = scratch_space_mem.get(); + auto scratch_space_i_mem = hwy::AllocateAligned(N * M * 2); + int16_t* scratch_space_i = scratch_space_i_mem.get(); + + Rng rng(0); + for (size_t i = 0; i < N * M; i++) { + pixels[i] = rng.UniformF(-1, 1); + } + ComputeScaledDCT()(DCTFrom(pixels, N), dct, scratch_space); + size_t integer_bits = std::max(FastIDCTIntegerBits(FastDCTTag()), + FastIDCTIntegerBits(FastDCTTag())); + // Enough range for [-2, 2] output values. + JXL_ASSERT(integer_bits <= 14); + float scale = (1 << (14 - integer_bits)); + for (size_t i = 0; i < N * M; i++) { + dct_i[i] = std::round(dct[i] * scale); + } + + for (size_t j = 0; j < 40000000 / (M * N); j++) { + memcpy(dct_in, dct_i, sizeof(*dct_i) * N * M); + ComputeFastScaledIDCT()(dct_in, idct, N, scratch_space_i); + } + float max_error = 0; + for (size_t i = 0; i < M * N; i++) { + float err = std::abs(idct[i] * (1.0f / scale) - pixels[i]); + if (std::abs(err) > max_error) { + max_error = std::abs(err); + } + } + printf("max error: %f mantissa bits: %d\n", max_error, + 14 - (int)integer_bits); +#endif +} + +template +HWY_NOINLINE void TestFloatIDCT() { + auto pixels_mem = hwy::AllocateAligned(N * M); + float* pixels = pixels_mem.get(); + auto dct_mem = hwy::AllocateAligned(N * M); + float* dct = dct_mem.get(); + auto idct_mem = hwy::AllocateAligned(N * M); + float* idct = idct_mem.get(); + + auto dct_in_mem = hwy::AllocateAligned(N * M); + float* dct_in = dct_mem.get(); + + auto scratch_space_mem = hwy::AllocateAligned(N * M * 2); + float* scratch_space = scratch_space_mem.get(); + + Rng rng(0); + for (size_t i = 0; i < N * M; i++) { + pixels[i] = rng.UniformF(-1, 1); + } + ComputeScaledDCT()(DCTFrom(pixels, N), dct, scratch_space); + + for (size_t j = 0; j < 40000000 / (M * N); j++) { + memcpy(dct_in, dct, sizeof(*dct) * N * M); + ComputeScaledIDCT()(dct_in, DCTTo(idct, N), scratch_space); + } + float max_error = 0; + for (size_t i = 0; i < M * N; i++) { + float err = std::abs(idct[i] - pixels[i]); + if (std::abs(err) > max_error) { + max_error = std::abs(err); + } + } + printf("max error: %e\n", max_error); +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_FAST_DCT_INL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct.cc b/third_party/jpeg-xl/lib/jxl/fast_dct.cc new file mode 100644 index 0000000000..d796018fd0 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/fast_dct.cc @@ -0,0 +1,37 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/fast_dct.cc" +#include +#include + +#include "lib/jxl/base/random.h" +#include "lib/jxl/dct-inl.h" +#include "lib/jxl/fast_dct-inl.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { +void BenchmarkFloatIDCT32x32() { TestFloatIDCT<32, 32>(); } +void BenchmarkFastIDCT32x32() { TestFastIDCT<32, 32>(); } +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(BenchmarkFloatIDCT32x32); +HWY_EXPORT(BenchmarkFastIDCT32x32); +void BenchmarkFloatIDCT32x32() { + HWY_DYNAMIC_DISPATCH(BenchmarkFloatIDCT32x32)(); +} +void BenchmarkFastIDCT32x32() { + HWY_DYNAMIC_DISPATCH(BenchmarkFastIDCT32x32)(); +} +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct.h b/third_party/jpeg-xl/lib/jxl/fast_dct.h new file mode 100644 index 0000000000..641933d8a0 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/fast_dct.h @@ -0,0 +1,9 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +namespace jxl { +void BenchmarkFloatIDCT32x32(); +void BenchmarkFastIDCT32x32(); +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h new file mode 100644 index 0000000000..1a94d3ee92 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h @@ -0,0 +1,2137 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* This file is automatically generated. Do not modify it directly. */ +#if HWY_TARGET != HWY_NEON +#error "only include this file from fast_dct-inl.h" +#endif + +constexpr size_t FastIDCTIntegerBits(FastDCTTag<128>) { return 2; } + +void FastIDCT(FastDCTTag<128>, const int16_t* in, size_t in_stride, + int16_t* out, size_t out_stride, size_t count) { + JXL_ASSERT(count % 8 == 0); + for (size_t i = 0; i < count; i += 8) { + int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i); + int16x8_t v1 = vld1q_s16(in + in_stride * 64 + i); + int16x8_t v2 = vaddq_s16(v0, v1); + int16x8_t v3 = vld1q_s16(in + in_stride * 32 + i); + int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573); + int16x8_t v4 = vaddq_s16(v4_tmp, v3); + int16x8_t v5 = vld1q_s16(in + in_stride * 96 + i); + int16x8_t v6 = vaddq_s16(v5, v3); + int16x8_t v7 = vaddq_s16(v4, v6); + int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734); + int16x8_t v9 = vaddq_s16(v2, v8); + int16x8_t v10 = vld1q_s16(in + in_stride * 16 + i); + int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573); + int16x8_t v11 = vaddq_s16(v11_tmp, v10); + int16x8_t v12 = vld1q_s16(in + in_stride * 80 + i); + int16x8_t v13 = vld1q_s16(in + in_stride * 48 + i); + int16x8_t v14 = vaddq_s16(v12, v13); + int16x8_t v15 = vaddq_s16(v11, v14); + int16x8_t v16 = vaddq_s16(v13, v10); + int16x8_t v17_tmp = vqrdmulhq_n_s16(v16, 13573); + int16x8_t v17 = vaddq_s16(v17_tmp, v16); + int16x8_t v18 = vld1q_s16(in + in_stride * 112 + i); + int16x8_t v19 = vaddq_s16(v18, v12); + int16x8_t v20 = vaddq_s16(v19, v16); + int16x8_t v21 = vaddq_s16(v17, v20); + int16x8_t v22 = vqrdmulhq_n_s16(v21, 17734); + int16x8_t v23 = vaddq_s16(v15, v22); + int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705); + int16x8_t v25 = vaddq_s16(v9, v24); + int16x8_t v26 = vld1q_s16(in + in_stride * 8 + i); + int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573); + int16x8_t v27 = vaddq_s16(v27_tmp, v26); + int16x8_t v28 = vld1q_s16(in + in_stride * 72 + i); + int16x8_t v29 = vld1q_s16(in + in_stride * 56 + i); + int16x8_t v30 = vaddq_s16(v28, v29); + int16x8_t v31 = vaddq_s16(v27, v30); + int16x8_t v32 = vld1q_s16(in + in_stride * 40 + i); + int16x8_t v33 = vld1q_s16(in + in_stride * 24 + i); + int16x8_t v34 = vaddq_s16(v32, v33); + int16x8_t v35_tmp = vqrdmulhq_n_s16(v34, 13573); + int16x8_t v35 = vaddq_s16(v35_tmp, v34); + int16x8_t v36 = vld1q_s16(in + in_stride * 104 + i); + int16x8_t v37 = vld1q_s16(in + in_stride * 88 + i); + int16x8_t v38 = vaddq_s16(v36, v37); + int16x8_t v39 = vaddq_s16(v38, v34); + int16x8_t v40 = vaddq_s16(v35, v39); + int16x8_t v41 = vqrdmulhq_n_s16(v40, 17734); + int16x8_t v42 = vaddq_s16(v31, v41); + int16x8_t v43 = vaddq_s16(v33, v26); + int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573); + int16x8_t v44 = vaddq_s16(v44_tmp, v43); + int16x8_t v45 = vaddq_s16(v37, v28); + int16x8_t v46 = vaddq_s16(v29, v32); + int16x8_t v47 = vaddq_s16(v45, v46); + int16x8_t v48 = vaddq_s16(v44, v47); + int16x8_t v49 = vaddq_s16(v46, v43); + int16x8_t v50_tmp = vqrdmulhq_n_s16(v49, 13573); + int16x8_t v50 = vaddq_s16(v50_tmp, v49); + int16x8_t v51 = vld1q_s16(in + in_stride * 120 + i); + int16x8_t v52 = vaddq_s16(v51, v36); + int16x8_t v53 = vaddq_s16(v52, v45); + int16x8_t v54 = vaddq_s16(v53, v49); + int16x8_t v55 = vaddq_s16(v50, v54); + int16x8_t v56 = vqrdmulhq_n_s16(v55, 17734); + int16x8_t v57 = vaddq_s16(v48, v56); + int16x8_t v58 = vqrdmulhq_n_s16(v57, 16705); + int16x8_t v59 = vaddq_s16(v42, v58); + int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463); + int16x8_t v61 = vaddq_s16(v25, v60); + int16x8_t v62 = vld1q_s16(in + in_stride * 4 + i); + int16x8_t v63_tmp = vqrdmulhq_n_s16(v62, 13573); + int16x8_t v63 = vaddq_s16(v63_tmp, v62); + int16x8_t v64 = vld1q_s16(in + in_stride * 68 + i); + int16x8_t v65 = vld1q_s16(in + in_stride * 60 + i); + int16x8_t v66 = vaddq_s16(v64, v65); + int16x8_t v67 = vaddq_s16(v63, v66); + int16x8_t v68 = vld1q_s16(in + in_stride * 36 + i); + int16x8_t v69 = vld1q_s16(in + in_stride * 28 + i); + int16x8_t v70 = vaddq_s16(v68, v69); + int16x8_t v71_tmp = vqrdmulhq_n_s16(v70, 13573); + int16x8_t v71 = vaddq_s16(v71_tmp, v70); + int16x8_t v72 = vld1q_s16(in + in_stride * 100 + i); + int16x8_t v73 = vld1q_s16(in + in_stride * 92 + i); + int16x8_t v74 = vaddq_s16(v72, v73); + int16x8_t v75 = vaddq_s16(v74, v70); + int16x8_t v76 = vaddq_s16(v71, v75); + int16x8_t v77 = vqrdmulhq_n_s16(v76, 17734); + int16x8_t v78 = vaddq_s16(v67, v77); + int16x8_t v79 = vld1q_s16(in + in_stride * 20 + i); + int16x8_t v80 = vld1q_s16(in + in_stride * 12 + i); + int16x8_t v81 = vaddq_s16(v79, v80); + int16x8_t v82_tmp = vqrdmulhq_n_s16(v81, 13573); + int16x8_t v82 = vaddq_s16(v82_tmp, v81); + int16x8_t v83 = vld1q_s16(in + in_stride * 84 + i); + int16x8_t v84 = vld1q_s16(in + in_stride * 76 + i); + int16x8_t v85 = vaddq_s16(v83, v84); + int16x8_t v86 = vld1q_s16(in + in_stride * 52 + i); + int16x8_t v87 = vld1q_s16(in + in_stride * 44 + i); + int16x8_t v88 = vaddq_s16(v86, v87); + int16x8_t v89 = vaddq_s16(v85, v88); + int16x8_t v90 = vaddq_s16(v82, v89); + int16x8_t v91 = vaddq_s16(v88, v81); + int16x8_t v92_tmp = vqrdmulhq_n_s16(v91, 13573); + int16x8_t v92 = vaddq_s16(v92_tmp, v91); + int16x8_t v93 = vld1q_s16(in + in_stride * 116 + i); + int16x8_t v94 = vld1q_s16(in + in_stride * 108 + i); + int16x8_t v95 = vaddq_s16(v93, v94); + int16x8_t v96 = vaddq_s16(v95, v85); + int16x8_t v97 = vaddq_s16(v96, v91); + int16x8_t v98 = vaddq_s16(v92, v97); + int16x8_t v99 = vqrdmulhq_n_s16(v98, 17734); + int16x8_t v100 = vaddq_s16(v90, v99); + int16x8_t v101 = vqrdmulhq_n_s16(v100, 16705); + int16x8_t v102 = vaddq_s16(v78, v101); + int16x8_t v103 = vaddq_s16(v80, v62); + int16x8_t v104_tmp = vqrdmulhq_n_s16(v103, 13573); + int16x8_t v104 = vaddq_s16(v104_tmp, v103); + int16x8_t v105 = vaddq_s16(v84, v64); + int16x8_t v106 = vaddq_s16(v65, v86); + int16x8_t v107 = vaddq_s16(v105, v106); + int16x8_t v108 = vaddq_s16(v104, v107); + int16x8_t v109 = vaddq_s16(v87, v68); + int16x8_t v110 = vaddq_s16(v69, v79); + int16x8_t v111 = vaddq_s16(v109, v110); + int16x8_t v112_tmp = vqrdmulhq_n_s16(v111, 13573); + int16x8_t v112 = vaddq_s16(v112_tmp, v111); + int16x8_t v113 = vaddq_s16(v94, v72); + int16x8_t v114 = vaddq_s16(v73, v83); + int16x8_t v115 = vaddq_s16(v113, v114); + int16x8_t v116 = vaddq_s16(v115, v111); + int16x8_t v117 = vaddq_s16(v112, v116); + int16x8_t v118 = vqrdmulhq_n_s16(v117, 17734); + int16x8_t v119 = vaddq_s16(v108, v118); + int16x8_t v120 = vaddq_s16(v110, v103); + int16x8_t v121_tmp = vqrdmulhq_n_s16(v120, 13573); + int16x8_t v121 = vaddq_s16(v121_tmp, v120); + int16x8_t v122 = vaddq_s16(v114, v105); + int16x8_t v123 = vaddq_s16(v106, v109); + int16x8_t v124 = vaddq_s16(v122, v123); + int16x8_t v125 = vaddq_s16(v121, v124); + int16x8_t v126 = vaddq_s16(v123, v120); + int16x8_t v127_tmp = vqrdmulhq_n_s16(v126, 13573); + int16x8_t v127 = vaddq_s16(v127_tmp, v126); + int16x8_t v128 = vld1q_s16(in + in_stride * 124 + i); + int16x8_t v129 = vaddq_s16(v128, v93); + int16x8_t v130 = vaddq_s16(v129, v113); + int16x8_t v131 = vaddq_s16(v130, v122); + int16x8_t v132 = vaddq_s16(v131, v126); + int16x8_t v133 = vaddq_s16(v127, v132); + int16x8_t v134 = vqrdmulhq_n_s16(v133, 17734); + int16x8_t v135 = vaddq_s16(v125, v134); + int16x8_t v136 = vqrdmulhq_n_s16(v135, 16705); + int16x8_t v137 = vaddq_s16(v119, v136); + int16x8_t v138 = vqrdmulhq_n_s16(v137, 16463); + int16x8_t v139 = vaddq_s16(v102, v138); + int16x8_t v140 = vqrdmulhq_n_s16(v139, 16404); + int16x8_t v141 = vaddq_s16(v61, v140); + int16x8_t v142 = vld1q_s16(in + in_stride * 2 + i); + int16x8_t v143_tmp = vqrdmulhq_n_s16(v142, 13573); + int16x8_t v143 = vaddq_s16(v143_tmp, v142); + int16x8_t v144 = vld1q_s16(in + in_stride * 66 + i); + int16x8_t v145 = vld1q_s16(in + in_stride * 62 + i); + int16x8_t v146 = vaddq_s16(v144, v145); + int16x8_t v147 = vaddq_s16(v143, v146); + int16x8_t v148 = vld1q_s16(in + in_stride * 34 + i); + int16x8_t v149 = vld1q_s16(in + in_stride * 30 + i); + int16x8_t v150 = vaddq_s16(v148, v149); + int16x8_t v151_tmp = vqrdmulhq_n_s16(v150, 13573); + int16x8_t v151 = vaddq_s16(v151_tmp, v150); + int16x8_t v152 = vld1q_s16(in + in_stride * 98 + i); + int16x8_t v153 = vld1q_s16(in + in_stride * 94 + i); + int16x8_t v154 = vaddq_s16(v152, v153); + int16x8_t v155 = vaddq_s16(v154, v150); + int16x8_t v156 = vaddq_s16(v151, v155); + int16x8_t v157 = vqrdmulhq_n_s16(v156, 17734); + int16x8_t v158 = vaddq_s16(v147, v157); + int16x8_t v159 = vld1q_s16(in + in_stride * 18 + i); + int16x8_t v160 = vld1q_s16(in + in_stride * 14 + i); + int16x8_t v161 = vaddq_s16(v159, v160); + int16x8_t v162_tmp = vqrdmulhq_n_s16(v161, 13573); + int16x8_t v162 = vaddq_s16(v162_tmp, v161); + int16x8_t v163 = vld1q_s16(in + in_stride * 82 + i); + int16x8_t v164 = vld1q_s16(in + in_stride * 78 + i); + int16x8_t v165 = vaddq_s16(v163, v164); + int16x8_t v166 = vld1q_s16(in + in_stride * 50 + i); + int16x8_t v167 = vld1q_s16(in + in_stride * 46 + i); + int16x8_t v168 = vaddq_s16(v166, v167); + int16x8_t v169 = vaddq_s16(v165, v168); + int16x8_t v170 = vaddq_s16(v162, v169); + int16x8_t v171 = vaddq_s16(v168, v161); + int16x8_t v172_tmp = vqrdmulhq_n_s16(v171, 13573); + int16x8_t v172 = vaddq_s16(v172_tmp, v171); + int16x8_t v173 = vld1q_s16(in + in_stride * 114 + i); + int16x8_t v174 = vld1q_s16(in + in_stride * 110 + i); + int16x8_t v175 = vaddq_s16(v173, v174); + int16x8_t v176 = vaddq_s16(v175, v165); + int16x8_t v177 = vaddq_s16(v176, v171); + int16x8_t v178 = vaddq_s16(v172, v177); + int16x8_t v179 = vqrdmulhq_n_s16(v178, 17734); + int16x8_t v180 = vaddq_s16(v170, v179); + int16x8_t v181 = vqrdmulhq_n_s16(v180, 16705); + int16x8_t v182 = vaddq_s16(v158, v181); + int16x8_t v183 = vld1q_s16(in + in_stride * 10 + i); + int16x8_t v184 = vld1q_s16(in + in_stride * 6 + i); + int16x8_t v185 = vaddq_s16(v183, v184); + int16x8_t v186_tmp = vqrdmulhq_n_s16(v185, 13573); + int16x8_t v186 = vaddq_s16(v186_tmp, v185); + int16x8_t v187 = vld1q_s16(in + in_stride * 74 + i); + int16x8_t v188 = vld1q_s16(in + in_stride * 70 + i); + int16x8_t v189 = vaddq_s16(v187, v188); + int16x8_t v190 = vld1q_s16(in + in_stride * 58 + i); + int16x8_t v191 = vld1q_s16(in + in_stride * 54 + i); + int16x8_t v192 = vaddq_s16(v190, v191); + int16x8_t v193 = vaddq_s16(v189, v192); + int16x8_t v194 = vaddq_s16(v186, v193); + int16x8_t v195 = vld1q_s16(in + in_stride * 42 + i); + int16x8_t v196 = vld1q_s16(in + in_stride * 38 + i); + int16x8_t v197 = vaddq_s16(v195, v196); + int16x8_t v198 = vld1q_s16(in + in_stride * 26 + i); + int16x8_t v199 = vld1q_s16(in + in_stride * 22 + i); + int16x8_t v200 = vaddq_s16(v198, v199); + int16x8_t v201 = vaddq_s16(v197, v200); + int16x8_t v202_tmp = vqrdmulhq_n_s16(v201, 13573); + int16x8_t v202 = vaddq_s16(v202_tmp, v201); + int16x8_t v203 = vld1q_s16(in + in_stride * 106 + i); + int16x8_t v204 = vld1q_s16(in + in_stride * 102 + i); + int16x8_t v205 = vaddq_s16(v203, v204); + int16x8_t v206 = vld1q_s16(in + in_stride * 90 + i); + int16x8_t v207 = vld1q_s16(in + in_stride * 86 + i); + int16x8_t v208 = vaddq_s16(v206, v207); + int16x8_t v209 = vaddq_s16(v205, v208); + int16x8_t v210 = vaddq_s16(v209, v201); + int16x8_t v211 = vaddq_s16(v202, v210); + int16x8_t v212 = vqrdmulhq_n_s16(v211, 17734); + int16x8_t v213 = vaddq_s16(v194, v212); + int16x8_t v214 = vaddq_s16(v200, v185); + int16x8_t v215_tmp = vqrdmulhq_n_s16(v214, 13573); + int16x8_t v215 = vaddq_s16(v215_tmp, v214); + int16x8_t v216 = vaddq_s16(v208, v189); + int16x8_t v217 = vaddq_s16(v192, v197); + int16x8_t v218 = vaddq_s16(v216, v217); + int16x8_t v219 = vaddq_s16(v215, v218); + int16x8_t v220 = vaddq_s16(v217, v214); + int16x8_t v221_tmp = vqrdmulhq_n_s16(v220, 13573); + int16x8_t v221 = vaddq_s16(v221_tmp, v220); + int16x8_t v222 = vld1q_s16(in + in_stride * 122 + i); + int16x8_t v223 = vld1q_s16(in + in_stride * 118 + i); + int16x8_t v224 = vaddq_s16(v222, v223); + int16x8_t v225 = vaddq_s16(v224, v205); + int16x8_t v226 = vaddq_s16(v225, v216); + int16x8_t v227 = vaddq_s16(v226, v220); + int16x8_t v228 = vaddq_s16(v221, v227); + int16x8_t v229 = vqrdmulhq_n_s16(v228, 17734); + int16x8_t v230 = vaddq_s16(v219, v229); + int16x8_t v231 = vqrdmulhq_n_s16(v230, 16705); + int16x8_t v232 = vaddq_s16(v213, v231); + int16x8_t v233 = vqrdmulhq_n_s16(v232, 16463); + int16x8_t v234 = vaddq_s16(v182, v233); + int16x8_t v235 = vaddq_s16(v184, v142); + int16x8_t v236_tmp = vqrdmulhq_n_s16(v235, 13573); + int16x8_t v236 = vaddq_s16(v236_tmp, v235); + int16x8_t v237 = vaddq_s16(v188, v144); + int16x8_t v238 = vaddq_s16(v145, v190); + int16x8_t v239 = vaddq_s16(v237, v238); + int16x8_t v240 = vaddq_s16(v236, v239); + int16x8_t v241 = vaddq_s16(v196, v148); + int16x8_t v242 = vaddq_s16(v149, v198); + int16x8_t v243 = vaddq_s16(v241, v242); + int16x8_t v244_tmp = vqrdmulhq_n_s16(v243, 13573); + int16x8_t v244 = vaddq_s16(v244_tmp, v243); + int16x8_t v245 = vaddq_s16(v204, v152); + int16x8_t v246 = vaddq_s16(v153, v206); + int16x8_t v247 = vaddq_s16(v245, v246); + int16x8_t v248 = vaddq_s16(v247, v243); + int16x8_t v249 = vaddq_s16(v244, v248); + int16x8_t v250 = vqrdmulhq_n_s16(v249, 17734); + int16x8_t v251 = vaddq_s16(v240, v250); + int16x8_t v252 = vaddq_s16(v199, v159); + int16x8_t v253 = vaddq_s16(v160, v183); + int16x8_t v254 = vaddq_s16(v252, v253); + int16x8_t v255_tmp = vqrdmulhq_n_s16(v254, 13573); + int16x8_t v255 = vaddq_s16(v255_tmp, v254); + int16x8_t v256 = vaddq_s16(v207, v163); + int16x8_t v257 = vaddq_s16(v164, v187); + int16x8_t v258 = vaddq_s16(v256, v257); + int16x8_t v259 = vaddq_s16(v191, v166); + int16x8_t v260 = vaddq_s16(v167, v195); + int16x8_t v261 = vaddq_s16(v259, v260); + int16x8_t v262 = vaddq_s16(v258, v261); + int16x8_t v263 = vaddq_s16(v255, v262); + int16x8_t v264 = vaddq_s16(v261, v254); + int16x8_t v265_tmp = vqrdmulhq_n_s16(v264, 13573); + int16x8_t v265 = vaddq_s16(v265_tmp, v264); + int16x8_t v266 = vaddq_s16(v223, v173); + int16x8_t v267 = vaddq_s16(v174, v203); + int16x8_t v268 = vaddq_s16(v266, v267); + int16x8_t v269 = vaddq_s16(v268, v258); + int16x8_t v270 = vaddq_s16(v269, v264); + int16x8_t v271 = vaddq_s16(v265, v270); + int16x8_t v272 = vqrdmulhq_n_s16(v271, 17734); + int16x8_t v273 = vaddq_s16(v263, v272); + int16x8_t v274 = vqrdmulhq_n_s16(v273, 16705); + int16x8_t v275 = vaddq_s16(v251, v274); + int16x8_t v276 = vaddq_s16(v253, v235); + int16x8_t v277_tmp = vqrdmulhq_n_s16(v276, 13573); + int16x8_t v277 = vaddq_s16(v277_tmp, v276); + int16x8_t v278 = vaddq_s16(v257, v237); + int16x8_t v279 = vaddq_s16(v238, v259); + int16x8_t v280 = vaddq_s16(v278, v279); + int16x8_t v281 = vaddq_s16(v277, v280); + int16x8_t v282 = vaddq_s16(v260, v241); + int16x8_t v283 = vaddq_s16(v242, v252); + int16x8_t v284 = vaddq_s16(v282, v283); + int16x8_t v285_tmp = vqrdmulhq_n_s16(v284, 13573); + int16x8_t v285 = vaddq_s16(v285_tmp, v284); + int16x8_t v286 = vaddq_s16(v267, v245); + int16x8_t v287 = vaddq_s16(v246, v256); + int16x8_t v288 = vaddq_s16(v286, v287); + int16x8_t v289 = vaddq_s16(v288, v284); + int16x8_t v290 = vaddq_s16(v285, v289); + int16x8_t v291 = vqrdmulhq_n_s16(v290, 17734); + int16x8_t v292 = vaddq_s16(v281, v291); + int16x8_t v293 = vaddq_s16(v283, v276); + int16x8_t v294_tmp = vqrdmulhq_n_s16(v293, 13573); + int16x8_t v294 = vaddq_s16(v294_tmp, v293); + int16x8_t v295 = vaddq_s16(v287, v278); + int16x8_t v296 = vaddq_s16(v279, v282); + int16x8_t v297 = vaddq_s16(v295, v296); + int16x8_t v298 = vaddq_s16(v294, v297); + int16x8_t v299 = vaddq_s16(v296, v293); + int16x8_t v300_tmp = vqrdmulhq_n_s16(v299, 13573); + int16x8_t v300 = vaddq_s16(v300_tmp, v299); + int16x8_t v301 = vld1q_s16(in + in_stride * 126 + i); + int16x8_t v302 = vaddq_s16(v301, v222); + int16x8_t v303 = vaddq_s16(v302, v266); + int16x8_t v304 = vaddq_s16(v303, v286); + int16x8_t v305 = vaddq_s16(v304, v295); + int16x8_t v306 = vaddq_s16(v305, v299); + int16x8_t v307 = vaddq_s16(v300, v306); + int16x8_t v308 = vqrdmulhq_n_s16(v307, 17734); + int16x8_t v309 = vaddq_s16(v298, v308); + int16x8_t v310 = vqrdmulhq_n_s16(v309, 16705); + int16x8_t v311 = vaddq_s16(v292, v310); + int16x8_t v312 = vqrdmulhq_n_s16(v311, 16463); + int16x8_t v313 = vaddq_s16(v275, v312); + int16x8_t v314 = vqrdmulhq_n_s16(v313, 16404); + int16x8_t v315 = vaddq_s16(v234, v314); + int16x8_t v316 = vqrdmulhq_n_s16(v315, 16389); + int16x8_t v317 = vaddq_s16(v141, v316); + int16x8_t v318 = vld1q_s16(in + in_stride * 1 + i); + int16x8_t v319_tmp = vqrdmulhq_n_s16(v318, 13573); + int16x8_t v319 = vaddq_s16(v319_tmp, v318); + int16x8_t v320 = vld1q_s16(in + in_stride * 65 + i); + int16x8_t v321 = vld1q_s16(in + in_stride * 63 + i); + int16x8_t v322 = vaddq_s16(v320, v321); + int16x8_t v323 = vaddq_s16(v319, v322); + int16x8_t v324 = vld1q_s16(in + in_stride * 33 + i); + int16x8_t v325 = vld1q_s16(in + in_stride * 31 + i); + int16x8_t v326 = vaddq_s16(v324, v325); + int16x8_t v327_tmp = vqrdmulhq_n_s16(v326, 13573); + int16x8_t v327 = vaddq_s16(v327_tmp, v326); + int16x8_t v328 = vld1q_s16(in + in_stride * 97 + i); + int16x8_t v329 = vld1q_s16(in + in_stride * 95 + i); + int16x8_t v330 = vaddq_s16(v328, v329); + int16x8_t v331 = vaddq_s16(v330, v326); + int16x8_t v332 = vaddq_s16(v327, v331); + int16x8_t v333 = vqrdmulhq_n_s16(v332, 17734); + int16x8_t v334 = vaddq_s16(v323, v333); + int16x8_t v335 = vld1q_s16(in + in_stride * 17 + i); + int16x8_t v336 = vld1q_s16(in + in_stride * 15 + i); + int16x8_t v337 = vaddq_s16(v335, v336); + int16x8_t v338_tmp = vqrdmulhq_n_s16(v337, 13573); + int16x8_t v338 = vaddq_s16(v338_tmp, v337); + int16x8_t v339 = vld1q_s16(in + in_stride * 81 + i); + int16x8_t v340 = vld1q_s16(in + in_stride * 79 + i); + int16x8_t v341 = vaddq_s16(v339, v340); + int16x8_t v342 = vld1q_s16(in + in_stride * 49 + i); + int16x8_t v343 = vld1q_s16(in + in_stride * 47 + i); + int16x8_t v344 = vaddq_s16(v342, v343); + int16x8_t v345 = vaddq_s16(v341, v344); + int16x8_t v346 = vaddq_s16(v338, v345); + int16x8_t v347 = vaddq_s16(v344, v337); + int16x8_t v348_tmp = vqrdmulhq_n_s16(v347, 13573); + int16x8_t v348 = vaddq_s16(v348_tmp, v347); + int16x8_t v349 = vld1q_s16(in + in_stride * 113 + i); + int16x8_t v350 = vld1q_s16(in + in_stride * 111 + i); + int16x8_t v351 = vaddq_s16(v349, v350); + int16x8_t v352 = vaddq_s16(v351, v341); + int16x8_t v353 = vaddq_s16(v352, v347); + int16x8_t v354 = vaddq_s16(v348, v353); + int16x8_t v355 = vqrdmulhq_n_s16(v354, 17734); + int16x8_t v356 = vaddq_s16(v346, v355); + int16x8_t v357 = vqrdmulhq_n_s16(v356, 16705); + int16x8_t v358 = vaddq_s16(v334, v357); + int16x8_t v359 = vld1q_s16(in + in_stride * 9 + i); + int16x8_t v360 = vld1q_s16(in + in_stride * 7 + i); + int16x8_t v361 = vaddq_s16(v359, v360); + int16x8_t v362_tmp = vqrdmulhq_n_s16(v361, 13573); + int16x8_t v362 = vaddq_s16(v362_tmp, v361); + int16x8_t v363 = vld1q_s16(in + in_stride * 73 + i); + int16x8_t v364 = vld1q_s16(in + in_stride * 71 + i); + int16x8_t v365 = vaddq_s16(v363, v364); + int16x8_t v366 = vld1q_s16(in + in_stride * 57 + i); + int16x8_t v367 = vld1q_s16(in + in_stride * 55 + i); + int16x8_t v368 = vaddq_s16(v366, v367); + int16x8_t v369 = vaddq_s16(v365, v368); + int16x8_t v370 = vaddq_s16(v362, v369); + int16x8_t v371 = vld1q_s16(in + in_stride * 41 + i); + int16x8_t v372 = vld1q_s16(in + in_stride * 39 + i); + int16x8_t v373 = vaddq_s16(v371, v372); + int16x8_t v374 = vld1q_s16(in + in_stride * 25 + i); + int16x8_t v375 = vld1q_s16(in + in_stride * 23 + i); + int16x8_t v376 = vaddq_s16(v374, v375); + int16x8_t v377 = vaddq_s16(v373, v376); + int16x8_t v378_tmp = vqrdmulhq_n_s16(v377, 13573); + int16x8_t v378 = vaddq_s16(v378_tmp, v377); + int16x8_t v379 = vld1q_s16(in + in_stride * 105 + i); + int16x8_t v380 = vld1q_s16(in + in_stride * 103 + i); + int16x8_t v381 = vaddq_s16(v379, v380); + int16x8_t v382 = vld1q_s16(in + in_stride * 89 + i); + int16x8_t v383 = vld1q_s16(in + in_stride * 87 + i); + int16x8_t v384 = vaddq_s16(v382, v383); + int16x8_t v385 = vaddq_s16(v381, v384); + int16x8_t v386 = vaddq_s16(v385, v377); + int16x8_t v387 = vaddq_s16(v378, v386); + int16x8_t v388 = vqrdmulhq_n_s16(v387, 17734); + int16x8_t v389 = vaddq_s16(v370, v388); + int16x8_t v390 = vaddq_s16(v376, v361); + int16x8_t v391_tmp = vqrdmulhq_n_s16(v390, 13573); + int16x8_t v391 = vaddq_s16(v391_tmp, v390); + int16x8_t v392 = vaddq_s16(v384, v365); + int16x8_t v393 = vaddq_s16(v368, v373); + int16x8_t v394 = vaddq_s16(v392, v393); + int16x8_t v395 = vaddq_s16(v391, v394); + int16x8_t v396 = vaddq_s16(v393, v390); + int16x8_t v397_tmp = vqrdmulhq_n_s16(v396, 13573); + int16x8_t v397 = vaddq_s16(v397_tmp, v396); + int16x8_t v398 = vld1q_s16(in + in_stride * 121 + i); + int16x8_t v399 = vld1q_s16(in + in_stride * 119 + i); + int16x8_t v400 = vaddq_s16(v398, v399); + int16x8_t v401 = vaddq_s16(v400, v381); + int16x8_t v402 = vaddq_s16(v401, v392); + int16x8_t v403 = vaddq_s16(v402, v396); + int16x8_t v404 = vaddq_s16(v397, v403); + int16x8_t v405 = vqrdmulhq_n_s16(v404, 17734); + int16x8_t v406 = vaddq_s16(v395, v405); + int16x8_t v407 = vqrdmulhq_n_s16(v406, 16705); + int16x8_t v408 = vaddq_s16(v389, v407); + int16x8_t v409 = vqrdmulhq_n_s16(v408, 16463); + int16x8_t v410 = vaddq_s16(v358, v409); + int16x8_t v411 = vld1q_s16(in + in_stride * 5 + i); + int16x8_t v412 = vld1q_s16(in + in_stride * 3 + i); + int16x8_t v413 = vaddq_s16(v411, v412); + int16x8_t v414_tmp = vqrdmulhq_n_s16(v413, 13573); + int16x8_t v414 = vaddq_s16(v414_tmp, v413); + int16x8_t v415 = vld1q_s16(in + in_stride * 69 + i); + int16x8_t v416 = vld1q_s16(in + in_stride * 67 + i); + int16x8_t v417 = vaddq_s16(v415, v416); + int16x8_t v418 = vld1q_s16(in + in_stride * 61 + i); + int16x8_t v419 = vld1q_s16(in + in_stride * 59 + i); + int16x8_t v420 = vaddq_s16(v418, v419); + int16x8_t v421 = vaddq_s16(v417, v420); + int16x8_t v422 = vaddq_s16(v414, v421); + int16x8_t v423 = vld1q_s16(in + in_stride * 37 + i); + int16x8_t v424 = vld1q_s16(in + in_stride * 35 + i); + int16x8_t v425 = vaddq_s16(v423, v424); + int16x8_t v426 = vld1q_s16(in + in_stride * 29 + i); + int16x8_t v427 = vld1q_s16(in + in_stride * 27 + i); + int16x8_t v428 = vaddq_s16(v426, v427); + int16x8_t v429 = vaddq_s16(v425, v428); + int16x8_t v430_tmp = vqrdmulhq_n_s16(v429, 13573); + int16x8_t v430 = vaddq_s16(v430_tmp, v429); + int16x8_t v431 = vld1q_s16(in + in_stride * 101 + i); + int16x8_t v432 = vld1q_s16(in + in_stride * 99 + i); + int16x8_t v433 = vaddq_s16(v431, v432); + int16x8_t v434 = vld1q_s16(in + in_stride * 93 + i); + int16x8_t v435 = vld1q_s16(in + in_stride * 91 + i); + int16x8_t v436 = vaddq_s16(v434, v435); + int16x8_t v437 = vaddq_s16(v433, v436); + int16x8_t v438 = vaddq_s16(v437, v429); + int16x8_t v439 = vaddq_s16(v430, v438); + int16x8_t v440 = vqrdmulhq_n_s16(v439, 17734); + int16x8_t v441 = vaddq_s16(v422, v440); + int16x8_t v442 = vld1q_s16(in + in_stride * 21 + i); + int16x8_t v443 = vld1q_s16(in + in_stride * 19 + i); + int16x8_t v444 = vaddq_s16(v442, v443); + int16x8_t v445 = vld1q_s16(in + in_stride * 13 + i); + int16x8_t v446 = vld1q_s16(in + in_stride * 11 + i); + int16x8_t v447 = vaddq_s16(v445, v446); + int16x8_t v448 = vaddq_s16(v444, v447); + int16x8_t v449_tmp = vqrdmulhq_n_s16(v448, 13573); + int16x8_t v449 = vaddq_s16(v449_tmp, v448); + int16x8_t v450 = vld1q_s16(in + in_stride * 85 + i); + int16x8_t v451 = vld1q_s16(in + in_stride * 83 + i); + int16x8_t v452 = vaddq_s16(v450, v451); + int16x8_t v453 = vld1q_s16(in + in_stride * 77 + i); + int16x8_t v454 = vld1q_s16(in + in_stride * 75 + i); + int16x8_t v455 = vaddq_s16(v453, v454); + int16x8_t v456 = vaddq_s16(v452, v455); + int16x8_t v457 = vld1q_s16(in + in_stride * 53 + i); + int16x8_t v458 = vld1q_s16(in + in_stride * 51 + i); + int16x8_t v459 = vaddq_s16(v457, v458); + int16x8_t v460 = vld1q_s16(in + in_stride * 45 + i); + int16x8_t v461 = vld1q_s16(in + in_stride * 43 + i); + int16x8_t v462 = vaddq_s16(v460, v461); + int16x8_t v463 = vaddq_s16(v459, v462); + int16x8_t v464 = vaddq_s16(v456, v463); + int16x8_t v465 = vaddq_s16(v449, v464); + int16x8_t v466 = vaddq_s16(v463, v448); + int16x8_t v467_tmp = vqrdmulhq_n_s16(v466, 13573); + int16x8_t v467 = vaddq_s16(v467_tmp, v466); + int16x8_t v468 = vld1q_s16(in + in_stride * 117 + i); + int16x8_t v469 = vld1q_s16(in + in_stride * 115 + i); + int16x8_t v470 = vaddq_s16(v468, v469); + int16x8_t v471 = vld1q_s16(in + in_stride * 109 + i); + int16x8_t v472 = vld1q_s16(in + in_stride * 107 + i); + int16x8_t v473 = vaddq_s16(v471, v472); + int16x8_t v474 = vaddq_s16(v470, v473); + int16x8_t v475 = vaddq_s16(v474, v456); + int16x8_t v476 = vaddq_s16(v475, v466); + int16x8_t v477 = vaddq_s16(v467, v476); + int16x8_t v478 = vqrdmulhq_n_s16(v477, 17734); + int16x8_t v479 = vaddq_s16(v465, v478); + int16x8_t v480 = vqrdmulhq_n_s16(v479, 16705); + int16x8_t v481 = vaddq_s16(v441, v480); + int16x8_t v482 = vaddq_s16(v447, v413); + int16x8_t v483_tmp = vqrdmulhq_n_s16(v482, 13573); + int16x8_t v483 = vaddq_s16(v483_tmp, v482); + int16x8_t v484 = vaddq_s16(v455, v417); + int16x8_t v485 = vaddq_s16(v420, v459); + int16x8_t v486 = vaddq_s16(v484, v485); + int16x8_t v487 = vaddq_s16(v483, v486); + int16x8_t v488 = vaddq_s16(v462, v425); + int16x8_t v489 = vaddq_s16(v428, v444); + int16x8_t v490 = vaddq_s16(v488, v489); + int16x8_t v491_tmp = vqrdmulhq_n_s16(v490, 13573); + int16x8_t v491 = vaddq_s16(v491_tmp, v490); + int16x8_t v492 = vaddq_s16(v473, v433); + int16x8_t v493 = vaddq_s16(v436, v452); + int16x8_t v494 = vaddq_s16(v492, v493); + int16x8_t v495 = vaddq_s16(v494, v490); + int16x8_t v496 = vaddq_s16(v491, v495); + int16x8_t v497 = vqrdmulhq_n_s16(v496, 17734); + int16x8_t v498 = vaddq_s16(v487, v497); + int16x8_t v499 = vaddq_s16(v489, v482); + int16x8_t v500_tmp = vqrdmulhq_n_s16(v499, 13573); + int16x8_t v500 = vaddq_s16(v500_tmp, v499); + int16x8_t v501 = vaddq_s16(v493, v484); + int16x8_t v502 = vaddq_s16(v485, v488); + int16x8_t v503 = vaddq_s16(v501, v502); + int16x8_t v504 = vaddq_s16(v500, v503); + int16x8_t v505 = vaddq_s16(v502, v499); + int16x8_t v506_tmp = vqrdmulhq_n_s16(v505, 13573); + int16x8_t v506 = vaddq_s16(v506_tmp, v505); + int16x8_t v507 = vld1q_s16(in + in_stride * 125 + i); + int16x8_t v508 = vld1q_s16(in + in_stride * 123 + i); + int16x8_t v509 = vaddq_s16(v507, v508); + int16x8_t v510 = vaddq_s16(v509, v470); + int16x8_t v511 = vaddq_s16(v510, v492); + int16x8_t v512 = vaddq_s16(v511, v501); + int16x8_t v513 = vaddq_s16(v512, v505); + int16x8_t v514 = vaddq_s16(v506, v513); + int16x8_t v515 = vqrdmulhq_n_s16(v514, 17734); + int16x8_t v516 = vaddq_s16(v504, v515); + int16x8_t v517 = vqrdmulhq_n_s16(v516, 16705); + int16x8_t v518 = vaddq_s16(v498, v517); + int16x8_t v519 = vqrdmulhq_n_s16(v518, 16463); + int16x8_t v520 = vaddq_s16(v481, v519); + int16x8_t v521 = vqrdmulhq_n_s16(v520, 16404); + int16x8_t v522 = vaddq_s16(v410, v521); + int16x8_t v523 = vaddq_s16(v412, v318); + int16x8_t v524_tmp = vqrdmulhq_n_s16(v523, 13573); + int16x8_t v524 = vaddq_s16(v524_tmp, v523); + int16x8_t v525 = vaddq_s16(v416, v320); + int16x8_t v526 = vaddq_s16(v321, v418); + int16x8_t v527 = vaddq_s16(v525, v526); + int16x8_t v528 = vaddq_s16(v524, v527); + int16x8_t v529 = vaddq_s16(v424, v324); + int16x8_t v530 = vaddq_s16(v325, v426); + int16x8_t v531 = vaddq_s16(v529, v530); + int16x8_t v532_tmp = vqrdmulhq_n_s16(v531, 13573); + int16x8_t v532 = vaddq_s16(v532_tmp, v531); + int16x8_t v533 = vaddq_s16(v432, v328); + int16x8_t v534 = vaddq_s16(v329, v434); + int16x8_t v535 = vaddq_s16(v533, v534); + int16x8_t v536 = vaddq_s16(v535, v531); + int16x8_t v537 = vaddq_s16(v532, v536); + int16x8_t v538 = vqrdmulhq_n_s16(v537, 17734); + int16x8_t v539 = vaddq_s16(v528, v538); + int16x8_t v540 = vaddq_s16(v443, v335); + int16x8_t v541 = vaddq_s16(v336, v445); + int16x8_t v542 = vaddq_s16(v540, v541); + int16x8_t v543_tmp = vqrdmulhq_n_s16(v542, 13573); + int16x8_t v543 = vaddq_s16(v543_tmp, v542); + int16x8_t v544 = vaddq_s16(v451, v339); + int16x8_t v545 = vaddq_s16(v340, v453); + int16x8_t v546 = vaddq_s16(v544, v545); + int16x8_t v547 = vaddq_s16(v458, v342); + int16x8_t v548 = vaddq_s16(v343, v460); + int16x8_t v549 = vaddq_s16(v547, v548); + int16x8_t v550 = vaddq_s16(v546, v549); + int16x8_t v551 = vaddq_s16(v543, v550); + int16x8_t v552 = vaddq_s16(v549, v542); + int16x8_t v553_tmp = vqrdmulhq_n_s16(v552, 13573); + int16x8_t v553 = vaddq_s16(v553_tmp, v552); + int16x8_t v554 = vaddq_s16(v469, v349); + int16x8_t v555 = vaddq_s16(v350, v471); + int16x8_t v556 = vaddq_s16(v554, v555); + int16x8_t v557 = vaddq_s16(v556, v546); + int16x8_t v558 = vaddq_s16(v557, v552); + int16x8_t v559 = vaddq_s16(v553, v558); + int16x8_t v560 = vqrdmulhq_n_s16(v559, 17734); + int16x8_t v561 = vaddq_s16(v551, v560); + int16x8_t v562 = vqrdmulhq_n_s16(v561, 16705); + int16x8_t v563 = vaddq_s16(v539, v562); + int16x8_t v564 = vaddq_s16(v446, v359); + int16x8_t v565 = vaddq_s16(v360, v411); + int16x8_t v566 = vaddq_s16(v564, v565); + int16x8_t v567_tmp = vqrdmulhq_n_s16(v566, 13573); + int16x8_t v567 = vaddq_s16(v567_tmp, v566); + int16x8_t v568 = vaddq_s16(v454, v363); + int16x8_t v569 = vaddq_s16(v364, v415); + int16x8_t v570 = vaddq_s16(v568, v569); + int16x8_t v571 = vaddq_s16(v419, v366); + int16x8_t v572 = vaddq_s16(v367, v457); + int16x8_t v573 = vaddq_s16(v571, v572); + int16x8_t v574 = vaddq_s16(v570, v573); + int16x8_t v575 = vaddq_s16(v567, v574); + int16x8_t v576 = vaddq_s16(v461, v371); + int16x8_t v577 = vaddq_s16(v372, v423); + int16x8_t v578 = vaddq_s16(v576, v577); + int16x8_t v579 = vaddq_s16(v427, v374); + int16x8_t v580 = vaddq_s16(v375, v442); + int16x8_t v581 = vaddq_s16(v579, v580); + int16x8_t v582 = vaddq_s16(v578, v581); + int16x8_t v583_tmp = vqrdmulhq_n_s16(v582, 13573); + int16x8_t v583 = vaddq_s16(v583_tmp, v582); + int16x8_t v584 = vaddq_s16(v472, v379); + int16x8_t v585 = vaddq_s16(v380, v431); + int16x8_t v586 = vaddq_s16(v584, v585); + int16x8_t v587 = vaddq_s16(v435, v382); + int16x8_t v588 = vaddq_s16(v383, v450); + int16x8_t v589 = vaddq_s16(v587, v588); + int16x8_t v590 = vaddq_s16(v586, v589); + int16x8_t v591 = vaddq_s16(v590, v582); + int16x8_t v592 = vaddq_s16(v583, v591); + int16x8_t v593 = vqrdmulhq_n_s16(v592, 17734); + int16x8_t v594 = vaddq_s16(v575, v593); + int16x8_t v595 = vaddq_s16(v581, v566); + int16x8_t v596_tmp = vqrdmulhq_n_s16(v595, 13573); + int16x8_t v596 = vaddq_s16(v596_tmp, v595); + int16x8_t v597 = vaddq_s16(v589, v570); + int16x8_t v598 = vaddq_s16(v573, v578); + int16x8_t v599 = vaddq_s16(v597, v598); + int16x8_t v600 = vaddq_s16(v596, v599); + int16x8_t v601 = vaddq_s16(v598, v595); + int16x8_t v602_tmp = vqrdmulhq_n_s16(v601, 13573); + int16x8_t v602 = vaddq_s16(v602_tmp, v601); + int16x8_t v603 = vaddq_s16(v508, v398); + int16x8_t v604 = vaddq_s16(v399, v468); + int16x8_t v605 = vaddq_s16(v603, v604); + int16x8_t v606 = vaddq_s16(v605, v586); + int16x8_t v607 = vaddq_s16(v606, v597); + int16x8_t v608 = vaddq_s16(v607, v601); + int16x8_t v609 = vaddq_s16(v602, v608); + int16x8_t v610 = vqrdmulhq_n_s16(v609, 17734); + int16x8_t v611 = vaddq_s16(v600, v610); + int16x8_t v612 = vqrdmulhq_n_s16(v611, 16705); + int16x8_t v613 = vaddq_s16(v594, v612); + int16x8_t v614 = vqrdmulhq_n_s16(v613, 16463); + int16x8_t v615 = vaddq_s16(v563, v614); + int16x8_t v616 = vaddq_s16(v565, v523); + int16x8_t v617_tmp = vqrdmulhq_n_s16(v616, 13573); + int16x8_t v617 = vaddq_s16(v617_tmp, v616); + int16x8_t v618 = vaddq_s16(v569, v525); + int16x8_t v619 = vaddq_s16(v526, v571); + int16x8_t v620 = vaddq_s16(v618, v619); + int16x8_t v621 = vaddq_s16(v617, v620); + int16x8_t v622 = vaddq_s16(v577, v529); + int16x8_t v623 = vaddq_s16(v530, v579); + int16x8_t v624 = vaddq_s16(v622, v623); + int16x8_t v625_tmp = vqrdmulhq_n_s16(v624, 13573); + int16x8_t v625 = vaddq_s16(v625_tmp, v624); + int16x8_t v626 = vaddq_s16(v585, v533); + int16x8_t v627 = vaddq_s16(v534, v587); + int16x8_t v628 = vaddq_s16(v626, v627); + int16x8_t v629 = vaddq_s16(v628, v624); + int16x8_t v630 = vaddq_s16(v625, v629); + int16x8_t v631 = vqrdmulhq_n_s16(v630, 17734); + int16x8_t v632 = vaddq_s16(v621, v631); + int16x8_t v633 = vaddq_s16(v580, v540); + int16x8_t v634 = vaddq_s16(v541, v564); + int16x8_t v635 = vaddq_s16(v633, v634); + int16x8_t v636_tmp = vqrdmulhq_n_s16(v635, 13573); + int16x8_t v636 = vaddq_s16(v636_tmp, v635); + int16x8_t v637 = vaddq_s16(v588, v544); + int16x8_t v638 = vaddq_s16(v545, v568); + int16x8_t v639 = vaddq_s16(v637, v638); + int16x8_t v640 = vaddq_s16(v572, v547); + int16x8_t v641 = vaddq_s16(v548, v576); + int16x8_t v642 = vaddq_s16(v640, v641); + int16x8_t v643 = vaddq_s16(v639, v642); + int16x8_t v644 = vaddq_s16(v636, v643); + int16x8_t v645 = vaddq_s16(v642, v635); + int16x8_t v646_tmp = vqrdmulhq_n_s16(v645, 13573); + int16x8_t v646 = vaddq_s16(v646_tmp, v645); + int16x8_t v647 = vaddq_s16(v604, v554); + int16x8_t v648 = vaddq_s16(v555, v584); + int16x8_t v649 = vaddq_s16(v647, v648); + int16x8_t v650 = vaddq_s16(v649, v639); + int16x8_t v651 = vaddq_s16(v650, v645); + int16x8_t v652 = vaddq_s16(v646, v651); + int16x8_t v653 = vqrdmulhq_n_s16(v652, 17734); + int16x8_t v654 = vaddq_s16(v644, v653); + int16x8_t v655 = vqrdmulhq_n_s16(v654, 16705); + int16x8_t v656 = vaddq_s16(v632, v655); + int16x8_t v657 = vaddq_s16(v634, v616); + int16x8_t v658_tmp = vqrdmulhq_n_s16(v657, 13573); + int16x8_t v658 = vaddq_s16(v658_tmp, v657); + int16x8_t v659 = vaddq_s16(v638, v618); + int16x8_t v660 = vaddq_s16(v619, v640); + int16x8_t v661 = vaddq_s16(v659, v660); + int16x8_t v662 = vaddq_s16(v658, v661); + int16x8_t v663 = vaddq_s16(v641, v622); + int16x8_t v664 = vaddq_s16(v623, v633); + int16x8_t v665 = vaddq_s16(v663, v664); + int16x8_t v666_tmp = vqrdmulhq_n_s16(v665, 13573); + int16x8_t v666 = vaddq_s16(v666_tmp, v665); + int16x8_t v667 = vaddq_s16(v648, v626); + int16x8_t v668 = vaddq_s16(v627, v637); + int16x8_t v669 = vaddq_s16(v667, v668); + int16x8_t v670 = vaddq_s16(v669, v665); + int16x8_t v671 = vaddq_s16(v666, v670); + int16x8_t v672 = vqrdmulhq_n_s16(v671, 17734); + int16x8_t v673 = vaddq_s16(v662, v672); + int16x8_t v674 = vaddq_s16(v664, v657); + int16x8_t v675_tmp = vqrdmulhq_n_s16(v674, 13573); + int16x8_t v675 = vaddq_s16(v675_tmp, v674); + int16x8_t v676 = vaddq_s16(v668, v659); + int16x8_t v677 = vaddq_s16(v660, v663); + int16x8_t v678 = vaddq_s16(v676, v677); + int16x8_t v679 = vaddq_s16(v675, v678); + int16x8_t v680 = vaddq_s16(v677, v674); + int16x8_t v681_tmp = vqrdmulhq_n_s16(v680, 13573); + int16x8_t v681 = vaddq_s16(v681_tmp, v680); + int16x8_t v682 = vld1q_s16(in + in_stride * 127 + i); + int16x8_t v683 = vaddq_s16(v682, v507); + int16x8_t v684 = vaddq_s16(v683, v603); + int16x8_t v685 = vaddq_s16(v684, v647); + int16x8_t v686 = vaddq_s16(v685, v667); + int16x8_t v687 = vaddq_s16(v686, v676); + int16x8_t v688 = vaddq_s16(v687, v680); + int16x8_t v689 = vaddq_s16(v681, v688); + int16x8_t v690 = vqrdmulhq_n_s16(v689, 17734); + int16x8_t v691 = vaddq_s16(v679, v690); + int16x8_t v692 = vqrdmulhq_n_s16(v691, 16705); + int16x8_t v693 = vaddq_s16(v673, v692); + int16x8_t v694 = vqrdmulhq_n_s16(v693, 16463); + int16x8_t v695 = vaddq_s16(v656, v694); + int16x8_t v696 = vqrdmulhq_n_s16(v695, 16404); + int16x8_t v697 = vaddq_s16(v615, v696); + int16x8_t v698 = vqrdmulhq_n_s16(v697, 16389); + int16x8_t v699 = vaddq_s16(v522, v698); + int16x8_t v700 = vqrdmulhq_n_s16(v699, 16385); + int16x8_t v701 = vaddq_s16(v317, v700); + int16x8_t v702 = vsubq_s16(v0, v1); + int16x8_t v703 = vsubq_s16(v4, v6); + int16x8_t v704_tmp = vqrdmulhq_n_s16(v703, 10045); + int16x8_t v704 = vaddq_s16(v704_tmp, v703); + int16x8_t v705 = vaddq_s16(v702, v704); + int16x8_t v706 = vsubq_s16(v11, v14); + int16x8_t v707 = vsubq_s16(v17, v20); + int16x8_t v708_tmp = vqrdmulhq_n_s16(v707, 10045); + int16x8_t v708 = vaddq_s16(v708_tmp, v707); + int16x8_t v709 = vaddq_s16(v706, v708); + int16x8_t v710 = vqrdmulhq_n_s16(v709, 19705); + int16x8_t v711 = vaddq_s16(v705, v710); + int16x8_t v712 = vsubq_s16(v27, v30); + int16x8_t v713 = vsubq_s16(v35, v39); + int16x8_t v714_tmp = vqrdmulhq_n_s16(v713, 10045); + int16x8_t v714 = vaddq_s16(v714_tmp, v713); + int16x8_t v715 = vaddq_s16(v712, v714); + int16x8_t v716 = vsubq_s16(v44, v47); + int16x8_t v717 = vsubq_s16(v50, v54); + int16x8_t v718_tmp = vqrdmulhq_n_s16(v717, 10045); + int16x8_t v718 = vaddq_s16(v718_tmp, v717); + int16x8_t v719 = vaddq_s16(v716, v718); + int16x8_t v720 = vqrdmulhq_n_s16(v719, 19705); + int16x8_t v721 = vaddq_s16(v715, v720); + int16x8_t v722 = vqrdmulhq_n_s16(v721, 17121); + int16x8_t v723 = vaddq_s16(v711, v722); + int16x8_t v724 = vsubq_s16(v63, v66); + int16x8_t v725 = vsubq_s16(v71, v75); + int16x8_t v726_tmp = vqrdmulhq_n_s16(v725, 10045); + int16x8_t v726 = vaddq_s16(v726_tmp, v725); + int16x8_t v727 = vaddq_s16(v724, v726); + int16x8_t v728 = vsubq_s16(v82, v89); + int16x8_t v729 = vsubq_s16(v92, v97); + int16x8_t v730_tmp = vqrdmulhq_n_s16(v729, 10045); + int16x8_t v730 = vaddq_s16(v730_tmp, v729); + int16x8_t v731 = vaddq_s16(v728, v730); + int16x8_t v732 = vqrdmulhq_n_s16(v731, 19705); + int16x8_t v733 = vaddq_s16(v727, v732); + int16x8_t v734 = vsubq_s16(v104, v107); + int16x8_t v735 = vsubq_s16(v112, v116); + int16x8_t v736_tmp = vqrdmulhq_n_s16(v735, 10045); + int16x8_t v736 = vaddq_s16(v736_tmp, v735); + int16x8_t v737 = vaddq_s16(v734, v736); + int16x8_t v738 = vsubq_s16(v121, v124); + int16x8_t v739 = vsubq_s16(v127, v132); + int16x8_t v740_tmp = vqrdmulhq_n_s16(v739, 10045); + int16x8_t v740 = vaddq_s16(v740_tmp, v739); + int16x8_t v741 = vaddq_s16(v738, v740); + int16x8_t v742 = vqrdmulhq_n_s16(v741, 19705); + int16x8_t v743 = vaddq_s16(v737, v742); + int16x8_t v744 = vqrdmulhq_n_s16(v743, 17121); + int16x8_t v745 = vaddq_s16(v733, v744); + int16x8_t v746 = vqrdmulhq_n_s16(v745, 16563); + int16x8_t v747 = vaddq_s16(v723, v746); + int16x8_t v748 = vsubq_s16(v143, v146); + int16x8_t v749 = vsubq_s16(v151, v155); + int16x8_t v750_tmp = vqrdmulhq_n_s16(v749, 10045); + int16x8_t v750 = vaddq_s16(v750_tmp, v749); + int16x8_t v751 = vaddq_s16(v748, v750); + int16x8_t v752 = vsubq_s16(v162, v169); + int16x8_t v753 = vqrdmulhq_n_s16(v752, 19705); + int16x8_t v754 = vsubq_s16(v172, v177); + int16x8_t v755 = vqrdmulhq_n_s16(v754, 25746); + int16x8_t v756 = vaddq_s16(v753, v755); + int16x8_t v757 = vaddq_s16(v751, v756); + int16x8_t v758 = vsubq_s16(v186, v193); + int16x8_t v759 = vsubq_s16(v202, v210); + int16x8_t v760_tmp = vqrdmulhq_n_s16(v759, 10045); + int16x8_t v760 = vaddq_s16(v760_tmp, v759); + int16x8_t v761 = vaddq_s16(v758, v760); + int16x8_t v762 = vsubq_s16(v215, v218); + int16x8_t v763 = vsubq_s16(v221, v227); + int16x8_t v764_tmp = vqrdmulhq_n_s16(v763, 10045); + int16x8_t v764 = vaddq_s16(v764_tmp, v763); + int16x8_t v765 = vaddq_s16(v762, v764); + int16x8_t v766 = vqrdmulhq_n_s16(v765, 19705); + int16x8_t v767 = vaddq_s16(v761, v766); + int16x8_t v768 = vqrdmulhq_n_s16(v767, 17121); + int16x8_t v769 = vaddq_s16(v757, v768); + int16x8_t v770 = vsubq_s16(v236, v239); + int16x8_t v771 = vsubq_s16(v244, v248); + int16x8_t v772_tmp = vqrdmulhq_n_s16(v771, 10045); + int16x8_t v772 = vaddq_s16(v772_tmp, v771); + int16x8_t v773 = vaddq_s16(v770, v772); + int16x8_t v774 = vsubq_s16(v255, v262); + int16x8_t v775 = vsubq_s16(v265, v270); + int16x8_t v776_tmp = vqrdmulhq_n_s16(v775, 10045); + int16x8_t v776 = vaddq_s16(v776_tmp, v775); + int16x8_t v777 = vaddq_s16(v774, v776); + int16x8_t v778 = vqrdmulhq_n_s16(v777, 19705); + int16x8_t v779 = vaddq_s16(v773, v778); + int16x8_t v780 = vsubq_s16(v277, v280); + int16x8_t v781 = vsubq_s16(v285, v289); + int16x8_t v782_tmp = vqrdmulhq_n_s16(v781, 10045); + int16x8_t v782 = vaddq_s16(v782_tmp, v781); + int16x8_t v783 = vaddq_s16(v780, v782); + int16x8_t v784 = vsubq_s16(v294, v297); + int16x8_t v785 = vsubq_s16(v300, v306); + int16x8_t v786_tmp = vqrdmulhq_n_s16(v785, 10045); + int16x8_t v786 = vaddq_s16(v786_tmp, v785); + int16x8_t v787 = vaddq_s16(v784, v786); + int16x8_t v788 = vqrdmulhq_n_s16(v787, 19705); + int16x8_t v789 = vaddq_s16(v783, v788); + int16x8_t v790 = vqrdmulhq_n_s16(v789, 17121); + int16x8_t v791 = vaddq_s16(v779, v790); + int16x8_t v792 = vqrdmulhq_n_s16(v791, 16563); + int16x8_t v793 = vaddq_s16(v769, v792); + int16x8_t v794 = vqrdmulhq_n_s16(v793, 16429); + int16x8_t v795 = vaddq_s16(v747, v794); + int16x8_t v796 = vsubq_s16(v319, v322); + int16x8_t v797 = vsubq_s16(v327, v331); + int16x8_t v798_tmp = vqrdmulhq_n_s16(v797, 10045); + int16x8_t v798 = vaddq_s16(v798_tmp, v797); + int16x8_t v799 = vaddq_s16(v796, v798); + int16x8_t v800 = vsubq_s16(v338, v345); + int16x8_t v801 = vsubq_s16(v348, v353); + int16x8_t v802_tmp = vqrdmulhq_n_s16(v801, 10045); + int16x8_t v802 = vaddq_s16(v802_tmp, v801); + int16x8_t v803 = vaddq_s16(v800, v802); + int16x8_t v804 = vqrdmulhq_n_s16(v803, 19705); + int16x8_t v805 = vaddq_s16(v799, v804); + int16x8_t v806 = vsubq_s16(v362, v369); + int16x8_t v807 = vsubq_s16(v378, v386); + int16x8_t v808_tmp = vqrdmulhq_n_s16(v807, 10045); + int16x8_t v808 = vaddq_s16(v808_tmp, v807); + int16x8_t v809 = vaddq_s16(v806, v808); + int16x8_t v810 = vsubq_s16(v391, v394); + int16x8_t v811 = vsubq_s16(v397, v403); + int16x8_t v812_tmp = vqrdmulhq_n_s16(v811, 10045); + int16x8_t v812 = vaddq_s16(v812_tmp, v811); + int16x8_t v813 = vaddq_s16(v810, v812); + int16x8_t v814 = vqrdmulhq_n_s16(v813, 19705); + int16x8_t v815 = vaddq_s16(v809, v814); + int16x8_t v816 = vqrdmulhq_n_s16(v815, 17121); + int16x8_t v817 = vaddq_s16(v805, v816); + int16x8_t v818 = vsubq_s16(v414, v421); + int16x8_t v819 = vsubq_s16(v430, v438); + int16x8_t v820_tmp = vqrdmulhq_n_s16(v819, 10045); + int16x8_t v820 = vaddq_s16(v820_tmp, v819); + int16x8_t v821 = vaddq_s16(v818, v820); + int16x8_t v822 = vsubq_s16(v449, v464); + int16x8_t v823 = vsubq_s16(v467, v476); + int16x8_t v824_tmp = vqrdmulhq_n_s16(v823, 10045); + int16x8_t v824 = vaddq_s16(v824_tmp, v823); + int16x8_t v825 = vaddq_s16(v822, v824); + int16x8_t v826 = vqrdmulhq_n_s16(v825, 19705); + int16x8_t v827 = vaddq_s16(v821, v826); + int16x8_t v828 = vsubq_s16(v483, v486); + int16x8_t v829 = vsubq_s16(v491, v495); + int16x8_t v830_tmp = vqrdmulhq_n_s16(v829, 10045); + int16x8_t v830 = vaddq_s16(v830_tmp, v829); + int16x8_t v831 = vaddq_s16(v828, v830); + int16x8_t v832 = vsubq_s16(v500, v503); + int16x8_t v833 = vsubq_s16(v506, v513); + int16x8_t v834_tmp = vqrdmulhq_n_s16(v833, 10045); + int16x8_t v834 = vaddq_s16(v834_tmp, v833); + int16x8_t v835 = vaddq_s16(v832, v834); + int16x8_t v836 = vqrdmulhq_n_s16(v835, 19705); + int16x8_t v837 = vaddq_s16(v831, v836); + int16x8_t v838 = vqrdmulhq_n_s16(v837, 17121); + int16x8_t v839 = vaddq_s16(v827, v838); + int16x8_t v840 = vqrdmulhq_n_s16(v839, 16563); + int16x8_t v841 = vaddq_s16(v817, v840); + int16x8_t v842 = vsubq_s16(v524, v527); + int16x8_t v843 = vsubq_s16(v532, v536); + int16x8_t v844_tmp = vqrdmulhq_n_s16(v843, 10045); + int16x8_t v844 = vaddq_s16(v844_tmp, v843); + int16x8_t v845 = vaddq_s16(v842, v844); + int16x8_t v846 = vsubq_s16(v543, v550); + int16x8_t v847 = vsubq_s16(v553, v558); + int16x8_t v848_tmp = vqrdmulhq_n_s16(v847, 10045); + int16x8_t v848 = vaddq_s16(v848_tmp, v847); + int16x8_t v849 = vaddq_s16(v846, v848); + int16x8_t v850 = vqrdmulhq_n_s16(v849, 19705); + int16x8_t v851 = vaddq_s16(v845, v850); + int16x8_t v852 = vsubq_s16(v567, v574); + int16x8_t v853 = vsubq_s16(v583, v591); + int16x8_t v854_tmp = vqrdmulhq_n_s16(v853, 10045); + int16x8_t v854 = vaddq_s16(v854_tmp, v853); + int16x8_t v855 = vaddq_s16(v852, v854); + int16x8_t v856 = vsubq_s16(v596, v599); + int16x8_t v857 = vsubq_s16(v602, v608); + int16x8_t v858_tmp = vqrdmulhq_n_s16(v857, 10045); + int16x8_t v858 = vaddq_s16(v858_tmp, v857); + int16x8_t v859 = vaddq_s16(v856, v858); + int16x8_t v860 = vqrdmulhq_n_s16(v859, 19705); + int16x8_t v861 = vaddq_s16(v855, v860); + int16x8_t v862 = vqrdmulhq_n_s16(v861, 17121); + int16x8_t v863 = vaddq_s16(v851, v862); + int16x8_t v864 = vsubq_s16(v617, v620); + int16x8_t v865 = vsubq_s16(v625, v629); + int16x8_t v866_tmp = vqrdmulhq_n_s16(v865, 10045); + int16x8_t v866 = vaddq_s16(v866_tmp, v865); + int16x8_t v867 = vaddq_s16(v864, v866); + int16x8_t v868 = vsubq_s16(v636, v643); + int16x8_t v869 = vsubq_s16(v646, v651); + int16x8_t v870_tmp = vqrdmulhq_n_s16(v869, 10045); + int16x8_t v870 = vaddq_s16(v870_tmp, v869); + int16x8_t v871 = vaddq_s16(v868, v870); + int16x8_t v872 = vqrdmulhq_n_s16(v871, 19705); + int16x8_t v873 = vaddq_s16(v867, v872); + int16x8_t v874 = vsubq_s16(v658, v661); + int16x8_t v875 = vsubq_s16(v666, v670); + int16x8_t v876_tmp = vqrdmulhq_n_s16(v875, 10045); + int16x8_t v876 = vaddq_s16(v876_tmp, v875); + int16x8_t v877 = vaddq_s16(v874, v876); + int16x8_t v878 = vsubq_s16(v675, v678); + int16x8_t v879 = vsubq_s16(v681, v688); + int16x8_t v880_tmp = vqrdmulhq_n_s16(v879, 10045); + int16x8_t v880 = vaddq_s16(v880_tmp, v879); + int16x8_t v881 = vaddq_s16(v878, v880); + int16x8_t v882 = vqrdmulhq_n_s16(v881, 19705); + int16x8_t v883 = vaddq_s16(v877, v882); + int16x8_t v884 = vqrdmulhq_n_s16(v883, 17121); + int16x8_t v885 = vaddq_s16(v873, v884); + int16x8_t v886 = vqrdmulhq_n_s16(v885, 16563); + int16x8_t v887 = vaddq_s16(v863, v886); + int16x8_t v888 = vqrdmulhq_n_s16(v887, 16429); + int16x8_t v889 = vaddq_s16(v841, v888); + int16x8_t v890 = vqrdmulhq_n_s16(v889, 16395); + int16x8_t v891 = vaddq_s16(v795, v890); + int16x8_t v892 = vsubq_s16(v702, v704); + int16x8_t v893 = vsubq_s16(v706, v708); + int16x8_t v894 = vqrdmulhq_n_s16(v893, 29490); + int16x8_t v895 = vaddq_s16(v892, v894); + int16x8_t v896 = vsubq_s16(v712, v714); + int16x8_t v897 = vsubq_s16(v716, v718); + int16x8_t v898 = vqrdmulhq_n_s16(v897, 29490); + int16x8_t v899 = vaddq_s16(v896, v898); + int16x8_t v900 = vqrdmulhq_n_s16(v899, 18578); + int16x8_t v901 = vaddq_s16(v895, v900); + int16x8_t v902 = vsubq_s16(v724, v726); + int16x8_t v903 = vsubq_s16(v728, v730); + int16x8_t v904 = vqrdmulhq_n_s16(v903, 29490); + int16x8_t v905 = vaddq_s16(v902, v904); + int16x8_t v906 = vsubq_s16(v734, v736); + int16x8_t v907 = vsubq_s16(v738, v740); + int16x8_t v908 = vqrdmulhq_n_s16(v907, 29490); + int16x8_t v909 = vaddq_s16(v906, v908); + int16x8_t v910 = vqrdmulhq_n_s16(v909, 18578); + int16x8_t v911 = vaddq_s16(v905, v910); + int16x8_t v912 = vqrdmulhq_n_s16(v911, 16890); + int16x8_t v913 = vaddq_s16(v901, v912); + int16x8_t v914 = vsubq_s16(v748, v750); + int16x8_t v915_tmp = vqrdmulhq_n_s16(v754, 10045); + int16x8_t v915 = vaddq_s16(v915_tmp, v754); + int16x8_t v916 = vsubq_s16(v752, v915); + int16x8_t v917 = vqrdmulhq_n_s16(v916, 29490); + int16x8_t v918 = vaddq_s16(v914, v917); + int16x8_t v919 = vsubq_s16(v758, v760); + int16x8_t v920 = vsubq_s16(v762, v764); + int16x8_t v921 = vqrdmulhq_n_s16(v920, 29490); + int16x8_t v922 = vaddq_s16(v919, v921); + int16x8_t v923 = vqrdmulhq_n_s16(v922, 18578); + int16x8_t v924 = vaddq_s16(v918, v923); + int16x8_t v925 = vsubq_s16(v770, v772); + int16x8_t v926 = vsubq_s16(v774, v776); + int16x8_t v927 = vqrdmulhq_n_s16(v926, 29490); + int16x8_t v928 = vaddq_s16(v925, v927); + int16x8_t v929 = vsubq_s16(v780, v782); + int16x8_t v930 = vsubq_s16(v784, v786); + int16x8_t v931 = vqrdmulhq_n_s16(v930, 29490); + int16x8_t v932 = vaddq_s16(v929, v931); + int16x8_t v933 = vqrdmulhq_n_s16(v932, 18578); + int16x8_t v934 = vaddq_s16(v928, v933); + int16x8_t v935 = vqrdmulhq_n_s16(v934, 16890); + int16x8_t v936 = vaddq_s16(v924, v935); + int16x8_t v937 = vqrdmulhq_n_s16(v936, 16508); + int16x8_t v938 = vaddq_s16(v913, v937); + int16x8_t v939 = vsubq_s16(v796, v798); + int16x8_t v940 = vsubq_s16(v800, v802); + int16x8_t v941 = vqrdmulhq_n_s16(v940, 29490); + int16x8_t v942 = vaddq_s16(v939, v941); + int16x8_t v943 = vsubq_s16(v806, v808); + int16x8_t v944 = vsubq_s16(v810, v812); + int16x8_t v945 = vqrdmulhq_n_s16(v944, 29490); + int16x8_t v946 = vaddq_s16(v943, v945); + int16x8_t v947 = vqrdmulhq_n_s16(v946, 18578); + int16x8_t v948 = vaddq_s16(v942, v947); + int16x8_t v949 = vsubq_s16(v818, v820); + int16x8_t v950 = vsubq_s16(v822, v824); + int16x8_t v951 = vqrdmulhq_n_s16(v950, 29490); + int16x8_t v952 = vaddq_s16(v949, v951); + int16x8_t v953 = vsubq_s16(v828, v830); + int16x8_t v954 = vsubq_s16(v832, v834); + int16x8_t v955 = vqrdmulhq_n_s16(v954, 29490); + int16x8_t v956 = vaddq_s16(v953, v955); + int16x8_t v957 = vqrdmulhq_n_s16(v956, 18578); + int16x8_t v958 = vaddq_s16(v952, v957); + int16x8_t v959 = vqrdmulhq_n_s16(v958, 16890); + int16x8_t v960 = vaddq_s16(v948, v959); + int16x8_t v961 = vsubq_s16(v842, v844); + int16x8_t v962 = vsubq_s16(v846, v848); + int16x8_t v963 = vqrdmulhq_n_s16(v962, 29490); + int16x8_t v964 = vaddq_s16(v961, v963); + int16x8_t v965 = vsubq_s16(v852, v854); + int16x8_t v966 = vsubq_s16(v856, v858); + int16x8_t v967 = vqrdmulhq_n_s16(v966, 29490); + int16x8_t v968 = vaddq_s16(v965, v967); + int16x8_t v969 = vqrdmulhq_n_s16(v968, 18578); + int16x8_t v970 = vaddq_s16(v964, v969); + int16x8_t v971 = vsubq_s16(v864, v866); + int16x8_t v972 = vsubq_s16(v868, v870); + int16x8_t v973 = vqrdmulhq_n_s16(v972, 29490); + int16x8_t v974 = vaddq_s16(v971, v973); + int16x8_t v975 = vsubq_s16(v874, v876); + int16x8_t v976 = vsubq_s16(v878, v880); + int16x8_t v977 = vqrdmulhq_n_s16(v976, 29490); + int16x8_t v978 = vaddq_s16(v975, v977); + int16x8_t v979 = vqrdmulhq_n_s16(v978, 18578); + int16x8_t v980 = vaddq_s16(v974, v979); + int16x8_t v981 = vqrdmulhq_n_s16(v980, 16890); + int16x8_t v982 = vaddq_s16(v970, v981); + int16x8_t v983 = vqrdmulhq_n_s16(v982, 16508); + int16x8_t v984 = vaddq_s16(v960, v983); + int16x8_t v985 = vqrdmulhq_n_s16(v984, 16415); + int16x8_t v986 = vaddq_s16(v938, v985); + int16x8_t v987 = vsubq_s16(v2, v8); + int16x8_t v988 = vsubq_s16(v15, v22); + int16x8_t v989_tmp = vqrdmulhq_n_s16(v988, 18446); + int16x8_t v989 = vmlaq_n_s16(v989_tmp, v988, 2); + int16x8_t v990 = vaddq_s16(v987, v989); + int16x8_t v991 = vsubq_s16(v31, v41); + int16x8_t v992 = vsubq_s16(v48, v56); + int16x8_t v993_tmp = vqrdmulhq_n_s16(v992, 18446); + int16x8_t v993 = vmlaq_n_s16(v993_tmp, v992, 2); + int16x8_t v994 = vaddq_s16(v991, v993); + int16x8_t v995 = vqrdmulhq_n_s16(v994, 21195); + int16x8_t v996 = vaddq_s16(v990, v995); + int16x8_t v997 = vsubq_s16(v67, v77); + int16x8_t v998 = vsubq_s16(v90, v99); + int16x8_t v999_tmp = vqrdmulhq_n_s16(v998, 18446); + int16x8_t v999 = vmlaq_n_s16(v999_tmp, v998, 2); + int16x8_t v1000 = vaddq_s16(v997, v999); + int16x8_t v1001 = vsubq_s16(v108, v118); + int16x8_t v1002 = vsubq_s16(v125, v134); + int16x8_t v1003_tmp = vqrdmulhq_n_s16(v1002, 18446); + int16x8_t v1003 = vmlaq_n_s16(v1003_tmp, v1002, 2); + int16x8_t v1004 = vaddq_s16(v1001, v1003); + int16x8_t v1005 = vqrdmulhq_n_s16(v1004, 21195); + int16x8_t v1006 = vaddq_s16(v1000, v1005); + int16x8_t v1007 = vqrdmulhq_n_s16(v1006, 17401); + int16x8_t v1008 = vaddq_s16(v996, v1007); + int16x8_t v1009 = vsubq_s16(v147, v157); + int16x8_t v1010 = vsubq_s16(v170, v179); + int16x8_t v1011_tmp = vqrdmulhq_n_s16(v1010, 18446); + int16x8_t v1011 = vmlaq_n_s16(v1011_tmp, v1010, 2); + int16x8_t v1012 = vaddq_s16(v1009, v1011); + int16x8_t v1013 = vsubq_s16(v194, v212); + int16x8_t v1014 = vsubq_s16(v219, v229); + int16x8_t v1015_tmp = vqrdmulhq_n_s16(v1014, 18446); + int16x8_t v1015 = vmlaq_n_s16(v1015_tmp, v1014, 2); + int16x8_t v1016 = vaddq_s16(v1013, v1015); + int16x8_t v1017 = vqrdmulhq_n_s16(v1016, 21195); + int16x8_t v1018 = vaddq_s16(v1012, v1017); + int16x8_t v1019 = vsubq_s16(v240, v250); + int16x8_t v1020 = vsubq_s16(v263, v272); + int16x8_t v1021_tmp = vqrdmulhq_n_s16(v1020, 18446); + int16x8_t v1021 = vmlaq_n_s16(v1021_tmp, v1020, 2); + int16x8_t v1022 = vaddq_s16(v1019, v1021); + int16x8_t v1023 = vsubq_s16(v281, v291); + int16x8_t v1024 = vsubq_s16(v298, v308); + int16x8_t v1025_tmp = vqrdmulhq_n_s16(v1024, 18446); + int16x8_t v1025 = vmlaq_n_s16(v1025_tmp, v1024, 2); + int16x8_t v1026 = vaddq_s16(v1023, v1025); + int16x8_t v1027 = vqrdmulhq_n_s16(v1026, 21195); + int16x8_t v1028 = vaddq_s16(v1022, v1027); + int16x8_t v1029 = vqrdmulhq_n_s16(v1028, 17401); + int16x8_t v1030 = vaddq_s16(v1018, v1029); + int16x8_t v1031 = vqrdmulhq_n_s16(v1030, 16629); + int16x8_t v1032 = vaddq_s16(v1008, v1031); + int16x8_t v1033 = vsubq_s16(v323, v333); + int16x8_t v1034 = vsubq_s16(v346, v355); + int16x8_t v1035_tmp = vqrdmulhq_n_s16(v1034, 18446); + int16x8_t v1035 = vmlaq_n_s16(v1035_tmp, v1034, 2); + int16x8_t v1036 = vaddq_s16(v1033, v1035); + int16x8_t v1037 = vsubq_s16(v370, v388); + int16x8_t v1038 = vsubq_s16(v395, v405); + int16x8_t v1039_tmp = vqrdmulhq_n_s16(v1038, 18446); + int16x8_t v1039 = vmlaq_n_s16(v1039_tmp, v1038, 2); + int16x8_t v1040 = vaddq_s16(v1037, v1039); + int16x8_t v1041 = vqrdmulhq_n_s16(v1040, 21195); + int16x8_t v1042 = vaddq_s16(v1036, v1041); + int16x8_t v1043 = vsubq_s16(v422, v440); + int16x8_t v1044 = vsubq_s16(v465, v478); + int16x8_t v1045_tmp = vqrdmulhq_n_s16(v1044, 18446); + int16x8_t v1045 = vmlaq_n_s16(v1045_tmp, v1044, 2); + int16x8_t v1046 = vaddq_s16(v1043, v1045); + int16x8_t v1047 = vsubq_s16(v487, v497); + int16x8_t v1048 = vsubq_s16(v504, v515); + int16x8_t v1049_tmp = vqrdmulhq_n_s16(v1048, 18446); + int16x8_t v1049 = vmlaq_n_s16(v1049_tmp, v1048, 2); + int16x8_t v1050 = vaddq_s16(v1047, v1049); + int16x8_t v1051 = vqrdmulhq_n_s16(v1050, 21195); + int16x8_t v1052 = vaddq_s16(v1046, v1051); + int16x8_t v1053 = vqrdmulhq_n_s16(v1052, 17401); + int16x8_t v1054 = vaddq_s16(v1042, v1053); + int16x8_t v1055 = vsubq_s16(v528, v538); + int16x8_t v1056 = vsubq_s16(v551, v560); + int16x8_t v1057_tmp = vqrdmulhq_n_s16(v1056, 18446); + int16x8_t v1057 = vmlaq_n_s16(v1057_tmp, v1056, 2); + int16x8_t v1058 = vaddq_s16(v1055, v1057); + int16x8_t v1059 = vsubq_s16(v575, v593); + int16x8_t v1060 = vsubq_s16(v600, v610); + int16x8_t v1061_tmp = vqrdmulhq_n_s16(v1060, 18446); + int16x8_t v1061 = vmlaq_n_s16(v1061_tmp, v1060, 2); + int16x8_t v1062 = vaddq_s16(v1059, v1061); + int16x8_t v1063 = vqrdmulhq_n_s16(v1062, 21195); + int16x8_t v1064 = vaddq_s16(v1058, v1063); + int16x8_t v1065 = vsubq_s16(v621, v631); + int16x8_t v1066 = vsubq_s16(v644, v653); + int16x8_t v1067_tmp = vqrdmulhq_n_s16(v1066, 18446); + int16x8_t v1067 = vmlaq_n_s16(v1067_tmp, v1066, 2); + int16x8_t v1068 = vaddq_s16(v1065, v1067); + int16x8_t v1069 = vsubq_s16(v662, v672); + int16x8_t v1070 = vsubq_s16(v679, v690); + int16x8_t v1071_tmp = vqrdmulhq_n_s16(v1070, 18446); + int16x8_t v1071 = vmlaq_n_s16(v1071_tmp, v1070, 2); + int16x8_t v1072 = vaddq_s16(v1069, v1071); + int16x8_t v1073 = vqrdmulhq_n_s16(v1072, 21195); + int16x8_t v1074 = vaddq_s16(v1068, v1073); + int16x8_t v1075 = vqrdmulhq_n_s16(v1074, 17401); + int16x8_t v1076 = vaddq_s16(v1064, v1075); + int16x8_t v1077 = vqrdmulhq_n_s16(v1076, 16629); + int16x8_t v1078 = vaddq_s16(v1054, v1077); + int16x8_t v1079 = vqrdmulhq_n_s16(v1078, 16445); + int16x8_t v1080 = vaddq_s16(v1032, v1079); + int16x8_t v1081 = vsubq_s16(v987, v989); + int16x8_t v1082 = vsubq_s16(v991, v993); + int16x8_t v1083 = vqrdmulhq_n_s16(v1082, 25826); + int16x8_t v1084 = vaddq_s16(v1081, v1083); + int16x8_t v1085 = vsubq_s16(v997, v999); + int16x8_t v1086 = vsubq_s16(v1001, v1003); + int16x8_t v1087 = vqrdmulhq_n_s16(v1086, 25826); + int16x8_t v1088 = vaddq_s16(v1085, v1087); + int16x8_t v1089 = vqrdmulhq_n_s16(v1088, 18124); + int16x8_t v1090 = vaddq_s16(v1084, v1089); + int16x8_t v1091 = vsubq_s16(v1009, v1011); + int16x8_t v1092 = vsubq_s16(v1013, v1015); + int16x8_t v1093 = vqrdmulhq_n_s16(v1092, 25826); + int16x8_t v1094 = vaddq_s16(v1091, v1093); + int16x8_t v1095 = vsubq_s16(v1019, v1021); + int16x8_t v1096 = vsubq_s16(v1023, v1025); + int16x8_t v1097 = vqrdmulhq_n_s16(v1096, 25826); + int16x8_t v1098 = vaddq_s16(v1095, v1097); + int16x8_t v1099 = vqrdmulhq_n_s16(v1098, 18124); + int16x8_t v1100 = vaddq_s16(v1094, v1099); + int16x8_t v1101 = vqrdmulhq_n_s16(v1100, 16792); + int16x8_t v1102 = vaddq_s16(v1090, v1101); + int16x8_t v1103 = vsubq_s16(v1033, v1035); + int16x8_t v1104 = vsubq_s16(v1037, v1039); + int16x8_t v1105 = vqrdmulhq_n_s16(v1104, 25826); + int16x8_t v1106 = vaddq_s16(v1103, v1105); + int16x8_t v1107 = vsubq_s16(v1043, v1045); + int16x8_t v1108 = vsubq_s16(v1047, v1049); + int16x8_t v1109 = vqrdmulhq_n_s16(v1108, 25826); + int16x8_t v1110 = vaddq_s16(v1107, v1109); + int16x8_t v1111 = vqrdmulhq_n_s16(v1110, 18124); + int16x8_t v1112 = vaddq_s16(v1106, v1111); + int16x8_t v1113 = vsubq_s16(v1055, v1057); + int16x8_t v1114 = vsubq_s16(v1059, v1061); + int16x8_t v1115 = vqrdmulhq_n_s16(v1114, 25826); + int16x8_t v1116 = vaddq_s16(v1113, v1115); + int16x8_t v1117 = vsubq_s16(v1065, v1067); + int16x8_t v1118 = vsubq_s16(v1069, v1071); + int16x8_t v1119 = vqrdmulhq_n_s16(v1118, 25826); + int16x8_t v1120 = vaddq_s16(v1117, v1119); + int16x8_t v1121 = vqrdmulhq_n_s16(v1120, 18124); + int16x8_t v1122 = vaddq_s16(v1116, v1121); + int16x8_t v1123 = vqrdmulhq_n_s16(v1122, 16792); + int16x8_t v1124 = vaddq_s16(v1112, v1123); + int16x8_t v1125 = vqrdmulhq_n_s16(v1124, 16484); + int16x8_t v1126 = vaddq_s16(v1102, v1125); + int16x8_t v1127 = vsubq_s16(v892, v894); + int16x8_t v1128 = vsubq_s16(v896, v898); + int16x8_t v1129_tmp = vqrdmulhq_n_s16(v1128, 1988); + int16x8_t v1129 = vaddq_s16(v1129_tmp, v1128); + int16x8_t v1130 = vaddq_s16(v1127, v1129); + int16x8_t v1131 = vsubq_s16(v902, v904); + int16x8_t v1132 = vsubq_s16(v906, v908); + int16x8_t v1133_tmp = vqrdmulhq_n_s16(v1132, 1988); + int16x8_t v1133 = vaddq_s16(v1133_tmp, v1132); + int16x8_t v1134 = vaddq_s16(v1131, v1133); + int16x8_t v1135 = vqrdmulhq_n_s16(v1134, 19102); + int16x8_t v1136 = vaddq_s16(v1130, v1135); + int16x8_t v1137 = vsubq_s16(v914, v917); + int16x8_t v1138 = vsubq_s16(v919, v921); + int16x8_t v1139_tmp = vqrdmulhq_n_s16(v1138, 1988); + int16x8_t v1139 = vaddq_s16(v1139_tmp, v1138); + int16x8_t v1140 = vaddq_s16(v1137, v1139); + int16x8_t v1141 = vsubq_s16(v925, v927); + int16x8_t v1142 = vsubq_s16(v929, v931); + int16x8_t v1143_tmp = vqrdmulhq_n_s16(v1142, 1988); + int16x8_t v1143 = vaddq_s16(v1143_tmp, v1142); + int16x8_t v1144 = vaddq_s16(v1141, v1143); + int16x8_t v1145 = vqrdmulhq_n_s16(v1144, 19102); + int16x8_t v1146 = vaddq_s16(v1140, v1145); + int16x8_t v1147 = vqrdmulhq_n_s16(v1146, 17000); + int16x8_t v1148 = vaddq_s16(v1136, v1147); + int16x8_t v1149 = vsubq_s16(v939, v941); + int16x8_t v1150 = vsubq_s16(v943, v945); + int16x8_t v1151_tmp = vqrdmulhq_n_s16(v1150, 1988); + int16x8_t v1151 = vaddq_s16(v1151_tmp, v1150); + int16x8_t v1152 = vaddq_s16(v1149, v1151); + int16x8_t v1153 = vsubq_s16(v949, v951); + int16x8_t v1154 = vsubq_s16(v953, v955); + int16x8_t v1155_tmp = vqrdmulhq_n_s16(v1154, 1988); + int16x8_t v1155 = vaddq_s16(v1155_tmp, v1154); + int16x8_t v1156 = vaddq_s16(v1153, v1155); + int16x8_t v1157 = vqrdmulhq_n_s16(v1156, 19102); + int16x8_t v1158 = vaddq_s16(v1152, v1157); + int16x8_t v1159 = vsubq_s16(v961, v963); + int16x8_t v1160 = vsubq_s16(v965, v967); + int16x8_t v1161_tmp = vqrdmulhq_n_s16(v1160, 1988); + int16x8_t v1161 = vaddq_s16(v1161_tmp, v1160); + int16x8_t v1162 = vaddq_s16(v1159, v1161); + int16x8_t v1163 = vsubq_s16(v971, v973); + int16x8_t v1164 = vsubq_s16(v975, v977); + int16x8_t v1165_tmp = vqrdmulhq_n_s16(v1164, 1988); + int16x8_t v1165 = vaddq_s16(v1165_tmp, v1164); + int16x8_t v1166 = vaddq_s16(v1163, v1165); + int16x8_t v1167 = vqrdmulhq_n_s16(v1166, 19102); + int16x8_t v1168 = vaddq_s16(v1162, v1167); + int16x8_t v1169 = vqrdmulhq_n_s16(v1168, 17000); + int16x8_t v1170 = vaddq_s16(v1158, v1169); + int16x8_t v1171 = vqrdmulhq_n_s16(v1170, 16534); + int16x8_t v1172 = vaddq_s16(v1148, v1171); + int16x8_t v1173 = vsubq_s16(v705, v710); + int16x8_t v1174 = vsubq_s16(v715, v720); + int16x8_t v1175_tmp = vqrdmulhq_n_s16(v1174, 23673); + int16x8_t v1175 = vaddq_s16(v1175_tmp, v1174); + int16x8_t v1176 = vaddq_s16(v1173, v1175); + int16x8_t v1177 = vsubq_s16(v727, v732); + int16x8_t v1178 = vsubq_s16(v737, v742); + int16x8_t v1179_tmp = vqrdmulhq_n_s16(v1178, 23673); + int16x8_t v1179 = vaddq_s16(v1179_tmp, v1178); + int16x8_t v1180 = vaddq_s16(v1177, v1179); + int16x8_t v1181 = vqrdmulhq_n_s16(v1180, 20398); + int16x8_t v1182 = vaddq_s16(v1176, v1181); + int16x8_t v1183 = vsubq_s16(v751, v756); + int16x8_t v1184 = vsubq_s16(v761, v766); + int16x8_t v1185_tmp = vqrdmulhq_n_s16(v1184, 23673); + int16x8_t v1185 = vaddq_s16(v1185_tmp, v1184); + int16x8_t v1186 = vaddq_s16(v1183, v1185); + int16x8_t v1187 = vsubq_s16(v773, v778); + int16x8_t v1188 = vsubq_s16(v783, v788); + int16x8_t v1189_tmp = vqrdmulhq_n_s16(v1188, 23673); + int16x8_t v1189 = vaddq_s16(v1189_tmp, v1188); + int16x8_t v1190 = vaddq_s16(v1187, v1189); + int16x8_t v1191 = vqrdmulhq_n_s16(v1190, 20398); + int16x8_t v1192 = vaddq_s16(v1186, v1191); + int16x8_t v1193 = vqrdmulhq_n_s16(v1192, 17255); + int16x8_t v1194 = vaddq_s16(v1182, v1193); + int16x8_t v1195 = vsubq_s16(v799, v804); + int16x8_t v1196 = vsubq_s16(v809, v814); + int16x8_t v1197_tmp = vqrdmulhq_n_s16(v1196, 23673); + int16x8_t v1197 = vaddq_s16(v1197_tmp, v1196); + int16x8_t v1198 = vaddq_s16(v1195, v1197); + int16x8_t v1199 = vsubq_s16(v821, v826); + int16x8_t v1200 = vsubq_s16(v831, v836); + int16x8_t v1201_tmp = vqrdmulhq_n_s16(v1200, 23673); + int16x8_t v1201 = vaddq_s16(v1201_tmp, v1200); + int16x8_t v1202 = vaddq_s16(v1199, v1201); + int16x8_t v1203 = vqrdmulhq_n_s16(v1202, 20398); + int16x8_t v1204 = vaddq_s16(v1198, v1203); + int16x8_t v1205 = vsubq_s16(v845, v850); + int16x8_t v1206 = vsubq_s16(v855, v860); + int16x8_t v1207_tmp = vqrdmulhq_n_s16(v1206, 23673); + int16x8_t v1207 = vaddq_s16(v1207_tmp, v1206); + int16x8_t v1208 = vaddq_s16(v1205, v1207); + int16x8_t v1209 = vsubq_s16(v867, v872); + int16x8_t v1210 = vsubq_s16(v877, v882); + int16x8_t v1211_tmp = vqrdmulhq_n_s16(v1210, 23673); + int16x8_t v1211 = vaddq_s16(v1211_tmp, v1210); + int16x8_t v1212 = vaddq_s16(v1209, v1211); + int16x8_t v1213 = vqrdmulhq_n_s16(v1212, 20398); + int16x8_t v1214 = vaddq_s16(v1208, v1213); + int16x8_t v1215 = vqrdmulhq_n_s16(v1214, 17255); + int16x8_t v1216 = vaddq_s16(v1204, v1215); + int16x8_t v1217 = vqrdmulhq_n_s16(v1216, 16595); + int16x8_t v1218 = vaddq_s16(v1194, v1217); + int16x8_t v1219 = vsubq_s16(v9, v24); + int16x8_t v1220 = vsubq_s16(v42, v58); + int16x8_t v1221_tmp = vqrdmulhq_n_s16(v1220, 3314); + int16x8_t v1221 = vmlaq_n_s16(v1221_tmp, v1220, 5); + int16x8_t v1222 = vaddq_s16(v1219, v1221); + int16x8_t v1223 = vsubq_s16(v78, v101); + int16x8_t v1224 = vsubq_s16(v119, v136); + int16x8_t v1225_tmp = vqrdmulhq_n_s16(v1224, 3314); + int16x8_t v1225 = vmlaq_n_s16(v1225_tmp, v1224, 5); + int16x8_t v1226 = vaddq_s16(v1223, v1225); + int16x8_t v1227 = vqrdmulhq_n_s16(v1226, 22112); + int16x8_t v1228 = vaddq_s16(v1222, v1227); + int16x8_t v1229 = vsubq_s16(v158, v181); + int16x8_t v1230 = vsubq_s16(v213, v231); + int16x8_t v1231_tmp = vqrdmulhq_n_s16(v1230, 3314); + int16x8_t v1231 = vmlaq_n_s16(v1231_tmp, v1230, 5); + int16x8_t v1232 = vaddq_s16(v1229, v1231); + int16x8_t v1233 = vsubq_s16(v251, v274); + int16x8_t v1234 = vsubq_s16(v292, v310); + int16x8_t v1235_tmp = vqrdmulhq_n_s16(v1234, 3314); + int16x8_t v1235 = vmlaq_n_s16(v1235_tmp, v1234, 5); + int16x8_t v1236 = vaddq_s16(v1233, v1235); + int16x8_t v1237 = vqrdmulhq_n_s16(v1236, 22112); + int16x8_t v1238 = vaddq_s16(v1232, v1237); + int16x8_t v1239 = vqrdmulhq_n_s16(v1238, 17561); + int16x8_t v1240 = vaddq_s16(v1228, v1239); + int16x8_t v1241 = vsubq_s16(v334, v357); + int16x8_t v1242 = vsubq_s16(v389, v407); + int16x8_t v1243_tmp = vqrdmulhq_n_s16(v1242, 3314); + int16x8_t v1243 = vmlaq_n_s16(v1243_tmp, v1242, 5); + int16x8_t v1244 = vaddq_s16(v1241, v1243); + int16x8_t v1245 = vsubq_s16(v441, v480); + int16x8_t v1246 = vsubq_s16(v498, v517); + int16x8_t v1247_tmp = vqrdmulhq_n_s16(v1246, 3314); + int16x8_t v1247 = vmlaq_n_s16(v1247_tmp, v1246, 5); + int16x8_t v1248 = vaddq_s16(v1245, v1247); + int16x8_t v1249 = vqrdmulhq_n_s16(v1248, 22112); + int16x8_t v1250 = vaddq_s16(v1244, v1249); + int16x8_t v1251 = vsubq_s16(v539, v562); + int16x8_t v1252 = vsubq_s16(v594, v612); + int16x8_t v1253_tmp = vqrdmulhq_n_s16(v1252, 3314); + int16x8_t v1253 = vmlaq_n_s16(v1253_tmp, v1252, 5); + int16x8_t v1254 = vaddq_s16(v1251, v1253); + int16x8_t v1255 = vsubq_s16(v632, v655); + int16x8_t v1256 = vsubq_s16(v673, v692); + int16x8_t v1257_tmp = vqrdmulhq_n_s16(v1256, 3314); + int16x8_t v1257 = vmlaq_n_s16(v1257_tmp, v1256, 5); + int16x8_t v1258 = vaddq_s16(v1255, v1257); + int16x8_t v1259 = vqrdmulhq_n_s16(v1258, 22112); + int16x8_t v1260 = vaddq_s16(v1254, v1259); + int16x8_t v1261 = vqrdmulhq_n_s16(v1260, 17561); + int16x8_t v1262 = vaddq_s16(v1250, v1261); + int16x8_t v1263 = vqrdmulhq_n_s16(v1262, 16666); + int16x8_t v1264 = vaddq_s16(v1240, v1263); + int16x8_t v1265 = vsubq_s16(v1219, v1221); + int16x8_t v1266 = vsubq_s16(v1223, v1225); + int16x8_t v1267 = vqrdmulhq_n_s16(v1266, 24397); + int16x8_t v1268 = vaddq_s16(v1265, v1267); + int16x8_t v1269 = vsubq_s16(v1229, v1231); + int16x8_t v1270 = vsubq_s16(v1233, v1235); + int16x8_t v1271 = vqrdmulhq_n_s16(v1270, 24397); + int16x8_t v1272 = vaddq_s16(v1269, v1271); + int16x8_t v1273 = vqrdmulhq_n_s16(v1272, 17921); + int16x8_t v1274 = vaddq_s16(v1268, v1273); + int16x8_t v1275 = vsubq_s16(v1241, v1243); + int16x8_t v1276 = vsubq_s16(v1245, v1247); + int16x8_t v1277 = vqrdmulhq_n_s16(v1276, 24397); + int16x8_t v1278 = vaddq_s16(v1275, v1277); + int16x8_t v1279 = vsubq_s16(v1251, v1253); + int16x8_t v1280 = vsubq_s16(v1255, v1257); + int16x8_t v1281 = vqrdmulhq_n_s16(v1280, 24397); + int16x8_t v1282 = vaddq_s16(v1279, v1281); + int16x8_t v1283 = vqrdmulhq_n_s16(v1282, 17921); + int16x8_t v1284 = vaddq_s16(v1278, v1283); + int16x8_t v1285 = vqrdmulhq_n_s16(v1284, 16747); + int16x8_t v1286 = vaddq_s16(v1274, v1285); + int16x8_t v1287 = vsubq_s16(v1173, v1175); + int16x8_t v1288 = vsubq_s16(v1177, v1179); + int16x8_t v1289 = vqrdmulhq_n_s16(v1288, 27504); + int16x8_t v1290 = vaddq_s16(v1287, v1289); + int16x8_t v1291 = vsubq_s16(v1183, v1185); + int16x8_t v1292 = vsubq_s16(v1187, v1189); + int16x8_t v1293 = vqrdmulhq_n_s16(v1292, 27504); + int16x8_t v1294 = vaddq_s16(v1291, v1293); + int16x8_t v1295 = vqrdmulhq_n_s16(v1294, 18343); + int16x8_t v1296 = vaddq_s16(v1290, v1295); + int16x8_t v1297 = vsubq_s16(v1195, v1197); + int16x8_t v1298 = vsubq_s16(v1199, v1201); + int16x8_t v1299 = vqrdmulhq_n_s16(v1298, 27504); + int16x8_t v1300 = vaddq_s16(v1297, v1299); + int16x8_t v1301 = vsubq_s16(v1205, v1207); + int16x8_t v1302 = vsubq_s16(v1209, v1211); + int16x8_t v1303 = vqrdmulhq_n_s16(v1302, 27504); + int16x8_t v1304 = vaddq_s16(v1301, v1303); + int16x8_t v1305 = vqrdmulhq_n_s16(v1304, 18343); + int16x8_t v1306 = vaddq_s16(v1300, v1305); + int16x8_t v1307 = vqrdmulhq_n_s16(v1306, 16840); + int16x8_t v1308 = vaddq_s16(v1296, v1307); + int16x8_t v1309 = vsubq_s16(v1127, v1129); + int16x8_t v1310 = vsubq_s16(v1131, v1133); + int16x8_t v1311 = vqrdmulhq_n_s16(v1310, 31869); + int16x8_t v1312 = vaddq_s16(v1309, v1311); + int16x8_t v1313 = vsubq_s16(v1137, v1139); + int16x8_t v1314 = vsubq_s16(v1141, v1143); + int16x8_t v1315 = vqrdmulhq_n_s16(v1314, 31869); + int16x8_t v1316 = vaddq_s16(v1313, v1315); + int16x8_t v1317 = vqrdmulhq_n_s16(v1316, 18830); + int16x8_t v1318 = vaddq_s16(v1312, v1317); + int16x8_t v1319 = vsubq_s16(v1149, v1151); + int16x8_t v1320 = vsubq_s16(v1153, v1155); + int16x8_t v1321 = vqrdmulhq_n_s16(v1320, 31869); + int16x8_t v1322 = vaddq_s16(v1319, v1321); + int16x8_t v1323 = vsubq_s16(v1159, v1161); + int16x8_t v1324 = vsubq_s16(v1163, v1165); + int16x8_t v1325 = vqrdmulhq_n_s16(v1324, 31869); + int16x8_t v1326 = vaddq_s16(v1323, v1325); + int16x8_t v1327 = vqrdmulhq_n_s16(v1326, 18830); + int16x8_t v1328 = vaddq_s16(v1322, v1327); + int16x8_t v1329 = vqrdmulhq_n_s16(v1328, 16944); + int16x8_t v1330 = vaddq_s16(v1318, v1329); + int16x8_t v1331 = vsubq_s16(v1081, v1083); + int16x8_t v1332 = vsubq_s16(v1085, v1087); + int16x8_t v1333_tmp = vqrdmulhq_n_s16(v1332, 5552); + int16x8_t v1333 = vaddq_s16(v1333_tmp, v1332); + int16x8_t v1334 = vaddq_s16(v1331, v1333); + int16x8_t v1335 = vsubq_s16(v1091, v1093); + int16x8_t v1336 = vsubq_s16(v1095, v1097); + int16x8_t v1337_tmp = vqrdmulhq_n_s16(v1336, 5552); + int16x8_t v1337 = vaddq_s16(v1337_tmp, v1336); + int16x8_t v1338 = vaddq_s16(v1335, v1337); + int16x8_t v1339 = vqrdmulhq_n_s16(v1338, 19393); + int16x8_t v1340 = vaddq_s16(v1334, v1339); + int16x8_t v1341 = vsubq_s16(v1103, v1105); + int16x8_t v1342 = vsubq_s16(v1107, v1109); + int16x8_t v1343_tmp = vqrdmulhq_n_s16(v1342, 5552); + int16x8_t v1343 = vaddq_s16(v1343_tmp, v1342); + int16x8_t v1344 = vaddq_s16(v1341, v1343); + int16x8_t v1345 = vsubq_s16(v1113, v1115); + int16x8_t v1346 = vsubq_s16(v1117, v1119); + int16x8_t v1347_tmp = vqrdmulhq_n_s16(v1346, 5552); + int16x8_t v1347 = vaddq_s16(v1347_tmp, v1346); + int16x8_t v1348 = vaddq_s16(v1345, v1347); + int16x8_t v1349 = vqrdmulhq_n_s16(v1348, 19393); + int16x8_t v1350 = vaddq_s16(v1344, v1349); + int16x8_t v1351 = vqrdmulhq_n_s16(v1350, 17059); + int16x8_t v1352 = vaddq_s16(v1340, v1351); + int16x8_t v1353 = vsubq_s16(v990, v995); + int16x8_t v1354 = vsubq_s16(v1000, v1005); + int16x8_t v1355_tmp = vqrdmulhq_n_s16(v1354, 15865); + int16x8_t v1355 = vaddq_s16(v1355_tmp, v1354); + int16x8_t v1356 = vaddq_s16(v1353, v1355); + int16x8_t v1357 = vsubq_s16(v1012, v1017); + int16x8_t v1358 = vsubq_s16(v1022, v1027); + int16x8_t v1359_tmp = vqrdmulhq_n_s16(v1358, 15865); + int16x8_t v1359 = vaddq_s16(v1359_tmp, v1358); + int16x8_t v1360 = vaddq_s16(v1357, v1359); + int16x8_t v1361 = vqrdmulhq_n_s16(v1360, 20040); + int16x8_t v1362 = vaddq_s16(v1356, v1361); + int16x8_t v1363 = vsubq_s16(v1036, v1041); + int16x8_t v1364 = vsubq_s16(v1046, v1051); + int16x8_t v1365_tmp = vqrdmulhq_n_s16(v1364, 15865); + int16x8_t v1365 = vaddq_s16(v1365_tmp, v1364); + int16x8_t v1366 = vaddq_s16(v1363, v1365); + int16x8_t v1367 = vsubq_s16(v1058, v1063); + int16x8_t v1368 = vsubq_s16(v1068, v1073); + int16x8_t v1369_tmp = vqrdmulhq_n_s16(v1368, 15865); + int16x8_t v1369 = vaddq_s16(v1369_tmp, v1368); + int16x8_t v1370 = vaddq_s16(v1367, v1369); + int16x8_t v1371 = vqrdmulhq_n_s16(v1370, 20040); + int16x8_t v1372 = vaddq_s16(v1366, v1371); + int16x8_t v1373 = vqrdmulhq_n_s16(v1372, 17187); + int16x8_t v1374 = vaddq_s16(v1362, v1373); + int16x8_t v1375 = vsubq_s16(v895, v900); + int16x8_t v1376 = vsubq_s16(v905, v910); + int16x8_t v1377_tmp = vqrdmulhq_n_s16(v1376, 1893); + int16x8_t v1377 = vmlaq_n_s16(v1377_tmp, v1376, 2); + int16x8_t v1378 = vaddq_s16(v1375, v1377); + int16x8_t v1379 = vsubq_s16(v918, v923); + int16x8_t v1380 = vsubq_s16(v928, v933); + int16x8_t v1381_tmp = vqrdmulhq_n_s16(v1380, 1893); + int16x8_t v1381 = vmlaq_n_s16(v1381_tmp, v1380, 2); + int16x8_t v1382 = vaddq_s16(v1379, v1381); + int16x8_t v1383 = vqrdmulhq_n_s16(v1382, 20783); + int16x8_t v1384 = vaddq_s16(v1378, v1383); + int16x8_t v1385 = vsubq_s16(v942, v947); + int16x8_t v1386 = vsubq_s16(v952, v957); + int16x8_t v1387_tmp = vqrdmulhq_n_s16(v1386, 1893); + int16x8_t v1387 = vmlaq_n_s16(v1387_tmp, v1386, 2); + int16x8_t v1388 = vaddq_s16(v1385, v1387); + int16x8_t v1389 = vsubq_s16(v964, v969); + int16x8_t v1390 = vsubq_s16(v974, v979); + int16x8_t v1391_tmp = vqrdmulhq_n_s16(v1390, 1893); + int16x8_t v1391 = vmlaq_n_s16(v1391_tmp, v1390, 2); + int16x8_t v1392 = vaddq_s16(v1389, v1391); + int16x8_t v1393 = vqrdmulhq_n_s16(v1392, 20783); + int16x8_t v1394 = vaddq_s16(v1388, v1393); + int16x8_t v1395 = vqrdmulhq_n_s16(v1394, 17326); + int16x8_t v1396 = vaddq_s16(v1384, v1395); + int16x8_t v1397 = vsubq_s16(v711, v722); + int16x8_t v1398 = vsubq_s16(v733, v744); + int16x8_t v1399_tmp = vqrdmulhq_n_s16(v1398, 13357); + int16x8_t v1399 = vmlaq_n_s16(v1399_tmp, v1398, 3); + int16x8_t v1400 = vaddq_s16(v1397, v1399); + int16x8_t v1401 = vsubq_s16(v757, v768); + int16x8_t v1402 = vsubq_s16(v779, v790); + int16x8_t v1403_tmp = vqrdmulhq_n_s16(v1402, 13357); + int16x8_t v1403 = vmlaq_n_s16(v1403_tmp, v1402, 3); + int16x8_t v1404 = vaddq_s16(v1401, v1403); + int16x8_t v1405 = vqrdmulhq_n_s16(v1404, 21637); + int16x8_t v1406 = vaddq_s16(v1400, v1405); + int16x8_t v1407 = vsubq_s16(v805, v816); + int16x8_t v1408 = vsubq_s16(v827, v838); + int16x8_t v1409_tmp = vqrdmulhq_n_s16(v1408, 13357); + int16x8_t v1409 = vmlaq_n_s16(v1409_tmp, v1408, 3); + int16x8_t v1410 = vaddq_s16(v1407, v1409); + int16x8_t v1411 = vsubq_s16(v851, v862); + int16x8_t v1412 = vsubq_s16(v873, v884); + int16x8_t v1413_tmp = vqrdmulhq_n_s16(v1412, 13357); + int16x8_t v1413 = vmlaq_n_s16(v1413_tmp, v1412, 3); + int16x8_t v1414 = vaddq_s16(v1411, v1413); + int16x8_t v1415 = vqrdmulhq_n_s16(v1414, 21637); + int16x8_t v1416 = vaddq_s16(v1410, v1415); + int16x8_t v1417 = vqrdmulhq_n_s16(v1416, 17479); + int16x8_t v1418 = vaddq_s16(v1406, v1417); + int16x8_t v1419 = vsubq_s16(v25, v60); + int16x8_t v1420 = vsubq_s16(v102, v138); + int16x8_t v1421_tmp = vqrdmulhq_n_s16(v1420, 6226); + int16x8_t v1421 = vmlaq_n_s16(v1421_tmp, v1420, 10); + int16x8_t v1422 = vaddq_s16(v1419, v1421); + int16x8_t v1423 = vsubq_s16(v182, v233); + int16x8_t v1424 = vsubq_s16(v275, v312); + int16x8_t v1425_tmp = vqrdmulhq_n_s16(v1424, 6226); + int16x8_t v1425 = vmlaq_n_s16(v1425_tmp, v1424, 10); + int16x8_t v1426 = vaddq_s16(v1423, v1425); + int16x8_t v1427 = vqrdmulhq_n_s16(v1426, 22622); + int16x8_t v1428 = vaddq_s16(v1422, v1427); + int16x8_t v1429 = vsubq_s16(v358, v409); + int16x8_t v1430 = vsubq_s16(v481, v519); + int16x8_t v1431_tmp = vqrdmulhq_n_s16(v1430, 6226); + int16x8_t v1431 = vmlaq_n_s16(v1431_tmp, v1430, 10); + int16x8_t v1432 = vaddq_s16(v1429, v1431); + int16x8_t v1433 = vsubq_s16(v563, v614); + int16x8_t v1434 = vsubq_s16(v656, v694); + int16x8_t v1435_tmp = vqrdmulhq_n_s16(v1434, 6226); + int16x8_t v1435 = vmlaq_n_s16(v1435_tmp, v1434, 10); + int16x8_t v1436 = vaddq_s16(v1433, v1435); + int16x8_t v1437 = vqrdmulhq_n_s16(v1436, 22622); + int16x8_t v1438 = vaddq_s16(v1432, v1437); + int16x8_t v1439 = vqrdmulhq_n_s16(v1438, 17646); + int16x8_t v1440 = vaddq_s16(v1428, v1439); + int16x8_t v1441 = vsubq_s16(v1419, v1421); + int16x8_t v1442 = vsubq_s16(v1423, v1425); + int16x8_t v1443 = vqrdmulhq_n_s16(v1442, 23761); + int16x8_t v1444 = vaddq_s16(v1441, v1443); + int16x8_t v1445 = vsubq_s16(v1429, v1431); + int16x8_t v1446 = vsubq_s16(v1433, v1435); + int16x8_t v1447 = vqrdmulhq_n_s16(v1446, 23761); + int16x8_t v1448 = vaddq_s16(v1445, v1447); + int16x8_t v1449 = vqrdmulhq_n_s16(v1448, 17826); + int16x8_t v1450 = vaddq_s16(v1444, v1449); + int16x8_t v1451 = vsubq_s16(v1397, v1399); + int16x8_t v1452 = vsubq_s16(v1401, v1403); + int16x8_t v1453 = vqrdmulhq_n_s16(v1452, 25084); + int16x8_t v1454 = vaddq_s16(v1451, v1453); + int16x8_t v1455 = vsubq_s16(v1407, v1409); + int16x8_t v1456 = vsubq_s16(v1411, v1413); + int16x8_t v1457 = vqrdmulhq_n_s16(v1456, 25084); + int16x8_t v1458 = vaddq_s16(v1455, v1457); + int16x8_t v1459 = vqrdmulhq_n_s16(v1458, 18021); + int16x8_t v1460 = vaddq_s16(v1454, v1459); + int16x8_t v1461 = vsubq_s16(v1375, v1377); + int16x8_t v1462 = vsubq_s16(v1379, v1381); + int16x8_t v1463 = vqrdmulhq_n_s16(v1462, 26631); + int16x8_t v1464 = vaddq_s16(v1461, v1463); + int16x8_t v1465 = vsubq_s16(v1385, v1387); + int16x8_t v1466 = vsubq_s16(v1389, v1391); + int16x8_t v1467 = vqrdmulhq_n_s16(v1466, 26631); + int16x8_t v1468 = vaddq_s16(v1465, v1467); + int16x8_t v1469 = vqrdmulhq_n_s16(v1468, 18231); + int16x8_t v1470 = vaddq_s16(v1464, v1469); + int16x8_t v1471 = vsubq_s16(v1353, v1355); + int16x8_t v1472 = vsubq_s16(v1357, v1359); + int16x8_t v1473 = vqrdmulhq_n_s16(v1472, 28454); + int16x8_t v1474 = vaddq_s16(v1471, v1473); + int16x8_t v1475 = vsubq_s16(v1363, v1365); + int16x8_t v1476 = vsubq_s16(v1367, v1369); + int16x8_t v1477 = vqrdmulhq_n_s16(v1476, 28454); + int16x8_t v1478 = vaddq_s16(v1475, v1477); + int16x8_t v1479 = vqrdmulhq_n_s16(v1478, 18458); + int16x8_t v1480 = vaddq_s16(v1474, v1479); + int16x8_t v1481 = vsubq_s16(v1331, v1333); + int16x8_t v1482 = vsubq_s16(v1335, v1337); + int16x8_t v1483 = vqrdmulhq_n_s16(v1482, 30624); + int16x8_t v1484 = vaddq_s16(v1481, v1483); + int16x8_t v1485 = vsubq_s16(v1341, v1343); + int16x8_t v1486 = vsubq_s16(v1345, v1347); + int16x8_t v1487 = vqrdmulhq_n_s16(v1486, 30624); + int16x8_t v1488 = vaddq_s16(v1485, v1487); + int16x8_t v1489 = vqrdmulhq_n_s16(v1488, 18702); + int16x8_t v1490 = vaddq_s16(v1484, v1489); + int16x8_t v1491 = vsubq_s16(v1309, v1311); + int16x8_t v1492 = vsubq_s16(v1313, v1315); + int16x8_t v1493_tmp = vqrdmulhq_n_s16(v1492, 472); + int16x8_t v1493 = vaddq_s16(v1493_tmp, v1492); + int16x8_t v1494 = vaddq_s16(v1491, v1493); + int16x8_t v1495 = vsubq_s16(v1319, v1321); + int16x8_t v1496 = vsubq_s16(v1323, v1325); + int16x8_t v1497_tmp = vqrdmulhq_n_s16(v1496, 472); + int16x8_t v1497 = vaddq_s16(v1497_tmp, v1496); + int16x8_t v1498 = vaddq_s16(v1495, v1497); + int16x8_t v1499 = vqrdmulhq_n_s16(v1498, 18964); + int16x8_t v1500 = vaddq_s16(v1494, v1499); + int16x8_t v1501 = vsubq_s16(v1287, v1289); + int16x8_t v1502 = vsubq_s16(v1291, v1293); + int16x8_t v1503_tmp = vqrdmulhq_n_s16(v1502, 3672); + int16x8_t v1503 = vaddq_s16(v1503_tmp, v1502); + int16x8_t v1504 = vaddq_s16(v1501, v1503); + int16x8_t v1505 = vsubq_s16(v1297, v1299); + int16x8_t v1506 = vsubq_s16(v1301, v1303); + int16x8_t v1507_tmp = vqrdmulhq_n_s16(v1506, 3672); + int16x8_t v1507 = vaddq_s16(v1507_tmp, v1506); + int16x8_t v1508 = vaddq_s16(v1505, v1507); + int16x8_t v1509 = vqrdmulhq_n_s16(v1508, 19245); + int16x8_t v1510 = vaddq_s16(v1504, v1509); + int16x8_t v1511 = vsubq_s16(v1265, v1267); + int16x8_t v1512 = vsubq_s16(v1269, v1271); + int16x8_t v1513_tmp = vqrdmulhq_n_s16(v1512, 7662); + int16x8_t v1513 = vaddq_s16(v1513_tmp, v1512); + int16x8_t v1514 = vaddq_s16(v1511, v1513); + int16x8_t v1515 = vsubq_s16(v1275, v1277); + int16x8_t v1516 = vsubq_s16(v1279, v1281); + int16x8_t v1517_tmp = vqrdmulhq_n_s16(v1516, 7662); + int16x8_t v1517 = vaddq_s16(v1517_tmp, v1516); + int16x8_t v1518 = vaddq_s16(v1515, v1517); + int16x8_t v1519 = vqrdmulhq_n_s16(v1518, 19546); + int16x8_t v1520 = vaddq_s16(v1514, v1519); + int16x8_t v1521 = vsubq_s16(v1222, v1227); + int16x8_t v1522 = vsubq_s16(v1232, v1237); + int16x8_t v1523_tmp = vqrdmulhq_n_s16(v1522, 12756); + int16x8_t v1523 = vaddq_s16(v1523_tmp, v1522); + int16x8_t v1524 = vaddq_s16(v1521, v1523); + int16x8_t v1525 = vsubq_s16(v1244, v1249); + int16x8_t v1526 = vsubq_s16(v1254, v1259); + int16x8_t v1527_tmp = vqrdmulhq_n_s16(v1526, 12756); + int16x8_t v1527 = vaddq_s16(v1527_tmp, v1526); + int16x8_t v1528 = vaddq_s16(v1525, v1527); + int16x8_t v1529 = vqrdmulhq_n_s16(v1528, 19869); + int16x8_t v1530 = vaddq_s16(v1524, v1529); + int16x8_t v1531 = vsubq_s16(v1176, v1181); + int16x8_t v1532 = vsubq_s16(v1186, v1191); + int16x8_t v1533_tmp = vqrdmulhq_n_s16(v1532, 19463); + int16x8_t v1533 = vaddq_s16(v1533_tmp, v1532); + int16x8_t v1534 = vaddq_s16(v1531, v1533); + int16x8_t v1535 = vsubq_s16(v1198, v1203); + int16x8_t v1536 = vsubq_s16(v1208, v1213); + int16x8_t v1537_tmp = vqrdmulhq_n_s16(v1536, 19463); + int16x8_t v1537 = vaddq_s16(v1537_tmp, v1536); + int16x8_t v1538 = vaddq_s16(v1535, v1537); + int16x8_t v1539 = vqrdmulhq_n_s16(v1538, 20216); + int16x8_t v1540 = vaddq_s16(v1534, v1539); + int16x8_t v1541 = vsubq_s16(v1130, v1135); + int16x8_t v1542 = vsubq_s16(v1140, v1145); + int16x8_t v1543_tmp = vqrdmulhq_n_s16(v1542, 28661); + int16x8_t v1543 = vaddq_s16(v1543_tmp, v1542); + int16x8_t v1544 = vaddq_s16(v1541, v1543); + int16x8_t v1545 = vsubq_s16(v1152, v1157); + int16x8_t v1546 = vsubq_s16(v1162, v1167); + int16x8_t v1547_tmp = vqrdmulhq_n_s16(v1546, 28661); + int16x8_t v1547 = vaddq_s16(v1547_tmp, v1546); + int16x8_t v1548 = vaddq_s16(v1545, v1547); + int16x8_t v1549 = vqrdmulhq_n_s16(v1548, 20587); + int16x8_t v1550 = vaddq_s16(v1544, v1549); + int16x8_t v1551 = vsubq_s16(v1084, v1089); + int16x8_t v1552 = vsubq_s16(v1094, v1099); + int16x8_t v1553_tmp = vqrdmulhq_n_s16(v1552, 9242); + int16x8_t v1553 = vmlaq_n_s16(v1553_tmp, v1552, 2); + int16x8_t v1554 = vaddq_s16(v1551, v1553); + int16x8_t v1555 = vsubq_s16(v1106, v1111); + int16x8_t v1556 = vsubq_s16(v1116, v1121); + int16x8_t v1557_tmp = vqrdmulhq_n_s16(v1556, 9242); + int16x8_t v1557 = vmlaq_n_s16(v1557_tmp, v1556, 2); + int16x8_t v1558 = vaddq_s16(v1555, v1557); + int16x8_t v1559 = vqrdmulhq_n_s16(v1558, 20985); + int16x8_t v1560 = vaddq_s16(v1554, v1559); + int16x8_t v1561 = vsubq_s16(v996, v1007); + int16x8_t v1562 = vsubq_s16(v1018, v1029); + int16x8_t v1563_tmp = vqrdmulhq_n_s16(v1562, 30298); + int16x8_t v1563 = vmlaq_n_s16(v1563_tmp, v1562, 2); + int16x8_t v1564 = vaddq_s16(v1561, v1563); + int16x8_t v1565 = vsubq_s16(v1042, v1053); + int16x8_t v1566 = vsubq_s16(v1064, v1075); + int16x8_t v1567_tmp = vqrdmulhq_n_s16(v1566, 30298); + int16x8_t v1567 = vmlaq_n_s16(v1567_tmp, v1566, 2); + int16x8_t v1568 = vaddq_s16(v1565, v1567); + int16x8_t v1569 = vqrdmulhq_n_s16(v1568, 21412); + int16x8_t v1570 = vaddq_s16(v1564, v1569); + int16x8_t v1571 = vsubq_s16(v901, v912); + int16x8_t v1572 = vsubq_s16(v924, v935); + int16x8_t v1573_tmp = vqrdmulhq_n_s16(v1572, 2773); + int16x8_t v1573 = vmlaq_n_s16(v1573_tmp, v1572, 4); + int16x8_t v1574 = vaddq_s16(v1571, v1573); + int16x8_t v1575 = vsubq_s16(v948, v959); + int16x8_t v1576 = vsubq_s16(v970, v981); + int16x8_t v1577_tmp = vqrdmulhq_n_s16(v1576, 2773); + int16x8_t v1577 = vmlaq_n_s16(v1577_tmp, v1576, 4); + int16x8_t v1578 = vaddq_s16(v1575, v1577); + int16x8_t v1579 = vqrdmulhq_n_s16(v1578, 21871); + int16x8_t v1580 = vaddq_s16(v1574, v1579); + int16x8_t v1581 = vsubq_s16(v723, v746); + int16x8_t v1582 = vsubq_s16(v769, v792); + int16x8_t v1583_tmp = vqrdmulhq_n_s16(v1582, 26108); + int16x8_t v1583 = vmlaq_n_s16(v1583_tmp, v1582, 6); + int16x8_t v1584 = vaddq_s16(v1581, v1583); + int16x8_t v1585 = vsubq_s16(v817, v840); + int16x8_t v1586 = vsubq_s16(v863, v886); + int16x8_t v1587_tmp = vqrdmulhq_n_s16(v1586, 26108); + int16x8_t v1587 = vmlaq_n_s16(v1587_tmp, v1586, 6); + int16x8_t v1588 = vaddq_s16(v1585, v1587); + int16x8_t v1589 = vqrdmulhq_n_s16(v1588, 22363); + int16x8_t v1590 = vaddq_s16(v1584, v1589); + int16x8_t v1591 = vsubq_s16(v61, v140); + int16x8_t v1592 = vsubq_s16(v234, v314); + int16x8_t v1593_tmp = vqrdmulhq_n_s16(v1592, 12251); + int16x8_t v1593 = vmlaq_n_s16(v1593_tmp, v1592, 20); + int16x8_t v1594 = vaddq_s16(v1591, v1593); + int16x8_t v1595 = vsubq_s16(v410, v521); + int16x8_t v1596 = vsubq_s16(v615, v696); + int16x8_t v1597_tmp = vqrdmulhq_n_s16(v1596, 12251); + int16x8_t v1597 = vmlaq_n_s16(v1597_tmp, v1596, 20); + int16x8_t v1598 = vaddq_s16(v1595, v1597); + int16x8_t v1599 = vqrdmulhq_n_s16(v1598, 22891); + int16x8_t v1600 = vaddq_s16(v1594, v1599); + int16x8_t v1601 = vsubq_s16(v1591, v1593); + int16x8_t v1602 = vsubq_s16(v1595, v1597); + int16x8_t v1603 = vqrdmulhq_n_s16(v1602, 23460); + int16x8_t v1604 = vaddq_s16(v1601, v1603); + int16x8_t v1605 = vsubq_s16(v1581, v1583); + int16x8_t v1606 = vsubq_s16(v1585, v1587); + int16x8_t v1607 = vqrdmulhq_n_s16(v1606, 24073); + int16x8_t v1608 = vaddq_s16(v1605, v1607); + int16x8_t v1609 = vsubq_s16(v1571, v1573); + int16x8_t v1610 = vsubq_s16(v1575, v1577); + int16x8_t v1611 = vqrdmulhq_n_s16(v1610, 24734); + int16x8_t v1612 = vaddq_s16(v1609, v1611); + int16x8_t v1613 = vsubq_s16(v1561, v1563); + int16x8_t v1614 = vsubq_s16(v1565, v1567); + int16x8_t v1615 = vqrdmulhq_n_s16(v1614, 25448); + int16x8_t v1616 = vaddq_s16(v1613, v1615); + int16x8_t v1617 = vsubq_s16(v1551, v1553); + int16x8_t v1618 = vsubq_s16(v1555, v1557); + int16x8_t v1619 = vqrdmulhq_n_s16(v1618, 26220); + int16x8_t v1620 = vaddq_s16(v1617, v1619); + int16x8_t v1621 = vsubq_s16(v1541, v1543); + int16x8_t v1622 = vsubq_s16(v1545, v1547); + int16x8_t v1623 = vqrdmulhq_n_s16(v1622, 27058); + int16x8_t v1624 = vaddq_s16(v1621, v1623); + int16x8_t v1625 = vsubq_s16(v1531, v1533); + int16x8_t v1626 = vsubq_s16(v1535, v1537); + int16x8_t v1627 = vqrdmulhq_n_s16(v1626, 27969); + int16x8_t v1628 = vaddq_s16(v1625, v1627); + int16x8_t v1629 = vsubq_s16(v1521, v1523); + int16x8_t v1630 = vsubq_s16(v1525, v1527); + int16x8_t v1631 = vqrdmulhq_n_s16(v1630, 28961); + int16x8_t v1632 = vaddq_s16(v1629, v1631); + int16x8_t v1633 = vsubq_s16(v1511, v1513); + int16x8_t v1634 = vsubq_s16(v1515, v1517); + int16x8_t v1635 = vqrdmulhq_n_s16(v1634, 30044); + int16x8_t v1636 = vaddq_s16(v1633, v1635); + int16x8_t v1637 = vsubq_s16(v1501, v1503); + int16x8_t v1638 = vsubq_s16(v1505, v1507); + int16x8_t v1639 = vqrdmulhq_n_s16(v1638, 31232); + int16x8_t v1640 = vaddq_s16(v1637, v1639); + int16x8_t v1641 = vsubq_s16(v1491, v1493); + int16x8_t v1642 = vsubq_s16(v1495, v1497); + int16x8_t v1643 = vqrdmulhq_n_s16(v1642, 32538); + int16x8_t v1644 = vaddq_s16(v1641, v1643); + int16x8_t v1645 = vsubq_s16(v1481, v1483); + int16x8_t v1646 = vsubq_s16(v1485, v1487); + int16x8_t v1647_tmp = vqrdmulhq_n_s16(v1646, 1211); + int16x8_t v1647 = vaddq_s16(v1647_tmp, v1646); + int16x8_t v1648 = vaddq_s16(v1645, v1647); + int16x8_t v1649 = vsubq_s16(v1471, v1473); + int16x8_t v1650 = vsubq_s16(v1475, v1477); + int16x8_t v1651_tmp = vqrdmulhq_n_s16(v1650, 2808); + int16x8_t v1651 = vaddq_s16(v1651_tmp, v1650); + int16x8_t v1652 = vaddq_s16(v1649, v1651); + int16x8_t v1653 = vsubq_s16(v1461, v1463); + int16x8_t v1654 = vsubq_s16(v1465, v1467); + int16x8_t v1655_tmp = vqrdmulhq_n_s16(v1654, 4586); + int16x8_t v1655 = vaddq_s16(v1655_tmp, v1654); + int16x8_t v1656 = vaddq_s16(v1653, v1655); + int16x8_t v1657 = vsubq_s16(v1451, v1453); + int16x8_t v1658 = vsubq_s16(v1455, v1457); + int16x8_t v1659_tmp = vqrdmulhq_n_s16(v1658, 6576); + int16x8_t v1659 = vaddq_s16(v1659_tmp, v1658); + int16x8_t v1660 = vaddq_s16(v1657, v1659); + int16x8_t v1661 = vsubq_s16(v1441, v1443); + int16x8_t v1662 = vsubq_s16(v1445, v1447); + int16x8_t v1663_tmp = vqrdmulhq_n_s16(v1662, 8817); + int16x8_t v1663 = vaddq_s16(v1663_tmp, v1662); + int16x8_t v1664 = vaddq_s16(v1661, v1663); + int16x8_t v1665 = vsubq_s16(v1422, v1427); + int16x8_t v1666 = vsubq_s16(v1432, v1437); + int16x8_t v1667_tmp = vqrdmulhq_n_s16(v1666, 11356); + int16x8_t v1667 = vaddq_s16(v1667_tmp, v1666); + int16x8_t v1668 = vaddq_s16(v1665, v1667); + int16x8_t v1669 = vsubq_s16(v1400, v1405); + int16x8_t v1670 = vsubq_s16(v1410, v1415); + int16x8_t v1671_tmp = vqrdmulhq_n_s16(v1670, 14256); + int16x8_t v1671 = vaddq_s16(v1671_tmp, v1670); + int16x8_t v1672 = vaddq_s16(v1669, v1671); + int16x8_t v1673 = vsubq_s16(v1378, v1383); + int16x8_t v1674 = vsubq_s16(v1388, v1393); + int16x8_t v1675_tmp = vqrdmulhq_n_s16(v1674, 17596); + int16x8_t v1675 = vaddq_s16(v1675_tmp, v1674); + int16x8_t v1676 = vaddq_s16(v1673, v1675); + int16x8_t v1677 = vsubq_s16(v1356, v1361); + int16x8_t v1678 = vsubq_s16(v1366, v1371); + int16x8_t v1679_tmp = vqrdmulhq_n_s16(v1678, 21483); + int16x8_t v1679 = vaddq_s16(v1679_tmp, v1678); + int16x8_t v1680 = vaddq_s16(v1677, v1679); + int16x8_t v1681 = vsubq_s16(v1334, v1339); + int16x8_t v1682 = vsubq_s16(v1344, v1349); + int16x8_t v1683_tmp = vqrdmulhq_n_s16(v1682, 26057); + int16x8_t v1683 = vaddq_s16(v1683_tmp, v1682); + int16x8_t v1684 = vaddq_s16(v1681, v1683); + int16x8_t v1685 = vsubq_s16(v1312, v1317); + int16x8_t v1686 = vsubq_s16(v1322, v1327); + int16x8_t v1687_tmp = vqrdmulhq_n_s16(v1686, 31517); + int16x8_t v1687 = vaddq_s16(v1687_tmp, v1686); + int16x8_t v1688 = vaddq_s16(v1685, v1687); + int16x8_t v1689 = vsubq_s16(v1290, v1295); + int16x8_t v1690 = vsubq_s16(v1300, v1305); + int16x8_t v1691_tmp = vqrdmulhq_n_s16(v1690, 5373); + int16x8_t v1691 = vmlaq_n_s16(v1691_tmp, v1690, 2); + int16x8_t v1692 = vaddq_s16(v1689, v1691); + int16x8_t v1693 = vsubq_s16(v1268, v1273); + int16x8_t v1694 = vsubq_s16(v1278, v1283); + int16x8_t v1695_tmp = vqrdmulhq_n_s16(v1694, 13571); + int16x8_t v1695 = vmlaq_n_s16(v1695_tmp, v1694, 2); + int16x8_t v1696 = vaddq_s16(v1693, v1695); + int16x8_t v1697 = vsubq_s16(v1228, v1239); + int16x8_t v1698 = vsubq_s16(v1250, v1261); + int16x8_t v1699_tmp = vqrdmulhq_n_s16(v1698, 23975); + int16x8_t v1699 = vmlaq_n_s16(v1699_tmp, v1698, 2); + int16x8_t v1700 = vaddq_s16(v1697, v1699); + int16x8_t v1701 = vsubq_s16(v1182, v1193); + int16x8_t v1702 = vsubq_s16(v1204, v1215); + int16x8_t v1703_tmp = vqrdmulhq_n_s16(v1702, 4832); + int16x8_t v1703 = vmlaq_n_s16(v1703_tmp, v1702, 3); + int16x8_t v1704 = vaddq_s16(v1701, v1703); + int16x8_t v1705 = vsubq_s16(v1136, v1147); + int16x8_t v1706 = vsubq_s16(v1158, v1169); + int16x8_t v1707_tmp = vqrdmulhq_n_s16(v1706, 23437); + int16x8_t v1707 = vmlaq_n_s16(v1707_tmp, v1706, 3); + int16x8_t v1708 = vaddq_s16(v1705, v1707); + int16x8_t v1709 = vsubq_s16(v1090, v1101); + int16x8_t v1710 = vsubq_s16(v1112, v1123); + int16x8_t v1711_tmp = vqrdmulhq_n_s16(v1710, 17573); + int16x8_t v1711 = vmlaq_n_s16(v1711_tmp, v1710, 4); + int16x8_t v1712 = vaddq_s16(v1709, v1711); + int16x8_t v1713 = vsubq_s16(v1008, v1031); + int16x8_t v1714 = vsubq_s16(v1054, v1077); + int16x8_t v1715_tmp = vqrdmulhq_n_s16(v1714, 27122); + int16x8_t v1715 = vmlaq_n_s16(v1715_tmp, v1714, 5); + int16x8_t v1716 = vaddq_s16(v1713, v1715); + int16x8_t v1717 = vsubq_s16(v913, v937); + int16x8_t v1718 = vsubq_s16(v960, v983); + int16x8_t v1719_tmp = vqrdmulhq_n_s16(v1718, 5041); + int16x8_t v1719 = vmlaq_n_s16(v1719_tmp, v1718, 8); + int16x8_t v1720 = vaddq_s16(v1717, v1719); + int16x8_t v1721 = vsubq_s16(v747, v794); + int16x8_t v1722 = vsubq_s16(v841, v888); + int16x8_t v1723_tmp = vqrdmulhq_n_s16(v1722, 19146); + int16x8_t v1723 = vmlaq_n_s16(v1723_tmp, v1722, 13); + int16x8_t v1724 = vaddq_s16(v1721, v1723); + int16x8_t v1725 = vsubq_s16(v141, v316); + int16x8_t v1726 = vsubq_s16(v522, v698); + int16x8_t v1727_tmp = vqrdmulhq_n_s16(v1726, 24402); + int16x8_t v1727 = vmlaq_n_s16(v1727_tmp, v1726, 40); + int16x8_t v1728 = vaddq_s16(v1725, v1727); + int16x8_t v1729 = vsubq_s16(v1725, v1727); + int16x8_t v1730 = vsubq_s16(v1721, v1723); + int16x8_t v1731 = vsubq_s16(v1717, v1719); + int16x8_t v1732 = vsubq_s16(v1713, v1715); + int16x8_t v1733 = vsubq_s16(v1709, v1711); + int16x8_t v1734 = vsubq_s16(v1705, v1707); + int16x8_t v1735 = vsubq_s16(v1701, v1703); + int16x8_t v1736 = vsubq_s16(v1697, v1699); + int16x8_t v1737 = vsubq_s16(v1693, v1695); + int16x8_t v1738 = vsubq_s16(v1689, v1691); + int16x8_t v1739 = vsubq_s16(v1685, v1687); + int16x8_t v1740 = vsubq_s16(v1681, v1683); + int16x8_t v1741 = vsubq_s16(v1677, v1679); + int16x8_t v1742 = vsubq_s16(v1673, v1675); + int16x8_t v1743 = vsubq_s16(v1669, v1671); + int16x8_t v1744 = vsubq_s16(v1665, v1667); + int16x8_t v1745 = vsubq_s16(v1661, v1663); + int16x8_t v1746 = vsubq_s16(v1657, v1659); + int16x8_t v1747 = vsubq_s16(v1653, v1655); + int16x8_t v1748 = vsubq_s16(v1649, v1651); + int16x8_t v1749 = vsubq_s16(v1645, v1647); + int16x8_t v1750 = vsubq_s16(v1641, v1643); + int16x8_t v1751 = vsubq_s16(v1637, v1639); + int16x8_t v1752 = vsubq_s16(v1633, v1635); + int16x8_t v1753 = vsubq_s16(v1629, v1631); + int16x8_t v1754 = vsubq_s16(v1625, v1627); + int16x8_t v1755 = vsubq_s16(v1621, v1623); + int16x8_t v1756 = vsubq_s16(v1617, v1619); + int16x8_t v1757 = vsubq_s16(v1613, v1615); + int16x8_t v1758 = vsubq_s16(v1609, v1611); + int16x8_t v1759 = vsubq_s16(v1605, v1607); + int16x8_t v1760 = vsubq_s16(v1601, v1603); + int16x8_t v1761 = vsubq_s16(v1594, v1599); + int16x8_t v1762 = vsubq_s16(v1584, v1589); + int16x8_t v1763 = vsubq_s16(v1574, v1579); + int16x8_t v1764 = vsubq_s16(v1564, v1569); + int16x8_t v1765 = vsubq_s16(v1554, v1559); + int16x8_t v1766 = vsubq_s16(v1544, v1549); + int16x8_t v1767 = vsubq_s16(v1534, v1539); + int16x8_t v1768 = vsubq_s16(v1524, v1529); + int16x8_t v1769 = vsubq_s16(v1514, v1519); + int16x8_t v1770 = vsubq_s16(v1504, v1509); + int16x8_t v1771 = vsubq_s16(v1494, v1499); + int16x8_t v1772 = vsubq_s16(v1484, v1489); + int16x8_t v1773 = vsubq_s16(v1474, v1479); + int16x8_t v1774 = vsubq_s16(v1464, v1469); + int16x8_t v1775 = vsubq_s16(v1454, v1459); + int16x8_t v1776 = vsubq_s16(v1444, v1449); + int16x8_t v1777 = vsubq_s16(v1428, v1439); + int16x8_t v1778 = vsubq_s16(v1406, v1417); + int16x8_t v1779 = vsubq_s16(v1384, v1395); + int16x8_t v1780 = vsubq_s16(v1362, v1373); + int16x8_t v1781 = vsubq_s16(v1340, v1351); + int16x8_t v1782 = vsubq_s16(v1318, v1329); + int16x8_t v1783 = vsubq_s16(v1296, v1307); + int16x8_t v1784 = vsubq_s16(v1274, v1285); + int16x8_t v1785 = vsubq_s16(v1240, v1263); + int16x8_t v1786 = vsubq_s16(v1194, v1217); + int16x8_t v1787 = vsubq_s16(v1148, v1171); + int16x8_t v1788 = vsubq_s16(v1102, v1125); + int16x8_t v1789 = vsubq_s16(v1032, v1079); + int16x8_t v1790 = vsubq_s16(v938, v985); + int16x8_t v1791 = vsubq_s16(v795, v890); + int16x8_t v1792 = vsubq_s16(v317, v700); + vst1q_s16(out + out_stride * 0 + i, v701); + vst1q_s16(out + out_stride * 1 + i, v891); + vst1q_s16(out + out_stride * 2 + i, v986); + vst1q_s16(out + out_stride * 3 + i, v1080); + vst1q_s16(out + out_stride * 4 + i, v1126); + vst1q_s16(out + out_stride * 5 + i, v1172); + vst1q_s16(out + out_stride * 6 + i, v1218); + vst1q_s16(out + out_stride * 7 + i, v1264); + vst1q_s16(out + out_stride * 8 + i, v1286); + vst1q_s16(out + out_stride * 9 + i, v1308); + vst1q_s16(out + out_stride * 10 + i, v1330); + vst1q_s16(out + out_stride * 11 + i, v1352); + vst1q_s16(out + out_stride * 12 + i, v1374); + vst1q_s16(out + out_stride * 13 + i, v1396); + vst1q_s16(out + out_stride * 14 + i, v1418); + vst1q_s16(out + out_stride * 15 + i, v1440); + vst1q_s16(out + out_stride * 16 + i, v1450); + vst1q_s16(out + out_stride * 17 + i, v1460); + vst1q_s16(out + out_stride * 18 + i, v1470); + vst1q_s16(out + out_stride * 19 + i, v1480); + vst1q_s16(out + out_stride * 20 + i, v1490); + vst1q_s16(out + out_stride * 21 + i, v1500); + vst1q_s16(out + out_stride * 22 + i, v1510); + vst1q_s16(out + out_stride * 23 + i, v1520); + vst1q_s16(out + out_stride * 24 + i, v1530); + vst1q_s16(out + out_stride * 25 + i, v1540); + vst1q_s16(out + out_stride * 26 + i, v1550); + vst1q_s16(out + out_stride * 27 + i, v1560); + vst1q_s16(out + out_stride * 28 + i, v1570); + vst1q_s16(out + out_stride * 29 + i, v1580); + vst1q_s16(out + out_stride * 30 + i, v1590); + vst1q_s16(out + out_stride * 31 + i, v1600); + vst1q_s16(out + out_stride * 32 + i, v1604); + vst1q_s16(out + out_stride * 33 + i, v1608); + vst1q_s16(out + out_stride * 34 + i, v1612); + vst1q_s16(out + out_stride * 35 + i, v1616); + vst1q_s16(out + out_stride * 36 + i, v1620); + vst1q_s16(out + out_stride * 37 + i, v1624); + vst1q_s16(out + out_stride * 38 + i, v1628); + vst1q_s16(out + out_stride * 39 + i, v1632); + vst1q_s16(out + out_stride * 40 + i, v1636); + vst1q_s16(out + out_stride * 41 + i, v1640); + vst1q_s16(out + out_stride * 42 + i, v1644); + vst1q_s16(out + out_stride * 43 + i, v1648); + vst1q_s16(out + out_stride * 44 + i, v1652); + vst1q_s16(out + out_stride * 45 + i, v1656); + vst1q_s16(out + out_stride * 46 + i, v1660); + vst1q_s16(out + out_stride * 47 + i, v1664); + vst1q_s16(out + out_stride * 48 + i, v1668); + vst1q_s16(out + out_stride * 49 + i, v1672); + vst1q_s16(out + out_stride * 50 + i, v1676); + vst1q_s16(out + out_stride * 51 + i, v1680); + vst1q_s16(out + out_stride * 52 + i, v1684); + vst1q_s16(out + out_stride * 53 + i, v1688); + vst1q_s16(out + out_stride * 54 + i, v1692); + vst1q_s16(out + out_stride * 55 + i, v1696); + vst1q_s16(out + out_stride * 56 + i, v1700); + vst1q_s16(out + out_stride * 57 + i, v1704); + vst1q_s16(out + out_stride * 58 + i, v1708); + vst1q_s16(out + out_stride * 59 + i, v1712); + vst1q_s16(out + out_stride * 60 + i, v1716); + vst1q_s16(out + out_stride * 61 + i, v1720); + vst1q_s16(out + out_stride * 62 + i, v1724); + vst1q_s16(out + out_stride * 63 + i, v1728); + vst1q_s16(out + out_stride * 64 + i, v1729); + vst1q_s16(out + out_stride * 65 + i, v1730); + vst1q_s16(out + out_stride * 66 + i, v1731); + vst1q_s16(out + out_stride * 67 + i, v1732); + vst1q_s16(out + out_stride * 68 + i, v1733); + vst1q_s16(out + out_stride * 69 + i, v1734); + vst1q_s16(out + out_stride * 70 + i, v1735); + vst1q_s16(out + out_stride * 71 + i, v1736); + vst1q_s16(out + out_stride * 72 + i, v1737); + vst1q_s16(out + out_stride * 73 + i, v1738); + vst1q_s16(out + out_stride * 74 + i, v1739); + vst1q_s16(out + out_stride * 75 + i, v1740); + vst1q_s16(out + out_stride * 76 + i, v1741); + vst1q_s16(out + out_stride * 77 + i, v1742); + vst1q_s16(out + out_stride * 78 + i, v1743); + vst1q_s16(out + out_stride * 79 + i, v1744); + vst1q_s16(out + out_stride * 80 + i, v1745); + vst1q_s16(out + out_stride * 81 + i, v1746); + vst1q_s16(out + out_stride * 82 + i, v1747); + vst1q_s16(out + out_stride * 83 + i, v1748); + vst1q_s16(out + out_stride * 84 + i, v1749); + vst1q_s16(out + out_stride * 85 + i, v1750); + vst1q_s16(out + out_stride * 86 + i, v1751); + vst1q_s16(out + out_stride * 87 + i, v1752); + vst1q_s16(out + out_stride * 88 + i, v1753); + vst1q_s16(out + out_stride * 89 + i, v1754); + vst1q_s16(out + out_stride * 90 + i, v1755); + vst1q_s16(out + out_stride * 91 + i, v1756); + vst1q_s16(out + out_stride * 92 + i, v1757); + vst1q_s16(out + out_stride * 93 + i, v1758); + vst1q_s16(out + out_stride * 94 + i, v1759); + vst1q_s16(out + out_stride * 95 + i, v1760); + vst1q_s16(out + out_stride * 96 + i, v1761); + vst1q_s16(out + out_stride * 97 + i, v1762); + vst1q_s16(out + out_stride * 98 + i, v1763); + vst1q_s16(out + out_stride * 99 + i, v1764); + vst1q_s16(out + out_stride * 100 + i, v1765); + vst1q_s16(out + out_stride * 101 + i, v1766); + vst1q_s16(out + out_stride * 102 + i, v1767); + vst1q_s16(out + out_stride * 103 + i, v1768); + vst1q_s16(out + out_stride * 104 + i, v1769); + vst1q_s16(out + out_stride * 105 + i, v1770); + vst1q_s16(out + out_stride * 106 + i, v1771); + vst1q_s16(out + out_stride * 107 + i, v1772); + vst1q_s16(out + out_stride * 108 + i, v1773); + vst1q_s16(out + out_stride * 109 + i, v1774); + vst1q_s16(out + out_stride * 110 + i, v1775); + vst1q_s16(out + out_stride * 111 + i, v1776); + vst1q_s16(out + out_stride * 112 + i, v1777); + vst1q_s16(out + out_stride * 113 + i, v1778); + vst1q_s16(out + out_stride * 114 + i, v1779); + vst1q_s16(out + out_stride * 115 + i, v1780); + vst1q_s16(out + out_stride * 116 + i, v1781); + vst1q_s16(out + out_stride * 117 + i, v1782); + vst1q_s16(out + out_stride * 118 + i, v1783); + vst1q_s16(out + out_stride * 119 + i, v1784); + vst1q_s16(out + out_stride * 120 + i, v1785); + vst1q_s16(out + out_stride * 121 + i, v1786); + vst1q_s16(out + out_stride * 122 + i, v1787); + vst1q_s16(out + out_stride * 123 + i, v1788); + vst1q_s16(out + out_stride * 124 + i, v1789); + vst1q_s16(out + out_stride * 125 + i, v1790); + vst1q_s16(out + out_stride * 126 + i, v1791); + vst1q_s16(out + out_stride * 127 + i, v1792); + } +} diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct16-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct16-inl.h new file mode 100644 index 0000000000..472ec20d42 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/fast_dct16-inl.h @@ -0,0 +1,180 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* This file is automatically generated. Do not modify it directly. */ +#if HWY_TARGET != HWY_NEON +#error "only include this file from fast_dct-inl.h" +#endif + +constexpr size_t FastIDCTIntegerBits(FastDCTTag<16>) { return 1; } + +void FastIDCT(FastDCTTag<16>, const int16_t* in, size_t in_stride, int16_t* out, + size_t out_stride, size_t count) { + JXL_ASSERT(count % 8 == 0); + for (size_t i = 0; i < count; i += 8) { + int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i); + int16x8_t v1 = vld1q_s16(in + in_stride * 8 + i); + int16x8_t v2 = vaddq_s16(v0, v1); + int16x8_t v3 = vld1q_s16(in + in_stride * 4 + i); + int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573); + int16x8_t v4 = vaddq_s16(v4_tmp, v3); + int16x8_t v5 = vld1q_s16(in + in_stride * 12 + i); + int16x8_t v6 = vaddq_s16(v5, v3); + int16x8_t v7 = vaddq_s16(v4, v6); + int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734); + int16x8_t v9 = vaddq_s16(v2, v8); + int16x8_t v10 = vld1q_s16(in + in_stride * 2 + i); + int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573); + int16x8_t v11 = vaddq_s16(v11_tmp, v10); + int16x8_t v12 = vld1q_s16(in + in_stride * 10 + i); + int16x8_t v13 = vld1q_s16(in + in_stride * 6 + i); + int16x8_t v14 = vaddq_s16(v12, v13); + int16x8_t v15 = vaddq_s16(v11, v14); + int16x8_t v16 = vaddq_s16(v13, v10); + int16x8_t v17 = vqrdmulhq_n_s16(v16, 25080); + int16x8_t v18 = vld1q_s16(in + in_stride * 14 + i); + int16x8_t v19 = vaddq_s16(v18, v12); + int16x8_t v20 = vaddq_s16(v16, v19); + int16x8_t v21 = vqrdmulhq_n_s16(v20, 17734); + int16x8_t v22 = vaddq_s16(v17, v21); + int16x8_t v23 = vaddq_s16(v15, v22); + int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705); + int16x8_t v25 = vaddq_s16(v9, v24); + int16x8_t v26 = vld1q_s16(in + in_stride * 15 + i); + int16x8_t v27 = vld1q_s16(in + in_stride * 13 + i); + int16x8_t v28 = vaddq_s16(v26, v27); + int16x8_t v29 = vld1q_s16(in + in_stride * 11 + i); + int16x8_t v30 = vld1q_s16(in + in_stride * 9 + i); + int16x8_t v31 = vaddq_s16(v29, v30); + int16x8_t v32 = vaddq_s16(v28, v31); + int16x8_t v33 = vqrdmulhq_n_s16(v32, 17734); + int16x8_t v34 = vld1q_s16(in + in_stride * 3 + i); + int16x8_t v35 = vld1q_s16(in + in_stride * 1 + i); + int16x8_t v36 = vaddq_s16(v34, v35); + int16x8_t v37 = vld1q_s16(in + in_stride * 7 + i); + int16x8_t v38 = vld1q_s16(in + in_stride * 5 + i); + int16x8_t v39 = vaddq_s16(v37, v38); + int16x8_t v40 = vaddq_s16(v36, v39); + int16x8_t v41_tmp = vqrdmulhq_n_s16(v40, 10045); + int16x8_t v41 = vaddq_s16(v41_tmp, v40); + int16x8_t v42 = vaddq_s16(v33, v41); + int16x8_t v43 = vqrdmulhq_n_s16(v42, 16705); + int16x8_t v44_tmp = vqrdmulhq_n_s16(v36, 13573); + int16x8_t v44 = vaddq_s16(v44_tmp, v36); + int16x8_t v45 = vaddq_s16(v39, v31); + int16x8_t v46 = vaddq_s16(v44, v45); + int16x8_t v47 = vqrdmulhq_n_s16(v46, 16705); + int16x8_t v48 = vaddq_s16(v43, v47); + int16x8_t v49_tmp = vqrdmulhq_n_s16(v35, 13573); + int16x8_t v49 = vaddq_s16(v49_tmp, v35); + int16x8_t v50 = vaddq_s16(v30, v37); + int16x8_t v51 = vaddq_s16(v49, v50); + int16x8_t v52 = vaddq_s16(v38, v34); + int16x8_t v53 = vaddq_s16(v27, v29); + int16x8_t v54 = vaddq_s16(v52, v53); + int16x8_t v55 = vqrdmulhq_n_s16(v54, 17734); + int16x8_t v56 = vqrdmulhq_n_s16(v52, 25080); + int16x8_t v57 = vaddq_s16(v55, v56); + int16x8_t v58 = vaddq_s16(v51, v57); + int16x8_t v59 = vaddq_s16(v48, v58); + int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463); + int16x8_t v61 = vaddq_s16(v25, v60); + int16x8_t v62 = vsubq_s16(v0, v1); + int16x8_t v63 = vsubq_s16(v4, v6); + int16x8_t v64_tmp = vqrdmulhq_n_s16(v63, 10045); + int16x8_t v64 = vaddq_s16(v64_tmp, v63); + int16x8_t v65 = vaddq_s16(v62, v64); + int16x8_t v66 = vsubq_s16(v11, v14); + int16x8_t v67 = vqrdmulhq_n_s16(v16, 17734); + int16x8_t v68_tmp = vqrdmulhq_n_s16(v19, 10045); + int16x8_t v68 = vaddq_s16(v68_tmp, v19); + int16x8_t v69 = vsubq_s16(v67, v68); + int16x8_t v70 = vaddq_s16(v66, v69); + int16x8_t v71 = vqrdmulhq_n_s16(v70, 19705); + int16x8_t v72 = vaddq_s16(v65, v71); + int16x8_t v73 = vsubq_s16(v49, v50); + int16x8_t v74 = vqrdmulhq_n_s16(v52, 17734); + int16x8_t v75_tmp = vqrdmulhq_n_s16(v53, 10045); + int16x8_t v75 = vaddq_s16(v75_tmp, v53); + int16x8_t v76 = vsubq_s16(v74, v75); + int16x8_t v77 = vaddq_s16(v73, v76); + int16x8_t v78 = vsubq_s16(v44, v45); + int16x8_t v79 = vqrdmulhq_n_s16(v78, 19705); + int16x8_t v80 = vqrdmulhq_n_s16(v40, 13573); + int16x8_t v81 = vsubq_s16(v80, v32); + int16x8_t v82 = vqrdmulhq_n_s16(v81, 25746); + int16x8_t v83 = vaddq_s16(v79, v82); + int16x8_t v84 = vaddq_s16(v77, v83); + int16x8_t v85 = vqrdmulhq_n_s16(v84, 17121); + int16x8_t v86 = vaddq_s16(v72, v85); + int16x8_t v87 = vsubq_s16(v62, v64); + int16x8_t v88 = vsubq_s16(v66, v69); + int16x8_t v89 = vqrdmulhq_n_s16(v88, 29490); + int16x8_t v90 = vaddq_s16(v87, v89); + int16x8_t v91 = vsubq_s16(v73, v76); + int16x8_t v92 = vqrdmulhq_n_s16(v78, 29490); + int16x8_t v93_tmp = vqrdmulhq_n_s16(v81, 5763); + int16x8_t v93 = vaddq_s16(v93_tmp, v81); + int16x8_t v94 = vsubq_s16(v92, v93); + int16x8_t v95 = vaddq_s16(v91, v94); + int16x8_t v96 = vqrdmulhq_n_s16(v95, 18578); + int16x8_t v97 = vaddq_s16(v90, v96); + int16x8_t v98 = vsubq_s16(v46, v42); + int16x8_t v99_tmp = vqrdmulhq_n_s16(v98, 18446); + int16x8_t v99 = vmlaq_n_s16(v99_tmp, v98, 2); + int16x8_t v100 = vsubq_s16(v51, v57); + int16x8_t v101 = vaddq_s16(v99, v100); + int16x8_t v102 = vqrdmulhq_n_s16(v101, 21195); + int16x8_t v103 = vsubq_s16(v2, v8); + int16x8_t v104 = vsubq_s16(v15, v22); + int16x8_t v105_tmp = vqrdmulhq_n_s16(v104, 18446); + int16x8_t v105 = vmlaq_n_s16(v105_tmp, v104, 2); + int16x8_t v106 = vaddq_s16(v103, v105); + int16x8_t v107 = vaddq_s16(v102, v106); + int16x8_t v108 = vsubq_s16(v103, v105); + int16x8_t v109 = vsubq_s16(v100, v99); + int16x8_t v110 = vqrdmulhq_n_s16(v109, 25826); + int16x8_t v111 = vaddq_s16(v108, v110); + int16x8_t v112 = vsubq_s16(v87, v89); + int16x8_t v113 = vsubq_s16(v91, v94); + int16x8_t v114_tmp = vqrdmulhq_n_s16(v113, 1988); + int16x8_t v114 = vaddq_s16(v114_tmp, v113); + int16x8_t v115 = vaddq_s16(v112, v114); + int16x8_t v116 = vsubq_s16(v65, v71); + int16x8_t v117 = vsubq_s16(v77, v83); + int16x8_t v118_tmp = vqrdmulhq_n_s16(v117, 23673); + int16x8_t v118 = vaddq_s16(v118_tmp, v117); + int16x8_t v119 = vaddq_s16(v116, v118); + int16x8_t v120 = vsubq_s16(v58, v48); + int16x8_t v121_tmp = vqrdmulhq_n_s16(v120, 3314); + int16x8_t v121 = vmlaq_n_s16(v121_tmp, v120, 5); + int16x8_t v122 = vsubq_s16(v9, v24); + int16x8_t v123 = vaddq_s16(v121, v122); + int16x8_t v124 = vsubq_s16(v122, v121); + int16x8_t v125 = vsubq_s16(v116, v118); + int16x8_t v126 = vsubq_s16(v112, v114); + int16x8_t v127 = vsubq_s16(v108, v110); + int16x8_t v128 = vsubq_s16(v106, v102); + int16x8_t v129 = vsubq_s16(v90, v96); + int16x8_t v130 = vsubq_s16(v72, v85); + int16x8_t v131 = vsubq_s16(v25, v60); + vst1q_s16(out + out_stride * 0 + i, v61); + vst1q_s16(out + out_stride * 1 + i, v86); + vst1q_s16(out + out_stride * 2 + i, v97); + vst1q_s16(out + out_stride * 3 + i, v107); + vst1q_s16(out + out_stride * 4 + i, v111); + vst1q_s16(out + out_stride * 5 + i, v115); + vst1q_s16(out + out_stride * 6 + i, v119); + vst1q_s16(out + out_stride * 7 + i, v123); + vst1q_s16(out + out_stride * 8 + i, v124); + vst1q_s16(out + out_stride * 9 + i, v125); + vst1q_s16(out + out_stride * 10 + i, v126); + vst1q_s16(out + out_stride * 11 + i, v127); + vst1q_s16(out + out_stride * 12 + i, v128); + vst1q_s16(out + out_stride * 13 + i, v129); + vst1q_s16(out + out_stride * 14 + i, v130); + vst1q_s16(out + out_stride * 15 + i, v131); + } +} diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct256-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct256-inl.h new file mode 100644 index 0000000000..a823440af2 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/fast_dct256-inl.h @@ -0,0 +1,4811 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* This file is automatically generated. Do not modify it directly. */ +#if HWY_TARGET != HWY_NEON +#error "only include this file from fast_dct-inl.h" +#endif + +constexpr size_t FastIDCTIntegerBits(FastDCTTag<256>) { return 3; } + +void FastIDCT(FastDCTTag<256>, const int16_t* in, size_t in_stride, + int16_t* out, size_t out_stride, size_t count) { + JXL_ASSERT(count % 8 == 0); + for (size_t i = 0; i < count; i += 8) { + int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i); + int16x8_t v1 = vld1q_s16(in + in_stride * 128 + i); + int16x8_t v2 = vaddq_s16(v0, v1); + int16x8_t v3 = vld1q_s16(in + in_stride * 64 + i); + int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573); + int16x8_t v4 = vaddq_s16(v4_tmp, v3); + int16x8_t v5 = vld1q_s16(in + in_stride * 192 + i); + int16x8_t v6 = vaddq_s16(v5, v3); + int16x8_t v7 = vaddq_s16(v4, v6); + int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734); + int16x8_t v9 = vaddq_s16(v2, v8); + int16x8_t v10 = vld1q_s16(in + in_stride * 32 + i); + int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573); + int16x8_t v11 = vaddq_s16(v11_tmp, v10); + int16x8_t v12 = vld1q_s16(in + in_stride * 160 + i); + int16x8_t v13 = vld1q_s16(in + in_stride * 96 + i); + int16x8_t v14 = vaddq_s16(v12, v13); + int16x8_t v15 = vaddq_s16(v11, v14); + int16x8_t v16 = vaddq_s16(v13, v10); + int16x8_t v17_tmp = vqrdmulhq_n_s16(v16, 13573); + int16x8_t v17 = vaddq_s16(v17_tmp, v16); + int16x8_t v18 = vld1q_s16(in + in_stride * 224 + i); + int16x8_t v19 = vaddq_s16(v18, v12); + int16x8_t v20 = vaddq_s16(v19, v16); + int16x8_t v21 = vaddq_s16(v17, v20); + int16x8_t v22 = vqrdmulhq_n_s16(v21, 17734); + int16x8_t v23 = vaddq_s16(v15, v22); + int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705); + int16x8_t v25 = vaddq_s16(v9, v24); + int16x8_t v26 = vld1q_s16(in + in_stride * 16 + i); + int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573); + int16x8_t v27 = vaddq_s16(v27_tmp, v26); + int16x8_t v28 = vld1q_s16(in + in_stride * 144 + i); + int16x8_t v29 = vld1q_s16(in + in_stride * 112 + i); + int16x8_t v30 = vaddq_s16(v28, v29); + int16x8_t v31 = vaddq_s16(v27, v30); + int16x8_t v32 = vld1q_s16(in + in_stride * 80 + i); + int16x8_t v33 = vld1q_s16(in + in_stride * 48 + i); + int16x8_t v34 = vaddq_s16(v32, v33); + int16x8_t v35_tmp = vqrdmulhq_n_s16(v34, 13573); + int16x8_t v35 = vaddq_s16(v35_tmp, v34); + int16x8_t v36 = vld1q_s16(in + in_stride * 208 + i); + int16x8_t v37 = vld1q_s16(in + in_stride * 176 + i); + int16x8_t v38 = vaddq_s16(v36, v37); + int16x8_t v39 = vaddq_s16(v38, v34); + int16x8_t v40 = vaddq_s16(v35, v39); + int16x8_t v41 = vqrdmulhq_n_s16(v40, 17734); + int16x8_t v42 = vaddq_s16(v31, v41); + int16x8_t v43 = vaddq_s16(v33, v26); + int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573); + int16x8_t v44 = vaddq_s16(v44_tmp, v43); + int16x8_t v45 = vaddq_s16(v37, v28); + int16x8_t v46 = vaddq_s16(v29, v32); + int16x8_t v47 = vaddq_s16(v45, v46); + int16x8_t v48 = vaddq_s16(v44, v47); + int16x8_t v49 = vaddq_s16(v46, v43); + int16x8_t v50_tmp = vqrdmulhq_n_s16(v49, 13573); + int16x8_t v50 = vaddq_s16(v50_tmp, v49); + int16x8_t v51 = vld1q_s16(in + in_stride * 240 + i); + int16x8_t v52 = vaddq_s16(v51, v36); + int16x8_t v53 = vaddq_s16(v52, v45); + int16x8_t v54 = vaddq_s16(v53, v49); + int16x8_t v55 = vaddq_s16(v50, v54); + int16x8_t v56 = vqrdmulhq_n_s16(v55, 17734); + int16x8_t v57 = vaddq_s16(v48, v56); + int16x8_t v58 = vqrdmulhq_n_s16(v57, 16705); + int16x8_t v59 = vaddq_s16(v42, v58); + int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463); + int16x8_t v61 = vaddq_s16(v25, v60); + int16x8_t v62 = vld1q_s16(in + in_stride * 8 + i); + int16x8_t v63_tmp = vqrdmulhq_n_s16(v62, 13573); + int16x8_t v63 = vaddq_s16(v63_tmp, v62); + int16x8_t v64 = vld1q_s16(in + in_stride * 136 + i); + int16x8_t v65 = vld1q_s16(in + in_stride * 120 + i); + int16x8_t v66 = vaddq_s16(v64, v65); + int16x8_t v67 = vaddq_s16(v63, v66); + int16x8_t v68 = vld1q_s16(in + in_stride * 72 + i); + int16x8_t v69 = vld1q_s16(in + in_stride * 56 + i); + int16x8_t v70 = vaddq_s16(v68, v69); + int16x8_t v71_tmp = vqrdmulhq_n_s16(v70, 13573); + int16x8_t v71 = vaddq_s16(v71_tmp, v70); + int16x8_t v72 = vld1q_s16(in + in_stride * 200 + i); + int16x8_t v73 = vld1q_s16(in + in_stride * 184 + i); + int16x8_t v74 = vaddq_s16(v72, v73); + int16x8_t v75 = vaddq_s16(v74, v70); + int16x8_t v76 = vaddq_s16(v71, v75); + int16x8_t v77 = vqrdmulhq_n_s16(v76, 17734); + int16x8_t v78 = vaddq_s16(v67, v77); + int16x8_t v79 = vld1q_s16(in + in_stride * 40 + i); + int16x8_t v80 = vld1q_s16(in + in_stride * 24 + i); + int16x8_t v81 = vaddq_s16(v79, v80); + int16x8_t v82_tmp = vqrdmulhq_n_s16(v81, 13573); + int16x8_t v82 = vaddq_s16(v82_tmp, v81); + int16x8_t v83 = vld1q_s16(in + in_stride * 168 + i); + int16x8_t v84 = vld1q_s16(in + in_stride * 152 + i); + int16x8_t v85 = vaddq_s16(v83, v84); + int16x8_t v86 = vld1q_s16(in + in_stride * 104 + i); + int16x8_t v87 = vld1q_s16(in + in_stride * 88 + i); + int16x8_t v88 = vaddq_s16(v86, v87); + int16x8_t v89 = vaddq_s16(v85, v88); + int16x8_t v90 = vaddq_s16(v82, v89); + int16x8_t v91 = vaddq_s16(v88, v81); + int16x8_t v92_tmp = vqrdmulhq_n_s16(v91, 13573); + int16x8_t v92 = vaddq_s16(v92_tmp, v91); + int16x8_t v93 = vld1q_s16(in + in_stride * 232 + i); + int16x8_t v94 = vld1q_s16(in + in_stride * 216 + i); + int16x8_t v95 = vaddq_s16(v93, v94); + int16x8_t v96 = vaddq_s16(v95, v85); + int16x8_t v97 = vaddq_s16(v96, v91); + int16x8_t v98 = vaddq_s16(v92, v97); + int16x8_t v99 = vqrdmulhq_n_s16(v98, 17734); + int16x8_t v100 = vaddq_s16(v90, v99); + int16x8_t v101 = vqrdmulhq_n_s16(v100, 16705); + int16x8_t v102 = vaddq_s16(v78, v101); + int16x8_t v103 = vaddq_s16(v80, v62); + int16x8_t v104_tmp = vqrdmulhq_n_s16(v103, 13573); + int16x8_t v104 = vaddq_s16(v104_tmp, v103); + int16x8_t v105 = vaddq_s16(v84, v64); + int16x8_t v106 = vaddq_s16(v65, v86); + int16x8_t v107 = vaddq_s16(v105, v106); + int16x8_t v108 = vaddq_s16(v104, v107); + int16x8_t v109 = vaddq_s16(v87, v68); + int16x8_t v110 = vaddq_s16(v69, v79); + int16x8_t v111 = vaddq_s16(v109, v110); + int16x8_t v112_tmp = vqrdmulhq_n_s16(v111, 13573); + int16x8_t v112 = vaddq_s16(v112_tmp, v111); + int16x8_t v113 = vaddq_s16(v94, v72); + int16x8_t v114 = vaddq_s16(v73, v83); + int16x8_t v115 = vaddq_s16(v113, v114); + int16x8_t v116 = vaddq_s16(v115, v111); + int16x8_t v117 = vaddq_s16(v112, v116); + int16x8_t v118 = vqrdmulhq_n_s16(v117, 17734); + int16x8_t v119 = vaddq_s16(v108, v118); + int16x8_t v120 = vaddq_s16(v110, v103); + int16x8_t v121_tmp = vqrdmulhq_n_s16(v120, 13573); + int16x8_t v121 = vaddq_s16(v121_tmp, v120); + int16x8_t v122 = vaddq_s16(v114, v105); + int16x8_t v123 = vaddq_s16(v106, v109); + int16x8_t v124 = vaddq_s16(v122, v123); + int16x8_t v125 = vaddq_s16(v121, v124); + int16x8_t v126 = vaddq_s16(v123, v120); + int16x8_t v127_tmp = vqrdmulhq_n_s16(v126, 13573); + int16x8_t v127 = vaddq_s16(v127_tmp, v126); + int16x8_t v128 = vld1q_s16(in + in_stride * 248 + i); + int16x8_t v129 = vaddq_s16(v128, v93); + int16x8_t v130 = vaddq_s16(v129, v113); + int16x8_t v131 = vaddq_s16(v130, v122); + int16x8_t v132 = vaddq_s16(v131, v126); + int16x8_t v133 = vaddq_s16(v127, v132); + int16x8_t v134 = vqrdmulhq_n_s16(v133, 17734); + int16x8_t v135 = vaddq_s16(v125, v134); + int16x8_t v136 = vqrdmulhq_n_s16(v135, 16705); + int16x8_t v137 = vaddq_s16(v119, v136); + int16x8_t v138 = vqrdmulhq_n_s16(v137, 16463); + int16x8_t v139 = vaddq_s16(v102, v138); + int16x8_t v140 = vqrdmulhq_n_s16(v139, 16404); + int16x8_t v141 = vaddq_s16(v61, v140); + int16x8_t v142 = vld1q_s16(in + in_stride * 4 + i); + int16x8_t v143_tmp = vqrdmulhq_n_s16(v142, 13573); + int16x8_t v143 = vaddq_s16(v143_tmp, v142); + int16x8_t v144 = vld1q_s16(in + in_stride * 132 + i); + int16x8_t v145 = vld1q_s16(in + in_stride * 124 + i); + int16x8_t v146 = vaddq_s16(v144, v145); + int16x8_t v147 = vaddq_s16(v143, v146); + int16x8_t v148 = vld1q_s16(in + in_stride * 68 + i); + int16x8_t v149 = vld1q_s16(in + in_stride * 60 + i); + int16x8_t v150 = vaddq_s16(v148, v149); + int16x8_t v151_tmp = vqrdmulhq_n_s16(v150, 13573); + int16x8_t v151 = vaddq_s16(v151_tmp, v150); + int16x8_t v152 = vld1q_s16(in + in_stride * 196 + i); + int16x8_t v153 = vld1q_s16(in + in_stride * 188 + i); + int16x8_t v154 = vaddq_s16(v152, v153); + int16x8_t v155 = vaddq_s16(v154, v150); + int16x8_t v156 = vaddq_s16(v151, v155); + int16x8_t v157 = vqrdmulhq_n_s16(v156, 17734); + int16x8_t v158 = vaddq_s16(v147, v157); + int16x8_t v159 = vld1q_s16(in + in_stride * 36 + i); + int16x8_t v160 = vld1q_s16(in + in_stride * 28 + i); + int16x8_t v161 = vaddq_s16(v159, v160); + int16x8_t v162_tmp = vqrdmulhq_n_s16(v161, 13573); + int16x8_t v162 = vaddq_s16(v162_tmp, v161); + int16x8_t v163 = vld1q_s16(in + in_stride * 164 + i); + int16x8_t v164 = vld1q_s16(in + in_stride * 156 + i); + int16x8_t v165 = vaddq_s16(v163, v164); + int16x8_t v166 = vld1q_s16(in + in_stride * 100 + i); + int16x8_t v167 = vld1q_s16(in + in_stride * 92 + i); + int16x8_t v168 = vaddq_s16(v166, v167); + int16x8_t v169 = vaddq_s16(v165, v168); + int16x8_t v170 = vaddq_s16(v162, v169); + int16x8_t v171 = vaddq_s16(v168, v161); + int16x8_t v172_tmp = vqrdmulhq_n_s16(v171, 13573); + int16x8_t v172 = vaddq_s16(v172_tmp, v171); + int16x8_t v173 = vld1q_s16(in + in_stride * 228 + i); + int16x8_t v174 = vld1q_s16(in + in_stride * 220 + i); + int16x8_t v175 = vaddq_s16(v173, v174); + int16x8_t v176 = vaddq_s16(v175, v165); + int16x8_t v177 = vaddq_s16(v176, v171); + int16x8_t v178 = vaddq_s16(v172, v177); + int16x8_t v179 = vqrdmulhq_n_s16(v178, 17734); + int16x8_t v180 = vaddq_s16(v170, v179); + int16x8_t v181 = vqrdmulhq_n_s16(v180, 16705); + int16x8_t v182 = vaddq_s16(v158, v181); + int16x8_t v183 = vld1q_s16(in + in_stride * 20 + i); + int16x8_t v184 = vld1q_s16(in + in_stride * 12 + i); + int16x8_t v185 = vaddq_s16(v183, v184); + int16x8_t v186_tmp = vqrdmulhq_n_s16(v185, 13573); + int16x8_t v186 = vaddq_s16(v186_tmp, v185); + int16x8_t v187 = vld1q_s16(in + in_stride * 148 + i); + int16x8_t v188 = vld1q_s16(in + in_stride * 140 + i); + int16x8_t v189 = vaddq_s16(v187, v188); + int16x8_t v190 = vld1q_s16(in + in_stride * 116 + i); + int16x8_t v191 = vld1q_s16(in + in_stride * 108 + i); + int16x8_t v192 = vaddq_s16(v190, v191); + int16x8_t v193 = vaddq_s16(v189, v192); + int16x8_t v194 = vaddq_s16(v186, v193); + int16x8_t v195 = vld1q_s16(in + in_stride * 84 + i); + int16x8_t v196 = vld1q_s16(in + in_stride * 76 + i); + int16x8_t v197 = vaddq_s16(v195, v196); + int16x8_t v198 = vld1q_s16(in + in_stride * 52 + i); + int16x8_t v199 = vld1q_s16(in + in_stride * 44 + i); + int16x8_t v200 = vaddq_s16(v198, v199); + int16x8_t v201 = vaddq_s16(v197, v200); + int16x8_t v202_tmp = vqrdmulhq_n_s16(v201, 13573); + int16x8_t v202 = vaddq_s16(v202_tmp, v201); + int16x8_t v203 = vld1q_s16(in + in_stride * 212 + i); + int16x8_t v204 = vld1q_s16(in + in_stride * 204 + i); + int16x8_t v205 = vaddq_s16(v203, v204); + int16x8_t v206 = vld1q_s16(in + in_stride * 180 + i); + int16x8_t v207 = vld1q_s16(in + in_stride * 172 + i); + int16x8_t v208 = vaddq_s16(v206, v207); + int16x8_t v209 = vaddq_s16(v205, v208); + int16x8_t v210 = vaddq_s16(v209, v201); + int16x8_t v211 = vaddq_s16(v202, v210); + int16x8_t v212 = vqrdmulhq_n_s16(v211, 17734); + int16x8_t v213 = vaddq_s16(v194, v212); + int16x8_t v214 = vaddq_s16(v200, v185); + int16x8_t v215_tmp = vqrdmulhq_n_s16(v214, 13573); + int16x8_t v215 = vaddq_s16(v215_tmp, v214); + int16x8_t v216 = vaddq_s16(v208, v189); + int16x8_t v217 = vaddq_s16(v192, v197); + int16x8_t v218 = vaddq_s16(v216, v217); + int16x8_t v219 = vaddq_s16(v215, v218); + int16x8_t v220 = vaddq_s16(v217, v214); + int16x8_t v221_tmp = vqrdmulhq_n_s16(v220, 13573); + int16x8_t v221 = vaddq_s16(v221_tmp, v220); + int16x8_t v222 = vld1q_s16(in + in_stride * 244 + i); + int16x8_t v223 = vld1q_s16(in + in_stride * 236 + i); + int16x8_t v224 = vaddq_s16(v222, v223); + int16x8_t v225 = vaddq_s16(v224, v205); + int16x8_t v226 = vaddq_s16(v225, v216); + int16x8_t v227 = vaddq_s16(v226, v220); + int16x8_t v228 = vaddq_s16(v221, v227); + int16x8_t v229 = vqrdmulhq_n_s16(v228, 17734); + int16x8_t v230 = vaddq_s16(v219, v229); + int16x8_t v231 = vqrdmulhq_n_s16(v230, 16705); + int16x8_t v232 = vaddq_s16(v213, v231); + int16x8_t v233 = vqrdmulhq_n_s16(v232, 16463); + int16x8_t v234 = vaddq_s16(v182, v233); + int16x8_t v235 = vaddq_s16(v184, v142); + int16x8_t v236_tmp = vqrdmulhq_n_s16(v235, 13573); + int16x8_t v236 = vaddq_s16(v236_tmp, v235); + int16x8_t v237 = vaddq_s16(v188, v144); + int16x8_t v238 = vaddq_s16(v145, v190); + int16x8_t v239 = vaddq_s16(v237, v238); + int16x8_t v240 = vaddq_s16(v236, v239); + int16x8_t v241 = vaddq_s16(v196, v148); + int16x8_t v242 = vaddq_s16(v149, v198); + int16x8_t v243 = vaddq_s16(v241, v242); + int16x8_t v244_tmp = vqrdmulhq_n_s16(v243, 13573); + int16x8_t v244 = vaddq_s16(v244_tmp, v243); + int16x8_t v245 = vaddq_s16(v204, v152); + int16x8_t v246 = vaddq_s16(v153, v206); + int16x8_t v247 = vaddq_s16(v245, v246); + int16x8_t v248 = vaddq_s16(v247, v243); + int16x8_t v249 = vaddq_s16(v244, v248); + int16x8_t v250 = vqrdmulhq_n_s16(v249, 17734); + int16x8_t v251 = vaddq_s16(v240, v250); + int16x8_t v252 = vaddq_s16(v199, v159); + int16x8_t v253 = vaddq_s16(v160, v183); + int16x8_t v254 = vaddq_s16(v252, v253); + int16x8_t v255_tmp = vqrdmulhq_n_s16(v254, 13573); + int16x8_t v255 = vaddq_s16(v255_tmp, v254); + int16x8_t v256 = vaddq_s16(v207, v163); + int16x8_t v257 = vaddq_s16(v164, v187); + int16x8_t v258 = vaddq_s16(v256, v257); + int16x8_t v259 = vaddq_s16(v191, v166); + int16x8_t v260 = vaddq_s16(v167, v195); + int16x8_t v261 = vaddq_s16(v259, v260); + int16x8_t v262 = vaddq_s16(v258, v261); + int16x8_t v263 = vaddq_s16(v255, v262); + int16x8_t v264 = vaddq_s16(v261, v254); + int16x8_t v265_tmp = vqrdmulhq_n_s16(v264, 13573); + int16x8_t v265 = vaddq_s16(v265_tmp, v264); + int16x8_t v266 = vaddq_s16(v223, v173); + int16x8_t v267 = vaddq_s16(v174, v203); + int16x8_t v268 = vaddq_s16(v266, v267); + int16x8_t v269 = vaddq_s16(v268, v258); + int16x8_t v270 = vaddq_s16(v269, v264); + int16x8_t v271 = vaddq_s16(v265, v270); + int16x8_t v272 = vqrdmulhq_n_s16(v271, 17734); + int16x8_t v273 = vaddq_s16(v263, v272); + int16x8_t v274 = vqrdmulhq_n_s16(v273, 16705); + int16x8_t v275 = vaddq_s16(v251, v274); + int16x8_t v276 = vaddq_s16(v253, v235); + int16x8_t v277_tmp = vqrdmulhq_n_s16(v276, 13573); + int16x8_t v277 = vaddq_s16(v277_tmp, v276); + int16x8_t v278 = vaddq_s16(v257, v237); + int16x8_t v279 = vaddq_s16(v238, v259); + int16x8_t v280 = vaddq_s16(v278, v279); + int16x8_t v281 = vaddq_s16(v277, v280); + int16x8_t v282 = vaddq_s16(v260, v241); + int16x8_t v283 = vaddq_s16(v242, v252); + int16x8_t v284 = vaddq_s16(v282, v283); + int16x8_t v285_tmp = vqrdmulhq_n_s16(v284, 13573); + int16x8_t v285 = vaddq_s16(v285_tmp, v284); + int16x8_t v286 = vaddq_s16(v267, v245); + int16x8_t v287 = vaddq_s16(v246, v256); + int16x8_t v288 = vaddq_s16(v286, v287); + int16x8_t v289 = vaddq_s16(v288, v284); + int16x8_t v290 = vaddq_s16(v285, v289); + int16x8_t v291 = vqrdmulhq_n_s16(v290, 17734); + int16x8_t v292 = vaddq_s16(v281, v291); + int16x8_t v293 = vaddq_s16(v283, v276); + int16x8_t v294_tmp = vqrdmulhq_n_s16(v293, 13573); + int16x8_t v294 = vaddq_s16(v294_tmp, v293); + int16x8_t v295 = vaddq_s16(v287, v278); + int16x8_t v296 = vaddq_s16(v279, v282); + int16x8_t v297 = vaddq_s16(v295, v296); + int16x8_t v298 = vaddq_s16(v294, v297); + int16x8_t v299 = vaddq_s16(v296, v293); + int16x8_t v300_tmp = vqrdmulhq_n_s16(v299, 13573); + int16x8_t v300 = vaddq_s16(v300_tmp, v299); + int16x8_t v301 = vld1q_s16(in + in_stride * 252 + i); + int16x8_t v302 = vaddq_s16(v301, v222); + int16x8_t v303 = vaddq_s16(v302, v266); + int16x8_t v304 = vaddq_s16(v303, v286); + int16x8_t v305 = vaddq_s16(v304, v295); + int16x8_t v306 = vaddq_s16(v305, v299); + int16x8_t v307 = vaddq_s16(v300, v306); + int16x8_t v308 = vqrdmulhq_n_s16(v307, 17734); + int16x8_t v309 = vaddq_s16(v298, v308); + int16x8_t v310 = vqrdmulhq_n_s16(v309, 16705); + int16x8_t v311 = vaddq_s16(v292, v310); + int16x8_t v312 = vqrdmulhq_n_s16(v311, 16463); + int16x8_t v313 = vaddq_s16(v275, v312); + int16x8_t v314 = vqrdmulhq_n_s16(v313, 16404); + int16x8_t v315 = vaddq_s16(v234, v314); + int16x8_t v316 = vqrdmulhq_n_s16(v315, 16389); + int16x8_t v317 = vaddq_s16(v141, v316); + int16x8_t v318 = vld1q_s16(in + in_stride * 2 + i); + int16x8_t v319_tmp = vqrdmulhq_n_s16(v318, 13573); + int16x8_t v319 = vaddq_s16(v319_tmp, v318); + int16x8_t v320 = vld1q_s16(in + in_stride * 130 + i); + int16x8_t v321 = vld1q_s16(in + in_stride * 126 + i); + int16x8_t v322 = vaddq_s16(v320, v321); + int16x8_t v323 = vaddq_s16(v319, v322); + int16x8_t v324 = vld1q_s16(in + in_stride * 66 + i); + int16x8_t v325 = vld1q_s16(in + in_stride * 62 + i); + int16x8_t v326 = vaddq_s16(v324, v325); + int16x8_t v327_tmp = vqrdmulhq_n_s16(v326, 13573); + int16x8_t v327 = vaddq_s16(v327_tmp, v326); + int16x8_t v328 = vld1q_s16(in + in_stride * 194 + i); + int16x8_t v329 = vld1q_s16(in + in_stride * 190 + i); + int16x8_t v330 = vaddq_s16(v328, v329); + int16x8_t v331 = vaddq_s16(v330, v326); + int16x8_t v332 = vaddq_s16(v327, v331); + int16x8_t v333 = vqrdmulhq_n_s16(v332, 17734); + int16x8_t v334 = vaddq_s16(v323, v333); + int16x8_t v335 = vld1q_s16(in + in_stride * 34 + i); + int16x8_t v336 = vld1q_s16(in + in_stride * 30 + i); + int16x8_t v337 = vaddq_s16(v335, v336); + int16x8_t v338_tmp = vqrdmulhq_n_s16(v337, 13573); + int16x8_t v338 = vaddq_s16(v338_tmp, v337); + int16x8_t v339 = vld1q_s16(in + in_stride * 162 + i); + int16x8_t v340 = vld1q_s16(in + in_stride * 158 + i); + int16x8_t v341 = vaddq_s16(v339, v340); + int16x8_t v342 = vld1q_s16(in + in_stride * 98 + i); + int16x8_t v343 = vld1q_s16(in + in_stride * 94 + i); + int16x8_t v344 = vaddq_s16(v342, v343); + int16x8_t v345 = vaddq_s16(v341, v344); + int16x8_t v346 = vaddq_s16(v338, v345); + int16x8_t v347 = vaddq_s16(v344, v337); + int16x8_t v348_tmp = vqrdmulhq_n_s16(v347, 13573); + int16x8_t v348 = vaddq_s16(v348_tmp, v347); + int16x8_t v349 = vld1q_s16(in + in_stride * 226 + i); + int16x8_t v350 = vld1q_s16(in + in_stride * 222 + i); + int16x8_t v351 = vaddq_s16(v349, v350); + int16x8_t v352 = vaddq_s16(v351, v341); + int16x8_t v353 = vaddq_s16(v352, v347); + int16x8_t v354 = vaddq_s16(v348, v353); + int16x8_t v355 = vqrdmulhq_n_s16(v354, 17734); + int16x8_t v356 = vaddq_s16(v346, v355); + int16x8_t v357 = vqrdmulhq_n_s16(v356, 16705); + int16x8_t v358 = vaddq_s16(v334, v357); + int16x8_t v359 = vld1q_s16(in + in_stride * 18 + i); + int16x8_t v360 = vld1q_s16(in + in_stride * 14 + i); + int16x8_t v361 = vaddq_s16(v359, v360); + int16x8_t v362_tmp = vqrdmulhq_n_s16(v361, 13573); + int16x8_t v362 = vaddq_s16(v362_tmp, v361); + int16x8_t v363 = vld1q_s16(in + in_stride * 146 + i); + int16x8_t v364 = vld1q_s16(in + in_stride * 142 + i); + int16x8_t v365 = vaddq_s16(v363, v364); + int16x8_t v366 = vld1q_s16(in + in_stride * 114 + i); + int16x8_t v367 = vld1q_s16(in + in_stride * 110 + i); + int16x8_t v368 = vaddq_s16(v366, v367); + int16x8_t v369 = vaddq_s16(v365, v368); + int16x8_t v370 = vaddq_s16(v362, v369); + int16x8_t v371 = vld1q_s16(in + in_stride * 82 + i); + int16x8_t v372 = vld1q_s16(in + in_stride * 78 + i); + int16x8_t v373 = vaddq_s16(v371, v372); + int16x8_t v374 = vld1q_s16(in + in_stride * 50 + i); + int16x8_t v375 = vld1q_s16(in + in_stride * 46 + i); + int16x8_t v376 = vaddq_s16(v374, v375); + int16x8_t v377 = vaddq_s16(v373, v376); + int16x8_t v378_tmp = vqrdmulhq_n_s16(v377, 13573); + int16x8_t v378 = vaddq_s16(v378_tmp, v377); + int16x8_t v379 = vld1q_s16(in + in_stride * 210 + i); + int16x8_t v380 = vld1q_s16(in + in_stride * 206 + i); + int16x8_t v381 = vaddq_s16(v379, v380); + int16x8_t v382 = vld1q_s16(in + in_stride * 178 + i); + int16x8_t v383 = vld1q_s16(in + in_stride * 174 + i); + int16x8_t v384 = vaddq_s16(v382, v383); + int16x8_t v385 = vaddq_s16(v381, v384); + int16x8_t v386 = vaddq_s16(v385, v377); + int16x8_t v387 = vaddq_s16(v378, v386); + int16x8_t v388 = vqrdmulhq_n_s16(v387, 17734); + int16x8_t v389 = vaddq_s16(v370, v388); + int16x8_t v390 = vaddq_s16(v376, v361); + int16x8_t v391_tmp = vqrdmulhq_n_s16(v390, 13573); + int16x8_t v391 = vaddq_s16(v391_tmp, v390); + int16x8_t v392 = vaddq_s16(v384, v365); + int16x8_t v393 = vaddq_s16(v368, v373); + int16x8_t v394 = vaddq_s16(v392, v393); + int16x8_t v395 = vaddq_s16(v391, v394); + int16x8_t v396 = vaddq_s16(v393, v390); + int16x8_t v397_tmp = vqrdmulhq_n_s16(v396, 13573); + int16x8_t v397 = vaddq_s16(v397_tmp, v396); + int16x8_t v398 = vld1q_s16(in + in_stride * 242 + i); + int16x8_t v399 = vld1q_s16(in + in_stride * 238 + i); + int16x8_t v400 = vaddq_s16(v398, v399); + int16x8_t v401 = vaddq_s16(v400, v381); + int16x8_t v402 = vaddq_s16(v401, v392); + int16x8_t v403 = vaddq_s16(v402, v396); + int16x8_t v404 = vaddq_s16(v397, v403); + int16x8_t v405 = vqrdmulhq_n_s16(v404, 17734); + int16x8_t v406 = vaddq_s16(v395, v405); + int16x8_t v407 = vqrdmulhq_n_s16(v406, 16705); + int16x8_t v408 = vaddq_s16(v389, v407); + int16x8_t v409 = vqrdmulhq_n_s16(v408, 16463); + int16x8_t v410 = vaddq_s16(v358, v409); + int16x8_t v411 = vld1q_s16(in + in_stride * 10 + i); + int16x8_t v412 = vld1q_s16(in + in_stride * 6 + i); + int16x8_t v413 = vaddq_s16(v411, v412); + int16x8_t v414_tmp = vqrdmulhq_n_s16(v413, 13573); + int16x8_t v414 = vaddq_s16(v414_tmp, v413); + int16x8_t v415 = vld1q_s16(in + in_stride * 138 + i); + int16x8_t v416 = vld1q_s16(in + in_stride * 134 + i); + int16x8_t v417 = vaddq_s16(v415, v416); + int16x8_t v418 = vld1q_s16(in + in_stride * 122 + i); + int16x8_t v419 = vld1q_s16(in + in_stride * 118 + i); + int16x8_t v420 = vaddq_s16(v418, v419); + int16x8_t v421 = vaddq_s16(v417, v420); + int16x8_t v422 = vaddq_s16(v414, v421); + int16x8_t v423 = vld1q_s16(in + in_stride * 74 + i); + int16x8_t v424 = vld1q_s16(in + in_stride * 70 + i); + int16x8_t v425 = vaddq_s16(v423, v424); + int16x8_t v426 = vld1q_s16(in + in_stride * 58 + i); + int16x8_t v427 = vld1q_s16(in + in_stride * 54 + i); + int16x8_t v428 = vaddq_s16(v426, v427); + int16x8_t v429 = vaddq_s16(v425, v428); + int16x8_t v430_tmp = vqrdmulhq_n_s16(v429, 13573); + int16x8_t v430 = vaddq_s16(v430_tmp, v429); + int16x8_t v431 = vld1q_s16(in + in_stride * 202 + i); + int16x8_t v432 = vld1q_s16(in + in_stride * 198 + i); + int16x8_t v433 = vaddq_s16(v431, v432); + int16x8_t v434 = vld1q_s16(in + in_stride * 186 + i); + int16x8_t v435 = vld1q_s16(in + in_stride * 182 + i); + int16x8_t v436 = vaddq_s16(v434, v435); + int16x8_t v437 = vaddq_s16(v433, v436); + int16x8_t v438 = vaddq_s16(v437, v429); + int16x8_t v439 = vaddq_s16(v430, v438); + int16x8_t v440 = vqrdmulhq_n_s16(v439, 17734); + int16x8_t v441 = vaddq_s16(v422, v440); + int16x8_t v442 = vld1q_s16(in + in_stride * 42 + i); + int16x8_t v443 = vld1q_s16(in + in_stride * 38 + i); + int16x8_t v444 = vaddq_s16(v442, v443); + int16x8_t v445 = vld1q_s16(in + in_stride * 26 + i); + int16x8_t v446 = vld1q_s16(in + in_stride * 22 + i); + int16x8_t v447 = vaddq_s16(v445, v446); + int16x8_t v448 = vaddq_s16(v444, v447); + int16x8_t v449_tmp = vqrdmulhq_n_s16(v448, 13573); + int16x8_t v449 = vaddq_s16(v449_tmp, v448); + int16x8_t v450 = vld1q_s16(in + in_stride * 170 + i); + int16x8_t v451 = vld1q_s16(in + in_stride * 166 + i); + int16x8_t v452 = vaddq_s16(v450, v451); + int16x8_t v453 = vld1q_s16(in + in_stride * 154 + i); + int16x8_t v454 = vld1q_s16(in + in_stride * 150 + i); + int16x8_t v455 = vaddq_s16(v453, v454); + int16x8_t v456 = vaddq_s16(v452, v455); + int16x8_t v457 = vld1q_s16(in + in_stride * 106 + i); + int16x8_t v458 = vld1q_s16(in + in_stride * 102 + i); + int16x8_t v459 = vaddq_s16(v457, v458); + int16x8_t v460 = vld1q_s16(in + in_stride * 90 + i); + int16x8_t v461 = vld1q_s16(in + in_stride * 86 + i); + int16x8_t v462 = vaddq_s16(v460, v461); + int16x8_t v463 = vaddq_s16(v459, v462); + int16x8_t v464 = vaddq_s16(v456, v463); + int16x8_t v465 = vaddq_s16(v449, v464); + int16x8_t v466 = vaddq_s16(v463, v448); + int16x8_t v467_tmp = vqrdmulhq_n_s16(v466, 13573); + int16x8_t v467 = vaddq_s16(v467_tmp, v466); + int16x8_t v468 = vld1q_s16(in + in_stride * 234 + i); + int16x8_t v469 = vld1q_s16(in + in_stride * 230 + i); + int16x8_t v470 = vaddq_s16(v468, v469); + int16x8_t v471 = vld1q_s16(in + in_stride * 218 + i); + int16x8_t v472 = vld1q_s16(in + in_stride * 214 + i); + int16x8_t v473 = vaddq_s16(v471, v472); + int16x8_t v474 = vaddq_s16(v470, v473); + int16x8_t v475 = vaddq_s16(v474, v456); + int16x8_t v476 = vaddq_s16(v475, v466); + int16x8_t v477 = vaddq_s16(v467, v476); + int16x8_t v478 = vqrdmulhq_n_s16(v477, 17734); + int16x8_t v479 = vaddq_s16(v465, v478); + int16x8_t v480 = vqrdmulhq_n_s16(v479, 16705); + int16x8_t v481 = vaddq_s16(v441, v480); + int16x8_t v482 = vaddq_s16(v447, v413); + int16x8_t v483_tmp = vqrdmulhq_n_s16(v482, 13573); + int16x8_t v483 = vaddq_s16(v483_tmp, v482); + int16x8_t v484 = vaddq_s16(v455, v417); + int16x8_t v485 = vaddq_s16(v420, v459); + int16x8_t v486 = vaddq_s16(v484, v485); + int16x8_t v487 = vaddq_s16(v483, v486); + int16x8_t v488 = vaddq_s16(v462, v425); + int16x8_t v489 = vaddq_s16(v428, v444); + int16x8_t v490 = vaddq_s16(v488, v489); + int16x8_t v491_tmp = vqrdmulhq_n_s16(v490, 13573); + int16x8_t v491 = vaddq_s16(v491_tmp, v490); + int16x8_t v492 = vaddq_s16(v473, v433); + int16x8_t v493 = vaddq_s16(v436, v452); + int16x8_t v494 = vaddq_s16(v492, v493); + int16x8_t v495 = vaddq_s16(v494, v490); + int16x8_t v496 = vaddq_s16(v491, v495); + int16x8_t v497 = vqrdmulhq_n_s16(v496, 17734); + int16x8_t v498 = vaddq_s16(v487, v497); + int16x8_t v499 = vaddq_s16(v489, v482); + int16x8_t v500_tmp = vqrdmulhq_n_s16(v499, 13573); + int16x8_t v500 = vaddq_s16(v500_tmp, v499); + int16x8_t v501 = vaddq_s16(v493, v484); + int16x8_t v502 = vaddq_s16(v485, v488); + int16x8_t v503 = vaddq_s16(v501, v502); + int16x8_t v504 = vaddq_s16(v500, v503); + int16x8_t v505 = vaddq_s16(v502, v499); + int16x8_t v506_tmp = vqrdmulhq_n_s16(v505, 13573); + int16x8_t v506 = vaddq_s16(v506_tmp, v505); + int16x8_t v507 = vld1q_s16(in + in_stride * 250 + i); + int16x8_t v508 = vld1q_s16(in + in_stride * 246 + i); + int16x8_t v509 = vaddq_s16(v507, v508); + int16x8_t v510 = vaddq_s16(v509, v470); + int16x8_t v511 = vaddq_s16(v510, v492); + int16x8_t v512 = vaddq_s16(v511, v501); + int16x8_t v513 = vaddq_s16(v512, v505); + int16x8_t v514 = vaddq_s16(v506, v513); + int16x8_t v515 = vqrdmulhq_n_s16(v514, 17734); + int16x8_t v516 = vaddq_s16(v504, v515); + int16x8_t v517 = vqrdmulhq_n_s16(v516, 16705); + int16x8_t v518 = vaddq_s16(v498, v517); + int16x8_t v519 = vqrdmulhq_n_s16(v518, 16463); + int16x8_t v520 = vaddq_s16(v481, v519); + int16x8_t v521 = vqrdmulhq_n_s16(v520, 16404); + int16x8_t v522 = vaddq_s16(v410, v521); + int16x8_t v523 = vaddq_s16(v412, v318); + int16x8_t v524_tmp = vqrdmulhq_n_s16(v523, 13573); + int16x8_t v524 = vaddq_s16(v524_tmp, v523); + int16x8_t v525 = vaddq_s16(v416, v320); + int16x8_t v526 = vaddq_s16(v321, v418); + int16x8_t v527 = vaddq_s16(v525, v526); + int16x8_t v528 = vaddq_s16(v524, v527); + int16x8_t v529 = vaddq_s16(v424, v324); + int16x8_t v530 = vaddq_s16(v325, v426); + int16x8_t v531 = vaddq_s16(v529, v530); + int16x8_t v532_tmp = vqrdmulhq_n_s16(v531, 13573); + int16x8_t v532 = vaddq_s16(v532_tmp, v531); + int16x8_t v533 = vaddq_s16(v432, v328); + int16x8_t v534 = vaddq_s16(v329, v434); + int16x8_t v535 = vaddq_s16(v533, v534); + int16x8_t v536 = vaddq_s16(v535, v531); + int16x8_t v537 = vaddq_s16(v532, v536); + int16x8_t v538 = vqrdmulhq_n_s16(v537, 17734); + int16x8_t v539 = vaddq_s16(v528, v538); + int16x8_t v540 = vaddq_s16(v443, v335); + int16x8_t v541 = vaddq_s16(v336, v445); + int16x8_t v542 = vaddq_s16(v540, v541); + int16x8_t v543_tmp = vqrdmulhq_n_s16(v542, 13573); + int16x8_t v543 = vaddq_s16(v543_tmp, v542); + int16x8_t v544 = vaddq_s16(v451, v339); + int16x8_t v545 = vaddq_s16(v340, v453); + int16x8_t v546 = vaddq_s16(v544, v545); + int16x8_t v547 = vaddq_s16(v458, v342); + int16x8_t v548 = vaddq_s16(v343, v460); + int16x8_t v549 = vaddq_s16(v547, v548); + int16x8_t v550 = vaddq_s16(v546, v549); + int16x8_t v551 = vaddq_s16(v543, v550); + int16x8_t v552 = vaddq_s16(v549, v542); + int16x8_t v553_tmp = vqrdmulhq_n_s16(v552, 13573); + int16x8_t v553 = vaddq_s16(v553_tmp, v552); + int16x8_t v554 = vaddq_s16(v469, v349); + int16x8_t v555 = vaddq_s16(v350, v471); + int16x8_t v556 = vaddq_s16(v554, v555); + int16x8_t v557 = vaddq_s16(v556, v546); + int16x8_t v558 = vaddq_s16(v557, v552); + int16x8_t v559 = vaddq_s16(v553, v558); + int16x8_t v560 = vqrdmulhq_n_s16(v559, 17734); + int16x8_t v561 = vaddq_s16(v551, v560); + int16x8_t v562 = vqrdmulhq_n_s16(v561, 16705); + int16x8_t v563 = vaddq_s16(v539, v562); + int16x8_t v564 = vaddq_s16(v446, v359); + int16x8_t v565 = vaddq_s16(v360, v411); + int16x8_t v566 = vaddq_s16(v564, v565); + int16x8_t v567_tmp = vqrdmulhq_n_s16(v566, 13573); + int16x8_t v567 = vaddq_s16(v567_tmp, v566); + int16x8_t v568 = vaddq_s16(v454, v363); + int16x8_t v569 = vaddq_s16(v364, v415); + int16x8_t v570 = vaddq_s16(v568, v569); + int16x8_t v571 = vaddq_s16(v419, v366); + int16x8_t v572 = vaddq_s16(v367, v457); + int16x8_t v573 = vaddq_s16(v571, v572); + int16x8_t v574 = vaddq_s16(v570, v573); + int16x8_t v575 = vaddq_s16(v567, v574); + int16x8_t v576 = vaddq_s16(v461, v371); + int16x8_t v577 = vaddq_s16(v372, v423); + int16x8_t v578 = vaddq_s16(v576, v577); + int16x8_t v579 = vaddq_s16(v427, v374); + int16x8_t v580 = vaddq_s16(v375, v442); + int16x8_t v581 = vaddq_s16(v579, v580); + int16x8_t v582 = vaddq_s16(v578, v581); + int16x8_t v583_tmp = vqrdmulhq_n_s16(v582, 13573); + int16x8_t v583 = vaddq_s16(v583_tmp, v582); + int16x8_t v584 = vaddq_s16(v472, v379); + int16x8_t v585 = vaddq_s16(v380, v431); + int16x8_t v586 = vaddq_s16(v584, v585); + int16x8_t v587 = vaddq_s16(v435, v382); + int16x8_t v588 = vaddq_s16(v383, v450); + int16x8_t v589 = vaddq_s16(v587, v588); + int16x8_t v590 = vaddq_s16(v586, v589); + int16x8_t v591 = vaddq_s16(v590, v582); + int16x8_t v592 = vaddq_s16(v583, v591); + int16x8_t v593 = vqrdmulhq_n_s16(v592, 17734); + int16x8_t v594 = vaddq_s16(v575, v593); + int16x8_t v595 = vaddq_s16(v581, v566); + int16x8_t v596_tmp = vqrdmulhq_n_s16(v595, 13573); + int16x8_t v596 = vaddq_s16(v596_tmp, v595); + int16x8_t v597 = vaddq_s16(v589, v570); + int16x8_t v598 = vaddq_s16(v573, v578); + int16x8_t v599 = vaddq_s16(v597, v598); + int16x8_t v600 = vaddq_s16(v596, v599); + int16x8_t v601 = vaddq_s16(v598, v595); + int16x8_t v602_tmp = vqrdmulhq_n_s16(v601, 13573); + int16x8_t v602 = vaddq_s16(v602_tmp, v601); + int16x8_t v603 = vaddq_s16(v508, v398); + int16x8_t v604 = vaddq_s16(v399, v468); + int16x8_t v605 = vaddq_s16(v603, v604); + int16x8_t v606 = vaddq_s16(v605, v586); + int16x8_t v607 = vaddq_s16(v606, v597); + int16x8_t v608 = vaddq_s16(v607, v601); + int16x8_t v609 = vaddq_s16(v602, v608); + int16x8_t v610 = vqrdmulhq_n_s16(v609, 17734); + int16x8_t v611 = vaddq_s16(v600, v610); + int16x8_t v612 = vqrdmulhq_n_s16(v611, 16705); + int16x8_t v613 = vaddq_s16(v594, v612); + int16x8_t v614 = vqrdmulhq_n_s16(v613, 16463); + int16x8_t v615 = vaddq_s16(v563, v614); + int16x8_t v616 = vaddq_s16(v565, v523); + int16x8_t v617_tmp = vqrdmulhq_n_s16(v616, 13573); + int16x8_t v617 = vaddq_s16(v617_tmp, v616); + int16x8_t v618 = vaddq_s16(v569, v525); + int16x8_t v619 = vaddq_s16(v526, v571); + int16x8_t v620 = vaddq_s16(v618, v619); + int16x8_t v621 = vaddq_s16(v617, v620); + int16x8_t v622 = vaddq_s16(v577, v529); + int16x8_t v623 = vaddq_s16(v530, v579); + int16x8_t v624 = vaddq_s16(v622, v623); + int16x8_t v625_tmp = vqrdmulhq_n_s16(v624, 13573); + int16x8_t v625 = vaddq_s16(v625_tmp, v624); + int16x8_t v626 = vaddq_s16(v585, v533); + int16x8_t v627 = vaddq_s16(v534, v587); + int16x8_t v628 = vaddq_s16(v626, v627); + int16x8_t v629 = vaddq_s16(v628, v624); + int16x8_t v630 = vaddq_s16(v625, v629); + int16x8_t v631 = vqrdmulhq_n_s16(v630, 17734); + int16x8_t v632 = vaddq_s16(v621, v631); + int16x8_t v633 = vaddq_s16(v580, v540); + int16x8_t v634 = vaddq_s16(v541, v564); + int16x8_t v635 = vaddq_s16(v633, v634); + int16x8_t v636_tmp = vqrdmulhq_n_s16(v635, 13573); + int16x8_t v636 = vaddq_s16(v636_tmp, v635); + int16x8_t v637 = vaddq_s16(v588, v544); + int16x8_t v638 = vaddq_s16(v545, v568); + int16x8_t v639 = vaddq_s16(v637, v638); + int16x8_t v640 = vaddq_s16(v572, v547); + int16x8_t v641 = vaddq_s16(v548, v576); + int16x8_t v642 = vaddq_s16(v640, v641); + int16x8_t v643 = vaddq_s16(v639, v642); + int16x8_t v644 = vaddq_s16(v636, v643); + int16x8_t v645 = vaddq_s16(v642, v635); + int16x8_t v646_tmp = vqrdmulhq_n_s16(v645, 13573); + int16x8_t v646 = vaddq_s16(v646_tmp, v645); + int16x8_t v647 = vaddq_s16(v604, v554); + int16x8_t v648 = vaddq_s16(v555, v584); + int16x8_t v649 = vaddq_s16(v647, v648); + int16x8_t v650 = vaddq_s16(v649, v639); + int16x8_t v651 = vaddq_s16(v650, v645); + int16x8_t v652 = vaddq_s16(v646, v651); + int16x8_t v653 = vqrdmulhq_n_s16(v652, 17734); + int16x8_t v654 = vaddq_s16(v644, v653); + int16x8_t v655 = vqrdmulhq_n_s16(v654, 16705); + int16x8_t v656 = vaddq_s16(v632, v655); + int16x8_t v657 = vaddq_s16(v634, v616); + int16x8_t v658_tmp = vqrdmulhq_n_s16(v657, 13573); + int16x8_t v658 = vaddq_s16(v658_tmp, v657); + int16x8_t v659 = vaddq_s16(v638, v618); + int16x8_t v660 = vaddq_s16(v619, v640); + int16x8_t v661 = vaddq_s16(v659, v660); + int16x8_t v662 = vaddq_s16(v658, v661); + int16x8_t v663 = vaddq_s16(v641, v622); + int16x8_t v664 = vaddq_s16(v623, v633); + int16x8_t v665 = vaddq_s16(v663, v664); + int16x8_t v666_tmp = vqrdmulhq_n_s16(v665, 13573); + int16x8_t v666 = vaddq_s16(v666_tmp, v665); + int16x8_t v667 = vaddq_s16(v648, v626); + int16x8_t v668 = vaddq_s16(v627, v637); + int16x8_t v669 = vaddq_s16(v667, v668); + int16x8_t v670 = vaddq_s16(v669, v665); + int16x8_t v671 = vaddq_s16(v666, v670); + int16x8_t v672 = vqrdmulhq_n_s16(v671, 17734); + int16x8_t v673 = vaddq_s16(v662, v672); + int16x8_t v674 = vaddq_s16(v664, v657); + int16x8_t v675_tmp = vqrdmulhq_n_s16(v674, 13573); + int16x8_t v675 = vaddq_s16(v675_tmp, v674); + int16x8_t v676 = vaddq_s16(v668, v659); + int16x8_t v677 = vaddq_s16(v660, v663); + int16x8_t v678 = vaddq_s16(v676, v677); + int16x8_t v679 = vaddq_s16(v675, v678); + int16x8_t v680 = vaddq_s16(v677, v674); + int16x8_t v681_tmp = vqrdmulhq_n_s16(v680, 13573); + int16x8_t v681 = vaddq_s16(v681_tmp, v680); + int16x8_t v682 = vld1q_s16(in + in_stride * 254 + i); + int16x8_t v683 = vaddq_s16(v682, v507); + int16x8_t v684 = vaddq_s16(v683, v603); + int16x8_t v685 = vaddq_s16(v684, v647); + int16x8_t v686 = vaddq_s16(v685, v667); + int16x8_t v687 = vaddq_s16(v686, v676); + int16x8_t v688 = vaddq_s16(v687, v680); + int16x8_t v689 = vaddq_s16(v681, v688); + int16x8_t v690 = vqrdmulhq_n_s16(v689, 17734); + int16x8_t v691 = vaddq_s16(v679, v690); + int16x8_t v692 = vqrdmulhq_n_s16(v691, 16705); + int16x8_t v693 = vaddq_s16(v673, v692); + int16x8_t v694 = vqrdmulhq_n_s16(v693, 16463); + int16x8_t v695 = vaddq_s16(v656, v694); + int16x8_t v696 = vqrdmulhq_n_s16(v695, 16404); + int16x8_t v697 = vaddq_s16(v615, v696); + int16x8_t v698 = vqrdmulhq_n_s16(v697, 16389); + int16x8_t v699 = vaddq_s16(v522, v698); + int16x8_t v700 = vqrdmulhq_n_s16(v699, 16385); + int16x8_t v701 = vaddq_s16(v317, v700); + int16x8_t v702 = vld1q_s16(in + in_stride * 1 + i); + int16x8_t v703_tmp = vqrdmulhq_n_s16(v702, 13573); + int16x8_t v703 = vaddq_s16(v703_tmp, v702); + int16x8_t v704 = vld1q_s16(in + in_stride * 129 + i); + int16x8_t v705 = vld1q_s16(in + in_stride * 127 + i); + int16x8_t v706 = vaddq_s16(v704, v705); + int16x8_t v707 = vaddq_s16(v703, v706); + int16x8_t v708 = vld1q_s16(in + in_stride * 65 + i); + int16x8_t v709 = vld1q_s16(in + in_stride * 63 + i); + int16x8_t v710 = vaddq_s16(v708, v709); + int16x8_t v711_tmp = vqrdmulhq_n_s16(v710, 13573); + int16x8_t v711 = vaddq_s16(v711_tmp, v710); + int16x8_t v712 = vld1q_s16(in + in_stride * 193 + i); + int16x8_t v713 = vld1q_s16(in + in_stride * 191 + i); + int16x8_t v714 = vaddq_s16(v712, v713); + int16x8_t v715 = vaddq_s16(v714, v710); + int16x8_t v716 = vaddq_s16(v711, v715); + int16x8_t v717 = vqrdmulhq_n_s16(v716, 17734); + int16x8_t v718 = vaddq_s16(v707, v717); + int16x8_t v719 = vld1q_s16(in + in_stride * 33 + i); + int16x8_t v720 = vld1q_s16(in + in_stride * 31 + i); + int16x8_t v721 = vaddq_s16(v719, v720); + int16x8_t v722_tmp = vqrdmulhq_n_s16(v721, 13573); + int16x8_t v722 = vaddq_s16(v722_tmp, v721); + int16x8_t v723 = vld1q_s16(in + in_stride * 161 + i); + int16x8_t v724 = vld1q_s16(in + in_stride * 159 + i); + int16x8_t v725 = vaddq_s16(v723, v724); + int16x8_t v726 = vld1q_s16(in + in_stride * 97 + i); + int16x8_t v727 = vld1q_s16(in + in_stride * 95 + i); + int16x8_t v728 = vaddq_s16(v726, v727); + int16x8_t v729 = vaddq_s16(v725, v728); + int16x8_t v730 = vaddq_s16(v722, v729); + int16x8_t v731 = vaddq_s16(v728, v721); + int16x8_t v732_tmp = vqrdmulhq_n_s16(v731, 13573); + int16x8_t v732 = vaddq_s16(v732_tmp, v731); + int16x8_t v733 = vld1q_s16(in + in_stride * 225 + i); + int16x8_t v734 = vld1q_s16(in + in_stride * 223 + i); + int16x8_t v735 = vaddq_s16(v733, v734); + int16x8_t v736 = vaddq_s16(v735, v725); + int16x8_t v737 = vaddq_s16(v736, v731); + int16x8_t v738 = vaddq_s16(v732, v737); + int16x8_t v739 = vqrdmulhq_n_s16(v738, 17734); + int16x8_t v740 = vaddq_s16(v730, v739); + int16x8_t v741 = vqrdmulhq_n_s16(v740, 16705); + int16x8_t v742 = vaddq_s16(v718, v741); + int16x8_t v743 = vld1q_s16(in + in_stride * 17 + i); + int16x8_t v744 = vld1q_s16(in + in_stride * 15 + i); + int16x8_t v745 = vaddq_s16(v743, v744); + int16x8_t v746_tmp = vqrdmulhq_n_s16(v745, 13573); + int16x8_t v746 = vaddq_s16(v746_tmp, v745); + int16x8_t v747 = vld1q_s16(in + in_stride * 145 + i); + int16x8_t v748 = vld1q_s16(in + in_stride * 143 + i); + int16x8_t v749 = vaddq_s16(v747, v748); + int16x8_t v750 = vld1q_s16(in + in_stride * 113 + i); + int16x8_t v751 = vld1q_s16(in + in_stride * 111 + i); + int16x8_t v752 = vaddq_s16(v750, v751); + int16x8_t v753 = vaddq_s16(v749, v752); + int16x8_t v754 = vaddq_s16(v746, v753); + int16x8_t v755 = vld1q_s16(in + in_stride * 81 + i); + int16x8_t v756 = vld1q_s16(in + in_stride * 79 + i); + int16x8_t v757 = vaddq_s16(v755, v756); + int16x8_t v758 = vld1q_s16(in + in_stride * 49 + i); + int16x8_t v759 = vld1q_s16(in + in_stride * 47 + i); + int16x8_t v760 = vaddq_s16(v758, v759); + int16x8_t v761 = vaddq_s16(v757, v760); + int16x8_t v762_tmp = vqrdmulhq_n_s16(v761, 13573); + int16x8_t v762 = vaddq_s16(v762_tmp, v761); + int16x8_t v763 = vld1q_s16(in + in_stride * 209 + i); + int16x8_t v764 = vld1q_s16(in + in_stride * 207 + i); + int16x8_t v765 = vaddq_s16(v763, v764); + int16x8_t v766 = vld1q_s16(in + in_stride * 177 + i); + int16x8_t v767 = vld1q_s16(in + in_stride * 175 + i); + int16x8_t v768 = vaddq_s16(v766, v767); + int16x8_t v769 = vaddq_s16(v765, v768); + int16x8_t v770 = vaddq_s16(v769, v761); + int16x8_t v771 = vaddq_s16(v762, v770); + int16x8_t v772 = vqrdmulhq_n_s16(v771, 17734); + int16x8_t v773 = vaddq_s16(v754, v772); + int16x8_t v774 = vaddq_s16(v760, v745); + int16x8_t v775_tmp = vqrdmulhq_n_s16(v774, 13573); + int16x8_t v775 = vaddq_s16(v775_tmp, v774); + int16x8_t v776 = vaddq_s16(v768, v749); + int16x8_t v777 = vaddq_s16(v752, v757); + int16x8_t v778 = vaddq_s16(v776, v777); + int16x8_t v779 = vaddq_s16(v775, v778); + int16x8_t v780 = vaddq_s16(v777, v774); + int16x8_t v781_tmp = vqrdmulhq_n_s16(v780, 13573); + int16x8_t v781 = vaddq_s16(v781_tmp, v780); + int16x8_t v782 = vld1q_s16(in + in_stride * 241 + i); + int16x8_t v783 = vld1q_s16(in + in_stride * 239 + i); + int16x8_t v784 = vaddq_s16(v782, v783); + int16x8_t v785 = vaddq_s16(v784, v765); + int16x8_t v786 = vaddq_s16(v785, v776); + int16x8_t v787 = vaddq_s16(v786, v780); + int16x8_t v788 = vaddq_s16(v781, v787); + int16x8_t v789 = vqrdmulhq_n_s16(v788, 17734); + int16x8_t v790 = vaddq_s16(v779, v789); + int16x8_t v791 = vqrdmulhq_n_s16(v790, 16705); + int16x8_t v792 = vaddq_s16(v773, v791); + int16x8_t v793 = vqrdmulhq_n_s16(v792, 16463); + int16x8_t v794 = vaddq_s16(v742, v793); + int16x8_t v795 = vld1q_s16(in + in_stride * 9 + i); + int16x8_t v796 = vld1q_s16(in + in_stride * 7 + i); + int16x8_t v797 = vaddq_s16(v795, v796); + int16x8_t v798_tmp = vqrdmulhq_n_s16(v797, 13573); + int16x8_t v798 = vaddq_s16(v798_tmp, v797); + int16x8_t v799 = vld1q_s16(in + in_stride * 137 + i); + int16x8_t v800 = vld1q_s16(in + in_stride * 135 + i); + int16x8_t v801 = vaddq_s16(v799, v800); + int16x8_t v802 = vld1q_s16(in + in_stride * 121 + i); + int16x8_t v803 = vld1q_s16(in + in_stride * 119 + i); + int16x8_t v804 = vaddq_s16(v802, v803); + int16x8_t v805 = vaddq_s16(v801, v804); + int16x8_t v806 = vaddq_s16(v798, v805); + int16x8_t v807 = vld1q_s16(in + in_stride * 73 + i); + int16x8_t v808 = vld1q_s16(in + in_stride * 71 + i); + int16x8_t v809 = vaddq_s16(v807, v808); + int16x8_t v810 = vld1q_s16(in + in_stride * 57 + i); + int16x8_t v811 = vld1q_s16(in + in_stride * 55 + i); + int16x8_t v812 = vaddq_s16(v810, v811); + int16x8_t v813 = vaddq_s16(v809, v812); + int16x8_t v814_tmp = vqrdmulhq_n_s16(v813, 13573); + int16x8_t v814 = vaddq_s16(v814_tmp, v813); + int16x8_t v815 = vld1q_s16(in + in_stride * 201 + i); + int16x8_t v816 = vld1q_s16(in + in_stride * 199 + i); + int16x8_t v817 = vaddq_s16(v815, v816); + int16x8_t v818 = vld1q_s16(in + in_stride * 185 + i); + int16x8_t v819 = vld1q_s16(in + in_stride * 183 + i); + int16x8_t v820 = vaddq_s16(v818, v819); + int16x8_t v821 = vaddq_s16(v817, v820); + int16x8_t v822 = vaddq_s16(v821, v813); + int16x8_t v823 = vaddq_s16(v814, v822); + int16x8_t v824 = vqrdmulhq_n_s16(v823, 17734); + int16x8_t v825 = vaddq_s16(v806, v824); + int16x8_t v826 = vld1q_s16(in + in_stride * 41 + i); + int16x8_t v827 = vld1q_s16(in + in_stride * 39 + i); + int16x8_t v828 = vaddq_s16(v826, v827); + int16x8_t v829 = vld1q_s16(in + in_stride * 25 + i); + int16x8_t v830 = vld1q_s16(in + in_stride * 23 + i); + int16x8_t v831 = vaddq_s16(v829, v830); + int16x8_t v832 = vaddq_s16(v828, v831); + int16x8_t v833_tmp = vqrdmulhq_n_s16(v832, 13573); + int16x8_t v833 = vaddq_s16(v833_tmp, v832); + int16x8_t v834 = vld1q_s16(in + in_stride * 169 + i); + int16x8_t v835 = vld1q_s16(in + in_stride * 167 + i); + int16x8_t v836 = vaddq_s16(v834, v835); + int16x8_t v837 = vld1q_s16(in + in_stride * 153 + i); + int16x8_t v838 = vld1q_s16(in + in_stride * 151 + i); + int16x8_t v839 = vaddq_s16(v837, v838); + int16x8_t v840 = vaddq_s16(v836, v839); + int16x8_t v841 = vld1q_s16(in + in_stride * 105 + i); + int16x8_t v842 = vld1q_s16(in + in_stride * 103 + i); + int16x8_t v843 = vaddq_s16(v841, v842); + int16x8_t v844 = vld1q_s16(in + in_stride * 89 + i); + int16x8_t v845 = vld1q_s16(in + in_stride * 87 + i); + int16x8_t v846 = vaddq_s16(v844, v845); + int16x8_t v847 = vaddq_s16(v843, v846); + int16x8_t v848 = vaddq_s16(v840, v847); + int16x8_t v849 = vaddq_s16(v833, v848); + int16x8_t v850 = vaddq_s16(v847, v832); + int16x8_t v851_tmp = vqrdmulhq_n_s16(v850, 13573); + int16x8_t v851 = vaddq_s16(v851_tmp, v850); + int16x8_t v852 = vld1q_s16(in + in_stride * 233 + i); + int16x8_t v853 = vld1q_s16(in + in_stride * 231 + i); + int16x8_t v854 = vaddq_s16(v852, v853); + int16x8_t v855 = vld1q_s16(in + in_stride * 217 + i); + int16x8_t v856 = vld1q_s16(in + in_stride * 215 + i); + int16x8_t v857 = vaddq_s16(v855, v856); + int16x8_t v858 = vaddq_s16(v854, v857); + int16x8_t v859 = vaddq_s16(v858, v840); + int16x8_t v860 = vaddq_s16(v859, v850); + int16x8_t v861 = vaddq_s16(v851, v860); + int16x8_t v862 = vqrdmulhq_n_s16(v861, 17734); + int16x8_t v863 = vaddq_s16(v849, v862); + int16x8_t v864 = vqrdmulhq_n_s16(v863, 16705); + int16x8_t v865 = vaddq_s16(v825, v864); + int16x8_t v866 = vaddq_s16(v831, v797); + int16x8_t v867_tmp = vqrdmulhq_n_s16(v866, 13573); + int16x8_t v867 = vaddq_s16(v867_tmp, v866); + int16x8_t v868 = vaddq_s16(v839, v801); + int16x8_t v869 = vaddq_s16(v804, v843); + int16x8_t v870 = vaddq_s16(v868, v869); + int16x8_t v871 = vaddq_s16(v867, v870); + int16x8_t v872 = vaddq_s16(v846, v809); + int16x8_t v873 = vaddq_s16(v812, v828); + int16x8_t v874 = vaddq_s16(v872, v873); + int16x8_t v875_tmp = vqrdmulhq_n_s16(v874, 13573); + int16x8_t v875 = vaddq_s16(v875_tmp, v874); + int16x8_t v876 = vaddq_s16(v857, v817); + int16x8_t v877 = vaddq_s16(v820, v836); + int16x8_t v878 = vaddq_s16(v876, v877); + int16x8_t v879 = vaddq_s16(v878, v874); + int16x8_t v880 = vaddq_s16(v875, v879); + int16x8_t v881 = vqrdmulhq_n_s16(v880, 17734); + int16x8_t v882 = vaddq_s16(v871, v881); + int16x8_t v883 = vaddq_s16(v873, v866); + int16x8_t v884_tmp = vqrdmulhq_n_s16(v883, 13573); + int16x8_t v884 = vaddq_s16(v884_tmp, v883); + int16x8_t v885 = vaddq_s16(v877, v868); + int16x8_t v886 = vaddq_s16(v869, v872); + int16x8_t v887 = vaddq_s16(v885, v886); + int16x8_t v888 = vaddq_s16(v884, v887); + int16x8_t v889 = vaddq_s16(v886, v883); + int16x8_t v890_tmp = vqrdmulhq_n_s16(v889, 13573); + int16x8_t v890 = vaddq_s16(v890_tmp, v889); + int16x8_t v891 = vld1q_s16(in + in_stride * 249 + i); + int16x8_t v892 = vld1q_s16(in + in_stride * 247 + i); + int16x8_t v893 = vaddq_s16(v891, v892); + int16x8_t v894 = vaddq_s16(v893, v854); + int16x8_t v895 = vaddq_s16(v894, v876); + int16x8_t v896 = vaddq_s16(v895, v885); + int16x8_t v897 = vaddq_s16(v896, v889); + int16x8_t v898 = vaddq_s16(v890, v897); + int16x8_t v899 = vqrdmulhq_n_s16(v898, 17734); + int16x8_t v900 = vaddq_s16(v888, v899); + int16x8_t v901 = vqrdmulhq_n_s16(v900, 16705); + int16x8_t v902 = vaddq_s16(v882, v901); + int16x8_t v903 = vqrdmulhq_n_s16(v902, 16463); + int16x8_t v904 = vaddq_s16(v865, v903); + int16x8_t v905 = vqrdmulhq_n_s16(v904, 16404); + int16x8_t v906 = vaddq_s16(v794, v905); + int16x8_t v907 = vld1q_s16(in + in_stride * 5 + i); + int16x8_t v908 = vld1q_s16(in + in_stride * 3 + i); + int16x8_t v909 = vaddq_s16(v907, v908); + int16x8_t v910_tmp = vqrdmulhq_n_s16(v909, 13573); + int16x8_t v910 = vaddq_s16(v910_tmp, v909); + int16x8_t v911 = vld1q_s16(in + in_stride * 133 + i); + int16x8_t v912 = vld1q_s16(in + in_stride * 131 + i); + int16x8_t v913 = vaddq_s16(v911, v912); + int16x8_t v914 = vld1q_s16(in + in_stride * 125 + i); + int16x8_t v915 = vld1q_s16(in + in_stride * 123 + i); + int16x8_t v916 = vaddq_s16(v914, v915); + int16x8_t v917 = vaddq_s16(v913, v916); + int16x8_t v918 = vaddq_s16(v910, v917); + int16x8_t v919 = vld1q_s16(in + in_stride * 69 + i); + int16x8_t v920 = vld1q_s16(in + in_stride * 67 + i); + int16x8_t v921 = vaddq_s16(v919, v920); + int16x8_t v922 = vld1q_s16(in + in_stride * 61 + i); + int16x8_t v923 = vld1q_s16(in + in_stride * 59 + i); + int16x8_t v924 = vaddq_s16(v922, v923); + int16x8_t v925 = vaddq_s16(v921, v924); + int16x8_t v926_tmp = vqrdmulhq_n_s16(v925, 13573); + int16x8_t v926 = vaddq_s16(v926_tmp, v925); + int16x8_t v927 = vld1q_s16(in + in_stride * 197 + i); + int16x8_t v928 = vld1q_s16(in + in_stride * 195 + i); + int16x8_t v929 = vaddq_s16(v927, v928); + int16x8_t v930 = vld1q_s16(in + in_stride * 189 + i); + int16x8_t v931 = vld1q_s16(in + in_stride * 187 + i); + int16x8_t v932 = vaddq_s16(v930, v931); + int16x8_t v933 = vaddq_s16(v929, v932); + int16x8_t v934 = vaddq_s16(v933, v925); + int16x8_t v935 = vaddq_s16(v926, v934); + int16x8_t v936 = vqrdmulhq_n_s16(v935, 17734); + int16x8_t v937 = vaddq_s16(v918, v936); + int16x8_t v938 = vld1q_s16(in + in_stride * 37 + i); + int16x8_t v939 = vld1q_s16(in + in_stride * 35 + i); + int16x8_t v940 = vaddq_s16(v938, v939); + int16x8_t v941 = vld1q_s16(in + in_stride * 29 + i); + int16x8_t v942 = vld1q_s16(in + in_stride * 27 + i); + int16x8_t v943 = vaddq_s16(v941, v942); + int16x8_t v944 = vaddq_s16(v940, v943); + int16x8_t v945_tmp = vqrdmulhq_n_s16(v944, 13573); + int16x8_t v945 = vaddq_s16(v945_tmp, v944); + int16x8_t v946 = vld1q_s16(in + in_stride * 165 + i); + int16x8_t v947 = vld1q_s16(in + in_stride * 163 + i); + int16x8_t v948 = vaddq_s16(v946, v947); + int16x8_t v949 = vld1q_s16(in + in_stride * 157 + i); + int16x8_t v950 = vld1q_s16(in + in_stride * 155 + i); + int16x8_t v951 = vaddq_s16(v949, v950); + int16x8_t v952 = vaddq_s16(v948, v951); + int16x8_t v953 = vld1q_s16(in + in_stride * 101 + i); + int16x8_t v954 = vld1q_s16(in + in_stride * 99 + i); + int16x8_t v955 = vaddq_s16(v953, v954); + int16x8_t v956 = vld1q_s16(in + in_stride * 93 + i); + int16x8_t v957 = vld1q_s16(in + in_stride * 91 + i); + int16x8_t v958 = vaddq_s16(v956, v957); + int16x8_t v959 = vaddq_s16(v955, v958); + int16x8_t v960 = vaddq_s16(v952, v959); + int16x8_t v961 = vaddq_s16(v945, v960); + int16x8_t v962 = vaddq_s16(v959, v944); + int16x8_t v963_tmp = vqrdmulhq_n_s16(v962, 13573); + int16x8_t v963 = vaddq_s16(v963_tmp, v962); + int16x8_t v964 = vld1q_s16(in + in_stride * 229 + i); + int16x8_t v965 = vld1q_s16(in + in_stride * 227 + i); + int16x8_t v966 = vaddq_s16(v964, v965); + int16x8_t v967 = vld1q_s16(in + in_stride * 221 + i); + int16x8_t v968 = vld1q_s16(in + in_stride * 219 + i); + int16x8_t v969 = vaddq_s16(v967, v968); + int16x8_t v970 = vaddq_s16(v966, v969); + int16x8_t v971 = vaddq_s16(v970, v952); + int16x8_t v972 = vaddq_s16(v971, v962); + int16x8_t v973 = vaddq_s16(v963, v972); + int16x8_t v974 = vqrdmulhq_n_s16(v973, 17734); + int16x8_t v975 = vaddq_s16(v961, v974); + int16x8_t v976 = vqrdmulhq_n_s16(v975, 16705); + int16x8_t v977 = vaddq_s16(v937, v976); + int16x8_t v978 = vld1q_s16(in + in_stride * 21 + i); + int16x8_t v979 = vld1q_s16(in + in_stride * 19 + i); + int16x8_t v980 = vaddq_s16(v978, v979); + int16x8_t v981 = vld1q_s16(in + in_stride * 13 + i); + int16x8_t v982 = vld1q_s16(in + in_stride * 11 + i); + int16x8_t v983 = vaddq_s16(v981, v982); + int16x8_t v984 = vaddq_s16(v980, v983); + int16x8_t v985_tmp = vqrdmulhq_n_s16(v984, 13573); + int16x8_t v985 = vaddq_s16(v985_tmp, v984); + int16x8_t v986 = vld1q_s16(in + in_stride * 149 + i); + int16x8_t v987 = vld1q_s16(in + in_stride * 147 + i); + int16x8_t v988 = vaddq_s16(v986, v987); + int16x8_t v989 = vld1q_s16(in + in_stride * 141 + i); + int16x8_t v990 = vld1q_s16(in + in_stride * 139 + i); + int16x8_t v991 = vaddq_s16(v989, v990); + int16x8_t v992 = vaddq_s16(v988, v991); + int16x8_t v993 = vld1q_s16(in + in_stride * 117 + i); + int16x8_t v994 = vld1q_s16(in + in_stride * 115 + i); + int16x8_t v995 = vaddq_s16(v993, v994); + int16x8_t v996 = vld1q_s16(in + in_stride * 109 + i); + int16x8_t v997 = vld1q_s16(in + in_stride * 107 + i); + int16x8_t v998 = vaddq_s16(v996, v997); + int16x8_t v999 = vaddq_s16(v995, v998); + int16x8_t v1000 = vaddq_s16(v992, v999); + int16x8_t v1001 = vaddq_s16(v985, v1000); + int16x8_t v1002 = vld1q_s16(in + in_stride * 85 + i); + int16x8_t v1003 = vld1q_s16(in + in_stride * 83 + i); + int16x8_t v1004 = vaddq_s16(v1002, v1003); + int16x8_t v1005 = vld1q_s16(in + in_stride * 77 + i); + int16x8_t v1006 = vld1q_s16(in + in_stride * 75 + i); + int16x8_t v1007 = vaddq_s16(v1005, v1006); + int16x8_t v1008 = vaddq_s16(v1004, v1007); + int16x8_t v1009 = vld1q_s16(in + in_stride * 53 + i); + int16x8_t v1010 = vld1q_s16(in + in_stride * 51 + i); + int16x8_t v1011 = vaddq_s16(v1009, v1010); + int16x8_t v1012 = vld1q_s16(in + in_stride * 45 + i); + int16x8_t v1013 = vld1q_s16(in + in_stride * 43 + i); + int16x8_t v1014 = vaddq_s16(v1012, v1013); + int16x8_t v1015 = vaddq_s16(v1011, v1014); + int16x8_t v1016 = vaddq_s16(v1008, v1015); + int16x8_t v1017_tmp = vqrdmulhq_n_s16(v1016, 13573); + int16x8_t v1017 = vaddq_s16(v1017_tmp, v1016); + int16x8_t v1018 = vld1q_s16(in + in_stride * 213 + i); + int16x8_t v1019 = vld1q_s16(in + in_stride * 211 + i); + int16x8_t v1020 = vaddq_s16(v1018, v1019); + int16x8_t v1021 = vld1q_s16(in + in_stride * 205 + i); + int16x8_t v1022 = vld1q_s16(in + in_stride * 203 + i); + int16x8_t v1023 = vaddq_s16(v1021, v1022); + int16x8_t v1024 = vaddq_s16(v1020, v1023); + int16x8_t v1025 = vld1q_s16(in + in_stride * 181 + i); + int16x8_t v1026 = vld1q_s16(in + in_stride * 179 + i); + int16x8_t v1027 = vaddq_s16(v1025, v1026); + int16x8_t v1028 = vld1q_s16(in + in_stride * 173 + i); + int16x8_t v1029 = vld1q_s16(in + in_stride * 171 + i); + int16x8_t v1030 = vaddq_s16(v1028, v1029); + int16x8_t v1031 = vaddq_s16(v1027, v1030); + int16x8_t v1032 = vaddq_s16(v1024, v1031); + int16x8_t v1033 = vaddq_s16(v1032, v1016); + int16x8_t v1034 = vaddq_s16(v1017, v1033); + int16x8_t v1035 = vqrdmulhq_n_s16(v1034, 17734); + int16x8_t v1036 = vaddq_s16(v1001, v1035); + int16x8_t v1037 = vaddq_s16(v1015, v984); + int16x8_t v1038_tmp = vqrdmulhq_n_s16(v1037, 13573); + int16x8_t v1038 = vaddq_s16(v1038_tmp, v1037); + int16x8_t v1039 = vaddq_s16(v1031, v992); + int16x8_t v1040 = vaddq_s16(v999, v1008); + int16x8_t v1041 = vaddq_s16(v1039, v1040); + int16x8_t v1042 = vaddq_s16(v1038, v1041); + int16x8_t v1043 = vaddq_s16(v1040, v1037); + int16x8_t v1044_tmp = vqrdmulhq_n_s16(v1043, 13573); + int16x8_t v1044 = vaddq_s16(v1044_tmp, v1043); + int16x8_t v1045 = vld1q_s16(in + in_stride * 245 + i); + int16x8_t v1046 = vld1q_s16(in + in_stride * 243 + i); + int16x8_t v1047 = vaddq_s16(v1045, v1046); + int16x8_t v1048 = vld1q_s16(in + in_stride * 237 + i); + int16x8_t v1049 = vld1q_s16(in + in_stride * 235 + i); + int16x8_t v1050 = vaddq_s16(v1048, v1049); + int16x8_t v1051 = vaddq_s16(v1047, v1050); + int16x8_t v1052 = vaddq_s16(v1051, v1024); + int16x8_t v1053 = vaddq_s16(v1052, v1039); + int16x8_t v1054 = vaddq_s16(v1053, v1043); + int16x8_t v1055 = vaddq_s16(v1044, v1054); + int16x8_t v1056 = vqrdmulhq_n_s16(v1055, 17734); + int16x8_t v1057 = vaddq_s16(v1042, v1056); + int16x8_t v1058 = vqrdmulhq_n_s16(v1057, 16705); + int16x8_t v1059 = vaddq_s16(v1036, v1058); + int16x8_t v1060 = vqrdmulhq_n_s16(v1059, 16463); + int16x8_t v1061 = vaddq_s16(v977, v1060); + int16x8_t v1062 = vaddq_s16(v983, v909); + int16x8_t v1063_tmp = vqrdmulhq_n_s16(v1062, 13573); + int16x8_t v1063 = vaddq_s16(v1063_tmp, v1062); + int16x8_t v1064 = vaddq_s16(v991, v913); + int16x8_t v1065 = vaddq_s16(v916, v995); + int16x8_t v1066 = vaddq_s16(v1064, v1065); + int16x8_t v1067 = vaddq_s16(v1063, v1066); + int16x8_t v1068 = vaddq_s16(v1007, v921); + int16x8_t v1069 = vaddq_s16(v924, v1011); + int16x8_t v1070 = vaddq_s16(v1068, v1069); + int16x8_t v1071_tmp = vqrdmulhq_n_s16(v1070, 13573); + int16x8_t v1071 = vaddq_s16(v1071_tmp, v1070); + int16x8_t v1072 = vaddq_s16(v1023, v929); + int16x8_t v1073 = vaddq_s16(v932, v1027); + int16x8_t v1074 = vaddq_s16(v1072, v1073); + int16x8_t v1075 = vaddq_s16(v1074, v1070); + int16x8_t v1076 = vaddq_s16(v1071, v1075); + int16x8_t v1077 = vqrdmulhq_n_s16(v1076, 17734); + int16x8_t v1078 = vaddq_s16(v1067, v1077); + int16x8_t v1079 = vaddq_s16(v1014, v940); + int16x8_t v1080 = vaddq_s16(v943, v980); + int16x8_t v1081 = vaddq_s16(v1079, v1080); + int16x8_t v1082_tmp = vqrdmulhq_n_s16(v1081, 13573); + int16x8_t v1082 = vaddq_s16(v1082_tmp, v1081); + int16x8_t v1083 = vaddq_s16(v1030, v948); + int16x8_t v1084 = vaddq_s16(v951, v988); + int16x8_t v1085 = vaddq_s16(v1083, v1084); + int16x8_t v1086 = vaddq_s16(v998, v955); + int16x8_t v1087 = vaddq_s16(v958, v1004); + int16x8_t v1088 = vaddq_s16(v1086, v1087); + int16x8_t v1089 = vaddq_s16(v1085, v1088); + int16x8_t v1090 = vaddq_s16(v1082, v1089); + int16x8_t v1091 = vaddq_s16(v1088, v1081); + int16x8_t v1092_tmp = vqrdmulhq_n_s16(v1091, 13573); + int16x8_t v1092 = vaddq_s16(v1092_tmp, v1091); + int16x8_t v1093 = vaddq_s16(v1050, v966); + int16x8_t v1094 = vaddq_s16(v969, v1020); + int16x8_t v1095 = vaddq_s16(v1093, v1094); + int16x8_t v1096 = vaddq_s16(v1095, v1085); + int16x8_t v1097 = vaddq_s16(v1096, v1091); + int16x8_t v1098 = vaddq_s16(v1092, v1097); + int16x8_t v1099 = vqrdmulhq_n_s16(v1098, 17734); + int16x8_t v1100 = vaddq_s16(v1090, v1099); + int16x8_t v1101 = vqrdmulhq_n_s16(v1100, 16705); + int16x8_t v1102 = vaddq_s16(v1078, v1101); + int16x8_t v1103 = vaddq_s16(v1080, v1062); + int16x8_t v1104_tmp = vqrdmulhq_n_s16(v1103, 13573); + int16x8_t v1104 = vaddq_s16(v1104_tmp, v1103); + int16x8_t v1105 = vaddq_s16(v1084, v1064); + int16x8_t v1106 = vaddq_s16(v1065, v1086); + int16x8_t v1107 = vaddq_s16(v1105, v1106); + int16x8_t v1108 = vaddq_s16(v1104, v1107); + int16x8_t v1109 = vaddq_s16(v1087, v1068); + int16x8_t v1110 = vaddq_s16(v1069, v1079); + int16x8_t v1111 = vaddq_s16(v1109, v1110); + int16x8_t v1112_tmp = vqrdmulhq_n_s16(v1111, 13573); + int16x8_t v1112 = vaddq_s16(v1112_tmp, v1111); + int16x8_t v1113 = vaddq_s16(v1094, v1072); + int16x8_t v1114 = vaddq_s16(v1073, v1083); + int16x8_t v1115 = vaddq_s16(v1113, v1114); + int16x8_t v1116 = vaddq_s16(v1115, v1111); + int16x8_t v1117 = vaddq_s16(v1112, v1116); + int16x8_t v1118 = vqrdmulhq_n_s16(v1117, 17734); + int16x8_t v1119 = vaddq_s16(v1108, v1118); + int16x8_t v1120 = vaddq_s16(v1110, v1103); + int16x8_t v1121_tmp = vqrdmulhq_n_s16(v1120, 13573); + int16x8_t v1121 = vaddq_s16(v1121_tmp, v1120); + int16x8_t v1122 = vaddq_s16(v1114, v1105); + int16x8_t v1123 = vaddq_s16(v1106, v1109); + int16x8_t v1124 = vaddq_s16(v1122, v1123); + int16x8_t v1125 = vaddq_s16(v1121, v1124); + int16x8_t v1126 = vaddq_s16(v1123, v1120); + int16x8_t v1127_tmp = vqrdmulhq_n_s16(v1126, 13573); + int16x8_t v1127 = vaddq_s16(v1127_tmp, v1126); + int16x8_t v1128 = vld1q_s16(in + in_stride * 253 + i); + int16x8_t v1129 = vld1q_s16(in + in_stride * 251 + i); + int16x8_t v1130 = vaddq_s16(v1128, v1129); + int16x8_t v1131 = vaddq_s16(v1130, v1047); + int16x8_t v1132 = vaddq_s16(v1131, v1093); + int16x8_t v1133 = vaddq_s16(v1132, v1113); + int16x8_t v1134 = vaddq_s16(v1133, v1122); + int16x8_t v1135 = vaddq_s16(v1134, v1126); + int16x8_t v1136 = vaddq_s16(v1127, v1135); + int16x8_t v1137 = vqrdmulhq_n_s16(v1136, 17734); + int16x8_t v1138 = vaddq_s16(v1125, v1137); + int16x8_t v1139 = vqrdmulhq_n_s16(v1138, 16705); + int16x8_t v1140 = vaddq_s16(v1119, v1139); + int16x8_t v1141 = vqrdmulhq_n_s16(v1140, 16463); + int16x8_t v1142 = vaddq_s16(v1102, v1141); + int16x8_t v1143 = vqrdmulhq_n_s16(v1142, 16404); + int16x8_t v1144 = vaddq_s16(v1061, v1143); + int16x8_t v1145 = vqrdmulhq_n_s16(v1144, 16389); + int16x8_t v1146 = vaddq_s16(v906, v1145); + int16x8_t v1147 = vaddq_s16(v908, v702); + int16x8_t v1148_tmp = vqrdmulhq_n_s16(v1147, 13573); + int16x8_t v1148 = vaddq_s16(v1148_tmp, v1147); + int16x8_t v1149 = vaddq_s16(v912, v704); + int16x8_t v1150 = vaddq_s16(v705, v914); + int16x8_t v1151 = vaddq_s16(v1149, v1150); + int16x8_t v1152 = vaddq_s16(v1148, v1151); + int16x8_t v1153 = vaddq_s16(v920, v708); + int16x8_t v1154 = vaddq_s16(v709, v922); + int16x8_t v1155 = vaddq_s16(v1153, v1154); + int16x8_t v1156_tmp = vqrdmulhq_n_s16(v1155, 13573); + int16x8_t v1156 = vaddq_s16(v1156_tmp, v1155); + int16x8_t v1157 = vaddq_s16(v928, v712); + int16x8_t v1158 = vaddq_s16(v713, v930); + int16x8_t v1159 = vaddq_s16(v1157, v1158); + int16x8_t v1160 = vaddq_s16(v1159, v1155); + int16x8_t v1161 = vaddq_s16(v1156, v1160); + int16x8_t v1162 = vqrdmulhq_n_s16(v1161, 17734); + int16x8_t v1163 = vaddq_s16(v1152, v1162); + int16x8_t v1164 = vaddq_s16(v939, v719); + int16x8_t v1165 = vaddq_s16(v720, v941); + int16x8_t v1166 = vaddq_s16(v1164, v1165); + int16x8_t v1167_tmp = vqrdmulhq_n_s16(v1166, 13573); + int16x8_t v1167 = vaddq_s16(v1167_tmp, v1166); + int16x8_t v1168 = vaddq_s16(v947, v723); + int16x8_t v1169 = vaddq_s16(v724, v949); + int16x8_t v1170 = vaddq_s16(v1168, v1169); + int16x8_t v1171 = vaddq_s16(v954, v726); + int16x8_t v1172 = vaddq_s16(v727, v956); + int16x8_t v1173 = vaddq_s16(v1171, v1172); + int16x8_t v1174 = vaddq_s16(v1170, v1173); + int16x8_t v1175 = vaddq_s16(v1167, v1174); + int16x8_t v1176 = vaddq_s16(v1173, v1166); + int16x8_t v1177_tmp = vqrdmulhq_n_s16(v1176, 13573); + int16x8_t v1177 = vaddq_s16(v1177_tmp, v1176); + int16x8_t v1178 = vaddq_s16(v965, v733); + int16x8_t v1179 = vaddq_s16(v734, v967); + int16x8_t v1180 = vaddq_s16(v1178, v1179); + int16x8_t v1181 = vaddq_s16(v1180, v1170); + int16x8_t v1182 = vaddq_s16(v1181, v1176); + int16x8_t v1183 = vaddq_s16(v1177, v1182); + int16x8_t v1184 = vqrdmulhq_n_s16(v1183, 17734); + int16x8_t v1185 = vaddq_s16(v1175, v1184); + int16x8_t v1186 = vqrdmulhq_n_s16(v1185, 16705); + int16x8_t v1187 = vaddq_s16(v1163, v1186); + int16x8_t v1188 = vaddq_s16(v979, v743); + int16x8_t v1189 = vaddq_s16(v744, v981); + int16x8_t v1190 = vaddq_s16(v1188, v1189); + int16x8_t v1191_tmp = vqrdmulhq_n_s16(v1190, 13573); + int16x8_t v1191 = vaddq_s16(v1191_tmp, v1190); + int16x8_t v1192 = vaddq_s16(v987, v747); + int16x8_t v1193 = vaddq_s16(v748, v989); + int16x8_t v1194 = vaddq_s16(v1192, v1193); + int16x8_t v1195 = vaddq_s16(v994, v750); + int16x8_t v1196 = vaddq_s16(v751, v996); + int16x8_t v1197 = vaddq_s16(v1195, v1196); + int16x8_t v1198 = vaddq_s16(v1194, v1197); + int16x8_t v1199 = vaddq_s16(v1191, v1198); + int16x8_t v1200 = vaddq_s16(v1003, v755); + int16x8_t v1201 = vaddq_s16(v756, v1005); + int16x8_t v1202 = vaddq_s16(v1200, v1201); + int16x8_t v1203 = vaddq_s16(v1010, v758); + int16x8_t v1204 = vaddq_s16(v759, v1012); + int16x8_t v1205 = vaddq_s16(v1203, v1204); + int16x8_t v1206 = vaddq_s16(v1202, v1205); + int16x8_t v1207_tmp = vqrdmulhq_n_s16(v1206, 13573); + int16x8_t v1207 = vaddq_s16(v1207_tmp, v1206); + int16x8_t v1208 = vaddq_s16(v1019, v763); + int16x8_t v1209 = vaddq_s16(v764, v1021); + int16x8_t v1210 = vaddq_s16(v1208, v1209); + int16x8_t v1211 = vaddq_s16(v1026, v766); + int16x8_t v1212 = vaddq_s16(v767, v1028); + int16x8_t v1213 = vaddq_s16(v1211, v1212); + int16x8_t v1214 = vaddq_s16(v1210, v1213); + int16x8_t v1215 = vaddq_s16(v1214, v1206); + int16x8_t v1216 = vaddq_s16(v1207, v1215); + int16x8_t v1217 = vqrdmulhq_n_s16(v1216, 17734); + int16x8_t v1218 = vaddq_s16(v1199, v1217); + int16x8_t v1219 = vaddq_s16(v1205, v1190); + int16x8_t v1220_tmp = vqrdmulhq_n_s16(v1219, 13573); + int16x8_t v1220 = vaddq_s16(v1220_tmp, v1219); + int16x8_t v1221 = vaddq_s16(v1213, v1194); + int16x8_t v1222 = vaddq_s16(v1197, v1202); + int16x8_t v1223 = vaddq_s16(v1221, v1222); + int16x8_t v1224 = vaddq_s16(v1220, v1223); + int16x8_t v1225 = vaddq_s16(v1222, v1219); + int16x8_t v1226_tmp = vqrdmulhq_n_s16(v1225, 13573); + int16x8_t v1226 = vaddq_s16(v1226_tmp, v1225); + int16x8_t v1227 = vaddq_s16(v1046, v782); + int16x8_t v1228 = vaddq_s16(v783, v1048); + int16x8_t v1229 = vaddq_s16(v1227, v1228); + int16x8_t v1230 = vaddq_s16(v1229, v1210); + int16x8_t v1231 = vaddq_s16(v1230, v1221); + int16x8_t v1232 = vaddq_s16(v1231, v1225); + int16x8_t v1233 = vaddq_s16(v1226, v1232); + int16x8_t v1234 = vqrdmulhq_n_s16(v1233, 17734); + int16x8_t v1235 = vaddq_s16(v1224, v1234); + int16x8_t v1236 = vqrdmulhq_n_s16(v1235, 16705); + int16x8_t v1237 = vaddq_s16(v1218, v1236); + int16x8_t v1238 = vqrdmulhq_n_s16(v1237, 16463); + int16x8_t v1239 = vaddq_s16(v1187, v1238); + int16x8_t v1240 = vaddq_s16(v982, v795); + int16x8_t v1241 = vaddq_s16(v796, v907); + int16x8_t v1242 = vaddq_s16(v1240, v1241); + int16x8_t v1243_tmp = vqrdmulhq_n_s16(v1242, 13573); + int16x8_t v1243 = vaddq_s16(v1243_tmp, v1242); + int16x8_t v1244 = vaddq_s16(v990, v799); + int16x8_t v1245 = vaddq_s16(v800, v911); + int16x8_t v1246 = vaddq_s16(v1244, v1245); + int16x8_t v1247 = vaddq_s16(v915, v802); + int16x8_t v1248 = vaddq_s16(v803, v993); + int16x8_t v1249 = vaddq_s16(v1247, v1248); + int16x8_t v1250 = vaddq_s16(v1246, v1249); + int16x8_t v1251 = vaddq_s16(v1243, v1250); + int16x8_t v1252 = vaddq_s16(v1006, v807); + int16x8_t v1253 = vaddq_s16(v808, v919); + int16x8_t v1254 = vaddq_s16(v1252, v1253); + int16x8_t v1255 = vaddq_s16(v923, v810); + int16x8_t v1256 = vaddq_s16(v811, v1009); + int16x8_t v1257 = vaddq_s16(v1255, v1256); + int16x8_t v1258 = vaddq_s16(v1254, v1257); + int16x8_t v1259_tmp = vqrdmulhq_n_s16(v1258, 13573); + int16x8_t v1259 = vaddq_s16(v1259_tmp, v1258); + int16x8_t v1260 = vaddq_s16(v1022, v815); + int16x8_t v1261 = vaddq_s16(v816, v927); + int16x8_t v1262 = vaddq_s16(v1260, v1261); + int16x8_t v1263 = vaddq_s16(v931, v818); + int16x8_t v1264 = vaddq_s16(v819, v1025); + int16x8_t v1265 = vaddq_s16(v1263, v1264); + int16x8_t v1266 = vaddq_s16(v1262, v1265); + int16x8_t v1267 = vaddq_s16(v1266, v1258); + int16x8_t v1268 = vaddq_s16(v1259, v1267); + int16x8_t v1269 = vqrdmulhq_n_s16(v1268, 17734); + int16x8_t v1270 = vaddq_s16(v1251, v1269); + int16x8_t v1271 = vaddq_s16(v1013, v826); + int16x8_t v1272 = vaddq_s16(v827, v938); + int16x8_t v1273 = vaddq_s16(v1271, v1272); + int16x8_t v1274 = vaddq_s16(v942, v829); + int16x8_t v1275 = vaddq_s16(v830, v978); + int16x8_t v1276 = vaddq_s16(v1274, v1275); + int16x8_t v1277 = vaddq_s16(v1273, v1276); + int16x8_t v1278_tmp = vqrdmulhq_n_s16(v1277, 13573); + int16x8_t v1278 = vaddq_s16(v1278_tmp, v1277); + int16x8_t v1279 = vaddq_s16(v1029, v834); + int16x8_t v1280 = vaddq_s16(v835, v946); + int16x8_t v1281 = vaddq_s16(v1279, v1280); + int16x8_t v1282 = vaddq_s16(v950, v837); + int16x8_t v1283 = vaddq_s16(v838, v986); + int16x8_t v1284 = vaddq_s16(v1282, v1283); + int16x8_t v1285 = vaddq_s16(v1281, v1284); + int16x8_t v1286 = vaddq_s16(v997, v841); + int16x8_t v1287 = vaddq_s16(v842, v953); + int16x8_t v1288 = vaddq_s16(v1286, v1287); + int16x8_t v1289 = vaddq_s16(v957, v844); + int16x8_t v1290 = vaddq_s16(v845, v1002); + int16x8_t v1291 = vaddq_s16(v1289, v1290); + int16x8_t v1292 = vaddq_s16(v1288, v1291); + int16x8_t v1293 = vaddq_s16(v1285, v1292); + int16x8_t v1294 = vaddq_s16(v1278, v1293); + int16x8_t v1295 = vaddq_s16(v1292, v1277); + int16x8_t v1296_tmp = vqrdmulhq_n_s16(v1295, 13573); + int16x8_t v1296 = vaddq_s16(v1296_tmp, v1295); + int16x8_t v1297 = vaddq_s16(v1049, v852); + int16x8_t v1298 = vaddq_s16(v853, v964); + int16x8_t v1299 = vaddq_s16(v1297, v1298); + int16x8_t v1300 = vaddq_s16(v968, v855); + int16x8_t v1301 = vaddq_s16(v856, v1018); + int16x8_t v1302 = vaddq_s16(v1300, v1301); + int16x8_t v1303 = vaddq_s16(v1299, v1302); + int16x8_t v1304 = vaddq_s16(v1303, v1285); + int16x8_t v1305 = vaddq_s16(v1304, v1295); + int16x8_t v1306 = vaddq_s16(v1296, v1305); + int16x8_t v1307 = vqrdmulhq_n_s16(v1306, 17734); + int16x8_t v1308 = vaddq_s16(v1294, v1307); + int16x8_t v1309 = vqrdmulhq_n_s16(v1308, 16705); + int16x8_t v1310 = vaddq_s16(v1270, v1309); + int16x8_t v1311 = vaddq_s16(v1276, v1242); + int16x8_t v1312_tmp = vqrdmulhq_n_s16(v1311, 13573); + int16x8_t v1312 = vaddq_s16(v1312_tmp, v1311); + int16x8_t v1313 = vaddq_s16(v1284, v1246); + int16x8_t v1314 = vaddq_s16(v1249, v1288); + int16x8_t v1315 = vaddq_s16(v1313, v1314); + int16x8_t v1316 = vaddq_s16(v1312, v1315); + int16x8_t v1317 = vaddq_s16(v1291, v1254); + int16x8_t v1318 = vaddq_s16(v1257, v1273); + int16x8_t v1319 = vaddq_s16(v1317, v1318); + int16x8_t v1320_tmp = vqrdmulhq_n_s16(v1319, 13573); + int16x8_t v1320 = vaddq_s16(v1320_tmp, v1319); + int16x8_t v1321 = vaddq_s16(v1302, v1262); + int16x8_t v1322 = vaddq_s16(v1265, v1281); + int16x8_t v1323 = vaddq_s16(v1321, v1322); + int16x8_t v1324 = vaddq_s16(v1323, v1319); + int16x8_t v1325 = vaddq_s16(v1320, v1324); + int16x8_t v1326 = vqrdmulhq_n_s16(v1325, 17734); + int16x8_t v1327 = vaddq_s16(v1316, v1326); + int16x8_t v1328 = vaddq_s16(v1318, v1311); + int16x8_t v1329_tmp = vqrdmulhq_n_s16(v1328, 13573); + int16x8_t v1329 = vaddq_s16(v1329_tmp, v1328); + int16x8_t v1330 = vaddq_s16(v1322, v1313); + int16x8_t v1331 = vaddq_s16(v1314, v1317); + int16x8_t v1332 = vaddq_s16(v1330, v1331); + int16x8_t v1333 = vaddq_s16(v1329, v1332); + int16x8_t v1334 = vaddq_s16(v1331, v1328); + int16x8_t v1335_tmp = vqrdmulhq_n_s16(v1334, 13573); + int16x8_t v1335 = vaddq_s16(v1335_tmp, v1334); + int16x8_t v1336 = vaddq_s16(v1129, v891); + int16x8_t v1337 = vaddq_s16(v892, v1045); + int16x8_t v1338 = vaddq_s16(v1336, v1337); + int16x8_t v1339 = vaddq_s16(v1338, v1299); + int16x8_t v1340 = vaddq_s16(v1339, v1321); + int16x8_t v1341 = vaddq_s16(v1340, v1330); + int16x8_t v1342 = vaddq_s16(v1341, v1334); + int16x8_t v1343 = vaddq_s16(v1335, v1342); + int16x8_t v1344 = vqrdmulhq_n_s16(v1343, 17734); + int16x8_t v1345 = vaddq_s16(v1333, v1344); + int16x8_t v1346 = vqrdmulhq_n_s16(v1345, 16705); + int16x8_t v1347 = vaddq_s16(v1327, v1346); + int16x8_t v1348 = vqrdmulhq_n_s16(v1347, 16463); + int16x8_t v1349 = vaddq_s16(v1310, v1348); + int16x8_t v1350 = vqrdmulhq_n_s16(v1349, 16404); + int16x8_t v1351 = vaddq_s16(v1239, v1350); + int16x8_t v1352 = vaddq_s16(v1241, v1147); + int16x8_t v1353_tmp = vqrdmulhq_n_s16(v1352, 13573); + int16x8_t v1353 = vaddq_s16(v1353_tmp, v1352); + int16x8_t v1354 = vaddq_s16(v1245, v1149); + int16x8_t v1355 = vaddq_s16(v1150, v1247); + int16x8_t v1356 = vaddq_s16(v1354, v1355); + int16x8_t v1357 = vaddq_s16(v1353, v1356); + int16x8_t v1358 = vaddq_s16(v1253, v1153); + int16x8_t v1359 = vaddq_s16(v1154, v1255); + int16x8_t v1360 = vaddq_s16(v1358, v1359); + int16x8_t v1361_tmp = vqrdmulhq_n_s16(v1360, 13573); + int16x8_t v1361 = vaddq_s16(v1361_tmp, v1360); + int16x8_t v1362 = vaddq_s16(v1261, v1157); + int16x8_t v1363 = vaddq_s16(v1158, v1263); + int16x8_t v1364 = vaddq_s16(v1362, v1363); + int16x8_t v1365 = vaddq_s16(v1364, v1360); + int16x8_t v1366 = vaddq_s16(v1361, v1365); + int16x8_t v1367 = vqrdmulhq_n_s16(v1366, 17734); + int16x8_t v1368 = vaddq_s16(v1357, v1367); + int16x8_t v1369 = vaddq_s16(v1272, v1164); + int16x8_t v1370 = vaddq_s16(v1165, v1274); + int16x8_t v1371 = vaddq_s16(v1369, v1370); + int16x8_t v1372_tmp = vqrdmulhq_n_s16(v1371, 13573); + int16x8_t v1372 = vaddq_s16(v1372_tmp, v1371); + int16x8_t v1373 = vaddq_s16(v1280, v1168); + int16x8_t v1374 = vaddq_s16(v1169, v1282); + int16x8_t v1375 = vaddq_s16(v1373, v1374); + int16x8_t v1376 = vaddq_s16(v1287, v1171); + int16x8_t v1377 = vaddq_s16(v1172, v1289); + int16x8_t v1378 = vaddq_s16(v1376, v1377); + int16x8_t v1379 = vaddq_s16(v1375, v1378); + int16x8_t v1380 = vaddq_s16(v1372, v1379); + int16x8_t v1381 = vaddq_s16(v1378, v1371); + int16x8_t v1382_tmp = vqrdmulhq_n_s16(v1381, 13573); + int16x8_t v1382 = vaddq_s16(v1382_tmp, v1381); + int16x8_t v1383 = vaddq_s16(v1298, v1178); + int16x8_t v1384 = vaddq_s16(v1179, v1300); + int16x8_t v1385 = vaddq_s16(v1383, v1384); + int16x8_t v1386 = vaddq_s16(v1385, v1375); + int16x8_t v1387 = vaddq_s16(v1386, v1381); + int16x8_t v1388 = vaddq_s16(v1382, v1387); + int16x8_t v1389 = vqrdmulhq_n_s16(v1388, 17734); + int16x8_t v1390 = vaddq_s16(v1380, v1389); + int16x8_t v1391 = vqrdmulhq_n_s16(v1390, 16705); + int16x8_t v1392 = vaddq_s16(v1368, v1391); + int16x8_t v1393 = vaddq_s16(v1275, v1188); + int16x8_t v1394 = vaddq_s16(v1189, v1240); + int16x8_t v1395 = vaddq_s16(v1393, v1394); + int16x8_t v1396_tmp = vqrdmulhq_n_s16(v1395, 13573); + int16x8_t v1396 = vaddq_s16(v1396_tmp, v1395); + int16x8_t v1397 = vaddq_s16(v1283, v1192); + int16x8_t v1398 = vaddq_s16(v1193, v1244); + int16x8_t v1399 = vaddq_s16(v1397, v1398); + int16x8_t v1400 = vaddq_s16(v1248, v1195); + int16x8_t v1401 = vaddq_s16(v1196, v1286); + int16x8_t v1402 = vaddq_s16(v1400, v1401); + int16x8_t v1403 = vaddq_s16(v1399, v1402); + int16x8_t v1404 = vaddq_s16(v1396, v1403); + int16x8_t v1405 = vaddq_s16(v1290, v1200); + int16x8_t v1406 = vaddq_s16(v1201, v1252); + int16x8_t v1407 = vaddq_s16(v1405, v1406); + int16x8_t v1408 = vaddq_s16(v1256, v1203); + int16x8_t v1409 = vaddq_s16(v1204, v1271); + int16x8_t v1410 = vaddq_s16(v1408, v1409); + int16x8_t v1411 = vaddq_s16(v1407, v1410); + int16x8_t v1412_tmp = vqrdmulhq_n_s16(v1411, 13573); + int16x8_t v1412 = vaddq_s16(v1412_tmp, v1411); + int16x8_t v1413 = vaddq_s16(v1301, v1208); + int16x8_t v1414 = vaddq_s16(v1209, v1260); + int16x8_t v1415 = vaddq_s16(v1413, v1414); + int16x8_t v1416 = vaddq_s16(v1264, v1211); + int16x8_t v1417 = vaddq_s16(v1212, v1279); + int16x8_t v1418 = vaddq_s16(v1416, v1417); + int16x8_t v1419 = vaddq_s16(v1415, v1418); + int16x8_t v1420 = vaddq_s16(v1419, v1411); + int16x8_t v1421 = vaddq_s16(v1412, v1420); + int16x8_t v1422 = vqrdmulhq_n_s16(v1421, 17734); + int16x8_t v1423 = vaddq_s16(v1404, v1422); + int16x8_t v1424 = vaddq_s16(v1410, v1395); + int16x8_t v1425_tmp = vqrdmulhq_n_s16(v1424, 13573); + int16x8_t v1425 = vaddq_s16(v1425_tmp, v1424); + int16x8_t v1426 = vaddq_s16(v1418, v1399); + int16x8_t v1427 = vaddq_s16(v1402, v1407); + int16x8_t v1428 = vaddq_s16(v1426, v1427); + int16x8_t v1429 = vaddq_s16(v1425, v1428); + int16x8_t v1430 = vaddq_s16(v1427, v1424); + int16x8_t v1431_tmp = vqrdmulhq_n_s16(v1430, 13573); + int16x8_t v1431 = vaddq_s16(v1431_tmp, v1430); + int16x8_t v1432 = vaddq_s16(v1337, v1227); + int16x8_t v1433 = vaddq_s16(v1228, v1297); + int16x8_t v1434 = vaddq_s16(v1432, v1433); + int16x8_t v1435 = vaddq_s16(v1434, v1415); + int16x8_t v1436 = vaddq_s16(v1435, v1426); + int16x8_t v1437 = vaddq_s16(v1436, v1430); + int16x8_t v1438 = vaddq_s16(v1431, v1437); + int16x8_t v1439 = vqrdmulhq_n_s16(v1438, 17734); + int16x8_t v1440 = vaddq_s16(v1429, v1439); + int16x8_t v1441 = vqrdmulhq_n_s16(v1440, 16705); + int16x8_t v1442 = vaddq_s16(v1423, v1441); + int16x8_t v1443 = vqrdmulhq_n_s16(v1442, 16463); + int16x8_t v1444 = vaddq_s16(v1392, v1443); + int16x8_t v1445 = vaddq_s16(v1394, v1352); + int16x8_t v1446_tmp = vqrdmulhq_n_s16(v1445, 13573); + int16x8_t v1446 = vaddq_s16(v1446_tmp, v1445); + int16x8_t v1447 = vaddq_s16(v1398, v1354); + int16x8_t v1448 = vaddq_s16(v1355, v1400); + int16x8_t v1449 = vaddq_s16(v1447, v1448); + int16x8_t v1450 = vaddq_s16(v1446, v1449); + int16x8_t v1451 = vaddq_s16(v1406, v1358); + int16x8_t v1452 = vaddq_s16(v1359, v1408); + int16x8_t v1453 = vaddq_s16(v1451, v1452); + int16x8_t v1454_tmp = vqrdmulhq_n_s16(v1453, 13573); + int16x8_t v1454 = vaddq_s16(v1454_tmp, v1453); + int16x8_t v1455 = vaddq_s16(v1414, v1362); + int16x8_t v1456 = vaddq_s16(v1363, v1416); + int16x8_t v1457 = vaddq_s16(v1455, v1456); + int16x8_t v1458 = vaddq_s16(v1457, v1453); + int16x8_t v1459 = vaddq_s16(v1454, v1458); + int16x8_t v1460 = vqrdmulhq_n_s16(v1459, 17734); + int16x8_t v1461 = vaddq_s16(v1450, v1460); + int16x8_t v1462 = vaddq_s16(v1409, v1369); + int16x8_t v1463 = vaddq_s16(v1370, v1393); + int16x8_t v1464 = vaddq_s16(v1462, v1463); + int16x8_t v1465_tmp = vqrdmulhq_n_s16(v1464, 13573); + int16x8_t v1465 = vaddq_s16(v1465_tmp, v1464); + int16x8_t v1466 = vaddq_s16(v1417, v1373); + int16x8_t v1467 = vaddq_s16(v1374, v1397); + int16x8_t v1468 = vaddq_s16(v1466, v1467); + int16x8_t v1469 = vaddq_s16(v1401, v1376); + int16x8_t v1470 = vaddq_s16(v1377, v1405); + int16x8_t v1471 = vaddq_s16(v1469, v1470); + int16x8_t v1472 = vaddq_s16(v1468, v1471); + int16x8_t v1473 = vaddq_s16(v1465, v1472); + int16x8_t v1474 = vaddq_s16(v1471, v1464); + int16x8_t v1475_tmp = vqrdmulhq_n_s16(v1474, 13573); + int16x8_t v1475 = vaddq_s16(v1475_tmp, v1474); + int16x8_t v1476 = vaddq_s16(v1433, v1383); + int16x8_t v1477 = vaddq_s16(v1384, v1413); + int16x8_t v1478 = vaddq_s16(v1476, v1477); + int16x8_t v1479 = vaddq_s16(v1478, v1468); + int16x8_t v1480 = vaddq_s16(v1479, v1474); + int16x8_t v1481 = vaddq_s16(v1475, v1480); + int16x8_t v1482 = vqrdmulhq_n_s16(v1481, 17734); + int16x8_t v1483 = vaddq_s16(v1473, v1482); + int16x8_t v1484 = vqrdmulhq_n_s16(v1483, 16705); + int16x8_t v1485 = vaddq_s16(v1461, v1484); + int16x8_t v1486 = vaddq_s16(v1463, v1445); + int16x8_t v1487_tmp = vqrdmulhq_n_s16(v1486, 13573); + int16x8_t v1487 = vaddq_s16(v1487_tmp, v1486); + int16x8_t v1488 = vaddq_s16(v1467, v1447); + int16x8_t v1489 = vaddq_s16(v1448, v1469); + int16x8_t v1490 = vaddq_s16(v1488, v1489); + int16x8_t v1491 = vaddq_s16(v1487, v1490); + int16x8_t v1492 = vaddq_s16(v1470, v1451); + int16x8_t v1493 = vaddq_s16(v1452, v1462); + int16x8_t v1494 = vaddq_s16(v1492, v1493); + int16x8_t v1495_tmp = vqrdmulhq_n_s16(v1494, 13573); + int16x8_t v1495 = vaddq_s16(v1495_tmp, v1494); + int16x8_t v1496 = vaddq_s16(v1477, v1455); + int16x8_t v1497 = vaddq_s16(v1456, v1466); + int16x8_t v1498 = vaddq_s16(v1496, v1497); + int16x8_t v1499 = vaddq_s16(v1498, v1494); + int16x8_t v1500 = vaddq_s16(v1495, v1499); + int16x8_t v1501 = vqrdmulhq_n_s16(v1500, 17734); + int16x8_t v1502 = vaddq_s16(v1491, v1501); + int16x8_t v1503 = vaddq_s16(v1493, v1486); + int16x8_t v1504_tmp = vqrdmulhq_n_s16(v1503, 13573); + int16x8_t v1504 = vaddq_s16(v1504_tmp, v1503); + int16x8_t v1505 = vaddq_s16(v1497, v1488); + int16x8_t v1506 = vaddq_s16(v1489, v1492); + int16x8_t v1507 = vaddq_s16(v1505, v1506); + int16x8_t v1508 = vaddq_s16(v1504, v1507); + int16x8_t v1509 = vaddq_s16(v1506, v1503); + int16x8_t v1510_tmp = vqrdmulhq_n_s16(v1509, 13573); + int16x8_t v1510 = vaddq_s16(v1510_tmp, v1509); + int16x8_t v1511 = vld1q_s16(in + in_stride * 255 + i); + int16x8_t v1512 = vaddq_s16(v1511, v1128); + int16x8_t v1513 = vaddq_s16(v1512, v1336); + int16x8_t v1514 = vaddq_s16(v1513, v1432); + int16x8_t v1515 = vaddq_s16(v1514, v1476); + int16x8_t v1516 = vaddq_s16(v1515, v1496); + int16x8_t v1517 = vaddq_s16(v1516, v1505); + int16x8_t v1518 = vaddq_s16(v1517, v1509); + int16x8_t v1519 = vaddq_s16(v1510, v1518); + int16x8_t v1520 = vqrdmulhq_n_s16(v1519, 17734); + int16x8_t v1521 = vaddq_s16(v1508, v1520); + int16x8_t v1522 = vqrdmulhq_n_s16(v1521, 16705); + int16x8_t v1523 = vaddq_s16(v1502, v1522); + int16x8_t v1524 = vqrdmulhq_n_s16(v1523, 16463); + int16x8_t v1525 = vaddq_s16(v1485, v1524); + int16x8_t v1526 = vqrdmulhq_n_s16(v1525, 16404); + int16x8_t v1527 = vaddq_s16(v1444, v1526); + int16x8_t v1528 = vqrdmulhq_n_s16(v1527, 16389); + int16x8_t v1529 = vaddq_s16(v1351, v1528); + int16x8_t v1530 = vqrdmulhq_n_s16(v1529, 16385); + int16x8_t v1531 = vaddq_s16(v1146, v1530); + int16x8_t v1532 = vqrdmulhq_n_s16(v1531, 16384); + int16x8_t v1533 = vaddq_s16(v701, v1532); + int16x8_t v1534 = vsubq_s16(v0, v1); + int16x8_t v1535 = vsubq_s16(v4, v6); + int16x8_t v1536_tmp = vqrdmulhq_n_s16(v1535, 10045); + int16x8_t v1536 = vaddq_s16(v1536_tmp, v1535); + int16x8_t v1537 = vaddq_s16(v1534, v1536); + int16x8_t v1538 = vsubq_s16(v11, v14); + int16x8_t v1539 = vsubq_s16(v17, v20); + int16x8_t v1540_tmp = vqrdmulhq_n_s16(v1539, 10045); + int16x8_t v1540 = vaddq_s16(v1540_tmp, v1539); + int16x8_t v1541 = vaddq_s16(v1538, v1540); + int16x8_t v1542 = vqrdmulhq_n_s16(v1541, 19705); + int16x8_t v1543 = vaddq_s16(v1537, v1542); + int16x8_t v1544 = vsubq_s16(v27, v30); + int16x8_t v1545 = vsubq_s16(v35, v39); + int16x8_t v1546_tmp = vqrdmulhq_n_s16(v1545, 10045); + int16x8_t v1546 = vaddq_s16(v1546_tmp, v1545); + int16x8_t v1547 = vaddq_s16(v1544, v1546); + int16x8_t v1548 = vsubq_s16(v44, v47); + int16x8_t v1549 = vsubq_s16(v50, v54); + int16x8_t v1550_tmp = vqrdmulhq_n_s16(v1549, 10045); + int16x8_t v1550 = vaddq_s16(v1550_tmp, v1549); + int16x8_t v1551 = vaddq_s16(v1548, v1550); + int16x8_t v1552 = vqrdmulhq_n_s16(v1551, 19705); + int16x8_t v1553 = vaddq_s16(v1547, v1552); + int16x8_t v1554 = vqrdmulhq_n_s16(v1553, 17121); + int16x8_t v1555 = vaddq_s16(v1543, v1554); + int16x8_t v1556 = vsubq_s16(v63, v66); + int16x8_t v1557 = vsubq_s16(v71, v75); + int16x8_t v1558_tmp = vqrdmulhq_n_s16(v1557, 10045); + int16x8_t v1558 = vaddq_s16(v1558_tmp, v1557); + int16x8_t v1559 = vaddq_s16(v1556, v1558); + int16x8_t v1560 = vsubq_s16(v82, v89); + int16x8_t v1561 = vsubq_s16(v92, v97); + int16x8_t v1562_tmp = vqrdmulhq_n_s16(v1561, 10045); + int16x8_t v1562 = vaddq_s16(v1562_tmp, v1561); + int16x8_t v1563 = vaddq_s16(v1560, v1562); + int16x8_t v1564 = vqrdmulhq_n_s16(v1563, 19705); + int16x8_t v1565 = vaddq_s16(v1559, v1564); + int16x8_t v1566 = vsubq_s16(v104, v107); + int16x8_t v1567 = vsubq_s16(v112, v116); + int16x8_t v1568_tmp = vqrdmulhq_n_s16(v1567, 10045); + int16x8_t v1568 = vaddq_s16(v1568_tmp, v1567); + int16x8_t v1569 = vaddq_s16(v1566, v1568); + int16x8_t v1570 = vsubq_s16(v121, v124); + int16x8_t v1571 = vsubq_s16(v127, v132); + int16x8_t v1572_tmp = vqrdmulhq_n_s16(v1571, 10045); + int16x8_t v1572 = vaddq_s16(v1572_tmp, v1571); + int16x8_t v1573 = vaddq_s16(v1570, v1572); + int16x8_t v1574 = vqrdmulhq_n_s16(v1573, 19705); + int16x8_t v1575 = vaddq_s16(v1569, v1574); + int16x8_t v1576 = vqrdmulhq_n_s16(v1575, 17121); + int16x8_t v1577 = vaddq_s16(v1565, v1576); + int16x8_t v1578 = vqrdmulhq_n_s16(v1577, 16563); + int16x8_t v1579 = vaddq_s16(v1555, v1578); + int16x8_t v1580 = vsubq_s16(v143, v146); + int16x8_t v1581 = vsubq_s16(v151, v155); + int16x8_t v1582_tmp = vqrdmulhq_n_s16(v1581, 10045); + int16x8_t v1582 = vaddq_s16(v1582_tmp, v1581); + int16x8_t v1583 = vaddq_s16(v1580, v1582); + int16x8_t v1584 = vsubq_s16(v162, v169); + int16x8_t v1585 = vsubq_s16(v172, v177); + int16x8_t v1586_tmp = vqrdmulhq_n_s16(v1585, 10045); + int16x8_t v1586 = vaddq_s16(v1586_tmp, v1585); + int16x8_t v1587 = vaddq_s16(v1584, v1586); + int16x8_t v1588 = vqrdmulhq_n_s16(v1587, 19705); + int16x8_t v1589 = vaddq_s16(v1583, v1588); + int16x8_t v1590 = vsubq_s16(v186, v193); + int16x8_t v1591 = vsubq_s16(v202, v210); + int16x8_t v1592_tmp = vqrdmulhq_n_s16(v1591, 10045); + int16x8_t v1592 = vaddq_s16(v1592_tmp, v1591); + int16x8_t v1593 = vaddq_s16(v1590, v1592); + int16x8_t v1594 = vsubq_s16(v215, v218); + int16x8_t v1595 = vsubq_s16(v221, v227); + int16x8_t v1596_tmp = vqrdmulhq_n_s16(v1595, 10045); + int16x8_t v1596 = vaddq_s16(v1596_tmp, v1595); + int16x8_t v1597 = vaddq_s16(v1594, v1596); + int16x8_t v1598 = vqrdmulhq_n_s16(v1597, 19705); + int16x8_t v1599 = vaddq_s16(v1593, v1598); + int16x8_t v1600 = vqrdmulhq_n_s16(v1599, 17121); + int16x8_t v1601 = vaddq_s16(v1589, v1600); + int16x8_t v1602 = vsubq_s16(v236, v239); + int16x8_t v1603 = vsubq_s16(v244, v248); + int16x8_t v1604_tmp = vqrdmulhq_n_s16(v1603, 10045); + int16x8_t v1604 = vaddq_s16(v1604_tmp, v1603); + int16x8_t v1605 = vaddq_s16(v1602, v1604); + int16x8_t v1606 = vsubq_s16(v255, v262); + int16x8_t v1607 = vsubq_s16(v265, v270); + int16x8_t v1608_tmp = vqrdmulhq_n_s16(v1607, 10045); + int16x8_t v1608 = vaddq_s16(v1608_tmp, v1607); + int16x8_t v1609 = vaddq_s16(v1606, v1608); + int16x8_t v1610 = vqrdmulhq_n_s16(v1609, 19705); + int16x8_t v1611 = vaddq_s16(v1605, v1610); + int16x8_t v1612 = vsubq_s16(v277, v280); + int16x8_t v1613 = vsubq_s16(v285, v289); + int16x8_t v1614_tmp = vqrdmulhq_n_s16(v1613, 10045); + int16x8_t v1614 = vaddq_s16(v1614_tmp, v1613); + int16x8_t v1615 = vaddq_s16(v1612, v1614); + int16x8_t v1616 = vsubq_s16(v294, v297); + int16x8_t v1617 = vsubq_s16(v300, v306); + int16x8_t v1618_tmp = vqrdmulhq_n_s16(v1617, 10045); + int16x8_t v1618 = vaddq_s16(v1618_tmp, v1617); + int16x8_t v1619 = vaddq_s16(v1616, v1618); + int16x8_t v1620 = vqrdmulhq_n_s16(v1619, 19705); + int16x8_t v1621 = vaddq_s16(v1615, v1620); + int16x8_t v1622 = vqrdmulhq_n_s16(v1621, 17121); + int16x8_t v1623 = vaddq_s16(v1611, v1622); + int16x8_t v1624 = vqrdmulhq_n_s16(v1623, 16563); + int16x8_t v1625 = vaddq_s16(v1601, v1624); + int16x8_t v1626 = vqrdmulhq_n_s16(v1625, 16429); + int16x8_t v1627 = vaddq_s16(v1579, v1626); + int16x8_t v1628 = vsubq_s16(v319, v322); + int16x8_t v1629 = vsubq_s16(v327, v331); + int16x8_t v1630_tmp = vqrdmulhq_n_s16(v1629, 10045); + int16x8_t v1630 = vaddq_s16(v1630_tmp, v1629); + int16x8_t v1631 = vaddq_s16(v1628, v1630); + int16x8_t v1632 = vsubq_s16(v338, v345); + int16x8_t v1633 = vsubq_s16(v348, v353); + int16x8_t v1634_tmp = vqrdmulhq_n_s16(v1633, 10045); + int16x8_t v1634 = vaddq_s16(v1634_tmp, v1633); + int16x8_t v1635 = vaddq_s16(v1632, v1634); + int16x8_t v1636 = vqrdmulhq_n_s16(v1635, 19705); + int16x8_t v1637 = vaddq_s16(v1631, v1636); + int16x8_t v1638 = vsubq_s16(v362, v369); + int16x8_t v1639 = vsubq_s16(v378, v386); + int16x8_t v1640_tmp = vqrdmulhq_n_s16(v1639, 10045); + int16x8_t v1640 = vaddq_s16(v1640_tmp, v1639); + int16x8_t v1641 = vaddq_s16(v1638, v1640); + int16x8_t v1642 = vsubq_s16(v391, v394); + int16x8_t v1643 = vsubq_s16(v397, v403); + int16x8_t v1644_tmp = vqrdmulhq_n_s16(v1643, 10045); + int16x8_t v1644 = vaddq_s16(v1644_tmp, v1643); + int16x8_t v1645 = vaddq_s16(v1642, v1644); + int16x8_t v1646 = vqrdmulhq_n_s16(v1645, 19705); + int16x8_t v1647 = vaddq_s16(v1641, v1646); + int16x8_t v1648 = vqrdmulhq_n_s16(v1647, 17121); + int16x8_t v1649 = vaddq_s16(v1637, v1648); + int16x8_t v1650 = vsubq_s16(v414, v421); + int16x8_t v1651 = vsubq_s16(v430, v438); + int16x8_t v1652_tmp = vqrdmulhq_n_s16(v1651, 10045); + int16x8_t v1652 = vaddq_s16(v1652_tmp, v1651); + int16x8_t v1653 = vaddq_s16(v1650, v1652); + int16x8_t v1654 = vsubq_s16(v449, v464); + int16x8_t v1655 = vsubq_s16(v467, v476); + int16x8_t v1656_tmp = vqrdmulhq_n_s16(v1655, 10045); + int16x8_t v1656 = vaddq_s16(v1656_tmp, v1655); + int16x8_t v1657 = vaddq_s16(v1654, v1656); + int16x8_t v1658 = vqrdmulhq_n_s16(v1657, 19705); + int16x8_t v1659 = vaddq_s16(v1653, v1658); + int16x8_t v1660 = vsubq_s16(v483, v486); + int16x8_t v1661 = vsubq_s16(v491, v495); + int16x8_t v1662_tmp = vqrdmulhq_n_s16(v1661, 10045); + int16x8_t v1662 = vaddq_s16(v1662_tmp, v1661); + int16x8_t v1663 = vaddq_s16(v1660, v1662); + int16x8_t v1664 = vsubq_s16(v500, v503); + int16x8_t v1665 = vsubq_s16(v506, v513); + int16x8_t v1666_tmp = vqrdmulhq_n_s16(v1665, 10045); + int16x8_t v1666 = vaddq_s16(v1666_tmp, v1665); + int16x8_t v1667 = vaddq_s16(v1664, v1666); + int16x8_t v1668 = vqrdmulhq_n_s16(v1667, 19705); + int16x8_t v1669 = vaddq_s16(v1663, v1668); + int16x8_t v1670 = vqrdmulhq_n_s16(v1669, 17121); + int16x8_t v1671 = vaddq_s16(v1659, v1670); + int16x8_t v1672 = vqrdmulhq_n_s16(v1671, 16563); + int16x8_t v1673 = vaddq_s16(v1649, v1672); + int16x8_t v1674 = vsubq_s16(v524, v527); + int16x8_t v1675 = vsubq_s16(v532, v536); + int16x8_t v1676_tmp = vqrdmulhq_n_s16(v1675, 10045); + int16x8_t v1676 = vaddq_s16(v1676_tmp, v1675); + int16x8_t v1677 = vaddq_s16(v1674, v1676); + int16x8_t v1678 = vsubq_s16(v543, v550); + int16x8_t v1679 = vsubq_s16(v553, v558); + int16x8_t v1680_tmp = vqrdmulhq_n_s16(v1679, 10045); + int16x8_t v1680 = vaddq_s16(v1680_tmp, v1679); + int16x8_t v1681 = vaddq_s16(v1678, v1680); + int16x8_t v1682 = vqrdmulhq_n_s16(v1681, 19705); + int16x8_t v1683 = vaddq_s16(v1677, v1682); + int16x8_t v1684 = vsubq_s16(v567, v574); + int16x8_t v1685 = vsubq_s16(v583, v591); + int16x8_t v1686_tmp = vqrdmulhq_n_s16(v1685, 10045); + int16x8_t v1686 = vaddq_s16(v1686_tmp, v1685); + int16x8_t v1687 = vaddq_s16(v1684, v1686); + int16x8_t v1688 = vsubq_s16(v596, v599); + int16x8_t v1689 = vsubq_s16(v602, v608); + int16x8_t v1690_tmp = vqrdmulhq_n_s16(v1689, 10045); + int16x8_t v1690 = vaddq_s16(v1690_tmp, v1689); + int16x8_t v1691 = vaddq_s16(v1688, v1690); + int16x8_t v1692 = vqrdmulhq_n_s16(v1691, 19705); + int16x8_t v1693 = vaddq_s16(v1687, v1692); + int16x8_t v1694 = vqrdmulhq_n_s16(v1693, 17121); + int16x8_t v1695 = vaddq_s16(v1683, v1694); + int16x8_t v1696 = vsubq_s16(v617, v620); + int16x8_t v1697 = vsubq_s16(v625, v629); + int16x8_t v1698_tmp = vqrdmulhq_n_s16(v1697, 10045); + int16x8_t v1698 = vaddq_s16(v1698_tmp, v1697); + int16x8_t v1699 = vaddq_s16(v1696, v1698); + int16x8_t v1700 = vsubq_s16(v636, v643); + int16x8_t v1701 = vsubq_s16(v646, v651); + int16x8_t v1702_tmp = vqrdmulhq_n_s16(v1701, 10045); + int16x8_t v1702 = vaddq_s16(v1702_tmp, v1701); + int16x8_t v1703 = vaddq_s16(v1700, v1702); + int16x8_t v1704 = vqrdmulhq_n_s16(v1703, 19705); + int16x8_t v1705 = vaddq_s16(v1699, v1704); + int16x8_t v1706 = vsubq_s16(v658, v661); + int16x8_t v1707 = vsubq_s16(v666, v670); + int16x8_t v1708_tmp = vqrdmulhq_n_s16(v1707, 10045); + int16x8_t v1708 = vaddq_s16(v1708_tmp, v1707); + int16x8_t v1709 = vaddq_s16(v1706, v1708); + int16x8_t v1710 = vsubq_s16(v675, v678); + int16x8_t v1711 = vsubq_s16(v681, v688); + int16x8_t v1712_tmp = vqrdmulhq_n_s16(v1711, 10045); + int16x8_t v1712 = vaddq_s16(v1712_tmp, v1711); + int16x8_t v1713 = vaddq_s16(v1710, v1712); + int16x8_t v1714 = vqrdmulhq_n_s16(v1713, 19705); + int16x8_t v1715 = vaddq_s16(v1709, v1714); + int16x8_t v1716 = vqrdmulhq_n_s16(v1715, 17121); + int16x8_t v1717 = vaddq_s16(v1705, v1716); + int16x8_t v1718 = vqrdmulhq_n_s16(v1717, 16563); + int16x8_t v1719 = vaddq_s16(v1695, v1718); + int16x8_t v1720 = vqrdmulhq_n_s16(v1719, 16429); + int16x8_t v1721 = vaddq_s16(v1673, v1720); + int16x8_t v1722 = vqrdmulhq_n_s16(v1721, 16395); + int16x8_t v1723 = vaddq_s16(v1627, v1722); + int16x8_t v1724 = vsubq_s16(v703, v706); + int16x8_t v1725 = vsubq_s16(v711, v715); + int16x8_t v1726_tmp = vqrdmulhq_n_s16(v1725, 10045); + int16x8_t v1726 = vaddq_s16(v1726_tmp, v1725); + int16x8_t v1727 = vaddq_s16(v1724, v1726); + int16x8_t v1728 = vsubq_s16(v722, v729); + int16x8_t v1729 = vsubq_s16(v732, v737); + int16x8_t v1730_tmp = vqrdmulhq_n_s16(v1729, 10045); + int16x8_t v1730 = vaddq_s16(v1730_tmp, v1729); + int16x8_t v1731 = vaddq_s16(v1728, v1730); + int16x8_t v1732 = vqrdmulhq_n_s16(v1731, 19705); + int16x8_t v1733 = vaddq_s16(v1727, v1732); + int16x8_t v1734 = vsubq_s16(v746, v753); + int16x8_t v1735 = vsubq_s16(v762, v770); + int16x8_t v1736_tmp = vqrdmulhq_n_s16(v1735, 10045); + int16x8_t v1736 = vaddq_s16(v1736_tmp, v1735); + int16x8_t v1737 = vaddq_s16(v1734, v1736); + int16x8_t v1738 = vsubq_s16(v775, v778); + int16x8_t v1739 = vsubq_s16(v781, v787); + int16x8_t v1740_tmp = vqrdmulhq_n_s16(v1739, 10045); + int16x8_t v1740 = vaddq_s16(v1740_tmp, v1739); + int16x8_t v1741 = vaddq_s16(v1738, v1740); + int16x8_t v1742 = vqrdmulhq_n_s16(v1741, 19705); + int16x8_t v1743 = vaddq_s16(v1737, v1742); + int16x8_t v1744 = vqrdmulhq_n_s16(v1743, 17121); + int16x8_t v1745 = vaddq_s16(v1733, v1744); + int16x8_t v1746 = vsubq_s16(v798, v805); + int16x8_t v1747 = vsubq_s16(v814, v822); + int16x8_t v1748_tmp = vqrdmulhq_n_s16(v1747, 10045); + int16x8_t v1748 = vaddq_s16(v1748_tmp, v1747); + int16x8_t v1749 = vaddq_s16(v1746, v1748); + int16x8_t v1750 = vsubq_s16(v833, v848); + int16x8_t v1751 = vsubq_s16(v851, v860); + int16x8_t v1752_tmp = vqrdmulhq_n_s16(v1751, 10045); + int16x8_t v1752 = vaddq_s16(v1752_tmp, v1751); + int16x8_t v1753 = vaddq_s16(v1750, v1752); + int16x8_t v1754 = vqrdmulhq_n_s16(v1753, 19705); + int16x8_t v1755 = vaddq_s16(v1749, v1754); + int16x8_t v1756 = vsubq_s16(v867, v870); + int16x8_t v1757 = vsubq_s16(v875, v879); + int16x8_t v1758_tmp = vqrdmulhq_n_s16(v1757, 10045); + int16x8_t v1758 = vaddq_s16(v1758_tmp, v1757); + int16x8_t v1759 = vaddq_s16(v1756, v1758); + int16x8_t v1760 = vsubq_s16(v884, v887); + int16x8_t v1761 = vsubq_s16(v890, v897); + int16x8_t v1762_tmp = vqrdmulhq_n_s16(v1761, 10045); + int16x8_t v1762 = vaddq_s16(v1762_tmp, v1761); + int16x8_t v1763 = vaddq_s16(v1760, v1762); + int16x8_t v1764 = vqrdmulhq_n_s16(v1763, 19705); + int16x8_t v1765 = vaddq_s16(v1759, v1764); + int16x8_t v1766 = vqrdmulhq_n_s16(v1765, 17121); + int16x8_t v1767 = vaddq_s16(v1755, v1766); + int16x8_t v1768 = vqrdmulhq_n_s16(v1767, 16563); + int16x8_t v1769 = vaddq_s16(v1745, v1768); + int16x8_t v1770 = vsubq_s16(v910, v917); + int16x8_t v1771 = vsubq_s16(v926, v934); + int16x8_t v1772_tmp = vqrdmulhq_n_s16(v1771, 10045); + int16x8_t v1772 = vaddq_s16(v1772_tmp, v1771); + int16x8_t v1773 = vaddq_s16(v1770, v1772); + int16x8_t v1774 = vsubq_s16(v945, v960); + int16x8_t v1775 = vsubq_s16(v963, v972); + int16x8_t v1776_tmp = vqrdmulhq_n_s16(v1775, 10045); + int16x8_t v1776 = vaddq_s16(v1776_tmp, v1775); + int16x8_t v1777 = vaddq_s16(v1774, v1776); + int16x8_t v1778 = vqrdmulhq_n_s16(v1777, 19705); + int16x8_t v1779 = vaddq_s16(v1773, v1778); + int16x8_t v1780 = vsubq_s16(v985, v1000); + int16x8_t v1781 = vsubq_s16(v1017, v1033); + int16x8_t v1782_tmp = vqrdmulhq_n_s16(v1781, 10045); + int16x8_t v1782 = vaddq_s16(v1782_tmp, v1781); + int16x8_t v1783 = vaddq_s16(v1780, v1782); + int16x8_t v1784 = vsubq_s16(v1038, v1041); + int16x8_t v1785 = vsubq_s16(v1044, v1054); + int16x8_t v1786_tmp = vqrdmulhq_n_s16(v1785, 10045); + int16x8_t v1786 = vaddq_s16(v1786_tmp, v1785); + int16x8_t v1787 = vaddq_s16(v1784, v1786); + int16x8_t v1788 = vqrdmulhq_n_s16(v1787, 19705); + int16x8_t v1789 = vaddq_s16(v1783, v1788); + int16x8_t v1790 = vqrdmulhq_n_s16(v1789, 17121); + int16x8_t v1791 = vaddq_s16(v1779, v1790); + int16x8_t v1792 = vsubq_s16(v1063, v1066); + int16x8_t v1793 = vsubq_s16(v1071, v1075); + int16x8_t v1794_tmp = vqrdmulhq_n_s16(v1793, 10045); + int16x8_t v1794 = vaddq_s16(v1794_tmp, v1793); + int16x8_t v1795 = vaddq_s16(v1792, v1794); + int16x8_t v1796 = vsubq_s16(v1082, v1089); + int16x8_t v1797 = vsubq_s16(v1092, v1097); + int16x8_t v1798_tmp = vqrdmulhq_n_s16(v1797, 10045); + int16x8_t v1798 = vaddq_s16(v1798_tmp, v1797); + int16x8_t v1799 = vaddq_s16(v1796, v1798); + int16x8_t v1800 = vqrdmulhq_n_s16(v1799, 19705); + int16x8_t v1801 = vaddq_s16(v1795, v1800); + int16x8_t v1802 = vsubq_s16(v1104, v1107); + int16x8_t v1803 = vsubq_s16(v1112, v1116); + int16x8_t v1804_tmp = vqrdmulhq_n_s16(v1803, 10045); + int16x8_t v1804 = vaddq_s16(v1804_tmp, v1803); + int16x8_t v1805 = vaddq_s16(v1802, v1804); + int16x8_t v1806 = vsubq_s16(v1121, v1124); + int16x8_t v1807 = vsubq_s16(v1127, v1135); + int16x8_t v1808_tmp = vqrdmulhq_n_s16(v1807, 10045); + int16x8_t v1808 = vaddq_s16(v1808_tmp, v1807); + int16x8_t v1809 = vaddq_s16(v1806, v1808); + int16x8_t v1810 = vqrdmulhq_n_s16(v1809, 19705); + int16x8_t v1811 = vaddq_s16(v1805, v1810); + int16x8_t v1812 = vqrdmulhq_n_s16(v1811, 17121); + int16x8_t v1813 = vaddq_s16(v1801, v1812); + int16x8_t v1814 = vqrdmulhq_n_s16(v1813, 16563); + int16x8_t v1815 = vaddq_s16(v1791, v1814); + int16x8_t v1816 = vqrdmulhq_n_s16(v1815, 16429); + int16x8_t v1817 = vaddq_s16(v1769, v1816); + int16x8_t v1818 = vsubq_s16(v1148, v1151); + int16x8_t v1819 = vsubq_s16(v1156, v1160); + int16x8_t v1820_tmp = vqrdmulhq_n_s16(v1819, 10045); + int16x8_t v1820 = vaddq_s16(v1820_tmp, v1819); + int16x8_t v1821 = vaddq_s16(v1818, v1820); + int16x8_t v1822 = vsubq_s16(v1167, v1174); + int16x8_t v1823 = vsubq_s16(v1177, v1182); + int16x8_t v1824_tmp = vqrdmulhq_n_s16(v1823, 10045); + int16x8_t v1824 = vaddq_s16(v1824_tmp, v1823); + int16x8_t v1825 = vaddq_s16(v1822, v1824); + int16x8_t v1826 = vqrdmulhq_n_s16(v1825, 19705); + int16x8_t v1827 = vaddq_s16(v1821, v1826); + int16x8_t v1828 = vsubq_s16(v1191, v1198); + int16x8_t v1829 = vsubq_s16(v1207, v1215); + int16x8_t v1830_tmp = vqrdmulhq_n_s16(v1829, 10045); + int16x8_t v1830 = vaddq_s16(v1830_tmp, v1829); + int16x8_t v1831 = vaddq_s16(v1828, v1830); + int16x8_t v1832 = vsubq_s16(v1220, v1223); + int16x8_t v1833 = vsubq_s16(v1226, v1232); + int16x8_t v1834_tmp = vqrdmulhq_n_s16(v1833, 10045); + int16x8_t v1834 = vaddq_s16(v1834_tmp, v1833); + int16x8_t v1835 = vaddq_s16(v1832, v1834); + int16x8_t v1836 = vqrdmulhq_n_s16(v1835, 19705); + int16x8_t v1837 = vaddq_s16(v1831, v1836); + int16x8_t v1838 = vqrdmulhq_n_s16(v1837, 17121); + int16x8_t v1839 = vaddq_s16(v1827, v1838); + int16x8_t v1840 = vsubq_s16(v1243, v1250); + int16x8_t v1841 = vsubq_s16(v1259, v1267); + int16x8_t v1842_tmp = vqrdmulhq_n_s16(v1841, 10045); + int16x8_t v1842 = vaddq_s16(v1842_tmp, v1841); + int16x8_t v1843 = vaddq_s16(v1840, v1842); + int16x8_t v1844 = vsubq_s16(v1278, v1293); + int16x8_t v1845 = vsubq_s16(v1296, v1305); + int16x8_t v1846_tmp = vqrdmulhq_n_s16(v1845, 10045); + int16x8_t v1846 = vaddq_s16(v1846_tmp, v1845); + int16x8_t v1847 = vaddq_s16(v1844, v1846); + int16x8_t v1848 = vqrdmulhq_n_s16(v1847, 19705); + int16x8_t v1849 = vaddq_s16(v1843, v1848); + int16x8_t v1850 = vsubq_s16(v1312, v1315); + int16x8_t v1851 = vsubq_s16(v1320, v1324); + int16x8_t v1852_tmp = vqrdmulhq_n_s16(v1851, 10045); + int16x8_t v1852 = vaddq_s16(v1852_tmp, v1851); + int16x8_t v1853 = vaddq_s16(v1850, v1852); + int16x8_t v1854 = vsubq_s16(v1329, v1332); + int16x8_t v1855 = vsubq_s16(v1335, v1342); + int16x8_t v1856_tmp = vqrdmulhq_n_s16(v1855, 10045); + int16x8_t v1856 = vaddq_s16(v1856_tmp, v1855); + int16x8_t v1857 = vaddq_s16(v1854, v1856); + int16x8_t v1858 = vqrdmulhq_n_s16(v1857, 19705); + int16x8_t v1859 = vaddq_s16(v1853, v1858); + int16x8_t v1860 = vqrdmulhq_n_s16(v1859, 17121); + int16x8_t v1861 = vaddq_s16(v1849, v1860); + int16x8_t v1862 = vqrdmulhq_n_s16(v1861, 16563); + int16x8_t v1863 = vaddq_s16(v1839, v1862); + int16x8_t v1864 = vsubq_s16(v1353, v1356); + int16x8_t v1865 = vsubq_s16(v1361, v1365); + int16x8_t v1866_tmp = vqrdmulhq_n_s16(v1865, 10045); + int16x8_t v1866 = vaddq_s16(v1866_tmp, v1865); + int16x8_t v1867 = vaddq_s16(v1864, v1866); + int16x8_t v1868 = vsubq_s16(v1372, v1379); + int16x8_t v1869 = vsubq_s16(v1382, v1387); + int16x8_t v1870_tmp = vqrdmulhq_n_s16(v1869, 10045); + int16x8_t v1870 = vaddq_s16(v1870_tmp, v1869); + int16x8_t v1871 = vaddq_s16(v1868, v1870); + int16x8_t v1872 = vqrdmulhq_n_s16(v1871, 19705); + int16x8_t v1873 = vaddq_s16(v1867, v1872); + int16x8_t v1874 = vsubq_s16(v1396, v1403); + int16x8_t v1875 = vsubq_s16(v1412, v1420); + int16x8_t v1876_tmp = vqrdmulhq_n_s16(v1875, 10045); + int16x8_t v1876 = vaddq_s16(v1876_tmp, v1875); + int16x8_t v1877 = vaddq_s16(v1874, v1876); + int16x8_t v1878 = vsubq_s16(v1425, v1428); + int16x8_t v1879 = vsubq_s16(v1431, v1437); + int16x8_t v1880_tmp = vqrdmulhq_n_s16(v1879, 10045); + int16x8_t v1880 = vaddq_s16(v1880_tmp, v1879); + int16x8_t v1881 = vaddq_s16(v1878, v1880); + int16x8_t v1882 = vqrdmulhq_n_s16(v1881, 19705); + int16x8_t v1883 = vaddq_s16(v1877, v1882); + int16x8_t v1884 = vqrdmulhq_n_s16(v1883, 17121); + int16x8_t v1885 = vaddq_s16(v1873, v1884); + int16x8_t v1886 = vsubq_s16(v1446, v1449); + int16x8_t v1887 = vsubq_s16(v1454, v1458); + int16x8_t v1888_tmp = vqrdmulhq_n_s16(v1887, 10045); + int16x8_t v1888 = vaddq_s16(v1888_tmp, v1887); + int16x8_t v1889 = vaddq_s16(v1886, v1888); + int16x8_t v1890 = vsubq_s16(v1465, v1472); + int16x8_t v1891 = vsubq_s16(v1475, v1480); + int16x8_t v1892_tmp = vqrdmulhq_n_s16(v1891, 10045); + int16x8_t v1892 = vaddq_s16(v1892_tmp, v1891); + int16x8_t v1893 = vaddq_s16(v1890, v1892); + int16x8_t v1894 = vqrdmulhq_n_s16(v1893, 19705); + int16x8_t v1895 = vaddq_s16(v1889, v1894); + int16x8_t v1896 = vsubq_s16(v1487, v1490); + int16x8_t v1897 = vsubq_s16(v1495, v1499); + int16x8_t v1898_tmp = vqrdmulhq_n_s16(v1897, 10045); + int16x8_t v1898 = vaddq_s16(v1898_tmp, v1897); + int16x8_t v1899 = vaddq_s16(v1896, v1898); + int16x8_t v1900 = vsubq_s16(v1504, v1507); + int16x8_t v1901 = vsubq_s16(v1510, v1518); + int16x8_t v1902_tmp = vqrdmulhq_n_s16(v1901, 10045); + int16x8_t v1902 = vaddq_s16(v1902_tmp, v1901); + int16x8_t v1903 = vaddq_s16(v1900, v1902); + int16x8_t v1904 = vqrdmulhq_n_s16(v1903, 19705); + int16x8_t v1905 = vaddq_s16(v1899, v1904); + int16x8_t v1906 = vqrdmulhq_n_s16(v1905, 17121); + int16x8_t v1907 = vaddq_s16(v1895, v1906); + int16x8_t v1908 = vqrdmulhq_n_s16(v1907, 16563); + int16x8_t v1909 = vaddq_s16(v1885, v1908); + int16x8_t v1910 = vqrdmulhq_n_s16(v1909, 16429); + int16x8_t v1911 = vaddq_s16(v1863, v1910); + int16x8_t v1912 = vqrdmulhq_n_s16(v1911, 16395); + int16x8_t v1913 = vaddq_s16(v1817, v1912); + int16x8_t v1914 = vqrdmulhq_n_s16(v1913, 16387); + int16x8_t v1915 = vaddq_s16(v1723, v1914); + int16x8_t v1916 = vsubq_s16(v1534, v1536); + int16x8_t v1917 = vsubq_s16(v1538, v1540); + int16x8_t v1918 = vqrdmulhq_n_s16(v1917, 29490); + int16x8_t v1919 = vaddq_s16(v1916, v1918); + int16x8_t v1920 = vsubq_s16(v1544, v1546); + int16x8_t v1921 = vsubq_s16(v1548, v1550); + int16x8_t v1922 = vqrdmulhq_n_s16(v1921, 29490); + int16x8_t v1923 = vaddq_s16(v1920, v1922); + int16x8_t v1924 = vqrdmulhq_n_s16(v1923, 18578); + int16x8_t v1925 = vaddq_s16(v1919, v1924); + int16x8_t v1926 = vsubq_s16(v1556, v1558); + int16x8_t v1927 = vsubq_s16(v1560, v1562); + int16x8_t v1928 = vqrdmulhq_n_s16(v1927, 29490); + int16x8_t v1929 = vaddq_s16(v1926, v1928); + int16x8_t v1930 = vsubq_s16(v1566, v1568); + int16x8_t v1931 = vsubq_s16(v1570, v1572); + int16x8_t v1932 = vqrdmulhq_n_s16(v1931, 29490); + int16x8_t v1933 = vaddq_s16(v1930, v1932); + int16x8_t v1934 = vqrdmulhq_n_s16(v1933, 18578); + int16x8_t v1935 = vaddq_s16(v1929, v1934); + int16x8_t v1936 = vqrdmulhq_n_s16(v1935, 16890); + int16x8_t v1937 = vaddq_s16(v1925, v1936); + int16x8_t v1938 = vsubq_s16(v1580, v1582); + int16x8_t v1939 = vsubq_s16(v1584, v1586); + int16x8_t v1940 = vqrdmulhq_n_s16(v1939, 29490); + int16x8_t v1941 = vaddq_s16(v1938, v1940); + int16x8_t v1942 = vsubq_s16(v1590, v1592); + int16x8_t v1943 = vsubq_s16(v1594, v1596); + int16x8_t v1944 = vqrdmulhq_n_s16(v1943, 29490); + int16x8_t v1945 = vaddq_s16(v1942, v1944); + int16x8_t v1946 = vqrdmulhq_n_s16(v1945, 18578); + int16x8_t v1947 = vaddq_s16(v1941, v1946); + int16x8_t v1948 = vsubq_s16(v1602, v1604); + int16x8_t v1949 = vsubq_s16(v1606, v1608); + int16x8_t v1950 = vqrdmulhq_n_s16(v1949, 29490); + int16x8_t v1951 = vaddq_s16(v1948, v1950); + int16x8_t v1952 = vsubq_s16(v1612, v1614); + int16x8_t v1953 = vsubq_s16(v1616, v1618); + int16x8_t v1954 = vqrdmulhq_n_s16(v1953, 29490); + int16x8_t v1955 = vaddq_s16(v1952, v1954); + int16x8_t v1956 = vqrdmulhq_n_s16(v1955, 18578); + int16x8_t v1957 = vaddq_s16(v1951, v1956); + int16x8_t v1958 = vqrdmulhq_n_s16(v1957, 16890); + int16x8_t v1959 = vaddq_s16(v1947, v1958); + int16x8_t v1960 = vqrdmulhq_n_s16(v1959, 16508); + int16x8_t v1961 = vaddq_s16(v1937, v1960); + int16x8_t v1962 = vsubq_s16(v1628, v1630); + int16x8_t v1963 = vsubq_s16(v1632, v1634); + int16x8_t v1964 = vqrdmulhq_n_s16(v1963, 29490); + int16x8_t v1965 = vaddq_s16(v1962, v1964); + int16x8_t v1966 = vsubq_s16(v1638, v1640); + int16x8_t v1967 = vsubq_s16(v1642, v1644); + int16x8_t v1968 = vqrdmulhq_n_s16(v1967, 29490); + int16x8_t v1969 = vaddq_s16(v1966, v1968); + int16x8_t v1970 = vqrdmulhq_n_s16(v1969, 18578); + int16x8_t v1971 = vaddq_s16(v1965, v1970); + int16x8_t v1972 = vsubq_s16(v1650, v1652); + int16x8_t v1973 = vsubq_s16(v1654, v1656); + int16x8_t v1974 = vqrdmulhq_n_s16(v1973, 29490); + int16x8_t v1975 = vaddq_s16(v1972, v1974); + int16x8_t v1976 = vsubq_s16(v1660, v1662); + int16x8_t v1977 = vsubq_s16(v1664, v1666); + int16x8_t v1978 = vqrdmulhq_n_s16(v1977, 29490); + int16x8_t v1979 = vaddq_s16(v1976, v1978); + int16x8_t v1980 = vqrdmulhq_n_s16(v1979, 18578); + int16x8_t v1981 = vaddq_s16(v1975, v1980); + int16x8_t v1982 = vqrdmulhq_n_s16(v1981, 16890); + int16x8_t v1983 = vaddq_s16(v1971, v1982); + int16x8_t v1984 = vsubq_s16(v1674, v1676); + int16x8_t v1985 = vsubq_s16(v1678, v1680); + int16x8_t v1986 = vqrdmulhq_n_s16(v1985, 29490); + int16x8_t v1987 = vaddq_s16(v1984, v1986); + int16x8_t v1988 = vsubq_s16(v1684, v1686); + int16x8_t v1989 = vsubq_s16(v1688, v1690); + int16x8_t v1990 = vqrdmulhq_n_s16(v1989, 29490); + int16x8_t v1991 = vaddq_s16(v1988, v1990); + int16x8_t v1992 = vqrdmulhq_n_s16(v1991, 18578); + int16x8_t v1993 = vaddq_s16(v1987, v1992); + int16x8_t v1994 = vsubq_s16(v1696, v1698); + int16x8_t v1995 = vsubq_s16(v1700, v1702); + int16x8_t v1996 = vqrdmulhq_n_s16(v1995, 29490); + int16x8_t v1997 = vaddq_s16(v1994, v1996); + int16x8_t v1998 = vsubq_s16(v1706, v1708); + int16x8_t v1999 = vsubq_s16(v1710, v1712); + int16x8_t v2000 = vqrdmulhq_n_s16(v1999, 29490); + int16x8_t v2001 = vaddq_s16(v1998, v2000); + int16x8_t v2002 = vqrdmulhq_n_s16(v2001, 18578); + int16x8_t v2003 = vaddq_s16(v1997, v2002); + int16x8_t v2004 = vqrdmulhq_n_s16(v2003, 16890); + int16x8_t v2005 = vaddq_s16(v1993, v2004); + int16x8_t v2006 = vqrdmulhq_n_s16(v2005, 16508); + int16x8_t v2007 = vaddq_s16(v1983, v2006); + int16x8_t v2008 = vqrdmulhq_n_s16(v2007, 16415); + int16x8_t v2009 = vaddq_s16(v1961, v2008); + int16x8_t v2010 = vsubq_s16(v1724, v1726); + int16x8_t v2011 = vsubq_s16(v1728, v1730); + int16x8_t v2012 = vqrdmulhq_n_s16(v2011, 29490); + int16x8_t v2013 = vaddq_s16(v2010, v2012); + int16x8_t v2014 = vsubq_s16(v1734, v1736); + int16x8_t v2015 = vsubq_s16(v1738, v1740); + int16x8_t v2016 = vqrdmulhq_n_s16(v2015, 29490); + int16x8_t v2017 = vaddq_s16(v2014, v2016); + int16x8_t v2018 = vqrdmulhq_n_s16(v2017, 18578); + int16x8_t v2019 = vaddq_s16(v2013, v2018); + int16x8_t v2020 = vsubq_s16(v1746, v1748); + int16x8_t v2021 = vsubq_s16(v1750, v1752); + int16x8_t v2022 = vqrdmulhq_n_s16(v2021, 29490); + int16x8_t v2023 = vaddq_s16(v2020, v2022); + int16x8_t v2024 = vsubq_s16(v1756, v1758); + int16x8_t v2025 = vsubq_s16(v1760, v1762); + int16x8_t v2026 = vqrdmulhq_n_s16(v2025, 29490); + int16x8_t v2027 = vaddq_s16(v2024, v2026); + int16x8_t v2028 = vqrdmulhq_n_s16(v2027, 18578); + int16x8_t v2029 = vaddq_s16(v2023, v2028); + int16x8_t v2030 = vqrdmulhq_n_s16(v2029, 16890); + int16x8_t v2031 = vaddq_s16(v2019, v2030); + int16x8_t v2032 = vsubq_s16(v1770, v1772); + int16x8_t v2033 = vsubq_s16(v1774, v1776); + int16x8_t v2034 = vqrdmulhq_n_s16(v2033, 29490); + int16x8_t v2035 = vaddq_s16(v2032, v2034); + int16x8_t v2036 = vsubq_s16(v1780, v1782); + int16x8_t v2037 = vsubq_s16(v1784, v1786); + int16x8_t v2038 = vqrdmulhq_n_s16(v2037, 29490); + int16x8_t v2039 = vaddq_s16(v2036, v2038); + int16x8_t v2040 = vqrdmulhq_n_s16(v2039, 18578); + int16x8_t v2041 = vaddq_s16(v2035, v2040); + int16x8_t v2042 = vsubq_s16(v1792, v1794); + int16x8_t v2043 = vsubq_s16(v1796, v1798); + int16x8_t v2044 = vqrdmulhq_n_s16(v2043, 29490); + int16x8_t v2045 = vaddq_s16(v2042, v2044); + int16x8_t v2046 = vsubq_s16(v1802, v1804); + int16x8_t v2047 = vsubq_s16(v1806, v1808); + int16x8_t v2048 = vqrdmulhq_n_s16(v2047, 29490); + int16x8_t v2049 = vaddq_s16(v2046, v2048); + int16x8_t v2050 = vqrdmulhq_n_s16(v2049, 18578); + int16x8_t v2051 = vaddq_s16(v2045, v2050); + int16x8_t v2052 = vqrdmulhq_n_s16(v2051, 16890); + int16x8_t v2053 = vaddq_s16(v2041, v2052); + int16x8_t v2054 = vqrdmulhq_n_s16(v2053, 16508); + int16x8_t v2055 = vaddq_s16(v2031, v2054); + int16x8_t v2056 = vsubq_s16(v1818, v1820); + int16x8_t v2057 = vsubq_s16(v1822, v1824); + int16x8_t v2058 = vqrdmulhq_n_s16(v2057, 29490); + int16x8_t v2059 = vaddq_s16(v2056, v2058); + int16x8_t v2060 = vsubq_s16(v1828, v1830); + int16x8_t v2061 = vsubq_s16(v1832, v1834); + int16x8_t v2062 = vqrdmulhq_n_s16(v2061, 29490); + int16x8_t v2063 = vaddq_s16(v2060, v2062); + int16x8_t v2064 = vqrdmulhq_n_s16(v2063, 18578); + int16x8_t v2065 = vaddq_s16(v2059, v2064); + int16x8_t v2066 = vsubq_s16(v1840, v1842); + int16x8_t v2067 = vsubq_s16(v1844, v1846); + int16x8_t v2068 = vqrdmulhq_n_s16(v2067, 29490); + int16x8_t v2069 = vaddq_s16(v2066, v2068); + int16x8_t v2070 = vsubq_s16(v1850, v1852); + int16x8_t v2071 = vqrdmulhq_n_s16(v2070, 18578); + int16x8_t v2072 = vsubq_s16(v1854, v1856); + int16x8_t v2073 = vqrdmulhq_n_s16(v2072, 16719); + int16x8_t v2074 = vaddq_s16(v2071, v2073); + int16x8_t v2075 = vaddq_s16(v2069, v2074); + int16x8_t v2076 = vqrdmulhq_n_s16(v2075, 16890); + int16x8_t v2077 = vaddq_s16(v2065, v2076); + int16x8_t v2078 = vsubq_s16(v1864, v1866); + int16x8_t v2079 = vsubq_s16(v1868, v1870); + int16x8_t v2080 = vqrdmulhq_n_s16(v2079, 29490); + int16x8_t v2081 = vaddq_s16(v2078, v2080); + int16x8_t v2082 = vsubq_s16(v1874, v1876); + int16x8_t v2083 = vsubq_s16(v1878, v1880); + int16x8_t v2084 = vqrdmulhq_n_s16(v2083, 29490); + int16x8_t v2085 = vaddq_s16(v2082, v2084); + int16x8_t v2086 = vqrdmulhq_n_s16(v2085, 18578); + int16x8_t v2087 = vaddq_s16(v2081, v2086); + int16x8_t v2088 = vsubq_s16(v1886, v1888); + int16x8_t v2089 = vsubq_s16(v1890, v1892); + int16x8_t v2090 = vqrdmulhq_n_s16(v2089, 29490); + int16x8_t v2091 = vaddq_s16(v2088, v2090); + int16x8_t v2092 = vsubq_s16(v1896, v1898); + int16x8_t v2093 = vsubq_s16(v1900, v1902); + int16x8_t v2094 = vqrdmulhq_n_s16(v2093, 29490); + int16x8_t v2095 = vaddq_s16(v2092, v2094); + int16x8_t v2096 = vqrdmulhq_n_s16(v2095, 18578); + int16x8_t v2097 = vaddq_s16(v2091, v2096); + int16x8_t v2098 = vqrdmulhq_n_s16(v2097, 16890); + int16x8_t v2099 = vaddq_s16(v2087, v2098); + int16x8_t v2100 = vqrdmulhq_n_s16(v2099, 16508); + int16x8_t v2101 = vaddq_s16(v2077, v2100); + int16x8_t v2102 = vqrdmulhq_n_s16(v2101, 16415); + int16x8_t v2103 = vaddq_s16(v2055, v2102); + int16x8_t v2104 = vqrdmulhq_n_s16(v2103, 16392); + int16x8_t v2105 = vaddq_s16(v2009, v2104); + int16x8_t v2106 = vsubq_s16(v2, v8); + int16x8_t v2107 = vsubq_s16(v15, v22); + int16x8_t v2108_tmp = vqrdmulhq_n_s16(v2107, 18446); + int16x8_t v2108 = vmlaq_n_s16(v2108_tmp, v2107, 2); + int16x8_t v2109 = vaddq_s16(v2106, v2108); + int16x8_t v2110 = vsubq_s16(v31, v41); + int16x8_t v2111 = vsubq_s16(v48, v56); + int16x8_t v2112_tmp = vqrdmulhq_n_s16(v2111, 18446); + int16x8_t v2112 = vmlaq_n_s16(v2112_tmp, v2111, 2); + int16x8_t v2113 = vaddq_s16(v2110, v2112); + int16x8_t v2114 = vqrdmulhq_n_s16(v2113, 21195); + int16x8_t v2115 = vaddq_s16(v2109, v2114); + int16x8_t v2116 = vsubq_s16(v67, v77); + int16x8_t v2117 = vsubq_s16(v90, v99); + int16x8_t v2118_tmp = vqrdmulhq_n_s16(v2117, 18446); + int16x8_t v2118 = vmlaq_n_s16(v2118_tmp, v2117, 2); + int16x8_t v2119 = vaddq_s16(v2116, v2118); + int16x8_t v2120 = vsubq_s16(v108, v118); + int16x8_t v2121 = vsubq_s16(v125, v134); + int16x8_t v2122_tmp = vqrdmulhq_n_s16(v2121, 18446); + int16x8_t v2122 = vmlaq_n_s16(v2122_tmp, v2121, 2); + int16x8_t v2123 = vaddq_s16(v2120, v2122); + int16x8_t v2124 = vqrdmulhq_n_s16(v2123, 21195); + int16x8_t v2125 = vaddq_s16(v2119, v2124); + int16x8_t v2126 = vqrdmulhq_n_s16(v2125, 17401); + int16x8_t v2127 = vaddq_s16(v2115, v2126); + int16x8_t v2128 = vsubq_s16(v147, v157); + int16x8_t v2129 = vsubq_s16(v170, v179); + int16x8_t v2130_tmp = vqrdmulhq_n_s16(v2129, 18446); + int16x8_t v2130 = vmlaq_n_s16(v2130_tmp, v2129, 2); + int16x8_t v2131 = vaddq_s16(v2128, v2130); + int16x8_t v2132 = vsubq_s16(v194, v212); + int16x8_t v2133 = vsubq_s16(v219, v229); + int16x8_t v2134_tmp = vqrdmulhq_n_s16(v2133, 18446); + int16x8_t v2134 = vmlaq_n_s16(v2134_tmp, v2133, 2); + int16x8_t v2135 = vaddq_s16(v2132, v2134); + int16x8_t v2136 = vqrdmulhq_n_s16(v2135, 21195); + int16x8_t v2137 = vaddq_s16(v2131, v2136); + int16x8_t v2138 = vsubq_s16(v240, v250); + int16x8_t v2139 = vsubq_s16(v263, v272); + int16x8_t v2140_tmp = vqrdmulhq_n_s16(v2139, 18446); + int16x8_t v2140 = vmlaq_n_s16(v2140_tmp, v2139, 2); + int16x8_t v2141 = vaddq_s16(v2138, v2140); + int16x8_t v2142 = vsubq_s16(v281, v291); + int16x8_t v2143 = vsubq_s16(v298, v308); + int16x8_t v2144_tmp = vqrdmulhq_n_s16(v2143, 18446); + int16x8_t v2144 = vmlaq_n_s16(v2144_tmp, v2143, 2); + int16x8_t v2145 = vaddq_s16(v2142, v2144); + int16x8_t v2146 = vqrdmulhq_n_s16(v2145, 21195); + int16x8_t v2147 = vaddq_s16(v2141, v2146); + int16x8_t v2148 = vqrdmulhq_n_s16(v2147, 17401); + int16x8_t v2149 = vaddq_s16(v2137, v2148); + int16x8_t v2150 = vqrdmulhq_n_s16(v2149, 16629); + int16x8_t v2151 = vaddq_s16(v2127, v2150); + int16x8_t v2152 = vsubq_s16(v323, v333); + int16x8_t v2153 = vsubq_s16(v346, v355); + int16x8_t v2154_tmp = vqrdmulhq_n_s16(v2153, 18446); + int16x8_t v2154 = vmlaq_n_s16(v2154_tmp, v2153, 2); + int16x8_t v2155 = vaddq_s16(v2152, v2154); + int16x8_t v2156 = vsubq_s16(v370, v388); + int16x8_t v2157 = vsubq_s16(v395, v405); + int16x8_t v2158_tmp = vqrdmulhq_n_s16(v2157, 18446); + int16x8_t v2158 = vmlaq_n_s16(v2158_tmp, v2157, 2); + int16x8_t v2159 = vaddq_s16(v2156, v2158); + int16x8_t v2160 = vqrdmulhq_n_s16(v2159, 21195); + int16x8_t v2161 = vaddq_s16(v2155, v2160); + int16x8_t v2162 = vsubq_s16(v422, v440); + int16x8_t v2163 = vsubq_s16(v465, v478); + int16x8_t v2164_tmp = vqrdmulhq_n_s16(v2163, 18446); + int16x8_t v2164 = vmlaq_n_s16(v2164_tmp, v2163, 2); + int16x8_t v2165 = vaddq_s16(v2162, v2164); + int16x8_t v2166 = vsubq_s16(v487, v497); + int16x8_t v2167 = vsubq_s16(v504, v515); + int16x8_t v2168_tmp = vqrdmulhq_n_s16(v2167, 18446); + int16x8_t v2168 = vmlaq_n_s16(v2168_tmp, v2167, 2); + int16x8_t v2169 = vaddq_s16(v2166, v2168); + int16x8_t v2170 = vqrdmulhq_n_s16(v2169, 21195); + int16x8_t v2171 = vaddq_s16(v2165, v2170); + int16x8_t v2172 = vqrdmulhq_n_s16(v2171, 17401); + int16x8_t v2173 = vaddq_s16(v2161, v2172); + int16x8_t v2174 = vsubq_s16(v528, v538); + int16x8_t v2175 = vsubq_s16(v551, v560); + int16x8_t v2176_tmp = vqrdmulhq_n_s16(v2175, 18446); + int16x8_t v2176 = vmlaq_n_s16(v2176_tmp, v2175, 2); + int16x8_t v2177 = vaddq_s16(v2174, v2176); + int16x8_t v2178 = vsubq_s16(v575, v593); + int16x8_t v2179 = vsubq_s16(v600, v610); + int16x8_t v2180_tmp = vqrdmulhq_n_s16(v2179, 18446); + int16x8_t v2180 = vmlaq_n_s16(v2180_tmp, v2179, 2); + int16x8_t v2181 = vaddq_s16(v2178, v2180); + int16x8_t v2182 = vqrdmulhq_n_s16(v2181, 21195); + int16x8_t v2183 = vaddq_s16(v2177, v2182); + int16x8_t v2184 = vsubq_s16(v621, v631); + int16x8_t v2185 = vsubq_s16(v644, v653); + int16x8_t v2186_tmp = vqrdmulhq_n_s16(v2185, 18446); + int16x8_t v2186 = vmlaq_n_s16(v2186_tmp, v2185, 2); + int16x8_t v2187 = vaddq_s16(v2184, v2186); + int16x8_t v2188 = vsubq_s16(v662, v672); + int16x8_t v2189 = vsubq_s16(v679, v690); + int16x8_t v2190_tmp = vqrdmulhq_n_s16(v2189, 18446); + int16x8_t v2190 = vmlaq_n_s16(v2190_tmp, v2189, 2); + int16x8_t v2191 = vaddq_s16(v2188, v2190); + int16x8_t v2192 = vqrdmulhq_n_s16(v2191, 21195); + int16x8_t v2193 = vaddq_s16(v2187, v2192); + int16x8_t v2194 = vqrdmulhq_n_s16(v2193, 17401); + int16x8_t v2195 = vaddq_s16(v2183, v2194); + int16x8_t v2196 = vqrdmulhq_n_s16(v2195, 16629); + int16x8_t v2197 = vaddq_s16(v2173, v2196); + int16x8_t v2198 = vqrdmulhq_n_s16(v2197, 16445); + int16x8_t v2199 = vaddq_s16(v2151, v2198); + int16x8_t v2200 = vsubq_s16(v707, v717); + int16x8_t v2201 = vsubq_s16(v730, v739); + int16x8_t v2202_tmp = vqrdmulhq_n_s16(v2201, 18446); + int16x8_t v2202 = vmlaq_n_s16(v2202_tmp, v2201, 2); + int16x8_t v2203 = vaddq_s16(v2200, v2202); + int16x8_t v2204 = vsubq_s16(v754, v772); + int16x8_t v2205 = vsubq_s16(v779, v789); + int16x8_t v2206_tmp = vqrdmulhq_n_s16(v2205, 18446); + int16x8_t v2206 = vmlaq_n_s16(v2206_tmp, v2205, 2); + int16x8_t v2207 = vaddq_s16(v2204, v2206); + int16x8_t v2208 = vqrdmulhq_n_s16(v2207, 21195); + int16x8_t v2209 = vaddq_s16(v2203, v2208); + int16x8_t v2210 = vsubq_s16(v806, v824); + int16x8_t v2211 = vsubq_s16(v849, v862); + int16x8_t v2212_tmp = vqrdmulhq_n_s16(v2211, 18446); + int16x8_t v2212 = vmlaq_n_s16(v2212_tmp, v2211, 2); + int16x8_t v2213 = vaddq_s16(v2210, v2212); + int16x8_t v2214 = vsubq_s16(v871, v881); + int16x8_t v2215 = vsubq_s16(v888, v899); + int16x8_t v2216_tmp = vqrdmulhq_n_s16(v2215, 18446); + int16x8_t v2216 = vmlaq_n_s16(v2216_tmp, v2215, 2); + int16x8_t v2217 = vaddq_s16(v2214, v2216); + int16x8_t v2218 = vqrdmulhq_n_s16(v2217, 21195); + int16x8_t v2219 = vaddq_s16(v2213, v2218); + int16x8_t v2220 = vqrdmulhq_n_s16(v2219, 17401); + int16x8_t v2221 = vaddq_s16(v2209, v2220); + int16x8_t v2222 = vsubq_s16(v918, v936); + int16x8_t v2223 = vsubq_s16(v961, v974); + int16x8_t v2224_tmp = vqrdmulhq_n_s16(v2223, 18446); + int16x8_t v2224 = vmlaq_n_s16(v2224_tmp, v2223, 2); + int16x8_t v2225 = vaddq_s16(v2222, v2224); + int16x8_t v2226 = vsubq_s16(v1001, v1035); + int16x8_t v2227 = vsubq_s16(v1042, v1056); + int16x8_t v2228_tmp = vqrdmulhq_n_s16(v2227, 18446); + int16x8_t v2228 = vmlaq_n_s16(v2228_tmp, v2227, 2); + int16x8_t v2229 = vaddq_s16(v2226, v2228); + int16x8_t v2230 = vqrdmulhq_n_s16(v2229, 21195); + int16x8_t v2231 = vaddq_s16(v2225, v2230); + int16x8_t v2232 = vsubq_s16(v1067, v1077); + int16x8_t v2233 = vsubq_s16(v1090, v1099); + int16x8_t v2234_tmp = vqrdmulhq_n_s16(v2233, 18446); + int16x8_t v2234 = vmlaq_n_s16(v2234_tmp, v2233, 2); + int16x8_t v2235 = vaddq_s16(v2232, v2234); + int16x8_t v2236 = vsubq_s16(v1108, v1118); + int16x8_t v2237 = vsubq_s16(v1125, v1137); + int16x8_t v2238_tmp = vqrdmulhq_n_s16(v2237, 18446); + int16x8_t v2238 = vmlaq_n_s16(v2238_tmp, v2237, 2); + int16x8_t v2239 = vaddq_s16(v2236, v2238); + int16x8_t v2240 = vqrdmulhq_n_s16(v2239, 21195); + int16x8_t v2241 = vaddq_s16(v2235, v2240); + int16x8_t v2242 = vqrdmulhq_n_s16(v2241, 17401); + int16x8_t v2243 = vaddq_s16(v2231, v2242); + int16x8_t v2244 = vqrdmulhq_n_s16(v2243, 16629); + int16x8_t v2245 = vaddq_s16(v2221, v2244); + int16x8_t v2246 = vsubq_s16(v1152, v1162); + int16x8_t v2247 = vsubq_s16(v1175, v1184); + int16x8_t v2248_tmp = vqrdmulhq_n_s16(v2247, 18446); + int16x8_t v2248 = vmlaq_n_s16(v2248_tmp, v2247, 2); + int16x8_t v2249 = vaddq_s16(v2246, v2248); + int16x8_t v2250 = vsubq_s16(v1199, v1217); + int16x8_t v2251 = vsubq_s16(v1224, v1234); + int16x8_t v2252_tmp = vqrdmulhq_n_s16(v2251, 18446); + int16x8_t v2252 = vmlaq_n_s16(v2252_tmp, v2251, 2); + int16x8_t v2253 = vaddq_s16(v2250, v2252); + int16x8_t v2254 = vqrdmulhq_n_s16(v2253, 21195); + int16x8_t v2255 = vaddq_s16(v2249, v2254); + int16x8_t v2256 = vsubq_s16(v1251, v1269); + int16x8_t v2257 = vsubq_s16(v1294, v1307); + int16x8_t v2258_tmp = vqrdmulhq_n_s16(v2257, 18446); + int16x8_t v2258 = vmlaq_n_s16(v2258_tmp, v2257, 2); + int16x8_t v2259 = vaddq_s16(v2256, v2258); + int16x8_t v2260 = vsubq_s16(v1316, v1326); + int16x8_t v2261 = vsubq_s16(v1333, v1344); + int16x8_t v2262_tmp = vqrdmulhq_n_s16(v2261, 18446); + int16x8_t v2262 = vmlaq_n_s16(v2262_tmp, v2261, 2); + int16x8_t v2263 = vaddq_s16(v2260, v2262); + int16x8_t v2264 = vqrdmulhq_n_s16(v2263, 21195); + int16x8_t v2265 = vaddq_s16(v2259, v2264); + int16x8_t v2266 = vqrdmulhq_n_s16(v2265, 17401); + int16x8_t v2267 = vaddq_s16(v2255, v2266); + int16x8_t v2268 = vsubq_s16(v1357, v1367); + int16x8_t v2269 = vsubq_s16(v1380, v1389); + int16x8_t v2270_tmp = vqrdmulhq_n_s16(v2269, 18446); + int16x8_t v2270 = vmlaq_n_s16(v2270_tmp, v2269, 2); + int16x8_t v2271 = vaddq_s16(v2268, v2270); + int16x8_t v2272 = vsubq_s16(v1404, v1422); + int16x8_t v2273 = vsubq_s16(v1429, v1439); + int16x8_t v2274_tmp = vqrdmulhq_n_s16(v2273, 18446); + int16x8_t v2274 = vmlaq_n_s16(v2274_tmp, v2273, 2); + int16x8_t v2275 = vaddq_s16(v2272, v2274); + int16x8_t v2276 = vqrdmulhq_n_s16(v2275, 21195); + int16x8_t v2277 = vaddq_s16(v2271, v2276); + int16x8_t v2278 = vsubq_s16(v1450, v1460); + int16x8_t v2279 = vsubq_s16(v1473, v1482); + int16x8_t v2280_tmp = vqrdmulhq_n_s16(v2279, 18446); + int16x8_t v2280 = vmlaq_n_s16(v2280_tmp, v2279, 2); + int16x8_t v2281 = vaddq_s16(v2278, v2280); + int16x8_t v2282 = vsubq_s16(v1491, v1501); + int16x8_t v2283 = vsubq_s16(v1508, v1520); + int16x8_t v2284_tmp = vqrdmulhq_n_s16(v2283, 18446); + int16x8_t v2284 = vmlaq_n_s16(v2284_tmp, v2283, 2); + int16x8_t v2285 = vaddq_s16(v2282, v2284); + int16x8_t v2286 = vqrdmulhq_n_s16(v2285, 21195); + int16x8_t v2287 = vaddq_s16(v2281, v2286); + int16x8_t v2288 = vqrdmulhq_n_s16(v2287, 17401); + int16x8_t v2289 = vaddq_s16(v2277, v2288); + int16x8_t v2290 = vqrdmulhq_n_s16(v2289, 16629); + int16x8_t v2291 = vaddq_s16(v2267, v2290); + int16x8_t v2292 = vqrdmulhq_n_s16(v2291, 16445); + int16x8_t v2293 = vaddq_s16(v2245, v2292); + int16x8_t v2294 = vqrdmulhq_n_s16(v2293, 16399); + int16x8_t v2295 = vaddq_s16(v2199, v2294); + int16x8_t v2296 = vsubq_s16(v2106, v2108); + int16x8_t v2297 = vsubq_s16(v2110, v2112); + int16x8_t v2298 = vqrdmulhq_n_s16(v2297, 25826); + int16x8_t v2299 = vaddq_s16(v2296, v2298); + int16x8_t v2300 = vsubq_s16(v2116, v2118); + int16x8_t v2301 = vsubq_s16(v2120, v2122); + int16x8_t v2302 = vqrdmulhq_n_s16(v2301, 25826); + int16x8_t v2303 = vaddq_s16(v2300, v2302); + int16x8_t v2304 = vqrdmulhq_n_s16(v2303, 18124); + int16x8_t v2305 = vaddq_s16(v2299, v2304); + int16x8_t v2306 = vsubq_s16(v2128, v2130); + int16x8_t v2307 = vsubq_s16(v2132, v2134); + int16x8_t v2308 = vqrdmulhq_n_s16(v2307, 25826); + int16x8_t v2309 = vaddq_s16(v2306, v2308); + int16x8_t v2310 = vsubq_s16(v2138, v2140); + int16x8_t v2311 = vsubq_s16(v2142, v2144); + int16x8_t v2312 = vqrdmulhq_n_s16(v2311, 25826); + int16x8_t v2313 = vaddq_s16(v2310, v2312); + int16x8_t v2314 = vqrdmulhq_n_s16(v2313, 18124); + int16x8_t v2315 = vaddq_s16(v2309, v2314); + int16x8_t v2316 = vqrdmulhq_n_s16(v2315, 16792); + int16x8_t v2317 = vaddq_s16(v2305, v2316); + int16x8_t v2318 = vsubq_s16(v2152, v2154); + int16x8_t v2319 = vsubq_s16(v2156, v2158); + int16x8_t v2320 = vqrdmulhq_n_s16(v2319, 25826); + int16x8_t v2321 = vaddq_s16(v2318, v2320); + int16x8_t v2322 = vsubq_s16(v2162, v2164); + int16x8_t v2323 = vsubq_s16(v2166, v2168); + int16x8_t v2324 = vqrdmulhq_n_s16(v2323, 25826); + int16x8_t v2325 = vaddq_s16(v2322, v2324); + int16x8_t v2326 = vqrdmulhq_n_s16(v2325, 18124); + int16x8_t v2327 = vaddq_s16(v2321, v2326); + int16x8_t v2328 = vsubq_s16(v2174, v2176); + int16x8_t v2329 = vsubq_s16(v2178, v2180); + int16x8_t v2330 = vqrdmulhq_n_s16(v2329, 25826); + int16x8_t v2331 = vaddq_s16(v2328, v2330); + int16x8_t v2332 = vsubq_s16(v2184, v2186); + int16x8_t v2333 = vsubq_s16(v2188, v2190); + int16x8_t v2334 = vqrdmulhq_n_s16(v2333, 25826); + int16x8_t v2335 = vaddq_s16(v2332, v2334); + int16x8_t v2336 = vqrdmulhq_n_s16(v2335, 18124); + int16x8_t v2337 = vaddq_s16(v2331, v2336); + int16x8_t v2338 = vqrdmulhq_n_s16(v2337, 16792); + int16x8_t v2339 = vaddq_s16(v2327, v2338); + int16x8_t v2340 = vqrdmulhq_n_s16(v2339, 16484); + int16x8_t v2341 = vaddq_s16(v2317, v2340); + int16x8_t v2342 = vsubq_s16(v2200, v2202); + int16x8_t v2343 = vsubq_s16(v2204, v2206); + int16x8_t v2344 = vqrdmulhq_n_s16(v2343, 25826); + int16x8_t v2345 = vaddq_s16(v2342, v2344); + int16x8_t v2346 = vsubq_s16(v2210, v2212); + int16x8_t v2347 = vsubq_s16(v2214, v2216); + int16x8_t v2348 = vqrdmulhq_n_s16(v2347, 25826); + int16x8_t v2349 = vaddq_s16(v2346, v2348); + int16x8_t v2350 = vqrdmulhq_n_s16(v2349, 18124); + int16x8_t v2351 = vaddq_s16(v2345, v2350); + int16x8_t v2352 = vsubq_s16(v2222, v2224); + int16x8_t v2353 = vsubq_s16(v2226, v2228); + int16x8_t v2354 = vqrdmulhq_n_s16(v2353, 25826); + int16x8_t v2355 = vaddq_s16(v2352, v2354); + int16x8_t v2356 = vsubq_s16(v2232, v2234); + int16x8_t v2357 = vsubq_s16(v2236, v2238); + int16x8_t v2358 = vqrdmulhq_n_s16(v2357, 25826); + int16x8_t v2359 = vaddq_s16(v2356, v2358); + int16x8_t v2360 = vqrdmulhq_n_s16(v2359, 18124); + int16x8_t v2361 = vaddq_s16(v2355, v2360); + int16x8_t v2362 = vqrdmulhq_n_s16(v2361, 16792); + int16x8_t v2363 = vaddq_s16(v2351, v2362); + int16x8_t v2364 = vsubq_s16(v2246, v2248); + int16x8_t v2365 = vsubq_s16(v2250, v2252); + int16x8_t v2366 = vqrdmulhq_n_s16(v2365, 25826); + int16x8_t v2367 = vaddq_s16(v2364, v2366); + int16x8_t v2368 = vsubq_s16(v2256, v2258); + int16x8_t v2369 = vsubq_s16(v2260, v2262); + int16x8_t v2370 = vqrdmulhq_n_s16(v2369, 25826); + int16x8_t v2371 = vaddq_s16(v2368, v2370); + int16x8_t v2372 = vqrdmulhq_n_s16(v2371, 18124); + int16x8_t v2373 = vaddq_s16(v2367, v2372); + int16x8_t v2374 = vsubq_s16(v2268, v2270); + int16x8_t v2375 = vsubq_s16(v2272, v2274); + int16x8_t v2376 = vqrdmulhq_n_s16(v2375, 25826); + int16x8_t v2377 = vaddq_s16(v2374, v2376); + int16x8_t v2378 = vsubq_s16(v2278, v2280); + int16x8_t v2379 = vsubq_s16(v2282, v2284); + int16x8_t v2380 = vqrdmulhq_n_s16(v2379, 25826); + int16x8_t v2381 = vaddq_s16(v2378, v2380); + int16x8_t v2382 = vqrdmulhq_n_s16(v2381, 18124); + int16x8_t v2383 = vaddq_s16(v2377, v2382); + int16x8_t v2384 = vqrdmulhq_n_s16(v2383, 16792); + int16x8_t v2385 = vaddq_s16(v2373, v2384); + int16x8_t v2386 = vqrdmulhq_n_s16(v2385, 16484); + int16x8_t v2387 = vaddq_s16(v2363, v2386); + int16x8_t v2388 = vqrdmulhq_n_s16(v2387, 16409); + int16x8_t v2389 = vaddq_s16(v2341, v2388); + int16x8_t v2390 = vsubq_s16(v1916, v1918); + int16x8_t v2391 = vsubq_s16(v1920, v1922); + int16x8_t v2392_tmp = vqrdmulhq_n_s16(v2391, 1988); + int16x8_t v2392 = vaddq_s16(v2392_tmp, v2391); + int16x8_t v2393 = vaddq_s16(v2390, v2392); + int16x8_t v2394 = vsubq_s16(v1926, v1928); + int16x8_t v2395 = vsubq_s16(v1930, v1932); + int16x8_t v2396_tmp = vqrdmulhq_n_s16(v2395, 1988); + int16x8_t v2396 = vaddq_s16(v2396_tmp, v2395); + int16x8_t v2397 = vaddq_s16(v2394, v2396); + int16x8_t v2398 = vqrdmulhq_n_s16(v2397, 19102); + int16x8_t v2399 = vaddq_s16(v2393, v2398); + int16x8_t v2400 = vsubq_s16(v1938, v1940); + int16x8_t v2401 = vsubq_s16(v1942, v1944); + int16x8_t v2402_tmp = vqrdmulhq_n_s16(v2401, 1988); + int16x8_t v2402 = vaddq_s16(v2402_tmp, v2401); + int16x8_t v2403 = vaddq_s16(v2400, v2402); + int16x8_t v2404 = vsubq_s16(v1948, v1950); + int16x8_t v2405 = vsubq_s16(v1952, v1954); + int16x8_t v2406_tmp = vqrdmulhq_n_s16(v2405, 1988); + int16x8_t v2406 = vaddq_s16(v2406_tmp, v2405); + int16x8_t v2407 = vaddq_s16(v2404, v2406); + int16x8_t v2408 = vqrdmulhq_n_s16(v2407, 19102); + int16x8_t v2409 = vaddq_s16(v2403, v2408); + int16x8_t v2410 = vqrdmulhq_n_s16(v2409, 17000); + int16x8_t v2411 = vaddq_s16(v2399, v2410); + int16x8_t v2412 = vsubq_s16(v1962, v1964); + int16x8_t v2413 = vsubq_s16(v1966, v1968); + int16x8_t v2414_tmp = vqrdmulhq_n_s16(v2413, 1988); + int16x8_t v2414 = vaddq_s16(v2414_tmp, v2413); + int16x8_t v2415 = vaddq_s16(v2412, v2414); + int16x8_t v2416 = vsubq_s16(v1972, v1974); + int16x8_t v2417 = vsubq_s16(v1976, v1978); + int16x8_t v2418_tmp = vqrdmulhq_n_s16(v2417, 1988); + int16x8_t v2418 = vaddq_s16(v2418_tmp, v2417); + int16x8_t v2419 = vaddq_s16(v2416, v2418); + int16x8_t v2420 = vqrdmulhq_n_s16(v2419, 19102); + int16x8_t v2421 = vaddq_s16(v2415, v2420); + int16x8_t v2422 = vsubq_s16(v1984, v1986); + int16x8_t v2423 = vsubq_s16(v1988, v1990); + int16x8_t v2424_tmp = vqrdmulhq_n_s16(v2423, 1988); + int16x8_t v2424 = vaddq_s16(v2424_tmp, v2423); + int16x8_t v2425 = vaddq_s16(v2422, v2424); + int16x8_t v2426 = vsubq_s16(v1994, v1996); + int16x8_t v2427 = vsubq_s16(v1998, v2000); + int16x8_t v2428_tmp = vqrdmulhq_n_s16(v2427, 1988); + int16x8_t v2428 = vaddq_s16(v2428_tmp, v2427); + int16x8_t v2429 = vaddq_s16(v2426, v2428); + int16x8_t v2430 = vqrdmulhq_n_s16(v2429, 19102); + int16x8_t v2431 = vaddq_s16(v2425, v2430); + int16x8_t v2432 = vqrdmulhq_n_s16(v2431, 17000); + int16x8_t v2433 = vaddq_s16(v2421, v2432); + int16x8_t v2434 = vqrdmulhq_n_s16(v2433, 16534); + int16x8_t v2435 = vaddq_s16(v2411, v2434); + int16x8_t v2436 = vsubq_s16(v2010, v2012); + int16x8_t v2437 = vsubq_s16(v2014, v2016); + int16x8_t v2438_tmp = vqrdmulhq_n_s16(v2437, 1988); + int16x8_t v2438 = vaddq_s16(v2438_tmp, v2437); + int16x8_t v2439 = vaddq_s16(v2436, v2438); + int16x8_t v2440 = vsubq_s16(v2020, v2022); + int16x8_t v2441 = vsubq_s16(v2024, v2026); + int16x8_t v2442_tmp = vqrdmulhq_n_s16(v2441, 1988); + int16x8_t v2442 = vaddq_s16(v2442_tmp, v2441); + int16x8_t v2443 = vaddq_s16(v2440, v2442); + int16x8_t v2444 = vqrdmulhq_n_s16(v2443, 19102); + int16x8_t v2445 = vaddq_s16(v2439, v2444); + int16x8_t v2446 = vsubq_s16(v2032, v2034); + int16x8_t v2447 = vsubq_s16(v2036, v2038); + int16x8_t v2448_tmp = vqrdmulhq_n_s16(v2447, 1988); + int16x8_t v2448 = vaddq_s16(v2448_tmp, v2447); + int16x8_t v2449 = vaddq_s16(v2446, v2448); + int16x8_t v2450 = vsubq_s16(v2042, v2044); + int16x8_t v2451 = vsubq_s16(v2046, v2048); + int16x8_t v2452_tmp = vqrdmulhq_n_s16(v2451, 1988); + int16x8_t v2452 = vaddq_s16(v2452_tmp, v2451); + int16x8_t v2453 = vaddq_s16(v2450, v2452); + int16x8_t v2454 = vqrdmulhq_n_s16(v2453, 19102); + int16x8_t v2455 = vaddq_s16(v2449, v2454); + int16x8_t v2456 = vqrdmulhq_n_s16(v2455, 17000); + int16x8_t v2457 = vaddq_s16(v2445, v2456); + int16x8_t v2458 = vsubq_s16(v2056, v2058); + int16x8_t v2459 = vsubq_s16(v2060, v2062); + int16x8_t v2460_tmp = vqrdmulhq_n_s16(v2459, 1988); + int16x8_t v2460 = vaddq_s16(v2460_tmp, v2459); + int16x8_t v2461 = vaddq_s16(v2458, v2460); + int16x8_t v2462 = vsubq_s16(v2066, v2068); + int16x8_t v2463 = vqrdmulhq_n_s16(v2072, 29490); + int16x8_t v2464 = vsubq_s16(v2070, v2463); + int16x8_t v2465_tmp = vqrdmulhq_n_s16(v2464, 1988); + int16x8_t v2465 = vaddq_s16(v2465_tmp, v2464); + int16x8_t v2466 = vaddq_s16(v2462, v2465); + int16x8_t v2467 = vqrdmulhq_n_s16(v2466, 19102); + int16x8_t v2468 = vaddq_s16(v2461, v2467); + int16x8_t v2469 = vsubq_s16(v2078, v2080); + int16x8_t v2470 = vsubq_s16(v2082, v2084); + int16x8_t v2471_tmp = vqrdmulhq_n_s16(v2470, 1988); + int16x8_t v2471 = vaddq_s16(v2471_tmp, v2470); + int16x8_t v2472 = vaddq_s16(v2469, v2471); + int16x8_t v2473 = vsubq_s16(v2088, v2090); + int16x8_t v2474 = vsubq_s16(v2092, v2094); + int16x8_t v2475_tmp = vqrdmulhq_n_s16(v2474, 1988); + int16x8_t v2475 = vaddq_s16(v2475_tmp, v2474); + int16x8_t v2476 = vaddq_s16(v2473, v2475); + int16x8_t v2477 = vqrdmulhq_n_s16(v2476, 19102); + int16x8_t v2478 = vaddq_s16(v2472, v2477); + int16x8_t v2479 = vqrdmulhq_n_s16(v2478, 17000); + int16x8_t v2480 = vaddq_s16(v2468, v2479); + int16x8_t v2481 = vqrdmulhq_n_s16(v2480, 16534); + int16x8_t v2482 = vaddq_s16(v2457, v2481); + int16x8_t v2483 = vqrdmulhq_n_s16(v2482, 16421); + int16x8_t v2484 = vaddq_s16(v2435, v2483); + int16x8_t v2485 = vsubq_s16(v1537, v1542); + int16x8_t v2486 = vsubq_s16(v1547, v1552); + int16x8_t v2487_tmp = vqrdmulhq_n_s16(v2486, 23673); + int16x8_t v2487 = vaddq_s16(v2487_tmp, v2486); + int16x8_t v2488 = vaddq_s16(v2485, v2487); + int16x8_t v2489 = vsubq_s16(v1559, v1564); + int16x8_t v2490 = vsubq_s16(v1569, v1574); + int16x8_t v2491_tmp = vqrdmulhq_n_s16(v2490, 23673); + int16x8_t v2491 = vaddq_s16(v2491_tmp, v2490); + int16x8_t v2492 = vaddq_s16(v2489, v2491); + int16x8_t v2493 = vqrdmulhq_n_s16(v2492, 20398); + int16x8_t v2494 = vaddq_s16(v2488, v2493); + int16x8_t v2495 = vsubq_s16(v1583, v1588); + int16x8_t v2496 = vsubq_s16(v1593, v1598); + int16x8_t v2497_tmp = vqrdmulhq_n_s16(v2496, 23673); + int16x8_t v2497 = vaddq_s16(v2497_tmp, v2496); + int16x8_t v2498 = vaddq_s16(v2495, v2497); + int16x8_t v2499 = vsubq_s16(v1605, v1610); + int16x8_t v2500 = vsubq_s16(v1615, v1620); + int16x8_t v2501_tmp = vqrdmulhq_n_s16(v2500, 23673); + int16x8_t v2501 = vaddq_s16(v2501_tmp, v2500); + int16x8_t v2502 = vaddq_s16(v2499, v2501); + int16x8_t v2503 = vqrdmulhq_n_s16(v2502, 20398); + int16x8_t v2504 = vaddq_s16(v2498, v2503); + int16x8_t v2505 = vqrdmulhq_n_s16(v2504, 17255); + int16x8_t v2506 = vaddq_s16(v2494, v2505); + int16x8_t v2507 = vsubq_s16(v1631, v1636); + int16x8_t v2508 = vsubq_s16(v1641, v1646); + int16x8_t v2509_tmp = vqrdmulhq_n_s16(v2508, 23673); + int16x8_t v2509 = vaddq_s16(v2509_tmp, v2508); + int16x8_t v2510 = vaddq_s16(v2507, v2509); + int16x8_t v2511 = vsubq_s16(v1653, v1658); + int16x8_t v2512 = vsubq_s16(v1663, v1668); + int16x8_t v2513_tmp = vqrdmulhq_n_s16(v2512, 23673); + int16x8_t v2513 = vaddq_s16(v2513_tmp, v2512); + int16x8_t v2514 = vaddq_s16(v2511, v2513); + int16x8_t v2515 = vqrdmulhq_n_s16(v2514, 20398); + int16x8_t v2516 = vaddq_s16(v2510, v2515); + int16x8_t v2517 = vsubq_s16(v1677, v1682); + int16x8_t v2518 = vsubq_s16(v1687, v1692); + int16x8_t v2519_tmp = vqrdmulhq_n_s16(v2518, 23673); + int16x8_t v2519 = vaddq_s16(v2519_tmp, v2518); + int16x8_t v2520 = vaddq_s16(v2517, v2519); + int16x8_t v2521 = vsubq_s16(v1699, v1704); + int16x8_t v2522 = vsubq_s16(v1709, v1714); + int16x8_t v2523_tmp = vqrdmulhq_n_s16(v2522, 23673); + int16x8_t v2523 = vaddq_s16(v2523_tmp, v2522); + int16x8_t v2524 = vaddq_s16(v2521, v2523); + int16x8_t v2525 = vqrdmulhq_n_s16(v2524, 20398); + int16x8_t v2526 = vaddq_s16(v2520, v2525); + int16x8_t v2527 = vqrdmulhq_n_s16(v2526, 17255); + int16x8_t v2528 = vaddq_s16(v2516, v2527); + int16x8_t v2529 = vqrdmulhq_n_s16(v2528, 16595); + int16x8_t v2530 = vaddq_s16(v2506, v2529); + int16x8_t v2531 = vsubq_s16(v1727, v1732); + int16x8_t v2532 = vsubq_s16(v1737, v1742); + int16x8_t v2533_tmp = vqrdmulhq_n_s16(v2532, 23673); + int16x8_t v2533 = vaddq_s16(v2533_tmp, v2532); + int16x8_t v2534 = vaddq_s16(v2531, v2533); + int16x8_t v2535 = vsubq_s16(v1749, v1754); + int16x8_t v2536 = vsubq_s16(v1759, v1764); + int16x8_t v2537_tmp = vqrdmulhq_n_s16(v2536, 23673); + int16x8_t v2537 = vaddq_s16(v2537_tmp, v2536); + int16x8_t v2538 = vaddq_s16(v2535, v2537); + int16x8_t v2539 = vqrdmulhq_n_s16(v2538, 20398); + int16x8_t v2540 = vaddq_s16(v2534, v2539); + int16x8_t v2541 = vsubq_s16(v1773, v1778); + int16x8_t v2542 = vsubq_s16(v1783, v1788); + int16x8_t v2543_tmp = vqrdmulhq_n_s16(v2542, 23673); + int16x8_t v2543 = vaddq_s16(v2543_tmp, v2542); + int16x8_t v2544 = vaddq_s16(v2541, v2543); + int16x8_t v2545 = vsubq_s16(v1795, v1800); + int16x8_t v2546 = vsubq_s16(v1805, v1810); + int16x8_t v2547_tmp = vqrdmulhq_n_s16(v2546, 23673); + int16x8_t v2547 = vaddq_s16(v2547_tmp, v2546); + int16x8_t v2548 = vaddq_s16(v2545, v2547); + int16x8_t v2549 = vqrdmulhq_n_s16(v2548, 20398); + int16x8_t v2550 = vaddq_s16(v2544, v2549); + int16x8_t v2551 = vqrdmulhq_n_s16(v2550, 17255); + int16x8_t v2552 = vaddq_s16(v2540, v2551); + int16x8_t v2553 = vsubq_s16(v1821, v1826); + int16x8_t v2554 = vsubq_s16(v1831, v1836); + int16x8_t v2555_tmp = vqrdmulhq_n_s16(v2554, 23673); + int16x8_t v2555 = vaddq_s16(v2555_tmp, v2554); + int16x8_t v2556 = vaddq_s16(v2553, v2555); + int16x8_t v2557 = vsubq_s16(v1843, v1848); + int16x8_t v2558 = vsubq_s16(v1853, v1858); + int16x8_t v2559_tmp = vqrdmulhq_n_s16(v2558, 23673); + int16x8_t v2559 = vaddq_s16(v2559_tmp, v2558); + int16x8_t v2560 = vaddq_s16(v2557, v2559); + int16x8_t v2561 = vqrdmulhq_n_s16(v2560, 20398); + int16x8_t v2562 = vaddq_s16(v2556, v2561); + int16x8_t v2563 = vsubq_s16(v1867, v1872); + int16x8_t v2564 = vsubq_s16(v1877, v1882); + int16x8_t v2565_tmp = vqrdmulhq_n_s16(v2564, 23673); + int16x8_t v2565 = vaddq_s16(v2565_tmp, v2564); + int16x8_t v2566 = vaddq_s16(v2563, v2565); + int16x8_t v2567 = vsubq_s16(v1889, v1894); + int16x8_t v2568 = vsubq_s16(v1899, v1904); + int16x8_t v2569_tmp = vqrdmulhq_n_s16(v2568, 23673); + int16x8_t v2569 = vaddq_s16(v2569_tmp, v2568); + int16x8_t v2570 = vaddq_s16(v2567, v2569); + int16x8_t v2571 = vqrdmulhq_n_s16(v2570, 20398); + int16x8_t v2572 = vaddq_s16(v2566, v2571); + int16x8_t v2573 = vqrdmulhq_n_s16(v2572, 17255); + int16x8_t v2574 = vaddq_s16(v2562, v2573); + int16x8_t v2575 = vqrdmulhq_n_s16(v2574, 16595); + int16x8_t v2576 = vaddq_s16(v2552, v2575); + int16x8_t v2577 = vqrdmulhq_n_s16(v2576, 16436); + int16x8_t v2578 = vaddq_s16(v2530, v2577); + int16x8_t v2579 = vsubq_s16(v9, v24); + int16x8_t v2580 = vsubq_s16(v42, v58); + int16x8_t v2581_tmp = vqrdmulhq_n_s16(v2580, 3314); + int16x8_t v2581 = vmlaq_n_s16(v2581_tmp, v2580, 5); + int16x8_t v2582 = vaddq_s16(v2579, v2581); + int16x8_t v2583 = vsubq_s16(v78, v101); + int16x8_t v2584 = vsubq_s16(v119, v136); + int16x8_t v2585_tmp = vqrdmulhq_n_s16(v2584, 3314); + int16x8_t v2585 = vmlaq_n_s16(v2585_tmp, v2584, 5); + int16x8_t v2586 = vaddq_s16(v2583, v2585); + int16x8_t v2587 = vqrdmulhq_n_s16(v2586, 22112); + int16x8_t v2588 = vaddq_s16(v2582, v2587); + int16x8_t v2589 = vsubq_s16(v158, v181); + int16x8_t v2590 = vsubq_s16(v213, v231); + int16x8_t v2591_tmp = vqrdmulhq_n_s16(v2590, 3314); + int16x8_t v2591 = vmlaq_n_s16(v2591_tmp, v2590, 5); + int16x8_t v2592 = vaddq_s16(v2589, v2591); + int16x8_t v2593 = vsubq_s16(v251, v274); + int16x8_t v2594 = vsubq_s16(v292, v310); + int16x8_t v2595_tmp = vqrdmulhq_n_s16(v2594, 3314); + int16x8_t v2595 = vmlaq_n_s16(v2595_tmp, v2594, 5); + int16x8_t v2596 = vaddq_s16(v2593, v2595); + int16x8_t v2597 = vqrdmulhq_n_s16(v2596, 22112); + int16x8_t v2598 = vaddq_s16(v2592, v2597); + int16x8_t v2599 = vqrdmulhq_n_s16(v2598, 17561); + int16x8_t v2600 = vaddq_s16(v2588, v2599); + int16x8_t v2601 = vsubq_s16(v334, v357); + int16x8_t v2602 = vsubq_s16(v389, v407); + int16x8_t v2603_tmp = vqrdmulhq_n_s16(v2602, 3314); + int16x8_t v2603 = vmlaq_n_s16(v2603_tmp, v2602, 5); + int16x8_t v2604 = vaddq_s16(v2601, v2603); + int16x8_t v2605 = vsubq_s16(v441, v480); + int16x8_t v2606 = vsubq_s16(v498, v517); + int16x8_t v2607_tmp = vqrdmulhq_n_s16(v2606, 3314); + int16x8_t v2607 = vmlaq_n_s16(v2607_tmp, v2606, 5); + int16x8_t v2608 = vaddq_s16(v2605, v2607); + int16x8_t v2609 = vqrdmulhq_n_s16(v2608, 22112); + int16x8_t v2610 = vaddq_s16(v2604, v2609); + int16x8_t v2611 = vsubq_s16(v539, v562); + int16x8_t v2612 = vsubq_s16(v594, v612); + int16x8_t v2613_tmp = vqrdmulhq_n_s16(v2612, 3314); + int16x8_t v2613 = vmlaq_n_s16(v2613_tmp, v2612, 5); + int16x8_t v2614 = vaddq_s16(v2611, v2613); + int16x8_t v2615 = vsubq_s16(v632, v655); + int16x8_t v2616 = vsubq_s16(v673, v692); + int16x8_t v2617_tmp = vqrdmulhq_n_s16(v2616, 3314); + int16x8_t v2617 = vmlaq_n_s16(v2617_tmp, v2616, 5); + int16x8_t v2618 = vaddq_s16(v2615, v2617); + int16x8_t v2619 = vqrdmulhq_n_s16(v2618, 22112); + int16x8_t v2620 = vaddq_s16(v2614, v2619); + int16x8_t v2621 = vqrdmulhq_n_s16(v2620, 17561); + int16x8_t v2622 = vaddq_s16(v2610, v2621); + int16x8_t v2623 = vqrdmulhq_n_s16(v2622, 16666); + int16x8_t v2624 = vaddq_s16(v2600, v2623); + int16x8_t v2625 = vsubq_s16(v718, v741); + int16x8_t v2626 = vsubq_s16(v773, v791); + int16x8_t v2627_tmp = vqrdmulhq_n_s16(v2626, 3314); + int16x8_t v2627 = vmlaq_n_s16(v2627_tmp, v2626, 5); + int16x8_t v2628 = vaddq_s16(v2625, v2627); + int16x8_t v2629 = vsubq_s16(v825, v864); + int16x8_t v2630 = vsubq_s16(v882, v901); + int16x8_t v2631_tmp = vqrdmulhq_n_s16(v2630, 3314); + int16x8_t v2631 = vmlaq_n_s16(v2631_tmp, v2630, 5); + int16x8_t v2632 = vaddq_s16(v2629, v2631); + int16x8_t v2633 = vqrdmulhq_n_s16(v2632, 22112); + int16x8_t v2634 = vaddq_s16(v2628, v2633); + int16x8_t v2635 = vsubq_s16(v937, v976); + int16x8_t v2636 = vsubq_s16(v1036, v1058); + int16x8_t v2637_tmp = vqrdmulhq_n_s16(v2636, 3314); + int16x8_t v2637 = vmlaq_n_s16(v2637_tmp, v2636, 5); + int16x8_t v2638 = vaddq_s16(v2635, v2637); + int16x8_t v2639 = vsubq_s16(v1078, v1101); + int16x8_t v2640 = vsubq_s16(v1119, v1139); + int16x8_t v2641_tmp = vqrdmulhq_n_s16(v2640, 3314); + int16x8_t v2641 = vmlaq_n_s16(v2641_tmp, v2640, 5); + int16x8_t v2642 = vaddq_s16(v2639, v2641); + int16x8_t v2643 = vqrdmulhq_n_s16(v2642, 22112); + int16x8_t v2644 = vaddq_s16(v2638, v2643); + int16x8_t v2645 = vqrdmulhq_n_s16(v2644, 17561); + int16x8_t v2646 = vaddq_s16(v2634, v2645); + int16x8_t v2647 = vsubq_s16(v1163, v1186); + int16x8_t v2648 = vsubq_s16(v1218, v1236); + int16x8_t v2649_tmp = vqrdmulhq_n_s16(v2648, 3314); + int16x8_t v2649 = vmlaq_n_s16(v2649_tmp, v2648, 5); + int16x8_t v2650 = vaddq_s16(v2647, v2649); + int16x8_t v2651 = vsubq_s16(v1270, v1309); + int16x8_t v2652 = vsubq_s16(v1327, v1346); + int16x8_t v2653_tmp = vqrdmulhq_n_s16(v2652, 3314); + int16x8_t v2653 = vmlaq_n_s16(v2653_tmp, v2652, 5); + int16x8_t v2654 = vaddq_s16(v2651, v2653); + int16x8_t v2655 = vqrdmulhq_n_s16(v2654, 22112); + int16x8_t v2656 = vaddq_s16(v2650, v2655); + int16x8_t v2657 = vsubq_s16(v1368, v1391); + int16x8_t v2658 = vsubq_s16(v1423, v1441); + int16x8_t v2659_tmp = vqrdmulhq_n_s16(v2658, 3314); + int16x8_t v2659 = vmlaq_n_s16(v2659_tmp, v2658, 5); + int16x8_t v2660 = vaddq_s16(v2657, v2659); + int16x8_t v2661 = vsubq_s16(v1461, v1484); + int16x8_t v2662 = vsubq_s16(v1502, v1522); + int16x8_t v2663_tmp = vqrdmulhq_n_s16(v2662, 3314); + int16x8_t v2663 = vmlaq_n_s16(v2663_tmp, v2662, 5); + int16x8_t v2664 = vaddq_s16(v2661, v2663); + int16x8_t v2665 = vqrdmulhq_n_s16(v2664, 22112); + int16x8_t v2666 = vaddq_s16(v2660, v2665); + int16x8_t v2667 = vqrdmulhq_n_s16(v2666, 17561); + int16x8_t v2668 = vaddq_s16(v2656, v2667); + int16x8_t v2669 = vqrdmulhq_n_s16(v2668, 16666); + int16x8_t v2670 = vaddq_s16(v2646, v2669); + int16x8_t v2671 = vqrdmulhq_n_s16(v2670, 16454); + int16x8_t v2672 = vaddq_s16(v2624, v2671); + int16x8_t v2673 = vsubq_s16(v2579, v2581); + int16x8_t v2674 = vsubq_s16(v2583, v2585); + int16x8_t v2675 = vqrdmulhq_n_s16(v2674, 24397); + int16x8_t v2676 = vaddq_s16(v2673, v2675); + int16x8_t v2677 = vsubq_s16(v2589, v2591); + int16x8_t v2678 = vsubq_s16(v2593, v2595); + int16x8_t v2679 = vqrdmulhq_n_s16(v2678, 24397); + int16x8_t v2680 = vaddq_s16(v2677, v2679); + int16x8_t v2681 = vqrdmulhq_n_s16(v2680, 17921); + int16x8_t v2682 = vaddq_s16(v2676, v2681); + int16x8_t v2683 = vsubq_s16(v2601, v2603); + int16x8_t v2684 = vsubq_s16(v2605, v2607); + int16x8_t v2685 = vqrdmulhq_n_s16(v2684, 24397); + int16x8_t v2686 = vaddq_s16(v2683, v2685); + int16x8_t v2687 = vsubq_s16(v2611, v2613); + int16x8_t v2688 = vsubq_s16(v2615, v2617); + int16x8_t v2689 = vqrdmulhq_n_s16(v2688, 24397); + int16x8_t v2690 = vaddq_s16(v2687, v2689); + int16x8_t v2691 = vqrdmulhq_n_s16(v2690, 17921); + int16x8_t v2692 = vaddq_s16(v2686, v2691); + int16x8_t v2693 = vqrdmulhq_n_s16(v2692, 16747); + int16x8_t v2694 = vaddq_s16(v2682, v2693); + int16x8_t v2695 = vsubq_s16(v2625, v2627); + int16x8_t v2696 = vsubq_s16(v2629, v2631); + int16x8_t v2697 = vqrdmulhq_n_s16(v2696, 24397); + int16x8_t v2698 = vaddq_s16(v2695, v2697); + int16x8_t v2699 = vsubq_s16(v2635, v2637); + int16x8_t v2700 = vsubq_s16(v2639, v2641); + int16x8_t v2701 = vqrdmulhq_n_s16(v2700, 24397); + int16x8_t v2702 = vaddq_s16(v2699, v2701); + int16x8_t v2703 = vqrdmulhq_n_s16(v2702, 17921); + int16x8_t v2704 = vaddq_s16(v2698, v2703); + int16x8_t v2705 = vsubq_s16(v2647, v2649); + int16x8_t v2706 = vsubq_s16(v2651, v2653); + int16x8_t v2707 = vqrdmulhq_n_s16(v2706, 24397); + int16x8_t v2708 = vaddq_s16(v2705, v2707); + int16x8_t v2709 = vsubq_s16(v2657, v2659); + int16x8_t v2710 = vsubq_s16(v2661, v2663); + int16x8_t v2711 = vqrdmulhq_n_s16(v2710, 24397); + int16x8_t v2712 = vaddq_s16(v2709, v2711); + int16x8_t v2713 = vqrdmulhq_n_s16(v2712, 17921); + int16x8_t v2714 = vaddq_s16(v2708, v2713); + int16x8_t v2715 = vqrdmulhq_n_s16(v2714, 16747); + int16x8_t v2716 = vaddq_s16(v2704, v2715); + int16x8_t v2717 = vqrdmulhq_n_s16(v2716, 16474); + int16x8_t v2718 = vaddq_s16(v2694, v2717); + int16x8_t v2719 = vsubq_s16(v2485, v2487); + int16x8_t v2720 = vsubq_s16(v2489, v2491); + int16x8_t v2721 = vqrdmulhq_n_s16(v2720, 27504); + int16x8_t v2722 = vaddq_s16(v2719, v2721); + int16x8_t v2723 = vsubq_s16(v2495, v2497); + int16x8_t v2724 = vsubq_s16(v2499, v2501); + int16x8_t v2725 = vqrdmulhq_n_s16(v2724, 27504); + int16x8_t v2726 = vaddq_s16(v2723, v2725); + int16x8_t v2727 = vqrdmulhq_n_s16(v2726, 18343); + int16x8_t v2728 = vaddq_s16(v2722, v2727); + int16x8_t v2729 = vsubq_s16(v2507, v2509); + int16x8_t v2730 = vsubq_s16(v2511, v2513); + int16x8_t v2731 = vqrdmulhq_n_s16(v2730, 27504); + int16x8_t v2732 = vaddq_s16(v2729, v2731); + int16x8_t v2733 = vsubq_s16(v2517, v2519); + int16x8_t v2734 = vsubq_s16(v2521, v2523); + int16x8_t v2735 = vqrdmulhq_n_s16(v2734, 27504); + int16x8_t v2736 = vaddq_s16(v2733, v2735); + int16x8_t v2737 = vqrdmulhq_n_s16(v2736, 18343); + int16x8_t v2738 = vaddq_s16(v2732, v2737); + int16x8_t v2739 = vqrdmulhq_n_s16(v2738, 16840); + int16x8_t v2740 = vaddq_s16(v2728, v2739); + int16x8_t v2741 = vsubq_s16(v2531, v2533); + int16x8_t v2742 = vsubq_s16(v2535, v2537); + int16x8_t v2743 = vqrdmulhq_n_s16(v2742, 27504); + int16x8_t v2744 = vaddq_s16(v2741, v2743); + int16x8_t v2745 = vsubq_s16(v2541, v2543); + int16x8_t v2746 = vsubq_s16(v2545, v2547); + int16x8_t v2747 = vqrdmulhq_n_s16(v2746, 27504); + int16x8_t v2748 = vaddq_s16(v2745, v2747); + int16x8_t v2749 = vqrdmulhq_n_s16(v2748, 18343); + int16x8_t v2750 = vaddq_s16(v2744, v2749); + int16x8_t v2751 = vsubq_s16(v2553, v2555); + int16x8_t v2752 = vsubq_s16(v2557, v2559); + int16x8_t v2753 = vqrdmulhq_n_s16(v2752, 27504); + int16x8_t v2754 = vaddq_s16(v2751, v2753); + int16x8_t v2755 = vsubq_s16(v2563, v2565); + int16x8_t v2756 = vsubq_s16(v2567, v2569); + int16x8_t v2757 = vqrdmulhq_n_s16(v2756, 27504); + int16x8_t v2758 = vaddq_s16(v2755, v2757); + int16x8_t v2759 = vqrdmulhq_n_s16(v2758, 18343); + int16x8_t v2760 = vaddq_s16(v2754, v2759); + int16x8_t v2761 = vqrdmulhq_n_s16(v2760, 16840); + int16x8_t v2762 = vaddq_s16(v2750, v2761); + int16x8_t v2763 = vqrdmulhq_n_s16(v2762, 16496); + int16x8_t v2764 = vaddq_s16(v2740, v2763); + int16x8_t v2765 = vsubq_s16(v2390, v2392); + int16x8_t v2766 = vsubq_s16(v2394, v2396); + int16x8_t v2767 = vqrdmulhq_n_s16(v2766, 31869); + int16x8_t v2768 = vaddq_s16(v2765, v2767); + int16x8_t v2769 = vsubq_s16(v2400, v2402); + int16x8_t v2770 = vsubq_s16(v2404, v2406); + int16x8_t v2771 = vqrdmulhq_n_s16(v2770, 31869); + int16x8_t v2772 = vaddq_s16(v2769, v2771); + int16x8_t v2773 = vqrdmulhq_n_s16(v2772, 18830); + int16x8_t v2774 = vaddq_s16(v2768, v2773); + int16x8_t v2775 = vsubq_s16(v2412, v2414); + int16x8_t v2776 = vsubq_s16(v2416, v2418); + int16x8_t v2777 = vqrdmulhq_n_s16(v2776, 31869); + int16x8_t v2778 = vaddq_s16(v2775, v2777); + int16x8_t v2779 = vsubq_s16(v2422, v2424); + int16x8_t v2780 = vsubq_s16(v2426, v2428); + int16x8_t v2781 = vqrdmulhq_n_s16(v2780, 31869); + int16x8_t v2782 = vaddq_s16(v2779, v2781); + int16x8_t v2783 = vqrdmulhq_n_s16(v2782, 18830); + int16x8_t v2784 = vaddq_s16(v2778, v2783); + int16x8_t v2785 = vqrdmulhq_n_s16(v2784, 16944); + int16x8_t v2786 = vaddq_s16(v2774, v2785); + int16x8_t v2787 = vsubq_s16(v2436, v2438); + int16x8_t v2788 = vsubq_s16(v2440, v2442); + int16x8_t v2789 = vqrdmulhq_n_s16(v2788, 31869); + int16x8_t v2790 = vaddq_s16(v2787, v2789); + int16x8_t v2791 = vsubq_s16(v2446, v2448); + int16x8_t v2792 = vsubq_s16(v2450, v2452); + int16x8_t v2793 = vqrdmulhq_n_s16(v2792, 31869); + int16x8_t v2794 = vaddq_s16(v2791, v2793); + int16x8_t v2795 = vqrdmulhq_n_s16(v2794, 18830); + int16x8_t v2796 = vaddq_s16(v2790, v2795); + int16x8_t v2797 = vsubq_s16(v2458, v2460); + int16x8_t v2798 = vsubq_s16(v2462, v2465); + int16x8_t v2799 = vqrdmulhq_n_s16(v2798, 31869); + int16x8_t v2800 = vaddq_s16(v2797, v2799); + int16x8_t v2801 = vsubq_s16(v2469, v2471); + int16x8_t v2802 = vsubq_s16(v2473, v2475); + int16x8_t v2803 = vqrdmulhq_n_s16(v2802, 31869); + int16x8_t v2804 = vaddq_s16(v2801, v2803); + int16x8_t v2805 = vqrdmulhq_n_s16(v2804, 18830); + int16x8_t v2806 = vaddq_s16(v2800, v2805); + int16x8_t v2807 = vqrdmulhq_n_s16(v2806, 16944); + int16x8_t v2808 = vaddq_s16(v2796, v2807); + int16x8_t v2809 = vqrdmulhq_n_s16(v2808, 16521); + int16x8_t v2810 = vaddq_s16(v2786, v2809); + int16x8_t v2811 = vsubq_s16(v2296, v2298); + int16x8_t v2812 = vsubq_s16(v2300, v2302); + int16x8_t v2813_tmp = vqrdmulhq_n_s16(v2812, 5552); + int16x8_t v2813 = vaddq_s16(v2813_tmp, v2812); + int16x8_t v2814 = vaddq_s16(v2811, v2813); + int16x8_t v2815 = vsubq_s16(v2306, v2308); + int16x8_t v2816 = vsubq_s16(v2310, v2312); + int16x8_t v2817_tmp = vqrdmulhq_n_s16(v2816, 5552); + int16x8_t v2817 = vaddq_s16(v2817_tmp, v2816); + int16x8_t v2818 = vaddq_s16(v2815, v2817); + int16x8_t v2819 = vqrdmulhq_n_s16(v2818, 19393); + int16x8_t v2820 = vaddq_s16(v2814, v2819); + int16x8_t v2821 = vsubq_s16(v2318, v2320); + int16x8_t v2822 = vsubq_s16(v2322, v2324); + int16x8_t v2823_tmp = vqrdmulhq_n_s16(v2822, 5552); + int16x8_t v2823 = vaddq_s16(v2823_tmp, v2822); + int16x8_t v2824 = vaddq_s16(v2821, v2823); + int16x8_t v2825 = vsubq_s16(v2328, v2330); + int16x8_t v2826 = vsubq_s16(v2332, v2334); + int16x8_t v2827_tmp = vqrdmulhq_n_s16(v2826, 5552); + int16x8_t v2827 = vaddq_s16(v2827_tmp, v2826); + int16x8_t v2828 = vaddq_s16(v2825, v2827); + int16x8_t v2829 = vqrdmulhq_n_s16(v2828, 19393); + int16x8_t v2830 = vaddq_s16(v2824, v2829); + int16x8_t v2831 = vqrdmulhq_n_s16(v2830, 17059); + int16x8_t v2832 = vaddq_s16(v2820, v2831); + int16x8_t v2833 = vsubq_s16(v2342, v2344); + int16x8_t v2834 = vsubq_s16(v2346, v2348); + int16x8_t v2835_tmp = vqrdmulhq_n_s16(v2834, 5552); + int16x8_t v2835 = vaddq_s16(v2835_tmp, v2834); + int16x8_t v2836 = vaddq_s16(v2833, v2835); + int16x8_t v2837 = vsubq_s16(v2352, v2354); + int16x8_t v2838 = vsubq_s16(v2356, v2358); + int16x8_t v2839_tmp = vqrdmulhq_n_s16(v2838, 5552); + int16x8_t v2839 = vaddq_s16(v2839_tmp, v2838); + int16x8_t v2840 = vaddq_s16(v2837, v2839); + int16x8_t v2841 = vqrdmulhq_n_s16(v2840, 19393); + int16x8_t v2842 = vaddq_s16(v2836, v2841); + int16x8_t v2843 = vsubq_s16(v2364, v2366); + int16x8_t v2844 = vsubq_s16(v2368, v2370); + int16x8_t v2845_tmp = vqrdmulhq_n_s16(v2844, 5552); + int16x8_t v2845 = vaddq_s16(v2845_tmp, v2844); + int16x8_t v2846 = vaddq_s16(v2843, v2845); + int16x8_t v2847 = vsubq_s16(v2374, v2376); + int16x8_t v2848 = vsubq_s16(v2378, v2380); + int16x8_t v2849_tmp = vqrdmulhq_n_s16(v2848, 5552); + int16x8_t v2849 = vaddq_s16(v2849_tmp, v2848); + int16x8_t v2850 = vaddq_s16(v2847, v2849); + int16x8_t v2851 = vqrdmulhq_n_s16(v2850, 19393); + int16x8_t v2852 = vaddq_s16(v2846, v2851); + int16x8_t v2853 = vqrdmulhq_n_s16(v2852, 17059); + int16x8_t v2854 = vaddq_s16(v2842, v2853); + int16x8_t v2855 = vqrdmulhq_n_s16(v2854, 16549); + int16x8_t v2856 = vaddq_s16(v2832, v2855); + int16x8_t v2857 = vsubq_s16(v2109, v2114); + int16x8_t v2858 = vsubq_s16(v2119, v2124); + int16x8_t v2859_tmp = vqrdmulhq_n_s16(v2858, 15865); + int16x8_t v2859 = vaddq_s16(v2859_tmp, v2858); + int16x8_t v2860 = vaddq_s16(v2857, v2859); + int16x8_t v2861 = vsubq_s16(v2131, v2136); + int16x8_t v2862 = vsubq_s16(v2141, v2146); + int16x8_t v2863_tmp = vqrdmulhq_n_s16(v2862, 15865); + int16x8_t v2863 = vaddq_s16(v2863_tmp, v2862); + int16x8_t v2864 = vaddq_s16(v2861, v2863); + int16x8_t v2865 = vqrdmulhq_n_s16(v2864, 20040); + int16x8_t v2866 = vaddq_s16(v2860, v2865); + int16x8_t v2867 = vsubq_s16(v2155, v2160); + int16x8_t v2868 = vsubq_s16(v2165, v2170); + int16x8_t v2869_tmp = vqrdmulhq_n_s16(v2868, 15865); + int16x8_t v2869 = vaddq_s16(v2869_tmp, v2868); + int16x8_t v2870 = vaddq_s16(v2867, v2869); + int16x8_t v2871 = vsubq_s16(v2177, v2182); + int16x8_t v2872 = vsubq_s16(v2187, v2192); + int16x8_t v2873_tmp = vqrdmulhq_n_s16(v2872, 15865); + int16x8_t v2873 = vaddq_s16(v2873_tmp, v2872); + int16x8_t v2874 = vaddq_s16(v2871, v2873); + int16x8_t v2875 = vqrdmulhq_n_s16(v2874, 20040); + int16x8_t v2876 = vaddq_s16(v2870, v2875); + int16x8_t v2877 = vqrdmulhq_n_s16(v2876, 17187); + int16x8_t v2878 = vaddq_s16(v2866, v2877); + int16x8_t v2879 = vsubq_s16(v2203, v2208); + int16x8_t v2880 = vsubq_s16(v2213, v2218); + int16x8_t v2881_tmp = vqrdmulhq_n_s16(v2880, 15865); + int16x8_t v2881 = vaddq_s16(v2881_tmp, v2880); + int16x8_t v2882 = vaddq_s16(v2879, v2881); + int16x8_t v2883 = vsubq_s16(v2225, v2230); + int16x8_t v2884 = vsubq_s16(v2235, v2240); + int16x8_t v2885_tmp = vqrdmulhq_n_s16(v2884, 15865); + int16x8_t v2885 = vaddq_s16(v2885_tmp, v2884); + int16x8_t v2886 = vaddq_s16(v2883, v2885); + int16x8_t v2887 = vqrdmulhq_n_s16(v2886, 20040); + int16x8_t v2888 = vaddq_s16(v2882, v2887); + int16x8_t v2889 = vsubq_s16(v2249, v2254); + int16x8_t v2890 = vsubq_s16(v2259, v2264); + int16x8_t v2891_tmp = vqrdmulhq_n_s16(v2890, 15865); + int16x8_t v2891 = vaddq_s16(v2891_tmp, v2890); + int16x8_t v2892 = vaddq_s16(v2889, v2891); + int16x8_t v2893 = vsubq_s16(v2271, v2276); + int16x8_t v2894 = vsubq_s16(v2281, v2286); + int16x8_t v2895_tmp = vqrdmulhq_n_s16(v2894, 15865); + int16x8_t v2895 = vaddq_s16(v2895_tmp, v2894); + int16x8_t v2896 = vaddq_s16(v2893, v2895); + int16x8_t v2897 = vqrdmulhq_n_s16(v2896, 20040); + int16x8_t v2898 = vaddq_s16(v2892, v2897); + int16x8_t v2899 = vqrdmulhq_n_s16(v2898, 17187); + int16x8_t v2900 = vaddq_s16(v2888, v2899); + int16x8_t v2901 = vqrdmulhq_n_s16(v2900, 16579); + int16x8_t v2902 = vaddq_s16(v2878, v2901); + int16x8_t v2903 = vsubq_s16(v1919, v1924); + int16x8_t v2904 = vsubq_s16(v1929, v1934); + int16x8_t v2905_tmp = vqrdmulhq_n_s16(v2904, 1893); + int16x8_t v2905 = vmlaq_n_s16(v2905_tmp, v2904, 2); + int16x8_t v2906 = vaddq_s16(v2903, v2905); + int16x8_t v2907 = vsubq_s16(v1941, v1946); + int16x8_t v2908 = vsubq_s16(v1951, v1956); + int16x8_t v2909_tmp = vqrdmulhq_n_s16(v2908, 1893); + int16x8_t v2909 = vmlaq_n_s16(v2909_tmp, v2908, 2); + int16x8_t v2910 = vaddq_s16(v2907, v2909); + int16x8_t v2911 = vqrdmulhq_n_s16(v2910, 20783); + int16x8_t v2912 = vaddq_s16(v2906, v2911); + int16x8_t v2913 = vsubq_s16(v1965, v1970); + int16x8_t v2914 = vsubq_s16(v1975, v1980); + int16x8_t v2915_tmp = vqrdmulhq_n_s16(v2914, 1893); + int16x8_t v2915 = vmlaq_n_s16(v2915_tmp, v2914, 2); + int16x8_t v2916 = vaddq_s16(v2913, v2915); + int16x8_t v2917 = vsubq_s16(v1987, v1992); + int16x8_t v2918 = vsubq_s16(v1997, v2002); + int16x8_t v2919_tmp = vqrdmulhq_n_s16(v2918, 1893); + int16x8_t v2919 = vmlaq_n_s16(v2919_tmp, v2918, 2); + int16x8_t v2920 = vaddq_s16(v2917, v2919); + int16x8_t v2921 = vqrdmulhq_n_s16(v2920, 20783); + int16x8_t v2922 = vaddq_s16(v2916, v2921); + int16x8_t v2923 = vqrdmulhq_n_s16(v2922, 17326); + int16x8_t v2924 = vaddq_s16(v2912, v2923); + int16x8_t v2925 = vsubq_s16(v2013, v2018); + int16x8_t v2926 = vsubq_s16(v2023, v2028); + int16x8_t v2927_tmp = vqrdmulhq_n_s16(v2926, 1893); + int16x8_t v2927 = vmlaq_n_s16(v2927_tmp, v2926, 2); + int16x8_t v2928 = vaddq_s16(v2925, v2927); + int16x8_t v2929 = vsubq_s16(v2035, v2040); + int16x8_t v2930 = vsubq_s16(v2045, v2050); + int16x8_t v2931_tmp = vqrdmulhq_n_s16(v2930, 1893); + int16x8_t v2931 = vmlaq_n_s16(v2931_tmp, v2930, 2); + int16x8_t v2932 = vaddq_s16(v2929, v2931); + int16x8_t v2933 = vqrdmulhq_n_s16(v2932, 20783); + int16x8_t v2934 = vaddq_s16(v2928, v2933); + int16x8_t v2935 = vsubq_s16(v2059, v2064); + int16x8_t v2936 = vsubq_s16(v2069, v2074); + int16x8_t v2937_tmp = vqrdmulhq_n_s16(v2936, 1893); + int16x8_t v2937 = vmlaq_n_s16(v2937_tmp, v2936, 2); + int16x8_t v2938 = vaddq_s16(v2935, v2937); + int16x8_t v2939 = vsubq_s16(v2081, v2086); + int16x8_t v2940 = vsubq_s16(v2091, v2096); + int16x8_t v2941_tmp = vqrdmulhq_n_s16(v2940, 1893); + int16x8_t v2941 = vmlaq_n_s16(v2941_tmp, v2940, 2); + int16x8_t v2942 = vaddq_s16(v2939, v2941); + int16x8_t v2943 = vqrdmulhq_n_s16(v2942, 20783); + int16x8_t v2944 = vaddq_s16(v2938, v2943); + int16x8_t v2945 = vqrdmulhq_n_s16(v2944, 17326); + int16x8_t v2946 = vaddq_s16(v2934, v2945); + int16x8_t v2947 = vqrdmulhq_n_s16(v2946, 16611); + int16x8_t v2948 = vaddq_s16(v2924, v2947); + int16x8_t v2949 = vsubq_s16(v1543, v1554); + int16x8_t v2950 = vsubq_s16(v1565, v1576); + int16x8_t v2951_tmp = vqrdmulhq_n_s16(v2950, 13357); + int16x8_t v2951 = vmlaq_n_s16(v2951_tmp, v2950, 3); + int16x8_t v2952 = vaddq_s16(v2949, v2951); + int16x8_t v2953 = vsubq_s16(v1589, v1600); + int16x8_t v2954 = vsubq_s16(v1611, v1622); + int16x8_t v2955_tmp = vqrdmulhq_n_s16(v2954, 13357); + int16x8_t v2955 = vmlaq_n_s16(v2955_tmp, v2954, 3); + int16x8_t v2956 = vaddq_s16(v2953, v2955); + int16x8_t v2957 = vqrdmulhq_n_s16(v2956, 21637); + int16x8_t v2958 = vaddq_s16(v2952, v2957); + int16x8_t v2959 = vsubq_s16(v1637, v1648); + int16x8_t v2960 = vsubq_s16(v1659, v1670); + int16x8_t v2961_tmp = vqrdmulhq_n_s16(v2960, 13357); + int16x8_t v2961 = vmlaq_n_s16(v2961_tmp, v2960, 3); + int16x8_t v2962 = vaddq_s16(v2959, v2961); + int16x8_t v2963 = vsubq_s16(v1683, v1694); + int16x8_t v2964 = vsubq_s16(v1705, v1716); + int16x8_t v2965_tmp = vqrdmulhq_n_s16(v2964, 13357); + int16x8_t v2965 = vmlaq_n_s16(v2965_tmp, v2964, 3); + int16x8_t v2966 = vaddq_s16(v2963, v2965); + int16x8_t v2967 = vqrdmulhq_n_s16(v2966, 21637); + int16x8_t v2968 = vaddq_s16(v2962, v2967); + int16x8_t v2969 = vqrdmulhq_n_s16(v2968, 17479); + int16x8_t v2970 = vaddq_s16(v2958, v2969); + int16x8_t v2971 = vsubq_s16(v1733, v1744); + int16x8_t v2972 = vsubq_s16(v1755, v1766); + int16x8_t v2973_tmp = vqrdmulhq_n_s16(v2972, 13357); + int16x8_t v2973 = vmlaq_n_s16(v2973_tmp, v2972, 3); + int16x8_t v2974 = vaddq_s16(v2971, v2973); + int16x8_t v2975 = vsubq_s16(v1779, v1790); + int16x8_t v2976 = vsubq_s16(v1801, v1812); + int16x8_t v2977_tmp = vqrdmulhq_n_s16(v2976, 13357); + int16x8_t v2977 = vmlaq_n_s16(v2977_tmp, v2976, 3); + int16x8_t v2978 = vaddq_s16(v2975, v2977); + int16x8_t v2979 = vqrdmulhq_n_s16(v2978, 21637); + int16x8_t v2980 = vaddq_s16(v2974, v2979); + int16x8_t v2981 = vsubq_s16(v1827, v1838); + int16x8_t v2982 = vsubq_s16(v1849, v1860); + int16x8_t v2983_tmp = vqrdmulhq_n_s16(v2982, 13357); + int16x8_t v2983 = vmlaq_n_s16(v2983_tmp, v2982, 3); + int16x8_t v2984 = vaddq_s16(v2981, v2983); + int16x8_t v2985 = vsubq_s16(v1873, v1884); + int16x8_t v2986 = vsubq_s16(v1895, v1906); + int16x8_t v2987_tmp = vqrdmulhq_n_s16(v2986, 13357); + int16x8_t v2987 = vmlaq_n_s16(v2987_tmp, v2986, 3); + int16x8_t v2988 = vaddq_s16(v2985, v2987); + int16x8_t v2989 = vqrdmulhq_n_s16(v2988, 21637); + int16x8_t v2990 = vaddq_s16(v2984, v2989); + int16x8_t v2991 = vqrdmulhq_n_s16(v2990, 17479); + int16x8_t v2992 = vaddq_s16(v2980, v2991); + int16x8_t v2993 = vqrdmulhq_n_s16(v2992, 16647); + int16x8_t v2994 = vaddq_s16(v2970, v2993); + int16x8_t v2995 = vsubq_s16(v25, v60); + int16x8_t v2996 = vsubq_s16(v102, v138); + int16x8_t v2997_tmp = vqrdmulhq_n_s16(v2996, 6226); + int16x8_t v2997 = vmlaq_n_s16(v2997_tmp, v2996, 10); + int16x8_t v2998 = vaddq_s16(v2995, v2997); + int16x8_t v2999 = vsubq_s16(v182, v233); + int16x8_t v3000 = vsubq_s16(v275, v312); + int16x8_t v3001_tmp = vqrdmulhq_n_s16(v3000, 6226); + int16x8_t v3001 = vmlaq_n_s16(v3001_tmp, v3000, 10); + int16x8_t v3002 = vaddq_s16(v2999, v3001); + int16x8_t v3003 = vqrdmulhq_n_s16(v3002, 22622); + int16x8_t v3004 = vaddq_s16(v2998, v3003); + int16x8_t v3005 = vsubq_s16(v358, v409); + int16x8_t v3006 = vsubq_s16(v481, v519); + int16x8_t v3007_tmp = vqrdmulhq_n_s16(v3006, 6226); + int16x8_t v3007 = vmlaq_n_s16(v3007_tmp, v3006, 10); + int16x8_t v3008 = vaddq_s16(v3005, v3007); + int16x8_t v3009 = vsubq_s16(v563, v614); + int16x8_t v3010 = vsubq_s16(v656, v694); + int16x8_t v3011_tmp = vqrdmulhq_n_s16(v3010, 6226); + int16x8_t v3011 = vmlaq_n_s16(v3011_tmp, v3010, 10); + int16x8_t v3012 = vaddq_s16(v3009, v3011); + int16x8_t v3013 = vqrdmulhq_n_s16(v3012, 22622); + int16x8_t v3014 = vaddq_s16(v3008, v3013); + int16x8_t v3015 = vqrdmulhq_n_s16(v3014, 17646); + int16x8_t v3016 = vaddq_s16(v3004, v3015); + int16x8_t v3017 = vsubq_s16(v742, v793); + int16x8_t v3018 = vsubq_s16(v865, v903); + int16x8_t v3019_tmp = vqrdmulhq_n_s16(v3018, 6226); + int16x8_t v3019 = vmlaq_n_s16(v3019_tmp, v3018, 10); + int16x8_t v3020 = vaddq_s16(v3017, v3019); + int16x8_t v3021 = vsubq_s16(v977, v1060); + int16x8_t v3022 = vsubq_s16(v1102, v1141); + int16x8_t v3023_tmp = vqrdmulhq_n_s16(v3022, 6226); + int16x8_t v3023 = vmlaq_n_s16(v3023_tmp, v3022, 10); + int16x8_t v3024 = vaddq_s16(v3021, v3023); + int16x8_t v3025 = vqrdmulhq_n_s16(v3024, 22622); + int16x8_t v3026 = vaddq_s16(v3020, v3025); + int16x8_t v3027 = vsubq_s16(v1187, v1238); + int16x8_t v3028 = vsubq_s16(v1310, v1348); + int16x8_t v3029_tmp = vqrdmulhq_n_s16(v3028, 6226); + int16x8_t v3029 = vmlaq_n_s16(v3029_tmp, v3028, 10); + int16x8_t v3030 = vaddq_s16(v3027, v3029); + int16x8_t v3031 = vsubq_s16(v1392, v1443); + int16x8_t v3032 = vsubq_s16(v1485, v1524); + int16x8_t v3033_tmp = vqrdmulhq_n_s16(v3032, 6226); + int16x8_t v3033 = vmlaq_n_s16(v3033_tmp, v3032, 10); + int16x8_t v3034 = vaddq_s16(v3031, v3033); + int16x8_t v3035 = vqrdmulhq_n_s16(v3034, 22622); + int16x8_t v3036 = vaddq_s16(v3030, v3035); + int16x8_t v3037 = vqrdmulhq_n_s16(v3036, 17646); + int16x8_t v3038 = vaddq_s16(v3026, v3037); + int16x8_t v3039 = vqrdmulhq_n_s16(v3038, 16685); + int16x8_t v3040 = vaddq_s16(v3016, v3039); + int16x8_t v3041 = vsubq_s16(v2995, v2997); + int16x8_t v3042 = vsubq_s16(v2999, v3001); + int16x8_t v3043 = vqrdmulhq_n_s16(v3042, 23761); + int16x8_t v3044 = vaddq_s16(v3041, v3043); + int16x8_t v3045 = vsubq_s16(v3005, v3007); + int16x8_t v3046 = vsubq_s16(v3009, v3011); + int16x8_t v3047 = vqrdmulhq_n_s16(v3046, 23761); + int16x8_t v3048 = vaddq_s16(v3045, v3047); + int16x8_t v3049 = vqrdmulhq_n_s16(v3048, 17826); + int16x8_t v3050 = vaddq_s16(v3044, v3049); + int16x8_t v3051 = vsubq_s16(v3017, v3019); + int16x8_t v3052 = vsubq_s16(v3021, v3023); + int16x8_t v3053 = vqrdmulhq_n_s16(v3052, 23761); + int16x8_t v3054 = vaddq_s16(v3051, v3053); + int16x8_t v3055 = vsubq_s16(v3027, v3029); + int16x8_t v3056 = vsubq_s16(v3031, v3033); + int16x8_t v3057 = vqrdmulhq_n_s16(v3056, 23761); + int16x8_t v3058 = vaddq_s16(v3055, v3057); + int16x8_t v3059 = vqrdmulhq_n_s16(v3058, 17826); + int16x8_t v3060 = vaddq_s16(v3054, v3059); + int16x8_t v3061 = vqrdmulhq_n_s16(v3060, 16726); + int16x8_t v3062 = vaddq_s16(v3050, v3061); + int16x8_t v3063 = vsubq_s16(v2949, v2951); + int16x8_t v3064 = vsubq_s16(v2953, v2955); + int16x8_t v3065 = vqrdmulhq_n_s16(v3064, 25084); + int16x8_t v3066 = vaddq_s16(v3063, v3065); + int16x8_t v3067 = vsubq_s16(v2959, v2961); + int16x8_t v3068 = vsubq_s16(v2963, v2965); + int16x8_t v3069 = vqrdmulhq_n_s16(v3068, 25084); + int16x8_t v3070 = vaddq_s16(v3067, v3069); + int16x8_t v3071 = vqrdmulhq_n_s16(v3070, 18021); + int16x8_t v3072 = vaddq_s16(v3066, v3071); + int16x8_t v3073 = vsubq_s16(v2971, v2973); + int16x8_t v3074 = vsubq_s16(v2975, v2977); + int16x8_t v3075 = vqrdmulhq_n_s16(v3074, 25084); + int16x8_t v3076 = vaddq_s16(v3073, v3075); + int16x8_t v3077 = vsubq_s16(v2981, v2983); + int16x8_t v3078 = vsubq_s16(v2985, v2987); + int16x8_t v3079 = vqrdmulhq_n_s16(v3078, 25084); + int16x8_t v3080 = vaddq_s16(v3077, v3079); + int16x8_t v3081 = vqrdmulhq_n_s16(v3080, 18021); + int16x8_t v3082 = vaddq_s16(v3076, v3081); + int16x8_t v3083 = vqrdmulhq_n_s16(v3082, 16769); + int16x8_t v3084 = vaddq_s16(v3072, v3083); + int16x8_t v3085 = vsubq_s16(v2903, v2905); + int16x8_t v3086 = vsubq_s16(v2907, v2909); + int16x8_t v3087 = vqrdmulhq_n_s16(v3086, 26631); + int16x8_t v3088 = vaddq_s16(v3085, v3087); + int16x8_t v3089 = vsubq_s16(v2913, v2915); + int16x8_t v3090 = vsubq_s16(v2917, v2919); + int16x8_t v3091 = vqrdmulhq_n_s16(v3090, 26631); + int16x8_t v3092 = vaddq_s16(v3089, v3091); + int16x8_t v3093 = vqrdmulhq_n_s16(v3092, 18231); + int16x8_t v3094 = vaddq_s16(v3088, v3093); + int16x8_t v3095 = vsubq_s16(v2925, v2927); + int16x8_t v3096 = vsubq_s16(v2929, v2931); + int16x8_t v3097 = vqrdmulhq_n_s16(v3096, 26631); + int16x8_t v3098 = vaddq_s16(v3095, v3097); + int16x8_t v3099 = vsubq_s16(v2935, v2937); + int16x8_t v3100 = vsubq_s16(v2939, v2941); + int16x8_t v3101 = vqrdmulhq_n_s16(v3100, 26631); + int16x8_t v3102 = vaddq_s16(v3099, v3101); + int16x8_t v3103 = vqrdmulhq_n_s16(v3102, 18231); + int16x8_t v3104 = vaddq_s16(v3098, v3103); + int16x8_t v3105 = vqrdmulhq_n_s16(v3104, 16815); + int16x8_t v3106 = vaddq_s16(v3094, v3105); + int16x8_t v3107 = vsubq_s16(v2857, v2859); + int16x8_t v3108 = vsubq_s16(v2861, v2863); + int16x8_t v3109 = vqrdmulhq_n_s16(v3108, 28454); + int16x8_t v3110 = vaddq_s16(v3107, v3109); + int16x8_t v3111 = vsubq_s16(v2867, v2869); + int16x8_t v3112 = vsubq_s16(v2871, v2873); + int16x8_t v3113 = vqrdmulhq_n_s16(v3112, 28454); + int16x8_t v3114 = vaddq_s16(v3111, v3113); + int16x8_t v3115 = vqrdmulhq_n_s16(v3114, 18458); + int16x8_t v3116 = vaddq_s16(v3110, v3115); + int16x8_t v3117 = vsubq_s16(v2879, v2881); + int16x8_t v3118 = vsubq_s16(v2883, v2885); + int16x8_t v3119 = vqrdmulhq_n_s16(v3118, 28454); + int16x8_t v3120 = vaddq_s16(v3117, v3119); + int16x8_t v3121 = vsubq_s16(v2889, v2891); + int16x8_t v3122 = vsubq_s16(v2893, v2895); + int16x8_t v3123 = vqrdmulhq_n_s16(v3122, 28454); + int16x8_t v3124 = vaddq_s16(v3121, v3123); + int16x8_t v3125 = vqrdmulhq_n_s16(v3124, 18458); + int16x8_t v3126 = vaddq_s16(v3120, v3125); + int16x8_t v3127 = vqrdmulhq_n_s16(v3126, 16865); + int16x8_t v3128 = vaddq_s16(v3116, v3127); + int16x8_t v3129 = vsubq_s16(v2811, v2813); + int16x8_t v3130 = vsubq_s16(v2815, v2817); + int16x8_t v3131 = vqrdmulhq_n_s16(v3130, 30624); + int16x8_t v3132 = vaddq_s16(v3129, v3131); + int16x8_t v3133 = vsubq_s16(v2821, v2823); + int16x8_t v3134 = vsubq_s16(v2825, v2827); + int16x8_t v3135 = vqrdmulhq_n_s16(v3134, 30624); + int16x8_t v3136 = vaddq_s16(v3133, v3135); + int16x8_t v3137 = vqrdmulhq_n_s16(v3136, 18702); + int16x8_t v3138 = vaddq_s16(v3132, v3137); + int16x8_t v3139 = vsubq_s16(v2833, v2835); + int16x8_t v3140 = vsubq_s16(v2837, v2839); + int16x8_t v3141 = vqrdmulhq_n_s16(v3140, 30624); + int16x8_t v3142 = vaddq_s16(v3139, v3141); + int16x8_t v3143 = vsubq_s16(v2843, v2845); + int16x8_t v3144 = vsubq_s16(v2847, v2849); + int16x8_t v3145 = vqrdmulhq_n_s16(v3144, 30624); + int16x8_t v3146 = vaddq_s16(v3143, v3145); + int16x8_t v3147 = vqrdmulhq_n_s16(v3146, 18702); + int16x8_t v3148 = vaddq_s16(v3142, v3147); + int16x8_t v3149 = vqrdmulhq_n_s16(v3148, 16916); + int16x8_t v3150 = vaddq_s16(v3138, v3149); + int16x8_t v3151 = vsubq_s16(v2765, v2767); + int16x8_t v3152 = vsubq_s16(v2769, v2771); + int16x8_t v3153_tmp = vqrdmulhq_n_s16(v3152, 472); + int16x8_t v3153 = vaddq_s16(v3153_tmp, v3152); + int16x8_t v3154 = vaddq_s16(v3151, v3153); + int16x8_t v3155 = vsubq_s16(v2775, v2777); + int16x8_t v3156 = vsubq_s16(v2779, v2781); + int16x8_t v3157_tmp = vqrdmulhq_n_s16(v3156, 472); + int16x8_t v3157 = vaddq_s16(v3157_tmp, v3156); + int16x8_t v3158 = vaddq_s16(v3155, v3157); + int16x8_t v3159 = vqrdmulhq_n_s16(v3158, 18964); + int16x8_t v3160 = vaddq_s16(v3154, v3159); + int16x8_t v3161 = vsubq_s16(v2787, v2789); + int16x8_t v3162 = vsubq_s16(v2791, v2793); + int16x8_t v3163_tmp = vqrdmulhq_n_s16(v3162, 472); + int16x8_t v3163 = vaddq_s16(v3163_tmp, v3162); + int16x8_t v3164 = vaddq_s16(v3161, v3163); + int16x8_t v3165 = vsubq_s16(v2797, v2799); + int16x8_t v3166 = vsubq_s16(v2801, v2803); + int16x8_t v3167_tmp = vqrdmulhq_n_s16(v3166, 472); + int16x8_t v3167 = vaddq_s16(v3167_tmp, v3166); + int16x8_t v3168 = vaddq_s16(v3165, v3167); + int16x8_t v3169 = vqrdmulhq_n_s16(v3168, 18964); + int16x8_t v3170 = vaddq_s16(v3164, v3169); + int16x8_t v3171 = vqrdmulhq_n_s16(v3170, 16971); + int16x8_t v3172 = vaddq_s16(v3160, v3171); + int16x8_t v3173 = vsubq_s16(v2719, v2721); + int16x8_t v3174 = vsubq_s16(v2723, v2725); + int16x8_t v3175_tmp = vqrdmulhq_n_s16(v3174, 3672); + int16x8_t v3175 = vaddq_s16(v3175_tmp, v3174); + int16x8_t v3176 = vaddq_s16(v3173, v3175); + int16x8_t v3177 = vsubq_s16(v2729, v2731); + int16x8_t v3178 = vsubq_s16(v2733, v2735); + int16x8_t v3179_tmp = vqrdmulhq_n_s16(v3178, 3672); + int16x8_t v3179 = vaddq_s16(v3179_tmp, v3178); + int16x8_t v3180 = vaddq_s16(v3177, v3179); + int16x8_t v3181 = vqrdmulhq_n_s16(v3180, 19245); + int16x8_t v3182 = vaddq_s16(v3176, v3181); + int16x8_t v3183 = vsubq_s16(v2741, v2743); + int16x8_t v3184 = vsubq_s16(v2745, v2747); + int16x8_t v3185_tmp = vqrdmulhq_n_s16(v3184, 3672); + int16x8_t v3185 = vaddq_s16(v3185_tmp, v3184); + int16x8_t v3186 = vaddq_s16(v3183, v3185); + int16x8_t v3187 = vsubq_s16(v2751, v2753); + int16x8_t v3188 = vsubq_s16(v2755, v2757); + int16x8_t v3189_tmp = vqrdmulhq_n_s16(v3188, 3672); + int16x8_t v3189 = vaddq_s16(v3189_tmp, v3188); + int16x8_t v3190 = vaddq_s16(v3187, v3189); + int16x8_t v3191 = vqrdmulhq_n_s16(v3190, 19245); + int16x8_t v3192 = vaddq_s16(v3186, v3191); + int16x8_t v3193 = vqrdmulhq_n_s16(v3192, 17029); + int16x8_t v3194 = vaddq_s16(v3182, v3193); + int16x8_t v3195 = vsubq_s16(v2673, v2675); + int16x8_t v3196 = vsubq_s16(v2677, v2679); + int16x8_t v3197_tmp = vqrdmulhq_n_s16(v3196, 7662); + int16x8_t v3197 = vaddq_s16(v3197_tmp, v3196); + int16x8_t v3198 = vaddq_s16(v3195, v3197); + int16x8_t v3199 = vsubq_s16(v2683, v2685); + int16x8_t v3200 = vsubq_s16(v2687, v2689); + int16x8_t v3201_tmp = vqrdmulhq_n_s16(v3200, 7662); + int16x8_t v3201 = vaddq_s16(v3201_tmp, v3200); + int16x8_t v3202 = vaddq_s16(v3199, v3201); + int16x8_t v3203 = vqrdmulhq_n_s16(v3202, 19546); + int16x8_t v3204 = vaddq_s16(v3198, v3203); + int16x8_t v3205 = vsubq_s16(v2695, v2697); + int16x8_t v3206 = vsubq_s16(v2699, v2701); + int16x8_t v3207_tmp = vqrdmulhq_n_s16(v3206, 7662); + int16x8_t v3207 = vaddq_s16(v3207_tmp, v3206); + int16x8_t v3208 = vaddq_s16(v3205, v3207); + int16x8_t v3209 = vsubq_s16(v2705, v2707); + int16x8_t v3210 = vsubq_s16(v2709, v2711); + int16x8_t v3211_tmp = vqrdmulhq_n_s16(v3210, 7662); + int16x8_t v3211 = vaddq_s16(v3211_tmp, v3210); + int16x8_t v3212 = vaddq_s16(v3209, v3211); + int16x8_t v3213 = vqrdmulhq_n_s16(v3212, 19546); + int16x8_t v3214 = vaddq_s16(v3208, v3213); + int16x8_t v3215 = vqrdmulhq_n_s16(v3214, 17090); + int16x8_t v3216 = vaddq_s16(v3204, v3215); + int16x8_t v3217 = vsubq_s16(v2582, v2587); + int16x8_t v3218 = vsubq_s16(v2592, v2597); + int16x8_t v3219_tmp = vqrdmulhq_n_s16(v3218, 12756); + int16x8_t v3219 = vaddq_s16(v3219_tmp, v3218); + int16x8_t v3220 = vaddq_s16(v3217, v3219); + int16x8_t v3221 = vsubq_s16(v2604, v2609); + int16x8_t v3222 = vsubq_s16(v2614, v2619); + int16x8_t v3223_tmp = vqrdmulhq_n_s16(v3222, 12756); + int16x8_t v3223 = vaddq_s16(v3223_tmp, v3222); + int16x8_t v3224 = vaddq_s16(v3221, v3223); + int16x8_t v3225 = vqrdmulhq_n_s16(v3224, 19869); + int16x8_t v3226 = vaddq_s16(v3220, v3225); + int16x8_t v3227 = vsubq_s16(v2628, v2633); + int16x8_t v3228 = vsubq_s16(v2638, v2643); + int16x8_t v3229_tmp = vqrdmulhq_n_s16(v3228, 12756); + int16x8_t v3229 = vaddq_s16(v3229_tmp, v3228); + int16x8_t v3230 = vaddq_s16(v3227, v3229); + int16x8_t v3231 = vsubq_s16(v2650, v2655); + int16x8_t v3232 = vsubq_s16(v2660, v2665); + int16x8_t v3233_tmp = vqrdmulhq_n_s16(v3232, 12756); + int16x8_t v3233 = vaddq_s16(v3233_tmp, v3232); + int16x8_t v3234 = vaddq_s16(v3231, v3233); + int16x8_t v3235 = vqrdmulhq_n_s16(v3234, 19869); + int16x8_t v3236 = vaddq_s16(v3230, v3235); + int16x8_t v3237 = vqrdmulhq_n_s16(v3236, 17153); + int16x8_t v3238 = vaddq_s16(v3226, v3237); + int16x8_t v3239 = vsubq_s16(v2488, v2493); + int16x8_t v3240 = vsubq_s16(v2498, v2503); + int16x8_t v3241_tmp = vqrdmulhq_n_s16(v3240, 19463); + int16x8_t v3241 = vaddq_s16(v3241_tmp, v3240); + int16x8_t v3242 = vaddq_s16(v3239, v3241); + int16x8_t v3243 = vsubq_s16(v2510, v2515); + int16x8_t v3244 = vsubq_s16(v2520, v2525); + int16x8_t v3245_tmp = vqrdmulhq_n_s16(v3244, 19463); + int16x8_t v3245 = vaddq_s16(v3245_tmp, v3244); + int16x8_t v3246 = vaddq_s16(v3243, v3245); + int16x8_t v3247 = vqrdmulhq_n_s16(v3246, 20216); + int16x8_t v3248 = vaddq_s16(v3242, v3247); + int16x8_t v3249 = vsubq_s16(v2534, v2539); + int16x8_t v3250 = vsubq_s16(v2544, v2549); + int16x8_t v3251_tmp = vqrdmulhq_n_s16(v3250, 19463); + int16x8_t v3251 = vaddq_s16(v3251_tmp, v3250); + int16x8_t v3252 = vaddq_s16(v3249, v3251); + int16x8_t v3253 = vsubq_s16(v2556, v2561); + int16x8_t v3254 = vsubq_s16(v2566, v2571); + int16x8_t v3255_tmp = vqrdmulhq_n_s16(v3254, 19463); + int16x8_t v3255 = vaddq_s16(v3255_tmp, v3254); + int16x8_t v3256 = vaddq_s16(v3253, v3255); + int16x8_t v3257 = vqrdmulhq_n_s16(v3256, 20216); + int16x8_t v3258 = vaddq_s16(v3252, v3257); + int16x8_t v3259 = vqrdmulhq_n_s16(v3258, 17220); + int16x8_t v3260 = vaddq_s16(v3248, v3259); + int16x8_t v3261 = vsubq_s16(v2393, v2398); + int16x8_t v3262 = vsubq_s16(v2403, v2408); + int16x8_t v3263_tmp = vqrdmulhq_n_s16(v3262, 28661); + int16x8_t v3263 = vaddq_s16(v3263_tmp, v3262); + int16x8_t v3264 = vaddq_s16(v3261, v3263); + int16x8_t v3265 = vsubq_s16(v2415, v2420); + int16x8_t v3266 = vsubq_s16(v2425, v2430); + int16x8_t v3267_tmp = vqrdmulhq_n_s16(v3266, 28661); + int16x8_t v3267 = vaddq_s16(v3267_tmp, v3266); + int16x8_t v3268 = vaddq_s16(v3265, v3267); + int16x8_t v3269 = vqrdmulhq_n_s16(v3268, 20587); + int16x8_t v3270 = vaddq_s16(v3264, v3269); + int16x8_t v3271 = vsubq_s16(v2439, v2444); + int16x8_t v3272 = vsubq_s16(v2449, v2454); + int16x8_t v3273_tmp = vqrdmulhq_n_s16(v3272, 28661); + int16x8_t v3273 = vaddq_s16(v3273_tmp, v3272); + int16x8_t v3274 = vaddq_s16(v3271, v3273); + int16x8_t v3275 = vsubq_s16(v2461, v2467); + int16x8_t v3276 = vsubq_s16(v2472, v2477); + int16x8_t v3277_tmp = vqrdmulhq_n_s16(v3276, 28661); + int16x8_t v3277 = vaddq_s16(v3277_tmp, v3276); + int16x8_t v3278 = vaddq_s16(v3275, v3277); + int16x8_t v3279 = vqrdmulhq_n_s16(v3278, 20587); + int16x8_t v3280 = vaddq_s16(v3274, v3279); + int16x8_t v3281 = vqrdmulhq_n_s16(v3280, 17290); + int16x8_t v3282 = vaddq_s16(v3270, v3281); + int16x8_t v3283 = vsubq_s16(v2299, v2304); + int16x8_t v3284 = vsubq_s16(v2309, v2314); + int16x8_t v3285_tmp = vqrdmulhq_n_s16(v3284, 9242); + int16x8_t v3285 = vmlaq_n_s16(v3285_tmp, v3284, 2); + int16x8_t v3286 = vaddq_s16(v3283, v3285); + int16x8_t v3287 = vsubq_s16(v2321, v2326); + int16x8_t v3288 = vsubq_s16(v2331, v2336); + int16x8_t v3289_tmp = vqrdmulhq_n_s16(v3288, 9242); + int16x8_t v3289 = vmlaq_n_s16(v3289_tmp, v3288, 2); + int16x8_t v3290 = vaddq_s16(v3287, v3289); + int16x8_t v3291 = vqrdmulhq_n_s16(v3290, 20985); + int16x8_t v3292 = vaddq_s16(v3286, v3291); + int16x8_t v3293 = vsubq_s16(v2345, v2350); + int16x8_t v3294 = vsubq_s16(v2355, v2360); + int16x8_t v3295_tmp = vqrdmulhq_n_s16(v3294, 9242); + int16x8_t v3295 = vmlaq_n_s16(v3295_tmp, v3294, 2); + int16x8_t v3296 = vaddq_s16(v3293, v3295); + int16x8_t v3297 = vsubq_s16(v2367, v2372); + int16x8_t v3298 = vsubq_s16(v2377, v2382); + int16x8_t v3299_tmp = vqrdmulhq_n_s16(v3298, 9242); + int16x8_t v3299 = vmlaq_n_s16(v3299_tmp, v3298, 2); + int16x8_t v3300 = vaddq_s16(v3297, v3299); + int16x8_t v3301 = vqrdmulhq_n_s16(v3300, 20985); + int16x8_t v3302 = vaddq_s16(v3296, v3301); + int16x8_t v3303 = vqrdmulhq_n_s16(v3302, 17363); + int16x8_t v3304 = vaddq_s16(v3292, v3303); + int16x8_t v3305 = vsubq_s16(v2115, v2126); + int16x8_t v3306 = vsubq_s16(v2137, v2148); + int16x8_t v3307_tmp = vqrdmulhq_n_s16(v3306, 30298); + int16x8_t v3307 = vmlaq_n_s16(v3307_tmp, v3306, 2); + int16x8_t v3308 = vaddq_s16(v3305, v3307); + int16x8_t v3309 = vsubq_s16(v2161, v2172); + int16x8_t v3310 = vsubq_s16(v2183, v2194); + int16x8_t v3311_tmp = vqrdmulhq_n_s16(v3310, 30298); + int16x8_t v3311 = vmlaq_n_s16(v3311_tmp, v3310, 2); + int16x8_t v3312 = vaddq_s16(v3309, v3311); + int16x8_t v3313 = vqrdmulhq_n_s16(v3312, 21412); + int16x8_t v3314 = vaddq_s16(v3308, v3313); + int16x8_t v3315 = vsubq_s16(v2209, v2220); + int16x8_t v3316 = vsubq_s16(v2231, v2242); + int16x8_t v3317_tmp = vqrdmulhq_n_s16(v3316, 30298); + int16x8_t v3317 = vmlaq_n_s16(v3317_tmp, v3316, 2); + int16x8_t v3318 = vaddq_s16(v3315, v3317); + int16x8_t v3319 = vsubq_s16(v2255, v2266); + int16x8_t v3320 = vsubq_s16(v2277, v2288); + int16x8_t v3321_tmp = vqrdmulhq_n_s16(v3320, 30298); + int16x8_t v3321 = vmlaq_n_s16(v3321_tmp, v3320, 2); + int16x8_t v3322 = vaddq_s16(v3319, v3321); + int16x8_t v3323 = vqrdmulhq_n_s16(v3322, 21412); + int16x8_t v3324 = vaddq_s16(v3318, v3323); + int16x8_t v3325 = vqrdmulhq_n_s16(v3324, 17440); + int16x8_t v3326 = vaddq_s16(v3314, v3325); + int16x8_t v3327 = vsubq_s16(v1925, v1936); + int16x8_t v3328 = vsubq_s16(v1947, v1958); + int16x8_t v3329_tmp = vqrdmulhq_n_s16(v3328, 2773); + int16x8_t v3329 = vmlaq_n_s16(v3329_tmp, v3328, 4); + int16x8_t v3330 = vaddq_s16(v3327, v3329); + int16x8_t v3331 = vsubq_s16(v1971, v1982); + int16x8_t v3332 = vsubq_s16(v1993, v2004); + int16x8_t v3333_tmp = vqrdmulhq_n_s16(v3332, 2773); + int16x8_t v3333 = vmlaq_n_s16(v3333_tmp, v3332, 4); + int16x8_t v3334 = vaddq_s16(v3331, v3333); + int16x8_t v3335 = vqrdmulhq_n_s16(v3334, 21871); + int16x8_t v3336 = vaddq_s16(v3330, v3335); + int16x8_t v3337 = vsubq_s16(v2019, v2030); + int16x8_t v3338 = vsubq_s16(v2041, v2052); + int16x8_t v3339_tmp = vqrdmulhq_n_s16(v3338, 2773); + int16x8_t v3339 = vmlaq_n_s16(v3339_tmp, v3338, 4); + int16x8_t v3340 = vaddq_s16(v3337, v3339); + int16x8_t v3341 = vsubq_s16(v2065, v2076); + int16x8_t v3342 = vsubq_s16(v2087, v2098); + int16x8_t v3343_tmp = vqrdmulhq_n_s16(v3342, 2773); + int16x8_t v3343 = vmlaq_n_s16(v3343_tmp, v3342, 4); + int16x8_t v3344 = vaddq_s16(v3341, v3343); + int16x8_t v3345 = vqrdmulhq_n_s16(v3344, 21871); + int16x8_t v3346 = vaddq_s16(v3340, v3345); + int16x8_t v3347 = vqrdmulhq_n_s16(v3346, 17520); + int16x8_t v3348 = vaddq_s16(v3336, v3347); + int16x8_t v3349 = vsubq_s16(v1555, v1578); + int16x8_t v3350 = vsubq_s16(v1601, v1624); + int16x8_t v3351_tmp = vqrdmulhq_n_s16(v3350, 26108); + int16x8_t v3351 = vmlaq_n_s16(v3351_tmp, v3350, 6); + int16x8_t v3352 = vaddq_s16(v3349, v3351); + int16x8_t v3353 = vsubq_s16(v1649, v1672); + int16x8_t v3354 = vsubq_s16(v1695, v1718); + int16x8_t v3355_tmp = vqrdmulhq_n_s16(v3354, 26108); + int16x8_t v3355 = vmlaq_n_s16(v3355_tmp, v3354, 6); + int16x8_t v3356 = vaddq_s16(v3353, v3355); + int16x8_t v3357 = vqrdmulhq_n_s16(v3356, 22363); + int16x8_t v3358 = vaddq_s16(v3352, v3357); + int16x8_t v3359 = vsubq_s16(v1745, v1768); + int16x8_t v3360 = vsubq_s16(v1791, v1814); + int16x8_t v3361_tmp = vqrdmulhq_n_s16(v3360, 26108); + int16x8_t v3361 = vmlaq_n_s16(v3361_tmp, v3360, 6); + int16x8_t v3362 = vaddq_s16(v3359, v3361); + int16x8_t v3363 = vsubq_s16(v1839, v1862); + int16x8_t v3364 = vsubq_s16(v1885, v1908); + int16x8_t v3365_tmp = vqrdmulhq_n_s16(v3364, 26108); + int16x8_t v3365 = vmlaq_n_s16(v3365_tmp, v3364, 6); + int16x8_t v3366 = vaddq_s16(v3363, v3365); + int16x8_t v3367 = vqrdmulhq_n_s16(v3366, 22363); + int16x8_t v3368 = vaddq_s16(v3362, v3367); + int16x8_t v3369 = vqrdmulhq_n_s16(v3368, 17603); + int16x8_t v3370 = vaddq_s16(v3358, v3369); + int16x8_t v3371 = vsubq_s16(v61, v140); + int16x8_t v3372 = vsubq_s16(v234, v314); + int16x8_t v3373_tmp = vqrdmulhq_n_s16(v3372, 12251); + int16x8_t v3373 = vmlaq_n_s16(v3373_tmp, v3372, 20); + int16x8_t v3374 = vaddq_s16(v3371, v3373); + int16x8_t v3375 = vsubq_s16(v410, v521); + int16x8_t v3376 = vsubq_s16(v615, v696); + int16x8_t v3377_tmp = vqrdmulhq_n_s16(v3376, 12251); + int16x8_t v3377 = vmlaq_n_s16(v3377_tmp, v3376, 20); + int16x8_t v3378 = vaddq_s16(v3375, v3377); + int16x8_t v3379 = vqrdmulhq_n_s16(v3378, 22891); + int16x8_t v3380 = vaddq_s16(v3374, v3379); + int16x8_t v3381 = vsubq_s16(v794, v905); + int16x8_t v3382 = vsubq_s16(v1061, v1143); + int16x8_t v3383_tmp = vqrdmulhq_n_s16(v3382, 12251); + int16x8_t v3383 = vmlaq_n_s16(v3383_tmp, v3382, 20); + int16x8_t v3384 = vaddq_s16(v3381, v3383); + int16x8_t v3385 = vsubq_s16(v1239, v1350); + int16x8_t v3386 = vsubq_s16(v1444, v1526); + int16x8_t v3387_tmp = vqrdmulhq_n_s16(v3386, 12251); + int16x8_t v3387 = vmlaq_n_s16(v3387_tmp, v3386, 20); + int16x8_t v3388 = vaddq_s16(v3385, v3387); + int16x8_t v3389 = vqrdmulhq_n_s16(v3388, 22891); + int16x8_t v3390 = vaddq_s16(v3384, v3389); + int16x8_t v3391 = vqrdmulhq_n_s16(v3390, 17689); + int16x8_t v3392 = vaddq_s16(v3380, v3391); + int16x8_t v3393 = vsubq_s16(v3371, v3373); + int16x8_t v3394 = vsubq_s16(v3375, v3377); + int16x8_t v3395 = vqrdmulhq_n_s16(v3394, 23460); + int16x8_t v3396 = vaddq_s16(v3393, v3395); + int16x8_t v3397 = vsubq_s16(v3381, v3383); + int16x8_t v3398 = vsubq_s16(v3385, v3387); + int16x8_t v3399 = vqrdmulhq_n_s16(v3398, 23460); + int16x8_t v3400 = vaddq_s16(v3397, v3399); + int16x8_t v3401 = vqrdmulhq_n_s16(v3400, 17779); + int16x8_t v3402 = vaddq_s16(v3396, v3401); + int16x8_t v3403 = vsubq_s16(v3349, v3351); + int16x8_t v3404 = vsubq_s16(v3353, v3355); + int16x8_t v3405 = vqrdmulhq_n_s16(v3404, 24073); + int16x8_t v3406 = vaddq_s16(v3403, v3405); + int16x8_t v3407 = vsubq_s16(v3359, v3361); + int16x8_t v3408 = vsubq_s16(v3363, v3365); + int16x8_t v3409 = vqrdmulhq_n_s16(v3408, 24073); + int16x8_t v3410 = vaddq_s16(v3407, v3409); + int16x8_t v3411 = vqrdmulhq_n_s16(v3410, 17873); + int16x8_t v3412 = vaddq_s16(v3406, v3411); + int16x8_t v3413 = vsubq_s16(v3327, v3329); + int16x8_t v3414 = vsubq_s16(v3331, v3333); + int16x8_t v3415 = vqrdmulhq_n_s16(v3414, 24734); + int16x8_t v3416 = vaddq_s16(v3413, v3415); + int16x8_t v3417 = vsubq_s16(v3337, v3339); + int16x8_t v3418 = vsubq_s16(v3341, v3343); + int16x8_t v3419 = vqrdmulhq_n_s16(v3418, 24734); + int16x8_t v3420 = vaddq_s16(v3417, v3419); + int16x8_t v3421 = vqrdmulhq_n_s16(v3420, 17971); + int16x8_t v3422 = vaddq_s16(v3416, v3421); + int16x8_t v3423 = vsubq_s16(v3305, v3307); + int16x8_t v3424 = vsubq_s16(v3309, v3311); + int16x8_t v3425 = vqrdmulhq_n_s16(v3424, 25448); + int16x8_t v3426 = vaddq_s16(v3423, v3425); + int16x8_t v3427 = vsubq_s16(v3315, v3317); + int16x8_t v3428 = vsubq_s16(v3319, v3321); + int16x8_t v3429 = vqrdmulhq_n_s16(v3428, 25448); + int16x8_t v3430 = vaddq_s16(v3427, v3429); + int16x8_t v3431 = vqrdmulhq_n_s16(v3430, 18072); + int16x8_t v3432 = vaddq_s16(v3426, v3431); + int16x8_t v3433 = vsubq_s16(v3283, v3285); + int16x8_t v3434 = vsubq_s16(v3287, v3289); + int16x8_t v3435 = vqrdmulhq_n_s16(v3434, 26220); + int16x8_t v3436 = vaddq_s16(v3433, v3435); + int16x8_t v3437 = vsubq_s16(v3293, v3295); + int16x8_t v3438 = vsubq_s16(v3297, v3299); + int16x8_t v3439 = vqrdmulhq_n_s16(v3438, 26220); + int16x8_t v3440 = vaddq_s16(v3437, v3439); + int16x8_t v3441 = vqrdmulhq_n_s16(v3440, 18177); + int16x8_t v3442 = vaddq_s16(v3436, v3441); + int16x8_t v3443 = vsubq_s16(v3261, v3263); + int16x8_t v3444 = vsubq_s16(v3265, v3267); + int16x8_t v3445 = vqrdmulhq_n_s16(v3444, 27058); + int16x8_t v3446 = vaddq_s16(v3443, v3445); + int16x8_t v3447 = vsubq_s16(v3271, v3273); + int16x8_t v3448 = vsubq_s16(v3275, v3277); + int16x8_t v3449 = vqrdmulhq_n_s16(v3448, 27058); + int16x8_t v3450 = vaddq_s16(v3447, v3449); + int16x8_t v3451 = vqrdmulhq_n_s16(v3450, 18286); + int16x8_t v3452 = vaddq_s16(v3446, v3451); + int16x8_t v3453 = vsubq_s16(v3239, v3241); + int16x8_t v3454 = vsubq_s16(v3243, v3245); + int16x8_t v3455 = vqrdmulhq_n_s16(v3454, 27969); + int16x8_t v3456 = vaddq_s16(v3453, v3455); + int16x8_t v3457 = vsubq_s16(v3249, v3251); + int16x8_t v3458 = vsubq_s16(v3253, v3255); + int16x8_t v3459 = vqrdmulhq_n_s16(v3458, 27969); + int16x8_t v3460 = vaddq_s16(v3457, v3459); + int16x8_t v3461 = vqrdmulhq_n_s16(v3460, 18400); + int16x8_t v3462 = vaddq_s16(v3456, v3461); + int16x8_t v3463 = vsubq_s16(v3217, v3219); + int16x8_t v3464 = vsubq_s16(v3221, v3223); + int16x8_t v3465 = vqrdmulhq_n_s16(v3464, 28961); + int16x8_t v3466 = vaddq_s16(v3463, v3465); + int16x8_t v3467 = vsubq_s16(v3227, v3229); + int16x8_t v3468 = vsubq_s16(v3231, v3233); + int16x8_t v3469 = vqrdmulhq_n_s16(v3468, 28961); + int16x8_t v3470 = vaddq_s16(v3467, v3469); + int16x8_t v3471 = vqrdmulhq_n_s16(v3470, 18517); + int16x8_t v3472 = vaddq_s16(v3466, v3471); + int16x8_t v3473 = vsubq_s16(v3195, v3197); + int16x8_t v3474 = vsubq_s16(v3199, v3201); + int16x8_t v3475 = vqrdmulhq_n_s16(v3474, 30044); + int16x8_t v3476 = vaddq_s16(v3473, v3475); + int16x8_t v3477 = vsubq_s16(v3205, v3207); + int16x8_t v3478 = vsubq_s16(v3209, v3211); + int16x8_t v3479 = vqrdmulhq_n_s16(v3478, 30044); + int16x8_t v3480 = vaddq_s16(v3477, v3479); + int16x8_t v3481 = vqrdmulhq_n_s16(v3480, 18639); + int16x8_t v3482 = vaddq_s16(v3476, v3481); + int16x8_t v3483 = vsubq_s16(v3173, v3175); + int16x8_t v3484 = vsubq_s16(v3177, v3179); + int16x8_t v3485 = vqrdmulhq_n_s16(v3484, 31232); + int16x8_t v3486 = vaddq_s16(v3483, v3485); + int16x8_t v3487 = vsubq_s16(v3183, v3185); + int16x8_t v3488 = vsubq_s16(v3187, v3189); + int16x8_t v3489 = vqrdmulhq_n_s16(v3488, 31232); + int16x8_t v3490 = vaddq_s16(v3487, v3489); + int16x8_t v3491 = vqrdmulhq_n_s16(v3490, 18765); + int16x8_t v3492 = vaddq_s16(v3486, v3491); + int16x8_t v3493 = vsubq_s16(v3151, v3153); + int16x8_t v3494 = vsubq_s16(v3155, v3157); + int16x8_t v3495 = vqrdmulhq_n_s16(v3494, 32538); + int16x8_t v3496 = vaddq_s16(v3493, v3495); + int16x8_t v3497 = vsubq_s16(v3161, v3163); + int16x8_t v3498 = vsubq_s16(v3165, v3167); + int16x8_t v3499 = vqrdmulhq_n_s16(v3498, 32538); + int16x8_t v3500 = vaddq_s16(v3497, v3499); + int16x8_t v3501 = vqrdmulhq_n_s16(v3500, 18896); + int16x8_t v3502 = vaddq_s16(v3496, v3501); + int16x8_t v3503 = vsubq_s16(v3129, v3131); + int16x8_t v3504 = vsubq_s16(v3133, v3135); + int16x8_t v3505_tmp = vqrdmulhq_n_s16(v3504, 1211); + int16x8_t v3505 = vaddq_s16(v3505_tmp, v3504); + int16x8_t v3506 = vaddq_s16(v3503, v3505); + int16x8_t v3507 = vsubq_s16(v3139, v3141); + int16x8_t v3508 = vsubq_s16(v3143, v3145); + int16x8_t v3509_tmp = vqrdmulhq_n_s16(v3508, 1211); + int16x8_t v3509 = vaddq_s16(v3509_tmp, v3508); + int16x8_t v3510 = vaddq_s16(v3507, v3509); + int16x8_t v3511 = vqrdmulhq_n_s16(v3510, 19032); + int16x8_t v3512 = vaddq_s16(v3506, v3511); + int16x8_t v3513 = vsubq_s16(v3107, v3109); + int16x8_t v3514 = vsubq_s16(v3111, v3113); + int16x8_t v3515_tmp = vqrdmulhq_n_s16(v3514, 2808); + int16x8_t v3515 = vaddq_s16(v3515_tmp, v3514); + int16x8_t v3516 = vaddq_s16(v3513, v3515); + int16x8_t v3517 = vsubq_s16(v3117, v3119); + int16x8_t v3518 = vsubq_s16(v3121, v3123); + int16x8_t v3519_tmp = vqrdmulhq_n_s16(v3518, 2808); + int16x8_t v3519 = vaddq_s16(v3519_tmp, v3518); + int16x8_t v3520 = vaddq_s16(v3517, v3519); + int16x8_t v3521 = vqrdmulhq_n_s16(v3520, 19172); + int16x8_t v3522 = vaddq_s16(v3516, v3521); + int16x8_t v3523 = vsubq_s16(v3085, v3087); + int16x8_t v3524 = vsubq_s16(v3089, v3091); + int16x8_t v3525_tmp = vqrdmulhq_n_s16(v3524, 4586); + int16x8_t v3525 = vaddq_s16(v3525_tmp, v3524); + int16x8_t v3526 = vaddq_s16(v3523, v3525); + int16x8_t v3527 = vsubq_s16(v3095, v3097); + int16x8_t v3528 = vsubq_s16(v3099, v3101); + int16x8_t v3529_tmp = vqrdmulhq_n_s16(v3528, 4586); + int16x8_t v3529 = vaddq_s16(v3529_tmp, v3528); + int16x8_t v3530 = vaddq_s16(v3527, v3529); + int16x8_t v3531 = vqrdmulhq_n_s16(v3530, 19318); + int16x8_t v3532 = vaddq_s16(v3526, v3531); + int16x8_t v3533 = vsubq_s16(v3063, v3065); + int16x8_t v3534 = vsubq_s16(v3067, v3069); + int16x8_t v3535_tmp = vqrdmulhq_n_s16(v3534, 6576); + int16x8_t v3535 = vaddq_s16(v3535_tmp, v3534); + int16x8_t v3536 = vaddq_s16(v3533, v3535); + int16x8_t v3537 = vsubq_s16(v3073, v3075); + int16x8_t v3538 = vsubq_s16(v3077, v3079); + int16x8_t v3539_tmp = vqrdmulhq_n_s16(v3538, 6576); + int16x8_t v3539 = vaddq_s16(v3539_tmp, v3538); + int16x8_t v3540 = vaddq_s16(v3537, v3539); + int16x8_t v3541 = vqrdmulhq_n_s16(v3540, 19469); + int16x8_t v3542 = vaddq_s16(v3536, v3541); + int16x8_t v3543 = vsubq_s16(v3041, v3043); + int16x8_t v3544 = vsubq_s16(v3045, v3047); + int16x8_t v3545_tmp = vqrdmulhq_n_s16(v3544, 8817); + int16x8_t v3545 = vaddq_s16(v3545_tmp, v3544); + int16x8_t v3546 = vaddq_s16(v3543, v3545); + int16x8_t v3547 = vsubq_s16(v3051, v3053); + int16x8_t v3548 = vsubq_s16(v3055, v3057); + int16x8_t v3549_tmp = vqrdmulhq_n_s16(v3548, 8817); + int16x8_t v3549 = vaddq_s16(v3549_tmp, v3548); + int16x8_t v3550 = vaddq_s16(v3547, v3549); + int16x8_t v3551 = vqrdmulhq_n_s16(v3550, 19625); + int16x8_t v3552 = vaddq_s16(v3546, v3551); + int16x8_t v3553 = vsubq_s16(v2998, v3003); + int16x8_t v3554 = vsubq_s16(v3008, v3013); + int16x8_t v3555_tmp = vqrdmulhq_n_s16(v3554, 11356); + int16x8_t v3555 = vaddq_s16(v3555_tmp, v3554); + int16x8_t v3556 = vaddq_s16(v3553, v3555); + int16x8_t v3557 = vsubq_s16(v3020, v3025); + int16x8_t v3558 = vsubq_s16(v3030, v3035); + int16x8_t v3559_tmp = vqrdmulhq_n_s16(v3558, 11356); + int16x8_t v3559 = vaddq_s16(v3559_tmp, v3558); + int16x8_t v3560 = vaddq_s16(v3557, v3559); + int16x8_t v3561 = vqrdmulhq_n_s16(v3560, 19786); + int16x8_t v3562 = vaddq_s16(v3556, v3561); + int16x8_t v3563 = vsubq_s16(v2952, v2957); + int16x8_t v3564 = vsubq_s16(v2962, v2967); + int16x8_t v3565_tmp = vqrdmulhq_n_s16(v3564, 14256); + int16x8_t v3565 = vaddq_s16(v3565_tmp, v3564); + int16x8_t v3566 = vaddq_s16(v3563, v3565); + int16x8_t v3567 = vsubq_s16(v2974, v2979); + int16x8_t v3568 = vsubq_s16(v2984, v2989); + int16x8_t v3569_tmp = vqrdmulhq_n_s16(v3568, 14256); + int16x8_t v3569 = vaddq_s16(v3569_tmp, v3568); + int16x8_t v3570 = vaddq_s16(v3567, v3569); + int16x8_t v3571 = vqrdmulhq_n_s16(v3570, 19954); + int16x8_t v3572 = vaddq_s16(v3566, v3571); + int16x8_t v3573 = vsubq_s16(v2906, v2911); + int16x8_t v3574 = vsubq_s16(v2916, v2921); + int16x8_t v3575_tmp = vqrdmulhq_n_s16(v3574, 17596); + int16x8_t v3575 = vaddq_s16(v3575_tmp, v3574); + int16x8_t v3576 = vaddq_s16(v3573, v3575); + int16x8_t v3577 = vsubq_s16(v2928, v2933); + int16x8_t v3578 = vsubq_s16(v2938, v2943); + int16x8_t v3579_tmp = vqrdmulhq_n_s16(v3578, 17596); + int16x8_t v3579 = vaddq_s16(v3579_tmp, v3578); + int16x8_t v3580 = vaddq_s16(v3577, v3579); + int16x8_t v3581 = vqrdmulhq_n_s16(v3580, 20127); + int16x8_t v3582 = vaddq_s16(v3576, v3581); + int16x8_t v3583 = vsubq_s16(v2860, v2865); + int16x8_t v3584 = vsubq_s16(v2870, v2875); + int16x8_t v3585_tmp = vqrdmulhq_n_s16(v3584, 21483); + int16x8_t v3585 = vaddq_s16(v3585_tmp, v3584); + int16x8_t v3586 = vaddq_s16(v3583, v3585); + int16x8_t v3587 = vsubq_s16(v2882, v2887); + int16x8_t v3588 = vsubq_s16(v2892, v2897); + int16x8_t v3589_tmp = vqrdmulhq_n_s16(v3588, 21483); + int16x8_t v3589 = vaddq_s16(v3589_tmp, v3588); + int16x8_t v3590 = vaddq_s16(v3587, v3589); + int16x8_t v3591 = vqrdmulhq_n_s16(v3590, 20306); + int16x8_t v3592 = vaddq_s16(v3586, v3591); + int16x8_t v3593 = vsubq_s16(v2814, v2819); + int16x8_t v3594 = vsubq_s16(v2824, v2829); + int16x8_t v3595_tmp = vqrdmulhq_n_s16(v3594, 26057); + int16x8_t v3595 = vaddq_s16(v3595_tmp, v3594); + int16x8_t v3596 = vaddq_s16(v3593, v3595); + int16x8_t v3597 = vsubq_s16(v2836, v2841); + int16x8_t v3598 = vsubq_s16(v2846, v2851); + int16x8_t v3599_tmp = vqrdmulhq_n_s16(v3598, 26057); + int16x8_t v3599 = vaddq_s16(v3599_tmp, v3598); + int16x8_t v3600 = vaddq_s16(v3597, v3599); + int16x8_t v3601 = vqrdmulhq_n_s16(v3600, 20492); + int16x8_t v3602 = vaddq_s16(v3596, v3601); + int16x8_t v3603 = vsubq_s16(v2768, v2773); + int16x8_t v3604 = vsubq_s16(v2778, v2783); + int16x8_t v3605_tmp = vqrdmulhq_n_s16(v3604, 31517); + int16x8_t v3605 = vaddq_s16(v3605_tmp, v3604); + int16x8_t v3606 = vaddq_s16(v3603, v3605); + int16x8_t v3607 = vsubq_s16(v2790, v2795); + int16x8_t v3608 = vsubq_s16(v2800, v2805); + int16x8_t v3609_tmp = vqrdmulhq_n_s16(v3608, 31517); + int16x8_t v3609 = vaddq_s16(v3609_tmp, v3608); + int16x8_t v3610 = vaddq_s16(v3607, v3609); + int16x8_t v3611 = vqrdmulhq_n_s16(v3610, 20684); + int16x8_t v3612 = vaddq_s16(v3606, v3611); + int16x8_t v3613 = vsubq_s16(v2722, v2727); + int16x8_t v3614 = vsubq_s16(v2732, v2737); + int16x8_t v3615_tmp = vqrdmulhq_n_s16(v3614, 5373); + int16x8_t v3615 = vmlaq_n_s16(v3615_tmp, v3614, 2); + int16x8_t v3616 = vaddq_s16(v3613, v3615); + int16x8_t v3617 = vsubq_s16(v2744, v2749); + int16x8_t v3618 = vsubq_s16(v2754, v2759); + int16x8_t v3619_tmp = vqrdmulhq_n_s16(v3618, 5373); + int16x8_t v3619 = vmlaq_n_s16(v3619_tmp, v3618, 2); + int16x8_t v3620 = vaddq_s16(v3617, v3619); + int16x8_t v3621 = vqrdmulhq_n_s16(v3620, 20883); + int16x8_t v3622 = vaddq_s16(v3616, v3621); + int16x8_t v3623 = vsubq_s16(v2676, v2681); + int16x8_t v3624 = vsubq_s16(v2686, v2691); + int16x8_t v3625_tmp = vqrdmulhq_n_s16(v3624, 13571); + int16x8_t v3625 = vmlaq_n_s16(v3625_tmp, v3624, 2); + int16x8_t v3626 = vaddq_s16(v3623, v3625); + int16x8_t v3627 = vsubq_s16(v2698, v2703); + int16x8_t v3628 = vsubq_s16(v2708, v2713); + int16x8_t v3629_tmp = vqrdmulhq_n_s16(v3628, 13571); + int16x8_t v3629 = vmlaq_n_s16(v3629_tmp, v3628, 2); + int16x8_t v3630 = vaddq_s16(v3627, v3629); + int16x8_t v3631 = vqrdmulhq_n_s16(v3630, 21089); + int16x8_t v3632 = vaddq_s16(v3626, v3631); + int16x8_t v3633 = vsubq_s16(v2588, v2599); + int16x8_t v3634 = vsubq_s16(v2610, v2621); + int16x8_t v3635_tmp = vqrdmulhq_n_s16(v3634, 23975); + int16x8_t v3635 = vmlaq_n_s16(v3635_tmp, v3634, 2); + int16x8_t v3636 = vaddq_s16(v3633, v3635); + int16x8_t v3637 = vsubq_s16(v2634, v2645); + int16x8_t v3638 = vsubq_s16(v2656, v2667); + int16x8_t v3639_tmp = vqrdmulhq_n_s16(v3638, 23975); + int16x8_t v3639 = vmlaq_n_s16(v3639_tmp, v3638, 2); + int16x8_t v3640 = vaddq_s16(v3637, v3639); + int16x8_t v3641 = vqrdmulhq_n_s16(v3640, 21303); + int16x8_t v3642 = vaddq_s16(v3636, v3641); + int16x8_t v3643 = vsubq_s16(v2494, v2505); + int16x8_t v3644 = vsubq_s16(v2516, v2527); + int16x8_t v3645_tmp = vqrdmulhq_n_s16(v3644, 4832); + int16x8_t v3645 = vmlaq_n_s16(v3645_tmp, v3644, 3); + int16x8_t v3646 = vaddq_s16(v3643, v3645); + int16x8_t v3647 = vsubq_s16(v2540, v2551); + int16x8_t v3648 = vsubq_s16(v2562, v2573); + int16x8_t v3649_tmp = vqrdmulhq_n_s16(v3648, 4832); + int16x8_t v3649 = vmlaq_n_s16(v3649_tmp, v3648, 3); + int16x8_t v3650 = vaddq_s16(v3647, v3649); + int16x8_t v3651 = vqrdmulhq_n_s16(v3650, 21524); + int16x8_t v3652 = vaddq_s16(v3646, v3651); + int16x8_t v3653 = vsubq_s16(v2399, v2410); + int16x8_t v3654 = vsubq_s16(v2421, v2432); + int16x8_t v3655_tmp = vqrdmulhq_n_s16(v3654, 23437); + int16x8_t v3655 = vmlaq_n_s16(v3655_tmp, v3654, 3); + int16x8_t v3656 = vaddq_s16(v3653, v3655); + int16x8_t v3657 = vsubq_s16(v2445, v2456); + int16x8_t v3658 = vsubq_s16(v2468, v2479); + int16x8_t v3659_tmp = vqrdmulhq_n_s16(v3658, 23437); + int16x8_t v3659 = vmlaq_n_s16(v3659_tmp, v3658, 3); + int16x8_t v3660 = vaddq_s16(v3657, v3659); + int16x8_t v3661 = vqrdmulhq_n_s16(v3660, 21753); + int16x8_t v3662 = vaddq_s16(v3656, v3661); + int16x8_t v3663 = vsubq_s16(v2305, v2316); + int16x8_t v3664 = vsubq_s16(v2327, v2338); + int16x8_t v3665_tmp = vqrdmulhq_n_s16(v3664, 17573); + int16x8_t v3665 = vmlaq_n_s16(v3665_tmp, v3664, 4); + int16x8_t v3666 = vaddq_s16(v3663, v3665); + int16x8_t v3667 = vsubq_s16(v2351, v2362); + int16x8_t v3668 = vsubq_s16(v2373, v2384); + int16x8_t v3669_tmp = vqrdmulhq_n_s16(v3668, 17573); + int16x8_t v3669 = vmlaq_n_s16(v3669_tmp, v3668, 4); + int16x8_t v3670 = vaddq_s16(v3667, v3669); + int16x8_t v3671 = vqrdmulhq_n_s16(v3670, 21990); + int16x8_t v3672 = vaddq_s16(v3666, v3671); + int16x8_t v3673 = vsubq_s16(v2127, v2150); + int16x8_t v3674 = vsubq_s16(v2173, v2196); + int16x8_t v3675_tmp = vqrdmulhq_n_s16(v3674, 27122); + int16x8_t v3675 = vmlaq_n_s16(v3675_tmp, v3674, 5); + int16x8_t v3676 = vaddq_s16(v3673, v3675); + int16x8_t v3677 = vsubq_s16(v2221, v2244); + int16x8_t v3678 = vsubq_s16(v2267, v2290); + int16x8_t v3679_tmp = vqrdmulhq_n_s16(v3678, 27122); + int16x8_t v3679 = vmlaq_n_s16(v3679_tmp, v3678, 5); + int16x8_t v3680 = vaddq_s16(v3677, v3679); + int16x8_t v3681 = vqrdmulhq_n_s16(v3680, 22236); + int16x8_t v3682 = vaddq_s16(v3676, v3681); + int16x8_t v3683 = vsubq_s16(v1937, v1960); + int16x8_t v3684 = vsubq_s16(v1983, v2006); + int16x8_t v3685_tmp = vqrdmulhq_n_s16(v3684, 5041); + int16x8_t v3685 = vmlaq_n_s16(v3685_tmp, v3684, 8); + int16x8_t v3686 = vaddq_s16(v3683, v3685); + int16x8_t v3687 = vsubq_s16(v2031, v2054); + int16x8_t v3688 = vsubq_s16(v2077, v2100); + int16x8_t v3689_tmp = vqrdmulhq_n_s16(v3688, 5041); + int16x8_t v3689 = vmlaq_n_s16(v3689_tmp, v3688, 8); + int16x8_t v3690 = vaddq_s16(v3687, v3689); + int16x8_t v3691 = vqrdmulhq_n_s16(v3690, 22491); + int16x8_t v3692 = vaddq_s16(v3686, v3691); + int16x8_t v3693 = vsubq_s16(v1579, v1626); + int16x8_t v3694 = vsubq_s16(v1673, v1720); + int16x8_t v3695_tmp = vqrdmulhq_n_s16(v3694, 19146); + int16x8_t v3695 = vmlaq_n_s16(v3695_tmp, v3694, 13); + int16x8_t v3696 = vaddq_s16(v3693, v3695); + int16x8_t v3697 = vsubq_s16(v1769, v1816); + int16x8_t v3698 = vsubq_s16(v1863, v1910); + int16x8_t v3699_tmp = vqrdmulhq_n_s16(v3698, 19146); + int16x8_t v3699 = vmlaq_n_s16(v3699_tmp, v3698, 13); + int16x8_t v3700 = vaddq_s16(v3697, v3699); + int16x8_t v3701 = vqrdmulhq_n_s16(v3700, 22755); + int16x8_t v3702 = vaddq_s16(v3696, v3701); + int16x8_t v3703 = vsubq_s16(v141, v316); + int16x8_t v3704 = vsubq_s16(v522, v698); + int16x8_t v3705_tmp = vqrdmulhq_n_s16(v3704, 24402); + int16x8_t v3705 = vmlaq_n_s16(v3705_tmp, v3704, 40); + int16x8_t v3706 = vaddq_s16(v3703, v3705); + int16x8_t v3707 = vsubq_s16(v906, v1145); + int16x8_t v3708 = vsubq_s16(v1351, v1528); + int16x8_t v3709_tmp = vqrdmulhq_n_s16(v3708, 24402); + int16x8_t v3709 = vmlaq_n_s16(v3709_tmp, v3708, 40); + int16x8_t v3710 = vaddq_s16(v3707, v3709); + int16x8_t v3711 = vqrdmulhq_n_s16(v3710, 23030); + int16x8_t v3712 = vaddq_s16(v3706, v3711); + int16x8_t v3713 = vsubq_s16(v3703, v3705); + int16x8_t v3714 = vsubq_s16(v3707, v3709); + int16x8_t v3715 = vqrdmulhq_n_s16(v3714, 23314); + int16x8_t v3716 = vaddq_s16(v3713, v3715); + int16x8_t v3717 = vsubq_s16(v3693, v3695); + int16x8_t v3718 = vsubq_s16(v3697, v3699); + int16x8_t v3719 = vqrdmulhq_n_s16(v3718, 23609); + int16x8_t v3720 = vaddq_s16(v3717, v3719); + int16x8_t v3721 = vsubq_s16(v3683, v3685); + int16x8_t v3722 = vsubq_s16(v3687, v3689); + int16x8_t v3723 = vqrdmulhq_n_s16(v3722, 23915); + int16x8_t v3724 = vaddq_s16(v3721, v3723); + int16x8_t v3725 = vsubq_s16(v3673, v3675); + int16x8_t v3726 = vsubq_s16(v3677, v3679); + int16x8_t v3727 = vqrdmulhq_n_s16(v3726, 24233); + int16x8_t v3728 = vaddq_s16(v3725, v3727); + int16x8_t v3729 = vsubq_s16(v3663, v3665); + int16x8_t v3730 = vsubq_s16(v3667, v3669); + int16x8_t v3731 = vqrdmulhq_n_s16(v3730, 24564); + int16x8_t v3732 = vaddq_s16(v3729, v3731); + int16x8_t v3733 = vsubq_s16(v3653, v3655); + int16x8_t v3734 = vsubq_s16(v3657, v3659); + int16x8_t v3735 = vqrdmulhq_n_s16(v3734, 24907); + int16x8_t v3736 = vaddq_s16(v3733, v3735); + int16x8_t v3737 = vsubq_s16(v3643, v3645); + int16x8_t v3738 = vsubq_s16(v3647, v3649); + int16x8_t v3739 = vqrdmulhq_n_s16(v3738, 25264); + int16x8_t v3740 = vaddq_s16(v3737, v3739); + int16x8_t v3741 = vsubq_s16(v3633, v3635); + int16x8_t v3742 = vsubq_s16(v3637, v3639); + int16x8_t v3743 = vqrdmulhq_n_s16(v3742, 25635); + int16x8_t v3744 = vaddq_s16(v3741, v3743); + int16x8_t v3745 = vsubq_s16(v3623, v3625); + int16x8_t v3746 = vsubq_s16(v3627, v3629); + int16x8_t v3747 = vqrdmulhq_n_s16(v3746, 26021); + int16x8_t v3748 = vaddq_s16(v3745, v3747); + int16x8_t v3749 = vsubq_s16(v3613, v3615); + int16x8_t v3750 = vsubq_s16(v3617, v3619); + int16x8_t v3751 = vqrdmulhq_n_s16(v3750, 26423); + int16x8_t v3752 = vaddq_s16(v3749, v3751); + int16x8_t v3753 = vsubq_s16(v3603, v3605); + int16x8_t v3754 = vsubq_s16(v3607, v3609); + int16x8_t v3755 = vqrdmulhq_n_s16(v3754, 26842); + int16x8_t v3756 = vaddq_s16(v3753, v3755); + int16x8_t v3757 = vsubq_s16(v3593, v3595); + int16x8_t v3758 = vsubq_s16(v3597, v3599); + int16x8_t v3759 = vqrdmulhq_n_s16(v3758, 27279); + int16x8_t v3760 = vaddq_s16(v3757, v3759); + int16x8_t v3761 = vsubq_s16(v3583, v3585); + int16x8_t v3762 = vsubq_s16(v3587, v3589); + int16x8_t v3763 = vqrdmulhq_n_s16(v3762, 27734); + int16x8_t v3764 = vaddq_s16(v3761, v3763); + int16x8_t v3765 = vsubq_s16(v3573, v3575); + int16x8_t v3766 = vsubq_s16(v3577, v3579); + int16x8_t v3767 = vqrdmulhq_n_s16(v3766, 28209); + int16x8_t v3768 = vaddq_s16(v3765, v3767); + int16x8_t v3769 = vsubq_s16(v3563, v3565); + int16x8_t v3770 = vsubq_s16(v3567, v3569); + int16x8_t v3771 = vqrdmulhq_n_s16(v3770, 28705); + int16x8_t v3772 = vaddq_s16(v3769, v3771); + int16x8_t v3773 = vsubq_s16(v3553, v3555); + int16x8_t v3774 = vsubq_s16(v3557, v3559); + int16x8_t v3775 = vqrdmulhq_n_s16(v3774, 29223); + int16x8_t v3776 = vaddq_s16(v3773, v3775); + int16x8_t v3777 = vsubq_s16(v3543, v3545); + int16x8_t v3778 = vsubq_s16(v3547, v3549); + int16x8_t v3779 = vqrdmulhq_n_s16(v3778, 29764); + int16x8_t v3780 = vaddq_s16(v3777, v3779); + int16x8_t v3781 = vsubq_s16(v3533, v3535); + int16x8_t v3782 = vsubq_s16(v3537, v3539); + int16x8_t v3783 = vqrdmulhq_n_s16(v3782, 30331); + int16x8_t v3784 = vaddq_s16(v3781, v3783); + int16x8_t v3785 = vsubq_s16(v3523, v3525); + int16x8_t v3786 = vsubq_s16(v3527, v3529); + int16x8_t v3787 = vqrdmulhq_n_s16(v3786, 30925); + int16x8_t v3788 = vaddq_s16(v3785, v3787); + int16x8_t v3789 = vsubq_s16(v3513, v3515); + int16x8_t v3790 = vsubq_s16(v3517, v3519); + int16x8_t v3791 = vqrdmulhq_n_s16(v3790, 31547); + int16x8_t v3792 = vaddq_s16(v3789, v3791); + int16x8_t v3793 = vsubq_s16(v3503, v3505); + int16x8_t v3794 = vsubq_s16(v3507, v3509); + int16x8_t v3795 = vqrdmulhq_n_s16(v3794, 32199); + int16x8_t v3796 = vaddq_s16(v3793, v3795); + int16x8_t v3797 = vsubq_s16(v3493, v3495); + int16x8_t v3798 = vsubq_s16(v3497, v3499); + int16x8_t v3799_tmp = vqrdmulhq_n_s16(v3798, 117); + int16x8_t v3799 = vaddq_s16(v3799_tmp, v3798); + int16x8_t v3800 = vaddq_s16(v3797, v3799); + int16x8_t v3801 = vsubq_s16(v3483, v3485); + int16x8_t v3802 = vsubq_s16(v3487, v3489); + int16x8_t v3803_tmp = vqrdmulhq_n_s16(v3802, 837); + int16x8_t v3803 = vaddq_s16(v3803_tmp, v3802); + int16x8_t v3804 = vaddq_s16(v3801, v3803); + int16x8_t v3805 = vsubq_s16(v3473, v3475); + int16x8_t v3806 = vsubq_s16(v3477, v3479); + int16x8_t v3807_tmp = vqrdmulhq_n_s16(v3806, 1594); + int16x8_t v3807 = vaddq_s16(v3807_tmp, v3806); + int16x8_t v3808 = vaddq_s16(v3805, v3807); + int16x8_t v3809 = vsubq_s16(v3463, v3465); + int16x8_t v3810 = vsubq_s16(v3467, v3469); + int16x8_t v3811_tmp = vqrdmulhq_n_s16(v3810, 2393); + int16x8_t v3811 = vaddq_s16(v3811_tmp, v3810); + int16x8_t v3812 = vaddq_s16(v3809, v3811); + int16x8_t v3813 = vsubq_s16(v3453, v3455); + int16x8_t v3814 = vsubq_s16(v3457, v3459); + int16x8_t v3815_tmp = vqrdmulhq_n_s16(v3814, 3234); + int16x8_t v3815 = vaddq_s16(v3815_tmp, v3814); + int16x8_t v3816 = vaddq_s16(v3813, v3815); + int16x8_t v3817 = vsubq_s16(v3443, v3445); + int16x8_t v3818 = vsubq_s16(v3447, v3449); + int16x8_t v3819_tmp = vqrdmulhq_n_s16(v3818, 4123); + int16x8_t v3819 = vaddq_s16(v3819_tmp, v3818); + int16x8_t v3820 = vaddq_s16(v3817, v3819); + int16x8_t v3821 = vsubq_s16(v3433, v3435); + int16x8_t v3822 = vsubq_s16(v3437, v3439); + int16x8_t v3823_tmp = vqrdmulhq_n_s16(v3822, 5062); + int16x8_t v3823 = vaddq_s16(v3823_tmp, v3822); + int16x8_t v3824 = vaddq_s16(v3821, v3823); + int16x8_t v3825 = vsubq_s16(v3423, v3425); + int16x8_t v3826 = vsubq_s16(v3427, v3429); + int16x8_t v3827_tmp = vqrdmulhq_n_s16(v3826, 6057); + int16x8_t v3827 = vaddq_s16(v3827_tmp, v3826); + int16x8_t v3828 = vaddq_s16(v3825, v3827); + int16x8_t v3829 = vsubq_s16(v3413, v3415); + int16x8_t v3830 = vsubq_s16(v3417, v3419); + int16x8_t v3831_tmp = vqrdmulhq_n_s16(v3830, 7111); + int16x8_t v3831 = vaddq_s16(v3831_tmp, v3830); + int16x8_t v3832 = vaddq_s16(v3829, v3831); + int16x8_t v3833 = vsubq_s16(v3403, v3405); + int16x8_t v3834 = vsubq_s16(v3407, v3409); + int16x8_t v3835_tmp = vqrdmulhq_n_s16(v3834, 8231); + int16x8_t v3835 = vaddq_s16(v3835_tmp, v3834); + int16x8_t v3836 = vaddq_s16(v3833, v3835); + int16x8_t v3837 = vsubq_s16(v3393, v3395); + int16x8_t v3838 = vsubq_s16(v3397, v3399); + int16x8_t v3839_tmp = vqrdmulhq_n_s16(v3838, 9421); + int16x8_t v3839 = vaddq_s16(v3839_tmp, v3838); + int16x8_t v3840 = vaddq_s16(v3837, v3839); + int16x8_t v3841 = vsubq_s16(v3374, v3379); + int16x8_t v3842 = vsubq_s16(v3384, v3389); + int16x8_t v3843_tmp = vqrdmulhq_n_s16(v3842, 10690); + int16x8_t v3843 = vaddq_s16(v3843_tmp, v3842); + int16x8_t v3844 = vaddq_s16(v3841, v3843); + int16x8_t v3845 = vsubq_s16(v3352, v3357); + int16x8_t v3846 = vsubq_s16(v3362, v3367); + int16x8_t v3847_tmp = vqrdmulhq_n_s16(v3846, 12044); + int16x8_t v3847 = vaddq_s16(v3847_tmp, v3846); + int16x8_t v3848 = vaddq_s16(v3845, v3847); + int16x8_t v3849 = vsubq_s16(v3330, v3335); + int16x8_t v3850 = vsubq_s16(v3340, v3345); + int16x8_t v3851_tmp = vqrdmulhq_n_s16(v3850, 13493); + int16x8_t v3851 = vaddq_s16(v3851_tmp, v3850); + int16x8_t v3852 = vaddq_s16(v3849, v3851); + int16x8_t v3853 = vsubq_s16(v3308, v3313); + int16x8_t v3854 = vsubq_s16(v3318, v3323); + int16x8_t v3855_tmp = vqrdmulhq_n_s16(v3854, 15046); + int16x8_t v3855 = vaddq_s16(v3855_tmp, v3854); + int16x8_t v3856 = vaddq_s16(v3853, v3855); + int16x8_t v3857 = vsubq_s16(v3286, v3291); + int16x8_t v3858 = vsubq_s16(v3296, v3301); + int16x8_t v3859_tmp = vqrdmulhq_n_s16(v3858, 16715); + int16x8_t v3859 = vaddq_s16(v3859_tmp, v3858); + int16x8_t v3860 = vaddq_s16(v3857, v3859); + int16x8_t v3861 = vsubq_s16(v3264, v3269); + int16x8_t v3862 = vsubq_s16(v3274, v3279); + int16x8_t v3863_tmp = vqrdmulhq_n_s16(v3862, 18512); + int16x8_t v3863 = vaddq_s16(v3863_tmp, v3862); + int16x8_t v3864 = vaddq_s16(v3861, v3863); + int16x8_t v3865 = vsubq_s16(v3242, v3247); + int16x8_t v3866 = vsubq_s16(v3252, v3257); + int16x8_t v3867_tmp = vqrdmulhq_n_s16(v3866, 20453); + int16x8_t v3867 = vaddq_s16(v3867_tmp, v3866); + int16x8_t v3868 = vaddq_s16(v3865, v3867); + int16x8_t v3869 = vsubq_s16(v3220, v3225); + int16x8_t v3870 = vsubq_s16(v3230, v3235); + int16x8_t v3871_tmp = vqrdmulhq_n_s16(v3870, 22555); + int16x8_t v3871 = vaddq_s16(v3871_tmp, v3870); + int16x8_t v3872 = vaddq_s16(v3869, v3871); + int16x8_t v3873 = vsubq_s16(v3198, v3203); + int16x8_t v3874 = vsubq_s16(v3208, v3213); + int16x8_t v3875_tmp = vqrdmulhq_n_s16(v3874, 24839); + int16x8_t v3875 = vaddq_s16(v3875_tmp, v3874); + int16x8_t v3876 = vaddq_s16(v3873, v3875); + int16x8_t v3877 = vsubq_s16(v3176, v3181); + int16x8_t v3878 = vsubq_s16(v3186, v3191); + int16x8_t v3879_tmp = vqrdmulhq_n_s16(v3878, 27330); + int16x8_t v3879 = vaddq_s16(v3879_tmp, v3878); + int16x8_t v3880 = vaddq_s16(v3877, v3879); + int16x8_t v3881 = vsubq_s16(v3154, v3159); + int16x8_t v3882 = vsubq_s16(v3164, v3169); + int16x8_t v3883_tmp = vqrdmulhq_n_s16(v3882, 30056); + int16x8_t v3883 = vaddq_s16(v3883_tmp, v3882); + int16x8_t v3884 = vaddq_s16(v3881, v3883); + int16x8_t v3885 = vsubq_s16(v3132, v3137); + int16x8_t v3886 = vsubq_s16(v3142, v3147); + int16x8_t v3887_tmp = vqrdmulhq_n_s16(v3886, 282); + int16x8_t v3887 = vmlaq_n_s16(v3887_tmp, v3886, 2); + int16x8_t v3888 = vaddq_s16(v3885, v3887); + int16x8_t v3889 = vsubq_s16(v3110, v3115); + int16x8_t v3890 = vsubq_s16(v3120, v3125); + int16x8_t v3891_tmp = vqrdmulhq_n_s16(v3890, 3588); + int16x8_t v3891 = vmlaq_n_s16(v3891_tmp, v3890, 2); + int16x8_t v3892 = vaddq_s16(v3889, v3891); + int16x8_t v3893 = vsubq_s16(v3088, v3093); + int16x8_t v3894 = vsubq_s16(v3098, v3103); + int16x8_t v3895_tmp = vqrdmulhq_n_s16(v3894, 7255); + int16x8_t v3895 = vmlaq_n_s16(v3895_tmp, v3894, 2); + int16x8_t v3896 = vaddq_s16(v3893, v3895); + int16x8_t v3897 = vsubq_s16(v3066, v3071); + int16x8_t v3898 = vsubq_s16(v3076, v3081); + int16x8_t v3899_tmp = vqrdmulhq_n_s16(v3898, 11344); + int16x8_t v3899 = vmlaq_n_s16(v3899_tmp, v3898, 2); + int16x8_t v3900 = vaddq_s16(v3897, v3899); + int16x8_t v3901 = vsubq_s16(v3044, v3049); + int16x8_t v3902 = vsubq_s16(v3054, v3059); + int16x8_t v3903_tmp = vqrdmulhq_n_s16(v3902, 15934); + int16x8_t v3903 = vmlaq_n_s16(v3903_tmp, v3902, 2); + int16x8_t v3904 = vaddq_s16(v3901, v3903); + int16x8_t v3905 = vsubq_s16(v3004, v3015); + int16x8_t v3906 = vsubq_s16(v3026, v3037); + int16x8_t v3907_tmp = vqrdmulhq_n_s16(v3906, 21120); + int16x8_t v3907 = vmlaq_n_s16(v3907_tmp, v3906, 2); + int16x8_t v3908 = vaddq_s16(v3905, v3907); + int16x8_t v3909 = vsubq_s16(v2958, v2969); + int16x8_t v3910 = vsubq_s16(v2980, v2991); + int16x8_t v3911_tmp = vqrdmulhq_n_s16(v3910, 27027); + int16x8_t v3911 = vmlaq_n_s16(v3911_tmp, v3910, 2); + int16x8_t v3912 = vaddq_s16(v3909, v3911); + int16x8_t v3913 = vsubq_s16(v2912, v2923); + int16x8_t v3914 = vsubq_s16(v2934, v2945); + int16x8_t v3915_tmp = vqrdmulhq_n_s16(v3914, 1045); + int16x8_t v3915 = vmlaq_n_s16(v3915_tmp, v3914, 3); + int16x8_t v3916 = vaddq_s16(v3913, v3915); + int16x8_t v3917 = vsubq_s16(v2866, v2877); + int16x8_t v3918 = vsubq_s16(v2888, v2899); + int16x8_t v3919_tmp = vqrdmulhq_n_s16(v3918, 8923); + int16x8_t v3919 = vmlaq_n_s16(v3919_tmp, v3918, 3); + int16x8_t v3920 = vaddq_s16(v3917, v3919); + int16x8_t v3921 = vsubq_s16(v2820, v2831); + int16x8_t v3922 = vsubq_s16(v2842, v2853); + int16x8_t v3923_tmp = vqrdmulhq_n_s16(v3922, 18177); + int16x8_t v3923 = vmlaq_n_s16(v3923_tmp, v3922, 3); + int16x8_t v3924 = vaddq_s16(v3921, v3923); + int16x8_t v3925 = vsubq_s16(v2774, v2785); + int16x8_t v3926 = vsubq_s16(v2796, v2807); + int16x8_t v3927_tmp = vqrdmulhq_n_s16(v3926, 29200); + int16x8_t v3927 = vmlaq_n_s16(v3927_tmp, v3926, 3); + int16x8_t v3928 = vaddq_s16(v3925, v3927); + int16x8_t v3929 = vsubq_s16(v2728, v2739); + int16x8_t v3930 = vsubq_s16(v2750, v2761); + int16x8_t v3931_tmp = vqrdmulhq_n_s16(v3930, 9782); + int16x8_t v3931 = vmlaq_n_s16(v3931_tmp, v3930, 4); + int16x8_t v3932 = vaddq_s16(v3929, v3931); + int16x8_t v3933 = vsubq_s16(v2682, v2693); + int16x8_t v3934 = vsubq_s16(v2704, v2715); + int16x8_t v3935_tmp = vqrdmulhq_n_s16(v3934, 26282); + int16x8_t v3935 = vmlaq_n_s16(v3935_tmp, v3934, 4); + int16x8_t v3936 = vaddq_s16(v3933, v3935); + int16x8_t v3937 = vsubq_s16(v2600, v2623); + int16x8_t v3938 = vsubq_s16(v2646, v2669); + int16x8_t v3939_tmp = vqrdmulhq_n_s16(v3938, 14423); + int16x8_t v3939 = vmlaq_n_s16(v3939_tmp, v3938, 5); + int16x8_t v3940 = vaddq_s16(v3937, v3939); + int16x8_t v3941 = vsubq_s16(v2506, v2529); + int16x8_t v3942 = vsubq_s16(v2552, v2575); + int16x8_t v3943_tmp = vqrdmulhq_n_s16(v3942, 9008); + int16x8_t v3943 = vmlaq_n_s16(v3943_tmp, v3942, 6); + int16x8_t v3944 = vaddq_s16(v3941, v3943); + int16x8_t v3945 = vsubq_s16(v2411, v2434); + int16x8_t v3946 = vsubq_s16(v2457, v2481); + int16x8_t v3947_tmp = vqrdmulhq_n_s16(v3946, 13552); + int16x8_t v3947 = vmlaq_n_s16(v3947_tmp, v3946, 7); + int16x8_t v3948 = vaddq_s16(v3945, v3947); + int16x8_t v3949 = vsubq_s16(v2317, v2340); + int16x8_t v3950 = vsubq_s16(v2363, v2386); + int16x8_t v3951_tmp = vqrdmulhq_n_s16(v3950, 1925); + int16x8_t v3951 = vmlaq_n_s16(v3951_tmp, v3950, 9); + int16x8_t v3952 = vaddq_s16(v3949, v3951); + int16x8_t v3953 = vsubq_s16(v2151, v2198); + int16x8_t v3954 = vsubq_s16(v2245, v2292); + int16x8_t v3955_tmp = vqrdmulhq_n_s16(v3954, 21123); + int16x8_t v3955 = vmlaq_n_s16(v3955_tmp, v3954, 11); + int16x8_t v3956 = vaddq_s16(v3953, v3955); + int16x8_t v3957 = vsubq_s16(v1961, v2008); + int16x8_t v3958 = vsubq_s16(v2055, v2102); + int16x8_t v3959_tmp = vqrdmulhq_n_s16(v3958, 9831); + int16x8_t v3959 = vmlaq_n_s16(v3959_tmp, v3958, 16); + int16x8_t v3960 = vaddq_s16(v3957, v3959); + int16x8_t v3961 = vsubq_s16(v1627, v1722); + int16x8_t v3962 = vsubq_s16(v1817, v1912); + int16x8_t v3963_tmp = vqrdmulhq_n_s16(v3962, 5373); + int16x8_t v3963 = vmlaq_n_s16(v3963_tmp, v3962, 27); + int16x8_t v3964 = vaddq_s16(v3961, v3963); + int16x8_t v3965 = vsubq_s16(v317, v700); + int16x8_t v3966 = vsubq_s16(v1146, v1530); + int16x8_t v3967_tmp = vqrdmulhq_n_s16(v3966, 15986); + int16x8_t v3967 = vmlaq_n_s16(v3967_tmp, v3966, 81); + int16x8_t v3968 = vaddq_s16(v3965, v3967); + int16x8_t v3969 = vsubq_s16(v3965, v3967); + int16x8_t v3970 = vsubq_s16(v3961, v3963); + int16x8_t v3971 = vsubq_s16(v3957, v3959); + int16x8_t v3972 = vsubq_s16(v3953, v3955); + int16x8_t v3973 = vsubq_s16(v3949, v3951); + int16x8_t v3974 = vsubq_s16(v3945, v3947); + int16x8_t v3975 = vsubq_s16(v3941, v3943); + int16x8_t v3976 = vsubq_s16(v3937, v3939); + int16x8_t v3977 = vsubq_s16(v3933, v3935); + int16x8_t v3978 = vsubq_s16(v3929, v3931); + int16x8_t v3979 = vsubq_s16(v3925, v3927); + int16x8_t v3980 = vsubq_s16(v3921, v3923); + int16x8_t v3981 = vsubq_s16(v3917, v3919); + int16x8_t v3982 = vsubq_s16(v3913, v3915); + int16x8_t v3983 = vsubq_s16(v3909, v3911); + int16x8_t v3984 = vsubq_s16(v3905, v3907); + int16x8_t v3985 = vsubq_s16(v3901, v3903); + int16x8_t v3986 = vsubq_s16(v3897, v3899); + int16x8_t v3987 = vsubq_s16(v3893, v3895); + int16x8_t v3988 = vsubq_s16(v3889, v3891); + int16x8_t v3989 = vsubq_s16(v3885, v3887); + int16x8_t v3990 = vsubq_s16(v3881, v3883); + int16x8_t v3991 = vsubq_s16(v3877, v3879); + int16x8_t v3992 = vsubq_s16(v3873, v3875); + int16x8_t v3993 = vsubq_s16(v3869, v3871); + int16x8_t v3994 = vsubq_s16(v3865, v3867); + int16x8_t v3995 = vsubq_s16(v3861, v3863); + int16x8_t v3996 = vsubq_s16(v3857, v3859); + int16x8_t v3997 = vsubq_s16(v3853, v3855); + int16x8_t v3998 = vsubq_s16(v3849, v3851); + int16x8_t v3999 = vsubq_s16(v3845, v3847); + int16x8_t v4000 = vsubq_s16(v3841, v3843); + int16x8_t v4001 = vsubq_s16(v3837, v3839); + int16x8_t v4002 = vsubq_s16(v3833, v3835); + int16x8_t v4003 = vsubq_s16(v3829, v3831); + int16x8_t v4004 = vsubq_s16(v3825, v3827); + int16x8_t v4005 = vsubq_s16(v3821, v3823); + int16x8_t v4006 = vsubq_s16(v3817, v3819); + int16x8_t v4007 = vsubq_s16(v3813, v3815); + int16x8_t v4008 = vsubq_s16(v3809, v3811); + int16x8_t v4009 = vsubq_s16(v3805, v3807); + int16x8_t v4010 = vsubq_s16(v3801, v3803); + int16x8_t v4011 = vsubq_s16(v3797, v3799); + int16x8_t v4012 = vsubq_s16(v3793, v3795); + int16x8_t v4013 = vsubq_s16(v3789, v3791); + int16x8_t v4014 = vsubq_s16(v3785, v3787); + int16x8_t v4015 = vsubq_s16(v3781, v3783); + int16x8_t v4016 = vsubq_s16(v3777, v3779); + int16x8_t v4017 = vsubq_s16(v3773, v3775); + int16x8_t v4018 = vsubq_s16(v3769, v3771); + int16x8_t v4019 = vsubq_s16(v3765, v3767); + int16x8_t v4020 = vsubq_s16(v3761, v3763); + int16x8_t v4021 = vsubq_s16(v3757, v3759); + int16x8_t v4022 = vsubq_s16(v3753, v3755); + int16x8_t v4023 = vsubq_s16(v3749, v3751); + int16x8_t v4024 = vsubq_s16(v3745, v3747); + int16x8_t v4025 = vsubq_s16(v3741, v3743); + int16x8_t v4026 = vsubq_s16(v3737, v3739); + int16x8_t v4027 = vsubq_s16(v3733, v3735); + int16x8_t v4028 = vsubq_s16(v3729, v3731); + int16x8_t v4029 = vsubq_s16(v3725, v3727); + int16x8_t v4030 = vsubq_s16(v3721, v3723); + int16x8_t v4031 = vsubq_s16(v3717, v3719); + int16x8_t v4032 = vsubq_s16(v3713, v3715); + int16x8_t v4033 = vsubq_s16(v3706, v3711); + int16x8_t v4034 = vsubq_s16(v3696, v3701); + int16x8_t v4035 = vsubq_s16(v3686, v3691); + int16x8_t v4036 = vsubq_s16(v3676, v3681); + int16x8_t v4037 = vsubq_s16(v3666, v3671); + int16x8_t v4038 = vsubq_s16(v3656, v3661); + int16x8_t v4039 = vsubq_s16(v3646, v3651); + int16x8_t v4040 = vsubq_s16(v3636, v3641); + int16x8_t v4041 = vsubq_s16(v3626, v3631); + int16x8_t v4042 = vsubq_s16(v3616, v3621); + int16x8_t v4043 = vsubq_s16(v3606, v3611); + int16x8_t v4044 = vsubq_s16(v3596, v3601); + int16x8_t v4045 = vsubq_s16(v3586, v3591); + int16x8_t v4046 = vsubq_s16(v3576, v3581); + int16x8_t v4047 = vsubq_s16(v3566, v3571); + int16x8_t v4048 = vsubq_s16(v3556, v3561); + int16x8_t v4049 = vsubq_s16(v3546, v3551); + int16x8_t v4050 = vsubq_s16(v3536, v3541); + int16x8_t v4051 = vsubq_s16(v3526, v3531); + int16x8_t v4052 = vsubq_s16(v3516, v3521); + int16x8_t v4053 = vsubq_s16(v3506, v3511); + int16x8_t v4054 = vsubq_s16(v3496, v3501); + int16x8_t v4055 = vsubq_s16(v3486, v3491); + int16x8_t v4056 = vsubq_s16(v3476, v3481); + int16x8_t v4057 = vsubq_s16(v3466, v3471); + int16x8_t v4058 = vsubq_s16(v3456, v3461); + int16x8_t v4059 = vsubq_s16(v3446, v3451); + int16x8_t v4060 = vsubq_s16(v3436, v3441); + int16x8_t v4061 = vsubq_s16(v3426, v3431); + int16x8_t v4062 = vsubq_s16(v3416, v3421); + int16x8_t v4063 = vsubq_s16(v3406, v3411); + int16x8_t v4064 = vsubq_s16(v3396, v3401); + int16x8_t v4065 = vsubq_s16(v3380, v3391); + int16x8_t v4066 = vsubq_s16(v3358, v3369); + int16x8_t v4067 = vsubq_s16(v3336, v3347); + int16x8_t v4068 = vsubq_s16(v3314, v3325); + int16x8_t v4069 = vsubq_s16(v3292, v3303); + int16x8_t v4070 = vsubq_s16(v3270, v3281); + int16x8_t v4071 = vsubq_s16(v3248, v3259); + int16x8_t v4072 = vsubq_s16(v3226, v3237); + int16x8_t v4073 = vsubq_s16(v3204, v3215); + int16x8_t v4074 = vsubq_s16(v3182, v3193); + int16x8_t v4075 = vsubq_s16(v3160, v3171); + int16x8_t v4076 = vsubq_s16(v3138, v3149); + int16x8_t v4077 = vsubq_s16(v3116, v3127); + int16x8_t v4078 = vsubq_s16(v3094, v3105); + int16x8_t v4079 = vsubq_s16(v3072, v3083); + int16x8_t v4080 = vsubq_s16(v3050, v3061); + int16x8_t v4081 = vsubq_s16(v3016, v3039); + int16x8_t v4082 = vsubq_s16(v2970, v2993); + int16x8_t v4083 = vsubq_s16(v2924, v2947); + int16x8_t v4084 = vsubq_s16(v2878, v2901); + int16x8_t v4085 = vsubq_s16(v2832, v2855); + int16x8_t v4086 = vsubq_s16(v2786, v2809); + int16x8_t v4087 = vsubq_s16(v2740, v2763); + int16x8_t v4088 = vsubq_s16(v2694, v2717); + int16x8_t v4089 = vsubq_s16(v2624, v2671); + int16x8_t v4090 = vsubq_s16(v2530, v2577); + int16x8_t v4091 = vsubq_s16(v2435, v2483); + int16x8_t v4092 = vsubq_s16(v2341, v2388); + int16x8_t v4093 = vsubq_s16(v2199, v2294); + int16x8_t v4094 = vsubq_s16(v2009, v2104); + int16x8_t v4095 = vsubq_s16(v1723, v1914); + int16x8_t v4096 = vsubq_s16(v701, v1532); + vst1q_s16(out + out_stride * 0 + i, v1533); + vst1q_s16(out + out_stride * 1 + i, v1915); + vst1q_s16(out + out_stride * 2 + i, v2105); + vst1q_s16(out + out_stride * 3 + i, v2295); + vst1q_s16(out + out_stride * 4 + i, v2389); + vst1q_s16(out + out_stride * 5 + i, v2484); + vst1q_s16(out + out_stride * 6 + i, v2578); + vst1q_s16(out + out_stride * 7 + i, v2672); + vst1q_s16(out + out_stride * 8 + i, v2718); + vst1q_s16(out + out_stride * 9 + i, v2764); + vst1q_s16(out + out_stride * 10 + i, v2810); + vst1q_s16(out + out_stride * 11 + i, v2856); + vst1q_s16(out + out_stride * 12 + i, v2902); + vst1q_s16(out + out_stride * 13 + i, v2948); + vst1q_s16(out + out_stride * 14 + i, v2994); + vst1q_s16(out + out_stride * 15 + i, v3040); + vst1q_s16(out + out_stride * 16 + i, v3062); + vst1q_s16(out + out_stride * 17 + i, v3084); + vst1q_s16(out + out_stride * 18 + i, v3106); + vst1q_s16(out + out_stride * 19 + i, v3128); + vst1q_s16(out + out_stride * 20 + i, v3150); + vst1q_s16(out + out_stride * 21 + i, v3172); + vst1q_s16(out + out_stride * 22 + i, v3194); + vst1q_s16(out + out_stride * 23 + i, v3216); + vst1q_s16(out + out_stride * 24 + i, v3238); + vst1q_s16(out + out_stride * 25 + i, v3260); + vst1q_s16(out + out_stride * 26 + i, v3282); + vst1q_s16(out + out_stride * 27 + i, v3304); + vst1q_s16(out + out_stride * 28 + i, v3326); + vst1q_s16(out + out_stride * 29 + i, v3348); + vst1q_s16(out + out_stride * 30 + i, v3370); + vst1q_s16(out + out_stride * 31 + i, v3392); + vst1q_s16(out + out_stride * 32 + i, v3402); + vst1q_s16(out + out_stride * 33 + i, v3412); + vst1q_s16(out + out_stride * 34 + i, v3422); + vst1q_s16(out + out_stride * 35 + i, v3432); + vst1q_s16(out + out_stride * 36 + i, v3442); + vst1q_s16(out + out_stride * 37 + i, v3452); + vst1q_s16(out + out_stride * 38 + i, v3462); + vst1q_s16(out + out_stride * 39 + i, v3472); + vst1q_s16(out + out_stride * 40 + i, v3482); + vst1q_s16(out + out_stride * 41 + i, v3492); + vst1q_s16(out + out_stride * 42 + i, v3502); + vst1q_s16(out + out_stride * 43 + i, v3512); + vst1q_s16(out + out_stride * 44 + i, v3522); + vst1q_s16(out + out_stride * 45 + i, v3532); + vst1q_s16(out + out_stride * 46 + i, v3542); + vst1q_s16(out + out_stride * 47 + i, v3552); + vst1q_s16(out + out_stride * 48 + i, v3562); + vst1q_s16(out + out_stride * 49 + i, v3572); + vst1q_s16(out + out_stride * 50 + i, v3582); + vst1q_s16(out + out_stride * 51 + i, v3592); + vst1q_s16(out + out_stride * 52 + i, v3602); + vst1q_s16(out + out_stride * 53 + i, v3612); + vst1q_s16(out + out_stride * 54 + i, v3622); + vst1q_s16(out + out_stride * 55 + i, v3632); + vst1q_s16(out + out_stride * 56 + i, v3642); + vst1q_s16(out + out_stride * 57 + i, v3652); + vst1q_s16(out + out_stride * 58 + i, v3662); + vst1q_s16(out + out_stride * 59 + i, v3672); + vst1q_s16(out + out_stride * 60 + i, v3682); + vst1q_s16(out + out_stride * 61 + i, v3692); + vst1q_s16(out + out_stride * 62 + i, v3702); + vst1q_s16(out + out_stride * 63 + i, v3712); + vst1q_s16(out + out_stride * 64 + i, v3716); + vst1q_s16(out + out_stride * 65 + i, v3720); + vst1q_s16(out + out_stride * 66 + i, v3724); + vst1q_s16(out + out_stride * 67 + i, v3728); + vst1q_s16(out + out_stride * 68 + i, v3732); + vst1q_s16(out + out_stride * 69 + i, v3736); + vst1q_s16(out + out_stride * 70 + i, v3740); + vst1q_s16(out + out_stride * 71 + i, v3744); + vst1q_s16(out + out_stride * 72 + i, v3748); + vst1q_s16(out + out_stride * 73 + i, v3752); + vst1q_s16(out + out_stride * 74 + i, v3756); + vst1q_s16(out + out_stride * 75 + i, v3760); + vst1q_s16(out + out_stride * 76 + i, v3764); + vst1q_s16(out + out_stride * 77 + i, v3768); + vst1q_s16(out + out_stride * 78 + i, v3772); + vst1q_s16(out + out_stride * 79 + i, v3776); + vst1q_s16(out + out_stride * 80 + i, v3780); + vst1q_s16(out + out_stride * 81 + i, v3784); + vst1q_s16(out + out_stride * 82 + i, v3788); + vst1q_s16(out + out_stride * 83 + i, v3792); + vst1q_s16(out + out_stride * 84 + i, v3796); + vst1q_s16(out + out_stride * 85 + i, v3800); + vst1q_s16(out + out_stride * 86 + i, v3804); + vst1q_s16(out + out_stride * 87 + i, v3808); + vst1q_s16(out + out_stride * 88 + i, v3812); + vst1q_s16(out + out_stride * 89 + i, v3816); + vst1q_s16(out + out_stride * 90 + i, v3820); + vst1q_s16(out + out_stride * 91 + i, v3824); + vst1q_s16(out + out_stride * 92 + i, v3828); + vst1q_s16(out + out_stride * 93 + i, v3832); + vst1q_s16(out + out_stride * 94 + i, v3836); + vst1q_s16(out + out_stride * 95 + i, v3840); + vst1q_s16(out + out_stride * 96 + i, v3844); + vst1q_s16(out + out_stride * 97 + i, v3848); + vst1q_s16(out + out_stride * 98 + i, v3852); + vst1q_s16(out + out_stride * 99 + i, v3856); + vst1q_s16(out + out_stride * 100 + i, v3860); + vst1q_s16(out + out_stride * 101 + i, v3864); + vst1q_s16(out + out_stride * 102 + i, v3868); + vst1q_s16(out + out_stride * 103 + i, v3872); + vst1q_s16(out + out_stride * 104 + i, v3876); + vst1q_s16(out + out_stride * 105 + i, v3880); + vst1q_s16(out + out_stride * 106 + i, v3884); + vst1q_s16(out + out_stride * 107 + i, v3888); + vst1q_s16(out + out_stride * 108 + i, v3892); + vst1q_s16(out + out_stride * 109 + i, v3896); + vst1q_s16(out + out_stride * 110 + i, v3900); + vst1q_s16(out + out_stride * 111 + i, v3904); + vst1q_s16(out + out_stride * 112 + i, v3908); + vst1q_s16(out + out_stride * 113 + i, v3912); + vst1q_s16(out + out_stride * 114 + i, v3916); + vst1q_s16(out + out_stride * 115 + i, v3920); + vst1q_s16(out + out_stride * 116 + i, v3924); + vst1q_s16(out + out_stride * 117 + i, v3928); + vst1q_s16(out + out_stride * 118 + i, v3932); + vst1q_s16(out + out_stride * 119 + i, v3936); + vst1q_s16(out + out_stride * 120 + i, v3940); + vst1q_s16(out + out_stride * 121 + i, v3944); + vst1q_s16(out + out_stride * 122 + i, v3948); + vst1q_s16(out + out_stride * 123 + i, v3952); + vst1q_s16(out + out_stride * 124 + i, v3956); + vst1q_s16(out + out_stride * 125 + i, v3960); + vst1q_s16(out + out_stride * 126 + i, v3964); + vst1q_s16(out + out_stride * 127 + i, v3968); + vst1q_s16(out + out_stride * 128 + i, v3969); + vst1q_s16(out + out_stride * 129 + i, v3970); + vst1q_s16(out + out_stride * 130 + i, v3971); + vst1q_s16(out + out_stride * 131 + i, v3972); + vst1q_s16(out + out_stride * 132 + i, v3973); + vst1q_s16(out + out_stride * 133 + i, v3974); + vst1q_s16(out + out_stride * 134 + i, v3975); + vst1q_s16(out + out_stride * 135 + i, v3976); + vst1q_s16(out + out_stride * 136 + i, v3977); + vst1q_s16(out + out_stride * 137 + i, v3978); + vst1q_s16(out + out_stride * 138 + i, v3979); + vst1q_s16(out + out_stride * 139 + i, v3980); + vst1q_s16(out + out_stride * 140 + i, v3981); + vst1q_s16(out + out_stride * 141 + i, v3982); + vst1q_s16(out + out_stride * 142 + i, v3983); + vst1q_s16(out + out_stride * 143 + i, v3984); + vst1q_s16(out + out_stride * 144 + i, v3985); + vst1q_s16(out + out_stride * 145 + i, v3986); + vst1q_s16(out + out_stride * 146 + i, v3987); + vst1q_s16(out + out_stride * 147 + i, v3988); + vst1q_s16(out + out_stride * 148 + i, v3989); + vst1q_s16(out + out_stride * 149 + i, v3990); + vst1q_s16(out + out_stride * 150 + i, v3991); + vst1q_s16(out + out_stride * 151 + i, v3992); + vst1q_s16(out + out_stride * 152 + i, v3993); + vst1q_s16(out + out_stride * 153 + i, v3994); + vst1q_s16(out + out_stride * 154 + i, v3995); + vst1q_s16(out + out_stride * 155 + i, v3996); + vst1q_s16(out + out_stride * 156 + i, v3997); + vst1q_s16(out + out_stride * 157 + i, v3998); + vst1q_s16(out + out_stride * 158 + i, v3999); + vst1q_s16(out + out_stride * 159 + i, v4000); + vst1q_s16(out + out_stride * 160 + i, v4001); + vst1q_s16(out + out_stride * 161 + i, v4002); + vst1q_s16(out + out_stride * 162 + i, v4003); + vst1q_s16(out + out_stride * 163 + i, v4004); + vst1q_s16(out + out_stride * 164 + i, v4005); + vst1q_s16(out + out_stride * 165 + i, v4006); + vst1q_s16(out + out_stride * 166 + i, v4007); + vst1q_s16(out + out_stride * 167 + i, v4008); + vst1q_s16(out + out_stride * 168 + i, v4009); + vst1q_s16(out + out_stride * 169 + i, v4010); + vst1q_s16(out + out_stride * 170 + i, v4011); + vst1q_s16(out + out_stride * 171 + i, v4012); + vst1q_s16(out + out_stride * 172 + i, v4013); + vst1q_s16(out + out_stride * 173 + i, v4014); + vst1q_s16(out + out_stride * 174 + i, v4015); + vst1q_s16(out + out_stride * 175 + i, v4016); + vst1q_s16(out + out_stride * 176 + i, v4017); + vst1q_s16(out + out_stride * 177 + i, v4018); + vst1q_s16(out + out_stride * 178 + i, v4019); + vst1q_s16(out + out_stride * 179 + i, v4020); + vst1q_s16(out + out_stride * 180 + i, v4021); + vst1q_s16(out + out_stride * 181 + i, v4022); + vst1q_s16(out + out_stride * 182 + i, v4023); + vst1q_s16(out + out_stride * 183 + i, v4024); + vst1q_s16(out + out_stride * 184 + i, v4025); + vst1q_s16(out + out_stride * 185 + i, v4026); + vst1q_s16(out + out_stride * 186 + i, v4027); + vst1q_s16(out + out_stride * 187 + i, v4028); + vst1q_s16(out + out_stride * 188 + i, v4029); + vst1q_s16(out + out_stride * 189 + i, v4030); + vst1q_s16(out + out_stride * 190 + i, v4031); + vst1q_s16(out + out_stride * 191 + i, v4032); + vst1q_s16(out + out_stride * 192 + i, v4033); + vst1q_s16(out + out_stride * 193 + i, v4034); + vst1q_s16(out + out_stride * 194 + i, v4035); + vst1q_s16(out + out_stride * 195 + i, v4036); + vst1q_s16(out + out_stride * 196 + i, v4037); + vst1q_s16(out + out_stride * 197 + i, v4038); + vst1q_s16(out + out_stride * 198 + i, v4039); + vst1q_s16(out + out_stride * 199 + i, v4040); + vst1q_s16(out + out_stride * 200 + i, v4041); + vst1q_s16(out + out_stride * 201 + i, v4042); + vst1q_s16(out + out_stride * 202 + i, v4043); + vst1q_s16(out + out_stride * 203 + i, v4044); + vst1q_s16(out + out_stride * 204 + i, v4045); + vst1q_s16(out + out_stride * 205 + i, v4046); + vst1q_s16(out + out_stride * 206 + i, v4047); + vst1q_s16(out + out_stride * 207 + i, v4048); + vst1q_s16(out + out_stride * 208 + i, v4049); + vst1q_s16(out + out_stride * 209 + i, v4050); + vst1q_s16(out + out_stride * 210 + i, v4051); + vst1q_s16(out + out_stride * 211 + i, v4052); + vst1q_s16(out + out_stride * 212 + i, v4053); + vst1q_s16(out + out_stride * 213 + i, v4054); + vst1q_s16(out + out_stride * 214 + i, v4055); + vst1q_s16(out + out_stride * 215 + i, v4056); + vst1q_s16(out + out_stride * 216 + i, v4057); + vst1q_s16(out + out_stride * 217 + i, v4058); + vst1q_s16(out + out_stride * 218 + i, v4059); + vst1q_s16(out + out_stride * 219 + i, v4060); + vst1q_s16(out + out_stride * 220 + i, v4061); + vst1q_s16(out + out_stride * 221 + i, v4062); + vst1q_s16(out + out_stride * 222 + i, v4063); + vst1q_s16(out + out_stride * 223 + i, v4064); + vst1q_s16(out + out_stride * 224 + i, v4065); + vst1q_s16(out + out_stride * 225 + i, v4066); + vst1q_s16(out + out_stride * 226 + i, v4067); + vst1q_s16(out + out_stride * 227 + i, v4068); + vst1q_s16(out + out_stride * 228 + i, v4069); + vst1q_s16(out + out_stride * 229 + i, v4070); + vst1q_s16(out + out_stride * 230 + i, v4071); + vst1q_s16(out + out_stride * 231 + i, v4072); + vst1q_s16(out + out_stride * 232 + i, v4073); + vst1q_s16(out + out_stride * 233 + i, v4074); + vst1q_s16(out + out_stride * 234 + i, v4075); + vst1q_s16(out + out_stride * 235 + i, v4076); + vst1q_s16(out + out_stride * 236 + i, v4077); + vst1q_s16(out + out_stride * 237 + i, v4078); + vst1q_s16(out + out_stride * 238 + i, v4079); + vst1q_s16(out + out_stride * 239 + i, v4080); + vst1q_s16(out + out_stride * 240 + i, v4081); + vst1q_s16(out + out_stride * 241 + i, v4082); + vst1q_s16(out + out_stride * 242 + i, v4083); + vst1q_s16(out + out_stride * 243 + i, v4084); + vst1q_s16(out + out_stride * 244 + i, v4085); + vst1q_s16(out + out_stride * 245 + i, v4086); + vst1q_s16(out + out_stride * 246 + i, v4087); + vst1q_s16(out + out_stride * 247 + i, v4088); + vst1q_s16(out + out_stride * 248 + i, v4089); + vst1q_s16(out + out_stride * 249 + i, v4090); + vst1q_s16(out + out_stride * 250 + i, v4091); + vst1q_s16(out + out_stride * 251 + i, v4092); + vst1q_s16(out + out_stride * 252 + i, v4093); + vst1q_s16(out + out_stride * 253 + i, v4094); + vst1q_s16(out + out_stride * 254 + i, v4095); + vst1q_s16(out + out_stride * 255 + i, v4096); + } +} diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct32-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct32-inl.h new file mode 100644 index 0000000000..0f3b31cfea --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/fast_dct32-inl.h @@ -0,0 +1,419 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* This file is automatically generated. Do not modify it directly. */ +#if HWY_TARGET != HWY_NEON +#error "only include this file from fast_dct-inl.h" +#endif + +constexpr size_t FastIDCTIntegerBits(FastDCTTag<32>) { return 1; } + +void FastIDCT(FastDCTTag<32>, const int16_t* in, size_t in_stride, int16_t* out, + size_t out_stride, size_t count) { + JXL_ASSERT(count % 8 == 0); + for (size_t i = 0; i < count; i += 8) { + int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i); + int16x8_t v1 = vld1q_s16(in + in_stride * 16 + i); + int16x8_t v2 = vaddq_s16(v0, v1); + int16x8_t v3 = vld1q_s16(in + in_stride * 8 + i); + int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573); + int16x8_t v4 = vaddq_s16(v4_tmp, v3); + int16x8_t v5 = vld1q_s16(in + in_stride * 24 + i); + int16x8_t v6 = vaddq_s16(v5, v3); + int16x8_t v7 = vaddq_s16(v4, v6); + int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734); + int16x8_t v9 = vaddq_s16(v2, v8); + int16x8_t v10 = vld1q_s16(in + in_stride * 4 + i); + int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573); + int16x8_t v11 = vaddq_s16(v11_tmp, v10); + int16x8_t v12 = vld1q_s16(in + in_stride * 20 + i); + int16x8_t v13 = vld1q_s16(in + in_stride * 12 + i); + int16x8_t v14 = vaddq_s16(v12, v13); + int16x8_t v15 = vaddq_s16(v11, v14); + int16x8_t v16 = vld1q_s16(in + in_stride * 28 + i); + int16x8_t v17 = vaddq_s16(v16, v12); + int16x8_t v18 = vaddq_s16(v13, v10); + int16x8_t v19 = vaddq_s16(v17, v18); + int16x8_t v20 = vqrdmulhq_n_s16(v19, 17734); + int16x8_t v21 = vqrdmulhq_n_s16(v18, 25080); + int16x8_t v22 = vaddq_s16(v20, v21); + int16x8_t v23 = vaddq_s16(v15, v22); + int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705); + int16x8_t v25 = vaddq_s16(v9, v24); + int16x8_t v26 = vld1q_s16(in + in_stride * 2 + i); + int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573); + int16x8_t v27 = vaddq_s16(v27_tmp, v26); + int16x8_t v28 = vld1q_s16(in + in_stride * 18 + i); + int16x8_t v29 = vld1q_s16(in + in_stride * 14 + i); + int16x8_t v30 = vaddq_s16(v28, v29); + int16x8_t v31 = vaddq_s16(v27, v30); + int16x8_t v32 = vld1q_s16(in + in_stride * 10 + i); + int16x8_t v33 = vld1q_s16(in + in_stride * 6 + i); + int16x8_t v34 = vaddq_s16(v32, v33); + int16x8_t v35 = vqrdmulhq_n_s16(v34, 25080); + int16x8_t v36 = vld1q_s16(in + in_stride * 26 + i); + int16x8_t v37 = vld1q_s16(in + in_stride * 22 + i); + int16x8_t v38 = vaddq_s16(v36, v37); + int16x8_t v39 = vaddq_s16(v38, v34); + int16x8_t v40 = vqrdmulhq_n_s16(v39, 17734); + int16x8_t v41 = vaddq_s16(v35, v40); + int16x8_t v42 = vaddq_s16(v31, v41); + int16x8_t v43 = vaddq_s16(v33, v26); + int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573); + int16x8_t v44 = vaddq_s16(v44_tmp, v43); + int16x8_t v45 = vaddq_s16(v29, v32); + int16x8_t v46 = vaddq_s16(v37, v28); + int16x8_t v47 = vaddq_s16(v45, v46); + int16x8_t v48 = vaddq_s16(v44, v47); + int16x8_t v49 = vqrdmulhq_n_s16(v48, 16705); + int16x8_t v50 = vld1q_s16(in + in_stride * 30 + i); + int16x8_t v51 = vaddq_s16(v50, v36); + int16x8_t v52 = vaddq_s16(v51, v46); + int16x8_t v53 = vqrdmulhq_n_s16(v52, 17734); + int16x8_t v54 = vaddq_s16(v45, v43); + int16x8_t v55_tmp = vqrdmulhq_n_s16(v54, 10045); + int16x8_t v55 = vaddq_s16(v55_tmp, v54); + int16x8_t v56 = vaddq_s16(v53, v55); + int16x8_t v57 = vqrdmulhq_n_s16(v56, 16705); + int16x8_t v58 = vaddq_s16(v49, v57); + int16x8_t v59 = vaddq_s16(v42, v58); + int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463); + int16x8_t v61 = vaddq_s16(v25, v60); + int16x8_t v62 = vld1q_s16(in + in_stride * 13 + i); + int16x8_t v63 = vld1q_s16(in + in_stride * 11 + i); + int16x8_t v64 = vaddq_s16(v62, v63); + int16x8_t v65 = vld1q_s16(in + in_stride * 5 + i); + int16x8_t v66 = vld1q_s16(in + in_stride * 3 + i); + int16x8_t v67 = vaddq_s16(v65, v66); + int16x8_t v68 = vaddq_s16(v64, v67); + int16x8_t v69_tmp = vqrdmulhq_n_s16(v68, 10045); + int16x8_t v69 = vaddq_s16(v69_tmp, v68); + int16x8_t v70 = vld1q_s16(in + in_stride * 21 + i); + int16x8_t v71 = vld1q_s16(in + in_stride * 19 + i); + int16x8_t v72 = vaddq_s16(v70, v71); + int16x8_t v73 = vld1q_s16(in + in_stride * 29 + i); + int16x8_t v74 = vld1q_s16(in + in_stride * 27 + i); + int16x8_t v75 = vaddq_s16(v73, v74); + int16x8_t v76 = vaddq_s16(v72, v75); + int16x8_t v77 = vqrdmulhq_n_s16(v76, 17734); + int16x8_t v78 = vaddq_s16(v69, v77); + int16x8_t v79 = vqrdmulhq_n_s16(v78, 16705); + int16x8_t v80_tmp = vqrdmulhq_n_s16(v67, 13573); + int16x8_t v80 = vaddq_s16(v80_tmp, v67); + int16x8_t v81 = vaddq_s16(v64, v72); + int16x8_t v82 = vaddq_s16(v80, v81); + int16x8_t v83 = vqrdmulhq_n_s16(v82, 16705); + int16x8_t v84 = vaddq_s16(v79, v83); + int16x8_t v85 = vld1q_s16(in + in_stride * 1 + i); + int16x8_t v86_tmp = vqrdmulhq_n_s16(v85, 13573); + int16x8_t v86 = vaddq_s16(v86_tmp, v85); + int16x8_t v87 = vld1q_s16(in + in_stride * 17 + i); + int16x8_t v88 = vld1q_s16(in + in_stride * 15 + i); + int16x8_t v89 = vaddq_s16(v87, v88); + int16x8_t v90 = vaddq_s16(v86, v89); + int16x8_t v91 = vld1q_s16(in + in_stride * 9 + i); + int16x8_t v92 = vld1q_s16(in + in_stride * 7 + i); + int16x8_t v93 = vaddq_s16(v91, v92); + int16x8_t v94 = vqrdmulhq_n_s16(v93, 25080); + int16x8_t v95 = vld1q_s16(in + in_stride * 25 + i); + int16x8_t v96 = vld1q_s16(in + in_stride * 23 + i); + int16x8_t v97 = vaddq_s16(v95, v96); + int16x8_t v98 = vaddq_s16(v97, v93); + int16x8_t v99 = vqrdmulhq_n_s16(v98, 17734); + int16x8_t v100 = vaddq_s16(v94, v99); + int16x8_t v101 = vaddq_s16(v90, v100); + int16x8_t v102 = vaddq_s16(v84, v101); + int16x8_t v103 = vaddq_s16(v92, v65); + int16x8_t v104 = vaddq_s16(v66, v85); + int16x8_t v105 = vaddq_s16(v103, v104); + int16x8_t v106_tmp = vqrdmulhq_n_s16(v105, 13573); + int16x8_t v106 = vaddq_s16(v106_tmp, v105); + int16x8_t v107 = vaddq_s16(v96, v70); + int16x8_t v108 = vaddq_s16(v71, v87); + int16x8_t v109 = vaddq_s16(v107, v108); + int16x8_t v110 = vaddq_s16(v63, v91); + int16x8_t v111 = vaddq_s16(v88, v62); + int16x8_t v112 = vaddq_s16(v110, v111); + int16x8_t v113 = vaddq_s16(v109, v112); + int16x8_t v114 = vaddq_s16(v106, v113); + int16x8_t v115 = vqrdmulhq_n_s16(v114, 16705); + int16x8_t v116 = vaddq_s16(v112, v105); + int16x8_t v117 = vqrdmulhq_n_s16(v116, 25080); + int16x8_t v118 = vqrdmulhq_n_s16(v116, 17734); + int16x8_t v119 = vaddq_s16(v74, v95); + int16x8_t v120 = vld1q_s16(in + in_stride * 31 + i); + int16x8_t v121 = vaddq_s16(v120, v73); + int16x8_t v122 = vaddq_s16(v119, v121); + int16x8_t v123 = vaddq_s16(v122, v109); + int16x8_t v124 = vqrdmulhq_n_s16(v123, 17734); + int16x8_t v125 = vaddq_s16(v118, v124); + int16x8_t v126 = vaddq_s16(v117, v125); + int16x8_t v127 = vqrdmulhq_n_s16(v126, 16705); + int16x8_t v128 = vaddq_s16(v115, v127); + int16x8_t v129 = vqrdmulhq_n_s16(v128, 16463); + int16x8_t v130_tmp = vqrdmulhq_n_s16(v104, 13573); + int16x8_t v130 = vaddq_s16(v130_tmp, v104); + int16x8_t v131 = vaddq_s16(v108, v111); + int16x8_t v132 = vaddq_s16(v130, v131); + int16x8_t v133 = vaddq_s16(v119, v107); + int16x8_t v134 = vqrdmulhq_n_s16(v133, 17734); + int16x8_t v135 = vaddq_s16(v110, v103); + int16x8_t v136_tmp = vqrdmulhq_n_s16(v135, 10045); + int16x8_t v136 = vaddq_s16(v136_tmp, v135); + int16x8_t v137 = vaddq_s16(v134, v136); + int16x8_t v138 = vaddq_s16(v132, v137); + int16x8_t v139 = vqrdmulhq_n_s16(v138, 16463); + int16x8_t v140 = vaddq_s16(v129, v139); + int16x8_t v141 = vaddq_s16(v102, v140); + int16x8_t v142 = vqrdmulhq_n_s16(v141, 16404); + int16x8_t v143 = vaddq_s16(v61, v142); + int16x8_t v144 = vsubq_s16(v0, v1); + int16x8_t v145 = vsubq_s16(v4, v6); + int16x8_t v146_tmp = vqrdmulhq_n_s16(v145, 10045); + int16x8_t v146 = vaddq_s16(v146_tmp, v145); + int16x8_t v147 = vaddq_s16(v144, v146); + int16x8_t v148 = vsubq_s16(v11, v14); + int16x8_t v149 = vqrdmulhq_n_s16(v18, 17734); + int16x8_t v150_tmp = vqrdmulhq_n_s16(v17, 10045); + int16x8_t v150 = vaddq_s16(v150_tmp, v17); + int16x8_t v151 = vsubq_s16(v149, v150); + int16x8_t v152 = vaddq_s16(v148, v151); + int16x8_t v153 = vqrdmulhq_n_s16(v152, 19705); + int16x8_t v154 = vaddq_s16(v147, v153); + int16x8_t v155 = vsubq_s16(v27, v30); + int16x8_t v156 = vqrdmulhq_n_s16(v34, 17734); + int16x8_t v157_tmp = vqrdmulhq_n_s16(v38, 10045); + int16x8_t v157 = vaddq_s16(v157_tmp, v38); + int16x8_t v158 = vsubq_s16(v156, v157); + int16x8_t v159 = vaddq_s16(v155, v158); + int16x8_t v160 = vqrdmulhq_n_s16(v54, 13573); + int16x8_t v161 = vsubq_s16(v160, v52); + int16x8_t v162 = vqrdmulhq_n_s16(v161, 25746); + int16x8_t v163 = vsubq_s16(v44, v47); + int16x8_t v164 = vqrdmulhq_n_s16(v163, 19705); + int16x8_t v165 = vaddq_s16(v162, v164); + int16x8_t v166 = vaddq_s16(v159, v165); + int16x8_t v167 = vqrdmulhq_n_s16(v166, 17121); + int16x8_t v168 = vaddq_s16(v154, v167); + int16x8_t v169 = vsubq_s16(v86, v89); + int16x8_t v170 = vqrdmulhq_n_s16(v93, 17734); + int16x8_t v171_tmp = vqrdmulhq_n_s16(v97, 10045); + int16x8_t v171 = vaddq_s16(v171_tmp, v97); + int16x8_t v172 = vsubq_s16(v170, v171); + int16x8_t v173 = vaddq_s16(v169, v172); + int16x8_t v174 = vsubq_s16(v80, v81); + int16x8_t v175 = vqrdmulhq_n_s16(v174, 19705); + int16x8_t v176 = vqrdmulhq_n_s16(v68, 13573); + int16x8_t v177 = vsubq_s16(v176, v76); + int16x8_t v178 = vqrdmulhq_n_s16(v177, 25746); + int16x8_t v179 = vaddq_s16(v175, v178); + int16x8_t v180 = vaddq_s16(v173, v179); + int16x8_t v181 = vsubq_s16(v130, v131); + int16x8_t v182 = vqrdmulhq_n_s16(v135, 13573); + int16x8_t v183 = vsubq_s16(v182, v133); + int16x8_t v184_tmp = vqrdmulhq_n_s16(v183, 10045); + int16x8_t v184 = vaddq_s16(v184_tmp, v183); + int16x8_t v185 = vaddq_s16(v181, v184); + int16x8_t v186 = vqrdmulhq_n_s16(v185, 17121); + int16x8_t v187 = vqrdmulhq_n_s16(v105, 27867); + int16x8_t v188 = vqrdmulhq_n_s16(v113, 19705); + int16x8_t v189 = vsubq_s16(v187, v188); + int16x8_t v190 = vqrdmulhq_n_s16(v116, 13573); + int16x8_t v191 = vsubq_s16(v190, v123); + int16x8_t v192 = vqrdmulhq_n_s16(v191, 25746); + int16x8_t v193 = vaddq_s16(v189, v192); + int16x8_t v194 = vqrdmulhq_n_s16(v193, 17121); + int16x8_t v195 = vaddq_s16(v186, v194); + int16x8_t v196 = vaddq_s16(v180, v195); + int16x8_t v197 = vqrdmulhq_n_s16(v196, 16563); + int16x8_t v198 = vaddq_s16(v168, v197); + int16x8_t v199 = vsubq_s16(v144, v146); + int16x8_t v200 = vsubq_s16(v148, v151); + int16x8_t v201 = vqrdmulhq_n_s16(v200, 29490); + int16x8_t v202 = vaddq_s16(v199, v201); + int16x8_t v203 = vsubq_s16(v155, v158); + int16x8_t v204 = vqrdmulhq_n_s16(v163, 29490); + int16x8_t v205_tmp = vqrdmulhq_n_s16(v161, 5763); + int16x8_t v205 = vaddq_s16(v205_tmp, v161); + int16x8_t v206 = vsubq_s16(v204, v205); + int16x8_t v207 = vaddq_s16(v203, v206); + int16x8_t v208 = vqrdmulhq_n_s16(v207, 18578); + int16x8_t v209 = vaddq_s16(v202, v208); + int16x8_t v210 = vsubq_s16(v169, v172); + int16x8_t v211 = vqrdmulhq_n_s16(v174, 29490); + int16x8_t v212_tmp = vqrdmulhq_n_s16(v177, 5763); + int16x8_t v212 = vaddq_s16(v212_tmp, v177); + int16x8_t v213 = vsubq_s16(v211, v212); + int16x8_t v214 = vaddq_s16(v210, v213); + int16x8_t v215 = vsubq_s16(v181, v184); + int16x8_t v216 = vqrdmulhq_n_s16(v215, 18578); + int16x8_t v217 = vqrdmulhq_n_s16(v189, 27803); + int16x8_t v218 = vqrdmulhq_n_s16(v191, 21845); + int16x8_t v219 = vsubq_s16(v217, v218); + int16x8_t v220 = vaddq_s16(v216, v219); + int16x8_t v221 = vaddq_s16(v214, v220); + int16x8_t v222 = vqrdmulhq_n_s16(v221, 16890); + int16x8_t v223 = vaddq_s16(v209, v222); + int16x8_t v224 = vsubq_s16(v2, v8); + int16x8_t v225 = vsubq_s16(v15, v22); + int16x8_t v226_tmp = vqrdmulhq_n_s16(v225, 18446); + int16x8_t v226 = vmlaq_n_s16(v226_tmp, v225, 2); + int16x8_t v227 = vaddq_s16(v224, v226); + int16x8_t v228 = vsubq_s16(v31, v41); + int16x8_t v229 = vsubq_s16(v48, v56); + int16x8_t v230_tmp = vqrdmulhq_n_s16(v229, 18446); + int16x8_t v230 = vmlaq_n_s16(v230_tmp, v229, 2); + int16x8_t v231 = vaddq_s16(v228, v230); + int16x8_t v232 = vqrdmulhq_n_s16(v231, 21195); + int16x8_t v233 = vaddq_s16(v227, v232); + int16x8_t v234 = vsubq_s16(v82, v78); + int16x8_t v235_tmp = vqrdmulhq_n_s16(v234, 18446); + int16x8_t v235 = vmlaq_n_s16(v235_tmp, v234, 2); + int16x8_t v236 = vsubq_s16(v90, v100); + int16x8_t v237 = vaddq_s16(v235, v236); + int16x8_t v238 = vsubq_s16(v132, v137); + int16x8_t v239 = vsubq_s16(v114, v126); + int16x8_t v240_tmp = vqrdmulhq_n_s16(v239, 18446); + int16x8_t v240 = vmlaq_n_s16(v240_tmp, v239, 2); + int16x8_t v241 = vaddq_s16(v238, v240); + int16x8_t v242 = vqrdmulhq_n_s16(v241, 21195); + int16x8_t v243 = vaddq_s16(v237, v242); + int16x8_t v244 = vqrdmulhq_n_s16(v243, 17401); + int16x8_t v245 = vaddq_s16(v233, v244); + int16x8_t v246 = vsubq_s16(v228, v230); + int16x8_t v247 = vqrdmulhq_n_s16(v246, 25826); + int16x8_t v248 = vsubq_s16(v224, v226); + int16x8_t v249 = vaddq_s16(v247, v248); + int16x8_t v250 = vsubq_s16(v238, v240); + int16x8_t v251 = vqrdmulhq_n_s16(v250, 25826); + int16x8_t v252 = vsubq_s16(v236, v235); + int16x8_t v253 = vaddq_s16(v251, v252); + int16x8_t v254 = vqrdmulhq_n_s16(v253, 18124); + int16x8_t v255 = vaddq_s16(v249, v254); + int16x8_t v256 = vsubq_s16(v199, v201); + int16x8_t v257 = vsubq_s16(v203, v206); + int16x8_t v258_tmp = vqrdmulhq_n_s16(v257, 1988); + int16x8_t v258 = vaddq_s16(v258_tmp, v257); + int16x8_t v259 = vaddq_s16(v256, v258); + int16x8_t v260 = vsubq_s16(v210, v213); + int16x8_t v261_tmp = vqrdmulhq_n_s16(v219, 25030); + int16x8_t v261 = vaddq_s16(v261_tmp, v219); + int16x8_t v262 = vsubq_s16(v215, v261); + int16x8_t v263_tmp = vqrdmulhq_n_s16(v262, 1988); + int16x8_t v263 = vaddq_s16(v263_tmp, v262); + int16x8_t v264 = vaddq_s16(v260, v263); + int16x8_t v265 = vqrdmulhq_n_s16(v264, 19102); + int16x8_t v266 = vaddq_s16(v259, v265); + int16x8_t v267 = vsubq_s16(v147, v153); + int16x8_t v268 = vsubq_s16(v159, v165); + int16x8_t v269_tmp = vqrdmulhq_n_s16(v268, 23673); + int16x8_t v269 = vaddq_s16(v269_tmp, v268); + int16x8_t v270 = vaddq_s16(v267, v269); + int16x8_t v271 = vsubq_s16(v173, v179); + int16x8_t v272 = vsubq_s16(v185, v193); + int16x8_t v273_tmp = vqrdmulhq_n_s16(v272, 23673); + int16x8_t v273 = vaddq_s16(v273_tmp, v272); + int16x8_t v274 = vaddq_s16(v271, v273); + int16x8_t v275 = vqrdmulhq_n_s16(v274, 20398); + int16x8_t v276 = vaddq_s16(v270, v275); + int16x8_t v277 = vsubq_s16(v9, v24); + int16x8_t v278 = vsubq_s16(v42, v58); + int16x8_t v279_tmp = vqrdmulhq_n_s16(v278, 3314); + int16x8_t v279 = vmlaq_n_s16(v279_tmp, v278, 5); + int16x8_t v280 = vaddq_s16(v277, v279); + int16x8_t v281 = vsubq_s16(v138, v128); + int16x8_t v282_tmp = vqrdmulhq_n_s16(v281, 3314); + int16x8_t v282 = vmlaq_n_s16(v282_tmp, v281, 5); + int16x8_t v283 = vsubq_s16(v101, v84); + int16x8_t v284 = vaddq_s16(v282, v283); + int16x8_t v285 = vqrdmulhq_n_s16(v284, 22112); + int16x8_t v286 = vaddq_s16(v280, v285); + int16x8_t v287 = vsubq_s16(v277, v279); + int16x8_t v288 = vsubq_s16(v283, v282); + int16x8_t v289 = vqrdmulhq_n_s16(v288, 24397); + int16x8_t v290 = vaddq_s16(v287, v289); + int16x8_t v291 = vsubq_s16(v267, v269); + int16x8_t v292 = vsubq_s16(v271, v273); + int16x8_t v293 = vqrdmulhq_n_s16(v292, 27504); + int16x8_t v294 = vaddq_s16(v291, v293); + int16x8_t v295 = vsubq_s16(v260, v263); + int16x8_t v296 = vqrdmulhq_n_s16(v295, 31869); + int16x8_t v297 = vsubq_s16(v256, v258); + int16x8_t v298 = vaddq_s16(v296, v297); + int16x8_t v299 = vsubq_s16(v248, v247); + int16x8_t v300 = vsubq_s16(v252, v251); + int16x8_t v301_tmp = vqrdmulhq_n_s16(v300, 5552); + int16x8_t v301 = vaddq_s16(v301_tmp, v300); + int16x8_t v302 = vaddq_s16(v299, v301); + int16x8_t v303 = vsubq_s16(v227, v232); + int16x8_t v304 = vsubq_s16(v237, v242); + int16x8_t v305_tmp = vqrdmulhq_n_s16(v304, 15865); + int16x8_t v305 = vaddq_s16(v305_tmp, v304); + int16x8_t v306 = vaddq_s16(v303, v305); + int16x8_t v307 = vsubq_s16(v202, v208); + int16x8_t v308 = vsubq_s16(v214, v220); + int16x8_t v309_tmp = vqrdmulhq_n_s16(v308, 1893); + int16x8_t v309 = vmlaq_n_s16(v309_tmp, v308, 2); + int16x8_t v310 = vaddq_s16(v307, v309); + int16x8_t v311 = vsubq_s16(v154, v167); + int16x8_t v312 = vsubq_s16(v180, v195); + int16x8_t v313_tmp = vqrdmulhq_n_s16(v312, 13357); + int16x8_t v313 = vmlaq_n_s16(v313_tmp, v312, 3); + int16x8_t v314 = vaddq_s16(v311, v313); + int16x8_t v315 = vsubq_s16(v102, v140); + int16x8_t v316_tmp = vqrdmulhq_n_s16(v315, 6226); + int16x8_t v316 = vmlaq_n_s16(v316_tmp, v315, 10); + int16x8_t v317 = vsubq_s16(v25, v60); + int16x8_t v318 = vaddq_s16(v316, v317); + int16x8_t v319 = vsubq_s16(v317, v316); + int16x8_t v320 = vsubq_s16(v311, v313); + int16x8_t v321 = vsubq_s16(v307, v309); + int16x8_t v322 = vsubq_s16(v303, v305); + int16x8_t v323 = vsubq_s16(v299, v301); + int16x8_t v324 = vsubq_s16(v297, v296); + int16x8_t v325 = vsubq_s16(v291, v293); + int16x8_t v326 = vsubq_s16(v287, v289); + int16x8_t v327 = vsubq_s16(v280, v285); + int16x8_t v328 = vsubq_s16(v270, v275); + int16x8_t v329 = vsubq_s16(v259, v265); + int16x8_t v330 = vsubq_s16(v249, v254); + int16x8_t v331 = vsubq_s16(v233, v244); + int16x8_t v332 = vsubq_s16(v209, v222); + int16x8_t v333 = vsubq_s16(v168, v197); + int16x8_t v334 = vsubq_s16(v61, v142); + vst1q_s16(out + out_stride * 0 + i, v143); + vst1q_s16(out + out_stride * 1 + i, v198); + vst1q_s16(out + out_stride * 2 + i, v223); + vst1q_s16(out + out_stride * 3 + i, v245); + vst1q_s16(out + out_stride * 4 + i, v255); + vst1q_s16(out + out_stride * 5 + i, v266); + vst1q_s16(out + out_stride * 6 + i, v276); + vst1q_s16(out + out_stride * 7 + i, v286); + vst1q_s16(out + out_stride * 8 + i, v290); + vst1q_s16(out + out_stride * 9 + i, v294); + vst1q_s16(out + out_stride * 10 + i, v298); + vst1q_s16(out + out_stride * 11 + i, v302); + vst1q_s16(out + out_stride * 12 + i, v306); + vst1q_s16(out + out_stride * 13 + i, v310); + vst1q_s16(out + out_stride * 14 + i, v314); + vst1q_s16(out + out_stride * 15 + i, v318); + vst1q_s16(out + out_stride * 16 + i, v319); + vst1q_s16(out + out_stride * 17 + i, v320); + vst1q_s16(out + out_stride * 18 + i, v321); + vst1q_s16(out + out_stride * 19 + i, v322); + vst1q_s16(out + out_stride * 20 + i, v323); + vst1q_s16(out + out_stride * 21 + i, v324); + vst1q_s16(out + out_stride * 22 + i, v325); + vst1q_s16(out + out_stride * 23 + i, v326); + vst1q_s16(out + out_stride * 24 + i, v327); + vst1q_s16(out + out_stride * 25 + i, v328); + vst1q_s16(out + out_stride * 26 + i, v329); + vst1q_s16(out + out_stride * 27 + i, v330); + vst1q_s16(out + out_stride * 28 + i, v331); + vst1q_s16(out + out_stride * 29 + i, v332); + vst1q_s16(out + out_stride * 30 + i, v333); + vst1q_s16(out + out_stride * 31 + i, v334); + } +} diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct64-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct64-inl.h new file mode 100644 index 0000000000..400da1a9de --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/fast_dct64-inl.h @@ -0,0 +1,985 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* This file is automatically generated. Do not modify it directly. */ +#if HWY_TARGET != HWY_NEON +#error "only include this file from fast_dct-inl.h" +#endif + +constexpr size_t FastIDCTIntegerBits(FastDCTTag<64>) { return 1; } + +void FastIDCT(FastDCTTag<64>, const int16_t* in, size_t in_stride, int16_t* out, + size_t out_stride, size_t count) { + JXL_ASSERT(count % 8 == 0); + for (size_t i = 0; i < count; i += 8) { + int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i); + int16x8_t v1 = vld1q_s16(in + in_stride * 32 + i); + int16x8_t v2 = vaddq_s16(v0, v1); + int16x8_t v3 = vld1q_s16(in + in_stride * 16 + i); + int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573); + int16x8_t v4 = vaddq_s16(v4_tmp, v3); + int16x8_t v5 = vld1q_s16(in + in_stride * 48 + i); + int16x8_t v6 = vaddq_s16(v5, v3); + int16x8_t v7 = vaddq_s16(v4, v6); + int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734); + int16x8_t v9 = vaddq_s16(v2, v8); + int16x8_t v10 = vld1q_s16(in + in_stride * 8 + i); + int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573); + int16x8_t v11 = vaddq_s16(v11_tmp, v10); + int16x8_t v12 = vld1q_s16(in + in_stride * 40 + i); + int16x8_t v13 = vld1q_s16(in + in_stride * 24 + i); + int16x8_t v14 = vaddq_s16(v12, v13); + int16x8_t v15 = vaddq_s16(v11, v14); + int16x8_t v16 = vld1q_s16(in + in_stride * 56 + i); + int16x8_t v17 = vaddq_s16(v16, v12); + int16x8_t v18 = vaddq_s16(v13, v10); + int16x8_t v19 = vaddq_s16(v17, v18); + int16x8_t v20 = vqrdmulhq_n_s16(v19, 17734); + int16x8_t v21 = vqrdmulhq_n_s16(v18, 25080); + int16x8_t v22 = vaddq_s16(v20, v21); + int16x8_t v23 = vaddq_s16(v15, v22); + int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705); + int16x8_t v25 = vaddq_s16(v9, v24); + int16x8_t v26 = vld1q_s16(in + in_stride * 4 + i); + int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573); + int16x8_t v27 = vaddq_s16(v27_tmp, v26); + int16x8_t v28 = vld1q_s16(in + in_stride * 36 + i); + int16x8_t v29 = vld1q_s16(in + in_stride * 28 + i); + int16x8_t v30 = vaddq_s16(v28, v29); + int16x8_t v31 = vaddq_s16(v27, v30); + int16x8_t v32 = vld1q_s16(in + in_stride * 20 + i); + int16x8_t v33 = vld1q_s16(in + in_stride * 12 + i); + int16x8_t v34 = vaddq_s16(v32, v33); + int16x8_t v35 = vqrdmulhq_n_s16(v34, 25080); + int16x8_t v36 = vld1q_s16(in + in_stride * 52 + i); + int16x8_t v37 = vld1q_s16(in + in_stride * 44 + i); + int16x8_t v38 = vaddq_s16(v36, v37); + int16x8_t v39 = vaddq_s16(v38, v34); + int16x8_t v40 = vqrdmulhq_n_s16(v39, 17734); + int16x8_t v41 = vaddq_s16(v35, v40); + int16x8_t v42 = vaddq_s16(v31, v41); + int16x8_t v43 = vaddq_s16(v33, v26); + int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573); + int16x8_t v44 = vaddq_s16(v44_tmp, v43); + int16x8_t v45 = vaddq_s16(v37, v28); + int16x8_t v46 = vaddq_s16(v29, v32); + int16x8_t v47 = vaddq_s16(v45, v46); + int16x8_t v48 = vaddq_s16(v44, v47); + int16x8_t v49 = vqrdmulhq_n_s16(v48, 16705); + int16x8_t v50 = vaddq_s16(v46, v43); + int16x8_t v51_tmp = vqrdmulhq_n_s16(v50, 10045); + int16x8_t v51 = vaddq_s16(v51_tmp, v50); + int16x8_t v52 = vld1q_s16(in + in_stride * 60 + i); + int16x8_t v53 = vaddq_s16(v52, v36); + int16x8_t v54 = vaddq_s16(v53, v45); + int16x8_t v55 = vqrdmulhq_n_s16(v54, 17734); + int16x8_t v56 = vaddq_s16(v51, v55); + int16x8_t v57 = vqrdmulhq_n_s16(v56, 16705); + int16x8_t v58 = vaddq_s16(v49, v57); + int16x8_t v59 = vaddq_s16(v42, v58); + int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463); + int16x8_t v61 = vaddq_s16(v25, v60); + int16x8_t v62 = vld1q_s16(in + in_stride * 2 + i); + int16x8_t v63_tmp = vqrdmulhq_n_s16(v62, 13573); + int16x8_t v63 = vaddq_s16(v63_tmp, v62); + int16x8_t v64 = vld1q_s16(in + in_stride * 34 + i); + int16x8_t v65 = vld1q_s16(in + in_stride * 30 + i); + int16x8_t v66 = vaddq_s16(v64, v65); + int16x8_t v67 = vaddq_s16(v63, v66); + int16x8_t v68 = vld1q_s16(in + in_stride * 18 + i); + int16x8_t v69 = vld1q_s16(in + in_stride * 14 + i); + int16x8_t v70 = vaddq_s16(v68, v69); + int16x8_t v71 = vqrdmulhq_n_s16(v70, 25080); + int16x8_t v72 = vld1q_s16(in + in_stride * 50 + i); + int16x8_t v73 = vld1q_s16(in + in_stride * 46 + i); + int16x8_t v74 = vaddq_s16(v72, v73); + int16x8_t v75 = vaddq_s16(v74, v70); + int16x8_t v76 = vqrdmulhq_n_s16(v75, 17734); + int16x8_t v77 = vaddq_s16(v71, v76); + int16x8_t v78 = vaddq_s16(v67, v77); + int16x8_t v79 = vld1q_s16(in + in_stride * 10 + i); + int16x8_t v80 = vld1q_s16(in + in_stride * 6 + i); + int16x8_t v81 = vaddq_s16(v79, v80); + int16x8_t v82_tmp = vqrdmulhq_n_s16(v81, 13573); + int16x8_t v82 = vaddq_s16(v82_tmp, v81); + int16x8_t v83 = vld1q_s16(in + in_stride * 42 + i); + int16x8_t v84 = vld1q_s16(in + in_stride * 38 + i); + int16x8_t v85 = vaddq_s16(v83, v84); + int16x8_t v86 = vld1q_s16(in + in_stride * 26 + i); + int16x8_t v87 = vld1q_s16(in + in_stride * 22 + i); + int16x8_t v88 = vaddq_s16(v86, v87); + int16x8_t v89 = vaddq_s16(v85, v88); + int16x8_t v90 = vaddq_s16(v82, v89); + int16x8_t v91 = vqrdmulhq_n_s16(v90, 16705); + int16x8_t v92 = vaddq_s16(v88, v81); + int16x8_t v93_tmp = vqrdmulhq_n_s16(v92, 10045); + int16x8_t v93 = vaddq_s16(v93_tmp, v92); + int16x8_t v94 = vld1q_s16(in + in_stride * 58 + i); + int16x8_t v95 = vld1q_s16(in + in_stride * 54 + i); + int16x8_t v96 = vaddq_s16(v94, v95); + int16x8_t v97 = vaddq_s16(v96, v85); + int16x8_t v98 = vqrdmulhq_n_s16(v97, 17734); + int16x8_t v99 = vaddq_s16(v93, v98); + int16x8_t v100 = vqrdmulhq_n_s16(v99, 16705); + int16x8_t v101 = vaddq_s16(v91, v100); + int16x8_t v102 = vaddq_s16(v78, v101); + int16x8_t v103 = vaddq_s16(v69, v79); + int16x8_t v104 = vaddq_s16(v80, v62); + int16x8_t v105 = vaddq_s16(v103, v104); + int16x8_t v106_tmp = vqrdmulhq_n_s16(v105, 13573); + int16x8_t v106 = vaddq_s16(v106_tmp, v105); + int16x8_t v107 = vaddq_s16(v73, v83); + int16x8_t v108 = vaddq_s16(v84, v64); + int16x8_t v109 = vaddq_s16(v107, v108); + int16x8_t v110 = vaddq_s16(v65, v86); + int16x8_t v111 = vaddq_s16(v87, v68); + int16x8_t v112 = vaddq_s16(v110, v111); + int16x8_t v113 = vaddq_s16(v109, v112); + int16x8_t v114 = vaddq_s16(v106, v113); + int16x8_t v115 = vqrdmulhq_n_s16(v114, 16705); + int16x8_t v116 = vaddq_s16(v112, v105); + int16x8_t v117 = vqrdmulhq_n_s16(v116, 25080); + int16x8_t v118 = vqrdmulhq_n_s16(v116, 17734); + int16x8_t v119 = vld1q_s16(in + in_stride * 62 + i); + int16x8_t v120 = vaddq_s16(v119, v94); + int16x8_t v121 = vaddq_s16(v95, v72); + int16x8_t v122 = vaddq_s16(v120, v121); + int16x8_t v123 = vaddq_s16(v122, v109); + int16x8_t v124 = vqrdmulhq_n_s16(v123, 17734); + int16x8_t v125 = vaddq_s16(v118, v124); + int16x8_t v126 = vaddq_s16(v117, v125); + int16x8_t v127 = vqrdmulhq_n_s16(v126, 16705); + int16x8_t v128 = vaddq_s16(v115, v127); + int16x8_t v129 = vqrdmulhq_n_s16(v128, 16463); + int16x8_t v130_tmp = vqrdmulhq_n_s16(v104, 13573); + int16x8_t v130 = vaddq_s16(v130_tmp, v104); + int16x8_t v131 = vaddq_s16(v108, v110); + int16x8_t v132 = vaddq_s16(v130, v131); + int16x8_t v133 = vaddq_s16(v111, v103); + int16x8_t v134_tmp = vqrdmulhq_n_s16(v133, 10045); + int16x8_t v134 = vaddq_s16(v134_tmp, v133); + int16x8_t v135 = vaddq_s16(v121, v107); + int16x8_t v136 = vqrdmulhq_n_s16(v135, 17734); + int16x8_t v137 = vaddq_s16(v134, v136); + int16x8_t v138 = vaddq_s16(v132, v137); + int16x8_t v139 = vqrdmulhq_n_s16(v138, 16463); + int16x8_t v140 = vaddq_s16(v129, v139); + int16x8_t v141 = vaddq_s16(v102, v140); + int16x8_t v142 = vqrdmulhq_n_s16(v141, 16404); + int16x8_t v143 = vaddq_s16(v61, v142); + int16x8_t v144 = vld1q_s16(in + in_stride * 1 + i); + int16x8_t v145_tmp = vqrdmulhq_n_s16(v144, 13573); + int16x8_t v145 = vaddq_s16(v145_tmp, v144); + int16x8_t v146 = vld1q_s16(in + in_stride * 33 + i); + int16x8_t v147 = vld1q_s16(in + in_stride * 31 + i); + int16x8_t v148 = vaddq_s16(v146, v147); + int16x8_t v149 = vaddq_s16(v145, v148); + int16x8_t v150 = vld1q_s16(in + in_stride * 17 + i); + int16x8_t v151 = vld1q_s16(in + in_stride * 15 + i); + int16x8_t v152 = vaddq_s16(v150, v151); + int16x8_t v153 = vqrdmulhq_n_s16(v152, 25080); + int16x8_t v154 = vld1q_s16(in + in_stride * 49 + i); + int16x8_t v155 = vld1q_s16(in + in_stride * 47 + i); + int16x8_t v156 = vaddq_s16(v154, v155); + int16x8_t v157 = vaddq_s16(v156, v152); + int16x8_t v158 = vqrdmulhq_n_s16(v157, 17734); + int16x8_t v159 = vaddq_s16(v153, v158); + int16x8_t v160 = vaddq_s16(v149, v159); + int16x8_t v161 = vld1q_s16(in + in_stride * 9 + i); + int16x8_t v162 = vld1q_s16(in + in_stride * 7 + i); + int16x8_t v163 = vaddq_s16(v161, v162); + int16x8_t v164_tmp = vqrdmulhq_n_s16(v163, 13573); + int16x8_t v164 = vaddq_s16(v164_tmp, v163); + int16x8_t v165 = vld1q_s16(in + in_stride * 41 + i); + int16x8_t v166 = vld1q_s16(in + in_stride * 39 + i); + int16x8_t v167 = vaddq_s16(v165, v166); + int16x8_t v168 = vld1q_s16(in + in_stride * 25 + i); + int16x8_t v169 = vld1q_s16(in + in_stride * 23 + i); + int16x8_t v170 = vaddq_s16(v168, v169); + int16x8_t v171 = vaddq_s16(v167, v170); + int16x8_t v172 = vaddq_s16(v164, v171); + int16x8_t v173 = vqrdmulhq_n_s16(v172, 16705); + int16x8_t v174 = vaddq_s16(v170, v163); + int16x8_t v175_tmp = vqrdmulhq_n_s16(v174, 10045); + int16x8_t v175 = vaddq_s16(v175_tmp, v174); + int16x8_t v176 = vld1q_s16(in + in_stride * 57 + i); + int16x8_t v177 = vld1q_s16(in + in_stride * 55 + i); + int16x8_t v178 = vaddq_s16(v176, v177); + int16x8_t v179 = vaddq_s16(v178, v167); + int16x8_t v180 = vqrdmulhq_n_s16(v179, 17734); + int16x8_t v181 = vaddq_s16(v175, v180); + int16x8_t v182 = vqrdmulhq_n_s16(v181, 16705); + int16x8_t v183 = vaddq_s16(v173, v182); + int16x8_t v184 = vaddq_s16(v160, v183); + int16x8_t v185 = vld1q_s16(in + in_stride * 37 + i); + int16x8_t v186 = vld1q_s16(in + in_stride * 35 + i); + int16x8_t v187 = vaddq_s16(v185, v186); + int16x8_t v188 = vld1q_s16(in + in_stride * 45 + i); + int16x8_t v189 = vld1q_s16(in + in_stride * 43 + i); + int16x8_t v190 = vaddq_s16(v188, v189); + int16x8_t v191 = vaddq_s16(v187, v190); + int16x8_t v192 = vld1q_s16(in + in_stride * 29 + i); + int16x8_t v193 = vld1q_s16(in + in_stride * 27 + i); + int16x8_t v194 = vaddq_s16(v192, v193); + int16x8_t v195 = vld1q_s16(in + in_stride * 21 + i); + int16x8_t v196 = vld1q_s16(in + in_stride * 19 + i); + int16x8_t v197 = vaddq_s16(v195, v196); + int16x8_t v198 = vaddq_s16(v194, v197); + int16x8_t v199 = vaddq_s16(v191, v198); + int16x8_t v200 = vld1q_s16(in + in_stride * 5 + i); + int16x8_t v201 = vld1q_s16(in + in_stride * 3 + i); + int16x8_t v202 = vaddq_s16(v200, v201); + int16x8_t v203 = vld1q_s16(in + in_stride * 13 + i); + int16x8_t v204 = vld1q_s16(in + in_stride * 11 + i); + int16x8_t v205 = vaddq_s16(v203, v204); + int16x8_t v206 = vaddq_s16(v202, v205); + int16x8_t v207_tmp = vqrdmulhq_n_s16(v206, 13573); + int16x8_t v207 = vaddq_s16(v207_tmp, v206); + int16x8_t v208 = vaddq_s16(v199, v207); + int16x8_t v209 = vqrdmulhq_n_s16(v208, 16705); + int16x8_t v210 = vaddq_s16(v198, v206); + int16x8_t v211 = vqrdmulhq_n_s16(v210, 25080); + int16x8_t v212 = vqrdmulhq_n_s16(v210, 17734); + int16x8_t v213 = vld1q_s16(in + in_stride * 53 + i); + int16x8_t v214 = vld1q_s16(in + in_stride * 51 + i); + int16x8_t v215 = vaddq_s16(v213, v214); + int16x8_t v216 = vld1q_s16(in + in_stride * 61 + i); + int16x8_t v217 = vld1q_s16(in + in_stride * 59 + i); + int16x8_t v218 = vaddq_s16(v216, v217); + int16x8_t v219 = vaddq_s16(v215, v218); + int16x8_t v220 = vaddq_s16(v219, v191); + int16x8_t v221 = vqrdmulhq_n_s16(v220, 17734); + int16x8_t v222 = vaddq_s16(v212, v221); + int16x8_t v223 = vaddq_s16(v211, v222); + int16x8_t v224 = vqrdmulhq_n_s16(v223, 16705); + int16x8_t v225 = vaddq_s16(v209, v224); + int16x8_t v226 = vqrdmulhq_n_s16(v225, 16463); + int16x8_t v227_tmp = vqrdmulhq_n_s16(v202, 13573); + int16x8_t v227 = vaddq_s16(v227_tmp, v202); + int16x8_t v228 = vaddq_s16(v187, v194); + int16x8_t v229 = vaddq_s16(v227, v228); + int16x8_t v230 = vaddq_s16(v215, v190); + int16x8_t v231 = vqrdmulhq_n_s16(v230, 17734); + int16x8_t v232 = vaddq_s16(v197, v205); + int16x8_t v233_tmp = vqrdmulhq_n_s16(v232, 10045); + int16x8_t v233 = vaddq_s16(v233_tmp, v232); + int16x8_t v234 = vaddq_s16(v231, v233); + int16x8_t v235 = vaddq_s16(v229, v234); + int16x8_t v236 = vqrdmulhq_n_s16(v235, 16463); + int16x8_t v237 = vaddq_s16(v226, v236); + int16x8_t v238 = vaddq_s16(v184, v237); + int16x8_t v239 = vaddq_s16(v201, v144); + int16x8_t v240_tmp = vqrdmulhq_n_s16(v239, 13573); + int16x8_t v240 = vaddq_s16(v240_tmp, v239); + int16x8_t v241 = vaddq_s16(v186, v146); + int16x8_t v242 = vaddq_s16(v147, v192); + int16x8_t v243 = vaddq_s16(v241, v242); + int16x8_t v244 = vaddq_s16(v240, v243); + int16x8_t v245 = vaddq_s16(v196, v150); + int16x8_t v246 = vaddq_s16(v151, v203); + int16x8_t v247 = vaddq_s16(v245, v246); + int16x8_t v248_tmp = vqrdmulhq_n_s16(v247, 10045); + int16x8_t v248 = vaddq_s16(v248_tmp, v247); + int16x8_t v249 = vaddq_s16(v155, v188); + int16x8_t v250 = vaddq_s16(v214, v154); + int16x8_t v251 = vaddq_s16(v249, v250); + int16x8_t v252 = vqrdmulhq_n_s16(v251, 17734); + int16x8_t v253 = vaddq_s16(v248, v252); + int16x8_t v254 = vaddq_s16(v244, v253); + int16x8_t v255 = vaddq_s16(v204, v161); + int16x8_t v256 = vaddq_s16(v162, v200); + int16x8_t v257 = vaddq_s16(v255, v256); + int16x8_t v258_tmp = vqrdmulhq_n_s16(v257, 13573); + int16x8_t v258 = vaddq_s16(v258_tmp, v257); + int16x8_t v259 = vaddq_s16(v189, v165); + int16x8_t v260 = vaddq_s16(v166, v185); + int16x8_t v261 = vaddq_s16(v259, v260); + int16x8_t v262 = vaddq_s16(v169, v195); + int16x8_t v263 = vaddq_s16(v193, v168); + int16x8_t v264 = vaddq_s16(v262, v263); + int16x8_t v265 = vaddq_s16(v261, v264); + int16x8_t v266 = vaddq_s16(v258, v265); + int16x8_t v267 = vqrdmulhq_n_s16(v266, 16705); + int16x8_t v268 = vaddq_s16(v264, v257); + int16x8_t v269 = vqrdmulhq_n_s16(v268, 25080); + int16x8_t v270 = vaddq_s16(v217, v176); + int16x8_t v271 = vaddq_s16(v177, v213); + int16x8_t v272 = vaddq_s16(v270, v271); + int16x8_t v273 = vaddq_s16(v272, v261); + int16x8_t v274 = vqrdmulhq_n_s16(v273, 17734); + int16x8_t v275 = vqrdmulhq_n_s16(v268, 17734); + int16x8_t v276 = vaddq_s16(v274, v275); + int16x8_t v277 = vaddq_s16(v269, v276); + int16x8_t v278 = vqrdmulhq_n_s16(v277, 16705); + int16x8_t v279 = vaddq_s16(v267, v278); + int16x8_t v280 = vaddq_s16(v254, v279); + int16x8_t v281 = vqrdmulhq_n_s16(v280, 16404); + int16x8_t v282 = vaddq_s16(v256, v239); + int16x8_t v283_tmp = vqrdmulhq_n_s16(v282, 13573); + int16x8_t v283 = vaddq_s16(v283_tmp, v282); + int16x8_t v284 = vaddq_s16(v260, v241); + int16x8_t v285 = vaddq_s16(v242, v263); + int16x8_t v286 = vaddq_s16(v284, v285); + int16x8_t v287 = vaddq_s16(v283, v286); + int16x8_t v288 = vaddq_s16(v262, v245); + int16x8_t v289 = vaddq_s16(v246, v255); + int16x8_t v290 = vaddq_s16(v288, v289); + int16x8_t v291 = vqrdmulhq_n_s16(v290, 25080); + int16x8_t v292 = vqrdmulhq_n_s16(v290, 17734); + int16x8_t v293 = vaddq_s16(v271, v250); + int16x8_t v294 = vaddq_s16(v249, v259); + int16x8_t v295 = vaddq_s16(v293, v294); + int16x8_t v296 = vqrdmulhq_n_s16(v295, 17734); + int16x8_t v297 = vaddq_s16(v292, v296); + int16x8_t v298 = vaddq_s16(v291, v297); + int16x8_t v299 = vaddq_s16(v287, v298); + int16x8_t v300 = vqrdmulhq_n_s16(v299, 16463); + int16x8_t v301 = vaddq_s16(v289, v282); + int16x8_t v302 = vqrdmulhq_n_s16(v301, 23624); + int16x8_t v303 = vaddq_s16(v294, v284); + int16x8_t v304 = vqrdmulhq_n_s16(v303, 19705); + int16x8_t v305 = vaddq_s16(v285, v288); + int16x8_t v306 = vqrdmulhq_n_s16(v305, 19705); + int16x8_t v307 = vaddq_s16(v304, v306); + int16x8_t v308 = vqrdmulhq_n_s16(v307, 27779); + int16x8_t v309 = vaddq_s16(v302, v308); + int16x8_t v310 = vaddq_s16(v305, v301); + int16x8_t v311 = vqrdmulhq_n_s16(v310, 25080); + int16x8_t v312 = vqrdmulhq_n_s16(v310, 17734); + int16x8_t v313 = vld1q_s16(in + in_stride * 63 + i); + int16x8_t v314 = vaddq_s16(v313, v216); + int16x8_t v315 = vaddq_s16(v314, v270); + int16x8_t v316 = vaddq_s16(v315, v293); + int16x8_t v317 = vqrdmulhq_n_s16(v316, 25746); + int16x8_t v318 = vqrdmulhq_n_s16(v303, 25746); + int16x8_t v319 = vaddq_s16(v317, v318); + int16x8_t v320 = vqrdmulhq_n_s16(v319, 22571); + int16x8_t v321 = vaddq_s16(v312, v320); + int16x8_t v322 = vaddq_s16(v311, v321); + int16x8_t v323 = vqrdmulhq_n_s16(v322, 16705); + int16x8_t v324 = vaddq_s16(v309, v323); + int16x8_t v325 = vqrdmulhq_n_s16(v324, 16463); + int16x8_t v326 = vaddq_s16(v300, v325); + int16x8_t v327 = vqrdmulhq_n_s16(v326, 16404); + int16x8_t v328 = vaddq_s16(v281, v327); + int16x8_t v329 = vaddq_s16(v238, v328); + int16x8_t v330 = vqrdmulhq_n_s16(v329, 16389); + int16x8_t v331 = vaddq_s16(v143, v330); + int16x8_t v332 = vsubq_s16(v82, v89); + int16x8_t v333 = vqrdmulhq_n_s16(v332, 19705); + int16x8_t v334 = vqrdmulhq_n_s16(v92, 13573); + int16x8_t v335 = vsubq_s16(v334, v97); + int16x8_t v336 = vqrdmulhq_n_s16(v335, 25746); + int16x8_t v337 = vaddq_s16(v333, v336); + int16x8_t v338 = vsubq_s16(v63, v66); + int16x8_t v339 = vqrdmulhq_n_s16(v70, 17734); + int16x8_t v340_tmp = vqrdmulhq_n_s16(v74, 10045); + int16x8_t v340 = vaddq_s16(v340_tmp, v74); + int16x8_t v341 = vsubq_s16(v339, v340); + int16x8_t v342 = vaddq_s16(v338, v341); + int16x8_t v343 = vaddq_s16(v337, v342); + int16x8_t v344 = vsubq_s16(v130, v131); + int16x8_t v345 = vqrdmulhq_n_s16(v133, 13573); + int16x8_t v346 = vsubq_s16(v345, v135); + int16x8_t v347_tmp = vqrdmulhq_n_s16(v346, 10045); + int16x8_t v347 = vaddq_s16(v347_tmp, v346); + int16x8_t v348 = vaddq_s16(v344, v347); + int16x8_t v349 = vqrdmulhq_n_s16(v348, 17121); + int16x8_t v350 = vqrdmulhq_n_s16(v105, 27867); + int16x8_t v351 = vqrdmulhq_n_s16(v113, 19705); + int16x8_t v352 = vsubq_s16(v350, v351); + int16x8_t v353 = vqrdmulhq_n_s16(v116, 13573); + int16x8_t v354 = vsubq_s16(v353, v123); + int16x8_t v355 = vqrdmulhq_n_s16(v354, 25746); + int16x8_t v356 = vaddq_s16(v352, v355); + int16x8_t v357 = vqrdmulhq_n_s16(v356, 17121); + int16x8_t v358 = vaddq_s16(v349, v357); + int16x8_t v359 = vaddq_s16(v343, v358); + int16x8_t v360 = vqrdmulhq_n_s16(v359, 16563); + int16x8_t v361 = vsubq_s16(v27, v30); + int16x8_t v362 = vqrdmulhq_n_s16(v34, 17734); + int16x8_t v363_tmp = vqrdmulhq_n_s16(v38, 10045); + int16x8_t v363 = vaddq_s16(v363_tmp, v38); + int16x8_t v364 = vsubq_s16(v362, v363); + int16x8_t v365 = vaddq_s16(v361, v364); + int16x8_t v366 = vsubq_s16(v44, v47); + int16x8_t v367 = vqrdmulhq_n_s16(v366, 19705); + int16x8_t v368 = vqrdmulhq_n_s16(v50, 13573); + int16x8_t v369 = vsubq_s16(v368, v54); + int16x8_t v370 = vqrdmulhq_n_s16(v369, 25746); + int16x8_t v371 = vaddq_s16(v367, v370); + int16x8_t v372 = vaddq_s16(v365, v371); + int16x8_t v373 = vqrdmulhq_n_s16(v372, 17121); + int16x8_t v374 = vsubq_s16(v0, v1); + int16x8_t v375 = vsubq_s16(v4, v6); + int16x8_t v376_tmp = vqrdmulhq_n_s16(v375, 10045); + int16x8_t v376 = vaddq_s16(v376_tmp, v375); + int16x8_t v377 = vaddq_s16(v374, v376); + int16x8_t v378 = vsubq_s16(v11, v14); + int16x8_t v379 = vqrdmulhq_n_s16(v18, 17734); + int16x8_t v380_tmp = vqrdmulhq_n_s16(v17, 10045); + int16x8_t v380 = vaddq_s16(v380_tmp, v17); + int16x8_t v381 = vsubq_s16(v379, v380); + int16x8_t v382 = vaddq_s16(v378, v381); + int16x8_t v383 = vqrdmulhq_n_s16(v382, 19705); + int16x8_t v384 = vaddq_s16(v377, v383); + int16x8_t v385 = vaddq_s16(v373, v384); + int16x8_t v386 = vaddq_s16(v360, v385); + int16x8_t v387 = vsubq_s16(v145, v148); + int16x8_t v388 = vqrdmulhq_n_s16(v152, 17734); + int16x8_t v389_tmp = vqrdmulhq_n_s16(v156, 10045); + int16x8_t v389 = vaddq_s16(v389_tmp, v156); + int16x8_t v390 = vsubq_s16(v388, v389); + int16x8_t v391 = vaddq_s16(v387, v390); + int16x8_t v392 = vsubq_s16(v164, v171); + int16x8_t v393 = vqrdmulhq_n_s16(v392, 19705); + int16x8_t v394 = vqrdmulhq_n_s16(v174, 13573); + int16x8_t v395 = vsubq_s16(v394, v179); + int16x8_t v396 = vqrdmulhq_n_s16(v395, 25746); + int16x8_t v397 = vaddq_s16(v393, v396); + int16x8_t v398 = vaddq_s16(v391, v397); + int16x8_t v399 = vsubq_s16(v227, v228); + int16x8_t v400 = vqrdmulhq_n_s16(v232, 13573); + int16x8_t v401 = vsubq_s16(v400, v230); + int16x8_t v402_tmp = vqrdmulhq_n_s16(v401, 10045); + int16x8_t v402 = vaddq_s16(v402_tmp, v401); + int16x8_t v403 = vaddq_s16(v399, v402); + int16x8_t v404 = vqrdmulhq_n_s16(v403, 17121); + int16x8_t v405 = vqrdmulhq_n_s16(v206, 27867); + int16x8_t v406 = vqrdmulhq_n_s16(v199, 19705); + int16x8_t v407 = vsubq_s16(v405, v406); + int16x8_t v408 = vqrdmulhq_n_s16(v210, 13573); + int16x8_t v409 = vsubq_s16(v408, v220); + int16x8_t v410 = vqrdmulhq_n_s16(v409, 25746); + int16x8_t v411 = vaddq_s16(v407, v410); + int16x8_t v412 = vqrdmulhq_n_s16(v411, 17121); + int16x8_t v413 = vaddq_s16(v404, v412); + int16x8_t v414 = vaddq_s16(v398, v413); + int16x8_t v415 = vsubq_s16(v240, v243); + int16x8_t v416 = vqrdmulhq_n_s16(v247, 13573); + int16x8_t v417 = vsubq_s16(v416, v251); + int16x8_t v418_tmp = vqrdmulhq_n_s16(v417, 10045); + int16x8_t v418 = vaddq_s16(v418_tmp, v417); + int16x8_t v419 = vaddq_s16(v415, v418); + int16x8_t v420 = vqrdmulhq_n_s16(v257, 27867); + int16x8_t v421 = vqrdmulhq_n_s16(v265, 19705); + int16x8_t v422 = vsubq_s16(v420, v421); + int16x8_t v423 = vqrdmulhq_n_s16(v268, 13573); + int16x8_t v424 = vsubq_s16(v423, v273); + int16x8_t v425 = vqrdmulhq_n_s16(v424, 25746); + int16x8_t v426 = vaddq_s16(v422, v425); + int16x8_t v427 = vaddq_s16(v419, v426); + int16x8_t v428 = vqrdmulhq_n_s16(v427, 16563); + int16x8_t v429 = vqrdmulhq_n_s16(v301, 27867); + int16x8_t v430 = vsubq_s16(v429, v307); + int16x8_t v431 = vqrdmulhq_n_s16(v310, 10664); + int16x8_t v432 = vsubq_s16(v431, v319); + int16x8_t v433 = vaddq_s16(v430, v432); + int16x8_t v434 = vqrdmulhq_n_s16(v433, 17121); + int16x8_t v435 = vsubq_s16(v283, v286); + int16x8_t v436 = vqrdmulhq_n_s16(v290, 13573); + int16x8_t v437 = vsubq_s16(v436, v295); + int16x8_t v438_tmp = vqrdmulhq_n_s16(v437, 10045); + int16x8_t v438 = vaddq_s16(v438_tmp, v437); + int16x8_t v439 = vaddq_s16(v435, v438); + int16x8_t v440 = vqrdmulhq_n_s16(v439, 17121); + int16x8_t v441 = vaddq_s16(v434, v440); + int16x8_t v442 = vqrdmulhq_n_s16(v441, 16563); + int16x8_t v443 = vaddq_s16(v428, v442); + int16x8_t v444 = vaddq_s16(v414, v443); + int16x8_t v445 = vqrdmulhq_n_s16(v444, 16429); + int16x8_t v446 = vaddq_s16(v386, v445); + int16x8_t v447 = vsubq_s16(v374, v376); + int16x8_t v448 = vsubq_s16(v378, v381); + int16x8_t v449 = vqrdmulhq_n_s16(v448, 29490); + int16x8_t v450 = vaddq_s16(v447, v449); + int16x8_t v451 = vsubq_s16(v361, v364); + int16x8_t v452 = vqrdmulhq_n_s16(v366, 29490); + int16x8_t v453_tmp = vqrdmulhq_n_s16(v369, 5763); + int16x8_t v453 = vaddq_s16(v453_tmp, v369); + int16x8_t v454 = vsubq_s16(v452, v453); + int16x8_t v455 = vaddq_s16(v451, v454); + int16x8_t v456 = vqrdmulhq_n_s16(v455, 18578); + int16x8_t v457 = vaddq_s16(v450, v456); + int16x8_t v458 = vsubq_s16(v338, v341); + int16x8_t v459 = vqrdmulhq_n_s16(v332, 29490); + int16x8_t v460_tmp = vqrdmulhq_n_s16(v335, 5763); + int16x8_t v460 = vaddq_s16(v460_tmp, v335); + int16x8_t v461 = vsubq_s16(v459, v460); + int16x8_t v462 = vaddq_s16(v458, v461); + int16x8_t v463 = vqrdmulhq_n_s16(v352, 27803); + int16x8_t v464 = vqrdmulhq_n_s16(v354, 21845); + int16x8_t v465 = vsubq_s16(v463, v464); + int16x8_t v466 = vsubq_s16(v344, v347); + int16x8_t v467 = vqrdmulhq_n_s16(v466, 18578); + int16x8_t v468 = vaddq_s16(v465, v467); + int16x8_t v469 = vaddq_s16(v462, v468); + int16x8_t v470 = vqrdmulhq_n_s16(v469, 16890); + int16x8_t v471 = vaddq_s16(v457, v470); + int16x8_t v472 = vsubq_s16(v415, v418); + int16x8_t v473_tmp = vqrdmulhq_n_s16(v422, 16273); + int16x8_t v473 = vaddq_s16(v473_tmp, v422); + int16x8_t v474_tmp = vqrdmulhq_n_s16(v424, 5763); + int16x8_t v474 = vaddq_s16(v474_tmp, v424); + int16x8_t v475 = vsubq_s16(v473, v474); + int16x8_t v476 = vaddq_s16(v472, v475); + int16x8_t v477 = vqrdmulhq_n_s16(v476, 16890); + int16x8_t v478 = vqrdmulhq_n_s16(v435, 20261); + int16x8_t v479 = vqrdmulhq_n_s16(v437, 26472); + int16x8_t v480 = vsubq_s16(v478, v479); + int16x8_t v481 = vqrdmulhq_n_s16(v480, 30046); + int16x8_t v482 = vqrdmulhq_n_s16(v430, 30322); + int16x8_t v483 = vqrdmulhq_n_s16(v432, 30322); + int16x8_t v484 = vsubq_s16(v482, v483); + int16x8_t v485 = vqrdmulhq_n_s16(v484, 30046); + int16x8_t v486 = vaddq_s16(v481, v485); + int16x8_t v487 = vqrdmulhq_n_s16(v486, 16890); + int16x8_t v488 = vaddq_s16(v477, v487); + int16x8_t v489 = vsubq_s16(v387, v390); + int16x8_t v490 = vqrdmulhq_n_s16(v392, 29490); + int16x8_t v491_tmp = vqrdmulhq_n_s16(v395, 5763); + int16x8_t v491 = vaddq_s16(v491_tmp, v395); + int16x8_t v492 = vsubq_s16(v490, v491); + int16x8_t v493 = vaddq_s16(v489, v492); + int16x8_t v494 = vsubq_s16(v399, v402); + int16x8_t v495 = vqrdmulhq_n_s16(v494, 18578); + int16x8_t v496 = vqrdmulhq_n_s16(v407, 27803); + int16x8_t v497 = vqrdmulhq_n_s16(v409, 21845); + int16x8_t v498 = vsubq_s16(v496, v497); + int16x8_t v499 = vaddq_s16(v495, v498); + int16x8_t v500 = vaddq_s16(v493, v499); + int16x8_t v501 = vaddq_s16(v488, v500); + int16x8_t v502 = vqrdmulhq_n_s16(v501, 16508); + int16x8_t v503 = vaddq_s16(v471, v502); + int16x8_t v504 = vsubq_s16(v2, v8); + int16x8_t v505 = vsubq_s16(v15, v22); + int16x8_t v506_tmp = vqrdmulhq_n_s16(v505, 18446); + int16x8_t v506 = vmlaq_n_s16(v506_tmp, v505, 2); + int16x8_t v507 = vaddq_s16(v504, v506); + int16x8_t v508 = vsubq_s16(v31, v41); + int16x8_t v509 = vsubq_s16(v48, v56); + int16x8_t v510_tmp = vqrdmulhq_n_s16(v509, 18446); + int16x8_t v510 = vmlaq_n_s16(v510_tmp, v509, 2); + int16x8_t v511 = vaddq_s16(v508, v510); + int16x8_t v512 = vqrdmulhq_n_s16(v511, 21195); + int16x8_t v513 = vaddq_s16(v507, v512); + int16x8_t v514 = vsubq_s16(v67, v77); + int16x8_t v515 = vsubq_s16(v90, v99); + int16x8_t v516_tmp = vqrdmulhq_n_s16(v515, 18446); + int16x8_t v516 = vmlaq_n_s16(v516_tmp, v515, 2); + int16x8_t v517 = vaddq_s16(v514, v516); + int16x8_t v518 = vsubq_s16(v114, v126); + int16x8_t v519_tmp = vqrdmulhq_n_s16(v518, 18446); + int16x8_t v519 = vmlaq_n_s16(v519_tmp, v518, 2); + int16x8_t v520 = vsubq_s16(v132, v137); + int16x8_t v521 = vaddq_s16(v519, v520); + int16x8_t v522 = vqrdmulhq_n_s16(v521, 21195); + int16x8_t v523 = vaddq_s16(v517, v522); + int16x8_t v524 = vqrdmulhq_n_s16(v523, 17401); + int16x8_t v525 = vaddq_s16(v513, v524); + int16x8_t v526 = vsubq_s16(v172, v181); + int16x8_t v527_tmp = vqrdmulhq_n_s16(v526, 18446); + int16x8_t v527 = vmlaq_n_s16(v527_tmp, v526, 2); + int16x8_t v528 = vsubq_s16(v149, v159); + int16x8_t v529 = vaddq_s16(v527, v528); + int16x8_t v530 = vsubq_s16(v229, v234); + int16x8_t v531 = vsubq_s16(v208, v223); + int16x8_t v532_tmp = vqrdmulhq_n_s16(v531, 18446); + int16x8_t v532 = vmlaq_n_s16(v532_tmp, v531, 2); + int16x8_t v533 = vaddq_s16(v530, v532); + int16x8_t v534 = vqrdmulhq_n_s16(v533, 21195); + int16x8_t v535 = vaddq_s16(v529, v534); + int16x8_t v536 = vsubq_s16(v244, v253); + int16x8_t v537 = vsubq_s16(v266, v277); + int16x8_t v538_tmp = vqrdmulhq_n_s16(v537, 18446); + int16x8_t v538 = vmlaq_n_s16(v538_tmp, v537, 2); + int16x8_t v539 = vaddq_s16(v536, v538); + int16x8_t v540 = vqrdmulhq_n_s16(v539, 17401); + int16x8_t v541 = vqrdmulhq_n_s16(v287, 25826); + int16x8_t v542 = vqrdmulhq_n_s16(v298, 25826); + int16x8_t v543 = vsubq_s16(v541, v542); + int16x8_t v544 = vqrdmulhq_n_s16(v543, 14281); + int16x8_t v545_tmp = vqrdmulhq_n_s16(v309, 31509); + int16x8_t v545 = vaddq_s16(v545_tmp, v309); + int16x8_t v546 = vsubq_s16(v545, v322); + int16x8_t v547 = vqrdmulhq_n_s16(v546, 28847); + int16x8_t v548 = vaddq_s16(v544, v547); + int16x8_t v549 = vaddq_s16(v540, v548); + int16x8_t v550 = vaddq_s16(v535, v549); + int16x8_t v551 = vqrdmulhq_n_s16(v550, 16629); + int16x8_t v552 = vaddq_s16(v525, v551); + int16x8_t v553 = vsubq_s16(v504, v506); + int16x8_t v554 = vsubq_s16(v508, v510); + int16x8_t v555 = vqrdmulhq_n_s16(v554, 25826); + int16x8_t v556 = vaddq_s16(v553, v555); + int16x8_t v557 = vsubq_s16(v514, v516); + int16x8_t v558 = vsubq_s16(v520, v519); + int16x8_t v559 = vqrdmulhq_n_s16(v558, 25826); + int16x8_t v560 = vaddq_s16(v557, v559); + int16x8_t v561 = vqrdmulhq_n_s16(v560, 18124); + int16x8_t v562 = vaddq_s16(v556, v561); + int16x8_t v563 = vsubq_s16(v528, v527); + int16x8_t v564 = vsubq_s16(v530, v532); + int16x8_t v565 = vqrdmulhq_n_s16(v564, 25826); + int16x8_t v566 = vaddq_s16(v563, v565); + int16x8_t v567 = vsubq_s16(v536, v538); + int16x8_t v568 = vqrdmulhq_n_s16(v567, 18124); + int16x8_t v569_tmp = vqrdmulhq_n_s16(v546, 654); + int16x8_t v569 = vmlaq_n_s16(v569_tmp, v546, 2); + int16x8_t v570 = vsubq_s16(v543, v569); + int16x8_t v571 = vqrdmulhq_n_s16(v570, 18124); + int16x8_t v572 = vaddq_s16(v568, v571); + int16x8_t v573 = vaddq_s16(v566, v572); + int16x8_t v574 = vqrdmulhq_n_s16(v573, 16792); + int16x8_t v575 = vaddq_s16(v562, v574); + int16x8_t v576 = vsubq_s16(v458, v461); + int16x8_t v577_tmp = vqrdmulhq_n_s16(v465, 25030); + int16x8_t v577 = vaddq_s16(v577_tmp, v465); + int16x8_t v578 = vsubq_s16(v466, v577); + int16x8_t v579_tmp = vqrdmulhq_n_s16(v578, 1988); + int16x8_t v579 = vaddq_s16(v579_tmp, v578); + int16x8_t v580 = vaddq_s16(v576, v579); + int16x8_t v581 = vqrdmulhq_n_s16(v580, 19102); + int16x8_t v582 = vsubq_s16(v447, v449); + int16x8_t v583 = vsubq_s16(v451, v454); + int16x8_t v584_tmp = vqrdmulhq_n_s16(v583, 1988); + int16x8_t v584 = vaddq_s16(v584_tmp, v583); + int16x8_t v585 = vaddq_s16(v582, v584); + int16x8_t v586 = vaddq_s16(v581, v585); + int16x8_t v587 = vsubq_s16(v489, v492); + int16x8_t v588_tmp = vqrdmulhq_n_s16(v498, 25030); + int16x8_t v588 = vaddq_s16(v588_tmp, v498); + int16x8_t v589 = vsubq_s16(v494, v588); + int16x8_t v590_tmp = vqrdmulhq_n_s16(v589, 1988); + int16x8_t v590 = vaddq_s16(v590_tmp, v589); + int16x8_t v591 = vaddq_s16(v587, v590); + int16x8_t v592 = vsubq_s16(v472, v475); + int16x8_t v593 = vqrdmulhq_n_s16(v592, 19102); + int16x8_t v594 = vsubq_s16(v480, v484); + int16x8_t v595 = vaddq_s16(v593, v594); + int16x8_t v596 = vaddq_s16(v591, v595); + int16x8_t v597 = vqrdmulhq_n_s16(v596, 17000); + int16x8_t v598 = vaddq_s16(v586, v597); + int16x8_t v599 = vsubq_s16(v365, v371); + int16x8_t v600_tmp = vqrdmulhq_n_s16(v599, 23673); + int16x8_t v600 = vaddq_s16(v600_tmp, v599); + int16x8_t v601 = vsubq_s16(v377, v383); + int16x8_t v602 = vaddq_s16(v600, v601); + int16x8_t v603 = vsubq_s16(v348, v356); + int16x8_t v604_tmp = vqrdmulhq_n_s16(v603, 23673); + int16x8_t v604 = vaddq_s16(v604_tmp, v603); + int16x8_t v605 = vsubq_s16(v342, v337); + int16x8_t v606 = vaddq_s16(v604, v605); + int16x8_t v607 = vqrdmulhq_n_s16(v606, 20398); + int16x8_t v608 = vaddq_s16(v602, v607); + int16x8_t v609 = vsubq_s16(v391, v397); + int16x8_t v610 = vsubq_s16(v403, v411); + int16x8_t v611_tmp = vqrdmulhq_n_s16(v610, 23673); + int16x8_t v611 = vaddq_s16(v611_tmp, v610); + int16x8_t v612 = vaddq_s16(v609, v611); + int16x8_t v613 = vsubq_s16(v419, v426); + int16x8_t v614 = vqrdmulhq_n_s16(v613, 20398); + int16x8_t v615 = vsubq_s16(v439, v433); + int16x8_t v616_tmp = vqrdmulhq_n_s16(v615, 2367); + int16x8_t v616 = vaddq_s16(v616_tmp, v615); + int16x8_t v617 = vaddq_s16(v614, v616); + int16x8_t v618 = vaddq_s16(v612, v617); + int16x8_t v619 = vqrdmulhq_n_s16(v618, 17255); + int16x8_t v620 = vaddq_s16(v608, v619); + int16x8_t v621 = vsubq_s16(v160, v183); + int16x8_t v622 = vsubq_s16(v235, v225); + int16x8_t v623_tmp = vqrdmulhq_n_s16(v622, 3314); + int16x8_t v623 = vmlaq_n_s16(v623_tmp, v622, 5); + int16x8_t v624 = vaddq_s16(v621, v623); + int16x8_t v625 = vsubq_s16(v254, v279); + int16x8_t v626 = vsubq_s16(v299, v324); + int16x8_t v627_tmp = vqrdmulhq_n_s16(v626, 3314); + int16x8_t v627 = vmlaq_n_s16(v627_tmp, v626, 5); + int16x8_t v628 = vaddq_s16(v625, v627); + int16x8_t v629 = vqrdmulhq_n_s16(v628, 22112); + int16x8_t v630 = vaddq_s16(v624, v629); + int16x8_t v631 = vqrdmulhq_n_s16(v630, 17561); + int16x8_t v632 = vsubq_s16(v9, v24); + int16x8_t v633 = vsubq_s16(v42, v58); + int16x8_t v634_tmp = vqrdmulhq_n_s16(v633, 3314); + int16x8_t v634 = vmlaq_n_s16(v634_tmp, v633, 5); + int16x8_t v635 = vaddq_s16(v632, v634); + int16x8_t v636 = vsubq_s16(v78, v101); + int16x8_t v637 = vsubq_s16(v138, v128); + int16x8_t v638_tmp = vqrdmulhq_n_s16(v637, 3314); + int16x8_t v638 = vmlaq_n_s16(v638_tmp, v637, 5); + int16x8_t v639 = vaddq_s16(v636, v638); + int16x8_t v640 = vqrdmulhq_n_s16(v639, 22112); + int16x8_t v641 = vaddq_s16(v635, v640); + int16x8_t v642 = vaddq_s16(v631, v641); + int16x8_t v643 = vsubq_s16(v632, v634); + int16x8_t v644 = vsubq_s16(v636, v638); + int16x8_t v645 = vqrdmulhq_n_s16(v644, 24397); + int16x8_t v646 = vaddq_s16(v643, v645); + int16x8_t v647 = vsubq_s16(v621, v623); + int16x8_t v648 = vsubq_s16(v625, v627); + int16x8_t v649 = vqrdmulhq_n_s16(v648, 24397); + int16x8_t v650 = vaddq_s16(v647, v649); + int16x8_t v651 = vqrdmulhq_n_s16(v650, 17921); + int16x8_t v652 = vaddq_s16(v646, v651); + int16x8_t v653 = vsubq_s16(v601, v600); + int16x8_t v654 = vsubq_s16(v605, v604); + int16x8_t v655 = vqrdmulhq_n_s16(v654, 27504); + int16x8_t v656 = vaddq_s16(v653, v655); + int16x8_t v657 = vsubq_s16(v609, v611); + int16x8_t v658 = vqrdmulhq_n_s16(v613, 27504); + int16x8_t v659_tmp = vqrdmulhq_n_s16(v615, 14606); + int16x8_t v659 = vaddq_s16(v659_tmp, v615); + int16x8_t v660 = vsubq_s16(v658, v659); + int16x8_t v661 = vaddq_s16(v657, v660); + int16x8_t v662 = vqrdmulhq_n_s16(v661, 18343); + int16x8_t v663 = vaddq_s16(v656, v662); + int16x8_t v664 = vsubq_s16(v582, v584); + int16x8_t v665 = vsubq_s16(v576, v579); + int16x8_t v666 = vqrdmulhq_n_s16(v665, 31869); + int16x8_t v667 = vaddq_s16(v664, v666); + int16x8_t v668 = vsubq_s16(v587, v590); + int16x8_t v669_tmp = vqrdmulhq_n_s16(v594, 23444); + int16x8_t v669 = vaddq_s16(v669_tmp, v594); + int16x8_t v670 = vsubq_s16(v592, v669); + int16x8_t v671 = vqrdmulhq_n_s16(v670, 31869); + int16x8_t v672 = vaddq_s16(v668, v671); + int16x8_t v673 = vqrdmulhq_n_s16(v672, 18830); + int16x8_t v674 = vaddq_s16(v667, v673); + int16x8_t v675 = vsubq_s16(v553, v555); + int16x8_t v676 = vsubq_s16(v557, v559); + int16x8_t v677_tmp = vqrdmulhq_n_s16(v676, 5552); + int16x8_t v677 = vaddq_s16(v677_tmp, v676); + int16x8_t v678 = vaddq_s16(v675, v677); + int16x8_t v679 = vsubq_s16(v563, v565); + int16x8_t v680 = vsubq_s16(v567, v570); + int16x8_t v681_tmp = vqrdmulhq_n_s16(v680, 5552); + int16x8_t v681 = vaddq_s16(v681_tmp, v680); + int16x8_t v682 = vaddq_s16(v679, v681); + int16x8_t v683 = vqrdmulhq_n_s16(v682, 19393); + int16x8_t v684 = vaddq_s16(v678, v683); + int16x8_t v685 = vsubq_s16(v507, v512); + int16x8_t v686 = vsubq_s16(v517, v522); + int16x8_t v687_tmp = vqrdmulhq_n_s16(v686, 15865); + int16x8_t v687 = vaddq_s16(v687_tmp, v686); + int16x8_t v688 = vaddq_s16(v685, v687); + int16x8_t v689 = vsubq_s16(v529, v534); + int16x8_t v690_tmp = vqrdmulhq_n_s16(v548, 28937); + int16x8_t v690 = vaddq_s16(v690_tmp, v548); + int16x8_t v691 = vsubq_s16(v539, v690); + int16x8_t v692_tmp = vqrdmulhq_n_s16(v691, 15865); + int16x8_t v692 = vaddq_s16(v692_tmp, v691); + int16x8_t v693 = vaddq_s16(v689, v692); + int16x8_t v694 = vqrdmulhq_n_s16(v693, 20040); + int16x8_t v695 = vaddq_s16(v688, v694); + int16x8_t v696 = vsubq_s16(v476, v486); + int16x8_t v697_tmp = vqrdmulhq_n_s16(v696, 1893); + int16x8_t v697 = vmlaq_n_s16(v697_tmp, v696, 2); + int16x8_t v698 = vsubq_s16(v493, v499); + int16x8_t v699 = vaddq_s16(v697, v698); + int16x8_t v700 = vqrdmulhq_n_s16(v699, 20783); + int16x8_t v701 = vsubq_s16(v450, v456); + int16x8_t v702 = vsubq_s16(v462, v468); + int16x8_t v703_tmp = vqrdmulhq_n_s16(v702, 1893); + int16x8_t v703 = vmlaq_n_s16(v703_tmp, v702, 2); + int16x8_t v704 = vaddq_s16(v701, v703); + int16x8_t v705 = vaddq_s16(v700, v704); + int16x8_t v706 = vsubq_s16(v384, v373); + int16x8_t v707 = vsubq_s16(v343, v358); + int16x8_t v708_tmp = vqrdmulhq_n_s16(v707, 13357); + int16x8_t v708 = vmlaq_n_s16(v708_tmp, v707, 3); + int16x8_t v709 = vaddq_s16(v706, v708); + int16x8_t v710 = vsubq_s16(v398, v413); + int16x8_t v711 = vsubq_s16(v427, v441); + int16x8_t v712_tmp = vqrdmulhq_n_s16(v711, 13357); + int16x8_t v712 = vmlaq_n_s16(v712_tmp, v711, 3); + int16x8_t v713 = vaddq_s16(v710, v712); + int16x8_t v714 = vqrdmulhq_n_s16(v713, 21637); + int16x8_t v715 = vaddq_s16(v709, v714); + int16x8_t v716 = vsubq_s16(v25, v60); + int16x8_t v717 = vsubq_s16(v102, v140); + int16x8_t v718_tmp = vqrdmulhq_n_s16(v717, 6226); + int16x8_t v718 = vmlaq_n_s16(v718_tmp, v717, 10); + int16x8_t v719 = vaddq_s16(v716, v718); + int16x8_t v720 = vsubq_s16(v280, v326); + int16x8_t v721_tmp = vqrdmulhq_n_s16(v720, 6226); + int16x8_t v721 = vmlaq_n_s16(v721_tmp, v720, 10); + int16x8_t v722 = vsubq_s16(v184, v237); + int16x8_t v723 = vaddq_s16(v721, v722); + int16x8_t v724 = vqrdmulhq_n_s16(v723, 22622); + int16x8_t v725 = vaddq_s16(v719, v724); + int16x8_t v726 = vsubq_s16(v716, v718); + int16x8_t v727 = vsubq_s16(v722, v721); + int16x8_t v728 = vqrdmulhq_n_s16(v727, 23761); + int16x8_t v729 = vaddq_s16(v726, v728); + int16x8_t v730 = vsubq_s16(v706, v708); + int16x8_t v731 = vsubq_s16(v710, v712); + int16x8_t v732 = vqrdmulhq_n_s16(v731, 25084); + int16x8_t v733 = vaddq_s16(v730, v732); + int16x8_t v734 = vsubq_s16(v701, v703); + int16x8_t v735 = vsubq_s16(v698, v697); + int16x8_t v736 = vqrdmulhq_n_s16(v735, 26631); + int16x8_t v737 = vaddq_s16(v734, v736); + int16x8_t v738 = vsubq_s16(v685, v687); + int16x8_t v739 = vsubq_s16(v689, v692); + int16x8_t v740 = vqrdmulhq_n_s16(v739, 28454); + int16x8_t v741 = vaddq_s16(v738, v740); + int16x8_t v742 = vsubq_s16(v675, v677); + int16x8_t v743 = vsubq_s16(v679, v681); + int16x8_t v744 = vqrdmulhq_n_s16(v743, 30624); + int16x8_t v745 = vaddq_s16(v742, v744); + int16x8_t v746 = vsubq_s16(v664, v666); + int16x8_t v747 = vsubq_s16(v668, v671); + int16x8_t v748_tmp = vqrdmulhq_n_s16(v747, 472); + int16x8_t v748 = vaddq_s16(v748_tmp, v747); + int16x8_t v749 = vaddq_s16(v746, v748); + int16x8_t v750 = vsubq_s16(v653, v655); + int16x8_t v751 = vsubq_s16(v657, v660); + int16x8_t v752_tmp = vqrdmulhq_n_s16(v751, 3672); + int16x8_t v752 = vaddq_s16(v752_tmp, v751); + int16x8_t v753 = vaddq_s16(v750, v752); + int16x8_t v754 = vsubq_s16(v643, v645); + int16x8_t v755 = vsubq_s16(v647, v649); + int16x8_t v756_tmp = vqrdmulhq_n_s16(v755, 7662); + int16x8_t v756 = vaddq_s16(v756_tmp, v755); + int16x8_t v757 = vaddq_s16(v754, v756); + int16x8_t v758 = vsubq_s16(v635, v640); + int16x8_t v759 = vsubq_s16(v624, v629); + int16x8_t v760_tmp = vqrdmulhq_n_s16(v759, 12756); + int16x8_t v760 = vaddq_s16(v760_tmp, v759); + int16x8_t v761 = vaddq_s16(v758, v760); + int16x8_t v762 = vsubq_s16(v602, v607); + int16x8_t v763 = vsubq_s16(v612, v617); + int16x8_t v764_tmp = vqrdmulhq_n_s16(v763, 19463); + int16x8_t v764 = vaddq_s16(v764_tmp, v763); + int16x8_t v765 = vaddq_s16(v762, v764); + int16x8_t v766 = vsubq_s16(v585, v581); + int16x8_t v767 = vsubq_s16(v591, v595); + int16x8_t v768_tmp = vqrdmulhq_n_s16(v767, 28661); + int16x8_t v768 = vaddq_s16(v768_tmp, v767); + int16x8_t v769 = vaddq_s16(v766, v768); + int16x8_t v770 = vsubq_s16(v556, v561); + int16x8_t v771 = vsubq_s16(v566, v572); + int16x8_t v772_tmp = vqrdmulhq_n_s16(v771, 9242); + int16x8_t v772 = vmlaq_n_s16(v772_tmp, v771, 2); + int16x8_t v773 = vaddq_s16(v770, v772); + int16x8_t v774 = vsubq_s16(v513, v524); + int16x8_t v775 = vsubq_s16(v535, v549); + int16x8_t v776_tmp = vqrdmulhq_n_s16(v775, 30298); + int16x8_t v776 = vmlaq_n_s16(v776_tmp, v775, 2); + int16x8_t v777 = vaddq_s16(v774, v776); + int16x8_t v778 = vsubq_s16(v457, v470); + int16x8_t v779 = vsubq_s16(v500, v488); + int16x8_t v780_tmp = vqrdmulhq_n_s16(v779, 2773); + int16x8_t v780 = vmlaq_n_s16(v780_tmp, v779, 4); + int16x8_t v781 = vaddq_s16(v778, v780); + int16x8_t v782 = vsubq_s16(v385, v360); + int16x8_t v783 = vsubq_s16(v414, v443); + int16x8_t v784_tmp = vqrdmulhq_n_s16(v783, 26108); + int16x8_t v784 = vmlaq_n_s16(v784_tmp, v783, 6); + int16x8_t v785 = vaddq_s16(v782, v784); + int16x8_t v786 = vsubq_s16(v61, v142); + int16x8_t v787 = vsubq_s16(v238, v328); + int16x8_t v788_tmp = vqrdmulhq_n_s16(v787, 12251); + int16x8_t v788 = vmlaq_n_s16(v788_tmp, v787, 20); + int16x8_t v789 = vaddq_s16(v786, v788); + int16x8_t v790 = vsubq_s16(v786, v788); + int16x8_t v791 = vsubq_s16(v782, v784); + int16x8_t v792 = vsubq_s16(v778, v780); + int16x8_t v793 = vsubq_s16(v774, v776); + int16x8_t v794 = vsubq_s16(v770, v772); + int16x8_t v795 = vsubq_s16(v766, v768); + int16x8_t v796 = vsubq_s16(v762, v764); + int16x8_t v797 = vsubq_s16(v758, v760); + int16x8_t v798 = vsubq_s16(v754, v756); + int16x8_t v799 = vsubq_s16(v750, v752); + int16x8_t v800 = vsubq_s16(v746, v748); + int16x8_t v801 = vsubq_s16(v742, v744); + int16x8_t v802 = vsubq_s16(v738, v740); + int16x8_t v803 = vsubq_s16(v734, v736); + int16x8_t v804 = vsubq_s16(v730, v732); + int16x8_t v805 = vsubq_s16(v726, v728); + int16x8_t v806 = vsubq_s16(v719, v724); + int16x8_t v807 = vsubq_s16(v709, v714); + int16x8_t v808 = vsubq_s16(v704, v700); + int16x8_t v809 = vsubq_s16(v688, v694); + int16x8_t v810 = vsubq_s16(v678, v683); + int16x8_t v811 = vsubq_s16(v667, v673); + int16x8_t v812 = vsubq_s16(v656, v662); + int16x8_t v813 = vsubq_s16(v646, v651); + int16x8_t v814 = vsubq_s16(v641, v631); + int16x8_t v815 = vsubq_s16(v608, v619); + int16x8_t v816 = vsubq_s16(v586, v597); + int16x8_t v817 = vsubq_s16(v562, v574); + int16x8_t v818 = vsubq_s16(v525, v551); + int16x8_t v819 = vsubq_s16(v471, v502); + int16x8_t v820 = vsubq_s16(v386, v445); + int16x8_t v821 = vsubq_s16(v143, v330); + vst1q_s16(out + out_stride * 0 + i, v331); + vst1q_s16(out + out_stride * 1 + i, v446); + vst1q_s16(out + out_stride * 2 + i, v503); + vst1q_s16(out + out_stride * 3 + i, v552); + vst1q_s16(out + out_stride * 4 + i, v575); + vst1q_s16(out + out_stride * 5 + i, v598); + vst1q_s16(out + out_stride * 6 + i, v620); + vst1q_s16(out + out_stride * 7 + i, v642); + vst1q_s16(out + out_stride * 8 + i, v652); + vst1q_s16(out + out_stride * 9 + i, v663); + vst1q_s16(out + out_stride * 10 + i, v674); + vst1q_s16(out + out_stride * 11 + i, v684); + vst1q_s16(out + out_stride * 12 + i, v695); + vst1q_s16(out + out_stride * 13 + i, v705); + vst1q_s16(out + out_stride * 14 + i, v715); + vst1q_s16(out + out_stride * 15 + i, v725); + vst1q_s16(out + out_stride * 16 + i, v729); + vst1q_s16(out + out_stride * 17 + i, v733); + vst1q_s16(out + out_stride * 18 + i, v737); + vst1q_s16(out + out_stride * 19 + i, v741); + vst1q_s16(out + out_stride * 20 + i, v745); + vst1q_s16(out + out_stride * 21 + i, v749); + vst1q_s16(out + out_stride * 22 + i, v753); + vst1q_s16(out + out_stride * 23 + i, v757); + vst1q_s16(out + out_stride * 24 + i, v761); + vst1q_s16(out + out_stride * 25 + i, v765); + vst1q_s16(out + out_stride * 26 + i, v769); + vst1q_s16(out + out_stride * 27 + i, v773); + vst1q_s16(out + out_stride * 28 + i, v777); + vst1q_s16(out + out_stride * 29 + i, v781); + vst1q_s16(out + out_stride * 30 + i, v785); + vst1q_s16(out + out_stride * 31 + i, v789); + vst1q_s16(out + out_stride * 32 + i, v790); + vst1q_s16(out + out_stride * 33 + i, v791); + vst1q_s16(out + out_stride * 34 + i, v792); + vst1q_s16(out + out_stride * 35 + i, v793); + vst1q_s16(out + out_stride * 36 + i, v794); + vst1q_s16(out + out_stride * 37 + i, v795); + vst1q_s16(out + out_stride * 38 + i, v796); + vst1q_s16(out + out_stride * 39 + i, v797); + vst1q_s16(out + out_stride * 40 + i, v798); + vst1q_s16(out + out_stride * 41 + i, v799); + vst1q_s16(out + out_stride * 42 + i, v800); + vst1q_s16(out + out_stride * 43 + i, v801); + vst1q_s16(out + out_stride * 44 + i, v802); + vst1q_s16(out + out_stride * 45 + i, v803); + vst1q_s16(out + out_stride * 46 + i, v804); + vst1q_s16(out + out_stride * 47 + i, v805); + vst1q_s16(out + out_stride * 48 + i, v806); + vst1q_s16(out + out_stride * 49 + i, v807); + vst1q_s16(out + out_stride * 50 + i, v808); + vst1q_s16(out + out_stride * 51 + i, v809); + vst1q_s16(out + out_stride * 52 + i, v810); + vst1q_s16(out + out_stride * 53 + i, v811); + vst1q_s16(out + out_stride * 54 + i, v812); + vst1q_s16(out + out_stride * 55 + i, v813); + vst1q_s16(out + out_stride * 56 + i, v814); + vst1q_s16(out + out_stride * 57 + i, v815); + vst1q_s16(out + out_stride * 58 + i, v816); + vst1q_s16(out + out_stride * 59 + i, v817); + vst1q_s16(out + out_stride * 60 + i, v818); + vst1q_s16(out + out_stride * 61 + i, v819); + vst1q_s16(out + out_stride * 62 + i, v820); + vst1q_s16(out + out_stride * 63 + i, v821); + } +} diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct8-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct8-inl.h new file mode 100644 index 0000000000..946ace4a0c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/fast_dct8-inl.h @@ -0,0 +1,80 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* This file is automatically generated. Do not modify it directly. */ +#if HWY_TARGET != HWY_NEON +#error "only include this file from fast_dct-inl.h" +#endif + +constexpr size_t FastIDCTIntegerBits(FastDCTTag<8>) { return 1; } + +void FastIDCT(FastDCTTag<8>, const int16_t* in, size_t in_stride, int16_t* out, + size_t out_stride, size_t count) { + JXL_ASSERT(count % 8 == 0); + for (size_t i = 0; i < count; i += 8) { + int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i); + int16x8_t v1 = vld1q_s16(in + in_stride * 4 + i); + int16x8_t v2 = vaddq_s16(v0, v1); + int16x8_t v3 = vld1q_s16(in + in_stride * 2 + i); + int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573); + int16x8_t v4 = vaddq_s16(v4_tmp, v3); + int16x8_t v5 = vld1q_s16(in + in_stride * 6 + i); + int16x8_t v6 = vaddq_s16(v5, v3); + int16x8_t v7 = vaddq_s16(v4, v6); + int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734); + int16x8_t v9 = vaddq_s16(v2, v8); + int16x8_t v10 = vld1q_s16(in + in_stride * 1 + i); + int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573); + int16x8_t v11 = vaddq_s16(v11_tmp, v10); + int16x8_t v12 = vld1q_s16(in + in_stride * 5 + i); + int16x8_t v13 = vld1q_s16(in + in_stride * 3 + i); + int16x8_t v14 = vaddq_s16(v12, v13); + int16x8_t v15 = vaddq_s16(v11, v14); + int16x8_t v16 = vaddq_s16(v13, v10); + int16x8_t v17 = vqrdmulhq_n_s16(v16, 25080); + int16x8_t v18 = vld1q_s16(in + in_stride * 7 + i); + int16x8_t v19 = vaddq_s16(v18, v12); + int16x8_t v20 = vaddq_s16(v16, v19); + int16x8_t v21 = vqrdmulhq_n_s16(v20, 17734); + int16x8_t v22 = vaddq_s16(v17, v21); + int16x8_t v23 = vaddq_s16(v15, v22); + int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705); + int16x8_t v25 = vaddq_s16(v9, v24); + int16x8_t v26 = vsubq_s16(v0, v1); + int16x8_t v27 = vsubq_s16(v4, v6); + int16x8_t v28_tmp = vqrdmulhq_n_s16(v27, 10045); + int16x8_t v28 = vaddq_s16(v28_tmp, v27); + int16x8_t v29 = vaddq_s16(v26, v28); + int16x8_t v30 = vsubq_s16(v11, v14); + int16x8_t v31 = vqrdmulhq_n_s16(v16, 17734); + int16x8_t v32_tmp = vqrdmulhq_n_s16(v19, 10045); + int16x8_t v32 = vaddq_s16(v32_tmp, v19); + int16x8_t v33 = vsubq_s16(v31, v32); + int16x8_t v34 = vaddq_s16(v30, v33); + int16x8_t v35 = vqrdmulhq_n_s16(v34, 19705); + int16x8_t v36 = vaddq_s16(v29, v35); + int16x8_t v37 = vsubq_s16(v26, v28); + int16x8_t v38 = vsubq_s16(v30, v33); + int16x8_t v39 = vqrdmulhq_n_s16(v38, 29490); + int16x8_t v40 = vaddq_s16(v37, v39); + int16x8_t v41 = vsubq_s16(v2, v8); + int16x8_t v42 = vsubq_s16(v15, v22); + int16x8_t v43_tmp = vqrdmulhq_n_s16(v42, 18446); + int16x8_t v43 = vmlaq_n_s16(v43_tmp, v42, 2); + int16x8_t v44 = vaddq_s16(v41, v43); + int16x8_t v45 = vsubq_s16(v41, v43); + int16x8_t v46 = vsubq_s16(v37, v39); + int16x8_t v47 = vsubq_s16(v29, v35); + int16x8_t v48 = vsubq_s16(v9, v24); + vst1q_s16(out + out_stride * 0 + i, v25); + vst1q_s16(out + out_stride * 1 + i, v36); + vst1q_s16(out + out_stride * 2 + i, v40); + vst1q_s16(out + out_stride * 3 + i, v44); + vst1q_s16(out + out_stride * 4 + i, v45); + vst1q_s16(out + out_stride * 5 + i, v46); + vst1q_s16(out + out_stride * 6 + i, v47); + vst1q_s16(out + out_stride * 7 + i, v48); + } +} diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct_test.cc b/third_party/jpeg-xl/lib/jxl/fast_dct_test.cc new file mode 100644 index 0000000000..5bb1a79cc5 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/fast_dct_test.cc @@ -0,0 +1,378 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/fast_dct_test.cc" +#include + +#include "lib/jxl/base/random.h" +#include "lib/jxl/dct-inl.h" +#include "lib/jxl/fast_dct-inl.h" +#include "lib/jxl/fast_dct.h" +#include "lib/jxl/transpose-inl.h" + +// Test utils +#include +#include +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +template +HWY_NOINLINE void TestFastTranspose() { +#if HWY_TARGET == HWY_NEON + auto array_mem = hwy::AllocateAligned(N * M); + int16_t* array = array_mem.get(); + auto transposed_mem = hwy::AllocateAligned(N * M); + int16_t* transposed = transposed_mem.get(); + std::iota(array, array + N * M, 0); + for (size_t j = 0; j < 100000000 / (N * M); j++) { + FastTransposeBlock(array, M, N, M, transposed, N); + } + for (size_t i = 0; i < M; i++) { + for (size_t j = 0; j < N; j++) { + EXPECT_EQ(array[j * M + i], transposed[i * N + j]); + } + } +#endif +} + +template +HWY_NOINLINE void TestFloatTranspose() { + auto array_mem = hwy::AllocateAligned(N * M); + float* array = array_mem.get(); + auto transposed_mem = hwy::AllocateAligned(N * M); + float* transposed = transposed_mem.get(); + std::iota(array, array + N * M, 0); + for (size_t j = 0; j < 100000000 / (N * M); j++) { + Transpose::Run(DCTFrom(array, M), DCTTo(transposed, N)); + } + for (size_t i = 0; i < M; i++) { + for (size_t j = 0; j < N; j++) { + EXPECT_EQ(array[j * M + i], transposed[i * N + j]); + } + } +} + +// TODO(sboukortt): re-enable the FloatIDCT tests once we find out why they fail +// in ASAN mode in the CI runners and seemingly not locally. + +HWY_NOINLINE void TestFastTranspose8x8() { TestFastTranspose<8, 8>(); } +HWY_NOINLINE void TestFloatTranspose8x8() { TestFloatTranspose<8, 8>(); } +HWY_NOINLINE void TestFastIDCT8x8() { TestFastIDCT<8, 8>(); } +HWY_NOINLINE void TestFloatIDCT8x8() { +#if HWY_TARGET == HWY_SCALAR && \ + (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER)) + GTEST_SKIP(); +#else + TestFloatIDCT<8, 8>(); +#endif +} +HWY_NOINLINE void TestFastTranspose8x16() { TestFastTranspose<8, 16>(); } +HWY_NOINLINE void TestFloatTranspose8x16() { TestFloatTranspose<8, 16>(); } +HWY_NOINLINE void TestFastIDCT8x16() { TestFastIDCT<8, 16>(); } +HWY_NOINLINE void TestFloatIDCT8x16() { +#if HWY_TARGET == HWY_SCALAR && \ + (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER)) + GTEST_SKIP(); +#else + TestFloatIDCT<8, 16>(); +#endif +} +HWY_NOINLINE void TestFastTranspose8x32() { TestFastTranspose<8, 32>(); } +HWY_NOINLINE void TestFloatTranspose8x32() { TestFloatTranspose<8, 32>(); } +HWY_NOINLINE void TestFastIDCT8x32() { TestFastIDCT<8, 32>(); } +HWY_NOINLINE void TestFloatIDCT8x32() { +#if HWY_TARGET == HWY_SCALAR && \ + (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER)) + GTEST_SKIP(); +#else + TestFloatIDCT<8, 32>(); +#endif +} +HWY_NOINLINE void TestFastTranspose16x8() { TestFastTranspose<16, 8>(); } +HWY_NOINLINE void TestFloatTranspose16x8() { TestFloatTranspose<16, 8>(); } +HWY_NOINLINE void TestFastIDCT16x8() { TestFastIDCT<16, 8>(); } +HWY_NOINLINE void TestFloatIDCT16x8() { +#if HWY_TARGET == HWY_SCALAR && \ + (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER)) + GTEST_SKIP(); +#else + TestFloatIDCT<16, 8>(); +#endif +} +HWY_NOINLINE void TestFastTranspose16x16() { TestFastTranspose<16, 16>(); } +HWY_NOINLINE void TestFloatTranspose16x16() { TestFloatTranspose<16, 16>(); } +HWY_NOINLINE void TestFastIDCT16x16() { TestFastIDCT<16, 16>(); } +HWY_NOINLINE void TestFloatIDCT16x16() { +#if HWY_TARGET == HWY_SCALAR && \ + (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER)) + GTEST_SKIP(); +#else + TestFloatIDCT<16, 16>(); +#endif +} +HWY_NOINLINE void TestFastTranspose16x32() { TestFastTranspose<16, 32>(); } +HWY_NOINLINE void TestFloatTranspose16x32() { TestFloatTranspose<16, 32>(); } +HWY_NOINLINE void TestFastIDCT16x32() { TestFastIDCT<16, 32>(); } +HWY_NOINLINE void TestFloatIDCT16x32() { +#if HWY_TARGET == HWY_SCALAR && \ + (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER)) + GTEST_SKIP(); +#else + TestFloatIDCT<16, 32>(); +#endif +} +HWY_NOINLINE void TestFastTranspose32x8() { TestFastTranspose<32, 8>(); } +HWY_NOINLINE void TestFloatTranspose32x8() { TestFloatTranspose<32, 8>(); } +HWY_NOINLINE void TestFastIDCT32x8() { TestFastIDCT<32, 8>(); } +HWY_NOINLINE void TestFloatIDCT32x8() { +#if HWY_TARGET == HWY_SCALAR && \ + (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER)) + GTEST_SKIP(); +#else + TestFloatIDCT<32, 8>(); +#endif +} +HWY_NOINLINE void TestFastTranspose32x16() { TestFastTranspose<32, 16>(); } +HWY_NOINLINE void TestFloatTranspose32x16() { TestFloatTranspose<32, 16>(); } +HWY_NOINLINE void TestFastIDCT32x16() { TestFastIDCT<32, 16>(); } +HWY_NOINLINE void TestFloatIDCT32x16() { +#if HWY_TARGET == HWY_SCALAR && \ + (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER)) + GTEST_SKIP(); +#else + TestFloatIDCT<32, 16>(); +#endif +} +HWY_NOINLINE void TestFastTranspose32x32() { TestFastTranspose<32, 32>(); } +HWY_NOINLINE void TestFloatTranspose32x32() { TestFloatTranspose<32, 32>(); } +HWY_NOINLINE void TestFastIDCT32x32() { TestFastIDCT<32, 32>(); } +HWY_NOINLINE void TestFloatIDCT32x32() { +#if HWY_TARGET == HWY_SCALAR && \ + (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER)) + GTEST_SKIP(); +#else + TestFloatIDCT<32, 32>(); +#endif +} +HWY_NOINLINE void TestFastTranspose32x64() { TestFastTranspose<32, 64>(); } +HWY_NOINLINE void TestFloatTranspose32x64() { TestFloatTranspose<32, 64>(); } +HWY_NOINLINE void TestFastIDCT32x64() { TestFastIDCT<32, 64>(); } +HWY_NOINLINE void TestFloatIDCT32x64() { +#if HWY_TARGET == HWY_SCALAR && \ + (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER)) + GTEST_SKIP(); +#else + TestFloatIDCT<32, 64>(); +#endif +} +HWY_NOINLINE void TestFastTranspose64x32() { TestFastTranspose<64, 32>(); } +HWY_NOINLINE void TestFloatTranspose64x32() { TestFloatTranspose<64, 32>(); } +HWY_NOINLINE void TestFastIDCT64x32() { TestFastIDCT<64, 32>(); } +HWY_NOINLINE void TestFloatIDCT64x32() { +#if HWY_TARGET == HWY_SCALAR && \ + (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER)) + GTEST_SKIP(); +#else + TestFloatIDCT<64, 32>(); +#endif +} +HWY_NOINLINE void TestFastTranspose64x64() { TestFastTranspose<64, 64>(); } +HWY_NOINLINE void TestFloatTranspose64x64() { TestFloatTranspose<64, 64>(); } +HWY_NOINLINE void TestFastIDCT64x64() { TestFastIDCT<64, 64>(); } +HWY_NOINLINE void TestFloatIDCT64x64() { +#if HWY_TARGET == HWY_SCALAR && \ + (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER)) + GTEST_SKIP(); +#else + TestFloatIDCT<64, 64>(); +#endif +} +HWY_NOINLINE void TestFastTranspose64x128() { TestFastTranspose<64, 128>(); } +HWY_NOINLINE void TestFloatTranspose64x128() { TestFloatTranspose<64, 128>(); } +/* +HWY_NOINLINE void TestFastIDCT64x128() { TestFastIDCT<64, 128>(); } +HWY_NOINLINE void TestFloatIDCT64x128() { +#if HWY_TARGET == HWY_SCALAR && \ + (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER)) + GTEST_SKIP(); +#else + TestFloatIDCT<64, 128>(); +#endif +} +*/ +HWY_NOINLINE void TestFastTranspose128x64() { TestFastTranspose<128, 64>(); } +HWY_NOINLINE void TestFloatTranspose128x64() { TestFloatTranspose<128, 64>(); } +/* +HWY_NOINLINE void TestFastIDCT128x64() { TestFastIDCT<128, 64>(); } +HWY_NOINLINE void TestFloatIDCT128x64() { +#if HWY_TARGET == HWY_SCALAR && \ + (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER)) + GTEST_SKIP(); +#else + TestFloatIDCT<128, 64>(); +#endif +} +*/ +HWY_NOINLINE void TestFastTranspose128x128() { TestFastTranspose<128, 128>(); } +HWY_NOINLINE void TestFloatTranspose128x128() { + TestFloatTranspose<128, 128>(); +} +/* +HWY_NOINLINE void TestFastIDCT128x128() { TestFastIDCT<128, 128>(); } +HWY_NOINLINE void TestFloatIDCT128x128() { +#if HWY_TARGET == HWY_SCALAR && \ + (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER)) + GTEST_SKIP(); +#else + TestFloatIDCT<128, 128>(); +#endif +} +*/ +HWY_NOINLINE void TestFastTranspose128x256() { TestFastTranspose<128, 256>(); } +HWY_NOINLINE void TestFloatTranspose128x256() { + TestFloatTranspose<128, 256>(); +} +/* +HWY_NOINLINE void TestFastIDCT128x256() { TestFastIDCT<128, 256>(); } +HWY_NOINLINE void TestFloatIDCT128x256() { +#if HWY_TARGET == HWY_SCALAR && \ + (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER)) + GTEST_SKIP(); +#else + TestFloatIDCT<128, 256>(); +#endif +} +*/ +HWY_NOINLINE void TestFastTranspose256x128() { TestFastTranspose<256, 128>(); } +HWY_NOINLINE void TestFloatTranspose256x128() { + TestFloatTranspose<256, 128>(); +} +/* +HWY_NOINLINE void TestFastIDCT256x128() { TestFastIDCT<256, 128>(); } +HWY_NOINLINE void TestFloatIDCT256x128() { +#if HWY_TARGET == HWY_SCALAR && \ + (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER)) + GTEST_SKIP(); +#else + TestFloatIDCT<256, 128>(); +#endif +} +*/ +HWY_NOINLINE void TestFastTranspose256x256() { TestFastTranspose<256, 256>(); } +HWY_NOINLINE void TestFloatTranspose256x256() { + TestFloatTranspose<256, 256>(); +} +/* +HWY_NOINLINE void TestFastIDCT256x256() { TestFastIDCT<256, 256>(); } +HWY_NOINLINE void TestFloatIDCT256x256() { +#if HWY_TARGET == HWY_SCALAR && \ + (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER)) + GTEST_SKIP(); +#else + TestFloatIDCT<256, 256>(); +#endif +} +*/ + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +class FastDCTTargetTest : public hwy::TestWithParamTarget {}; +HWY_TARGET_INSTANTIATE_TEST_SUITE_P(FastDCTTargetTest); + +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose8x8); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose8x8); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose8x16); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose8x16); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose8x32); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose8x32); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose16x8); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose16x8); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose16x16); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose16x16); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose16x32); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose16x32); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose32x8); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose32x8); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose32x16); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose32x16); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose32x32); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose32x32); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose32x64); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose32x64); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose64x32); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose64x32); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose64x64); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose64x64); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose64x128); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose64x128); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose128x64); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose128x64); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose128x128); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose128x128); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose128x256); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose128x256); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose256x128); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose256x128); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose256x256); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose256x256); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT8x8); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT8x8); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT8x16); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT8x16); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT8x32); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT8x32); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT16x8); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT16x8); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT16x16); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT16x16); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT16x32); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT16x32); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT32x8); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT32x8); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT32x16); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT32x16); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT32x32); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT32x32); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT32x64); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT32x64); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT64x32); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT64x32); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT64x64); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT64x64); +/* + * DCT-128 and above have very large errors just by rounding inputs. +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT64x128); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT64x128); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT128x64); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT128x64); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT128x128); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT128x128); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT128x256); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT128x256); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT256x128); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT256x128); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT256x256); +HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT256x256); +*/ + +TEST(FastDCTTest, TestWrapperFloat) { BenchmarkFloatIDCT32x32(); } +TEST(FastDCTTest, TestWrapperFast) { BenchmarkFastIDCT32x32(); } + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/fast_math-inl.h b/third_party/jpeg-xl/lib/jxl/fast_math-inl.h new file mode 100644 index 0000000000..5c48034290 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/fast_math-inl.h @@ -0,0 +1,236 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Fast SIMD math ops (log2, encoder only, cos, erf for splines) + +#if defined(LIB_JXL_FAST_MATH_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_FAST_MATH_INL_H_ +#undef LIB_JXL_FAST_MATH_INL_H_ +#else +#define LIB_JXL_FAST_MATH_INL_H_ +#endif + +#include + +#include "lib/jxl/common.h" +#include "lib/jxl/rational_polynomial-inl.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Abs; +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Eq; +using hwy::HWY_NAMESPACE::Floor; +using hwy::HWY_NAMESPACE::Ge; +using hwy::HWY_NAMESPACE::GetLane; +using hwy::HWY_NAMESPACE::IfThenElse; +using hwy::HWY_NAMESPACE::IfThenZeroElse; +using hwy::HWY_NAMESPACE::Le; +using hwy::HWY_NAMESPACE::Min; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::NegMulAdd; +using hwy::HWY_NAMESPACE::Rebind; +using hwy::HWY_NAMESPACE::ShiftLeft; +using hwy::HWY_NAMESPACE::ShiftRight; +using hwy::HWY_NAMESPACE::Sub; +using hwy::HWY_NAMESPACE::Xor; + +// Computes base-2 logarithm like std::log2. Undefined if negative / NaN. +// L1 error ~3.9E-6 +template +V FastLog2f(const DF df, V x) { + // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2). + HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f), + HWY_REP4(1.4287160470083755E+00f), + HWY_REP4(7.4245873327820566E-01f)}; + HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f), + HWY_REP4(1.0096718572241148E+00f), + HWY_REP4(1.7409343003366853E-01f)}; + + const Rebind di; + const auto x_bits = BitCast(di, x); + + // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops + const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab)); // = 2/3 + // Shifted exponent = log2; also used to clear mantissa. + const auto exp_shifted = ShiftRight<23>(exp_bits); + const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted))); + const auto exp_val = ConvertTo(df, exp_shifted); + return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q), + exp_val); +} + +// max relative error ~3e-7 +template +V FastPow2f(const DF df, V x) { + const Rebind di; + auto floorx = Floor(x); + auto exp = + BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127)))); + auto frac = Sub(x, floorx); + auto num = Add(frac, Set(df, 1.01749063e+01)); + num = MulAdd(num, frac, Set(df, 4.88687798e+01)); + num = MulAdd(num, frac, Set(df, 9.85506591e+01)); + num = Mul(num, exp); + auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02)); + den = MulAdd(den, frac, Set(df, -1.94414990e+01)); + den = MulAdd(den, frac, Set(df, 9.85506633e+01)); + return Div(num, den); +} + +// max relative error ~3e-5 +template +V FastPowf(const DF df, V base, V exponent) { + return FastPow2f(df, Mul(FastLog2f(df, base), exponent)); +} + +// Computes cosine like std::cos. +// L1 error 7e-5. +template +V FastCosf(const DF df, V x) { + // Step 1: range reduction to [0, 2pi) + const auto pi2 = Set(df, kPi * 2.0f); + const auto pi2_inv = Set(df, 0.5f / kPi); + const auto npi2 = Mul(Floor(Mul(x, pi2_inv)), pi2); + const auto xmodpi2 = Sub(x, npi2); + // Step 2: range reduction to [0, pi] + const auto x_pi = Min(xmodpi2, Sub(pi2, xmodpi2)); + // Step 3: range reduction to [0, pi/2] + const auto above_pihalf = Ge(x_pi, Set(df, kPi / 2.0f)); + const auto x_pihalf = IfThenElse(above_pihalf, Sub(Set(df, kPi), x_pi), x_pi); + // Step 4: Taylor-like approximation, scaled by 2**0.75 to make angle + // duplication steps faster, on x/4. + const auto xs = Mul(x_pihalf, Set(df, 0.25f)); + const auto x2 = Mul(xs, xs); + const auto x4 = Mul(x2, x2); + const auto cosx_prescaling = + MulAdd(x4, Set(df, 0.06960438), + MulAdd(x2, Set(df, -0.84087373), Set(df, 1.68179268))); + // Step 5: angle duplication. + const auto cosx_scale1 = + MulAdd(cosx_prescaling, cosx_prescaling, Set(df, -1.414213562)); + const auto cosx_scale2 = MulAdd(cosx_scale1, cosx_scale1, Set(df, -1)); + // Step 6: change sign if needed. + const Rebind du; + auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, above_pihalf))); + return BitCast(df, Xor(signbit, BitCast(du, cosx_scale2))); +} + +// Computes the error function like std::erf. +// L1 error 7e-4. +template +V FastErff(const DF df, V x) { + // Formula from + // https://en.wikipedia.org/wiki/Error_function#Numerical_approximations + // but constants have been recomputed. + const auto xle0 = Le(x, Zero(df)); + const auto absx = Abs(x); + // Compute 1 - 1 / ((((x * a + b) * x + c) * x + d) * x + 1)**4 + const auto denom1 = + MulAdd(absx, Set(df, 7.77394369e-02), Set(df, 2.05260015e-04)); + const auto denom2 = MulAdd(denom1, absx, Set(df, 2.32120216e-01)); + const auto denom3 = MulAdd(denom2, absx, Set(df, 2.77820801e-01)); + const auto denom4 = MulAdd(denom3, absx, Set(df, 1.0f)); + const auto denom5 = Mul(denom4, denom4); + const auto inv_denom5 = Div(Set(df, 1.0f), denom5); + const auto result = NegMulAdd(inv_denom5, inv_denom5, Set(df, 1.0f)); + // Change sign if needed. + const Rebind du; + auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, xle0))); + return BitCast(df, Xor(signbit, BitCast(du, result))); +} + +inline float FastLog2f(float f) { + HWY_CAPPED(float, 1) D; + return GetLane(FastLog2f(D, Set(D, f))); +} + +inline float FastPow2f(float f) { + HWY_CAPPED(float, 1) D; + return GetLane(FastPow2f(D, Set(D, f))); +} + +inline float FastPowf(float b, float e) { + HWY_CAPPED(float, 1) D; + return GetLane(FastPowf(D, Set(D, b), Set(D, e))); +} + +inline float FastCosf(float f) { + HWY_CAPPED(float, 1) D; + return GetLane(FastCosf(D, Set(D, f))); +} + +inline float FastErff(float f) { + HWY_CAPPED(float, 1) D; + return GetLane(FastErff(D, Set(D, f))); +} + +// Returns cbrt(x) + add with 6 ulp max error. +// Modified from vectormath_exp.h, Apache 2 license. +// https://www.agner.org/optimize/vectorclass.zip +template +V CubeRootAndAdd(const V x, const V add) { + const HWY_FULL(float) df; + const HWY_FULL(int32_t) di; + + const auto kExpBias = Set(di, 0x54800000); // cast(1.) + cast(1.) / 3 + const auto kExpMul = Set(di, 0x002AAAAA); // shifted 1/3 + const auto k1_3 = Set(df, 1.0f / 3); + const auto k4_3 = Set(df, 4.0f / 3); + + const auto xa = x; // assume inputs never negative + const auto xa_3 = Mul(k1_3, xa); + + // Multiply exponent by -1/3 + const auto m1 = BitCast(di, xa); + // Special case for 0. 0 is represented with an exponent of 0, so the + // "kExpBias - 1/3 * exp" below gives the wrong result. The IfThenZeroElse() + // sets those values as 0, which prevents having NaNs in the computations + // below. + // TODO(eustas): use fused op + const auto m2 = IfThenZeroElse( + Eq(m1, Zero(di)), Sub(kExpBias, Mul((ShiftRight<23>(m1)), kExpMul))); + auto r = BitCast(df, m2); + + // Newton-Raphson iterations + for (int i = 0; i < 3; i++) { + const auto r2 = Mul(r, r); + r = NegMulAdd(xa_3, Mul(r2, r2), Mul(k4_3, r)); + } + // Final iteration + auto r2 = Mul(r, r); + r = MulAdd(k1_3, NegMulAdd(xa, Mul(r2, r2), r), r); + r2 = Mul(r, r); + r = MulAdd(r2, x, add); + + return r; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_FAST_MATH_INL_H_ + +#if HWY_ONCE +#ifndef FAST_MATH_ONCE +#define FAST_MATH_ONCE + +namespace jxl { +inline float FastLog2f(float f) { return HWY_STATIC_DISPATCH(FastLog2f)(f); } +inline float FastPow2f(float f) { return HWY_STATIC_DISPATCH(FastPow2f)(f); } +inline float FastPowf(float b, float e) { + return HWY_STATIC_DISPATCH(FastPowf)(b, e); +} +inline float FastCosf(float f) { return HWY_STATIC_DISPATCH(FastCosf)(f); } +inline float FastErff(float f) { return HWY_STATIC_DISPATCH(FastErff)(f); } +} // namespace jxl + +#endif // FAST_MATH_ONCE +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/fast_math_test.cc b/third_party/jpeg-xl/lib/jxl/fast_math_test.cc new file mode 100644 index 0000000000..897aadc120 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/fast_math_test.cc @@ -0,0 +1,288 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/fast_math_test.cc" +#include + +#include "lib/jxl/base/random.h" +#include "lib/jxl/dec_xyb-inl.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/transfer_functions-inl.h" + +// Test utils +#include +#include +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +HWY_NOINLINE void TestFastLog2() { + constexpr size_t kNumTrials = 1 << 23; + Rng rng(1); + float max_abs_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float f = rng.UniformF(1e-7f, 1e3f); + const auto actual_v = FastLog2f(d, Set(d, f)); + const float actual = GetLane(actual_v); + const float abs_err = std::abs(std::log2(f) - actual); + EXPECT_LT(abs_err, 3.1E-6) << "f = " << f; + max_abs_err = std::max(max_abs_err, abs_err); + } + printf("max abs err %e\n", static_cast(max_abs_err)); +} + +HWY_NOINLINE void TestFastPow2() { + constexpr size_t kNumTrials = 1 << 23; + Rng rng(1); + float max_rel_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float f = rng.UniformF(-100, 100); + const auto actual_v = FastPow2f(d, Set(d, f)); + const float actual = GetLane(actual_v); + const float expected = std::pow(2, f); + const float rel_err = std::abs(expected - actual) / expected; + EXPECT_LT(rel_err, 3.1E-6) << "f = " << f; + max_rel_err = std::max(max_rel_err, rel_err); + } + printf("max rel err %e\n", static_cast(max_rel_err)); +} + +HWY_NOINLINE void TestFastPow() { + constexpr size_t kNumTrials = 1 << 23; + Rng rng(1); + float max_rel_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float b = rng.UniformF(1e-3f, 1e3f); + const float e = rng.UniformF(-10, 10); + const auto actual_v = FastPowf(d, Set(d, b), Set(d, e)); + const float actual = GetLane(actual_v); + const float expected = std::pow(b, e); + const float rel_err = std::abs(expected - actual) / expected; + EXPECT_LT(rel_err, 3E-5) << "b = " << b << " e = " << e; + max_rel_err = std::max(max_rel_err, rel_err); + } + printf("max rel err %e\n", static_cast(max_rel_err)); +} + +HWY_NOINLINE void TestFastCos() { + constexpr size_t kNumTrials = 1 << 23; + Rng rng(1); + float max_abs_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float f = rng.UniformF(-1e3f, 1e3f); + const auto actual_v = FastCosf(d, Set(d, f)); + const float actual = GetLane(actual_v); + const float abs_err = std::abs(std::cos(f) - actual); + EXPECT_LT(abs_err, 7E-5) << "f = " << f; + max_abs_err = std::max(max_abs_err, abs_err); + } + printf("max abs err %e\n", static_cast(max_abs_err)); +} + +HWY_NOINLINE void TestFastErf() { + constexpr size_t kNumTrials = 1 << 23; + Rng rng(1); + float max_abs_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float f = rng.UniformF(-5.f, 5.f); + const auto actual_v = FastErff(d, Set(d, f)); + const float actual = GetLane(actual_v); + const float abs_err = std::abs(std::erf(f) - actual); + EXPECT_LT(abs_err, 7E-4) << "f = " << f; + max_abs_err = std::max(max_abs_err, abs_err); + } + printf("max abs err %e\n", static_cast(max_abs_err)); +} + +HWY_NOINLINE void TestCubeRoot() { + const HWY_FULL(float) d; + for (uint64_t x5 = 0; x5 < 2000000; x5++) { + const float x = x5 * 1E-5f; + const float expected = cbrtf(x); + HWY_ALIGN float approx[MaxLanes(d)]; + Store(CubeRootAndAdd(Set(d, x), Zero(d)), d, approx); + + // All lanes are same + for (size_t i = 1; i < Lanes(d); ++i) { + EXPECT_NEAR(approx[0], approx[i], 5E-7f); + } + EXPECT_NEAR(approx[0], expected, 8E-7f); + } +} + +HWY_NOINLINE void TestFastSRGB() { + constexpr size_t kNumTrials = 1 << 23; + Rng rng(1); + float max_abs_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float f = rng.UniformF(0.0f, 1.0f); + const auto actual_v = FastLinearToSRGB(d, Set(d, f)); + const float actual = GetLane(actual_v); + const float expected = GetLane(TF_SRGB().EncodedFromDisplay(d, Set(d, f))); + const float abs_err = std::abs(expected - actual); + EXPECT_LT(abs_err, 1.2E-4) << "f = " << f; + max_abs_err = std::max(max_abs_err, abs_err); + } + printf("max abs err %e\n", static_cast(max_abs_err)); +} + +HWY_NOINLINE void TestFastPQEFD() { + constexpr size_t kNumTrials = 1 << 23; + Rng rng(1); + float max_abs_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float f = rng.UniformF(0.0f, 1.0f); + const float actual = GetLane(TF_PQ().EncodedFromDisplay(d, Set(d, f))); + const float expected = TF_PQ().EncodedFromDisplay(f); + const float abs_err = std::abs(expected - actual); + EXPECT_LT(abs_err, 7e-7) << "f = " << f; + max_abs_err = std::max(max_abs_err, abs_err); + } + printf("max abs err %e\n", static_cast(max_abs_err)); +} + +HWY_NOINLINE void TestFastHLGEFD() { + constexpr size_t kNumTrials = 1 << 23; + Rng rng(1); + float max_abs_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float f = rng.UniformF(0.0f, 1.0f); + const float actual = GetLane(TF_HLG().EncodedFromDisplay(d, Set(d, f))); + const float expected = TF_HLG().EncodedFromDisplay(f); + const float abs_err = std::abs(expected - actual); + EXPECT_LT(abs_err, 5e-7) << "f = " << f; + max_abs_err = std::max(max_abs_err, abs_err); + } + printf("max abs err %e\n", static_cast(max_abs_err)); +} + +HWY_NOINLINE void TestFast709EFD() { + constexpr size_t kNumTrials = 1 << 23; + Rng rng(1); + float max_abs_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float f = rng.UniformF(0.0f, 1.0f); + const float actual = GetLane(TF_709().EncodedFromDisplay(d, Set(d, f))); + const float expected = TF_709().EncodedFromDisplay(f); + const float abs_err = std::abs(expected - actual); + EXPECT_LT(abs_err, 2e-6) << "f = " << f; + max_abs_err = std::max(max_abs_err, abs_err); + } + printf("max abs err %e\n", static_cast(max_abs_err)); +} + +HWY_NOINLINE void TestFastPQDFE() { + constexpr size_t kNumTrials = 1 << 23; + Rng rng(1); + float max_abs_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float f = rng.UniformF(0.0f, 1.0f); + const float actual = GetLane(TF_PQ().DisplayFromEncoded(d, Set(d, f))); + const float expected = TF_PQ().DisplayFromEncoded(f); + const float abs_err = std::abs(expected - actual); + EXPECT_LT(abs_err, 3E-6) << "f = " << f; + max_abs_err = std::max(max_abs_err, abs_err); + } + printf("max abs err %e\n", static_cast(max_abs_err)); +} + +HWY_NOINLINE void TestFastXYB() { + if (!HasFastXYBTosRGB8()) return; + ImageMetadata metadata; + ImageBundle ib(&metadata); + int scaling = 1; + int n = 256 * scaling; + float inv_scaling = 1.0f / scaling; + int kChunk = 32; + // The image is divided in chunks to reduce total memory usage. + for (int cr = 0; cr < n; cr += kChunk) { + for (int cg = 0; cg < n; cg += kChunk) { + for (int cb = 0; cb < n; cb += kChunk) { + Image3F chunk(kChunk * kChunk, kChunk); + for (int ir = 0; ir < kChunk; ir++) { + for (int ig = 0; ig < kChunk; ig++) { + for (int ib = 0; ib < kChunk; ib++) { + float r = (cr + ir) * inv_scaling; + float g = (cg + ig) * inv_scaling; + float b = (cb + ib) * inv_scaling; + chunk.PlaneRow(0, ir)[ig * kChunk + ib] = r * (1.0f / 255); + chunk.PlaneRow(1, ir)[ig * kChunk + ib] = g * (1.0f / 255); + chunk.PlaneRow(2, ir)[ig * kChunk + ib] = b * (1.0f / 255); + } + } + } + ib.SetFromImage(std::move(chunk), ColorEncoding::SRGB()); + Image3F xyb(kChunk * kChunk, kChunk); + std::vector roundtrip(kChunk * kChunk * kChunk * 3); + ToXYB(ib, nullptr, &xyb, GetJxlCms()); + for (int y = 0; y < kChunk; y++) { + const float* xyba[4] = {xyb.PlaneRow(0, y), xyb.PlaneRow(1, y), + xyb.PlaneRow(2, y), nullptr}; + jxl::HWY_NAMESPACE::FastXYBTosRGB8( + xyba, roundtrip.data() + 3 * xyb.xsize() * y, false, xyb.xsize()); + } + for (int ir = 0; ir < kChunk; ir++) { + for (int ig = 0; ig < kChunk; ig++) { + for (int ib = 0; ib < kChunk; ib++) { + float r = (cr + ir) * inv_scaling; + float g = (cg + ig) * inv_scaling; + float b = (cb + ib) * inv_scaling; + size_t idx = ir * kChunk * kChunk + ig * kChunk + ib; + int rr = roundtrip[3 * idx]; + int rg = roundtrip[3 * idx + 1]; + int rb = roundtrip[3 * idx + 2]; + EXPECT_LT(abs(r - rr), 2) << "expected " << r << " got " << rr; + EXPECT_LT(abs(g - rg), 2) << "expected " << g << " got " << rg; + EXPECT_LT(abs(b - rb), 2) << "expected " << b << " got " << rb; + } + } + } + } + } + } +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +class FastMathTargetTest : public hwy::TestWithParamTarget {}; +HWY_TARGET_INSTANTIATE_TEST_SUITE_P(FastMathTargetTest); + +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastLog2); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastPow2); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastPow); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastCos); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastErf); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestCubeRoot); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastSRGB); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastPQDFE); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastPQEFD); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastHLGEFD); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFast709EFD); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastXYB); + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/field_encodings.h b/third_party/jpeg-xl/lib/jxl/field_encodings.h new file mode 100644 index 0000000000..613e8fad33 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/field_encodings.h @@ -0,0 +1,134 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_FIELD_ENCODINGS_H_ +#define LIB_JXL_FIELD_ENCODINGS_H_ + +// Constants needed to encode/decode fields; avoids including the full fields.h. + +#include +#include + +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +// Macro to define the Fields' derived class Name when compiling with debug +// names. +#if JXL_IS_DEBUG_BUILD +#define JXL_FIELDS_NAME(X) \ + const char* Name() const override { return #X; } +#else +#define JXL_FIELDS_NAME(X) +#endif // JXL_IS_DEBUG_BUILD + +class Visitor; +class Fields { + public: + virtual ~Fields() = default; +#if JXL_IS_DEBUG_BUILD + virtual const char* Name() const = 0; +#endif // JXL_IS_DEBUG_BUILD + virtual Status VisitFields(Visitor* JXL_RESTRICT visitor) = 0; +}; + +// Distribution of U32 values for one particular selector. Represents either a +// power of two-sized range, or a single value. A separate type ensures this is +// only passed to the U32Enc ctor. +struct U32Distr { + // No need to validate - all `d` are legitimate. + constexpr explicit U32Distr(uint32_t d) : d(d) {} + + static constexpr uint32_t kDirect = 0x80000000u; + + constexpr bool IsDirect() const { return (d & kDirect) != 0; } + + // Only call if IsDirect(). + constexpr uint32_t Direct() const { return d & (kDirect - 1); } + + // Only call if !IsDirect(). + constexpr size_t ExtraBits() const { return (d & 0x1F) + 1; } + uint32_t Offset() const { return (d >> 5) & 0x3FFFFFF; } + + uint32_t d; +}; + +// A direct-coded 31-bit value occupying 2 bits in the bitstream. +constexpr U32Distr Val(uint32_t value) { + return U32Distr(value | U32Distr::kDirect); +} + +// Value - `offset` will be signaled in `bits` extra bits. +constexpr U32Distr BitsOffset(uint32_t bits, uint32_t offset) { + return U32Distr(((bits - 1) & 0x1F) + ((offset & 0x3FFFFFF) << 5)); +} + +// Value will be signaled in `bits` extra bits. +constexpr U32Distr Bits(uint32_t bits) { return BitsOffset(bits, 0); } + +// See U32Coder documentation in fields.h. +class U32Enc { + public: + constexpr U32Enc(const U32Distr d0, const U32Distr d1, const U32Distr d2, + const U32Distr d3) + : d_{d0, d1, d2, d3} {} + + // Returns the U32Distr at `selector` = 0..3, least-significant first. + U32Distr GetDistr(const uint32_t selector) const { + JXL_ASSERT(selector < 4); + return d_[selector]; + } + + private: + U32Distr d_[4]; +}; + +// Returns bit with the given `index` (0 = least significant). +template +static inline constexpr uint64_t MakeBit(T index) { + return 1ULL << static_cast(index); +} + +// Returns vector of all possible values of an Enum type. Relies on each Enum +// providing an overload of EnumBits() that returns a bit array of its values, +// which implies values must be in [0, 64). +template +std::vector Values() { + uint64_t bits = EnumBits(Enum()); + + std::vector values; + values.reserve(hwy::PopCount(bits)); + + // For each 1-bit in bits: add its index as value + while (bits != 0) { + const int index = Num0BitsBelowLS1Bit_Nonzero(bits); + values.push_back(static_cast(index)); + bits &= bits - 1; // clear least-significant bit + } + return values; +} + +// Returns true if value is one of Values(). +template +Status EnumValid(const Enum value) { + if (static_cast(value) >= 64) { + return JXL_FAILURE("Value %u too large for %s\n", + static_cast(value), EnumName(Enum())); + } + const uint64_t bit = MakeBit(value); + if ((EnumBits(Enum()) & bit) == 0) { + return JXL_FAILURE("Invalid value %u for %s\n", + static_cast(value), EnumName(Enum())); + } + return true; +} + +} // namespace jxl + +#endif // LIB_JXL_FIELD_ENCODINGS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/fields.cc b/third_party/jpeg-xl/lib/jxl/fields.cc new file mode 100644 index 0000000000..cd1e72bd94 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/fields.cc @@ -0,0 +1,642 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/fields.h" + +#include + +#include +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/printf_macros.h" + +namespace jxl { + +namespace { + +using ::jxl::fields_internal::VisitorBase; + +struct InitVisitor : public VisitorBase { + Status Bits(const size_t /*unused*/, const uint32_t default_value, + uint32_t* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + Status U32(const U32Enc /*unused*/, const uint32_t default_value, + uint32_t* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + Status U64(const uint64_t default_value, + uint64_t* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + Status Bool(bool default_value, bool* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + Status F16(const float default_value, float* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + // Always visit conditional fields to ensure they are initialized. + Status Conditional(bool /*condition*/) override { return true; } + + Status AllDefault(const Fields& /*fields*/, + bool* JXL_RESTRICT all_default) override { + // Just initialize this field and don't skip initializing others. + JXL_RETURN_IF_ERROR(Bool(true, all_default)); + return false; + } + + Status VisitNested(Fields* /*fields*/) override { + // Avoid re-initializing nested bundles (their ctors already called + // Bundle::Init for their fields). + return true; + } +}; + +// Similar to InitVisitor, but also initializes nested fields. +struct SetDefaultVisitor : public VisitorBase { + Status Bits(const size_t /*unused*/, const uint32_t default_value, + uint32_t* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + Status U32(const U32Enc /*unused*/, const uint32_t default_value, + uint32_t* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + Status U64(const uint64_t default_value, + uint64_t* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + Status Bool(bool default_value, bool* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + Status F16(const float default_value, float* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + // Always visit conditional fields to ensure they are initialized. + Status Conditional(bool /*condition*/) override { return true; } + + Status AllDefault(const Fields& /*fields*/, + bool* JXL_RESTRICT all_default) override { + // Just initialize this field and don't skip initializing others. + JXL_RETURN_IF_ERROR(Bool(true, all_default)); + return false; + } +}; + +class AllDefaultVisitor : public VisitorBase { + public: + explicit AllDefaultVisitor() : VisitorBase() {} + + Status Bits(const size_t bits, const uint32_t default_value, + uint32_t* JXL_RESTRICT value) override { + all_default_ &= *value == default_value; + return true; + } + + Status U32(const U32Enc /*unused*/, const uint32_t default_value, + uint32_t* JXL_RESTRICT value) override { + all_default_ &= *value == default_value; + return true; + } + + Status U64(const uint64_t default_value, + uint64_t* JXL_RESTRICT value) override { + all_default_ &= *value == default_value; + return true; + } + + Status F16(const float default_value, float* JXL_RESTRICT value) override { + all_default_ &= std::abs(*value - default_value) < 1E-6f; + return true; + } + + Status AllDefault(const Fields& /*fields*/, + bool* JXL_RESTRICT /*all_default*/) override { + // Visit all fields so we can compute the actual all_default_ value. + return false; + } + + bool AllDefault() const { return all_default_; } + + private: + bool all_default_ = true; +}; + +class ReadVisitor : public VisitorBase { + public: + explicit ReadVisitor(BitReader* reader) : VisitorBase(), reader_(reader) {} + + Status Bits(const size_t bits, const uint32_t /*default_value*/, + uint32_t* JXL_RESTRICT value) override { + *value = BitsCoder::Read(bits, reader_); + if (!reader_->AllReadsWithinBounds()) { + return JXL_STATUS(StatusCode::kNotEnoughBytes, + "Not enough bytes for header"); + } + return true; + } + + Status U32(const U32Enc dist, const uint32_t /*default_value*/, + uint32_t* JXL_RESTRICT value) override { + *value = U32Coder::Read(dist, reader_); + if (!reader_->AllReadsWithinBounds()) { + return JXL_STATUS(StatusCode::kNotEnoughBytes, + "Not enough bytes for header"); + } + return true; + } + + Status U64(const uint64_t /*default_value*/, + uint64_t* JXL_RESTRICT value) override { + *value = U64Coder::Read(reader_); + if (!reader_->AllReadsWithinBounds()) { + return JXL_STATUS(StatusCode::kNotEnoughBytes, + "Not enough bytes for header"); + } + return true; + } + + Status F16(const float /*default_value*/, + float* JXL_RESTRICT value) override { + ok_ &= F16Coder::Read(reader_, value); + if (!reader_->AllReadsWithinBounds()) { + return JXL_STATUS(StatusCode::kNotEnoughBytes, + "Not enough bytes for header"); + } + return true; + } + + void SetDefault(Fields* fields) override { Bundle::SetDefault(fields); } + + bool IsReading() const override { return true; } + + // This never fails because visitors are expected to keep reading until + // EndExtensions, see comment there. + Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override { + JXL_QUIET_RETURN_IF_ERROR(VisitorBase::BeginExtensions(extensions)); + if (*extensions == 0) return true; + + // For each nonzero bit, i.e. extension that is present: + for (uint64_t remaining_extensions = *extensions; remaining_extensions != 0; + remaining_extensions &= remaining_extensions - 1) { + const size_t idx_extension = + Num0BitsBelowLS1Bit_Nonzero(remaining_extensions); + // Read additional U64 (one per extension) indicating the number of bits + // (allows skipping individual extensions). + JXL_RETURN_IF_ERROR(U64(0, &extension_bits_[idx_extension])); + if (!SafeAdd(total_extension_bits_, extension_bits_[idx_extension], + total_extension_bits_)) { + return JXL_FAILURE("Extension bits overflowed, invalid codestream"); + } + } + // Used by EndExtensions to skip past any _remaining_ extensions. + pos_after_ext_size_ = reader_->TotalBitsConsumed(); + JXL_ASSERT(pos_after_ext_size_ != 0); + return true; + } + + Status EndExtensions() override { + JXL_QUIET_RETURN_IF_ERROR(VisitorBase::EndExtensions()); + // Happens if extensions == 0: don't read size, done. + if (pos_after_ext_size_ == 0) return true; + + // Not enough bytes as set by BeginExtensions or earlier. Do not return + // this as a JXL_FAILURE or false (which can also propagate to error + // through e.g. JXL_RETURN_IF_ERROR), since this may be used while + // silently checking whether there are enough bytes. If this case must be + // treated as an error, reader_>Close() will do this, just like is already + // done for non-extension fields. + if (!enough_bytes_) return true; + + // Skip new fields this (old?) decoder didn't know about, if any. + const size_t bits_read = reader_->TotalBitsConsumed(); + uint64_t end; + if (!SafeAdd(pos_after_ext_size_, total_extension_bits_, end)) { + return JXL_FAILURE("Invalid extension size, caused overflow"); + } + if (bits_read > end) { + return JXL_FAILURE("Read more extension bits than budgeted"); + } + const size_t remaining_bits = end - bits_read; + if (remaining_bits != 0) { + JXL_WARNING("Skipping %" PRIuS "-bit extension(s)", remaining_bits); + reader_->SkipBits(remaining_bits); + if (!reader_->AllReadsWithinBounds()) { + return JXL_STATUS(StatusCode::kNotEnoughBytes, + "Not enough bytes for header"); + } + } + return true; + } + + Status OK() const { return ok_; } + + private: + // Whether any error other than not enough bytes occurred. + bool ok_ = true; + + // Whether there are enough input bytes to read from. + bool enough_bytes_ = true; + BitReader* const reader_; + // May be 0 even if the corresponding extension is present. + uint64_t extension_bits_[Bundle::kMaxExtensions] = {0}; + uint64_t total_extension_bits_ = 0; + size_t pos_after_ext_size_ = 0; // 0 iff extensions == 0. +}; + +class MaxBitsVisitor : public VisitorBase { + public: + Status Bits(const size_t bits, const uint32_t /*default_value*/, + uint32_t* JXL_RESTRICT /*value*/) override { + max_bits_ += BitsCoder::MaxEncodedBits(bits); + return true; + } + + Status U32(const U32Enc enc, const uint32_t /*default_value*/, + uint32_t* JXL_RESTRICT /*value*/) override { + max_bits_ += U32Coder::MaxEncodedBits(enc); + return true; + } + + Status U64(const uint64_t /*default_value*/, + uint64_t* JXL_RESTRICT /*value*/) override { + max_bits_ += U64Coder::MaxEncodedBits(); + return true; + } + + Status F16(const float /*default_value*/, + float* JXL_RESTRICT /*value*/) override { + max_bits_ += F16Coder::MaxEncodedBits(); + return true; + } + + Status AllDefault(const Fields& /*fields*/, + bool* JXL_RESTRICT all_default) override { + JXL_RETURN_IF_ERROR(Bool(true, all_default)); + return false; // For max bits, assume nothing is default + } + + // Always visit conditional fields to get a (loose) upper bound. + Status Conditional(bool /*condition*/) override { return true; } + + Status BeginExtensions(uint64_t* JXL_RESTRICT /*extensions*/) override { + // Skip - extensions are not included in "MaxBits" because their length + // is potentially unbounded. + return true; + } + + Status EndExtensions() override { return true; } + + size_t MaxBits() const { return max_bits_; } + + private: + size_t max_bits_ = 0; +}; + +class CanEncodeVisitor : public VisitorBase { + public: + explicit CanEncodeVisitor() : VisitorBase() {} + + Status Bits(const size_t bits, const uint32_t /*default_value*/, + uint32_t* JXL_RESTRICT value) override { + size_t encoded_bits = 0; + ok_ &= BitsCoder::CanEncode(bits, *value, &encoded_bits); + encoded_bits_ += encoded_bits; + return true; + } + + Status U32(const U32Enc enc, const uint32_t /*default_value*/, + uint32_t* JXL_RESTRICT value) override { + size_t encoded_bits = 0; + ok_ &= U32Coder::CanEncode(enc, *value, &encoded_bits); + encoded_bits_ += encoded_bits; + return true; + } + + Status U64(const uint64_t /*default_value*/, + uint64_t* JXL_RESTRICT value) override { + size_t encoded_bits = 0; + ok_ &= U64Coder::CanEncode(*value, &encoded_bits); + encoded_bits_ += encoded_bits; + return true; + } + + Status F16(const float /*default_value*/, + float* JXL_RESTRICT value) override { + size_t encoded_bits = 0; + ok_ &= F16Coder::CanEncode(*value, &encoded_bits); + encoded_bits_ += encoded_bits; + return true; + } + + Status AllDefault(const Fields& fields, + bool* JXL_RESTRICT all_default) override { + *all_default = Bundle::AllDefault(fields); + JXL_RETURN_IF_ERROR(Bool(true, all_default)); + return *all_default; + } + + Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override { + JXL_QUIET_RETURN_IF_ERROR(VisitorBase::BeginExtensions(extensions)); + extensions_ = *extensions; + if (*extensions != 0) { + JXL_ASSERT(pos_after_ext_ == 0); + pos_after_ext_ = encoded_bits_; + JXL_ASSERT(pos_after_ext_ != 0); // visited "extensions" + } + return true; + } + // EndExtensions = default. + + Status GetSizes(size_t* JXL_RESTRICT extension_bits, + size_t* JXL_RESTRICT total_bits) { + JXL_RETURN_IF_ERROR(ok_); + *extension_bits = 0; + *total_bits = encoded_bits_; + // Only if extension field was nonzero will we encode their sizes. + if (pos_after_ext_ != 0) { + JXL_ASSERT(encoded_bits_ >= pos_after_ext_); + *extension_bits = encoded_bits_ - pos_after_ext_; + // Also need to encode *extension_bits and bill it to *total_bits. + size_t encoded_bits = 0; + ok_ &= U64Coder::CanEncode(*extension_bits, &encoded_bits); + *total_bits += encoded_bits; + + // TODO(janwas): support encoding individual extension sizes. We + // currently ascribe all bits to the first and send zeros for the + // others. + for (size_t i = 1; i < hwy::PopCount(extensions_); ++i) { + encoded_bits = 0; + ok_ &= U64Coder::CanEncode(0, &encoded_bits); + *total_bits += encoded_bits; + } + } + return true; + } + + private: + bool ok_ = true; + size_t encoded_bits_ = 0; + uint64_t extensions_ = 0; + // Snapshot of encoded_bits_ after visiting the extension field, but NOT + // including the hidden extension sizes. + uint64_t pos_after_ext_ = 0; +}; +} // namespace + +void Bundle::Init(Fields* fields) { + InitVisitor visitor; + if (!visitor.Visit(fields)) { + JXL_ABORT("Init should never fail"); + } +} +void Bundle::SetDefault(Fields* fields) { + SetDefaultVisitor visitor; + if (!visitor.Visit(fields)) { + JXL_ABORT("SetDefault should never fail"); + } +} +bool Bundle::AllDefault(const Fields& fields) { + AllDefaultVisitor visitor; + if (!visitor.VisitConst(fields)) { + JXL_ABORT("AllDefault should never fail"); + } + return visitor.AllDefault(); +} +size_t Bundle::MaxBits(const Fields& fields) { + MaxBitsVisitor visitor; +#if JXL_ENABLE_ASSERT + Status ret = +#else + (void) +#endif // JXL_ENABLE_ASSERT + visitor.VisitConst(fields); + JXL_ASSERT(ret); + return visitor.MaxBits(); +} +Status Bundle::CanEncode(const Fields& fields, size_t* extension_bits, + size_t* total_bits) { + CanEncodeVisitor visitor; + JXL_QUIET_RETURN_IF_ERROR(visitor.VisitConst(fields)); + JXL_QUIET_RETURN_IF_ERROR(visitor.GetSizes(extension_bits, total_bits)); + return true; +} +Status Bundle::Read(BitReader* reader, Fields* fields) { + ReadVisitor visitor(reader); + JXL_RETURN_IF_ERROR(visitor.Visit(fields)); + return visitor.OK(); +} +bool Bundle::CanRead(BitReader* reader, Fields* fields) { + ReadVisitor visitor(reader); + Status status = visitor.Visit(fields); + // We are only checking here whether there are enough bytes. We still return + // true for other errors because it means there are enough bytes to determine + // there's an error. Use Read() to determine which error it is. + return status.code() != StatusCode::kNotEnoughBytes; +} + +size_t BitsCoder::MaxEncodedBits(const size_t bits) { return bits; } + +Status BitsCoder::CanEncode(const size_t bits, const uint32_t value, + size_t* JXL_RESTRICT encoded_bits) { + *encoded_bits = bits; + if (value >= (1ULL << bits)) { + return JXL_FAILURE("Value %u too large for %" PRIu64 " bits", value, + static_cast(bits)); + } + return true; +} + +uint32_t BitsCoder::Read(const size_t bits, BitReader* JXL_RESTRICT reader) { + return reader->ReadBits(bits); +} + +size_t U32Coder::MaxEncodedBits(const U32Enc enc) { + size_t extra_bits = 0; + for (uint32_t selector = 0; selector < 4; ++selector) { + const U32Distr d = enc.GetDistr(selector); + if (d.IsDirect()) { + continue; + } else { + extra_bits = std::max(extra_bits, d.ExtraBits()); + } + } + return 2 + extra_bits; +} + +Status U32Coder::CanEncode(const U32Enc enc, const uint32_t value, + size_t* JXL_RESTRICT encoded_bits) { + uint32_t selector; + size_t total_bits; + const Status ok = ChooseSelector(enc, value, &selector, &total_bits); + *encoded_bits = ok ? total_bits : 0; + return ok; +} + +uint32_t U32Coder::Read(const U32Enc enc, BitReader* JXL_RESTRICT reader) { + const uint32_t selector = reader->ReadFixedBits<2>(); + const U32Distr d = enc.GetDistr(selector); + if (d.IsDirect()) { + return d.Direct(); + } else { + return reader->ReadBits(d.ExtraBits()) + d.Offset(); + } +} + +Status U32Coder::ChooseSelector(const U32Enc enc, const uint32_t value, + uint32_t* JXL_RESTRICT selector, + size_t* JXL_RESTRICT total_bits) { +#if JXL_ENABLE_ASSERT + const size_t bits_required = 32 - Num0BitsAboveMS1Bit(value); +#endif // JXL_ENABLE_ASSERT + JXL_ASSERT(bits_required <= 32); + + *selector = 0; + *total_bits = 0; + + // It is difficult to verify whether Dist32Byte are sorted, so check all + // selectors and keep the one with the fewest total_bits. + *total_bits = 64; // more than any valid encoding + for (uint32_t s = 0; s < 4; ++s) { + const U32Distr d = enc.GetDistr(s); + if (d.IsDirect()) { + if (d.Direct() == value) { + *selector = s; + *total_bits = 2; + return true; // Done, direct is always the best possible. + } + continue; + } + const size_t extra_bits = d.ExtraBits(); + const uint32_t offset = d.Offset(); + if (value < offset || value >= offset + (1ULL << extra_bits)) continue; + + // Better than prior encoding, remember it: + if (2 + extra_bits < *total_bits) { + *selector = s; + *total_bits = 2 + extra_bits; + } + } + + if (*total_bits == 64) { + return JXL_FAILURE("No feasible selector for %u", value); + } + + return true; +} + +uint64_t U64Coder::Read(BitReader* JXL_RESTRICT reader) { + uint64_t selector = reader->ReadFixedBits<2>(); + if (selector == 0) { + return 0; + } + if (selector == 1) { + return 1 + reader->ReadFixedBits<4>(); + } + if (selector == 2) { + return 17 + reader->ReadFixedBits<8>(); + } + + // selector 3, varint, groups have first 12, then 8, and last 4 bits. + uint64_t result = reader->ReadFixedBits<12>(); + + uint64_t shift = 12; + while (reader->ReadFixedBits<1>()) { + if (shift == 60) { + result |= static_cast(reader->ReadFixedBits<4>()) << shift; + break; + } + result |= static_cast(reader->ReadFixedBits<8>()) << shift; + shift += 8; + } + + return result; +} + +// Can always encode, but useful because it also returns bit size. +Status U64Coder::CanEncode(uint64_t value, size_t* JXL_RESTRICT encoded_bits) { + if (value == 0) { + *encoded_bits = 2; // 2 selector bits + } else if (value <= 16) { + *encoded_bits = 2 + 4; // 2 selector bits + 4 payload bits + } else if (value <= 272) { + *encoded_bits = 2 + 8; // 2 selector bits + 8 payload bits + } else { + *encoded_bits = 2 + 12; // 2 selector bits + 12 payload bits + value >>= 12; + int shift = 12; + while (value > 0 && shift < 60) { + *encoded_bits += 1 + 8; // 1 continuation bit + 8 payload bits + value >>= 8; + shift += 8; + } + if (value > 0) { + // This only could happen if shift == N - 4. + *encoded_bits += 1 + 4; // 1 continuation bit + 4 payload bits + } else { + *encoded_bits += 1; // 1 stop bit + } + } + + return true; +} + +Status F16Coder::Read(BitReader* JXL_RESTRICT reader, + float* JXL_RESTRICT value) { + const uint32_t bits16 = reader->ReadFixedBits<16>(); + const uint32_t sign = bits16 >> 15; + const uint32_t biased_exp = (bits16 >> 10) & 0x1F; + const uint32_t mantissa = bits16 & 0x3FF; + + if (JXL_UNLIKELY(biased_exp == 31)) { + return JXL_FAILURE("F16 infinity or NaN are not supported"); + } + + // Subnormal or zero + if (JXL_UNLIKELY(biased_exp == 0)) { + *value = (1.0f / 16384) * (mantissa * (1.0f / 1024)); + if (sign) *value = -*value; + return true; + } + + // Normalized: convert the representation directly (faster than ldexp/tables). + const uint32_t biased_exp32 = biased_exp + (127 - 15); + const uint32_t mantissa32 = mantissa << (23 - 10); + const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32; + memcpy(value, &bits32, sizeof(bits32)); + return true; +} + +Status F16Coder::CanEncode(float value, size_t* JXL_RESTRICT encoded_bits) { + *encoded_bits = MaxEncodedBits(); + if (std::isnan(value) || std::isinf(value)) { + return JXL_FAILURE("Should not attempt to store NaN and infinity"); + } + return std::abs(value) <= 65504.0f; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/fields.h b/third_party/jpeg-xl/lib/jxl/fields.h new file mode 100644 index 0000000000..10d0b7aa30 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/fields.h @@ -0,0 +1,377 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_FIELDS_H_ +#define LIB_JXL_FIELDS_H_ + +// Forward/backward-compatible 'bundles' with auto-serialized 'fields'. + +#include +#include +#include +#include +#include +#include + +#include +#include // abs +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/field_encodings.h" + +namespace jxl { + +struct AuxOut; +struct BitWriter; + +// Integer coders: BitsCoder (raw), U32Coder (table), U64Coder (varint). + +// Reads/writes a given (fixed) number of bits <= 32. +namespace BitsCoder { +size_t MaxEncodedBits(size_t bits); + +Status CanEncode(size_t bits, uint32_t value, + size_t* JXL_RESTRICT encoded_bits); + +uint32_t Read(size_t bits, BitReader* JXL_RESTRICT reader); + +// Returns false if the value is too large to encode. +Status Write(size_t bits, uint32_t value, BitWriter* JXL_RESTRICT writer); +} // namespace BitsCoder + +// Encodes u32 using a lookup table and/or extra bits, governed by a per-field +// encoding `enc` which consists of four distributions `d` chosen via a 2-bit +// selector (least significant = 0). Each d may have two modes: +// - direct: if d.IsDirect(), the value is d.Direct(); +// - offset: the value is derived from d.ExtraBits() extra bits plus d.Offset(); +// This encoding is denser than Exp-Golomb or Gamma codes when both small and +// large values occur. +// +// Examples: +// Direct: U32Enc(Val(8), Val(16), Val(32), Bits(6)), value 32 => 10b. +// Offset: U32Enc(Val(0), BitsOffset(1, 1), BitsOffset(2, 3), BitsOffset(8, 8)) +// defines the following prefix code: +// 00 -> 0 +// 01x -> 1..2 +// 10xx -> 3..7 +// 11xxxxxxxx -> 8..263 +namespace U32Coder { +size_t MaxEncodedBits(U32Enc enc); +Status CanEncode(U32Enc enc, uint32_t value, size_t* JXL_RESTRICT encoded_bits); +uint32_t Read(U32Enc enc, BitReader* JXL_RESTRICT reader); + +// Returns false if the value is too large to encode. +Status Write(U32Enc enc, uint32_t value, BitWriter* JXL_RESTRICT writer); + +// "private" +Status ChooseSelector(U32Enc enc, uint32_t value, + uint32_t* JXL_RESTRICT selector, + size_t* JXL_RESTRICT total_bits); +} // namespace U32Coder + +// Encodes 64-bit unsigned integers with a fixed distribution, taking 2 bits +// to encode 0, 6 bits to encode 1 to 16, 10 bits to encode 17 to 272, 15 bits +// to encode up to 4095, and on the order of log2(value) * 1.125 bits for +// larger values. +namespace U64Coder { +constexpr size_t MaxEncodedBits() { return 2 + 12 + 6 * (8 + 1) + (4 + 1); } + +uint64_t Read(BitReader* JXL_RESTRICT reader); + +// Returns false if the value is too large to encode. +Status Write(uint64_t value, BitWriter* JXL_RESTRICT writer); + +// Can always encode, but useful because it also returns bit size. +Status CanEncode(uint64_t value, size_t* JXL_RESTRICT encoded_bits); +} // namespace U64Coder + +// IEEE 754 half-precision (binary16). Refuses to read/write NaN/Inf. +namespace F16Coder { +constexpr size_t MaxEncodedBits() { return 16; } + +// Returns false if the bit representation is NaN or infinity +Status Read(BitReader* JXL_RESTRICT reader, float* JXL_RESTRICT value); + +// Returns false if the value is too large to encode. +Status Write(float value, BitWriter* JXL_RESTRICT writer); +Status CanEncode(float value, size_t* JXL_RESTRICT encoded_bits); +} // namespace F16Coder + +// A "bundle" is a forward- and backward compatible collection of fields. +// They are used for SizeHeader/FrameHeader/GroupHeader. Bundles can be +// extended by appending(!) fields. Optional fields may be omitted from the +// bitstream by conditionally visiting them. When reading new bitstreams with +// old code, we skip unknown fields at the end of the bundle. This requires +// storing the amount of extra appended bits, and that fields are visited in +// chronological order of being added to the format, because old decoders +// cannot skip some future fields and resume reading old fields. Similarly, +// new readers query bits in an "extensions" field to skip (groups of) fields +// not present in old bitstreams. Note that each bundle must include an +// "extensions" field prior to freezing the format, otherwise it cannot be +// extended. +// +// To ensure interoperability, there will be no opaque fields. +// +// HOWTO: +// - basic usage: define a struct with member variables ("fields") and a +// VisitFields(v) member function that calls v->U32/Bool etc. for each +// field, specifying their default values. The ctor must call +// Bundle::Init(this). +// +// - print a trace of visitors: ensure each bundle has a static Name() member +// function, and change Bundle::Print* to return true. +// +// - optional fields: in VisitFields, add if (v->Conditional(your_condition)) +// { v->Bool(default, &field); }. This prevents reading/writing field +// if !your_condition, which is typically computed from a prior field. +// WARNING: to ensure all fields are initialized, do not add an else branch; +// instead add another if (v->Conditional(!your_condition)). +// +// - repeated fields: for dynamic sizes, use e.g. std::vector and in +// VisitFields, if (v->IsReading()) field.resize(size) before accessing field. +// For static or bounded sizes, use an array or std::array. In all cases, +// simply visit each array element as if it were a normal field. +// +// - nested bundles: add a bundle as a normal field and in VisitFields call +// JXL_RETURN_IF_ERROR(v->VisitNested(&nested)); +// +// - allow future extensions: define a "uint64_t extensions" field and call +// v->BeginExtensions(&extensions) after visiting all non-extension fields, +// and `return v->EndExtensions();` after the last extension field. +// +// - encode an entire bundle in one bit if ALL its fields equal their default +// values: add a "mutable bool all_default" field and as the first visitor: +// if (v->AllDefault(*this, &all_default)) { +// // Overwrite all serialized fields, but not any nonserialized_*. +// v->SetDefault(this); +// return true; +// } +// Note: if extensions are present, AllDefault() == false. + +namespace Bundle { +constexpr size_t kMaxExtensions = 64; // bits in u64 + +// Initializes fields to the default values. It is not recursive to nested +// fields, this function is intended to be called in the constructors so +// each nested field will already Init itself. +void Init(Fields* JXL_RESTRICT fields); + +// Similar to Init, but recursive to nested fields. +void SetDefault(Fields* JXL_RESTRICT fields); + +// Returns whether ALL fields (including `extensions`, if present) are equal +// to their default value. +bool AllDefault(const Fields& fields); + +// Returns max number of bits required to encode a T. +size_t MaxBits(const Fields& fields); + +// Returns whether a header's fields can all be encoded, i.e. they have a +// valid representation. If so, "*total_bits" is the exact number of bits +// required. Called by Write. +Status CanEncode(const Fields& fields, size_t* JXL_RESTRICT extension_bits, + size_t* JXL_RESTRICT total_bits); + +Status Read(BitReader* reader, Fields* JXL_RESTRICT fields); + +// Returns whether enough bits are available to fully read this bundle using +// Read. Also returns true in case of a codestream error (other than not being +// large enough): that means enough bits are available to determine there's an +// error, use Read to get such error status. +// NOTE: this advances the BitReader, a different one pointing back at the +// original bit position in the codestream must be created to use Read after +// this. +bool CanRead(BitReader* reader, Fields* JXL_RESTRICT fields); + +Status Write(const Fields& fields, BitWriter* JXL_RESTRICT writer, size_t layer, + AuxOut* aux_out); +} // namespace Bundle + +// Different subclasses of Visitor are passed to implementations of Fields +// throughout their lifetime. Templates used to be used for this but dynamic +// polymorphism produces more compact executables than template reification did. +class Visitor { + public: + virtual ~Visitor() = default; + virtual Status Visit(Fields* fields) = 0; + + virtual Status Bool(bool default_value, bool* JXL_RESTRICT value) = 0; + virtual Status U32(U32Enc, uint32_t, uint32_t*) = 0; + + // Helper to construct U32Enc from U32Distr. + Status U32(const U32Distr d0, const U32Distr d1, const U32Distr d2, + const U32Distr d3, const uint32_t default_value, + uint32_t* JXL_RESTRICT value) { + return U32(U32Enc(d0, d1, d2, d3), default_value, value); + } + + template + Status Enum(const EnumT default_value, EnumT* JXL_RESTRICT value) { + uint32_t u32 = static_cast(*value); + // 00 -> 0 + // 01 -> 1 + // 10xxxx -> 2..17 + // 11yyyyyy -> 18..81 + JXL_RETURN_IF_ERROR(U32(Val(0), Val(1), BitsOffset(4, 2), BitsOffset(6, 18), + static_cast(default_value), &u32)); + *value = static_cast(u32); + return EnumValid(*value); + } + + virtual Status Bits(size_t bits, uint32_t default_value, + uint32_t* JXL_RESTRICT value) = 0; + virtual Status U64(uint64_t default_value, uint64_t* JXL_RESTRICT value) = 0; + virtual Status F16(float default_value, float* JXL_RESTRICT value) = 0; + + // Returns whether VisitFields should visit some subsequent fields. + // "condition" is typically from prior fields, e.g. flags. + // Overridden by InitVisitor and MaxBitsVisitor. + virtual Status Conditional(bool condition) { return condition; } + + // Overridden by InitVisitor, AllDefaultVisitor and CanEncodeVisitor. + virtual Status AllDefault(const Fields& /*fields*/, + bool* JXL_RESTRICT all_default) { + JXL_RETURN_IF_ERROR(Bool(true, all_default)); + return *all_default; + } + + virtual void SetDefault(Fields* /*fields*/) { + // Do nothing by default, this is overridden by ReadVisitor. + } + + // Returns the result of visiting a nested Bundle. + // Overridden by InitVisitor. + virtual Status VisitNested(Fields* fields) { return Visit(fields); } + + // Overridden by ReadVisitor. Enables dynamically-sized fields. + virtual bool IsReading() const { return false; } + + virtual Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) = 0; + virtual Status EndExtensions() = 0; +}; + +namespace fields_internal { +// A bundle can be in one of three states concerning extensions: not-begun, +// active, ended. Bundles may be nested, so we need a stack of states. +class ExtensionStates { + public: + void Push() { + // Initial state = not-begun. + begun_ <<= 1; + ended_ <<= 1; + } + + // Clears current state; caller must check IsEnded beforehand. + void Pop() { + begun_ >>= 1; + ended_ >>= 1; + } + + // Returns true if state == active || state == ended. + Status IsBegun() const { return (begun_ & 1) != 0; } + // Returns true if state != not-begun && state != active. + Status IsEnded() const { return (ended_ & 1) != 0; } + + void Begin() { + JXL_ASSERT(!IsBegun()); + JXL_ASSERT(!IsEnded()); + begun_ += 1; + } + + void End() { + JXL_ASSERT(IsBegun()); + JXL_ASSERT(!IsEnded()); + ended_ += 1; + } + + private: + // Current state := least-significant bit of begun_ and ended_. + uint64_t begun_ = 0; + uint64_t ended_ = 0; +}; + +// Visitors generate Init/AllDefault/Read/Write logic for all fields. Each +// bundle's VisitFields member function calls visitor->U32 etc. We do not +// overload operator() because a function name is easier to search for. + +class VisitorBase : public Visitor { + public: + explicit VisitorBase() {} + ~VisitorBase() override { JXL_ASSERT(depth_ == 0); } + + // This is the only call site of Fields::VisitFields. + // Ensures EndExtensions was called. + Status Visit(Fields* fields) override { + depth_ += 1; + JXL_ASSERT(depth_ <= Bundle::kMaxExtensions); + extension_states_.Push(); + + const Status ok = fields->VisitFields(this); + + if (ok) { + // If VisitFields called BeginExtensions, must also call + // EndExtensions. + JXL_ASSERT(!extension_states_.IsBegun() || extension_states_.IsEnded()); + } else { + // Failed, undefined state: don't care whether EndExtensions was + // called. + } + + extension_states_.Pop(); + JXL_ASSERT(depth_ != 0); + depth_ -= 1; + + return ok; + } + + // For visitors accepting a const Visitor, need to const-cast so we can call + // the non-const Visitor::VisitFields. NOTE: C is not modified except the + // `all_default` field by CanEncodeVisitor. + Status VisitConst(const Fields& t) { return Visit(const_cast(&t)); } + + // Derived types (overridden by InitVisitor because it is unsafe to read + // from *value there) + + Status Bool(bool default_value, bool* JXL_RESTRICT value) override { + uint32_t bits = *value ? 1 : 0; + JXL_RETURN_IF_ERROR(Bits(1, static_cast(default_value), &bits)); + JXL_DASSERT(bits <= 1); + *value = bits == 1; + return true; + } + + // Overridden by ReadVisitor and WriteVisitor. + // Called before any conditional visit based on "extensions". + // Overridden by ReadVisitor, CanEncodeVisitor and WriteVisitor. + Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override { + JXL_RETURN_IF_ERROR(U64(0, extensions)); + + extension_states_.Begin(); + return true; + } + + // Called after all extension fields (if any). Although non-extension + // fields could be visited afterward, we prefer the convention that + // extension fields are always the last to be visited. Overridden by + // ReadVisitor. + Status EndExtensions() override { + extension_states_.End(); + return true; + } + + private: + size_t depth_ = 0; // to check nesting + ExtensionStates extension_states_; +}; +} // namespace fields_internal + +} // namespace jxl + +#endif // LIB_JXL_FIELDS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/fields_test.cc b/third_party/jpeg-xl/lib/jxl/fields_test.cc new file mode 100644 index 0000000000..cf54c780ea --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/fields_test.cc @@ -0,0 +1,429 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/fields.h" + +#include +#include + +#include +#include + +#include "lib/jxl/base/span.h" +#include "lib/jxl/common.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_fields.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +// Ensures `value` round-trips and in exactly `expected_bits_written`. +void TestU32Coder(const uint32_t value, const size_t expected_bits_written) { + const U32Enc enc(Val(0), Bits(4), Val(0x7FFFFFFF), Bits(32)); + + BitWriter writer; + BitWriter::Allotment allotment( + &writer, RoundUpBitsToByteMultiple(U32Coder::MaxEncodedBits(enc))); + + size_t precheck_pos; + EXPECT_TRUE(U32Coder::CanEncode(enc, value, &precheck_pos)); + EXPECT_EQ(expected_bits_written, precheck_pos); + + EXPECT_TRUE(U32Coder::Write(enc, value, &writer)); + EXPECT_EQ(expected_bits_written, writer.BitsWritten()); + writer.ZeroPadToByte(); + allotment.ReclaimAndCharge(&writer, 0, nullptr); + + BitReader reader(writer.GetSpan()); + const uint32_t decoded_value = U32Coder::Read(enc, &reader); + EXPECT_EQ(value, decoded_value); + EXPECT_TRUE(reader.Close()); +} + +TEST(FieldsTest, U32CoderTest) { + TestU32Coder(0, 2); + TestU32Coder(1, 6); + TestU32Coder(15, 6); + TestU32Coder(0x7FFFFFFF, 2); + TestU32Coder(128, 34); + TestU32Coder(0x7FFFFFFEu, 34); + TestU32Coder(0x80000000u, 34); + TestU32Coder(0xFFFFFFFFu, 34); +} + +void TestU64Coder(const uint64_t value, const size_t expected_bits_written) { + BitWriter writer; + BitWriter::Allotment allotment( + &writer, RoundUpBitsToByteMultiple(U64Coder::MaxEncodedBits())); + + size_t precheck_pos; + EXPECT_TRUE(U64Coder::CanEncode(value, &precheck_pos)); + EXPECT_EQ(expected_bits_written, precheck_pos); + + EXPECT_TRUE(U64Coder::Write(value, &writer)); + EXPECT_EQ(expected_bits_written, writer.BitsWritten()); + + writer.ZeroPadToByte(); + allotment.ReclaimAndCharge(&writer, 0, nullptr); + + BitReader reader(writer.GetSpan()); + const uint64_t decoded_value = U64Coder::Read(&reader); + EXPECT_EQ(value, decoded_value); + EXPECT_TRUE(reader.Close()); +} + +TEST(FieldsTest, U64CoderTest) { + // Values that should take 2 bits (selector 00): 0 + TestU64Coder(0, 2); + + // Values that should take 6 bits (2 for selector, 4 for value): 1..16 + TestU64Coder(1, 6); + TestU64Coder(2, 6); + TestU64Coder(8, 6); + TestU64Coder(15, 6); + TestU64Coder(16, 6); + + // Values that should take 10 bits (2 for selector, 8 for value): 17..272 + TestU64Coder(17, 10); + TestU64Coder(18, 10); + TestU64Coder(100, 10); + TestU64Coder(271, 10); + TestU64Coder(272, 10); + + // Values that should take 15 bits (2 for selector, 12 for value, 1 for varint + // end): (0)..273..4095 + TestU64Coder(273, 15); + TestU64Coder(274, 15); + TestU64Coder(1000, 15); + TestU64Coder(4094, 15); + TestU64Coder(4095, 15); + + // Take 24 bits (of which 20 actual value): (0)..4096..1048575 + TestU64Coder(4096, 24); + TestU64Coder(4097, 24); + TestU64Coder(10000, 24); + TestU64Coder(1048574, 24); + TestU64Coder(1048575, 24); + + // Take 33 bits (of which 28 actual value): (0)..1048576..268435455 + TestU64Coder(1048576, 33); + TestU64Coder(1048577, 33); + TestU64Coder(10000000, 33); + TestU64Coder(268435454, 33); + TestU64Coder(268435455, 33); + + // Take 42 bits (of which 36 actual value): (0)..268435456..68719476735 + TestU64Coder(268435456ull, 42); + TestU64Coder(268435457ull, 42); + TestU64Coder(1000000000ull, 42); + TestU64Coder(68719476734ull, 42); + TestU64Coder(68719476735ull, 42); + + // Take 51 bits (of which 44 actual value): (0)..68719476736..17592186044415 + TestU64Coder(68719476736ull, 51); + TestU64Coder(68719476737ull, 51); + TestU64Coder(1000000000000ull, 51); + TestU64Coder(17592186044414ull, 51); + TestU64Coder(17592186044415ull, 51); + + // Take 60 bits (of which 52 actual value): + // (0)..17592186044416..4503599627370495 + TestU64Coder(17592186044416ull, 60); + TestU64Coder(17592186044417ull, 60); + TestU64Coder(100000000000000ull, 60); + TestU64Coder(4503599627370494ull, 60); + TestU64Coder(4503599627370495ull, 60); + + // Take 69 bits (of which 60 actual value): + // (0)..4503599627370496..1152921504606846975 + TestU64Coder(4503599627370496ull, 69); + TestU64Coder(4503599627370497ull, 69); + TestU64Coder(10000000000000000ull, 69); + TestU64Coder(1152921504606846974ull, 69); + TestU64Coder(1152921504606846975ull, 69); + + // Take 73 bits (of which 64 actual value): + // (0)..1152921504606846976..18446744073709551615 + TestU64Coder(1152921504606846976ull, 73); + TestU64Coder(1152921504606846977ull, 73); + TestU64Coder(10000000000000000000ull, 73); + TestU64Coder(18446744073709551614ull, 73); + TestU64Coder(18446744073709551615ull, 73); +} + +Status TestF16Coder(const float value) { + size_t max_encoded_bits; + // It is not a fatal error if it can't be encoded. + if (!F16Coder::CanEncode(value, &max_encoded_bits)) return false; + EXPECT_EQ(F16Coder::MaxEncodedBits(), max_encoded_bits); + + BitWriter writer; + BitWriter::Allotment allotment(&writer, + RoundUpBitsToByteMultiple(max_encoded_bits)); + + EXPECT_TRUE(F16Coder::Write(value, &writer)); + EXPECT_EQ(F16Coder::MaxEncodedBits(), writer.BitsWritten()); + writer.ZeroPadToByte(); + allotment.ReclaimAndCharge(&writer, 0, nullptr); + + BitReader reader(writer.GetSpan()); + float decoded_value; + EXPECT_TRUE(F16Coder::Read(&reader, &decoded_value)); + // All values we test can be represented exactly. + EXPECT_EQ(value, decoded_value); + EXPECT_TRUE(reader.Close()); + return true; +} + +TEST(FieldsTest, F16CoderTest) { + for (float sign : {-1.0f, 1.0f}) { + // (anything less than 1E-3 are subnormals) + for (float mag : {0.0f, 0.5f, 1.0f, 2.0f, 2.5f, 16.015625f, 1.0f / 4096, + 1.0f / 16384, 65504.0f}) { + EXPECT_TRUE(TestF16Coder(sign * mag)); + } + } + + // Out of range + EXPECT_FALSE(TestF16Coder(65504.01f)); + EXPECT_FALSE(TestF16Coder(-65505.0f)); +} + +// Ensures Read(Write()) returns the same fields. +TEST(FieldsTest, TestRoundtripSize) { + for (int i = 0; i < 8; i++) { + SizeHeader size; + ASSERT_TRUE(size.Set(123 + 77 * i, 7 + i)); + + size_t extension_bits = 999, total_bits = 999; // Initialize as garbage. + ASSERT_TRUE(Bundle::CanEncode(size, &extension_bits, &total_bits)); + EXPECT_EQ(0u, extension_bits); + + BitWriter writer; + ASSERT_TRUE(WriteSizeHeader(size, &writer, 0, nullptr)); + EXPECT_EQ(total_bits, writer.BitsWritten()); + writer.ZeroPadToByte(); + + SizeHeader size2; + BitReader reader(writer.GetSpan()); + ASSERT_TRUE(ReadSizeHeader(&reader, &size2)); + EXPECT_EQ(total_bits, reader.TotalBitsConsumed()); + EXPECT_TRUE(reader.Close()); + + EXPECT_EQ(size.xsize(), size2.xsize()); + EXPECT_EQ(size.ysize(), size2.ysize()); + } +} + +// Ensure all values can be reached by the encoding. +TEST(FieldsTest, TestCropRect) { + CodecMetadata metadata; + for (int32_t i = -999; i < 19000; ++i) { + FrameHeader f(&metadata); + f.custom_size_or_origin = true; + f.frame_origin.x0 = i; + f.frame_origin.y0 = i; + f.frame_size.xsize = 1000 + i; + f.frame_size.ysize = 1000 + i; + size_t extension_bits = 0, total_bits = 0; + ASSERT_TRUE(Bundle::CanEncode(f, &extension_bits, &total_bits)); + EXPECT_EQ(0u, extension_bits); + EXPECT_GE(total_bits, 9u); + } +} +TEST(FieldsTest, TestPreview) { + // (div8 cannot represent 4360, but !div8 can go a little higher) + for (uint32_t i = 1; i < 4360; ++i) { + PreviewHeader p; + ASSERT_TRUE(p.Set(i, i)); + size_t extension_bits = 0, total_bits = 0; + ASSERT_TRUE(Bundle::CanEncode(p, &extension_bits, &total_bits)); + EXPECT_EQ(0u, extension_bits); + EXPECT_GE(total_bits, 6u); + } +} + +// Ensures Read(Write()) returns the same fields. +TEST(FieldsTest, TestRoundtripFrame) { + CodecMetadata metadata; + FrameHeader h(&metadata); + h.extensions = 0x800; + + size_t extension_bits = 999, total_bits = 999; // Initialize as garbage. + ASSERT_TRUE(Bundle::CanEncode(h, &extension_bits, &total_bits)); + EXPECT_EQ(0u, extension_bits); + BitWriter writer; + ASSERT_TRUE(WriteFrameHeader(h, &writer, nullptr)); + EXPECT_EQ(total_bits, writer.BitsWritten()); + writer.ZeroPadToByte(); + + FrameHeader h2(&metadata); + BitReader reader(writer.GetSpan()); + ASSERT_TRUE(ReadFrameHeader(&reader, &h2)); + EXPECT_EQ(total_bits, reader.TotalBitsConsumed()); + EXPECT_TRUE(reader.Close()); + + EXPECT_EQ(h.extensions, h2.extensions); + EXPECT_EQ(h.flags, h2.flags); +} + +#ifndef JXL_CRASH_ON_ERROR +// Ensure out-of-bounds values cause an error. +TEST(FieldsTest, TestOutOfRange) { + SizeHeader h; + ASSERT_TRUE(h.Set(0xFFFFFFFFull, 0xFFFFFFFFull)); + size_t extension_bits = 999, total_bits = 999; // Initialize as garbage. + ASSERT_FALSE(Bundle::CanEncode(h, &extension_bits, &total_bits)); +} +#endif + +struct OldBundle : public Fields { + OldBundle() { Bundle::Init(this); } + JXL_FIELDS_NAME(OldBundle) + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(1), Bits(2), Bits(3), Bits(4), 1, &old_small)); + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(1.125f, &old_f)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Bits(7), Bits(12), Bits(16), Bits(32), 0, &old_large)); + + JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions)); + return visitor->EndExtensions(); + } + + uint32_t old_small; + float old_f; + uint32_t old_large; + uint64_t extensions; +}; + +struct NewBundle : public Fields { + NewBundle() { Bundle::Init(this); } + JXL_FIELDS_NAME(NewBundle) + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(1), Bits(2), Bits(3), Bits(4), 1, &old_small)); + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(1.125f, &old_f)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Bits(7), Bits(12), Bits(16), Bits(32), 0, &old_large)); + + JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions)); + if (visitor->Conditional(extensions & 1)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(2), Bits(2), Bits(3), Bits(4), 2, &new_small)); + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(-2.0f, &new_f)); + } + if (visitor->Conditional(extensions & 2)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Bits(9), Bits(12), Bits(16), Bits(32), 0, &new_large)); + } + return visitor->EndExtensions(); + } + + uint32_t old_small; + float old_f; + uint32_t old_large; + uint64_t extensions; + + // If extensions & 1 + uint32_t new_small = 2; + float new_f = -2.0f; + // If extensions & 2 + uint32_t new_large = 0; +}; + +TEST(FieldsTest, TestNewDecoderOldData) { + OldBundle old_bundle; + old_bundle.old_large = 123; + old_bundle.old_f = 3.75f; + old_bundle.extensions = 0; + + // Write to bit stream + const size_t kMaxOutBytes = 999; + BitWriter writer; + // Make sure values are initialized by code under test. + size_t extension_bits = 12345, total_bits = 12345; + ASSERT_TRUE(Bundle::CanEncode(old_bundle, &extension_bits, &total_bits)); + ASSERT_LE(total_bits, kMaxOutBytes * kBitsPerByte); + EXPECT_EQ(0u, extension_bits); + AuxOut aux_out; + ASSERT_TRUE(Bundle::Write(old_bundle, &writer, kLayerHeader, &aux_out)); + + BitWriter::Allotment allotment(&writer, + kMaxOutBytes * kBitsPerByte - total_bits); + writer.Write(20, 0xA55A); // sentinel + writer.ZeroPadToByte(); + allotment.ReclaimAndCharge(&writer, kLayerHeader, nullptr); + + ASSERT_LE(writer.GetSpan().size(), kMaxOutBytes); + BitReader reader(writer.GetSpan()); + NewBundle new_bundle; + ASSERT_TRUE(Bundle::Read(&reader, &new_bundle)); + EXPECT_EQ(reader.TotalBitsConsumed(), + aux_out.layers[kLayerHeader].total_bits); + EXPECT_EQ(reader.ReadBits(20), 0xA55Au); + EXPECT_TRUE(reader.Close()); + + // Old fields are the same in both + EXPECT_EQ(old_bundle.extensions, new_bundle.extensions); + EXPECT_EQ(old_bundle.old_small, new_bundle.old_small); + EXPECT_EQ(old_bundle.old_f, new_bundle.old_f); + EXPECT_EQ(old_bundle.old_large, new_bundle.old_large); + // New fields match their defaults + EXPECT_EQ(2u, new_bundle.new_small); + EXPECT_EQ(-2.0f, new_bundle.new_f); + EXPECT_EQ(0u, new_bundle.new_large); +} + +TEST(FieldsTest, TestOldDecoderNewData) { + NewBundle new_bundle; + new_bundle.old_large = 123; + new_bundle.extensions = 3; + new_bundle.new_f = 999.0f; + new_bundle.new_large = 456; + + // Write to bit stream + constexpr size_t kMaxOutBytes = 999; + BitWriter writer; + // Make sure values are initialized by code under test. + size_t extension_bits = 12345, total_bits = 12345; + ASSERT_TRUE(Bundle::CanEncode(new_bundle, &extension_bits, &total_bits)); + EXPECT_NE(0u, extension_bits); + AuxOut aux_out; + ASSERT_TRUE(Bundle::Write(new_bundle, &writer, kLayerHeader, &aux_out)); + ASSERT_LE(aux_out.layers[kLayerHeader].total_bits, + kMaxOutBytes * kBitsPerByte); + + BitWriter::Allotment allotment( + &writer, + kMaxOutBytes * kBitsPerByte - aux_out.layers[kLayerHeader].total_bits); + // Ensure Read skips the additional fields + writer.Write(20, 0xA55A); // sentinel + writer.ZeroPadToByte(); + allotment.ReclaimAndCharge(&writer, kLayerHeader, nullptr); + + BitReader reader(writer.GetSpan()); + OldBundle old_bundle; + ASSERT_TRUE(Bundle::Read(&reader, &old_bundle)); + EXPECT_EQ(reader.TotalBitsConsumed(), + aux_out.layers[kLayerHeader].total_bits); + EXPECT_EQ(reader.ReadBits(20), 0xA55Au); + EXPECT_TRUE(reader.Close()); + + // Old fields are the same in both + EXPECT_EQ(new_bundle.extensions, old_bundle.extensions); + EXPECT_EQ(new_bundle.old_small, old_bundle.old_small); + EXPECT_EQ(new_bundle.old_f, old_bundle.old_f); + EXPECT_EQ(new_bundle.old_large, old_bundle.old_large); + // (Can't check new fields because old decoder doesn't know about them) +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/frame_header.cc b/third_party/jpeg-xl/lib/jxl/frame_header.cc new file mode 100644 index 0000000000..475ce8e05e --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/frame_header.cc @@ -0,0 +1,494 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/frame_header.h" + +#include + +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/fields.h" + +namespace jxl { + +constexpr uint8_t YCbCrChromaSubsampling::kHShift[] = {0, 1, 1, 0}; +constexpr uint8_t YCbCrChromaSubsampling::kVShift[] = {0, 1, 0, 1}; + +static Status VisitBlendMode(Visitor* JXL_RESTRICT visitor, + BlendMode default_value, BlendMode* blend_mode) { + uint32_t encoded = static_cast(*blend_mode); + + JXL_QUIET_RETURN_IF_ERROR(visitor->U32( + Val(static_cast(BlendMode::kReplace)), + Val(static_cast(BlendMode::kAdd)), + Val(static_cast(BlendMode::kBlend)), BitsOffset(2, 3), + static_cast(default_value), &encoded)); + if (encoded > 4) { + return JXL_FAILURE("Invalid blend_mode"); + } + *blend_mode = static_cast(encoded); + return true; +} + +static Status VisitFrameType(Visitor* JXL_RESTRICT visitor, + FrameType default_value, FrameType* frame_type) { + uint32_t encoded = static_cast(*frame_type); + + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(static_cast(FrameType::kRegularFrame)), + Val(static_cast(FrameType::kDCFrame)), + Val(static_cast(FrameType::kReferenceOnly)), + Val(static_cast(FrameType::kSkipProgressive)), + static_cast(default_value), &encoded)); + *frame_type = static_cast(encoded); + return true; +} + +BlendingInfo::BlendingInfo() { Bundle::Init(this); } + +Status BlendingInfo::VisitFields(Visitor* JXL_RESTRICT visitor) { + JXL_QUIET_RETURN_IF_ERROR( + VisitBlendMode(visitor, BlendMode::kReplace, &mode)); + if (visitor->Conditional(nonserialized_num_extra_channels > 0 && + (mode == BlendMode::kBlend || + mode == BlendMode::kAlphaWeightedAdd))) { + // Up to 11 alpha channels for blending. + JXL_QUIET_RETURN_IF_ERROR(visitor->U32( + Val(0), Val(1), Val(2), BitsOffset(3, 3), 0, &alpha_channel)); + if (visitor->IsReading() && + alpha_channel >= nonserialized_num_extra_channels) { + return JXL_FAILURE("Invalid alpha channel for blending"); + } + } + if (visitor->Conditional((nonserialized_num_extra_channels > 0 && + (mode == BlendMode::kBlend || + mode == BlendMode::kAlphaWeightedAdd)) || + mode == BlendMode::kMul)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &clamp)); + } + // 'old' frame for blending. Only necessary if this is not a full frame, or + // blending is not kReplace. + if (visitor->Conditional(mode != BlendMode::kReplace || + nonserialized_is_partial_frame)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(0), Val(1), Val(2), Val(3), 0, &source)); + } + return true; +} + +std::string BlendingInfo::DebugString() const { + std::ostringstream os; + os << (mode == BlendMode::kReplace ? "Replace" + : mode == BlendMode::kAdd ? "Add" + : mode == BlendMode::kBlend ? "Blend" + : mode == BlendMode::kAlphaWeightedAdd ? "AlphaWeightedAdd" + : "Mul"); + if (nonserialized_num_extra_channels > 0 && + (mode == BlendMode::kBlend || mode == BlendMode::kAlphaWeightedAdd)) { + os << ",alpha=" << alpha_channel << ",clamp=" << clamp; + } else if (mode == BlendMode::kMul) { + os << ",clamp=" << clamp; + } + if (mode != BlendMode::kReplace || nonserialized_is_partial_frame) { + os << ",source=" << source; + } + return os.str(); +} + +AnimationFrame::AnimationFrame(const CodecMetadata* metadata) + : nonserialized_metadata(metadata) { + Bundle::Init(this); +} +Status AnimationFrame::VisitFields(Visitor* JXL_RESTRICT visitor) { + if (visitor->Conditional(nonserialized_metadata != nullptr && + nonserialized_metadata->m.have_animation)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(0), Val(1), Bits(8), Bits(32), 0, &duration)); + } + + if (visitor->Conditional( + nonserialized_metadata != nullptr && + nonserialized_metadata->m.animation.have_timecodes)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(32, 0, &timecode)); + } + return true; +} + +YCbCrChromaSubsampling::YCbCrChromaSubsampling() { Bundle::Init(this); } +Passes::Passes() { Bundle::Init(this); } +Status Passes::VisitFields(Visitor* JXL_RESTRICT visitor) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(2), Val(3), BitsOffset(3, 4), 1, &num_passes)); + JXL_ASSERT(num_passes <= kMaxNumPasses); // Cannot happen when reading + + if (visitor->Conditional(num_passes != 1)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32( + Val(0), Val(1), Val(2), BitsOffset(1, 3), 0, &num_downsample)); + JXL_ASSERT(num_downsample <= 4); // 1,2,4,8 + if (num_downsample > num_passes) { + return JXL_FAILURE("num_downsample %u > num_passes %u", num_downsample, + num_passes); + } + + for (uint32_t i = 0; i < num_passes - 1; i++) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(2, 0, &shift[i])); + } + shift[num_passes - 1] = 0; + + for (uint32_t i = 0; i < num_downsample; ++i) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(2), Val(4), Val(8), 1, &downsample[i])); + if (i > 0 && downsample[i] >= downsample[i - 1]) { + return JXL_FAILURE("downsample sequence should be decreasing"); + } + } + for (uint32_t i = 0; i < num_downsample; ++i) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(0), Val(1), Val(2), Bits(3), 0, &last_pass[i])); + if (i > 0 && last_pass[i] <= last_pass[i - 1]) { + return JXL_FAILURE("last_pass sequence should be increasing"); + } + if (last_pass[i] >= num_passes) { + return JXL_FAILURE("last_pass %u >= num_passes %u", last_pass[i], + num_passes); + } + } + } + + return true; +} + +std::string Passes::DebugString() const { + std::ostringstream os; + os << "p=" << num_passes; + if (num_downsample) { + os << ",ds="; + for (uint32_t i = 0; i < num_downsample; ++i) { + os << last_pass[i] << ":" << downsample[i]; + if (i + 1 < num_downsample) os << ";"; + } + } + bool have_shifts = false; + for (uint32_t i = 0; i < num_passes; ++i) { + if (shift[i]) have_shifts = true; + } + if (have_shifts) { + os << ",shifts="; + for (uint32_t i = 0; i < num_passes; ++i) { + os << shift[i]; + if (i + 1 < num_passes) os << ";"; + } + } + return os.str(); +} + +FrameHeader::FrameHeader(const CodecMetadata* metadata) + : animation_frame(metadata), nonserialized_metadata(metadata) { + Bundle::Init(this); +} + +Status ReadFrameHeader(BitReader* JXL_RESTRICT reader, + FrameHeader* JXL_RESTRICT frame) { + return Bundle::Read(reader, frame); +} + +Status FrameHeader::VisitFields(Visitor* JXL_RESTRICT visitor) { + if (visitor->AllDefault(*this, &all_default)) { + // Overwrite all serialized fields, but not any nonserialized_*. + visitor->SetDefault(this); + return true; + } + + JXL_QUIET_RETURN_IF_ERROR( + VisitFrameType(visitor, FrameType::kRegularFrame, &frame_type)); + if (visitor->IsReading() && nonserialized_is_preview && + frame_type != kRegularFrame) { + return JXL_FAILURE("Only regular frame could be a preview"); + } + + // FrameEncoding. + bool is_modular = (encoding == FrameEncoding::kModular); + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &is_modular)); + encoding = (is_modular ? FrameEncoding::kModular : FrameEncoding::kVarDCT); + + // Flags + JXL_QUIET_RETURN_IF_ERROR(visitor->U64(0, &flags)); + + // Color transform + bool xyb_encoded = nonserialized_metadata == nullptr || + nonserialized_metadata->m.xyb_encoded; + + if (xyb_encoded) { + color_transform = ColorTransform::kXYB; + } else { + // Alternate if kYCbCr. + bool alternate = color_transform == ColorTransform::kYCbCr; + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &alternate)); + color_transform = + (alternate ? ColorTransform::kYCbCr : ColorTransform::kNone); + } + + // Chroma subsampling for YCbCr, if no DC frame is used. + if (visitor->Conditional(color_transform == ColorTransform::kYCbCr && + ((flags & kUseDcFrame) == 0))) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&chroma_subsampling)); + } + + size_t num_extra_channels = + nonserialized_metadata != nullptr + ? nonserialized_metadata->m.extra_channel_info.size() + : 0; + + // Upsampling + if (visitor->Conditional((flags & kUseDcFrame) == 0)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(2), Val(4), Val(8), 1, &upsampling)); + if (nonserialized_metadata != nullptr && + visitor->Conditional(num_extra_channels != 0)) { + const std::vector& extra_channels = + nonserialized_metadata->m.extra_channel_info; + extra_channel_upsampling.resize(extra_channels.size(), 1); + for (size_t i = 0; i < extra_channels.size(); ++i) { + uint32_t dim_shift = + nonserialized_metadata->m.extra_channel_info[i].dim_shift; + uint32_t& ec_upsampling = extra_channel_upsampling[i]; + ec_upsampling >>= dim_shift; + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(2), Val(4), Val(8), 1, &ec_upsampling)); + ec_upsampling <<= dim_shift; + if (ec_upsampling < upsampling) { + return JXL_FAILURE( + "EC upsampling (%u) < color upsampling (%u), which is invalid.", + ec_upsampling, upsampling); + } + if (ec_upsampling > 8) { + return JXL_FAILURE("EC upsampling too large (%u)", ec_upsampling); + } + } + } else { + extra_channel_upsampling.clear(); + } + } + + // Modular- or VarDCT-specific data. + if (visitor->Conditional(encoding == FrameEncoding::kModular)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(2, 1, &group_size_shift)); + } + if (visitor->Conditional(encoding == FrameEncoding::kVarDCT && + color_transform == ColorTransform::kXYB)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 3, &x_qm_scale)); + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 2, &b_qm_scale)); + } else { + x_qm_scale = b_qm_scale = 2; // noop + } + + // Not useful for kPatchSource + if (visitor->Conditional(frame_type != FrameType::kReferenceOnly)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&passes)); + } + + if (visitor->Conditional(frame_type == FrameType::kDCFrame)) { + // Up to 4 pyramid levels - for up to 16384x downsampling. + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(2), Val(3), Val(4), 1, &dc_level)); + } + if (frame_type != FrameType::kDCFrame) { + dc_level = 0; + } + + bool is_partial_frame = false; + if (visitor->Conditional(frame_type != FrameType::kDCFrame)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &custom_size_or_origin)); + if (visitor->Conditional(custom_size_or_origin)) { + const U32Enc enc(Bits(8), BitsOffset(11, 256), BitsOffset(14, 2304), + BitsOffset(30, 18688)); + // Frame offset, only if kRegularFrame or kSkipProgressive. + if (visitor->Conditional(frame_type == FrameType::kRegularFrame || + frame_type == FrameType::kSkipProgressive)) { + uint32_t ux0 = PackSigned(frame_origin.x0); + uint32_t uy0 = PackSigned(frame_origin.y0); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(enc, 0, &ux0)); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(enc, 0, &uy0)); + frame_origin.x0 = UnpackSigned(ux0); + frame_origin.y0 = UnpackSigned(uy0); + } + // Frame size + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(enc, 0, &frame_size.xsize)); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(enc, 0, &frame_size.ysize)); + if (custom_size_or_origin && + (frame_size.xsize == 0 || frame_size.ysize == 0)) { + return JXL_FAILURE( + "Invalid crop dimensions for frame: zero width or height"); + } + int32_t image_xsize = default_xsize(); + int32_t image_ysize = default_ysize(); + if (frame_type == FrameType::kRegularFrame || + frame_type == FrameType::kSkipProgressive) { + is_partial_frame |= frame_origin.x0 > 0; + is_partial_frame |= frame_origin.y0 > 0; + is_partial_frame |= (static_cast(frame_size.xsize) + + frame_origin.x0) < image_xsize; + is_partial_frame |= (static_cast(frame_size.ysize) + + frame_origin.y0) < image_ysize; + } + } + } + + // Blending info, animation info and whether this is the last frame or not. + if (visitor->Conditional(frame_type == FrameType::kRegularFrame || + frame_type == FrameType::kSkipProgressive)) { + blending_info.nonserialized_num_extra_channels = num_extra_channels; + blending_info.nonserialized_is_partial_frame = is_partial_frame; + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&blending_info)); + bool replace_all = (blending_info.mode == BlendMode::kReplace); + extra_channel_blending_info.resize(num_extra_channels); + for (size_t i = 0; i < num_extra_channels; i++) { + auto& ec_blending_info = extra_channel_blending_info[i]; + ec_blending_info.nonserialized_is_partial_frame = is_partial_frame; + ec_blending_info.nonserialized_num_extra_channels = num_extra_channels; + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&ec_blending_info)); + replace_all &= (ec_blending_info.mode == BlendMode::kReplace); + } + if (visitor->IsReading() && nonserialized_is_preview) { + if (!replace_all || custom_size_or_origin) { + return JXL_FAILURE("Preview is not compatible with blending"); + } + } + if (visitor->Conditional(nonserialized_metadata != nullptr && + nonserialized_metadata->m.have_animation)) { + animation_frame.nonserialized_metadata = nonserialized_metadata; + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&animation_frame)); + } + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(true, &is_last)); + } + if (frame_type != FrameType::kRegularFrame) { + is_last = false; + } + + // ID of that can be used to refer to this frame. 0 for a non-zero-duration + // frame means that it will not be referenced. Not necessary for the last + // frame. + if (visitor->Conditional(frame_type != kDCFrame && !is_last)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(0), Val(1), Val(2), Val(3), 0, &save_as_reference)); + } + + // If this frame is not blended on another frame post-color-transform, it may + // be stored for being referenced either before or after the color transform. + // If it is blended post-color-transform, it must be blended after. It must + // also be blended after if this is a kRegular frame that does not cover the + // full frame, as samples outside the partial region are from a + // post-color-transform frame. + if (frame_type != FrameType::kDCFrame) { + if (visitor->Conditional(CanBeReferenced() && + blending_info.mode == BlendMode::kReplace && + !is_partial_frame && + (frame_type == FrameType::kRegularFrame || + frame_type == FrameType::kSkipProgressive))) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->Bool(false, &save_before_color_transform)); + } else if (visitor->Conditional(frame_type == FrameType::kReferenceOnly)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->Bool(true, &save_before_color_transform)); + if (!save_before_color_transform && + (frame_size.xsize < nonserialized_metadata->xsize() || + frame_size.ysize < nonserialized_metadata->ysize() || + frame_origin.x0 != 0 || frame_origin.y0 != 0)) { + return JXL_FAILURE( + "non-patch reference frame with invalid crop: %" PRIuS "x%" PRIuS + "%+d%+d", + static_cast(frame_size.xsize), + static_cast(frame_size.ysize), + static_cast(frame_origin.x0), + static_cast(frame_origin.y0)); + } + } + } else { + save_before_color_transform = true; + } + + JXL_QUIET_RETURN_IF_ERROR(VisitNameString(visitor, &name)); + + loop_filter.nonserialized_is_modular = is_modular; + JXL_RETURN_IF_ERROR(visitor->VisitNested(&loop_filter)); + + JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions)); + // Extensions: in chronological order of being added to the format. + return visitor->EndExtensions(); +} + +std::string FrameHeader::DebugString() const { + std::ostringstream os; + os << (encoding == FrameEncoding::kVarDCT ? "VarDCT" : "Modular"); + os << ","; + os << (frame_type == FrameType::kRegularFrame ? "Regular" + : frame_type == FrameType::kDCFrame ? "DC" + : frame_type == FrameType::kReferenceOnly ? "Reference" + : "SkipProgressive"); + if (frame_type == FrameType::kDCFrame) { + os << "(lv" << dc_level << ")"; + } + + if (flags) { + os << ","; + uint32_t remaining = flags; + +#define TEST_FLAG(name) \ + if (flags & Flags::k##name) { \ + remaining &= ~Flags::k##name; \ + os << #name; \ + if (remaining) os << "|"; \ + } + TEST_FLAG(Noise); + TEST_FLAG(Patches); + TEST_FLAG(Splines); + TEST_FLAG(UseDcFrame); + TEST_FLAG(SkipAdaptiveDCSmoothing); +#undef TEST_FLAG + } + + os << ","; + os << (color_transform == ColorTransform::kXYB ? "XYB" + : color_transform == ColorTransform::kYCbCr ? "YCbCr" + : "None"); + + if (encoding == FrameEncoding::kModular) { + os << ",shift=" << group_size_shift; + } else if (color_transform == ColorTransform::kXYB) { + os << ",qm=" << x_qm_scale << ";" << b_qm_scale; + } + if (frame_type != FrameType::kReferenceOnly) { + os << "," << passes.DebugString(); + } + if (custom_size_or_origin) { + os << ",xs=" << frame_size.xsize; + os << ",ys=" << frame_size.ysize; + if (frame_type == FrameType::kRegularFrame || + frame_type == FrameType::kSkipProgressive) { + os << ",x0=" << frame_origin.x0; + os << ",y0=" << frame_origin.y0; + } + } + if (upsampling > 1) os << ",up=" << upsampling; + if (loop_filter.gab) os << ",Gaborish"; + if (loop_filter.epf_iters > 0) os << ",epf=" << loop_filter.epf_iters; + if (animation_frame.duration > 0) os << ",dur=" << animation_frame.duration; + if (frame_type == FrameType::kRegularFrame || + frame_type == FrameType::kSkipProgressive) { + os << ","; + os << blending_info.DebugString(); + for (size_t i = 0; i < extra_channel_blending_info.size(); ++i) { + os << (i == 0 ? "[" : ";"); + os << extra_channel_blending_info[i].DebugString(); + if (i + 1 == extra_channel_blending_info.size()) os << "]"; + } + } + if (save_as_reference > 0) os << ",ref=" << save_as_reference; + os << "," << (save_before_color_transform ? "before" : "after") << "_ct"; + if (is_last) os << ",last"; + return os.str(); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/frame_header.h b/third_party/jpeg-xl/lib/jxl/frame_header.h new file mode 100644 index 0000000000..5580bcd6fe --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/frame_header.h @@ -0,0 +1,503 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_FRAME_HEADER_H_ +#define LIB_JXL_FRAME_HEADER_H_ + +// Frame header with backward and forward-compatible extension capability and +// compressed integer fields. + +#include +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image_metadata.h" +#include "lib/jxl/loop_filter.h" + +namespace jxl { + +// TODO(eustas): move to proper place? +// Also used by extra channel names. +static inline Status VisitNameString(Visitor* JXL_RESTRICT visitor, + std::string* name) { + uint32_t name_length = static_cast(name->length()); + // Allows layer name lengths up to 1071 bytes + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(0), Bits(4), BitsOffset(5, 16), + BitsOffset(10, 48), 0, &name_length)); + if (visitor->IsReading()) { + name->resize(name_length); + } + for (size_t i = 0; i < name_length; i++) { + uint32_t c = static_cast((*name)[i]); + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(8, 0, &c)); + (*name)[i] = static_cast(c); + } + return true; +} + +enum class FrameEncoding : uint32_t { + kVarDCT, + kModular, +}; + +enum class ColorTransform : uint32_t { + kXYB, // Values are encoded with XYB. May only be used if + // ImageBundle::xyb_encoded. + kNone, // Values are encoded according to the attached color profile. May + // only be used if !ImageBundle::xyb_encoded. + kYCbCr, // Values are encoded according to the attached color profile, but + // transformed to YCbCr. May only be used if + // !ImageBundle::xyb_encoded. +}; + +inline std::array JpegOrder(ColorTransform ct, bool is_gray) { + if (is_gray) { + return {{0, 0, 0}}; + } + JXL_ASSERT(ct != ColorTransform::kXYB); + if (ct == ColorTransform::kYCbCr) { + return {{1, 0, 2}}; + } else { + return {{0, 1, 2}}; + } +} + +struct YCbCrChromaSubsampling : public Fields { + YCbCrChromaSubsampling(); + JXL_FIELDS_NAME(YCbCrChromaSubsampling) + size_t HShift(size_t c) const { return maxhs_ - kHShift[channel_mode_[c]]; } + size_t VShift(size_t c) const { return maxvs_ - kVShift[channel_mode_[c]]; } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override { + // TODO(veluca): consider allowing 4x downsamples + for (size_t i = 0; i < 3; i++) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(2, 0, &channel_mode_[i])); + } + Recompute(); + return true; + } + + uint8_t MaxHShift() const { return maxhs_; } + uint8_t MaxVShift() const { return maxvs_; } + + uint8_t RawHShift(size_t c) const { return kHShift[channel_mode_[c]]; } + uint8_t RawVShift(size_t c) const { return kVShift[channel_mode_[c]]; } + + // Uses JPEG channel order (Y, Cb, Cr). + Status Set(const uint8_t* hsample, const uint8_t* vsample) { + for (size_t c = 0; c < 3; c++) { + size_t cjpeg = c < 2 ? c ^ 1 : c; + size_t i = 0; + for (; i < 4; i++) { + if (1 << kHShift[i] == hsample[cjpeg] && + 1 << kVShift[i] == vsample[cjpeg]) { + channel_mode_[c] = i; + break; + } + } + if (i == 4) { + return JXL_FAILURE("Invalid subsample mode"); + } + } + Recompute(); + return true; + } + + bool Is444() const { + return HShift(0) == 0 && VShift(0) == 0 && // Cb + HShift(2) == 0 && VShift(2) == 0 && // Cr + HShift(1) == 0 && VShift(1) == 0; // Y + } + + bool Is420() const { + return HShift(0) == 1 && VShift(0) == 1 && // Cb + HShift(2) == 1 && VShift(2) == 1 && // Cr + HShift(1) == 0 && VShift(1) == 0; // Y + } + + bool Is422() const { + return HShift(0) == 1 && VShift(0) == 0 && // Cb + HShift(2) == 1 && VShift(2) == 0 && // Cr + HShift(1) == 0 && VShift(1) == 0; // Y + } + + bool Is440() const { + return HShift(0) == 0 && VShift(0) == 1 && // Cb + HShift(2) == 0 && VShift(2) == 1 && // Cr + HShift(1) == 0 && VShift(1) == 0; // Y + } + + std::string DebugString() const { + if (Is444()) return "444"; + if (Is420()) return "420"; + if (Is422()) return "422"; + if (Is440()) return "440"; + return "cs" + std::to_string(channel_mode_[0]) + + std::to_string(channel_mode_[1]) + std::to_string(channel_mode_[2]); + } + + private: + void Recompute() { + maxhs_ = 0; + maxvs_ = 0; + for (size_t i = 0; i < 3; i++) { + maxhs_ = std::max(maxhs_, kHShift[channel_mode_[i]]); + maxvs_ = std::max(maxvs_, kVShift[channel_mode_[i]]); + } + } + static const uint8_t kHShift[4]; + static const uint8_t kVShift[4]; + uint32_t channel_mode_[3]; + uint8_t maxhs_; + uint8_t maxvs_; +}; + +// Indicates how to combine the current frame with a previously-saved one. Can +// be independently controlled for color and extra channels. Formulas are +// indicative and treat alpha as if it is in range 0.0-1.0. In descriptions +// below, alpha channel is the extra channel of type alpha used for blending +// according to the blend_channel, or fully opaque if there is no alpha channel. +// The blending specified here is used for performing blending *after* color +// transforms - in linear sRGB if blending a XYB-encoded frame on another +// XYB-encoded frame, in sRGB if blending a frame with kColorSpace == kSRGB, or +// in the original colorspace otherwise. Blending in XYB or YCbCr is done by +// using patches. +enum class BlendMode { + // The new values (in the crop) replace the old ones: sample = new + kReplace = 0, + // The new values (in the crop) get added to the old ones: sample = old + new + kAdd = 1, + // The new values (in the crop) replace the old ones if alpha>0: + // For the alpha channel that is used as source: + // alpha = old + new * (1 - old) + // For other channels if !alpha_associated: + // sample = ((1 - new_alpha) * old * old_alpha + new_alpha * new) / alpha + // For other channels if alpha_associated: + // sample = (1 - new_alpha) * old + new + // The alpha formula applies to the alpha used for the division in the other + // channels formula, and applies to the alpha channel itself if its + // blend_channel value matches itself. + kBlend = 2, + // The new values (in the crop) are added to the old ones if alpha>0: + // For the alpha channel that is used as source: + // sample = sample = old + new * (1 - old) + // For other channels: sample = old + alpha * new + kAlphaWeightedAdd = 3, + // The new values (in the crop) get multiplied by the old ones: + // sample = old * new + // The range of the new value matters for multiplication purposes, and its + // nominal range of 0..1 is computed the same way as this is done for the + // alpha values in kBlend and kAlphaWeightedAdd. + // If using kMul as a blend mode for color channels, no color transform is + // performed on the current frame. + kMul = 4, +}; + +struct BlendingInfo : public Fields { + BlendingInfo(); + JXL_FIELDS_NAME(BlendingInfo) + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + BlendMode mode; + // Which extra channel to use as alpha channel for blending, only encoded + // for blend modes that involve alpha and if there are more than 1 extra + // channels. + uint32_t alpha_channel; + // Clamp alpha or channel values to 0-1 range. + bool clamp; + // Frame ID to copy from (0-3). Only encoded if blend_mode is not kReplace. + uint32_t source; + + std::string DebugString() const; + + size_t nonserialized_num_extra_channels = 0; + bool nonserialized_is_partial_frame = false; +}; + +// Origin of the current frame. Not present for frames of type +// kOnlyPatches. +struct FrameOrigin { + int32_t x0, y0; // can be negative. +}; + +// Size of the current frame. +struct FrameSize { + uint32_t xsize, ysize; +}; + +// AnimationFrame defines duration of animation frames. +struct AnimationFrame : public Fields { + explicit AnimationFrame(const CodecMetadata* metadata); + JXL_FIELDS_NAME(AnimationFrame) + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + // How long to wait [in ticks, see Animation{}] after rendering. + // May be 0 if the current frame serves as a foundation for another frame. + uint32_t duration; + + uint32_t timecode; // 0xHHMMSSFF + + // Must be set to the one ImageMetadata acting as the full codestream header, + // with correct xyb_encoded, list of extra channels, etc... + const CodecMetadata* nonserialized_metadata = nullptr; +}; + +// For decoding to lower resolutions. Only used for kRegular frames. +struct Passes : public Fields { + Passes(); + JXL_FIELDS_NAME(Passes) + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + void GetDownsamplingBracket(size_t pass, int& minShift, int& maxShift) const { + maxShift = 2; + minShift = 3; + for (size_t i = 0;; i++) { + for (uint32_t j = 0; j < num_downsample; ++j) { + if (i == last_pass[j]) { + if (downsample[j] == 8) minShift = 3; + if (downsample[j] == 4) minShift = 2; + if (downsample[j] == 2) minShift = 1; + if (downsample[j] == 1) minShift = 0; + } + } + if (i == num_passes - 1) minShift = 0; + if (i == pass) return; + maxShift = minShift - 1; + } + } + + uint32_t GetDownsamplingTargetForCompletedPasses(uint32_t num_p) const { + if (num_p >= num_passes) return 1; + uint32_t retval = 8; + for (uint32_t i = 0; i < num_downsample; ++i) { + if (num_p > last_pass[i]) { + retval = std::min(retval, downsample[i]); + } + } + return retval; + } + + std::string DebugString() const; + + uint32_t num_passes; // <= kMaxNumPasses + uint32_t num_downsample; // <= num_passes + + // Array of num_downsample pairs. downsample=1/last_pass=num_passes-1 and + // downsample=8/last_pass=0 need not be specified; they are implicit. + uint32_t downsample[kMaxNumPasses]; + uint32_t last_pass[kMaxNumPasses]; + // Array of shift values for each pass. It is implicitly assumed to be 0 for + // the last pass. + uint32_t shift[kMaxNumPasses]; +}; + +enum FrameType { + // A "regular" frame: might be a crop, and will be blended on a previous + // frame, if any, and displayed or blended in future frames. + kRegularFrame = 0, + // A DC frame: this frame is downsampled and will be *only* used as the DC of + // a future frame and, possibly, for previews. Cannot be cropped, blended, or + // referenced by patches or blending modes. Frames that *use* a DC frame + // cannot have non-default sizes either. + kDCFrame = 1, + // A PatchesSource frame: this frame will be only used as a source frame for + // taking patches. Can be cropped, but cannot have non-(0, 0) x0 and y0. + kReferenceOnly = 2, + // Same as kRegularFrame, but not used for progressive rendering. This also + // implies no early display of DC. + kSkipProgressive = 3, +}; + +// Image/frame := one of more of these, where the last has is_last = true. +// Starts at a byte-aligned address "a"; the next pass starts at "a + size". +struct FrameHeader : public Fields { + // Optional postprocessing steps. These flags are the source of truth; + // Override must set/clear them rather than change their meaning. Values + // chosen such that typical flags == 0 (encoded in only two bits). + enum Flags { + // Often but not always off => low bit value: + + // Inject noise into decoded output. + kNoise = 1, + + // Overlay patches. + kPatches = 2, + + // 4, 8 = reserved for future sometimes-off + + // Overlay splines. + kSplines = 16, + + kUseDcFrame = 32, // Implies kSkipAdaptiveDCSmoothing. + + // 64 = reserved for future often-off + + // Almost always on => negated: + + kSkipAdaptiveDCSmoothing = 128, + }; + + explicit FrameHeader(const CodecMetadata* metadata); + JXL_FIELDS_NAME(FrameHeader) + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + // Sets/clears `flag` based upon `condition`. + void UpdateFlag(const bool condition, const uint64_t flag) { + if (condition) { + flags |= flag; + } else { + flags &= ~flag; + } + } + + // Returns true if this frame is supposed to be saved for future usage by + // other frames. + bool CanBeReferenced() const { + // DC frames cannot be referenced. The last frame cannot be referenced. A + // duration 0 frame makes little sense if it is not referenced. A + // non-duration 0 frame may or may not be referenced. + return !is_last && frame_type != FrameType::kDCFrame && + (animation_frame.duration == 0 || save_as_reference != 0); + } + + mutable bool all_default; + + // Always present + FrameEncoding encoding; + // Some versions of UBSAN complain in VisitFrameType if not initialized. + FrameType frame_type = FrameType::kRegularFrame; + + uint64_t flags; + + ColorTransform color_transform; + YCbCrChromaSubsampling chroma_subsampling; + + uint32_t group_size_shift; // only if encoding == kModular; + + uint32_t x_qm_scale; // only if VarDCT and color_transform == kXYB + uint32_t b_qm_scale; // only if VarDCT and color_transform == kXYB + + std::string name; + + // Skipped for kReferenceOnly. + Passes passes; + + // Skipped for kDCFrame + bool custom_size_or_origin; + FrameSize frame_size; + + // upsampling factors for color and extra channels. + // Upsampling is always performed before applying any inverse color transform. + // Skipped (1) if kUseDCFrame + uint32_t upsampling; + std::vector extra_channel_upsampling; + + // Only for kRegular frames. + FrameOrigin frame_origin; + + BlendingInfo blending_info; + std::vector extra_channel_blending_info; + + // Animation info for this frame. + AnimationFrame animation_frame; + + // This is the last frame. + bool is_last; + + // ID to refer to this frame with. 0-3, not present if kDCFrame. + // 0 has a special meaning for kRegular frames of nonzero duration: it defines + // a frame that will not be referenced in the future. + uint32_t save_as_reference; + + // Whether to save this frame before or after the color transform. A frame + // that is saved before the color tansform can only be used for blending + // through patches. On the contrary, a frame that is saved after the color + // transform can only be used for blending through blending modes. + // Irrelevant for extra channel blending. Can only be true if + // blending_info.mode == kReplace and this is not a partial kRegularFrame; if + // this is a DC frame, it is always true. + bool save_before_color_transform; + + uint32_t dc_level; // 1-4 if kDCFrame (0 otherwise). + + // Must be set to the one ImageMetadata acting as the full codestream header, + // with correct xyb_encoded, list of extra channels, etc... + const CodecMetadata* nonserialized_metadata = nullptr; + + // NOTE: This is ignored by AllDefault. + LoopFilter loop_filter; + + bool nonserialized_is_preview = false; + + size_t default_xsize() const { + if (!nonserialized_metadata) return 0; + if (nonserialized_is_preview) { + return nonserialized_metadata->m.preview_size.xsize(); + } + return nonserialized_metadata->xsize(); + } + + size_t default_ysize() const { + if (!nonserialized_metadata) return 0; + if (nonserialized_is_preview) { + return nonserialized_metadata->m.preview_size.ysize(); + } + return nonserialized_metadata->ysize(); + } + + FrameDimensions ToFrameDimensions() const { + size_t xsize = default_xsize(); + size_t ysize = default_ysize(); + + xsize = frame_size.xsize ? frame_size.xsize : xsize; + ysize = frame_size.ysize ? frame_size.ysize : ysize; + + if (dc_level != 0) { + xsize = DivCeil(xsize, 1 << (3 * dc_level)); + ysize = DivCeil(ysize, 1 << (3 * dc_level)); + } + + FrameDimensions frame_dim; + frame_dim.Set(xsize, ysize, group_size_shift, + chroma_subsampling.MaxHShift(), + chroma_subsampling.MaxVShift(), + encoding == FrameEncoding::kModular, upsampling); + return frame_dim; + } + + // True if a color transform should be applied to this frame. + bool needs_color_transform() const { + return !save_before_color_transform || + frame_type == FrameType::kRegularFrame || + frame_type == FrameType::kSkipProgressive; + } + + std::string DebugString() const; + + uint64_t extensions; +}; + +Status ReadFrameHeader(BitReader* JXL_RESTRICT reader, + FrameHeader* JXL_RESTRICT frame); + +// Shared by enc/dec. 5F and 13 are by far the most common for d1/2/4/8, 0 +// ensures low overhead for small images. +static constexpr U32Enc kOrderEnc = + U32Enc(Val(0x5F), Val(0x13), Val(0), Bits(kNumOrders)); + +} // namespace jxl + +#endif // LIB_JXL_FRAME_HEADER_H_ diff --git a/third_party/jpeg-xl/lib/jxl/gamma_correct_test.cc b/third_party/jpeg-xl/lib/jxl/gamma_correct_test.cc new file mode 100644 index 0000000000..131ec4fa83 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/gamma_correct_test.cc @@ -0,0 +1,37 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include + +#include "lib/jxl/enc_gamma_correct.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +TEST(GammaCorrectTest, TestLinearToSrgbEdgeCases) { + EXPECT_EQ(0, LinearToSrgb8Direct(0.0)); + EXPECT_NEAR(0, LinearToSrgb8Direct(1E-6f), 2E-5); + EXPECT_EQ(0, LinearToSrgb8Direct(-1E-6f)); + EXPECT_EQ(0, LinearToSrgb8Direct(-1E6)); + EXPECT_NEAR(1, LinearToSrgb8Direct(1 - 1E-6f), 1E-5); + EXPECT_EQ(1, LinearToSrgb8Direct(1 + 1E-6f)); + EXPECT_EQ(1, LinearToSrgb8Direct(1E6)); +} + +TEST(GammaCorrectTest, TestRoundTrip) { + // NOLINTNEXTLINE(clang-analyzer-security.FloatLoopCounter) + for (double linear = 0.0; linear <= 1.0; linear += 1E-7) { + const double srgb = LinearToSrgb8Direct(linear); + const double linear2 = Srgb8ToLinearDirect(srgb); + ASSERT_LT(std::abs(linear - linear2), 2E-13) + << "linear = " << linear << ", linear2 = " << linear2; + } +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/gauss_blur.cc b/third_party/jpeg-xl/lib/jxl/gauss_blur.cc new file mode 100644 index 0000000000..82384e4c64 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/gauss_blur.cc @@ -0,0 +1,623 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/gauss_blur.h" + +#include + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/gauss_blur.cc" +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/matrix_ops.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Broadcast; +using hwy::HWY_NAMESPACE::GetLane; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::NegMulSub; +#if HWY_TARGET != HWY_SCALAR +using hwy::HWY_NAMESPACE::ShiftLeftLanes; +#endif +using hwy::HWY_NAMESPACE::Vec; + +void FastGaussian1D(const hwy::AlignedUniquePtr& rg, + const float* JXL_RESTRICT in, intptr_t width, + float* JXL_RESTRICT out) { + // Although the current output depends on the previous output, we can unroll + // up to 4x by precomputing up to fourth powers of the constants. Beyond that, + // numerical precision might become a problem. Macro because this is tested + // in #if alongside HWY_TARGET. +#define JXL_GAUSS_MAX_LANES 4 + using D = HWY_CAPPED(float, JXL_GAUSS_MAX_LANES); + using V = Vec; + const D d; + const V mul_in_1 = Load(d, rg->mul_in + 0 * 4); + const V mul_in_3 = Load(d, rg->mul_in + 1 * 4); + const V mul_in_5 = Load(d, rg->mul_in + 2 * 4); + const V mul_prev_1 = Load(d, rg->mul_prev + 0 * 4); + const V mul_prev_3 = Load(d, rg->mul_prev + 1 * 4); + const V mul_prev_5 = Load(d, rg->mul_prev + 2 * 4); + const V mul_prev2_1 = Load(d, rg->mul_prev2 + 0 * 4); + const V mul_prev2_3 = Load(d, rg->mul_prev2 + 1 * 4); + const V mul_prev2_5 = Load(d, rg->mul_prev2 + 2 * 4); + V prev_1 = Zero(d); + V prev_3 = Zero(d); + V prev_5 = Zero(d); + V prev2_1 = Zero(d); + V prev2_3 = Zero(d); + V prev2_5 = Zero(d); + + const intptr_t N = rg->radius; + + intptr_t n = -N + 1; + // Left side with bounds checks and only write output after n >= 0. + const intptr_t first_aligned = RoundUpTo(N + 1, Lanes(d)); + for (; n < std::min(first_aligned, width); ++n) { + const intptr_t left = n - N - 1; + const intptr_t right = n + N - 1; + const float left_val = left >= 0 ? in[left] : 0.0f; + const float right_val = right < width ? in[right] : 0.0f; + const V sum = Set(d, left_val + right_val); + + // (Only processing a single lane here, no need to broadcast) + V out_1 = Mul(sum, mul_in_1); + V out_3 = Mul(sum, mul_in_3); + V out_5 = Mul(sum, mul_in_5); + + out_1 = MulAdd(mul_prev2_1, prev2_1, out_1); + out_3 = MulAdd(mul_prev2_3, prev2_3, out_3); + out_5 = MulAdd(mul_prev2_5, prev2_5, out_5); + prev2_1 = prev_1; + prev2_3 = prev_3; + prev2_5 = prev_5; + + out_1 = MulAdd(mul_prev_1, prev_1, out_1); + out_3 = MulAdd(mul_prev_3, prev_3, out_3); + out_5 = MulAdd(mul_prev_5, prev_5, out_5); + prev_1 = out_1; + prev_3 = out_3; + prev_5 = out_5; + + if (n >= 0) { + out[n] = GetLane(Add(out_1, Add(out_3, out_5))); + } + } + + // The above loop is effectively scalar but it is convenient to use the same + // prev/prev2 variables, so broadcast to each lane before the unrolled loop. +#if HWY_TARGET != HWY_SCALAR && JXL_GAUSS_MAX_LANES > 1 + prev2_1 = Broadcast<0>(prev2_1); + prev2_3 = Broadcast<0>(prev2_3); + prev2_5 = Broadcast<0>(prev2_5); + prev_1 = Broadcast<0>(prev_1); + prev_3 = Broadcast<0>(prev_3); + prev_5 = Broadcast<0>(prev_5); +#endif + + // Unrolled, no bounds checking needed. + for (; n < width - N + 1 - (JXL_GAUSS_MAX_LANES - 1); n += Lanes(d)) { + const V sum = Add(LoadU(d, in + n - N - 1), LoadU(d, in + n + N - 1)); + + // To get a vector of output(s), we multiply broadcasted vectors (of each + // input plus the two previous outputs) and add them all together. + // Incremental broadcasting and shifting is expected to be cheaper than + // horizontal adds or transposing 4x4 values because they run on a different + // port, concurrently with the FMA. + const V in0 = Broadcast<0>(sum); + V out_1 = Mul(in0, mul_in_1); + V out_3 = Mul(in0, mul_in_3); + V out_5 = Mul(in0, mul_in_5); + +#if HWY_TARGET != HWY_SCALAR && JXL_GAUSS_MAX_LANES >= 2 + const V in1 = Broadcast<1>(sum); + out_1 = MulAdd(ShiftLeftLanes<1>(mul_in_1), in1, out_1); + out_3 = MulAdd(ShiftLeftLanes<1>(mul_in_3), in1, out_3); + out_5 = MulAdd(ShiftLeftLanes<1>(mul_in_5), in1, out_5); + +#if JXL_GAUSS_MAX_LANES >= 4 + const V in2 = Broadcast<2>(sum); + out_1 = MulAdd(ShiftLeftLanes<2>(mul_in_1), in2, out_1); + out_3 = MulAdd(ShiftLeftLanes<2>(mul_in_3), in2, out_3); + out_5 = MulAdd(ShiftLeftLanes<2>(mul_in_5), in2, out_5); + + const V in3 = Broadcast<3>(sum); + out_1 = MulAdd(ShiftLeftLanes<3>(mul_in_1), in3, out_1); + out_3 = MulAdd(ShiftLeftLanes<3>(mul_in_3), in3, out_3); + out_5 = MulAdd(ShiftLeftLanes<3>(mul_in_5), in3, out_5); +#endif +#endif + + out_1 = MulAdd(mul_prev2_1, prev2_1, out_1); + out_3 = MulAdd(mul_prev2_3, prev2_3, out_3); + out_5 = MulAdd(mul_prev2_5, prev2_5, out_5); + + out_1 = MulAdd(mul_prev_1, prev_1, out_1); + out_3 = MulAdd(mul_prev_3, prev_3, out_3); + out_5 = MulAdd(mul_prev_5, prev_5, out_5); +#if HWY_TARGET == HWY_SCALAR || JXL_GAUSS_MAX_LANES == 1 + prev2_1 = prev_1; + prev2_3 = prev_3; + prev2_5 = prev_5; + prev_1 = out_1; + prev_3 = out_3; + prev_5 = out_5; +#else + prev2_1 = Broadcast(out_1); + prev2_3 = Broadcast(out_3); + prev2_5 = Broadcast(out_5); + prev_1 = Broadcast(out_1); + prev_3 = Broadcast(out_3); + prev_5 = Broadcast(out_5); +#endif + + Store(Add(out_1, Add(out_3, out_5)), d, out + n); + } + + // Remainder handling with bounds checks + for (; n < width; ++n) { + const intptr_t left = n - N - 1; + const intptr_t right = n + N - 1; + const float left_val = left >= 0 ? in[left] : 0.0f; + const float right_val = right < width ? in[right] : 0.0f; + const V sum = Set(d, left_val + right_val); + + // (Only processing a single lane here, no need to broadcast) + V out_1 = Mul(sum, mul_in_1); + V out_3 = Mul(sum, mul_in_3); + V out_5 = Mul(sum, mul_in_5); + + out_1 = MulAdd(mul_prev2_1, prev2_1, out_1); + out_3 = MulAdd(mul_prev2_3, prev2_3, out_3); + out_5 = MulAdd(mul_prev2_5, prev2_5, out_5); + prev2_1 = prev_1; + prev2_3 = prev_3; + prev2_5 = prev_5; + + out_1 = MulAdd(mul_prev_1, prev_1, out_1); + out_3 = MulAdd(mul_prev_3, prev_3, out_3); + out_5 = MulAdd(mul_prev_5, prev_5, out_5); + prev_1 = out_1; + prev_3 = out_3; + prev_5 = out_5; + + out[n] = GetLane(Add(out_1, Add(out_3, out_5))); + } +} + +// Ring buffer is for n, n-1, n-2; round up to 4 for faster modulo. +constexpr size_t kMod = 4; + +// Avoids an unnecessary store during warmup. +struct OutputNone { + template + void operator()(const V& /*unused*/, float* JXL_RESTRICT /*pos*/, + ptrdiff_t /*offset*/) const {} +}; + +// Common case: write output vectors in all VerticalBlock except warmup. +struct OutputStore { + template + void operator()(const V& out, float* JXL_RESTRICT pos, + ptrdiff_t offset) const { + // Stream helps for large images but is slower for images that fit in cache. + Store(out, HWY_FULL(float)(), pos + offset); + } +}; + +// At top/bottom borders, we don't have two inputs to load, so avoid addition. +// pos may even point to all zeros if the row is outside the input image. +class SingleInput { + public: + explicit SingleInput(const float* pos) : pos_(pos) {} + Vec operator()(const size_t offset) const { + return Load(HWY_FULL(float)(), pos_ + offset); + } + const float* pos_; +}; + +// In the middle of the image, we need to load from a row above and below, and +// return the sum. +class TwoInputs { + public: + TwoInputs(const float* pos1, const float* pos2) : pos1_(pos1), pos2_(pos2) {} + Vec operator()(const size_t offset) const { + const auto in1 = Load(HWY_FULL(float)(), pos1_ + offset); + const auto in2 = Load(HWY_FULL(float)(), pos2_ + offset); + return Add(in1, in2); + } + + private: + const float* pos1_; + const float* pos2_; +}; + +// Block := kVectors consecutive full vectors (one cache line except on the +// right boundary, where we can only rely on having one vector). Unrolling to +// the cache line size improves cache utilization. +template +void VerticalBlock(const V& d1_1, const V& d1_3, const V& d1_5, const V& n2_1, + const V& n2_3, const V& n2_5, const Input& input, + size_t& ctr, float* ring_buffer, const Output output, + float* JXL_RESTRICT out_pos) { + const HWY_FULL(float) d; + constexpr size_t kVN = MaxLanes(d); + // More cache-friendly to process an entirely cache line at a time + constexpr size_t kLanes = kVectors * kVN; + + float* JXL_RESTRICT y_1 = ring_buffer + 0 * kLanes * kMod; + float* JXL_RESTRICT y_3 = ring_buffer + 1 * kLanes * kMod; + float* JXL_RESTRICT y_5 = ring_buffer + 2 * kLanes * kMod; + + const size_t n_0 = (++ctr) % kMod; + const size_t n_1 = (ctr - 1) % kMod; + const size_t n_2 = (ctr - 2) % kMod; + + for (size_t idx_vec = 0; idx_vec < kVectors; ++idx_vec) { + const V sum = input(idx_vec * kVN); + + const V y_n1_1 = Load(d, y_1 + kLanes * n_1 + idx_vec * kVN); + const V y_n1_3 = Load(d, y_3 + kLanes * n_1 + idx_vec * kVN); + const V y_n1_5 = Load(d, y_5 + kLanes * n_1 + idx_vec * kVN); + const V y_n2_1 = Load(d, y_1 + kLanes * n_2 + idx_vec * kVN); + const V y_n2_3 = Load(d, y_3 + kLanes * n_2 + idx_vec * kVN); + const V y_n2_5 = Load(d, y_5 + kLanes * n_2 + idx_vec * kVN); + // (35) + const V y1 = MulAdd(n2_1, sum, NegMulSub(d1_1, y_n1_1, y_n2_1)); + const V y3 = MulAdd(n2_3, sum, NegMulSub(d1_3, y_n1_3, y_n2_3)); + const V y5 = MulAdd(n2_5, sum, NegMulSub(d1_5, y_n1_5, y_n2_5)); + Store(y1, d, y_1 + kLanes * n_0 + idx_vec * kVN); + Store(y3, d, y_3 + kLanes * n_0 + idx_vec * kVN); + Store(y5, d, y_5 + kLanes * n_0 + idx_vec * kVN); + output(Add(y1, Add(y3, y5)), out_pos, idx_vec * kVN); + } + // NOTE: flushing cache line out_pos hurts performance - less so with + // clflushopt than clflush but still a significant slowdown. +} + +// Reads/writes one block (kVectors full vectors) in each row. +template +void VerticalStrip(const hwy::AlignedUniquePtr& rg, + const ImageF& in, const size_t x, ImageF* JXL_RESTRICT out) { + // We're iterating vertically, so use multiple full-length vectors (each lane + // is one column of row n). + using D = HWY_FULL(float); + using V = Vec; + const D d; + constexpr size_t kVN = MaxLanes(d); + // More cache-friendly to process an entirely cache line at a time + constexpr size_t kLanes = kVectors * kVN; +#if HWY_TARGET == HWY_SCALAR + const V d1_1 = Set(d, rg->d1[0 * 4]); + const V d1_3 = Set(d, rg->d1[1 * 4]); + const V d1_5 = Set(d, rg->d1[2 * 4]); + const V n2_1 = Set(d, rg->n2[0 * 4]); + const V n2_3 = Set(d, rg->n2[1 * 4]); + const V n2_5 = Set(d, rg->n2[2 * 4]); +#else + const V d1_1 = LoadDup128(d, rg->d1 + 0 * 4); + const V d1_3 = LoadDup128(d, rg->d1 + 1 * 4); + const V d1_5 = LoadDup128(d, rg->d1 + 2 * 4); + const V n2_1 = LoadDup128(d, rg->n2 + 0 * 4); + const V n2_3 = LoadDup128(d, rg->n2 + 1 * 4); + const V n2_5 = LoadDup128(d, rg->n2 + 2 * 4); +#endif + + const size_t N = rg->radius; + const size_t ysize = in.ysize(); + + size_t ctr = 0; + HWY_ALIGN float ring_buffer[3 * kLanes * kMod] = {0}; + HWY_ALIGN static constexpr float zero[kLanes] = {0}; + + // Warmup: top is out of bounds (zero padded), bottom is usually in-bounds. + ssize_t n = -static_cast(N) + 1; + for (; n < 0; ++n) { + // bottom is always non-negative since n is initialized in -N + 1. + const size_t bottom = n + N - 1; + VerticalBlock( + d1_1, d1_3, d1_5, n2_1, n2_3, n2_5, + SingleInput(bottom < ysize ? in.ConstRow(bottom) + x : zero), ctr, + ring_buffer, OutputNone(), nullptr); + } + JXL_DASSERT(n >= 0); + + // Start producing output; top is still out of bounds. + for (; static_cast(n) < std::min(N + 1, ysize); ++n) { + const size_t bottom = n + N - 1; + VerticalBlock( + d1_1, d1_3, d1_5, n2_1, n2_3, n2_5, + SingleInput(bottom < ysize ? in.ConstRow(bottom) + x : zero), ctr, + ring_buffer, OutputStore(), out->Row(n) + x); + } + + // Interior outputs with prefetching and without bounds checks. + constexpr size_t kPrefetchRows = 8; + for (; n < static_cast(ysize - N + 1 - kPrefetchRows); ++n) { + const size_t top = n - N - 1; + const size_t bottom = n + N - 1; + VerticalBlock( + d1_1, d1_3, d1_5, n2_1, n2_3, n2_5, + TwoInputs(in.ConstRow(top) + x, in.ConstRow(bottom) + x), ctr, + ring_buffer, OutputStore(), out->Row(n) + x); + hwy::Prefetch(in.ConstRow(top + kPrefetchRows) + x); + hwy::Prefetch(in.ConstRow(bottom + kPrefetchRows) + x); + } + + // Bottom border without prefetching and with bounds checks. + for (; static_cast(n) < ysize; ++n) { + const size_t top = n - N - 1; + const size_t bottom = n + N - 1; + VerticalBlock( + d1_1, d1_3, d1_5, n2_1, n2_3, n2_5, + TwoInputs(in.ConstRow(top) + x, + bottom < ysize ? in.ConstRow(bottom) + x : zero), + ctr, ring_buffer, OutputStore(), out->Row(n) + x); + } +} + +// Apply 1D vertical scan to multiple columns (one per vector lane). +// Not yet parallelized. +void FastGaussianVertical(const hwy::AlignedUniquePtr& rg, + const ImageF& in, ThreadPool* /*pool*/, + ImageF* JXL_RESTRICT out) { + PROFILER_FUNC; + JXL_CHECK(SameSize(in, *out)); + + constexpr size_t kCacheLineLanes = 64 / sizeof(float); + constexpr size_t kVN = MaxLanes(HWY_FULL(float)()); + constexpr size_t kCacheLineVectors = + (kVN < kCacheLineLanes) ? (kCacheLineLanes / kVN) : 4; + constexpr size_t kFastPace = kCacheLineVectors * kVN; + + size_t x = 0; + for (; x + kFastPace <= in.xsize(); x += kFastPace) { + VerticalStrip(rg, in, x, out); + } + for (; x < in.xsize(); x += kVN) { + VerticalStrip<1>(rg, in, x, out); + } +} + +// TODO(veluca): consider replacing with FastGaussian. +ImageF ConvolveXSampleAndTranspose(const ImageF& in, + const std::vector& kernel, + const size_t res) { + JXL_ASSERT(kernel.size() % 2 == 1); + JXL_ASSERT(in.xsize() % res == 0); + const size_t offset = res / 2; + const size_t out_xsize = in.xsize() / res; + ImageF out(in.ysize(), out_xsize); + const int r = kernel.size() / 2; + HWY_FULL(float) df; + std::vector row_tmp(in.xsize() + 2 * r + Lanes(df)); + float* const JXL_RESTRICT rowp = &row_tmp[r]; + std::vector padded_k = kernel; + padded_k.resize(padded_k.size() + Lanes(df)); + const float* const kernelp = &padded_k[r]; + for (size_t y = 0; y < in.ysize(); ++y) { + ExtrapolateBorders(in.Row(y), rowp, in.xsize(), r); + size_t x = offset, ox = 0; + for (; x < static_cast(r) && x < in.xsize(); x += res, ++ox) { + float sum = 0.0f; + for (int i = -r; i <= r; ++i) { + sum += rowp[std::max( + 0, std::min(static_cast(x) + i, in.xsize()))] * + kernelp[i]; + } + out.Row(ox)[y] = sum; + } + for (; x + r < in.xsize(); x += res, ++ox) { + auto sum = Zero(df); + for (int i = -r; i <= r; i += Lanes(df)) { + sum = MulAdd(LoadU(df, rowp + x + i), LoadU(df, kernelp + i), sum); + } + out.Row(ox)[y] = GetLane(SumOfLanes(df, sum)); + } + for (; x < in.xsize(); x += res, ++ox) { + float sum = 0.0f; + for (int i = -r; i <= r; ++i) { + sum += rowp[std::max( + 0, std::min(static_cast(x) + i, in.xsize()))] * + kernelp[i]; + } + out.Row(ox)[y] = sum; + } + } + return out; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(FastGaussian1D); +HWY_EXPORT(ConvolveXSampleAndTranspose); +void FastGaussian1D(const hwy::AlignedUniquePtr& rg, + const float* JXL_RESTRICT in, intptr_t width, + float* JXL_RESTRICT out) { + return HWY_DYNAMIC_DISPATCH(FastGaussian1D)(rg, in, width, out); +} + +HWY_EXPORT(FastGaussianVertical); // Local function. + +void ExtrapolateBorders(const float* const JXL_RESTRICT row_in, + float* const JXL_RESTRICT row_out, const int xsize, + const int radius) { + const int lastcol = xsize - 1; + for (int x = 1; x <= radius; ++x) { + row_out[-x] = row_in[std::min(x, xsize - 1)]; + } + memcpy(row_out, row_in, xsize * sizeof(row_out[0])); + for (int x = 1; x <= radius; ++x) { + row_out[lastcol + x] = row_in[std::max(0, lastcol - x)]; + } +} + +ImageF ConvolveXSampleAndTranspose(const ImageF& in, + const std::vector& kernel, + const size_t res) { + return HWY_DYNAMIC_DISPATCH(ConvolveXSampleAndTranspose)(in, kernel, res); +} + +Image3F ConvolveXSampleAndTranspose(const Image3F& in, + const std::vector& kernel, + const size_t res) { + return Image3F(ConvolveXSampleAndTranspose(in.Plane(0), kernel, res), + ConvolveXSampleAndTranspose(in.Plane(1), kernel, res), + ConvolveXSampleAndTranspose(in.Plane(2), kernel, res)); +} + +ImageF ConvolveAndSample(const ImageF& in, const std::vector& kernel, + const size_t res) { + ImageF tmp = ConvolveXSampleAndTranspose(in, kernel, res); + return ConvolveXSampleAndTranspose(tmp, kernel, res); +} + +// Implements "Recursive Implementation of the Gaussian Filter Using Truncated +// Cosine Functions" by Charalampidis [2016]. +hwy::AlignedUniquePtr CreateRecursiveGaussian(double sigma) { + PROFILER_FUNC; + auto rg = hwy::MakeUniqueAligned(); + constexpr double kPi = 3.141592653589793238; + + const double radius = roundf(3.2795 * sigma + 0.2546); // (57), "N" + + // Table I, first row + const double pi_div_2r = kPi / (2.0 * radius); + const double omega[3] = {pi_div_2r, 3.0 * pi_div_2r, 5.0 * pi_div_2r}; + + // (37), k={1,3,5} + const double p_1 = +1.0 / std::tan(0.5 * omega[0]); + const double p_3 = -1.0 / std::tan(0.5 * omega[1]); + const double p_5 = +1.0 / std::tan(0.5 * omega[2]); + + // (44), k={1,3,5} + const double r_1 = +p_1 * p_1 / std::sin(omega[0]); + const double r_3 = -p_3 * p_3 / std::sin(omega[1]); + const double r_5 = +p_5 * p_5 / std::sin(omega[2]); + + // (50), k={1,3,5} + const double neg_half_sigma2 = -0.5 * sigma * sigma; + const double recip_radius = 1.0 / radius; + double rho[3]; + for (size_t i = 0; i < 3; ++i) { + rho[i] = std::exp(neg_half_sigma2 * omega[i] * omega[i]) * recip_radius; + } + + // second part of (52), k1,k2 = 1,3; 3,5; 5,1 + const double D_13 = p_1 * r_3 - r_1 * p_3; + const double D_35 = p_3 * r_5 - r_3 * p_5; + const double D_51 = p_5 * r_1 - r_5 * p_1; + + // (52), k=5 + const double recip_d13 = 1.0 / D_13; + const double zeta_15 = D_35 * recip_d13; + const double zeta_35 = D_51 * recip_d13; + + double A[9] = {p_1, p_3, p_5, // + r_1, r_3, r_5, // (56) + zeta_15, zeta_35, 1}; + JXL_CHECK(Inv3x3Matrix(A)); + const double gamma[3] = {1, radius * radius - sigma * sigma, // (55) + zeta_15 * rho[0] + zeta_35 * rho[1] + rho[2]}; + double beta[3]; + Mul3x3Vector(A, gamma, beta); // (53) + + // Sanity check: correctly solved for beta (IIR filter weights are normalized) + const double sum = beta[0] * p_1 + beta[1] * p_3 + beta[2] * p_5; // (39) + JXL_ASSERT(std::abs(sum - 1) < 1E-12); + (void)sum; + + rg->radius = static_cast(radius); + + double n2[3]; + double d1[3]; + for (size_t i = 0; i < 3; ++i) { + n2[i] = -beta[i] * std::cos(omega[i] * (radius + 1.0)); // (33) + d1[i] = -2.0 * std::cos(omega[i]); // (33) + + for (size_t lane = 0; lane < 4; ++lane) { + rg->n2[4 * i + lane] = static_cast(n2[i]); + rg->d1[4 * i + lane] = static_cast(d1[i]); + } + + const double d_2 = d1[i] * d1[i]; + + // Obtained by expanding (35) for four consecutive outputs via sympy: + // n, d, p, pp = symbols('n d p pp') + // i0, i1, i2, i3 = symbols('i0 i1 i2 i3') + // o0, o1, o2, o3 = symbols('o0 o1 o2 o3') + // o0 = n*i0 - d*p - pp + // o1 = n*i1 - d*o0 - p + // o2 = n*i2 - d*o1 - o0 + // o3 = n*i3 - d*o2 - o1 + // Then expand(o3) and gather terms for p(prev), pp(prev2) etc. + rg->mul_prev[4 * i + 0] = -d1[i]; + rg->mul_prev[4 * i + 1] = d_2 - 1.0; + rg->mul_prev[4 * i + 2] = -d_2 * d1[i] + 2.0 * d1[i]; + rg->mul_prev[4 * i + 3] = d_2 * d_2 - 3.0 * d_2 + 1.0; + rg->mul_prev2[4 * i + 0] = -1.0; + rg->mul_prev2[4 * i + 1] = d1[i]; + rg->mul_prev2[4 * i + 2] = -d_2 + 1.0; + rg->mul_prev2[4 * i + 3] = d_2 * d1[i] - 2.0 * d1[i]; + rg->mul_in[4 * i + 0] = n2[i]; + rg->mul_in[4 * i + 1] = -d1[i] * n2[i]; + rg->mul_in[4 * i + 2] = d_2 * n2[i] - n2[i]; + rg->mul_in[4 * i + 3] = -d_2 * d1[i] * n2[i] + 2.0 * d1[i] * n2[i]; + } + return rg; +} + +namespace { + +// Apply 1D horizontal scan to each row. +void FastGaussianHorizontal(const hwy::AlignedUniquePtr& rg, + const ImageF& in, ThreadPool* pool, + ImageF* JXL_RESTRICT out) { + PROFILER_FUNC; + JXL_CHECK(SameSize(in, *out)); + + const intptr_t xsize = in.xsize(); + JXL_CHECK(RunOnPool( + pool, 0, in.ysize(), ThreadPool::NoInit, + [&](const uint32_t task, size_t /*thread*/) { + const size_t y = task; + const float* row_in = in.ConstRow(y); + float* JXL_RESTRICT row_out = out->Row(y); + FastGaussian1D(rg, row_in, xsize, row_out); + }, + "FastGaussianHorizontal")); +} + +} // namespace + +void FastGaussian(const hwy::AlignedUniquePtr& rg, + const ImageF& in, ThreadPool* pool, ImageF* JXL_RESTRICT temp, + ImageF* JXL_RESTRICT out) { + FastGaussianHorizontal(rg, in, pool, temp); + HWY_DYNAMIC_DISPATCH(FastGaussianVertical)(rg, *temp, pool, out); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/gauss_blur.h b/third_party/jpeg-xl/lib/jxl/gauss_blur.h new file mode 100644 index 0000000000..fb4741f03a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/gauss_blur.h @@ -0,0 +1,94 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_GAUSS_BLUR_H_ +#define LIB_JXL_GAUSS_BLUR_H_ + +#include + +#include +#include +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/image.h" + +namespace jxl { + +template +std::vector GaussianKernel(int radius, T sigma) { + JXL_ASSERT(sigma > 0.0); + std::vector kernel(2 * radius + 1); + const T scaler = -1.0 / (2 * sigma * sigma); + double sum = 0.0; + for (int i = -radius; i <= radius; ++i) { + const T val = std::exp(scaler * i * i); + kernel[i + radius] = val; + sum += val; + } + for (size_t i = 0; i < kernel.size(); ++i) { + kernel[i] /= sum; + } + return kernel; +} + +// All convolution functions below apply mirroring of the input on the borders +// in the following way: +// +// input: [a0 a1 a2 ... aN] +// mirrored input: [aR ... a1 | a0 a1 a2 .... aN | aN-1 ... aN-R] +// +// where R is the radius of the kernel (i.e. kernel size is 2*R+1). + +// REQUIRES: in.xsize() and in.ysize() are integer multiples of res. +ImageF ConvolveAndSample(const ImageF& in, const std::vector& kernel, + const size_t res); + +// Private, used by test. +void ExtrapolateBorders(const float* const JXL_RESTRICT row_in, + float* const JXL_RESTRICT row_out, const int xsize, + const int radius); + +// Only for use by CreateRecursiveGaussian and FastGaussian*. +#pragma pack(push, 1) +struct RecursiveGaussian { + // For k={1,3,5} in that order, each broadcasted 4x for LoadDup128. Used only + // for vertical passes. + float n2[3 * 4]; + float d1[3 * 4]; + + // We unroll horizontal passes 4x - one output per lane. These are each lane's + // multiplier for the previous output (relative to the first of the four + // outputs). Indexing: 4 * 0..2 (for {1,3,5}) + 0..3 for the lane index. + float mul_prev[3 * 4]; + // Ditto for the second to last output. + float mul_prev2[3 * 4]; + + // We multiply a vector of inputs 0..3 by a vector shifted from this array. + // in=0 uses all 4 (nonzero) terms; for in=3, the lower three lanes are 0. + float mul_in[3 * 4]; + + size_t radius; +}; +#pragma pack(pop) + +// Precomputation for FastGaussian*; users may use the same pointer/storage in +// subsequent calls to FastGaussian* with the same sigma. +hwy::AlignedUniquePtr CreateRecursiveGaussian(double sigma); + +// 1D Gaussian with zero-pad boundary handling and runtime independent of sigma. +void FastGaussian1D(const hwy::AlignedUniquePtr& rg, + const float* JXL_RESTRICT in, intptr_t width, + float* JXL_RESTRICT out); + +// 2D Gaussian with zero-pad boundary handling and runtime independent of sigma. +void FastGaussian(const hwy::AlignedUniquePtr& rg, + const ImageF& in, ThreadPool* pool, ImageF* JXL_RESTRICT temp, + ImageF* JXL_RESTRICT out); + +} // namespace jxl + +#endif // LIB_JXL_GAUSS_BLUR_H_ diff --git a/third_party/jpeg-xl/lib/jxl/gauss_blur_gbench.cc b/third_party/jpeg-xl/lib/jxl/gauss_blur_gbench.cc new file mode 100644 index 0000000000..b1bb64abc5 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/gauss_blur_gbench.cc @@ -0,0 +1,126 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include "benchmark/benchmark.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/gauss_blur.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { +namespace { + +JXL_MAYBE_UNUSED ImageF Convolve(const ImageF& in, + const std::vector& kernel) { + return ConvolveAndSample(in, kernel, 1); +} + +void BM_GaussBlur1d(benchmark::State& state) { + // Uncomment to disable SIMD and force and scalar implementation + // hwy::DisableTargets(~HWY_SCALAR); + // Uncomment to run AVX2 + // hwy::DisableTargets(HWY_AVX3); + + const size_t length = state.range(); + const double sigma = 7.0; // (from Butteraugli application) + ImageF in(length, 1); + const float expected = length; + FillImage(expected, &in); + + ImageF temp(length, 1); + ImageF out(length, 1); + const auto rg = CreateRecursiveGaussian(sigma); + for (auto _ : state) { + FastGaussian1D(rg, in.Row(0), length, out.Row(0)); + // Prevent optimizing out + JXL_ASSERT(std::abs(out.ConstRow(0)[length / 2] - expected) / expected < + 9E-5); + } + state.SetItemsProcessed(length * state.iterations()); +} + +void BM_GaussBlur2d(benchmark::State& state) { + // See GaussBlur1d for SIMD changes. + + const size_t xsize = state.range(); + const size_t ysize = xsize; + const double sigma = 7.0; // (from Butteraugli application) + ImageF in(xsize, ysize); + const float expected = xsize + ysize; + FillImage(expected, &in); + + ImageF temp(xsize, ysize); + ImageF out(xsize, ysize); + ThreadPool* null_pool = nullptr; + const auto rg = CreateRecursiveGaussian(sigma); + for (auto _ : state) { + FastGaussian(rg, in, null_pool, &temp, &out); + // Prevent optimizing out + JXL_ASSERT(std::abs(out.ConstRow(ysize / 2)[xsize / 2] - expected) / + expected < + 9E-5); + } + state.SetItemsProcessed(xsize * ysize * state.iterations()); +} + +void BM_GaussBlurFir(benchmark::State& state) { + // See GaussBlur1d for SIMD changes. + + const size_t xsize = state.range(); + const size_t ysize = xsize; + const double sigma = 7.0; // (from Butteraugli application) + ImageF in(xsize, ysize); + const float expected = xsize + ysize; + FillImage(expected, &in); + + ImageF temp(xsize, ysize); + ImageF out(xsize, ysize); + const std::vector kernel = + GaussianKernel(static_cast(4 * sigma), static_cast(sigma)); + for (auto _ : state) { + // Prevent optimizing out + JXL_ASSERT(std::abs(Convolve(in, kernel).ConstRow(ysize / 2)[xsize / 2] - + expected) / + expected < + 9E-5); + } + state.SetItemsProcessed(xsize * ysize * state.iterations()); +} + +void BM_GaussBlurSep7(benchmark::State& state) { + // See GaussBlur1d for SIMD changes. + + const size_t xsize = state.range(); + const size_t ysize = xsize; + ImageF in(xsize, ysize); + const float expected = xsize + ysize; + FillImage(expected, &in); + + ImageF temp(xsize, ysize); + ImageF out(xsize, ysize); + ThreadPool* null_pool = nullptr; + // Gaussian with sigma 1 + const WeightsSeparable7 weights = {{HWY_REP4(0.383103f), HWY_REP4(0.241843f), + HWY_REP4(0.060626f), HWY_REP4(0.00598f)}, + {HWY_REP4(0.383103f), HWY_REP4(0.241843f), + HWY_REP4(0.060626f), HWY_REP4(0.00598f)}}; + for (auto _ : state) { + Separable7(in, Rect(in), weights, null_pool, &out); + // Prevent optimizing out + JXL_ASSERT(std::abs(out.ConstRow(ysize / 2)[xsize / 2] - expected) / + expected < + 9E-5); + } + state.SetItemsProcessed(xsize * ysize * state.iterations()); +} + +BENCHMARK(BM_GaussBlur1d)->Range(1 << 8, 1 << 14); +BENCHMARK(BM_GaussBlur2d)->Range(1 << 7, 1 << 10); +BENCHMARK(BM_GaussBlurFir)->Range(1 << 7, 1 << 10); +BENCHMARK(BM_GaussBlurSep7)->Range(1 << 7, 1 << 10); + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/gauss_blur_test.cc b/third_party/jpeg-xl/lib/jxl/gauss_blur_test.cc new file mode 100644 index 0000000000..097c1aa8df --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/gauss_blur_test.cc @@ -0,0 +1,453 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/gauss_blur.h" + +#include +#include +#include + +#include "lib/extras/time.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { + +bool NearEdge(const int64_t width, const int64_t peak) { + // When around 3*sigma from the edge, there is negligible truncation. + return peak < 10 || peak > width - 10; +} + +// Follow the curve downwards by scanning right from `peak` and verifying +// identical values at the same offset to the left. +void VerifySymmetric(const int64_t width, const int64_t peak, + const float* out) { + const double tolerance = NearEdge(width, peak) ? 0.015 : 6E-7; + for (int64_t i = 1;; ++i) { + // Stop if we passed either end of the array + if (peak - i < 0 || peak + i >= width) break; + EXPECT_GT(out[peak + i - 1] + tolerance, out[peak + i]); // descending + EXPECT_NEAR(out[peak - i], out[peak + i], tolerance); // symmetric + } +} + +void TestImpulseResponse(size_t width, size_t peak) { + const auto rg3 = CreateRecursiveGaussian(3.0); + const auto rg4 = CreateRecursiveGaussian(4.0); + const auto rg5 = CreateRecursiveGaussian(5.0); + + // Extra padding for 4x unrolling + auto in = hwy::AllocateAligned(width + 3); + memset(in.get(), 0, sizeof(float) * (width + 3)); + in[peak] = 1.0f; + + auto out3 = hwy::AllocateAligned(width + 3); + auto out4 = hwy::AllocateAligned(width + 3); + auto out5 = hwy::AllocateAligned(width + 3); + FastGaussian1D(rg3, in.get(), width, out3.get()); + FastGaussian1D(rg4, out3.get(), width, out4.get()); + FastGaussian1D(rg5, in.get(), width, out5.get()); + + VerifySymmetric(width, peak, out3.get()); + VerifySymmetric(width, peak, out4.get()); + VerifySymmetric(width, peak, out5.get()); + + // Wider kernel has flatter peak + EXPECT_LT(out5[peak] + 0.05, out3[peak]); + + // Gauss3 o Gauss4 ~= Gauss5 + const double tolerance = NearEdge(width, peak) ? 0.04 : 0.01; + for (size_t i = 0; i < width; ++i) { + EXPECT_NEAR(out4[i], out5[i], tolerance); + } +} + +void TestImpulseResponseForWidth(size_t width) { + for (size_t i = 0; i < width; ++i) { + TestImpulseResponse(width, i); + } +} + +TEST(GaussBlurTest, ImpulseResponse) { + TestImpulseResponseForWidth(10); // tiny even + TestImpulseResponseForWidth(15); // small odd + TestImpulseResponseForWidth(32); // power of two + TestImpulseResponseForWidth(31); // power of two - 1 + TestImpulseResponseForWidth(33); // power of two + 1 +} + +ImageF Convolve(const ImageF& in, const std::vector& kernel) { + return ConvolveAndSample(in, kernel, 1); +} + +// Higher-precision version for accuracy test. +ImageF ConvolveAndTransposeF64(const ImageF& in, + const std::vector& kernel) { + JXL_ASSERT(kernel.size() % 2 == 1); + ImageF out(in.ysize(), in.xsize()); + const int r = kernel.size() / 2; + std::vector row_tmp(in.xsize() + 2 * r); + float* const JXL_RESTRICT rowp = &row_tmp[r]; + const double* const kernelp = &kernel[r]; + for (size_t y = 0; y < in.ysize(); ++y) { + ExtrapolateBorders(in.Row(y), rowp, in.xsize(), r); + for (size_t x = 0, ox = 0; x < in.xsize(); ++x, ++ox) { + double sum = 0.0; + for (int i = -r; i <= r; ++i) { + sum += rowp[std::max( + 0, std::min(static_cast(x) + i, in.xsize()))] * + kernelp[i]; + } + out.Row(ox)[y] = static_cast(sum); + } + } + return out; +} + +ImageF ConvolveF64(const ImageF& in, const std::vector& kernel) { + ImageF tmp = ConvolveAndTransposeF64(in, kernel); + return ConvolveAndTransposeF64(tmp, kernel); +} + +void TestDirac2D(size_t xsize, size_t ysize, double sigma) { + ImageF in(xsize, ysize); + ZeroFillImage(&in); + // We anyway ignore the border below, so might as well choose the middle. + in.Row(ysize / 2)[xsize / 2] = 1.0f; + + ImageF temp(xsize, ysize); + ImageF out(xsize, ysize); + const auto rg = CreateRecursiveGaussian(sigma); + ThreadPool* null_pool = nullptr; + FastGaussian(rg, in, null_pool, &temp, &out); + + const std::vector kernel = + GaussianKernel(static_cast(4 * sigma), static_cast(sigma)); + const ImageF expected = Convolve(in, kernel); + + const double max_l1 = sigma < 1.5 ? 5E-3 : 6E-4; + const size_t border = 2 * sigma; + + JXL_ASSERT_OK(VerifyRelativeError(expected, out, max_l1, 1E-8, _, border)); +} + +TEST(GaussBlurTest, Test2D) { + const std::vector dimensions{6, 15, 17, 64, 50, 49}; + for (int xsize : dimensions) { + for (int ysize : dimensions) { + for (double sigma : {1.0, 2.5, 3.6, 7.0}) { + TestDirac2D(static_cast(xsize), static_cast(ysize), + sigma); + } + } + } +} + +// Slow (44 sec). To run, remove the disabled prefix. +TEST(GaussBlurTest, DISABLED_SlowTestDirac1D) { + const double sigma = 7.0; + const auto rg = CreateRecursiveGaussian(sigma); + + // IPOL accuracy test uses 10^-15 tolerance, this is 2*10^-11. + const size_t radius = static_cast(7 * sigma); + const std::vector kernel = GaussianKernel(radius, sigma); + + const size_t length = 16384; + ImageF inputs(length, 1); + ZeroFillImage(&inputs); + + auto outputs = hwy::AllocateAligned(length); + + // One per center position + auto sum_abs_err = hwy::AllocateAligned(length); + std::fill(sum_abs_err.get(), sum_abs_err.get() + length, 0.0); + + for (size_t center = radius; center < length - radius; ++center) { + inputs.Row(0)[center - 1] = 0.0f; // reset last peak, entire array now 0 + inputs.Row(0)[center] = 1.0f; + FastGaussian1D(rg, inputs.Row(0), length, outputs.get()); + + const ImageF outputs_fir = ConvolveF64(inputs, kernel); + + for (size_t i = 0; i < length; ++i) { + const float abs_err = std::abs(outputs[i] - outputs_fir.Row(0)[i]); + sum_abs_err[i] += static_cast(abs_err); + } + } + + const double max_abs_err = + *std::max_element(sum_abs_err.get(), sum_abs_err.get() + length); + printf("Max abs err: %.8e\n", max_abs_err); +} + +void TestRandom(size_t xsize, size_t ysize, float min, float max, double sigma, + double max_l1, double max_rel) { + printf("%4" PRIuS " x %4" PRIuS " %4.1f %4.1f sigma %.1f\n", xsize, ysize, + min, max, sigma); + ImageF in(xsize, ysize); + RandomFillImage(&in, min, max, 65537 + xsize * 129 + ysize); + // FastGaussian/Convolve handle borders differently, so keep those pixels 0. + const size_t border = 4 * sigma; + SetBorder(border, 0.0f, &in); + + ImageF temp(xsize, ysize); + ImageF out(xsize, ysize); + const auto rg = CreateRecursiveGaussian(sigma); + ThreadPool* null_pool = nullptr; + FastGaussian(rg, in, null_pool, &temp, &out); + + const std::vector kernel = + GaussianKernel(static_cast(4 * sigma), static_cast(sigma)); + const ImageF expected = Convolve(in, kernel); + + JXL_ASSERT_OK(VerifyRelativeError(expected, out, max_l1, max_rel, _, border)); +} + +void TestRandomForSizes(float min, float max, double sigma) { + double max_l1 = 6E-3; + double max_rel = 3E-3; + TestRandom(128, 1, min, max, sigma, max_l1, max_rel); + TestRandom(1, 128, min, max, sigma, max_l1, max_rel); + TestRandom(30, 201, min, max, sigma, max_l1 * 1.6, max_rel * 1.2); + TestRandom(201, 30, min, max, sigma, max_l1 * 1.6, max_rel * 1.2); + TestRandom(201, 201, min, max, sigma, max_l1 * 2.0, max_rel * 1.2); +} + +TEST(GaussBlurTest, TestRandom) { + // small non-negative + TestRandomForSizes(0.0f, 10.0f, 3.0f); + TestRandomForSizes(0.0f, 10.0f, 7.0f); + + // small negative + TestRandomForSizes(-4.0f, -1.0f, 3.0f); + TestRandomForSizes(-4.0f, -1.0f, 7.0f); + + // mixed positive/negative + TestRandomForSizes(-6.0f, 6.0f, 3.0f); + TestRandomForSizes(-6.0f, 6.0f, 7.0f); +} + +TEST(GaussBlurTest, TestSign) { + const size_t xsize = 500; + const size_t ysize = 606; + ImageF in(xsize, ysize); + + ZeroFillImage(&in); + const float center[33 * 33] = { + -0.128445f, -0.098473f, -0.121883f, -0.093601f, 0.095665f, -0.271332f, + -0.705475f, -1.324005f, -2.020741f, -1.329464f, 1.834064f, 4.787300f, + 5.834560f, 5.272720f, 3.967960f, 3.547935f, 3.432732f, 3.383015f, + 3.239326f, 3.290806f, 3.298954f, 3.397808f, 3.359730f, 3.533844f, + 3.511856f, 3.436787f, 3.428310f, 3.460209f, 3.550011f, 3.590942f, + 3.593109f, 3.560005f, 3.443165f, 0.089741f, 0.179230f, -0.032997f, + -0.182610f, 0.005669f, -0.244759f, -0.395123f, -0.514961f, -1.003529f, + -1.798656f, -2.377975f, 0.222191f, 3.957664f, 5.946804f, 5.543129f, + 4.290096f, 3.621010f, 3.407257f, 3.392494f, 3.345367f, 3.391903f, + 3.441605f, 3.429260f, 3.444969f, 3.507130f, 3.518612f, 3.443111f, + 3.475948f, 3.536148f, 3.470333f, 3.628311f, 3.600243f, 3.292892f, + -0.226730f, -0.573616f, -0.762165f, -0.398739f, -0.189842f, -0.275921f, + -0.446739f, -0.550037f, -0.461033f, -0.724792f, -1.448349f, -1.814064f, + -0.491032f, 2.817703f, 5.213242f, 5.675629f, 4.864548f, 3.876324f, + 3.535587f, 3.530312f, 3.413765f, 3.386261f, 3.404854f, 3.383472f, + 3.420830f, 3.326496f, 3.257877f, 3.362152f, 3.489609f, 3.619587f, + 3.555805f, 3.423164f, 3.309708f, -0.483940f, -0.502926f, -0.592983f, + -0.492527f, -0.413616f, -0.482555f, -0.475506f, -0.447990f, -0.338120f, + -0.189072f, -0.376427f, -0.910828f, -1.878044f, -1.937927f, 1.423218f, + 4.871609f, 5.767548f, 5.103741f, 3.983868f, 3.633003f, 3.458263f, + 3.507309f, 3.247021f, 3.220612f, 3.326061f, 3.352814f, 3.291061f, + 3.322739f, 3.444302f, 3.506207f, 3.556839f, 3.529575f, 3.457024f, + -0.408161f, -0.431343f, -0.454369f, -0.356419f, -0.380924f, -0.399452f, + -0.439476f, -0.412189f, -0.306816f, -0.008213f, -0.325813f, -0.537842f, + -0.984100f, -1.805332f, -2.028198f, 0.773205f, 4.423046f, 5.604839f, + 5.231617f, 4.080299f, 3.603008f, 3.498741f, 3.517010f, 3.333897f, + 3.381336f, 3.342617f, 3.369686f, 3.434155f, 3.490452f, 3.607029f, + 3.555298f, 3.702297f, 3.618679f, -0.503609f, -0.578564f, -0.419014f, + -0.239883f, 0.269836f, 0.022984f, -0.455067f, -0.621777f, -0.304176f, + -0.163792f, -0.490250f, -0.466637f, -0.391792f, -0.657940f, -1.498035f, + -1.895836f, 0.036537f, 3.462456f, 5.586445f, 5.658791f, 4.434784f, + 3.423435f, 3.318848f, 3.202328f, 3.532764f, 3.436687f, 3.354881f, + 3.356941f, 3.382645f, 3.503902f, 3.512867f, 3.632366f, 3.537312f, + -0.274734f, -0.658829f, -0.726532f, -0.281254f, 0.053196f, -0.064991f, + -0.608517f, -0.720966f, -0.070602f, -0.111320f, -0.440956f, -0.492180f, + -0.488762f, -0.569283f, -1.012741f, -1.582779f, -2.101479f, -1.392380f, + 2.451153f, 5.555855f, 6.096313f, 5.230045f, 4.068172f, 3.404274f, + 3.392586f, 3.326065f, 3.156670f, 3.284828f, 3.347012f, 3.319252f, + 3.352310f, 3.610790f, 3.499847f, -0.150600f, -0.314445f, -0.093575f, + -0.057384f, 0.053688f, -0.189255f, -0.263515f, -0.318653f, 0.053246f, + 0.080627f, -0.119553f, -0.152454f, -0.305420f, -0.404869f, -0.385944f, + -0.689949f, -1.204914f, -1.985748f, -1.711361f, 1.260658f, 4.626896f, + 5.888351f, 5.450989f, 4.070587f, 3.539200f, 3.383492f, 3.296318f, + 3.267334f, 3.436028f, 3.463005f, 3.502625f, 3.522282f, 3.403763f, + -0.348049f, -0.302303f, -0.137016f, -0.041737f, -0.164001f, -0.358849f, + -0.469627f, -0.428291f, -0.375797f, -0.246346f, -0.118950f, -0.084229f, + -0.205681f, -0.241199f, -0.391796f, -0.323151f, -0.241211f, -0.834137f, + -1.684219f, -1.972137f, 0.448399f, 4.019985f, 5.648144f, 5.647846f, + 4.295094f, 3.641884f, 3.374790f, 3.197342f, 3.425545f, 3.507481f, + 3.478065f, 3.430889f, 3.341900f, -1.016304f, -0.959221f, -0.909466f, + -0.810715f, -0.590729f, -0.594467f, -0.646721f, -0.629364f, -0.528561f, + -0.551819f, -0.301086f, -0.149101f, -0.060146f, -0.162220f, -0.326210f, + -0.156548f, -0.036293f, -0.426098f, -1.145470f, -1.628998f, -2.003052f, + -1.142891f, 2.885162f, 5.652863f, 5.718426f, 4.911140f, 3.234222f, + 3.473373f, 3.577183f, 3.271603f, 3.410435f, 3.505489f, 3.434032f, + -0.508911f, -0.438797f, -0.437450f, -0.627426f, -0.511745f, -0.304874f, + -0.274246f, -0.261841f, -0.228466f, -0.342491f, -0.528206f, -0.490082f, + -0.516350f, -0.361694f, -0.398514f, -0.276020f, -0.210369f, -0.355938f, + -0.402622f, -0.538864f, -1.249573f, -2.100105f, -0.996178f, 1.886410f, + 4.929745f, 5.630871f, 5.444199f, 4.042740f, 3.739189f, 3.691399f, + 3.391956f, 3.469696f, 3.431232f, 0.204849f, 0.205433f, -0.131927f, + -0.367908f, -0.374378f, -0.126820f, -0.186951f, -0.228565f, -0.081776f, + -0.143143f, -0.379230f, -0.598701f, -0.458019f, -0.295586f, -0.407730f, + -0.245853f, -0.043140f, 0.024242f, -0.038998f, -0.044151f, -0.425991f, + -1.240753f, -1.943146f, -2.174755f, 0.523415f, 4.376751f, 5.956558f, + 5.850082f, 4.403152f, 3.517399f, 3.560753f, 3.554836f, 3.471985f, + -0.508503f, -0.109783f, 0.057747f, 0.190079f, -0.257153f, -0.591980f, + -0.666771f, -0.525391f, -0.293060f, -0.489731f, -0.304855f, -0.259644f, + -0.367825f, -0.346977f, -0.292889f, -0.215652f, -0.120705f, -0.176010f, + -0.422905f, -0.114647f, -0.289749f, -0.374203f, -0.606754f, -1.127949f, + -1.994583f, -0.588058f, 3.415840f, 5.603470f, 5.811581f, 4.959423f, + 3.721760f, 3.710499f, 3.785461f, -0.554588f, -0.565517f, -0.434578f, + -0.012482f, -0.284660f, -0.699795f, -0.957535f, -0.755135f, -0.382034f, + -0.321552f, -0.287571f, -0.279537f, -0.314972f, -0.256287f, -0.372818f, + -0.316017f, -0.287975f, -0.365639f, -0.512589f, -0.420692f, -0.436485f, + -0.295353f, -0.451958f, -0.755459f, -1.272358f, -2.301353f, -1.776161f, + 1.572483f, 4.826286f, 5.741898f, 5.162853f, 4.028049f, 3.686325f, + -0.495590f, -0.664413f, -0.760044f, -0.152634f, -0.286480f, -0.340462f, + 0.076477f, 0.187706f, -0.068787f, -0.293491f, -0.361145f, -0.292515f, + -0.140671f, -0.190723f, -0.333302f, -0.368168f, -0.192581f, -0.154499f, + -0.236544f, -0.124405f, -0.208321f, -0.465607f, -0.883080f, -1.104813f, + -1.210567f, -1.415665f, -1.924683f, -1.634758f, 0.601017f, 4.276672f, + 5.501350f, 5.331257f, 3.809288f, -0.727722f, -0.533619f, -0.511524f, + -0.470688f, -0.610710f, -0.575130f, -0.311115f, -0.090420f, -0.297676f, + -0.646118f, -0.742805f, -0.485050f, -0.330910f, -0.275417f, -0.357037f, + -0.425598f, -0.481876f, -0.488941f, -0.393551f, -0.051105f, -0.090755f, + -0.328674f, -0.536369f, -0.533684f, -0.336960f, -0.689194f, -1.187195f, + -1.860954f, -2.290253f, -0.424774f, 3.050060f, 5.083332f, 5.291920f, + -0.343605f, -0.190975f, -0.303692f, -0.456512f, -0.681820f, -0.690693f, + -0.416729f, -0.286446f, -0.442055f, -0.709148f, -0.569160f, -0.382423f, + -0.402321f, -0.383362f, -0.366413f, -0.290718f, -0.110069f, -0.220280f, + -0.279018f, -0.255424f, -0.262081f, -0.487556f, -0.444492f, -0.250500f, + -0.119583f, -0.291557f, -0.537781f, -1.104073f, -1.737091f, -1.697441f, + -0.323456f, 2.042049f, 4.605103f, -0.310631f, -0.279568f, -0.012695f, + -0.160130f, -0.358746f, -0.421101f, -0.559677f, -0.474136f, -0.416565f, + -0.561817f, -0.534672f, -0.519157f, -0.767197f, -0.605831f, -0.186523f, + 0.219872f, 0.264984f, -0.193432f, -0.363182f, -0.467472f, -0.462009f, + -0.571053f, -0.522476f, -0.315903f, -0.237427f, -0.147320f, -0.100201f, + -0.237568f, -0.763435f, -1.242043f, -2.135159f, -1.409485f, 1.236370f, + -0.474247f, -0.517906f, -0.410217f, -0.542244f, -0.795986f, -0.590004f, + -0.388863f, -0.462921f, -0.810627f, -0.778637f, -0.512486f, -0.718025f, + -0.710854f, -0.482513f, -0.318233f, -0.194962f, -0.220116f, -0.421673f, + -0.534233f, -0.403339f, -0.389332f, -0.407303f, -0.437355f, -0.469730f, + -0.359600f, -0.352745f, -0.466755f, -0.414585f, -0.430756f, -0.656822f, + -1.237038f, -2.046097f, -1.574898f, -0.593815f, -0.582165f, -0.336098f, + -0.372612f, -0.554386f, -0.410603f, -0.428276f, -0.647644f, -0.640720f, + -0.582207f, -0.414112f, -0.435547f, -0.435505f, -0.332561f, -0.248116f, + -0.340221f, -0.277855f, -0.352699f, -0.377319f, -0.230850f, -0.313267f, + -0.446270f, -0.346237f, -0.420422f, -0.530781f, -0.400341f, -0.463661f, + -0.209091f, -0.056705f, -0.011772f, -0.169388f, -0.736275f, -1.463017f, + -0.752701f, -0.668865f, -0.329765f, -0.299347f, -0.245667f, -0.286999f, + -0.520420f, -0.675438f, -0.255753f, 0.141357f, -0.079639f, -0.419476f, + -0.374069f, -0.046253f, 0.116116f, -0.145847f, -0.380371f, -0.563412f, + -0.638634f, -0.310116f, -0.260914f, -0.508404f, -0.465508f, -0.527824f, + -0.370979f, -0.305595f, -0.244694f, -0.254490f, 0.009968f, -0.050201f, + -0.331219f, -0.614960f, -0.788208f, -0.483242f, -0.367516f, -0.186951f, + -0.180031f, 0.129711f, -0.127811f, -0.384750f, -0.499542f, -0.418613f, + -0.121635f, 0.203197f, -0.167290f, -0.397270f, -0.355461f, -0.218746f, + -0.376785f, -0.521698f, -0.721581f, -0.845741f, -0.535439f, -0.220882f, + -0.309067f, -0.555248f, -0.690342f, -0.664948f, -0.390102f, 0.020355f, + -0.130447f, -0.173252f, -0.170059f, -0.633663f, -0.956001f, -0.621696f, + -0.388302f, -0.342262f, -0.244370f, -0.386948f, -0.401421f, -0.172979f, + -0.206163f, -0.450058f, -0.525789f, -0.549274f, -0.349251f, -0.474613f, + -0.667976f, -0.435600f, -0.175369f, -0.196877f, -0.202976f, -0.242481f, + -0.258369f, -0.189133f, -0.395397f, -0.765499f, -0.944016f, -0.850967f, + -0.631561f, -0.152493f, -0.046432f, -0.262066f, -0.195919f, 0.048218f, + 0.084972f, 0.039902f, 0.000618f, -0.404430f, -0.447456f, -0.418076f, + -0.631935f, -0.717415f, -0.502888f, -0.530514f, -0.747826f, -0.704041f, + -0.674969f, -0.516853f, -0.418446f, -0.327740f, -0.308815f, -0.481636f, + -0.440083f, -0.481720f, -0.341053f, -0.283897f, -0.324368f, -0.352829f, + -0.434349f, -0.545589f, -0.533104f, -0.472755f, -0.570496f, -0.557735f, + -0.708176f, -0.493332f, -0.194416f, -0.186249f, -0.256710f, -0.271835f, + -0.304752f, -0.431267f, -0.422398f, -0.646725f, -0.680801f, -0.249031f, + -0.058567f, -0.213890f, -0.383949f, -0.540291f, -0.549877f, -0.225567f, + -0.037174f, -0.499874f, -0.641010f, -0.628044f, -0.390549f, -0.311497f, + -0.542313f, -0.569565f, -0.473408f, -0.331245f, -0.357197f, -0.285599f, + -0.200157f, -0.201866f, -0.124428f, -0.346016f, -0.392311f, -0.264496f, + -0.285370f, -0.436974f, -0.523483f, -0.410461f, -0.267925f, -0.055016f, + -0.382458f, -0.319771f, -0.049927f, 0.124329f, 0.266102f, -0.106606f, + -0.773647f, -0.973053f, -0.708206f, -0.486137f, -0.319923f, -0.493900f, + -0.490860f, -0.324986f, -0.147346f, -0.146088f, -0.161758f, -0.084396f, + -0.379494f, 0.041626f, -0.113361f, -0.277767f, 0.083366f, 0.126476f, + 0.139057f, 0.038040f, 0.038162f, -0.242126f, -0.411736f, -0.370049f, + -0.455357f, -0.039257f, 0.264442f, -0.271492f, -0.425346f, -0.514847f, + -0.448650f, -0.580399f, -0.652603f, -0.774803f, -0.692524f, -0.579578f, + -0.465206f, -0.386265f, -0.458012f, -0.446594f, -0.284893f, -0.345448f, + -0.350876f, -0.440350f, -0.360378f, -0.270428f, 0.237213f, -0.063602f, + -0.364529f, -0.179867f, 0.078197f, 0.117947f, -0.093410f, -0.359119f, + -0.480961f, -0.540638f, -0.436287f, -0.598576f, -0.253735f, -0.060093f, + -0.549145f, -0.808327f, -0.698593f, -0.595764f, -0.582508f, -0.497353f, + -0.480892f, -0.584240f, -0.665791f, -0.690903f, -0.743446f, -0.796677f, + -0.782391f, -0.649010f, -0.628139f, -0.880848f, -0.829361f, -0.373272f, + -0.223667f, 0.174572f, -0.348743f, -0.798901f, -0.692307f, -0.607609f, + -0.401455f, -0.480919f, -0.450798f, -0.435413f, -0.322338f, -0.228382f, + -0.450466f, -0.504440f, -0.477402f, -0.662224f, -0.583397f, -0.217445f, + -0.157459f, -0.079584f, -0.226168f, -0.488720f, -0.669624f, -0.666878f, + -0.565311f, -0.549625f, -0.364601f, -0.497627f, -0.736897f, -0.763023f, + -0.741020f, -0.404503f, 0.184814f, -0.075315f, -0.281513f, -0.532906f, + -0.405800f, -0.313438f, -0.536652f, -0.403381f, 0.011967f, 0.103310f, + -0.269848f, -0.508656f, -0.445923f, -0.644859f, -0.617870f, -0.500927f, + -0.371559f, -0.125580f, 0.028625f, -0.154713f, -0.442024f, -0.492764f, + -0.199371f, 0.236305f, 0.225925f, 0.075577f, -0.285812f, -0.437145f, + -0.374260f, -0.156693f, -0.129635f, -0.243206f, -0.123058f, 0.162148f, + -0.313152f, -0.337982f, -0.358421f, 0.040070f, 0.038925f, -0.333313f, + -0.351662f, 0.023014f, 0.091362f, -0.282890f, -0.373253f, -0.389050f, + -0.532707f, -0.423347f, -0.349968f, -0.287045f, -0.202442f, -0.308430f, + -0.222801f, -0.106323f, -0.056358f, 0.027222f, 0.390732f, 0.033558f, + -0.160088f, -0.382217f, -0.535282f, -0.515900f, -0.022736f, 0.165665f, + -0.111408f, -0.233784f, -0.312357f, -0.541885f, -0.480022f, -0.482513f, + -0.246254f, 0.132244f, 0.090134f, 0.234634f, -0.089249f, -0.460854f, + -0.515457f, -0.450874f, -0.311031f, -0.387680f, -0.360554f, -0.179241f, + -0.283817f, -0.475815f, -0.246399f, -0.388958f, -0.551140f, -0.496239f, + -0.559879f, -0.379761f, -0.254288f, -0.395111f, -0.613018f, -0.459427f, + -0.263580f, -0.268929f, 0.080826f, 0.115616f, -0.097324f, -0.325310f, + -0.480450f, -0.313286f, -0.310371f, -0.517361f, -0.288288f, -0.112679f, + -0.173241f, -0.221664f, -0.039452f, -0.107578f, -0.089630f, -0.483768f, + -0.571087f, -0.497108f, -0.321533f, -0.375492f, -0.540363f, -0.406815f, + -0.388512f, -0.514561f, -0.540192f, -0.402412f, -0.232246f, -0.304749f, + -0.383724f, -0.679596f, -0.685463f, -0.694538f, -0.642937f, -0.425789f, + 0.103271f, -0.194862f, -0.487999f, -0.717281f, -0.681850f, -0.709286f, + -0.615398f, -0.554245f, -0.254681f, -0.049950f, -0.002914f, -0.095383f, + -0.370911f, -0.564224f, -0.242714f}; + const size_t xtest = xsize / 2; + const size_t ytest = ysize / 2; + + for (intptr_t dy = -16; dy <= 16; ++dy) { + float* row = in.Row(ytest + dy); + for (intptr_t dx = -16; dx <= 16; ++dx) + row[xtest + dx] = center[(dy + 16) * 33 + (dx + 16)]; + } + + const double sigma = 7.155933; + + ImageF temp(xsize, ysize); + ImageF out_rg(xsize, ysize); + const auto rg = CreateRecursiveGaussian(sigma); + ThreadPool* null_pool = nullptr; + FastGaussian(rg, in, null_pool, &temp, &out_rg); + + ImageF out_old; + { + const std::vector kernel = + GaussianKernel(static_cast(4 * sigma), static_cast(sigma)); + printf("old kernel size %" PRIuS "\n", kernel.size()); + out_old = Convolve(in, kernel); + } + + printf("rg %.4f old %.4f\n", out_rg.Row(ytest)[xtest], + out_old.Row(ytest)[xtest]); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/gradient_test.cc b/third_party/jpeg-xl/lib/jxl/gradient_test.cc new file mode 100644 index 0000000000..282fe89f0a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/gradient_test.cc @@ -0,0 +1,207 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include + +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/enc_file.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { + +struct AuxOut; + +namespace { + +// Returns distance of point p to line p0..p1, the result is signed and is not +// normalized. +double PointLineDist(double x0, double y0, double x1, double y1, double x, + double y) { + return (y1 - y0) * x - (x1 - x0) * y + x1 * y0 - y1 * x0; +} + +// Generates a test image with a gradient from one color to another. +// Angle in degrees, colors can be given in hex as 0xRRGGBB. The angle is the +// angle in which the change direction happens. +Image3F GenerateTestGradient(uint32_t color0, uint32_t color1, double angle, + size_t xsize, size_t ysize) { + Image3F image(xsize, ysize); + + double x0 = xsize / 2; + double y0 = ysize / 2; + double x1 = x0 + std::sin(angle / 360.0 * 2.0 * kPi); + double y1 = y0 + std::cos(angle / 360.0 * 2.0 * kPi); + + double maxdist = + std::max(fabs(PointLineDist(x0, y0, x1, y1, 0, 0)), + fabs(PointLineDist(x0, y0, x1, y1, xsize, 0))); + + for (size_t c = 0; c < 3; ++c) { + float c0 = ((color0 >> (8 * (2 - c))) & 255); + float c1 = ((color1 >> (8 * (2 - c))) & 255); + for (size_t y = 0; y < ysize; ++y) { + float* row = image.PlaneRow(c, y); + for (size_t x = 0; x < xsize; ++x) { + double dist = PointLineDist(x0, y0, x1, y1, x, y); + double v = ((dist / maxdist) + 1.0) / 2.0; + float color = c0 * (1.0 - v) + c1 * v; + row[x] = color; + } + } + } + + return image; +} + +// Computes the max of the horizontal and vertical second derivative for each +// pixel, where second derivative means absolute value of difference of left +// delta and right delta (top/bottom for vertical direction). +// The radius over which the derivative is computed is only 1 pixel and it only +// checks two angles (hor and ver), but this approximation works well enough. +static ImageF Gradient2(const ImageF& image) { + size_t xsize = image.xsize(); + size_t ysize = image.ysize(); + ImageF image2(image.xsize(), image.ysize()); + for (size_t y = 1; y + 1 < ysize; y++) { + const auto* JXL_RESTRICT row0 = image.Row(y - 1); + const auto* JXL_RESTRICT row1 = image.Row(y); + const auto* JXL_RESTRICT row2 = image.Row(y + 1); + auto* row_out = image2.Row(y); + for (size_t x = 1; x + 1 < xsize; x++) { + float ddx = (row1[x] - row1[x - 1]) - (row1[x + 1] - row1[x]); + float ddy = (row1[x] - row0[x]) - (row2[x] - row1[x]); + row_out[x] = std::max(fabsf(ddx), fabsf(ddy)); + } + } + // Copy to the borders + if (ysize > 2) { + auto* JXL_RESTRICT row0 = image2.Row(0); + const auto* JXL_RESTRICT row1 = image2.Row(1); + const auto* JXL_RESTRICT row2 = image2.Row(ysize - 2); + auto* JXL_RESTRICT row3 = image2.Row(ysize - 1); + for (size_t x = 1; x + 1 < xsize; x++) { + row0[x] = row1[x]; + row3[x] = row2[x]; + } + } else { + const auto* row0_in = image.Row(0); + const auto* row1_in = image.Row(ysize - 1); + auto* row0_out = image2.Row(0); + auto* row1_out = image2.Row(ysize - 1); + for (size_t x = 1; x + 1 < xsize; x++) { + // Image too narrow, take first derivative instead + row0_out[x] = row1_out[x] = fabsf(row0_in[x] - row1_in[x]); + } + } + if (xsize > 2) { + for (size_t y = 0; y < ysize; y++) { + auto* row = image2.Row(y); + row[0] = row[1]; + row[xsize - 1] = row[xsize - 2]; + } + } else { + for (size_t y = 0; y < ysize; y++) { + const auto* JXL_RESTRICT row_in = image.Row(y); + auto* row_out = image2.Row(y); + // Image too narrow, take first derivative instead + row_out[0] = row_out[xsize - 1] = fabsf(row_in[0] - row_in[xsize - 1]); + } + } + return image2; +} + +static Image3F Gradient2(const Image3F& image) { + return Image3F(Gradient2(image.Plane(0)), Gradient2(image.Plane(1)), + Gradient2(image.Plane(2))); +} + +/* +Tests if roundtrip with jxl on a gradient image doesn't cause banding. +Only tests if use_gradient is true. Set to false for debugging to see the +distance values. +Angle in degrees, colors can be given in hex as 0xRRGGBB. +*/ +void TestGradient(ThreadPool* pool, uint32_t color0, uint32_t color1, + size_t xsize, size_t ysize, float angle, bool fast_mode, + float butteraugli_distance, bool use_gradient = true) { + CompressParams cparams; + cparams.butteraugli_distance = butteraugli_distance; + if (fast_mode) { + cparams.speed_tier = SpeedTier::kSquirrel; + } + Image3F gradient = GenerateTestGradient(color0, color1, angle, xsize, ysize); + + CodecInOut io; + io.metadata.m.SetUintSamples(8); + io.metadata.m.color_encoding = ColorEncoding::SRGB(); + io.SetFromImage(std::move(gradient), io.metadata.m.color_encoding); + + CodecInOut io2; + + PaddedBytes compressed; + AuxOut* aux_out = nullptr; + PassesEncoderState enc_state; + EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(), + aux_out, pool)); + EXPECT_TRUE( + test::DecodeFile({}, Span(compressed), &io2, pool)); + EXPECT_TRUE( + io2.Main().TransformTo(io2.metadata.m.color_encoding, GetJxlCms(), pool)); + + if (use_gradient) { + // Test that the gradient map worked. For that, we take a second derivative + // of the image with Gradient2 to measure how linear the change is in x and + // y direction. For a well handled gradient, we expect max values around + // 0.1, while if there is noticeable banding, which means the gradient map + // failed, the values are around 0.5-1.0 (regardless of + // butteraugli_distance). + Image3F gradient2 = Gradient2(*io2.Main().color()); + + std::array image_max; + Image3Max(gradient2, &image_max); + + // TODO(jyrki): These values used to work with 0.2, 0.2, 0.2. + EXPECT_LE(image_max[0], 3.15); + EXPECT_LE(image_max[1], 1.72); + EXPECT_LE(image_max[2], 5.05); + } +} + +static constexpr bool fast_mode = true; + +TEST(GradientTest, SteepGradient) { + test::ThreadPoolForTests pool(8); + // Relatively steep gradients, colors from the sky of stp.png + TestGradient(&pool, 0xd99d58, 0x889ab1, 512, 512, 90, fast_mode, 3.0); +} + +TEST(GradientTest, SubtleGradient) { + test::ThreadPoolForTests pool(8); + // Very subtle gradient + TestGradient(&pool, 0xb89b7b, 0xa89b8d, 512, 512, 90, fast_mode, 4.0); +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/headers.cc b/third_party/jpeg-xl/lib/jxl/headers.cc new file mode 100644 index 0000000000..dc53726385 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/headers.cc @@ -0,0 +1,194 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/headers.h" + +#include "lib/jxl/common.h" +#include "lib/jxl/fields.h" + +namespace jxl { +namespace { + +struct Rational { + constexpr explicit Rational(uint32_t num, uint32_t den) + : num(num), den(den) {} + + // Returns floor(multiplicand * rational). + constexpr uint32_t MulTruncate(uint32_t multiplicand) const { + return uint64_t(multiplicand) * num / den; + } + + uint32_t num; + uint32_t den; +}; + +Rational FixedAspectRatios(uint32_t ratio) { + JXL_ASSERT(0 != ratio && ratio < 8); + // Other candidates: 5/4, 7/5, 14/9, 16/10, 5/3, 21/9, 12/5 + constexpr Rational kRatios[7] = {Rational(1, 1), // square + Rational(12, 10), // + Rational(4, 3), // camera + Rational(3, 2), // mobile camera + Rational(16, 9), // camera/display + Rational(5, 4), // + Rational(2, 1)}; // + return kRatios[ratio - 1]; +} + +uint32_t FindAspectRatio(uint32_t xsize, uint32_t ysize) { + for (uint32_t r = 1; r < 8; ++r) { + if (xsize == FixedAspectRatios(r).MulTruncate(ysize)) { + return r; + } + } + return 0; // Must send xsize instead +} + +} // namespace + +size_t SizeHeader::xsize() const { + if (ratio_ != 0) { + return FixedAspectRatios(ratio_).MulTruncate( + static_cast(ysize())); + } + return small_ ? ((xsize_div8_minus_1_ + 1) * 8) : xsize_; +} + +Status SizeHeader::Set(size_t xsize64, size_t ysize64) { + if (xsize64 > 0xFFFFFFFFull || ysize64 > 0xFFFFFFFFull) { + return JXL_FAILURE("Image too large"); + } + const uint32_t xsize32 = static_cast(xsize64); + const uint32_t ysize32 = static_cast(ysize64); + if (xsize64 == 0 || ysize64 == 0) return JXL_FAILURE("Empty image"); + ratio_ = FindAspectRatio(xsize32, ysize32); + small_ = ysize64 <= 256 && (ysize64 % kBlockDim) == 0 && + (ratio_ != 0 || (xsize64 <= 256 && (xsize64 % kBlockDim) == 0)); + if (small_) { + ysize_div8_minus_1_ = ysize32 / 8 - 1; + } else { + ysize_ = ysize32; + } + + if (ratio_ == 0) { + if (small_) { + xsize_div8_minus_1_ = xsize32 / 8 - 1; + } else { + xsize_ = xsize32; + } + } + JXL_ASSERT(xsize() == xsize64); + JXL_ASSERT(ysize() == ysize64); + return true; +} + +Status PreviewHeader::Set(size_t xsize64, size_t ysize64) { + const uint32_t xsize32 = static_cast(xsize64); + const uint32_t ysize32 = static_cast(ysize64); + if (xsize64 == 0 || ysize64 == 0) return JXL_FAILURE("Empty preview"); + div8_ = (xsize64 % kBlockDim) == 0 && (ysize64 % kBlockDim) == 0; + if (div8_) { + ysize_div8_ = ysize32 / 8; + } else { + ysize_ = ysize32; + } + + ratio_ = FindAspectRatio(xsize32, ysize32); + if (ratio_ == 0) { + if (div8_) { + xsize_div8_ = xsize32 / 8; + } else { + xsize_ = xsize32; + } + } + JXL_ASSERT(xsize() == xsize64); + JXL_ASSERT(ysize() == ysize64); + return true; +} + +size_t PreviewHeader::xsize() const { + if (ratio_ != 0) { + return FixedAspectRatios(ratio_).MulTruncate( + static_cast(ysize())); + } + return div8_ ? (xsize_div8_ * 8) : xsize_; +} + +SizeHeader::SizeHeader() { Bundle::Init(this); } +Status SizeHeader::VisitFields(Visitor* JXL_RESTRICT visitor) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &small_)); + + if (visitor->Conditional(small_)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(5, 0, &ysize_div8_minus_1_)); + } + if (visitor->Conditional(!small_)) { + // (Could still be small, but non-multiple of 8.) + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(BitsOffset(9, 1), BitsOffset(13, 1), + BitsOffset(18, 1), BitsOffset(30, 1), + 1, &ysize_)); + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 0, &ratio_)); + if (visitor->Conditional(ratio_ == 0 && small_)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(5, 0, &xsize_div8_minus_1_)); + } + if (visitor->Conditional(ratio_ == 0 && !small_)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(BitsOffset(9, 1), BitsOffset(13, 1), + BitsOffset(18, 1), BitsOffset(30, 1), + 1, &xsize_)); + } + + return true; +} + +PreviewHeader::PreviewHeader() { Bundle::Init(this); } +Status PreviewHeader::VisitFields(Visitor* JXL_RESTRICT visitor) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &div8_)); + + if (visitor->Conditional(div8_)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(16), Val(32), BitsOffset(5, 1), + BitsOffset(9, 33), 1, &ysize_div8_)); + } + if (visitor->Conditional(!div8_)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(BitsOffset(6, 1), BitsOffset(8, 65), + BitsOffset(10, 321), + BitsOffset(12, 1345), 1, &ysize_)); + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 0, &ratio_)); + if (visitor->Conditional(ratio_ == 0 && div8_)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(16), Val(32), BitsOffset(5, 1), + BitsOffset(9, 33), 1, &xsize_div8_)); + } + if (visitor->Conditional(ratio_ == 0 && !div8_)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(BitsOffset(6, 1), BitsOffset(8, 65), + BitsOffset(10, 321), + BitsOffset(12, 1345), 1, &xsize_)); + } + + return true; +} + +AnimationHeader::AnimationHeader() { Bundle::Init(this); } +Status AnimationHeader::VisitFields(Visitor* JXL_RESTRICT visitor) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(100), Val(1000), BitsOffset(10, 1), + BitsOffset(30, 1), 1, &tps_numerator)); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(1), Val(1001), BitsOffset(8, 1), + BitsOffset(10, 1), 1, + &tps_denominator)); + + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(0), Bits(3), Bits(16), Bits(32), 0, &num_loops)); + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_timecodes)); + return true; +} + +Status ReadSizeHeader(BitReader* JXL_RESTRICT reader, + SizeHeader* JXL_RESTRICT size) { + return Bundle::Read(reader, size); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/headers.h b/third_party/jpeg-xl/lib/jxl/headers.h new file mode 100644 index 0000000000..3cce84dabc --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/headers.h @@ -0,0 +1,97 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_HEADERS_H_ +#define LIB_JXL_HEADERS_H_ + +// Codestream headers. + +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/field_encodings.h" + +namespace jxl { + +// Reserved by ISO/IEC 10918-1. LF causes files opened in text mode to be +// rejected because the marker changes to 0x0D instead. The 0xFF prefix also +// ensures there were no 7-bit transmission limitations. +static constexpr uint8_t kCodestreamMarker = 0x0A; + +// Compact representation of image dimensions (best case: 9 bits) so decoders +// can preallocate early. +class SizeHeader : public Fields { + public: + SizeHeader(); + JXL_FIELDS_NAME(SizeHeader) + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + Status Set(size_t xsize, size_t ysize); + + size_t xsize() const; + size_t ysize() const { + return small_ ? ((ysize_div8_minus_1_ + 1) * 8) : ysize_; + } + + private: + bool small_; // xsize and ysize <= 256 and divisible by 8. + + uint32_t ysize_div8_minus_1_; + uint32_t ysize_; + + uint32_t ratio_; + uint32_t xsize_div8_minus_1_; + uint32_t xsize_; +}; + +// (Similar to SizeHeader but different encoding because previews are smaller) +class PreviewHeader : public Fields { + public: + PreviewHeader(); + JXL_FIELDS_NAME(PreviewHeader) + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + Status Set(size_t xsize, size_t ysize); + + size_t xsize() const; + size_t ysize() const { return div8_ ? (ysize_div8_ * 8) : ysize_; } + + private: + bool div8_; // xsize and ysize divisible by 8. + + uint32_t ysize_div8_; + uint32_t ysize_; + + uint32_t ratio_; + uint32_t xsize_div8_; + uint32_t xsize_; +}; + +struct AnimationHeader : public Fields { + AnimationHeader(); + JXL_FIELDS_NAME(AnimationHeader) + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + // Ticks per second (expressed as rational number to support NTSC) + uint32_t tps_numerator; + uint32_t tps_denominator; + + uint32_t num_loops; // 0 means to repeat infinitely. + + bool have_timecodes; +}; + +Status ReadSizeHeader(BitReader* JXL_RESTRICT reader, + SizeHeader* JXL_RESTRICT size); + +} // namespace jxl + +#endif // LIB_JXL_HEADERS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/huffman_table.cc b/third_party/jpeg-xl/lib/jxl/huffman_table.cc new file mode 100644 index 0000000000..9ae7865af6 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/huffman_table.cc @@ -0,0 +1,161 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/huffman_table.h" + +#include /* for memcpy */ +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/dec_huffman.h" + +namespace jxl { + +/* Returns reverse(reverse(key, len) + 1, len), where reverse(key, len) is the + bit-wise reversal of the len least significant bits of key. */ +static inline int GetNextKey(int key, int len) { + int step = 1u << (len - 1); + while (key & step) { + step >>= 1; + } + return (key & (step - 1)) + step; +} + +/* Stores code in table[0], table[step], table[2*step], ..., table[end] */ +/* Assumes that end is an integer multiple of step */ +static inline void ReplicateValue(HuffmanCode* table, int step, int end, + HuffmanCode code) { + do { + end -= step; + table[end] = code; + } while (end > 0); +} + +/* Returns the table width of the next 2nd level table. count is the histogram + of bit lengths for the remaining symbols, len is the code length of the next + processed symbol */ +static inline size_t NextTableBitSize(const uint16_t* const count, size_t len, + int root_bits) { + size_t left = 1u << (len - root_bits); + while (len < PREFIX_MAX_BITS) { + if (left <= count[len]) break; + left -= count[len]; + ++len; + left <<= 1; + } + return len - root_bits; +} + +uint32_t BuildHuffmanTable(HuffmanCode* root_table, int root_bits, + const uint8_t* const code_lengths, + size_t code_lengths_size, uint16_t* count) { + HuffmanCode code; /* current table entry */ + HuffmanCode* table; /* next available space in table */ + size_t len; /* current code length */ + size_t symbol; /* symbol index in original or sorted table */ + int key; /* reversed prefix code */ + int step; /* step size to replicate values in current table */ + int low; /* low bits for current root entry */ + int mask; /* mask for low bits */ + size_t table_bits; /* key length of current table */ + int table_size; /* size of current table */ + int total_size; /* sum of root table size and 2nd level table sizes */ + /* offsets in sorted table for each length */ + uint16_t offset[PREFIX_MAX_BITS + 1]; + size_t max_length = 1; + + if (code_lengths_size > 1u << PREFIX_MAX_BITS) return 0; + + /* symbols sorted by code length */ + std::vector sorted_storage(code_lengths_size); + uint16_t* sorted = sorted_storage.data(); + + /* generate offsets into sorted symbol table by code length */ + { + uint16_t sum = 0; + for (len = 1; len <= PREFIX_MAX_BITS; len++) { + offset[len] = sum; + if (count[len]) { + sum = static_cast(sum + count[len]); + max_length = len; + } + } + } + + /* sort symbols by length, by symbol order within each length */ + for (symbol = 0; symbol < code_lengths_size; symbol++) { + if (code_lengths[symbol] != 0) { + sorted[offset[code_lengths[symbol]]++] = symbol; + } + } + + table = root_table; + table_bits = root_bits; + table_size = 1u << table_bits; + total_size = table_size; + + /* special case code with only one value */ + if (offset[PREFIX_MAX_BITS] == 1) { + code.bits = 0; + code.value = static_cast(sorted[0]); + for (key = 0; key < total_size; ++key) { + table[key] = code; + } + return total_size; + } + + /* fill in root table */ + /* let's reduce the table size to a smaller size if possible, and */ + /* create the repetitions by memcpy if possible in the coming loop */ + if (table_bits > max_length) { + table_bits = max_length; + table_size = 1u << table_bits; + } + key = 0; + symbol = 0; + code.bits = 1; + step = 2; + do { + for (; count[code.bits] != 0; --count[code.bits]) { + code.value = static_cast(sorted[symbol++]); + ReplicateValue(&table[key], step, table_size, code); + key = GetNextKey(key, code.bits); + } + step <<= 1; + } while (++code.bits <= table_bits); + + /* if root_bits != table_bits we only created one fraction of the */ + /* table, and we need to replicate it now. */ + while (total_size != table_size) { + memcpy(&table[table_size], &table[0], table_size * sizeof(table[0])); + table_size <<= 1; + } + + /* fill in 2nd level tables and add pointers to root table */ + mask = total_size - 1; + low = -1; + for (len = root_bits + 1, step = 2; len <= max_length; ++len, step <<= 1) { + for (; count[len] != 0; --count[len]) { + if ((key & mask) != low) { + table += table_size; + table_bits = NextTableBitSize(count, len, root_bits); + table_size = 1u << table_bits; + total_size += table_size; + low = key & mask; + root_table[low].bits = static_cast(table_bits + root_bits); + root_table[low].value = + static_cast((table - root_table) - low); + } + code.bits = static_cast(len - root_bits); + code.value = static_cast(sorted[symbol++]); + ReplicateValue(&table[key >> root_bits], step, table_size, code); + key = GetNextKey(key, len); + } + } + + return total_size; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/huffman_table.h b/third_party/jpeg-xl/lib/jxl/huffman_table.h new file mode 100644 index 0000000000..11cdb2fc45 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/huffman_table.h @@ -0,0 +1,28 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_HUFFMAN_TABLE_H_ +#define LIB_JXL_HUFFMAN_TABLE_H_ + +#include +#include + +namespace jxl { + +struct HuffmanCode { + uint8_t bits; /* number of bits used for this symbol */ + uint16_t value; /* symbol value or table offset */ +}; + +/* Builds Huffman lookup table assuming code lengths are in symbol order. */ +/* Returns 0 in case of error (invalid tree or memory error), otherwise + populated size of table. */ +uint32_t BuildHuffmanTable(HuffmanCode* root_table, int root_bits, + const uint8_t* code_lengths, + size_t code_lengths_size, uint16_t* count); + +} // namespace jxl + +#endif // LIB_JXL_HUFFMAN_TABLE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/iaca_test.cc b/third_party/jpeg-xl/lib/jxl/iaca_test.cc new file mode 100644 index 0000000000..e25d9316d5 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/iaca_test.cc @@ -0,0 +1,21 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/iaca.h" + +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +TEST(IacaTest, MarkersDefaultToDisabledAndDoNotCrash) { + BeginIACA(); + EndIACA(); +} + +TEST(IacaTest, ScopeDefaultToDisabledAndDoNotCrash) { ScopeIACA iaca; } + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/icc_codec.cc b/third_party/jpeg-xl/lib/jxl/icc_codec.cc new file mode 100644 index 0000000000..f367461c0f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/icc_codec.cc @@ -0,0 +1,389 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/icc_codec.h" + +#include + +#include +#include +#include + +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/icc_codec_common.h" + +namespace jxl { +namespace { + +// Shuffles or interleaves bytes, for example with width 2, turns "ABCDabcd" +// into "AaBbCcDc". Transposes a matrix of ceil(size / width) columns and +// width rows. There are size elements, size may be < width * height, if so the +// last elements of the rightmost column are missing, the missing spots are +// transposed along with the filled spots, and the result has the missing +// elements at the end of the bottom row. The input is the input matrix in +// scanline order but with missing elements skipped (which may occur in multiple +// locations), the output is the result matrix in scanline order (with +// no need to skip missing elements as they are past the end of the data). +void Shuffle(uint8_t* data, size_t size, size_t width) { + size_t height = (size + width - 1) / width; // amount of rows of output + PaddedBytes result(size); + // i = output index, j input index + size_t s = 0, j = 0; + for (size_t i = 0; i < size; i++) { + result[i] = data[j]; + j += height; + if (j >= size) j = ++s; + } + + for (size_t i = 0; i < size; i++) { + data[i] = result[i]; + } +} + +// TODO(eustas): should be 20, or even 18, once DecodeVarInt is improved; +// currently DecodeVarInt does not signal the errors, and marks +// 11 bytes as used even if only 10 are used (and 9 is enough for +// 63-bit values). +constexpr const size_t kPreambleSize = 22; // enough for reading 2 VarInts + +} // namespace + +// Mimics the beginning of UnpredictICC for quick validity check. +// At least kPreambleSize bytes of data should be valid at invocation time. +Status CheckPreamble(const PaddedBytes& data, size_t enc_size, + size_t output_limit) { + const uint8_t* enc = data.data(); + size_t size = data.size(); + size_t pos = 0; + uint64_t osize = DecodeVarInt(enc, size, &pos); + JXL_RETURN_IF_ERROR(CheckIs32Bit(osize)); + if (pos >= size) return JXL_FAILURE("Out of bounds"); + uint64_t csize = DecodeVarInt(enc, size, &pos); + JXL_RETURN_IF_ERROR(CheckIs32Bit(csize)); + JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, csize, size)); + // We expect that UnpredictICC inflates input, not the other way round. + if (osize + 65536 < enc_size) return JXL_FAILURE("Malformed ICC"); + if (output_limit && osize > output_limit) { + return JXL_FAILURE("Decoded ICC is too large"); + } + return true; +} + +// Decodes the result of PredictICC back to a valid ICC profile. +Status UnpredictICC(const uint8_t* enc, size_t size, PaddedBytes* result) { + if (!result->empty()) return JXL_FAILURE("result must be empty initially"); + size_t pos = 0; + // TODO(lode): technically speaking we need to check that the entire varint + // decoding never goes out of bounds, not just the first byte. This requires + // a DecodeVarInt function that returns an error code. It is safe to use + // DecodeVarInt with out of bounds values, it silently returns, but the + // specification requires an error. Idem for all DecodeVarInt below. + if (pos >= size) return JXL_FAILURE("Out of bounds"); + uint64_t osize = DecodeVarInt(enc, size, &pos); // Output size + JXL_RETURN_IF_ERROR(CheckIs32Bit(osize)); + if (pos >= size) return JXL_FAILURE("Out of bounds"); + uint64_t csize = DecodeVarInt(enc, size, &pos); // Commands size + // Every command is translated to at least on byte. + JXL_RETURN_IF_ERROR(CheckIs32Bit(csize)); + size_t cpos = pos; // pos in commands stream + JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, csize, size)); + size_t commands_end = cpos + csize; + pos = commands_end; // pos in data stream + + // Header + PaddedBytes header = ICCInitialHeaderPrediction(); + EncodeUint32(0, osize, &header); + for (size_t i = 0; i <= kICCHeaderSize; i++) { + if (result->size() == osize) { + if (cpos != commands_end) return JXL_FAILURE("Not all commands used"); + if (pos != size) return JXL_FAILURE("Not all data used"); + return true; // Valid end + } + if (i == kICCHeaderSize) break; // Done + ICCPredictHeader(result->data(), result->size(), header.data(), i); + if (pos >= size) return JXL_FAILURE("Out of bounds"); + result->push_back(enc[pos++] + header[i]); + } + if (cpos >= commands_end) return JXL_FAILURE("Out of bounds"); + + // Tag list + uint64_t numtags = DecodeVarInt(enc, size, &cpos); + + if (numtags != 0) { + numtags--; + JXL_RETURN_IF_ERROR(CheckIs32Bit(numtags)); + AppendUint32(numtags, result); + uint64_t prevtagstart = kICCHeaderSize + numtags * 12; + uint64_t prevtagsize = 0; + for (;;) { + if (result->size() > osize) return JXL_FAILURE("Invalid result size"); + if (cpos > commands_end) return JXL_FAILURE("Out of bounds"); + if (cpos == commands_end) break; // Valid end + uint8_t command = enc[cpos++]; + uint8_t tagcode = command & 63; + Tag tag; + if (tagcode == 0) { + break; + } else if (tagcode == kCommandTagUnknown) { + JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, 4, size)); + tag = DecodeKeyword(enc, size, pos); + pos += 4; + } else if (tagcode == kCommandTagTRC) { + tag = kRtrcTag; + } else if (tagcode == kCommandTagXYZ) { + tag = kRxyzTag; + } else { + if (tagcode - kCommandTagStringFirst >= kNumTagStrings) { + return JXL_FAILURE("Unknown tagcode"); + } + tag = *kTagStrings[tagcode - kCommandTagStringFirst]; + } + AppendKeyword(tag, result); + + uint64_t tagstart; + uint64_t tagsize = prevtagsize; + if (tag == kRxyzTag || tag == kGxyzTag || tag == kBxyzTag || + tag == kKxyzTag || tag == kWtptTag || tag == kBkptTag || + tag == kLumiTag) { + tagsize = 20; + } + + if (command & kFlagBitOffset) { + if (cpos >= commands_end) return JXL_FAILURE("Out of bounds"); + tagstart = DecodeVarInt(enc, size, &cpos); + } else { + JXL_RETURN_IF_ERROR(CheckIs32Bit(prevtagstart)); + tagstart = prevtagstart + prevtagsize; + } + JXL_RETURN_IF_ERROR(CheckIs32Bit(tagstart)); + AppendUint32(tagstart, result); + if (command & kFlagBitSize) { + if (cpos >= commands_end) return JXL_FAILURE("Out of bounds"); + tagsize = DecodeVarInt(enc, size, &cpos); + } + JXL_RETURN_IF_ERROR(CheckIs32Bit(tagsize)); + AppendUint32(tagsize, result); + prevtagstart = tagstart; + prevtagsize = tagsize; + + if (tagcode == kCommandTagTRC) { + AppendKeyword(kGtrcTag, result); + AppendUint32(tagstart, result); + AppendUint32(tagsize, result); + AppendKeyword(kBtrcTag, result); + AppendUint32(tagstart, result); + AppendUint32(tagsize, result); + } + + if (tagcode == kCommandTagXYZ) { + JXL_RETURN_IF_ERROR(CheckIs32Bit(tagstart + tagsize * 2)); + AppendKeyword(kGxyzTag, result); + AppendUint32(tagstart + tagsize, result); + AppendUint32(tagsize, result); + AppendKeyword(kBxyzTag, result); + AppendUint32(tagstart + tagsize * 2, result); + AppendUint32(tagsize, result); + } + } + } + + // Main Content + for (;;) { + if (result->size() > osize) return JXL_FAILURE("Invalid result size"); + if (cpos > commands_end) return JXL_FAILURE("Out of bounds"); + if (cpos == commands_end) break; // Valid end + uint8_t command = enc[cpos++]; + if (command == kCommandInsert) { + if (cpos >= commands_end) return JXL_FAILURE("Out of bounds"); + uint64_t num = DecodeVarInt(enc, size, &cpos); + JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, num, size)); + for (size_t i = 0; i < num; i++) { + result->push_back(enc[pos++]); + } + } else if (command == kCommandShuffle2 || command == kCommandShuffle4) { + if (cpos >= commands_end) return JXL_FAILURE("Out of bounds"); + uint64_t num = DecodeVarInt(enc, size, &cpos); + JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, num, size)); + PaddedBytes shuffled(num); + for (size_t i = 0; i < num; i++) { + shuffled[i] = enc[pos + i]; + } + if (command == kCommandShuffle2) { + Shuffle(shuffled.data(), num, 2); + } else if (command == kCommandShuffle4) { + Shuffle(shuffled.data(), num, 4); + } + for (size_t i = 0; i < num; i++) { + result->push_back(shuffled[i]); + pos++; + } + } else if (command == kCommandPredict) { + JXL_RETURN_IF_ERROR(CheckOutOfBounds(cpos, 2, commands_end)); + uint8_t flags = enc[cpos++]; + + size_t width = (flags & 3) + 1; + if (width == 3) return JXL_FAILURE("Invalid width"); + + int order = (flags & 12) >> 2; + if (order == 3) return JXL_FAILURE("Invalid order"); + + uint64_t stride = width; + if (flags & 16) { + if (cpos >= commands_end) return JXL_FAILURE("Out of bounds"); + stride = DecodeVarInt(enc, size, &cpos); + if (stride < width) { + return JXL_FAILURE("Invalid stride"); + } + } + // If stride * 4 >= result->size(), return failure. The check + // "size == 0 || ((size - 1) >> 2) < stride" corresponds to + // "stride * 4 >= size", but does not suffer from integer overflow. + // This check is more strict than necessary but follows the specification + // and the encoder should ensure this is followed. + if (result->empty() || ((result->size() - 1u) >> 2u) < stride) { + return JXL_FAILURE("Invalid stride"); + } + + if (cpos >= commands_end) return JXL_FAILURE("Out of bounds"); + uint64_t num = DecodeVarInt(enc, size, &cpos); // in bytes + JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, num, size)); + + PaddedBytes shuffled(num); + for (size_t i = 0; i < num; i++) { + shuffled[i] = enc[pos + i]; + } + if (width > 1) Shuffle(shuffled.data(), num, width); + + size_t start = result->size(); + for (size_t i = 0; i < num; i++) { + uint8_t predicted = LinearPredictICCValue(result->data(), start, i, + stride, width, order); + result->push_back(predicted + shuffled[i]); + } + pos += num; + } else if (command == kCommandXYZ) { + AppendKeyword(kXyz_Tag, result); + for (int i = 0; i < 4; i++) result->push_back(0); + JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, 12, size)); + for (size_t i = 0; i < 12; i++) { + result->push_back(enc[pos++]); + } + } else if (command >= kCommandTypeStartFirst && + command < kCommandTypeStartFirst + kNumTypeStrings) { + AppendKeyword(*kTypeStrings[command - kCommandTypeStartFirst], result); + for (size_t i = 0; i < 4; i++) { + result->push_back(0); + } + } else { + return JXL_FAILURE("Unknown command"); + } + } + + if (pos != size) return JXL_FAILURE("Not all data used"); + if (result->size() != osize) return JXL_FAILURE("Invalid result size"); + + return true; +} + +Status ICCReader::Init(BitReader* reader, size_t output_limit) { + JXL_RETURN_IF_ERROR(CheckEOI(reader)); + used_bits_base_ = reader->TotalBitsConsumed(); + if (bits_to_skip_ == 0) { + enc_size_ = U64Coder::Read(reader); + if (enc_size_ > 268435456) { + // Avoid too large memory allocation for invalid file. + return JXL_FAILURE("Too large encoded profile"); + } + JXL_RETURN_IF_ERROR( + DecodeHistograms(reader, kNumICCContexts, &code_, &context_map_)); + ans_reader_ = ANSSymbolReader(&code_, reader); + i_ = 0; + decompressed_.resize(std::min(i_ + 0x400, enc_size_)); + for (; i_ < std::min(2, enc_size_); i_++) { + decompressed_[i_] = ans_reader_.ReadHybridUint( + ICCANSContext(i_, i_ > 0 ? decompressed_[i_ - 1] : 0, + i_ > 1 ? decompressed_[i_ - 2] : 0), + reader, context_map_); + } + if (enc_size_ > kPreambleSize) { + for (; i_ < kPreambleSize; i_++) { + decompressed_[i_] = ans_reader_.ReadHybridUint( + ICCANSContext(i_, decompressed_[i_ - 1], decompressed_[i_ - 2]), + reader, context_map_); + } + JXL_RETURN_IF_ERROR(CheckEOI(reader)); + JXL_RETURN_IF_ERROR( + CheckPreamble(decompressed_, enc_size_, output_limit)); + } + bits_to_skip_ = reader->TotalBitsConsumed() - used_bits_base_; + } else { + reader->SkipBits(bits_to_skip_); + } + return true; +} + +Status ICCReader::Process(BitReader* reader, PaddedBytes* icc) { + ANSSymbolReader::Checkpoint checkpoint; + size_t saved_i = 0; + auto save = [&]() { + ans_reader_.Save(&checkpoint); + bits_to_skip_ = reader->TotalBitsConsumed() - used_bits_base_; + saved_i = i_; + }; + save(); + auto check_and_restore = [&]() { + Status status = CheckEOI(reader); + if (!status) { + // not enough bytes. + ans_reader_.Restore(checkpoint); + i_ = saved_i; + return status; + } + return Status(true); + }; + for (; i_ < enc_size_; i_++) { + if (i_ % ANSSymbolReader::kMaxCheckpointInterval == 0 && i_ > 0) { + JXL_RETURN_IF_ERROR(check_and_restore()); + save(); + if ((i_ > 0) && (((i_ & 0xFFFF) == 0))) { + float used_bytes = + (reader->TotalBitsConsumed() - used_bits_base_) / 8.0f; + if (i_ > used_bytes * 256) return JXL_FAILURE("Corrupted stream"); + } + decompressed_.resize(std::min(i_ + 0x400, enc_size_)); + } + JXL_DASSERT(i_ >= 2); + decompressed_[i_] = ans_reader_.ReadHybridUint( + ICCANSContext(i_, decompressed_[i_ - 1], decompressed_[i_ - 2]), reader, + context_map_); + } + JXL_RETURN_IF_ERROR(check_and_restore()); + bits_to_skip_ = reader->TotalBitsConsumed() - used_bits_base_; + if (!ans_reader_.CheckANSFinalState()) { + return JXL_FAILURE("Corrupted ICC profile"); + } + + icc->clear(); + return UnpredictICC(decompressed_.data(), decompressed_.size(), icc); +} + +Status ICCReader::CheckEOI(BitReader* reader) { + if (reader->AllReadsWithinBounds()) return true; + return JXL_STATUS(StatusCode::kNotEnoughBytes, + "Not enough bytes for reading ICC profile"); +} + +Status ReadICC(BitReader* JXL_RESTRICT reader, PaddedBytes* JXL_RESTRICT icc, + size_t output_limit) { + ICCReader icc_reader; + JXL_RETURN_IF_ERROR(icc_reader.Init(reader, output_limit)); + JXL_RETURN_IF_ERROR(icc_reader.Process(reader, icc)); + return true; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/icc_codec.h b/third_party/jpeg-xl/lib/jxl/icc_codec.h new file mode 100644 index 0000000000..a6c7477c60 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/icc_codec.h @@ -0,0 +1,57 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ICC_CODEC_H_ +#define LIB_JXL_ICC_CODEC_H_ + +// Compressed representation of ICC profiles. + +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" + +namespace jxl { + +struct ICCReader { + Status Init(BitReader* reader, size_t output_limit); + Status Process(BitReader* reader, PaddedBytes* icc); + void Reset() { + bits_to_skip_ = 0; + decompressed_.clear(); + } + + private: + Status CheckEOI(BitReader* reader); + size_t i_ = 0; + size_t bits_to_skip_ = 0; + size_t used_bits_base_ = 0; + uint64_t enc_size_ = 0; + std::vector context_map_; + ANSCode code_; + ANSSymbolReader ans_reader_; + PaddedBytes decompressed_; +}; + +// `icc` may be empty afterwards - if so, call CreateProfile. Does not append, +// clears any original data that was in icc. +// If `output_limit` is not 0, then returns error if resulting profile would be +// longer than `output_limit` +Status ReadICC(BitReader* JXL_RESTRICT reader, PaddedBytes* JXL_RESTRICT icc, + size_t output_limit = 0); + +// Exposed only for testing +Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result); + +// Exposed only for testing +Status UnpredictICC(const uint8_t* enc, size_t size, PaddedBytes* result); + +} // namespace jxl + +#endif // LIB_JXL_ICC_CODEC_H_ diff --git a/third_party/jpeg-xl/lib/jxl/icc_codec_common.cc b/third_party/jpeg-xl/lib/jxl/icc_codec_common.cc new file mode 100644 index 0000000000..212387e78f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/icc_codec_common.cc @@ -0,0 +1,190 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/icc_codec_common.h" + +#include + +#include +#include +#include + +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/common.h" +#include "lib/jxl/fields.h" + +namespace jxl { +namespace { +static uint8_t ByteKind1(uint8_t b) { + if ('a' <= b && b <= 'z') return 0; + if ('A' <= b && b <= 'Z') return 0; + if ('0' <= b && b <= '9') return 1; + if (b == '.' || b == ',') return 1; + if (b == 0) return 2; + if (b == 1) return 3; + if (b < 16) return 4; + if (b == 255) return 6; + if (b > 240) return 5; + return 7; +} + +static uint8_t ByteKind2(uint8_t b) { + if ('a' <= b && b <= 'z') return 0; + if ('A' <= b && b <= 'Z') return 0; + if ('0' <= b && b <= '9') return 1; + if (b == '.' || b == ',') return 1; + if (b < 16) return 2; + if (b > 240) return 3; + return 4; +} + +template +T PredictValue(T p1, T p2, T p3, int order) { + if (order == 0) return p1; + if (order == 1) return 2 * p1 - p2; + if (order == 2) return 3 * p1 - 3 * p2 + p3; + return 0; +} +} // namespace + +uint32_t DecodeUint32(const uint8_t* data, size_t size, size_t pos) { + return pos + 4 > size ? 0 : LoadBE32(data + pos); +} + +void EncodeUint32(size_t pos, uint32_t value, PaddedBytes* data) { + if (pos + 4 > data->size()) return; + StoreBE32(value, data->data() + pos); +} + +void AppendUint32(uint32_t value, PaddedBytes* data) { + data->resize(data->size() + 4); + EncodeUint32(data->size() - 4, value, data); +} + +typedef std::array Tag; + +Tag DecodeKeyword(const uint8_t* data, size_t size, size_t pos) { + if (pos + 4 > size) return {{' ', ' ', ' ', ' '}}; + return {{data[pos], data[pos + 1], data[pos + 2], data[pos + 3]}}; +} + +void EncodeKeyword(const Tag& keyword, uint8_t* data, size_t size, size_t pos) { + if (keyword.size() != 4 || pos + 3 >= size) return; + for (size_t i = 0; i < 4; ++i) data[pos + i] = keyword[i]; +} + +void AppendKeyword(const Tag& keyword, PaddedBytes* data) { + JXL_ASSERT(keyword.size() == 4); + data->append(keyword); +} + +// Checks if a + b > size, taking possible integer overflow into account. +Status CheckOutOfBounds(size_t a, size_t b, size_t size) { + size_t pos = a + b; + if (pos > size) return JXL_FAILURE("Out of bounds"); + if (pos < a) return JXL_FAILURE("Out of bounds"); // overflow happened + return true; +} + +Status CheckIs32Bit(uint64_t v) { + static constexpr const uint64_t kUpper32 = ~static_cast(0xFFFFFFFF); + if ((v & kUpper32) != 0) return JXL_FAILURE("32-bit value expected"); + return true; +} + +PaddedBytes ICCInitialHeaderPrediction() { + PaddedBytes result(kICCHeaderSize); + for (size_t i = 0; i < kICCHeaderSize; i++) { + result[i] = 0; + } + result[8] = 4; + EncodeKeyword(kMntrTag, result.data(), result.size(), 12); + EncodeKeyword(kRgb_Tag, result.data(), result.size(), 16); + EncodeKeyword(kXyz_Tag, result.data(), result.size(), 20); + EncodeKeyword(kAcspTag, result.data(), result.size(), 36); + result[68] = 0; + result[69] = 0; + result[70] = 246; + result[71] = 214; + result[72] = 0; + result[73] = 1; + result[74] = 0; + result[75] = 0; + result[76] = 0; + result[77] = 0; + result[78] = 211; + result[79] = 45; + return result; +} + +void ICCPredictHeader(const uint8_t* icc, size_t size, uint8_t* header, + size_t pos) { + if (pos == 8 && size >= 8) { + header[80] = icc[4]; + header[81] = icc[5]; + header[82] = icc[6]; + header[83] = icc[7]; + } + if (pos == 41 && size >= 41) { + if (icc[40] == 'A') { + header[41] = 'P'; + header[42] = 'P'; + header[43] = 'L'; + } + if (icc[40] == 'M') { + header[41] = 'S'; + header[42] = 'F'; + header[43] = 'T'; + } + } + if (pos == 42 && size >= 42) { + if (icc[40] == 'S' && icc[41] == 'G') { + header[42] = 'I'; + header[43] = ' '; + } + if (icc[40] == 'S' && icc[41] == 'U') { + header[42] = 'N'; + header[43] = 'W'; + } + } +} + +// Predicts a value with linear prediction of given order (0-2), for integers +// with width bytes and given stride in bytes between values. +// The start position is at start + i, and the relevant modulus of i describes +// which byte of the multi-byte integer is being handled. +// The value start + i must be at least stride * 4. +uint8_t LinearPredictICCValue(const uint8_t* data, size_t start, size_t i, + size_t stride, size_t width, int order) { + size_t pos = start + i; + if (width == 1) { + uint8_t p1 = data[pos - stride]; + uint8_t p2 = data[pos - stride * 2]; + uint8_t p3 = data[pos - stride * 3]; + return PredictValue(p1, p2, p3, order); + } else if (width == 2) { + size_t p = start + (i & ~1); + uint16_t p1 = (data[p - stride * 1] << 8) + data[p - stride * 1 + 1]; + uint16_t p2 = (data[p - stride * 2] << 8) + data[p - stride * 2 + 1]; + uint16_t p3 = (data[p - stride * 3] << 8) + data[p - stride * 3 + 1]; + uint16_t pred = PredictValue(p1, p2, p3, order); + return (i & 1) ? (pred & 255) : ((pred >> 8) & 255); + } else { + size_t p = start + (i & ~3); + uint32_t p1 = DecodeUint32(data, pos, p - stride); + uint32_t p2 = DecodeUint32(data, pos, p - stride * 2); + uint32_t p3 = DecodeUint32(data, pos, p - stride * 3); + uint32_t pred = PredictValue(p1, p2, p3, order); + unsigned shiftbytes = 3 - (i & 3); + return (pred >> (shiftbytes * 8)) & 255; + } +} + +size_t ICCANSContext(size_t i, size_t b1, size_t b2) { + if (i <= 128) return 0; + return 1 + ByteKind1(b1) + ByteKind2(b2) * 8; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/icc_codec_common.h b/third_party/jpeg-xl/lib/jxl/icc_codec_common.h new file mode 100644 index 0000000000..e91e908669 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/icc_codec_common.h @@ -0,0 +1,106 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ICC_CODEC_COMMON_H_ +#define LIB_JXL_ICC_CODEC_COMMON_H_ + +// Compressed representation of ICC profiles. + +#include +#include + +#include + +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +static constexpr size_t kICCHeaderSize = 128; + +typedef std::array Tag; + +static const Tag kAcspTag = {{'a', 'c', 's', 'p'}}; +static const Tag kBkptTag = {{'b', 'k', 'p', 't'}}; +static const Tag kBtrcTag = {{'b', 'T', 'R', 'C'}}; +static const Tag kBxyzTag = {{'b', 'X', 'Y', 'Z'}}; +static const Tag kChadTag = {{'c', 'h', 'a', 'd'}}; +static const Tag kChrmTag = {{'c', 'h', 'r', 'm'}}; +static const Tag kCprtTag = {{'c', 'p', 'r', 't'}}; +static const Tag kCurvTag = {{'c', 'u', 'r', 'v'}}; +static const Tag kDescTag = {{'d', 'e', 's', 'c'}}; +static const Tag kDmddTag = {{'d', 'm', 'd', 'd'}}; +static const Tag kDmndTag = {{'d', 'm', 'n', 'd'}}; +static const Tag kGbd_Tag = {{'g', 'b', 'd', ' '}}; +static const Tag kGtrcTag = {{'g', 'T', 'R', 'C'}}; +static const Tag kGxyzTag = {{'g', 'X', 'Y', 'Z'}}; +static const Tag kKtrcTag = {{'k', 'T', 'R', 'C'}}; +static const Tag kKxyzTag = {{'k', 'X', 'Y', 'Z'}}; +static const Tag kLumiTag = {{'l', 'u', 'm', 'i'}}; +static const Tag kMab_Tag = {{'m', 'A', 'B', ' '}}; +static const Tag kMba_Tag = {{'m', 'B', 'A', ' '}}; +static const Tag kMlucTag = {{'m', 'l', 'u', 'c'}}; +static const Tag kMntrTag = {{'m', 'n', 't', 'r'}}; +static const Tag kParaTag = {{'p', 'a', 'r', 'a'}}; +static const Tag kRgb_Tag = {{'R', 'G', 'B', ' '}}; +static const Tag kRtrcTag = {{'r', 'T', 'R', 'C'}}; +static const Tag kRxyzTag = {{'r', 'X', 'Y', 'Z'}}; +static const Tag kSf32Tag = {{'s', 'f', '3', '2'}}; +static const Tag kTextTag = {{'t', 'e', 'x', 't'}}; +static const Tag kVcgtTag = {{'v', 'c', 'g', 't'}}; +static const Tag kWtptTag = {{'w', 't', 'p', 't'}}; +static const Tag kXyz_Tag = {{'X', 'Y', 'Z', ' '}}; + +// Tag names focused on RGB and GRAY monitor profiles +static constexpr size_t kNumTagStrings = 17; +static constexpr const Tag* kTagStrings[kNumTagStrings] = { + &kCprtTag, &kWtptTag, &kBkptTag, &kRxyzTag, &kGxyzTag, &kBxyzTag, + &kKxyzTag, &kRtrcTag, &kGtrcTag, &kBtrcTag, &kKtrcTag, &kChadTag, + &kDescTag, &kChrmTag, &kDmndTag, &kDmddTag, &kLumiTag}; + +static constexpr size_t kCommandTagUnknown = 1; +static constexpr size_t kCommandTagTRC = 2; +static constexpr size_t kCommandTagXYZ = 3; +static constexpr size_t kCommandTagStringFirst = 4; + +// Tag types focused on RGB and GRAY monitor profiles +static constexpr size_t kNumTypeStrings = 8; +static constexpr const Tag* kTypeStrings[kNumTypeStrings] = { + &kXyz_Tag, &kDescTag, &kTextTag, &kMlucTag, + &kParaTag, &kCurvTag, &kSf32Tag, &kGbd_Tag}; + +static constexpr size_t kCommandInsert = 1; +static constexpr size_t kCommandShuffle2 = 2; +static constexpr size_t kCommandShuffle4 = 3; +static constexpr size_t kCommandPredict = 4; +static constexpr size_t kCommandXYZ = 10; +static constexpr size_t kCommandTypeStartFirst = 16; + +static constexpr size_t kFlagBitOffset = 64; +static constexpr size_t kFlagBitSize = 128; + +static constexpr size_t kNumICCContexts = 41; + +uint32_t DecodeUint32(const uint8_t* data, size_t size, size_t pos); +void EncodeUint32(size_t pos, uint32_t value, PaddedBytes* data); +void AppendUint32(uint32_t value, PaddedBytes* data); +Tag DecodeKeyword(const uint8_t* data, size_t size, size_t pos); +void EncodeKeyword(const Tag& keyword, uint8_t* data, size_t size, size_t pos); +void AppendKeyword(const Tag& keyword, PaddedBytes* data); + +// Checks if a + b > size, taking possible integer overflow into account. +Status CheckOutOfBounds(size_t a, size_t b, size_t size); +Status CheckIs32Bit(uint64_t v); + +PaddedBytes ICCInitialHeaderPrediction(); +void ICCPredictHeader(const uint8_t* icc, size_t size, uint8_t* header, + size_t pos); +uint8_t LinearPredictICCValue(const uint8_t* data, size_t start, size_t i, + size_t stride, size_t width, int order); +size_t ICCANSContext(size_t i, size_t b1, size_t b2); + +} // namespace jxl + +#endif // LIB_JXL_ICC_CODEC_COMMON_H_ diff --git a/third_party/jpeg-xl/lib/jxl/icc_codec_test.cc b/third_party/jpeg-xl/lib/jxl/icc_codec_test.cc new file mode 100644 index 0000000000..af02094e99 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/icc_codec_test.cc @@ -0,0 +1,207 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/icc_codec.h" + +#include + +#include "lib/jxl/base/span.h" +#include "lib/jxl/enc_icc_codec.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +void TestProfile(const PaddedBytes& icc) { + BitWriter writer; + ASSERT_TRUE(WriteICC(icc, &writer, 0, nullptr)); + writer.ZeroPadToByte(); + PaddedBytes dec; + BitReader reader(writer.GetSpan()); + ASSERT_TRUE(ReadICC(&reader, &dec)); + ASSERT_TRUE(reader.Close()); + EXPECT_EQ(icc.size(), dec.size()); + if (icc.size() == dec.size()) { + for (size_t i = 0; i < icc.size(); i++) { + EXPECT_EQ(icc[i], dec[i]); + if (icc[i] != dec[i]) break; // One output is enough + } + } +} + +void TestProfile(const std::string& icc) { + PaddedBytes bytes(icc.size()); + for (size_t i = 0; i < icc.size(); i++) { + bytes[i] = icc[i]; + } + TestProfile(bytes); +} + +// Valid profile from one of the images output by the decoder. +static const unsigned char kTestProfile[] = { + 0x00, 0x00, 0x03, 0x80, 0x6c, 0x63, 0x6d, 0x73, 0x04, 0x30, 0x00, 0x00, + 0x6d, 0x6e, 0x74, 0x72, 0x52, 0x47, 0x42, 0x20, 0x58, 0x59, 0x5a, 0x20, + 0x07, 0xe3, 0x00, 0x04, 0x00, 0x1d, 0x00, 0x0f, 0x00, 0x32, 0x00, 0x2e, + 0x61, 0x63, 0x73, 0x70, 0x41, 0x50, 0x50, 0x4c, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0xf6, 0xd6, + 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xd3, 0x2d, 0x6c, 0x63, 0x6d, 0x73, + 0x5f, 0x07, 0x0d, 0x3e, 0x4d, 0x32, 0xf2, 0x6e, 0x5d, 0x77, 0x26, 0xcc, + 0x23, 0xb0, 0x6a, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0d, + 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x01, 0x20, 0x00, 0x00, 0x00, 0x42, + 0x63, 0x70, 0x72, 0x74, 0x00, 0x00, 0x01, 0x64, 0x00, 0x00, 0x01, 0x00, + 0x77, 0x74, 0x70, 0x74, 0x00, 0x00, 0x02, 0x64, 0x00, 0x00, 0x00, 0x14, + 0x63, 0x68, 0x61, 0x64, 0x00, 0x00, 0x02, 0x78, 0x00, 0x00, 0x00, 0x2c, + 0x72, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x02, 0xa4, 0x00, 0x00, 0x00, 0x14, + 0x62, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x02, 0xb8, 0x00, 0x00, 0x00, 0x14, + 0x67, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x02, 0xcc, 0x00, 0x00, 0x00, 0x14, + 0x72, 0x54, 0x52, 0x43, 0x00, 0x00, 0x02, 0xe0, 0x00, 0x00, 0x00, 0x20, + 0x67, 0x54, 0x52, 0x43, 0x00, 0x00, 0x02, 0xe0, 0x00, 0x00, 0x00, 0x20, + 0x62, 0x54, 0x52, 0x43, 0x00, 0x00, 0x02, 0xe0, 0x00, 0x00, 0x00, 0x20, + 0x63, 0x68, 0x72, 0x6d, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x24, + 0x64, 0x6d, 0x6e, 0x64, 0x00, 0x00, 0x03, 0x24, 0x00, 0x00, 0x00, 0x28, + 0x64, 0x6d, 0x64, 0x64, 0x00, 0x00, 0x03, 0x4c, 0x00, 0x00, 0x00, 0x32, + 0x6d, 0x6c, 0x75, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x0c, 0x65, 0x6e, 0x55, 0x53, 0x00, 0x00, 0x00, 0x26, + 0x00, 0x00, 0x00, 0x1c, 0x00, 0x52, 0x00, 0x47, 0x00, 0x42, 0x00, 0x5f, + 0x00, 0x44, 0x00, 0x36, 0x00, 0x35, 0x00, 0x5f, 0x00, 0x53, 0x00, 0x52, + 0x00, 0x47, 0x00, 0x5f, 0x00, 0x52, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x5f, + 0x00, 0x37, 0x00, 0x30, 0x00, 0x39, 0x00, 0x00, 0x6d, 0x6c, 0x75, 0x63, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, + 0x65, 0x6e, 0x55, 0x53, 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0x00, 0x1c, + 0x00, 0x43, 0x00, 0x6f, 0x00, 0x70, 0x00, 0x79, 0x00, 0x72, 0x00, 0x69, + 0x00, 0x67, 0x00, 0x68, 0x00, 0x74, 0x00, 0x20, 0x00, 0x32, 0x00, 0x30, + 0x00, 0x31, 0x00, 0x38, 0x00, 0x20, 0x00, 0x47, 0x00, 0x6f, 0x00, 0x6f, + 0x00, 0x67, 0x00, 0x6c, 0x00, 0x65, 0x00, 0x20, 0x00, 0x4c, 0x00, 0x4c, + 0x00, 0x43, 0x00, 0x2c, 0x00, 0x20, 0x00, 0x43, 0x00, 0x43, 0x00, 0x2d, + 0x00, 0x42, 0x00, 0x59, 0x00, 0x2d, 0x00, 0x53, 0x00, 0x41, 0x00, 0x20, + 0x00, 0x33, 0x00, 0x2e, 0x00, 0x30, 0x00, 0x20, 0x00, 0x55, 0x00, 0x6e, + 0x00, 0x70, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x74, 0x00, 0x65, 0x00, 0x64, + 0x00, 0x20, 0x00, 0x6c, 0x00, 0x69, 0x00, 0x63, 0x00, 0x65, 0x00, 0x6e, + 0x00, 0x73, 0x00, 0x65, 0x00, 0x28, 0x00, 0x68, 0x00, 0x74, 0x00, 0x74, + 0x00, 0x70, 0x00, 0x73, 0x00, 0x3a, 0x00, 0x2f, 0x00, 0x2f, 0x00, 0x63, + 0x00, 0x72, 0x00, 0x65, 0x00, 0x61, 0x00, 0x74, 0x00, 0x69, 0x00, 0x76, + 0x00, 0x65, 0x00, 0x63, 0x00, 0x6f, 0x00, 0x6d, 0x00, 0x6d, 0x00, 0x6f, + 0x00, 0x6e, 0x00, 0x73, 0x00, 0x2e, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x67, + 0x00, 0x2f, 0x00, 0x6c, 0x00, 0x69, 0x00, 0x63, 0x00, 0x65, 0x00, 0x6e, + 0x00, 0x73, 0x00, 0x65, 0x00, 0x73, 0x00, 0x2f, 0x00, 0x62, 0x00, 0x79, + 0x00, 0x2d, 0x00, 0x73, 0x00, 0x61, 0x00, 0x2f, 0x00, 0x33, 0x00, 0x2e, + 0x00, 0x30, 0x00, 0x2f, 0x00, 0x6c, 0x00, 0x65, 0x00, 0x67, 0x00, 0x61, + 0x00, 0x6c, 0x00, 0x63, 0x00, 0x6f, 0x00, 0x64, 0x00, 0x65, 0x00, 0x29, + 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xd6, + 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xd3, 0x2d, 0x73, 0x66, 0x33, 0x32, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x0c, 0x42, 0x00, 0x00, 0x05, 0xde, + 0xff, 0xff, 0xf3, 0x25, 0x00, 0x00, 0x07, 0x93, 0x00, 0x00, 0xfd, 0x90, + 0xff, 0xff, 0xfb, 0xa1, 0xff, 0xff, 0xfd, 0xa2, 0x00, 0x00, 0x03, 0xdc, + 0x00, 0x00, 0xc0, 0x6e, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x6f, 0xa0, 0x00, 0x00, 0x38, 0xf5, 0x00, 0x00, 0x03, 0x90, + 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x24, 0x9f, + 0x00, 0x00, 0x0f, 0x84, 0x00, 0x00, 0xb6, 0xc4, 0x58, 0x59, 0x5a, 0x20, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x62, 0x97, 0x00, 0x00, 0xb7, 0x87, + 0x00, 0x00, 0x18, 0xd9, 0x70, 0x61, 0x72, 0x61, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x38, 0xe4, 0x00, 0x00, 0xe8, 0xf0, + 0x00, 0x00, 0x17, 0x10, 0x00, 0x00, 0x38, 0xe4, 0x00, 0x00, 0x14, 0xbc, + 0x63, 0x68, 0x72, 0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, + 0x00, 0x00, 0xa3, 0xd7, 0x00, 0x00, 0x54, 0x7c, 0x00, 0x00, 0x4c, 0xcd, + 0x00, 0x00, 0x99, 0x9a, 0x00, 0x00, 0x26, 0x67, 0x00, 0x00, 0x0f, 0x5c, + 0x6d, 0x6c, 0x75, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x0c, 0x65, 0x6e, 0x55, 0x53, 0x00, 0x00, 0x00, 0x0c, + 0x00, 0x00, 0x00, 0x1c, 0x00, 0x47, 0x00, 0x6f, 0x00, 0x6f, 0x00, 0x67, + 0x00, 0x6c, 0x00, 0x65, 0x6d, 0x6c, 0x75, 0x63, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x65, 0x6e, 0x55, 0x53, + 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x49, 0x00, 0x6d, + 0x00, 0x61, 0x00, 0x67, 0x00, 0x65, 0x00, 0x20, 0x00, 0x63, 0x00, 0x6f, + 0x00, 0x64, 0x00, 0x65, 0x00, 0x63, 0x00, 0x00, +}; + +} // namespace + +TEST(IccCodecTest, Icc) { + // Empty string cannot be tested, encoder checks against writing it. + TestProfile("a"); + TestProfile("ab"); + TestProfile("aaaa"); + + { + // Exactly the ICC header size + PaddedBytes profile(128); + for (size_t i = 0; i < 128; i++) { + profile[i] = 0; + } + TestProfile(profile); + } + + { + PaddedBytes profile; + profile.append(kTestProfile, kTestProfile + sizeof(kTestProfile)); + TestProfile(profile); + } + + // Test substrings of full profile + { + PaddedBytes profile; + for (size_t i = 0; i <= 256; i++) { + profile.push_back(kTestProfile[i]); + TestProfile(profile); + } + } +} + +// kTestProfile after encoding with the ICC codec +static const unsigned char kEncodedTestProfile[] = { + 0x1f, 0x8b, 0x1, 0x13, 0x10, 0x0, 0x0, 0x0, 0x20, 0x4c, 0xcc, 0x3, + 0xe7, 0xa0, 0xa5, 0xa2, 0x90, 0xa4, 0x27, 0xe8, 0x79, 0x1d, 0xe3, 0x26, + 0x57, 0x54, 0xef, 0x0, 0xe8, 0x97, 0x2, 0xce, 0xa1, 0xd7, 0x85, 0x16, + 0xb4, 0x29, 0x94, 0x58, 0xf2, 0x56, 0xc0, 0x76, 0xea, 0x23, 0xec, 0x7c, + 0x73, 0x51, 0x41, 0x40, 0x23, 0x21, 0x95, 0x4, 0x75, 0x12, 0xc9, 0xcc, + 0x16, 0xbd, 0xb6, 0x99, 0xad, 0xf8, 0x75, 0x35, 0xb6, 0x42, 0xae, 0xae, + 0xae, 0x86, 0x56, 0xf8, 0xcc, 0x16, 0x30, 0xb3, 0x45, 0xad, 0xd, 0x40, + 0xd6, 0xd1, 0xd6, 0x99, 0x40, 0xbe, 0xe2, 0xdc, 0x31, 0x7, 0xa6, 0xb9, + 0x27, 0x92, 0x38, 0x0, 0x3, 0x5e, 0x2c, 0xbe, 0xe6, 0xfb, 0x19, 0xbf, + 0xf3, 0x6d, 0xbc, 0x4d, 0x64, 0xe5, 0xba, 0x76, 0xde, 0x31, 0x65, 0x66, + 0x14, 0xa6, 0x3a, 0xc5, 0x8f, 0xb1, 0xb4, 0xba, 0x1f, 0xb1, 0xb8, 0xd4, + 0x75, 0xba, 0x18, 0x86, 0x95, 0x3c, 0x26, 0xf6, 0x25, 0x62, 0x53, 0xfd, + 0x9c, 0x94, 0x76, 0xf6, 0x95, 0x2c, 0xb1, 0xfd, 0xdc, 0xc0, 0xe4, 0x3f, + 0xb3, 0xff, 0x67, 0xde, 0xd5, 0x94, 0xcc, 0xb0, 0x83, 0x2f, 0x28, 0x93, + 0x92, 0x3, 0xa1, 0x41, 0x64, 0x60, 0x62, 0x70, 0x80, 0x87, 0xaf, 0xe7, + 0x60, 0x4a, 0x20, 0x23, 0xb3, 0x11, 0x7, 0x38, 0x38, 0xd4, 0xa, 0x66, + 0xb5, 0x93, 0x41, 0x90, 0x19, 0x17, 0x18, 0x60, 0xa5, 0xb, 0x7a, 0x24, + 0xaa, 0x20, 0x81, 0xac, 0xa9, 0xa1, 0x70, 0xa6, 0x12, 0x8a, 0x4a, 0xa3, + 0xa0, 0xf9, 0x9a, 0x97, 0xe7, 0xa8, 0xac, 0x8, 0xa8, 0xc4, 0x2a, 0x86, + 0xa7, 0x69, 0x1e, 0x67, 0xe6, 0xbe, 0xa4, 0xd3, 0xff, 0x91, 0x61, 0xf6, + 0x8a, 0xe6, 0xb5, 0xb3, 0x61, 0x9f, 0x19, 0x17, 0x98, 0x27, 0x6b, 0xe9, + 0x8, 0x98, 0xe1, 0x21, 0x4a, 0x9, 0xb5, 0xd7, 0xca, 0xfa, 0x94, 0xd0, + 0x69, 0x1a, 0xeb, 0x52, 0x1, 0x4e, 0xf5, 0xf6, 0xdf, 0x7f, 0xe7, 0x29, + 0x70, 0xee, 0x4, 0xda, 0x2f, 0xa4, 0xff, 0xfe, 0xbb, 0x6f, 0xa8, 0xff, + 0xfe, 0xdb, 0xaf, 0x8, 0xf6, 0x72, 0xa1, 0x40, 0x5d, 0xf0, 0x2d, 0x8, + 0x82, 0x5b, 0x87, 0xbd, 0x10, 0x8, 0xe9, 0x7, 0xee, 0x4b, 0x80, 0xda, + 0x4a, 0x4, 0xc5, 0x5e, 0xa0, 0xb7, 0x1e, 0x60, 0xb0, 0x59, 0x76, 0x60, + 0xb, 0x2e, 0x19, 0x8a, 0x2e, 0x1c, 0xe6, 0x6, 0x20, 0xb8, 0x64, 0x18, + 0x2a, 0xcf, 0x51, 0x94, 0xd4, 0xee, 0xc3, 0xfe, 0x39, 0x74, 0xd4, 0x2b, + 0x48, 0xc9, 0x83, 0x4c, 0x9b, 0xd0, 0x4c, 0x35, 0x10, 0xe3, 0x9, 0xf7, + 0x72, 0xf0, 0x7a, 0xe, 0xbf, 0x7d, 0x36, 0x2e, 0x19, 0x7e, 0x3f, 0xc, + 0xf7, 0x93, 0xe7, 0xf4, 0x1d, 0x32, 0xc6, 0xb0, 0x89, 0xad, 0xe0, 0x28, + 0xc1, 0xa7, 0x59, 0xe3, 0x0, +}; + +// Tests that the decoded kEncodedTestProfile matches kTestProfile. +TEST(IccCodecTest, EncodedIccProfile) { + jxl::BitReader reader(jxl::Span(kEncodedTestProfile, + sizeof(kEncodedTestProfile))); + jxl::PaddedBytes dec; + ASSERT_TRUE(ReadICC(&reader, &dec)); + ASSERT_TRUE(reader.Close()); + EXPECT_EQ(sizeof(kTestProfile), dec.size()); + if (sizeof(kTestProfile) == dec.size()) { + for (size_t i = 0; i < dec.size(); i++) { + EXPECT_EQ(kTestProfile[i], dec[i]); + if (kTestProfile[i] != dec[i]) break; // One output is enough + } + } +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/image.cc b/third_party/jpeg-xl/lib/jxl/image.cc new file mode 100644 index 0000000000..3faff6aefb --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/image.cc @@ -0,0 +1,251 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/image.h" + +#include // swap + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/image.cc" +#include +#include + +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/sanitizers.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { + +namespace HWY_NAMESPACE { +size_t GetVectorSize() { return HWY_LANES(uint8_t); } +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE + +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +namespace { + +HWY_EXPORT(GetVectorSize); // Local function. + +// Returns distance [bytes] between the start of two consecutive rows, a +// multiple of vector/cache line size but NOT CacheAligned::kAlias - see below. +size_t BytesPerRow(const size_t xsize, const size_t sizeof_t) { + const size_t vec_size = VectorSize(); + size_t valid_bytes = xsize * sizeof_t; + + // Allow unaligned accesses starting at the last valid value - this may raise + // msan errors unless the user calls InitializePaddingForUnalignedAccesses. + // Skip for the scalar case because no extra lanes will be loaded. + if (vec_size != 0) { + valid_bytes += vec_size - sizeof_t; + } + + // Round up to vector and cache line size. + const size_t align = std::max(vec_size, CacheAligned::kAlignment); + size_t bytes_per_row = RoundUpTo(valid_bytes, align); + + // During the lengthy window before writes are committed to memory, CPUs + // guard against read after write hazards by checking the address, but + // only the lower 11 bits. We avoid a false dependency between writes to + // consecutive rows by ensuring their sizes are not multiples of 2 KiB. + // Avoid2K prevents the same problem for the planes of an Image3. + if (bytes_per_row % CacheAligned::kAlias == 0) { + bytes_per_row += align; + } + + JXL_ASSERT(bytes_per_row % align == 0); + return bytes_per_row; +} + +} // namespace + +size_t VectorSize() { + static size_t bytes = HWY_DYNAMIC_DISPATCH(GetVectorSize)(); + return bytes; +} + +PlaneBase::PlaneBase(const size_t xsize, const size_t ysize, + const size_t sizeof_t) + : xsize_(static_cast(xsize)), + ysize_(static_cast(ysize)), + orig_xsize_(static_cast(xsize)), + orig_ysize_(static_cast(ysize)) { + // (Can't profile CacheAligned itself because it is used by profiler.h) + PROFILER_FUNC; + + JXL_CHECK(xsize == xsize_); + JXL_CHECK(ysize == ysize_); + + JXL_ASSERT(sizeof_t == 1 || sizeof_t == 2 || sizeof_t == 4 || sizeof_t == 8); + + bytes_per_row_ = 0; + // Dimensions can be zero, e.g. for lazily-allocated images. Only allocate + // if nonzero, because "zero" bytes still have padding/bookkeeping overhead. + if (xsize != 0 && ysize != 0) { + bytes_per_row_ = BytesPerRow(xsize, sizeof_t); + bytes_ = AllocateArray(bytes_per_row_ * ysize); + JXL_CHECK(bytes_.get()); + InitializePadding(sizeof_t, Padding::kRoundUp); + } +} + +void PlaneBase::InitializePadding(const size_t sizeof_t, Padding padding) { +#if defined(MEMORY_SANITIZER) || HWY_IDE + if (xsize_ == 0 || ysize_ == 0) return; + + const size_t vec_size = VectorSize(); + if (vec_size == 0) return; // Scalar mode: no padding needed + + const size_t valid_size = xsize_ * sizeof_t; + const size_t initialize_size = padding == Padding::kRoundUp + ? RoundUpTo(valid_size, vec_size) + : valid_size + vec_size - sizeof_t; + if (valid_size == initialize_size) return; + + for (size_t y = 0; y < ysize_; ++y) { + uint8_t* JXL_RESTRICT row = static_cast(VoidRow(y)); +#if defined(__clang__) && \ + ((!defined(__apple_build_version__) && __clang_major__ <= 6) || \ + (defined(__apple_build_version__) && \ + __apple_build_version__ <= 10001145)) + // There's a bug in msan in clang-6 when handling AVX2 operations. This + // workaround allows tests to pass on msan, although it is slower and + // prevents msan warnings from uninitialized images. + std::fill(row, msan::kSanitizerSentinelByte, initialize_size); +#else + memset(row + valid_size, msan::kSanitizerSentinelByte, + initialize_size - valid_size); +#endif // clang6 + } +#endif // MEMORY_SANITIZER +} + +void PlaneBase::Swap(PlaneBase& other) { + std::swap(xsize_, other.xsize_); + std::swap(ysize_, other.ysize_); + std::swap(orig_xsize_, other.orig_xsize_); + std::swap(orig_ysize_, other.orig_ysize_); + std::swap(bytes_per_row_, other.bytes_per_row_); + std::swap(bytes_, other.bytes_); +} + +Image3F PadImageMirror(const Image3F& in, const size_t xborder, + const size_t yborder) { + size_t xsize = in.xsize(); + size_t ysize = in.ysize(); + Image3F out(xsize + 2 * xborder, ysize + 2 * yborder); + if (xborder > xsize || yborder > ysize) { + for (size_t c = 0; c < 3; c++) { + for (int32_t y = 0; y < static_cast(out.ysize()); y++) { + float* row_out = out.PlaneRow(c, y); + const float* row_in = in.PlaneRow( + c, Mirror(y - static_cast(yborder), in.ysize())); + for (int32_t x = 0; x < static_cast(out.xsize()); x++) { + int32_t xin = Mirror(x - static_cast(xborder), in.xsize()); + row_out[x] = row_in[xin]; + } + } + } + return out; + } + CopyImageTo(in, Rect(xborder, yborder, xsize, ysize), &out); + for (size_t c = 0; c < 3; c++) { + // Horizontal pad. + for (size_t y = 0; y < ysize; y++) { + for (size_t x = 0; x < xborder; x++) { + out.PlaneRow(c, y + yborder)[x] = + in.ConstPlaneRow(c, y)[xborder - x - 1]; + out.PlaneRow(c, y + yborder)[x + xsize + xborder] = + in.ConstPlaneRow(c, y)[xsize - 1 - x]; + } + } + // Vertical pad. + for (size_t y = 0; y < yborder; y++) { + memcpy(out.PlaneRow(c, y), out.ConstPlaneRow(c, 2 * yborder - 1 - y), + out.xsize() * sizeof(float)); + memcpy(out.PlaneRow(c, y + ysize + yborder), + out.ConstPlaneRow(c, ysize + yborder - 1 - y), + out.xsize() * sizeof(float)); + } + } + return out; +} + +void PadImageToBlockMultipleInPlace(Image3F* JXL_RESTRICT in, + size_t block_dim) { + PROFILER_FUNC; + const size_t xsize_orig = in->xsize(); + const size_t ysize_orig = in->ysize(); + const size_t xsize = RoundUpTo(xsize_orig, block_dim); + const size_t ysize = RoundUpTo(ysize_orig, block_dim); + // Expands image size to the originally-allocated size. + in->ShrinkTo(xsize, ysize); + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < ysize_orig; y++) { + float* JXL_RESTRICT row = in->PlaneRow(c, y); + for (size_t x = xsize_orig; x < xsize; x++) { + row[x] = row[xsize_orig - 1]; + } + } + const float* JXL_RESTRICT row_src = in->ConstPlaneRow(c, ysize_orig - 1); + for (size_t y = ysize_orig; y < ysize; y++) { + memcpy(in->PlaneRow(c, y), row_src, xsize * sizeof(float)); + } + } +} + +static void DownsampleImage(const ImageF& input, size_t factor, + ImageF* output) { + JXL_ASSERT(factor != 1); + output->ShrinkTo(DivCeil(input.xsize(), factor), + DivCeil(input.ysize(), factor)); + size_t in_stride = input.PixelsPerRow(); + for (size_t y = 0; y < output->ysize(); y++) { + float* row_out = output->Row(y); + const float* row_in = input.Row(factor * y); + for (size_t x = 0; x < output->xsize(); x++) { + size_t cnt = 0; + float sum = 0; + for (size_t iy = 0; iy < factor && iy + factor * y < input.ysize(); + iy++) { + for (size_t ix = 0; ix < factor && ix + factor * x < input.xsize(); + ix++) { + sum += row_in[iy * in_stride + x * factor + ix]; + cnt++; + } + } + row_out[x] = sum / cnt; + } + } +} + +void DownsampleImage(ImageF* image, size_t factor) { + // Allocate extra space to avoid a reallocation when padding. + ImageF downsampled(DivCeil(image->xsize(), factor) + kBlockDim, + DivCeil(image->ysize(), factor) + kBlockDim); + DownsampleImage(*image, factor, &downsampled); + *image = std::move(downsampled); +} + +void DownsampleImage(Image3F* opsin, size_t factor) { + JXL_ASSERT(factor != 1); + // Allocate extra space to avoid a reallocation when padding. + Image3F downsampled(DivCeil(opsin->xsize(), factor) + kBlockDim, + DivCeil(opsin->ysize(), factor) + kBlockDim); + downsampled.ShrinkTo(downsampled.xsize() - kBlockDim, + downsampled.ysize() - kBlockDim); + for (size_t c = 0; c < 3; c++) { + DownsampleImage(opsin->Plane(c), factor, &downsampled.Plane(c)); + } + *opsin = std::move(downsampled); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/image.h b/third_party/jpeg-xl/lib/jxl/image.h new file mode 100644 index 0000000000..e66534220c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/image.h @@ -0,0 +1,497 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_IMAGE_H_ +#define LIB_JXL_IMAGE_H_ + +// SIMD/multicore-friendly planar image representation with row accessors. + +#include +#include +#include +#include + +#include +#include +#include // std::move + +#include "lib/jxl/base/cache_aligned.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" + +namespace jxl { + +// Helper function to create rows that are multiples of SIMD vector size. +size_t VectorSize(); + +// Type-independent parts of Plane<> - reduces code duplication and facilitates +// moving member function implementations to cc file. +struct PlaneBase { + PlaneBase() + : xsize_(0), + ysize_(0), + orig_xsize_(0), + orig_ysize_(0), + bytes_per_row_(0), + bytes_(nullptr) {} + PlaneBase(size_t xsize, size_t ysize, size_t sizeof_t); + + // Copy construction/assignment is forbidden to avoid inadvertent copies, + // which can be very expensive. Use CopyImageTo() instead. + PlaneBase(const PlaneBase& other) = delete; + PlaneBase& operator=(const PlaneBase& other) = delete; + + // Move constructor (required for returning Image from function) + PlaneBase(PlaneBase&& other) noexcept = default; + + // Move assignment (required for std::vector) + PlaneBase& operator=(PlaneBase&& other) noexcept = default; + + void Swap(PlaneBase& other); + + // Useful for pre-allocating image with some padding for alignment purposes + // and later reporting the actual valid dimensions. May also be used to + // un-shrink the image. Caller is responsible for ensuring xsize/ysize are <= + // the original dimensions. + void ShrinkTo(const size_t xsize, const size_t ysize) { + JXL_CHECK(xsize <= orig_xsize_); + JXL_CHECK(ysize <= orig_ysize_); + xsize_ = static_cast(xsize); + ysize_ = static_cast(ysize); + // NOTE: we can't recompute bytes_per_row for more compact storage and + // better locality because that would invalidate the image contents. + } + + // How many pixels. + JXL_INLINE size_t xsize() const { return xsize_; } + JXL_INLINE size_t ysize() const { return ysize_; } + + // NOTE: do not use this for copying rows - the valid xsize may be much less. + JXL_INLINE size_t bytes_per_row() const { return bytes_per_row_; } + + // Raw access to byte contents, for interfacing with other libraries. + // Unsigned char instead of char to avoid surprises (sign extension). + JXL_INLINE uint8_t* bytes() { + void* p = bytes_.get(); + return static_cast(JXL_ASSUME_ALIGNED(p, 64)); + } + JXL_INLINE const uint8_t* bytes() const { + const void* p = bytes_.get(); + return static_cast(JXL_ASSUME_ALIGNED(p, 64)); + } + + protected: + // Returns pointer to the start of a row. + JXL_INLINE void* VoidRow(const size_t y) const { +#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \ + defined(THREAD_SANITIZER) + if (y >= ysize_) { + JXL_ABORT("Row(%" PRIu64 ") in (%u x %u) image\n", (uint64_t)y, xsize_, + ysize_); + } +#endif + + void* row = bytes_.get() + y * bytes_per_row_; + return JXL_ASSUME_ALIGNED(row, 64); + } + + enum class Padding { + // Allow Load(d, row + x) for x = 0; x < xsize(); x += Lanes(d). Default. + kRoundUp, + // Allow LoadU(d, row + x) for x = xsize() - 1. This requires an extra + // vector to be initialized. If done by default, this would suppress + // legitimate msan warnings. We therefore require users to explicitly call + // InitializePadding before using unaligned loads (e.g. convolution). + kUnaligned + }; + + // Initializes the minimum bytes required to suppress msan warnings from + // legitimate (according to Padding mode) vector loads/stores on the right + // border, where some lanes are uninitialized and assumed to be unused. + void InitializePadding(size_t sizeof_t, Padding padding); + + // (Members are non-const to enable assignment during move-assignment.) + uint32_t xsize_; // In valid pixels, not including any padding. + uint32_t ysize_; + uint32_t orig_xsize_; + uint32_t orig_ysize_; + size_t bytes_per_row_; // Includes padding. + CacheAlignedUniquePtr bytes_; +}; + +// Single channel, aligned rows separated by padding. T must be POD. +// +// 'Single channel' (one 2D array per channel) simplifies vectorization +// (repeating the same operation on multiple adjacent components) without the +// complexity of a hybrid layout (8 R, 8 G, 8 B, ...). In particular, clients +// can easily iterate over all components in a row and Image requires no +// knowledge of the pixel format beyond the component type "T". +// +// 'Aligned' means each row is aligned to the L1 cache line size. This prevents +// false sharing between two threads operating on adjacent rows. +// +// 'Padding' is still relevant because vectors could potentially be larger than +// a cache line. By rounding up row sizes to the vector size, we allow +// reading/writing ALIGNED vectors whose first lane is a valid sample. This +// avoids needing a separate loop to handle remaining unaligned lanes. +// +// This image layout could also be achieved with a vector and a row accessor +// function, but a class wrapper with support for "deleter" allows wrapping +// existing memory allocated by clients without copying the pixels. It also +// provides convenient accessors for xsize/ysize, which shortens function +// argument lists. Supports move-construction so it can be stored in containers. +template +class Plane : public PlaneBase { + public: + using T = ComponentType; + static constexpr size_t kNumPlanes = 1; + + Plane() = default; + Plane(const size_t xsize, const size_t ysize) + : PlaneBase(xsize, ysize, sizeof(T)) {} + + void InitializePaddingForUnalignedAccesses() { + InitializePadding(sizeof(T), Padding::kUnaligned); + } + + JXL_INLINE T* Row(const size_t y) { return static_cast(VoidRow(y)); } + + // Returns pointer to const (see above). + JXL_INLINE const T* Row(const size_t y) const { + return static_cast(VoidRow(y)); + } + + // Documents that the access is const. + JXL_INLINE const T* ConstRow(const size_t y) const { + return static_cast(VoidRow(y)); + } + + // Returns number of pixels (some of which are padding) per row. Useful for + // computing other rows via pointer arithmetic. WARNING: this must + // NOT be used to determine xsize. + JXL_INLINE intptr_t PixelsPerRow() const { + return static_cast(bytes_per_row_ / sizeof(T)); + } +}; + +using ImageSB = Plane; +using ImageB = Plane; +using ImageS = Plane; // signed integer or half-float +using ImageU = Plane; +using ImageI = Plane; +using ImageF = Plane; +using ImageD = Plane; + +// Also works for Image3 and mixed argument types. +template +bool SameSize(const Image1& image1, const Image2& image2) { + return image1.xsize() == image2.xsize() && image1.ysize() == image2.ysize(); +} + +template +class Image3; + +// Rectangular region in image(s). Factoring this out of Image instead of +// shifting the pointer by x0/y0 allows this to apply to multiple images with +// different resolutions (e.g. color transform and quantization field). +// Can compare using SameSize(rect1, rect2). +template +class RectT { + public: + // Most windows are xsize_max * ysize_max, except those on the borders where + // begin + size_max > end. + constexpr RectT(T xbegin, T ybegin, size_t xsize_max, size_t ysize_max, + T xend, T yend) + : x0_(xbegin), + y0_(ybegin), + xsize_(ClampedSize(xbegin, xsize_max, xend)), + ysize_(ClampedSize(ybegin, ysize_max, yend)) {} + + // Construct with origin and known size (typically from another Rect). + constexpr RectT(T xbegin, T ybegin, size_t xsize, size_t ysize) + : x0_(xbegin), y0_(ybegin), xsize_(xsize), ysize_(ysize) {} + + // Construct a rect that covers a whole image/plane/ImageBundle etc. + template + explicit RectT(const ImageT& image) + : RectT(0, 0, image.xsize(), image.ysize()) {} + + RectT() : RectT(0, 0, 0, 0) {} + + RectT(const RectT&) = default; + RectT& operator=(const RectT&) = default; + + // Construct a subrect that resides in an image/plane/ImageBundle etc. + template + RectT Crop(const ImageT& image) const { + return Intersection(RectT(image)); + } + + // Construct a subrect that resides in the [0, ysize) x [0, xsize) region of + // the current rect. + RectT Crop(size_t area_xsize, size_t area_ysize) const { + return Intersection(RectT(0, 0, area_xsize, area_ysize)); + } + + // Returns a rect that only contains `num` lines with offset `y` from `y0()`. + RectT Lines(size_t y, size_t num) const { + JXL_DASSERT(y + num <= ysize_); + return RectT(x0_, y0_ + y, xsize_, num); + } + + RectT Line(size_t y) const { return Lines(y, 1); } + + JXL_MUST_USE_RESULT RectT Intersection(const RectT& other) const { + return RectT(std::max(x0_, other.x0_), std::max(y0_, other.y0_), xsize_, + ysize_, std::min(x1(), other.x1()), + std::min(y1(), other.y1())); + } + + JXL_MUST_USE_RESULT RectT Translate(int64_t x_offset, + int64_t y_offset) const { + return RectT(x0_ + x_offset, y0_ + y_offset, xsize_, ysize_); + } + + template + V* Row(Plane* image, size_t y) const { + JXL_DASSERT(y + y0_ >= 0); + return image->Row(y + y0_) + x0_; + } + + template + const V* Row(const Plane* image, size_t y) const { + JXL_DASSERT(y + y0_ >= 0); + return image->Row(y + y0_) + x0_; + } + + template + V* PlaneRow(Image3* image, const size_t c, size_t y) const { + JXL_DASSERT(y + y0_ >= 0); + return image->PlaneRow(c, y + y0_) + x0_; + } + + template + const V* ConstRow(const Plane& image, size_t y) const { + JXL_DASSERT(y + y0_ >= 0); + return image.ConstRow(y + y0_) + x0_; + } + + template + const V* ConstPlaneRow(const Image3& image, size_t c, size_t y) const { + JXL_DASSERT(y + y0_ >= 0); + return image.ConstPlaneRow(c, y + y0_) + x0_; + } + + bool IsInside(const RectT& other) const { + return x0_ >= other.x0() && x1() <= other.x1() && y0_ >= other.y0() && + y1() <= other.y1(); + } + + // Returns true if this Rect fully resides in the given image. ImageT could be + // Plane or Image3; however if ImageT is Rect, results are nonsensical. + template + bool IsInside(const ImageT& image) const { + return IsInside(RectT(image)); + } + + T x0() const { return x0_; } + T y0() const { return y0_; } + size_t xsize() const { return xsize_; } + size_t ysize() const { return ysize_; } + T x1() const { return x0_ + xsize_; } + T y1() const { return y0_ + ysize_; } + + RectT ShiftLeft(size_t shiftx, size_t shifty) const { + return RectT(x0_ * (1 << shiftx), y0_ * (1 << shifty), xsize_ << shiftx, + ysize_ << shifty); + } + RectT ShiftLeft(size_t shift) const { return ShiftLeft(shift, shift); } + + // Requires x0(), y0() to be multiples of 1< CeilShiftRight(size_t shiftx, size_t shifty) const { + JXL_ASSERT(x0_ % (1 << shiftx) == 0); + JXL_ASSERT(y0_ % (1 << shifty) == 0); + return RectT(x0_ / (1 << shiftx), y0_ / (1 << shifty), + DivCeil(xsize_, T{1} << shiftx), + DivCeil(ysize_, T{1} << shifty)); + } + RectT CeilShiftRight(std::pair shift) const { + return CeilShiftRight(shift.first, shift.second); + } + RectT CeilShiftRight(size_t shift) const { + return CeilShiftRight(shift, shift); + } + + template + RectT As() const { + return RectT(U(x0_), U(y0_), U(xsize_), U(ysize_)); + } + + private: + // Returns size_max, or whatever is left in [begin, end). + static constexpr size_t ClampedSize(T begin, size_t size_max, T end) { + return (static_cast(begin + size_max) <= end) + ? size_max + : (end > begin ? end - begin : 0); + } + + T x0_; + T y0_; + + size_t xsize_; + size_t ysize_; +}; + +template +std::string Description(RectT r) { + std::ostringstream os; + os << "[" << r.x0() << ".." << r.x1() << ")x" + << "[" << r.y0() << ".." << r.y1() << ")"; + return os.str(); +} + +using Rect = RectT; + +// Currently, we abuse Image to either refer to an image that owns its storage +// or one that doesn't. In similar vein, we abuse Image* function parameters to +// either mean "assign to me" or "fill the provided image with data". +// Hopefully, the "assign to me" meaning will go away and most images in the +// codebase will not be backed by own storage. When this happens we can redesign +// Image to be a non-storage-holding view class and introduce BackedImage in +// those places that actually need it. + +// NOTE: we can't use Image as a view because invariants are violated +// (alignment and the presence of padding before/after each "row"). + +// A bundle of 3 same-sized images. Typically constructed by moving from three +// rvalue references to Image. To overwrite an existing Image3 using +// single-channel producers, we also need access to Image*. Constructing +// temporary non-owning Image pointing to one plane of an existing Image3 risks +// dangling references, especially if the wrapper is moved. Therefore, we +// store an array of Image (which are compact enough that size is not a concern) +// and provide Plane+Row accessors. +template +class Image3 { + public: + using T = ComponentType; + using PlaneT = jxl::Plane; + static constexpr size_t kNumPlanes = 3; + + Image3() : planes_{PlaneT(), PlaneT(), PlaneT()} {} + + Image3(const size_t xsize, const size_t ysize) + : planes_{PlaneT(xsize, ysize), PlaneT(xsize, ysize), + PlaneT(xsize, ysize)} {} + + Image3(Image3&& other) noexcept { + for (size_t i = 0; i < kNumPlanes; i++) { + planes_[i] = std::move(other.planes_[i]); + } + } + + Image3(PlaneT&& plane0, PlaneT&& plane1, PlaneT&& plane2) { + JXL_CHECK(SameSize(plane0, plane1)); + JXL_CHECK(SameSize(plane0, plane2)); + planes_[0] = std::move(plane0); + planes_[1] = std::move(plane1); + planes_[2] = std::move(plane2); + } + + // Copy construction/assignment is forbidden to avoid inadvertent copies, + // which can be very expensive. Use CopyImageTo instead. + Image3(const Image3& other) = delete; + Image3& operator=(const Image3& other) = delete; + + Image3& operator=(Image3&& other) noexcept { + for (size_t i = 0; i < kNumPlanes; i++) { + planes_[i] = std::move(other.planes_[i]); + } + return *this; + } + + // Returns row pointer; usage: PlaneRow(idx_plane, y)[x] = val. + JXL_INLINE T* PlaneRow(const size_t c, const size_t y) { + // Custom implementation instead of calling planes_[c].Row ensures only a + // single multiplication is needed for PlaneRow(0..2, y). + PlaneRowBoundsCheck(c, y); + const size_t row_offset = y * planes_[0].bytes_per_row(); + void* row = planes_[c].bytes() + row_offset; + return static_cast(JXL_ASSUME_ALIGNED(row, 64)); + } + + // Returns const row pointer; usage: val = PlaneRow(idx_plane, y)[x]. + JXL_INLINE const T* PlaneRow(const size_t c, const size_t y) const { + PlaneRowBoundsCheck(c, y); + const size_t row_offset = y * planes_[0].bytes_per_row(); + const void* row = planes_[c].bytes() + row_offset; + return static_cast(JXL_ASSUME_ALIGNED(row, 64)); + } + + // Returns const row pointer, even if called from a non-const Image3. + JXL_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) const { + PlaneRowBoundsCheck(c, y); + return PlaneRow(c, y); + } + + JXL_INLINE const PlaneT& Plane(size_t idx) const { return planes_[idx]; } + + JXL_INLINE PlaneT& Plane(size_t idx) { return planes_[idx]; } + + void Swap(Image3& other) { + for (size_t c = 0; c < 3; ++c) { + other.planes_[c].Swap(planes_[c]); + } + } + + // Useful for pre-allocating image with some padding for alignment purposes + // and later reporting the actual valid dimensions. May also be used to + // un-shrink the image. Caller is responsible for ensuring xsize/ysize are <= + // the original dimensions. + void ShrinkTo(const size_t xsize, const size_t ysize) { + for (PlaneT& plane : planes_) { + plane.ShrinkTo(xsize, ysize); + } + } + + // Sizes of all three images are guaranteed to be equal. + JXL_INLINE size_t xsize() const { return planes_[0].xsize(); } + JXL_INLINE size_t ysize() const { return planes_[0].ysize(); } + // Returns offset [bytes] from one row to the next row of the same plane. + // WARNING: this must NOT be used to determine xsize, nor for copying rows - + // the valid xsize may be much less. + JXL_INLINE size_t bytes_per_row() const { return planes_[0].bytes_per_row(); } + // Returns number of pixels (some of which are padding) per row. Useful for + // computing other rows via pointer arithmetic. WARNING: this must NOT be used + // to determine xsize. + JXL_INLINE intptr_t PixelsPerRow() const { return planes_[0].PixelsPerRow(); } + + private: + void PlaneRowBoundsCheck(const size_t c, const size_t y) const { +#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \ + defined(THREAD_SANITIZER) + if (c >= kNumPlanes || y >= ysize()) { + JXL_ABORT("PlaneRow(%" PRIu64 ", %" PRIu64 ") in (%" PRIu64 " x %" PRIu64 + ") image\n", + static_cast(c), static_cast(y), + static_cast(xsize()), static_cast(ysize())); + } +#endif + } + + private: + PlaneT planes_[kNumPlanes]; +}; + +using Image3B = Image3; +using Image3S = Image3; +using Image3U = Image3; +using Image3I = Image3; +using Image3F = Image3; +using Image3D = Image3; + +} // namespace jxl + +#endif // LIB_JXL_IMAGE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/image_bundle.cc b/third_party/jpeg-xl/lib/jxl/image_bundle.cc new file mode 100644 index 0000000000..7e7051b608 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/image_bundle.cc @@ -0,0 +1,125 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/image_bundle.h" + +#include +#include + +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/fields.h" + +namespace jxl { + +void ImageBundle::ShrinkTo(size_t xsize, size_t ysize) { + if (HasColor()) color_.ShrinkTo(xsize, ysize); + for (ImageF& ec : extra_channels_) { + ec.ShrinkTo(xsize, ysize); + } +} + +// Called by all other SetFrom*. +void ImageBundle::SetFromImage(Image3F&& color, + const ColorEncoding& c_current) { + JXL_CHECK(color.xsize() != 0 && color.ysize() != 0); + JXL_CHECK(metadata_->color_encoding.IsGray() == c_current.IsGray()); + color_ = std::move(color); + c_current_ = c_current; + VerifySizes(); +} + +void ImageBundle::VerifyMetadata() const { + JXL_CHECK(!c_current_.ICC().empty()); + JXL_CHECK(metadata_->color_encoding.IsGray() == IsGray()); + + if (metadata_->HasAlpha() && alpha().xsize() == 0) { + JXL_ABORT("MD alpha_bits %u IB alpha %" PRIuS " x %" PRIuS "\n", + metadata_->GetAlphaBits(), alpha().xsize(), alpha().ysize()); + } + const uint32_t alpha_bits = metadata_->GetAlphaBits(); + JXL_CHECK(alpha_bits <= 32); + + // metadata_->num_extra_channels may temporarily differ from + // extra_channels_.size(), e.g. after SetAlpha. They are synced by the next + // call to VisitFields. +} + +void ImageBundle::VerifySizes() const { + const size_t xs = xsize(); + const size_t ys = ysize(); + + if (HasExtraChannels()) { + JXL_CHECK(xs != 0 && ys != 0); + for (const ImageF& ec : extra_channels_) { + JXL_CHECK(ec.xsize() == xs); + JXL_CHECK(ec.ysize() == ys); + } + } +} + +size_t ImageBundle::DetectRealBitdepth() const { + return metadata_->bit_depth.bits_per_sample; + + // TODO(lode): let this function return lower bit depth if possible, e.g. + // return 8 bits in case the original image came from a 16-bit PNG that + // was in fact representable as 8-bit PNG. Ensure that the implementation + // returns 16 if e.g. two consecutive 16-bit values appeared in the original + // image (such as 32768 and 32769), take into account that e.g. the values + // 3-bit can represent is not a superset of the values 2-bit can represent, + // and there may be slight imprecisions in the floating point image. +} + +const ImageF& ImageBundle::black() const { + JXL_ASSERT(HasBlack()); + const size_t ec = metadata_->Find(ExtraChannel::kBlack) - + metadata_->extra_channel_info.data(); + JXL_ASSERT(ec < extra_channels_.size()); + return extra_channels_[ec]; +} +const ImageF& ImageBundle::alpha() const { + JXL_ASSERT(HasAlpha()); + const size_t ec = metadata_->Find(ExtraChannel::kAlpha) - + metadata_->extra_channel_info.data(); + JXL_ASSERT(ec < extra_channels_.size()); + return extra_channels_[ec]; +} +ImageF* ImageBundle::alpha() { + JXL_ASSERT(HasAlpha()); + const size_t ec = metadata_->Find(ExtraChannel::kAlpha) - + metadata_->extra_channel_info.data(); + JXL_ASSERT(ec < extra_channels_.size()); + return &extra_channels_[ec]; +} + +void ImageBundle::SetAlpha(ImageF&& alpha) { + const ExtraChannelInfo* eci = metadata_->Find(ExtraChannel::kAlpha); + // Must call SetAlphaBits first, otherwise we don't know which channel index + JXL_CHECK(eci != nullptr); + JXL_CHECK(alpha.xsize() != 0 && alpha.ysize() != 0); + if (extra_channels_.size() < metadata_->extra_channel_info.size()) { + // TODO(jon): get rid of this case + extra_channels_.insert( + extra_channels_.begin() + (eci - metadata_->extra_channel_info.data()), + std::move(alpha)); + } else { + extra_channels_[eci - metadata_->extra_channel_info.data()] = + std::move(alpha); + } + // num_extra_channels is automatically set in visitor + VerifySizes(); +} + +void ImageBundle::SetExtraChannels(std::vector&& extra_channels) { + for (const ImageF& plane : extra_channels) { + JXL_CHECK(plane.xsize() != 0 && plane.ysize() != 0); + } + extra_channels_ = std::move(extra_channels); + VerifySizes(); +} +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/image_bundle.h b/third_party/jpeg-xl/lib/jxl/image_bundle.h new file mode 100644 index 0000000000..c7b812b59a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/image_bundle.h @@ -0,0 +1,254 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_IMAGE_BUNDLE_H_ +#define LIB_JXL_IMAGE_BUNDLE_H_ + +// The main image or frame consists of a bundle of associated images. + +#include +#include +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/field_encodings.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_metadata.h" +#include "lib/jxl/jpeg/jpeg_data.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/quantizer.h" + +namespace jxl { + +// A bundle of color/alpha/depth/plane images. +class ImageBundle { + public: + // Uninitialized state for use as output parameter. + ImageBundle() : metadata_(nullptr) {} + // Caller is responsible for setting metadata before calling Set*. + explicit ImageBundle(const ImageMetadata* metadata) : metadata_(metadata) {} + + // Move-only (allows storing in std::vector). + ImageBundle(ImageBundle&&) = default; + ImageBundle& operator=(ImageBundle&&) = default; + + ImageBundle Copy() const { + ImageBundle copy(metadata_); + copy.color_ = CopyImage(color_); + copy.c_current_ = c_current_; + copy.extra_channels_.reserve(extra_channels_.size()); + for (const ImageF& plane : extra_channels_) { + copy.extra_channels_.emplace_back(CopyImage(plane)); + } + + copy.jpeg_data = + jpeg_data ? make_unique(*jpeg_data) : nullptr; + copy.color_transform = color_transform; + copy.chroma_subsampling = chroma_subsampling; + + return copy; + } + + // -- SIZE + + size_t xsize() const { + if (IsJPEG()) return jpeg_data->width; + if (color_.xsize() != 0) return color_.xsize(); + return extra_channels_.empty() ? 0 : extra_channels_[0].xsize(); + } + size_t ysize() const { + if (IsJPEG()) return jpeg_data->height; + if (color_.ysize() != 0) return color_.ysize(); + return extra_channels_.empty() ? 0 : extra_channels_[0].ysize(); + } + void ShrinkTo(size_t xsize, size_t ysize); + + // sizes taking orientation into account + size_t oriented_xsize() const { + if (static_cast(metadata_->GetOrientation()) > 4) { + return ysize(); + } else { + return xsize(); + } + } + size_t oriented_ysize() const { + if (static_cast(metadata_->GetOrientation()) > 4) { + return xsize(); + } else { + return ysize(); + } + } + + // -- COLOR + + // Whether color() is valid/usable. Returns true in most cases. Even images + // with spot colors (one example of when !planes().empty()) typically have a + // part that can be converted to RGB. + bool HasColor() const { return color_.xsize() != 0; } + + // For resetting the size when switching from a reference to main frame. + void RemoveColor() { color_ = Image3F(); } + + // Do not use if !HasColor(). + const Image3F& color() const { + // If this fails, Set* was not called - perhaps because decoding failed? + JXL_DASSERT(HasColor()); + return color_; + } + + // Do not use if !HasColor(). + Image3F* color() { + JXL_DASSERT(HasColor()); + return &color_; + } + + // If c_current.IsGray(), all planes must be identical. NOTE: c_current is + // independent of metadata()->color_encoding, which is the original, whereas + // a decoder might return pixels in a different c_current. + // This only sets the color channels, you must also make extra channels + // match the amount that is in the metadata. + void SetFromImage(Image3F&& color, const ColorEncoding& c_current); + + // -- COLOR ENCODING + + const ColorEncoding& c_current() const { return c_current_; } + + // Returns whether the color image has identical planes. Once established by + // Set*, remains unchanged until a subsequent Set* or TransformTo. + bool IsGray() const { return c_current_.IsGray(); } + + bool IsSRGB() const { return c_current_.IsSRGB(); } + bool IsLinearSRGB() const { + return c_current_.white_point == WhitePoint::kD65 && + c_current_.primaries == Primaries::kSRGB && c_current_.tf.IsLinear(); + } + + // Set the c_current profile without doing any transformation, e.g. if the + // transformation was already applied. + void OverrideProfile(const ColorEncoding& new_c_current) { + c_current_ = new_c_current; + } + + // TODO(lode): TransformTo and CopyTo are implemented in enc_image_bundle.cc, + // move these functions out of this header file and class, to + // enc_image_bundle.h. + + // Transforms color to c_desired and sets c_current to c_desired. Alpha and + // metadata remains unchanged. + Status TransformTo(const ColorEncoding& c_desired, const JxlCmsInterface& cms, + ThreadPool* pool = nullptr); + // Copies this:rect, converts to c_desired, and allocates+fills out. + Status CopyTo(const Rect& rect, const ColorEncoding& c_desired, + const JxlCmsInterface& cms, Image3F* out, + ThreadPool* pool = nullptr) const; + + // Detect 'real' bit depth, which can be lower than nominal bit depth + // (this is common in PNG), returns 'real' bit depth + size_t DetectRealBitdepth() const; + + // -- ALPHA + + void SetAlpha(ImageF&& alpha); + bool HasAlpha() const { + return metadata_->Find(ExtraChannel::kAlpha) != nullptr; + } + bool AlphaIsPremultiplied() const { + const ExtraChannelInfo* eci = metadata_->Find(ExtraChannel::kAlpha); + return (eci == nullptr) ? false : eci->alpha_associated; + } + const ImageF& alpha() const; + ImageF* alpha(); + + // -- EXTRA CHANNELS + bool HasBlack() const { + return metadata_->Find(ExtraChannel::kBlack) != nullptr; + } + const ImageF& black() const; + + // Extra channels of unknown interpretation (e.g. spot colors). + void SetExtraChannels(std::vector&& extra_channels); + void ClearExtraChannels() { extra_channels_.clear(); } + bool HasExtraChannels() const { return !extra_channels_.empty(); } + const std::vector& extra_channels() const { return extra_channels_; } + std::vector& extra_channels() { return extra_channels_; } + + const ImageMetadata* metadata() const { return metadata_; } + + void VerifyMetadata() const; + + void SetDecodedBytes(size_t decoded_bytes) { decoded_bytes_ = decoded_bytes; } + size_t decoded_bytes() const { return decoded_bytes_; } + + // -- JPEG transcoding: + + // Returns true if image does or will represent quantized DCT-8 coefficients, + // stored in 8x8 pixel regions. + bool IsJPEG() const { +#if JPEGXL_ENABLE_TRANSCODE_JPEG + return jpeg_data != nullptr; +#else // JPEGXL_ENABLE_TRANSCODE_JPEG + return false; +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + } + + std::unique_ptr jpeg_data; + // these fields are used to signal the input JPEG color space + // NOTE: JPEG doesn't actually provide a way to determine whether YCbCr was + // applied or not. + ColorTransform color_transform = ColorTransform::kNone; + YCbCrChromaSubsampling chroma_subsampling; + + FrameOrigin origin{0, 0}; + + // Animation-related information, corresponding to the timecode and duration + // fields of the jxl::AnimationFrame of the jxl::FrameHeader. + // TODO(lode): ImageBundle is used here to carry the information from + // jxl::FrameHeader, consider instead passing a jxl::FrameHeader directly to + // EncodeFrame or having a field of that type here. + uint32_t duration = 0; + uint32_t timecode = 0; + + // TODO(lode): these fields do not match the JXL frame header, it should be + // possible to specify up to 4 (3 if nonzero duration) slots to save this + // frame as reference (see save_as_reference). + bool use_for_next_frame = false; + bool blend = false; + BlendMode blendmode = BlendMode::kBlend; + + std::string name; + + private: + // Called after any Set* to ensure their sizes are compatible. + void VerifySizes() const; + + // Required for TransformTo so that an ImageBundle is self-sufficient. Always + // points to the same thing, but cannot be const-pointer because that prevents + // the compiler from generating a move ctor. + const ImageMetadata* metadata_; + + // Initialized by Set*: + Image3F color_; // If empty, planes_ is not; all planes equal if IsGray(). + ColorEncoding c_current_; // of color_ + + // Initialized by SetPlanes; size = ImageMetadata.num_extra_channels + std::vector extra_channels_; + + // How many bytes of the input were actually read. + size_t decoded_bytes_ = 0; +}; + +} // namespace jxl + +#endif // LIB_JXL_IMAGE_BUNDLE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/image_bundle_test.cc b/third_party/jpeg-xl/lib/jxl/image_bundle_test.cc new file mode 100644 index 0000000000..1a10598fe2 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/image_bundle_test.cc @@ -0,0 +1,37 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/image_bundle.h" + +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +TEST(ImageBundleTest, ExtraChannelName) { + AuxOut aux_out; + BitWriter writer; + BitWriter::Allotment allotment(&writer, 99); + + ImageMetadata metadata; + ExtraChannelInfo eci; + eci.type = ExtraChannel::kBlack; + eci.name = "testK"; + metadata.extra_channel_info.push_back(std::move(eci)); + ASSERT_TRUE(WriteImageMetadata(metadata, &writer, /*layer=*/0, &aux_out)); + writer.ZeroPadToByte(); + allotment.ReclaimAndCharge(&writer, /*layer=*/0, &aux_out); + + BitReader reader(writer.GetSpan()); + ImageMetadata metadata_out; + ASSERT_TRUE(ReadImageMetadata(&reader, &metadata_out)); + EXPECT_TRUE(reader.Close()); + EXPECT_EQ("testK", metadata_out.Find(ExtraChannel::kBlack)->name); +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/image_metadata.cc b/third_party/jpeg-xl/lib/jxl/image_metadata.cc new file mode 100644 index 0000000000..20b0d6f95a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/image_metadata.cc @@ -0,0 +1,472 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/image_metadata.h" + +#include +#include + +#include "lib/jxl/alpha.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/quantizer.h" + +namespace jxl { +BitDepth::BitDepth() { Bundle::Init(this); } +Status BitDepth::VisitFields(Visitor* JXL_RESTRICT visitor) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &floating_point_sample)); + // The same fields (bits_per_sample and exponent_bits_per_sample) are read + // in a different way depending on floating_point_sample's value. It's still + // default-initialized correctly so using visitor->Conditional is not + // required. + if (!floating_point_sample) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32( + Val(8), Val(10), Val(12), BitsOffset(6, 1), 8, &bits_per_sample)); + exponent_bits_per_sample = 0; + } else { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32( + Val(32), Val(16), Val(24), BitsOffset(6, 1), 32, &bits_per_sample)); + // The encoded value is exponent_bits_per_sample - 1, encoded in 3 bits + // so the value can be in range [1, 8]. + const uint32_t offset = 1; + exponent_bits_per_sample -= offset; + JXL_QUIET_RETURN_IF_ERROR( + visitor->Bits(4, 8 - offset, &exponent_bits_per_sample)); + exponent_bits_per_sample += offset; + } + + // Error-checking for floating point ranges. + if (floating_point_sample) { + if (exponent_bits_per_sample < 2 || exponent_bits_per_sample > 8) { + return JXL_FAILURE("Invalid exponent_bits_per_sample: %u", + exponent_bits_per_sample); + } + int mantissa_bits = + static_cast(bits_per_sample) - exponent_bits_per_sample - 1; + if (mantissa_bits < 2 || mantissa_bits > 23) { + return JXL_FAILURE("Invalid bits_per_sample: %u", bits_per_sample); + } + } else { + if (bits_per_sample > 31) { + return JXL_FAILURE("Invalid bits_per_sample: %u", bits_per_sample); + } + } + return true; +} + +std::string BitDepth::DebugString() const { + std::ostringstream os; + os << (floating_point_sample ? "F" : "U"); + os << bits_per_sample; + if (floating_point_sample) os << "." << exponent_bits_per_sample; + return os.str(); +} + +CustomTransformData::CustomTransformData() { Bundle::Init(this); } +Status CustomTransformData::VisitFields(Visitor* JXL_RESTRICT visitor) { + if (visitor->AllDefault(*this, &all_default)) { + // Overwrite all serialized fields, but not any nonserialized_*. + visitor->SetDefault(this); + return true; + } + if (visitor->Conditional(nonserialized_xyb_encoded)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&opsin_inverse_matrix)); + } + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 0, &custom_weights_mask)); + if (visitor->Conditional((custom_weights_mask & 0x1) != 0)) { + // 4 5x5 kernels, but all of them can be obtained by symmetry from one, + // which is symmetric along its main diagonal. The top-left kernel is + // defined by + // + // 0 1 2 3 4 + // 1 5 6 7 8 + // 2 6 9 10 11 + // 3 7 10 12 13 + // 4 8 11 13 14 + float constexpr kWeights2[15] = { + -0.01716200f, -0.03452303f, -0.04022174f, -0.02921014f, -0.00624645f, + 0.14111091f, 0.28896755f, 0.00278718f, -0.01610267f, 0.56661550f, + 0.03777607f, -0.01986694f, -0.03144731f, -0.01185068f, -0.00213539f}; + for (size_t i = 0; i < 15; i++) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(kWeights2[i], &upsampling2_weights[i])); + } + } + if (visitor->Conditional((custom_weights_mask & 0x2) != 0)) { + // 16 5x5 kernels, but all of them can be obtained by symmetry from + // three, two of which are symmetric along their main diagonals. The top + // left 4 kernels are defined by + // + // 0 1 2 3 4 5 6 7 8 9 + // 1 10 11 12 13 14 15 16 17 18 + // 2 11 19 20 21 22 23 24 25 26 + // 3 12 20 27 28 29 30 31 32 33 + // 4 13 21 28 34 35 36 37 38 39 + // + // 5 14 22 29 35 40 41 42 43 44 + // 6 15 23 30 36 41 45 46 47 48 + // 7 16 24 31 37 42 46 49 50 51 + // 8 17 25 32 38 43 47 50 52 53 + // 9 18 26 33 39 44 48 51 53 54 + constexpr float kWeights4[55] = { + -0.02419067f, -0.03491987f, -0.03693351f, -0.03094285f, -0.00529785f, + -0.01663432f, -0.03556863f, -0.03888905f, -0.03516850f, -0.00989469f, + 0.23651958f, 0.33392945f, -0.01073543f, -0.01313181f, -0.03556694f, + 0.13048175f, 0.40103025f, 0.03951150f, -0.02077584f, 0.46914198f, + -0.00209270f, -0.01484589f, -0.04064806f, 0.18942530f, 0.56279892f, + 0.06674400f, -0.02335494f, -0.03551682f, -0.00754830f, -0.02267919f, + -0.02363578f, 0.00315804f, -0.03399098f, -0.01359519f, -0.00091653f, + -0.00335467f, -0.01163294f, -0.01610294f, -0.00974088f, -0.00191622f, + -0.01095446f, -0.03198464f, -0.04455121f, -0.02799790f, -0.00645912f, + 0.06390599f, 0.22963888f, 0.00630981f, -0.01897349f, 0.67537268f, + 0.08483369f, -0.02534994f, -0.02205197f, -0.01667999f, -0.00384443f}; + for (size_t i = 0; i < 55; i++) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(kWeights4[i], &upsampling4_weights[i])); + } + } + if (visitor->Conditional((custom_weights_mask & 0x4) != 0)) { + // 64 5x5 kernels, all of them can be obtained by symmetry from + // 10, 4 of which are symmetric along their main diagonals. The top + // left 16 kernels are defined by + // 0 1 2 3 4 5 6 7 8 9 a b c d e f 10 11 12 13 + // 1 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 20 21 22 23 24 25 26 + // 2 15 27 28 29 2a 2b 2c 2d 2e 2f 30 31 32 33 34 35 36 37 38 + // 3 16 28 39 3a 3b 3c 3d 3e 3f 40 41 42 43 44 45 46 47 48 49 + // 4 17 29 3a 4a 4b 4c 4d 4e 4f 50 51 52 53 54 55 56 57 58 59 + + // 5 18 2a 3b 4b 5a 5b 5c 5d 5e 5f 60 61 62 63 64 65 66 67 68 + // 6 19 2b 3c 4c 5b 69 6a 6b 6c 6d 6e 6f 70 71 72 73 74 75 76 + // 7 1a 2c 3d 4d 5c 6a 77 78 79 7a 7b 7c 7d 7e 7f 80 81 82 83 + // 8 1b 2d 3e 4e 5d 6b 78 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f + // 9 1c 2e 3f 4f 5e 6c 79 85 90 91 92 93 94 95 96 97 98 99 9a + + // a 1d 2f 40 50 5f 6d 7a 86 91 9b 9c 9d 9e 9f a0 a1 a2 a3 a4 + // b 1e 30 41 51 60 6e 7b 87 92 9c a5 a6 a7 a8 a9 aa ab ac ad + // c 1f 31 42 52 61 6f 7c 88 93 9d a6 ae af b0 b1 b2 b3 b4 b5 + // d 20 32 43 53 62 70 7d 89 94 9e a7 af b6 b7 b8 b9 ba bb bc + // e 21 33 44 54 63 71 7e 8a 95 9f a8 b0 b7 bd be bf c0 c1 c2 + + // f 22 34 45 55 64 72 7f 8b 96 a0 a9 b1 b8 be c3 c4 c5 c6 c7 + // 10 23 35 46 56 65 73 80 8c 97 a1 aa b2 b9 bf c4 c8 c9 ca cb + // 11 24 36 47 57 66 74 81 8d 98 a2 ab b3 ba c0 c5 c9 cc cd ce + // 12 25 37 48 58 67 75 82 8e 99 a3 ac b4 bb c1 c6 ca cd cf d0 + // 13 26 38 49 59 68 76 83 8f 9a a4 ad b5 bc c2 c7 cb ce d0 d1 + constexpr float kWeights8[210] = { + -0.02928613f, -0.03706353f, -0.03783812f, -0.03324558f, -0.00447632f, + -0.02519406f, -0.03752601f, -0.03901508f, -0.03663285f, -0.00646649f, + -0.02066407f, -0.03838633f, -0.04002101f, -0.03900035f, -0.00901973f, + -0.01626393f, -0.03954148f, -0.04046620f, -0.03979621f, -0.01224485f, + 0.29895328f, 0.35757708f, -0.02447552f, -0.01081748f, -0.04314594f, + 0.23903219f, 0.41119301f, -0.00573046f, -0.01450239f, -0.04246845f, + 0.17567618f, 0.45220643f, 0.02287757f, -0.01936783f, -0.03583255f, + 0.11572472f, 0.47416733f, 0.06284440f, -0.02685066f, 0.42720050f, + -0.02248939f, -0.01155273f, -0.04562755f, 0.28689496f, 0.49093869f, + -0.00007891f, -0.01545926f, -0.04562659f, 0.21238920f, 0.53980934f, + 0.03369474f, -0.02070211f, -0.03866988f, 0.14229550f, 0.56593398f, + 0.08045181f, -0.02888298f, -0.03680918f, -0.00542229f, -0.02920477f, + -0.02788574f, -0.02118180f, -0.03942402f, -0.00775547f, -0.02433614f, + -0.03193943f, -0.02030828f, -0.04044014f, -0.01074016f, -0.01930822f, + -0.03620399f, -0.01974125f, -0.03919545f, -0.01456093f, -0.00045072f, + -0.00360110f, -0.01020207f, -0.01231907f, -0.00638988f, -0.00071592f, + -0.00279122f, -0.00957115f, -0.01288327f, -0.00730937f, -0.00107783f, + -0.00210156f, -0.00890705f, -0.01317668f, -0.00813895f, -0.00153491f, + -0.02128481f, -0.04173044f, -0.04831487f, -0.03293190f, -0.00525260f, + -0.01720322f, -0.04052736f, -0.05045706f, -0.03607317f, -0.00738030f, + -0.01341764f, -0.03965629f, -0.05151616f, -0.03814886f, -0.01005819f, + 0.18968273f, 0.33063684f, -0.01300105f, -0.01372950f, -0.04017465f, + 0.13727832f, 0.36402234f, 0.01027890f, -0.01832107f, -0.03365072f, + 0.08734506f, 0.38194295f, 0.04338228f, -0.02525993f, 0.56408126f, + 0.00458352f, -0.01648227f, -0.04887868f, 0.24585519f, 0.62026135f, + 0.04314807f, -0.02213737f, -0.04158014f, 0.16637289f, 0.65027023f, + 0.09621636f, -0.03101388f, -0.04082742f, -0.00904519f, -0.02790922f, + -0.02117818f, 0.00798662f, -0.03995711f, -0.01243427f, -0.02231705f, + -0.02946266f, 0.00992055f, -0.03600283f, -0.01684920f, -0.00111684f, + -0.00411204f, -0.01297130f, -0.01723725f, -0.01022545f, -0.00165306f, + -0.00313110f, -0.01218016f, -0.01763266f, -0.01125620f, -0.00231663f, + -0.01374149f, -0.03797620f, -0.05142937f, -0.03117307f, -0.00581914f, + -0.01064003f, -0.03608089f, -0.05272168f, -0.03375670f, -0.00795586f, + 0.09628104f, 0.27129991f, -0.00353779f, -0.01734151f, -0.03153981f, + 0.05686230f, 0.28500998f, 0.02230594f, -0.02374955f, 0.68214326f, + 0.05018048f, -0.02320852f, -0.04383616f, 0.18459474f, 0.71517975f, + 0.10805613f, -0.03263677f, -0.03637639f, -0.01394373f, -0.02511203f, + -0.01728636f, 0.05407331f, -0.02867568f, -0.01893131f, -0.00240854f, + -0.00446511f, -0.01636187f, -0.02377053f, -0.01522848f, -0.00333334f, + -0.00819975f, -0.02964169f, -0.04499287f, -0.02745350f, -0.00612408f, + 0.02727416f, 0.19446600f, 0.00159832f, -0.02232473f, 0.74982506f, + 0.11452620f, -0.03348048f, -0.01605681f, -0.02070339f, -0.00458223f}; + for (size_t i = 0; i < 210; i++) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(kWeights8[i], &upsampling8_weights[i])); + } + } + return true; +} + +ExtraChannelInfo::ExtraChannelInfo() { Bundle::Init(this); } +Status ExtraChannelInfo::VisitFields(Visitor* JXL_RESTRICT visitor) { + if (visitor->AllDefault(*this, &all_default)) { + // Overwrite all serialized fields, but not any nonserialized_*. + visitor->SetDefault(this); + return true; + } + + // General + JXL_QUIET_RETURN_IF_ERROR(visitor->Enum(ExtraChannel::kAlpha, &type)); + + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&bit_depth)); + + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(0), Val(3), Val(4), BitsOffset(3, 1), 0, &dim_shift)); + if ((1U << dim_shift) > 8) { + return JXL_FAILURE("dim_shift %u too large", dim_shift); + } + + JXL_QUIET_RETURN_IF_ERROR(VisitNameString(visitor, &name)); + + // Conditional + if (visitor->Conditional(type == ExtraChannel::kAlpha)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &alpha_associated)); + } + if (visitor->Conditional(type == ExtraChannel::kSpotColor)) { + for (float& c : spot_color) { + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0, &c)); + } + } + if (visitor->Conditional(type == ExtraChannel::kCFA)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(1), Bits(2), BitsOffset(4, 3), + BitsOffset(8, 19), 1, &cfa_channel)); + } + + if (type == ExtraChannel::kUnknown || + (int(ExtraChannel::kReserved0) <= int(type) && + int(type) <= int(ExtraChannel::kReserved7))) { + return JXL_FAILURE("Unknown extra channel (bits %u, shift %u, name '%s')\n", + bit_depth.bits_per_sample, dim_shift, name.c_str()); + } + return true; +} + +std::string ExtraChannelInfo::DebugString() const { + std::ostringstream os; + os << (type == ExtraChannel::kAlpha ? "Alpha" + : type == ExtraChannel::kDepth ? "Depth" + : type == ExtraChannel::kSpotColor ? "Spot" + : type == ExtraChannel::kSelectionMask ? "Mask" + : type == ExtraChannel::kBlack ? "Black" + : type == ExtraChannel::kCFA ? "CFA" + : type == ExtraChannel::kThermal ? "Thermal" + : "Unknown"); + if (type == ExtraChannel::kAlpha && alpha_associated) os << "(premul)"; + os << " " << bit_depth.DebugString(); + os << " shift: " << dim_shift; + return os.str(); +} + +ImageMetadata::ImageMetadata() { Bundle::Init(this); } +Status ImageMetadata::VisitFields(Visitor* JXL_RESTRICT visitor) { + if (visitor->AllDefault(*this, &all_default)) { + // Overwrite all serialized fields, but not any nonserialized_*. + visitor->SetDefault(this); + return true; + } + + // Bundle::AllDefault does not allow usage when reading (it may abort the + // program when a codestream has invalid values), but when reading we + // overwrite the extra_fields value, so do not need to call AllDefault. + bool tone_mapping_default = + visitor->IsReading() ? false : Bundle::AllDefault(tone_mapping); + + bool extra_fields = (orientation != 1 || have_preview || have_animation || + have_intrinsic_size || !tone_mapping_default); + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &extra_fields)); + if (visitor->Conditional(extra_fields)) { + orientation--; + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 0, &orientation)); + orientation++; + // (No need for bounds checking because we read exactly 3 bits) + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_intrinsic_size)); + if (visitor->Conditional(have_intrinsic_size)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&intrinsic_size)); + } + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_preview)); + if (visitor->Conditional(have_preview)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&preview_size)); + } + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_animation)); + if (visitor->Conditional(have_animation)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&animation)); + } + } else { + orientation = 1; // identity + have_intrinsic_size = false; + have_preview = false; + have_animation = false; + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&bit_depth)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->Bool(true, &modular_16_bit_buffer_sufficient)); + + num_extra_channels = extra_channel_info.size(); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(0), Val(1), BitsOffset(4, 2), + BitsOffset(12, 1), 0, + &num_extra_channels)); + + if (visitor->Conditional(num_extra_channels != 0)) { + if (visitor->IsReading()) { + extra_channel_info.resize(num_extra_channels); + } + for (ExtraChannelInfo& eci : extra_channel_info) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&eci)); + } + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(true, &xyb_encoded)); + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&color_encoding)); + if (visitor->Conditional(extra_fields)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&tone_mapping)); + } + + // Treat as if only the fields up to extra channels exist. + if (visitor->IsReading() && nonserialized_only_parse_basic_info) { + return true; + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions)); + // Extensions: in chronological order of being added to the format. + return visitor->EndExtensions(); +} + +OpsinInverseMatrix::OpsinInverseMatrix() { Bundle::Init(this); } +Status OpsinInverseMatrix::VisitFields(Visitor* JXL_RESTRICT visitor) { + if (visitor->AllDefault(*this, &all_default)) { + // Overwrite all serialized fields, but not any nonserialized_*. + visitor->SetDefault(this); + return true; + } + for (int i = 0; i < 9; ++i) { + JXL_QUIET_RETURN_IF_ERROR(visitor->F16( + DefaultInverseOpsinAbsorbanceMatrix()[i], &inverse_matrix[i])); + } + for (int i = 0; i < 3; ++i) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(kNegOpsinAbsorbanceBiasRGB[i], &opsin_biases[i])); + } + for (int i = 0; i < 4; ++i) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(kDefaultQuantBias[i], &quant_biases[i])); + } + return true; +} + +ToneMapping::ToneMapping() { Bundle::Init(this); } +Status ToneMapping::VisitFields(Visitor* JXL_RESTRICT visitor) { + if (visitor->AllDefault(*this, &all_default)) { + // Overwrite all serialized fields, but not any nonserialized_*. + visitor->SetDefault(this); + return true; + } + + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(kDefaultIntensityTarget, &intensity_target)); + if (intensity_target <= 0.f) { + return JXL_FAILURE("invalid intensity target"); + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.0f, &min_nits)); + if (min_nits < 0.f || min_nits > intensity_target) { + return JXL_FAILURE("invalid min %f vs max %f", min_nits, intensity_target); + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &relative_to_max_display)); + + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.0f, &linear_below)); + if (linear_below < 0 || (relative_to_max_display && linear_below > 1.0f)) { + return JXL_FAILURE("invalid linear_below %f (%s)", linear_below, + relative_to_max_display ? "relative" : "absolute"); + } + + return true; +} + +Status ReadImageMetadata(BitReader* JXL_RESTRICT reader, + ImageMetadata* JXL_RESTRICT metadata) { + return Bundle::Read(reader, metadata); +} + +void ImageMetadata::SetAlphaBits(uint32_t bits, bool alpha_is_premultiplied) { + std::vector& eciv = extra_channel_info; + ExtraChannelInfo* alpha = Find(ExtraChannel::kAlpha); + if (bits == 0) { + if (alpha != nullptr) { + // Remove the alpha channel from the extra channel info. It's + // theoretically possible that there are multiple, remove all in that + // case. This ensure a next HasAlpha() will return false. + const auto is_alpha = [](const ExtraChannelInfo& eci) { + return eci.type == ExtraChannel::kAlpha; + }; + eciv.erase(std::remove_if(eciv.begin(), eciv.end(), is_alpha), + eciv.end()); + } + } else { + if (alpha == nullptr) { + ExtraChannelInfo info; + info.type = ExtraChannel::kAlpha; + info.bit_depth.bits_per_sample = bits; + info.dim_shift = 0; + info.alpha_associated = alpha_is_premultiplied; + // Prepend rather than append: in case there already are other extra + // channels, prefer alpha channel to be listed first. + eciv.insert(eciv.begin(), info); + } else { + // Ignores potential extra alpha channels, only sets to first one. + alpha->bit_depth.bits_per_sample = bits; + alpha->bit_depth.floating_point_sample = false; + alpha->bit_depth.exponent_bits_per_sample = 0; + alpha->alpha_associated = alpha_is_premultiplied; + } + } + num_extra_channels = extra_channel_info.size(); + if (bits > 12) modular_16_bit_buffer_sufficient = false; +} + +std::string ImageMetadata::DebugString() const { + std::ostringstream os; + os << bit_depth.DebugString(); + if (modular_16_bit_buffer_sufficient) { + os << " (modular 16)"; + } + os << (xyb_encoded ? " xyb encoded" : " orig profile"); + os << " " << Description(color_encoding); + if (num_extra_channels > 0) { + os << " extra channels:"; + for (size_t i = 0; i < num_extra_channels; ++i) { + os << " (" << extra_channel_info[i].DebugString() << ")"; + if (i + 1 < num_extra_channels) os << ","; + } + } + if (have_preview) { + os << " preview: " << preview_size.xsize() << "x" << preview_size.ysize(); + } + if (orientation != 1) { + os << " orientation: " << orientation; + } + return os.str(); +} + +std::string CodecMetadata::DebugString() const { + std::ostringstream os; + os << size.xsize() << "x" << size.ysize(); + os << " " << m.DebugString(); + return os.str(); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/image_metadata.h b/third_party/jpeg-xl/lib/jxl/image_metadata.h new file mode 100644 index 0000000000..ca69eb3a3d --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/image_metadata.h @@ -0,0 +1,425 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Main codestream header bundles, the metadata that applies to all frames. +// Enums must align with the C API definitions in codestream_header.h. + +#ifndef LIB_JXL_IMAGE_METADATA_H_ +#define LIB_JXL_IMAGE_METADATA_H_ + +#include +#include +#include + +#include +#include + +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/jpeg/jpeg_data.h" +#include "lib/jxl/opsin_params.h" + +namespace jxl { + +struct AuxOut; + +// EXIF orientation of the image. This field overrides any field present in +// actual EXIF metadata. The value tells which transformation the decoder must +// apply after decoding to display the image with the correct orientation. +enum class Orientation : uint32_t { + // Values 1..8 match the EXIF definitions. + kIdentity = JXL_ORIENT_IDENTITY, + kFlipHorizontal = JXL_ORIENT_FLIP_HORIZONTAL, + kRotate180 = JXL_ORIENT_ROTATE_180, + kFlipVertical = JXL_ORIENT_FLIP_VERTICAL, + kTranspose = JXL_ORIENT_TRANSPOSE, + kRotate90 = JXL_ORIENT_ROTATE_90_CW, + kAntiTranspose = JXL_ORIENT_ANTI_TRANSPOSE, + kRotate270 = JXL_ORIENT_ROTATE_90_CCW, +}; +// Don't need an EnumBits because Orientation is not read via Enum(). + +enum class ExtraChannel : uint32_t { + // First two enumerators (most common) are cheaper to encode + kAlpha = JXL_CHANNEL_ALPHA, + kDepth = JXL_CHANNEL_DEPTH, + + kSpotColor = JXL_CHANNEL_SPOT_COLOR, + kSelectionMask = JXL_CHANNEL_SELECTION_MASK, + kBlack = JXL_CHANNEL_BLACK, // for CMYK + kCFA = JXL_CHANNEL_CFA, // Bayer channel + kThermal = JXL_CHANNEL_THERMAL, + kReserved0 = JXL_CHANNEL_RESERVED0, + kReserved1 = JXL_CHANNEL_RESERVED1, + kReserved2 = JXL_CHANNEL_RESERVED2, + kReserved3 = JXL_CHANNEL_RESERVED3, + kReserved4 = JXL_CHANNEL_RESERVED4, + kReserved5 = JXL_CHANNEL_RESERVED5, + kReserved6 = JXL_CHANNEL_RESERVED6, + kReserved7 = JXL_CHANNEL_RESERVED7, + // disambiguated via name string, raise warning if unsupported + kUnknown = JXL_CHANNEL_UNKNOWN, + // like kUnknown but can silently be ignored + kOptional = JXL_CHANNEL_OPTIONAL +}; +static inline const char* EnumName(ExtraChannel /*unused*/) { + return "ExtraChannel"; +} +static inline constexpr uint64_t EnumBits(ExtraChannel /*unused*/) { + using EC = ExtraChannel; + return MakeBit(EC::kAlpha) | MakeBit(EC::kDepth) | MakeBit(EC::kSpotColor) | + MakeBit(EC::kSelectionMask) | MakeBit(EC::kBlack) | MakeBit(EC::kCFA) | + MakeBit(EC::kThermal) | MakeBit(EC::kUnknown) | MakeBit(EC::kOptional); +} + +// Used in ImageMetadata and ExtraChannelInfo. +struct BitDepth : public Fields { + BitDepth(); + JXL_FIELDS_NAME(BitDepth) + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + std::string DebugString() const; + + // Whether the original (uncompressed) samples are floating point or + // unsigned integer. + bool floating_point_sample; + + // Bit depth of the original (uncompressed) image samples. Must be in the + // range [1, 32]. + uint32_t bits_per_sample; + + // Floating point exponent bits of the original (uncompressed) image samples, + // only used if floating_point_sample is true. + // If used, the samples are floating point with: + // - 1 sign bit + // - exponent_bits_per_sample exponent bits + // - (bits_per_sample - exponent_bits_per_sample - 1) mantissa bits + // If used, exponent_bits_per_sample must be in the range + // [2, 8] and amount of mantissa bits must be in the range [2, 23]. + // NOTE: exponent_bits_per_sample is 8 for single precision binary32 + // point, 5 for half precision binary16, 7 for fp24. + uint32_t exponent_bits_per_sample; +}; + +// Describes one extra channel. +struct ExtraChannelInfo : public Fields { + ExtraChannelInfo(); + JXL_FIELDS_NAME(ExtraChannelInfo) + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + std::string DebugString() const; + + mutable bool all_default; + + ExtraChannel type; + BitDepth bit_depth; + uint32_t dim_shift; // downsampled by 2^dim_shift on each axis + + std::string name; // UTF-8 + + // Conditional: + bool alpha_associated; // i.e. premultiplied + float spot_color[4]; // spot color in linear RGBA + uint32_t cfa_channel; +}; + +struct OpsinInverseMatrix : public Fields { + OpsinInverseMatrix(); + JXL_FIELDS_NAME(OpsinInverseMatrix) + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + mutable bool all_default; + + float inverse_matrix[9]; + float opsin_biases[3]; + float quant_biases[4]; +}; + +// Information useful for mapping HDR images to lower dynamic range displays. +struct ToneMapping : public Fields { + ToneMapping(); + JXL_FIELDS_NAME(ToneMapping) + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + mutable bool all_default; + + // Upper bound on the intensity level present in the image. For unsigned + // integer pixel encodings, this is the brightness of the largest + // representable value. The image does not necessarily contain a pixel + // actually this bright. An encoder is allowed to set 255 for SDR images + // without computing a histogram. + float intensity_target; // [nits] + + // Lower bound on the intensity level present in the image. This may be + // loose, i.e. lower than the actual darkest pixel. When tone mapping, a + // decoder will map [min_nits, intensity_target] to the display range. + float min_nits; + + bool relative_to_max_display; // see below + // The tone mapping will leave unchanged (linear mapping) any pixels whose + // brightness is strictly below this. The interpretation depends on + // relative_to_max_display. If true, this is a ratio [0, 1] of the maximum + // display brightness [nits], otherwise an absolute brightness [nits]. + float linear_below; +}; + +// Contains weights to customize some trasnforms - in particular, XYB and +// upsampling. +struct CustomTransformData : public Fields { + CustomTransformData(); + JXL_FIELDS_NAME(CustomTransformData) + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + // Must be set before calling VisitFields. Must equal xyb_encoded of + // ImageMetadata, should be set by ImageMetadata during VisitFields. + bool nonserialized_xyb_encoded = false; + + mutable bool all_default; + + OpsinInverseMatrix opsin_inverse_matrix; + + uint32_t custom_weights_mask; + float upsampling2_weights[15]; + float upsampling4_weights[55]; + float upsampling8_weights[210]; +}; + +// Properties of the original image bundle. This enables Encode(Decode()) to +// re-create an equivalent image without user input. +struct ImageMetadata : public Fields { + ImageMetadata(); + JXL_FIELDS_NAME(ImageMetadata) + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + // Returns bit depth of the JPEG XL compressed alpha channel, or 0 if no alpha + // channel present. In the theoretical case that there are multiple alpha + // channels, returns the bit depht of the first. + uint32_t GetAlphaBits() const { + const ExtraChannelInfo* alpha = Find(ExtraChannel::kAlpha); + if (alpha == nullptr) return 0; + JXL_ASSERT(alpha->bit_depth.bits_per_sample != 0); + return alpha->bit_depth.bits_per_sample; + } + + // Sets bit depth of alpha channel, adding extra channel if needed, or + // removing all alpha channels if bits is 0. + // Assumes integer alpha channel and not designed to support multiple + // alpha channels (it's possible to use those features by manipulating + // extra_channel_info directly). + // + // Callers must insert the actual channel image at the same index before any + // further modifications to extra_channel_info. + void SetAlphaBits(uint32_t bits, bool alpha_is_premultiplied = false); + + bool HasAlpha() const { return GetAlphaBits() != 0; } + + // Sets the original bit depth fields to indicate unsigned integer of the + // given bit depth. + // TODO(lode): move function to BitDepth + void SetUintSamples(uint32_t bits) { + bit_depth.bits_per_sample = bits; + bit_depth.exponent_bits_per_sample = 0; + bit_depth.floating_point_sample = false; + // RCT / Squeeze may add one bit each, and this is about int16_t, + // so uint13 should still be OK but limiting it to 12 seems safer. + // TODO(jon): figure out a better way to set this header field. + // (in particular, if modular mode is not used it doesn't matter, + // and if transforms are restricted, up to 15-bit could be done) + if (bits > 12) modular_16_bit_buffer_sufficient = false; + } + // Sets the original bit depth fields to indicate single precision floating + // point. + // TODO(lode): move function to BitDepth + void SetFloat32Samples() { + bit_depth.bits_per_sample = 32; + bit_depth.exponent_bits_per_sample = 8; + bit_depth.floating_point_sample = true; + modular_16_bit_buffer_sufficient = false; + } + + void SetFloat16Samples() { + bit_depth.bits_per_sample = 16; + bit_depth.exponent_bits_per_sample = 5; + bit_depth.floating_point_sample = true; + modular_16_bit_buffer_sufficient = false; + } + + void SetIntensityTarget(float intensity_target) { + tone_mapping.intensity_target = intensity_target; + } + float IntensityTarget() const { + JXL_ASSERT(tone_mapping.intensity_target != 0); + return tone_mapping.intensity_target; + } + + // Returns first ExtraChannelInfo of the given type, or nullptr if none. + const ExtraChannelInfo* Find(ExtraChannel type) const { + for (const ExtraChannelInfo& eci : extra_channel_info) { + if (eci.type == type) return &eci; + } + return nullptr; + } + + // Returns first ExtraChannelInfo of the given type, or nullptr if none. + ExtraChannelInfo* Find(ExtraChannel type) { + for (ExtraChannelInfo& eci : extra_channel_info) { + if (eci.type == type) return &eci; + } + return nullptr; + } + + Orientation GetOrientation() const { + return static_cast(orientation); + } + + bool ExtraFieldsDefault() const; + + std::string DebugString() const; + + mutable bool all_default; + + BitDepth bit_depth; + bool modular_16_bit_buffer_sufficient; // otherwise 32 is. + + // Whether the colors values of the pixels of frames are encoded in the + // codestream using the absolute XYB color space, or the using values that + // follow the color space defined by the ColorEncoding or ICC profile. This + // determines when or whether a CMS (Color Management System) is needed to get + // the pixels in a desired color space. In one case, the pixels have one known + // color space and a CMS is needed to convert them to the original image's + // color space, in the other case the pixels have the color space of the + // original image and a CMS is required if a different display space, or a + // single known consistent color space for multiple decoded images, is + // desired. In all cases, the color space of all frames from a single image is + // the same, both VarDCT and modular frames. + // + // If true: then frames can be decoded to XYB (which can also be converted to + // linear and non-linear sRGB with the built in conversion without CMS). The + // attached ColorEncoding or ICC profile has no effect on the meaning of the + // pixel's color values, but instead indicates what the color profile of the + // original image was, and what color profile one should convert to when + // decoding to integers to prevent clipping and precision loss. To do that + // conversion requires a CMS. + // + // If false: then the color values of decoded frames are in the space defined + // by the attached ColorEncoding or ICC profile. To instead get the pixels in + // a chosen known color space, such as sRGB, requires a CMS, since the + // attached ColorEncoding or ICC profile could be any arbitrary color space. + // This mode is typically used for lossless images encoded as integers. + // Frames can also use YCbCr encoding, some frames may and some may not, but + // this is not a different color space but a certain encoding of the RGB + // values. + // + // Note: if !xyb_encoded, but the attached color profile indicates XYB (which + // can happen either if it's a ColorEncoding with color_space_ == + // ColorSpace::kXYB, or if it's an ICC Profile that has been crafted to + // represent XYB), then the frames still may not use ColorEncoding kXYB, they + // must still use kNone (or kYCbCr, which would mean applying the YCbCr + // transform to the 3-channel XYB data), since with !xyb_encoded, the 3 + // channels are stored as-is, no matter what meaning the color profile assigns + // to them. To use ColorEncoding::kXYB, xyb_encoded must be true. + // + // This value is defined in image metadata because this is the global + // codestream header. This value does not affect the image itself, so is not + // image metadata per se, it only affects the encoding, and what color space + // the decoder can receive the pixels in without needing a CMS. + bool xyb_encoded; + + ColorEncoding color_encoding; + + // These values are initialized to defaults such that the 'extra_fields' + // condition in VisitFields uses correctly initialized values. + uint32_t orientation = 1; + bool have_preview = false; + bool have_animation = false; + bool have_intrinsic_size = false; + + // If present, the stored image has the dimensions of the first SizeHeader, + // but decoders are advised to resample or display per `intrinsic_size`. + SizeHeader intrinsic_size; // only if have_intrinsic_size + + ToneMapping tone_mapping; + + // When reading: deserialized. When writing: automatically set from vector. + uint32_t num_extra_channels; + std::vector extra_channel_info; + + // Only present if m.have_preview. + PreviewHeader preview_size; + // Only present if m.have_animation. + AnimationHeader animation; + + uint64_t extensions; + + // Option to stop parsing after basic info, and treat as if the later + // fields do not participate. Use to parse only basic image information + // excluding the final larger or variable sized data. + bool nonserialized_only_parse_basic_info = false; +}; + +Status ReadImageMetadata(BitReader* JXL_RESTRICT reader, + ImageMetadata* JXL_RESTRICT metadata); + +Status WriteImageMetadata(const ImageMetadata& metadata, + BitWriter* JXL_RESTRICT writer, size_t layer, + AuxOut* aux_out); + +// All metadata applicable to the entire codestream (dimensions, extra channels, +// ...) +struct CodecMetadata { + // TODO(lode): use the preview and animation fields too, in place of the + // nonserialized_ ones in ImageMetadata. + ImageMetadata m; + // The size of the codestream: this is the nominal size applicable to all + // frames, although some frames can have a different effective size through + // crop, dc_level or representing a the preview. + SizeHeader size; + // Often default. + CustomTransformData transform_data; + + size_t xsize() const { return size.xsize(); } + size_t ysize() const { return size.ysize(); } + size_t oriented_xsize(bool keep_orientation) const { + if (static_cast(m.GetOrientation()) > 4 && !keep_orientation) { + return ysize(); + } else { + return xsize(); + } + } + size_t oriented_preview_xsize(bool keep_orientation) const { + if (static_cast(m.GetOrientation()) > 4 && !keep_orientation) { + return m.preview_size.ysize(); + } else { + return m.preview_size.xsize(); + } + } + size_t oriented_ysize(bool keep_orientation) const { + if (static_cast(m.GetOrientation()) > 4 && !keep_orientation) { + return xsize(); + } else { + return ysize(); + } + } + size_t oriented_preview_ysize(bool keep_orientation) const { + if (static_cast(m.GetOrientation()) > 4 && !keep_orientation) { + return m.preview_size.xsize(); + } else { + return m.preview_size.ysize(); + } + } + + std::string DebugString() const; +}; + +} // namespace jxl + +#endif // LIB_JXL_IMAGE_METADATA_H_ diff --git a/third_party/jpeg-xl/lib/jxl/image_ops.h b/third_party/jpeg-xl/lib/jxl/image_ops.h new file mode 100644 index 0000000000..c025007e95 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/image_ops.h @@ -0,0 +1,805 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_IMAGE_OPS_H_ +#define LIB_JXL_IMAGE_OPS_H_ + +// Operations on images. + +#include +#include +#include +#include + +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image.h" + +namespace jxl { + +template +void CopyImageTo(const Plane& from, Plane* JXL_RESTRICT to) { + PROFILER_ZONE("CopyImage1"); + JXL_ASSERT(SameSize(from, *to)); + if (from.ysize() == 0 || from.xsize() == 0) return; + for (size_t y = 0; y < from.ysize(); ++y) { + const T* JXL_RESTRICT row_from = from.ConstRow(y); + T* JXL_RESTRICT row_to = to->Row(y); + memcpy(row_to, row_from, from.xsize() * sizeof(T)); + } +} + +// DEPRECATED - prefer to preallocate result. +template +Plane CopyImage(const Plane& from) { + Plane to(from.xsize(), from.ysize()); + CopyImageTo(from, &to); + return to; +} + +// Copies `from:rect_from` to `to:rect_to`. +template +void CopyImageTo(const Rect& rect_from, const Plane& from, + const Rect& rect_to, Plane* JXL_RESTRICT to) { + PROFILER_ZONE("CopyImageR"); + JXL_DASSERT(SameSize(rect_from, rect_to)); + JXL_DASSERT(rect_from.IsInside(from)); + JXL_DASSERT(rect_to.IsInside(*to)); + if (rect_from.xsize() == 0) return; + for (size_t y = 0; y < rect_from.ysize(); ++y) { + const T* JXL_RESTRICT row_from = rect_from.ConstRow(from, y); + T* JXL_RESTRICT row_to = rect_to.Row(to, y); + memcpy(row_to, row_from, rect_from.xsize() * sizeof(T)); + } +} + +// DEPRECATED - Returns a copy of the "image" pixels that lie in "rect". +template +Plane CopyImage(const Rect& rect, const Plane& image) { + Plane copy(rect.xsize(), rect.ysize()); + CopyImageTo(rect, image, ©); + return copy; +} + +// Copies `from:rect_from` to `to:rect_to`. +template +void CopyImageTo(const Rect& rect_from, const Image3& from, + const Rect& rect_to, Image3* JXL_RESTRICT to) { + PROFILER_ZONE("CopyImageR"); + JXL_ASSERT(SameSize(rect_from, rect_to)); + for (size_t c = 0; c < 3; c++) { + CopyImageTo(rect_from, from.Plane(c), rect_to, &to->Plane(c)); + } +} + +template +void ConvertPlaneAndClamp(const Rect& rect_from, const Plane& from, + const Rect& rect_to, Plane* JXL_RESTRICT to) { + PROFILER_ZONE("ConvertPlane"); + JXL_ASSERT(SameSize(rect_from, rect_to)); + using M = decltype(T() + U()); + for (size_t y = 0; y < rect_to.ysize(); ++y) { + const T* JXL_RESTRICT row_from = rect_from.ConstRow(from, y); + U* JXL_RESTRICT row_to = rect_to.Row(to, y); + for (size_t x = 0; x < rect_to.xsize(); ++x) { + row_to[x] = + std::min(std::max(row_from[x], std::numeric_limits::min()), + std::numeric_limits::max()); + } + } +} + +// Copies `from` to `to`. +template +void CopyImageTo(const T& from, T* JXL_RESTRICT to) { + return CopyImageTo(Rect(from), from, Rect(*to), to); +} + +// Copies `from:rect_from` to `to`. +template +void CopyImageTo(const Rect& rect_from, const T& from, T* JXL_RESTRICT to) { + return CopyImageTo(rect_from, from, Rect(*to), to); +} + +// Copies `from` to `to:rect_to`. +template +void CopyImageTo(const T& from, const Rect& rect_to, T* JXL_RESTRICT to) { + return CopyImageTo(Rect(from), from, rect_to, to); +} + +// Copies `from:rect_from` to `to:rect_to`; also copies `padding` pixels of +// border around `from:rect_from`, in all directions, whenever they are inside +// the first image. +template +void CopyImageToWithPadding(const Rect& from_rect, const T& from, + size_t padding, const Rect& to_rect, T* to) { + size_t xextra0 = std::min(padding, from_rect.x0()); + size_t xextra1 = + std::min(padding, from.xsize() - from_rect.x0() - from_rect.xsize()); + size_t yextra0 = std::min(padding, from_rect.y0()); + size_t yextra1 = + std::min(padding, from.ysize() - from_rect.y0() - from_rect.ysize()); + JXL_DASSERT(to_rect.x0() >= xextra0); + JXL_DASSERT(to_rect.y0() >= yextra0); + + return CopyImageTo(Rect(from_rect.x0() - xextra0, from_rect.y0() - yextra0, + from_rect.xsize() + xextra0 + xextra1, + from_rect.ysize() + yextra0 + yextra1), + from, + Rect(to_rect.x0() - xextra0, to_rect.y0() - yextra0, + to_rect.xsize() + xextra0 + xextra1, + to_rect.ysize() + yextra0 + yextra1), + to); +} + +// DEPRECATED - prefer to preallocate result. +template +Image3 CopyImage(const Image3& from) { + Image3 copy(from.xsize(), from.ysize()); + CopyImageTo(from, ©); + return copy; +} + +// DEPRECATED - prefer to preallocate result. +template +Image3 CopyImage(const Rect& rect, const Image3& from) { + Image3 to(rect.xsize(), rect.ysize()); + CopyImageTo(rect, from.Plane(0), to.Plane(0)); + CopyImageTo(rect, from.Plane(1), to.Plane(1)); + CopyImageTo(rect, from.Plane(2), to.Plane(2)); + return to; +} + +// Sets "thickness" pixels on each border to "value". This is faster than +// initializing the entire image and overwriting valid/interior pixels. +template +void SetBorder(const size_t thickness, const T value, Image3* image) { + const size_t xsize = image->xsize(); + const size_t ysize = image->ysize(); + // Top: fill entire row + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < std::min(thickness, ysize); ++y) { + T* JXL_RESTRICT row = image->PlaneRow(c, y); + std::fill(row, row + xsize, value); + } + + // Bottom: fill entire row + for (size_t y = ysize - thickness; y < ysize; ++y) { + T* JXL_RESTRICT row = image->PlaneRow(c, y); + std::fill(row, row + xsize, value); + } + + // Left/right: fill the 'columns' on either side, but only if the image is + // big enough that they don't already belong to the top/bottom rows. + if (ysize >= 2 * thickness) { + for (size_t y = thickness; y < ysize - thickness; ++y) { + T* JXL_RESTRICT row = image->PlaneRow(c, y); + std::fill(row, row + thickness, value); + std::fill(row + xsize - thickness, row + xsize, value); + } + } + } +} + +template +void Subtract(const ImageIn& image1, const ImageIn& image2, ImageOut* out) { + using T = typename ImageIn::T; + const size_t xsize = image1.xsize(); + const size_t ysize = image1.ysize(); + JXL_CHECK(xsize == image2.xsize()); + JXL_CHECK(ysize == image2.ysize()); + + for (size_t y = 0; y < ysize; ++y) { + const T* const JXL_RESTRICT row1 = image1.Row(y); + const T* const JXL_RESTRICT row2 = image2.Row(y); + T* const JXL_RESTRICT row_out = out->Row(y); + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = row1[x] - row2[x]; + } + } +} + +// In-place. +template +void SubtractFrom(const Plane& what, Plane* to) { + const size_t xsize = what.xsize(); + const size_t ysize = what.ysize(); + for (size_t y = 0; y < ysize; ++y) { + const Tin* JXL_RESTRICT row_what = what.ConstRow(y); + Tout* JXL_RESTRICT row_to = to->Row(y); + for (size_t x = 0; x < xsize; ++x) { + row_to[x] -= row_what[x]; + } + } +} + +// In-place. +template +void AddTo(const Plane& what, Plane* to) { + const size_t xsize = what.xsize(); + const size_t ysize = what.ysize(); + for (size_t y = 0; y < ysize; ++y) { + const Tin* JXL_RESTRICT row_what = what.ConstRow(y); + Tout* JXL_RESTRICT row_to = to->Row(y); + for (size_t x = 0; x < xsize; ++x) { + row_to[x] += row_what[x]; + } + } +} + +template +void AddTo(Rect rectFrom, const Plane& what, Rect rectTo, + Plane* to) { + JXL_ASSERT(SameSize(rectFrom, rectTo)); + const size_t xsize = rectTo.xsize(); + const size_t ysize = rectTo.ysize(); + for (size_t y = 0; y < ysize; ++y) { + const Tin* JXL_RESTRICT row_what = rectFrom.ConstRow(what, y); + Tout* JXL_RESTRICT row_to = rectTo.Row(to, y); + for (size_t x = 0; x < xsize; ++x) { + row_to[x] += row_what[x]; + } + } +} + +// Returns linear combination of two grayscale images. +template +Plane LinComb(const T lambda1, const Plane& image1, const T lambda2, + const Plane& image2) { + const size_t xsize = image1.xsize(); + const size_t ysize = image1.ysize(); + JXL_CHECK(xsize == image2.xsize()); + JXL_CHECK(ysize == image2.ysize()); + Plane out(xsize, ysize); + for (size_t y = 0; y < ysize; ++y) { + const T* const JXL_RESTRICT row1 = image1.Row(y); + const T* const JXL_RESTRICT row2 = image2.Row(y); + T* const JXL_RESTRICT row_out = out.Row(y); + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = lambda1 * row1[x] + lambda2 * row2[x]; + } + } + return out; +} + +// Returns a pixel-by-pixel multiplication of image by lambda. +template +Plane ScaleImage(const T lambda, const Plane& image) { + Plane out(image.xsize(), image.ysize()); + for (size_t y = 0; y < image.ysize(); ++y) { + const T* const JXL_RESTRICT row = image.Row(y); + T* const JXL_RESTRICT row_out = out.Row(y); + for (size_t x = 0; x < image.xsize(); ++x) { + row_out[x] = lambda * row[x]; + } + } + return out; +} + +// Multiplies image by lambda in-place +template +void ScaleImage(const T lambda, Plane* image) { + for (size_t y = 0; y < image->ysize(); ++y) { + T* const JXL_RESTRICT row = image->Row(y); + for (size_t x = 0; x < image->xsize(); ++x) { + row[x] = lambda * row[x]; + } + } +} + +template +Plane Product(const Plane& a, const Plane& b) { + Plane c(a.xsize(), a.ysize()); + for (size_t y = 0; y < a.ysize(); ++y) { + const T* const JXL_RESTRICT row_a = a.Row(y); + const T* const JXL_RESTRICT row_b = b.Row(y); + T* const JXL_RESTRICT row_c = c.Row(y); + for (size_t x = 0; x < a.xsize(); ++x) { + row_c[x] = row_a[x] * row_b[x]; + } + } + return c; +} + +template +void FillImage(const T value, Plane* image) { + for (size_t y = 0; y < image->ysize(); ++y) { + T* const JXL_RESTRICT row = image->Row(y); + for (size_t x = 0; x < image->xsize(); ++x) { + row[x] = value; + } + } +} + +template +void ZeroFillImage(Plane* image) { + if (image->xsize() == 0) return; + for (size_t y = 0; y < image->ysize(); ++y) { + T* const JXL_RESTRICT row = image->Row(y); + memset(row, 0, image->xsize() * sizeof(T)); + } +} + +// Mirrors out of bounds coordinates and returns valid coordinates unchanged. +// We assume the radius (distance outside the image) is small compared to the +// image size, otherwise this might not terminate. +// The mirror is outside the last column (border pixel is also replicated). +static inline int64_t Mirror(int64_t x, const int64_t xsize) { + JXL_DASSERT(xsize != 0); + + // TODO(janwas): replace with branchless version + while (x < 0 || x >= xsize) { + if (x < 0) { + x = -x - 1; + } else { + x = 2 * xsize - 1 - x; + } + } + return x; +} + +// Wrap modes for ensuring X/Y coordinates are in the valid range [0, size): + +// Mirrors (repeating the edge pixel once). Useful for convolutions. +struct WrapMirror { + JXL_INLINE int64_t operator()(const int64_t coord, const int64_t size) const { + return Mirror(coord, size); + } +}; + +// Returns the same coordinate: required for TFNode with Border(), or useful +// when we know "coord" is already valid (e.g. interior of an image). +struct WrapUnchanged { + JXL_INLINE int64_t operator()(const int64_t coord, int64_t /*size*/) const { + return coord; + } +}; + +// Similar to Wrap* but for row pointers (reduces Row() multiplications). + +class WrapRowMirror { + public: + template + WrapRowMirror(const ImageOrView& image, size_t ysize) + : first_row_(image.ConstRow(0)), last_row_(image.ConstRow(ysize - 1)) {} + + const float* operator()(const float* const JXL_RESTRICT row, + const int64_t stride) const { + if (row < first_row_) { + const int64_t num_before = first_row_ - row; + // Mirrored; one row before => row 0, two before = row 1, ... + return first_row_ + num_before - stride; + } + if (row > last_row_) { + const int64_t num_after = row - last_row_; + // Mirrored; one row after => last row, two after = last - 1, ... + return last_row_ - num_after + stride; + } + return row; + } + + private: + const float* const JXL_RESTRICT first_row_; + const float* const JXL_RESTRICT last_row_; +}; + +struct WrapRowUnchanged { + JXL_INLINE const float* operator()(const float* const JXL_RESTRICT row, + int64_t /*stride*/) const { + return row; + } +}; + +// Sets "thickness" pixels on each border to "value". This is faster than +// initializing the entire image and overwriting valid/interior pixels. +template +void SetBorder(const size_t thickness, const T value, Plane* image) { + const size_t xsize = image->xsize(); + const size_t ysize = image->ysize(); + // Top: fill entire row + for (size_t y = 0; y < std::min(thickness, ysize); ++y) { + T* const JXL_RESTRICT row = image->Row(y); + std::fill(row, row + xsize, value); + } + + // Bottom: fill entire row + for (size_t y = ysize - thickness; y < ysize; ++y) { + T* const JXL_RESTRICT row = image->Row(y); + std::fill(row, row + xsize, value); + } + + // Left/right: fill the 'columns' on either side, but only if the image is + // big enough that they don't already belong to the top/bottom rows. + if (ysize >= 2 * thickness) { + for (size_t y = thickness; y < ysize - thickness; ++y) { + T* const JXL_RESTRICT row = image->Row(y); + std::fill(row, row + thickness, value); + std::fill(row + xsize - thickness, row + xsize, value); + } + } +} + +// Computes the minimum and maximum pixel value. +template +void ImageMinMax(const Plane& image, T* const JXL_RESTRICT min, + T* const JXL_RESTRICT max) { + *min = std::numeric_limits::max(); + *max = std::numeric_limits::lowest(); + for (size_t y = 0; y < image.ysize(); ++y) { + const T* const JXL_RESTRICT row = image.Row(y); + for (size_t x = 0; x < image.xsize(); ++x) { + *min = std::min(*min, row[x]); + *max = std::max(*max, row[x]); + } + } +} + +// Copies pixels, scaling their value relative to the "from" min/max by +// "to_range". Example: U8 [0, 255] := [0.0, 1.0], to_range = 1.0 => +// outputs [0.0, 1.0]. +template +void ImageConvert(const Plane& from, const float to_range, + Plane* const JXL_RESTRICT to) { + JXL_ASSERT(SameSize(from, *to)); + FromType min_from, max_from; + ImageMinMax(from, &min_from, &max_from); + const float scale = to_range / (max_from - min_from); + for (size_t y = 0; y < from.ysize(); ++y) { + const FromType* const JXL_RESTRICT row_from = from.Row(y); + ToType* const JXL_RESTRICT row_to = to->Row(y); + for (size_t x = 0; x < from.xsize(); ++x) { + row_to[x] = static_cast((row_from[x] - min_from) * scale); + } + } +} + +template +Plane ConvertToFloat(const Plane& from) { + float factor = 1.0f / std::numeric_limits::max(); + if (std::is_same::value || std::is_same::value) { + factor = 1.0f; + } + Plane to(from.xsize(), from.ysize()); + for (size_t y = 0; y < from.ysize(); ++y) { + const From* const JXL_RESTRICT row_from = from.Row(y); + float* const JXL_RESTRICT row_to = to.Row(y); + for (size_t x = 0; x < from.xsize(); ++x) { + row_to[x] = row_from[x] * factor; + } + } + return to; +} + +template +Plane ImageFromPacked(const std::vector& packed, const size_t xsize, + const size_t ysize) { + Plane out(xsize, ysize); + for (size_t y = 0; y < ysize; ++y) { + T* const JXL_RESTRICT row = out.Row(y); + const T* const JXL_RESTRICT packed_row = &packed[y * xsize]; + memcpy(row, packed_row, xsize * sizeof(T)); + } + return out; +} + +// Computes independent minimum and maximum values for each plane. +template +void Image3MinMax(const Image3& image, const Rect& rect, + std::array* out_min, std::array* out_max) { + for (size_t c = 0; c < 3; ++c) { + T min = std::numeric_limits::max(); + T max = std::numeric_limits::min(); + for (size_t y = 0; y < rect.ysize(); ++y) { + const T* JXL_RESTRICT row = rect.ConstPlaneRow(image, c, y); + for (size_t x = 0; x < rect.xsize(); ++x) { + min = std::min(min, row[x]); + max = std::max(max, row[x]); + } + } + (*out_min)[c] = min; + (*out_max)[c] = max; + } +} + +// Computes independent minimum and maximum values for each plane. +template +void Image3MinMax(const Image3& image, std::array* out_min, + std::array* out_max) { + Image3MinMax(image, Rect(image), out_min, out_max); +} + +template +void Image3Max(const Image3& image, std::array* out_max) { + for (size_t c = 0; c < 3; ++c) { + T max = std::numeric_limits::min(); + for (size_t y = 0; y < image.ysize(); ++y) { + const T* JXL_RESTRICT row = image.ConstPlaneRow(c, y); + for (size_t x = 0; x < image.xsize(); ++x) { + max = std::max(max, row[x]); + } + } + (*out_max)[c] = max; + } +} + +// Computes the sum of the pixels in `rect`. +template +T ImageSum(const Plane& image, const Rect& rect) { + T result = 0; + for (size_t y = 0; y < rect.ysize(); ++y) { + const T* JXL_RESTRICT row = rect.ConstRow(image, y); + for (size_t x = 0; x < rect.xsize(); ++x) { + result += row[x]; + } + } + return result; +} + +template +T ImageSum(const Plane& image) { + return ImageSum(image, Rect(image)); +} + +template +std::array Image3Sum(const Image3& image, const Rect& rect) { + std::array out_sum = 0; + for (size_t c = 0; c < 3; ++c) { + (out_sum)[c] = ImageSum(image.Plane(c), rect); + } + return out_sum; +} + +template +std::array Image3Sum(const Image3& image) { + return Image3Sum(image, Rect(image)); +} + +template +std::vector PackedFromImage(const Plane& image, const Rect& rect) { + const size_t xsize = rect.xsize(); + const size_t ysize = rect.ysize(); + std::vector packed(xsize * ysize); + for (size_t y = 0; y < rect.ysize(); ++y) { + memcpy(&packed[y * xsize], rect.ConstRow(image, y), xsize * sizeof(T)); + } + return packed; +} + +template +std::vector PackedFromImage(const Plane& image) { + return PackedFromImage(image, Rect(image)); +} + +// Computes the median pixel value. +template +T ImageMedian(const Plane& image, const Rect& rect) { + std::vector pixels = PackedFromImage(image, rect); + return Median(&pixels); +} + +template +T ImageMedian(const Plane& image) { + return ImageMedian(image, Rect(image)); +} + +template +std::array Image3Median(const Image3& image, const Rect& rect) { + std::array out_median; + for (size_t c = 0; c < 3; ++c) { + (out_median)[c] = ImageMedian(image.Plane(c), rect); + } + return out_median; +} + +template +std::array Image3Median(const Image3& image) { + return Image3Median(image, Rect(image)); +} + +template +void Image3Convert(const Image3& from, const float to_range, + Image3* const JXL_RESTRICT to) { + JXL_ASSERT(SameSize(from, *to)); + std::array min_from, max_from; + Image3MinMax(from, &min_from, &max_from); + float scales[3]; + for (size_t c = 0; c < 3; ++c) { + scales[c] = to_range / (max_from[c] - min_from[c]); + } + float scale = std::min(scales[0], std::min(scales[1], scales[2])); + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < from.ysize(); ++y) { + const FromType* JXL_RESTRICT row_from = from.ConstPlaneRow(c, y); + ToType* JXL_RESTRICT row_to = to->PlaneRow(c, y); + for (size_t x = 0; x < from.xsize(); ++x) { + const float to = (row_from[x] - min_from[c]) * scale; + row_to[x] = static_cast(to); + } + } + } +} + +template +Image3F ConvertToFloat(const Image3& from) { + return Image3F(ConvertToFloat(from.Plane(0)), ConvertToFloat(from.Plane(1)), + ConvertToFloat(from.Plane(2))); +} + +template +void Subtract(const Image3& image1, const Image3& image2, + Image3* out) { + const size_t xsize = image1.xsize(); + const size_t ysize = image1.ysize(); + JXL_CHECK(xsize == image2.xsize()); + JXL_CHECK(ysize == image2.ysize()); + + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < ysize; ++y) { + const Tin* const JXL_RESTRICT row1 = image1.ConstPlaneRow(c, y); + const Tin* const JXL_RESTRICT row2 = image2.ConstPlaneRow(c, y); + Tout* const JXL_RESTRICT row_out = out->PlaneRow(c, y); + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = row1[x] - row2[x]; + } + } + } +} + +template +void SubtractFrom(const Image3& what, Image3* to) { + const size_t xsize = what.xsize(); + const size_t ysize = what.ysize(); + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < ysize; ++y) { + const Tin* JXL_RESTRICT row_what = what.ConstPlaneRow(c, y); + Tout* JXL_RESTRICT row_to = to->PlaneRow(c, y); + for (size_t x = 0; x < xsize; ++x) { + row_to[x] -= row_what[x]; + } + } + } +} + +template +void AddTo(const Image3& what, Image3* to) { + const size_t xsize = what.xsize(); + const size_t ysize = what.ysize(); + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < ysize; ++y) { + const Tin* JXL_RESTRICT row_what = what.ConstPlaneRow(c, y); + Tout* JXL_RESTRICT row_to = to->PlaneRow(c, y); + for (size_t x = 0; x < xsize; ++x) { + row_to[x] += row_what[x]; + } + } + } +} + +// Adds `what` of the size of `rect` to `to` in the position of `rect`. +template +void AddTo(const Rect& rect, const Image3& what, Image3* to) { + const size_t xsize = what.xsize(); + const size_t ysize = what.ysize(); + JXL_ASSERT(xsize == rect.xsize()); + JXL_ASSERT(ysize == rect.ysize()); + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < ysize; ++y) { + const Tin* JXL_RESTRICT row_what = what.ConstPlaneRow(c, y); + Tout* JXL_RESTRICT row_to = rect.PlaneRow(to, c, y); + for (size_t x = 0; x < xsize; ++x) { + row_to[x] += row_what[x]; + } + } + } +} + +template +Image3 ScaleImage(const T lambda, const Image3& image) { + Image3 out(image.xsize(), image.ysize()); + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < image.ysize(); ++y) { + const T* JXL_RESTRICT row = image.ConstPlaneRow(c, y); + T* JXL_RESTRICT row_out = out.PlaneRow(c, y); + for (size_t x = 0; x < image.xsize(); ++x) { + row_out[x] = lambda * row[x]; + } + } + } + return out; +} + +// Multiplies image by lambda in-place +template +void ScaleImage(const T lambda, Image3* image) { + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < image->ysize(); ++y) { + T* const JXL_RESTRICT row = image->PlaneRow(c, y); + for (size_t x = 0; x < image->xsize(); ++x) { + row[x] = lambda * row[x]; + } + } + } +} + +// Initializes all planes to the same "value". +template +void FillImage(const T value, Image3* image) { + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < image->ysize(); ++y) { + T* JXL_RESTRICT row = image->PlaneRow(c, y); + for (size_t x = 0; x < image->xsize(); ++x) { + row[x] = value; + } + } + } +} + +template +void FillPlane(const T value, Plane* image) { + for (size_t y = 0; y < image->ysize(); ++y) { + T* JXL_RESTRICT row = image->Row(y); + for (size_t x = 0; x < image->xsize(); ++x) { + row[x] = value; + } + } +} + +template +void FillImage(const T value, Image3* image, Rect rect) { + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < rect.ysize(); ++y) { + T* JXL_RESTRICT row = rect.PlaneRow(image, c, y); + for (size_t x = 0; x < rect.xsize(); ++x) { + row[x] = value; + } + } + } +} + +template +void FillPlane(const T value, Plane* image, Rect rect) { + for (size_t y = 0; y < rect.ysize(); ++y) { + T* JXL_RESTRICT row = rect.Row(image, y); + for (size_t x = 0; x < rect.xsize(); ++x) { + row[x] = value; + } + } +} + +template +void ZeroFillImage(Image3* image) { + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < image->ysize(); ++y) { + T* JXL_RESTRICT row = image->PlaneRow(c, y); + if (image->xsize() != 0) memset(row, 0, image->xsize() * sizeof(T)); + } + } +} + +template +void ZeroFillPlane(Plane* image, Rect rect) { + for (size_t y = 0; y < rect.ysize(); ++y) { + T* JXL_RESTRICT row = rect.Row(image, y); + memset(row, 0, rect.xsize() * sizeof(T)); + } +} + +// Pad an image with xborder columns on each vertical side and yboder rows +// above and below, mirroring the image. +Image3F PadImageMirror(const Image3F& in, size_t xborder, size_t yborder); + +// Same as above, but operates in-place. Assumes that the `in` image was +// allocated large enough. +void PadImageToBlockMultipleInPlace(Image3F* JXL_RESTRICT in, + size_t block_dim = kBlockDim); + +// Downsamples an image by a given factor. +void DownsampleImage(Image3F* opsin, size_t factor); +void DownsampleImage(ImageF* image, size_t factor); + +} // namespace jxl + +#endif // LIB_JXL_IMAGE_OPS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/image_ops_test.cc b/third_party/jpeg-xl/lib/jxl/image_ops_test.cc new file mode 100644 index 0000000000..44c021513d --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/image_ops_test.cc @@ -0,0 +1,164 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/image_ops.h" + +#include +#include +#include + +#include + +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +template +void TestPacked(const size_t xsize, const size_t ysize) { + Plane image1(xsize, ysize); + RandomFillImage(&image1); + const std::vector& packed = PackedFromImage(image1); + const Plane& image2 = ImageFromPacked(packed, xsize, ysize); + JXL_EXPECT_OK(SamePixels(image1, image2, _)); +} + +TEST(ImageTest, TestPacked) { + TestPacked(1, 1); + TestPacked(7, 1); + TestPacked(1, 7); + + TestPacked(1, 1); + TestPacked(7, 1); + TestPacked(1, 7); + + TestPacked(1, 1); + TestPacked(7, 1); + TestPacked(1, 7); + + TestPacked(1, 1); + TestPacked(7, 1); + TestPacked(1, 7); +} + +// Ensure entire payload is readable/writable for various size/offset combos. +TEST(ImageTest, TestAllocator) { + Rng rng(0); + const size_t k32 = 32; + const size_t kAlign = CacheAligned::kAlignment; + for (size_t size : {k32 * 1, k32 * 2, k32 * 3, k32 * 4, k32 * 5, + CacheAligned::kAlias, 2 * CacheAligned::kAlias + 4}) { + for (size_t offset = 0; offset <= CacheAligned::kAlias; offset += kAlign) { + uint8_t* bytes = + static_cast(CacheAligned::Allocate(size, offset)); + JXL_CHECK(reinterpret_cast(bytes) % kAlign == 0); + // Ensure we can write/read the last byte. Use RNG to fool the compiler + // into thinking the write is necessary. + memset(bytes, 0, size); + bytes[size - 1] = 1; // greatest element + uint32_t pos = rng.UniformU(0, size - 1); // random but != greatest + JXL_CHECK(bytes[pos] < bytes[size - 1]); + + CacheAligned::Free(bytes); + } + } +} + +template +void TestFillImpl(Image3* img, const char* layout) { + FillImage(T(1), img); + for (size_t y = 0; y < img->ysize(); ++y) { + for (size_t c = 0; c < 3; ++c) { + T* JXL_RESTRICT row = img->PlaneRow(c, y); + for (size_t x = 0; x < img->xsize(); ++x) { + if (row[x] != T(1)) { + printf("Not 1 at c=%" PRIuS " %" PRIuS ", %" PRIuS " (%" PRIuS + " x %" PRIuS ") (%s)\n", + c, x, y, img->xsize(), img->ysize(), layout); + abort(); + } + row[x] = T(2); + } + } + } + + // Same for ZeroFillImage and swapped c/y loop ordering. + ZeroFillImage(img); + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < img->ysize(); ++y) { + T* JXL_RESTRICT row = img->PlaneRow(c, y); + for (size_t x = 0; x < img->xsize(); ++x) { + if (row[x] != T(0)) { + printf("Not 0 at c=%" PRIuS " %" PRIuS ", %" PRIuS " (%" PRIuS + " x %" PRIuS ") (%s)\n", + c, x, y, img->xsize(), img->ysize(), layout); + abort(); + } + row[x] = T(3); + } + } + } +} + +template +void TestFillT() { + for (uint32_t xsize : {0, 1, 15, 16, 31, 32}) { + for (uint32_t ysize : {0, 1, 15, 16, 31, 32}) { + Image3 image(xsize, ysize); + TestFillImpl(&image, "size ctor"); + + Image3 planar(Plane(xsize, ysize), Plane(xsize, ysize), + Plane(xsize, ysize)); + TestFillImpl(&planar, "planar"); + } + } +} + +// Ensure y/c/x and c/y/x loops visit pixels no more than once. +TEST(ImageTest, TestFill) { + TestFillT(); + TestFillT(); + TestFillT(); + TestFillT(); +} + +TEST(ImageTest, CopyImageToWithPaddingTest) { + Plane src(100, 61); + for (size_t y = 0; y < src.ysize(); y++) { + for (size_t x = 0; x < src.xsize(); x++) { + src.Row(y)[x] = x * 1000 + y; + } + } + Rect src_rect(10, 20, 30, 40); + EXPECT_TRUE(src_rect.IsInside(src)); + + Plane dst(60, 50); + FillImage(0u, &dst); + Rect dst_rect(20, 5, 30, 40); + EXPECT_TRUE(dst_rect.IsInside(dst)); + + CopyImageToWithPadding(src_rect, src, /*padding=*/2, dst_rect, &dst); + + // ysize is + 3 instead of + 4 because we are at the y image boundary on the + // source image. + Rect padded_dst_rect(20 - 2, 5 - 2, 30 + 4, 40 + 3); + for (size_t y = 0; y < dst.ysize(); y++) { + for (size_t x = 0; x < dst.xsize(); x++) { + if (Rect(x, y, 1, 1).IsInside(padded_dst_rect)) { + EXPECT_EQ((x - dst_rect.x0() + src_rect.x0()) * 1000 + + (y - dst_rect.y0() + src_rect.y0()), + dst.Row(y)[x]); + } else { + EXPECT_EQ(0u, dst.Row(y)[x]); + } + } + } +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/image_test_utils.h b/third_party/jpeg-xl/lib/jxl/image_test_utils.h new file mode 100644 index 0000000000..e7d72285e6 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/image_test_utils.h @@ -0,0 +1,257 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_IMAGE_TEST_UTILS_H_ +#define LIB_JXL_IMAGE_TEST_UTILS_H_ + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include +#include +#include + +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/random.h" +#include "lib/jxl/image.h" + +namespace jxl { + +template +bool SamePixels(const Plane& image1, const Plane& image2, + std::stringstream& failures) { + const Rect rect(image1); + JXL_CHECK(SameSize(image1, image2)); + size_t mismatches = 0; + for (size_t y = rect.y0(); y < rect.ysize(); ++y) { + const T* const JXL_RESTRICT row1 = image1.Row(y); + const T* const JXL_RESTRICT row2 = image2.Row(y); + for (size_t x = rect.x0(); x < rect.xsize(); ++x) { + if (row1[x] != row2[x]) { + failures << "pixel mismatch" << x << ", " << y << ": " + << double(row1[x]) << " != " << double(row2[x]) << "\n"; + if (++mismatches > 4) { + return false; + } + } + } + } + return mismatches == 0; +} + +template +bool SamePixels(const Image3& image1, const Image3& image2, + std::stringstream& failures) { + JXL_CHECK(SameSize(image1, image2)); + for (size_t c = 0; c < 3; ++c) { + if (!SamePixels(image1.Plane(c), image2.Plane(c), failures)) { + return false; + } + } + return true; +} + +// Use for floating-point images with fairly large numbers; tolerates small +// absolute errors and/or small relative errors. +template +bool VerifyRelativeError(const Plane& expected, const Plane& actual, + const double threshold_l1, + const double threshold_relative, + std::stringstream& failures, const intptr_t border = 0, + const size_t c = 0) { + JXL_CHECK(SameSize(expected, actual)); + const intptr_t xsize = expected.xsize(); + const intptr_t ysize = expected.ysize(); + + // Max over current scanline to give a better idea whether there are + // systematic errors or just one outlier. Invalid if negative. + double max_l1 = -1; + double max_relative = -1; + bool any_bad = false; + for (intptr_t y = border; y < ysize - border; ++y) { + const T* const JXL_RESTRICT row_expected = expected.Row(y); + const T* const JXL_RESTRICT row_actual = actual.Row(y); + for (intptr_t x = border; x < xsize - border; ++x) { + const double l1 = std::abs(row_expected[x] - row_actual[x]); + + // Cannot compute relative, only check/update L1. + if (std::abs(row_expected[x]) < 1E-10) { + if (l1 > threshold_l1) { + any_bad = true; + max_l1 = std::max(max_l1, l1); + } + } else { + const double relative = l1 / std::abs(double(row_expected[x])); + if (l1 > threshold_l1 && relative > threshold_relative) { + // Fails both tolerances => will exit below, update max_*. + any_bad = true; + max_l1 = std::max(max_l1, l1); + max_relative = std::max(max_relative, relative); + } + } + } + } + if (!any_bad) { + return true; + } + // Never had a valid relative value, don't print it. + if (max_relative < 0) { + fprintf(stderr, "c=%" PRIu64 ": max +/- %E exceeds +/- %.2E\n", + static_cast(c), max_l1, threshold_l1); + } else { + fprintf(stderr, + "c=%" PRIu64 ": max +/- %E, x %E exceeds +/- %.2E, x %.2E\n", + static_cast(c), max_l1, max_relative, threshold_l1, + threshold_relative); + } + // Dump the expected image and actual image if the region is small enough. + const intptr_t kMaxTestDumpSize = 16; + if (xsize <= kMaxTestDumpSize + 2 * border && + ysize <= kMaxTestDumpSize + 2 * border) { + fprintf(stderr, "Expected image:\n"); + for (intptr_t y = border; y < ysize - border; ++y) { + const T* const JXL_RESTRICT row_expected = expected.Row(y); + for (intptr_t x = border; x < xsize - border; ++x) { + fprintf(stderr, "%10lf ", static_cast(row_expected[x])); + } + fprintf(stderr, "\n"); + } + + fprintf(stderr, "Actual image:\n"); + for (intptr_t y = border; y < ysize - border; ++y) { + const T* const JXL_RESTRICT row_expected = expected.Row(y); + const T* const JXL_RESTRICT row_actual = actual.Row(y); + for (intptr_t x = border; x < xsize - border; ++x) { + const double l1 = std::abs(row_expected[x] - row_actual[x]); + + bool bad = l1 > threshold_l1; + if (row_expected[x] > 1E-10) { + const double relative = l1 / std::abs(double(row_expected[x])); + bad &= relative > threshold_relative; + } + if (bad) { + fprintf(stderr, "%10lf ", static_cast(row_actual[x])); + } else { + fprintf(stderr, "%10s ", "=="); + } + } + fprintf(stderr, "\n"); + } + } + + // Find first failing x for further debugging. + for (intptr_t y = border; y < ysize - border; ++y) { + const T* const JXL_RESTRICT row_expected = expected.Row(y); + const T* const JXL_RESTRICT row_actual = actual.Row(y); + + for (intptr_t x = border; x < xsize - border; ++x) { + const double l1 = std::abs(row_expected[x] - row_actual[x]); + + bool bad = l1 > threshold_l1; + if (row_expected[x] > 1E-10) { + const double relative = l1 / std::abs(double(row_expected[x])); + bad &= relative > threshold_relative; + } + if (bad) { + failures << x << ", " << y << " (" << expected.xsize() << " x " + << expected.ysize() << ") expected " + << static_cast(row_expected[x]) << " actual " + << static_cast(row_actual[x]); + return false; + } + } + } + return false; +} + +template +bool VerifyRelativeError(const Image3& expected, const Image3& actual, + const float threshold_l1, + const float threshold_relative, + std::stringstream& failures, + const intptr_t border = 0) { + for (size_t c = 0; c < 3; ++c) { + bool ok = + VerifyRelativeError(expected.Plane(c), actual.Plane(c), threshold_l1, + threshold_relative, failures, border, c); + if (!ok) { + return false; + } + } + return true; +} + +template +void GenerateImage(Rng& rng, Plane* image, U begin, U end) { + for (size_t y = 0; y < image->ysize(); ++y) { + T* const JXL_RESTRICT row = image->Row(y); + for (size_t x = 0; x < image->xsize(); ++x) { + if (std::is_same::value || std::is_same::value) { + row[x] = rng.UniformF(begin, end); + } else if (std::is_signed::value) { + row[x] = rng.UniformI(begin, end); + } else { + row[x] = rng.UniformU(begin, end); + } + } + } +} + +template +void RandomFillImage(Plane* image, const T begin, const T end, + const int seed = 129) { + Rng rng(seed); + GenerateImage(rng, image, begin, end); +} + +template +typename std::enable_if::value>::type RandomFillImage( + Plane* image) { + Rng rng(129); + GenerateImage(rng, image, int64_t(0), + int64_t(std::numeric_limits::max()) + 1); +} + +JXL_INLINE void RandomFillImage(Plane* image) { + Rng rng(129); + GenerateImage(rng, image, 0.0f, std::numeric_limits::max()); +} + +template +void GenerateImage(Rng& rng, Image3* image, U begin, U end) { + for (size_t c = 0; c < 3; ++c) { + GenerateImage(rng, &image->Plane(c), begin, end); + } +} + +template +typename std::enable_if::value>::type RandomFillImage( + Image3* image) { + Rng rng(129); + GenerateImage(rng, image, int64_t(0), + int64_t(std::numeric_limits::max()) + 1); +} + +JXL_INLINE void RandomFillImage(Image3F* image) { + Rng rng(129); + GenerateImage(rng, image, 0.0f, std::numeric_limits::max()); +} + +template +void RandomFillImage(Image3* image, const U begin, const U end, + const int seed = 129) { + Rng rng(seed); + GenerateImage(rng, image, begin, end); +} + +} // namespace jxl + +#endif // LIB_JXL_IMAGE_TEST_UTILS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/inverse_mtf-inl.h b/third_party/jpeg-xl/lib/jxl/inverse_mtf-inl.h new file mode 100644 index 0000000000..fcb01d7396 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/inverse_mtf-inl.h @@ -0,0 +1,90 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// SIMDified inverse-move-to-front transform. + +#if defined(LIB_JXL_INVERSE_MTF_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_INVERSE_MTF_INL_H_ +#undef LIB_JXL_INVERSE_MTF_INL_H_ +#else +#define LIB_JXL_INVERSE_MTF_INL_H_ +#endif + +#include + +#include "lib/jxl/sanitizers.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::FirstN; +using hwy::HWY_NAMESPACE::IfThenElse; +using hwy::HWY_NAMESPACE::Load; +using hwy::HWY_NAMESPACE::LoadU; +using hwy::HWY_NAMESPACE::StoreU; + +inline void MoveToFront(uint8_t* v, uint8_t index) { + uint8_t value = v[index]; + uint8_t i = index; + if (i < 4) { + for (; i; --i) v[i] = v[i - 1]; + } else { + const HWY_CAPPED(uint8_t, 64) d; + int tail = i & (Lanes(d) - 1); + if (tail) { + i -= tail; + const auto vec = Load(d, v + i); + const auto prev = LoadU(d, v + i + 1); + StoreU(IfThenElse(FirstN(d, tail), vec, prev), d, v + i + 1); + } + while (i) { + i -= Lanes(d); + const auto vec = Load(d, v + i); + StoreU(vec, d, v + i + 1); + } + } + v[0] = value; +} + +inline void InverseMoveToFrontTransform(uint8_t* v, int v_len) { + HWY_ALIGN uint8_t mtf[256 + 64]; + int i; + for (i = 0; i < 256; ++i) { + mtf[i] = static_cast(i); + } +#if JXL_MEMORY_SANITIZER + const HWY_CAPPED(uint8_t, 64) d; + for (size_t j = 0; j < Lanes(d); ++j) { + mtf[256 + j] = 0; + } +#endif // JXL_MEMORY_SANITIZER + for (i = 0; i < v_len; ++i) { + uint8_t index = v[i]; + v[i] = mtf[index]; + if (index) MoveToFront(mtf, index); + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_INVERSE_MTF_INL_H_ + +#if HWY_ONCE +#ifndef INVERSE_MTF_ONCE +#define INVERSE_MTF_ONCE + +namespace jxl { +inline void InverseMoveToFrontTransform(uint8_t* v, int v_len) { + return HWY_STATIC_DISPATCH(InverseMoveToFrontTransform)(v, v_len); +} +} // namespace jxl + +#endif // INVERSE_MTF_ONCE +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.cc b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.cc new file mode 100644 index 0000000000..db49a1c215 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.cc @@ -0,0 +1,145 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/jpeg/dec_jpeg_data.h" + +#include + +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/sanitizers.h" + +namespace jxl { +namespace jpeg { +Status DecodeJPEGData(Span encoded, JPEGData* jpeg_data) { + Status ret = true; + const uint8_t* in = encoded.data(); + size_t available_in = encoded.size(); + { + BitReader br(encoded); + BitReaderScopedCloser br_closer(&br, &ret); + JXL_RETURN_IF_ERROR(Bundle::Read(&br, jpeg_data)); + JXL_RETURN_IF_ERROR(br.JumpToByteBoundary()); + in += br.TotalBitsConsumed() / 8; + available_in -= br.TotalBitsConsumed() / 8; + } + JXL_RETURN_IF_ERROR(ret); + + BrotliDecoderState* brotli_dec = + BrotliDecoderCreateInstance(nullptr, nullptr, nullptr); + + struct BrotliDecDeleter { + BrotliDecoderState* brotli_dec; + ~BrotliDecDeleter() { BrotliDecoderDestroyInstance(brotli_dec); } + } brotli_dec_deleter{brotli_dec}; + + BrotliDecoderResult result = + BrotliDecoderResult::BROTLI_DECODER_RESULT_SUCCESS; + + auto br_read = [&](std::vector& data) -> Status { + size_t available_out = data.size(); + uint8_t* out = data.data(); + while (available_out != 0) { + if (BrotliDecoderIsFinished(brotli_dec)) { + return JXL_FAILURE("Not enough decompressed output"); + } + uint8_t* next_out_before = out; + size_t avail_out_before = available_out; + msan::MemoryIsInitialized(in, available_in); + result = BrotliDecoderDecompressStream(brotli_dec, &available_in, &in, + &available_out, &out, nullptr); + if (result != + BrotliDecoderResult::BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT && + result != BrotliDecoderResult::BROTLI_DECODER_RESULT_SUCCESS) { + return JXL_FAILURE( + "Brotli decoding error: %s\n", + BrotliDecoderErrorString(BrotliDecoderGetErrorCode(brotli_dec))); + } + msan::UnpoisonMemory(next_out_before, avail_out_before - available_out); + } + return true; + }; + size_t num_icc = 0; + for (size_t i = 0; i < jpeg_data->app_data.size(); i++) { + auto& marker = jpeg_data->app_data[i]; + if (jpeg_data->app_marker_type[i] != AppMarkerType::kUnknown) { + // Set the size of the marker. + size_t size_minus_1 = marker.size() - 1; + marker[1] = size_minus_1 >> 8; + marker[2] = size_minus_1 & 0xFF; + if (jpeg_data->app_marker_type[i] == AppMarkerType::kICC) { + if (marker.size() < 17) { + return JXL_FAILURE("ICC markers must be at least 17 bytes"); + } + marker[0] = 0xE2; + memcpy(&marker[3], kIccProfileTag, sizeof kIccProfileTag); + marker[15] = ++num_icc; + } + } else { + JXL_RETURN_IF_ERROR(br_read(marker)); + if (marker[1] * 256u + marker[2] + 1u != marker.size()) { + return JXL_FAILURE("Incorrect marker size"); + } + } + } + for (size_t i = 0; i < jpeg_data->app_data.size(); i++) { + auto& marker = jpeg_data->app_data[i]; + if (jpeg_data->app_marker_type[i] == AppMarkerType::kICC) { + marker[16] = num_icc; + } + if (jpeg_data->app_marker_type[i] == AppMarkerType::kExif) { + marker[0] = 0xE1; + if (marker.size() < 3 + sizeof kExifTag) { + return JXL_FAILURE("Incorrect Exif marker size"); + } + memcpy(&marker[3], kExifTag, sizeof kExifTag); + } + if (jpeg_data->app_marker_type[i] == AppMarkerType::kXMP) { + marker[0] = 0xE1; + if (marker.size() < 3 + sizeof kXMPTag) { + return JXL_FAILURE("Incorrect XMP marker size"); + } + memcpy(&marker[3], kXMPTag, sizeof kXMPTag); + } + } + // TODO(eustas): actually inject ICC profile and check it fits perfectly. + for (size_t i = 0; i < jpeg_data->com_data.size(); i++) { + auto& marker = jpeg_data->com_data[i]; + JXL_RETURN_IF_ERROR(br_read(marker)); + if (marker[1] * 256u + marker[2] + 1u != marker.size()) { + return JXL_FAILURE("Incorrect marker size"); + } + } + for (size_t i = 0; i < jpeg_data->inter_marker_data.size(); i++) { + JXL_RETURN_IF_ERROR(br_read(jpeg_data->inter_marker_data[i])); + } + JXL_RETURN_IF_ERROR(br_read(jpeg_data->tail_data)); + + // Check if there is more decompressed output. + size_t available_out = 1; + uint64_t dummy; + uint8_t* next_out = reinterpret_cast(&dummy); + result = BrotliDecoderDecompressStream(brotli_dec, &available_in, &in, + &available_out, &next_out, nullptr); + if (available_out == 0 || + result == BrotliDecoderResult::BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT) { + return JXL_FAILURE("Excess data in compressed stream"); + } + if (result == BrotliDecoderResult::BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT) { + return JXL_FAILURE("Incomplete brotli-stream"); + } + if (!BrotliDecoderIsFinished(brotli_dec) || + result != BrotliDecoderResult::BROTLI_DECODER_RESULT_SUCCESS) { + return JXL_FAILURE("Corrupted brotli-stream"); + } + if (available_in != 0) { + return JXL_FAILURE("Unused data after brotli stream"); + } + + return true; +} +} // namespace jpeg +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.h b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.h new file mode 100644 index 0000000000..b9d50bf9f8 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.h @@ -0,0 +1,19 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_JPEG_DEC_JPEG_DATA_H_ +#define LIB_JXL_JPEG_DEC_JPEG_DATA_H_ + +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/jpeg/jpeg_data.h" + +namespace jxl { +namespace jpeg { +Status DecodeJPEGData(Span encoded, JPEGData* jpeg_data); +} +} // namespace jxl + +#endif // LIB_JXL_JPEG_DEC_JPEG_DATA_H_ diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.cc b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.cc new file mode 100644 index 0000000000..f9ae755789 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.cc @@ -0,0 +1,1050 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/jpeg/dec_jpeg_data_writer.h" + +#include +#include /* for memset, memcpy */ + +#include +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/jpeg/dec_jpeg_serialization_state.h" +#include "lib/jxl/jpeg/jpeg_data.h" + +namespace jxl { +namespace jpeg { + +namespace { + +enum struct SerializationStatus { + NEEDS_MORE_INPUT, + NEEDS_MORE_OUTPUT, + ERROR, + DONE +}; + +const int kJpegPrecision = 8; + +// JpegBitWriter: buffer size +const size_t kJpegBitWriterChunkSize = 16384; + +// DCTCodingState: maximum number of correction bits to buffer +const int kJPEGMaxCorrectionBits = 1u << 16; + +// Returns non-zero if and only if x has a zero byte, i.e. one of +// x & 0xff, x & 0xff00, ..., x & 0xff00000000000000 is zero. +static JXL_INLINE uint64_t HasZeroByte(uint64_t x) { + return (x - 0x0101010101010101ULL) & ~x & 0x8080808080808080ULL; +} + +void JpegBitWriterInit(JpegBitWriter* bw, + std::deque* output_queue) { + bw->output = output_queue; + bw->chunk = OutputChunk(kJpegBitWriterChunkSize); + bw->pos = 0; + bw->put_buffer = 0; + bw->put_bits = 64; + bw->healthy = true; + bw->data = bw->chunk.buffer->data(); +} + +static JXL_NOINLINE void SwapBuffer(JpegBitWriter* bw) { + bw->chunk.len = bw->pos; + bw->output->emplace_back(std::move(bw->chunk)); + bw->chunk = OutputChunk(kJpegBitWriterChunkSize); + bw->data = bw->chunk.buffer->data(); + bw->pos = 0; +} + +static JXL_INLINE void Reserve(JpegBitWriter* bw, size_t n_bytes) { + if (JXL_UNLIKELY((bw->pos + n_bytes) > kJpegBitWriterChunkSize)) { + SwapBuffer(bw); + } +} + +/** + * Writes the given byte to the output, writes an extra zero if byte is 0xFF. + * + * This method is "careless" - caller must make sure that there is enough + * space in the output buffer. Emits up to 2 bytes to buffer. + */ +static JXL_INLINE void EmitByte(JpegBitWriter* bw, int byte) { + bw->data[bw->pos++] = byte; + if (byte == 0xFF) bw->data[bw->pos++] = 0; +} + +static JXL_INLINE void DischargeBitBuffer(JpegBitWriter* bw) { + // At this point we are ready to emit the most significant 6 bytes of + // put_buffer_ to the output. + // The JPEG format requires that after every 0xff byte in the entropy + // coded section, there is a zero byte, therefore we first check if any of + // the 6 most significant bytes of put_buffer_ is 0xFF. + Reserve(bw, 12); + if (HasZeroByte(~bw->put_buffer | 0xFFFF)) { + // We have a 0xFF byte somewhere, examine each byte and append a zero + // byte if necessary. + EmitByte(bw, (bw->put_buffer >> 56) & 0xFF); + EmitByte(bw, (bw->put_buffer >> 48) & 0xFF); + EmitByte(bw, (bw->put_buffer >> 40) & 0xFF); + EmitByte(bw, (bw->put_buffer >> 32) & 0xFF); + EmitByte(bw, (bw->put_buffer >> 24) & 0xFF); + EmitByte(bw, (bw->put_buffer >> 16) & 0xFF); + } else { + // We don't have any 0xFF bytes, output all 6 bytes without checking. + bw->data[bw->pos] = (bw->put_buffer >> 56) & 0xFF; + bw->data[bw->pos + 1] = (bw->put_buffer >> 48) & 0xFF; + bw->data[bw->pos + 2] = (bw->put_buffer >> 40) & 0xFF; + bw->data[bw->pos + 3] = (bw->put_buffer >> 32) & 0xFF; + bw->data[bw->pos + 4] = (bw->put_buffer >> 24) & 0xFF; + bw->data[bw->pos + 5] = (bw->put_buffer >> 16) & 0xFF; + bw->pos += 6; + } + bw->put_buffer <<= 48; + bw->put_bits += 48; +} + +static JXL_INLINE void WriteBits(JpegBitWriter* bw, int nbits, uint64_t bits) { + // This is an optimization; if everything goes well, + // then |nbits| is positive; if non-existing Huffman symbol is going to be + // encoded, its length should be zero; later encoder could check the + // "health" of JpegBitWriter. + if (nbits == 0) { + bw->healthy = false; + return; + } + bw->put_bits -= nbits; + bw->put_buffer |= (bits << bw->put_bits); + if (bw->put_bits <= 16) DischargeBitBuffer(bw); +} + +void EmitMarker(JpegBitWriter* bw, int marker) { + Reserve(bw, 2); + JXL_DASSERT(marker != 0xFF); + bw->data[bw->pos++] = 0xFF; + bw->data[bw->pos++] = marker; +} + +bool JumpToByteBoundary(JpegBitWriter* bw, const uint8_t** pad_bits, + const uint8_t* pad_bits_end) { + size_t n_bits = bw->put_bits & 7u; + uint8_t pad_pattern; + if (*pad_bits == nullptr) { + pad_pattern = (1u << n_bits) - 1; + } else { + pad_pattern = 0; + const uint8_t* src = *pad_bits; + // TODO(eustas): bitwise reading looks insanely ineffective... + while (n_bits--) { + pad_pattern <<= 1; + if (src >= pad_bits_end) return false; + // TODO(eustas): DCHECK *src == {0, 1} + pad_pattern |= !!*(src++); + } + *pad_bits = src; + } + + Reserve(bw, 16); + + while (bw->put_bits <= 56) { + int c = (bw->put_buffer >> 56) & 0xFF; + EmitByte(bw, c); + bw->put_buffer <<= 8; + bw->put_bits += 8; + } + if (bw->put_bits < 64) { + int pad_mask = 0xFFu >> (64 - bw->put_bits); + int c = ((bw->put_buffer >> 56) & ~pad_mask) | pad_pattern; + EmitByte(bw, c); + } + bw->put_buffer = 0; + bw->put_bits = 64; + + return true; +} + +void JpegBitWriterFinish(JpegBitWriter* bw) { + if (bw->pos == 0) return; + bw->chunk.len = bw->pos; + bw->output->emplace_back(std::move(bw->chunk)); + bw->chunk = OutputChunk(nullptr, 0); + bw->data = nullptr; + bw->pos = 0; +} + +void DCTCodingStateInit(DCTCodingState* s) { + s->eob_run_ = 0; + s->cur_ac_huff_ = nullptr; + s->refinement_bits_.clear(); + s->refinement_bits_.reserve(kJPEGMaxCorrectionBits); +} + +enum OutputModes { + kModeHistogram, + kModeWrite, +}; + +template +static JXL_INLINE void WriteSymbol(int symbol, HuffmanCodeTable* table, + JpegBitWriter* bw) { + if (kOutputMode == OutputModes::kModeHistogram) { + ++table->depth[symbol]; + } else { + WriteBits(bw, table->depth[symbol], table->code[symbol]); + } +} + +// Emit all buffered data to the bit stream using the given Huffman code and +// bit writer. +template +static JXL_INLINE void Flush(DCTCodingState* s, JpegBitWriter* bw) { + if (s->eob_run_ > 0) { + int nbits = FloorLog2Nonzero(s->eob_run_); + int symbol = nbits << 4u; + WriteSymbol(symbol, s->cur_ac_huff_, bw); + if (nbits > 0) { + WriteBits(bw, nbits, s->eob_run_ & ((1 << nbits) - 1)); + } + s->eob_run_ = 0; + } + for (size_t i = 0; i < s->refinement_bits_.size(); ++i) { + WriteBits(bw, 1, s->refinement_bits_[i]); + } + s->refinement_bits_.clear(); +} + +// Buffer some more data at the end-of-band (the last non-zero or newly +// non-zero coefficient within the [Ss, Se] spectral band). +template +static JXL_INLINE void BufferEndOfBand(DCTCodingState* s, + HuffmanCodeTable* ac_huff, + const std::vector* new_bits, + JpegBitWriter* bw) { + if (s->eob_run_ == 0) { + s->cur_ac_huff_ = ac_huff; + } + ++s->eob_run_; + if (new_bits) { + s->refinement_bits_.insert(s->refinement_bits_.end(), new_bits->begin(), + new_bits->end()); + } + if (s->eob_run_ == 0x7FFF || + s->refinement_bits_.size() > kJPEGMaxCorrectionBits - kDCTBlockSize + 1) { + Flush(s, bw); + } +} + +bool BuildHuffmanCodeTable(const JPEGHuffmanCode& huff, + HuffmanCodeTable* table) { + int huff_code[kJpegHuffmanAlphabetSize]; + // +1 for a sentinel element. + uint32_t huff_size[kJpegHuffmanAlphabetSize + 1]; + int p = 0; + for (size_t l = 1; l <= kJpegHuffmanMaxBitLength; ++l) { + int i = huff.counts[l]; + if (p + i > kJpegHuffmanAlphabetSize + 1) { + return false; + } + while (i--) huff_size[p++] = l; + } + + if (p == 0) { + return true; + } + + // Reuse sentinel element. + int last_p = p - 1; + huff_size[last_p] = 0; + + int code = 0; + uint32_t si = huff_size[0]; + p = 0; + while (huff_size[p]) { + while ((huff_size[p]) == si) { + huff_code[p++] = code; + code++; + } + code <<= 1; + si++; + } + for (p = 0; p < last_p; p++) { + int i = huff.values[p]; + table->depth[i] = huff_size[p]; + table->code[i] = huff_code[p]; + } + return true; +} + +bool EncodeSOI(SerializationState* state) { + state->output_queue.push_back(OutputChunk({0xFF, 0xD8})); + return true; +} + +bool EncodeEOI(const JPEGData& jpg, SerializationState* state) { + state->output_queue.push_back(OutputChunk({0xFF, 0xD9})); + state->output_queue.emplace_back(jpg.tail_data); + return true; +} + +bool EncodeSOF(const JPEGData& jpg, uint8_t marker, SerializationState* state) { + if (marker <= 0xC2) state->is_progressive = (marker == 0xC2); + + const size_t n_comps = jpg.components.size(); + const size_t marker_len = 8 + 3 * n_comps; + state->output_queue.emplace_back(marker_len + 2); + uint8_t* data = state->output_queue.back().buffer->data(); + size_t pos = 0; + data[pos++] = 0xFF; + data[pos++] = marker; + data[pos++] = marker_len >> 8u; + data[pos++] = marker_len & 0xFFu; + data[pos++] = kJpegPrecision; + data[pos++] = jpg.height >> 8u; + data[pos++] = jpg.height & 0xFFu; + data[pos++] = jpg.width >> 8u; + data[pos++] = jpg.width & 0xFFu; + data[pos++] = n_comps; + for (size_t i = 0; i < n_comps; ++i) { + data[pos++] = jpg.components[i].id; + data[pos++] = ((jpg.components[i].h_samp_factor << 4u) | + (jpg.components[i].v_samp_factor)); + const size_t quant_idx = jpg.components[i].quant_idx; + if (quant_idx >= jpg.quant.size()) return false; + data[pos++] = jpg.quant[quant_idx].index; + } + return true; +} + +bool EncodeSOS(const JPEGData& jpg, const JPEGScanInfo& scan_info, + SerializationState* state) { + const size_t n_scans = scan_info.num_components; + const size_t marker_len = 6 + 2 * n_scans; + state->output_queue.emplace_back(marker_len + 2); + uint8_t* data = state->output_queue.back().buffer->data(); + size_t pos = 0; + data[pos++] = 0xFF; + data[pos++] = 0xDA; + data[pos++] = marker_len >> 8u; + data[pos++] = marker_len & 0xFFu; + data[pos++] = n_scans; + for (size_t i = 0; i < n_scans; ++i) { + const JPEGComponentScanInfo& si = scan_info.components[i]; + if (si.comp_idx >= jpg.components.size()) return false; + data[pos++] = jpg.components[si.comp_idx].id; + data[pos++] = (si.dc_tbl_idx << 4u) + si.ac_tbl_idx; + } + data[pos++] = scan_info.Ss; + data[pos++] = scan_info.Se; + data[pos++] = ((scan_info.Ah << 4u) | (scan_info.Al)); + return true; +} + +bool EncodeDHT(const JPEGData& jpg, SerializationState* state) { + const std::vector& huffman_code = jpg.huffman_code; + + size_t marker_len = 2; + for (size_t i = state->dht_index; i < huffman_code.size(); ++i) { + const JPEGHuffmanCode& huff = huffman_code[i]; + marker_len += kJpegHuffmanMaxBitLength; + for (size_t j = 0; j < huff.counts.size(); ++j) { + marker_len += huff.counts[j]; + } + if (huff.is_last) break; + } + state->output_queue.emplace_back(marker_len + 2); + uint8_t* data = state->output_queue.back().buffer->data(); + size_t pos = 0; + data[pos++] = 0xFF; + data[pos++] = 0xC4; + data[pos++] = marker_len >> 8u; + data[pos++] = marker_len & 0xFFu; + while (true) { + const size_t huffman_code_index = state->dht_index++; + if (huffman_code_index >= huffman_code.size()) { + return false; + } + const JPEGHuffmanCode& huff = huffman_code[huffman_code_index]; + size_t index = huff.slot_id; + HuffmanCodeTable* huff_table; + if (index & 0x10) { + index -= 0x10; + huff_table = &state->ac_huff_table[index]; + } else { + huff_table = &state->dc_huff_table[index]; + } + // TODO(eustas): cache + // TODO(eustas): set up non-existing symbols + if (!BuildHuffmanCodeTable(huff, huff_table)) { + return false; + } + size_t total_count = 0; + size_t max_length = 0; + for (size_t i = 0; i < huff.counts.size(); ++i) { + if (huff.counts[i] != 0) { + max_length = i; + } + total_count += huff.counts[i]; + } + --total_count; + data[pos++] = huff.slot_id; + for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) { + data[pos++] = (i == max_length ? huff.counts[i] - 1 : huff.counts[i]); + } + for (size_t i = 0; i < total_count; ++i) { + data[pos++] = huff.values[i]; + } + if (huff.is_last) break; + } + return true; +} + +bool EncodeDQT(const JPEGData& jpg, SerializationState* state) { + int marker_len = 2; + for (size_t i = state->dqt_index; i < jpg.quant.size(); ++i) { + const JPEGQuantTable& table = jpg.quant[i]; + marker_len += 1 + (table.precision ? 2 : 1) * kDCTBlockSize; + if (table.is_last) break; + } + state->output_queue.emplace_back(marker_len + 2); + uint8_t* data = state->output_queue.back().buffer->data(); + size_t pos = 0; + data[pos++] = 0xFF; + data[pos++] = 0xDB; + data[pos++] = marker_len >> 8u; + data[pos++] = marker_len & 0xFFu; + while (true) { + const size_t idx = state->dqt_index++; + if (idx >= jpg.quant.size()) { + return false; // corrupt input + } + const JPEGQuantTable& table = jpg.quant[idx]; + data[pos++] = (table.precision << 4u) + table.index; + for (size_t i = 0; i < kDCTBlockSize; ++i) { + int val_idx = kJPEGNaturalOrder[i]; + int val = table.values[val_idx]; + if (table.precision) { + data[pos++] = val >> 8u; + } + data[pos++] = val & 0xFFu; + } + if (table.is_last) break; + } + return true; +} + +bool EncodeDRI(const JPEGData& jpg, SerializationState* state) { + state->seen_dri_marker = true; + OutputChunk dri_marker = {0xFF, + 0xDD, + 0, + 4, + static_cast(jpg.restart_interval >> 8), + static_cast(jpg.restart_interval & 0xFF)}; + state->output_queue.push_back(std::move(dri_marker)); + return true; +} + +bool EncodeRestart(uint8_t marker, SerializationState* state) { + state->output_queue.push_back(OutputChunk({0xFF, marker})); + return true; +} + +bool EncodeAPP(const JPEGData& jpg, uint8_t marker, SerializationState* state) { + // TODO(eustas): check that marker corresponds to payload? + (void)marker; + + size_t app_index = state->app_index++; + if (app_index >= jpg.app_data.size()) return false; + state->output_queue.push_back(OutputChunk({0xFF})); + state->output_queue.emplace_back(jpg.app_data[app_index]); + return true; +} + +bool EncodeCOM(const JPEGData& jpg, SerializationState* state) { + size_t com_index = state->com_index++; + if (com_index >= jpg.com_data.size()) return false; + state->output_queue.push_back(OutputChunk({0xFF})); + state->output_queue.emplace_back(jpg.com_data[com_index]); + return true; +} + +bool EncodeInterMarkerData(const JPEGData& jpg, SerializationState* state) { + size_t index = state->data_index++; + if (index >= jpg.inter_marker_data.size()) return false; + state->output_queue.emplace_back(jpg.inter_marker_data[index]); + return true; +} + +template +bool EncodeDCTBlockSequential(const coeff_t* coeffs, HuffmanCodeTable* dc_huff, + HuffmanCodeTable* ac_huff, int num_zero_runs, + coeff_t* last_dc_coeff, JpegBitWriter* bw) { + coeff_t temp2; + coeff_t temp; + temp2 = coeffs[0]; + temp = temp2 - *last_dc_coeff; + *last_dc_coeff = temp2; + temp2 = temp; + if (temp < 0) { + temp = -temp; + if (temp < 0) return false; + temp2--; + } + int dc_nbits = (temp == 0) ? 0 : (FloorLog2Nonzero(temp) + 1); + WriteSymbol(dc_nbits, dc_huff, bw); + if (dc_nbits >= 12) return false; + if (dc_nbits > 0) { + WriteBits(bw, dc_nbits, temp2 & ((1u << dc_nbits) - 1)); + } + int r = 0; + for (int k = 1; k < 64; ++k) { + if ((temp = coeffs[kJPEGNaturalOrder[k]]) == 0) { + r++; + continue; + } + if (temp < 0) { + temp = -temp; + if (temp < 0) return false; + temp2 = ~temp; + } else { + temp2 = temp; + } + while (r > 15) { + WriteSymbol(0xf0, ac_huff, bw); + r -= 16; + } + int ac_nbits = FloorLog2Nonzero(temp) + 1; + if (ac_nbits >= 16) return false; + int symbol = (r << 4u) + ac_nbits; + WriteSymbol(symbol, ac_huff, bw); + WriteBits(bw, ac_nbits, temp2 & ((1 << ac_nbits) - 1)); + r = 0; + } + for (int i = 0; i < num_zero_runs; ++i) { + WriteSymbol(0xf0, ac_huff, bw); + r -= 16; + } + if (r > 0) { + WriteSymbol(0, ac_huff, bw); + } + return true; +} + +template +bool EncodeDCTBlockProgressive(const coeff_t* coeffs, HuffmanCodeTable* dc_huff, + HuffmanCodeTable* ac_huff, int Ss, int Se, + int Al, int num_zero_runs, + DCTCodingState* coding_state, + coeff_t* last_dc_coeff, JpegBitWriter* bw) { + bool eob_run_allowed = Ss > 0; + coeff_t temp2; + coeff_t temp; + if (Ss == 0) { + temp2 = coeffs[0] >> Al; + temp = temp2 - *last_dc_coeff; + *last_dc_coeff = temp2; + temp2 = temp; + if (temp < 0) { + temp = -temp; + if (temp < 0) return false; + temp2--; + } + int nbits = (temp == 0) ? 0 : (FloorLog2Nonzero(temp) + 1); + WriteSymbol(nbits, dc_huff, bw); + if (nbits > 0) { + WriteBits(bw, nbits, temp2 & ((1 << nbits) - 1)); + } + ++Ss; + } + if (Ss > Se) { + return true; + } + int r = 0; + for (int k = Ss; k <= Se; ++k) { + if ((temp = coeffs[kJPEGNaturalOrder[k]]) == 0) { + r++; + continue; + } + if (temp < 0) { + temp = -temp; + if (temp < 0) return false; + temp >>= Al; + temp2 = ~temp; + } else { + temp >>= Al; + temp2 = temp; + } + if (temp == 0) { + r++; + continue; + } + Flush(coding_state, bw); + while (r > 15) { + WriteSymbol(0xf0, ac_huff, bw); + r -= 16; + } + int nbits = FloorLog2Nonzero(temp) + 1; + int symbol = (r << 4u) + nbits; + WriteSymbol(symbol, ac_huff, bw); + WriteBits(bw, nbits, temp2 & ((1 << nbits) - 1)); + r = 0; + } + if (num_zero_runs > 0) { + Flush(coding_state, bw); + for (int i = 0; i < num_zero_runs; ++i) { + WriteSymbol(0xf0, ac_huff, bw); + r -= 16; + } + } + if (r > 0) { + BufferEndOfBand(coding_state, ac_huff, nullptr, bw); + if (!eob_run_allowed) { + Flush(coding_state, bw); + } + } + return true; +} + +template +bool EncodeRefinementBits(const coeff_t* coeffs, HuffmanCodeTable* ac_huff, + int Ss, int Se, int Al, DCTCodingState* coding_state, + JpegBitWriter* bw) { + bool eob_run_allowed = Ss > 0; + if (Ss == 0) { + // Emit next bit of DC component. + WriteBits(bw, 1, (coeffs[0] >> Al) & 1); + ++Ss; + } + if (Ss > Se) { + return true; + } + int abs_values[kDCTBlockSize]; + int eob = 0; + for (int k = Ss; k <= Se; k++) { + const coeff_t abs_val = std::abs(coeffs[kJPEGNaturalOrder[k]]); + abs_values[k] = abs_val >> Al; + if (abs_values[k] == 1) { + eob = k; + } + } + int r = 0; + std::vector refinement_bits; + refinement_bits.reserve(kDCTBlockSize); + for (int k = Ss; k <= Se; k++) { + if (abs_values[k] == 0) { + r++; + continue; + } + while (r > 15 && k <= eob) { + Flush(coding_state, bw); + WriteSymbol(0xf0, ac_huff, bw); + r -= 16; + for (int bit : refinement_bits) { + WriteBits(bw, 1, bit); + } + refinement_bits.clear(); + } + if (abs_values[k] > 1) { + refinement_bits.push_back(abs_values[k] & 1u); + continue; + } + Flush(coding_state, bw); + int symbol = (r << 4u) + 1; + int new_non_zero_bit = (coeffs[kJPEGNaturalOrder[k]] < 0) ? 0 : 1; + WriteSymbol(symbol, ac_huff, bw); + WriteBits(bw, 1, new_non_zero_bit); + for (int bit : refinement_bits) { + WriteBits(bw, 1, bit); + } + refinement_bits.clear(); + r = 0; + } + if (r > 0 || !refinement_bits.empty()) { + BufferEndOfBand(coding_state, ac_huff, &refinement_bits, bw); + if (!eob_run_allowed) { + Flush(coding_state, bw); + } + } + return true; +} + +size_t NumHistograms(const JPEGData& jpg) { + size_t num = 0; + for (const auto& si : jpg.scan_info) { + num += si.num_components; + } + return num; +} + +size_t HistogramIndex(const JPEGData& jpg, size_t scan_index, + size_t component_index) { + size_t idx = 0; + for (size_t i = 0; i < scan_index; ++i) { + idx += jpg.scan_info[i].num_components; + } + return idx + component_index; +} + +template +SerializationStatus JXL_NOINLINE DoEncodeScan(const JPEGData& jpg, + SerializationState* state) { + const JPEGScanInfo& scan_info = jpg.scan_info[state->scan_index]; + EncodeScanState& ss = state->scan_state; + + const int restart_interval = + state->seen_dri_marker ? jpg.restart_interval : 0; + + const auto get_next_extra_zero_run_index = [&ss, &scan_info]() -> int { + if (ss.extra_zero_runs_pos < scan_info.extra_zero_runs.size()) { + return scan_info.extra_zero_runs[ss.extra_zero_runs_pos].block_idx; + } else { + return -1; + } + }; + + const auto get_next_reset_point = [&ss, &scan_info]() -> int { + if (ss.next_reset_point_pos < scan_info.reset_points.size()) { + return scan_info.reset_points[ss.next_reset_point_pos++]; + } else { + return -1; + } + }; + + if (ss.stage == EncodeScanState::HEAD) { + if (!EncodeSOS(jpg, scan_info, state)) return SerializationStatus::ERROR; + JpegBitWriterInit(&ss.bw, &state->output_queue); + DCTCodingStateInit(&ss.coding_state); + ss.restarts_to_go = restart_interval; + ss.next_restart_marker = 0; + ss.block_scan_index = 0; + ss.extra_zero_runs_pos = 0; + ss.next_extra_zero_run_index = get_next_extra_zero_run_index(); + ss.next_reset_point_pos = 0; + ss.next_reset_point = get_next_reset_point(); + ss.mcu_y = 0; + memset(ss.last_dc_coeff, 0, sizeof(ss.last_dc_coeff)); + ss.stage = EncodeScanState::BODY; + } + JpegBitWriter* bw = &ss.bw; + DCTCodingState* coding_state = &ss.coding_state; + + JXL_DASSERT(ss.stage == EncodeScanState::BODY); + + // "Non-interleaved" means color data comes in separate scans, in other words + // each scan can contain only one color component. + const bool is_interleaved = (scan_info.num_components > 1); + int MCUs_per_row = 0; + int MCU_rows = 0; + jpg.CalculateMcuSize(scan_info, &MCUs_per_row, &MCU_rows); + const bool is_progressive = state->is_progressive; + const int Al = is_progressive ? scan_info.Al : 0; + const int Ss = is_progressive ? scan_info.Ss : 0; + const int Se = is_progressive ? scan_info.Se : 63; + + // DC-only is defined by [0..0] spectral range. + const bool want_ac = ((Ss != 0) || (Se != 0)); + // TODO: support streaming decoding again. + const bool complete_ac = true; + const bool has_ac = true; + if (want_ac && !has_ac) return SerializationStatus::NEEDS_MORE_INPUT; + + // |has_ac| implies |complete_dc| but not vice versa; for the sake of + // simplicity we pretend they are equal, because they are separated by just a + // few bytes of input. + const bool complete_dc = has_ac; + const bool complete = want_ac ? complete_ac : complete_dc; + // When "incomplete" |ac_dc| tracks information about current ("incomplete") + // band parsing progress. + + // FIXME: Is this always complete? + // const int last_mcu_y = + // complete ? MCU_rows : parsing_state.internal->ac_dc.next_mcu_y * + // v_group; + (void)complete; + const int last_mcu_y = complete ? MCU_rows : 0; + + for (; ss.mcu_y < last_mcu_y; ++ss.mcu_y) { + for (int mcu_x = 0; mcu_x < MCUs_per_row; ++mcu_x) { + // Possibly emit a restart marker. + if (restart_interval > 0 && ss.restarts_to_go == 0) { + Flush(coding_state, bw); + if (!JumpToByteBoundary(bw, &state->pad_bits, state->pad_bits_end)) { + return SerializationStatus::ERROR; + } + EmitMarker(bw, 0xD0 + ss.next_restart_marker); + ss.next_restart_marker += 1; + ss.next_restart_marker &= 0x7; + ss.restarts_to_go = restart_interval; + memset(ss.last_dc_coeff, 0, sizeof(ss.last_dc_coeff)); + } + // Encode one MCU + for (size_t i = 0; i < scan_info.num_components; ++i) { + const JPEGComponentScanInfo& si = scan_info.components[i]; + const JPEGComponent& c = jpg.components[si.comp_idx]; + size_t dc_tbl_idx = (kOutputMode == OutputModes::kModeHistogram + ? HistogramIndex(jpg, state->scan_index, i) + : si.dc_tbl_idx); + size_t ac_tbl_idx = (kOutputMode == OutputModes::kModeHistogram + ? HistogramIndex(jpg, state->scan_index, i) + : si.ac_tbl_idx); + HuffmanCodeTable* dc_huff = &state->dc_huff_table[dc_tbl_idx]; + HuffmanCodeTable* ac_huff = &state->ac_huff_table[ac_tbl_idx]; + int n_blocks_y = is_interleaved ? c.v_samp_factor : 1; + int n_blocks_x = is_interleaved ? c.h_samp_factor : 1; + for (int iy = 0; iy < n_blocks_y; ++iy) { + for (int ix = 0; ix < n_blocks_x; ++ix) { + int block_y = ss.mcu_y * n_blocks_y + iy; + int block_x = mcu_x * n_blocks_x + ix; + int block_idx = block_y * c.width_in_blocks + block_x; + if (ss.block_scan_index == ss.next_reset_point) { + Flush(coding_state, bw); + ss.next_reset_point = get_next_reset_point(); + } + int num_zero_runs = 0; + if (ss.block_scan_index == ss.next_extra_zero_run_index) { + num_zero_runs = scan_info.extra_zero_runs[ss.extra_zero_runs_pos] + .num_extra_zero_runs; + ++ss.extra_zero_runs_pos; + ss.next_extra_zero_run_index = get_next_extra_zero_run_index(); + } + const coeff_t* coeffs = &c.coeffs[block_idx << 6]; + bool ok; + if (kMode == 0) { + ok = EncodeDCTBlockSequential( + coeffs, dc_huff, ac_huff, num_zero_runs, + ss.last_dc_coeff + si.comp_idx, bw); + } else if (kMode == 1) { + ok = EncodeDCTBlockProgressive( + coeffs, dc_huff, ac_huff, Ss, Se, Al, num_zero_runs, + coding_state, ss.last_dc_coeff + si.comp_idx, bw); + } else { + ok = EncodeRefinementBits(coeffs, ac_huff, Ss, Se, + Al, coding_state, bw); + } + if (!ok) return SerializationStatus::ERROR; + ++ss.block_scan_index; + } + } + } + --ss.restarts_to_go; + } + } + if (ss.mcu_y < MCU_rows) { + if (!bw->healthy) return SerializationStatus::ERROR; + return SerializationStatus::NEEDS_MORE_INPUT; + } + Flush(coding_state, bw); + if (!JumpToByteBoundary(bw, &state->pad_bits, state->pad_bits_end)) { + return SerializationStatus::ERROR; + } + JpegBitWriterFinish(bw); + ss.stage = EncodeScanState::HEAD; + state->scan_index++; + if (!bw->healthy) return SerializationStatus::ERROR; + + return SerializationStatus::DONE; +} + +template +static SerializationStatus JXL_INLINE EncodeScan(const JPEGData& jpg, + SerializationState* state) { + const JPEGScanInfo& scan_info = jpg.scan_info[state->scan_index]; + const bool is_progressive = state->is_progressive; + const int Al = is_progressive ? scan_info.Al : 0; + const int Ah = is_progressive ? scan_info.Ah : 0; + const int Ss = is_progressive ? scan_info.Ss : 0; + const int Se = is_progressive ? scan_info.Se : 63; + const bool need_sequential = + !is_progressive || (Ah == 0 && Al == 0 && Ss == 0 && Se == 63); + if (need_sequential) { + return DoEncodeScan<0, kOutputMode>(jpg, state); + } else if (Ah == 0) { + return DoEncodeScan<1, kOutputMode>(jpg, state); + } else { + return DoEncodeScan<2, kOutputMode>(jpg, state); + } +} + +template +SerializationStatus SerializeSection(uint8_t marker, SerializationState* state, + const JPEGData& jpg) { + const auto to_status = [](bool result) { + return result ? SerializationStatus::DONE : SerializationStatus::ERROR; + }; + // TODO(eustas): add and use marker enum + switch (marker) { + case 0xC0: + case 0xC1: + case 0xC2: + case 0xC9: + case 0xCA: + return to_status(EncodeSOF(jpg, marker, state)); + + case 0xC4: + return to_status((kOutputMode == OutputModes::kModeHistogram) || + EncodeDHT(jpg, state)); + + case 0xD0: + case 0xD1: + case 0xD2: + case 0xD3: + case 0xD4: + case 0xD5: + case 0xD6: + case 0xD7: + return to_status(EncodeRestart(marker, state)); + + case 0xD9: + return to_status(EncodeEOI(jpg, state)); + + case 0xDA: + return EncodeScan(jpg, state); + + case 0xDB: + return to_status(EncodeDQT(jpg, state)); + + case 0xDD: + return to_status(EncodeDRI(jpg, state)); + + case 0xE0: + case 0xE1: + case 0xE2: + case 0xE3: + case 0xE4: + case 0xE5: + case 0xE6: + case 0xE7: + case 0xE8: + case 0xE9: + case 0xEA: + case 0xEB: + case 0xEC: + case 0xED: + case 0xEE: + case 0xEF: + return to_status(EncodeAPP(jpg, marker, state)); + + case 0xFE: + return to_status(EncodeCOM(jpg, state)); + + case 0xFF: + return to_status(EncodeInterMarkerData(jpg, state)); + + default: + return SerializationStatus::ERROR; + } +} + +// TODO(veluca): add streaming support again. +template +Status WriteJpegInternal(const JPEGData& jpg, const JPEGOutput& out, + SerializationState* ss) { + const auto maybe_push_output = [&]() -> Status { + if (ss->stage != SerializationState::STAGE_ERROR) { + while (!ss->output_queue.empty()) { + auto& chunk = ss->output_queue.front(); + size_t num_written = out(chunk.next, chunk.len); + if (num_written == 0 && chunk.len > 0) { + return StatusMessage(Status(StatusCode::kNotEnoughBytes), + "Failed to write output"); + } + chunk.len -= num_written; + if (chunk.len == 0) { + ss->output_queue.pop_front(); + } + } + } + return true; + }; + + while (true) { + switch (ss->stage) { + case SerializationState::STAGE_INIT: { + // Valid Brunsli requires, at least, 0xD9 marker. + // This might happen on corrupted stream, or on unconditioned JPEGData. + // TODO(eustas): check D9 in the only one and is the last one. + if (jpg.marker_order.empty()) { + ss->stage = SerializationState::STAGE_ERROR; + break; + } + if (kOutputMode == OutputModes::kModeHistogram) { + size_t num_histo = NumHistograms(jpg); + ss->dc_huff_table.resize(num_histo); + ss->ac_huff_table.resize(num_histo); + for (size_t i = 0; i < num_histo; ++i) { + ss->dc_huff_table[i].InitDepths(); + ss->ac_huff_table[i].InitDepths(); + } + } else { + ss->dc_huff_table.resize(kMaxHuffmanTables); + ss->ac_huff_table.resize(kMaxHuffmanTables); + } + if (jpg.has_zero_padding_bit) { + ss->pad_bits = jpg.padding_bits.data(); + ss->pad_bits_end = ss->pad_bits + jpg.padding_bits.size(); + } + + EncodeSOI(ss); + JXL_QUIET_RETURN_IF_ERROR(maybe_push_output()); + ss->stage = SerializationState::STAGE_SERIALIZE_SECTION; + break; + } + + case SerializationState::STAGE_SERIALIZE_SECTION: { + if (ss->section_index >= jpg.marker_order.size()) { + ss->stage = SerializationState::STAGE_DONE; + break; + } + uint8_t marker = jpg.marker_order[ss->section_index]; + SerializationStatus status = + SerializeSection(marker, ss, jpg); + if (status == SerializationStatus::ERROR) { + JXL_WARNING("Failed to encode marker 0x%.2x", marker); + ss->stage = SerializationState::STAGE_ERROR; + break; + } + JXL_QUIET_RETURN_IF_ERROR(maybe_push_output()); + if (status == SerializationStatus::NEEDS_MORE_INPUT) { + return JXL_FAILURE("Incomplete serialization data"); + } else if (status != SerializationStatus::DONE) { + JXL_DASSERT(false); + ss->stage = SerializationState::STAGE_ERROR; + break; + } + ++ss->section_index; + break; + } + + case SerializationState::STAGE_DONE: + JXL_ASSERT(ss->output_queue.empty()); + if (ss->pad_bits != nullptr && ss->pad_bits != ss->pad_bits_end) { + return JXL_FAILURE("Invalid number of padding bits."); + } + return true; + + case SerializationState::STAGE_ERROR: + return JXL_FAILURE("JPEG serialization error"); + } + } +} + +} // namespace + +Status WriteJpeg(const JPEGData& jpg, const JPEGOutput& out) { + SerializationState ss; + return WriteJpegInternal(jpg, out, &ss); +} + +Status ProcessJpeg(const JPEGData& jpg, SerializationState* ss) { + auto nullout = [](const uint8_t* buf, size_t len) { return len; }; + return WriteJpegInternal(jpg, nullout, ss); +} + +} // namespace jpeg +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.h b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.h new file mode 100644 index 0000000000..9ccfb749a8 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.h @@ -0,0 +1,35 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Functions for writing a JPEGData object into a jpeg byte stream. + +#ifndef LIB_JXL_JPEG_DEC_JPEG_DATA_WRITER_H_ +#define LIB_JXL_JPEG_DEC_JPEG_DATA_WRITER_H_ + +#include +#include + +#include + +#include "lib/jxl/jpeg/dec_jpeg_serialization_state.h" +#include "lib/jxl/jpeg/jpeg_data.h" + +namespace jxl { +namespace jpeg { + +// Function type used to write len bytes into buf. Returns the number of bytes +// written. +using JPEGOutput = std::function; + +Status WriteJpeg(const JPEGData& jpg, const JPEGOutput& out); + +// Same as WriteJpeg, but instead of writing to the output, collects statistics +// about the bit-stream into `ss`. +Status ProcessJpeg(const JPEGData& jpg, SerializationState* ss); + +} // namespace jpeg +} // namespace jxl + +#endif // LIB_JXL_JPEG_DEC_JPEG_DATA_WRITER_H_ diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_output_chunk.h b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_output_chunk.h new file mode 100644 index 0000000000..e003c04952 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_output_chunk.h @@ -0,0 +1,72 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_JPEG_DEC_JPEG_OUTPUT_CHUNK_H_ +#define LIB_JXL_JPEG_DEC_JPEG_OUTPUT_CHUNK_H_ + +#include +#include + +#include +#include +#include + +namespace jxl { +namespace jpeg { + +/** + * A chunk of output data. + * + * Data producer creates OutputChunks and adds them to the end output queue. + * Once control flow leaves the producer code, it is considered that chunk of + * data is final and can not be changed; to underline this fact |next| is a + * const-pointer. + * + * Data consumer removes OutputChunks from the beginning of the output queue. + * It is possible to consume OutputChunks partially, by updating |next| and + * |len|. + * + * There are 2 types of output chunks: + * - owning: actual data is stored in |buffer| field; producer fills data after + * the instance it created; it is legal to reduce |len| to show that not all + * the capacity of |buffer| is used + * - non-owning: represents the data stored (owned) somewhere else + */ +struct OutputChunk { + // Non-owning + template + explicit OutputChunk(Bytes& bytes) : len(bytes.size()) { + // Deal both with const qualifier and data type. + const void* src = bytes.data(); + next = reinterpret_cast(src); + } + + // Non-owning + OutputChunk(const uint8_t* data, size_t size) : next(data), len(size) {} + + // Owning + explicit OutputChunk(size_t size = 0) { + buffer.reset(new std::vector(size)); + next = buffer->data(); + len = size; + } + + // Owning + OutputChunk(std::initializer_list bytes) { + buffer.reset(new std::vector(bytes)); + next = buffer->data(); + len = bytes.size(); + } + + const uint8_t* next; + size_t len; + // TODO(veluca): consider removing the unique_ptr. + std::unique_ptr> buffer; +}; + +} // namespace jpeg +} // namespace jxl + +#endif // LIB_JXL_JPEG_DEC_JPEG_OUTPUT_CHUNK_H_ diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_serialization_state.h b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_serialization_state.h new file mode 100644 index 0000000000..40ce450a76 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_serialization_state.h @@ -0,0 +1,96 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_JPEG_DEC_JPEG_SERIALIZATION_STATE_H_ +#define LIB_JXL_JPEG_DEC_JPEG_SERIALIZATION_STATE_H_ + +#include +#include + +#include "lib/jxl/jpeg/dec_jpeg_output_chunk.h" +#include "lib/jxl/jpeg/jpeg_data.h" + +namespace jxl { +namespace jpeg { + +struct HuffmanCodeTable { + int depth[256]; + int code[256]; + void InitDepths() { std::fill(std::begin(depth), std::end(depth), 0); } +}; + +// Handles the packing of bits into output bytes. +struct JpegBitWriter { + bool healthy; + std::deque* output; + OutputChunk chunk; + uint8_t* data; + size_t pos; + uint64_t put_buffer; + int put_bits; +}; + +// Holds data that is buffered between 8x8 blocks in progressive mode. +struct DCTCodingState { + // The run length of end-of-band symbols in a progressive scan. + int eob_run_; + // The huffman table to be used when flushing the state. + HuffmanCodeTable* cur_ac_huff_; + // The sequence of currently buffered refinement bits for a successive + // approximation scan (one where Ah > 0). + std::vector refinement_bits_; +}; + +struct EncodeScanState { + enum Stage { HEAD, BODY }; + + Stage stage = HEAD; + + int mcu_y; + JpegBitWriter bw; + coeff_t last_dc_coeff[kMaxComponents] = {0}; + int restarts_to_go; + int next_restart_marker; + int block_scan_index; + DCTCodingState coding_state; + size_t extra_zero_runs_pos; + int next_extra_zero_run_index; + size_t next_reset_point_pos; + int next_reset_point; +}; + +struct SerializationState { + enum Stage { + STAGE_INIT, + STAGE_SERIALIZE_SECTION, + STAGE_DONE, + STAGE_ERROR, + }; + + Stage stage = STAGE_INIT; + + std::deque output_queue; + + size_t section_index = 0; + int dht_index = 0; + int dqt_index = 0; + int app_index = 0; + int com_index = 0; + int data_index = 0; + int scan_index = 0; + std::vector dc_huff_table; + std::vector ac_huff_table; + const uint8_t* pad_bits = nullptr; + const uint8_t* pad_bits_end = nullptr; + bool seen_dri_marker = false; + bool is_progressive = false; + + EncodeScanState scan_state; +}; + +} // namespace jpeg +} // namespace jxl + +#endif // LIB_JXL_JPEG_DEC_JPEG_SERIALIZATION_STATE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.cc b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.cc new file mode 100644 index 0000000000..842612f4ab --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.cc @@ -0,0 +1,384 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/jpeg/enc_jpeg_data.h" + +#include +#include + +#include "lib/jxl/enc_fields.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/jpeg/enc_jpeg_data_reader.h" +#include "lib/jxl/luminance.h" +#include "lib/jxl/sanitizers.h" + +namespace jxl { +namespace jpeg { + +namespace { + +constexpr int BITS_IN_JSAMPLE = 8; +using ByteSpan = Span; + +// TODO(eustas): move to jpeg_data, to use from codec_jpg as well. +// See if there is a canonically chunked ICC profile and mark corresponding +// app-tags with AppMarkerType::kICC. +Status DetectIccProfile(JPEGData& jpeg_data) { + JXL_DASSERT(jpeg_data.app_data.size() == jpeg_data.app_marker_type.size()); + size_t num_icc = 0; + size_t num_icc_jpeg = 0; + for (size_t i = 0; i < jpeg_data.app_data.size(); i++) { + const auto& app = jpeg_data.app_data[i]; + size_t pos = 0; + if (app[pos++] != 0xE2) continue; + // At least APPn + size; otherwise it should be intermarker-data. + JXL_DASSERT(app.size() >= 3); + size_t tag_length = (app[pos] << 8) + app[pos + 1]; + pos += 2; + JXL_DASSERT(app.size() == tag_length + 1); + // Empty payload is 2 bytes for tag length itself + signature + if (tag_length < 2 + sizeof kIccProfileTag) continue; + + if (memcmp(&app[pos], kIccProfileTag, sizeof kIccProfileTag) != 0) continue; + pos += sizeof kIccProfileTag; + uint8_t chunk_id = app[pos++]; + uint8_t num_chunks = app[pos++]; + if (chunk_id != num_icc + 1) continue; + if (num_icc_jpeg == 0) num_icc_jpeg = num_chunks; + if (num_icc_jpeg != num_chunks) continue; + num_icc++; + jpeg_data.app_marker_type[i] = AppMarkerType::kICC; + } + if (num_icc != num_icc_jpeg) { + return JXL_FAILURE("Invalid ICC chunks"); + } + return true; +} + +bool GetMarkerPayload(const uint8_t* data, size_t size, ByteSpan* payload) { + if (size < 3) { + return false; + } + size_t hi = data[1]; + size_t lo = data[2]; + size_t internal_size = (hi << 8u) | lo; + // Second byte of marker is not counted towards size. + if (internal_size != size - 1) { + return false; + } + // cut second marker byte and "length" from payload. + *payload = ByteSpan(data, size); + payload->remove_prefix(3); + return true; +} + +Status DetectBlobs(jpeg::JPEGData& jpeg_data) { + JXL_DASSERT(jpeg_data.app_data.size() == jpeg_data.app_marker_type.size()); + bool have_exif = false, have_xmp = false; + for (size_t i = 0; i < jpeg_data.app_data.size(); i++) { + auto& marker = jpeg_data.app_data[i]; + if (marker.empty() || marker[0] != kApp1) { + continue; + } + ByteSpan payload; + if (!GetMarkerPayload(marker.data(), marker.size(), &payload)) { + // Something is wrong with this marker; does not care. + continue; + } + if (!have_exif && payload.size() >= sizeof kExifTag && + !memcmp(payload.data(), kExifTag, sizeof kExifTag)) { + jpeg_data.app_marker_type[i] = AppMarkerType::kExif; + have_exif = true; + } + if (!have_xmp && payload.size() >= sizeof kXMPTag && + !memcmp(payload.data(), kXMPTag, sizeof kXMPTag)) { + jpeg_data.app_marker_type[i] = AppMarkerType::kXMP; + have_xmp = true; + } + } + return true; +} + +Status ParseChunkedMarker(const jpeg::JPEGData& src, uint8_t marker_type, + const ByteSpan& tag, PaddedBytes* output, + bool allow_permutations = false) { + output->clear(); + + std::vector chunks; + std::vector presence; + size_t expected_number_of_parts = 0; + bool is_first_chunk = true; + size_t ordinal = 0; + for (const auto& marker : src.app_data) { + if (marker.empty() || marker[0] != marker_type) { + continue; + } + ByteSpan payload; + if (!GetMarkerPayload(marker.data(), marker.size(), &payload)) { + // Something is wrong with this marker; does not care. + continue; + } + if ((payload.size() < tag.size()) || + memcmp(payload.data(), tag.data(), tag.size()) != 0) { + continue; + } + payload.remove_prefix(tag.size()); + if (payload.size() < 2) { + return JXL_FAILURE("Chunk is too small."); + } + uint8_t index = payload[0]; + uint8_t total = payload[1]; + ordinal++; + if (!allow_permutations) { + if (index != ordinal) return JXL_FAILURE("Invalid chunk order."); + } + + payload.remove_prefix(2); + + JXL_RETURN_IF_ERROR(total != 0); + if (is_first_chunk) { + is_first_chunk = false; + expected_number_of_parts = total; + // 1-based indices; 0-th element is added for convenience. + chunks.resize(total + 1); + presence.resize(total + 1); + } else { + JXL_RETURN_IF_ERROR(expected_number_of_parts == total); + } + + if (index == 0 || index > total) { + return JXL_FAILURE("Invalid chunk index."); + } + + if (presence[index]) { + return JXL_FAILURE("Duplicate chunk."); + } + presence[index] = true; + chunks[index] = payload; + } + + for (size_t i = 0; i < expected_number_of_parts; ++i) { + // 0-th element is not used. + size_t index = i + 1; + if (!presence[index]) { + return JXL_FAILURE("Missing chunk."); + } + output->append(chunks[index]); + } + + return true; +} + +Status SetBlobsFromJpegData(const jpeg::JPEGData& jpeg_data, Blobs* blobs) { + for (size_t i = 0; i < jpeg_data.app_data.size(); i++) { + auto& marker = jpeg_data.app_data[i]; + if (marker.empty() || marker[0] != kApp1) { + continue; + } + ByteSpan payload; + if (!GetMarkerPayload(marker.data(), marker.size(), &payload)) { + // Something is wrong with this marker; does not care. + continue; + } + if (payload.size() >= sizeof kExifTag && + !memcmp(payload.data(), kExifTag, sizeof kExifTag)) { + if (blobs->exif.empty()) { + blobs->exif.resize(payload.size() - sizeof kExifTag); + memcpy(blobs->exif.data(), payload.data() + sizeof kExifTag, + payload.size() - sizeof kExifTag); + } else { + JXL_WARNING( + "ReJPEG: multiple Exif blobs, storing only first one in the JPEG " + "XL container\n"); + } + } + if (payload.size() >= sizeof kXMPTag && + !memcmp(payload.data(), kXMPTag, sizeof kXMPTag)) { + if (blobs->xmp.empty()) { + blobs->xmp.resize(payload.size() - sizeof kXMPTag); + memcpy(blobs->xmp.data(), payload.data() + sizeof kXMPTag, + payload.size() - sizeof kXMPTag); + } else { + JXL_WARNING( + "ReJPEG: multiple XMP blobs, storing only first one in the JPEG " + "XL container\n"); + } + } + } + return true; +} + +static inline bool IsJPG(const Span bytes) { + return bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xD8; +} + +} // namespace + +Status SetColorEncodingFromJpegData(const jpeg::JPEGData& jpg, + ColorEncoding* color_encoding) { + PaddedBytes icc_profile; + if (!ParseChunkedMarker(jpg, kApp2, ByteSpan(kIccProfileTag), &icc_profile)) { + JXL_WARNING("ReJPEG: corrupted ICC profile\n"); + icc_profile.clear(); + } + + if (icc_profile.empty()) { + bool is_gray = (jpg.components.size() == 1); + *color_encoding = ColorEncoding::SRGB(is_gray); + return true; + } + + return color_encoding->SetICC(std::move(icc_profile)); +} + +Status EncodeJPEGData(JPEGData& jpeg_data, PaddedBytes* bytes, + const CompressParams& cparams) { + jpeg_data.app_marker_type.resize(jpeg_data.app_data.size(), + AppMarkerType::kUnknown); + JXL_RETURN_IF_ERROR(DetectIccProfile(jpeg_data)); + JXL_RETURN_IF_ERROR(DetectBlobs(jpeg_data)); + BitWriter writer; + JXL_RETURN_IF_ERROR(Bundle::Write(jpeg_data, &writer, 0, nullptr)); + writer.ZeroPadToByte(); + *bytes = std::move(writer).TakeBytes(); + BrotliEncoderState* brotli_enc = + BrotliEncoderCreateInstance(nullptr, nullptr, nullptr); + int effort = cparams.brotli_effort; + if (effort < 0) effort = 11 - static_cast(cparams.speed_tier); + BrotliEncoderSetParameter(brotli_enc, BROTLI_PARAM_QUALITY, effort); + size_t total_data = 0; + for (size_t i = 0; i < jpeg_data.app_data.size(); i++) { + if (jpeg_data.app_marker_type[i] != AppMarkerType::kUnknown) { + continue; + } + total_data += jpeg_data.app_data[i].size(); + } + for (size_t i = 0; i < jpeg_data.com_data.size(); i++) { + total_data += jpeg_data.com_data[i].size(); + } + for (size_t i = 0; i < jpeg_data.inter_marker_data.size(); i++) { + total_data += jpeg_data.inter_marker_data[i].size(); + } + total_data += jpeg_data.tail_data.size(); + size_t initial_size = bytes->size(); + size_t brotli_capacity = BrotliEncoderMaxCompressedSize(total_data); + BrotliEncoderSetParameter(brotli_enc, BROTLI_PARAM_SIZE_HINT, total_data); + bytes->resize(bytes->size() + brotli_capacity); + size_t enc_size = 0; + auto br_append = [&](const std::vector& data, bool last) { + size_t available_in = data.size(); + const uint8_t* in = data.data(); + uint8_t* out = &(*bytes)[initial_size + enc_size]; + do { + uint8_t* out_before = out; + msan::MemoryIsInitialized(in, available_in); + JXL_CHECK(BrotliEncoderCompressStream( + brotli_enc, last ? BROTLI_OPERATION_FINISH : BROTLI_OPERATION_PROCESS, + &available_in, &in, &brotli_capacity, &out, &enc_size)); + msan::UnpoisonMemory(out_before, out - out_before); + } while (BrotliEncoderHasMoreOutput(brotli_enc) || available_in > 0); + }; + + for (size_t i = 0; i < jpeg_data.app_data.size(); i++) { + if (jpeg_data.app_marker_type[i] != AppMarkerType::kUnknown) { + continue; + } + br_append(jpeg_data.app_data[i], /*last=*/false); + } + for (size_t i = 0; i < jpeg_data.com_data.size(); i++) { + br_append(jpeg_data.com_data[i], /*last=*/false); + } + for (size_t i = 0; i < jpeg_data.inter_marker_data.size(); i++) { + br_append(jpeg_data.inter_marker_data[i], /*last=*/false); + } + br_append(jpeg_data.tail_data, /*last=*/true); + BrotliEncoderDestroyInstance(brotli_enc); + bytes->resize(initial_size + enc_size); + return true; +} + +Status DecodeImageJPG(const Span bytes, CodecInOut* io) { + if (!IsJPG(bytes)) return false; + io->frames.clear(); + io->frames.reserve(1); + io->frames.emplace_back(&io->metadata.m); + io->Main().jpeg_data = make_unique(); + jpeg::JPEGData* jpeg_data = io->Main().jpeg_data.get(); + if (!jpeg::ReadJpeg(bytes.data(), bytes.size(), jpeg::JpegReadMode::kReadAll, + jpeg_data)) { + return JXL_FAILURE("Error reading JPEG"); + } + JXL_RETURN_IF_ERROR( + SetColorEncodingFromJpegData(*jpeg_data, &io->metadata.m.color_encoding)); + JXL_RETURN_IF_ERROR(SetBlobsFromJpegData(*jpeg_data, &io->blobs)); + size_t nbcomp = jpeg_data->components.size(); + if (nbcomp != 1 && nbcomp != 3) { + return JXL_FAILURE("Cannot recompress JPEGs with neither 1 nor 3 channels"); + } + YCbCrChromaSubsampling cs; + if (nbcomp == 3) { + uint8_t hsample[3], vsample[3]; + for (size_t i = 0; i < nbcomp; i++) { + hsample[i] = jpeg_data->components[i].h_samp_factor; + vsample[i] = jpeg_data->components[i].v_samp_factor; + } + JXL_RETURN_IF_ERROR(cs.Set(hsample, vsample)); + } else if (nbcomp == 1) { + uint8_t hsample[3], vsample[3]; + for (size_t i = 0; i < 3; i++) { + hsample[i] = jpeg_data->components[0].h_samp_factor; + vsample[i] = jpeg_data->components[0].v_samp_factor; + } + JXL_RETURN_IF_ERROR(cs.Set(hsample, vsample)); + } + bool is_rgb = false; + { + const auto& markers = jpeg_data->marker_order; + // If there is a JFIF marker, this is YCbCr. Otherwise... + if (std::find(markers.begin(), markers.end(), 0xE0) == markers.end()) { + // Try to find an 'Adobe' marker. + size_t app_markers = 0; + size_t i = 0; + for (; i < markers.size(); i++) { + // This is an APP marker. + if ((markers[i] & 0xF0) == 0xE0) { + JXL_CHECK(app_markers < jpeg_data->app_data.size()); + // APP14 marker + if (markers[i] == 0xEE) { + const auto& data = jpeg_data->app_data[app_markers]; + if (data.size() == 15 && data[3] == 'A' && data[4] == 'd' && + data[5] == 'o' && data[6] == 'b' && data[7] == 'e') { + // 'Adobe' marker. + is_rgb = data[14] == 0; + break; + } + } + app_markers++; + } + } + + if (i == markers.size()) { + // No 'Adobe' marker, guess from component IDs. + is_rgb = nbcomp == 3 && jpeg_data->components[0].id == 'R' && + jpeg_data->components[1].id == 'G' && + jpeg_data->components[2].id == 'B'; + } + } + } + + io->Main().chroma_subsampling = cs; + io->Main().color_transform = + (!is_rgb || nbcomp == 1) ? ColorTransform::kYCbCr : ColorTransform::kNone; + + io->metadata.m.SetIntensityTarget(kDefaultIntensityTarget); + io->metadata.m.SetUintSamples(BITS_IN_JSAMPLE); + io->SetFromImage(Image3F(jpeg_data->width, jpeg_data->height), + io->metadata.m.color_encoding); + SetIntensityTarget(&io->metadata.m); + return true; +} + +} // namespace jpeg +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.h b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.h new file mode 100644 index 0000000000..806128c465 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.h @@ -0,0 +1,31 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_JPEG_ENC_JPEG_DATA_H_ +#define LIB_JXL_JPEG_ENC_JPEG_DATA_H_ + +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/jpeg/jpeg_data.h" + +namespace jxl { +namespace jpeg { +Status EncodeJPEGData(JPEGData& jpeg_data, PaddedBytes* bytes, + const CompressParams& cparams); + +Status SetColorEncodingFromJpegData(const jpeg::JPEGData& jpg, + ColorEncoding* color_encoding); + +/** + * Decodes bytes containing JPEG codestream into a CodecInOut as coefficients + * only, for lossless JPEG transcoding. + */ +Status DecodeImageJPG(Span bytes, CodecInOut* io); + +} // namespace jpeg +} // namespace jxl + +#endif // LIB_JXL_JPEG_ENC_JPEG_DATA_H_ diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.cc b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.cc new file mode 100644 index 0000000000..f569b73363 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.cc @@ -0,0 +1,1053 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/jpeg/enc_jpeg_data_reader.h" + +#include +#include + +#include +#include +#include + +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/jpeg/enc_jpeg_huffman_decode.h" +#include "lib/jxl/jpeg/jpeg_data.h" + +namespace jxl { +namespace jpeg { + +namespace { +static const int kBrunsliMaxSampling = 15; + +// Macros for commonly used error conditions. + +#define JXL_JPEG_VERIFY_LEN(n) \ + if (*pos + (n) > len) { \ + return JXL_FAILURE("Unexpected end of input: pos=%" PRIuS \ + " need=%d len=%" PRIuS, \ + *pos, static_cast(n), len); \ + } + +#define JXL_JPEG_VERIFY_INPUT(var, low, high, code) \ + if ((var) < (low) || (var) > (high)) { \ + return JXL_FAILURE("Invalid " #var ": %d", static_cast(var)); \ + } + +#define JXL_JPEG_VERIFY_MARKER_END() \ + if (start_pos + marker_len != *pos) { \ + return JXL_FAILURE("Invalid marker length: declared=%" PRIuS \ + " actual=%" PRIuS, \ + marker_len, (*pos - start_pos)); \ + } + +#define JXL_JPEG_EXPECT_MARKER() \ + if (pos + 2 > len || data[pos] != 0xff) { \ + return JXL_FAILURE( \ + "Marker byte (0xff) expected, found: 0x%.2x pos=%" PRIuS \ + " len=%" PRIuS, \ + (pos < len ? data[pos] : 0), pos, len); \ + } + +inline int ReadUint8(const uint8_t* data, size_t* pos) { + return data[(*pos)++]; +} + +inline int ReadUint16(const uint8_t* data, size_t* pos) { + int v = (data[*pos] << 8) + data[*pos + 1]; + *pos += 2; + return v; +} + +// Reads the Start of Frame (SOF) marker segment and fills in *jpg with the +// parsed data. +bool ProcessSOF(const uint8_t* data, const size_t len, JpegReadMode mode, + size_t* pos, JPEGData* jpg) { + if (jpg->width != 0) { + return JXL_FAILURE("Duplicate SOF marker."); + } + const size_t start_pos = *pos; + JXL_JPEG_VERIFY_LEN(8); + size_t marker_len = ReadUint16(data, pos); + int precision = ReadUint8(data, pos); + int height = ReadUint16(data, pos); + int width = ReadUint16(data, pos); + int num_components = ReadUint8(data, pos); + // 'jbrd' is hardcoded for 8bits: + JXL_JPEG_VERIFY_INPUT(precision, 8, 8, PRECISION); + JXL_JPEG_VERIFY_INPUT(height, 1, kMaxDimPixels, HEIGHT); + JXL_JPEG_VERIFY_INPUT(width, 1, kMaxDimPixels, WIDTH); + JXL_JPEG_VERIFY_INPUT(num_components, 1, kMaxComponents, NUMCOMP); + JXL_JPEG_VERIFY_LEN(3 * num_components); + jpg->height = height; + jpg->width = width; + jpg->components.resize(num_components); + + // Read sampling factors and quant table index for each component. + std::vector ids_seen(256, false); + int max_h_samp_factor = 1; + int max_v_samp_factor = 1; + for (size_t i = 0; i < jpg->components.size(); ++i) { + const int id = ReadUint8(data, pos); + if (ids_seen[id]) { // (cf. section B.2.2, syntax of Ci) + return JXL_FAILURE("Duplicate ID %d in SOF.", id); + } + ids_seen[id] = true; + jpg->components[i].id = id; + int factor = ReadUint8(data, pos); + int h_samp_factor = factor >> 4; + int v_samp_factor = factor & 0xf; + JXL_JPEG_VERIFY_INPUT(h_samp_factor, 1, kBrunsliMaxSampling, SAMP_FACTOR); + JXL_JPEG_VERIFY_INPUT(v_samp_factor, 1, kBrunsliMaxSampling, SAMP_FACTOR); + jpg->components[i].h_samp_factor = h_samp_factor; + jpg->components[i].v_samp_factor = v_samp_factor; + jpg->components[i].quant_idx = ReadUint8(data, pos); + max_h_samp_factor = std::max(max_h_samp_factor, h_samp_factor); + max_v_samp_factor = std::max(max_v_samp_factor, v_samp_factor); + } + + // We have checked above that none of the sampling factors are 0, so the max + // sampling factors can not be 0. + int MCU_rows = DivCeil(jpg->height, max_v_samp_factor * 8); + int MCU_cols = DivCeil(jpg->width, max_h_samp_factor * 8); + // Compute the block dimensions for each component. + for (size_t i = 0; i < jpg->components.size(); ++i) { + JPEGComponent* c = &jpg->components[i]; + if (max_h_samp_factor % c->h_samp_factor != 0 || + max_v_samp_factor % c->v_samp_factor != 0) { + return JXL_FAILURE("Non-integral subsampling ratios."); + } + c->width_in_blocks = MCU_cols * c->h_samp_factor; + c->height_in_blocks = MCU_rows * c->v_samp_factor; + const uint64_t num_blocks = + static_cast(c->width_in_blocks) * c->height_in_blocks; + if (mode == JpegReadMode::kReadAll) { + c->coeffs.resize(num_blocks * kDCTBlockSize); + } + } + JXL_JPEG_VERIFY_MARKER_END(); + return true; +} + +// Reads the Start of Scan (SOS) marker segment and fills in *scan_info with the +// parsed data. +bool ProcessSOS(const uint8_t* data, const size_t len, size_t* pos, + JPEGData* jpg) { + const size_t start_pos = *pos; + JXL_JPEG_VERIFY_LEN(3); + size_t marker_len = ReadUint16(data, pos); + size_t comps_in_scan = ReadUint8(data, pos); + JXL_JPEG_VERIFY_INPUT(comps_in_scan, 1, jpg->components.size(), + COMPS_IN_SCAN); + + JPEGScanInfo scan_info; + scan_info.num_components = comps_in_scan; + JXL_JPEG_VERIFY_LEN(2 * comps_in_scan); + std::vector ids_seen(256, false); + for (size_t i = 0; i < comps_in_scan; ++i) { + uint32_t id = ReadUint8(data, pos); + if (ids_seen[id]) { // (cf. section B.2.3, regarding CSj) + return JXL_FAILURE("Duplicate ID %d in SOS.", id); + } + ids_seen[id] = true; + bool found_index = false; + for (size_t j = 0; j < jpg->components.size(); ++j) { + if (jpg->components[j].id == id) { + scan_info.components[i].comp_idx = j; + found_index = true; + } + } + if (!found_index) { + return JXL_FAILURE("SOS marker: Could not find component with id %d", id); + } + int c = ReadUint8(data, pos); + int dc_tbl_idx = c >> 4; + int ac_tbl_idx = c & 0xf; + JXL_JPEG_VERIFY_INPUT(dc_tbl_idx, 0, 3, HUFFMAN_INDEX); + JXL_JPEG_VERIFY_INPUT(ac_tbl_idx, 0, 3, HUFFMAN_INDEX); + scan_info.components[i].dc_tbl_idx = dc_tbl_idx; + scan_info.components[i].ac_tbl_idx = ac_tbl_idx; + } + JXL_JPEG_VERIFY_LEN(3); + scan_info.Ss = ReadUint8(data, pos); + scan_info.Se = ReadUint8(data, pos); + JXL_JPEG_VERIFY_INPUT(static_cast(scan_info.Ss), 0, 63, START_OF_SCAN); + JXL_JPEG_VERIFY_INPUT(scan_info.Se, scan_info.Ss, 63, END_OF_SCAN); + int c = ReadUint8(data, pos); + scan_info.Ah = c >> 4; + scan_info.Al = c & 0xf; + if (scan_info.Ah != 0 && scan_info.Al != scan_info.Ah - 1) { + // section G.1.1.1.2 : Successive approximation control only improves + // by one bit at a time. But it's not always respected, so we just issue + // a warning. + JXL_WARNING("Invalid progressive parameters: Al=%d Ah=%d", scan_info.Al, + scan_info.Ah); + } + // Check that all the Huffman tables needed for this scan are defined. + for (size_t i = 0; i < comps_in_scan; ++i) { + bool found_dc_table = false; + bool found_ac_table = false; + for (size_t j = 0; j < jpg->huffman_code.size(); ++j) { + uint32_t slot_id = jpg->huffman_code[j].slot_id; + if (slot_id == scan_info.components[i].dc_tbl_idx) { + found_dc_table = true; + } else if (slot_id == scan_info.components[i].ac_tbl_idx + 16) { + found_ac_table = true; + } + } + if (scan_info.Ss == 0 && !found_dc_table) { + return JXL_FAILURE( + "SOS marker: Could not find DC Huffman table with index %d", + scan_info.components[i].dc_tbl_idx); + } + if (scan_info.Se > 0 && !found_ac_table) { + return JXL_FAILURE( + "SOS marker: Could not find AC Huffman table with index %d", + scan_info.components[i].ac_tbl_idx); + } + } + jpg->scan_info.push_back(scan_info); + JXL_JPEG_VERIFY_MARKER_END(); + return true; +} + +// Reads the Define Huffman Table (DHT) marker segment and fills in *jpg with +// the parsed data. Builds the Huffman decoding table in either dc_huff_lut or +// ac_huff_lut, depending on the type and solt_id of Huffman code being read. +bool ProcessDHT(const uint8_t* data, const size_t len, JpegReadMode mode, + std::vector* dc_huff_lut, + std::vector* ac_huff_lut, size_t* pos, + JPEGData* jpg) { + const size_t start_pos = *pos; + JXL_JPEG_VERIFY_LEN(2); + size_t marker_len = ReadUint16(data, pos); + if (marker_len == 2) { + return JXL_FAILURE("DHT marker: no Huffman table found"); + } + while (*pos < start_pos + marker_len) { + JXL_JPEG_VERIFY_LEN(1 + kJpegHuffmanMaxBitLength); + JPEGHuffmanCode huff; + huff.slot_id = ReadUint8(data, pos); + int huffman_index = huff.slot_id; + int is_ac_table = (huff.slot_id & 0x10) != 0; + HuffmanTableEntry* huff_lut; + if (is_ac_table) { + huffman_index -= 0x10; + JXL_JPEG_VERIFY_INPUT(huffman_index, 0, 3, HUFFMAN_INDEX); + huff_lut = &(*ac_huff_lut)[huffman_index * kJpegHuffmanLutSize]; + } else { + JXL_JPEG_VERIFY_INPUT(huffman_index, 0, 3, HUFFMAN_INDEX); + huff_lut = &(*dc_huff_lut)[huffman_index * kJpegHuffmanLutSize]; + } + huff.counts[0] = 0; + int total_count = 0; + int space = 1 << kJpegHuffmanMaxBitLength; + int max_depth = 1; + for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) { + int count = ReadUint8(data, pos); + if (count != 0) { + max_depth = i; + } + huff.counts[i] = count; + total_count += count; + space -= count * (1 << (kJpegHuffmanMaxBitLength - i)); + } + if (is_ac_table) { + JXL_JPEG_VERIFY_INPUT(total_count, 0, kJpegHuffmanAlphabetSize, + HUFFMAN_CODE); + } else { + JXL_JPEG_VERIFY_INPUT(total_count, 0, kJpegDCAlphabetSize, HUFFMAN_CODE); + } + JXL_JPEG_VERIFY_LEN(total_count); + std::vector values_seen(256, false); + for (int i = 0; i < total_count; ++i) { + int value = ReadUint8(data, pos); + if (!is_ac_table) { + JXL_JPEG_VERIFY_INPUT(value, 0, kJpegDCAlphabetSize - 1, HUFFMAN_CODE); + } + if (values_seen[value]) { + return JXL_FAILURE("Duplicate Huffman code value %d", value); + } + values_seen[value] = true; + huff.values[i] = value; + } + // Add an invalid symbol that will have the all 1 code. + ++huff.counts[max_depth]; + huff.values[total_count] = kJpegHuffmanAlphabetSize; + space -= (1 << (kJpegHuffmanMaxBitLength - max_depth)); + if (space < 0) { + return JXL_FAILURE("Invalid Huffman code lengths."); + } else if (space > 0 && huff_lut[0].value != 0xffff) { + // Re-initialize the values to an invalid symbol so that we can recognize + // it when reading the bit stream using a Huffman code with space > 0. + for (int i = 0; i < kJpegHuffmanLutSize; ++i) { + huff_lut[i].bits = 0; + huff_lut[i].value = 0xffff; + } + } + huff.is_last = (*pos == start_pos + marker_len); + if (mode == JpegReadMode::kReadAll) { + BuildJpegHuffmanTable(&huff.counts[0], &huff.values[0], huff_lut); + } + jpg->huffman_code.push_back(huff); + } + JXL_JPEG_VERIFY_MARKER_END(); + return true; +} + +// Reads the Define Quantization Table (DQT) marker segment and fills in *jpg +// with the parsed data. +bool ProcessDQT(const uint8_t* data, const size_t len, size_t* pos, + JPEGData* jpg) { + const size_t start_pos = *pos; + JXL_JPEG_VERIFY_LEN(2); + size_t marker_len = ReadUint16(data, pos); + if (marker_len == 2) { + return JXL_FAILURE("DQT marker: no quantization table found"); + } + while (*pos < start_pos + marker_len && jpg->quant.size() < kMaxQuantTables) { + JXL_JPEG_VERIFY_LEN(1); + int quant_table_index = ReadUint8(data, pos); + int quant_table_precision = quant_table_index >> 4; + JXL_JPEG_VERIFY_INPUT(quant_table_precision, 0, 1, QUANT_TBL_PRECISION); + quant_table_index &= 0xf; + JXL_JPEG_VERIFY_INPUT(quant_table_index, 0, 3, QUANT_TBL_INDEX); + JXL_JPEG_VERIFY_LEN((quant_table_precision + 1) * kDCTBlockSize); + JPEGQuantTable table; + table.index = quant_table_index; + table.precision = quant_table_precision; + for (size_t i = 0; i < kDCTBlockSize; ++i) { + int quant_val = + quant_table_precision ? ReadUint16(data, pos) : ReadUint8(data, pos); + JXL_JPEG_VERIFY_INPUT(quant_val, 1, 65535, QUANT_VAL); + table.values[kJPEGNaturalOrder[i]] = quant_val; + } + table.is_last = (*pos == start_pos + marker_len); + jpg->quant.push_back(table); + } + JXL_JPEG_VERIFY_MARKER_END(); + return true; +} + +// Reads the DRI marker and saves the restart interval into *jpg. +bool ProcessDRI(const uint8_t* data, const size_t len, size_t* pos, + bool* found_dri, JPEGData* jpg) { + if (*found_dri) { + return JXL_FAILURE("Duplicate DRI marker."); + } + *found_dri = true; + const size_t start_pos = *pos; + JXL_JPEG_VERIFY_LEN(4); + size_t marker_len = ReadUint16(data, pos); + int restart_interval = ReadUint16(data, pos); + jpg->restart_interval = restart_interval; + JXL_JPEG_VERIFY_MARKER_END(); + return true; +} + +// Saves the APP marker segment as a string to *jpg. +bool ProcessAPP(const uint8_t* data, const size_t len, size_t* pos, + JPEGData* jpg) { + JXL_JPEG_VERIFY_LEN(2); + size_t marker_len = ReadUint16(data, pos); + JXL_JPEG_VERIFY_INPUT(marker_len, 2, 65535, MARKER_LEN); + JXL_JPEG_VERIFY_LEN(marker_len - 2); + JXL_DASSERT(*pos >= 3); + // Save the marker type together with the app data. + const uint8_t* app_str_start = data + *pos - 3; + std::vector app_str(app_str_start, app_str_start + marker_len + 1); + *pos += marker_len - 2; + jpg->app_data.push_back(app_str); + return true; +} + +// Saves the COM marker segment as a string to *jpg. +bool ProcessCOM(const uint8_t* data, const size_t len, size_t* pos, + JPEGData* jpg) { + JXL_JPEG_VERIFY_LEN(2); + size_t marker_len = ReadUint16(data, pos); + JXL_JPEG_VERIFY_INPUT(marker_len, 2, 65535, MARKER_LEN); + JXL_JPEG_VERIFY_LEN(marker_len - 2); + const uint8_t* com_str_start = data + *pos - 3; + std::vector com_str(com_str_start, com_str_start + marker_len + 1); + *pos += marker_len - 2; + jpg->com_data.push_back(com_str); + return true; +} + +// Helper structure to read bits from the entropy coded data segment. +struct BitReaderState { + BitReaderState(const uint8_t* data, const size_t len, size_t pos) + : data_(data), len_(len) { + Reset(pos); + } + + void Reset(size_t pos) { + pos_ = pos; + val_ = 0; + bits_left_ = 0; + next_marker_pos_ = len_ - 2; + FillBitWindow(); + } + + // Returns the next byte and skips the 0xff/0x00 escape sequences. + uint8_t GetNextByte() { + if (pos_ >= next_marker_pos_) { + ++pos_; + return 0; + } + uint8_t c = data_[pos_++]; + if (c == 0xff) { + uint8_t escape = data_[pos_]; + if (escape == 0) { + ++pos_; + } else { + // 0xff was followed by a non-zero byte, which means that we found the + // start of the next marker segment. + next_marker_pos_ = pos_ - 1; + } + } + return c; + } + + void FillBitWindow() { + if (bits_left_ <= 16) { + while (bits_left_ <= 56) { + val_ <<= 8; + val_ |= (uint64_t)GetNextByte(); + bits_left_ += 8; + } + } + } + + int ReadBits(int nbits) { + FillBitWindow(); + uint64_t val = (val_ >> (bits_left_ - nbits)) & ((1ULL << nbits) - 1); + bits_left_ -= nbits; + return val; + } + + // Sets *pos to the next stream position where parsing should continue. + // Enqueue the padding bits seen (0 or 1). + // Returns false if there is inconsistent or invalid padding or the stream + // ended too early. + bool FinishStream(JPEGData* jpg, size_t* pos) { + int npadbits = bits_left_ & 7; + if (npadbits > 0) { + uint64_t padmask = (1ULL << npadbits) - 1; + uint64_t padbits = (val_ >> (bits_left_ - npadbits)) & padmask; + if (padbits != padmask) { + jpg->has_zero_padding_bit = true; + } + for (int i = npadbits - 1; i >= 0; --i) { + jpg->padding_bits.push_back((padbits >> i) & 1); + } + } + // Give back some bytes that we did not use. + int unused_bytes_left = bits_left_ >> 3; + while (unused_bytes_left-- > 0) { + --pos_; + // If we give back a 0 byte, we need to check if it was a 0xff/0x00 escape + // sequence, and if yes, we need to give back one more byte. + if (pos_ < next_marker_pos_ && data_[pos_] == 0 && + data_[pos_ - 1] == 0xff) { + --pos_; + } + } + if (pos_ > next_marker_pos_) { + // Data ran out before the scan was complete. + return JXL_FAILURE("Unexpected end of scan."); + } + *pos = pos_; + return true; + } + + const uint8_t* data_; + const size_t len_; + size_t pos_; + uint64_t val_; + int bits_left_; + size_t next_marker_pos_; +}; + +// Returns the next Huffman-coded symbol. +int ReadSymbol(const HuffmanTableEntry* table, BitReaderState* br) { + int nbits; + br->FillBitWindow(); + int val = (br->val_ >> (br->bits_left_ - 8)) & 0xff; + table += val; + nbits = table->bits - 8; + if (nbits > 0) { + br->bits_left_ -= 8; + table += table->value; + val = (br->val_ >> (br->bits_left_ - nbits)) & ((1 << nbits) - 1); + table += val; + } + br->bits_left_ -= table->bits; + return table->value; +} + +/** + * Returns the DC diff or AC value for extra bits value x and prefix code s. + * + * CCITT Rec. T.81 (1992 E) + * Table F.1 – Difference magnitude categories for DC coding + * SSSS | DIFF values + * ------+-------------------------- + * 0 | 0 + * 1 | –1, 1 + * 2 | –3, –2, 2, 3 + * 3 | –7..–4, 4..7 + * ......|.......................... + * 11 | –2047..–1024, 1024..2047 + * + * CCITT Rec. T.81 (1992 E) + * Table F.2 – Categories assigned to coefficient values + * [ Same as Table F.1, but does not include SSSS equal to 0 and 11] + * + * + * CCITT Rec. T.81 (1992 E) + * F.1.2.1.1 Structure of DC code table + * For each category,... additional bits... appended... to uniquely identify + * which difference... occurred... When DIFF is positive... SSSS... bits of DIFF + * are appended. When DIFF is negative... SSSS... bits of (DIFF – 1) are + * appended... Most significant bit... is 0 for negative differences and 1 for + * positive differences. + * + * In other words the upper half of extra bits range represents DIFF as is. + * The lower half represents the negative DIFFs with an offset. + */ +int HuffExtend(int x, int s) { + JXL_DASSERT(s >= 1); + int half = 1 << (s - 1); + if (x >= half) { + JXL_DASSERT(x < (1 << s)); + return x; + } else { + return x - (1 << s) + 1; + } +} + +// Decodes one 8x8 block of DCT coefficients from the bit stream. +bool DecodeDCTBlock(const HuffmanTableEntry* dc_huff, + const HuffmanTableEntry* ac_huff, int Ss, int Se, int Al, + int* eobrun, bool* reset_state, int* num_zero_runs, + BitReaderState* br, JPEGData* jpg, coeff_t* last_dc_coeff, + coeff_t* coeffs) { + // Nowadays multiplication is even faster than variable shift. + int Am = 1 << Al; + bool eobrun_allowed = Ss > 0; + if (Ss == 0) { + int s = ReadSymbol(dc_huff, br); + if (s >= kJpegDCAlphabetSize) { + return JXL_FAILURE("Invalid Huffman symbol %d for DC coefficient.", s); + } + int diff = 0; + if (s > 0) { + int bits = br->ReadBits(s); + diff = HuffExtend(bits, s); + } + int coeff = diff + *last_dc_coeff; + const int dc_coeff = coeff * Am; + coeffs[0] = dc_coeff; + // TODO(eustas): is there a more elegant / explicit way to check this? + if (dc_coeff != coeffs[0]) { + return JXL_FAILURE("Invalid DC coefficient %d", dc_coeff); + } + *last_dc_coeff = coeff; + ++Ss; + } + if (Ss > Se) { + return true; + } + if (*eobrun > 0) { + --(*eobrun); + return true; + } + *num_zero_runs = 0; + for (int k = Ss; k <= Se; k++) { + int sr = ReadSymbol(ac_huff, br); + if (sr >= kJpegHuffmanAlphabetSize) { + return JXL_FAILURE("Invalid Huffman symbol %d for AC coefficient %d", sr, + k); + } + int r = sr >> 4; + int s = sr & 15; + if (s > 0) { + k += r; + if (k > Se) { + return JXL_FAILURE("Out-of-band coefficient %d band was %d-%d", k, Ss, + Se); + } + if (s + Al >= kJpegDCAlphabetSize) { + return JXL_FAILURE( + "Out of range AC coefficient value: s = %d Al = %d k = %d", s, Al, + k); + } + int bits = br->ReadBits(s); + int coeff = HuffExtend(bits, s); + coeffs[kJPEGNaturalOrder[k]] = coeff * Am; + *num_zero_runs = 0; + } else if (r == 15) { + k += 15; + ++(*num_zero_runs); + } else { + if (eobrun_allowed && k == Ss && *eobrun == 0) { + // We have two end-of-block runs right after each other, so we signal + // the jpeg encoder to force a state reset at this point. + *reset_state = true; + } + *eobrun = 1 << r; + if (r > 0) { + if (!eobrun_allowed) { + return JXL_FAILURE("End-of-block run crossing DC coeff."); + } + *eobrun += br->ReadBits(r); + } + break; + } + } + --(*eobrun); + return true; +} + +bool RefineDCTBlock(const HuffmanTableEntry* ac_huff, int Ss, int Se, int Al, + int* eobrun, bool* reset_state, BitReaderState* br, + JPEGData* jpg, coeff_t* coeffs) { + // Nowadays multiplication is even faster than variable shift. + int Am = 1 << Al; + bool eobrun_allowed = Ss > 0; + if (Ss == 0) { + int s = br->ReadBits(1); + coeff_t dc_coeff = coeffs[0]; + dc_coeff |= s * Am; + coeffs[0] = dc_coeff; + ++Ss; + } + if (Ss > Se) { + return true; + } + int p1 = Am; + int m1 = -Am; + int k = Ss; + int r; + int s; + bool in_zero_run = false; + if (*eobrun <= 0) { + for (; k <= Se; k++) { + s = ReadSymbol(ac_huff, br); + if (s >= kJpegHuffmanAlphabetSize) { + return JXL_FAILURE("Invalid Huffman symbol %d for AC coefficient %d", s, + k); + } + r = s >> 4; + s &= 15; + if (s) { + if (s != 1) { + return JXL_FAILURE("Invalid Huffman symbol %d for AC coefficient %d", + s, k); + } + s = br->ReadBits(1) ? p1 : m1; + in_zero_run = false; + } else { + if (r != 15) { + if (eobrun_allowed && k == Ss && *eobrun == 0) { + // We have two end-of-block runs right after each other, so we + // signal the jpeg encoder to force a state reset at this point. + *reset_state = true; + } + *eobrun = 1 << r; + if (r > 0) { + if (!eobrun_allowed) { + return JXL_FAILURE("End-of-block run crossing DC coeff."); + } + *eobrun += br->ReadBits(r); + } + break; + } + in_zero_run = true; + } + do { + coeff_t thiscoef = coeffs[kJPEGNaturalOrder[k]]; + if (thiscoef != 0) { + if (br->ReadBits(1)) { + if ((thiscoef & p1) == 0) { + if (thiscoef >= 0) { + thiscoef += p1; + } else { + thiscoef += m1; + } + } + } + coeffs[kJPEGNaturalOrder[k]] = thiscoef; + } else { + if (--r < 0) { + break; + } + } + k++; + } while (k <= Se); + if (s) { + if (k > Se) { + return JXL_FAILURE("Out-of-band coefficient %d band was %d-%d", k, Ss, + Se); + } + coeffs[kJPEGNaturalOrder[k]] = s; + } + } + } + if (in_zero_run) { + return JXL_FAILURE("Extra zero run before end-of-block."); + } + if (*eobrun > 0) { + for (; k <= Se; k++) { + coeff_t thiscoef = coeffs[kJPEGNaturalOrder[k]]; + if (thiscoef != 0) { + if (br->ReadBits(1)) { + if ((thiscoef & p1) == 0) { + if (thiscoef >= 0) { + thiscoef += p1; + } else { + thiscoef += m1; + } + } + } + coeffs[kJPEGNaturalOrder[k]] = thiscoef; + } + } + } + --(*eobrun); + return true; +} + +bool ProcessRestart(const uint8_t* data, const size_t len, + int* next_restart_marker, BitReaderState* br, + JPEGData* jpg) { + size_t pos = 0; + if (!br->FinishStream(jpg, &pos)) { + return JXL_FAILURE("Invalid scan"); + } + int expected_marker = 0xd0 + *next_restart_marker; + JXL_JPEG_EXPECT_MARKER(); + int marker = data[pos + 1]; + if (marker != expected_marker) { + return JXL_FAILURE("Did not find expected restart marker %d actual %d", + expected_marker, marker); + } + br->Reset(pos + 2); + *next_restart_marker += 1; + *next_restart_marker &= 0x7; + return true; +} + +bool ProcessScan(const uint8_t* data, const size_t len, + const std::vector& dc_huff_lut, + const std::vector& ac_huff_lut, + uint16_t scan_progression[kMaxComponents][kDCTBlockSize], + bool is_progressive, size_t* pos, JPEGData* jpg) { + if (!ProcessSOS(data, len, pos, jpg)) { + return false; + } + JPEGScanInfo* scan_info = &jpg->scan_info.back(); + bool is_interleaved = (scan_info->num_components > 1); + int max_h_samp_factor = 1; + int max_v_samp_factor = 1; + for (size_t i = 0; i < jpg->components.size(); ++i) { + max_h_samp_factor = + std::max(max_h_samp_factor, jpg->components[i].h_samp_factor); + max_v_samp_factor = + std::max(max_v_samp_factor, jpg->components[i].v_samp_factor); + } + + int MCU_rows = DivCeil(jpg->height, max_v_samp_factor * 8); + int MCUs_per_row = DivCeil(jpg->width, max_h_samp_factor * 8); + if (!is_interleaved) { + const JPEGComponent& c = jpg->components[scan_info->components[0].comp_idx]; + MCUs_per_row = DivCeil(jpg->width * c.h_samp_factor, 8 * max_h_samp_factor); + MCU_rows = DivCeil(jpg->height * c.v_samp_factor, 8 * max_v_samp_factor); + } + coeff_t last_dc_coeff[kMaxComponents] = {0}; + BitReaderState br(data, len, *pos); + int restarts_to_go = jpg->restart_interval; + int next_restart_marker = 0; + int eobrun = -1; + int block_scan_index = 0; + const int Al = is_progressive ? scan_info->Al : 0; + const int Ah = is_progressive ? scan_info->Ah : 0; + const int Ss = is_progressive ? scan_info->Ss : 0; + const int Se = is_progressive ? scan_info->Se : 63; + const uint16_t scan_bitmask = Ah == 0 ? (0xffff << Al) : (1u << Al); + const uint16_t refinement_bitmask = (1 << Al) - 1; + for (size_t i = 0; i < scan_info->num_components; ++i) { + int comp_idx = scan_info->components[i].comp_idx; + for (int k = Ss; k <= Se; ++k) { + if (scan_progression[comp_idx][k] & scan_bitmask) { + return JXL_FAILURE( + "Overlapping scans: component=%d k=%d prev_mask: %u cur_mask %u", + comp_idx, k, scan_progression[i][k], scan_bitmask); + } + if (scan_progression[comp_idx][k] & refinement_bitmask) { + return JXL_FAILURE( + "Invalid scan order, a more refined scan was already done: " + "component=%d k=%d prev_mask=%u cur_mask=%u", + comp_idx, k, scan_progression[i][k], scan_bitmask); + } + scan_progression[comp_idx][k] |= scan_bitmask; + } + } + if (Al > 10) { + return JXL_FAILURE("Scan parameter Al=%d is not supported.", Al); + } + for (int mcu_y = 0; mcu_y < MCU_rows; ++mcu_y) { + for (int mcu_x = 0; mcu_x < MCUs_per_row; ++mcu_x) { + // Handle the restart intervals. + if (jpg->restart_interval > 0) { + if (restarts_to_go == 0) { + if (ProcessRestart(data, len, &next_restart_marker, &br, jpg)) { + restarts_to_go = jpg->restart_interval; + memset(static_cast(last_dc_coeff), 0, sizeof(last_dc_coeff)); + if (eobrun > 0) { + return JXL_FAILURE("End-of-block run too long."); + } + eobrun = -1; // fresh start + } else { + return JXL_FAILURE("Could not process restart."); + } + } + --restarts_to_go; + } + // Decode one MCU. + for (size_t i = 0; i < scan_info->num_components; ++i) { + JPEGComponentScanInfo* si = &scan_info->components[i]; + JPEGComponent* c = &jpg->components[si->comp_idx]; + const HuffmanTableEntry* dc_lut = + &dc_huff_lut[si->dc_tbl_idx * kJpegHuffmanLutSize]; + const HuffmanTableEntry* ac_lut = + &ac_huff_lut[si->ac_tbl_idx * kJpegHuffmanLutSize]; + int nblocks_y = is_interleaved ? c->v_samp_factor : 1; + int nblocks_x = is_interleaved ? c->h_samp_factor : 1; + for (int iy = 0; iy < nblocks_y; ++iy) { + for (int ix = 0; ix < nblocks_x; ++ix) { + int block_y = mcu_y * nblocks_y + iy; + int block_x = mcu_x * nblocks_x + ix; + int block_idx = block_y * c->width_in_blocks + block_x; + bool reset_state = false; + int num_zero_runs = 0; + coeff_t* coeffs = &c->coeffs[block_idx * kDCTBlockSize]; + if (Ah == 0) { + if (!DecodeDCTBlock(dc_lut, ac_lut, Ss, Se, Al, &eobrun, + &reset_state, &num_zero_runs, &br, jpg, + &last_dc_coeff[si->comp_idx], coeffs)) { + return false; + } + } else { + if (!RefineDCTBlock(ac_lut, Ss, Se, Al, &eobrun, &reset_state, + &br, jpg, coeffs)) { + return false; + } + } + if (reset_state) { + scan_info->reset_points.emplace_back(block_scan_index); + } + if (num_zero_runs > 0) { + JPEGScanInfo::ExtraZeroRunInfo info; + info.block_idx = block_scan_index; + info.num_extra_zero_runs = num_zero_runs; + scan_info->extra_zero_runs.push_back(info); + } + ++block_scan_index; + } + } + } + } + } + if (eobrun > 0) { + return JXL_FAILURE("End-of-block run too long."); + } + if (!br.FinishStream(jpg, pos)) { + return JXL_FAILURE("Invalid scan."); + } + if (*pos > len) { + return JXL_FAILURE("Unexpected end of file during scan. pos=%" PRIuS + " len=%" PRIuS, + *pos, len); + } + return true; +} + +// Changes the quant_idx field of the components to refer to the index of the +// quant table in the jpg->quant array. +bool FixupIndexes(JPEGData* jpg) { + for (size_t i = 0; i < jpg->components.size(); ++i) { + JPEGComponent* c = &jpg->components[i]; + bool found_index = false; + for (size_t j = 0; j < jpg->quant.size(); ++j) { + if (jpg->quant[j].index == c->quant_idx) { + c->quant_idx = j; + found_index = true; + break; + } + } + if (!found_index) { + return JXL_FAILURE("Quantization table with index %u not found", + c->quant_idx); + } + } + return true; +} + +size_t FindNextMarker(const uint8_t* data, const size_t len, size_t pos) { + // kIsValidMarker[i] == 1 means (0xc0 + i) is a valid marker. + static const uint8_t kIsValidMarker[] = { + 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + }; + size_t num_skipped = 0; + while (pos + 1 < len && (data[pos] != 0xff || data[pos + 1] < 0xc0 || + !kIsValidMarker[data[pos + 1] - 0xc0])) { + ++pos; + ++num_skipped; + } + return num_skipped; +} + +} // namespace + +bool ReadJpeg(const uint8_t* data, const size_t len, JpegReadMode mode, + JPEGData* jpg) { + size_t pos = 0; + // Check SOI marker. + JXL_JPEG_EXPECT_MARKER(); + int marker = data[pos + 1]; + pos += 2; + if (marker != 0xd8) { + return JXL_FAILURE("Did not find expected SOI marker, actual=%d", marker); + } + int lut_size = kMaxHuffmanTables * kJpegHuffmanLutSize; + std::vector dc_huff_lut(lut_size); + std::vector ac_huff_lut(lut_size); + bool found_sof = false; + bool found_dri = false; + uint16_t scan_progression[kMaxComponents][kDCTBlockSize] = {{0}}; + + jpg->padding_bits.resize(0); + bool is_progressive = false; // default + do { + // Read next marker. + size_t num_skipped = FindNextMarker(data, len, pos); + if (num_skipped > 0) { + // Add a fake marker to indicate arbitrary in-between-markers data. + jpg->marker_order.push_back(0xff); + jpg->inter_marker_data.emplace_back(data + pos, data + pos + num_skipped); + pos += num_skipped; + } + JXL_JPEG_EXPECT_MARKER(); + marker = data[pos + 1]; + pos += 2; + bool ok = true; + switch (marker) { + case 0xc0: + case 0xc1: + case 0xc2: + is_progressive = (marker == 0xc2); + ok = ProcessSOF(data, len, mode, &pos, jpg); + found_sof = true; + break; + case 0xc4: + ok = ProcessDHT(data, len, mode, &dc_huff_lut, &ac_huff_lut, &pos, jpg); + break; + case 0xd0: + case 0xd1: + case 0xd2: + case 0xd3: + case 0xd4: + case 0xd5: + case 0xd6: + case 0xd7: + // RST markers do not have any data. + break; + case 0xd9: + // Found end marker. + break; + case 0xda: + if (mode == JpegReadMode::kReadAll) { + ok = ProcessScan(data, len, dc_huff_lut, ac_huff_lut, + scan_progression, is_progressive, &pos, jpg); + } + break; + case 0xdb: + ok = ProcessDQT(data, len, &pos, jpg); + break; + case 0xdd: + ok = ProcessDRI(data, len, &pos, &found_dri, jpg); + break; + case 0xe0: + case 0xe1: + case 0xe2: + case 0xe3: + case 0xe4: + case 0xe5: + case 0xe6: + case 0xe7: + case 0xe8: + case 0xe9: + case 0xea: + case 0xeb: + case 0xec: + case 0xed: + case 0xee: + case 0xef: + if (mode != JpegReadMode::kReadTables) { + ok = ProcessAPP(data, len, &pos, jpg); + } + break; + case 0xfe: + if (mode != JpegReadMode::kReadTables) { + ok = ProcessCOM(data, len, &pos, jpg); + } + break; + default: + return JXL_FAILURE("Unsupported marker: %d pos=%" PRIuS " len=%" PRIuS, + marker, pos, len); + } + if (!ok) { + return false; + } + jpg->marker_order.push_back(marker); + if (mode == JpegReadMode::kReadHeader && found_sof) { + break; + } + } while (marker != 0xd9); + + if (!found_sof) { + return JXL_FAILURE("Missing SOF marker."); + } + + // Supplemental checks. + if (mode == JpegReadMode::kReadAll) { + if (pos < len) { + jpg->tail_data = std::vector(data + pos, data + len); + } + if (!FixupIndexes(jpg)) { + return false; + } + if (jpg->huffman_code.empty()) { + // Section B.2.4.2: "If a table has never been defined for a particular + // destination, then when this destination is specified in a scan header, + // the results are unpredictable." + return JXL_FAILURE("Need at least one Huffman code table."); + } + if (jpg->huffman_code.size() >= kMaxDHTMarkers) { + return JXL_FAILURE("Too many Huffman tables."); + } + } + return true; +} + +} // namespace jpeg +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.h b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.h new file mode 100644 index 0000000000..3fad820e9d --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.h @@ -0,0 +1,36 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Functions for reading a jpeg byte stream into a JPEGData object. + +#ifndef LIB_JXL_JPEG_ENC_JPEG_DATA_READER_H_ +#define LIB_JXL_JPEG_ENC_JPEG_DATA_READER_H_ + +#include +#include + +#include "lib/jxl/jpeg/jpeg_data.h" + +namespace jxl { +namespace jpeg { + +enum class JpegReadMode { + kReadHeader, // only basic headers + kReadTables, // headers and tables (quant, Huffman, ...) + kReadAll, // everything +}; + +// Parses the JPEG stream contained in data[*pos ... len) and fills in *jpg with +// the parsed information. +// If mode is kReadHeader, it fills in only the image dimensions in *jpg. +// Returns false if the data is not valid JPEG, or if it contains an unsupported +// JPEG feature. +bool ReadJpeg(const uint8_t* data, const size_t len, JpegReadMode mode, + JPEGData* jpg); + +} // namespace jpeg +} // namespace jxl + +#endif // LIB_JXL_JPEG_ENC_JPEG_DATA_READER_H_ diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc new file mode 100644 index 0000000000..38282e640a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc @@ -0,0 +1,103 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/jpeg/enc_jpeg_huffman_decode.h" + +#include "lib/jxl/jpeg/jpeg_data.h" + +namespace jxl { +namespace jpeg { + +// Returns the table width of the next 2nd level table, count is the histogram +// of bit lengths for the remaining symbols, len is the code length of the next +// processed symbol. +static inline int NextTableBitSize(const int* count, int len) { + int left = 1 << (len - kJpegHuffmanRootTableBits); + while (len < static_cast(kJpegHuffmanMaxBitLength)) { + left -= count[len]; + if (left <= 0) break; + ++len; + left <<= 1; + } + return len - kJpegHuffmanRootTableBits; +} + +void BuildJpegHuffmanTable(const uint32_t* count, const uint32_t* symbols, + HuffmanTableEntry* lut) { + HuffmanTableEntry code; // current table entry + HuffmanTableEntry* table; // next available space in table + int len; // current code length + int idx; // symbol index + int key; // prefix code + int reps; // number of replicate key values in current table + int low; // low bits for current root entry + int table_bits; // key length of current table + int table_size; // size of current table + + // Make a local copy of the input bit length histogram. + int tmp_count[kJpegHuffmanMaxBitLength + 1] = {0}; + int total_count = 0; + for (len = 1; len <= static_cast(kJpegHuffmanMaxBitLength); ++len) { + tmp_count[len] = count[len]; + total_count += tmp_count[len]; + } + + table = lut; + table_bits = kJpegHuffmanRootTableBits; + table_size = 1 << table_bits; + + // Special case code with only one value. + if (total_count == 1) { + code.bits = 0; + code.value = symbols[0]; + for (key = 0; key < table_size; ++key) { + table[key] = code; + } + return; + } + + // Fill in root table. + key = 0; + idx = 0; + for (len = 1; len <= kJpegHuffmanRootTableBits; ++len) { + for (; tmp_count[len] > 0; --tmp_count[len]) { + code.bits = len; + code.value = symbols[idx++]; + reps = 1 << (kJpegHuffmanRootTableBits - len); + while (reps--) { + table[key++] = code; + } + } + } + + // Fill in 2nd level tables and add pointers to root table. + table += table_size; + table_size = 0; + low = 0; + for (len = kJpegHuffmanRootTableBits + 1; + len <= static_cast(kJpegHuffmanMaxBitLength); ++len) { + for (; tmp_count[len] > 0; --tmp_count[len]) { + // Start a new sub-table if the previous one is full. + if (low >= table_size) { + table += table_size; + table_bits = NextTableBitSize(tmp_count, len); + table_size = 1 << table_bits; + low = 0; + lut[key].bits = table_bits + kJpegHuffmanRootTableBits; + lut[key].value = (table - lut) - key; + ++key; + } + code.bits = len - kJpegHuffmanRootTableBits; + code.value = symbols[idx++]; + reps = 1 << (table_bits - code.bits); + while (reps--) { + table[low++] = code; + } + } + } +} + +} // namespace jpeg +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.h b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.h new file mode 100644 index 0000000000..b8a60e4107 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.h @@ -0,0 +1,41 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Utility function for building a Huffman lookup table for the jpeg decoder. + +#ifndef LIB_JXL_JPEG_ENC_JPEG_HUFFMAN_DECODE_H_ +#define LIB_JXL_JPEG_ENC_JPEG_HUFFMAN_DECODE_H_ + +#include + +namespace jxl { +namespace jpeg { + +constexpr int kJpegHuffmanRootTableBits = 8; +// Maximum huffman lookup table size. +// According to zlib/examples/enough.c, 758 entries are always enough for +// an alphabet of 257 symbols (256 + 1 special symbol for the all 1s code) and +// max bit length 16 if the root table has 8 bits. +constexpr int kJpegHuffmanLutSize = 758; + +struct HuffmanTableEntry { + // Initialize the value to an invalid symbol so that we can recognize it + // when reading the bit stream using a Huffman code with space > 0. + HuffmanTableEntry() : bits(0), value(0xffff) {} + + uint8_t bits; // number of bits used for this symbol + uint16_t value; // symbol value or table offset +}; + +// Builds jpeg-style Huffman lookup table from the given symbols. +// The symbols are in order of increasing bit lengths. The number of symbols +// with bit length n is given in counts[n] for each n >= 1. +void BuildJpegHuffmanTable(const uint32_t* counts, const uint32_t* symbols, + HuffmanTableEntry* lut); + +} // namespace jpeg +} // namespace jxl + +#endif // LIB_JXL_JPEG_ENC_JPEG_HUFFMAN_DECODE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.cc b/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.cc new file mode 100644 index 0000000000..430707b9ed --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.cc @@ -0,0 +1,451 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/jpeg/jpeg_data.h" + +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/status.h" + +namespace jxl { +namespace jpeg { + +#if JPEGXL_ENABLE_TRANSCODE_JPEG + +namespace { +enum JPEGComponentType : uint32_t { + kGray = 0, + kYCbCr = 1, + kRGB = 2, + kCustom = 3, +}; + +struct JPEGInfo { + size_t num_app_markers = 0; + size_t num_com_markers = 0; + size_t num_scans = 0; + size_t num_intermarker = 0; + bool has_dri = false; +}; + +Status VisitMarker(uint8_t* marker, Visitor* visitor, JPEGInfo* info) { + uint32_t marker32 = *marker - 0xc0; + JXL_RETURN_IF_ERROR(visitor->Bits(6, 0x00, &marker32)); + *marker = marker32 + 0xc0; + if ((*marker & 0xf0) == 0xe0) { + info->num_app_markers++; + } + if (*marker == 0xfe) { + info->num_com_markers++; + } + if (*marker == 0xda) { + info->num_scans++; + } + // We use a fake 0xff marker to signal intermarker data. + if (*marker == 0xff) { + info->num_intermarker++; + } + if (*marker == 0xdd) { + info->has_dri = true; + } + return true; +} + +} // namespace + +Status JPEGData::VisitFields(Visitor* visitor) { + bool is_gray = components.size() == 1; + JXL_RETURN_IF_ERROR(visitor->Bool(false, &is_gray)); + if (visitor->IsReading()) { + components.resize(is_gray ? 1 : 3); + } + JPEGInfo info; + if (visitor->IsReading()) { + uint8_t marker = 0xc0; + do { + JXL_RETURN_IF_ERROR(VisitMarker(&marker, visitor, &info)); + marker_order.push_back(marker); + if (marker_order.size() > 16384) { + return JXL_FAILURE("Too many markers: %" PRIuS "\n", + marker_order.size()); + } + } while (marker != 0xd9); + } else { + if (marker_order.size() > 16384) { + return JXL_FAILURE("Too many markers: %" PRIuS "\n", marker_order.size()); + } + for (size_t i = 0; i < marker_order.size(); i++) { + JXL_RETURN_IF_ERROR(VisitMarker(&marker_order[i], visitor, &info)); + } + if (!marker_order.empty()) { + // Last marker should always be EOI marker. + JXL_CHECK(marker_order.back() == 0xd9); + } + } + + // Size of the APP and COM markers. + if (visitor->IsReading()) { + app_data.resize(info.num_app_markers); + app_marker_type.resize(info.num_app_markers); + com_data.resize(info.num_com_markers); + scan_info.resize(info.num_scans); + } + JXL_ASSERT(app_data.size() == info.num_app_markers); + JXL_ASSERT(app_marker_type.size() == info.num_app_markers); + JXL_ASSERT(com_data.size() == info.num_com_markers); + JXL_ASSERT(scan_info.size() == info.num_scans); + for (size_t i = 0; i < app_data.size(); i++) { + auto& app = app_data[i]; + // Encodes up to 8 different values. + JXL_RETURN_IF_ERROR( + visitor->U32(Val(0), Val(1), BitsOffset(1, 2), BitsOffset(2, 4), 0, + reinterpret_cast(&app_marker_type[i]))); + if (app_marker_type[i] != AppMarkerType::kUnknown && + app_marker_type[i] != AppMarkerType::kICC && + app_marker_type[i] != AppMarkerType::kExif && + app_marker_type[i] != AppMarkerType::kXMP) { + return JXL_FAILURE("Unknown app marker type %u", + static_cast(app_marker_type[i])); + } + uint32_t len = app.size() - 1; + JXL_RETURN_IF_ERROR(visitor->Bits(16, 0, &len)); + if (visitor->IsReading()) app.resize(len + 1); + if (app.size() < 3) { + return JXL_FAILURE("Invalid marker size: %" PRIuS "\n", app.size()); + } + } + for (auto& com : com_data) { + uint32_t len = com.size() - 1; + JXL_RETURN_IF_ERROR(visitor->Bits(16, 0, &len)); + if (visitor->IsReading()) com.resize(len + 1); + if (com.size() < 3) { + return JXL_FAILURE("Invalid marker size: %" PRIuS "\n", com.size()); + } + } + + uint32_t num_quant_tables = quant.size(); + JXL_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(2), Val(3), Val(4), 2, &num_quant_tables)); + if (num_quant_tables == 4) { + return JXL_FAILURE("Invalid number of quant tables"); + } + if (visitor->IsReading()) { + quant.resize(num_quant_tables); + } + for (size_t i = 0; i < num_quant_tables; i++) { + if (quant[i].precision > 1) { + return JXL_FAILURE( + "Quant tables with more than 16 bits are not supported"); + } + JXL_RETURN_IF_ERROR(visitor->Bits(1, 0, &quant[i].precision)); + JXL_RETURN_IF_ERROR(visitor->Bits(2, i, &quant[i].index)); + JXL_RETURN_IF_ERROR(visitor->Bool(true, &quant[i].is_last)); + } + + JPEGComponentType component_type = + components.size() == 1 && components[0].id == 1 ? JPEGComponentType::kGray + : components.size() == 3 && components[0].id == 1 && + components[1].id == 2 && components[2].id == 3 + ? JPEGComponentType::kYCbCr + : components.size() == 3 && components[0].id == 'R' && + components[1].id == 'G' && components[2].id == 'B' + ? JPEGComponentType::kRGB + : JPEGComponentType::kCustom; + JXL_RETURN_IF_ERROR( + visitor->Bits(2, JPEGComponentType::kYCbCr, + reinterpret_cast(&component_type))); + uint32_t num_components; + if (component_type == JPEGComponentType::kGray) { + num_components = 1; + } else if (component_type != JPEGComponentType::kCustom) { + num_components = 3; + } else { + num_components = components.size(); + JXL_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(2), Val(3), Val(4), 3, &num_components)); + if (num_components != 1 && num_components != 3) { + return JXL_FAILURE("Invalid number of components: %u", num_components); + } + } + if (visitor->IsReading()) { + components.resize(num_components); + } + if (component_type == JPEGComponentType::kCustom) { + for (size_t i = 0; i < components.size(); i++) { + JXL_RETURN_IF_ERROR(visitor->Bits(8, 0, &components[i].id)); + } + } else if (component_type == JPEGComponentType::kGray) { + components[0].id = 1; + } else if (component_type == JPEGComponentType::kRGB) { + components[0].id = 'R'; + components[1].id = 'G'; + components[2].id = 'B'; + } else { + components[0].id = 1; + components[1].id = 2; + components[2].id = 3; + } + size_t used_tables = 0; + for (size_t i = 0; i < components.size(); i++) { + JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &components[i].quant_idx)); + if (components[i].quant_idx >= quant.size()) { + return JXL_FAILURE("Invalid quant table for component %" PRIuS ": %u\n", + i, components[i].quant_idx); + } + used_tables |= 1U << components[i].quant_idx; + } + for (size_t i = 0; i < quant.size(); i++) { + if (used_tables & (1 << i)) continue; + if (i == 0) return JXL_FAILURE("First quant table unused."); + // Unused quant table has to be set to copy of previous quant table + for (size_t j = 0; j < 64; j++) { + if (quant[i].values[j] != quant[i - 1].values[j]) { + return JXL_FAILURE("Non-trivial unused quant table"); + } + } + } + + uint32_t num_huff = huffman_code.size(); + JXL_RETURN_IF_ERROR(visitor->U32(Val(4), BitsOffset(3, 2), BitsOffset(4, 10), + BitsOffset(6, 26), 4, &num_huff)); + if (visitor->IsReading()) { + huffman_code.resize(num_huff); + } + for (JPEGHuffmanCode& hc : huffman_code) { + bool is_ac = hc.slot_id >> 4; + uint32_t id = hc.slot_id & 0xF; + JXL_RETURN_IF_ERROR(visitor->Bool(false, &is_ac)); + JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &id)); + hc.slot_id = (static_cast(is_ac) << 4) | id; + JXL_RETURN_IF_ERROR(visitor->Bool(true, &hc.is_last)); + size_t num_symbols = 0; + for (size_t i = 0; i <= 16; i++) { + JXL_RETURN_IF_ERROR(visitor->U32(Val(0), Val(1), BitsOffset(3, 2), + Bits(8), 0, &hc.counts[i])); + num_symbols += hc.counts[i]; + } + if (num_symbols < 1) { + // Actually, at least 2 symbols are required, since one of them is EOI. + return JXL_FAILURE("Empty Huffman table"); + } + if (num_symbols > hc.values.size()) { + return JXL_FAILURE("Huffman code too large (%" PRIuS ")", num_symbols); + } + // Presence flags for 4 * 64 + 1 values. + uint64_t value_slots[5] = {}; + for (size_t i = 0; i < num_symbols; i++) { + // Goes up to 256, included. Might have the same symbol appear twice... + JXL_RETURN_IF_ERROR(visitor->U32(Bits(2), BitsOffset(2, 4), + BitsOffset(4, 8), BitsOffset(8, 1), 0, + &hc.values[i])); + value_slots[hc.values[i] >> 6] |= (uint64_t)1 << (hc.values[i] & 0x3F); + } + if (hc.values[num_symbols - 1] != kJpegHuffmanAlphabetSize) { + return JXL_FAILURE("Missing EOI symbol"); + } + // Last element, denoting EOI, have to be 1 after the loop. + JXL_ASSERT(value_slots[4] == 1); + size_t num_values = 1; + for (size_t i = 0; i < 4; ++i) num_values += hwy::PopCount(value_slots[i]); + if (num_values != num_symbols) { + return JXL_FAILURE("Duplicate Huffman symbols"); + } + if (!is_ac) { + bool only_dc = ((value_slots[0] >> kJpegDCAlphabetSize) | value_slots[1] | + value_slots[2] | value_slots[3]) == 0; + if (!only_dc) return JXL_FAILURE("Huffman symbols out of DC range"); + } + } + + for (auto& scan : scan_info) { + JXL_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(2), Val(3), Val(4), 1, &scan.num_components)); + if (scan.num_components >= 4) { + return JXL_FAILURE("Invalid number of components in SOS marker"); + } + JXL_RETURN_IF_ERROR(visitor->Bits(6, 0, &scan.Ss)); + JXL_RETURN_IF_ERROR(visitor->Bits(6, 63, &scan.Se)); + JXL_RETURN_IF_ERROR(visitor->Bits(4, 0, &scan.Al)); + JXL_RETURN_IF_ERROR(visitor->Bits(4, 0, &scan.Ah)); + for (size_t i = 0; i < scan.num_components; i++) { + JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &scan.components[i].comp_idx)); + if (scan.components[i].comp_idx >= components.size()) { + return JXL_FAILURE("Invalid component idx in SOS marker"); + } + JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &scan.components[i].ac_tbl_idx)); + JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &scan.components[i].dc_tbl_idx)); + } + // TODO(veluca): actually set and use this value. + JXL_RETURN_IF_ERROR(visitor->U32(Val(0), Val(1), Val(2), BitsOffset(3, 3), + kMaxNumPasses - 1, + &scan.last_needed_pass)); + } + + // From here on, this is data that is not strictly necessary to get a valid + // JPEG, but necessary for bit-exact JPEG reconstruction. + if (info.has_dri) { + JXL_RETURN_IF_ERROR(visitor->Bits(16, 0, &restart_interval)); + } + + for (auto& scan : scan_info) { + uint32_t num_reset_points = scan.reset_points.size(); + JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(2, 1), BitsOffset(4, 4), + BitsOffset(16, 20), 0, &num_reset_points)); + if (visitor->IsReading()) { + scan.reset_points.resize(num_reset_points); + } + int last_block_idx = -1; + for (auto& block_idx : scan.reset_points) { + block_idx -= last_block_idx + 1; + JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(3, 1), + BitsOffset(5, 9), BitsOffset(28, 41), 0, + &block_idx)); + block_idx += last_block_idx + 1; + if (static_cast(block_idx) < last_block_idx + 1) { + return JXL_FAILURE("Invalid block ID: %u, last block was %d", block_idx, + last_block_idx); + } + // TODO(eustas): better upper boundary could be given at this point; also + // it could be applied during reset_points reading. + if (block_idx > (1u << 30)) { + // At most 8K x 8K x num_channels blocks are expected. That is, + // typically, 1.5 * 2^27. 2^30 should be sufficient for any sane + // image. + return JXL_FAILURE("Invalid block ID: %u", block_idx); + } + last_block_idx = block_idx; + } + + uint32_t num_extra_zero_runs = scan.extra_zero_runs.size(); + JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(2, 1), BitsOffset(4, 4), + BitsOffset(16, 20), 0, + &num_extra_zero_runs)); + if (visitor->IsReading()) { + scan.extra_zero_runs.resize(num_extra_zero_runs); + } + last_block_idx = -1; + for (size_t i = 0; i < scan.extra_zero_runs.size(); ++i) { + uint32_t& block_idx = scan.extra_zero_runs[i].block_idx; + JXL_RETURN_IF_ERROR(visitor->U32( + Val(1), BitsOffset(2, 2), BitsOffset(4, 5), BitsOffset(8, 20), 1, + &scan.extra_zero_runs[i].num_extra_zero_runs)); + block_idx -= last_block_idx + 1; + JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(3, 1), + BitsOffset(5, 9), BitsOffset(28, 41), 0, + &block_idx)); + block_idx += last_block_idx + 1; + if (static_cast(block_idx) < last_block_idx + 1) { + return JXL_FAILURE("Invalid block ID: %u, last block was %d", block_idx, + last_block_idx); + } + if (block_idx > (1u << 30)) { + // At most 8K x 8K x num_channels blocks are expected. That is, + // typically, 1.5 * 2^27. 2^30 should be sufficient for any sane + // image. + return JXL_FAILURE("Invalid block ID: %u", block_idx); + } + last_block_idx = block_idx; + } + } + std::vector inter_marker_data_sizes; + inter_marker_data_sizes.reserve(info.num_intermarker); + for (size_t i = 0; i < info.num_intermarker; ++i) { + uint32_t len = visitor->IsReading() ? 0 : inter_marker_data[i].size(); + JXL_RETURN_IF_ERROR(visitor->Bits(16, 0, &len)); + if (visitor->IsReading()) inter_marker_data_sizes.emplace_back(len); + } + uint32_t tail_data_len = tail_data.size(); + if (!visitor->IsReading() && tail_data_len > 4260096) { + return JXL_FAILURE("Tail data too large (max size = 4260096, size = %u).", + tail_data_len); + } + JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(8, 1), + BitsOffset(16, 257), BitsOffset(22, 65793), + 0, &tail_data_len)); + + JXL_RETURN_IF_ERROR(visitor->Bool(false, &has_zero_padding_bit)); + if (has_zero_padding_bit) { + uint32_t nbit = padding_bits.size(); + JXL_RETURN_IF_ERROR(visitor->Bits(24, 0, &nbit)); + if (visitor->IsReading()) { + padding_bits.reserve(std::min(1024u, nbit)); + for (uint32_t i = 0; i < nbit; i++) { + bool bbit = false; + JXL_RETURN_IF_ERROR(visitor->Bool(false, &bbit)); + padding_bits.push_back(bbit); + } + } else { + for (uint8_t& bit : padding_bits) { + bool bbit = bit; + JXL_RETURN_IF_ERROR(visitor->Bool(false, &bbit)); + bit = bbit; + } + } + } + + // Apply postponed actions. + if (visitor->IsReading()) { + tail_data.resize(tail_data_len); + JXL_ASSERT(inter_marker_data_sizes.size() == info.num_intermarker); + inter_marker_data.reserve(info.num_intermarker); + for (size_t i = 0; i < info.num_intermarker; ++i) { + inter_marker_data.emplace_back(inter_marker_data_sizes[i]); + } + } + + return true; +} + +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + +void JPEGData::CalculateMcuSize(const JPEGScanInfo& scan, int* MCUs_per_row, + int* MCU_rows) const { + const bool is_interleaved = (scan.num_components > 1); + const JPEGComponent& base_component = components[scan.components[0].comp_idx]; + // h_group / v_group act as numerators for converting number of blocks to + // number of MCU. In interleaved mode it is 1, so MCU is represented with + // max_*_samp_factor blocks. In non-interleaved mode we choose numerator to + // be the samping factor, consequently MCU is always represented with single + // block. + const int h_group = is_interleaved ? 1 : base_component.h_samp_factor; + const int v_group = is_interleaved ? 1 : base_component.v_samp_factor; + int max_h_samp_factor = 1; + int max_v_samp_factor = 1; + for (const auto& c : components) { + max_h_samp_factor = std::max(c.h_samp_factor, max_h_samp_factor); + max_v_samp_factor = std::max(c.v_samp_factor, max_v_samp_factor); + } + *MCUs_per_row = DivCeil(width * h_group, 8 * max_h_samp_factor); + *MCU_rows = DivCeil(height * v_group, 8 * max_v_samp_factor); +} + +#if JPEGXL_ENABLE_TRANSCODE_JPEG + +Status SetJPEGDataFromICC(const PaddedBytes& icc, jpeg::JPEGData* jpeg_data) { + size_t icc_pos = 0; + for (size_t i = 0; i < jpeg_data->app_data.size(); i++) { + if (jpeg_data->app_marker_type[i] != jpeg::AppMarkerType::kICC) { + continue; + } + size_t len = jpeg_data->app_data[i].size() - 17; + if (icc_pos + len > icc.size()) { + return JXL_FAILURE( + "ICC length is less than APP markers: requested %" PRIuS + " more bytes, " + "%" PRIuS " available", + len, icc.size() - icc_pos); + } + memcpy(&jpeg_data->app_data[i][17], icc.data() + icc_pos, len); + icc_pos += len; + } + if (icc_pos != icc.size() && icc_pos != 0) { + return JXL_FAILURE("ICC length is more than APP markers"); + } + return true; +} + +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + +} // namespace jpeg +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.h b/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.h new file mode 100644 index 0000000000..70ff4f8e05 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.h @@ -0,0 +1,216 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Data structures that represent the non-pixel contents of a jpeg file. + +#ifndef LIB_JXL_JPEG_JPEG_DATA_H_ +#define LIB_JXL_JPEG_JPEG_DATA_H_ + +#include +#include + +#include +#include + +#include "lib/jxl/common.h" // JPEGXL_ENABLE_TRANSCODE_JPEG +#include "lib/jxl/fields.h" + +namespace jxl { +namespace jpeg { + +constexpr int kMaxComponents = 4; +constexpr int kMaxQuantTables = 4; +constexpr int kMaxHuffmanTables = 4; +constexpr size_t kJpegHuffmanMaxBitLength = 16; +constexpr int kJpegHuffmanAlphabetSize = 256; +constexpr int kJpegDCAlphabetSize = 12; +constexpr int kMaxDHTMarkers = 512; +constexpr int kMaxDimPixels = 65535; +constexpr uint8_t kApp1 = 0xE1; +constexpr uint8_t kApp2 = 0xE2; +const uint8_t kIccProfileTag[12] = "ICC_PROFILE"; +const uint8_t kExifTag[6] = "Exif\0"; +const uint8_t kXMPTag[29] = "http://ns.adobe.com/xap/1.0/"; + +/* clang-format off */ +constexpr uint32_t kJPEGNaturalOrder[80] = { + 0, 1, 8, 16, 9, 2, 3, 10, + 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, + 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, + 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, + 53, 60, 61, 54, 47, 55, 62, 63, + // extra entries for safety in decoder + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63 +}; + +constexpr uint32_t kJPEGZigZagOrder[64] = { + 0, 1, 5, 6, 14, 15, 27, 28, + 2, 4, 7, 13, 16, 26, 29, 42, + 3, 8, 12, 17, 25, 30, 41, 43, + 9, 11, 18, 24, 31, 40, 44, 53, + 10, 19, 23, 32, 39, 45, 52, 54, + 20, 22, 33, 38, 46, 51, 55, 60, + 21, 34, 37, 47, 50, 56, 59, 61, + 35, 36, 48, 49, 57, 58, 62, 63 +}; +/* clang-format on */ + +// Quantization values for an 8x8 pixel block. +struct JPEGQuantTable { + std::array values; + uint32_t precision = 0; + // The index of this quantization table as it was parsed from the input JPEG. + // Each DQT marker segment contains an 'index' field, and we save this index + // here. Valid values are 0 to 3. + uint32_t index = 0; + // Set to true if this table is the last one within its marker segment. + bool is_last = true; +}; + +// Huffman code and decoding lookup table used for DC and AC coefficients. +struct JPEGHuffmanCode { + // Bit length histogram. + std::array counts = {}; + // Symbol values sorted by increasing bit lengths. + std::array values = {}; + // The index of the Huffman code in the current set of Huffman codes. For AC + // component Huffman codes, 0x10 is added to the index. + int slot_id = 0; + // Set to true if this Huffman code is the last one within its marker segment. + bool is_last = true; +}; + +// Huffman table indexes used for one component of one scan. +struct JPEGComponentScanInfo { + uint32_t comp_idx; + uint32_t dc_tbl_idx; + uint32_t ac_tbl_idx; +}; + +// Contains information that is used in one scan. +struct JPEGScanInfo { + // Parameters used for progressive scans (named the same way as in the spec): + // Ss : Start of spectral band in zig-zag sequence. + // Se : End of spectral band in zig-zag sequence. + // Ah : Successive approximation bit position, high. + // Al : Successive approximation bit position, low. + uint32_t Ss; + uint32_t Se; + uint32_t Ah; + uint32_t Al; + uint32_t num_components = 0; + std::array components; + // Last codestream pass that is needed to write this scan. + uint32_t last_needed_pass = 0; + + // Extra information required for bit-precise JPEG file reconstruction. + + // Set of block indexes where the JPEG encoder has to flush the end-of-block + // runs and refinement bits. + std::vector reset_points; + // The number of extra zero runs (Huffman symbol 0xf0) before the end of + // block (if nonzero), indexed by block index. + // All of these symbols can be omitted without changing the pixel values, but + // some jpeg encoders put these at the end of blocks. + typedef struct { + uint32_t block_idx; + uint32_t num_extra_zero_runs; + } ExtraZeroRunInfo; + std::vector extra_zero_runs; +}; + +typedef int16_t coeff_t; + +// Represents one component of a jpeg file. +struct JPEGComponent { + JPEGComponent() + : id(0), + h_samp_factor(1), + v_samp_factor(1), + quant_idx(0), + width_in_blocks(0), + height_in_blocks(0) {} + + // One-byte id of the component. + uint32_t id; + // Horizontal and vertical sampling factors. + // In interleaved mode, each minimal coded unit (MCU) has + // h_samp_factor x v_samp_factor DCT blocks from this component. + int h_samp_factor; + int v_samp_factor; + // The index of the quantization table used for this component. + uint32_t quant_idx; + // The dimensions of the component measured in 8x8 blocks. + uint32_t width_in_blocks; + uint32_t height_in_blocks; + // The DCT coefficients of this component, laid out block-by-block, divided + // through the quantization matrix values. + std::vector coeffs; +}; + +enum class AppMarkerType : uint32_t { + kUnknown = 0, + kICC = 1, + kExif = 2, + kXMP = 3, +}; + +// Represents a parsed jpeg file. +struct JPEGData : public Fields { + JPEGData() + : width(0), height(0), restart_interval(0), has_zero_padding_bit(false) {} + + JXL_FIELDS_NAME(JPEGData) +#if JPEGXL_ENABLE_TRANSCODE_JPEG + // Doesn't serialize everything - skips brotli-encoded data and what is + // already encoded in the codestream. + Status VisitFields(Visitor* visitor) override; +#else + Status VisitFields(Visitor* /* visitor */) override { + JXL_ABORT("JPEG transcoding support not enabled"); + } +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + + void CalculateMcuSize(const JPEGScanInfo& scan, int* MCUs_per_row, + int* MCU_rows) const; + + int width; + int height; + uint32_t restart_interval; + std::vector> app_data; + std::vector app_marker_type; + std::vector> com_data; + std::vector quant; + std::vector huffman_code; + std::vector components; + std::vector scan_info; + std::vector marker_order; + std::vector> inter_marker_data; + std::vector tail_data; + + // Extra information required for bit-precise JPEG file reconstruction. + + bool has_zero_padding_bit; + std::vector padding_bits; +}; + +#if JPEGXL_ENABLE_TRANSCODE_JPEG +// Set ICC profile in jpeg_data. +Status SetJPEGDataFromICC(const PaddedBytes& icc, jpeg::JPEGData* jpeg_data); +#else +static JXL_INLINE Status SetJPEGDataFromICC(const PaddedBytes& /* icc */, + jpeg::JPEGData* /* jpeg_data */) { + JXL_ABORT("JPEG transcoding support not enabled"); +} +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + +} // namespace jpeg +} // namespace jxl + +#endif // LIB_JXL_JPEG_JPEG_DATA_H_ diff --git a/third_party/jpeg-xl/lib/jxl/jxl.syms b/third_party/jpeg-xl/lib/jxl/jxl.syms new file mode 100644 index 0000000000..0f398d7151 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/jxl.syms @@ -0,0 +1,5 @@ +{ + extern "C" { + jpegxl_*; + }; +}; diff --git a/third_party/jpeg-xl/lib/jxl/jxl.version b/third_party/jpeg-xl/lib/jxl/jxl.version new file mode 100644 index 0000000000..26b0e9e54d --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/jxl.version @@ -0,0 +1,17 @@ +JXL_0 { + global: + Jxl*; + + local: + # Hide all the std namespace symbols. std namespace is explicitly marked + # as visibility(default) and header-only functions or methods (such as those + # from templates) should be exposed in shared libraries as weak symbols but + # this is only needed when we expose those types in the shared library API + # in any way. We don't use C++ std types in the API and we also don't + # support exceptions in the library. + # See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36022 for a discussion + # about this. + extern "C++" { + *std::*; + }; +}; diff --git a/third_party/jpeg-xl/lib/jxl/jxl_inspection.h b/third_party/jpeg-xl/lib/jxl/jxl_inspection.h new file mode 100644 index 0000000000..0b70a58523 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/jxl_inspection.h @@ -0,0 +1,22 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_JXL_INSPECTION_H_ +#define LIB_JXL_JXL_INSPECTION_H_ + +#include + +#include "lib/jxl/image.h" + +namespace jxl { +// Type of the inspection-callback which, if enabled, will be called on various +// intermediate data during image processing, allowing inspection access. +// +// Returns false if processing can be stopped at that point, true otherwise. +// This is only advisory - it is always OK to just continue processing. +using InspectorImage3F = std::function; +} // namespace jxl + +#endif // LIB_JXL_JXL_INSPECTION_H_ diff --git a/third_party/jpeg-xl/lib/jxl/jxl_osx.syms b/third_party/jpeg-xl/lib/jxl/jxl_osx.syms new file mode 100644 index 0000000000..96bc568025 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/jxl_osx.syms @@ -0,0 +1 @@ +_Jxl* diff --git a/third_party/jpeg-xl/lib/jxl/jxl_test.cc b/third_party/jpeg-xl/lib/jxl/jxl_test.cc new file mode 100644 index 0000000000..0a676802f6 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/jxl_test.cc @@ -0,0 +1,1537 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/extras/dec/jxl.h" + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "lib/extras/codec.h" +#include "lib/extras/enc/encode.h" +#include "lib/extras/packed_image.h" +#include "lib/jxl/alpha.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_butteraugli_pnorm.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/enc_file.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/fake_parallel_runner_testonly.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/jpeg/dec_jpeg_data.h" +#include "lib/jxl/jpeg/dec_jpeg_data_writer.h" +#include "lib/jxl/jpeg/enc_jpeg_data.h" +#include "lib/jxl/jpeg/jpeg_data.h" +#include "lib/jxl/modular/options.h" +#include "lib/jxl/test_image.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" +#include "tools/box/box.h" + +namespace jxl { + +struct AuxOut; + +namespace { +using extras::JXLCompressParams; +using extras::JXLDecompressParams; +using extras::PackedPixelFile; +using test::ButteraugliDistance; +using test::ComputeDistance2; +using test::Roundtrip; +using test::TestImage; +using test::ThreadPoolForTests; + +#define JXL_TEST_NL 0 // Disabled in code + +TEST(JxlTest, RoundtripSinglePixel) { + TestImage t; + t.SetDimensions(1, 1).AddFrame().ZeroFill(); + PackedPixelFile ppf_out; + EXPECT_EQ(Roundtrip(t.ppf(), {}, {}, nullptr, &ppf_out), 55); +} + +TEST(JxlTest, RoundtripSinglePixelWithAlpha) { + TestImage t; + t.SetDimensions(1, 1).SetChannels(4).AddFrame().ZeroFill(); + PackedPixelFile ppf_out; + EXPECT_EQ(Roundtrip(t.ppf(), {}, {}, nullptr, &ppf_out), 59); +} + +// Changing serialized signature causes Decode to fail. +#ifndef JXL_CRASH_ON_ERROR +TEST(JxlTest, RoundtripMarker) { + TestImage t; + t.SetDimensions(1, 1).AddFrame().ZeroFill(); + for (size_t i = 0; i < 2; ++i) { + std::vector compressed; + EXPECT_TRUE(extras::EncodeImageJXL({}, t.ppf(), /*jpeg_bytes=*/nullptr, + &compressed)); + compressed[i] ^= 0xFF; + PackedPixelFile ppf_out; + EXPECT_FALSE(extras::DecodeImageJXL(compressed.data(), compressed.size(), + {}, /*decodec_bytes=*/nullptr, + &ppf_out)); + } +} +#endif + +TEST(JxlTest, RoundtripTinyFast) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(32, 32); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7); + cparams.distance = 4.0f; + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 192, 10); +} + +TEST(JxlTest, RoundtripSmallD1) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + size_t xsize = t.ppf().info.xsize / 8; + size_t ysize = t.ppf().info.ysize / 8; + t.SetDimensions(xsize, ysize); + + { + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 766, 40); + EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.2)); + } + + // With a lower intensity target than the default, the bitrate should be + // smaller. + t.ppf().info.intensity_target = 100.0f; + + { + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 659, 20); + EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.3)); + EXPECT_EQ(ppf_out.info.intensity_target, t.ppf().info.intensity_target); + } +} +TEST(JxlTest, RoundtripResample2) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2); + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 3); // kFalcon + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 18772, 200); + EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(90)); +} + +TEST(JxlTest, RoundtripResample2Slow) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2); + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 9); // kTortoise + cparams.distance = 10.0; + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 4088, 200); + EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(250)); +} + +TEST(JxlTest, RoundtripResample2MT) { + ThreadPoolForTests pool(4); + const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png"); + // image has to be large enough to have multiple groups after downsampling + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2); + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 3); // kFalcon + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 228283, 1000); + EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(340)); +} + +// Roundtrip the image using a parallel runner that executes single-threaded but +// in random order. +TEST(JxlTest, RoundtripOutOfOrderProcessing) { + FakeParallelRunner fake_pool(/*order_seed=*/123, /*num_threads=*/8); + ThreadPool pool(&JxlFakeParallelRunner, &fake_pool); + const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + // Image size is selected so that the block border needed is larger than the + // amount of pixels available on the next block. + t.SetDimensions(513, 515); + + JXLCompressParams cparams; + // Force epf so we end up needing a lot of border. + cparams.AddOption(JXL_ENC_FRAME_SETTING_EPF, 3); + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 22584, 400); + EXPECT_LE(ButteraugliDistance(t.ppf(), ppf_out), 1.35); +} + +TEST(JxlTest, RoundtripOutOfOrderProcessingBorder) { + FakeParallelRunner fake_pool(/*order_seed=*/47, /*num_threads=*/8); + ThreadPool pool(&JxlFakeParallelRunner, &fake_pool); + const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + // Image size is selected so that the block border needed is larger than the + // amount of pixels available on the next block. + t.SetDimensions(513, 515); + + JXLCompressParams cparams; + // Force epf so we end up needing a lot of border. + cparams.AddOption(JXL_ENC_FRAME_SETTING_EPF, 3); + cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2); + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 10907, 200); + EXPECT_LE(ButteraugliDistance(t.ppf(), ppf_out), 2.9); +} + +TEST(JxlTest, RoundtripResample4) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 4); + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 5824, 100); + EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(22)); +} + +TEST(JxlTest, RoundtripResample8) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 8); + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 2036, 50); + EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(50)); +} + +TEST(JxlTest, RoundtripUnalignedD2) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + size_t xsize = t.ppf().info.xsize / 12; + size_t ysize = t.ppf().info.ysize / 7; + t.SetDimensions(xsize, ysize); + + JXLCompressParams cparams; + cparams.distance = 2.0; + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 506, 30); + EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.72)); +} + +TEST(JxlTest, RoundtripMultiGroup) { + const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(600, 1024); + + auto test = [&](jxl::SpeedTier speed_tier, float target_distance, + size_t expected_size, float expected_distance) { + ThreadPoolForTests pool(4); + JXLCompressParams cparams; + int64_t effort = 10 - static_cast(speed_tier); + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, effort); + cparams.distance = target_distance; + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), expected_size, + 700); + EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), + IsSlightlyBelow(expected_distance)); + }; + + auto run_kitten = std::async(std::launch::async, test, SpeedTier::kKitten, + 1.0f, 54895u, 11.7); + auto run_wombat = std::async(std::launch::async, test, SpeedTier::kWombat, + 2.0f, 33507u, 20.0); +} + +TEST(JxlTest, RoundtripRGBToGrayscale) { + ThreadPoolForTests pool(4); + const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + io.ShrinkTo(600, 1024); + + CompressParams cparams; + cparams.butteraugli_distance = 1.0f; + cparams.speed_tier = SpeedTier::kFalcon; + + JXLDecompressParams dparams; + dparams.color_space = "Gra_D65_Rel_SRG"; + + CodecInOut io2; + EXPECT_FALSE(io.Main().IsGray()); + size_t compressed_size; + JXL_EXPECT_OK( + Roundtrip(&io, cparams, dparams, &io2, _, &compressed_size, &pool)); + EXPECT_LE(compressed_size, 65000u); + EXPECT_TRUE(io2.Main().IsGray()); + + // Convert original to grayscale here, because TransformTo refuses to + // convert between grayscale and RGB. + ColorEncoding srgb_lin = ColorEncoding::LinearSRGB(/*is_gray=*/false); + ASSERT_TRUE(io.frames[0].TransformTo(srgb_lin, GetJxlCms())); + Image3F* color = io.Main().color(); + for (size_t y = 0; y < color->ysize(); ++y) { + float* row_r = color->PlaneRow(0, y); + float* row_g = color->PlaneRow(1, y); + float* row_b = color->PlaneRow(2, y); + for (size_t x = 0; x < color->xsize(); ++x) { + float luma = 0.2126 * row_r[x] + 0.7152 * row_g[x] + 0.0722 * row_b[x]; + row_r[x] = row_g[x] = row_b[x] = luma; + } + } + ColorEncoding srgb_gamma = ColorEncoding::SRGB(/*is_gray=*/false); + ASSERT_TRUE(io.frames[0].TransformTo(srgb_gamma, GetJxlCms())); + io.metadata.m.color_encoding = io2.Main().c_current(); + io.Main().OverrideProfile(io2.Main().c_current()); + EXPECT_THAT( + ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, GetJxlCms(), + /*distmap=*/nullptr, &pool), + IsSlightlyBelow(1.36)); +} + +TEST(JxlTest, RoundtripLargeFast) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7); // kSquirrel + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 445684, 5000); + EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(100)); +} + +TEST(JxlTest, RoundtripDotsForceEpf) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/cvo9xd_keong_macan_srgb8.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7); // kSquirrel + cparams.AddOption(JXL_ENC_FRAME_SETTING_EPF, 2); + cparams.AddOption(JXL_ENC_FRAME_SETTING_DOTS, 1); + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 41472, 300); + EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(18)); +} + +// Checks for differing size/distance in two consecutive runs of distance 2, +// which involves additional processing including adaptive reconstruction. +// Failing this may be a sign of race conditions or invalid memory accesses. +TEST(JxlTest, RoundtripD2Consistent) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7); // kSquirrel + cparams.distance = 2.0; + + // Try each xsize mod kBlockDim to verify right border handling. + for (size_t xsize = 48; xsize > 40; --xsize) { + t.SetDimensions(xsize, 15); + + PackedPixelFile ppf2; + const size_t size2 = Roundtrip(t.ppf(), cparams, {}, &pool, &ppf2); + + PackedPixelFile ppf3; + const size_t size3 = Roundtrip(t.ppf(), cparams, {}, &pool, &ppf3); + + // Exact same compressed size. + EXPECT_EQ(size2, size3); + + // Exact same distance. + const float dist2 = ComputeDistance2(t.ppf(), ppf2); + const float dist3 = ComputeDistance2(t.ppf(), ppf3); + EXPECT_EQ(dist2, dist3); + } +} + +// Same as above, but for full image, testing multiple groups. +TEST(JxlTest, RoundtripLargeConsistent) { + const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7); // kSquirrel + cparams.distance = 2.0; + + auto roundtrip_and_compare = [&]() { + ThreadPoolForTests pool(8); + PackedPixelFile ppf2; + size_t size = Roundtrip(t.ppf(), cparams, {}, &pool, &ppf2); + double dist = ComputeDistance2(t.ppf(), ppf2); + return std::tuple(size, dist); + }; + + // Try each xsize mod kBlockDim to verify right border handling. + auto future2 = std::async(std::launch::async, roundtrip_and_compare); + auto future3 = std::async(std::launch::async, roundtrip_and_compare); + + const auto result2 = future2.get(); + const auto result3 = future3.get(); + + // Exact same compressed size. + EXPECT_EQ(std::get<0>(result2), std::get<0>(result3)); + + // Exact same distance. + EXPECT_EQ(std::get<1>(result2), std::get<1>(result3)); +} + +TEST(JxlTest, RoundtripSmallNL) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + size_t xsize = t.ppf().info.xsize / 8; + size_t ysize = t.ppf().info.ysize / 8; + t.SetDimensions(xsize, ysize); + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 783, 25); + EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.1)); +} + +TEST(JxlTest, RoundtripNoGaborishNoAR) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_EPF, 0); + cparams.AddOption(JXL_ENC_FRAME_SETTING_GABORISH, 0); + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 38561, 200); + EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.8)); +} + +TEST(JxlTest, RoundtripSmallNoGaborish) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + size_t xsize = t.ppf().info.xsize / 8; + size_t ysize = t.ppf().info.ysize / 8; + t.SetDimensions(xsize, ysize); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_GABORISH, 0); + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 811, 20); + EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.1)); +} + +TEST(JxlTest, RoundtripSmallPatchesAlpha) { + ThreadPool* pool = nullptr; + TestImage t; + t.SetDimensions(256, 256).SetChannels(4); + t.SetColorEncoding("RGB_D65_SRG_Rel_Lin"); + TestImage::Frame frame = t.AddFrame(); + frame.ZeroFill(); + // This pattern should be picked up by the patch detection heuristics. + for (size_t y = 0; y < t.ppf().info.ysize; ++y) { + for (size_t x = 0; x < t.ppf().info.xsize; ++x) { + if (x % 4 == 0 && (y / 32) % 4 == 0) { + frame.SetValue(y, x, 1, 127.0f / 255.0f); + } + frame.SetValue(y, x, 3, 1.0f); + } + } + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7); // kSquirrel + cparams.distance = 0.1f; + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 597, 100); + EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.012f)); +} + +TEST(JxlTest, RoundtripSmallPatches) { + ThreadPool* pool = nullptr; + TestImage t; + t.SetDimensions(256, 256); + t.SetColorEncoding("RGB_D65_SRG_Rel_Lin"); + TestImage::Frame frame = t.AddFrame(); + frame.ZeroFill(); + // This pattern should be picked up by the patch detection heuristics. + for (size_t y = 0; y < t.ppf().info.ysize; ++y) { + for (size_t x = 0; x < t.ppf().info.xsize; ++x) { + if (x % 4 == 0 && (y / 32) % 4 == 0) { + frame.SetValue(y, x, 1, 127.0f / 255.0f); + } + } + } + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7); // kSquirrel + cparams.distance = 0.1f; + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 486, 100); + EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.012f)); +} + +// TODO(szabadka) Add encoder and decoder API functions that accept frame +// buffers in arbitrary unsigned and floating point formats, and then roundtrip +// test the lossless codepath to make sure the exact binary representations +// are preserved. +#if 0 +TEST(JxlTest, RoundtripImageBundleOriginalBits) { + // Image does not matter, only io.metadata.m and io2.metadata.m are tested. + Image3F image(1, 1); + ZeroFillImage(&image); + CodecInOut io; + io.metadata.m.color_encoding = ColorEncoding::LinearSRGB(); + io.SetFromImage(std::move(image), ColorEncoding::LinearSRGB()); + + CompressParams cparams; + + // Test unsigned integers from 1 to 32 bits + for (uint32_t bit_depth = 1; bit_depth <= 32; bit_depth++) { + if (bit_depth == 32) { + // TODO(lode): allow testing 32, however the code below ends up in + // enc_modular which does not support 32. We only want to test the header + // encoding though, so try without modular. + break; + } + + io.metadata.m.SetUintSamples(bit_depth); + CodecInOut io2; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _)); + + EXPECT_EQ(bit_depth, io2.metadata.m.bit_depth.bits_per_sample); + EXPECT_FALSE(io2.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(0u, io2.metadata.m.bit_depth.exponent_bits_per_sample); + EXPECT_EQ(0u, io2.metadata.m.GetAlphaBits()); + } + + // Test various existing and non-existing floating point formats + for (uint32_t bit_depth = 8; bit_depth <= 32; bit_depth++) { + if (bit_depth != 32) { + // TODO: test other float types once they work + break; + } + + uint32_t exponent_bit_depth; + if (bit_depth < 10) { + exponent_bit_depth = 2; + } else if (bit_depth < 12) { + exponent_bit_depth = 3; + } else if (bit_depth < 16) { + exponent_bit_depth = 4; + } else if (bit_depth < 20) { + exponent_bit_depth = 5; + } else if (bit_depth < 24) { + exponent_bit_depth = 6; + } else if (bit_depth < 28) { + exponent_bit_depth = 7; + } else { + exponent_bit_depth = 8; + } + + io.metadata.m.bit_depth.bits_per_sample = bit_depth; + io.metadata.m.bit_depth.floating_point_sample = true; + io.metadata.m.bit_depth.exponent_bits_per_sample = exponent_bit_depth; + + CodecInOut io2; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2)); + + EXPECT_EQ(bit_depth, io2.metadata.m.bit_depth.bits_per_sample); + EXPECT_TRUE(io2.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(exponent_bit_depth, + io2.metadata.m.bit_depth.exponent_bits_per_sample); + EXPECT_EQ(0u, io2.metadata.m.GetAlphaBits()); + } +} +#endif + +TEST(JxlTest, RoundtripGrayscale) { + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/cvo9xd_keong_macan_grayscale.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io)); + ASSERT_NE(io.xsize(), 0u); + io.ShrinkTo(128, 128); + EXPECT_TRUE(io.Main().IsGray()); + EXPECT_EQ(8u, io.metadata.m.bit_depth.bits_per_sample); + EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(0u, io.metadata.m.bit_depth.exponent_bits_per_sample); + EXPECT_TRUE(io.metadata.m.color_encoding.tf.IsSRGB()); + + PassesEncoderState enc_state; + AuxOut* aux_out = nullptr; + + { + CompressParams cparams; + cparams.butteraugli_distance = 1.0; + + PaddedBytes compressed; + EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(), + aux_out)); + CodecInOut io2; + EXPECT_TRUE(test::DecodeFile({}, Span(compressed), &io2)); + EXPECT_TRUE(io2.Main().IsGray()); + + EXPECT_LE(compressed.size(), 7000u); + EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, + GetJxlCms(), + /*distmap=*/nullptr), + IsSlightlyBelow(1.6)); + } + + // Test with larger butteraugli distance and other settings enabled so + // different jxl codepaths trigger. + { + CompressParams cparams; + cparams.butteraugli_distance = 8.0; + + PaddedBytes compressed; + EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(), + aux_out)); + CodecInOut io2; + EXPECT_TRUE(test::DecodeFile({}, Span(compressed), &io2)); + EXPECT_TRUE(io2.Main().IsGray()); + + EXPECT_LE(compressed.size(), 1300u); + EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, + GetJxlCms(), + /*distmap=*/nullptr), + IsSlightlyBelow(6.0)); + } + + { + CompressParams cparams; + cparams.butteraugli_distance = 1.0; + + PaddedBytes compressed; + EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(), + aux_out)); + + CodecInOut io2; + JXLDecompressParams dparams; + dparams.color_space = "RGB_D65_SRG_Rel_SRG"; + EXPECT_TRUE( + test::DecodeFile(dparams, Span(compressed), &io2)); + EXPECT_FALSE(io2.Main().IsGray()); + + EXPECT_LE(compressed.size(), 7000u); + EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, + GetJxlCms(), + /*distmap=*/nullptr), + IsSlightlyBelow(1.6)); + } +} + +TEST(JxlTest, RoundtripAlpha) { + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/tmshre_riaphotographs_alpha.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io)); + + ASSERT_NE(io.xsize(), 0u); + ASSERT_TRUE(io.metadata.m.HasAlpha()); + ASSERT_TRUE(io.Main().HasAlpha()); + io.ShrinkTo(300, 300); + + CompressParams cparams; + cparams.butteraugli_distance = 1.0; + + EXPECT_EQ(8u, io.metadata.m.bit_depth.bits_per_sample); + EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(0u, io.metadata.m.bit_depth.exponent_bits_per_sample); + EXPECT_TRUE(io.metadata.m.color_encoding.tf.IsSRGB()); + PassesEncoderState enc_state; + AuxOut* aux_out = nullptr; + PaddedBytes compressed; + EXPECT_TRUE( + EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(), aux_out)); + + EXPECT_LE(compressed.size(), 10077u); + + for (bool use_image_callback : {false, true}) { + for (bool unpremul_alpha : {false, true}) { + CodecInOut io2; + JXLDecompressParams dparams; + dparams.use_image_callback = use_image_callback; + dparams.unpremultiply_alpha = unpremul_alpha; + EXPECT_TRUE( + test::DecodeFile(dparams, Span(compressed), &io2)); + EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, + GetJxlCms(), + /*distmap=*/nullptr), + IsSlightlyBelow(1.15)); + } + } +} + +namespace { +// Performs "PremultiplyAlpha" for each ImageBundle (preview/frames). +bool PremultiplyAlpha(CodecInOut& io) { + const auto doPremultiplyAlpha = [](ImageBundle& bundle) { + if (!bundle.HasAlpha()) return; + if (!bundle.HasColor()) return; + auto* color = bundle.color(); + const auto* alpha = bundle.alpha(); + JXL_CHECK(color->ysize() == alpha->ysize()); + JXL_CHECK(color->xsize() == alpha->xsize()); + for (size_t y = 0; y < color->ysize(); y++) { + ::jxl::PremultiplyAlpha(color->PlaneRow(0, y), color->PlaneRow(1, y), + color->PlaneRow(2, y), alpha->Row(y), + color->xsize()); + } + }; + ExtraChannelInfo* eci = io.metadata.m.Find(ExtraChannel::kAlpha); + if (eci == nullptr || eci->alpha_associated) return false; + if (io.metadata.m.have_preview) { + doPremultiplyAlpha(io.preview_frame); + } + for (ImageBundle& ib : io.frames) { + doPremultiplyAlpha(ib); + } + eci->alpha_associated = true; + return true; +} + +bool UnpremultiplyAlpha(CodecInOut& io) { + const auto doUnpremultiplyAlpha = [](ImageBundle& bundle) { + if (!bundle.HasAlpha()) return; + if (!bundle.HasColor()) return; + auto* color = bundle.color(); + const auto* alpha = bundle.alpha(); + JXL_CHECK(color->ysize() == alpha->ysize()); + JXL_CHECK(color->xsize() == alpha->xsize()); + for (size_t y = 0; y < color->ysize(); y++) { + ::jxl::UnpremultiplyAlpha(color->PlaneRow(0, y), color->PlaneRow(1, y), + color->PlaneRow(2, y), alpha->Row(y), + color->xsize()); + } + }; + ExtraChannelInfo* eci = io.metadata.m.Find(ExtraChannel::kAlpha); + if (eci == nullptr || !eci->alpha_associated) return false; + if (io.metadata.m.have_preview) { + doUnpremultiplyAlpha(io.preview_frame); + } + for (ImageBundle& ib : io.frames) { + doUnpremultiplyAlpha(ib); + } + eci->alpha_associated = false; + return true; +} +} // namespace + +TEST(JxlTest, RoundtripAlphaPremultiplied) { + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/tmshre_riaphotographs_alpha.png"); + CodecInOut io, io_nopremul; + ASSERT_TRUE(SetFromBytes(Span(orig), &io)); + ASSERT_TRUE(SetFromBytes(Span(orig), &io_nopremul)); + + ASSERT_NE(io.xsize(), 0u); + ASSERT_TRUE(io.metadata.m.HasAlpha()); + ASSERT_TRUE(io.Main().HasAlpha()); + io.ShrinkTo(300, 300); + io_nopremul.ShrinkTo(300, 300); + + CompressParams cparams; + cparams.butteraugli_distance = 1.0; + + EXPECT_FALSE(io.Main().AlphaIsPremultiplied()); + EXPECT_TRUE(PremultiplyAlpha(io)); + EXPECT_TRUE(io.Main().AlphaIsPremultiplied()); + + EXPECT_FALSE(io_nopremul.Main().AlphaIsPremultiplied()); + + PassesEncoderState enc_state; + AuxOut* aux_out = nullptr; + PaddedBytes compressed; + EXPECT_TRUE( + EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(), aux_out)); + EXPECT_LE(compressed.size(), 10000u); + + for (bool use_image_callback : {false, true}) { + for (bool unpremul_alpha : {false, true}) { + for (bool use_uint8 : {false, true}) { + printf( + "Testing premultiplied alpha using %s %s requesting " + "%spremultiplied output.\n", + use_uint8 ? "uint8" : "float", + use_image_callback ? "image callback" : "image_buffer", + unpremul_alpha ? "un" : ""); + CodecInOut io2; + JXLDecompressParams dparams; + dparams.use_image_callback = use_image_callback; + dparams.unpremultiply_alpha = unpremul_alpha; + if (use_uint8) { + dparams.accepted_formats = { + {4, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}}; + } + EXPECT_TRUE( + test::DecodeFile(dparams, Span(compressed), &io2)); + + EXPECT_EQ(unpremul_alpha, !io2.Main().AlphaIsPremultiplied()); + if (!unpremul_alpha) { + EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, + cparams.ba_params, GetJxlCms(), + /*distmap=*/nullptr), + IsSlightlyBelow(1.2)); + EXPECT_TRUE(UnpremultiplyAlpha(io2)); + EXPECT_FALSE(io2.Main().AlphaIsPremultiplied()); + } + EXPECT_THAT(ButteraugliDistance(io_nopremul.frames, io2.frames, + cparams.ba_params, GetJxlCms(), + /*distmap=*/nullptr), + IsSlightlyBelow(1.47)); + } + } + } +} + +TEST(JxlTest, RoundtripAlphaResampling) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/tmshre_riaphotographs_alpha.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + ASSERT_NE(t.ppf().info.xsize, 0); + ASSERT_TRUE(t.ppf().info.alpha_bits > 0); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 5); // kHare + cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2); + cparams.AddOption(JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING, 2); + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 12803, 130); + EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(5.2)); +} + +TEST(JxlTest, RoundtripAlphaResamplingOnlyAlpha) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/tmshre_riaphotographs_alpha.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + ASSERT_NE(t.ppf().info.xsize, 0); + ASSERT_TRUE(t.ppf().info.alpha_bits > 0); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 3); // kFalcon + cparams.AddOption(JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING, 2); + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 33571, 400); + EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.49)); +} + +TEST(JxlTest, RoundtripAlphaNonMultipleOf8) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/tmshre_riaphotographs_alpha.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(12, 12); + ASSERT_NE(t.ppf().info.xsize, 0); + ASSERT_TRUE(t.ppf().info.alpha_bits > 0); + EXPECT_EQ(t.ppf().frames[0].color.format.data_type, JXL_TYPE_UINT8); + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 107, 10); + EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.95)); +} + +TEST(JxlTest, RoundtripAlpha16) { + ThreadPoolForTests pool(4); + // The image is wider than 512 pixels to ensure multiple groups are tested. + size_t xsize = 1200, ysize = 160; + TestImage t; + t.SetDimensions(xsize, ysize).SetChannels(4).SetAllBitDepths(16); + TestImage::Frame frame = t.AddFrame(); + // Generate 16-bit pattern that uses various colors and alpha values. + const float mul = 1.0f / 65535; + for (size_t y = 0; y < ysize; y++) { + for (size_t x = 0; x < xsize; x++) { + uint16_t r = y * 65535 / ysize; + uint16_t g = x * 65535 / xsize; + uint16_t b = (y + x) * 65535 / (xsize + ysize); + frame.SetValue(y, x, 0, r * mul); + frame.SetValue(y, x, 1, g * mul); + frame.SetValue(y, x, 2, b * mul); + frame.SetValue(y, x, 3, g * mul); + } + } + + ASSERT_NE(t.ppf().info.xsize, 0); + ASSERT_EQ(t.ppf().info.alpha_bits, 16); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 6); // kWombat + cparams.distance = 0.5; + + PackedPixelFile ppf_out; + // TODO(szabadka) Investigate big size difference on i686 + // This still keeps happening (2023-04-18). + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 3466, 120); + EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.65)); +} + +namespace { +JXLCompressParams CompressParamsForLossless() { + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_MODULAR, 1); + cparams.AddOption(JXL_ENC_FRAME_SETTING_COLOR_TRANSFORM, 1); + cparams.AddOption(JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR, 6); // Weighted + cparams.distance = 0; + return cparams; +} +} // namespace + +TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8)) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/tmshre_riaphotographs_srgb8.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + + JXLCompressParams cparams = CompressParamsForLossless(); + + PackedPixelFile ppf_out; + EXPECT_EQ(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 222167); + EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0); +} + +TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8ThunderGradient)) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/tmshre_riaphotographs_srgb8.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + + JXLCompressParams cparams = CompressParamsForLossless(); + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 2); // kThunder + cparams.AddOption(JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR, 5); // Gradient + + PackedPixelFile ppf_out; + EXPECT_EQ(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 261684); + EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0); +} + +TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8LightningGradient)) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/tmshre_riaphotographs_srgb8.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + + JXLCompressParams cparams = CompressParamsForLossless(); + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 1); // kLightning + + PackedPixelFile ppf_out; + // Lax comparison because different SIMD will cause different compression. + EXPECT_THAT(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), + IsSlightlyBelow(286848u)); + EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0); +} + +TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8Falcon)) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/tmshre_riaphotographs_srgb8.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + + JXLCompressParams cparams = CompressParamsForLossless(); + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 3); // kFalcon + + PackedPixelFile ppf_out; + EXPECT_EQ(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 230766); + EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0); +} + +TEST(JxlTest, RoundtripLossless8Alpha) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/tmshre_riaphotographs_alpha.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + ASSERT_EQ(t.ppf().info.alpha_bits, 8); + EXPECT_EQ(t.ppf().frames[0].color.format.data_type, JXL_TYPE_UINT8); + + JXLCompressParams cparams = CompressParamsForLossless(); + + JXLDecompressParams dparams; + dparams.accepted_formats.push_back(t.ppf().frames[0].color.format); + + PackedPixelFile ppf_out; + EXPECT_EQ(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), 248817); + EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0); + EXPECT_EQ(ppf_out.info.alpha_bits, 8); + EXPECT_TRUE(test::SameAlpha(t.ppf(), ppf_out)); +} + +TEST(JxlTest, RoundtripLossless16Alpha) { + ThreadPool* pool = nullptr; + size_t xsize = 1200, ysize = 160; + TestImage t; + t.SetDimensions(xsize, ysize).SetChannels(4).SetAllBitDepths(16); + TestImage::Frame frame = t.AddFrame(); + // Generate 16-bit pattern that uses various colors and alpha values. + const float mul = 1.0f / 65535; + for (size_t y = 0; y < ysize; y++) { + for (size_t x = 0; x < xsize; x++) { + uint16_t r = y * 65535 / ysize; + uint16_t g = x * 65535 / xsize + 37; + uint16_t b = (y + x) * 65535 / (xsize + ysize); + frame.SetValue(y, x, 0, r * mul); + frame.SetValue(y, x, 1, g * mul); + frame.SetValue(y, x, 2, b * mul); + frame.SetValue(y, x, 3, g * mul); + } + } + ASSERT_EQ(t.ppf().info.bits_per_sample, 16); + ASSERT_EQ(t.ppf().info.alpha_bits, 16); + + JXLCompressParams cparams = CompressParamsForLossless(); + + JXLDecompressParams dparams; + dparams.accepted_formats.push_back(t.ppf().frames[0].color.format); + + PackedPixelFile ppf_out; + // TODO(szabadka) Investigate big size difference on i686 + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), 4849, 100); + EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0); + EXPECT_EQ(ppf_out.info.alpha_bits, 16); + EXPECT_TRUE(test::SameAlpha(t.ppf(), ppf_out)); +} + +TEST(JxlTest, RoundtripLossless16AlphaNotMisdetectedAs8Bit) { + ThreadPool* pool = nullptr; + size_t xsize = 128, ysize = 128; + TestImage t; + t.SetDimensions(xsize, ysize).SetChannels(4).SetAllBitDepths(16); + TestImage::Frame frame = t.AddFrame(); + // All 16-bit values, both color and alpha, of this image are below 64. + // This allows testing if a code path wrongly concludes it's an 8-bit instead + // of 16-bit image (or even 6-bit). + const float mul = 1.0f / 65535; + for (size_t y = 0; y < ysize; y++) { + for (size_t x = 0; x < xsize; x++) { + uint16_t r = y * 64 / ysize; + uint16_t g = x * 64 / xsize + 37; + uint16_t b = (y + x) * 64 / (xsize + ysize); + frame.SetValue(y, x, 0, r * mul); + frame.SetValue(y, x, 1, g * mul); + frame.SetValue(y, x, 2, b * mul); + frame.SetValue(y, x, 3, g * mul); + } + } + ASSERT_EQ(t.ppf().info.bits_per_sample, 16); + ASSERT_EQ(t.ppf().info.alpha_bits, 16); + + JXLCompressParams cparams = CompressParamsForLossless(); + + JXLDecompressParams dparams; + dparams.accepted_formats.push_back(t.ppf().frames[0].color.format); + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), 543, 75); + EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0); + EXPECT_EQ(ppf_out.info.bits_per_sample, 16); + EXPECT_EQ(ppf_out.info.alpha_bits, 16); + EXPECT_TRUE(test::SameAlpha(t.ppf(), ppf_out)); +} + +TEST(JxlTest, RoundtripDots) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/cvo9xd_keong_macan_srgb8.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + ASSERT_NE(t.ppf().info.xsize, 0); + EXPECT_EQ(t.ppf().info.bits_per_sample, 8); + EXPECT_EQ(t.ppf().color_encoding.transfer_function, + JXL_TRANSFER_FUNCTION_SRGB); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7); // kSkirrel + cparams.AddOption(JXL_ENC_FRAME_SETTING_DOTS, 1); + cparams.distance = 0.04; + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 284295, 3000); + EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.35)); +} + +TEST(JxlTest, RoundtripNoise) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/cvo9xd_keong_macan_srgb8.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + ASSERT_NE(t.ppf().info.xsize, 0); + EXPECT_EQ(t.ppf().info.bits_per_sample, 8); + EXPECT_EQ(t.ppf().color_encoding.transfer_function, + JXL_TRANSFER_FUNCTION_SRGB); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7); // kSkirrel + cparams.AddOption(JXL_ENC_FRAME_SETTING_NOISE, 1); + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 41385, 750); + EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.48)); +} + +TEST(JxlTest, RoundtripLossless8Gray) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/cvo9xd_keong_macan_grayscale.png"); + TestImage t; + t.SetColorEncoding("Gra_D65_Rel_SRG").DecodeFromBytes(orig).ClearMetadata(); + EXPECT_EQ(t.ppf().color_encoding.color_space, JXL_COLOR_SPACE_GRAY); + EXPECT_EQ(t.ppf().info.bits_per_sample, 8); + + JXLCompressParams cparams = CompressParamsForLossless(); + + JXLDecompressParams dparams; + dparams.accepted_formats.push_back(t.ppf().frames[0].color.format); + + PackedPixelFile ppf_out; + EXPECT_EQ(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), 92766); + EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0); + EXPECT_EQ(ppf_out.color_encoding.color_space, JXL_COLOR_SPACE_GRAY); + EXPECT_EQ(ppf_out.info.bits_per_sample, 8); +} + +#if JPEGXL_ENABLE_GIF + +TEST(JxlTest, RoundtripAnimation) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData("jxl/traffic_light.gif"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + EXPECT_EQ(4, t.ppf().frames.size()); + + JXLDecompressParams dparams; + dparams.accepted_formats.push_back(t.ppf().frames[0].color.format); + + PackedPixelFile ppf_out; + EXPECT_THAT(Roundtrip(t.ppf(), {}, dparams, pool, &ppf_out), + IsSlightlyBelow(2600)); + + t.CoalesceGIFAnimationWithAlpha(); + ASSERT_EQ(ppf_out.frames.size(), t.ppf().frames.size()); + EXPECT_LE(ButteraugliDistance(t.ppf(), ppf_out), +#if JXL_HIGH_PRECISION + 1.55); +#else + 1.75); +#endif +} + +TEST(JxlTest, RoundtripLosslessAnimation) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData("jxl/traffic_light.gif"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + EXPECT_EQ(4, t.ppf().frames.size()); + + JXLCompressParams cparams = CompressParamsForLossless(); + + JXLDecompressParams dparams; + dparams.accepted_formats.push_back(t.ppf().frames[0].color.format); + + PackedPixelFile ppf_out; + EXPECT_EQ(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), 958); + + t.CoalesceGIFAnimationWithAlpha(); + ASSERT_EQ(ppf_out.frames.size(), t.ppf().frames.size()); + EXPECT_LE(ButteraugliDistance(t.ppf(), ppf_out), 5e-4); +} + +TEST(JxlTest, RoundtripAnimationPatches) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = jxl::test::ReadTestData("jxl/animation_patches.gif"); + + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata(); + ASSERT_EQ(2u, t.ppf().frames.size()); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_PATCHES, 1); + + JXLDecompressParams dparams; + dparams.accepted_formats.push_back(t.ppf().frames[0].color.format); + + PackedPixelFile ppf_out; + // 40k with no patches, 27k with patch frames encoded multiple times. + EXPECT_THAT(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), + IsSlightlyBelow(16710)); + EXPECT_EQ(ppf_out.frames.size(), t.ppf().frames.size()); + // >10 with broken patches + EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.05)); +} + +#endif // JPEGXL_ENABLE_GIF + +size_t RoundtripJpeg(const PaddedBytes& jpeg_in, ThreadPool* pool) { + std::vector jpeg_bytes(jpeg_in.data(), + jpeg_in.data() + jpeg_in.size()); + std::vector compressed; + EXPECT_TRUE(extras::EncodeImageJXL({}, extras::PackedPixelFile(), &jpeg_bytes, + &compressed)); + + jxl::JXLDecompressParams dparams; + test::SetThreadParallelRunner(dparams, pool); + std::vector out; + jxl::PackedPixelFile ppf; + EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), dparams, + nullptr, &ppf, &out)); + EXPECT_EQ(out.size(), jpeg_in.size()); + size_t failures = 0; + for (size_t i = 0; i < std::min(out.size(), jpeg_in.size()); i++) { + if (out[i] != jpeg_in[i]) { + EXPECT_EQ(out[i], jpeg_in[i]) + << "byte mismatch " << i << " " << out[i] << " != " << jpeg_in[i]; + if (++failures > 4) { + return compressed.size(); + } + } + } + return compressed.size(); +} + +void RoundtripJpegToPixels(const PaddedBytes& jpeg_in, + JXLDecompressParams dparams, ThreadPool* pool, + PackedPixelFile* ppf_out) { + std::vector jpeg_bytes(jpeg_in.data(), + jpeg_in.data() + jpeg_in.size()); + std::vector compressed; + EXPECT_TRUE(extras::EncodeImageJXL({}, extras::PackedPixelFile(), &jpeg_bytes, + &compressed)); + + test::SetThreadParallelRunner(dparams, pool); + EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), dparams, + nullptr, ppf_out, nullptr)); +} + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression444)) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = + jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_444.jpg"); + // JPEG size is 696,659 bytes. + EXPECT_NEAR(RoundtripJpeg(orig, &pool), 568940u, 10); +} + +#if JPEGXL_ENABLE_JPEG + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels)) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = + jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_444.jpg"); + TestImage t; + t.DecodeFromBytes(orig); + + PackedPixelFile ppf_out; + RoundtripJpegToPixels(orig, {}, &pool, &ppf_out); + EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(12)); +} + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels420)) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = + jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_420.jpg"); + TestImage t; + t.DecodeFromBytes(orig); + + PackedPixelFile ppf_out; + RoundtripJpegToPixels(orig, {}, &pool, &ppf_out); + EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(11)); +} + +TEST(JxlTest, + JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels420EarlyFlush)) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = + jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_420.jpg"); + TestImage t; + t.DecodeFromBytes(orig); + + JXLDecompressParams dparams; + dparams.max_downsampling = 8; + + PackedPixelFile ppf_out; + RoundtripJpegToPixels(orig, dparams, &pool, &ppf_out); + EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(4410)); +} + +TEST(JxlTest, + JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels420Mul16)) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = + jxl::test::ReadTestData("jxl/flower/flower_cropped.jpg"); + TestImage t; + t.DecodeFromBytes(orig); + + PackedPixelFile ppf_out; + RoundtripJpegToPixels(orig, {}, &pool, &ppf_out); + EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(4)); +} + +TEST(JxlTest, + JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels_asymmetric)) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = + jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_asymmetric.jpg"); + TestImage t; + t.DecodeFromBytes(orig); + + PackedPixelFile ppf_out; + RoundtripJpegToPixels(orig, {}, &pool, &ppf_out); + EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(10)); +} + +#endif + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionGray)) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = + jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_gray.jpg"); + // JPEG size is 456,528 bytes. + EXPECT_NEAR(RoundtripJpeg(orig, &pool), 387496u, 200); +} + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression420)) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = + jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_420.jpg"); + // JPEG size is 546,797 bytes. + EXPECT_NEAR(RoundtripJpeg(orig, &pool), 455560u, 10); +} + +TEST(JxlTest, + JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression_luma_subsample)) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = jxl::test::ReadTestData( + "jxl/flower/flower.png.im_q85_luma_subsample.jpg"); + // JPEG size is 400,724 bytes. + EXPECT_NEAR(RoundtripJpeg(orig, &pool), 325354u, 10); +} + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression444_12)) { + // 444 JPEG that has an interesting sampling-factor (1x2, 1x2, 1x2). + ThreadPoolForTests pool(8); + const PaddedBytes orig = + jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_444_1x2.jpg"); + // JPEG size is 703,874 bytes. + EXPECT_NEAR(RoundtripJpeg(orig, &pool), 569679u, 10); +} + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression422)) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = + jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_422.jpg"); + // JPEG size is 522,057 bytes. + EXPECT_NEAR(RoundtripJpeg(orig, &pool), 499282u, 10); +} + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression440)) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = + jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_440.jpg"); + // JPEG size is 603,623 bytes. + EXPECT_NEAR(RoundtripJpeg(orig, &pool), 501151u, 10); +} + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression_asymmetric)) { + // 2x vertical downsample of one chroma channel, 2x horizontal downsample of + // the other. + ThreadPoolForTests pool(8); + const PaddedBytes orig = + jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_asymmetric.jpg"); + // JPEG size is 604,601 bytes. + EXPECT_NEAR(RoundtripJpeg(orig, &pool), 500602u, 10); +} + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression420Progr)) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = + jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_420_progr.jpg"); + // JPEG size is 522,057 bytes. + EXPECT_NEAR(RoundtripJpeg(orig, &pool), 455499u, 10); +} + +TEST(JxlTest, RoundtripProgressive) { + ThreadPoolForTests pool(4); + const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(600, 1024); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC, 1); + cparams.AddOption(JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC, 1); + cparams.AddOption(JXL_ENC_FRAME_SETTING_RESPONSIVE, 1); + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 61635, 750); + EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.4)); +} + +TEST(JxlTest, RoundtripProgressiveLevel2Slow) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png"); + TestImage t; + t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(600, 1024); + + JXLCompressParams cparams; + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 9); // kTortoise + cparams.AddOption(JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC, 2); + cparams.AddOption(JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC, 1); + cparams.AddOption(JXL_ENC_FRAME_SETTING_RESPONSIVE, 1); + + PackedPixelFile ppf_out; + EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 72841, 1000); + EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.17)); +} + +TEST(JxlTest, RoundtripUnsignedCustomBitdepthLossless) { + ThreadPool* pool = nullptr; + for (uint32_t num_channels = 1; num_channels < 6; ++num_channels) { + for (JxlEndianness endianness : {JXL_LITTLE_ENDIAN, JXL_BIG_ENDIAN}) { + for (uint32_t bitdepth = 3; bitdepth <= 16; ++bitdepth) { + if (bitdepth <= 8 && endianness == JXL_BIG_ENDIAN) continue; + printf("Testing %u channel unsigned %u bit %s endian lossless.\n", + num_channels, bitdepth, + endianness == JXL_LITTLE_ENDIAN ? "little" : "big"); + TestImage t; + t.SetDimensions(256, 256).SetChannels(num_channels); + t.SetAllBitDepths(bitdepth).SetEndianness(endianness); + TestImage::Frame frame = t.AddFrame(); + frame.RandomFill(); + + JXLCompressParams cparams = CompressParamsForLossless(); + cparams.input_bitdepth.type = JXL_BIT_DEPTH_FROM_CODESTREAM; + + JXLDecompressParams dparams; + dparams.accepted_formats.push_back(t.ppf().frames[0].color.format); + dparams.output_bitdepth.type = JXL_BIT_DEPTH_FROM_CODESTREAM; + + PackedPixelFile ppf_out; + Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out); + + ASSERT_TRUE(test::SamePixels(t.ppf(), ppf_out)); + } + } + } +} + +TEST(JxlTest, LosslessPNMRoundtrip) { + static const char* kChannels[] = {"", "g", "ga", "rgb", "rgba"}; + static const char* kExtension[] = {"", ".pgm", ".pam", ".ppm", ".pam"}; + for (size_t bit_depth = 1; bit_depth <= 16; ++bit_depth) { + for (size_t channels = 1; channels <= 4; ++channels) { + if (bit_depth == 1 && (channels == 2 || channels == 4)) continue; + std::string extension(kExtension[channels]); + std::string filename = "jxl/flower/flower_small." + + std::string(kChannels[channels]) + ".depth" + + std::to_string(bit_depth) + extension; + const PaddedBytes orig = jxl::test::ReadTestData(filename); + test::TestImage t; + if (channels < 3) t.SetColorEncoding("Gra_D65_Rel_SRG"); + t.DecodeFromBytes(orig); + + JXLCompressParams cparams = CompressParamsForLossless(); + cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 1); // kLightning + cparams.input_bitdepth.type = JXL_BIT_DEPTH_FROM_CODESTREAM; + + JXLDecompressParams dparams; + dparams.accepted_formats.push_back(t.ppf().frames[0].color.format); + dparams.output_bitdepth.type = JXL_BIT_DEPTH_FROM_CODESTREAM; + + PackedPixelFile ppf_out; + Roundtrip(t.ppf(), cparams, dparams, nullptr, &ppf_out); + + extras::EncodedImage encoded; + auto encoder = extras::Encoder::FromExtension(extension); + ASSERT_TRUE(encoder.get()); + ASSERT_TRUE(encoder->Encode(ppf_out, &encoded, nullptr)); + ASSERT_EQ(encoded.bitstreams.size(), 1); + ASSERT_EQ(orig.size(), encoded.bitstreams[0].size()); + EXPECT_EQ(0, + memcmp(orig.data(), encoded.bitstreams[0].data(), orig.size())); + } + } +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/lehmer_code.h b/third_party/jpeg-xl/lib/jxl/lehmer_code.h new file mode 100644 index 0000000000..dd1d21c6f7 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/lehmer_code.h @@ -0,0 +1,102 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_LEHMER_CODE_H_ +#define LIB_JXL_LEHMER_CODE_H_ + +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +// Permutation <=> factorial base representation (Lehmer code). + +using LehmerT = uint32_t; + +template +constexpr T ValueOfLowest1Bit(T t) { + return t & -t; +} + +// Computes the Lehmer (factorial basis) code of permutation, an array of n +// unique indices in [0..n), and stores it in code[0..len). N*logN time. +// temp must have n + 1 elements but need not be initialized. +template +void ComputeLehmerCode(const PermutationT* JXL_RESTRICT permutation, + uint32_t* JXL_RESTRICT temp, const size_t n, + LehmerT* JXL_RESTRICT code) { + for (size_t idx = 0; idx < n + 1; ++idx) temp[idx] = 0; + + for (size_t idx = 0; idx < n; ++idx) { + const PermutationT s = permutation[idx]; + + // Compute sum in Fenwick tree + uint32_t penalty = 0; + uint32_t i = s + 1; + while (i != 0) { + penalty += temp[i]; + i &= i - 1; // clear lowest bit + } + JXL_DASSERT(s >= penalty); + code[idx] = s - penalty; + i = s + 1; + // Add operation in Fenwick tree + while (i < n + 1) { + temp[i] += 1; + i += ValueOfLowest1Bit(i); + } + } +} + +// Decodes the Lehmer code in code[0..n) into permutation[0..n). +// temp must have 1 << CeilLog2(n) elements but need not be initialized. +template +void DecodeLehmerCode(const LehmerT* JXL_RESTRICT code, + uint32_t* JXL_RESTRICT temp, size_t n, + PermutationT* JXL_RESTRICT permutation) { + JXL_DASSERT(n != 0); + const size_t log2n = CeilLog2Nonzero(n); + const size_t padded_n = 1ull << log2n; + + for (size_t i = 0; i < padded_n; i++) { + const int32_t i1 = static_cast(i + 1); + temp[i] = static_cast(ValueOfLowest1Bit(i1)); + } + + for (size_t i = 0; i < n; i++) { + JXL_DASSERT(code[i] + i < n); + uint32_t rank = code[i] + 1; + + // Extract i-th unused element via implicit order-statistics tree. + size_t bit = padded_n; + size_t next = 0; + for (size_t i = 0; i <= log2n; i++) { + const size_t cand = next + bit; + JXL_DASSERT(cand >= 1); + bit >>= 1; + if (temp[cand - 1] < rank) { + next = cand; + rank -= temp[cand - 1]; + } + } + + permutation[i] = next; + + // Mark as used + next += 1; + while (next <= padded_n) { + temp[next - 1] -= 1; + next += ValueOfLowest1Bit(next); + } + } +} + +} // namespace jxl + +#endif // LIB_JXL_LEHMER_CODE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/lehmer_code_test.cc b/third_party/jpeg-xl/lib/jxl/lehmer_code_test.cc new file mode 100644 index 0000000000..acda762545 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/lehmer_code_test.cc @@ -0,0 +1,98 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/lehmer_code.h" + +#include +#include + +#include +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/random.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +template +struct WorkingSet { + explicit WorkingSet(size_t max_n) + : padded_n(1ull << CeilLog2Nonzero(max_n + 1)), + permutation(max_n), + temp(padded_n), + lehmer(max_n), + decoded(max_n) {} + + size_t padded_n; + std::vector permutation; + std::vector temp; + std::vector lehmer; + std::vector decoded; +}; + +template +void Roundtrip(size_t n, WorkingSet* ws) { + JXL_ASSERT(n != 0); + const size_t padded_n = 1ull << CeilLog2Nonzero(n); + + Rng rng(n * 65537 + 13); + + // Ensure indices fit into PermutationT + EXPECT_LE(n, 1ULL << (sizeof(PermutationT) * 8)); + + std::iota(ws->permutation.begin(), ws->permutation.begin() + n, 0); + + // For various random permutations: + for (size_t rep = 0; rep < 3; ++rep) { + rng.Shuffle(ws->permutation.data(), n); + + // Must decode to the same permutation + ComputeLehmerCode(ws->permutation.data(), ws->temp.data(), n, + ws->lehmer.data()); + memset(ws->temp.data(), 0, padded_n * 4); + DecodeLehmerCode(ws->lehmer.data(), ws->temp.data(), n, ws->decoded.data()); + + for (size_t i = 0; i < n; ++i) { + EXPECT_EQ(ws->permutation[i], ws->decoded[i]); + } + } +} + +// Preallocates arrays and tests n = [begin, end). +template +void RoundtripSizeRange(ThreadPool* pool, uint32_t begin, uint32_t end) { + ASSERT_NE(0u, begin); // n = 0 not allowed. + std::vector> working_sets; + + JXL_CHECK(RunOnPool( + pool, begin, end, + [&working_sets, end](const size_t num_threads) { + for (size_t i = 0; i < num_threads; i++) { + working_sets.emplace_back(end - 1); + } + return true; + }, + [&working_sets](const uint32_t n, const size_t thread) { + Roundtrip(n, &working_sets[thread]); + }, + "lehmer test")); +} + +TEST(LehmerCodeTest, TestRoundtrips) { + test::ThreadPoolForTests pool(8); + + RoundtripSizeRange(&pool, 1, 1026); + + // Ensures PermutationT can fit > 16 bit values. + RoundtripSizeRange(&pool, 65536, 65540); +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/libjxl.pc.in b/third_party/jpeg-xl/lib/jxl/libjxl.pc.in new file mode 100644 index 0000000000..4a7af65b7c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/libjxl.pc.in @@ -0,0 +1,13 @@ +prefix=@CMAKE_INSTALL_PREFIX@ +exec_prefix=${prefix} +libdir=@PKGCONFIG_TARGET_LIBS@ +includedir=@PKGCONFIG_TARGET_INCLUDES@ + +Name: libjxl +Description: Loads and saves JPEG XL files +Version: @JPEGXL_LIBRARY_VERSION@ +Requires.private: @JPEGXL_LIBRARY_REQUIRES@ +Libs: -L${libdir} -ljxl +Libs.private: -lm +Cflags: -I${includedir} +Cflags.private: -DJXL_STATIC_DEFINE diff --git a/third_party/jpeg-xl/lib/jxl/loop_filter.cc b/third_party/jpeg-xl/lib/jxl/loop_filter.cc new file mode 100644 index 0000000000..5afe87617d --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/loop_filter.cc @@ -0,0 +1,98 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/loop_filter.h" + +#include "lib/jxl/base/status.h" +#include "lib/jxl/fields.h" + +namespace jxl { + +LoopFilter::LoopFilter() { Bundle::Init(this); } +Status LoopFilter::VisitFields(Visitor* JXL_RESTRICT visitor) { + // Must come before AllDefault. + + if (visitor->AllDefault(*this, &all_default)) { + // Overwrite all serialized fields, but not any nonserialized_*. + visitor->SetDefault(this); + return true; + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(true, &gab)); + if (visitor->Conditional(gab)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &gab_custom)); + if (visitor->Conditional(gab_custom)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(1.1 * 0.104699568f, &gab_x_weight1)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(1.1 * 0.055680538f, &gab_x_weight2)); + if (std::abs(1.0f + (gab_x_weight1 + gab_x_weight2) * 4) < 1e-8) { + return JXL_FAILURE( + "Gaborish x weights lead to near 0 unnormalized kernel"); + } + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(1.1 * 0.104699568f, &gab_y_weight1)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(1.1 * 0.055680538f, &gab_y_weight2)); + if (std::abs(1.0f + (gab_y_weight1 + gab_y_weight2) * 4) < 1e-8) { + return JXL_FAILURE( + "Gaborish y weights lead to near 0 unnormalized kernel"); + } + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(1.1 * 0.104699568f, &gab_b_weight1)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(1.1 * 0.055680538f, &gab_b_weight2)); + if (std::abs(1.0f + (gab_b_weight1 + gab_b_weight2) * 4) < 1e-8) { + return JXL_FAILURE( + "Gaborish b weights lead to near 0 unnormalized kernel"); + } + } + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(2, 2, &epf_iters)); + if (visitor->Conditional(epf_iters > 0)) { + if (visitor->Conditional(!nonserialized_is_modular)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &epf_sharp_custom)); + if (visitor->Conditional(epf_sharp_custom)) { + for (size_t i = 0; i < kEpfSharpEntries; ++i) { + JXL_QUIET_RETURN_IF_ERROR(visitor->F16( + float(i) / float(kEpfSharpEntries - 1), &epf_sharp_lut[i])); + } + } + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &epf_weight_custom)); + if (visitor->Conditional(epf_weight_custom)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(40.0f, &epf_channel_scale[0])); + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(5.0f, &epf_channel_scale[1])); + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(3.5f, &epf_channel_scale[2])); + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.45f, &epf_pass1_zeroflush)); + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.6f, &epf_pass2_zeroflush)); + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &epf_sigma_custom)); + if (visitor->Conditional(epf_sigma_custom)) { + if (visitor->Conditional(!nonserialized_is_modular)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.46f, &epf_quant_mul)); + } + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.9f, &epf_pass0_sigma_scale)); + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(6.5f, &epf_pass2_sigma_scale)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(0.6666666666666666f, &epf_border_sad_mul)); + } + if (visitor->Conditional(nonserialized_is_modular)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(1.0f, &epf_sigma_for_modular)); + if (epf_sigma_for_modular < 1e-8) { + return JXL_FAILURE("EPF: sigma for modular is too small"); + } + } + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions)); + // Extensions: in chronological order of being added to the format. + return visitor->EndExtensions(); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/loop_filter.h b/third_party/jpeg-xl/lib/jxl/loop_filter.h new file mode 100644 index 0000000000..e4b418ba2b --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/loop_filter.h @@ -0,0 +1,76 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_LOOP_FILTER_H_ +#define LIB_JXL_LOOP_FILTER_H_ + +// Parameters for loop filter(s), stored in each frame. + +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/field_encodings.h" + +namespace jxl { + +struct LoopFilter : public Fields { + LoopFilter(); + JXL_FIELDS_NAME(LoopFilter) + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + size_t Padding() const { + static const size_t padding_per_epf_iter[4] = {0, 2, 3, 6}; + return padding_per_epf_iter[epf_iters] + (gab ? 1 : 0); + } + + mutable bool all_default; + + // --- Gaborish convolution + bool gab; + + bool gab_custom; + float gab_x_weight1; + float gab_x_weight2; + float gab_y_weight1; + float gab_y_weight2; + float gab_b_weight1; + float gab_b_weight2; + + // --- Edge-preserving filter + + // Number of EPF stages to apply. 0 means EPF disabled. 1 applies only the + // first stage, 2 applies both stages and 3 applies the first stage twice and + // the second stage once. + uint32_t epf_iters; + + bool epf_sharp_custom; + enum { kEpfSharpEntries = 8 }; + float epf_sharp_lut[kEpfSharpEntries]; + + bool epf_weight_custom; // Custom weight params + float epf_channel_scale[3]; // Relative weight of each channel + float epf_pass1_zeroflush; // Minimum weight for first pass + float epf_pass2_zeroflush; // Minimum weight for second pass + + bool epf_sigma_custom; // Custom sigma parameters + float epf_quant_mul; // Sigma is ~ this * quant + float epf_pass0_sigma_scale; // Multiplier for sigma in pass 0 + float epf_pass2_sigma_scale; // Multiplier for sigma in the second pass + float epf_border_sad_mul; // (inverse) multiplier for sigma on borders + + float epf_sigma_for_modular; + + uint64_t extensions; + + bool nonserialized_is_modular = false; +}; + +} // namespace jxl + +#endif // LIB_JXL_LOOP_FILTER_H_ diff --git a/third_party/jpeg-xl/lib/jxl/luminance.cc b/third_party/jpeg-xl/lib/jxl/luminance.cc new file mode 100644 index 0000000000..10151f4267 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/luminance.cc @@ -0,0 +1,26 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/luminance.h" + +#include "lib/jxl/image_metadata.h" + +namespace jxl { + +void SetIntensityTarget(ImageMetadata* m) { + if (m->color_encoding.tf.IsPQ()) { + // Peak luminance of PQ as defined by SMPTE ST 2084:2014. + m->SetIntensityTarget(10000); + } else if (m->color_encoding.tf.IsHLG()) { + // Nominal display peak luminance used as a reference by + // Rec. ITU-R BT.2100-2. + m->SetIntensityTarget(1000); + } else { + // SDR + m->SetIntensityTarget(kDefaultIntensityTarget); + } +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/luminance.h b/third_party/jpeg-xl/lib/jxl/luminance.h new file mode 100644 index 0000000000..3181576823 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/luminance.h @@ -0,0 +1,22 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_LUMINANCE_H_ +#define LIB_JXL_LUMINANCE_H_ + +namespace jxl { + +// Chooses a default intensity target based on the transfer function of the +// image, if known. For SDR images or images not known to be HDR, returns +// kDefaultIntensityTarget, for images known to have PQ or HLG transfer function +// returns a higher value. + +struct ImageMetadata; +// TODO(eustas): rename +void SetIntensityTarget(ImageMetadata* m); + +} // namespace jxl + +#endif // LIB_JXL_LUMINANCE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/matrix_ops.h b/third_party/jpeg-xl/lib/jxl/matrix_ops.h new file mode 100644 index 0000000000..1a969bd4f0 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/matrix_ops.h @@ -0,0 +1,84 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MATRIX_OPS_H_ +#define LIB_JXL_MATRIX_OPS_H_ + +// 3x3 matrix operations. + +#include // abs +#include + +#include "lib/jxl/base/status.h" + +namespace jxl { + +// Computes C = A * B, where A, B, C are 3x3 matrices. +template +void Mul3x3Matrix(const T* a, const T* b, T* c) { + alignas(16) T temp[3]; // For transposed column + for (size_t x = 0; x < 3; x++) { + for (size_t z = 0; z < 3; z++) { + temp[z] = b[z * 3 + x]; + } + for (size_t y = 0; y < 3; y++) { + double e = 0; + for (size_t z = 0; z < 3; z++) { + e += a[y * 3 + z] * temp[z]; + } + c[y * 3 + x] = e; + } + } +} + +// Computes C = A * B, where A is 3x3 matrix and B is vector. +template +void Mul3x3Vector(const T* a, const T* b, T* c) { + for (size_t y = 0; y < 3; y++) { + double e = 0; + for (size_t x = 0; x < 3; x++) { + e += a[y * 3 + x] * b[x]; + } + c[y] = e; + } +} + +// Inverts a 3x3 matrix in place. +template +Status Inv3x3Matrix(T* matrix) { + // Intermediate computation is done in double precision. + double temp[9]; + temp[0] = static_cast(matrix[4]) * matrix[8] - + static_cast(matrix[5]) * matrix[7]; + temp[1] = static_cast(matrix[2]) * matrix[7] - + static_cast(matrix[1]) * matrix[8]; + temp[2] = static_cast(matrix[1]) * matrix[5] - + static_cast(matrix[2]) * matrix[4]; + temp[3] = static_cast(matrix[5]) * matrix[6] - + static_cast(matrix[3]) * matrix[8]; + temp[4] = static_cast(matrix[0]) * matrix[8] - + static_cast(matrix[2]) * matrix[6]; + temp[5] = static_cast(matrix[2]) * matrix[3] - + static_cast(matrix[0]) * matrix[5]; + temp[6] = static_cast(matrix[3]) * matrix[7] - + static_cast(matrix[4]) * matrix[6]; + temp[7] = static_cast(matrix[1]) * matrix[6] - + static_cast(matrix[0]) * matrix[7]; + temp[8] = static_cast(matrix[0]) * matrix[4] - + static_cast(matrix[1]) * matrix[3]; + double det = matrix[0] * temp[0] + matrix[1] * temp[3] + matrix[2] * temp[6]; + if (std::abs(det) < 1e-10) { + return JXL_FAILURE("Matrix determinant is too close to 0"); + } + double idet = 1.0 / det; + for (size_t i = 0; i < 9; i++) { + matrix[i] = temp[i] * idet; + } + return true; +} + +} // namespace jxl + +#endif // LIB_JXL_MATRIX_OPS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/memory_manager_internal.cc b/third_party/jpeg-xl/lib/jxl/memory_manager_internal.cc new file mode 100644 index 0000000000..87727e75cd --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/memory_manager_internal.cc @@ -0,0 +1,18 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/memory_manager_internal.h" + +#include + +namespace jxl { + +void* MemoryManagerDefaultAlloc(void* opaque, size_t size) { + return malloc(size); +} + +void MemoryManagerDefaultFree(void* opaque, void* address) { free(address); } + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/memory_manager_internal.h b/third_party/jpeg-xl/lib/jxl/memory_manager_internal.h new file mode 100644 index 0000000000..f8a5cd8d59 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/memory_manager_internal.h @@ -0,0 +1,101 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MEMORY_MANAGER_INTERNAL_H_ +#define LIB_JXL_MEMORY_MANAGER_INTERNAL_H_ + +// Memory allocator with support for alignment + misalignment. + +#include +#include +#include +#include +#include // memcpy + +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +// Default alloc and free functions. +void* MemoryManagerDefaultAlloc(void* opaque, size_t size); +void MemoryManagerDefaultFree(void* opaque, void* address); + +// Initializes the memory manager instance with the passed one. The +// MemoryManager passed in |memory_manager| may be NULL or contain NULL +// functions which will be initialized with the default ones. If either alloc +// or free are NULL, then both must be NULL, otherwise this function returns an +// error. +static JXL_INLINE Status MemoryManagerInit( + JxlMemoryManager* self, const JxlMemoryManager* memory_manager) { + if (memory_manager) { + *self = *memory_manager; + } else { + memset(self, 0, sizeof(*self)); + } + if (!self->alloc != !self->free) { + return false; + } + if (!self->alloc) self->alloc = jxl::MemoryManagerDefaultAlloc; + if (!self->free) self->free = jxl::MemoryManagerDefaultFree; + + return true; +} + +static JXL_INLINE void* MemoryManagerAlloc( + const JxlMemoryManager* memory_manager, size_t size) { + return memory_manager->alloc(memory_manager->opaque, size); +} + +static JXL_INLINE void MemoryManagerFree(const JxlMemoryManager* memory_manager, + void* address) { + return memory_manager->free(memory_manager->opaque, address); +} + +// Helper class to be used as a deleter in a unique_ptr call. +class MemoryManagerDeleteHelper { + public: + explicit MemoryManagerDeleteHelper(const JxlMemoryManager* memory_manager) + : memory_manager_(memory_manager) {} + + // Delete and free the passed pointer using the memory_manager. + template + void operator()(T* address) const { + if (!address) { + return; + } + address->~T(); + return memory_manager_->free(memory_manager_->opaque, address); + } + + private: + const JxlMemoryManager* memory_manager_; +}; + +template +using MemoryManagerUniquePtr = std::unique_ptr; + +// Creates a new object T allocating it with the memory allocator into a +// unique_ptr. +template +JXL_INLINE MemoryManagerUniquePtr MemoryManagerMakeUnique( + const JxlMemoryManager* memory_manager, Args&&... args) { + T* mem = + static_cast(memory_manager->alloc(memory_manager->opaque, sizeof(T))); + if (!mem) { + // Allocation error case. + return MemoryManagerUniquePtr(nullptr, + MemoryManagerDeleteHelper(memory_manager)); + } + return MemoryManagerUniquePtr(new (mem) T(std::forward(args)...), + MemoryManagerDeleteHelper(memory_manager)); +} + +} // namespace jxl + +#endif // LIB_JXL_MEMORY_MANAGER_INTERNAL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/context_predict.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/context_predict.h new file mode 100644 index 0000000000..914cd6a4e4 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/context_predict.h @@ -0,0 +1,626 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_ENCODING_CONTEXT_PREDICT_H_ +#define LIB_JXL_MODULAR_ENCODING_CONTEXT_PREDICT_H_ + +#include +#include + +#include "lib/jxl/fields.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/options.h" + +namespace jxl { + +namespace weighted { +constexpr static size_t kNumPredictors = 4; +constexpr static int64_t kPredExtraBits = 3; +constexpr static int64_t kPredictionRound = ((1 << kPredExtraBits) >> 1) - 1; +constexpr static size_t kNumProperties = 1; + +struct Header : public Fields { + JXL_FIELDS_NAME(WeightedPredictorHeader) + // TODO(janwas): move to cc file, avoid including fields.h. + Header() { Bundle::Init(this); } + + Status VisitFields(Visitor *JXL_RESTRICT visitor) override { + if (visitor->AllDefault(*this, &all_default)) { + // Overwrite all serialized fields, but not any nonserialized_*. + visitor->SetDefault(this); + return true; + } + auto visit_p = [visitor](pixel_type val, pixel_type *p) { + uint32_t up = *p; + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(5, val, &up)); + *p = up; + return Status(true); + }; + JXL_QUIET_RETURN_IF_ERROR(visit_p(16, &p1C)); + JXL_QUIET_RETURN_IF_ERROR(visit_p(10, &p2C)); + JXL_QUIET_RETURN_IF_ERROR(visit_p(7, &p3Ca)); + JXL_QUIET_RETURN_IF_ERROR(visit_p(7, &p3Cb)); + JXL_QUIET_RETURN_IF_ERROR(visit_p(7, &p3Cc)); + JXL_QUIET_RETURN_IF_ERROR(visit_p(0, &p3Cd)); + JXL_QUIET_RETURN_IF_ERROR(visit_p(0, &p3Ce)); + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(4, 0xd, &w[0])); + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(4, 0xc, &w[1])); + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(4, 0xc, &w[2])); + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(4, 0xc, &w[3])); + return true; + } + + bool all_default; + pixel_type p1C = 0, p2C = 0, p3Ca = 0, p3Cb = 0, p3Cc = 0, p3Cd = 0, p3Ce = 0; + uint32_t w[kNumPredictors] = {}; +}; + +struct State { + pixel_type_w prediction[kNumPredictors] = {}; + pixel_type_w pred = 0; // *before* removing the added bits. + std::vector pred_errors[kNumPredictors]; + std::vector error; + const Header header; + + // Allows to approximate division by a number from 1 to 64. + uint32_t divlookup[64]; + + constexpr static pixel_type_w AddBits(pixel_type_w x) { + return uint64_t(x) << kPredExtraBits; + } + + State(Header header, size_t xsize, size_t ysize) : header(header) { + // Extra margin to avoid out-of-bounds writes. + // All have space for two rows of data. + for (size_t i = 0; i < 4; i++) { + pred_errors[i].resize((xsize + 2) * 2); + } + error.resize((xsize + 2) * 2); + // Initialize division lookup table. + for (int i = 0; i < 64; i++) { + divlookup[i] = (1 << 24) / (i + 1); + } + } + + // Approximates 4+(maxweight<<24)/(x+1), avoiding division + JXL_INLINE uint32_t ErrorWeight(uint64_t x, uint32_t maxweight) const { + int shift = static_cast(FloorLog2Nonzero(x + 1)) - 5; + if (shift < 0) shift = 0; + return 4 + ((maxweight * divlookup[x >> shift]) >> shift); + } + + // Approximates the weighted average of the input values with the given + // weights, avoiding division. Weights must sum to at least 16. + JXL_INLINE pixel_type_w + WeightedAverage(const pixel_type_w *JXL_RESTRICT p, + std::array w) const { + uint32_t weight_sum = 0; + for (size_t i = 0; i < kNumPredictors; i++) { + weight_sum += w[i]; + } + JXL_DASSERT(weight_sum > 15); + uint32_t log_weight = FloorLog2Nonzero(weight_sum); // at least 4. + weight_sum = 0; + for (size_t i = 0; i < kNumPredictors; i++) { + w[i] >>= log_weight - 4; + weight_sum += w[i]; + } + // for rounding. + pixel_type_w sum = (weight_sum >> 1) - 1; + for (size_t i = 0; i < kNumPredictors; i++) { + sum += p[i] * w[i]; + } + return (sum * divlookup[weight_sum - 1]) >> 24; + } + + template + JXL_INLINE pixel_type_w Predict(size_t x, size_t y, size_t xsize, + pixel_type_w N, pixel_type_w W, + pixel_type_w NE, pixel_type_w NW, + pixel_type_w NN, Properties *properties, + size_t offset) { + size_t cur_row = y & 1 ? 0 : (xsize + 2); + size_t prev_row = y & 1 ? (xsize + 2) : 0; + size_t pos_N = prev_row + x; + size_t pos_NE = x < xsize - 1 ? pos_N + 1 : pos_N; + size_t pos_NW = x > 0 ? pos_N - 1 : pos_N; + std::array weights; + for (size_t i = 0; i < kNumPredictors; i++) { + // pred_errors[pos_N] also contains the error of pixel W. + // pred_errors[pos_NW] also contains the error of pixel WW. + weights[i] = pred_errors[i][pos_N] + pred_errors[i][pos_NE] + + pred_errors[i][pos_NW]; + weights[i] = ErrorWeight(weights[i], header.w[i]); + } + + N = AddBits(N); + W = AddBits(W); + NE = AddBits(NE); + NW = AddBits(NW); + NN = AddBits(NN); + + pixel_type_w teW = x == 0 ? 0 : error[cur_row + x - 1]; + pixel_type_w teN = error[pos_N]; + pixel_type_w teNW = error[pos_NW]; + pixel_type_w sumWN = teN + teW; + pixel_type_w teNE = error[pos_NE]; + + if (compute_properties) { + pixel_type_w p = teW; + if (std::abs(teN) > std::abs(p)) p = teN; + if (std::abs(teNW) > std::abs(p)) p = teNW; + if (std::abs(teNE) > std::abs(p)) p = teNE; + (*properties)[offset++] = p; + } + + prediction[0] = W + NE - N; + prediction[1] = N - (((sumWN + teNE) * header.p1C) >> 5); + prediction[2] = W - (((sumWN + teNW) * header.p2C) >> 5); + prediction[3] = + N - ((teNW * header.p3Ca + teN * header.p3Cb + teNE * header.p3Cc + + (NN - N) * header.p3Cd + (NW - W) * header.p3Ce) >> + 5); + + pred = WeightedAverage(prediction, weights); + + // If all three have the same sign, skip clamping. + if (((teN ^ teW) | (teN ^ teNW)) > 0) { + return (pred + kPredictionRound) >> kPredExtraBits; + } + + // Otherwise, clamp to min/max of neighbouring pixels (just W, NE, N). + pixel_type_w mx = std::max(W, std::max(NE, N)); + pixel_type_w mn = std::min(W, std::min(NE, N)); + pred = std::max(mn, std::min(mx, pred)); + return (pred + kPredictionRound) >> kPredExtraBits; + } + + JXL_INLINE void UpdateErrors(pixel_type_w val, size_t x, size_t y, + size_t xsize) { + size_t cur_row = y & 1 ? 0 : (xsize + 2); + size_t prev_row = y & 1 ? (xsize + 2) : 0; + val = AddBits(val); + error[cur_row + x] = pred - val; + for (size_t i = 0; i < kNumPredictors; i++) { + pixel_type_w err = + (std::abs(prediction[i] - val) + kPredictionRound) >> kPredExtraBits; + // For predicting in the next row. + pred_errors[i][cur_row + x] = err; + // Add the error on this pixel to the error on the NE pixel. This has the + // effect of adding the error on this pixel to the E and EE pixels. + pred_errors[i][prev_row + x + 1] += err; + } + } +}; + +// Encoder helper function to set the parameters to some presets. +inline void PredictorMode(int i, Header *header) { + switch (i) { + case 0: + // ~ lossless16 predictor + header->w[0] = 0xd; + header->w[1] = 0xc; + header->w[2] = 0xc; + header->w[3] = 0xc; + header->p1C = 16; + header->p2C = 10; + header->p3Ca = 7; + header->p3Cb = 7; + header->p3Cc = 7; + header->p3Cd = 0; + header->p3Ce = 0; + break; + case 1: + // ~ default lossless8 predictor + header->w[0] = 0xd; + header->w[1] = 0xc; + header->w[2] = 0xc; + header->w[3] = 0xb; + header->p1C = 8; + header->p2C = 8; + header->p3Ca = 4; + header->p3Cb = 0; + header->p3Cc = 3; + header->p3Cd = 23; + header->p3Ce = 2; + break; + case 2: + // ~ west lossless8 predictor + header->w[0] = 0xd; + header->w[1] = 0xc; + header->w[2] = 0xd; + header->w[3] = 0xc; + header->p1C = 10; + header->p2C = 9; + header->p3Ca = 7; + header->p3Cb = 0; + header->p3Cc = 0; + header->p3Cd = 16; + header->p3Ce = 9; + break; + case 3: + // ~ north lossless8 predictor + header->w[0] = 0xd; + header->w[1] = 0xd; + header->w[2] = 0xc; + header->w[3] = 0xc; + header->p1C = 16; + header->p2C = 8; + header->p3Ca = 0; + header->p3Cb = 16; + header->p3Cc = 0; + header->p3Cd = 23; + header->p3Ce = 0; + break; + case 4: + default: + // something else, because why not + header->w[0] = 0xd; + header->w[1] = 0xc; + header->w[2] = 0xc; + header->w[3] = 0xc; + header->p1C = 10; + header->p2C = 10; + header->p3Ca = 5; + header->p3Cb = 5; + header->p3Cc = 5; + header->p3Cd = 12; + header->p3Ce = 4; + break; + } +} +} // namespace weighted + +// Stores a node and its two children at the same time. This significantly +// reduces the number of branches needed during decoding. +struct FlatDecisionNode { + // Property + splitval of the top node. + int32_t property0; // -1 if leaf. + union { + PropertyVal splitval0; + Predictor predictor; + }; + uint32_t childID; // childID is ctx id if leaf. + // Property+splitval of the two child nodes. + union { + PropertyVal splitvals[2]; + int32_t multiplier; + }; + union { + int32_t properties[2]; + int64_t predictor_offset; + }; +}; +using FlatTree = std::vector; + +class MATreeLookup { + public: + explicit MATreeLookup(const FlatTree &tree) : nodes_(tree) {} + struct LookupResult { + uint32_t context; + Predictor predictor; + int64_t offset; + int32_t multiplier; + }; + JXL_INLINE LookupResult Lookup(const Properties &properties) const { + uint32_t pos = 0; + while (true) { + const FlatDecisionNode &node = nodes_[pos]; + if (node.property0 < 0) { + return {node.childID, node.predictor, node.predictor_offset, + node.multiplier}; + } + bool p0 = properties[node.property0] <= node.splitval0; + uint32_t off0 = properties[node.properties[0]] <= node.splitvals[0]; + uint32_t off1 = + 2 | (properties[node.properties[1]] <= node.splitvals[1] ? 1 : 0); + pos = node.childID + (p0 ? off1 : off0); + } + } + + private: + const FlatTree &nodes_; +}; + +static constexpr size_t kExtraPropsPerChannel = 4; +static constexpr size_t kNumNonrefProperties = + kNumStaticProperties + 13 + weighted::kNumProperties; + +constexpr size_t kWPProp = kNumNonrefProperties - weighted::kNumProperties; +constexpr size_t kGradientProp = 9; + +// Clamps gradient to the min/max of n, w (and l, implicitly). +static JXL_INLINE int32_t ClampedGradient(const int32_t n, const int32_t w, + const int32_t l) { + const int32_t m = std::min(n, w); + const int32_t M = std::max(n, w); + // The end result of this operation doesn't overflow or underflow if the + // result is between m and M, but the intermediate value may overflow, so we + // do the intermediate operations in uint32_t and check later if we had an + // overflow or underflow condition comparing m, M and l directly. + // grad = M + m - l = n + w - l + const int32_t grad = + static_cast(static_cast(n) + static_cast(w) - + static_cast(l)); + // We use two sets of ternary operators to force the evaluation of them in + // any case, allowing the compiler to avoid branches and use cmovl/cmovg in + // x86. + const int32_t grad_clamp_M = (l < m) ? M : grad; + return (l > M) ? m : grad_clamp_M; +} + +inline pixel_type_w Select(pixel_type_w a, pixel_type_w b, pixel_type_w c) { + pixel_type_w p = a + b - c; + pixel_type_w pa = std::abs(p - a); + pixel_type_w pb = std::abs(p - b); + return pa < pb ? a : b; +} + +inline void PrecomputeReferences(const Channel &ch, size_t y, + const Image &image, uint32_t i, + Channel *references) { + ZeroFillImage(&references->plane); + uint32_t offset = 0; + size_t num_extra_props = references->w; + intptr_t onerow = references->plane.PixelsPerRow(); + for (int32_t j = static_cast(i) - 1; + j >= 0 && offset < num_extra_props; j--) { + if (image.channel[j].w != image.channel[i].w || + image.channel[j].h != image.channel[i].h) { + continue; + } + if (image.channel[j].hshift != image.channel[i].hshift) continue; + if (image.channel[j].vshift != image.channel[i].vshift) continue; + pixel_type *JXL_RESTRICT rp = references->Row(0) + offset; + const pixel_type *JXL_RESTRICT rpp = image.channel[j].Row(y); + const pixel_type *JXL_RESTRICT rpprev = image.channel[j].Row(y ? y - 1 : 0); + for (size_t x = 0; x < ch.w; x++, rp += onerow) { + pixel_type_w v = rpp[x]; + rp[0] = std::abs(v); + rp[1] = v; + pixel_type_w vleft = (x ? rpp[x - 1] : 0); + pixel_type_w vtop = (y ? rpprev[x] : vleft); + pixel_type_w vtopleft = (x && y ? rpprev[x - 1] : vleft); + pixel_type_w vpredicted = ClampedGradient(vleft, vtop, vtopleft); + rp[2] = std::abs(v - vpredicted); + rp[3] = v - vpredicted; + } + + offset += kExtraPropsPerChannel; + } +} + +struct PredictionResult { + int context = 0; + pixel_type_w guess = 0; + Predictor predictor; + int32_t multiplier; +}; + +inline void InitPropsRow( + Properties *p, + const std::array &static_props, + const int y) { + for (size_t i = 0; i < kNumStaticProperties; i++) { + (*p)[i] = static_props[i]; + } + (*p)[2] = y; + (*p)[9] = 0; // local gradient. +} + +namespace detail { +enum PredictorMode { + kUseTree = 1, + kUseWP = 2, + kForceComputeProperties = 4, + kAllPredictions = 8, + kNoEdgeCases = 16 +}; + +JXL_INLINE pixel_type_w PredictOne(Predictor p, pixel_type_w left, + pixel_type_w top, pixel_type_w toptop, + pixel_type_w topleft, pixel_type_w topright, + pixel_type_w leftleft, + pixel_type_w toprightright, + pixel_type_w wp_pred) { + switch (p) { + case Predictor::Zero: + return pixel_type_w{0}; + case Predictor::Left: + return left; + case Predictor::Top: + return top; + case Predictor::Select: + return Select(left, top, topleft); + case Predictor::Weighted: + return wp_pred; + case Predictor::Gradient: + return pixel_type_w{ClampedGradient(left, top, topleft)}; + case Predictor::TopLeft: + return topleft; + case Predictor::TopRight: + return topright; + case Predictor::LeftLeft: + return leftleft; + case Predictor::Average0: + return (left + top) / 2; + case Predictor::Average1: + return (left + topleft) / 2; + case Predictor::Average2: + return (topleft + top) / 2; + case Predictor::Average3: + return (top + topright) / 2; + case Predictor::Average4: + return (6 * top - 2 * toptop + 7 * left + 1 * leftleft + + 1 * toprightright + 3 * topright + 8) / + 16; + default: + return pixel_type_w{0}; + } +} + +template +JXL_INLINE PredictionResult Predict( + Properties *p, size_t w, const pixel_type *JXL_RESTRICT pp, + const intptr_t onerow, const size_t x, const size_t y, Predictor predictor, + const MATreeLookup *lookup, const Channel *references, + weighted::State *wp_state, pixel_type_w *predictions) { + // We start in position 3 because of 2 static properties + y. + size_t offset = 3; + constexpr bool compute_properties = + mode & kUseTree || mode & kForceComputeProperties; + constexpr bool nec = mode & kNoEdgeCases; + pixel_type_w left = (nec || x ? pp[-1] : (y ? pp[-onerow] : 0)); + pixel_type_w top = (nec || y ? pp[-onerow] : left); + pixel_type_w topleft = (nec || (x && y) ? pp[-1 - onerow] : left); + pixel_type_w topright = (nec || (x + 1 < w && y) ? pp[1 - onerow] : top); + pixel_type_w leftleft = (nec || x > 1 ? pp[-2] : left); + pixel_type_w toptop = (nec || y > 1 ? pp[-onerow - onerow] : top); + pixel_type_w toprightright = + (nec || (x + 2 < w && y) ? pp[2 - onerow] : topright); + + if (compute_properties) { + // location + (*p)[offset++] = x; + // neighbors + (*p)[offset++] = std::abs(top); + (*p)[offset++] = std::abs(left); + (*p)[offset++] = top; + (*p)[offset++] = left; + + // local gradient + (*p)[offset] = left - (*p)[offset + 1]; + offset++; + // local gradient + (*p)[offset++] = left + top - topleft; + + // FFV1 context properties + (*p)[offset++] = left - topleft; + (*p)[offset++] = topleft - top; + (*p)[offset++] = top - topright; + (*p)[offset++] = top - toptop; + (*p)[offset++] = left - leftleft; + } + + pixel_type_w wp_pred = 0; + if (mode & kUseWP) { + wp_pred = wp_state->Predict( + x, y, w, top, left, topright, topleft, toptop, p, offset); + } + if (!nec && compute_properties) { + offset += weighted::kNumProperties; + // Extra properties. + const pixel_type *JXL_RESTRICT rp = references->Row(x); + for (size_t i = 0; i < references->w; i++) { + (*p)[offset++] = rp[i]; + } + } + PredictionResult result; + if (mode & kUseTree) { + MATreeLookup::LookupResult lr = lookup->Lookup(*p); + result.context = lr.context; + result.guess = lr.offset; + result.multiplier = lr.multiplier; + predictor = lr.predictor; + } + if (mode & kAllPredictions) { + for (size_t i = 0; i < kNumModularPredictors; i++) { + predictions[i] = PredictOne((Predictor)i, left, top, toptop, topleft, + topright, leftleft, toprightright, wp_pred); + } + } + result.guess += PredictOne(predictor, left, top, toptop, topleft, topright, + leftleft, toprightright, wp_pred); + result.predictor = predictor; + + return result; +} +} // namespace detail + +inline PredictionResult PredictNoTreeNoWP(size_t w, + const pixel_type *JXL_RESTRICT pp, + const intptr_t onerow, const int x, + const int y, Predictor predictor) { + return detail::Predict( + /*p=*/nullptr, w, pp, onerow, x, y, predictor, /*lookup=*/nullptr, + /*references=*/nullptr, /*wp_state=*/nullptr, /*predictions=*/nullptr); +} + +inline PredictionResult PredictNoTreeWP(size_t w, + const pixel_type *JXL_RESTRICT pp, + const intptr_t onerow, const int x, + const int y, Predictor predictor, + weighted::State *wp_state) { + return detail::Predict( + /*p=*/nullptr, w, pp, onerow, x, y, predictor, /*lookup=*/nullptr, + /*references=*/nullptr, wp_state, /*predictions=*/nullptr); +} + +inline PredictionResult PredictTreeNoWP(Properties *p, size_t w, + const pixel_type *JXL_RESTRICT pp, + const intptr_t onerow, const int x, + const int y, + const MATreeLookup &tree_lookup, + const Channel &references) { + return detail::Predict( + p, w, pp, onerow, x, y, Predictor::Zero, &tree_lookup, &references, + /*wp_state=*/nullptr, /*predictions=*/nullptr); +} +// Only use for y > 1, x > 1, x < w-2, and empty references +JXL_INLINE PredictionResult +PredictTreeNoWPNEC(Properties *p, size_t w, const pixel_type *JXL_RESTRICT pp, + const intptr_t onerow, const int x, const int y, + const MATreeLookup &tree_lookup, const Channel &references) { + return detail::Predict( + p, w, pp, onerow, x, y, Predictor::Zero, &tree_lookup, &references, + /*wp_state=*/nullptr, /*predictions=*/nullptr); +} + +inline PredictionResult PredictTreeWP(Properties *p, size_t w, + const pixel_type *JXL_RESTRICT pp, + const intptr_t onerow, const int x, + const int y, + const MATreeLookup &tree_lookup, + const Channel &references, + weighted::State *wp_state) { + return detail::Predict( + p, w, pp, onerow, x, y, Predictor::Zero, &tree_lookup, &references, + wp_state, /*predictions=*/nullptr); +} + +inline PredictionResult PredictLearn(Properties *p, size_t w, + const pixel_type *JXL_RESTRICT pp, + const intptr_t onerow, const int x, + const int y, Predictor predictor, + const Channel &references, + weighted::State *wp_state) { + return detail::Predict( + p, w, pp, onerow, x, y, predictor, /*lookup=*/nullptr, &references, + wp_state, /*predictions=*/nullptr); +} + +inline void PredictLearnAll(Properties *p, size_t w, + const pixel_type *JXL_RESTRICT pp, + const intptr_t onerow, const int x, const int y, + const Channel &references, + weighted::State *wp_state, + pixel_type_w *predictions) { + detail::Predict( + p, w, pp, onerow, x, y, Predictor::Zero, + /*lookup=*/nullptr, &references, wp_state, predictions); +} + +inline void PredictAllNoWP(size_t w, const pixel_type *JXL_RESTRICT pp, + const intptr_t onerow, const int x, const int y, + pixel_type_w *predictions) { + detail::Predict( + /*p=*/nullptr, w, pp, onerow, x, y, Predictor::Zero, + /*lookup=*/nullptr, + /*references=*/nullptr, /*wp_state=*/nullptr, predictions); +} +} // namespace jxl + +#endif // LIB_JXL_MODULAR_ENCODING_CONTEXT_PREDICT_H_ diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.cc b/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.cc new file mode 100644 index 0000000000..66562f7dfd --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.cc @@ -0,0 +1,107 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/encoding/dec_ma.h" + +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/modular/encoding/ma_common.h" +#include "lib/jxl/modular/modular_image.h" + +namespace jxl { + +namespace { + +Status ValidateTree( + const Tree &tree, + const std::vector> &prop_bounds, + size_t root) { + if (tree[root].property == -1) return true; + size_t p = tree[root].property; + int val = tree[root].splitval; + if (prop_bounds[p].first > val) return JXL_FAILURE("Invalid tree"); + // Splitting at max value makes no sense: left range will be exactly same + // as parent, right range will be invalid (min > max). + if (prop_bounds[p].second <= val) return JXL_FAILURE("Invalid tree"); + auto new_bounds = prop_bounds; + new_bounds[p].first = val + 1; + JXL_RETURN_IF_ERROR(ValidateTree(tree, new_bounds, tree[root].lchild)); + new_bounds[p] = prop_bounds[p]; + new_bounds[p].second = val; + return ValidateTree(tree, new_bounds, tree[root].rchild); +} + +Status DecodeTree(BitReader *br, ANSSymbolReader *reader, + const std::vector &context_map, Tree *tree, + size_t tree_size_limit) { + size_t leaf_id = 0; + size_t to_decode = 1; + tree->clear(); + while (to_decode > 0) { + JXL_RETURN_IF_ERROR(br->AllReadsWithinBounds()); + if (tree->size() > tree_size_limit) { + return JXL_FAILURE("Tree is too large: %" PRIuS " nodes vs %" PRIuS + " max nodes", + tree->size(), tree_size_limit); + } + to_decode--; + uint32_t prop1 = reader->ReadHybridUint(kPropertyContext, br, context_map); + if (prop1 > 256) return JXL_FAILURE("Invalid tree property value"); + int property = prop1 - 1; + if (property == -1) { + size_t predictor = + reader->ReadHybridUint(kPredictorContext, br, context_map); + if (predictor >= kNumModularPredictors) { + return JXL_FAILURE("Invalid predictor"); + } + int64_t predictor_offset = + UnpackSigned(reader->ReadHybridUint(kOffsetContext, br, context_map)); + uint32_t mul_log = + reader->ReadHybridUint(kMultiplierLogContext, br, context_map); + if (mul_log >= 31) { + return JXL_FAILURE("Invalid multiplier logarithm"); + } + uint32_t mul_bits = + reader->ReadHybridUint(kMultiplierBitsContext, br, context_map); + if (mul_bits + 1 >= 1u << (31u - mul_log)) { + return JXL_FAILURE("Invalid multiplier"); + } + uint32_t multiplier = (mul_bits + 1U) << mul_log; + tree->emplace_back(-1, 0, leaf_id++, 0, static_cast(predictor), + predictor_offset, multiplier); + continue; + } + int splitval = + UnpackSigned(reader->ReadHybridUint(kSplitValContext, br, context_map)); + tree->emplace_back(property, splitval, tree->size() + to_decode + 1, + tree->size() + to_decode + 2, Predictor::Zero, 0, 1); + to_decode += 2; + } + std::vector> prop_bounds; + prop_bounds.resize(256, {std::numeric_limits::min(), + std::numeric_limits::max()}); + return ValidateTree(*tree, prop_bounds, 0); +} +} // namespace + +Status DecodeTree(BitReader *br, Tree *tree, size_t tree_size_limit) { + std::vector tree_context_map; + ANSCode tree_code; + JXL_RETURN_IF_ERROR( + DecodeHistograms(br, kNumTreeContexts, &tree_code, &tree_context_map)); + // TODO(eustas): investigate more infinite tree cases. + if (tree_code.degenerate_symbols[tree_context_map[kPropertyContext]] > 0) { + return JXL_FAILURE("Infinite tree"); + } + ANSSymbolReader reader(&tree_code, br); + JXL_RETURN_IF_ERROR(DecodeTree(br, &reader, tree_context_map, tree, + std::min(tree_size_limit, kMaxTreeSize))); + if (!reader.CheckANSFinalState()) { + return JXL_FAILURE("ANS decode final state failed"); + } + return true; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.h new file mode 100644 index 0000000000..a910c4deb1 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.h @@ -0,0 +1,66 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_ENCODING_DEC_MA_H_ +#define LIB_JXL_MODULAR_ENCODING_DEC_MA_H_ + +#include +#include + +#include + +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/modular/options.h" + +namespace jxl { + +// inner nodes +struct PropertyDecisionNode { + PropertyVal splitval; + int16_t property; // -1: leaf node, lchild points to leaf node + uint32_t lchild; + uint32_t rchild; + Predictor predictor; + int64_t predictor_offset; + uint32_t multiplier; + + PropertyDecisionNode(int p, int split_val, int lchild, int rchild, + Predictor predictor, int64_t predictor_offset, + uint32_t multiplier) + : splitval(split_val), + property(p), + lchild(lchild), + rchild(rchild), + predictor(predictor), + predictor_offset(predictor_offset), + multiplier(multiplier) {} + PropertyDecisionNode() + : splitval(0), + property(-1), + lchild(0), + rchild(0), + predictor(Predictor::Zero), + predictor_offset(0), + multiplier(1) {} + static PropertyDecisionNode Leaf(Predictor predictor, int64_t offset = 0, + uint32_t multiplier = 1) { + return PropertyDecisionNode(-1, 0, 0, 0, predictor, offset, multiplier); + } + static PropertyDecisionNode Split(int p, int split_val, int lchild, + int rchild = -1) { + if (rchild == -1) rchild = lchild + 1; + return PropertyDecisionNode(p, split_val, lchild, rchild, Predictor::Zero, + 0, 1); + } +}; + +using Tree = std::vector; + +Status DecodeTree(BitReader *br, Tree *tree, size_t tree_size_limit); + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_ENCODING_DEC_MA_H_ diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.cc b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.cc new file mode 100644 index 0000000000..f2a1705e4b --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.cc @@ -0,0 +1,124 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/encoding/enc_debug_tree.h" + +#include +#include + +#include "lib/jxl/base/os_macros.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/encoding/dec_ma.h" +#include "lib/jxl/modular/options.h" + +#if JXL_OS_IOS +#define JXL_ENABLE_DOT 0 +#else +#define JXL_ENABLE_DOT 1 // iOS lacks C89 system() +#endif + +namespace jxl { + +const char *PredictorName(Predictor p) { + switch (p) { + case Predictor::Zero: + return "Zero"; + case Predictor::Left: + return "Left"; + case Predictor::Top: + return "Top"; + case Predictor::Average0: + return "Avg0"; + case Predictor::Average1: + return "Avg1"; + case Predictor::Average2: + return "Avg2"; + case Predictor::Average3: + return "Avg3"; + case Predictor::Average4: + return "Avg4"; + case Predictor::Select: + return "Sel"; + case Predictor::Gradient: + return "Grd"; + case Predictor::Weighted: + return "Wgh"; + case Predictor::TopLeft: + return "TopL"; + case Predictor::TopRight: + return "TopR"; + case Predictor::LeftLeft: + return "LL"; + default: + return "INVALID"; + }; +} + +std::string PropertyName(size_t i) { + static_assert(kNumNonrefProperties == 16, "Update this function"); + switch (i) { + case 0: + return "c"; + case 1: + return "g"; + case 2: + return "y"; + case 3: + return "x"; + case 4: + return "|N|"; + case 5: + return "|W|"; + case 6: + return "N"; + case 7: + return "W"; + case 8: + return "W-WW-NW+NWW"; + case 9: + return "W+N-NW"; + case 10: + return "W-NW"; + case 11: + return "NW-N"; + case 12: + return "N-NE"; + case 13: + return "N-NN"; + case 14: + return "W-WW"; + case 15: + return "WGH"; + default: + return "ch[" + ToString(15 - (int)i) + "]"; + } +} + +void PrintTree(const Tree &tree, const std::string &path) { + FILE *f = fopen((path + ".dot").c_str(), "w"); + fprintf(f, "graph{\n"); + for (size_t cur = 0; cur < tree.size(); cur++) { + if (tree[cur].property < 0) { + fprintf(f, "n%05" PRIuS " [label=\"%s%+" PRId64 " (x%u)\"];\n", cur, + PredictorName(tree[cur].predictor), tree[cur].predictor_offset, + tree[cur].multiplier); + } else { + fprintf(f, "n%05" PRIuS " [label=\"%s>%d\"];\n", cur, + PropertyName(tree[cur].property).c_str(), tree[cur].splitval); + fprintf(f, "n%05" PRIuS " -- n%05d;\n", cur, tree[cur].lchild); + fprintf(f, "n%05" PRIuS " -- n%05d;\n", cur, tree[cur].rchild); + } + } + fprintf(f, "}\n"); + fclose(f); +#if JXL_ENABLE_DOT + JXL_ASSERT( + system(("dot " + path + ".dot -T svg -o " + path + ".svg").c_str()) == 0); +#endif +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.h new file mode 100644 index 0000000000..78deaab1b8 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.h @@ -0,0 +1,27 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_ENCODING_ENC_DEBUG_TREE_H_ +#define LIB_JXL_MODULAR_ENCODING_ENC_DEBUG_TREE_H_ + +#include +#include + +#include +#include + +#include "lib/jxl/modular/encoding/dec_ma.h" +#include "lib/jxl/modular/options.h" + +namespace jxl { + +const char *PredictorName(Predictor p); +std::string PropertyName(size_t i); + +void PrintTree(const Tree &tree, const std::string &path); + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_ENCODING_ENC_DEBUG_TREE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.cc b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.cc new file mode 100644 index 0000000000..c8c183335e --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.cc @@ -0,0 +1,562 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_fields.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/encoding/enc_debug_tree.h" +#include "lib/jxl/modular/encoding/enc_ma.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/modular/encoding/ma_common.h" +#include "lib/jxl/modular/options.h" +#include "lib/jxl/modular/transform/transform.h" +#include "lib/jxl/toc.h" + +namespace jxl { + +namespace { +// Plot tree (if enabled) and predictor usage map. +constexpr bool kWantDebug = false; +constexpr bool kPrintTree = false; + +inline std::array PredictorColor(Predictor p) { + switch (p) { + case Predictor::Zero: + return {{0, 0, 0}}; + case Predictor::Left: + return {{255, 0, 0}}; + case Predictor::Top: + return {{0, 255, 0}}; + case Predictor::Average0: + return {{0, 0, 255}}; + case Predictor::Average4: + return {{192, 128, 128}}; + case Predictor::Select: + return {{255, 255, 0}}; + case Predictor::Gradient: + return {{255, 0, 255}}; + case Predictor::Weighted: + return {{0, 255, 255}}; + // TODO + default: + return {{255, 255, 255}}; + }; +} + +} // namespace + +void GatherTreeData(const Image &image, pixel_type chan, size_t group_id, + const weighted::Header &wp_header, + const ModularOptions &options, TreeSamples &tree_samples, + size_t *total_pixels) { + const Channel &channel = image.channel[chan]; + + JXL_DEBUG_V(7, "Learning %" PRIuS "x%" PRIuS " channel %d", channel.w, + channel.h, chan); + + std::array static_props = { + {chan, (int)group_id}}; + Properties properties(kNumNonrefProperties + + kExtraPropsPerChannel * options.max_properties); + double pixel_fraction = std::min(1.0f, options.nb_repeats); + // a fraction of 0 is used to disable learning entirely. + if (pixel_fraction > 0) { + pixel_fraction = std::max(pixel_fraction, + std::min(1.0, 1024.0 / (channel.w * channel.h))); + } + uint64_t threshold = + (std::numeric_limits::max() >> 32) * pixel_fraction; + uint64_t s[2] = {static_cast(0x94D049BB133111EBull), + static_cast(0xBF58476D1CE4E5B9ull)}; + // Xorshift128+ adapted from xorshift128+-inl.h + auto use_sample = [&]() { + auto s1 = s[0]; + const auto s0 = s[1]; + const auto bits = s1 + s0; // b, c + s[0] = s0; + s1 ^= s1 << 23; + s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5); + s[1] = s1; + return (bits >> 32) <= threshold; + }; + + const intptr_t onerow = channel.plane.PixelsPerRow(); + Channel references(properties.size() - kNumNonrefProperties, channel.w); + weighted::State wp_state(wp_header, channel.w, channel.h); + tree_samples.PrepareForSamples(pixel_fraction * channel.h * channel.w + 64); + for (size_t y = 0; y < channel.h; y++) { + const pixel_type *JXL_RESTRICT p = channel.Row(y); + PrecomputeReferences(channel, y, image, chan, &references); + InitPropsRow(&properties, static_props, y); + // TODO(veluca): avoid computing WP if we don't use its property or + // predictions. + for (size_t x = 0; x < channel.w; x++) { + pixel_type_w pred[kNumModularPredictors]; + if (tree_samples.NumPredictors() != 1) { + PredictLearnAll(&properties, channel.w, p + x, onerow, x, y, references, + &wp_state, pred); + } else { + pred[static_cast(tree_samples.PredictorFromIndex(0))] = + PredictLearn(&properties, channel.w, p + x, onerow, x, y, + tree_samples.PredictorFromIndex(0), references, + &wp_state) + .guess; + } + (*total_pixels)++; + if (use_sample()) { + tree_samples.AddSample(p[x], properties, pred); + } + wp_state.UpdateErrors(p[x], x, y, channel.w); + } + } +} + +Tree LearnTree(TreeSamples &&tree_samples, size_t total_pixels, + const ModularOptions &options, + const std::vector &multiplier_info = {}, + StaticPropRange static_prop_range = {}) { + for (size_t i = 0; i < kNumStaticProperties; i++) { + if (static_prop_range[i][1] == 0) { + static_prop_range[i][1] = std::numeric_limits::max(); + } + } + if (!tree_samples.HasSamples()) { + Tree tree; + tree.emplace_back(); + tree.back().predictor = tree_samples.PredictorFromIndex(0); + tree.back().property = -1; + tree.back().predictor_offset = 0; + tree.back().multiplier = 1; + return tree; + } + float pixel_fraction = tree_samples.NumSamples() * 1.0f / total_pixels; + float required_cost = pixel_fraction * 0.9 + 0.1; + tree_samples.AllSamplesDone(); + Tree tree; + ComputeBestTree(tree_samples, + options.splitting_heuristics_node_threshold * required_cost, + multiplier_info, static_prop_range, + options.fast_decode_multiplier, &tree); + return tree; +} + +Status EncodeModularChannelMAANS(const Image &image, pixel_type chan, + const weighted::Header &wp_header, + const Tree &global_tree, Token **tokenpp, + AuxOut *aux_out, size_t group_id, + bool skip_encoder_fast_path) { + const Channel &channel = image.channel[chan]; + Token *tokenp = *tokenpp; + JXL_ASSERT(channel.w != 0 && channel.h != 0); + + Image3F predictor_img; + if (kWantDebug) predictor_img = Image3F(channel.w, channel.h); + + JXL_DEBUG_V(6, + "Encoding %" PRIuS "x%" PRIuS + " channel %d, " + "(shift=%i,%i)", + channel.w, channel.h, chan, channel.hshift, channel.vshift); + + std::array static_props = { + {chan, (int)group_id}}; + bool use_wp, is_wp_only; + bool is_gradient_only; + size_t num_props; + FlatTree tree = FilterTree(global_tree, static_props, &num_props, &use_wp, + &is_wp_only, &is_gradient_only); + Properties properties(num_props); + MATreeLookup tree_lookup(tree); + JXL_DEBUG_V(3, "Encoding using a MA tree with %" PRIuS " nodes", tree.size()); + + // Check if this tree is a WP-only tree with a small enough property value + // range. + // Initialized to avoid clang-tidy complaining. + uint16_t context_lookup[2 * kPropRangeFast] = {}; + int8_t offsets[2 * kPropRangeFast] = {}; + if (is_wp_only) { + is_wp_only = TreeToLookupTable(tree, context_lookup, offsets); + } + if (is_gradient_only) { + is_gradient_only = TreeToLookupTable(tree, context_lookup, offsets); + } + + if (is_wp_only && !skip_encoder_fast_path) { + for (size_t c = 0; c < 3; c++) { + FillImage(static_cast(PredictorColor(Predictor::Weighted)[c]), + &predictor_img.Plane(c)); + } + const intptr_t onerow = channel.plane.PixelsPerRow(); + weighted::State wp_state(wp_header, channel.w, channel.h); + Properties properties(1); + for (size_t y = 0; y < channel.h; y++) { + const pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + size_t offset = 0; + pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0); + pixel_type_w top = (y ? *(r + x - onerow) : left); + pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left); + pixel_type_w topright = + (x + 1 < channel.w && y ? *(r + x + 1 - onerow) : top); + pixel_type_w toptop = (y > 1 ? *(r + x - onerow - onerow) : top); + int32_t guess = wp_state.Predict( + x, y, channel.w, top, left, topright, topleft, toptop, &properties, + offset); + uint32_t pos = + kPropRangeFast + std::min(std::max(-kPropRangeFast, properties[0]), + kPropRangeFast - 1); + uint32_t ctx_id = context_lookup[pos]; + int32_t residual = r[x] - guess - offsets[pos]; + *tokenp++ = Token(ctx_id, PackSigned(residual)); + wp_state.UpdateErrors(r[x], x, y, channel.w); + } + } + } else if (tree.size() == 1 && tree[0].predictor == Predictor::Gradient && + tree[0].multiplier == 1 && tree[0].predictor_offset == 0 && + !skip_encoder_fast_path) { + for (size_t c = 0; c < 3; c++) { + FillImage(static_cast(PredictorColor(Predictor::Gradient)[c]), + &predictor_img.Plane(c)); + } + const intptr_t onerow = channel.plane.PixelsPerRow(); + for (size_t y = 0; y < channel.h; y++) { + const pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0); + pixel_type_w top = (y ? *(r + x - onerow) : left); + pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left); + int32_t guess = ClampedGradient(top, left, topleft); + int32_t residual = r[x] - guess; + *tokenp++ = Token(tree[0].childID, PackSigned(residual)); + } + } + } else if (is_gradient_only && !skip_encoder_fast_path) { + for (size_t c = 0; c < 3; c++) { + FillImage(static_cast(PredictorColor(Predictor::Gradient)[c]), + &predictor_img.Plane(c)); + } + const intptr_t onerow = channel.plane.PixelsPerRow(); + for (size_t y = 0; y < channel.h; y++) { + const pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0); + pixel_type_w top = (y ? *(r + x - onerow) : left); + pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left); + int32_t guess = ClampedGradient(top, left, topleft); + uint32_t pos = + kPropRangeFast + + std::min( + std::max(-kPropRangeFast, top + left - topleft), + kPropRangeFast - 1); + uint32_t ctx_id = context_lookup[pos]; + int32_t residual = r[x] - guess - offsets[pos]; + *tokenp++ = Token(ctx_id, PackSigned(residual)); + } + } + } else if (tree.size() == 1 && tree[0].predictor == Predictor::Zero && + tree[0].multiplier == 1 && tree[0].predictor_offset == 0 && + !skip_encoder_fast_path) { + for (size_t c = 0; c < 3; c++) { + FillImage(static_cast(PredictorColor(Predictor::Zero)[c]), + &predictor_img.Plane(c)); + } + for (size_t y = 0; y < channel.h; y++) { + const pixel_type *JXL_RESTRICT p = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + *tokenp++ = Token(tree[0].childID, PackSigned(p[x])); + } + } + } else if (tree.size() == 1 && tree[0].predictor != Predictor::Weighted && + (tree[0].multiplier & (tree[0].multiplier - 1)) == 0 && + tree[0].predictor_offset == 0 && !skip_encoder_fast_path) { + // multiplier is a power of 2. + for (size_t c = 0; c < 3; c++) { + FillImage(static_cast(PredictorColor(tree[0].predictor)[c]), + &predictor_img.Plane(c)); + } + uint32_t mul_shift = FloorLog2Nonzero((uint32_t)tree[0].multiplier); + const intptr_t onerow = channel.plane.PixelsPerRow(); + for (size_t y = 0; y < channel.h; y++) { + const pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + PredictionResult pred = PredictNoTreeNoWP(channel.w, r + x, onerow, x, + y, tree[0].predictor); + pixel_type_w residual = r[x] - pred.guess; + JXL_DASSERT((residual >> mul_shift) * tree[0].multiplier == residual); + *tokenp++ = Token(tree[0].childID, PackSigned(residual >> mul_shift)); + } + } + + } else if (!use_wp && !skip_encoder_fast_path) { + const intptr_t onerow = channel.plane.PixelsPerRow(); + Channel references(properties.size() - kNumNonrefProperties, channel.w); + for (size_t y = 0; y < channel.h; y++) { + const pixel_type *JXL_RESTRICT p = channel.Row(y); + PrecomputeReferences(channel, y, image, chan, &references); + float *pred_img_row[3]; + if (kWantDebug) { + for (size_t c = 0; c < 3; c++) { + pred_img_row[c] = predictor_img.PlaneRow(c, y); + } + } + InitPropsRow(&properties, static_props, y); + for (size_t x = 0; x < channel.w; x++) { + PredictionResult res = + PredictTreeNoWP(&properties, channel.w, p + x, onerow, x, y, + tree_lookup, references); + if (kWantDebug) { + for (size_t i = 0; i < 3; i++) { + pred_img_row[i][x] = PredictorColor(res.predictor)[i]; + } + } + pixel_type_w residual = p[x] - res.guess; + JXL_ASSERT(residual % res.multiplier == 0); + *tokenp++ = Token(res.context, PackSigned(residual / res.multiplier)); + } + } + } else { + const intptr_t onerow = channel.plane.PixelsPerRow(); + Channel references(properties.size() - kNumNonrefProperties, channel.w); + weighted::State wp_state(wp_header, channel.w, channel.h); + for (size_t y = 0; y < channel.h; y++) { + const pixel_type *JXL_RESTRICT p = channel.Row(y); + PrecomputeReferences(channel, y, image, chan, &references); + float *pred_img_row[3]; + if (kWantDebug) { + for (size_t c = 0; c < 3; c++) { + pred_img_row[c] = predictor_img.PlaneRow(c, y); + } + } + InitPropsRow(&properties, static_props, y); + for (size_t x = 0; x < channel.w; x++) { + PredictionResult res = + PredictTreeWP(&properties, channel.w, p + x, onerow, x, y, + tree_lookup, references, &wp_state); + if (kWantDebug) { + for (size_t i = 0; i < 3; i++) { + pred_img_row[i][x] = PredictorColor(res.predictor)[i]; + } + } + pixel_type_w residual = p[x] - res.guess; + JXL_ASSERT(residual % res.multiplier == 0); + *tokenp++ = Token(res.context, PackSigned(residual / res.multiplier)); + wp_state.UpdateErrors(p[x], x, y, channel.w); + } + } + } + if (kWantDebug && WantDebugOutput(aux_out)) { + aux_out->DumpImage( + ("pred_" + ToString(group_id) + "_" + ToString(chan)).c_str(), + predictor_img); + } + *tokenpp = tokenp; + return true; +} + +Status ModularEncode(const Image &image, const ModularOptions &options, + BitWriter *writer, AuxOut *aux_out, size_t layer, + size_t group_id, TreeSamples *tree_samples, + size_t *total_pixels, const Tree *tree, + GroupHeader *header, std::vector *tokens, + size_t *width) { + if (image.error) return JXL_FAILURE("Invalid image"); + size_t nb_channels = image.channel.size(); + JXL_DEBUG_V( + 2, "Encoding %" PRIuS "-channel, %i-bit, %" PRIuS "x%" PRIuS " image.", + nb_channels, image.bitdepth, image.w, image.h); + + if (nb_channels < 1) { + return true; // is there any use for a zero-channel image? + } + + // encode transforms + GroupHeader header_storage; + if (header == nullptr) header = &header_storage; + Bundle::Init(header); + if (options.predictor == Predictor::Weighted) { + weighted::PredictorMode(options.wp_mode, &header->wp_header); + } + header->transforms = image.transform; + // This doesn't actually work + if (tree != nullptr) { + header->use_global_tree = true; + } + if (tree_samples == nullptr && tree == nullptr) { + JXL_RETURN_IF_ERROR(Bundle::Write(*header, writer, layer, aux_out)); + } + + TreeSamples tree_samples_storage; + size_t total_pixels_storage = 0; + if (!total_pixels) total_pixels = &total_pixels_storage; + // If there's no tree, compute one (or gather data to). + if (tree == nullptr) { + bool gather_data = tree_samples != nullptr; + if (tree_samples == nullptr) { + JXL_RETURN_IF_ERROR(tree_samples_storage.SetPredictor( + options.predictor, options.wp_tree_mode)); + JXL_RETURN_IF_ERROR(tree_samples_storage.SetProperties( + options.splitting_heuristics_properties, options.wp_tree_mode)); + std::vector pixel_samples; + std::vector diff_samples; + std::vector group_pixel_count; + std::vector channel_pixel_count; + CollectPixelSamples(image, options, 0, group_pixel_count, + channel_pixel_count, pixel_samples, diff_samples); + std::vector dummy_multiplier_info; + StaticPropRange range; + tree_samples_storage.PreQuantizeProperties( + range, dummy_multiplier_info, group_pixel_count, channel_pixel_count, + pixel_samples, diff_samples, options.max_property_values); + } + for (size_t i = 0; i < nb_channels; i++) { + if (!image.channel[i].w || !image.channel[i].h) { + continue; // skip empty channels + } + if (i >= image.nb_meta_channels && + (image.channel[i].w > options.max_chan_size || + image.channel[i].h > options.max_chan_size)) { + break; + } + GatherTreeData(image, i, group_id, header->wp_header, options, + gather_data ? *tree_samples : tree_samples_storage, + total_pixels); + } + if (gather_data) return true; + } + + JXL_ASSERT((tree == nullptr) == (tokens == nullptr)); + + Tree tree_storage; + std::vector> tokens_storage(1); + // Compute tree. + if (tree == nullptr) { + EntropyEncodingData code; + std::vector context_map; + + std::vector> tree_tokens(1); + tree_storage = + LearnTree(std::move(tree_samples_storage), *total_pixels, options); + tree = &tree_storage; + tokens = &tokens_storage[0]; + + Tree decoded_tree; + TokenizeTree(*tree, &tree_tokens[0], &decoded_tree); + JXL_ASSERT(tree->size() == decoded_tree.size()); + tree_storage = std::move(decoded_tree); + + if (kWantDebug && kPrintTree && WantDebugOutput(aux_out)) { + PrintTree(*tree, aux_out->debug_prefix + "/tree_" + ToString(group_id)); + } + // Write tree + BuildAndEncodeHistograms(HistogramParams(), kNumTreeContexts, tree_tokens, + &code, &context_map, writer, kLayerModularTree, + aux_out); + WriteTokens(tree_tokens[0], code, context_map, writer, kLayerModularTree, + aux_out); + } + + size_t image_width = 0; + size_t total_tokens = 0; + for (size_t i = 0; i < nb_channels; i++) { + if (i >= image.nb_meta_channels && + (image.channel[i].w > options.max_chan_size || + image.channel[i].h > options.max_chan_size)) { + break; + } + if (image.channel[i].w > image_width) image_width = image.channel[i].w; + total_tokens += image.channel[i].w * image.channel[i].h; + } + if (options.zero_tokens) { + tokens->resize(tokens->size() + total_tokens, {0, 0}); + } else { + // Do one big allocation for all the tokens we'll need, + // to avoid reallocs that might require copying. + size_t pos = tokens->size(); + tokens->resize(pos + total_tokens); + Token *tokenp = tokens->data() + pos; + for (size_t i = 0; i < nb_channels; i++) { + if (!image.channel[i].w || !image.channel[i].h) { + continue; // skip empty channels + } + if (i >= image.nb_meta_channels && + (image.channel[i].w > options.max_chan_size || + image.channel[i].h > options.max_chan_size)) { + break; + } + JXL_RETURN_IF_ERROR(EncodeModularChannelMAANS( + image, i, header->wp_header, *tree, &tokenp, aux_out, group_id, + options.skip_encoder_fast_path)); + } + // Make sure we actually wrote all tokens + JXL_CHECK(tokenp == tokens->data() + tokens->size()); + } + + // Write data if not using a global tree/ANS stream. + if (!header->use_global_tree) { + EntropyEncodingData code; + std::vector context_map; + HistogramParams histo_params; + histo_params.image_widths.push_back(image_width); + BuildAndEncodeHistograms(histo_params, (tree->size() + 1) / 2, + tokens_storage, &code, &context_map, writer, layer, + aux_out); + WriteTokens(tokens_storage[0], code, context_map, writer, layer, aux_out); + } else { + *width = image_width; + } + return true; +} + +Status ModularGenericCompress(Image &image, const ModularOptions &opts, + BitWriter *writer, AuxOut *aux_out, size_t layer, + size_t group_id, TreeSamples *tree_samples, + size_t *total_pixels, const Tree *tree, + GroupHeader *header, std::vector *tokens, + size_t *width) { + if (image.w == 0 || image.h == 0) return true; + ModularOptions options = opts; // Make a copy to modify it. + + if (options.predictor == static_cast(-1)) { + options.predictor = Predictor::Gradient; + } + + size_t bits = writer ? writer->BitsWritten() : 0; + JXL_RETURN_IF_ERROR(ModularEncode(image, options, writer, aux_out, layer, + group_id, tree_samples, total_pixels, tree, + header, tokens, width)); + bits = writer ? writer->BitsWritten() - bits : 0; + if (writer) { + JXL_DEBUG_V(4, + "Modular-encoded a %" PRIuS "x%" PRIuS + " bitdepth=%i nbchans=%" PRIuS " image in %" PRIuS " bytes", + image.w, image.h, image.bitdepth, image.channel.size(), + bits / 8); + } + (void)bits; + return true; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.h new file mode 100644 index 0000000000..04df504750 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.h @@ -0,0 +1,47 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_ENCODING_ENC_ENCODING_H_ +#define LIB_JXL_MODULAR_ENCODING_ENC_ENCODING_H_ + +#include +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/image.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/encoding/enc_ma.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/options.h" +#include "lib/jxl/modular/transform/transform.h" + +namespace jxl { + +Tree LearnTree(TreeSamples &&tree_samples, size_t total_pixels, + const ModularOptions &options, + const std::vector &multiplier_info = {}, + StaticPropRange static_prop_range = {}); + +// TODO(veluca): make cleaner interfaces. + +Status ModularGenericCompress( + Image &image, const ModularOptions &opts, BitWriter *writer, + AuxOut *aux_out = nullptr, size_t layer = 0, size_t group_id = 0, + // For gathering data for producing a global tree. + TreeSamples *tree_samples = nullptr, size_t *total_pixels = nullptr, + // For encoding with global tree. + const Tree *tree = nullptr, GroupHeader *header = nullptr, + std::vector *tokens = nullptr, size_t *widths = nullptr); +} // namespace jxl + +#endif // LIB_JXL_MODULAR_ENCODING_ENC_ENCODING_H_ diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.cc b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.cc new file mode 100644 index 0000000000..d0f6b47566 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.cc @@ -0,0 +1,1023 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/encoding/enc_ma.h" + +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/modular/encoding/ma_common.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/modular/encoding/enc_ma.cc" +#include +#include + +#include "lib/jxl/base/random.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/fast_math-inl.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/options.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Eq; +using hwy::HWY_NAMESPACE::IfThenElse; +using hwy::HWY_NAMESPACE::Lt; + +const HWY_FULL(float) df; +const HWY_FULL(int32_t) di; +size_t Padded(size_t x) { return RoundUpTo(x, Lanes(df)); } + +float EstimateBits(const int32_t *counts, int32_t *rounded_counts, + size_t num_symbols) { + // Try to approximate the effect of rounding up nonzero probabilities. + int32_t total = std::accumulate(counts, counts + num_symbols, 0); + const auto min = Set(di, (total + ANS_TAB_SIZE - 1) >> ANS_LOG_TAB_SIZE); + const auto zero_i = Zero(di); + for (size_t i = 0; i < num_symbols; i += Lanes(df)) { + auto counts_v = LoadU(di, &counts[i]); + counts_v = IfThenElse(Eq(counts_v, zero_i), zero_i, + IfThenElse(Lt(counts_v, min), min, counts_v)); + StoreU(counts_v, di, &rounded_counts[i]); + } + // Compute entropy of the "rounded" probabilities. + const auto zero = Zero(df); + const size_t total_scalar = + std::accumulate(rounded_counts, rounded_counts + num_symbols, 0); + const auto inv_total = Set(df, 1.0f / total_scalar); + auto bits_lanes = Zero(df); + auto total_v = Set(di, total_scalar); + for (size_t i = 0; i < num_symbols; i += Lanes(df)) { + const auto counts_v = ConvertTo(df, LoadU(di, &counts[i])); + const auto round_counts_v = LoadU(di, &rounded_counts[i]); + const auto probs = Mul(ConvertTo(df, round_counts_v), inv_total); + const auto nbps = IfThenElse(Eq(round_counts_v, total_v), BitCast(di, zero), + BitCast(di, FastLog2f(df, probs))); + bits_lanes = Sub(bits_lanes, IfThenElse(Eq(counts_v, zero), zero, + Mul(counts_v, BitCast(df, nbps)))); + } + return GetLane(SumOfLanes(df, bits_lanes)); +} + +void MakeSplitNode(size_t pos, int property, int splitval, Predictor lpred, + int64_t loff, Predictor rpred, int64_t roff, Tree *tree) { + // Note that the tree splits on *strictly greater*. + (*tree)[pos].lchild = tree->size(); + (*tree)[pos].rchild = tree->size() + 1; + (*tree)[pos].splitval = splitval; + (*tree)[pos].property = property; + tree->emplace_back(); + tree->back().property = -1; + tree->back().predictor = rpred; + tree->back().predictor_offset = roff; + tree->back().multiplier = 1; + tree->emplace_back(); + tree->back().property = -1; + tree->back().predictor = lpred; + tree->back().predictor_offset = loff; + tree->back().multiplier = 1; +} + +enum class IntersectionType { kNone, kPartial, kInside }; +IntersectionType BoxIntersects(StaticPropRange needle, StaticPropRange haystack, + uint32_t &partial_axis, uint32_t &partial_val) { + bool partial = false; + for (size_t i = 0; i < kNumStaticProperties; i++) { + if (haystack[i][0] >= needle[i][1]) { + return IntersectionType::kNone; + } + if (haystack[i][1] <= needle[i][0]) { + return IntersectionType::kNone; + } + if (haystack[i][0] <= needle[i][0] && haystack[i][1] >= needle[i][1]) { + continue; + } + partial = true; + partial_axis = i; + if (haystack[i][0] > needle[i][0] && haystack[i][0] < needle[i][1]) { + partial_val = haystack[i][0] - 1; + } else { + JXL_DASSERT(haystack[i][1] > needle[i][0] && + haystack[i][1] < needle[i][1]); + partial_val = haystack[i][1] - 1; + } + } + return partial ? IntersectionType::kPartial : IntersectionType::kInside; +} + +void SplitTreeSamples(TreeSamples &tree_samples, size_t begin, size_t pos, + size_t end, size_t prop) { + auto cmp = [&](size_t a, size_t b) { + return int32_t(tree_samples.Property(prop, a)) - + int32_t(tree_samples.Property(prop, b)); + }; + Rng rng(0); + while (end > begin + 1) { + { + size_t pivot = rng.UniformU(begin, end); + tree_samples.Swap(begin, pivot); + } + size_t pivot_begin = begin; + size_t pivot_end = pivot_begin + 1; + for (size_t i = begin + 1; i < end; i++) { + JXL_DASSERT(i >= pivot_end); + JXL_DASSERT(pivot_end > pivot_begin); + int32_t cmp_result = cmp(i, pivot_begin); + if (cmp_result < 0) { // i < pivot, move pivot forward and put i before + // the pivot. + tree_samples.ThreeShuffle(pivot_begin, pivot_end, i); + pivot_begin++; + pivot_end++; + } else if (cmp_result == 0) { + tree_samples.Swap(pivot_end, i); + pivot_end++; + } + } + JXL_DASSERT(pivot_begin >= begin); + JXL_DASSERT(pivot_end > pivot_begin); + JXL_DASSERT(pivot_end <= end); + for (size_t i = begin; i < pivot_begin; i++) { + JXL_DASSERT(cmp(i, pivot_begin) < 0); + } + for (size_t i = pivot_end; i < end; i++) { + JXL_DASSERT(cmp(i, pivot_begin) > 0); + } + for (size_t i = pivot_begin; i < pivot_end; i++) { + JXL_DASSERT(cmp(i, pivot_begin) == 0); + } + // We now have that [begin, pivot_begin) is < pivot, [pivot_begin, + // pivot_end) is = pivot, and [pivot_end, end) is > pivot. + // If pos falls in the first or the last interval, we continue in that + // interval; otherwise, we are done. + if (pivot_begin > pos) { + end = pivot_begin; + } else if (pivot_end < pos) { + begin = pivot_end; + } else { + break; + } + } +} + +void FindBestSplit(TreeSamples &tree_samples, float threshold, + const std::vector &mul_info, + StaticPropRange initial_static_prop_range, + float fast_decode_multiplier, Tree *tree) { + struct NodeInfo { + size_t pos; + size_t begin; + size_t end; + uint64_t used_properties; + StaticPropRange static_prop_range; + }; + std::vector nodes; + nodes.push_back(NodeInfo{0, 0, tree_samples.NumDistinctSamples(), 0, + initial_static_prop_range}); + + size_t num_predictors = tree_samples.NumPredictors(); + size_t num_properties = tree_samples.NumProperties(); + + // TODO(veluca): consider parallelizing the search (processing multiple nodes + // at a time). + while (!nodes.empty()) { + size_t pos = nodes.back().pos; + size_t begin = nodes.back().begin; + size_t end = nodes.back().end; + uint64_t used_properties = nodes.back().used_properties; + StaticPropRange static_prop_range = nodes.back().static_prop_range; + nodes.pop_back(); + if (begin == end) continue; + + struct SplitInfo { + size_t prop = 0; + uint32_t val = 0; + size_t pos = 0; + float lcost = std::numeric_limits::max(); + float rcost = std::numeric_limits::max(); + Predictor lpred = Predictor::Zero; + Predictor rpred = Predictor::Zero; + float Cost() { return lcost + rcost; } + }; + + SplitInfo best_split_static_constant; + SplitInfo best_split_static; + SplitInfo best_split_nonstatic; + SplitInfo best_split_nowp; + + JXL_DASSERT(begin <= end); + JXL_DASSERT(end <= tree_samples.NumDistinctSamples()); + + // Compute the maximum token in the range. + size_t max_symbols = 0; + for (size_t pred = 0; pred < num_predictors; pred++) { + for (size_t i = begin; i < end; i++) { + uint32_t tok = tree_samples.Token(pred, i); + max_symbols = max_symbols > tok + 1 ? max_symbols : tok + 1; + } + } + max_symbols = Padded(max_symbols); + std::vector rounded_counts(max_symbols); + std::vector counts(max_symbols * num_predictors); + std::vector tot_extra_bits(num_predictors); + for (size_t pred = 0; pred < num_predictors; pred++) { + for (size_t i = begin; i < end; i++) { + counts[pred * max_symbols + tree_samples.Token(pred, i)] += + tree_samples.Count(i); + tot_extra_bits[pred] += + tree_samples.NBits(pred, i) * tree_samples.Count(i); + } + } + + float base_bits; + { + size_t pred = tree_samples.PredictorIndex((*tree)[pos].predictor); + base_bits = EstimateBits(counts.data() + pred * max_symbols, + rounded_counts.data(), max_symbols) + + tot_extra_bits[pred]; + } + + SplitInfo *best = &best_split_nonstatic; + + SplitInfo forced_split; + // The multiplier ranges cut halfway through the current ranges of static + // properties. We do this even if the current node is not a leaf, to + // minimize the number of nodes in the resulting tree. + for (size_t i = 0; i < mul_info.size(); i++) { + uint32_t axis, val; + IntersectionType t = + BoxIntersects(static_prop_range, mul_info[i].range, axis, val); + if (t == IntersectionType::kNone) continue; + if (t == IntersectionType::kInside) { + (*tree)[pos].multiplier = mul_info[i].multiplier; + break; + } + if (t == IntersectionType::kPartial) { + forced_split.val = tree_samples.QuantizeProperty(axis, val); + forced_split.prop = axis; + forced_split.lcost = forced_split.rcost = base_bits / 2 - threshold; + forced_split.lpred = forced_split.rpred = (*tree)[pos].predictor; + best = &forced_split; + best->pos = begin; + JXL_ASSERT(best->prop == tree_samples.PropertyFromIndex(best->prop)); + for (size_t x = begin; x < end; x++) { + if (tree_samples.Property(best->prop, x) <= best->val) { + best->pos++; + } + } + break; + } + } + + if (best != &forced_split) { + std::vector prop_value_used_count; + std::vector count_increase; + std::vector extra_bits_increase; + // For each property, compute which of its values are used, and what + // tokens correspond to those usages. Then, iterate through the values, + // and compute the entropy of each side of the split (of the form `prop > + // threshold`). Finally, find the split that minimizes the cost. + struct CostInfo { + float cost = std::numeric_limits::max(); + float extra_cost = 0; + float Cost() const { return cost + extra_cost; } + Predictor pred; // will be uninitialized in some cases, but never used. + }; + std::vector costs_l; + std::vector costs_r; + + std::vector counts_above(max_symbols); + std::vector counts_below(max_symbols); + + // The lower the threshold, the higher the expected noisiness of the + // estimate. Thus, discourage changing predictors. + float change_pred_penalty = 800.0f / (100.0f + threshold); + for (size_t prop = 0; prop < num_properties && base_bits > threshold; + prop++) { + costs_l.clear(); + costs_r.clear(); + size_t prop_size = tree_samples.NumPropertyValues(prop); + if (extra_bits_increase.size() < prop_size) { + count_increase.resize(prop_size * max_symbols); + extra_bits_increase.resize(prop_size); + } + // Clear prop_value_used_count (which cannot be cleared "on the go") + prop_value_used_count.clear(); + prop_value_used_count.resize(prop_size); + + size_t first_used = prop_size; + size_t last_used = 0; + + // TODO(veluca): consider finding multiple splits along a single + // property at the same time, possibly with a bottom-up approach. + for (size_t i = begin; i < end; i++) { + size_t p = tree_samples.Property(prop, i); + prop_value_used_count[p]++; + last_used = std::max(last_used, p); + first_used = std::min(first_used, p); + } + costs_l.resize(last_used - first_used); + costs_r.resize(last_used - first_used); + // For all predictors, compute the right and left costs of each split. + for (size_t pred = 0; pred < num_predictors; pred++) { + // Compute cost and histogram increments for each property value. + for (size_t i = begin; i < end; i++) { + size_t p = tree_samples.Property(prop, i); + size_t cnt = tree_samples.Count(i); + size_t sym = tree_samples.Token(pred, i); + count_increase[p * max_symbols + sym] += cnt; + extra_bits_increase[p] += tree_samples.NBits(pred, i) * cnt; + } + memcpy(counts_above.data(), counts.data() + pred * max_symbols, + max_symbols * sizeof counts_above[0]); + memset(counts_below.data(), 0, max_symbols * sizeof counts_below[0]); + size_t extra_bits_below = 0; + // Exclude last used: this ensures neither counts_above nor + // counts_below is empty. + for (size_t i = first_used; i < last_used; i++) { + if (!prop_value_used_count[i]) continue; + extra_bits_below += extra_bits_increase[i]; + // The increase for this property value has been used, and will not + // be used again: clear it. Also below. + extra_bits_increase[i] = 0; + for (size_t sym = 0; sym < max_symbols; sym++) { + counts_above[sym] -= count_increase[i * max_symbols + sym]; + counts_below[sym] += count_increase[i * max_symbols + sym]; + count_increase[i * max_symbols + sym] = 0; + } + float rcost = EstimateBits(counts_above.data(), + rounded_counts.data(), max_symbols) + + tot_extra_bits[pred] - extra_bits_below; + float lcost = EstimateBits(counts_below.data(), + rounded_counts.data(), max_symbols) + + extra_bits_below; + JXL_DASSERT(extra_bits_below <= tot_extra_bits[pred]); + float penalty = 0; + // Never discourage moving away from the Weighted predictor. + if (tree_samples.PredictorFromIndex(pred) != + (*tree)[pos].predictor && + (*tree)[pos].predictor != Predictor::Weighted) { + penalty = change_pred_penalty; + } + // If everything else is equal, disfavour Weighted (slower) and + // favour Zero (faster if it's the only predictor used in a + // group+channel combination) + if (tree_samples.PredictorFromIndex(pred) == Predictor::Weighted) { + penalty += 1e-8; + } + if (tree_samples.PredictorFromIndex(pred) == Predictor::Zero) { + penalty -= 1e-8; + } + if (rcost + penalty < costs_r[i - first_used].Cost()) { + costs_r[i - first_used].cost = rcost; + costs_r[i - first_used].extra_cost = penalty; + costs_r[i - first_used].pred = + tree_samples.PredictorFromIndex(pred); + } + if (lcost + penalty < costs_l[i - first_used].Cost()) { + costs_l[i - first_used].cost = lcost; + costs_l[i - first_used].extra_cost = penalty; + costs_l[i - first_used].pred = + tree_samples.PredictorFromIndex(pred); + } + } + } + // Iterate through the possible splits and find the one with minimum sum + // of costs of the two sides. + size_t split = begin; + for (size_t i = first_used; i < last_used; i++) { + if (!prop_value_used_count[i]) continue; + split += prop_value_used_count[i]; + float rcost = costs_r[i - first_used].cost; + float lcost = costs_l[i - first_used].cost; + // WP was not used + we would use the WP property or predictor + bool adds_wp = + (tree_samples.PropertyFromIndex(prop) == kWPProp && + (used_properties & (1LU << prop)) == 0) || + ((costs_l[i - first_used].pred == Predictor::Weighted || + costs_r[i - first_used].pred == Predictor::Weighted) && + (*tree)[pos].predictor != Predictor::Weighted); + bool zero_entropy_side = rcost == 0 || lcost == 0; + + SplitInfo &best = + prop < kNumStaticProperties + ? (zero_entropy_side ? best_split_static_constant + : best_split_static) + : (adds_wp ? best_split_nonstatic : best_split_nowp); + if (lcost + rcost < best.Cost()) { + best.prop = prop; + best.val = i; + best.pos = split; + best.lcost = lcost; + best.lpred = costs_l[i - first_used].pred; + best.rcost = rcost; + best.rpred = costs_r[i - first_used].pred; + } + } + // Clear extra_bits_increase and cost_increase for last_used. + extra_bits_increase[last_used] = 0; + for (size_t sym = 0; sym < max_symbols; sym++) { + count_increase[last_used * max_symbols + sym] = 0; + } + } + + // Try to avoid introducing WP. + if (best_split_nowp.Cost() + threshold < base_bits && + best_split_nowp.Cost() <= fast_decode_multiplier * best->Cost()) { + best = &best_split_nowp; + } + // Split along static props if possible and not significantly more + // expensive. + if (best_split_static.Cost() + threshold < base_bits && + best_split_static.Cost() <= fast_decode_multiplier * best->Cost()) { + best = &best_split_static; + } + // Split along static props to create constant nodes if possible. + if (best_split_static_constant.Cost() + threshold < base_bits) { + best = &best_split_static_constant; + } + } + + if (best->Cost() + threshold < base_bits) { + uint32_t p = tree_samples.PropertyFromIndex(best->prop); + pixel_type dequant = + tree_samples.UnquantizeProperty(best->prop, best->val); + // Split node and try to split children. + MakeSplitNode(pos, p, dequant, best->lpred, 0, best->rpred, 0, tree); + // "Sort" according to winning property + SplitTreeSamples(tree_samples, begin, best->pos, end, best->prop); + if (p >= kNumStaticProperties) { + used_properties |= 1 << best->prop; + } + auto new_sp_range = static_prop_range; + if (p < kNumStaticProperties) { + JXL_ASSERT(static_cast(dequant + 1) <= new_sp_range[p][1]); + new_sp_range[p][1] = dequant + 1; + JXL_ASSERT(new_sp_range[p][0] < new_sp_range[p][1]); + } + nodes.push_back(NodeInfo{(*tree)[pos].rchild, begin, best->pos, + used_properties, new_sp_range}); + new_sp_range = static_prop_range; + if (p < kNumStaticProperties) { + JXL_ASSERT(new_sp_range[p][0] <= static_cast(dequant + 1)); + new_sp_range[p][0] = dequant + 1; + JXL_ASSERT(new_sp_range[p][0] < new_sp_range[p][1]); + } + nodes.push_back(NodeInfo{(*tree)[pos].lchild, best->pos, end, + used_properties, new_sp_range}); + } + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(FindBestSplit); // Local function. + +void ComputeBestTree(TreeSamples &tree_samples, float threshold, + const std::vector &mul_info, + StaticPropRange static_prop_range, + float fast_decode_multiplier, Tree *tree) { + // TODO(veluca): take into account that different contexts can have different + // uint configs. + // + // Initialize tree. + tree->emplace_back(); + tree->back().property = -1; + tree->back().predictor = tree_samples.PredictorFromIndex(0); + tree->back().predictor_offset = 0; + tree->back().multiplier = 1; + JXL_ASSERT(tree_samples.NumProperties() < 64); + + JXL_ASSERT(tree_samples.NumDistinctSamples() <= + std::numeric_limits::max()); + HWY_DYNAMIC_DISPATCH(FindBestSplit) + (tree_samples, threshold, mul_info, static_prop_range, fast_decode_multiplier, + tree); +} + +constexpr int32_t TreeSamples::kPropertyRange; +constexpr uint32_t TreeSamples::kDedupEntryUnused; + +Status TreeSamples::SetPredictor(Predictor predictor, + ModularOptions::TreeMode wp_tree_mode) { + if (wp_tree_mode == ModularOptions::TreeMode::kWPOnly) { + predictors = {Predictor::Weighted}; + residuals.resize(1); + return true; + } + if (wp_tree_mode == ModularOptions::TreeMode::kNoWP && + predictor == Predictor::Weighted) { + return JXL_FAILURE("Invalid predictor settings"); + } + if (predictor == Predictor::Variable) { + for (size_t i = 0; i < kNumModularPredictors; i++) { + predictors.push_back(static_cast(i)); + } + std::swap(predictors[0], predictors[static_cast(Predictor::Weighted)]); + std::swap(predictors[1], predictors[static_cast(Predictor::Gradient)]); + } else if (predictor == Predictor::Best) { + predictors = {Predictor::Weighted, Predictor::Gradient}; + } else { + predictors = {predictor}; + } + if (wp_tree_mode == ModularOptions::TreeMode::kNoWP) { + auto wp_it = + std::find(predictors.begin(), predictors.end(), Predictor::Weighted); + if (wp_it != predictors.end()) { + predictors.erase(wp_it); + } + } + residuals.resize(predictors.size()); + return true; +} + +Status TreeSamples::SetProperties(const std::vector &properties, + ModularOptions::TreeMode wp_tree_mode) { + props_to_use = properties; + if (wp_tree_mode == ModularOptions::TreeMode::kWPOnly) { + props_to_use = {static_cast(kWPProp)}; + } + if (wp_tree_mode == ModularOptions::TreeMode::kGradientOnly) { + props_to_use = {static_cast(kGradientProp)}; + } + if (wp_tree_mode == ModularOptions::TreeMode::kNoWP) { + auto it = std::find(props_to_use.begin(), props_to_use.end(), kWPProp); + if (it != props_to_use.end()) { + props_to_use.erase(it); + } + } + if (props_to_use.empty()) { + return JXL_FAILURE("Invalid property set configuration"); + } + props.resize(props_to_use.size()); + return true; +} + +void TreeSamples::InitTable(size_t size) { + JXL_DASSERT((size & (size - 1)) == 0); + if (dedup_table_.size() == size) return; + dedup_table_.resize(size, kDedupEntryUnused); + for (size_t i = 0; i < NumDistinctSamples(); i++) { + if (sample_counts[i] != std::numeric_limits::max()) { + AddToTable(i); + } + } +} + +bool TreeSamples::AddToTableAndMerge(size_t a) { + size_t pos1 = Hash1(a); + size_t pos2 = Hash2(a); + if (dedup_table_[pos1] != kDedupEntryUnused && + IsSameSample(a, dedup_table_[pos1])) { + JXL_DASSERT(sample_counts[a] == 1); + sample_counts[dedup_table_[pos1]]++; + // Remove from hash table samples that are saturated. + if (sample_counts[dedup_table_[pos1]] == + std::numeric_limits::max()) { + dedup_table_[pos1] = kDedupEntryUnused; + } + return true; + } + if (dedup_table_[pos2] != kDedupEntryUnused && + IsSameSample(a, dedup_table_[pos2])) { + JXL_DASSERT(sample_counts[a] == 1); + sample_counts[dedup_table_[pos2]]++; + // Remove from hash table samples that are saturated. + if (sample_counts[dedup_table_[pos2]] == + std::numeric_limits::max()) { + dedup_table_[pos2] = kDedupEntryUnused; + } + return true; + } + AddToTable(a); + return false; +} + +void TreeSamples::AddToTable(size_t a) { + size_t pos1 = Hash1(a); + size_t pos2 = Hash2(a); + if (dedup_table_[pos1] == kDedupEntryUnused) { + dedup_table_[pos1] = a; + } else if (dedup_table_[pos2] == kDedupEntryUnused) { + dedup_table_[pos2] = a; + } +} + +void TreeSamples::PrepareForSamples(size_t num_samples) { + for (auto &res : residuals) { + res.reserve(res.size() + num_samples); + } + for (auto &p : props) { + p.reserve(p.size() + num_samples); + } + size_t total_num_samples = num_samples + sample_counts.size(); + size_t next_pow2 = 1LLU << CeilLog2Nonzero(total_num_samples * 3 / 2); + InitTable(next_pow2); +} + +size_t TreeSamples::Hash1(size_t a) const { + constexpr uint64_t constant = 0x1e35a7bd; + uint64_t h = constant; + for (const auto &r : residuals) { + h = h * constant + r[a].tok; + h = h * constant + r[a].nbits; + } + for (const auto &p : props) { + h = h * constant + p[a]; + } + return (h >> 16) & (dedup_table_.size() - 1); +} +size_t TreeSamples::Hash2(size_t a) const { + constexpr uint64_t constant = 0x1e35a7bd1e35a7bd; + uint64_t h = constant; + for (const auto &p : props) { + h = h * constant ^ p[a]; + } + for (const auto &r : residuals) { + h = h * constant ^ r[a].tok; + h = h * constant ^ r[a].nbits; + } + return (h >> 16) & (dedup_table_.size() - 1); +} + +bool TreeSamples::IsSameSample(size_t a, size_t b) const { + bool ret = true; + for (const auto &r : residuals) { + if (r[a].tok != r[b].tok) { + ret = false; + } + if (r[a].nbits != r[b].nbits) { + ret = false; + } + } + for (const auto &p : props) { + if (p[a] != p[b]) { + ret = false; + } + } + return ret; +} + +void TreeSamples::AddSample(pixel_type_w pixel, const Properties &properties, + const pixel_type_w *predictions) { + for (size_t i = 0; i < predictors.size(); i++) { + pixel_type v = pixel - predictions[static_cast(predictors[i])]; + uint32_t tok, nbits, bits; + HybridUintConfig(4, 1, 2).Encode(PackSigned(v), &tok, &nbits, &bits); + JXL_DASSERT(tok < 256); + JXL_DASSERT(nbits < 256); + residuals[i].emplace_back( + ResidualToken{static_cast(tok), static_cast(nbits)}); + } + for (size_t i = 0; i < props_to_use.size(); i++) { + props[i].push_back(QuantizeProperty(i, properties[props_to_use[i]])); + } + sample_counts.push_back(1); + num_samples++; + if (AddToTableAndMerge(sample_counts.size() - 1)) { + for (auto &r : residuals) r.pop_back(); + for (auto &p : props) p.pop_back(); + sample_counts.pop_back(); + } +} + +void TreeSamples::Swap(size_t a, size_t b) { + if (a == b) return; + for (auto &r : residuals) { + std::swap(r[a], r[b]); + } + for (auto &p : props) { + std::swap(p[a], p[b]); + } + std::swap(sample_counts[a], sample_counts[b]); +} + +void TreeSamples::ThreeShuffle(size_t a, size_t b, size_t c) { + if (b == c) return Swap(a, b); + for (auto &r : residuals) { + auto tmp = r[a]; + r[a] = r[c]; + r[c] = r[b]; + r[b] = tmp; + } + for (auto &p : props) { + auto tmp = p[a]; + p[a] = p[c]; + p[c] = p[b]; + p[b] = tmp; + } + auto tmp = sample_counts[a]; + sample_counts[a] = sample_counts[c]; + sample_counts[c] = sample_counts[b]; + sample_counts[b] = tmp; +} + +namespace { +std::vector QuantizeHistogram(const std::vector &histogram, + size_t num_chunks) { + if (histogram.empty()) return {}; + // TODO(veluca): selecting distinct quantiles is likely not the best + // way to go about this. + std::vector thresholds; + size_t sum = std::accumulate(histogram.begin(), histogram.end(), 0LU); + size_t cumsum = 0; + size_t threshold = 0; + for (size_t i = 0; i + 1 < histogram.size(); i++) { + cumsum += histogram[i]; + if (cumsum > (threshold + 1) * sum / num_chunks) { + thresholds.push_back(i); + while (cumsum >= (threshold + 1) * sum / num_chunks) threshold++; + } + } + return thresholds; +} + +std::vector QuantizeSamples(const std::vector &samples, + size_t num_chunks) { + if (samples.empty()) return {}; + int min = *std::min_element(samples.begin(), samples.end()); + constexpr int kRange = 512; + min = std::min(std::max(min, -kRange), kRange); + std::vector counts(2 * kRange + 1); + for (int s : samples) { + uint32_t sample_offset = std::min(std::max(s, -kRange), kRange) - min; + counts[sample_offset]++; + } + std::vector thresholds = QuantizeHistogram(counts, num_chunks); + for (auto &v : thresholds) v += min; + return thresholds; +} +} // namespace + +void TreeSamples::PreQuantizeProperties( + const StaticPropRange &range, + const std::vector &multiplier_info, + const std::vector &group_pixel_count, + const std::vector &channel_pixel_count, + std::vector &pixel_samples, + std::vector &diff_samples, size_t max_property_values) { + // If we have forced splits because of multipliers, choose channel and group + // thresholds accordingly. + std::vector group_multiplier_thresholds; + std::vector channel_multiplier_thresholds; + for (const auto &v : multiplier_info) { + if (v.range[0][0] != range[0][0]) { + channel_multiplier_thresholds.push_back(v.range[0][0] - 1); + } + if (v.range[0][1] != range[0][1]) { + channel_multiplier_thresholds.push_back(v.range[0][1] - 1); + } + if (v.range[1][0] != range[1][0]) { + group_multiplier_thresholds.push_back(v.range[1][0] - 1); + } + if (v.range[1][1] != range[1][1]) { + group_multiplier_thresholds.push_back(v.range[1][1] - 1); + } + } + std::sort(channel_multiplier_thresholds.begin(), + channel_multiplier_thresholds.end()); + channel_multiplier_thresholds.resize( + std::unique(channel_multiplier_thresholds.begin(), + channel_multiplier_thresholds.end()) - + channel_multiplier_thresholds.begin()); + std::sort(group_multiplier_thresholds.begin(), + group_multiplier_thresholds.end()); + group_multiplier_thresholds.resize( + std::unique(group_multiplier_thresholds.begin(), + group_multiplier_thresholds.end()) - + group_multiplier_thresholds.begin()); + + compact_properties.resize(props_to_use.size()); + auto quantize_channel = [&]() { + if (!channel_multiplier_thresholds.empty()) { + return channel_multiplier_thresholds; + } + return QuantizeHistogram(channel_pixel_count, max_property_values); + }; + auto quantize_group_id = [&]() { + if (!group_multiplier_thresholds.empty()) { + return group_multiplier_thresholds; + } + return QuantizeHistogram(group_pixel_count, max_property_values); + }; + auto quantize_coordinate = [&]() { + std::vector quantized; + quantized.reserve(max_property_values - 1); + for (size_t i = 0; i + 1 < max_property_values; i++) { + quantized.push_back((i + 1) * 256 / max_property_values - 1); + } + return quantized; + }; + std::vector abs_pixel_thr; + std::vector pixel_thr; + auto quantize_pixel_property = [&]() { + if (pixel_thr.empty()) { + pixel_thr = QuantizeSamples(pixel_samples, max_property_values); + } + return pixel_thr; + }; + auto quantize_abs_pixel_property = [&]() { + if (abs_pixel_thr.empty()) { + quantize_pixel_property(); // Compute the non-abs thresholds. + for (auto &v : pixel_samples) v = std::abs(v); + abs_pixel_thr = QuantizeSamples(pixel_samples, max_property_values); + } + return abs_pixel_thr; + }; + std::vector abs_diff_thr; + std::vector diff_thr; + auto quantize_diff_property = [&]() { + if (diff_thr.empty()) { + diff_thr = QuantizeSamples(diff_samples, max_property_values); + } + return diff_thr; + }; + auto quantize_abs_diff_property = [&]() { + if (abs_diff_thr.empty()) { + quantize_diff_property(); // Compute the non-abs thresholds. + for (auto &v : diff_samples) v = std::abs(v); + abs_diff_thr = QuantizeSamples(diff_samples, max_property_values); + } + return abs_diff_thr; + }; + auto quantize_wp = [&]() { + if (max_property_values < 32) { + return std::vector{-127, -63, -31, -15, -7, -3, -1, 0, + 1, 3, 7, 15, 31, 63, 127}; + } + if (max_property_values < 64) { + return std::vector{-255, -191, -127, -95, -63, -47, -31, -23, + -15, -11, -7, -5, -3, -1, 0, 1, + 3, 5, 7, 11, 15, 23, 31, 47, + 63, 95, 127, 191, 255}; + } + return std::vector{ + -255, -223, -191, -159, -127, -111, -95, -79, -63, -55, -47, + -39, -31, -27, -23, -19, -15, -13, -11, -9, -7, -6, + -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 9, 11, 13, 15, 19, 23, 27, 31, 39, + 47, 55, 63, 79, 95, 111, 127, 159, 191, 223, 255}; + }; + + property_mapping.resize(props_to_use.size()); + for (size_t i = 0; i < props_to_use.size(); i++) { + if (props_to_use[i] == 0) { + compact_properties[i] = quantize_channel(); + } else if (props_to_use[i] == 1) { + compact_properties[i] = quantize_group_id(); + } else if (props_to_use[i] == 2 || props_to_use[i] == 3) { + compact_properties[i] = quantize_coordinate(); + } else if (props_to_use[i] == 6 || props_to_use[i] == 7 || + props_to_use[i] == 8 || + (props_to_use[i] >= kNumNonrefProperties && + (props_to_use[i] - kNumNonrefProperties) % 4 == 1)) { + compact_properties[i] = quantize_pixel_property(); + } else if (props_to_use[i] == 4 || props_to_use[i] == 5 || + (props_to_use[i] >= kNumNonrefProperties && + (props_to_use[i] - kNumNonrefProperties) % 4 == 0)) { + compact_properties[i] = quantize_abs_pixel_property(); + } else if (props_to_use[i] >= kNumNonrefProperties && + (props_to_use[i] - kNumNonrefProperties) % 4 == 2) { + compact_properties[i] = quantize_abs_diff_property(); + } else if (props_to_use[i] == kWPProp) { + compact_properties[i] = quantize_wp(); + } else { + compact_properties[i] = quantize_diff_property(); + } + property_mapping[i].resize(kPropertyRange * 2 + 1); + size_t mapped = 0; + for (size_t j = 0; j < property_mapping[i].size(); j++) { + while (mapped < compact_properties[i].size() && + static_cast(j) - kPropertyRange > + compact_properties[i][mapped]) { + mapped++; + } + // property_mapping[i] of a value V is `mapped` if + // compact_properties[i][mapped] <= j and + // compact_properties[i][mapped-1] > j + // This is because the decision node in the tree splits on (property) > j, + // hence everything that is not > of a threshold should be clustered + // together. + property_mapping[i][j] = mapped; + } + } +} + +void CollectPixelSamples(const Image &image, const ModularOptions &options, + size_t group_id, + std::vector &group_pixel_count, + std::vector &channel_pixel_count, + std::vector &pixel_samples, + std::vector &diff_samples) { + if (options.nb_repeats == 0) return; + if (group_pixel_count.size() <= group_id) { + group_pixel_count.resize(group_id + 1); + } + if (channel_pixel_count.size() < image.channel.size()) { + channel_pixel_count.resize(image.channel.size()); + } + Rng rng(group_id); + // Sample 10% of the final number of samples for property quantization. + float fraction = std::min(options.nb_repeats * 0.1, 0.99); + Rng::GeometricDistribution dist(fraction); + size_t total_pixels = 0; + std::vector channel_ids; + for (size_t i = 0; i < image.channel.size(); i++) { + if (image.channel[i].w <= 1 || image.channel[i].h == 0) { + continue; // skip empty or width-1 channels. + } + if (i >= image.nb_meta_channels && + (image.channel[i].w > options.max_chan_size || + image.channel[i].h > options.max_chan_size)) { + break; + } + channel_ids.push_back(i); + group_pixel_count[group_id] += image.channel[i].w * image.channel[i].h; + channel_pixel_count[i] += image.channel[i].w * image.channel[i].h; + total_pixels += image.channel[i].w * image.channel[i].h; + } + if (channel_ids.empty()) return; + pixel_samples.reserve(pixel_samples.size() + fraction * total_pixels); + diff_samples.reserve(diff_samples.size() + fraction * total_pixels); + size_t i = 0; + size_t y = 0; + size_t x = 0; + auto advance = [&](size_t amount) { + x += amount; + // Detect row overflow (rare). + while (x >= image.channel[channel_ids[i]].w) { + x -= image.channel[channel_ids[i]].w; + y++; + // Detect end-of-channel (even rarer). + if (y == image.channel[channel_ids[i]].h) { + i++; + y = 0; + if (i >= channel_ids.size()) { + return; + } + } + } + }; + advance(rng.Geometric(dist)); + for (; i < channel_ids.size(); advance(rng.Geometric(dist) + 1)) { + const pixel_type *row = image.channel[channel_ids[i]].Row(y); + pixel_samples.push_back(row[x]); + size_t xp = x == 0 ? 1 : x - 1; + diff_samples.push_back((int64_t)row[x] - row[xp]); + } +} + +// TODO(veluca): very simple encoding scheme. This should be improved. +void TokenizeTree(const Tree &tree, std::vector *tokens, + Tree *decoder_tree) { + JXL_ASSERT(tree.size() <= kMaxTreeSize); + std::queue q; + q.push(0); + size_t leaf_id = 0; + decoder_tree->clear(); + while (!q.empty()) { + int cur = q.front(); + q.pop(); + JXL_ASSERT(tree[cur].property >= -1); + tokens->emplace_back(kPropertyContext, tree[cur].property + 1); + if (tree[cur].property == -1) { + tokens->emplace_back(kPredictorContext, + static_cast(tree[cur].predictor)); + tokens->emplace_back(kOffsetContext, + PackSigned(tree[cur].predictor_offset)); + uint32_t mul_log = Num0BitsBelowLS1Bit_Nonzero(tree[cur].multiplier); + uint32_t mul_bits = (tree[cur].multiplier >> mul_log) - 1; + tokens->emplace_back(kMultiplierLogContext, mul_log); + tokens->emplace_back(kMultiplierBitsContext, mul_bits); + JXL_ASSERT(tree[cur].predictor < Predictor::Best); + decoder_tree->emplace_back(-1, 0, leaf_id++, 0, tree[cur].predictor, + tree[cur].predictor_offset, + tree[cur].multiplier); + continue; + } + decoder_tree->emplace_back(tree[cur].property, tree[cur].splitval, + decoder_tree->size() + q.size() + 1, + decoder_tree->size() + q.size() + 2, + Predictor::Zero, 0, 1); + q.push(tree[cur].lchild); + q.push(tree[cur].rchild); + tokens->emplace_back(kSplitValContext, PackSigned(tree[cur].splitval)); + } +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.h new file mode 100644 index 0000000000..ede37c8023 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.h @@ -0,0 +1,157 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_ENCODING_ENC_MA_H_ +#define LIB_JXL_MODULAR_ENCODING_ENC_MA_H_ + +#include + +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/modular/encoding/dec_ma.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/options.h" + +namespace jxl { + +// Struct to collect all the data needed to build a tree. +struct TreeSamples { + bool HasSamples() const { + return !residuals.empty() && !residuals[0].empty(); + } + size_t NumDistinctSamples() const { return sample_counts.size(); } + size_t NumSamples() const { return num_samples; } + // Set the predictor to use. Must be called before adding any samples. + Status SetPredictor(Predictor predictor, + ModularOptions::TreeMode wp_tree_mode); + // Set the properties to use. Must be called before adding any samples. + Status SetProperties(const std::vector &properties, + ModularOptions::TreeMode wp_tree_mode); + + size_t Token(size_t pred, size_t i) const { return residuals[pred][i].tok; } + size_t NBits(size_t pred, size_t i) const { return residuals[pred][i].nbits; } + size_t Count(size_t i) const { return sample_counts[i]; } + size_t PredictorIndex(Predictor predictor) const { + const auto predictor_elem = + std::find(predictors.begin(), predictors.end(), predictor); + JXL_DASSERT(predictor_elem != predictors.end()); + return predictor_elem - predictors.begin(); + } + size_t PropertyIndex(size_t property) const { + const auto property_elem = + std::find(props_to_use.begin(), props_to_use.end(), property); + JXL_DASSERT(property_elem != props_to_use.end()); + return property_elem - props_to_use.begin(); + } + size_t NumPropertyValues(size_t property_index) const { + return compact_properties[property_index].size() + 1; + } + // Returns the *quantized* property value. + size_t Property(size_t property_index, size_t i) const { + return props[property_index][i]; + } + int UnquantizeProperty(size_t property_index, uint32_t quant) const { + JXL_ASSERT(quant < compact_properties[property_index].size()); + return compact_properties[property_index][quant]; + } + + Predictor PredictorFromIndex(size_t index) const { + JXL_DASSERT(index < predictors.size()); + return predictors[index]; + } + size_t PropertyFromIndex(size_t index) const { + JXL_DASSERT(index < props_to_use.size()); + return props_to_use[index]; + } + size_t NumPredictors() const { return predictors.size(); } + size_t NumProperties() const { return props_to_use.size(); } + + // Preallocate data for a given number of samples. MUST be called before + // adding any sample. + void PrepareForSamples(size_t num_samples); + // Add a sample. + void AddSample(pixel_type_w pixel, const Properties &properties, + const pixel_type_w *predictions); + // Pre-cluster property values. + void PreQuantizeProperties( + const StaticPropRange &range, + const std::vector &multiplier_info, + const std::vector &group_pixel_count, + const std::vector &channel_pixel_count, + std::vector &pixel_samples, + std::vector &diff_samples, size_t max_property_values); + + void AllSamplesDone() { dedup_table_ = std::vector(); } + + uint32_t QuantizeProperty(uint32_t prop, pixel_type v) const { + v = std::min(std::max(v, -kPropertyRange), kPropertyRange) + kPropertyRange; + return property_mapping[prop][v]; + } + + // Swaps samples in position a and b. Does nothing if a == b. + void Swap(size_t a, size_t b); + + // Cycles samples: a -> b -> c -> a. We assume a <= b <= c, so that we can + // just call Swap(a, b) if b==c. + void ThreeShuffle(size_t a, size_t b, size_t c); + + private: + // TODO(veluca): as the total number of properties and predictors are known + // before adding any samples, it might be better to interleave predictors, + // properties and counts in a single vector to improve locality. + // A first attempt at doing this actually results in much slower encoding, + // possibly because of the more complex addressing. + struct ResidualToken { + uint8_t tok; + uint8_t nbits; + }; + // Residual information: token and number of extra bits, per predictor. + std::vector> residuals; + // Number of occurrences of each sample. + std::vector sample_counts; + // Property values, quantized to at most 256 distinct values. + std::vector> props; + // Decompactification info for `props`. + std::vector> compact_properties; + // List of properties to use. + std::vector props_to_use; + // List of predictors to use. + std::vector predictors; + // Mapping property value -> quantized property value. + static constexpr int32_t kPropertyRange = 511; + std::vector> property_mapping; + // Number of samples seen. + size_t num_samples = 0; + // Table for deduplication. + static constexpr uint32_t kDedupEntryUnused{static_cast(-1)}; + std::vector dedup_table_; + + // Functions for sample deduplication. + bool IsSameSample(size_t a, size_t b) const; + size_t Hash1(size_t a) const; + size_t Hash2(size_t a) const; + void InitTable(size_t size); + // Returns true if `a` was already present in the table. + bool AddToTableAndMerge(size_t a); + void AddToTable(size_t a); +}; + +void TokenizeTree(const Tree &tree, std::vector *tokens, + Tree *decoder_tree); + +void CollectPixelSamples(const Image &image, const ModularOptions &options, + size_t group_id, + std::vector &group_pixel_count, + std::vector &channel_pixel_count, + std::vector &pixel_samples, + std::vector &diff_samples); + +void ComputeBestTree(TreeSamples &tree_samples, float threshold, + const std::vector &mul_info, + StaticPropRange static_prop_range, + float fast_decode_multiplier, Tree *tree); + +} // namespace jxl +#endif // LIB_JXL_MODULAR_ENCODING_ENC_MA_H_ diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.cc b/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.cc new file mode 100644 index 0000000000..9d2c3e5cf9 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.cc @@ -0,0 +1,622 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/encoding/encoding.h" + +#include +#include + +#include + +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/scope_guard.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/options.h" + +namespace jxl { + +// Removes all nodes that use a static property (i.e. channel or group ID) from +// the tree and collapses each node on even levels with its two children to +// produce a flatter tree. Also computes whether the resulting tree requires +// using the weighted predictor. +FlatTree FilterTree(const Tree &global_tree, + std::array &static_props, + size_t *num_props, bool *use_wp, bool *wp_only, + bool *gradient_only) { + *num_props = 0; + bool has_wp = false; + bool has_non_wp = false; + *gradient_only = true; + const auto mark_property = [&](int32_t p) { + if (p == kWPProp) { + has_wp = true; + } else if (p >= kNumStaticProperties) { + has_non_wp = true; + } + if (p >= kNumStaticProperties && p != kGradientProp) { + *gradient_only = false; + } + }; + FlatTree output; + std::queue nodes; + nodes.push(0); + // Produces a trimmed and flattened tree by doing a BFS visit of the original + // tree, ignoring branches that are known to be false and proceeding two + // levels at a time to collapse nodes in a flatter tree; if an inner parent + // node has a leaf as a child, the leaf is duplicated and an implicit fake + // node is added. This allows to reduce the number of branches when traversing + // the resulting flat tree. + while (!nodes.empty()) { + size_t cur = nodes.front(); + nodes.pop(); + // Skip nodes that we can decide now, by jumping directly to their children. + while (global_tree[cur].property < kNumStaticProperties && + global_tree[cur].property != -1) { + if (static_props[global_tree[cur].property] > global_tree[cur].splitval) { + cur = global_tree[cur].lchild; + } else { + cur = global_tree[cur].rchild; + } + } + FlatDecisionNode flat; + if (global_tree[cur].property == -1) { + flat.property0 = -1; + flat.childID = global_tree[cur].lchild; + flat.predictor = global_tree[cur].predictor; + flat.predictor_offset = global_tree[cur].predictor_offset; + flat.multiplier = global_tree[cur].multiplier; + *gradient_only &= flat.predictor == Predictor::Gradient; + has_wp |= flat.predictor == Predictor::Weighted; + has_non_wp |= flat.predictor != Predictor::Weighted; + output.push_back(flat); + continue; + } + flat.childID = output.size() + nodes.size() + 1; + + flat.property0 = global_tree[cur].property; + *num_props = std::max(flat.property0 + 1, *num_props); + flat.splitval0 = global_tree[cur].splitval; + + for (size_t i = 0; i < 2; i++) { + size_t cur_child = + i == 0 ? global_tree[cur].lchild : global_tree[cur].rchild; + // Skip nodes that we can decide now. + while (global_tree[cur_child].property < kNumStaticProperties && + global_tree[cur_child].property != -1) { + if (static_props[global_tree[cur_child].property] > + global_tree[cur_child].splitval) { + cur_child = global_tree[cur_child].lchild; + } else { + cur_child = global_tree[cur_child].rchild; + } + } + // We ended up in a leaf, add a dummy decision and two copies of the leaf. + if (global_tree[cur_child].property == -1) { + flat.properties[i] = 0; + flat.splitvals[i] = 0; + nodes.push(cur_child); + nodes.push(cur_child); + } else { + flat.properties[i] = global_tree[cur_child].property; + flat.splitvals[i] = global_tree[cur_child].splitval; + nodes.push(global_tree[cur_child].lchild); + nodes.push(global_tree[cur_child].rchild); + *num_props = std::max(flat.properties[i] + 1, *num_props); + } + } + + for (size_t j = 0; j < 2; j++) mark_property(flat.properties[j]); + mark_property(flat.property0); + output.push_back(flat); + } + if (*num_props > kNumNonrefProperties) { + *num_props = + DivCeil(*num_props - kNumNonrefProperties, kExtraPropsPerChannel) * + kExtraPropsPerChannel + + kNumNonrefProperties; + } else { + *num_props = kNumNonrefProperties; + } + *use_wp = has_wp; + *wp_only = has_wp && !has_non_wp; + + return output; +} + +Status DecodeModularChannelMAANS(BitReader *br, ANSSymbolReader *reader, + const std::vector &context_map, + const Tree &global_tree, + const weighted::Header &wp_header, + pixel_type chan, size_t group_id, + Image *image) { + Channel &channel = image->channel[chan]; + + std::array static_props = { + {chan, (int)group_id}}; + // TODO(veluca): filter the tree according to static_props. + + // zero pixel channel? could happen + if (channel.w == 0 || channel.h == 0) return true; + + bool tree_has_wp_prop_or_pred = false; + bool is_wp_only = false; + bool is_gradient_only = false; + size_t num_props; + FlatTree tree = + FilterTree(global_tree, static_props, &num_props, + &tree_has_wp_prop_or_pred, &is_wp_only, &is_gradient_only); + + // From here on, tree lookup returns a *clustered* context ID. + // This avoids an extra memory lookup after tree traversal. + for (size_t i = 0; i < tree.size(); i++) { + if (tree[i].property0 == -1) { + tree[i].childID = context_map[tree[i].childID]; + } + } + + JXL_DEBUG_V(3, "Decoded MA tree with %" PRIuS " nodes", tree.size()); + + // MAANS decode + const auto make_pixel = [](uint64_t v, pixel_type multiplier, + pixel_type_w offset) -> pixel_type { + JXL_DASSERT((v & 0xFFFFFFFF) == v); + pixel_type_w val = UnpackSigned(v); + // if it overflows, it overflows, and we have a problem anyway + return val * multiplier + offset; + }; + + if (tree.size() == 1) { + // special optimized case: no meta-adaptation, so no need + // to compute properties. + Predictor predictor = tree[0].predictor; + int64_t offset = tree[0].predictor_offset; + int32_t multiplier = tree[0].multiplier; + size_t ctx_id = tree[0].childID; + if (predictor == Predictor::Zero) { + uint32_t value; + if (reader->IsSingleValueAndAdvance(ctx_id, &value, + channel.w * channel.h)) { + // Special-case: histogram has a single symbol, with no extra bits, and + // we use ANS mode. + JXL_DEBUG_V(8, "Fastest track."); + pixel_type v = make_pixel(value, multiplier, offset); + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT r = channel.Row(y); + std::fill(r, r + channel.w, v); + } + } else { + JXL_DEBUG_V(8, "Fast track."); + if (multiplier == 1 && offset == 0) { + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + uint32_t v = reader->ReadHybridUintClustered(ctx_id, br); + r[x] = UnpackSigned(v); + } + } + } else { + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + uint32_t v = reader->ReadHybridUintClustered(ctx_id, br); + r[x] = make_pixel(v, multiplier, offset); + } + } + } + } + } else if (predictor == Predictor::Gradient && offset == 0 && + multiplier == 1 && reader->HuffRleOnly()) { + JXL_DEBUG_V(8, "Gradient RLE (fjxl) very fast track."); + uint32_t run = 0; + uint32_t v = 0; + pixel_type_w sv = 0; + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT r = channel.Row(y); + const pixel_type *JXL_RESTRICT rtop = (y ? channel.Row(y - 1) : r - 1); + const pixel_type *JXL_RESTRICT rtopleft = + (y ? channel.Row(y - 1) - 1 : r - 1); + pixel_type_w guess = (y ? rtop[0] : 0); + if (run == 0) { + reader->ReadHybridUintClusteredHuffRleOnly(ctx_id, br, &v, &run); + sv = UnpackSigned(v); + } else { + run--; + } + r[0] = sv + guess; + for (size_t x = 1; x < channel.w; x++) { + pixel_type left = r[x - 1]; + pixel_type top = rtop[x]; + pixel_type topleft = rtopleft[x]; + pixel_type_w guess = ClampedGradient(top, left, topleft); + if (!run) { + reader->ReadHybridUintClusteredHuffRleOnly(ctx_id, br, &v, &run); + sv = UnpackSigned(v); + } else { + run--; + } + r[x] = sv + guess; + } + } + } else if (predictor == Predictor::Gradient && offset == 0 && + multiplier == 1) { + JXL_DEBUG_V(8, "Gradient very fast track."); + const intptr_t onerow = channel.plane.PixelsPerRow(); + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + pixel_type left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0); + pixel_type top = (y ? *(r + x - onerow) : left); + pixel_type topleft = (x && y ? *(r + x - 1 - onerow) : left); + pixel_type guess = ClampedGradient(top, left, topleft); + uint64_t v = reader->ReadHybridUintClustered(ctx_id, br); + r[x] = make_pixel(v, 1, guess); + } + } + } else if (predictor != Predictor::Weighted) { + // special optimized case: no wp + JXL_DEBUG_V(8, "Quite fast track."); + const intptr_t onerow = channel.plane.PixelsPerRow(); + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + PredictionResult pred = + PredictNoTreeNoWP(channel.w, r + x, onerow, x, y, predictor); + pixel_type_w g = pred.guess + offset; + uint64_t v = reader->ReadHybridUintClustered(ctx_id, br); + // NOTE: pred.multiplier is unset. + r[x] = make_pixel(v, multiplier, g); + } + } + } else { + JXL_DEBUG_V(8, "Somewhat fast track."); + const intptr_t onerow = channel.plane.PixelsPerRow(); + weighted::State wp_state(wp_header, channel.w, channel.h); + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + pixel_type_w g = PredictNoTreeWP(channel.w, r + x, onerow, x, y, + predictor, &wp_state) + .guess + + offset; + uint64_t v = reader->ReadHybridUintClustered(ctx_id, br); + r[x] = make_pixel(v, multiplier, g); + wp_state.UpdateErrors(r[x], x, y, channel.w); + } + } + } + return true; + } + + // Check if this tree is a WP-only tree with a small enough property value + // range. + // Initialized to avoid clang-tidy complaining. + uint8_t context_lookup[2 * kPropRangeFast] = {}; + int8_t multipliers[2 * kPropRangeFast] = {}; + int8_t offsets[2 * kPropRangeFast] = {}; + if (is_wp_only) { + is_wp_only = TreeToLookupTable(tree, context_lookup, offsets, multipliers); + } + if (is_gradient_only) { + is_gradient_only = + TreeToLookupTable(tree, context_lookup, offsets, multipliers); + } + + if (is_gradient_only) { + JXL_DEBUG_V(8, "Gradient fast track."); + const intptr_t onerow = channel.plane.PixelsPerRow(); + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0); + pixel_type_w top = (y ? *(r + x - onerow) : left); + pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left); + int32_t guess = ClampedGradient(top, left, topleft); + uint32_t pos = + kPropRangeFast + + std::min( + std::max(-kPropRangeFast, top + left - topleft), + kPropRangeFast - 1); + uint32_t ctx_id = context_lookup[pos]; + uint64_t v = reader->ReadHybridUintClustered(ctx_id, br); + r[x] = make_pixel(v, multipliers[pos], + static_cast(offsets[pos]) + guess); + } + } + } else if (is_wp_only) { + JXL_DEBUG_V(8, "WP fast track."); + const intptr_t onerow = channel.plane.PixelsPerRow(); + weighted::State wp_state(wp_header, channel.w, channel.h); + Properties properties(1); + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + size_t offset = 0; + pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0); + pixel_type_w top = (y ? *(r + x - onerow) : left); + pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left); + pixel_type_w topright = + (x + 1 < channel.w && y ? *(r + x + 1 - onerow) : top); + pixel_type_w toptop = (y > 1 ? *(r + x - onerow - onerow) : top); + int32_t guess = wp_state.Predict( + x, y, channel.w, top, left, topright, topleft, toptop, &properties, + offset); + uint32_t pos = + kPropRangeFast + std::min(std::max(-kPropRangeFast, properties[0]), + kPropRangeFast - 1); + uint32_t ctx_id = context_lookup[pos]; + uint64_t v = reader->ReadHybridUintClustered(ctx_id, br); + r[x] = make_pixel(v, multipliers[pos], + static_cast(offsets[pos]) + guess); + wp_state.UpdateErrors(r[x], x, y, channel.w); + } + } + } else if (!tree_has_wp_prop_or_pred) { + // special optimized case: the weighted predictor and its properties are not + // used, so no need to compute weights and properties. + JXL_DEBUG_V(8, "Slow track."); + MATreeLookup tree_lookup(tree); + Properties properties = Properties(num_props); + const intptr_t onerow = channel.plane.PixelsPerRow(); + Channel references(properties.size() - kNumNonrefProperties, channel.w); + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT p = channel.Row(y); + PrecomputeReferences(channel, y, *image, chan, &references); + InitPropsRow(&properties, static_props, y); + if (y > 1 && channel.w > 8 && references.w == 0) { + for (size_t x = 0; x < 2; x++) { + PredictionResult res = + PredictTreeNoWP(&properties, channel.w, p + x, onerow, x, y, + tree_lookup, references); + uint64_t v = reader->ReadHybridUintClustered(res.context, br); + p[x] = make_pixel(v, res.multiplier, res.guess); + } + for (size_t x = 2; x < channel.w - 2; x++) { + PredictionResult res = + PredictTreeNoWPNEC(&properties, channel.w, p + x, onerow, x, y, + tree_lookup, references); + uint64_t v = reader->ReadHybridUintClustered(res.context, br); + p[x] = make_pixel(v, res.multiplier, res.guess); + } + for (size_t x = channel.w - 2; x < channel.w; x++) { + PredictionResult res = + PredictTreeNoWP(&properties, channel.w, p + x, onerow, x, y, + tree_lookup, references); + uint64_t v = reader->ReadHybridUintClustered(res.context, br); + p[x] = make_pixel(v, res.multiplier, res.guess); + } + } else { + for (size_t x = 0; x < channel.w; x++) { + PredictionResult res = + PredictTreeNoWP(&properties, channel.w, p + x, onerow, x, y, + tree_lookup, references); + uint64_t v = reader->ReadHybridUintClustered(res.context, br); + p[x] = make_pixel(v, res.multiplier, res.guess); + } + } + } + } else { + JXL_DEBUG_V(8, "Slowest track."); + MATreeLookup tree_lookup(tree); + Properties properties = Properties(num_props); + const intptr_t onerow = channel.plane.PixelsPerRow(); + Channel references(properties.size() - kNumNonrefProperties, channel.w); + weighted::State wp_state(wp_header, channel.w, channel.h); + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT p = channel.Row(y); + InitPropsRow(&properties, static_props, y); + PrecomputeReferences(channel, y, *image, chan, &references); + for (size_t x = 0; x < channel.w; x++) { + PredictionResult res = + PredictTreeWP(&properties, channel.w, p + x, onerow, x, y, + tree_lookup, references, &wp_state); + uint64_t v = reader->ReadHybridUintClustered(res.context, br); + p[x] = make_pixel(v, res.multiplier, res.guess); + wp_state.UpdateErrors(p[x], x, y, channel.w); + } + } + } + return true; +} + +GroupHeader::GroupHeader() { Bundle::Init(this); } + +Status ValidateChannelDimensions(const Image &image, + const ModularOptions &options) { + size_t nb_channels = image.channel.size(); + for (bool is_dc : {true, false}) { + size_t group_dim = options.group_dim * (is_dc ? kBlockDim : 1); + size_t c = image.nb_meta_channels; + for (; c < nb_channels; c++) { + const Channel &ch = image.channel[c]; + if (ch.w > options.group_dim || ch.h > options.group_dim) break; + } + for (; c < nb_channels; c++) { + const Channel &ch = image.channel[c]; + if (ch.w == 0 || ch.h == 0) continue; // skip empty + bool is_dc_channel = std::min(ch.hshift, ch.vshift) >= 3; + if (is_dc_channel != is_dc) continue; + size_t tile_dim = group_dim >> std::max(ch.hshift, ch.vshift); + if (tile_dim == 0) { + return JXL_FAILURE("Inconsistent transforms"); + } + } + } + return true; +} + +Status ModularDecode(BitReader *br, Image &image, GroupHeader &header, + size_t group_id, ModularOptions *options, + const Tree *global_tree, const ANSCode *global_code, + const std::vector *global_ctx_map, + bool allow_truncated_group) { + if (image.channel.empty()) return true; + + // decode transforms + Status status = Bundle::Read(br, &header); + if (!allow_truncated_group) JXL_RETURN_IF_ERROR(status); + if (status.IsFatalError()) return status; + if (!br->AllReadsWithinBounds()) { + // Don't do/undo transforms if header is incomplete. + header.transforms.clear(); + image.transform = header.transforms; + for (size_t c = 0; c < image.channel.size(); c++) { + ZeroFillImage(&image.channel[c].plane); + } + return Status(StatusCode::kNotEnoughBytes); + } + + JXL_DEBUG_V(3, "Image data underwent %" PRIuS " transformations: ", + header.transforms.size()); + image.transform = header.transforms; + for (Transform &transform : image.transform) { + JXL_RETURN_IF_ERROR(transform.MetaApply(image)); + } + if (image.error) { + return JXL_FAILURE("Corrupt file. Aborting."); + } + JXL_RETURN_IF_ERROR(ValidateChannelDimensions(image, *options)); + + size_t nb_channels = image.channel.size(); + + size_t num_chans = 0; + size_t distance_multiplier = 0; + for (size_t i = 0; i < nb_channels; i++) { + Channel &channel = image.channel[i]; + if (!channel.w || !channel.h) { + continue; // skip empty channels + } + if (i >= image.nb_meta_channels && (channel.w > options->max_chan_size || + channel.h > options->max_chan_size)) { + break; + } + if (channel.w > distance_multiplier) { + distance_multiplier = channel.w; + } + num_chans++; + } + if (num_chans == 0) return true; + + size_t next_channel = 0; + auto scope_guard = MakeScopeGuard([&]() { + // Do not do anything if truncated groups are not allowed. + if (!allow_truncated_group) return; + for (size_t c = next_channel; c < nb_channels; c++) { + ZeroFillImage(&image.channel[c].plane); + } + }); + + // Read tree. + Tree tree_storage; + std::vector context_map_storage; + ANSCode code_storage; + const Tree *tree = &tree_storage; + const ANSCode *code = &code_storage; + const std::vector *context_map = &context_map_storage; + if (!header.use_global_tree) { + size_t max_tree_size = 1024; + for (size_t i = 0; i < nb_channels; i++) { + Channel &channel = image.channel[i]; + if (!channel.w || !channel.h) { + continue; // skip empty channels + } + if (i >= image.nb_meta_channels && (channel.w > options->max_chan_size || + channel.h > options->max_chan_size)) { + break; + } + size_t pixels = channel.w * channel.h; + if (pixels / channel.w != channel.h) { + return JXL_FAILURE("Tree size overflow"); + } + max_tree_size += pixels; + if (max_tree_size < pixels) return JXL_FAILURE("Tree size overflow"); + } + max_tree_size = std::min(static_cast(1 << 20), max_tree_size); + JXL_RETURN_IF_ERROR(DecodeTree(br, &tree_storage, max_tree_size)); + JXL_RETURN_IF_ERROR(DecodeHistograms(br, (tree_storage.size() + 1) / 2, + &code_storage, &context_map_storage)); + } else { + if (!global_tree || !global_code || !global_ctx_map || + global_tree->empty()) { + return JXL_FAILURE("No global tree available but one was requested"); + } + tree = global_tree; + code = global_code; + context_map = global_ctx_map; + } + + // Read channels + ANSSymbolReader reader(code, br, distance_multiplier); + for (; next_channel < nb_channels; next_channel++) { + Channel &channel = image.channel[next_channel]; + if (!channel.w || !channel.h) { + continue; // skip empty channels + } + if (next_channel >= image.nb_meta_channels && + (channel.w > options->max_chan_size || + channel.h > options->max_chan_size)) { + break; + } + JXL_RETURN_IF_ERROR(DecodeModularChannelMAANS( + br, &reader, *context_map, *tree, header.wp_header, next_channel, + group_id, &image)); + // Truncated group. + if (!br->AllReadsWithinBounds()) { + if (!allow_truncated_group) return JXL_FAILURE("Truncated input"); + return Status(StatusCode::kNotEnoughBytes); + } + } + + // Make sure no zero-filling happens even if next_channel < nb_channels. + scope_guard.Disarm(); + + if (!reader.CheckANSFinalState()) { + return JXL_FAILURE("ANS decode final state failed"); + } + return true; +} + +Status ModularGenericDecompress(BitReader *br, Image &image, + GroupHeader *header, size_t group_id, + ModularOptions *options, bool undo_transforms, + const Tree *tree, const ANSCode *code, + const std::vector *ctx_map, + bool allow_truncated_group) { +#ifdef JXL_ENABLE_ASSERT + std::vector> req_sizes(image.channel.size()); + for (size_t c = 0; c < req_sizes.size(); c++) { + req_sizes[c] = {image.channel[c].w, image.channel[c].h}; + } +#endif + GroupHeader local_header; + if (header == nullptr) header = &local_header; + size_t bit_pos = br->TotalBitsConsumed(); + auto dec_status = ModularDecode(br, image, *header, group_id, options, tree, + code, ctx_map, allow_truncated_group); + if (!allow_truncated_group) JXL_RETURN_IF_ERROR(dec_status); + if (dec_status.IsFatalError()) return dec_status; + if (undo_transforms) image.undo_transforms(header->wp_header); + if (image.error) return JXL_FAILURE("Corrupt file. Aborting."); + JXL_DEBUG_V(4, + "Modular-decoded a %" PRIuS "x%" PRIuS " nbchans=%" PRIuS + " image from %" PRIuS " bytes", + image.w, image.h, image.channel.size(), + (br->TotalBitsConsumed() - bit_pos) / 8); + JXL_DEBUG_V(5, "Modular image: %s", image.DebugString().c_str()); + (void)bit_pos; +#ifdef JXL_ENABLE_ASSERT + // Check that after applying all transforms we are back to the requested image + // sizes, otherwise there's a programming error with the transformations. + if (undo_transforms) { + JXL_ASSERT(image.channel.size() == req_sizes.size()); + for (size_t c = 0; c < req_sizes.size(); c++) { + JXL_ASSERT(req_sizes[c].first == image.channel[c].w); + JXL_ASSERT(req_sizes[c].second == image.channel[c].h); + } + } +#endif + return dec_status; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.h new file mode 100644 index 0000000000..89697bce87 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.h @@ -0,0 +1,135 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_ENCODING_ENCODING_H_ +#define LIB_JXL_MODULAR_ENCODING_ENCODING_H_ + +#include +#include + +#include + +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/image.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/encoding/dec_ma.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/options.h" +#include "lib/jxl/modular/transform/transform.h" + +namespace jxl { + +// Valid range of properties for using lookup tables instead of trees. +constexpr int32_t kPropRangeFast = 512; + +struct GroupHeader : public Fields { + GroupHeader(); + + JXL_FIELDS_NAME(GroupHeader) + + Status VisitFields(Visitor *JXL_RESTRICT visitor) override { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &use_global_tree)); + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&wp_header)); + uint32_t num_transforms = static_cast(transforms.size()); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(0), Val(1), BitsOffset(4, 2), + BitsOffset(8, 18), 0, + &num_transforms)); + if (visitor->IsReading()) transforms.resize(num_transforms); + for (size_t i = 0; i < num_transforms; i++) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&transforms[i])); + } + return true; + } + + bool use_global_tree; + weighted::Header wp_header; + + std::vector transforms; +}; + +FlatTree FilterTree(const Tree &global_tree, + std::array &static_props, + size_t *num_props, bool *use_wp, bool *wp_only, + bool *gradient_only); + +template +bool TreeToLookupTable(const FlatTree &tree, + T context_lookup[2 * kPropRangeFast], + int8_t offsets[2 * kPropRangeFast], + int8_t multipliers[2 * kPropRangeFast] = nullptr) { + struct TreeRange { + // Begin *excluded*, end *included*. This works best with > vs <= decision + // nodes. + int begin, end; + size_t pos; + }; + std::vector ranges; + ranges.push_back(TreeRange{-kPropRangeFast - 1, kPropRangeFast - 1, 0}); + while (!ranges.empty()) { + TreeRange cur = ranges.back(); + ranges.pop_back(); + if (cur.begin < -kPropRangeFast - 1 || cur.begin >= kPropRangeFast - 1 || + cur.end > kPropRangeFast - 1) { + // Tree is outside the allowed range, exit. + return false; + } + auto &node = tree[cur.pos]; + // Leaf. + if (node.property0 == -1) { + if (node.predictor_offset < std::numeric_limits::min() || + node.predictor_offset > std::numeric_limits::max()) { + return false; + } + if (node.multiplier < std::numeric_limits::min() || + node.multiplier > std::numeric_limits::max()) { + return false; + } + if (multipliers == nullptr && node.multiplier != 1) { + return false; + } + for (int i = cur.begin + 1; i < cur.end + 1; i++) { + context_lookup[i + kPropRangeFast] = node.childID; + if (multipliers) multipliers[i + kPropRangeFast] = node.multiplier; + offsets[i + kPropRangeFast] = node.predictor_offset; + } + continue; + } + // > side of top node. + if (node.properties[0] >= kNumStaticProperties) { + ranges.push_back(TreeRange({node.splitvals[0], cur.end, node.childID})); + ranges.push_back( + TreeRange({node.splitval0, node.splitvals[0], node.childID + 1})); + } else { + ranges.push_back(TreeRange({node.splitval0, cur.end, node.childID})); + } + // <= side + if (node.properties[1] >= kNumStaticProperties) { + ranges.push_back( + TreeRange({node.splitvals[1], node.splitval0, node.childID + 2})); + ranges.push_back( + TreeRange({cur.begin, node.splitvals[1], node.childID + 3})); + } else { + ranges.push_back( + TreeRange({cur.begin, node.splitval0, node.childID + 2})); + } + } + return true; +} +// TODO(veluca): make cleaner interfaces. + +Status ValidateChannelDimensions(const Image &image, + const ModularOptions &options); + +Status ModularGenericDecompress(BitReader *br, Image &image, + GroupHeader *header, size_t group_id, + ModularOptions *options, + bool undo_transforms = true, + const Tree *tree = nullptr, + const ANSCode *code = nullptr, + const std::vector *ctx_map = nullptr, + bool allow_truncated_group = false); +} // namespace jxl + +#endif // LIB_JXL_MODULAR_ENCODING_ENCODING_H_ diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/ma_common.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/ma_common.h new file mode 100644 index 0000000000..71b7847321 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/ma_common.h @@ -0,0 +1,28 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_ENCODING_MA_COMMON_H_ +#define LIB_JXL_MODULAR_ENCODING_MA_COMMON_H_ + +#include + +namespace jxl { + +enum MATreeContext : size_t { + kSplitValContext = 0, + kPropertyContext = 1, + kPredictorContext = 2, + kOffsetContext = 3, + kMultiplierLogContext = 4, + kMultiplierBitsContext = 5, + + kNumTreeContexts = 6, +}; + +static constexpr size_t kMaxTreeSize = 1 << 22; + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_ENCODING_MA_COMMON_H_ diff --git a/third_party/jpeg-xl/lib/jxl/modular/modular_image.cc b/third_party/jpeg-xl/lib/jxl/modular/modular_image.cc new file mode 100644 index 0000000000..785d0c5443 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/modular_image.cc @@ -0,0 +1,77 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/modular_image.h" + +#include + +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/modular/transform/transform.h" + +namespace jxl { + +void Image::undo_transforms(const weighted::Header &wp_header, + jxl::ThreadPool *pool) { + while (!transform.empty()) { + Transform t = transform.back(); + JXL_DEBUG_V(4, "Undoing transform"); + Status result = t.Inverse(*this, wp_header, pool); + if (result == false) { + JXL_NOTIFY_ERROR("Error while undoing transform."); + error = true; + return; + } + JXL_DEBUG_V(8, "Undoing transform: done"); + transform.pop_back(); + } +} + +Image::Image(size_t iw, size_t ih, int bitdepth, int nb_chans) + : w(iw), h(ih), bitdepth(bitdepth), nb_meta_channels(0), error(false) { + for (int i = 0; i < nb_chans; i++) channel.emplace_back(Channel(iw, ih)); +} + +Image::Image() : w(0), h(0), bitdepth(8), nb_meta_channels(0), error(true) {} + +Image &Image::operator=(Image &&other) noexcept { + w = other.w; + h = other.h; + bitdepth = other.bitdepth; + nb_meta_channels = other.nb_meta_channels; + error = other.error; + channel = std::move(other.channel); + transform = std::move(other.transform); + return *this; +} + +Image Image::clone() { + Image c(w, h, bitdepth, 0); + c.nb_meta_channels = nb_meta_channels; + c.error = error; + c.transform = transform; + for (Channel &ch : channel) { + Channel a(ch.w, ch.h, ch.hshift, ch.vshift); + CopyImageTo(ch.plane, &a.plane); + c.channel.push_back(std::move(a)); + } + return c; +} + +std::string Image::DebugString() const { + std::ostringstream os; + os << w << "x" << h << ", depth: " << bitdepth; + if (!channel.empty()) { + os << ", channels:"; + for (size_t i = 0; i < channel.size(); ++i) { + os << " " << channel[i].w << "x" << channel[i].h + << "(shift: " << channel[i].hshift << "," << channel[i].vshift << ")"; + if (i < nb_meta_channels) os << "*"; + } + } + return os.str(); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/modular/modular_image.h b/third_party/jpeg-xl/lib/jxl/modular/modular_image.h new file mode 100644 index 0000000000..3e9b5a8a08 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/modular_image.h @@ -0,0 +1,118 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_MODULAR_IMAGE_H_ +#define LIB_JXL_MODULAR_MODULAR_IMAGE_H_ + +#include +#include +#include +#include + +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { + +typedef int32_t pixel_type; // can use int16_t if it's only for 8-bit images. + // Need some wiggle room for YCoCg / Squeeze etc + +typedef int64_t pixel_type_w; + +namespace weighted { +struct Header; +} + +class Channel { + public: + jxl::Plane plane; + size_t w, h; + int hshift, vshift; // w ~= image.w >> hshift; h ~= image.h >> vshift + Channel(size_t iw, size_t ih, int hsh = 0, int vsh = 0) + : plane(iw, ih), w(iw), h(ih), hshift(hsh), vshift(vsh) {} + + Channel(const Channel& other) = delete; + Channel& operator=(const Channel& other) = delete; + + // Move assignment + Channel& operator=(Channel&& other) noexcept { + w = other.w; + h = other.h; + hshift = other.hshift; + vshift = other.vshift; + plane = std::move(other.plane); + return *this; + } + + // Move constructor + Channel(Channel&& other) noexcept = default; + + void shrink() { + if (plane.xsize() == w && plane.ysize() == h) return; + jxl::Plane resizedplane(w, h); + plane = std::move(resizedplane); + } + void shrink(int nw, int nh) { + w = nw; + h = nh; + shrink(); + } + + JXL_INLINE pixel_type* Row(const size_t y) { return plane.Row(y); } + JXL_INLINE const pixel_type* Row(const size_t y) const { + return plane.Row(y); + } +}; + +class Transform; + +class Image { + public: + // image data, transforms can dramatically change the number of channels and + // their semantics + std::vector channel; + // transforms that have been applied (and that have to be undone) + std::vector transform; + + // image dimensions (channels may have different dimensions due to transforms) + size_t w, h; + int bitdepth; + size_t nb_meta_channels; // first few channels might contain palette(s) + bool error; // true if a fatal error occurred, false otherwise + + Image(size_t iw, size_t ih, int bitdepth, int nb_chans); + Image(); + + Image(const Image& other) = delete; + Image& operator=(const Image& other) = delete; + + Image& operator=(Image&& other) noexcept; + Image(Image&& other) noexcept = default; + + bool empty() const { + for (const auto& ch : channel) { + if (ch.w && ch.h) return false; + } + return true; + } + + Image clone(); + + void undo_transforms(const weighted::Header& wp_header, + jxl::ThreadPool* pool = nullptr); + + std::string DebugString() const; +}; + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_MODULAR_IMAGE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/modular/options.h b/third_party/jpeg-xl/lib/jxl/modular/options.h new file mode 100644 index 0000000000..ce6596b912 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/options.h @@ -0,0 +1,117 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_OPTIONS_H_ +#define LIB_JXL_MODULAR_OPTIONS_H_ + +#include + +#include +#include + +namespace jxl { + +using PropertyVal = int32_t; +using Properties = std::vector; + +enum class Predictor : uint32_t { + Zero = 0, + Left = 1, + Top = 2, + Average0 = 3, + Select = 4, + Gradient = 5, + Weighted = 6, + TopRight = 7, + TopLeft = 8, + LeftLeft = 9, + Average1 = 10, + Average2 = 11, + Average3 = 12, + Average4 = 13, + // The following predictors are encoder-only. + Best = 14, // Best of Gradient and Weighted + Variable = + 15, // Find the best decision tree for predictors/predictor per row +}; + +constexpr size_t kNumModularPredictors = + static_cast(Predictor::Average4) + 1; +constexpr size_t kNumModularEncoderPredictors = + static_cast(Predictor::Variable) + 1; + +static constexpr ssize_t kNumStaticProperties = 2; // channel, group_id. + +using StaticPropRange = + std::array, kNumStaticProperties>; + +struct ModularMultiplierInfo { + StaticPropRange range; + uint32_t multiplier; +}; + +struct ModularOptions { + /// Used in both encode and decode: + + // Stop encoding/decoding when reaching a (non-meta) channel that has a + // dimension bigger than max_chan_size. + size_t max_chan_size = 0xFFFFFF; + + // Used during decoding for validation of transforms (sqeeezing) scheme. + size_t group_dim = 0x1FFFFFFF; + + /// Encode options: + // Fraction of pixels to look at to learn a MA tree + // Number of iterations to do to learn a MA tree + // (if zero there is no MA context model) + float nb_repeats = .5f; + + // Maximum number of (previous channel) properties to use in the MA trees + int max_properties = 0; // no previous channels + + // Alternative heuristic tweaks. + // Properties default to channel, group, weighted, gradient residual, W-NW, + // NW-N, N-NE, N-NN + std::vector splitting_heuristics_properties = {0, 1, 15, 9, + 10, 11, 12, 13}; + float splitting_heuristics_node_threshold = 96; + size_t max_property_values = 32; + + // Predictor to use for each channel. + Predictor predictor = static_cast(-1); + + int wp_mode = 0; + + float fast_decode_multiplier = 1.01f; + + // Forces the encoder to produce a tree that is compatible with the WP-only + // decode path (or with the no-wp path, or the gradient-only path). + enum class TreeMode { kGradientOnly, kWPOnly, kNoWP, kDefault }; + TreeMode wp_tree_mode = TreeMode::kDefault; + + // Skip fast paths in the encoder. + bool skip_encoder_fast_path = false; + + // Kind of tree to use. + // TODO(veluca): add tree kinds for JPEG recompression with CfL enabled, + // general AC metadata, different DC qualities, and others. + enum class TreeKind { + kTrivialTreeNoPredictor, + kLearn, + kJpegTranscodeACMeta, + kFalconACMeta, + kACMeta, + kWPFixedDC, + kGradientFixedDC, + }; + TreeKind tree_kind = TreeKind::kLearn; + + // Ignore the image and just pretend all tokens are zeroes + bool zero_tokens = false; +}; + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_OPTIONS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.cc new file mode 100644 index 0000000000..bc31445bc5 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.cc @@ -0,0 +1,606 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/transform/enc_palette.h" + +#include +#include +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/enc_transform.h" +#include "lib/jxl/modular/transform/palette.h" + +namespace jxl { + +namespace palette_internal { + +static constexpr bool kEncodeToHighQualityImplicitPalette = true; + +// Inclusive. +static constexpr int kMinImplicitPaletteIndex = -(2 * 72 - 1); + +float ColorDistance(const std::vector &JXL_RESTRICT a, + const std::vector &JXL_RESTRICT b) { + JXL_ASSERT(a.size() == b.size()); + float distance = 0; + float ave3 = 0; + if (a.size() >= 3) { + ave3 = (a[0] + b[0] + a[1] + b[1] + a[2] + b[2]) * (1.21f / 3.0f); + } + float sum_a = 0, sum_b = 0; + for (size_t c = 0; c < a.size(); ++c) { + const float difference = + static_cast(a[c]) - static_cast(b[c]); + float weight = c == 0 ? 3 : c == 1 ? 5 : 2; + if (c < 3 && (a[c] + b[c] >= ave3)) { + const float add_w[3] = { + 1.15, + 1.15, + 1.12, + }; + weight += add_w[c]; + if (c == 2 && ((a[2] + b[2]) < 1.22 * ave3)) { + weight -= 0.5; + } + } + distance += difference * difference * weight * weight; + const int sum_weight = c == 0 ? 3 : c == 1 ? 5 : 1; + sum_a += a[c] * sum_weight; + sum_b += b[c] * sum_weight; + } + distance *= 4; + float sum_difference = sum_a - sum_b; + distance += sum_difference * sum_difference; + return distance; +} + +static int QuantizeColorToImplicitPaletteIndex( + const std::vector &color, const int palette_size, + const int bit_depth, bool high_quality) { + int index = 0; + if (high_quality) { + int multiplier = 1; + for (size_t c = 0; c < color.size(); c++) { + int quantized = ((kLargeCube - 1) * color[c] + (1 << (bit_depth - 1))) / + ((1 << bit_depth) - 1); + JXL_ASSERT((quantized % kLargeCube) == quantized); + index += quantized * multiplier; + multiplier *= kLargeCube; + } + return index + palette_size + kLargeCubeOffset; + } else { + int multiplier = 1; + for (size_t c = 0; c < color.size(); c++) { + int value = color[c]; + value -= 1 << (std::max(0, bit_depth - 3)); + value = std::max(0, value); + int quantized = ((kLargeCube - 1) * value + (1 << (bit_depth - 1))) / + ((1 << bit_depth) - 1); + JXL_ASSERT((quantized % kLargeCube) == quantized); + if (quantized > kSmallCube - 1) { + quantized = kSmallCube - 1; + } + index += quantized * multiplier; + multiplier *= kSmallCube; + } + return index + palette_size; + } +} + +} // namespace palette_internal + +int RoundInt(int value, int div) { // symmetric rounding around 0 + if (value < 0) return -RoundInt(-value, div); + return (value + div / 2) / div; +} + +struct PaletteIterationData { + static constexpr int kMaxDeltas = 128; + bool final_run = false; + std::vector deltas[3]; + std::vector delta_distances; + std::vector frequent_deltas[3]; + + // Populates `frequent_deltas` with items from `deltas` based on frequencies + // and color distances. + void FindFrequentColorDeltas(int num_pixels, int bitdepth) { + using pixel_type_3d = std::array; + std::map delta_frequency_map; + pixel_type bucket_size = 3 << std::max(0, bitdepth - 8); + // Store frequency weighted by delta distance from quantized value. + for (size_t i = 0; i < deltas[0].size(); ++i) { + pixel_type_3d delta = { + {RoundInt(deltas[0][i], bucket_size), + RoundInt(deltas[1][i], bucket_size), + RoundInt(deltas[2][i], bucket_size)}}; // a basic form of clustering + if (delta[0] == 0 && delta[1] == 0 && delta[2] == 0) continue; + delta_frequency_map[delta] += sqrt(sqrt(delta_distances[i])); + } + + const float delta_distance_multiplier = 1.0f / num_pixels; + + // Weigh frequencies by magnitude and normalize. + for (auto &delta_frequency : delta_frequency_map) { + std::vector current_delta = {delta_frequency.first[0], + delta_frequency.first[1], + delta_frequency.first[2]}; + float delta_distance = + sqrt(palette_internal::ColorDistance({0, 0, 0}, current_delta)) + 1; + delta_frequency.second *= delta_distance * delta_distance_multiplier; + } + + // Sort by weighted frequency. + using pixel_type_3d_frequency = std::pair; + std::vector sorted_delta_frequency_map( + delta_frequency_map.begin(), delta_frequency_map.end()); + std::sort( + sorted_delta_frequency_map.begin(), sorted_delta_frequency_map.end(), + [](const pixel_type_3d_frequency &a, const pixel_type_3d_frequency &b) { + return a.second > b.second; + }); + + // Store the top deltas. + for (auto &delta_frequency : sorted_delta_frequency_map) { + if (frequent_deltas[0].size() >= kMaxDeltas) break; + // Number obtained by optimizing on jyrki31 corpus: + if (delta_frequency.second < 17) break; + for (int c = 0; c < 3; ++c) { + frequent_deltas[c].push_back(delta_frequency.first[c] * bucket_size); + } + } + } +}; + +Status FwdPaletteIteration(Image &input, uint32_t begin_c, uint32_t end_c, + uint32_t &nb_colors, uint32_t &nb_deltas, + bool ordered, bool lossy, Predictor &predictor, + const weighted::Header &wp_header, + PaletteIterationData &palette_iteration_data) { + JXL_QUIET_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, end_c)); + JXL_ASSERT(begin_c >= input.nb_meta_channels); + uint32_t nb = end_c - begin_c + 1; + + size_t w = input.channel[begin_c].w; + size_t h = input.channel[begin_c].h; + + if (!lossy && nb == 1) { + // Channel palette special case + if (nb_colors == 0) return false; + std::vector lookup; + pixel_type minval, maxval; + compute_minmax(input.channel[begin_c], &minval, &maxval); + size_t lookup_table_size = + static_cast(maxval) - static_cast(minval) + 1; + if (lookup_table_size > palette_internal::kMaxPaletteLookupTableSize) { + // a lookup table would use too much memory, instead use a slower approach + // with std::set + std::set chpalette; + pixel_type idx = 0; + for (size_t y = 0; y < h; y++) { + const pixel_type *p = input.channel[begin_c].Row(y); + for (size_t x = 0; x < w; x++) { + const bool new_color = chpalette.insert(p[x]).second; + if (new_color) { + idx++; + if (idx > (int)nb_colors) return false; + } + } + } + JXL_DEBUG_V(6, "Channel %i uses only %i colors.", begin_c, idx); + Channel pch(idx, 1); + pch.hshift = -1; + pch.vshift = -1; + nb_colors = idx; + idx = 0; + pixel_type *JXL_RESTRICT p_palette = pch.Row(0); + for (pixel_type p : chpalette) { + p_palette[idx++] = p; + } + for (size_t y = 0; y < h; y++) { + pixel_type *p = input.channel[begin_c].Row(y); + for (size_t x = 0; x < w; x++) { + for (idx = 0; p[x] != p_palette[idx] && idx < (int)nb_colors; idx++) { + } + JXL_DASSERT(idx < (int)nb_colors); + p[x] = idx; + } + } + predictor = Predictor::Zero; + input.nb_meta_channels++; + input.channel.insert(input.channel.begin(), std::move(pch)); + + return true; + } + lookup.resize(lookup_table_size, 0); + pixel_type idx = 0; + for (size_t y = 0; y < h; y++) { + const pixel_type *p = input.channel[begin_c].Row(y); + for (size_t x = 0; x < w; x++) { + if (lookup[p[x] - minval] == 0) { + lookup[p[x] - minval] = 1; + idx++; + if (idx > (int)nb_colors) return false; + } + } + } + JXL_DEBUG_V(6, "Channel %i uses only %i colors.", begin_c, idx); + Channel pch(idx, 1); + pch.hshift = -1; + pch.vshift = -1; + nb_colors = idx; + idx = 0; + pixel_type *JXL_RESTRICT p_palette = pch.Row(0); + for (size_t i = 0; i < lookup_table_size; i++) { + if (lookup[i]) { + p_palette[idx] = i + minval; + lookup[i] = idx; + idx++; + } + } + for (size_t y = 0; y < h; y++) { + pixel_type *p = input.channel[begin_c].Row(y); + for (size_t x = 0; x < w; x++) p[x] = lookup[p[x] - minval]; + } + predictor = Predictor::Zero; + input.nb_meta_channels++; + input.channel.insert(input.channel.begin(), std::move(pch)); + return true; + } + + Image quantized_input; + if (lossy) { + quantized_input = Image(w, h, input.bitdepth, nb); + for (size_t c = 0; c < nb; c++) { + CopyImageTo(input.channel[begin_c + c].plane, + &quantized_input.channel[c].plane); + } + } + + JXL_DEBUG_V( + 7, "Trying to represent channels %i-%i using at most a %i-color palette.", + begin_c, end_c, nb_colors); + nb_deltas = 0; + bool delta_used = false; + std::set> + candidate_palette; // ordered lexicographically + std::vector> candidate_palette_imageorder; + std::vector color(nb); + std::vector color_with_error(nb); + std::vector p_in(nb); + + if (lossy) { + palette_iteration_data.FindFrequentColorDeltas(w * h, input.bitdepth); + nb_deltas = palette_iteration_data.frequent_deltas[0].size(); + + // Count color frequency for colors that make a cross. + std::map, size_t> color_freq_map; + for (size_t y = 1; y + 1 < h; y++) { + for (uint32_t c = 0; c < nb; c++) { + p_in[c] = input.channel[begin_c + c].Row(y); + } + for (size_t x = 1; x + 1 < w; x++) { + for (uint32_t c = 0; c < nb; c++) { + color[c] = p_in[c][x]; + } + int offsets[4][2] = {{1, 0}, {-1, 0}, {0, 1}, {0, -1}}; + bool makes_cross = true; + for (int i = 0; i < 4 && makes_cross; ++i) { + int dx = offsets[i][0]; + int dy = offsets[i][1]; + for (uint32_t c = 0; c < nb && makes_cross; c++) { + if (input.channel[begin_c + c].Row(y + dy)[x + dx] != color[c]) { + makes_cross = false; + } + } + } + if (makes_cross) color_freq_map[color] += 1; + } + } + // Add colors satisfying frequency condition to the palette. + constexpr float kImageFraction = 0.01f; + size_t color_frequency_lower_bound = 5 + input.h * input.w * kImageFraction; + for (const auto &color_freq : color_freq_map) { + if (color_freq.second > color_frequency_lower_bound) { + candidate_palette.insert(color_freq.first); + candidate_palette_imageorder.push_back(color_freq.first); + } + } + } + + for (size_t y = 0; y < h; y++) { + for (uint32_t c = 0; c < nb; c++) { + p_in[c] = input.channel[begin_c + c].Row(y); + } + for (size_t x = 0; x < w; x++) { + if (lossy && candidate_palette.size() >= nb_colors) break; + for (uint32_t c = 0; c < nb; c++) { + color[c] = p_in[c][x]; + } + const bool new_color = candidate_palette.insert(color).second; + if (new_color) { + candidate_palette_imageorder.push_back(color); + } + if (candidate_palette.size() > nb_colors) { + return false; // too many colors + } + } + } + + nb_colors = nb_deltas + candidate_palette.size(); + JXL_DEBUG_V(6, "Channels %i-%i can be represented using a %i-color palette.", + begin_c, end_c, nb_colors); + + Channel pch(nb_colors, nb); + pch.hshift = -1; + pch.vshift = -1; + pixel_type *JXL_RESTRICT p_palette = pch.Row(0); + intptr_t onerow = pch.plane.PixelsPerRow(); + intptr_t onerow_image = input.channel[begin_c].plane.PixelsPerRow(); + const int bit_depth = std::min(input.bitdepth, 24); + + if (lossy) { + for (uint32_t i = 0; i < nb_deltas; i++) { + for (size_t c = 0; c < 3; c++) { + p_palette[c * onerow + i] = + palette_iteration_data.frequent_deltas[c][i]; + } + } + } + + int x = 0; + if (ordered) { + JXL_DEBUG_V(7, "Palette of %i colors, using lexicographic order", + nb_colors); + for (auto pcol : candidate_palette) { + JXL_DEBUG_V(9, " Color %i : ", x); + for (size_t i = 0; i < nb; i++) { + p_palette[nb_deltas + i * onerow + x] = pcol[i]; + } + for (size_t i = 0; i < nb; i++) { + JXL_DEBUG_V(9, "%i ", pcol[i]); + } + x++; + } + } else { + JXL_DEBUG_V(7, "Palette of %i colors, using image order", nb_colors); + for (auto pcol : candidate_palette_imageorder) { + JXL_DEBUG_V(9, " Color %i : ", x); + for (size_t i = 0; i < nb; i++) + p_palette[nb_deltas + i * onerow + x] = pcol[i]; + for (size_t i = 0; i < nb; i++) JXL_DEBUG_V(9, "%i ", pcol[i]); + x++; + } + } + std::vector wp_states; + for (size_t c = 0; c < nb; c++) { + wp_states.emplace_back(wp_header, w, h); + } + std::vector p_quant(nb); + // Three rows of error for dithering: y to y + 2. + // Each row has two pixels of padding in the ends, which is + // beneficial for both precision and encoding speed. + std::vector> error_row[3]; + if (lossy) { + for (int i = 0; i < 3; ++i) { + error_row[i].resize(nb); + for (size_t c = 0; c < nb; ++c) { + error_row[i][c].resize(w + 4); + } + } + } + for (size_t y = 0; y < h; y++) { + for (size_t c = 0; c < nb; c++) { + p_in[c] = input.channel[begin_c + c].Row(y); + if (lossy) p_quant[c] = quantized_input.channel[c].Row(y); + } + pixel_type *JXL_RESTRICT p = input.channel[begin_c].Row(y); + for (size_t x = 0; x < w; x++) { + int index; + if (!lossy) { + for (size_t c = 0; c < nb; c++) color[c] = p_in[c][x]; + // Exact search. + for (index = 0; static_cast(index) < nb_colors; index++) { + bool found = true; + for (size_t c = 0; c < nb; c++) { + if (color[c] != p_palette[c * onerow + index]) { + found = false; + break; + } + } + if (found) break; + } + if (index < static_cast(nb_deltas)) { + delta_used = true; + } + } else { + int best_index = 0; + bool best_is_delta = false; + float best_distance = std::numeric_limits::infinity(); + std::vector best_val(nb, 0); + std::vector ideal_residual(nb, 0); + std::vector quantized_val(nb); + std::vector predictions(nb); + static const double kDiffusionMultiplier[] = {0.55, 0.75}; + for (int diffusion_index = 0; diffusion_index < 2; ++diffusion_index) { + for (size_t c = 0; c < nb; c++) { + color_with_error[c] = + p_in[c][x] + palette_iteration_data.final_run * + kDiffusionMultiplier[diffusion_index] * + error_row[0][c][x + 2]; + color[c] = Clamp1(lroundf(color_with_error[c]), 0l, + (1l << input.bitdepth) - 1); + } + + for (size_t c = 0; c < nb; ++c) { + predictions[c] = PredictNoTreeWP(w, p_quant[c] + x, onerow_image, x, + y, predictor, &wp_states[c]) + .guess; + } + const auto TryIndex = [&](const int index) { + for (size_t c = 0; c < nb; c++) { + quantized_val[c] = palette_internal::GetPaletteValue( + p_palette, index, /*c=*/c, + /*palette_size=*/nb_colors, + /*onerow=*/onerow, /*bit_depth=*/bit_depth); + if (index < static_cast(nb_deltas)) { + quantized_val[c] += predictions[c]; + } + } + const float color_distance = + 32.0 / (1LL << std::max(0, 2 * (bit_depth - 8))) * + palette_internal::ColorDistance(color_with_error, + quantized_val); + float index_penalty = 0; + if (index == -1) { + index_penalty = -124; + } else if (index < 0) { + index_penalty = -2 * index; + } else if (index < static_cast(nb_deltas)) { + index_penalty = 250; + } else if (index < static_cast(nb_colors)) { + index_penalty = 150; + } else if (index < static_cast(nb_colors) + + palette_internal::kLargeCubeOffset) { + index_penalty = 70; + } else { + index_penalty = 256; + } + const float distance = color_distance + index_penalty; + if (distance < best_distance) { + best_distance = distance; + best_index = index; + best_is_delta = index < static_cast(nb_deltas); + best_val.swap(quantized_val); + for (size_t c = 0; c < nb; ++c) { + ideal_residual[c] = color_with_error[c] - predictions[c]; + } + } + }; + for (index = palette_internal::kMinImplicitPaletteIndex; + index < static_cast(nb_colors); index++) { + TryIndex(index); + } + TryIndex(palette_internal::QuantizeColorToImplicitPaletteIndex( + color, nb_colors, bit_depth, + /*high_quality=*/false)); + if (palette_internal::kEncodeToHighQualityImplicitPalette) { + TryIndex(palette_internal::QuantizeColorToImplicitPaletteIndex( + color, nb_colors, bit_depth, + /*high_quality=*/true)); + } + } + index = best_index; + delta_used |= best_is_delta; + if (!palette_iteration_data.final_run) { + for (size_t c = 0; c < 3; ++c) { + palette_iteration_data.deltas[c].push_back(ideal_residual[c]); + } + palette_iteration_data.delta_distances.push_back(best_distance); + } + + for (size_t c = 0; c < nb; ++c) { + wp_states[c].UpdateErrors(best_val[c], x, y, w); + p_quant[c][x] = best_val[c]; + } + float len_error = 0; + for (size_t c = 0; c < nb; ++c) { + float local_error = color_with_error[c] - best_val[c]; + len_error += local_error * local_error; + } + len_error = sqrt(len_error); + float modulate = 1.0; + int len_limit = 38 << std::max(0, bit_depth - 8); + if (len_error > len_limit) { + modulate *= len_limit / len_error; + } + for (size_t c = 0; c < nb; ++c) { + float total_error = (color_with_error[c] - best_val[c]); + + // If the neighboring pixels have some error in the opposite + // direction of total_error, cancel some or all of it out before + // spreading among them. + constexpr int offsets[12][2] = {{1, 2}, {0, 3}, {0, 4}, {1, 1}, + {1, 3}, {2, 2}, {1, 0}, {1, 4}, + {2, 1}, {2, 3}, {2, 0}, {2, 4}}; + float total_available = 0; + for (int i = 0; i < 11; ++i) { + const int row = offsets[i][0]; + const int col = offsets[i][1]; + if (std::signbit(error_row[row][c][x + col]) != + std::signbit(total_error)) { + total_available += error_row[row][c][x + col]; + } + } + float weight = + std::abs(total_error) / (std::abs(total_available) + 1e-3); + weight = std::min(weight, 1.0f); + for (int i = 0; i < 11; ++i) { + const int row = offsets[i][0]; + const int col = offsets[i][1]; + if (std::signbit(error_row[row][c][x + col]) != + std::signbit(total_error)) { + total_error += weight * error_row[row][c][x + col]; + error_row[row][c][x + col] *= (1 - weight); + } + } + total_error *= modulate; + const float remaining_error = (1.0f / 14.) * total_error; + error_row[0][c][x + 3] += 2 * remaining_error; + error_row[0][c][x + 4] += remaining_error; + error_row[1][c][x + 0] += remaining_error; + for (int i = 0; i < 5; ++i) { + error_row[1][c][x + i] += remaining_error; + error_row[2][c][x + i] += remaining_error; + } + } + } + if (palette_iteration_data.final_run) p[x] = index; + } + if (lossy) { + for (size_t c = 0; c < nb; ++c) { + error_row[0][c].swap(error_row[1][c]); + error_row[1][c].swap(error_row[2][c]); + std::fill(error_row[2][c].begin(), error_row[2][c].end(), 0.f); + } + } + } + if (!delta_used) { + predictor = Predictor::Zero; + } + if (palette_iteration_data.final_run) { + input.nb_meta_channels++; + input.channel.erase(input.channel.begin() + begin_c + 1, + input.channel.begin() + end_c + 1); + input.channel.insert(input.channel.begin(), std::move(pch)); + } + nb_colors -= nb_deltas; + return true; +} + +Status FwdPalette(Image &input, uint32_t begin_c, uint32_t end_c, + uint32_t &nb_colors, uint32_t &nb_deltas, bool ordered, + bool lossy, Predictor &predictor, + const weighted::Header &wp_header) { + PaletteIterationData palette_iteration_data; + uint32_t nb_colors_orig = nb_colors; + uint32_t nb_deltas_orig = nb_deltas; + // preprocessing pass in case of lossy palette + if (lossy && input.bitdepth >= 8) { + JXL_RETURN_IF_ERROR(FwdPaletteIteration( + input, begin_c, end_c, nb_colors_orig, nb_deltas_orig, ordered, lossy, + predictor, wp_header, palette_iteration_data)); + } + palette_iteration_data.final_run = true; + return FwdPaletteIteration(input, begin_c, end_c, nb_colors, nb_deltas, + ordered, lossy, predictor, wp_header, + palette_iteration_data); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.h b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.h new file mode 100644 index 0000000000..0f3d66825b --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.h @@ -0,0 +1,22 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_TRANSFORM_ENC_PALETTE_H_ +#define LIB_JXL_MODULAR_TRANSFORM_ENC_PALETTE_H_ + +#include "lib/jxl/fields.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/modular_image.h" + +namespace jxl { + +Status FwdPalette(Image &input, uint32_t begin_c, uint32_t end_c, + uint32_t &nb_colors, uint32_t &nb_deltas, bool ordered, + bool lossy, Predictor &predictor, + const weighted::Header &wp_header); + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_TRANSFORM_ENC_PALETTE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.cc new file mode 100644 index 0000000000..050563a3c2 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.cc @@ -0,0 +1,73 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/transform/enc_rct.h" + +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/transform.h" // CheckEqualChannels + +namespace jxl { + +Status FwdRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) { + JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2)); + if (rct_type == 0) { // noop + return false; + } + // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR + int permutation = rct_type / 7; + // 0-5 values have the low bit corresponding to Third and the high bits + // corresponding to Second. 6 corresponds to YCoCg. + // + // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird + // + // Third: 0=nop, 1=SubtractFirst + int custom = rct_type % 7; + size_t m = begin_c; + size_t w = input.channel[m + 0].w; + size_t h = input.channel[m + 0].h; + int second = (custom % 7) >> 1; + int third = (custom % 7) & 1; + const auto do_rct = [&](const int y, const int thread) { + const pixel_type* in0 = input.channel[m + (permutation % 3)].Row(y); + const pixel_type* in1 = + input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y); + const pixel_type* in2 = + input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y); + pixel_type* out0 = input.channel[m].Row(y); + pixel_type* out1 = input.channel[m + 1].Row(y); + pixel_type* out2 = input.channel[m + 2].Row(y); + if (custom == 6) { + for (size_t x = 0; x < w; x++) { + pixel_type R = in0[x]; + pixel_type G = in1[x]; + pixel_type B = in2[x]; + out1[x] = R - B; + pixel_type tmp = B + (out1[x] >> 1); + out2[x] = G - tmp; + out0[x] = tmp + (out2[x] >> 1); + } + } else { + for (size_t x = 0; x < w; x++) { + pixel_type First = in0[x]; + pixel_type Second = in1[x]; + pixel_type Third = in2[x]; + if (second == 1) { + Second = Second - First; + } else if (second == 2) { + Second = Second - ((First + Third) >> 1); + } + if (third) Third = Third - First; + out0[x] = First; + out1[x] = Second; + out2[x] = Third; + } + } + }; + return RunOnPool(pool, 0, h, ThreadPool::NoInit, do_rct, "FwdRCT"); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.h b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.h new file mode 100644 index 0000000000..cb5a193c8d --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.h @@ -0,0 +1,17 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_TRANSFORM_ENC_RCT_H_ +#define LIB_JXL_MODULAR_TRANSFORM_ENC_RCT_H_ + +#include "lib/jxl/modular/modular_image.h" + +namespace jxl { + +Status FwdRCT(Image &input, size_t begin_c, size_t rct_type, ThreadPool *pool); + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_TRANSFORM_ENC_RCT_H_ diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.cc new file mode 100644 index 0000000000..dfd90cde68 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.cc @@ -0,0 +1,141 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/transform/enc_squeeze.h" + +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/common.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/squeeze.h" +#include "lib/jxl/modular/transform/transform.h" + +namespace jxl { + +void FwdHSqueeze(Image &input, int c, int rc) { + const Channel &chin = input.channel[c]; + + JXL_DEBUG_V(4, "Doing horizontal squeeze of channel %i to new channel %i", c, + rc); + + Channel chout((chin.w + 1) / 2, chin.h, chin.hshift + 1, chin.vshift); + Channel chout_residual(chin.w - chout.w, chout.h, chin.hshift + 1, + chin.vshift); + + for (size_t y = 0; y < chout.h; y++) { + const pixel_type *JXL_RESTRICT p_in = chin.Row(y); + pixel_type *JXL_RESTRICT p_out = chout.Row(y); + pixel_type *JXL_RESTRICT p_res = chout_residual.Row(y); + for (size_t x = 0; x < chout_residual.w; x++) { + pixel_type A = p_in[x * 2]; + pixel_type B = p_in[x * 2 + 1]; + pixel_type avg = (A + B + (A > B)) >> 1; + p_out[x] = avg; + + pixel_type diff = A - B; + + pixel_type next_avg = avg; + if (x + 1 < chout_residual.w) { + next_avg = (p_in[x * 2 + 2] + p_in[x * 2 + 3] + + (p_in[x * 2 + 2] > p_in[x * 2 + 3])) >> + 1; // which will be chout.value(y,x+1) + } else if (chin.w & 1) + next_avg = p_in[x * 2 + 2]; + pixel_type left = (x > 0 ? p_in[x * 2 - 1] : avg); + pixel_type tendency = SmoothTendency(left, avg, next_avg); + + p_res[x] = diff - tendency; + } + if (chin.w & 1) { + int x = chout.w - 1; + p_out[x] = p_in[x * 2]; + } + } + input.channel[c] = std::move(chout); + input.channel.insert(input.channel.begin() + rc, std::move(chout_residual)); +} + +void FwdVSqueeze(Image &input, int c, int rc) { + const Channel &chin = input.channel[c]; + + JXL_DEBUG_V(4, "Doing vertical squeeze of channel %i to new channel %i", c, + rc); + + Channel chout(chin.w, (chin.h + 1) / 2, chin.hshift, chin.vshift + 1); + Channel chout_residual(chin.w, chin.h - chout.h, chin.hshift, + chin.vshift + 1); + intptr_t onerow_in = chin.plane.PixelsPerRow(); + for (size_t y = 0; y < chout_residual.h; y++) { + const pixel_type *JXL_RESTRICT p_in = chin.Row(y * 2); + pixel_type *JXL_RESTRICT p_out = chout.Row(y); + pixel_type *JXL_RESTRICT p_res = chout_residual.Row(y); + for (size_t x = 0; x < chout.w; x++) { + pixel_type A = p_in[x]; + pixel_type B = p_in[x + onerow_in]; + pixel_type avg = (A + B + (A > B)) >> 1; + p_out[x] = avg; + + pixel_type diff = A - B; + + pixel_type next_avg = avg; + if (y + 1 < chout_residual.h) { + next_avg = (p_in[x + 2 * onerow_in] + p_in[x + 3 * onerow_in] + + (p_in[x + 2 * onerow_in] > p_in[x + 3 * onerow_in])) >> + 1; // which will be chout.value(y+1,x) + } else if (chin.h & 1) { + next_avg = p_in[x + 2 * onerow_in]; + } + pixel_type top = + (y > 0 ? p_in[static_cast(x) - onerow_in] : avg); + pixel_type tendency = SmoothTendency(top, avg, next_avg); + + p_res[x] = diff - tendency; + } + } + if (chin.h & 1) { + size_t y = chout.h - 1; + const pixel_type *p_in = chin.Row(y * 2); + pixel_type *p_out = chout.Row(y); + for (size_t x = 0; x < chout.w; x++) { + p_out[x] = p_in[x]; + } + } + input.channel[c] = std::move(chout); + input.channel.insert(input.channel.begin() + rc, std::move(chout_residual)); +} + +Status FwdSqueeze(Image &input, std::vector parameters, + ThreadPool *pool) { + if (parameters.empty()) { + DefaultSqueezeParameters(¶meters, input); + } + // if nothing to do, don't do squeeze + if (parameters.empty()) return false; + for (size_t i = 0; i < parameters.size(); i++) { + JXL_RETURN_IF_ERROR( + CheckMetaSqueezeParams(parameters[i], input.channel.size())); + bool horizontal = parameters[i].horizontal; + bool in_place = parameters[i].in_place; + uint32_t beginc = parameters[i].begin_c; + uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1; + uint32_t offset; + if (in_place) { + offset = endc + 1; + } else { + offset = input.channel.size(); + } + for (uint32_t c = beginc; c <= endc; c++) { + if (horizontal) { + FwdHSqueeze(input, c, offset + c - beginc); + } else { + FwdVSqueeze(input, c, offset + c - beginc); + } + } + } + return true; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.h b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.h new file mode 100644 index 0000000000..39b001017b --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.h @@ -0,0 +1,20 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_TRANSFORM_ENC_SQUEEZE_H_ +#define LIB_JXL_MODULAR_TRANSFORM_ENC_SQUEEZE_H_ + +#include "lib/jxl/fields.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/transform.h" + +namespace jxl { + +Status FwdSqueeze(Image &input, std::vector parameters, + ThreadPool *pool); + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_TRANSFORM_ENC_SQUEEZE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.cc new file mode 100644 index 0000000000..bdaaf9f87e --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.cc @@ -0,0 +1,46 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/transform/enc_transform.h" + +#include "lib/jxl/modular/transform/enc_palette.h" +#include "lib/jxl/modular/transform/enc_rct.h" +#include "lib/jxl/modular/transform/enc_squeeze.h" + +namespace jxl { + +Status TransformForward(Transform &t, Image &input, + const weighted::Header &wp_header, ThreadPool *pool) { + switch (t.id) { + case TransformId::kRCT: + return FwdRCT(input, t.begin_c, t.rct_type, pool); + case TransformId::kSqueeze: + return FwdSqueeze(input, t.squeezes, pool); + case TransformId::kPalette: + return FwdPalette(input, t.begin_c, t.begin_c + t.num_c - 1, t.nb_colors, + t.nb_deltas, t.ordered_palette, t.lossy_palette, + t.predictor, wp_header); + default: + return JXL_FAILURE("Unknown transformation (ID=%u)", + static_cast(t.id)); + } +} + +void compute_minmax(const Channel &ch, pixel_type *min, pixel_type *max) { + pixel_type realmin = std::numeric_limits::max(); + pixel_type realmax = std::numeric_limits::min(); + for (size_t y = 0; y < ch.h; y++) { + const pixel_type *JXL_RESTRICT p = ch.Row(y); + for (size_t x = 0; x < ch.w; x++) { + if (p[x] < realmin) realmin = p[x]; + if (p[x] > realmax) realmax = p[x]; + } + } + + if (min) *min = realmin; + if (max) *max = realmax; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.h b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.h new file mode 100644 index 0000000000..07659e1b0a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.h @@ -0,0 +1,22 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_TRANSFORM_ENC_TRANSFORM_H_ +#define LIB_JXL_MODULAR_TRANSFORM_ENC_TRANSFORM_H_ + +#include "lib/jxl/fields.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/transform.h" + +namespace jxl { + +Status TransformForward(Transform &t, Image &input, + const weighted::Header &wp_header, ThreadPool *pool); + +void compute_minmax(const Channel &ch, pixel_type *min, pixel_type *max); + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_TRANSFORM_ENC_TRANSFORM_H_ diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/palette.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/palette.cc new file mode 100644 index 0000000000..46129f19f0 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/transform/palette.cc @@ -0,0 +1,176 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/transform/palette.h" + +namespace jxl { + +Status InvPalette(Image &input, uint32_t begin_c, uint32_t nb_colors, + uint32_t nb_deltas, Predictor predictor, + const weighted::Header &wp_header, ThreadPool *pool) { + if (input.nb_meta_channels < 1) { + return JXL_FAILURE("Error: Palette transform without palette."); + } + std::atomic num_errors{0}; + int nb = input.channel[0].h; + uint32_t c0 = begin_c + 1; + if (c0 >= input.channel.size()) { + return JXL_FAILURE("Channel is out of range."); + } + size_t w = input.channel[c0].w; + size_t h = input.channel[c0].h; + if (nb < 1) return JXL_FAILURE("Corrupted transforms"); + for (int i = 1; i < nb; i++) { + input.channel.insert( + input.channel.begin() + c0 + 1, + Channel(w, h, input.channel[c0].hshift, input.channel[c0].vshift)); + } + const Channel &palette = input.channel[0]; + const pixel_type *JXL_RESTRICT p_palette = input.channel[0].Row(0); + intptr_t onerow = input.channel[0].plane.PixelsPerRow(); + intptr_t onerow_image = input.channel[c0].plane.PixelsPerRow(); + const int bit_depth = std::min(input.bitdepth, 24); + + if (w == 0) { + // Nothing to do. + // Avoid touching "empty" channels with non-zero height. + } else if (nb_deltas == 0 && predictor == Predictor::Zero) { + if (nb == 1) { + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, h, ThreadPool::NoInit, + [&](const uint32_t task, size_t /* thread */) { + const size_t y = task; + pixel_type *p = input.channel[c0].Row(y); + for (size_t x = 0; x < w; x++) { + const int index = Clamp1(p[x], 0, (pixel_type)palette.w - 1); + p[x] = palette_internal::GetPaletteValue( + p_palette, index, /*c=*/0, + /*palette_size=*/palette.w, + /*onerow=*/onerow, /*bit_depth=*/bit_depth); + } + }, + "UndoChannelPalette")); + } else { + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, h, ThreadPool::NoInit, + [&](const uint32_t task, size_t /* thread */) { + const size_t y = task; + std::vector p_out(nb); + const pixel_type *p_index = input.channel[c0].Row(y); + for (int c = 0; c < nb; c++) + p_out[c] = input.channel[c0 + c].Row(y); + for (size_t x = 0; x < w; x++) { + const int index = p_index[x]; + for (int c = 0; c < nb; c++) { + p_out[c][x] = palette_internal::GetPaletteValue( + p_palette, index, /*c=*/c, + /*palette_size=*/palette.w, + /*onerow=*/onerow, /*bit_depth=*/bit_depth); + } + } + }, + "UndoPalette")); + } + } else { + // Parallelized per channel. + ImageI indices = CopyImage(input.channel[c0].plane); + if (predictor == Predictor::Weighted) { + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, nb, ThreadPool::NoInit, + [&](const uint32_t c, size_t /* thread */) { + Channel &channel = input.channel[c0 + c]; + weighted::State wp_state(wp_header, channel.w, channel.h); + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT p = channel.Row(y); + const pixel_type *JXL_RESTRICT idx = indices.Row(y); + for (size_t x = 0; x < channel.w; x++) { + int index = idx[x]; + pixel_type_w val = 0; + const pixel_type palette_entry = + palette_internal::GetPaletteValue( + p_palette, index, /*c=*/c, + /*palette_size=*/palette.w, /*onerow=*/onerow, + /*bit_depth=*/bit_depth); + if (index < static_cast(nb_deltas)) { + PredictionResult pred = + PredictNoTreeWP(channel.w, p + x, onerow_image, x, y, + predictor, &wp_state); + val = pred.guess + palette_entry; + } else { + val = palette_entry; + } + p[x] = val; + wp_state.UpdateErrors(p[x], x, y, channel.w); + } + } + }, + "UndoDeltaPaletteWP")); + } else { + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, nb, ThreadPool::NoInit, + [&](const uint32_t c, size_t /* thread */) { + Channel &channel = input.channel[c0 + c]; + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT p = channel.Row(y); + const pixel_type *JXL_RESTRICT idx = indices.Row(y); + for (size_t x = 0; x < channel.w; x++) { + int index = idx[x]; + pixel_type_w val = 0; + const pixel_type palette_entry = + palette_internal::GetPaletteValue( + p_palette, index, /*c=*/c, + /*palette_size=*/palette.w, + /*onerow=*/onerow, /*bit_depth=*/bit_depth); + if (index < static_cast(nb_deltas)) { + PredictionResult pred = PredictNoTreeNoWP( + channel.w, p + x, onerow_image, x, y, predictor); + val = pred.guess + palette_entry; + } else { + val = palette_entry; + } + p[x] = val; + } + } + }, + "UndoDeltaPaletteNoWP")); + } + } + if (c0 >= input.nb_meta_channels) { + // Palette was done on normal channels + input.nb_meta_channels--; + } else { + // Palette was done on metachannels + JXL_ASSERT(static_cast(input.nb_meta_channels) >= 2 - nb); + input.nb_meta_channels -= 2 - nb; + JXL_ASSERT(begin_c + nb - 1 < input.nb_meta_channels); + } + input.channel.erase(input.channel.begin(), input.channel.begin() + 1); + return num_errors.load(std::memory_order_relaxed) == 0; +} + +Status MetaPalette(Image &input, uint32_t begin_c, uint32_t end_c, + uint32_t nb_colors, uint32_t nb_deltas, bool lossy) { + JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, end_c)); + + size_t nb = end_c - begin_c + 1; + if (begin_c >= input.nb_meta_channels) { + // Palette was done on normal channels + input.nb_meta_channels++; + } else { + // Palette was done on metachannels + JXL_ASSERT(end_c < input.nb_meta_channels); + // we remove nb-1 metachannels and add one + input.nb_meta_channels += 2 - nb; + } + input.channel.erase(input.channel.begin() + begin_c + 1, + input.channel.begin() + end_c + 1); + Channel pch(nb_colors + nb_deltas, nb); + pch.hshift = -1; + pch.vshift = -1; + input.channel.insert(input.channel.begin(), std::move(pch)); + return true; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/palette.h b/third_party/jpeg-xl/lib/jxl/modular/transform/palette.h new file mode 100644 index 0000000000..cc0f67960b --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/transform/palette.h @@ -0,0 +1,129 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_TRANSFORM_PALETTE_H_ +#define LIB_JXL_MODULAR_TRANSFORM_PALETTE_H_ + +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/transform.h" // CheckEqualChannels + +namespace jxl { + +namespace palette_internal { + +static constexpr int kMaxPaletteLookupTableSize = 1 << 16; + +static constexpr int kRgbChannels = 3; + +// 5x5x5 color cube for the larger cube. +static constexpr int kLargeCube = 5; + +// Smaller interleaved color cube to fill the holes of the larger cube. +static constexpr int kSmallCube = 4; +static constexpr int kSmallCubeBits = 2; +// kSmallCube ** 3 +static constexpr int kLargeCubeOffset = kSmallCube * kSmallCube * kSmallCube; + +static inline pixel_type Scale(uint64_t value, uint64_t bit_depth, + uint64_t denom) { + // return (value * ((static_cast(1) << bit_depth) - 1)) / denom; + // We only call this function with kSmallCube or kLargeCube - 1 as denom, + // allowing us to avoid a division here. + JXL_ASSERT(denom == 4); + return (value * ((static_cast(1) << bit_depth) - 1)) >> 2; +} + +// The purpose of this function is solely to extend the interpretation of +// palette indices to implicit values. If index < nb_deltas, indicating that the +// result is a delta palette entry, it is the responsibility of the caller to +// treat it as such. +static JXL_MAYBE_UNUSED pixel_type +GetPaletteValue(const pixel_type *const palette, int index, const size_t c, + const int palette_size, const int onerow, const int bit_depth) { + if (index < 0) { + static constexpr std::array, 72> kDeltaPalette = { + { + {{0, 0, 0}}, {{4, 4, 4}}, {{11, 0, 0}}, + {{0, 0, -13}}, {{0, -12, 0}}, {{-10, -10, -10}}, + {{-18, -18, -18}}, {{-27, -27, -27}}, {{-18, -18, 0}}, + {{0, 0, -32}}, {{-32, 0, 0}}, {{-37, -37, -37}}, + {{0, -32, -32}}, {{24, 24, 45}}, {{50, 50, 50}}, + {{-45, -24, -24}}, {{-24, -45, -45}}, {{0, -24, -24}}, + {{-34, -34, 0}}, {{-24, 0, -24}}, {{-45, -45, -24}}, + {{64, 64, 64}}, {{-32, 0, -32}}, {{0, -32, 0}}, + {{-32, 0, 32}}, {{-24, -45, -24}}, {{45, 24, 45}}, + {{24, -24, -45}}, {{-45, -24, 24}}, {{80, 80, 80}}, + {{64, 0, 0}}, {{0, 0, -64}}, {{0, -64, -64}}, + {{-24, -24, 45}}, {{96, 96, 96}}, {{64, 64, 0}}, + {{45, -24, -24}}, {{34, -34, 0}}, {{112, 112, 112}}, + {{24, -45, -45}}, {{45, 45, -24}}, {{0, -32, 32}}, + {{24, -24, 45}}, {{0, 96, 96}}, {{45, -24, 24}}, + {{24, -45, -24}}, {{-24, -45, 24}}, {{0, -64, 0}}, + {{96, 0, 0}}, {{128, 128, 128}}, {{64, 0, 64}}, + {{144, 144, 144}}, {{96, 96, 0}}, {{-36, -36, 36}}, + {{45, -24, -45}}, {{45, -45, -24}}, {{0, 0, -96}}, + {{0, 128, 128}}, {{0, 96, 0}}, {{45, 24, -45}}, + {{-128, 0, 0}}, {{24, -45, 24}}, {{-45, 24, -45}}, + {{64, 0, -64}}, {{64, -64, -64}}, {{96, 0, 96}}, + {{45, -45, 24}}, {{24, 45, -45}}, {{64, 64, -64}}, + {{128, 128, 0}}, {{0, 0, -128}}, {{-24, 45, -45}}, + }}; + if (c >= kRgbChannels) { + return 0; + } + // Do not open the brackets, otherwise INT32_MIN negation could overflow. + index = -(index + 1); + index %= 1 + 2 * (kDeltaPalette.size() - 1); + static constexpr int kMultiplier[] = {-1, 1}; + pixel_type result = + kDeltaPalette[((index + 1) >> 1)][c] * kMultiplier[index & 1]; + if (bit_depth > 8) { + result *= static_cast(1) << (bit_depth - 8); + } + return result; + } else if (palette_size <= index && index < palette_size + kLargeCubeOffset) { + if (c >= kRgbChannels) return 0; + index -= palette_size; + index >>= c * kSmallCubeBits; + return Scale(index % kSmallCube, bit_depth, kSmallCube) + + (1 << (std::max(0, bit_depth - 3))); + } else if (palette_size + kLargeCubeOffset <= index) { + if (c >= kRgbChannels) return 0; + index -= palette_size + kLargeCubeOffset; + // TODO(eustas): should we take care of ambiguity created by + // index >= kLargeCube ** 3 ? + switch (c) { + case 0: + break; + case 1: + index /= kLargeCube; + break; + case 2: + index /= kLargeCube * kLargeCube; + break; + } + return Scale(index % kLargeCube, bit_depth, kLargeCube - 1); + } + return palette[c * onerow + static_cast(index)]; +} + +} // namespace palette_internal + +Status InvPalette(Image &input, uint32_t begin_c, uint32_t nb_colors, + uint32_t nb_deltas, Predictor predictor, + const weighted::Header &wp_header, ThreadPool *pool); + +Status MetaPalette(Image &input, uint32_t begin_c, uint32_t end_c, + uint32_t nb_colors, uint32_t nb_deltas, bool lossy); + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_TRANSFORM_PALETTE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/rct.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/rct.cc new file mode 100644 index 0000000000..f3002a5ac3 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/transform/rct.cc @@ -0,0 +1,153 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/transform/rct.h" +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/modular/transform/rct.cc" +#include +#include +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::ShiftRight; +using hwy::HWY_NAMESPACE::Sub; + +template +void InvRCTRow(const pixel_type* in0, const pixel_type* in1, + const pixel_type* in2, pixel_type* out0, pixel_type* out1, + pixel_type* out2, size_t w) { + static_assert(transform_type >= 0 && transform_type < 7, + "Invalid transform type"); + int second = transform_type >> 1; + int third = transform_type & 1; + + size_t x = 0; + const HWY_FULL(pixel_type) d; + const size_t N = Lanes(d); + for (; x + N - 1 < w; x += N) { + if (transform_type == 6) { + auto Y = Load(d, in0 + x); + auto Co = Load(d, in1 + x); + auto Cg = Load(d, in2 + x); + Y = Sub(Y, ShiftRight<1>(Cg)); + auto G = Add(Cg, Y); + Y = Sub(Y, ShiftRight<1>(Co)); + auto R = Add(Y, Co); + Store(R, d, out0 + x); + Store(G, d, out1 + x); + Store(Y, d, out2 + x); + } else { + auto First = Load(d, in0 + x); + auto Second = Load(d, in1 + x); + auto Third = Load(d, in2 + x); + if (third) Third = Add(Third, First); + if (second == 1) { + Second = Add(Second, First); + } else if (second == 2) { + Second = Add(Second, ShiftRight<1>(Add(First, Third))); + } + Store(First, d, out0 + x); + Store(Second, d, out1 + x); + Store(Third, d, out2 + x); + } + } + for (; x < w; x++) { + if (transform_type == 6) { + pixel_type Y = in0[x]; + pixel_type Co = in1[x]; + pixel_type Cg = in2[x]; + pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); + pixel_type G = PixelAdd(Cg, tmp); + pixel_type B = PixelAdd(tmp, -(Co >> 1)); + pixel_type R = PixelAdd(B, Co); + out0[x] = R; + out1[x] = G; + out2[x] = B; + } else { + pixel_type First = in0[x]; + pixel_type Second = in1[x]; + pixel_type Third = in2[x]; + if (third) Third = PixelAdd(Third, First); + if (second == 1) { + Second = PixelAdd(Second, First); + } else if (second == 2) { + Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); + } + out0[x] = First; + out1[x] = Second; + out2[x] = Third; + } + } +} + +Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) { + JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2)); + size_t m = begin_c; + Channel& c0 = input.channel[m + 0]; + size_t w = c0.w; + size_t h = c0.h; + if (rct_type == 0) { // noop + return true; + } + // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR + int permutation = rct_type / 7; + JXL_CHECK(permutation < 6); + // 0-5 values have the low bit corresponding to Third and the high bits + // corresponding to Second. 6 corresponds to YCoCg. + // + // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird + // + // Third: 0=nop, 1=SubtractFirst + int custom = rct_type % 7; + // Special case: permute-only. Swap channels around. + if (custom == 0) { + Channel ch0 = std::move(input.channel[m]); + Channel ch1 = std::move(input.channel[m + 1]); + Channel ch2 = std::move(input.channel[m + 2]); + input.channel[m + (permutation % 3)] = std::move(ch0); + input.channel[m + ((permutation + 1 + permutation / 3) % 3)] = + std::move(ch1); + input.channel[m + ((permutation + 2 - permutation / 3) % 3)] = + std::move(ch2); + return true; + } + constexpr decltype(&InvRCTRow<0>) inv_rct_row[] = { + InvRCTRow<0>, InvRCTRow<1>, InvRCTRow<2>, InvRCTRow<3>, + InvRCTRow<4>, InvRCTRow<5>, InvRCTRow<6>}; + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, h, ThreadPool::NoInit, + [&](const uint32_t task, size_t /* thread */) { + const size_t y = task; + const pixel_type* in0 = input.channel[m].Row(y); + const pixel_type* in1 = input.channel[m + 1].Row(y); + const pixel_type* in2 = input.channel[m + 2].Row(y); + pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y); + pixel_type* out1 = + input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y); + pixel_type* out2 = + input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y); + inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w); + }, + "InvRCT")); + return true; +} + +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(InvRCT); +Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) { + return HWY_DYNAMIC_DISPATCH(InvRCT)(input, begin_c, rct_type, pool); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/rct.h b/third_party/jpeg-xl/lib/jxl/modular/transform/rct.h new file mode 100644 index 0000000000..aef65621d5 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/transform/rct.h @@ -0,0 +1,20 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_TRANSFORM_RCT_H_ +#define LIB_JXL_MODULAR_TRANSFORM_RCT_H_ + +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/transform.h" // CheckEqualChannels + +namespace jxl { + +Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool); + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_TRANSFORM_RCT_H_ diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.cc new file mode 100644 index 0000000000..8440d9e804 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.cc @@ -0,0 +1,478 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/transform/squeeze.h" + +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/common.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/transform.h" +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/modular/transform/squeeze.cc" +#include +#include + +#include "lib/jxl/simd_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Abs; +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::And; +using hwy::HWY_NAMESPACE::Gt; +using hwy::HWY_NAMESPACE::IfThenElse; +using hwy::HWY_NAMESPACE::IfThenZeroElse; +using hwy::HWY_NAMESPACE::Lt; +using hwy::HWY_NAMESPACE::MulEven; +using hwy::HWY_NAMESPACE::Ne; +using hwy::HWY_NAMESPACE::Neg; +using hwy::HWY_NAMESPACE::OddEven; +using hwy::HWY_NAMESPACE::RebindToUnsigned; +using hwy::HWY_NAMESPACE::ShiftLeft; +using hwy::HWY_NAMESPACE::ShiftRight; +using hwy::HWY_NAMESPACE::Sub; +using hwy::HWY_NAMESPACE::Xor; + +#if HWY_TARGET != HWY_SCALAR + +JXL_INLINE void FastUnsqueeze(const pixel_type *JXL_RESTRICT p_residual, + const pixel_type *JXL_RESTRICT p_avg, + const pixel_type *JXL_RESTRICT p_navg, + const pixel_type *p_pout, + pixel_type *JXL_RESTRICT p_out, + pixel_type *p_nout) { + const HWY_CAPPED(pixel_type, 8) d; + const RebindToUnsigned du; + const size_t N = Lanes(d); + auto onethird = Set(d, 0x55555556); + for (size_t x = 0; x < 8; x += N) { + auto avg = Load(d, p_avg + x); + auto next_avg = Load(d, p_navg + x); + auto top = Load(d, p_pout + x); + // Equivalent to SmoothTendency(top,avg,next_avg), but without branches + auto Ba = Sub(top, avg); + auto an = Sub(avg, next_avg); + auto nonmono = Xor(Ba, an); + auto absBa = Abs(Ba); + auto absan = Abs(an); + auto absBn = Abs(Sub(top, next_avg)); + // Compute a3 = absBa / 3 + auto a3e = BitCast(d, ShiftRight<32>(MulEven(absBa, onethird))); + auto a3oi = MulEven(Reverse(d, absBa), onethird); + auto a3o = BitCast( + d, Reverse(hwy::HWY_NAMESPACE::Repartition(), + a3oi)); + auto a3 = OddEven(a3o, a3e); + a3 = Add(a3, Add(absBn, Set(d, 2))); + auto absdiff = ShiftRight<2>(a3); + auto skipdiff = Ne(Ba, Zero(d)); + skipdiff = And(skipdiff, Ne(an, Zero(d))); + skipdiff = And(skipdiff, Lt(nonmono, Zero(d))); + auto absBa2 = Add(ShiftLeft<1>(absBa), And(absdiff, Set(d, 1))); + absdiff = IfThenElse(Gt(absdiff, absBa2), + Add(ShiftLeft<1>(absBa), Set(d, 1)), absdiff); + auto absan2 = ShiftLeft<1>(absan); + absdiff = IfThenElse(Gt(Add(absdiff, And(absdiff, Set(d, 1))), absan2), + absan2, absdiff); + auto diff1 = IfThenElse(Lt(top, next_avg), Neg(absdiff), absdiff); + auto tendency = IfThenZeroElse(skipdiff, diff1); + + auto diff_minus_tendency = Load(d, p_residual + x); + auto diff = Add(diff_minus_tendency, tendency); + auto out = + Add(avg, ShiftRight<1>( + Add(diff, BitCast(d, ShiftRight<31>(BitCast(du, diff)))))); + Store(out, d, p_out + x); + Store(Sub(out, diff), d, p_nout + x); + } +} + +#endif + +Status InvHSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { + JXL_ASSERT(c < input.channel.size()); + JXL_ASSERT(rc < input.channel.size()); + Channel &chin = input.channel[c]; + const Channel &chin_residual = input.channel[rc]; + // These must be valid since we ran MetaApply already. + JXL_ASSERT(chin.w == DivCeil(chin.w + chin_residual.w, 2)); + JXL_ASSERT(chin.h == chin_residual.h); + + if (chin_residual.w == 0) { + // Short-circuit: output channel has same dimensions as input. + input.channel[c].hshift--; + return true; + } + + // Note: chin.w >= chin_residual.w and at most 1 different. + Channel chout(chin.w + chin_residual.w, chin.h, chin.hshift - 1, chin.vshift); + JXL_DEBUG_V(4, + "Undoing horizontal squeeze of channel %i using residuals in " + "channel %i (going from width %" PRIuS " to %" PRIuS ")", + c, rc, chin.w, chout.w); + + if (chin_residual.h == 0) { + // Short-circuit: channel with no pixels. + input.channel[c] = std::move(chout); + return true; + } + auto unsqueeze_row = [&](size_t y, size_t x0) { + const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y); + const pixel_type *JXL_RESTRICT p_avg = chin.Row(y); + pixel_type *JXL_RESTRICT p_out = chout.Row(y); + for (size_t x = x0; x < chin_residual.w; x++) { + pixel_type_w diff_minus_tendency = p_residual[x]; + pixel_type_w avg = p_avg[x]; + pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg); + pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg); + pixel_type_w tendency = SmoothTendency(left, avg, next_avg); + pixel_type_w diff = diff_minus_tendency + tendency; + pixel_type_w A = avg + (diff / 2); + p_out[(x << 1)] = A; + pixel_type_w B = A - diff; + p_out[(x << 1) + 1] = B; + } + if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1]; + }; + + // somewhat complicated trickery just to be able to SIMD this. + // Horizontal unsqueeze has horizontal data dependencies, so we do + // 8 rows at a time and treat it as a vertical unsqueeze of a + // transposed 8x8 block (or 9x8 for one input). + static constexpr const size_t kRowsPerThread = 8; + const auto unsqueeze_span = [&](const uint32_t task, size_t /* thread */) { + const size_t y0 = task * kRowsPerThread; + const size_t rows = std::min(kRowsPerThread, chin.h - y0); + size_t x = 0; + +#if HWY_TARGET != HWY_SCALAR + intptr_t onerow_in = chin.plane.PixelsPerRow(); + intptr_t onerow_inr = chin_residual.plane.PixelsPerRow(); + intptr_t onerow_out = chout.plane.PixelsPerRow(); + const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0); + const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0); + pixel_type *JXL_RESTRICT p_out = chout.Row(y0); + HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread]; + HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread]; + HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread]; + HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread]; + HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread]; + HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread]; + const HWY_CAPPED(pixel_type, 8) d; + const size_t N = Lanes(d); + if (chin_residual.w > 16 && rows == kRowsPerThread) { + for (; x < chin_residual.w - 9; x += 8) { + Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr); + Transpose8x8Block(p_avg + x, b_p_avg, onerow_in); + for (size_t y = 0; y < kRowsPerThread; y++) { + b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y]; + } + for (size_t i = 0; i < 8; i++) { + FastUnsqueeze( + b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1), + (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i), + b_p_out_even + 8 * i, b_p_out_odd + 8 * i); + } + + Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8); + Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8); + for (size_t y = 0; y < kRowsPerThread; y++) { + for (size_t i = 0; i < kRowsPerThread; i += N) { + auto even = Load(d, b_p_out_evenT + 8 * y + i); + auto odd = Load(d, b_p_out_oddT + 8 * y + i); + StoreInterleaved(d, even, odd, + p_out + ((x + i) << 1) + onerow_out * y); + } + } + } + } +#endif + for (size_t y = 0; y < rows; y++) { + unsqueeze_row(y0 + y, x); + } + }; + JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.h, kRowsPerThread), + ThreadPool::NoInit, unsqueeze_span, + "InvHorizontalSqueeze")); + input.channel[c] = std::move(chout); + return true; +} + +Status InvVSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { + JXL_ASSERT(c < input.channel.size()); + JXL_ASSERT(rc < input.channel.size()); + const Channel &chin = input.channel[c]; + const Channel &chin_residual = input.channel[rc]; + // These must be valid since we ran MetaApply already. + JXL_ASSERT(chin.h == DivCeil(chin.h + chin_residual.h, 2)); + JXL_ASSERT(chin.w == chin_residual.w); + + if (chin_residual.h == 0) { + // Short-circuit: output channel has same dimensions as input. + input.channel[c].vshift--; + return true; + } + + // Note: chin.h >= chin_residual.h and at most 1 different. + Channel chout(chin.w, chin.h + chin_residual.h, chin.hshift, chin.vshift - 1); + JXL_DEBUG_V( + 4, + "Undoing vertical squeeze of channel %i using residuals in channel " + "%i (going from height %" PRIuS " to %" PRIuS ")", + c, rc, chin.h, chout.h); + + if (chin_residual.w == 0) { + // Short-circuit: channel with no pixels. + input.channel[c] = std::move(chout); + return true; + } + + static constexpr const int kColsPerThread = 64; + const auto unsqueeze_slice = [&](const uint32_t task, size_t /* thread */) { + const size_t x0 = task * kColsPerThread; + const size_t x1 = std::min((size_t)(task + 1) * kColsPerThread, chin.w); + const size_t w = x1 - x0; + // We only iterate up to std::min(chin_residual.h, chin.h) which is + // always chin_residual.h. + for (size_t y = 0; y < chin_residual.h; y++) { + const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0; + const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0; + const pixel_type *JXL_RESTRICT p_navg = + chin.Row(y + 1 < chin.h ? y + 1 : y) + x0; + pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0; + pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0; + const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg; + size_t x = 0; +#if HWY_TARGET != HWY_SCALAR + for (; x + 7 < w; x += 8) { + FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x, + p_out + x, p_nout + x); + } +#endif + for (; x < w; x++) { + pixel_type_w avg = p_avg[x]; + pixel_type_w next_avg = p_navg[x]; + pixel_type_w top = p_pout[x]; + pixel_type_w tendency = SmoothTendency(top, avg, next_avg); + pixel_type_w diff_minus_tendency = p_residual[x]; + pixel_type_w diff = diff_minus_tendency + tendency; + pixel_type_w out = avg + (diff / 2); + p_out[x] = out; + // If the chin_residual.h == chin.h, the output has an even number + // of rows so the next line is fine. Otherwise, this loop won't + // write to the last output row which is handled separately. + p_nout[x] = out - diff; + } + } + }; + JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.w, kColsPerThread), + ThreadPool::NoInit, unsqueeze_slice, + "InvVertSqueeze")); + + if (chout.h & 1) { + size_t y = chin.h - 1; + const pixel_type *p_avg = chin.Row(y); + pixel_type *p_out = chout.Row(y << 1); + for (size_t x = 0; x < chin.w; x++) { + p_out[x] = p_avg[x]; + } + } + input.channel[c] = std::move(chout); + return true; +} + +Status InvSqueeze(Image &input, std::vector parameters, + ThreadPool *pool) { + for (int i = parameters.size() - 1; i >= 0; i--) { + JXL_RETURN_IF_ERROR( + CheckMetaSqueezeParams(parameters[i], input.channel.size())); + bool horizontal = parameters[i].horizontal; + bool in_place = parameters[i].in_place; + uint32_t beginc = parameters[i].begin_c; + uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1; + uint32_t offset; + if (in_place) { + offset = endc + 1; + } else { + offset = input.channel.size() + beginc - endc - 1; + } + if (beginc < input.nb_meta_channels) { + // This is checked in MetaSqueeze. + JXL_ASSERT(input.nb_meta_channels > parameters[i].num_c); + input.nb_meta_channels -= parameters[i].num_c; + } + + for (uint32_t c = beginc; c <= endc; c++) { + uint32_t rc = offset + c - beginc; + // MetaApply should imply that `rc` is within range, otherwise there's a + // programming bug. + JXL_ASSERT(rc < input.channel.size()); + if ((input.channel[c].w < input.channel[rc].w) || + (input.channel[c].h < input.channel[rc].h)) { + return JXL_FAILURE("Corrupted squeeze transform"); + } + if (horizontal) { + JXL_RETURN_IF_ERROR(InvHSqueeze(input, c, rc, pool)); + } else { + JXL_RETURN_IF_ERROR(InvVSqueeze(input, c, rc, pool)); + } + } + input.channel.erase(input.channel.begin() + offset, + input.channel.begin() + offset + (endc - beginc + 1)); + } + return true; +} + +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace jxl { + +HWY_EXPORT(InvSqueeze); +Status InvSqueeze(Image &input, std::vector parameters, + ThreadPool *pool) { + return HWY_DYNAMIC_DISPATCH(InvSqueeze)(input, parameters, pool); +} + +void DefaultSqueezeParameters(std::vector *parameters, + const Image &image) { + int nb_channels = image.channel.size() - image.nb_meta_channels; + + parameters->clear(); + size_t w = image.channel[image.nb_meta_channels].w; + size_t h = image.channel[image.nb_meta_channels].h; + JXL_DEBUG_V( + 7, "Default squeeze parameters for %" PRIuS "x%" PRIuS " image: ", w, h); + + // do horizontal first on wide images; vertical first on tall images + bool wide = (w > h); + + if (nb_channels > 2 && image.channel[image.nb_meta_channels + 1].w == w && + image.channel[image.nb_meta_channels + 1].h == h) { + // assume channels 1 and 2 are chroma, and can be squeezed first for 4:2:0 + // previews + JXL_DEBUG_V(7, "(4:2:0 chroma), %" PRIuS "x%" PRIuS " image", w, h); + SqueezeParams params; + // horizontal chroma squeeze + params.horizontal = true; + params.in_place = false; + params.begin_c = image.nb_meta_channels + 1; + params.num_c = 2; + parameters->push_back(params); + params.horizontal = false; + // vertical chroma squeeze + parameters->push_back(params); + } + SqueezeParams params; + params.begin_c = image.nb_meta_channels; + params.num_c = nb_channels; + params.in_place = true; + + if (!wide) { + if (h > JXL_MAX_FIRST_PREVIEW_SIZE) { + params.horizontal = false; + parameters->push_back(params); + h = (h + 1) / 2; + JXL_DEBUG_V(7, "Vertical (%" PRIuS "x%" PRIuS "), ", w, h); + } + } + while (w > JXL_MAX_FIRST_PREVIEW_SIZE || h > JXL_MAX_FIRST_PREVIEW_SIZE) { + if (w > JXL_MAX_FIRST_PREVIEW_SIZE) { + params.horizontal = true; + parameters->push_back(params); + w = (w + 1) / 2; + JXL_DEBUG_V(7, "Horizontal (%" PRIuS "x%" PRIuS "), ", w, h); + } + if (h > JXL_MAX_FIRST_PREVIEW_SIZE) { + params.horizontal = false; + parameters->push_back(params); + h = (h + 1) / 2; + JXL_DEBUG_V(7, "Vertical (%" PRIuS "x%" PRIuS "), ", w, h); + } + } + JXL_DEBUG_V(7, "that's it"); +} + +Status CheckMetaSqueezeParams(const SqueezeParams ¶meter, + int num_channels) { + int c1 = parameter.begin_c; + int c2 = parameter.begin_c + parameter.num_c - 1; + if (c1 < 0 || c1 >= num_channels || c2 < 0 || c2 >= num_channels || c2 < c1) { + return JXL_FAILURE("Invalid channel range"); + } + return true; +} + +Status MetaSqueeze(Image &image, std::vector *parameters) { + if (parameters->empty()) { + DefaultSqueezeParameters(parameters, image); + } + + for (size_t i = 0; i < parameters->size(); i++) { + JXL_RETURN_IF_ERROR( + CheckMetaSqueezeParams((*parameters)[i], image.channel.size())); + bool horizontal = (*parameters)[i].horizontal; + bool in_place = (*parameters)[i].in_place; + uint32_t beginc = (*parameters)[i].begin_c; + uint32_t endc = (*parameters)[i].begin_c + (*parameters)[i].num_c - 1; + + uint32_t offset; + if (beginc < image.nb_meta_channels) { + if (endc >= image.nb_meta_channels) { + return JXL_FAILURE("Invalid squeeze: mix of meta and nonmeta channels"); + } + if (!in_place) { + return JXL_FAILURE( + "Invalid squeeze: meta channels require in-place residuals"); + } + image.nb_meta_channels += (*parameters)[i].num_c; + } + if (in_place) { + offset = endc + 1; + } else { + offset = image.channel.size(); + } + for (uint32_t c = beginc; c <= endc; c++) { + if (image.channel[c].hshift > 30 || image.channel[c].vshift > 30) { + return JXL_FAILURE("Too many squeezes: shift > 30"); + } + size_t w = image.channel[c].w; + size_t h = image.channel[c].h; + if (w == 0 || h == 0) return JXL_FAILURE("Squeezing empty channel"); + if (horizontal) { + image.channel[c].w = (w + 1) / 2; + if (image.channel[c].hshift >= 0) image.channel[c].hshift++; + w = w - (w + 1) / 2; + } else { + image.channel[c].h = (h + 1) / 2; + if (image.channel[c].vshift >= 0) image.channel[c].vshift++; + h = h - (h + 1) / 2; + } + image.channel[c].shrink(); + Channel dummy(w, h); + dummy.hshift = image.channel[c].hshift; + dummy.vshift = image.channel[c].vshift; + + image.channel.insert(image.channel.begin() + offset + (c - beginc), + std::move(dummy)); + JXL_DEBUG_V(8, "MetaSqueeze applied, current image: %s", + image.DebugString().c_str()); + } + } + return true; +} + +} // namespace jxl + +#endif diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.h b/third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.h new file mode 100644 index 0000000000..fb18710a6f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.h @@ -0,0 +1,90 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_TRANSFORM_SQUEEZE_H_ +#define LIB_JXL_MODULAR_TRANSFORM_SQUEEZE_H_ + +// Haar-like transform: halves the resolution in one direction +// A B -> (A+B)>>1 in one channel (average) -> same range as +// original channel +// A-B - tendency in a new channel ('residual' needed to make +// the transform reversible) +// -> theoretically range could be 2.5 +// times larger (2 times without the +// 'tendency'), but there should be lots +// of zeroes +// Repeated application (alternating horizontal and vertical squeezes) results +// in downscaling +// +// The default coefficient ordering is low-frequency to high-frequency, as in +// M. Antonini, M. Barlaud, P. Mathieu and I. Daubechies, "Image coding using +// wavelet transform", IEEE Transactions on Image Processing, vol. 1, no. 2, pp. +// 205-220, April 1992, doi: 10.1109/83.136597. + +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/common.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/transform.h" + +#define JXL_MAX_FIRST_PREVIEW_SIZE 8 + +namespace jxl { + +/* + int avg=(A+B)>>1; + int diff=(A-B); + int rA=(diff+(avg<<1)+(diff&1))>>1; + int rB=rA-diff; + +*/ +// |A B|C D|E F| +// p a n p=avg(A,B), a=avg(C,D), n=avg(E,F) +// +// Goal: estimate C-D (avoiding ringing artifacts) +// (ensuring that in smooth areas, a zero residual corresponds to a smooth +// gradient) + +// best estimate for C: (B + 2*a)/3 +// best estimate for D: (n + 3*a)/4 +// best estimate for C-D: 4*B - 3*n - a /12 + +// avoid ringing by 1) only doing this if B <= a <= n or B >= a >= n +// (otherwise, this is not a smooth area and we cannot really estimate C-D) +// 2) making sure that B <= C <= D <= n or B >= C >= D >= n + +inline pixel_type_w SmoothTendency(pixel_type_w B, pixel_type_w a, + pixel_type_w n) { + pixel_type_w diff = 0; + if (B >= a && a >= n) { + diff = (4 * B - 3 * n - a + 6) / 12; + // 2C = a<<1 + diff - diff&1 <= 2B so diff - diff&1 <= 2B - 2a + // 2D = a<<1 - diff - diff&1 >= 2n so diff + diff&1 <= 2a - 2n + if (diff - (diff & 1) > 2 * (B - a)) diff = 2 * (B - a) + 1; + if (diff + (diff & 1) > 2 * (a - n)) diff = 2 * (a - n); + } else if (B <= a && a <= n) { + diff = (4 * B - 3 * n - a - 6) / 12; + // 2C = a<<1 + diff + diff&1 >= 2B so diff + diff&1 >= 2B - 2a + // 2D = a<<1 - diff + diff&1 <= 2n so diff - diff&1 >= 2a - 2n + if (diff + (diff & 1) < 2 * (B - a)) diff = 2 * (B - a) - 1; + if (diff - (diff & 1) < 2 * (a - n)) diff = 2 * (a - n); + } + return diff; +} + +void DefaultSqueezeParameters(std::vector *parameters, + const Image &image); + +Status CheckMetaSqueezeParams(const SqueezeParams ¶meter, int num_channels); + +Status MetaSqueeze(Image &image, std::vector *parameters); + +Status InvSqueeze(Image &input, std::vector parameters, + ThreadPool *pool); + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_TRANSFORM_SQUEEZE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/transform.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/transform.cc new file mode 100644 index 0000000000..d9f2b435bf --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/transform/transform.cc @@ -0,0 +1,98 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/transform/transform.h" + +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/palette.h" +#include "lib/jxl/modular/transform/rct.h" +#include "lib/jxl/modular/transform/squeeze.h" + +namespace jxl { + +SqueezeParams::SqueezeParams() { Bundle::Init(this); } +Transform::Transform(TransformId id) { + Bundle::Init(this); + this->id = id; +} + +Status Transform::Inverse(Image &input, const weighted::Header &wp_header, + ThreadPool *pool) { + JXL_DEBUG_V(6, "Input channels (%" PRIuS ", %" PRIuS " meta): ", + input.channel.size(), input.nb_meta_channels); + switch (id) { + case TransformId::kRCT: + return InvRCT(input, begin_c, rct_type, pool); + case TransformId::kSqueeze: + return InvSqueeze(input, squeezes, pool); + case TransformId::kPalette: + return InvPalette(input, begin_c, nb_colors, nb_deltas, predictor, + wp_header, pool); + default: + return JXL_FAILURE("Unknown transformation (ID=%u)", + static_cast(id)); + } +} + +Status Transform::MetaApply(Image &input) { + JXL_DEBUG_V(6, "MetaApply input: %s", input.DebugString().c_str()); + switch (id) { + case TransformId::kRCT: + JXL_DEBUG_V(2, "Transform: kRCT, rct_type=%" PRIu32, rct_type); + return CheckEqualChannels(input, begin_c, begin_c + 2); + case TransformId::kSqueeze: + JXL_DEBUG_V(2, "Transform: kSqueeze:"); +#if JXL_DEBUG_V_LEVEL >= 2 + { + auto squeezes_copy = squeezes; + if (squeezes_copy.empty()) { + DefaultSqueezeParameters(&squeezes_copy, input); + } + for (const auto ¶ms : squeezes_copy) { + JXL_DEBUG_V( + 2, + " squeeze params: horizontal=%d, in_place=%d, begin_c=%" PRIu32 + ", num_c=%" PRIu32, + params.horizontal, params.in_place, params.begin_c, params.num_c); + } + } +#endif + return MetaSqueeze(input, &squeezes); + case TransformId::kPalette: + JXL_DEBUG_V(2, + "Transform: kPalette, begin_c=%" PRIu32 ", num_c=%" PRIu32 + ", nb_colors=%" PRIu32 ", nb_deltas=%" PRIu32, + begin_c, num_c, nb_colors, nb_deltas); + return MetaPalette(input, begin_c, begin_c + num_c - 1, nb_colors, + nb_deltas, lossy_palette); + default: + return JXL_FAILURE("Unknown transformation (ID=%u)", + static_cast(id)); + } +} + +Status CheckEqualChannels(const Image &image, uint32_t c1, uint32_t c2) { + if (c1 > image.channel.size() || c2 >= image.channel.size() || c2 < c1) { + return JXL_FAILURE("Invalid channel range: %u..%u (there are only %" PRIuS + " channels)", + c1, c2, image.channel.size()); + } + if (c1 < image.nb_meta_channels && c2 >= image.nb_meta_channels) { + return JXL_FAILURE("Invalid: transforming mix of meta and nonmeta"); + } + const auto &ch1 = image.channel[c1]; + for (size_t c = c1 + 1; c <= c2; c++) { + const auto &ch2 = image.channel[c]; + if (ch1.w != ch2.w || ch1.h != ch2.h || ch1.hshift != ch2.hshift || + ch1.vshift != ch2.vshift) { + return false; + } + } + return true; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/transform.h b/third_party/jpeg-xl/lib/jxl/modular/transform/transform.h new file mode 100644 index 0000000000..d5d3259f7a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular/transform/transform.h @@ -0,0 +1,148 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_TRANSFORM_TRANSFORM_H_ +#define LIB_JXL_MODULAR_TRANSFORM_TRANSFORM_H_ + +#include +#include +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/options.h" + +namespace jxl { + +enum class TransformId : uint32_t { + // G, R-G, B-G and variants (including YCoCg). + kRCT = 0, + + // Color palette. Parameters are: [begin_c] [end_c] [nb_colors] + kPalette = 1, + + // Squeezing (Haar-style) + kSqueeze = 2, + + // Invalid for now. + kInvalid = 3, +}; + +struct SqueezeParams : public Fields { + JXL_FIELDS_NAME(SqueezeParams) + bool horizontal; + bool in_place; + uint32_t begin_c; + uint32_t num_c; + SqueezeParams(); + Status VisitFields(Visitor *JXL_RESTRICT visitor) override { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &horizontal)); + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &in_place)); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Bits(3), BitsOffset(6, 8), + BitsOffset(10, 72), + BitsOffset(13, 1096), 0, &begin_c)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(2), Val(3), BitsOffset(4, 4), 2, &num_c)); + return true; + } +}; + +class Transform : public Fields { + public: + TransformId id; + // for Palette and RCT. + uint32_t begin_c; + // for RCT. 42 possible values starting from 0. + uint32_t rct_type; + // Only for Palette and NearLossless. + uint32_t num_c; + // Only for Palette. + uint32_t nb_colors; + uint32_t nb_deltas; + // for Squeeze. Default squeeze if empty. + std::vector squeezes; + // for NearLossless, not serialized. + int max_delta_error; + // Serialized for Palette. + Predictor predictor; + // for Palette, not serialized. + bool ordered_palette = true; + bool lossy_palette = false; + + explicit Transform(TransformId id); + // default constructor for bundles. + Transform() : Transform(TransformId::kInvalid) {} + + Status VisitFields(Visitor *JXL_RESTRICT visitor) override { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32( + Val((uint32_t)TransformId::kRCT), Val((uint32_t)TransformId::kPalette), + Val((uint32_t)TransformId::kSqueeze), + Val((uint32_t)TransformId::kInvalid), (uint32_t)TransformId::kRCT, + reinterpret_cast(&id))); + if (id == TransformId::kInvalid) { + return JXL_FAILURE("Invalid transform ID"); + } + if (visitor->Conditional(id == TransformId::kRCT || + id == TransformId::kPalette)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Bits(3), BitsOffset(6, 8), BitsOffset(10, 72), + BitsOffset(13, 1096), 0, &begin_c)); + } + if (visitor->Conditional(id == TransformId::kRCT)) { + // 0-41, default YCoCg. + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(6), Bits(2), BitsOffset(4, 2), + BitsOffset(6, 10), 6, &rct_type)); + if (rct_type >= 42) { + return JXL_FAILURE("Invalid transform RCT type"); + } + } + if (visitor->Conditional(id == TransformId::kPalette)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(3), Val(4), BitsOffset(13, 1), 3, &num_c)); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32( + BitsOffset(8, 0), BitsOffset(10, 256), BitsOffset(12, 1280), + BitsOffset(16, 5376), 256, &nb_colors)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(0), BitsOffset(8, 1), BitsOffset(10, 257), + BitsOffset(16, 1281), 0, &nb_deltas)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->Bits(4, (uint32_t)Predictor::Zero, + reinterpret_cast(&predictor))); + if (predictor >= Predictor::Best) { + return JXL_FAILURE("Invalid predictor"); + } + } + + if (visitor->Conditional(id == TransformId::kSqueeze)) { + uint32_t num_squeezes = static_cast(squeezes.size()); + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(0), BitsOffset(4, 1), BitsOffset(6, 9), + BitsOffset(8, 41), 0, &num_squeezes)); + if (visitor->IsReading()) squeezes.resize(num_squeezes); + for (size_t i = 0; i < num_squeezes; i++) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&squeezes[i])); + } + } + return true; + } + + JXL_FIELDS_NAME(Transform) + + Status Inverse(Image &input, const weighted::Header &wp_header, + ThreadPool *pool = nullptr); + Status MetaApply(Image &input); +}; + +Status CheckEqualChannels(const Image &image, uint32_t c1, uint32_t c2); + +static inline pixel_type PixelAdd(pixel_type a, pixel_type b) { + return static_cast(static_cast(a) + + static_cast(b)); +} + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_TRANSFORM_TRANSFORM_H_ diff --git a/third_party/jpeg-xl/lib/jxl/modular_test.cc b/third_party/jpeg-xl/lib/jxl/modular_test.cc new file mode 100644 index 0000000000..293f59ff87 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/modular_test.cc @@ -0,0 +1,541 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include + +#include +#include +#include +#include + +#include "lib/extras/codec.h" +#include "lib/extras/dec/jxl.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_butteraugli_pnorm.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/enc_fields.h" +#include "lib/jxl/enc_file.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_toc.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/modular/encoding/enc_encoding.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/modular/encoding/ma_common.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { +using test::Roundtrip; + +void TestLosslessGroups(size_t group_size_shift) { + const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png"); + CompressParams cparams; + cparams.SetLossless(); + cparams.modular_group_size_shift = group_size_shift; + + CodecInOut io_out; + + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io)); + io.ShrinkTo(io.xsize() / 4, io.ysize() / 4); + + size_t compressed_size; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size)); + EXPECT_LE(compressed_size, 280000u); + JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io_out.Main().color(), _)); +} + +TEST(ModularTest, RoundtripLosslessGroups128) { TestLosslessGroups(0); } + +TEST(ModularTest, JXL_TSAN_SLOW_TEST(RoundtripLosslessGroups512)) { + TestLosslessGroups(2); +} + +TEST(ModularTest, JXL_TSAN_SLOW_TEST(RoundtripLosslessGroups1024)) { + TestLosslessGroups(3); +} + +TEST(ModularTest, RoundtripLosslessCustomWP_PermuteRCT) { + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CompressParams cparams; + cparams.SetLossless(); + // 9 = permute to GBR, to test the special case of permutation-only + cparams.colorspace = 9; + // slowest speed so different WP modes are tried + cparams.speed_tier = SpeedTier::kTortoise; + cparams.options.predictor = {Predictor::Weighted}; + + CodecInOut io_out; + + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io)); + io.ShrinkTo(100, 100); + + size_t compressed_size; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size)); + EXPECT_LE(compressed_size, 10150u); + JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io_out.Main().color(), _)); +} + +TEST(ModularTest, RoundtripLossyDeltaPalette) { + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CompressParams cparams; + cparams.modular_mode = true; + cparams.color_transform = jxl::ColorTransform::kNone; + cparams.lossy_palette = true; + cparams.palette_colors = 0; + + CodecInOut io_out; + + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io)); + io.ShrinkTo(300, 100); + + size_t compressed_size; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size)); + EXPECT_LE(compressed_size, 6800u); + cparams.ba_params.intensity_target = 80.0f; + EXPECT_THAT(ButteraugliDistance(io.frames, io_out.frames, cparams.ba_params, + GetJxlCms(), + /*distmap=*/nullptr), + IsSlightlyBelow(1.5)); +} +TEST(ModularTest, RoundtripLossyDeltaPaletteWP) { + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CompressParams cparams; + cparams.SetLossless(); + cparams.lossy_palette = true; + cparams.palette_colors = 0; + cparams.options.predictor = jxl::Predictor::Weighted; + + CodecInOut io_out; + + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io)); + io.ShrinkTo(300, 100); + + size_t compressed_size; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size)); + EXPECT_LE(compressed_size, 7000u); + cparams.ba_params.intensity_target = 80.0f; + EXPECT_THAT(ButteraugliDistance(io.frames, io_out.frames, cparams.ba_params, + GetJxlCms(), + /*distmap=*/nullptr), + IsSlightlyBelow(10.1)); +} + +TEST(ModularTest, RoundtripLossy) { + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CompressParams cparams; + cparams.modular_mode = true; + cparams.butteraugli_distance = 2.f; + + CodecInOut io_out; + + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io)); + + size_t compressed_size; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size)); + EXPECT_LE(compressed_size, 30000u); + cparams.ba_params.intensity_target = 80.0f; + EXPECT_THAT(ButteraugliDistance(io.frames, io_out.frames, cparams.ba_params, + GetJxlCms(), + /*distmap=*/nullptr), + IsSlightlyBelow(2.3)); +} + +TEST(ModularTest, RoundtripLossy16) { + const PaddedBytes orig = jxl::test::ReadTestData( + "external/raw.pixls/DJI-FC6310-16bit_709_v4_krita.png"); + CompressParams cparams; + cparams.modular_mode = true; + cparams.butteraugli_distance = 2.f; + + CodecInOut io_out; + + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io)); + JXL_CHECK(!io.metadata.m.have_preview); + JXL_CHECK(io.frames.size() == 1); + JXL_CHECK(io.frames[0].TransformTo(ColorEncoding::SRGB(), GetJxlCms())); + io.metadata.m.color_encoding = ColorEncoding::SRGB(); + + size_t compressed_size; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size)); + EXPECT_LE(compressed_size, 300u); + cparams.ba_params.intensity_target = 80.0f; + EXPECT_THAT(ButteraugliDistance(io.frames, io_out.frames, cparams.ba_params, + GetJxlCms(), + /*distmap=*/nullptr), + IsSlightlyBelow(1.6)); +} + +TEST(ModularTest, RoundtripExtraProperties) { + constexpr size_t kSize = 250; + Image image(kSize, kSize, /*bitdepth=*/8, 3); + ModularOptions options; + options.max_properties = 4; + options.predictor = Predictor::Zero; + Rng rng(0); + for (size_t y = 0; y < kSize; y++) { + for (size_t x = 0; x < kSize; x++) { + image.channel[0].plane.Row(y)[x] = image.channel[2].plane.Row(y)[x] = + rng.UniformU(0, 9); + } + } + ZeroFillImage(&image.channel[1].plane); + BitWriter writer; + ASSERT_TRUE(ModularGenericCompress(image, options, &writer)); + writer.ZeroPadToByte(); + Image decoded(kSize, kSize, /*bitdepth=*/8, image.channel.size()); + for (size_t i = 0; i < image.channel.size(); i++) { + const Channel& ch = image.channel[i]; + decoded.channel[i] = Channel(ch.w, ch.h, ch.hshift, ch.vshift); + } + Status status = true; + { + BitReader reader(writer.GetSpan()); + BitReaderScopedCloser closer(&reader, &status); + ASSERT_TRUE(ModularGenericDecompress(&reader, decoded, /*header=*/nullptr, + /*group_id=*/0, &options)); + } + ASSERT_TRUE(status); + ASSERT_EQ(image.channel.size(), decoded.channel.size()); + for (size_t c = 0; c < image.channel.size(); c++) { + for (size_t y = 0; y < image.channel[c].plane.ysize(); y++) { + for (size_t x = 0; x < image.channel[c].plane.xsize(); x++) { + EXPECT_EQ(image.channel[c].plane.Row(y)[x], + decoded.channel[c].plane.Row(y)[x]) + << "c = " << c << ", x = " << x << ", y = " << y; + } + } + } +} + +TEST(ModularTest, RoundtripLosslessCustomSqueeze) { + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/tmshre_riaphotographs_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io)); + + CompressParams cparams; + cparams.modular_mode = true; + cparams.color_transform = jxl::ColorTransform::kNone; + cparams.butteraugli_distance = 0.f; + cparams.options.predictor = {Predictor::Zero}; + cparams.speed_tier = SpeedTier::kThunder; + cparams.responsive = 1; + // Custom squeeze params, atm just for testing + SqueezeParams p; + p.horizontal = true; + p.in_place = false; + p.begin_c = 0; + p.num_c = 3; + cparams.squeezes.push_back(p); + p.begin_c = 1; + p.in_place = true; + p.horizontal = false; + cparams.squeezes.push_back(p); + + CodecInOut io2; + size_t compressed_size; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size)); + EXPECT_LE(compressed_size, 265000u); + JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io2.Main().color(), _)); +} + +struct RoundtripLosslessConfig { + int bitdepth; + int responsive; +}; +class ModularTestParam + : public ::testing::TestWithParam {}; + +std::vector GenerateLosslessTests() { + std::vector all; + for (int responsive = 0; responsive <= 1; responsive++) { + for (int bitdepth = 1; bitdepth < 32; bitdepth++) { + if (responsive && bitdepth > 30) continue; + all.push_back({bitdepth, responsive}); + } + } + return all; +} +std::string LosslessTestDescription( + const testing::TestParamInfo& info) { + std::stringstream name; + name << info.param.bitdepth << "bit"; + if (info.param.responsive) name << "Squeeze"; + return name.str(); +} + +JXL_GTEST_INSTANTIATE_TEST_SUITE_P(RoundtripLossless, ModularTestParam, + testing::ValuesIn(GenerateLosslessTests()), + LosslessTestDescription); + +TEST_P(ModularTestParam, RoundtripLossless) { + RoundtripLosslessConfig config = GetParam(); + int bitdepth = config.bitdepth; + int responsive = config.responsive; + + ThreadPool* pool = nullptr; + Rng generator(123); + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io1; + ASSERT_TRUE(SetFromBytes(Span(orig), &io1, pool)); + + // vary the dimensions a bit, in case of bugs related to + // even vs odd width or height. + size_t xsize = 423 + bitdepth; + size_t ysize = 467 + bitdepth; + + CodecInOut io; + io.SetSize(xsize, ysize); + io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false); + io.metadata.m.SetUintSamples(bitdepth); + + double factor = ((1lu << bitdepth) - 1lu); + double ifactor = 1.0 / factor; + Image3F noise_added(xsize, ysize); + + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < ysize; y++) { + const float* in = io1.Main().color()->PlaneRow(c, y); + float* out = noise_added.PlaneRow(c, y); + for (size_t x = 0; x < xsize; x++) { + // make the least significant bits random + float f = in[x] + generator.UniformF(0.0f, 1.f / 255.f); + if (f > 1.f) f = 1.f; + // quantize to the bitdepth we're testing + unsigned int u = f * factor + 0.5; + out[x] = u * ifactor; + } + } + } + io.SetFromImage(std::move(noise_added), jxl::ColorEncoding::SRGB(false)); + + CompressParams cparams; + cparams.modular_mode = true; + cparams.color_transform = jxl::ColorTransform::kNone; + cparams.butteraugli_distance = 0.f; + cparams.options.predictor = {Predictor::Zero}; + cparams.speed_tier = SpeedTier::kThunder; + cparams.responsive = responsive; + CodecInOut io2; + size_t compressed_size; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size)); + EXPECT_LE(compressed_size, bitdepth * xsize * ysize / 3); + EXPECT_LE(0, ComputeDistance2(io.Main(), io2.Main(), GetJxlCms())); + size_t different = 0; + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < ysize; y++) { + const float* in = io.Main().color()->PlaneRow(c, y); + const float* out = io2.Main().color()->PlaneRow(c, y); + for (size_t x = 0; x < xsize; x++) { + uint32_t uin = in[x] * factor + 0.5; + uint32_t uout = out[x] * factor + 0.5; + // check that the integer values are identical + if (uin != uout) different++; + } + } + } + EXPECT_EQ(different, 0); +} + +TEST(ModularTest, RoundtripLosslessCustomFloat) { + CodecInOut io; + size_t xsize = 100, ysize = 300; + io.SetSize(xsize, ysize); + io.metadata.m.bit_depth.bits_per_sample = 18; + io.metadata.m.bit_depth.exponent_bits_per_sample = 6; + io.metadata.m.bit_depth.floating_point_sample = true; + io.metadata.m.modular_16_bit_buffer_sufficient = false; + ColorEncoding color_encoding; + color_encoding.tf.SetTransferFunction(TransferFunction::kLinear); + color_encoding.SetColorSpace(ColorSpace::kRGB); + Image3F testimage(xsize, ysize); + float factor = 1.f / (1 << 14); + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < ysize; y++) { + float* const JXL_RESTRICT row = testimage.PlaneRow(c, y); + for (size_t x = 0; x < xsize; x++) { + row[x] = factor * (x ^ y); + } + } + } + io.SetFromImage(std::move(testimage), color_encoding); + io.metadata.m.color_encoding = color_encoding; + io.metadata.m.SetIntensityTarget(255); + + CompressParams cparams; + cparams.modular_mode = true; + cparams.color_transform = jxl::ColorTransform::kNone; + cparams.butteraugli_distance = 0.f; + cparams.options.predictor = {Predictor::Zero}; + cparams.speed_tier = SpeedTier::kThunder; + cparams.decoding_speed_tier = 2; + + CodecInOut io2; + size_t compressed_size; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size)); + EXPECT_LE(compressed_size, 23000u); + JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io2.Main().color(), _)); +} + +void WriteHeaders(BitWriter* writer, size_t xsize, size_t ysize) { + BitWriter::Allotment allotment(writer, 16); + writer->Write(8, 0xFF); + writer->Write(8, kCodestreamMarker); + allotment.ReclaimAndCharge(writer, 0, nullptr); + CodecMetadata metadata; + EXPECT_TRUE(metadata.size.Set(xsize, ysize)); + EXPECT_TRUE(WriteSizeHeader(metadata.size, writer, 0, nullptr)); + metadata.m.color_encoding = ColorEncoding::LinearSRGB(/*is_gray=*/true); + metadata.m.xyb_encoded = false; + metadata.m.SetUintSamples(31); + EXPECT_TRUE(WriteImageMetadata(metadata.m, writer, 0, nullptr)); + metadata.transform_data.nonserialized_xyb_encoded = metadata.m.xyb_encoded; + EXPECT_TRUE(Bundle::Write(metadata.transform_data, writer, 0, nullptr)); + writer->ZeroPadToByte(); + FrameHeader frame_header(&metadata); + frame_header.encoding = FrameEncoding::kModular; + frame_header.loop_filter.gab = false; + frame_header.loop_filter.epf_iters = 0; + EXPECT_TRUE(WriteFrameHeader(frame_header, writer, nullptr)); +} + +// Tree with single node, zero predictor, offset is 1 and multiplier is 1, +// entropy code is prefix tree with alphabet size 256 and all bits lengths 8. +void WriteHistograms(BitWriter* writer) { + writer->Write(1, 1); // default DC quant + writer->Write(1, 1); // has_tree + // tree histograms + writer->Write(1, 0); // LZ77 disabled + writer->Write(3, 1); // simple context map + writer->Write(1, 1); // prefix code + writer->Write(7, 0x63); // UnintConfig(3, 2, 1) + writer->Write(12, 0xfef); // alphabet_size = 256 + writer->Write(32, 0x10003); // all bit lengths 8 + // tree tokens + writer->Write(8, 0); // tree leaf + writer->Write(8, 0); // zero predictor + writer->Write(8, 64); // offset = UnpackSigned(ReverseBits(64)) = 1 + writer->Write(16, 0); // multiplier = 1 + // histograms + writer->Write(1, 0); // LZ77 disabled + writer->Write(1, 1); // prefix code + writer->Write(7, 0x63); // UnintConfig(3, 2, 1) + writer->Write(12, 0xfef); // alphabet_size = 256 + writer->Write(32, 0x10003); // all bit lengths 8 +} + +TEST(ModularTest, PredictorIntegerOverflow) { + const size_t xsize = 1; + const size_t ysize = 1; + BitWriter writer; + WriteHeaders(&writer, xsize, ysize); + std::vector group_codes(1); + { + BitWriter* bw = &group_codes[0]; + BitWriter::Allotment allotment(bw, 1 << 20); + WriteHistograms(bw); + GroupHeader header; + header.use_global_tree = true; + EXPECT_TRUE(Bundle::Write(header, bw, 0, nullptr)); + // After UnpackSigned this becomes (1 << 31) - 1, the largest pixel_type, + // and after adding the offset we get -(1 << 31). + bw->Write(8, 119); + bw->Write(28, 0xfffffff); + bw->ZeroPadToByte(); + allotment.ReclaimAndCharge(bw, 0, nullptr); + } + EXPECT_TRUE(WriteGroupOffsets(group_codes, nullptr, &writer, nullptr)); + writer.AppendByteAligned(group_codes); + + PaddedBytes compressed = std::move(writer).TakeBytes(); + extras::PackedPixelFile ppf; + extras::JXLDecompressParams params; + params.accepted_formats.push_back({1, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0}); + EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), params, + nullptr, &ppf)); + ASSERT_EQ(1, ppf.frames.size()); + const auto& img = ppf.frames[0].color; + const auto pixels = reinterpret_cast(img.pixels()); + EXPECT_EQ(-1.0f, pixels[0]); +} + +TEST(ModularTest, UnsqueezeIntegerOverflow) { + // Image width is 9 so we can test both the SIMD and non-vector code paths. + const size_t xsize = 9; + const size_t ysize = 2; + BitWriter writer; + WriteHeaders(&writer, xsize, ysize); + std::vector group_codes(1); + { + BitWriter* bw = &group_codes[0]; + BitWriter::Allotment allotment(bw, 1 << 20); + WriteHistograms(bw); + GroupHeader header; + header.use_global_tree = true; + header.transforms.emplace_back(); + header.transforms[0].id = TransformId::kSqueeze; + SqueezeParams params; + params.horizontal = false; + params.in_place = true; + params.begin_c = 0; + params.num_c = 1; + header.transforms[0].squeezes.emplace_back(params); + EXPECT_TRUE(Bundle::Write(header, bw, 0, nullptr)); + for (size_t i = 0; i < xsize * ysize; ++i) { + // After UnpackSigned and adding offset, this becomes (1 << 31) - 1, both + // in the image and in the residual channels, and unsqueeze makes them + // ~(3 << 30) and (1 << 30) (in pixel_type_w) and the first wraps around + // to about -(1 << 30). + bw->Write(8, 119); + bw->Write(28, 0xffffffe); + } + bw->ZeroPadToByte(); + allotment.ReclaimAndCharge(bw, 0, nullptr); + } + EXPECT_TRUE(WriteGroupOffsets(group_codes, nullptr, &writer, nullptr)); + writer.AppendByteAligned(group_codes); + + PaddedBytes compressed = std::move(writer).TakeBytes(); + extras::PackedPixelFile ppf; + extras::JXLDecompressParams params; + params.accepted_formats.push_back({1, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0}); + EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), params, + nullptr, &ppf)); + ASSERT_EQ(1, ppf.frames.size()); + const auto& img = ppf.frames[0].color; + const auto pixels = reinterpret_cast(img.pixels()); + for (size_t x = 0; x < xsize; ++x) { + EXPECT_NEAR(-0.5f, pixels[x], 1e-10); + EXPECT_NEAR(0.5f, pixels[xsize + x], 1e-10); + } +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/noise.h b/third_party/jpeg-xl/lib/jxl/noise.h new file mode 100644 index 0000000000..d897ea3abe --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/noise.h @@ -0,0 +1,60 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_NOISE_H_ +#define LIB_JXL_NOISE_H_ + +// Noise parameters shared by encoder/decoder. + +#include + +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" + +namespace jxl { + +const float kNoisePrecision = 1 << 10; + +struct NoiseParams { + // LUT index is an intensity of pixel / mean intensity of patch + static constexpr size_t kNumNoisePoints = 8; + float lut[kNumNoisePoints]; + + void Clear() { + for (float& i : lut) i = 0.f; + } + bool HasAny() const { + for (float i : lut) { + if (std::abs(i) > 1e-3f) return true; + } + return false; + } +}; + +static inline std::pair IndexAndFrac(float x) { + constexpr size_t kScaleNumerator = NoiseParams::kNumNoisePoints - 2; + // TODO: instead of 1, this should be a proper Y range. + constexpr float kScale = kScaleNumerator / 1; + float scaled_x = std::max(0.f, x * kScale); + float floor_x; + float frac_x = std::modf(scaled_x, &floor_x); + if (JXL_UNLIKELY(scaled_x >= kScaleNumerator + 1)) { + floor_x = kScaleNumerator; + frac_x = 1.f; + } + return std::make_pair(static_cast(floor_x), frac_x); +} + +struct NoiseLevel { + float noise_level; + float intensity; +}; + +} // namespace jxl + +#endif // LIB_JXL_NOISE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/opsin_image_test.cc b/third_party/jpeg-xl/lib/jxl/opsin_image_test.cc new file mode 100644 index 0000000000..07fd824f14 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/opsin_image_test.cc @@ -0,0 +1,123 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/image.h" +#include "lib/jxl/matrix_ops.h" +#include "lib/jxl/opsin_params.h" + +namespace jxl { +namespace { + +// Convert a single linear sRGB color to xyb, using the exact image conversion +// procedure that jpeg xl uses. +void LinearSrgbToOpsin(float rgb_r, float rgb_g, float rgb_b, + float* JXL_RESTRICT xyb_x, float* JXL_RESTRICT xyb_y, + float* JXL_RESTRICT xyb_b) { + Image3F linear(1, 1); + linear.PlaneRow(0, 0)[0] = rgb_r; + linear.PlaneRow(1, 0)[0] = rgb_g; + linear.PlaneRow(2, 0)[0] = rgb_b; + + ImageMetadata metadata; + metadata.SetFloat32Samples(); + metadata.color_encoding = ColorEncoding::LinearSRGB(); + ImageBundle ib(&metadata); + ib.SetFromImage(std::move(linear), metadata.color_encoding); + Image3F opsin(1, 1); + (void)ToXYB(ib, /*pool=*/nullptr, &opsin, GetJxlCms()); + + *xyb_x = opsin.PlaneRow(0, 0)[0]; + *xyb_y = opsin.PlaneRow(1, 0)[0]; + *xyb_b = opsin.PlaneRow(2, 0)[0]; +} + +// Convert a single XYB color to linear sRGB, using the exact image conversion +// procedure that jpeg xl uses. +void OpsinToLinearSrgb(float xyb_x, float xyb_y, float xyb_b, + float* JXL_RESTRICT rgb_r, float* JXL_RESTRICT rgb_g, + float* JXL_RESTRICT rgb_b) { + Image3F opsin(1, 1); + opsin.PlaneRow(0, 0)[0] = xyb_x; + opsin.PlaneRow(1, 0)[0] = xyb_y; + opsin.PlaneRow(2, 0)[0] = xyb_b; + Image3F linear(1, 1); + OpsinParams opsin_params; + opsin_params.Init(/*intensity_target=*/255.0f); + OpsinToLinear(opsin, Rect(opsin), nullptr, &linear, opsin_params); + *rgb_r = linear.PlaneRow(0, 0)[0]; + *rgb_g = linear.PlaneRow(1, 0)[0]; + *rgb_b = linear.PlaneRow(2, 0)[0]; +} + +void OpsinRoundtripTestRGB(float r, float g, float b) { + float xyb_x, xyb_y, xyb_b; + LinearSrgbToOpsin(r, g, b, &xyb_x, &xyb_y, &xyb_b); + float r2, g2, b2; + OpsinToLinearSrgb(xyb_x, xyb_y, xyb_b, &r2, &g2, &b2); + EXPECT_NEAR(r, r2, 1e-3); + EXPECT_NEAR(g, g2, 1e-3); + EXPECT_NEAR(b, b2, 1e-3); +} + +TEST(OpsinImageTest, VerifyOpsinAbsorbanceInverseMatrix) { + float matrix[9]; // writable copy + for (int i = 0; i < 9; i++) { + matrix[i] = GetOpsinAbsorbanceInverseMatrix()[i]; + } + EXPECT_TRUE(Inv3x3Matrix(matrix)); + for (int i = 0; i < 9; i++) { + EXPECT_NEAR(matrix[i], kOpsinAbsorbanceMatrix[i], 1e-6); + } +} + +TEST(OpsinImageTest, OpsinRoundtrip) { + OpsinRoundtripTestRGB(0, 0, 0); + OpsinRoundtripTestRGB(1. / 255, 1. / 255, 1. / 255); + OpsinRoundtripTestRGB(128. / 255, 128. / 255, 128. / 255); + OpsinRoundtripTestRGB(1, 1, 1); + + OpsinRoundtripTestRGB(0, 0, 1. / 255); + OpsinRoundtripTestRGB(0, 0, 128. / 255); + OpsinRoundtripTestRGB(0, 0, 1); + + OpsinRoundtripTestRGB(0, 1. / 255, 0); + OpsinRoundtripTestRGB(0, 128. / 255, 0); + OpsinRoundtripTestRGB(0, 1, 0); + + OpsinRoundtripTestRGB(1. / 255, 0, 0); + OpsinRoundtripTestRGB(128. / 255, 0, 0); + OpsinRoundtripTestRGB(1, 0, 0); +} + +TEST(OpsinImageTest, VerifyZero) { + // Test that black color (zero energy) is 0,0,0 in xyb. + float x, y, b; + LinearSrgbToOpsin(0, 0, 0, &x, &y, &b); + EXPECT_NEAR(0, x, 1e-9); + EXPECT_NEAR(0, y, 1e-7); + EXPECT_NEAR(0, b, 1e-7); +} + +TEST(OpsinImageTest, VerifyGray) { + // Test that grayscale colors have a fixed y/b ratio and x==0. + for (size_t i = 1; i < 255; i++) { + float x, y, b; + LinearSrgbToOpsin(i / 255., i / 255., i / 255., &x, &y, &b); + EXPECT_NEAR(0, x, 1e-6); + EXPECT_NEAR(kYToBRatio, b / y, 3e-5); + } +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/opsin_inverse_test.cc b/third_party/jpeg-xl/lib/jxl/opsin_inverse_test.cc new file mode 100644 index 0000000000..088253c2ce --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/opsin_inverse_test.cc @@ -0,0 +1,57 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +TEST(OpsinInverseTest, LinearInverseInverts) { + Image3F linear(128, 128); + RandomFillImage(&linear, 0.0f, 1.0f); + + CodecInOut io; + io.metadata.m.SetFloat32Samples(); + io.metadata.m.color_encoding = ColorEncoding::LinearSRGB(); + io.SetFromImage(CopyImage(linear), io.metadata.m.color_encoding); + ThreadPool* null_pool = nullptr; + Image3F opsin(io.xsize(), io.ysize()); + (void)ToXYB(io.Main(), null_pool, &opsin, GetJxlCms()); + + OpsinParams opsin_params; + opsin_params.Init(/*intensity_target=*/255.0f); + OpsinToLinearInplace(&opsin, /*pool=*/nullptr, opsin_params); + + JXL_ASSERT_OK(VerifyRelativeError(linear, opsin, 3E-3, 2E-4, _)); +} + +TEST(OpsinInverseTest, YcbCrInverts) { + Image3F rgb(128, 128); + RandomFillImage(&rgb, 0.0f, 1.0f); + + ThreadPool* null_pool = nullptr; + Image3F ycbcr(rgb.xsize(), rgb.ysize()); + EXPECT_TRUE(RgbToYcbcr(rgb.Plane(0), rgb.Plane(1), rgb.Plane(2), + &ycbcr.Plane(1), &ycbcr.Plane(0), &ycbcr.Plane(2), + null_pool)); + + Image3F rgb2(rgb.xsize(), rgb.ysize()); + YcbcrToRgb(ycbcr, &rgb2, Rect(rgb)); + + JXL_ASSERT_OK(VerifyRelativeError(rgb, rgb2, 4E-5, 4E-7, _)); +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/opsin_params.cc b/third_party/jpeg-xl/lib/jxl/opsin_params.cc new file mode 100644 index 0000000000..ec3db4ee76 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/opsin_params.cc @@ -0,0 +1,44 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/opsin_params.h" + +#include + +#include "lib/jxl/matrix_ops.h" + +namespace jxl { + +#define INVERSE_OPSIN_FROM_SPEC 1 + +const float* GetOpsinAbsorbanceInverseMatrix() { +#if INVERSE_OPSIN_FROM_SPEC + return DefaultInverseOpsinAbsorbanceMatrix(); +#else // INVERSE_OPSIN_FROM_SPEC + // Compute the inverse opsin matrix from the forward matrix. Less precise + // than taking the values from the specification, but must be used if the + // forward transform is changed and the spec will require updating. + static const float* const kInverse = [] { + static float inverse[9]; + for (int i = 0; i < 9; i++) { + inverse[i] = kOpsinAbsorbanceMatrix[i]; + } + Inv3x3Matrix(inverse); + return inverse; + }(); + return kInverse; +#endif // INVERSE_OPSIN_FROM_SPEC +} + +void InitSIMDInverseMatrix(const float* JXL_RESTRICT inverse, + float* JXL_RESTRICT simd_inverse, + float intensity_target) { + for (size_t i = 0; i < 9; ++i) { + simd_inverse[4 * i] = simd_inverse[4 * i + 1] = simd_inverse[4 * i + 2] = + simd_inverse[4 * i + 3] = inverse[i] * (255.0f / intensity_target); + } +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/opsin_params.h b/third_party/jpeg-xl/lib/jxl/opsin_params.h new file mode 100644 index 0000000000..3a7da97d8a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/opsin_params.h @@ -0,0 +1,86 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_OPSIN_PARAMS_H_ +#define LIB_JXL_OPSIN_PARAMS_H_ + +// Constants that define the XYB color space. + +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" + +namespace jxl { + +// Parameters for opsin absorbance. +static const float kM02 = 0.078f; +static const float kM00 = 0.30f; +static const float kM01 = 1.0f - kM02 - kM00; + +static const float kM12 = 0.078f; +static const float kM10 = 0.23f; +static const float kM11 = 1.0f - kM12 - kM10; + +static const float kM20 = 0.24342268924547819f; +static const float kM21 = 0.20476744424496821f; +static const float kM22 = 1.0f - kM20 - kM21; + +static const float kBScale = 1.0f; +static const float kYToBRatio = 1.0f; // works better with 0.50017729543783418 +static const float kBToYRatio = 1.0f / kYToBRatio; + +static const float kB0 = 0.0037930732552754493f; +static const float kB1 = kB0; +static const float kB2 = kB0; + +// Opsin absorbance matrix is now frozen. +static const float kOpsinAbsorbanceMatrix[9] = { + kM00, kM01, kM02, kM10, kM11, kM12, kM20, kM21, kM22, +}; + +// Must be the inverse matrix of kOpsinAbsorbanceMatrix and match the spec. +static inline const float* DefaultInverseOpsinAbsorbanceMatrix() { + static float kDefaultInverseOpsinAbsorbanceMatrix[9] = { + 11.031566901960783f, -9.866943921568629f, -0.16462299647058826f, + -3.254147380392157f, 4.418770392156863f, -0.16462299647058826f, + -3.6588512862745097f, 2.7129230470588235f, 1.9459282392156863f}; + return kDefaultInverseOpsinAbsorbanceMatrix; +} + +// Returns 3x3 row-major matrix inverse of kOpsinAbsorbanceMatrix. +// opsin_image_test verifies this is actually the inverse. +const float* GetOpsinAbsorbanceInverseMatrix(); + +void InitSIMDInverseMatrix(const float* JXL_RESTRICT inverse, + float* JXL_RESTRICT simd_inverse, + float intensity_target); + +static const float kOpsinAbsorbanceBias[3] = { + kB0, + kB1, + kB2, +}; + +static const float kNegOpsinAbsorbanceBiasRGB[4] = { + -kOpsinAbsorbanceBias[0], -kOpsinAbsorbanceBias[1], + -kOpsinAbsorbanceBias[2], 1.0f}; + +static const float kScaledXYBOffset[3] = { + 0.015386134f, + 0.0f, + 0.27770459f, +}; + +static const float kScaledXYBScale[3] = { + 22.995788804f, + 1.183000077f, + 1.502141333f, +}; + +} // namespace jxl + +#endif // LIB_JXL_OPSIN_PARAMS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/padded_bytes_test.cc b/third_party/jpeg-xl/lib/jxl/padded_bytes_test.cc new file mode 100644 index 0000000000..9ca7a22423 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/padded_bytes_test.cc @@ -0,0 +1,126 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/padded_bytes.h" + +#include // iota +#include + +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +TEST(PaddedBytesTest, TestNonEmptyFirstByteZero) { + PaddedBytes pb(1); + EXPECT_EQ(0, pb[0]); + // Even after resizing.. + pb.resize(20); + EXPECT_EQ(0, pb[0]); + // And reserving. + pb.reserve(200); + EXPECT_EQ(0, pb[0]); +} + +TEST(PaddedBytesTest, TestEmptyFirstByteZero) { + PaddedBytes pb(0); + // After resizing - new zero is written despite there being nothing to copy. + pb.resize(20); + EXPECT_EQ(0, pb[0]); +} + +TEST(PaddedBytesTest, TestFillWithoutReserve) { + PaddedBytes pb; + for (size_t i = 0; i < 170u; ++i) { + pb.push_back(i); + } + EXPECT_EQ(170u, pb.size()); + EXPECT_GE(pb.capacity(), 170u); +} + +TEST(PaddedBytesTest, TestFillWithExactReserve) { + PaddedBytes pb; + pb.reserve(170); + for (size_t i = 0; i < 170u; ++i) { + pb.push_back(i); + } + EXPECT_EQ(170u, pb.size()); + EXPECT_EQ(pb.capacity(), 170u); +} + +TEST(PaddedBytesTest, TestFillWithMoreReserve) { + PaddedBytes pb; + pb.reserve(171); + for (size_t i = 0; i < 170u; ++i) { + pb.push_back(i); + } + EXPECT_EQ(170u, pb.size()); + EXPECT_GT(pb.capacity(), 170u); +} + +// Can assign() a subset of the valid data. +TEST(PaddedBytesTest, TestAssignFromWithin) { + PaddedBytes pb; + pb.reserve(256); + for (size_t i = 0; i < 256; ++i) { + pb.push_back(i); + } + pb.assign(pb.data() + 64, pb.data() + 192); + EXPECT_EQ(128u, pb.size()); + for (size_t i = 0; i < 128; ++i) { + EXPECT_EQ(i + 64, pb[i]); + } +} + +// Can assign() a range with both valid and previously-allocated data. +TEST(PaddedBytesTest, TestAssignReclaim) { + PaddedBytes pb; + pb.reserve(256); + for (size_t i = 0; i < 256; ++i) { + pb.push_back(i); + } + + const uint8_t* mem = pb.data(); + pb.resize(200); + // Just shrank without reallocating + EXPECT_EQ(mem, pb.data()); + EXPECT_EQ(256u, pb.capacity()); + + // Reclaim part of initial allocation + pb.assign(pb.data() + 100, pb.data() + 240); + EXPECT_EQ(140u, pb.size()); + + for (size_t i = 0; i < 140; ++i) { + EXPECT_EQ(i + 100, pb[i]); + } +} + +// Can assign() smaller and larger ranges outside the current allocation. +TEST(PaddedBytesTest, TestAssignOutside) { + PaddedBytes pb; + pb.resize(400); + std::iota(pb.begin(), pb.end(), 1); + + std::vector small(64); + std::iota(small.begin(), small.end(), 500); + + pb.assign(small.data(), small.data() + small.size()); + EXPECT_EQ(64u, pb.size()); + for (size_t i = 0; i < 64; ++i) { + EXPECT_EQ((i + 500) & 0xFF, pb[i]); + } + + std::vector large(1000); + std::iota(large.begin(), large.end(), 600); + + pb.assign(large.data(), large.data() + large.size()); + EXPECT_EQ(1000u, pb.size()); + for (size_t i = 0; i < 1000; ++i) { + EXPECT_EQ((i + 600) & 0xFF, pb[i]); + } +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/passes_state.cc b/third_party/jpeg-xl/lib/jxl/passes_state.cc new file mode 100644 index 0000000000..2f287ec9b6 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/passes_state.cc @@ -0,0 +1,70 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/passes_state.h" + +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/common.h" + +namespace jxl { + +Status InitializePassesSharedState(const FrameHeader& frame_header, + PassesSharedState* JXL_RESTRICT shared, + bool encoder) { + JXL_ASSERT(frame_header.nonserialized_metadata != nullptr); + shared->frame_header = frame_header; + shared->metadata = frame_header.nonserialized_metadata; + shared->frame_dim = frame_header.ToFrameDimensions(); + shared->image_features.patches.SetPassesSharedState(shared); + + const FrameDimensions& frame_dim = shared->frame_dim; + + shared->ac_strategy = + AcStrategyImage(frame_dim.xsize_blocks, frame_dim.ysize_blocks); + shared->raw_quant_field = + ImageI(frame_dim.xsize_blocks, frame_dim.ysize_blocks); + shared->epf_sharpness = + ImageB(frame_dim.xsize_blocks, frame_dim.ysize_blocks); + shared->cmap = ColorCorrelationMap(frame_dim.xsize, frame_dim.ysize); + + // In the decoder, we allocate coeff orders afterwards, when we know how many + // we will actually need. + shared->coeff_order_size = kCoeffOrderMaxSize; + if (encoder && + shared->coeff_orders.size() < + frame_header.passes.num_passes * kCoeffOrderMaxSize && + frame_header.encoding == FrameEncoding::kVarDCT) { + shared->coeff_orders.resize(frame_header.passes.num_passes * + kCoeffOrderMaxSize); + } + + shared->quant_dc = ImageB(frame_dim.xsize_blocks, frame_dim.ysize_blocks); + + bool use_dc_frame = !!(frame_header.flags & FrameHeader::kUseDcFrame); + if (!encoder && use_dc_frame) { + if (frame_header.dc_level == 4) { + return JXL_FAILURE("Invalid DC level for kUseDcFrame: %u", + frame_header.dc_level); + } + shared->dc_storage = Image3F(); + shared->dc = &shared->dc_frames[frame_header.dc_level]; + if (shared->dc->xsize() == 0) { + return JXL_FAILURE( + "kUseDcFrame specified for dc_level %u, but no frame was decoded " + "with level %u", + frame_header.dc_level, frame_header.dc_level + 1); + } + ZeroFillImage(&shared->quant_dc); + } else { + shared->dc_storage = + Image3F(frame_dim.xsize_blocks, frame_dim.ysize_blocks); + shared->dc = &shared->dc_storage; + } + + return true; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/passes_state.h b/third_party/jpeg-xl/lib/jxl/passes_state.h new file mode 100644 index 0000000000..8d648a8feb --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/passes_state.h @@ -0,0 +1,133 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_PASSES_STATE_H_ +#define LIB_JXL_PASSES_STATE_H_ + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_patch_dictionary.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/noise.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/splines.h" + +// Structures that hold the (en/de)coder state for a JPEG XL kVarDCT +// (en/de)coder. + +namespace jxl { + +struct ImageFeatures { + NoiseParams noise_params; + PatchDictionary patches; + Splines splines; +}; + +// State common to both encoder and decoder. +// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding) +struct PassesSharedState { + PassesSharedState() : frame_header(nullptr) {} + + // Headers and metadata. + const CodecMetadata* metadata; + FrameHeader frame_header; + + FrameDimensions frame_dim; + + // Control fields and parameters. + AcStrategyImage ac_strategy; + + // Dequant matrices + quantizer. + DequantMatrices matrices; + Quantizer quantizer{&matrices}; + ImageI raw_quant_field; + + // Per-block side information for EPF detail preservation. + ImageB epf_sharpness; + + ColorCorrelationMap cmap; + + ImageFeatures image_features; + + // Memory area for storing coefficient orders. + // `coeff_order_size` is the size used by *one* set of coefficient orders (at + // most kMaxCoeffOrderSize). A set of coefficient orders is present for each + // pass. + size_t coeff_order_size = 0; + std::vector coeff_orders; + + // Decoder-side DC and quantized DC. + ImageB quant_dc; + Image3F dc_storage; + const Image3F* JXL_RESTRICT dc = &dc_storage; + + BlockCtxMap block_ctx_map; + + Image3F dc_frames[4]; + + struct { + ImageBundle frame; + // ImageBundle doesn't yet have a simple way to state it is in XYB. + bool ib_is_in_xyb = false; + } reference_frames[4] = {}; + + // Number of pre-clustered set of histograms (with the same ctx map), per + // pass. Encoded as num_histograms_ - 1. + size_t num_histograms = 0; + + bool IsGrayscale() const { return metadata->m.color_encoding.IsGray(); } + + Rect GroupRect(size_t group_index) const { + const size_t gx = group_index % frame_dim.xsize_groups; + const size_t gy = group_index / frame_dim.xsize_groups; + const Rect rect(gx * frame_dim.group_dim, gy * frame_dim.group_dim, + frame_dim.group_dim, frame_dim.group_dim, frame_dim.xsize, + frame_dim.ysize); + return rect; + } + + Rect PaddedGroupRect(size_t group_index) const { + const size_t gx = group_index % frame_dim.xsize_groups; + const size_t gy = group_index / frame_dim.xsize_groups; + const Rect rect(gx * frame_dim.group_dim, gy * frame_dim.group_dim, + frame_dim.group_dim, frame_dim.group_dim, + frame_dim.xsize_padded, frame_dim.ysize_padded); + return rect; + } + + Rect BlockGroupRect(size_t group_index) const { + const size_t gx = group_index % frame_dim.xsize_groups; + const size_t gy = group_index / frame_dim.xsize_groups; + const Rect rect(gx * (frame_dim.group_dim >> 3), + gy * (frame_dim.group_dim >> 3), frame_dim.group_dim >> 3, + frame_dim.group_dim >> 3, frame_dim.xsize_blocks, + frame_dim.ysize_blocks); + return rect; + } + + Rect DCGroupRect(size_t group_index) const { + const size_t gx = group_index % frame_dim.xsize_dc_groups; + const size_t gy = group_index / frame_dim.xsize_dc_groups; + const Rect rect(gx * frame_dim.group_dim, gy * frame_dim.group_dim, + frame_dim.group_dim, frame_dim.group_dim, + frame_dim.xsize_blocks, frame_dim.ysize_blocks); + return rect; + } +}; + +// Initialized the state information that is shared between encoder and decoder. +Status InitializePassesSharedState(const FrameHeader& frame_header, + PassesSharedState* JXL_RESTRICT shared, + bool encoder = false); + +} // namespace jxl + +#endif // LIB_JXL_PASSES_STATE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/passes_test.cc b/third_party/jpeg-xl/lib/jxl/passes_test.cc new file mode 100644 index 0000000000..97d776f941 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/passes_test.cc @@ -0,0 +1,402 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include +#include +#include + +#include "lib/extras/codec.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/common.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/enc_file.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { + +using test::Roundtrip; +using test::ThreadPoolForTests; + +namespace { + +TEST(PassesTest, RoundtripSmallPasses) { + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io)); + io.ShrinkTo(io.xsize() / 8, io.ysize() / 8); + + CompressParams cparams; + cparams.butteraugli_distance = 1.0; + cparams.progressive_mode = true; + + CodecInOut io2; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _)); + EXPECT_THAT( + ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, GetJxlCms(), + /*distmap=*/nullptr), + IsSlightlyBelow(1.1)); +} + +TEST(PassesTest, RoundtripUnalignedPasses) { + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io)); + io.ShrinkTo(io.xsize() / 12, io.ysize() / 7); + + CompressParams cparams; + cparams.butteraugli_distance = 2.0; + cparams.progressive_mode = true; + + CodecInOut io2; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _)); + EXPECT_THAT( + ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, GetJxlCms(), + /*distmap=*/nullptr), + IsSlightlyBelow(1.72)); +} + +TEST(PassesTest, RoundtripMultiGroupPasses) { + const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png"); + CodecInOut io; + { + ThreadPoolForTests pool(4); + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + } + io.ShrinkTo(600, 1024); // partial X, full Y group + + auto test = [&](float target_distance, float threshold) { + ThreadPoolForTests pool(4); + CompressParams cparams; + cparams.butteraugli_distance = target_distance; + cparams.progressive_mode = true; + CodecInOut io2; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, + /* compressed_size */ nullptr, &pool)); + EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, + GetJxlCms(), + /*distmap=*/nullptr, &pool), + IsSlightlyBelow(target_distance + threshold)); + }; + + auto run1 = std::async(std::launch::async, test, 1.0f, 0.5f); + auto run2 = std::async(std::launch::async, test, 2.0f, 0.5f); +} + +TEST(PassesTest, RoundtripLargeFastPasses) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + cparams.progressive_mode = true; + + CodecInOut io2; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, + /* comrpessed_size */ nullptr, &pool)); +} + +// Checks for differing size/distance in two consecutive runs of distance 2, +// which involves additional processing including adaptive reconstruction. +// Failing this may be a sign of race conditions or invalid memory accesses. +TEST(PassesTest, RoundtripProgressiveConsistent) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + cparams.progressive_mode = true; + cparams.butteraugli_distance = 2.0; + + // Try each xsize mod kBlockDim to verify right border handling. + for (size_t xsize = 48; xsize > 40; --xsize) { + io.ShrinkTo(xsize, 15); + + CodecInOut io2; + size_t size2; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &size2, &pool)); + + CodecInOut io3; + size_t size3; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io3, _, &size3, &pool)); + + // Exact same compressed size. + EXPECT_EQ(size2, size3); + + // Exact same distance. + const float dist2 = ButteraugliDistance(io.frames, io2.frames, + cparams.ba_params, GetJxlCms(), + /*distmap=*/nullptr, &pool); + const float dist3 = ButteraugliDistance(io.frames, io3.frames, + cparams.ba_params, GetJxlCms(), + /*distmap=*/nullptr, &pool); + EXPECT_EQ(dist2, dist3); + } +} + +TEST(PassesTest, AllDownsampleFeasible) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + PaddedBytes compressed; + AuxOut aux; + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + cparams.progressive_mode = true; + cparams.butteraugli_distance = 1.0; + PassesEncoderState enc_state; + ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(), + &aux, &pool)); + + EXPECT_LE(compressed.size(), 240000u); + float target_butteraugli[9] = {}; + target_butteraugli[1] = 2.5f; + target_butteraugli[2] = 16.0f; + target_butteraugli[4] = 20.0f; + target_butteraugli[8] = 80.0f; + + // The default progressive encoding scheme should make all these downsampling + // factors achievable. + // TODO(veluca): re-enable downsampling 16. + std::vector downsamplings = {1, 2, 4, 8}; //, 16}; + + auto check = [&](const uint32_t task, size_t /* thread */) -> void { + const size_t downsampling = downsamplings[task]; + extras::JXLDecompressParams dparams; + dparams.max_downsampling = downsampling; + CodecInOut output; + ASSERT_TRUE( + test::DecodeFile(dparams, Span(compressed), &output)); + EXPECT_EQ(output.xsize(), io.xsize()) << "downsampling = " << downsampling; + EXPECT_EQ(output.ysize(), io.ysize()) << "downsampling = " << downsampling; + EXPECT_LE(ButteraugliDistance(io.frames, output.frames, cparams.ba_params, + GetJxlCms(), + /*distmap=*/nullptr, nullptr), + target_butteraugli[downsampling]) + << "downsampling: " << downsampling; + }; + EXPECT_TRUE(RunOnPool(&pool, 0, downsamplings.size(), ThreadPool::NoInit, + check, "TestDownsampling")); +} + +TEST(PassesTest, AllDownsampleFeasibleQProgressive) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + PaddedBytes compressed; + AuxOut aux; + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + cparams.qprogressive_mode = true; + cparams.butteraugli_distance = 1.0; + PassesEncoderState enc_state; + ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(), + &aux, &pool)); + + EXPECT_LE(compressed.size(), 220000u); + + float target_butteraugli[9] = {}; + target_butteraugli[1] = 3.0f; + target_butteraugli[2] = 6.0f; + target_butteraugli[4] = 10.0f; + target_butteraugli[8] = 80.0f; + + // The default progressive encoding scheme should make all these downsampling + // factors achievable. + std::vector downsamplings = {1, 2, 4, 8}; + + auto check = [&](const uint32_t task, size_t /* thread */) -> void { + const size_t downsampling = downsamplings[task]; + extras::JXLDecompressParams dparams; + dparams.max_downsampling = downsampling; + CodecInOut output; + ASSERT_TRUE( + test::DecodeFile(dparams, Span(compressed), &output)); + EXPECT_EQ(output.xsize(), io.xsize()) << "downsampling = " << downsampling; + EXPECT_EQ(output.ysize(), io.ysize()) << "downsampling = " << downsampling; + EXPECT_LE(ButteraugliDistance(io.frames, output.frames, cparams.ba_params, + GetJxlCms(), + /*distmap=*/nullptr), + target_butteraugli[downsampling]) + << "downsampling: " << downsampling; + }; + EXPECT_TRUE(RunOnPool(&pool, 0, downsamplings.size(), ThreadPool::NoInit, + check, "TestQProgressive")); +} + +TEST(PassesTest, ProgressiveDownsample2DegradesCorrectlyGrayscale) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/cvo9xd_keong_macan_grayscale.png"); + CodecInOut io_orig; + ASSERT_TRUE(SetFromBytes(Span(orig), &io_orig, &pool)); + Rect rect(0, 0, io_orig.xsize(), 128); + // need 2 DC groups for the DC frame to actually be progressive. + Image3F large(4242, rect.ysize()); + ZeroFillImage(&large); + CopyImageTo(rect, *io_orig.Main().color(), rect, &large); + CodecInOut io; + io.metadata = io_orig.metadata; + io.SetFromImage(std::move(large), io_orig.Main().c_current()); + + PaddedBytes compressed; + AuxOut aux; + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + cparams.progressive_dc = 1; + cparams.responsive = true; + cparams.qprogressive_mode = true; + cparams.butteraugli_distance = 1.0; + PassesEncoderState enc_state; + ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(), + &aux, &pool)); + + EXPECT_LE(compressed.size(), 10000u); + + extras::JXLDecompressParams dparams; + dparams.max_downsampling = 1; + CodecInOut output; + ASSERT_TRUE( + test::DecodeFile(dparams, Span(compressed), &output)); + + dparams.max_downsampling = 2; + CodecInOut output_d2; + ASSERT_TRUE( + test::DecodeFile(dparams, Span(compressed), &output_d2)); + + // 0 if reading all the passes, ~15 if skipping the 8x pass. + float butteraugli_distance_down2_full = ButteraugliDistance( + output.frames, output_d2.frames, cparams.ba_params, GetJxlCms(), + /*distmap=*/nullptr); + + EXPECT_LE(butteraugli_distance_down2_full, 3.2f); + EXPECT_GE(butteraugli_distance_down2_full, 1.0f); +} + +TEST(PassesTest, ProgressiveDownsample2DegradesCorrectly) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png"); + CodecInOut io_orig; + ASSERT_TRUE(SetFromBytes(Span(orig), &io_orig, &pool)); + Rect rect(0, 0, io_orig.xsize(), 128); + // need 2 DC groups for the DC frame to actually be progressive. + Image3F large(4242, rect.ysize()); + ZeroFillImage(&large); + CopyImageTo(rect, *io_orig.Main().color(), rect, &large); + CodecInOut io; + io.SetFromImage(std::move(large), io_orig.Main().c_current()); + + PaddedBytes compressed; + AuxOut aux; + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + cparams.progressive_dc = 1; + cparams.responsive = true; + cparams.qprogressive_mode = true; + cparams.butteraugli_distance = 1.0; + PassesEncoderState enc_state; + ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(), + &aux, &pool)); + + EXPECT_LE(compressed.size(), 220000u); + + extras::JXLDecompressParams dparams; + dparams.max_downsampling = 1; + CodecInOut output; + ASSERT_TRUE( + test::DecodeFile(dparams, Span(compressed), &output)); + + dparams.max_downsampling = 2; + CodecInOut output_d2; + ASSERT_TRUE( + test::DecodeFile(dparams, Span(compressed), &output_d2)); + + // 0 if reading all the passes, ~15 if skipping the 8x pass. + float butteraugli_distance_down2_full = ButteraugliDistance( + output.frames, output_d2.frames, cparams.ba_params, GetJxlCms(), + /*distmap=*/nullptr); + + EXPECT_LE(butteraugli_distance_down2_full, 3.0f); + EXPECT_GE(butteraugli_distance_down2_full, 1.0f); +} + +TEST(PassesTest, NonProgressiveDCImage) { + ThreadPoolForTests pool(8); + const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + PaddedBytes compressed; + AuxOut aux; + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + cparams.progressive_mode = false; + cparams.butteraugli_distance = 2.0; + PassesEncoderState enc_state; + ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(), + &aux, &pool)); + + // Even in non-progressive mode, it should be possible to return a DC-only + // image. + extras::JXLDecompressParams dparams; + dparams.max_downsampling = 100; + CodecInOut output; + ASSERT_TRUE(test::DecodeFile(dparams, Span(compressed), + &output, &pool)); + EXPECT_EQ(output.xsize(), io.xsize()); + EXPECT_EQ(output.ysize(), io.ysize()); +} + +TEST(PassesTest, RoundtripSmallNoGaborishPasses) { + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io)); + io.ShrinkTo(io.xsize() / 8, io.ysize() / 8); + + CompressParams cparams; + cparams.gaborish = Override::kOff; + cparams.butteraugli_distance = 1.0; + cparams.progressive_mode = true; + + CodecInOut io2; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _)); + EXPECT_THAT( + ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, GetJxlCms(), + /*distmap=*/nullptr), + IsSlightlyBelow(1.2)); +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/patch_dictionary_internal.h b/third_party/jpeg-xl/lib/jxl/patch_dictionary_internal.h new file mode 100644 index 0000000000..e4172f6db6 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/patch_dictionary_internal.h @@ -0,0 +1,31 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_PATCH_DICTIONARY_INTERNAL_H_ +#define LIB_JXL_PATCH_DICTIONARY_INTERNAL_H_ + +#include "lib/jxl/dec_patch_dictionary.h" +#include "lib/jxl/passes_state.h" // for PassesSharedState + +namespace jxl { + +// Context numbers as specified in Section C.4.5, Listing C.2: +enum Contexts { + kNumRefPatchContext = 0, + kReferenceFrameContext = 1, + kPatchSizeContext = 2, + kPatchReferencePositionContext = 3, + kPatchPositionContext = 4, + kPatchBlendModeContext = 5, + kPatchOffsetContext = 6, + kPatchCountContext = 7, + kPatchAlphaChannelContext = 8, + kPatchClampContext = 9, + kNumPatchDictionaryContexts +}; + +} // namespace jxl + +#endif // LIB_JXL_PATCH_DICTIONARY_INTERNAL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/patch_dictionary_test.cc b/third_party/jpeg-xl/lib/jxl/patch_dictionary_test.cc new file mode 100644 index 0000000000..5cc6c13a9e --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/patch_dictionary_test.cc @@ -0,0 +1,58 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/extras/codec.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +using ::jxl::test::Roundtrip; + +TEST(PatchDictionaryTest, GrayscaleModular) { + const PaddedBytes orig = jxl::test::ReadTestData("jxl/grayscale_patches.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io)); + + CompressParams cparams; + cparams.SetLossless(); + cparams.patches = jxl::Override::kOn; + + CodecInOut io2; + // Without patches: ~25k + size_t compressed_size; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size)); + EXPECT_LE(compressed_size, 8000u); + JXL_ASSERT_OK(VerifyRelativeError(*io.Main().color(), *io2.Main().color(), + 1e-7f, 0, _)); +} + +TEST(PatchDictionaryTest, GrayscaleVarDCT) { + const PaddedBytes orig = jxl::test::ReadTestData("jxl/grayscale_patches.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io)); + + CompressParams cparams; + cparams.patches = jxl::Override::kOn; + + CodecInOut io2; + // Without patches: ~47k + size_t compressed_size; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size)); + EXPECT_LE(compressed_size, 14000u); + // Without patches: ~1.2 + EXPECT_LE( + ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, GetJxlCms(), + /*distmap=*/nullptr), + 1.1); +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/preview_test.cc b/third_party/jpeg-xl/lib/jxl/preview_test.cc new file mode 100644 index 0000000000..9d4603ca70 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/preview_test.cc @@ -0,0 +1,68 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include + +#include "lib/extras/codec.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/enc_file.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { +using test::Roundtrip; + +TEST(PreviewTest, RoundtripGivenPreview) { + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io)); + io.ShrinkTo(io.xsize() / 8, io.ysize() / 8); + // Same as main image + io.preview_frame = io.Main().Copy(); + const size_t preview_xsize = 15; + const size_t preview_ysize = 27; + io.preview_frame.ShrinkTo(preview_xsize, preview_ysize); + io.metadata.m.have_preview = true; + ASSERT_TRUE(io.metadata.m.preview_size.Set(io.preview_frame.xsize(), + io.preview_frame.ysize())); + + CompressParams cparams; + cparams.butteraugli_distance = 2.0; + cparams.speed_tier = SpeedTier::kSquirrel; + + CodecInOut io2; + JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _)); + EXPECT_EQ(preview_xsize, io2.metadata.m.preview_size.xsize()); + EXPECT_EQ(preview_ysize, io2.metadata.m.preview_size.ysize()); + EXPECT_EQ(preview_xsize, io2.preview_frame.xsize()); + EXPECT_EQ(preview_ysize, io2.preview_frame.ysize()); + + EXPECT_LE(ButteraugliDistance(io.preview_frame, io2.preview_frame, + cparams.ba_params, GetJxlCms(), + /*distmap=*/nullptr), + 2.5); + EXPECT_LE( + ButteraugliDistance(io.Main(), io2.Main(), cparams.ba_params, GetJxlCms(), + /*distmap=*/nullptr), + 2.5); +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/quant_weights.cc b/third_party/jpeg-xl/lib/jxl/quant_weights.cc new file mode 100644 index 0000000000..5e3f3424aa --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/quant_weights.cc @@ -0,0 +1,1239 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +#include "lib/jxl/quant_weights.h" + +#include +#include + +#include +#include +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/dec_modular.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/quant_weights.cc" +#include +#include + +#include "lib/jxl/fast_math-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Lt; +using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::Sqrt; + +// kQuantWeights[N * N * c + N * y + x] is the relative weight of the (x, y) +// coefficient in component c. Higher weights correspond to finer quantization +// intervals and more bits spent in encoding. + +static constexpr const float kAlmostZero = 1e-8f; + +void GetQuantWeightsDCT2(const QuantEncoding::DCT2Weights& dct2weights, + float* weights) { + for (size_t c = 0; c < 3; c++) { + size_t start = c * 64; + weights[start] = 0xBAD; + weights[start + 1] = weights[start + 8] = dct2weights[c][0]; + weights[start + 9] = dct2weights[c][1]; + for (size_t y = 0; y < 2; y++) { + for (size_t x = 0; x < 2; x++) { + weights[start + y * 8 + x + 2] = dct2weights[c][2]; + weights[start + (y + 2) * 8 + x] = dct2weights[c][2]; + } + } + for (size_t y = 0; y < 2; y++) { + for (size_t x = 0; x < 2; x++) { + weights[start + (y + 2) * 8 + x + 2] = dct2weights[c][3]; + } + } + for (size_t y = 0; y < 4; y++) { + for (size_t x = 0; x < 4; x++) { + weights[start + y * 8 + x + 4] = dct2weights[c][4]; + weights[start + (y + 4) * 8 + x] = dct2weights[c][4]; + } + } + for (size_t y = 0; y < 4; y++) { + for (size_t x = 0; x < 4; x++) { + weights[start + (y + 4) * 8 + x + 4] = dct2weights[c][5]; + } + } + } +} + +void GetQuantWeightsIdentity(const QuantEncoding::IdWeights& idweights, + float* weights) { + for (size_t c = 0; c < 3; c++) { + for (int i = 0; i < 64; i++) { + weights[64 * c + i] = idweights[c][0]; + } + weights[64 * c + 1] = idweights[c][1]; + weights[64 * c + 8] = idweights[c][1]; + weights[64 * c + 9] = idweights[c][2]; + } +} + +float Interpolate(float pos, float max, const float* array, size_t len) { + float scaled_pos = pos * (len - 1) / max; + size_t idx = scaled_pos; + JXL_DASSERT(idx + 1 < len); + float a = array[idx]; + float b = array[idx + 1]; + return a * FastPowf(b / a, scaled_pos - idx); +} + +float Mult(float v) { + if (v > 0.0f) return 1.0f + v; + return 1.0f / (1.0f - v); +} + +using DF4 = HWY_CAPPED(float, 4); + +hwy::HWY_NAMESPACE::Vec InterpolateVec( + hwy::HWY_NAMESPACE::Vec scaled_pos, const float* array) { + HWY_CAPPED(int32_t, 4) di; + + auto idx = ConvertTo(di, scaled_pos); + + auto frac = Sub(scaled_pos, ConvertTo(DF4(), idx)); + + // TODO(veluca): in theory, this could be done with 8 TableLookupBytes, but + // it's probably slower. + auto a = GatherIndex(DF4(), array, idx); + auto b = GatherIndex(DF4(), array + 1, idx); + + return Mul(a, FastPowf(DF4(), Div(b, a), frac)); +} + +// Computes quant weights for a COLS*ROWS-sized transform, using num_bands +// eccentricity bands and num_ebands eccentricity bands. If print_mode is 1, +// prints the resulting matrix; if print_mode is 2, prints the matrix in a +// format suitable for a 3d plot with gnuplot. +Status GetQuantWeights( + size_t ROWS, size_t COLS, + const DctQuantWeightParams::DistanceBandsArray& distance_bands, + size_t num_bands, float* out) { + for (size_t c = 0; c < 3; c++) { + float bands[DctQuantWeightParams::kMaxDistanceBands] = { + distance_bands[c][0]}; + if (bands[0] < kAlmostZero) return JXL_FAILURE("Invalid distance bands"); + for (size_t i = 1; i < num_bands; i++) { + bands[i] = bands[i - 1] * Mult(distance_bands[c][i]); + if (bands[i] < kAlmostZero) return JXL_FAILURE("Invalid distance bands"); + } + float scale = (num_bands - 1) / (kSqrt2 + 1e-6f); + float rcpcol = scale / (COLS - 1); + float rcprow = scale / (ROWS - 1); + JXL_ASSERT(COLS >= Lanes(DF4())); + HWY_ALIGN float l0123[4] = {0, 1, 2, 3}; + for (uint32_t y = 0; y < ROWS; y++) { + float dy = y * rcprow; + float dy2 = dy * dy; + for (uint32_t x = 0; x < COLS; x += Lanes(DF4())) { + auto dx = + Mul(Add(Set(DF4(), x), Load(DF4(), l0123)), Set(DF4(), rcpcol)); + auto scaled_distance = Sqrt(MulAdd(dx, dx, Set(DF4(), dy2))); + auto weight = num_bands == 1 ? Set(DF4(), bands[0]) + : InterpolateVec(scaled_distance, bands); + StoreU(weight, DF4(), out + c * COLS * ROWS + y * COLS + x); + } + } + } + return true; +} + +// TODO(veluca): SIMD-fy. With 256x256, this is actually slow. +Status ComputeQuantTable(const QuantEncoding& encoding, + float* JXL_RESTRICT table, + float* JXL_RESTRICT inv_table, size_t table_num, + DequantMatrices::QuantTable kind, size_t* pos) { + constexpr size_t N = kBlockDim; + size_t wrows = 8 * DequantMatrices::required_size_x[kind], + wcols = 8 * DequantMatrices::required_size_y[kind]; + size_t num = wrows * wcols; + + std::vector weights(3 * num); + + switch (encoding.mode) { + case QuantEncoding::kQuantModeLibrary: { + // Library and copy quant encoding should get replaced by the actual + // parameters by the caller. + JXL_ASSERT(false); + break; + } + case QuantEncoding::kQuantModeID: { + JXL_ASSERT(num == kDCTBlockSize); + GetQuantWeightsIdentity(encoding.idweights, weights.data()); + break; + } + case QuantEncoding::kQuantModeDCT2: { + JXL_ASSERT(num == kDCTBlockSize); + GetQuantWeightsDCT2(encoding.dct2weights, weights.data()); + break; + } + case QuantEncoding::kQuantModeDCT4: { + JXL_ASSERT(num == kDCTBlockSize); + float weights4x4[3 * 4 * 4]; + // Always use 4x4 GetQuantWeights for DCT4 quantization tables. + JXL_RETURN_IF_ERROR( + GetQuantWeights(4, 4, encoding.dct_params.distance_bands, + encoding.dct_params.num_distance_bands, weights4x4)); + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < kBlockDim; y++) { + for (size_t x = 0; x < kBlockDim; x++) { + weights[c * num + y * kBlockDim + x] = + weights4x4[c * 16 + (y / 2) * 4 + (x / 2)]; + } + } + weights[c * num + 1] /= encoding.dct4multipliers[c][0]; + weights[c * num + N] /= encoding.dct4multipliers[c][0]; + weights[c * num + N + 1] /= encoding.dct4multipliers[c][1]; + } + break; + } + case QuantEncoding::kQuantModeDCT4X8: { + JXL_ASSERT(num == kDCTBlockSize); + float weights4x8[3 * 4 * 8]; + // Always use 4x8 GetQuantWeights for DCT4X8 quantization tables. + JXL_RETURN_IF_ERROR( + GetQuantWeights(4, 8, encoding.dct_params.distance_bands, + encoding.dct_params.num_distance_bands, weights4x8)); + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < kBlockDim; y++) { + for (size_t x = 0; x < kBlockDim; x++) { + weights[c * num + y * kBlockDim + x] = + weights4x8[c * 32 + (y / 2) * 8 + x]; + } + } + weights[c * num + N] /= encoding.dct4x8multipliers[c]; + } + break; + } + case QuantEncoding::kQuantModeDCT: { + JXL_RETURN_IF_ERROR(GetQuantWeights( + wrows, wcols, encoding.dct_params.distance_bands, + encoding.dct_params.num_distance_bands, weights.data())); + break; + } + case QuantEncoding::kQuantModeRAW: { + if (!encoding.qraw.qtable || encoding.qraw.qtable->size() != 3 * num) { + return JXL_FAILURE("Invalid table encoding"); + } + for (size_t i = 0; i < 3 * num; i++) { + weights[i] = + 1.f / (encoding.qraw.qtable_den * (*encoding.qraw.qtable)[i]); + } + break; + } + case QuantEncoding::kQuantModeAFV: { + constexpr float kFreqs[] = { + 0xBAD, + 0xBAD, + 0.8517778890324296, + 5.37778436506804, + 0xBAD, + 0xBAD, + 4.734747904497923, + 5.449245381693219, + 1.6598270267479331, + 4, + 7.275749096817861, + 10.423227632456525, + 2.662932286148962, + 7.630657783650829, + 8.962388608184032, + 12.97166202570235, + }; + + float weights4x8[3 * 4 * 8]; + JXL_RETURN_IF_ERROR(( + GetQuantWeights(4, 8, encoding.dct_params.distance_bands, + encoding.dct_params.num_distance_bands, weights4x8))); + float weights4x4[3 * 4 * 4]; + JXL_RETURN_IF_ERROR((GetQuantWeights( + 4, 4, encoding.dct_params_afv_4x4.distance_bands, + encoding.dct_params_afv_4x4.num_distance_bands, weights4x4))); + + constexpr float lo = 0.8517778890324296; + constexpr float hi = 12.97166202570235f - lo + 1e-6f; + for (size_t c = 0; c < 3; c++) { + float bands[4]; + bands[0] = encoding.afv_weights[c][5]; + if (bands[0] < kAlmostZero) return JXL_FAILURE("Invalid AFV bands"); + for (size_t i = 1; i < 4; i++) { + bands[i] = bands[i - 1] * Mult(encoding.afv_weights[c][i + 5]); + if (bands[i] < kAlmostZero) return JXL_FAILURE("Invalid AFV bands"); + } + size_t start = c * 64; + auto set_weight = [&start, &weights](size_t x, size_t y, float val) { + weights[start + y * 8 + x] = val; + }; + weights[start] = 1; // Not used, but causes MSAN error otherwise. + // Weights for (0, 1) and (1, 0). + set_weight(0, 1, encoding.afv_weights[c][0]); + set_weight(1, 0, encoding.afv_weights[c][1]); + // AFV special weights for 3-pixel corner. + set_weight(0, 2, encoding.afv_weights[c][2]); + set_weight(2, 0, encoding.afv_weights[c][3]); + set_weight(2, 2, encoding.afv_weights[c][4]); + + // All other AFV weights. + for (size_t y = 0; y < 4; y++) { + for (size_t x = 0; x < 4; x++) { + if (x < 2 && y < 2) continue; + float val = Interpolate(kFreqs[y * 4 + x] - lo, hi, bands, 4); + set_weight(2 * x, 2 * y, val); + } + } + + // Put 4x8 weights in odd rows, except (1, 0). + for (size_t y = 0; y < kBlockDim / 2; y++) { + for (size_t x = 0; x < kBlockDim; x++) { + if (x == 0 && y == 0) continue; + weights[c * num + (2 * y + 1) * kBlockDim + x] = + weights4x8[c * 32 + y * 8 + x]; + } + } + // Put 4x4 weights in even rows / odd columns, except (0, 1). + for (size_t y = 0; y < kBlockDim / 2; y++) { + for (size_t x = 0; x < kBlockDim / 2; x++) { + if (x == 0 && y == 0) continue; + weights[c * num + (2 * y) * kBlockDim + 2 * x + 1] = + weights4x4[c * 16 + y * 4 + x]; + } + } + } + break; + } + } + size_t prev_pos = *pos; + HWY_CAPPED(float, 64) d; + for (size_t i = 0; i < num * 3; i += Lanes(d)) { + auto inv_val = LoadU(d, weights.data() + i); + if (JXL_UNLIKELY(!AllFalse(d, Ge(inv_val, Set(d, 1.0f / kAlmostZero))) || + !AllFalse(d, Lt(inv_val, Set(d, kAlmostZero))))) { + return JXL_FAILURE("Invalid quantization table"); + } + auto val = Div(Set(d, 1.0f), inv_val); + StoreU(val, d, table + *pos + i); + StoreU(inv_val, d, inv_table + *pos + i); + } + (*pos) += 3 * num; + + // Ensure that the lowest frequencies have a 0 inverse table. + // This does not affect en/decoding, but allows AC strategy selection to be + // slightly simpler. + size_t xs = DequantMatrices::required_size_x[kind]; + size_t ys = DequantMatrices::required_size_y[kind]; + CoefficientLayout(&ys, &xs); + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < ys; y++) { + for (size_t x = 0; x < xs; x++) { + inv_table[prev_pos + c * ys * xs * kDCTBlockSize + y * kBlockDim * xs + + x] = 0; + } + } + } + return true; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace jxl { +namespace { + +HWY_EXPORT(ComputeQuantTable); + +static constexpr const float kAlmostZero = 1e-8f; + +Status DecodeDctParams(BitReader* br, DctQuantWeightParams* params) { + params->num_distance_bands = + br->ReadFixedBits() + 1; + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < params->num_distance_bands; i++) { + JXL_RETURN_IF_ERROR(F16Coder::Read(br, ¶ms->distance_bands[c][i])); + } + if (params->distance_bands[c][0] < kAlmostZero) { + return JXL_FAILURE("Distance band seed is too small"); + } + params->distance_bands[c][0] *= 64.0f; + } + return true; +} + +Status Decode(BitReader* br, QuantEncoding* encoding, size_t required_size_x, + size_t required_size_y, size_t idx, + ModularFrameDecoder* modular_frame_decoder) { + size_t required_size = required_size_x * required_size_y; + required_size_x *= kBlockDim; + required_size_y *= kBlockDim; + int mode = br->ReadFixedBits(); + switch (mode) { + case QuantEncoding::kQuantModeLibrary: { + encoding->predefined = br->ReadFixedBits(); + if (encoding->predefined >= kNumPredefinedTables) { + return JXL_FAILURE("Invalid predefined table"); + } + break; + } + case QuantEncoding::kQuantModeID: { + if (required_size != 1) return JXL_FAILURE("Invalid mode"); + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < 3; i++) { + JXL_RETURN_IF_ERROR(F16Coder::Read(br, &encoding->idweights[c][i])); + if (std::abs(encoding->idweights[c][i]) < kAlmostZero) { + return JXL_FAILURE("ID Quantizer is too small"); + } + encoding->idweights[c][i] *= 64; + } + } + break; + } + case QuantEncoding::kQuantModeDCT2: { + if (required_size != 1) return JXL_FAILURE("Invalid mode"); + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < 6; i++) { + JXL_RETURN_IF_ERROR(F16Coder::Read(br, &encoding->dct2weights[c][i])); + if (std::abs(encoding->dct2weights[c][i]) < kAlmostZero) { + return JXL_FAILURE("Quantizer is too small"); + } + encoding->dct2weights[c][i] *= 64; + } + } + break; + } + case QuantEncoding::kQuantModeDCT4X8: { + if (required_size != 1) return JXL_FAILURE("Invalid mode"); + for (size_t c = 0; c < 3; c++) { + JXL_RETURN_IF_ERROR( + F16Coder::Read(br, &encoding->dct4x8multipliers[c])); + if (std::abs(encoding->dct4x8multipliers[c]) < kAlmostZero) { + return JXL_FAILURE("DCT4X8 multiplier is too small"); + } + } + JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params)); + break; + } + case QuantEncoding::kQuantModeDCT4: { + if (required_size != 1) return JXL_FAILURE("Invalid mode"); + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < 2; i++) { + JXL_RETURN_IF_ERROR( + F16Coder::Read(br, &encoding->dct4multipliers[c][i])); + if (std::abs(encoding->dct4multipliers[c][i]) < kAlmostZero) { + return JXL_FAILURE("DCT4 multiplier is too small"); + } + } + } + JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params)); + break; + } + case QuantEncoding::kQuantModeAFV: { + if (required_size != 1) return JXL_FAILURE("Invalid mode"); + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < 9; i++) { + JXL_RETURN_IF_ERROR(F16Coder::Read(br, &encoding->afv_weights[c][i])); + } + for (size_t i = 0; i < 6; i++) { + encoding->afv_weights[c][i] *= 64; + } + } + JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params)); + JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params_afv_4x4)); + break; + } + case QuantEncoding::kQuantModeDCT: { + JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params)); + break; + } + case QuantEncoding::kQuantModeRAW: { + // Set mode early, to avoid mem-leak. + encoding->mode = QuantEncoding::kQuantModeRAW; + JXL_RETURN_IF_ERROR(ModularFrameDecoder::DecodeQuantTable( + required_size_x, required_size_y, br, encoding, idx, + modular_frame_decoder)); + break; + } + default: + return JXL_FAILURE("Invalid quantization table encoding"); + } + encoding->mode = QuantEncoding::Mode(mode); + return true; +} + +} // namespace + +// These definitions are needed before C++17. +constexpr size_t DequantMatrices::required_size_[]; +constexpr size_t DequantMatrices::required_size_x[]; +constexpr size_t DequantMatrices::required_size_y[]; +constexpr DequantMatrices::QuantTable DequantMatrices::kQuantTable[]; + +Status DequantMatrices::Decode(BitReader* br, + ModularFrameDecoder* modular_frame_decoder) { + size_t all_default = br->ReadBits(1); + size_t num_tables = all_default ? 0 : static_cast(kNum); + encodings_.clear(); + encodings_.resize(kNum, QuantEncoding::Library(0)); + for (size_t i = 0; i < num_tables; i++) { + JXL_RETURN_IF_ERROR( + jxl::Decode(br, &encodings_[i], required_size_x[i % kNum], + required_size_y[i % kNum], i, modular_frame_decoder)); + } + computed_mask_ = 0; + return true; +} + +Status DequantMatrices::DecodeDC(BitReader* br) { + bool all_default = br->ReadBits(1); + if (!br->AllReadsWithinBounds()) return JXL_FAILURE("EOS during DecodeDC"); + if (!all_default) { + for (size_t c = 0; c < 3; c++) { + JXL_RETURN_IF_ERROR(F16Coder::Read(br, &dc_quant_[c])); + dc_quant_[c] *= 1.0f / 128.0f; + // Negative values and nearly zero are invalid values. + if (dc_quant_[c] < kAlmostZero) { + return JXL_FAILURE("Invalid dc_quant: coefficient is too small."); + } + inv_dc_quant_[c] = 1.0f / dc_quant_[c]; + } + } + return true; +} + +constexpr float V(float v) { return static_cast(v); } + +namespace { +struct DequantMatricesLibraryDef { + // DCT8 + static constexpr QuantEncodingInternal DCT() { + return QuantEncodingInternal::DCT(DctQuantWeightParams({{{{ + V(3150.0), + V(0.0), + V(-0.4), + V(-0.4), + V(-0.4), + V(-2.0), + }}, + {{ + V(560.0), + V(0.0), + V(-0.3), + V(-0.3), + V(-0.3), + V(-0.3), + }}, + {{ + V(512.0), + V(-2.0), + V(-1.0), + V(0.0), + V(-1.0), + V(-2.0), + }}}}, + 6)); + } + + // Identity + static constexpr QuantEncodingInternal IDENTITY() { + return QuantEncodingInternal::Identity({{{{ + V(280.0), + V(3160.0), + V(3160.0), + }}, + {{ + V(60.0), + V(864.0), + V(864.0), + }}, + {{ + V(18.0), + V(200.0), + V(200.0), + }}}}); + } + + // DCT2 + static constexpr QuantEncodingInternal DCT2X2() { + return QuantEncodingInternal::DCT2({{{{ + V(3840.0), + V(2560.0), + V(1280.0), + V(640.0), + V(480.0), + V(300.0), + }}, + {{ + V(960.0), + V(640.0), + V(320.0), + V(180.0), + V(140.0), + V(120.0), + }}, + {{ + V(640.0), + V(320.0), + V(128.0), + V(64.0), + V(32.0), + V(16.0), + }}}}); + } + + // DCT4 (quant_kind 3) + static constexpr QuantEncodingInternal DCT4X4() { + return QuantEncodingInternal::DCT4(DctQuantWeightParams({{{{ + V(2200.0), + V(0.0), + V(0.0), + V(0.0), + }}, + {{ + V(392.0), + V(0.0), + V(0.0), + V(0.0), + }}, + {{ + V(112.0), + V(-0.25), + V(-0.25), + V(-0.5), + }}}}, + 4), + /* kMul */ + {{{{ + V(1.0), + V(1.0), + }}, + {{ + V(1.0), + V(1.0), + }}, + {{ + V(1.0), + V(1.0), + }}}}); + } + + // DCT16 + static constexpr QuantEncodingInternal DCT16X16() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{{ + V(8996.8725711814115328), + V(-1.3000777393353804), + V(-0.49424529824571225), + V(-0.439093774457103443), + V(-0.6350101832695744), + V(-0.90177264050827612), + V(-1.6162099239887414), + }}, + {{ + V(3191.48366296844234752), + V(-0.67424582104194355), + V(-0.80745813428471001), + V(-0.44925837484843441), + V(-0.35865440981033403), + V(-0.31322389111877305), + V(-0.37615025315725483), + }}, + {{ + V(1157.50408145487200256), + V(-2.0531423165804414), + V(-1.4), + V(-0.50687130033378396), + V(-0.42708730624733904), + V(-1.4856834539296244), + V(-4.9209142884401604), + }}}}, + 7)); + } + + // DCT32 + static constexpr QuantEncodingInternal DCT32X32() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{{ + V(15718.40830982518931456), + V(-1.025), + V(-0.98), + V(-0.9012), + V(-0.4), + V(-0.48819395464), + V(-0.421064), + V(-0.27), + }}, + {{ + V(7305.7636810695983104), + V(-0.8041958212306401), + V(-0.7633036457487539), + V(-0.55660379990111464), + V(-0.49785304658857626), + V(-0.43699592683512467), + V(-0.40180866526242109), + V(-0.27321683125358037), + }}, + {{ + V(3803.53173721215041536), + V(-3.060733579805728), + V(-2.0413270132490346), + V(-2.0235650159727417), + V(-0.5495389509954993), + V(-0.4), + V(-0.4), + V(-0.3), + }}}}, + 8)); + } + + // DCT16X8 + static constexpr QuantEncodingInternal DCT8X16() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{{ + V(7240.7734393502), + V(-0.7), + V(-0.7), + V(-0.2), + V(-0.2), + V(-0.2), + V(-0.5), + }}, + {{ + V(1448.15468787004), + V(-0.5), + V(-0.5), + V(-0.5), + V(-0.2), + V(-0.2), + V(-0.2), + }}, + {{ + V(506.854140754517), + V(-1.4), + V(-0.2), + V(-0.5), + V(-0.5), + V(-1.5), + V(-3.6), + }}}}, + 7)); + } + + // DCT32X8 + static constexpr QuantEncodingInternal DCT8X32() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{{ + V(16283.2494710648897), + V(-1.7812845336559429), + V(-1.6309059012653515), + V(-1.0382179034313539), + V(-0.85), + V(-0.7), + V(-0.9), + V(-1.2360638576849587), + }}, + {{ + V(5089.15750884921511936), + V(-0.320049391452786891), + V(-0.35362849922161446), + V(-0.30340000000000003), + V(-0.61), + V(-0.5), + V(-0.5), + V(-0.6), + }}, + {{ + V(3397.77603275308720128), + V(-0.321327362693153371), + V(-0.34507619223117997), + V(-0.70340000000000003), + V(-0.9), + V(-1.0), + V(-1.0), + V(-1.1754605576265209), + }}}}, + 8)); + } + + // DCT32X16 + static constexpr QuantEncodingInternal DCT16X32() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{{ + V(13844.97076442300573), + V(-0.97113799999999995), + V(-0.658), + V(-0.42026), + V(-0.22712), + V(-0.2206), + V(-0.226), + V(-0.6), + }}, + {{ + V(4798.964084220744293), + V(-0.61125308982767057), + V(-0.83770786552491361), + V(-0.79014862079498627), + V(-0.2692727459704829), + V(-0.38272769465388551), + V(-0.22924222653091453), + V(-0.20719098826199578), + }}, + {{ + V(1807.236946760964614), + V(-1.2), + V(-1.2), + V(-0.7), + V(-0.7), + V(-0.7), + V(-0.4), + V(-0.5), + }}}}, + 8)); + } + + // DCT4X8 and 8x4 + static constexpr QuantEncodingInternal DCT4X8() { + return QuantEncodingInternal::DCT4X8( + DctQuantWeightParams({{ + {{ + V(2198.050556016380522), + V(-0.96269623020744692), + V(-0.76194253026666783), + V(-0.6551140670773547), + }}, + {{ + V(764.3655248643528689), + V(-0.92630200888366945), + V(-0.9675229603596517), + V(-0.27845290869168118), + }}, + {{ + V(527.107573587542228), + V(-1.4594385811273854), + V(-1.450082094097871593), + V(-1.5843722511996204), + }}, + }}, + 4), + /* kMuls */ + {{ + V(1.0), + V(1.0), + V(1.0), + }}); + } + // AFV + static QuantEncodingInternal AFV0() { + return QuantEncodingInternal::AFV(DCT4X8().dct_params, DCT4X4().dct_params, + {{{{ + // 4x4/4x8 DC tendency. + V(3072.0), + V(3072.0), + // AFV corner. + V(256.0), + V(256.0), + V(256.0), + // AFV high freqs. + V(414.0), + V(0.0), + V(0.0), + V(0.0), + }}, + {{ + // 4x4/4x8 DC tendency. + V(1024.0), + V(1024.0), + // AFV corner. + V(50), + V(50), + V(50), + // AFV high freqs. + V(58.0), + V(0.0), + V(0.0), + V(0.0), + }}, + {{ + // 4x4/4x8 DC tendency. + V(384.0), + V(384.0), + // AFV corner. + V(12.0), + V(12.0), + V(12.0), + // AFV high freqs. + V(22.0), + V(-0.25), + V(-0.25), + V(-0.25), + }}}}); + } + + // DCT64 + static QuantEncodingInternal DCT64X64() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{{ + V(0.9 * 26629.073922049845), + V(-1.025), + V(-0.78), + V(-0.65012), + V(-0.19041574084286472), + V(-0.20819395464), + V(-0.421064), + V(-0.32733845535848671), + }}, + {{ + V(0.9 * 9311.3238710010046), + V(-0.3041958212306401), + V(-0.3633036457487539), + V(-0.35660379990111464), + V(-0.3443074455424403), + V(-0.33699592683512467), + V(-0.30180866526242109), + V(-0.27321683125358037), + }}, + {{ + V(0.9 * 4992.2486445538634), + V(-1.2), + V(-1.2), + V(-0.8), + V(-0.7), + V(-0.7), + V(-0.4), + V(-0.5), + }}}}, + 8)); + } + + // DCT64X32 + static QuantEncodingInternal DCT32X64() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{{ + V(0.65 * 23629.073922049845), + V(-1.025), + V(-0.78), + V(-0.65012), + V(-0.19041574084286472), + V(-0.20819395464), + V(-0.421064), + V(-0.32733845535848671), + }}, + {{ + V(0.65 * 8611.3238710010046), + V(-0.3041958212306401), + V(-0.3633036457487539), + V(-0.35660379990111464), + V(-0.3443074455424403), + V(-0.33699592683512467), + V(-0.30180866526242109), + V(-0.27321683125358037), + }}, + {{ + V(0.65 * 4492.2486445538634), + V(-1.2), + V(-1.2), + V(-0.8), + V(-0.7), + V(-0.7), + V(-0.4), + V(-0.5), + }}}}, + 8)); + } + // DCT128X128 + static QuantEncodingInternal DCT128X128() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{{ + V(1.8 * 26629.073922049845), + V(-1.025), + V(-0.78), + V(-0.65012), + V(-0.19041574084286472), + V(-0.20819395464), + V(-0.421064), + V(-0.32733845535848671), + }}, + {{ + V(1.8 * 9311.3238710010046), + V(-0.3041958212306401), + V(-0.3633036457487539), + V(-0.35660379990111464), + V(-0.3443074455424403), + V(-0.33699592683512467), + V(-0.30180866526242109), + V(-0.27321683125358037), + }}, + {{ + V(1.8 * 4992.2486445538634), + V(-1.2), + V(-1.2), + V(-0.8), + V(-0.7), + V(-0.7), + V(-0.4), + V(-0.5), + }}}}, + 8)); + } + + // DCT128X64 + static QuantEncodingInternal DCT64X128() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{{ + V(1.3 * 23629.073922049845), + V(-1.025), + V(-0.78), + V(-0.65012), + V(-0.19041574084286472), + V(-0.20819395464), + V(-0.421064), + V(-0.32733845535848671), + }}, + {{ + V(1.3 * 8611.3238710010046), + V(-0.3041958212306401), + V(-0.3633036457487539), + V(-0.35660379990111464), + V(-0.3443074455424403), + V(-0.33699592683512467), + V(-0.30180866526242109), + V(-0.27321683125358037), + }}, + {{ + V(1.3 * 4492.2486445538634), + V(-1.2), + V(-1.2), + V(-0.8), + V(-0.7), + V(-0.7), + V(-0.4), + V(-0.5), + }}}}, + 8)); + } + // DCT256X256 + static QuantEncodingInternal DCT256X256() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{{ + V(3.6 * 26629.073922049845), + V(-1.025), + V(-0.78), + V(-0.65012), + V(-0.19041574084286472), + V(-0.20819395464), + V(-0.421064), + V(-0.32733845535848671), + }}, + {{ + V(3.6 * 9311.3238710010046), + V(-0.3041958212306401), + V(-0.3633036457487539), + V(-0.35660379990111464), + V(-0.3443074455424403), + V(-0.33699592683512467), + V(-0.30180866526242109), + V(-0.27321683125358037), + }}, + {{ + V(3.6 * 4992.2486445538634), + V(-1.2), + V(-1.2), + V(-0.8), + V(-0.7), + V(-0.7), + V(-0.4), + V(-0.5), + }}}}, + 8)); + } + + // DCT256X128 + static QuantEncodingInternal DCT128X256() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{{ + V(2.6 * 23629.073922049845), + V(-1.025), + V(-0.78), + V(-0.65012), + V(-0.19041574084286472), + V(-0.20819395464), + V(-0.421064), + V(-0.32733845535848671), + }}, + {{ + V(2.6 * 8611.3238710010046), + V(-0.3041958212306401), + V(-0.3633036457487539), + V(-0.35660379990111464), + V(-0.3443074455424403), + V(-0.33699592683512467), + V(-0.30180866526242109), + V(-0.27321683125358037), + }}, + {{ + V(2.6 * 4492.2486445538634), + V(-1.2), + V(-1.2), + V(-0.8), + V(-0.7), + V(-0.7), + V(-0.4), + V(-0.5), + }}}}, + 8)); + } +}; +} // namespace + +DequantMatrices::DequantLibraryInternal DequantMatrices::LibraryInit() { + static_assert(kNum == 17, + "Update this function when adding new quantization kinds."); + static_assert(kNumPredefinedTables == 1, + "Update this function when adding new quantization matrices to " + "the library."); + + // The library and the indices need to be kept in sync manually. + static_assert(0 == DCT, "Update the DequantLibrary array below."); + static_assert(1 == IDENTITY, "Update the DequantLibrary array below."); + static_assert(2 == DCT2X2, "Update the DequantLibrary array below."); + static_assert(3 == DCT4X4, "Update the DequantLibrary array below."); + static_assert(4 == DCT16X16, "Update the DequantLibrary array below."); + static_assert(5 == DCT32X32, "Update the DequantLibrary array below."); + static_assert(6 == DCT8X16, "Update the DequantLibrary array below."); + static_assert(7 == DCT8X32, "Update the DequantLibrary array below."); + static_assert(8 == DCT16X32, "Update the DequantLibrary array below."); + static_assert(9 == DCT4X8, "Update the DequantLibrary array below."); + static_assert(10 == AFV0, "Update the DequantLibrary array below."); + static_assert(11 == DCT64X64, "Update the DequantLibrary array below."); + static_assert(12 == DCT32X64, "Update the DequantLibrary array below."); + static_assert(13 == DCT128X128, "Update the DequantLibrary array below."); + static_assert(14 == DCT64X128, "Update the DequantLibrary array below."); + static_assert(15 == DCT256X256, "Update the DequantLibrary array below."); + static_assert(16 == DCT128X256, "Update the DequantLibrary array below."); + return DequantMatrices::DequantLibraryInternal{{ + DequantMatricesLibraryDef::DCT(), + DequantMatricesLibraryDef::IDENTITY(), + DequantMatricesLibraryDef::DCT2X2(), + DequantMatricesLibraryDef::DCT4X4(), + DequantMatricesLibraryDef::DCT16X16(), + DequantMatricesLibraryDef::DCT32X32(), + DequantMatricesLibraryDef::DCT8X16(), + DequantMatricesLibraryDef::DCT8X32(), + DequantMatricesLibraryDef::DCT16X32(), + DequantMatricesLibraryDef::DCT4X8(), + DequantMatricesLibraryDef::AFV0(), + DequantMatricesLibraryDef::DCT64X64(), + DequantMatricesLibraryDef::DCT32X64(), + // Same default for large transforms (128+) as for 64x* transforms. + DequantMatricesLibraryDef::DCT128X128(), + DequantMatricesLibraryDef::DCT64X128(), + DequantMatricesLibraryDef::DCT256X256(), + DequantMatricesLibraryDef::DCT128X256(), + }}; +} + +const QuantEncoding* DequantMatrices::Library() { + static const DequantMatrices::DequantLibraryInternal kDequantLibrary = + DequantMatrices::LibraryInit(); + // Downcast the result to a const QuantEncoding* from QuantEncodingInternal* + // since the subclass (QuantEncoding) doesn't add any new members and users + // will need to upcast to QuantEncodingInternal to access the members of that + // class. This allows to have kDequantLibrary as a constexpr value while still + // allowing to create QuantEncoding::RAW() instances that use std::vector in + // C++11. + return reinterpret_cast(kDequantLibrary.data()); +} + +DequantMatrices::DequantMatrices() { + encodings_.resize(size_t(QuantTable::kNum), QuantEncoding::Library(0)); + size_t pos = 0; + size_t offsets[kNum * 3]; + for (size_t i = 0; i < size_t(QuantTable::kNum); i++) { + size_t num = required_size_[i] * kDCTBlockSize; + for (size_t c = 0; c < 3; c++) { + offsets[3 * i + c] = pos + c * num; + } + pos += 3 * num; + } + for (size_t i = 0; i < AcStrategy::kNumValidStrategies; i++) { + for (size_t c = 0; c < 3; c++) { + table_offsets_[i * 3 + c] = offsets[kQuantTable[i] * 3 + c]; + } + } +} + +Status DequantMatrices::EnsureComputed(uint32_t acs_mask) { + const QuantEncoding* library = Library(); + + if (!table_storage_) { + table_storage_ = hwy::AllocateAligned(2 * kTotalTableSize); + table_ = table_storage_.get(); + inv_table_ = table_storage_.get() + kTotalTableSize; + } + + size_t offsets[kNum * 3 + 1]; + size_t pos = 0; + for (size_t i = 0; i < kNum; i++) { + size_t num = required_size_[i] * kDCTBlockSize; + for (size_t c = 0; c < 3; c++) { + offsets[3 * i + c] = pos + c * num; + } + pos += 3 * num; + } + offsets[kNum * 3] = pos; + JXL_ASSERT(pos == kTotalTableSize); + + uint32_t kind_mask = 0; + for (size_t i = 0; i < AcStrategy::kNumValidStrategies; i++) { + if (acs_mask & (1u << i)) { + kind_mask |= 1u << kQuantTable[i]; + } + } + uint32_t computed_kind_mask = 0; + for (size_t i = 0; i < AcStrategy::kNumValidStrategies; i++) { + if (computed_mask_ & (1u << i)) { + computed_kind_mask |= 1u << kQuantTable[i]; + } + } + for (size_t table = 0; table < kNum; table++) { + if ((1 << table) & computed_kind_mask) continue; + if ((1 << table) & ~kind_mask) continue; + size_t pos = offsets[table * 3]; + if (encodings_[table].mode == QuantEncoding::kQuantModeLibrary) { + JXL_CHECK(HWY_DYNAMIC_DISPATCH(ComputeQuantTable)( + library[table], table_storage_.get(), + table_storage_.get() + kTotalTableSize, table, QuantTable(table), + &pos)); + } else { + JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(ComputeQuantTable)( + encodings_[table], table_storage_.get(), + table_storage_.get() + kTotalTableSize, table, QuantTable(table), + &pos)); + } + JXL_ASSERT(pos == offsets[table * 3 + 3]); + } + computed_mask_ |= acs_mask; + + return true; +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/quant_weights.h b/third_party/jpeg-xl/lib/jxl/quant_weights.h new file mode 100644 index 0000000000..d76fc1d1e6 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/quant_weights.h @@ -0,0 +1,448 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_QUANT_WEIGHTS_H_ +#define LIB_JXL_QUANT_WEIGHTS_H_ + +#include +#include + +#include +#include +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/cache_aligned.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/image.h" + +namespace jxl { + +template +constexpr T ArraySum(T (&a)[N], size_t i = N - 1) { + static_assert(N > 0, "Trying to compute the sum of an empty array"); + return i == 0 ? a[0] : a[i] + ArraySum(a, i - 1); +} + +static constexpr size_t kMaxQuantTableSize = AcStrategy::kMaxCoeffArea; +static constexpr size_t kNumPredefinedTables = 1; +static constexpr size_t kCeilLog2NumPredefinedTables = 0; +static constexpr size_t kLog2NumQuantModes = 3; + +struct DctQuantWeightParams { + static constexpr size_t kLog2MaxDistanceBands = 4; + static constexpr size_t kMaxDistanceBands = 1 + (1 << kLog2MaxDistanceBands); + typedef std::array, 3> + DistanceBandsArray; + + size_t num_distance_bands = 0; + DistanceBandsArray distance_bands = {}; + + constexpr DctQuantWeightParams() : num_distance_bands(0) {} + + constexpr DctQuantWeightParams(const DistanceBandsArray& dist_bands, + size_t num_dist_bands) + : num_distance_bands(num_dist_bands), distance_bands(dist_bands) {} + + template + explicit DctQuantWeightParams(const float dist_bands[3][num_dist_bands]) { + num_distance_bands = num_dist_bands; + for (size_t c = 0; c < 3; c++) { + memcpy(distance_bands[c].data(), dist_bands[c], + sizeof(float) * num_dist_bands); + } + } +}; + +// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding) +struct QuantEncodingInternal { + enum Mode { + kQuantModeLibrary, + kQuantModeID, + kQuantModeDCT2, + kQuantModeDCT4, + kQuantModeDCT4X8, + kQuantModeAFV, + kQuantModeDCT, + kQuantModeRAW, + }; + + template + struct Tag {}; + + typedef std::array, 3> IdWeights; + typedef std::array, 3> DCT2Weights; + typedef std::array, 3> DCT4Multipliers; + typedef std::array, 3> AFVWeights; + typedef std::array DCT4x8Multipliers; + + static constexpr QuantEncodingInternal Library(uint8_t predefined) { + return ((predefined < kNumPredefinedTables) || + JXL_ABORT("Assert predefined < kNumPredefinedTables")), + QuantEncodingInternal(Tag(), predefined); + } + constexpr QuantEncodingInternal(Tag /* tag */, + uint8_t predefined) + : mode(kQuantModeLibrary), predefined(predefined) {} + + // Identity + // xybweights is an array of {xweights, yweights, bweights}. + static constexpr QuantEncodingInternal Identity(const IdWeights& xybweights) { + return QuantEncodingInternal(Tag(), xybweights); + } + constexpr QuantEncodingInternal(Tag /* tag */, + const IdWeights& xybweights) + : mode(kQuantModeID), idweights(xybweights) {} + + // DCT2 + static constexpr QuantEncodingInternal DCT2(const DCT2Weights& xybweights) { + return QuantEncodingInternal(Tag(), xybweights); + } + constexpr QuantEncodingInternal(Tag /* tag */, + const DCT2Weights& xybweights) + : mode(kQuantModeDCT2), dct2weights(xybweights) {} + + // DCT4 + static constexpr QuantEncodingInternal DCT4( + const DctQuantWeightParams& params, const DCT4Multipliers& xybmul) { + return QuantEncodingInternal(Tag(), params, xybmul); + } + constexpr QuantEncodingInternal(Tag /* tag */, + const DctQuantWeightParams& params, + const DCT4Multipliers& xybmul) + : mode(kQuantModeDCT4), dct_params(params), dct4multipliers(xybmul) {} + + // DCT4x8 + static constexpr QuantEncodingInternal DCT4X8( + const DctQuantWeightParams& params, const DCT4x8Multipliers& xybmul) { + return QuantEncodingInternal(Tag(), params, xybmul); + } + constexpr QuantEncodingInternal(Tag /* tag */, + const DctQuantWeightParams& params, + const DCT4x8Multipliers& xybmul) + : mode(kQuantModeDCT4X8), dct_params(params), dct4x8multipliers(xybmul) {} + + // DCT + static constexpr QuantEncodingInternal DCT( + const DctQuantWeightParams& params) { + return QuantEncodingInternal(Tag(), params); + } + constexpr QuantEncodingInternal(Tag /* tag */, + const DctQuantWeightParams& params) + : mode(kQuantModeDCT), dct_params(params) {} + + // AFV + static constexpr QuantEncodingInternal AFV( + const DctQuantWeightParams& params4x8, + const DctQuantWeightParams& params4x4, const AFVWeights& weights) { + return QuantEncodingInternal(Tag(), params4x8, params4x4, + weights); + } + constexpr QuantEncodingInternal(Tag /* tag */, + const DctQuantWeightParams& params4x8, + const DctQuantWeightParams& params4x4, + const AFVWeights& weights) + : mode(kQuantModeAFV), + dct_params(params4x8), + afv_weights(weights), + dct_params_afv_4x4(params4x4) {} + + // This constructor is not constexpr so it can't be used in any of the + // constexpr cases above. + explicit QuantEncodingInternal(Mode mode) : mode(mode) {} + + Mode mode; + + // Weights for DCT4+ tables. + DctQuantWeightParams dct_params; + + union { + // Weights for identity. + IdWeights idweights; + + // Weights for DCT2. + DCT2Weights dct2weights; + + // Extra multipliers for coefficients 01/10 and 11 for DCT4 and AFV. + DCT4Multipliers dct4multipliers; + + // Weights for AFV. {0, 1} are used directly for coefficients (0, 1) and (1, + // 0); {2, 3, 4} are used directly corner DC, (1,0) - (0,1) and (0, 1) + + // (1, 0) - (0, 0) inside the AFV block. Values from 5 to 8 are interpolated + // as in GetQuantWeights for DC and are used for other coefficients. + AFVWeights afv_weights = {}; + + // Extra multipliers for coefficients 01 or 10 for DCT4X8 and DCT8X4. + DCT4x8Multipliers dct4x8multipliers; + + // Only used in kQuantModeRAW mode. + struct { + // explicit quantization table (like in JPEG) + std::vector* qtable = nullptr; + float qtable_den = 1.f / (8 * 255); + } qraw; + }; + + // Weights for 4x4 sub-block in AFV. + DctQuantWeightParams dct_params_afv_4x4; + + union { + // Which predefined table to use. Only used if mode is kQuantModeLibrary. + uint8_t predefined = 0; + + // Which other quant table to copy; must copy from a table that comes before + // the current one. Only used if mode is kQuantModeCopy. + uint8_t source; + }; +}; + +class QuantEncoding final : public QuantEncodingInternal { + public: + QuantEncoding(const QuantEncoding& other) + : QuantEncodingInternal( + static_cast(other)) { + if (mode == kQuantModeRAW && qraw.qtable) { + // Need to make a copy of the passed *qtable. + qraw.qtable = new std::vector(*other.qraw.qtable); + } + } + QuantEncoding(QuantEncoding&& other) noexcept + : QuantEncodingInternal( + static_cast(other)) { + // Steal the qtable from the other object if any. + if (mode == kQuantModeRAW) { + other.qraw.qtable = nullptr; + } + } + QuantEncoding& operator=(const QuantEncoding& other) { + if (mode == kQuantModeRAW && qraw.qtable) { + delete qraw.qtable; + } + *static_cast(this) = + QuantEncodingInternal(static_cast(other)); + if (mode == kQuantModeRAW && qraw.qtable) { + // Need to make a copy of the passed *qtable. + qraw.qtable = new std::vector(*other.qraw.qtable); + } + return *this; + } + + ~QuantEncoding() { + if (mode == kQuantModeRAW && qraw.qtable) { + delete qraw.qtable; + } + } + + // Wrappers of the QuantEncodingInternal:: static functions that return a + // QuantEncoding instead. This is using the explicit and private cast from + // QuantEncodingInternal to QuantEncoding, which would be inlined anyway. + // In general, you should use this wrappers. The only reason to directly + // create a QuantEncodingInternal instance is if you need a constexpr version + // of this class. Note that RAW() is not supported in that case since it uses + // a std::vector. + static QuantEncoding Library(uint8_t predefined_arg) { + return QuantEncoding(QuantEncodingInternal::Library(predefined_arg)); + } + static QuantEncoding Identity(const IdWeights& xybweights) { + return QuantEncoding(QuantEncodingInternal::Identity(xybweights)); + } + static QuantEncoding DCT2(const DCT2Weights& xybweights) { + return QuantEncoding(QuantEncodingInternal::DCT2(xybweights)); + } + static QuantEncoding DCT4(const DctQuantWeightParams& params, + const DCT4Multipliers& xybmul) { + return QuantEncoding(QuantEncodingInternal::DCT4(params, xybmul)); + } + static QuantEncoding DCT4X8(const DctQuantWeightParams& params, + const DCT4x8Multipliers& xybmul) { + return QuantEncoding(QuantEncodingInternal::DCT4X8(params, xybmul)); + } + static QuantEncoding DCT(const DctQuantWeightParams& params) { + return QuantEncoding(QuantEncodingInternal::DCT(params)); + } + static QuantEncoding AFV(const DctQuantWeightParams& params4x8, + const DctQuantWeightParams& params4x4, + const AFVWeights& weights) { + return QuantEncoding( + QuantEncodingInternal::AFV(params4x8, params4x4, weights)); + } + + // RAW, note that this one is not a constexpr one. + static QuantEncoding RAW(const std::vector& qtable, int shift = 0) { + QuantEncoding encoding(kQuantModeRAW); + encoding.qraw.qtable = new std::vector(); + *encoding.qraw.qtable = qtable; + encoding.qraw.qtable_den = (1 << shift) * (1.f / (8 * 255)); + return encoding; + } + + private: + explicit QuantEncoding(const QuantEncodingInternal& other) + : QuantEncodingInternal(other) {} + + explicit QuantEncoding(QuantEncodingInternal::Mode mode_arg) + : QuantEncodingInternal(mode_arg) {} +}; + +// A constexpr QuantEncodingInternal instance is often downcasted to the +// QuantEncoding subclass even if the instance wasn't an instance of the +// subclass. This is safe because user will upcast to QuantEncodingInternal to +// access any of its members. +static_assert(sizeof(QuantEncoding) == sizeof(QuantEncodingInternal), + "Don't add any members to QuantEncoding"); + +// Let's try to keep these 2**N for possible future simplicity. +const float kInvDCQuant[3] = { + 4096.0f, + 512.0f, + 256.0f, +}; + +const float kDCQuant[3] = { + 1.0f / kInvDCQuant[0], + 1.0f / kInvDCQuant[1], + 1.0f / kInvDCQuant[2], +}; + +class ModularFrameEncoder; +class ModularFrameDecoder; + +class DequantMatrices { + public: + enum QuantTable : size_t { + DCT = 0, + IDENTITY, + DCT2X2, + DCT4X4, + DCT16X16, + DCT32X32, + // DCT16X8 + DCT8X16, + // DCT32X8 + DCT8X32, + // DCT32X16 + DCT16X32, + DCT4X8, + // DCT8X4 + AFV0, + // AFV1 + // AFV2 + // AFV3 + DCT64X64, + // DCT64X32, + DCT32X64, + DCT128X128, + // DCT128X64, + DCT64X128, + DCT256X256, + // DCT256X128, + DCT128X256, + kNum + }; + + static constexpr QuantTable kQuantTable[] = { + QuantTable::DCT, QuantTable::IDENTITY, QuantTable::DCT2X2, + QuantTable::DCT4X4, QuantTable::DCT16X16, QuantTable::DCT32X32, + QuantTable::DCT8X16, QuantTable::DCT8X16, QuantTable::DCT8X32, + QuantTable::DCT8X32, QuantTable::DCT16X32, QuantTable::DCT16X32, + QuantTable::DCT4X8, QuantTable::DCT4X8, QuantTable::AFV0, + QuantTable::AFV0, QuantTable::AFV0, QuantTable::AFV0, + QuantTable::DCT64X64, QuantTable::DCT32X64, QuantTable::DCT32X64, + QuantTable::DCT128X128, QuantTable::DCT64X128, QuantTable::DCT64X128, + QuantTable::DCT256X256, QuantTable::DCT128X256, QuantTable::DCT128X256, + }; + static_assert(AcStrategy::kNumValidStrategies == + sizeof(kQuantTable) / sizeof *kQuantTable, + "Update this array when adding or removing AC strategies."); + + DequantMatrices(); + + static const QuantEncoding* Library(); + + typedef std::array + DequantLibraryInternal; + // Return the array of library kNumPredefinedTables QuantEncoding entries as + // a constexpr array. Use Library() to obtain a pointer to the copy in the + // .cc file. + static DequantLibraryInternal LibraryInit(); + + // Returns aligned memory. + JXL_INLINE const float* Matrix(size_t quant_kind, size_t c) const { + JXL_DASSERT(quant_kind < AcStrategy::kNumValidStrategies); + JXL_DASSERT((1 << quant_kind) & computed_mask_); + return &table_[table_offsets_[quant_kind * 3 + c]]; + } + + JXL_INLINE const float* InvMatrix(size_t quant_kind, size_t c) const { + JXL_DASSERT(quant_kind < AcStrategy::kNumValidStrategies); + JXL_DASSERT((1 << quant_kind) & computed_mask_); + return &inv_table_[table_offsets_[quant_kind * 3 + c]]; + } + + // DC quants are used in modular mode for XYB multipliers. + JXL_INLINE float DCQuant(size_t c) const { return dc_quant_[c]; } + JXL_INLINE const float* DCQuants() const { return dc_quant_; } + + JXL_INLINE float InvDCQuant(size_t c) const { return inv_dc_quant_[c]; } + + // For encoder. + void SetEncodings(const std::vector& encodings) { + encodings_ = encodings; + computed_mask_ = 0; + } + + // For encoder. + void SetDCQuant(const float dc[3]) { + for (size_t c = 0; c < 3; c++) { + dc_quant_[c] = 1.0f / dc[c]; + inv_dc_quant_[c] = dc[c]; + } + } + + Status Decode(BitReader* br, + ModularFrameDecoder* modular_frame_decoder = nullptr); + Status DecodeDC(BitReader* br); + + const std::vector& encodings() const { return encodings_; } + + static constexpr size_t required_size_x[] = {1, 1, 1, 1, 2, 4, 1, 1, 2, + 1, 1, 8, 4, 16, 8, 32, 16}; + static_assert(kNum == sizeof(required_size_x) / sizeof(*required_size_x), + "Update this array when adding or removing quant tables."); + + static constexpr size_t required_size_y[] = {1, 1, 1, 1, 2, 4, 2, 4, 4, + 1, 1, 8, 8, 16, 16, 32, 32}; + static_assert(kNum == sizeof(required_size_y) / sizeof(*required_size_y), + "Update this array when adding or removing quant tables."); + + Status EnsureComputed(uint32_t acs_mask); + + private: + static constexpr size_t required_size_[] = { + 1, 1, 1, 1, 4, 16, 2, 4, 8, 1, 1, 64, 32, 256, 128, 1024, 512}; + static_assert(kNum == sizeof(required_size_) / sizeof(*required_size_), + "Update this array when adding or removing quant tables."); + static constexpr size_t kTotalTableSize = + ArraySum(required_size_) * kDCTBlockSize * 3; + + uint32_t computed_mask_ = 0; + // kTotalTableSize entries followed by kTotalTableSize for inv_table + hwy::AlignedFreeUniquePtr table_storage_; + const float* table_; + const float* inv_table_; + float dc_quant_[3] = {kDCQuant[0], kDCQuant[1], kDCQuant[2]}; + float inv_dc_quant_[3] = {kInvDCQuant[0], kInvDCQuant[1], kInvDCQuant[2]}; + size_t table_offsets_[AcStrategy::kNumValidStrategies * 3]; + std::vector encodings_; +}; + +} // namespace jxl + +#endif // LIB_JXL_QUANT_WEIGHTS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/quant_weights_test.cc b/third_party/jpeg-xl/lib/jxl/quant_weights_test.cc new file mode 100644 index 0000000000..f0497948a7 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/quant_weights_test.cc @@ -0,0 +1,240 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +#include "lib/jxl/quant_weights.h" + +#include + +#include +#include +#include // HWY_ALIGN_MAX +#include +#include + +#include "lib/jxl/base/random.h" +#include "lib/jxl/dct_for_test.h" +#include "lib/jxl/dec_transforms_testonly.h" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/enc_transforms.h" + +namespace jxl { +namespace { + +template +void CheckSimilar(T a, T b) { + EXPECT_EQ(a, b); +} +// minimum exponent = -15. +template <> +void CheckSimilar(float a, float b) { + float m = std::max(std::abs(a), std::abs(b)); + // 10 bits of precision are used in the format. Relative error should be + // below 2^-10. + EXPECT_LE(std::abs(a - b), m / 1024.0f) << "a: " << a << " b: " << b; +} + +TEST(QuantWeightsTest, DC) { + DequantMatrices mat; + float dc_quant[3] = {1e+5, 1e+3, 1e+1}; + DequantMatricesSetCustomDC(&mat, dc_quant); + for (size_t c = 0; c < 3; c++) { + CheckSimilar(mat.InvDCQuant(c), dc_quant[c]); + } +} + +void RoundtripMatrices(const std::vector& encodings) { + ASSERT_TRUE(encodings.size() == DequantMatrices::kNum); + DequantMatrices mat; + CodecMetadata metadata; + FrameHeader frame_header(&metadata); + ModularFrameEncoder encoder(frame_header, CompressParams{}); + DequantMatricesSetCustom(&mat, encodings, &encoder); + const std::vector& encodings_dec = mat.encodings(); + for (size_t i = 0; i < encodings.size(); i++) { + const QuantEncoding& e = encodings[i]; + const QuantEncoding& d = encodings_dec[i]; + // Check values roundtripped correctly. + EXPECT_EQ(e.mode, d.mode); + EXPECT_EQ(e.predefined, d.predefined); + EXPECT_EQ(e.source, d.source); + + EXPECT_EQ(static_cast(e.dct_params.num_distance_bands), + static_cast(d.dct_params.num_distance_bands)); + for (size_t c = 0; c < 3; c++) { + for (size_t j = 0; j < DctQuantWeightParams::kMaxDistanceBands; j++) { + CheckSimilar(e.dct_params.distance_bands[c][j], + d.dct_params.distance_bands[c][j]); + } + } + + if (e.mode == QuantEncoding::kQuantModeRAW) { + EXPECT_FALSE(!e.qraw.qtable); + EXPECT_FALSE(!d.qraw.qtable); + EXPECT_EQ(e.qraw.qtable->size(), d.qraw.qtable->size()); + for (size_t j = 0; j < e.qraw.qtable->size(); j++) { + EXPECT_EQ((*e.qraw.qtable)[j], (*d.qraw.qtable)[j]); + } + EXPECT_NEAR(e.qraw.qtable_den, d.qraw.qtable_den, 1e-7f); + } else { + // modes different than kQuantModeRAW use one of the other fields used + // here, which all happen to be arrays of floats. + for (size_t c = 0; c < 3; c++) { + for (size_t j = 0; j < 3; j++) { + CheckSimilar(e.idweights[c][j], d.idweights[c][j]); + } + for (size_t j = 0; j < 6; j++) { + CheckSimilar(e.dct2weights[c][j], d.dct2weights[c][j]); + } + for (size_t j = 0; j < 2; j++) { + CheckSimilar(e.dct4multipliers[c][j], d.dct4multipliers[c][j]); + } + CheckSimilar(e.dct4x8multipliers[c], d.dct4x8multipliers[c]); + for (size_t j = 0; j < 9; j++) { + CheckSimilar(e.afv_weights[c][j], d.afv_weights[c][j]); + } + for (size_t j = 0; j < DctQuantWeightParams::kMaxDistanceBands; j++) { + CheckSimilar(e.dct_params_afv_4x4.distance_bands[c][j], + d.dct_params_afv_4x4.distance_bands[c][j]); + } + } + } + } +} + +TEST(QuantWeightsTest, AllDefault) { + std::vector encodings(DequantMatrices::kNum, + QuantEncoding::Library(0)); + RoundtripMatrices(encodings); +} + +void TestSingleQuantMatrix(DequantMatrices::QuantTable kind) { + std::vector encodings(DequantMatrices::kNum, + QuantEncoding::Library(0)); + encodings[kind] = DequantMatrices::Library()[kind]; + RoundtripMatrices(encodings); +} + +// Ensure we can reasonably represent default quant tables. +TEST(QuantWeightsTest, DCT) { TestSingleQuantMatrix(DequantMatrices::DCT); } +TEST(QuantWeightsTest, IDENTITY) { + TestSingleQuantMatrix(DequantMatrices::IDENTITY); +} +TEST(QuantWeightsTest, DCT2X2) { + TestSingleQuantMatrix(DequantMatrices::DCT2X2); +} +TEST(QuantWeightsTest, DCT4X4) { + TestSingleQuantMatrix(DequantMatrices::DCT4X4); +} +TEST(QuantWeightsTest, DCT16X16) { + TestSingleQuantMatrix(DequantMatrices::DCT16X16); +} +TEST(QuantWeightsTest, DCT32X32) { + TestSingleQuantMatrix(DequantMatrices::DCT32X32); +} +TEST(QuantWeightsTest, DCT8X16) { + TestSingleQuantMatrix(DequantMatrices::DCT8X16); +} +TEST(QuantWeightsTest, DCT8X32) { + TestSingleQuantMatrix(DequantMatrices::DCT8X32); +} +TEST(QuantWeightsTest, DCT16X32) { + TestSingleQuantMatrix(DequantMatrices::DCT16X32); +} +TEST(QuantWeightsTest, DCT4X8) { + TestSingleQuantMatrix(DequantMatrices::DCT4X8); +} +TEST(QuantWeightsTest, AFV0) { TestSingleQuantMatrix(DequantMatrices::AFV0); } +TEST(QuantWeightsTest, RAW) { + std::vector encodings(DequantMatrices::kNum, + QuantEncoding::Library(0)); + std::vector matrix(3 * 32 * 32); + Rng rng(0); + for (size_t i = 0; i < matrix.size(); i++) matrix[i] = rng.UniformI(1, 256); + encodings[DequantMatrices::kQuantTable[AcStrategy::DCT32X32]] = + QuantEncoding::RAW(matrix, 2); + RoundtripMatrices(encodings); +} + +class QuantWeightsTargetTest : public hwy::TestWithParamTarget {}; +HWY_TARGET_INSTANTIATE_TEST_SUITE_P(QuantWeightsTargetTest); + +TEST_P(QuantWeightsTargetTest, DCTUniform) { + constexpr float kUniformQuant = 4; + float weights[3][2] = {{1.0f / kUniformQuant, 0}, + {1.0f / kUniformQuant, 0}, + {1.0f / kUniformQuant, 0}}; + DctQuantWeightParams dct_params(weights); + std::vector encodings(DequantMatrices::kNum, + QuantEncoding::DCT(dct_params)); + DequantMatrices dequant_matrices; + CodecMetadata metadata; + FrameHeader frame_header(&metadata); + ModularFrameEncoder encoder(frame_header, CompressParams{}); + DequantMatricesSetCustom(&dequant_matrices, encodings, &encoder); + JXL_CHECK(dequant_matrices.EnsureComputed(~0u)); + + const float dc_quant[3] = {1.0f / kUniformQuant, 1.0f / kUniformQuant, + 1.0f / kUniformQuant}; + DequantMatricesSetCustomDC(&dequant_matrices, dc_quant); + + HWY_ALIGN_MAX float scratch_space[16 * 16 * 2]; + + // DCT8 + { + HWY_ALIGN_MAX float pixels[64]; + std::iota(std::begin(pixels), std::end(pixels), 0); + HWY_ALIGN_MAX float coeffs[64]; + const AcStrategy::Type dct = AcStrategy::DCT; + TransformFromPixels(dct, pixels, 8, coeffs, scratch_space); + HWY_ALIGN_MAX double slow_coeffs[64]; + for (size_t i = 0; i < 64; i++) slow_coeffs[i] = pixels[i]; + DCTSlow<8>(slow_coeffs); + + for (size_t i = 0; i < 64; i++) { + // DCTSlow doesn't multiply/divide by 1/N, so we do it manually. + slow_coeffs[i] = roundf(slow_coeffs[i] / kUniformQuant) * kUniformQuant; + coeffs[i] = roundf(coeffs[i] / dequant_matrices.Matrix(dct, 0)[i]) * + dequant_matrices.Matrix(dct, 0)[i]; + } + IDCTSlow<8>(slow_coeffs); + TransformToPixels(dct, coeffs, pixels, 8, scratch_space); + for (size_t i = 0; i < 64; i++) { + EXPECT_NEAR(pixels[i], slow_coeffs[i], 1e-4); + } + } + + // DCT16 + { + HWY_ALIGN_MAX float pixels[64 * 4]; + std::iota(std::begin(pixels), std::end(pixels), 0); + HWY_ALIGN_MAX float coeffs[64 * 4]; + const AcStrategy::Type dct = AcStrategy::DCT16X16; + TransformFromPixels(dct, pixels, 16, coeffs, scratch_space); + HWY_ALIGN_MAX double slow_coeffs[64 * 4]; + for (size_t i = 0; i < 64 * 4; i++) slow_coeffs[i] = pixels[i]; + DCTSlow<16>(slow_coeffs); + + for (size_t i = 0; i < 64 * 4; i++) { + slow_coeffs[i] = roundf(slow_coeffs[i] / kUniformQuant) * kUniformQuant; + coeffs[i] = roundf(coeffs[i] / dequant_matrices.Matrix(dct, 0)[i]) * + dequant_matrices.Matrix(dct, 0)[i]; + } + + IDCTSlow<16>(slow_coeffs); + TransformToPixels(dct, coeffs, pixels, 16, scratch_space); + for (size_t i = 0; i < 64 * 4; i++) { + EXPECT_NEAR(pixels[i], slow_coeffs[i], 1e-4); + } + } + + // Check that all matrices have the same DC quantization, i.e. that they all + // have the same scaling. + for (size_t i = 0; i < AcStrategy::kNumValidStrategies; i++) { + EXPECT_NEAR(dequant_matrices.Matrix(i, 0)[0], kUniformQuant, 1e-6); + } +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/quantizer-inl.h b/third_party/jpeg-xl/lib/jxl/quantizer-inl.h new file mode 100644 index 0000000000..64d273c552 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/quantizer-inl.h @@ -0,0 +1,74 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#if defined(LIB_JXL_QUANTIZER_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_QUANTIZER_INL_H_ +#undef LIB_JXL_QUANTIZER_INL_H_ +#else +#define LIB_JXL_QUANTIZER_INL_H_ +#endif + +#include + +#include +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::And; +using hwy::HWY_NAMESPACE::AndNot; +using hwy::HWY_NAMESPACE::ApproximateReciprocal; +using hwy::HWY_NAMESPACE::Gt; +using hwy::HWY_NAMESPACE::IfThenElse; +using hwy::HWY_NAMESPACE::IfThenElseZero; +using hwy::HWY_NAMESPACE::Lt; +using hwy::HWY_NAMESPACE::Rebind; +using hwy::HWY_NAMESPACE::Vec; +using hwy::HWY_NAMESPACE::Xor; + +template +HWY_INLINE HWY_MAYBE_UNUSED Vec> AdjustQuantBias( + DI di, const size_t c, const Vec quant_i, + const float* HWY_RESTRICT biases) { + const Rebind df; + + const auto quant = ConvertTo(df, quant_i); + + // Compare |quant|, keep sign bit for negating result. + const auto kSign = BitCast(df, Set(di, INT32_MIN)); + const auto sign = And(quant, kSign); // TODO(janwas): = abs ^ orig + const auto abs_quant = AndNot(kSign, quant); + + // If |x| is 1, kZeroBias creates a different bias for each channel. + // We're implementing the following: + // if (quant == 0) return 0; + // if (quant == 1) return biases[c]; + // if (quant == -1) return -biases[c]; + // return quant - biases[3] / quant; + + // Integer comparison is not helpful because Clang incurs bypass penalties + // from unnecessarily mixing integer and float. + const auto is_01 = Lt(abs_quant, Set(df, 1.125f)); + const auto not_0 = Gt(abs_quant, Zero(df)); + + // Bitwise logic is faster than quant * biases[c]. + const auto one_bias = IfThenElseZero(not_0, Xor(Set(df, biases[c]), sign)); + + // About 2E-5 worse than ReciprocalNR or division. + const auto bias = + NegMulAdd(Set(df, biases[3]), ApproximateReciprocal(quant), quant); + + return IfThenElse(is_01, one_bias, bias); +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_QUANTIZER_INL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/quantizer.cc b/third_party/jpeg-xl/lib/jxl/quantizer.cc new file mode 100644 index 0000000000..153cf19b21 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/quantizer.cc @@ -0,0 +1,156 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/quantizer.h" + +#include +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/field_encodings.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/quant_weights.h" + +namespace jxl { + +static const int32_t kDefaultQuant = 64; + +constexpr int32_t Quantizer::kQuantMax; + +Quantizer::Quantizer(const DequantMatrices* dequant) + : Quantizer(dequant, kDefaultQuant, kGlobalScaleDenom / kDefaultQuant) {} + +Quantizer::Quantizer(const DequantMatrices* dequant, int quant_dc, + int global_scale) + : global_scale_(global_scale), quant_dc_(quant_dc), dequant_(dequant) { + JXL_ASSERT(dequant_ != nullptr); + RecomputeFromGlobalScale(); + inv_quant_dc_ = inv_global_scale_ / quant_dc_; + + memcpy(zero_bias_, kZeroBiasDefault, sizeof(kZeroBiasDefault)); +} + +void Quantizer::ComputeGlobalScaleAndQuant(float quant_dc, float quant_median, + float quant_median_absd) { + // Target value for the median value in the quant field. + const float kQuantFieldTarget = 5; + // We reduce the median of the quant field by the median absolute deviation: + // higher resolution on highly varying quant fields. + float scale = kGlobalScaleDenom * (quant_median - quant_median_absd) / + kQuantFieldTarget; + // Ensure that new_global_scale is positive and no more than 1<<15. + if (scale < 1) scale = 1; + if (scale > (1 << 15)) scale = 1 << 15; + int new_global_scale = static_cast(scale); + // Ensure that quant_dc_ will always be at least + // 0.625 * kGlobalScaleDenom/kGlobalScaleNumerator = 10. + const int scaled_quant_dc = + static_cast(quant_dc * kGlobalScaleNumerator * 1.6); + if (new_global_scale > scaled_quant_dc) { + new_global_scale = scaled_quant_dc; + if (new_global_scale <= 0) new_global_scale = 1; + } + global_scale_ = new_global_scale; + // Code below uses inv_global_scale_. + RecomputeFromGlobalScale(); + + float fval = quant_dc * inv_global_scale_ + 0.5f; + fval = std::min(1 << 16, fval); + const int new_quant_dc = static_cast(fval); + quant_dc_ = new_quant_dc; + + // quant_dc_ was updated, recompute values. + RecomputeFromGlobalScale(); +} + +void Quantizer::SetQuantFieldRect(const ImageF& qf, const Rect& rect, + ImageI* JXL_RESTRICT raw_quant_field) const { + for (size_t y = 0; y < rect.ysize(); ++y) { + const float* JXL_RESTRICT row_qf = rect.ConstRow(qf, y); + int32_t* JXL_RESTRICT row_qi = rect.Row(raw_quant_field, y); + for (size_t x = 0; x < rect.xsize(); ++x) { + int val = ClampVal(row_qf[x] * inv_global_scale_ + 0.5f); + row_qi[x] = val; + } + } +} + +void Quantizer::SetQuantField(const float quant_dc, const ImageF& qf, + ImageI* JXL_RESTRICT raw_quant_field) { + std::vector data(qf.xsize() * qf.ysize()); + for (size_t y = 0; y < qf.ysize(); ++y) { + const float* JXL_RESTRICT row_qf = qf.Row(y); + for (size_t x = 0; x < qf.xsize(); ++x) { + float quant = row_qf[x]; + data[qf.xsize() * y + x] = quant; + } + } + std::nth_element(data.begin(), data.begin() + data.size() / 2, data.end()); + const float quant_median = data[data.size() / 2]; + std::vector deviations(data.size()); + for (size_t i = 0; i < data.size(); i++) { + deviations[i] = fabsf(data[i] - quant_median); + } + std::nth_element(deviations.begin(), + deviations.begin() + deviations.size() / 2, + deviations.end()); + const float quant_median_absd = deviations[deviations.size() / 2]; + ComputeGlobalScaleAndQuant(quant_dc, quant_median, quant_median_absd); + if (raw_quant_field) { + JXL_CHECK(SameSize(*raw_quant_field, qf)); + SetQuantFieldRect(qf, Rect(qf), raw_quant_field); + } +} + +void Quantizer::SetQuant(float quant_dc, float quant_ac, + ImageI* JXL_RESTRICT raw_quant_field) { + ComputeGlobalScaleAndQuant(quant_dc, quant_ac, 0); + int32_t val = ClampVal(quant_ac * inv_global_scale_ + 0.5f); + FillImage(val, raw_quant_field); +} + +Status QuantizerParams::VisitFields(Visitor* JXL_RESTRICT visitor) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32( + BitsOffset(11, 1), BitsOffset(11, 2049), BitsOffset(12, 4097), + BitsOffset(16, 8193), 1, &global_scale)); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(16), BitsOffset(5, 1), + BitsOffset(8, 1), BitsOffset(16, 1), 1, + &quant_dc)); + return true; +} + +QuantizerParams Quantizer::GetParams() const { + QuantizerParams params; + params.global_scale = global_scale_; + params.quant_dc = quant_dc_; + return params; +} + +Status Quantizer::Decode(BitReader* reader) { + QuantizerParams params; + JXL_RETURN_IF_ERROR(Bundle::Read(reader, ¶ms)); + global_scale_ = static_cast(params.global_scale); + quant_dc_ = static_cast(params.quant_dc); + RecomputeFromGlobalScale(); + return true; +} + +void Quantizer::DumpQuantizationMap(const ImageI& raw_quant_field) const { + printf("Global scale: %d (%.7f)\nDC quant: %d\n", global_scale_, + global_scale_ * 1.0 / kGlobalScaleDenom, quant_dc_); + printf("AC quantization Map:\n"); + for (size_t y = 0; y < raw_quant_field.ysize(); ++y) { + for (size_t x = 0; x < raw_quant_field.xsize(); ++x) { + printf(" %3d", raw_quant_field.Row(y)[x]); + } + printf("\n"); + } +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/quantizer.h b/third_party/jpeg-xl/lib/jxl/quantizer.h new file mode 100644 index 0000000000..d78ba7b3fc --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/quantizer.h @@ -0,0 +1,182 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_QUANTIZER_H_ +#define LIB_JXL_QUANTIZER_H_ + +#include +#include +#include + +#include +#include +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image.h" +#include "lib/jxl/quant_weights.h" + +// Quantizes DC and AC coefficients, with separate quantization tables according +// to the quant_kind (which is currently computed from the AC strategy and the +// block index inside that strategy). + +namespace jxl { + +static constexpr int kGlobalScaleDenom = 1 << 16; +static constexpr int kGlobalScaleNumerator = 4096; + +// zero-biases for quantizing channels X, Y, B +static constexpr float kZeroBiasDefault[3] = {0.5f, 0.5f, 0.5f}; + +// Returns adjusted version of a quantized integer, such that its value is +// closer to the expected value of the original. +// The residuals of AC coefficients that we quantize are not uniformly +// distributed. Numerical experiments show that they have a distribution with +// the "shape" of 1/(1+x^2) [up to some coefficients]. This means that the +// expected value of a coefficient that gets quantized to x will not be x +// itself, but (at least with reasonable approximation): +// - 0 if x is 0 +// - x * biases[c] if x is 1 or -1 +// - x - biases[3]/x otherwise +// This follows from computing the distribution of the quantization bias, which +// can be approximated fairly well by /x when |x| is at least two. +static constexpr float kBiasNumerator = 0.145f; + +static constexpr float kDefaultQuantBias[4] = { + 1.0f - 0.05465007330715401f, + 1.0f - 0.07005449891748593f, + 1.0f - 0.049935103337343655f, + 0.145f, +}; + +struct QuantizerParams; + +class Quantizer { + public: + explicit Quantizer(const DequantMatrices* dequant); + Quantizer(const DequantMatrices* dequant, int quant_dc, int global_scale); + + static constexpr int32_t kQuantMax = 256; + + static JXL_INLINE int32_t ClampVal(float val) { + return static_cast( + std::max(1.0f, std::min(val, kQuantMax))); + } + + float ScaleGlobalScale(const float scale) { + int new_global_scale = static_cast(global_scale_ * scale + 0.5f); + float scale_out = new_global_scale * 1.0f / global_scale_; + global_scale_ = new_global_scale; + RecomputeFromGlobalScale(); + return scale_out; + } + + // Recomputes other derived fields after global_scale_ has changed. + void RecomputeFromGlobalScale() { + global_scale_float_ = global_scale_ * (1.0 / kGlobalScaleDenom); + inv_global_scale_ = 1.0 * kGlobalScaleDenom / global_scale_; + inv_quant_dc_ = inv_global_scale_ / quant_dc_; + for (size_t c = 0; c < 3; c++) { + mul_dc_[c] = GetDcStep(c); + inv_mul_dc_[c] = GetInvDcStep(c); + } + } + + // Returns scaling factor such that Scale() * (RawDC() or RawQuantField()) + // pixels yields the same float values returned by GetQuantField. + JXL_INLINE float Scale() const { return global_scale_float_; } + + // Reciprocal of Scale(). + JXL_INLINE float InvGlobalScale() const { return inv_global_scale_; } + + void SetQuantFieldRect(const ImageF& qf, const Rect& rect, + ImageI* JXL_RESTRICT raw_quant_field) const; + + void SetQuantField(float quant_dc, const ImageF& qf, + ImageI* JXL_RESTRICT raw_quant_field); + + void SetQuant(float quant_dc, float quant_ac, + ImageI* JXL_RESTRICT raw_quant_field); + + // Returns the DC quantization base value, which is currently global (not + // adaptive). The actual scale factor used to dequantize pixels in channel c + // is: inv_quant_dc() * dequant_->DCQuant(c). + float inv_quant_dc() const { return inv_quant_dc_; } + + // Dequantize by multiplying with this times dequant_matrix. + float inv_quant_ac(int32_t quant) const { return inv_global_scale_ / quant; } + + QuantizerParams GetParams() const; + + Status Decode(BitReader* reader); + + void DumpQuantizationMap(const ImageI& raw_quant_field) const; + + JXL_INLINE const float* DequantMatrix(size_t quant_kind, size_t c) const { + return dequant_->Matrix(quant_kind, c); + } + + JXL_INLINE const float* InvDequantMatrix(size_t quant_kind, size_t c) const { + return dequant_->InvMatrix(quant_kind, c); + } + + // Calculates DC quantization step. + JXL_INLINE float GetDcStep(size_t c) const { + return inv_quant_dc_ * dequant_->DCQuant(c); + } + JXL_INLINE float GetInvDcStep(size_t c) const { + return dequant_->InvDCQuant(c) * (global_scale_float_ * quant_dc_); + } + + JXL_INLINE const float* MulDC() const { return mul_dc_; } + JXL_INLINE const float* InvMulDC() const { return inv_mul_dc_; } + + JXL_INLINE void ClearDCMul() { + std::fill(mul_dc_, mul_dc_ + 4, 1.f); + std::fill(inv_mul_dc_, inv_mul_dc_ + 4, 1.f); + } + + void ComputeGlobalScaleAndQuant(float quant_dc, float quant_median, + float quant_median_absd); + + private: + float mul_dc_[4]; + float inv_mul_dc_[4]; + + // These are serialized: + int global_scale_; + int quant_dc_; + + // These are derived from global_scale_: + float inv_global_scale_; + float global_scale_float_; // reciprocal of inv_global_scale_ + float inv_quant_dc_; + + float zero_bias_[3]; + const DequantMatrices* dequant_; +}; + +struct QuantizerParams : public Fields { + QuantizerParams() { Bundle::Init(this); } + JXL_FIELDS_NAME(QuantizerParams) + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + uint32_t global_scale; + uint32_t quant_dc; +}; + +} // namespace jxl + +#endif // LIB_JXL_QUANTIZER_H_ diff --git a/third_party/jpeg-xl/lib/jxl/quantizer_test.cc b/third_party/jpeg-xl/lib/jxl/quantizer_test.cc new file mode 100644 index 0000000000..f9cf2c838e --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/quantizer_test.cc @@ -0,0 +1,81 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/quantizer.h" + +#include "lib/jxl/base/span.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_fields.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +void TestEquivalence(int qxsize, int qysize, const Quantizer& quantizer1, + const Quantizer& quantizer2) { + ASSERT_NEAR(quantizer1.inv_quant_dc(), quantizer2.inv_quant_dc(), 1e-7); +} + +TEST(QuantizerTest, QuantizerParams) { + for (uint32_t i = 1; i < 10000; ++i) { + QuantizerParams p; + p.global_scale = i; + size_t extension_bits = 0, total_bits = 0; + EXPECT_TRUE(Bundle::CanEncode(p, &extension_bits, &total_bits)); + EXPECT_EQ(0u, extension_bits); + EXPECT_GE(total_bits, 4u); + } +} + +TEST(QuantizerTest, BitStreamRoundtripSameQuant) { + const int qxsize = 8; + const int qysize = 8; + DequantMatrices dequant; + Quantizer quantizer1(&dequant); + ImageI raw_quant_field(qxsize, qysize); + quantizer1.SetQuant(0.17f, 0.17f, &raw_quant_field); + BitWriter writer; + QuantizerParams params = quantizer1.GetParams(); + EXPECT_TRUE(WriteQuantizerParams(params, &writer, 0, nullptr)); + writer.ZeroPadToByte(); + const size_t bits_written = writer.BitsWritten(); + Quantizer quantizer2(&dequant); + BitReader reader(writer.GetSpan()); + EXPECT_TRUE(quantizer2.Decode(&reader)); + EXPECT_TRUE(reader.JumpToByteBoundary()); + EXPECT_EQ(reader.TotalBitsConsumed(), bits_written); + EXPECT_TRUE(reader.Close()); + TestEquivalence(qxsize, qysize, quantizer1, quantizer2); +} + +TEST(QuantizerTest, BitStreamRoundtripRandomQuant) { + const int qxsize = 8; + const int qysize = 8; + DequantMatrices dequant; + Quantizer quantizer1(&dequant); + ImageI raw_quant_field(qxsize, qysize); + quantizer1.SetQuant(0.17f, 0.17f, &raw_quant_field); + float quant_dc = 0.17f; + ImageF qf(qxsize, qysize); + RandomFillImage(&qf, 0.0f, 1.0f); + quantizer1.SetQuantField(quant_dc, qf, &raw_quant_field); + BitWriter writer; + QuantizerParams params = quantizer1.GetParams(); + EXPECT_TRUE(WriteQuantizerParams(params, &writer, 0, nullptr)); + writer.ZeroPadToByte(); + const size_t bits_written = writer.BitsWritten(); + Quantizer quantizer2(&dequant); + BitReader reader(writer.GetSpan()); + EXPECT_TRUE(quantizer2.Decode(&reader)); + EXPECT_TRUE(reader.JumpToByteBoundary()); + EXPECT_EQ(reader.TotalBitsConsumed(), bits_written); + EXPECT_TRUE(reader.Close()); + TestEquivalence(qxsize, qysize, quantizer1, quantizer2); +} +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/rational_polynomial-inl.h b/third_party/jpeg-xl/lib/jxl/rational_polynomial-inl.h new file mode 100644 index 0000000000..176e24092c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/rational_polynomial-inl.h @@ -0,0 +1,98 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Fast SIMD evaluation of rational polynomials for approximating functions. + +#if defined(LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_ +#undef LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_ +#else +#define LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_ +#endif + +#include + +#include +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Div; +using hwy::HWY_NAMESPACE::MulAdd; + +// Primary template: default to actual division. +template +struct FastDivision { + HWY_INLINE V operator()(const V n, const V d) const { return n / d; } +}; +// Partial specialization for float vectors. +template +struct FastDivision { + // One Newton-Raphson iteration. + static HWY_INLINE V ReciprocalNR(const V x) { + const auto rcp = ApproximateReciprocal(x); + const auto sum = Add(rcp, rcp); + const auto x_rcp = Mul(x, rcp); + return NegMulAdd(x_rcp, rcp, sum); + } + + V operator()(const V n, const V d) const { +#if 1 // Faster on SKX + return Div(n, d); +#else + return n * ReciprocalNR(d); +#endif + } +}; + +// Approximates smooth functions via rational polynomials (i.e. dividing two +// polynomials). Evaluates polynomials via Horner's scheme, which is faster than +// Clenshaw recurrence for Chebyshev polynomials. LoadDup128 allows us to +// specify constants (replicated 4x) independently of the lane count. +template +HWY_INLINE HWY_MAYBE_UNUSED V EvalRationalPolynomial(const D d, const V x, + const T (&p)[NP], + const T (&q)[NQ]) { + constexpr size_t kDegP = NP / 4 - 1; + constexpr size_t kDegQ = NQ / 4 - 1; + auto yp = LoadDup128(d, &p[kDegP * 4]); + auto yq = LoadDup128(d, &q[kDegQ * 4]); + // We use pointer arithmetic to refer to &p[(kDegP - n) * 4] to avoid a + // compiler warning that the index is out of bounds since we are already + // checking that it is not out of bounds with (kDegP >= n) and the access + // will be optimized away. Similarly with q and kDegQ. + HWY_FENCE; + if (kDegP >= 1) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 1) * 4))); + if (kDegQ >= 1) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 1) * 4))); + HWY_FENCE; + if (kDegP >= 2) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 2) * 4))); + if (kDegQ >= 2) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 2) * 4))); + HWY_FENCE; + if (kDegP >= 3) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 3) * 4))); + if (kDegQ >= 3) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 3) * 4))); + HWY_FENCE; + if (kDegP >= 4) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 4) * 4))); + if (kDegQ >= 4) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 4) * 4))); + HWY_FENCE; + if (kDegP >= 5) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 5) * 4))); + if (kDegQ >= 5) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 5) * 4))); + HWY_FENCE; + if (kDegP >= 6) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 6) * 4))); + if (kDegQ >= 6) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 6) * 4))); + HWY_FENCE; + if (kDegP >= 7) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 7) * 4))); + if (kDegQ >= 7) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 7) * 4))); + + return FastDivision()(yp, yq); +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); +#endif // LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/rational_polynomial_test.cc b/third_party/jpeg-xl/lib/jxl/rational_polynomial_test.cc new file mode 100644 index 0000000000..13fc044a55 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/rational_polynomial_test.cc @@ -0,0 +1,238 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/rational_polynomial_test.cc" +#include +#include +#include + +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/rational_polynomial-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +using T = float; // required by EvalLog2 +using D = HWY_FULL(T); + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::GetLane; +using hwy::HWY_NAMESPACE::ShiftLeft; +using hwy::HWY_NAMESPACE::ShiftRight; +using hwy::HWY_NAMESPACE::Sub; + +// Generic: only computes polynomial +struct EvalPoly { + template + T operator()(T x, const T (&p)[NP], const T (&q)[NQ]) const { + const HWY_FULL(T) d; + const auto vx = Set(d, x); + const auto approx = EvalRationalPolynomial(d, vx, p, q); + return GetLane(approx); + } +}; + +// Range reduction for log2 +struct EvalLog2 { + template + T operator()(T x, const T (&p)[NP], const T (&q)[NQ]) const { + const HWY_FULL(T) d; + auto vx = Set(d, x); + + const HWY_FULL(int32_t) di; + const auto x_bits = BitCast(di, vx); + // Cannot handle negative numbers / NaN. + JXL_DASSERT(AllTrue(di, Eq(Abs(x_bits), x_bits))); + + // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops + const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab)); // = 2/3 + // Shifted exponent = log2; also used to clear mantissa. + const auto exp_shifted = ShiftRight<23>(exp_bits); + const auto mantissa = BitCast(d, Sub(x_bits, ShiftLeft<23>(exp_shifted))); + const auto exp_val = ConvertTo(d, exp_shifted); + vx = Sub(mantissa, Set(d, 1.0f)); + + const auto approx = Add(EvalRationalPolynomial(d, vx, p, q), exp_val); + return GetLane(approx); + } +}; + +// Functions to approximate: + +T LinearToSrgb8Direct(T val) { + if (val < 0.0) return 0.0; + if (val >= 255.0) return 255.0; + if (val <= 10.0 / 12.92) return val * 12.92; + return 255.0 * (std::pow(val / 255.0, 1.0 / 2.4) * 1.055 - 0.055); +} + +T SimpleGamma(T v) { + static const T kGamma = 0.387494322593; + static const T limit = 43.01745241042018; + T bright = v - limit; + if (bright >= 0) { + static const T mul = 0.0383723643799; + v -= bright * mul; + } + static const T limit2 = 94.68634353321337; + T bright2 = v - limit2; + if (bright2 >= 0) { + static const T mul = 0.22885405968; + v -= bright2 * mul; + } + static const T offset = 0.156775786057; + static const T scale = 8.898059160493739; + T retval = scale * (offset + pow(v, kGamma)); + return retval; +} + +// Runs CaratheodoryFejer and verifies the polynomial using a lot of samples to +// return the biggest error. +template +T RunApproximation(T x0, T x1, const T (&p)[NP], const T (&q)[NQ], + const Eval& eval, T func_to_approx(T)) { + float maxerr = 0; + T lastPrint = 0; + // NOLINTNEXTLINE(clang-analyzer-security.FloatLoopCounter) + for (T x = x0; x <= x1; x += (x1 - x0) / 10000.0) { + const T f = func_to_approx(x); + const T g = eval(x, p, q); + maxerr = std::max(fabsf(g - f), maxerr); + if (x == x0 || x - lastPrint > (x1 - x0) / 20.0) { + printf("x: %11.6f, f: %11.6f, g: %11.6f, e: %11.6f\n", x, f, g, + fabs(g - f)); + lastPrint = x; + } + } + return maxerr; +} + +void TestSimpleGamma() { + const T p[4 * (6 + 1)] = { + HWY_REP4(-5.0646949363741811E-05), HWY_REP4(6.7369380528439771E-05), + HWY_REP4(8.9376652530412794E-05), HWY_REP4(2.1153513301520462E-06), + HWY_REP4(-6.9130322970386449E-08), HWY_REP4(3.9424752749293728E-10), + HWY_REP4(1.2360288207619576E-13)}; + + const T q[4 * (6 + 1)] = { + HWY_REP4(-6.6389733798591366E-06), HWY_REP4(1.3299859726565908E-05), + HWY_REP4(3.8538748358398873E-06), HWY_REP4(-2.8707687262928236E-08), + HWY_REP4(-6.6897385800005434E-10), HWY_REP4(6.1428748869186003E-12), + HWY_REP4(-2.5475738169252870E-15)}; + + const T err = RunApproximation(0.77, 274.579999999999984, p, q, EvalPoly(), + SimpleGamma); + EXPECT_LT(err, 0.05); +} + +void TestLinearToSrgb8Direct() { + const T p[4 * (5 + 1)] = { + HWY_REP4(-9.5357499040105154E-05), HWY_REP4(4.6761186249798248E-04), + HWY_REP4(2.5708174333943594E-04), HWY_REP4(1.5250087770436082E-05), + HWY_REP4(1.1946768008931187E-07), HWY_REP4(5.9916446295972850E-11)}; + + const T q[4 * (4 + 1)] = { + HWY_REP4(1.8932479758079768E-05), HWY_REP4(2.7312342474687321E-05), + HWY_REP4(4.3901204783327006E-06), HWY_REP4(1.0417787306920273E-07), + HWY_REP4(3.0084206762140419E-10)}; + + const T err = + RunApproximation(0.77, 255, p, q, EvalPoly(), LinearToSrgb8Direct); + EXPECT_LT(err, 0.05); +} + +void TestExp() { + const T p[4 * (2 + 1)] = {HWY_REP4(9.6266879665530902E-01), + HWY_REP4(4.8961265681586763E-01), + HWY_REP4(8.2619259189548433E-02)}; + const T q[4 * (2 + 1)] = {HWY_REP4(9.6259895571622622E-01), + HWY_REP4(-4.7272457588933831E-01), + HWY_REP4(7.4802088567547664E-02)}; + const T err = + RunApproximation(-1, 1, p, q, EvalPoly(), [](T x) { return T(exp(x)); }); + EXPECT_LT(err, 1E-4); +} + +void TestNegExp() { + // 4,3 is the min required for monotonicity; max error in 0,10: 751 ppm + // no benefit for k>50. + const T p[4 * (4 + 1)] = { + HWY_REP4(5.9580258551150123E-02), HWY_REP4(-2.5073728806886408E-02), + HWY_REP4(4.1561830213689248E-03), HWY_REP4(-3.1815408488900372E-04), + HWY_REP4(9.3866690094906802E-06)}; + const T q[4 * (3 + 1)] = { + HWY_REP4(5.9579108238812878E-02), HWY_REP4(3.4542074345478582E-02), + HWY_REP4(8.7263562483501714E-03), HWY_REP4(1.4095109143061216E-03)}; + + const T err = + RunApproximation(0, 10, p, q, EvalPoly(), [](T x) { return T(exp(-x)); }); + EXPECT_LT(err, sizeof(T) == 8 ? 2E-5 : 3E-5); +} + +void TestSin() { + const T p[4 * (6 + 1)] = { + HWY_REP4(1.5518122109203780E-05), HWY_REP4(2.3388958643675966E+00), + HWY_REP4(-8.6705520940849157E-01), HWY_REP4(-1.9702294764873535E-01), + HWY_REP4(1.2193404314472320E-01), HWY_REP4(-1.7373966109788839E-02), + HWY_REP4(7.8829435883034796E-04)}; + const T q[4 * (5 + 1)] = { + HWY_REP4(2.3394371422557279E+00), HWY_REP4(-8.7028221081288615E-01), + HWY_REP4(2.0052872219658430E-01), HWY_REP4(-3.2460335995264836E-02), + HWY_REP4(3.1546157932479282E-03), HWY_REP4(-1.6692542019380155E-04)}; + + const T err = RunApproximation(0, Pi(1) * 2, p, q, EvalPoly(), + [](T x) { return T(sin(x)); }); + EXPECT_LT(err, sizeof(T) == 8 ? 5E-4 : 7E-4); +} + +void TestLog() { + HWY_ALIGN const T p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06), + HWY_REP4(1.4287160470083755E+00), + HWY_REP4(7.4245873327820566E-01)}; + HWY_ALIGN const T q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01), + HWY_REP4(1.0096718572241148E+00), + HWY_REP4(1.7409343003366853E-01)}; + const T err = RunApproximation(1E-6, 1000, p, q, EvalLog2(), std::log2); + printf("%E\n", err); +} + +HWY_NOINLINE void TestRationalPolynomial() { + TestSimpleGamma(); + TestLinearToSrgb8Direct(); + TestExp(); + TestNegExp(); + TestSin(); + TestLog(); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +class RationalPolynomialTest : public hwy::TestWithParamTarget {}; +HWY_TARGET_INSTANTIATE_TEST_SUITE_P(RationalPolynomialTest); + +HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestSimpleGamma); +HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestLinearToSrgb8Direct); +HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestExp); +HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestNegExp); +HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestSin); +HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestLog); + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc new file mode 100644 index 0000000000..db60a458db --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc @@ -0,0 +1,865 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/low_memory_render_pipeline.h" + +#include +#include +#include + +#include "lib/jxl/base/arch_macros.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { +std::pair +LowMemoryRenderPipeline::ColorDimensionsToChannelDimensions( + std::pair in, size_t c, size_t stage) const { + std::pair ret; + std::pair shift = channel_shifts_[stage][c]; + ret.first = + ((in.first << base_color_shift_) + (1 << shift.first) - 1) >> shift.first; + ret.second = ((in.second << base_color_shift_) + (1 << shift.second) - 1) >> + shift.second; + return ret; +} + +std::pair LowMemoryRenderPipeline::BorderToStore( + size_t c) const { + auto ret = ColorDimensionsToChannelDimensions(group_border_, c, 0); + ret.first += padding_[0][c].first; + ret.second += padding_[0][c].second; + return ret; +} + +void LowMemoryRenderPipeline::SaveBorders(size_t group_id, size_t c, + const ImageF& in) { + size_t gy = group_id / frame_dimensions_.xsize_groups; + size_t gx = group_id % frame_dimensions_.xsize_groups; + size_t hshift = channel_shifts_[0][c].first; + size_t vshift = channel_shifts_[0][c].second; + size_t x0 = gx * GroupInputXSize(c); + size_t x1 = std::min((gx + 1) * GroupInputXSize(c), + DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift)); + size_t y0 = gy * GroupInputYSize(c); + size_t y1 = std::min((gy + 1) * GroupInputYSize(c), + DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift)); + + auto borders = BorderToStore(c); + size_t borderx_write = borders.first; + size_t bordery_write = borders.second; + + if (gy > 0) { + Rect from(group_data_x_border_, group_data_y_border_, x1 - x0, + bordery_write); + Rect to(x0, (gy * 2 - 1) * bordery_write, x1 - x0, bordery_write); + CopyImageTo(from, in, to, &borders_horizontal_[c]); + } + if (gy + 1 < frame_dimensions_.ysize_groups) { + Rect from(group_data_x_border_, + group_data_y_border_ + y1 - y0 - bordery_write, x1 - x0, + bordery_write); + Rect to(x0, (gy * 2) * bordery_write, x1 - x0, bordery_write); + CopyImageTo(from, in, to, &borders_horizontal_[c]); + } + if (gx > 0) { + Rect from(group_data_x_border_, group_data_y_border_, borderx_write, + y1 - y0); + Rect to((gx * 2 - 1) * borderx_write, y0, borderx_write, y1 - y0); + CopyImageTo(from, in, to, &borders_vertical_[c]); + } + if (gx + 1 < frame_dimensions_.xsize_groups) { + Rect from(group_data_x_border_ + x1 - x0 - borderx_write, + group_data_y_border_, borderx_write, y1 - y0); + Rect to((gx * 2) * borderx_write, y0, borderx_write, y1 - y0); + CopyImageTo(from, in, to, &borders_vertical_[c]); + } +} + +void LowMemoryRenderPipeline::LoadBorders(size_t group_id, size_t c, + const Rect& r, ImageF* out) { + size_t gy = group_id / frame_dimensions_.xsize_groups; + size_t gx = group_id % frame_dimensions_.xsize_groups; + size_t hshift = channel_shifts_[0][c].first; + size_t vshift = channel_shifts_[0][c].second; + // Coordinates of the group in the image. + size_t x0 = gx * GroupInputXSize(c); + size_t x1 = std::min((gx + 1) * GroupInputXSize(c), + DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift)); + size_t y0 = gy * GroupInputYSize(c); + size_t y1 = std::min((gy + 1) * GroupInputYSize(c), + DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift)); + + size_t paddingx = padding_[0][c].first; + size_t paddingy = padding_[0][c].second; + + auto borders = BorderToStore(c); + size_t borderx_write = borders.first; + size_t bordery_write = borders.second; + + // Limits of the area to copy from, in image coordinates. + JXL_DASSERT(r.x0() == 0 || (r.x0() << base_color_shift_) >= paddingx); + size_t x0src = DivCeil(r.x0() << base_color_shift_, 1 << hshift); + if (x0src != 0) { + x0src -= paddingx; + } + // r may be such that r.x1 (namely x0() + xsize()) is within paddingx of the + // right side of the image, so we use min() here. + size_t x1src = + DivCeil((r.x0() + r.xsize()) << base_color_shift_, 1 << hshift); + x1src = std::min(x1src + paddingx, + DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift)); + + // Similar computation for y. + JXL_DASSERT(r.y0() == 0 || (r.y0() << base_color_shift_) >= paddingy); + size_t y0src = DivCeil(r.y0() << base_color_shift_, 1 << vshift); + if (y0src != 0) { + y0src -= paddingy; + } + size_t y1src = + DivCeil((r.y0() + r.ysize()) << base_color_shift_, 1 << vshift); + y1src = std::min(y1src + paddingy, + DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift)); + + // Copy other groups' borders from the border storage. + if (y0src < y0) { + JXL_DASSERT(gy > 0); + CopyImageTo( + Rect(x0src, (gy * 2 - 2) * bordery_write, x1src - x0src, bordery_write), + borders_horizontal_[c], + Rect(group_data_x_border_ + x0src - x0, + group_data_y_border_ - bordery_write, x1src - x0src, + bordery_write), + out); + } + if (y1src > y1) { + // When copying the bottom border we must not be on the bottom groups. + JXL_DASSERT(gy + 1 < frame_dimensions_.ysize_groups); + CopyImageTo( + Rect(x0src, (gy * 2 + 1) * bordery_write, x1src - x0src, bordery_write), + borders_horizontal_[c], + Rect(group_data_x_border_ + x0src - x0, group_data_y_border_ + y1 - y0, + x1src - x0src, bordery_write), + out); + } + if (x0src < x0) { + JXL_DASSERT(gx > 0); + CopyImageTo( + Rect((gx * 2 - 2) * borderx_write, y0src, borderx_write, y1src - y0src), + borders_vertical_[c], + Rect(group_data_x_border_ - borderx_write, + group_data_y_border_ + y0src - y0, borderx_write, y1src - y0src), + out); + } + if (x1src > x1) { + // When copying the right border we must not be on the rightmost groups. + JXL_DASSERT(gx + 1 < frame_dimensions_.xsize_groups); + CopyImageTo( + Rect((gx * 2 + 1) * borderx_write, y0src, borderx_write, y1src - y0src), + borders_vertical_[c], + Rect(group_data_x_border_ + x1 - x0, group_data_y_border_ + y0src - y0, + borderx_write, y1src - y0src), + out); + } +} + +size_t LowMemoryRenderPipeline::GroupInputXSize(size_t c) const { + return (frame_dimensions_.group_dim << base_color_shift_) >> + channel_shifts_[0][c].first; +} + +size_t LowMemoryRenderPipeline::GroupInputYSize(size_t c) const { + return (frame_dimensions_.group_dim << base_color_shift_) >> + channel_shifts_[0][c].second; +} + +void LowMemoryRenderPipeline::EnsureBordersStorage() { + const auto& shifts = channel_shifts_[0]; + if (borders_horizontal_.size() < shifts.size()) { + borders_horizontal_.resize(shifts.size()); + borders_vertical_.resize(shifts.size()); + } + for (size_t c = 0; c < shifts.size(); c++) { + auto borders = BorderToStore(c); + size_t borderx = borders.first; + size_t bordery = borders.second; + JXL_DASSERT(frame_dimensions_.xsize_groups > 0); + size_t num_xborders = (frame_dimensions_.xsize_groups - 1) * 2; + JXL_DASSERT(frame_dimensions_.ysize_groups > 0); + size_t num_yborders = (frame_dimensions_.ysize_groups - 1) * 2; + size_t downsampled_xsize = + DivCeil(frame_dimensions_.xsize_upsampled_padded, 1 << shifts[c].first); + size_t downsampled_ysize = DivCeil(frame_dimensions_.ysize_upsampled_padded, + 1 << shifts[c].second); + Rect horizontal = Rect(0, 0, downsampled_xsize, bordery * num_yborders); + if (!SameSize(horizontal, borders_horizontal_[c])) { + borders_horizontal_[c] = ImageF(horizontal.xsize(), horizontal.ysize()); + } + Rect vertical = Rect(0, 0, borderx * num_xborders, downsampled_ysize); + if (!SameSize(vertical, borders_vertical_[c])) { + borders_vertical_[c] = ImageF(vertical.xsize(), vertical.ysize()); + } + } +} + +void LowMemoryRenderPipeline::Init() { + group_border_ = {0, 0}; + base_color_shift_ = CeilLog2Nonzero(frame_dimensions_.xsize_upsampled_padded / + frame_dimensions_.xsize_padded); + + const auto& shifts = channel_shifts_[0]; + + // Ensure that each channel has enough many border pixels. + for (size_t c = 0; c < shifts.size(); c++) { + group_border_.first = + std::max(group_border_.first, + DivCeil(padding_[0][c].first << channel_shifts_[0][c].first, + 1 << base_color_shift_)); + group_border_.second = + std::max(group_border_.second, + DivCeil(padding_[0][c].second << channel_shifts_[0][c].second, + 1 << base_color_shift_)); + } + + // Ensure that all channels have an integer number of border pixels in the + // input. + for (size_t c = 0; c < shifts.size(); c++) { + if (channel_shifts_[0][c].first >= base_color_shift_) { + group_border_.first = + RoundUpTo(group_border_.first, + 1 << (channel_shifts_[0][c].first - base_color_shift_)); + } + if (channel_shifts_[0][c].second >= base_color_shift_) { + group_border_.second = + RoundUpTo(group_border_.second, + 1 << (channel_shifts_[0][c].second - base_color_shift_)); + } + } + // Ensure that the X border on color channels is a multiple of kBlockDim or + // the vector size (required for EPF stages). Vectors on ARM NEON are never + // wider than 4 floats, so rounding to multiples of 4 is enough. +#if JXL_ARCH_ARM + constexpr size_t kGroupXAlign = 4; +#else + constexpr size_t kGroupXAlign = 16; +#endif + group_border_.first = RoundUpTo(group_border_.first, kGroupXAlign); + // Allocate borders in group images that are just enough for storing the + // borders to be copied in, plus any rounding to ensure alignment. + std::pair max_border = {0, 0}; + for (size_t c = 0; c < shifts.size(); c++) { + max_border.first = std::max(BorderToStore(c).first, max_border.first); + max_border.second = std::max(BorderToStore(c).second, max_border.second); + } + group_data_x_border_ = RoundUpTo(max_border.first, kGroupXAlign); + group_data_y_border_ = max_border.second; + + EnsureBordersStorage(); + group_border_assigner_.Init(frame_dimensions_); + + for (first_trailing_stage_ = stages_.size(); first_trailing_stage_ > 0; + first_trailing_stage_--) { + bool has_inout_c = false; + for (size_t c = 0; c < shifts.size(); c++) { + if (stages_[first_trailing_stage_ - 1]->GetChannelMode(c) == + RenderPipelineChannelMode::kInOut) { + has_inout_c = true; + } + } + if (has_inout_c) { + break; + } + } + + first_image_dim_stage_ = stages_.size(); + for (size_t i = 0; i < stages_.size(); i++) { + std::vector> input_sizes(shifts.size()); + for (size_t c = 0; c < shifts.size(); c++) { + input_sizes[c] = + std::make_pair(DivCeil(frame_dimensions_.xsize_upsampled, + 1 << channel_shifts_[i][c].first), + DivCeil(frame_dimensions_.ysize_upsampled, + 1 << channel_shifts_[i][c].second)); + } + stages_[i]->SetInputSizes(input_sizes); + if (stages_[i]->SwitchToImageDimensions()) { + // We don't allow kInOut after switching to image dimensions. + JXL_ASSERT(i >= first_trailing_stage_); + first_image_dim_stage_ = i + 1; + stages_[i]->GetImageDimensions(&full_image_xsize_, &full_image_ysize_, + &frame_origin_); + break; + } + } + for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) { + if (stages_[i]->SwitchToImageDimensions()) { + JXL_ABORT("Cannot switch to image dimensions multiple times"); + } + std::vector> input_sizes(shifts.size()); + for (size_t c = 0; c < shifts.size(); c++) { + input_sizes[c] = {full_image_xsize_, full_image_ysize_}; + } + stages_[i]->SetInputSizes(input_sizes); + } + + anyc_.resize(stages_.size()); + for (size_t i = 0; i < stages_.size(); i++) { + for (size_t c = 0; c < shifts.size(); c++) { + if (stages_[i]->GetChannelMode(c) != + RenderPipelineChannelMode::kIgnored) { + anyc_[i] = c; + } + } + } + + stage_input_for_channel_ = std::vector>( + stages_.size(), std::vector(shifts.size())); + for (size_t c = 0; c < shifts.size(); c++) { + int input = -1; + for (size_t i = 0; i < stages_.size(); i++) { + stage_input_for_channel_[i][c] = input; + if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { + input = i; + } + } + } + + image_rect_.resize(stages_.size()); + for (size_t i = 0; i < stages_.size(); i++) { + size_t x1 = DivCeil(frame_dimensions_.xsize_upsampled, + 1 << channel_shifts_[i][anyc_[i]].first); + size_t y1 = DivCeil(frame_dimensions_.ysize_upsampled, + 1 << channel_shifts_[i][anyc_[i]].second); + image_rect_[i] = Rect(0, 0, x1, y1); + } + + virtual_ypadding_for_output_.resize(stages_.size()); + xpadding_for_output_.resize(stages_.size()); + for (size_t c = 0; c < shifts.size(); c++) { + int ypad = 0; + int xpad = 0; + for (size_t i = stages_.size(); i-- > 0;) { + if (stages_[i]->GetChannelMode(c) != + RenderPipelineChannelMode::kIgnored) { + virtual_ypadding_for_output_[i] = + std::max(ypad, virtual_ypadding_for_output_[i]); + xpadding_for_output_[i] = std::max(xpad, xpadding_for_output_[i]); + } + if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { + ypad = (DivCeil(ypad, 1 << channel_shifts_[i][c].second) + + stages_[i]->settings_.border_y) + << channel_shifts_[i][c].second; + xpad = DivCeil(xpad, 1 << stages_[i]->settings_.shift_x) + + stages_[i]->settings_.border_x; + } + } + } +} + +void LowMemoryRenderPipeline::PrepareForThreadsInternal(size_t num, + bool use_group_ids) { + const auto& shifts = channel_shifts_[0]; + + use_group_ids_ = use_group_ids; + size_t num_buffers = use_group_ids_ ? frame_dimensions_.num_groups : num; + for (size_t t = group_data_.size(); t < num_buffers; t++) { + group_data_.emplace_back(); + group_data_[t].resize(shifts.size()); + for (size_t c = 0; c < shifts.size(); c++) { + group_data_[t][c] = ImageF(GroupInputXSize(c) + group_data_x_border_ * 2, + GroupInputYSize(c) + group_data_y_border_ * 2); + } + } + // TODO(veluca): avoid reallocating buffers if not needed. + stage_data_.resize(num); + size_t upsampling = 1u << base_color_shift_; + size_t group_dim = frame_dimensions_.group_dim * upsampling; + size_t padding = + 2 * group_data_x_border_ * upsampling + // maximum size of a rect + 2 * kRenderPipelineXOffset; // extra padding for processing + size_t stage_buffer_xsize = group_dim + padding; + for (size_t t = 0; t < num; t++) { + stage_data_[t].resize(shifts.size()); + for (size_t c = 0; c < shifts.size(); c++) { + stage_data_[t][c].resize(stages_.size()); + size_t next_y_border = 0; + for (size_t i = stages_.size(); i-- > 0;) { + if (stages_[i]->GetChannelMode(c) == + RenderPipelineChannelMode::kInOut) { + size_t stage_buffer_ysize = + 2 * next_y_border + (1 << stages_[i]->settings_.shift_y); + stage_buffer_ysize = 1 << CeilLog2Nonzero(stage_buffer_ysize); + next_y_border = stages_[i]->settings_.border_y; + stage_data_[t][c][i] = ImageF(stage_buffer_xsize, stage_buffer_ysize); + } + } + } + } + if (first_image_dim_stage_ != stages_.size()) { + RectT image_rect(0, 0, frame_dimensions_.xsize_upsampled, + frame_dimensions_.ysize_upsampled); + RectT full_image_rect(0, 0, full_image_xsize_, full_image_ysize_); + image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0); + image_rect = image_rect.Intersection(full_image_rect); + if (image_rect.xsize() == 0 || image_rect.ysize() == 0) { + image_rect = RectT(0, 0, 0, 0); + } + size_t left_padding = image_rect.x0(); + size_t middle_padding = group_dim; + size_t right_padding = full_image_xsize_ - image_rect.x1(); + size_t out_of_frame_xsize = + padding + + std::max(left_padding, std::max(middle_padding, right_padding)); + out_of_frame_data_.resize(num); + for (size_t t = 0; t < num; t++) { + out_of_frame_data_[t] = ImageF(out_of_frame_xsize, shifts.size()); + } + } +} + +std::vector> LowMemoryRenderPipeline::PrepareBuffers( + size_t group_id, size_t thread_id) { + std::vector> ret(channel_shifts_[0].size()); + const size_t gx = group_id % frame_dimensions_.xsize_groups; + const size_t gy = group_id / frame_dimensions_.xsize_groups; + for (size_t c = 0; c < channel_shifts_[0].size(); c++) { + ret[c].first = &group_data_[use_group_ids_ ? group_id : thread_id][c]; + ret[c].second = Rect(group_data_x_border_, group_data_y_border_, + GroupInputXSize(c), GroupInputYSize(c), + DivCeil(frame_dimensions_.xsize_upsampled, + 1 << channel_shifts_[0][c].first) - + gx * GroupInputXSize(c) + group_data_x_border_, + DivCeil(frame_dimensions_.ysize_upsampled, + 1 << channel_shifts_[0][c].second) - + gy * GroupInputYSize(c) + group_data_y_border_); + } + return ret; +} + +namespace { + +JXL_INLINE int GetMirroredY(int y, ssize_t group_y0, ssize_t image_ysize) { + if (group_y0 == 0 && (y < 0 || y + group_y0 >= image_ysize)) { + return Mirror(y, image_ysize); + } + if (y + group_y0 >= image_ysize) { + // Here we know that the one mirroring step is sufficient. + return 2 * image_ysize - (y + group_y0) - 1 - group_y0; + } + return y; +} + +JXL_INLINE void ApplyXMirroring(float* row, ssize_t borderx, ssize_t group_x0, + ssize_t group_xsize, ssize_t image_xsize) { + if (image_xsize <= borderx) { + if (group_x0 == 0) { + for (ssize_t ix = 0; ix < borderx; ix++) { + row[kRenderPipelineXOffset - ix - 1] = + row[kRenderPipelineXOffset + Mirror(-ix - 1, image_xsize)]; + } + } + if (group_xsize + borderx + group_x0 >= image_xsize) { + for (ssize_t ix = 0; ix < borderx; ix++) { + row[kRenderPipelineXOffset + image_xsize + ix - group_x0] = + row[kRenderPipelineXOffset + Mirror(image_xsize + ix, image_xsize) - + group_x0]; + } + } + } else { + // Here we know that the one mirroring step is sufficient. + if (group_x0 == 0) { + for (ssize_t ix = 0; ix < borderx; ix++) { + row[kRenderPipelineXOffset - ix - 1] = row[kRenderPipelineXOffset + ix]; + } + } + if (group_xsize + borderx + group_x0 >= image_xsize) { + for (ssize_t ix = 0; ix < borderx; ix++) { + row[kRenderPipelineXOffset + image_xsize - group_x0 + ix] = + row[kRenderPipelineXOffset + image_xsize - group_x0 - ix - 1]; + } + } + } +} + +// Information about where the *output* of each stage is stored. +class Rows { + public: + Rows(const std::vector>& stages, + const Rect data_max_color_channel_rect, int group_data_x_border, + int group_data_y_border, + const std::vector>& group_data_shift, + size_t base_color_shift, std::vector>& thread_data, + std::vector& input_data) { + size_t num_stages = stages.size(); + size_t num_channels = input_data.size(); + + JXL_ASSERT(thread_data.size() == num_channels); + JXL_ASSERT(group_data_shift.size() == num_channels); + +#if JXL_ENABLE_ASSERT + for (const auto& td : thread_data) { + JXL_ASSERT(td.size() == num_stages); + } +#endif + + rows_.resize(num_stages + 1, std::vector(num_channels)); + + for (size_t i = 0; i < num_stages; i++) { + for (size_t c = 0; c < input_data.size(); c++) { + if (stages[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { + rows_[i + 1][c].ymod_minus_1 = thread_data[c][i].ysize() - 1; + rows_[i + 1][c].base_ptr = thread_data[c][i].Row(0); + rows_[i + 1][c].stride = thread_data[c][i].PixelsPerRow(); + } + } + } + + for (size_t c = 0; c < input_data.size(); c++) { + auto channel_group_data_rect = + data_max_color_channel_rect.As() + .Translate(-group_data_x_border, -group_data_y_border) + .ShiftLeft(base_color_shift) + .CeilShiftRight(group_data_shift[c]) + .Translate(group_data_x_border - ssize_t(kRenderPipelineXOffset), + group_data_y_border); + rows_[0][c].base_ptr = channel_group_data_rect.Row(&input_data[c], 0); + rows_[0][c].stride = input_data[c].PixelsPerRow(); + rows_[0][c].ymod_minus_1 = -1; + } + } + + // Stage -1 refers to the input data; all other values must be nonnegative and + // refer to the data for the output of that stage. + JXL_INLINE float* GetBuffer(int stage, int y, size_t c) const { + JXL_DASSERT(stage >= -1); + const RowInfo& info = rows_[stage + 1][c]; + return info.base_ptr + ssize_t(info.stride) * (y & info.ymod_minus_1); + } + + private: + struct RowInfo { + // Pointer to beginning of the first row. + float* base_ptr; + // Modulo value for the y axis minus 1 (ymod is guaranteed to be a power of + // 2, which allows efficient mod computation by masking). + int ymod_minus_1; + // Number of floats per row. + size_t stride; + }; + std::vector> rows_; +}; + +} // namespace + +void LowMemoryRenderPipeline::RenderRect(size_t thread_id, + std::vector& input_data, + Rect data_max_color_channel_rect, + Rect image_max_color_channel_rect) { + // For each stage, the rect corresponding to the image area currently being + // processed, in the coordinates of that stage (i.e. with the scaling factor + // that that stage has). + std::vector group_rect; + group_rect.resize(stages_.size()); + Rect image_area_rect = + image_max_color_channel_rect.ShiftLeft(base_color_shift_) + .Crop(frame_dimensions_.xsize_upsampled, + frame_dimensions_.ysize_upsampled); + for (size_t i = 0; i < stages_.size(); i++) { + group_rect[i] = + image_area_rect.CeilShiftRight(channel_shifts_[i][anyc_[i]]); + } + + ssize_t frame_x0 = + first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.x0; + ssize_t frame_y0 = + first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.y0; + size_t full_image_xsize = first_image_dim_stage_ == stages_.size() + ? frame_dimensions_.xsize_upsampled + : full_image_xsize_; + size_t full_image_ysize = first_image_dim_stage_ == stages_.size() + ? frame_dimensions_.ysize_upsampled + : full_image_ysize_; + + // Compute actual x-axis bounds for the current image area in the context of + // the full image this frame is part of. As the left boundary may be negative, + // we also create the x_pixels_skip value, defined as follows: + // - both x_pixels_skip and full_image_x0 are >= 0, and at least one is 0; + // - full_image_x0 - x_pixels_skip is the position of the current frame area + // in the full image. + ssize_t full_image_x0 = frame_x0 + image_area_rect.x0(); + ssize_t x_pixels_skip = 0; + if (full_image_x0 < 0) { + x_pixels_skip = -full_image_x0; + full_image_x0 = 0; + } + ssize_t full_image_x1 = frame_x0 + image_area_rect.x1(); + full_image_x1 = std::min(full_image_x1, full_image_xsize); + + // If the current image area is entirely outside of the visible image, there + // is no point in proceeding. Note: this uses the assumption that if there is + // a stage with observable effects (i.e. a kInput stage), it only appears + // after the stage that switches to image dimensions. + if (full_image_x1 <= full_image_x0) return; + + // Data structures to hold information about input/output rows and their + // buffers. + Rows rows(stages_, data_max_color_channel_rect, group_data_x_border_, + group_data_y_border_, channel_shifts_[0], base_color_shift_, + stage_data_[thread_id], input_data); + + std::vector input_rows(first_trailing_stage_ + + 1); + for (size_t i = 0; i < first_trailing_stage_; i++) { + input_rows[i].resize(input_data.size()); + } + input_rows[first_trailing_stage_].resize(input_data.size(), + std::vector(1)); + + // Maximum possible shift is 3. + RenderPipelineStage::RowInfo output_rows(input_data.size(), + std::vector(8)); + + // Fills in input_rows and output_rows for a given y value (relative to the + // start of the group, measured in actual pixels at the appropriate vertical + // scaling factor) and a given stage, applying mirroring if necessary. This + // function is somewhat inefficient for trailing kInOut or kInput stages, + // where just filling the input row once ought to be sufficient. + auto prepare_io_rows = [&](int y, size_t i) { + ssize_t bordery = stages_[i]->settings_.border_y; + size_t shifty = stages_[i]->settings_.shift_y; + auto make_row = [&](size_t c, ssize_t iy) { + size_t mirrored_y = GetMirroredY(y + iy - bordery, group_rect[i].y0(), + image_rect_[i].ysize()); + input_rows[i][c][iy] = + rows.GetBuffer(stage_input_for_channel_[i][c], mirrored_y, c); + ApplyXMirroring(input_rows[i][c][iy], stages_[i]->settings_.border_x, + group_rect[i].x0(), group_rect[i].xsize(), + image_rect_[i].xsize()); + }; + for (size_t c = 0; c < input_data.size(); c++) { + RenderPipelineChannelMode mode = stages_[i]->GetChannelMode(c); + if (mode == RenderPipelineChannelMode::kIgnored) { + continue; + } + // If we already have rows from a previous iteration, we can just shift + // the rows by 1 and insert the new one. + if (input_rows[i][c].size() == 2 * size_t(bordery) + 1) { + for (ssize_t iy = 0; iy < 2 * bordery; iy++) { + input_rows[i][c][iy] = input_rows[i][c][iy + 1]; + } + make_row(c, bordery * 2); + } else { + input_rows[i][c].resize(2 * bordery + 1); + for (ssize_t iy = 0; iy < 2 * bordery + 1; iy++) { + make_row(c, iy); + } + } + + // If necessary, get the output buffers. + if (mode == RenderPipelineChannelMode::kInOut) { + for (size_t iy = 0; iy < (1u << shifty); iy++) { + output_rows[c][iy] = rows.GetBuffer(i, y * (1 << shifty) + iy, c); + } + } + } + }; + + // We pretend that every stage has a vertical shift of 0, i.e. it is as tall + // as the final image. + // We call each such row a "virtual" row, because it may or may not correspond + // to an actual row of the current processing stage; actual processing happens + // when vy % (1<> channel_shifts_[i][anyc_[i]].second; + + ssize_t image_y = ssize_t(group_rect[i].y0()) + y; + // Do not produce rows in out-of-bounds areas. + if (image_y < 0 || image_y >= ssize_t(image_rect_[i].ysize())) { + continue; + } + + // Get the input/output rows and potentially apply mirroring to the input. + prepare_io_rows(y, i); + + // Produce output rows. + stages_[i]->ProcessRow(input_rows[i], output_rows, + xpadding_for_output_[i], group_rect[i].xsize(), + group_rect[i].x0(), image_y, thread_id); + } + + // Process trailing stages, i.e. the final set of non-kInOut stages; they + // all have the same input buffer and no need to use any mirroring. + + int y = vy - num_extra_rows; + + for (size_t c = 0; c < input_data.size(); c++) { + // Skip pixels that are not part of the actual final image area. + input_rows[first_trailing_stage_][c][0] = + rows.GetBuffer(stage_input_for_channel_[first_trailing_stage_][c], y, + c) + + x_pixels_skip; + } + + // Check that we are not outside of the bounds for the current rendering + // rect. Not doing so might result in overwriting some rows that have been + // written (or will be written) by other threads. + if (y < 0 || y >= ssize_t(image_area_rect.ysize())) { + continue; + } + + // Avoid running pipeline stages on pixels that are outside the full image + // area. As trailing stages have no borders, this is a free optimization + // (and may be necessary for correctness, as some stages assume coordinates + // are within bounds). + ssize_t full_image_y = frame_y0 + image_area_rect.y0() + y; + if (full_image_y < 0 || full_image_y >= ssize_t(full_image_ysize)) { + continue; + } + + for (size_t i = first_trailing_stage_; i < stages_.size(); i++) { + // Before the first_image_dim_stage_, coordinates are relative to the + // current frame. + size_t x0 = + i < first_image_dim_stage_ ? full_image_x0 - frame_x0 : full_image_x0; + size_t y = + i < first_image_dim_stage_ ? full_image_y - frame_y0 : full_image_y; + stages_[i]->ProcessRow(input_rows[first_trailing_stage_], output_rows, + /*xextra=*/0, full_image_x1 - full_image_x0, x0, y, + thread_id); + } + } +} + +void LowMemoryRenderPipeline::RenderPadding(size_t thread_id, Rect rect) { + if (rect.xsize() == 0) return; + size_t numc = channel_shifts_[0].size(); + RenderPipelineStage::RowInfo input_rows(numc, std::vector(1)); + RenderPipelineStage::RowInfo output_rows; + + for (size_t c = 0; c < numc; c++) { + input_rows[c][0] = out_of_frame_data_[thread_id].Row(c); + } + + for (size_t y = 0; y < rect.ysize(); y++) { + stages_[first_image_dim_stage_ - 1]->ProcessPaddingRow( + input_rows, rect.xsize(), rect.x0(), rect.y0() + y); + for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) { + stages_[i]->ProcessRow(input_rows, output_rows, + /*xextra=*/0, rect.xsize(), rect.x0(), + rect.y0() + y, thread_id); + } + } +} + +void LowMemoryRenderPipeline::ProcessBuffers(size_t group_id, + size_t thread_id) { + std::vector& input_data = + group_data_[use_group_ids_ ? group_id : thread_id]; + + // Copy the group borders to the border storage. + for (size_t c = 0; c < input_data.size(); c++) { + SaveBorders(group_id, c, input_data[c]); + } + + size_t gy = group_id / frame_dimensions_.xsize_groups; + size_t gx = group_id % frame_dimensions_.xsize_groups; + + if (first_image_dim_stage_ != stages_.size()) { + size_t group_dim = frame_dimensions_.group_dim << base_color_shift_; + RectT group_rect(gx * group_dim, gy * group_dim, group_dim, + group_dim); + RectT image_rect(0, 0, frame_dimensions_.xsize_upsampled, + frame_dimensions_.ysize_upsampled); + RectT full_image_rect(0, 0, full_image_xsize_, full_image_ysize_); + group_rect = group_rect.Translate(frame_origin_.x0, frame_origin_.y0); + image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0); + image_rect = image_rect.Intersection(full_image_rect); + group_rect = group_rect.Intersection(image_rect); + size_t x0 = group_rect.x0(); + size_t y0 = group_rect.y0(); + size_t x1 = group_rect.x1(); + size_t y1 = group_rect.y1(); + JXL_DEBUG_V(6, + "Rendering padding for full image rect %s " + "outside group rect %s", + Description(full_image_rect).c_str(), + Description(group_rect).c_str()); + + if (group_id == 0 && (image_rect.xsize() == 0 || image_rect.ysize() == 0)) { + // If this frame does not intersect with the full image, we have to + // initialize the whole image area with RenderPadding. + RenderPadding(thread_id, + Rect(0, 0, full_image_xsize_, full_image_ysize_)); + } + + // Render padding for groups that intersect with the full image. The case + // where no groups intersect was handled above. + if (group_rect.xsize() > 0 && group_rect.ysize() > 0) { + if (gx == 0 && gy == 0) { + RenderPadding(thread_id, Rect(0, 0, x0, y0)); + } + if (gy == 0) { + RenderPadding(thread_id, Rect(x0, 0, x1 - x0, y0)); + } + if (gx == 0) { + RenderPadding(thread_id, Rect(0, y0, x0, y1 - y0)); + } + if (gx == 0 && gy + 1 == frame_dimensions_.ysize_groups) { + RenderPadding(thread_id, Rect(0, y1, x0, full_image_ysize_ - y1)); + } + if (gy + 1 == frame_dimensions_.ysize_groups) { + RenderPadding(thread_id, Rect(x0, y1, x1 - x0, full_image_ysize_ - y1)); + } + if (gy == 0 && gx + 1 == frame_dimensions_.xsize_groups) { + RenderPadding(thread_id, Rect(x1, 0, full_image_xsize_ - x1, y0)); + } + if (gx + 1 == frame_dimensions_.xsize_groups) { + RenderPadding(thread_id, Rect(x1, y0, full_image_xsize_ - x1, y1 - y0)); + } + if (gy + 1 == frame_dimensions_.ysize_groups && + gx + 1 == frame_dimensions_.xsize_groups) { + RenderPadding(thread_id, Rect(x1, y1, full_image_xsize_ - x1, + full_image_ysize_ - y1)); + } + } + } + + Rect ready_rects[GroupBorderAssigner::kMaxToFinalize]; + size_t num_ready_rects = 0; + group_border_assigner_.GroupDone(group_id, group_border_.first, + group_border_.second, ready_rects, + &num_ready_rects); + for (size_t i = 0; i < num_ready_rects; i++) { + const Rect& image_max_color_channel_rect = ready_rects[i]; + for (size_t c = 0; c < input_data.size(); c++) { + LoadBorders(group_id, c, image_max_color_channel_rect, &input_data[c]); + } + Rect data_max_color_channel_rect( + group_data_x_border_ + image_max_color_channel_rect.x0() - + gx * frame_dimensions_.group_dim, + group_data_y_border_ + image_max_color_channel_rect.y0() - + gy * frame_dimensions_.group_dim, + image_max_color_channel_rect.xsize(), + image_max_color_channel_rect.ysize()); + RenderRect(thread_id, input_data, data_max_color_channel_rect, + image_max_color_channel_rect); + } +} +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h new file mode 100644 index 0000000000..b386f7c078 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h @@ -0,0 +1,111 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_LOW_MEMORY_RENDER_PIPELINE_H_ +#define LIB_JXL_RENDER_PIPELINE_LOW_MEMORY_RENDER_PIPELINE_H_ + +#include + +#include "lib/jxl/dec_group_border.h" +#include "lib/jxl/render_pipeline/render_pipeline.h" + +namespace jxl { + +// A multithreaded, low-memory rendering pipeline that only allocates a minimal +// amount of buffers. +class LowMemoryRenderPipeline final : public RenderPipeline { + private: + std::vector> PrepareBuffers( + size_t group_id, size_t thread_id) override; + + void PrepareForThreadsInternal(size_t num, bool use_group_ids) override; + + void ProcessBuffers(size_t group_id, size_t thread_id) override; + + void ClearDone(size_t i) override { group_border_assigner_.ClearDone(i); } + + void Init() override; + + void EnsureBordersStorage(); + size_t GroupInputXSize(size_t c) const; + size_t GroupInputYSize(size_t c) const; + void RenderRect(size_t thread_id, std::vector& input_data, + Rect data_max_color_channel_rect, + Rect image_max_color_channel_rect); + void RenderPadding(size_t thread_id, Rect rect); + + void SaveBorders(size_t group_id, size_t c, const ImageF& in); + void LoadBorders(size_t group_id, size_t c, const Rect& r, ImageF* out); + + std::pair ColorDimensionsToChannelDimensions( + std::pair in, size_t c, size_t stage) const; + + std::pair BorderToStore(size_t c) const; + + bool use_group_ids_; + + // Storage for borders between groups. Borders of adjacent groups are stacked + // together, e.g. bottom border of current group is followed by top border + // of next group. + std::vector borders_horizontal_; + std::vector borders_vertical_; + + // Manages the status of borders. + GroupBorderAssigner group_border_assigner_; + + // Size (in color-channel-pixels) of the border around each group that might + // be assigned to that group. + std::pair group_border_; + // base_color_shift_ defines the size of groups in terms of final image + // pixels. + size_t base_color_shift_; + + // Buffer for decoded pixel data for a group, indexed by [thread][channel] or + // [group][channel] depending on `use_group_ids_`. + std::vector> group_data_; + + // Borders for storing group data. + size_t group_data_x_border_; + size_t group_data_y_border_; + + // Buffers for intermediate rows for the various stages, indexed by + // [thread][channel][stage]. + std::vector>> stage_data_; + + // Buffers for out-of-frame data, indexed by [thread]; every row is a + // different channel. + std::vector out_of_frame_data_; + + // For each stage, a non-kIgnored channel. + std::vector anyc_; + + // Size of the image at each stage. + std::vector image_rect_; + + // For each stage, for each channel, keep track of the kInOut stage that + // produced the input to that stage (which corresponds to the buffer index + // containing the data). -1 if data comes from the original input. + std::vector> stage_input_for_channel_; + + // Number of (virtual) extra rows that must be processed at each stage + // to produce sufficient output for future stages. + std::vector virtual_ypadding_for_output_; + + // Same thing for columns, except these are real columns and not virtual ones. + std::vector xpadding_for_output_; + + // First stage that doesn't have any kInOut channel. + size_t first_trailing_stage_; + + // Origin and size of the frame after switching to image dimensions. + FrameOrigin frame_origin_; + size_t full_image_xsize_; + size_t full_image_ysize_; + size_t first_image_dim_stage_; +}; + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_LOW_MEMORY_RENDER_PIPELINE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc new file mode 100644 index 0000000000..68b6ef613f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc @@ -0,0 +1,132 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/render_pipeline.h" + +#include + +#include "lib/jxl/render_pipeline/low_memory_render_pipeline.h" +#include "lib/jxl/render_pipeline/simple_render_pipeline.h" +#include "lib/jxl/sanitizers.h" + +namespace jxl { + +void RenderPipeline::Builder::AddStage( + std::unique_ptr stage) { + stages_.push_back(std::move(stage)); +} + +std::unique_ptr RenderPipeline::Builder::Finalize( + FrameDimensions frame_dimensions) && { +#if JXL_ENABLE_ASSERT + // Check that the last stage is not an kInOut stage for any channel, and that + // there is at least one stage. + JXL_ASSERT(!stages_.empty()); + for (size_t c = 0; c < num_c_; c++) { + JXL_ASSERT(stages_.back()->GetChannelMode(c) != + RenderPipelineChannelMode::kInOut); + } +#endif + + std::unique_ptr res; + if (use_simple_implementation_) { + res = jxl::make_unique(); + } else { + res = jxl::make_unique(); + } + + res->padding_.resize(stages_.size()); + for (size_t i = stages_.size(); i-- > 0;) { + const auto& stage = stages_[i]; + res->padding_[i].resize(num_c_); + if (i + 1 == stages_.size()) { + continue; + } + for (size_t c = 0; c < num_c_; c++) { + if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { + res->padding_[i][c].first = DivCeil(res->padding_[i + 1][c].first, + 1 << stage->settings_.shift_x) + + stage->settings_.border_x; + res->padding_[i][c].second = DivCeil(res->padding_[i + 1][c].second, + 1 << stage->settings_.shift_y) + + stage->settings_.border_y; + } else { + res->padding_[i][c] = res->padding_[i + 1][c]; + } + } + } + + res->frame_dimensions_ = frame_dimensions; + res->group_completed_passes_.resize(frame_dimensions.num_groups); + res->channel_shifts_.resize(stages_.size()); + res->channel_shifts_[0].resize(num_c_); + for (size_t i = 1; i < stages_.size(); i++) { + auto& stage = stages_[i - 1]; + for (size_t c = 0; c < num_c_; c++) { + if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { + res->channel_shifts_[0][c].first += stage->settings_.shift_x; + res->channel_shifts_[0][c].second += stage->settings_.shift_y; + } + } + } + for (size_t i = 1; i < stages_.size(); i++) { + auto& stage = stages_[i - 1]; + res->channel_shifts_[i].resize(num_c_); + for (size_t c = 0; c < num_c_; c++) { + if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) { + res->channel_shifts_[i][c].first = + res->channel_shifts_[i - 1][c].first - stage->settings_.shift_x; + res->channel_shifts_[i][c].second = + res->channel_shifts_[i - 1][c].second - stage->settings_.shift_y; + } else { + res->channel_shifts_[i][c].first = res->channel_shifts_[i - 1][c].first; + res->channel_shifts_[i][c].second = + res->channel_shifts_[i - 1][c].second; + } + } + } + res->stages_ = std::move(stages_); + res->Init(); + return res; +} + +RenderPipelineInput RenderPipeline::GetInputBuffers(size_t group_id, + size_t thread_id) { + RenderPipelineInput ret; + JXL_DASSERT(group_id < group_completed_passes_.size()); + ret.group_id_ = group_id; + ret.thread_id_ = thread_id; + ret.pipeline_ = this; + ret.buffers_ = PrepareBuffers(group_id, thread_id); + return ret; +} + +void RenderPipeline::InputReady( + size_t group_id, size_t thread_id, + const std::vector>& buffers) { + JXL_DASSERT(group_id < group_completed_passes_.size()); + group_completed_passes_[group_id]++; + for (size_t i = 0; i < buffers.size(); ++i) { + (void)i; + JXL_CHECK_PLANE_INITIALIZED(*buffers[i].first, buffers[i].second, i); + } + + ProcessBuffers(group_id, thread_id); +} + +Status RenderPipeline::PrepareForThreads(size_t num, bool use_group_ids) { + for (const auto& stage : stages_) { + JXL_RETURN_IF_ERROR(stage->PrepareForThreads(num)); + } + PrepareForThreadsInternal(num, use_group_ids); + return true; +} + +void RenderPipelineInput::Done() { + JXL_ASSERT(pipeline_); + pipeline_->InputReady(group_id_, thread_id_, buffers_); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h new file mode 100644 index 0000000000..bf3ad4975e --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h @@ -0,0 +1,139 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_H_ +#define LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_H_ + +#include + +#include "lib/jxl/image.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Interface to provide input to the rendering pipeline. When this object is +// destroyed, all the data in the provided ImageF's Rects must have been +// initialized. +class RenderPipelineInput { + public: + RenderPipelineInput(const RenderPipelineInput&) = delete; + RenderPipelineInput(RenderPipelineInput&& other) noexcept { + *this = std::move(other); + } + RenderPipelineInput& operator=(RenderPipelineInput&& other) noexcept { + pipeline_ = other.pipeline_; + group_id_ = other.group_id_; + thread_id_ = other.thread_id_; + buffers_ = std::move(other.buffers_); + other.pipeline_ = nullptr; + return *this; + } + + RenderPipelineInput() = default; + void Done(); + + const std::pair& GetBuffer(size_t c) const { + JXL_ASSERT(c < buffers_.size()); + return buffers_[c]; + } + + private: + RenderPipeline* pipeline_ = nullptr; + size_t group_id_; + size_t thread_id_; + std::vector> buffers_; + friend class RenderPipeline; +}; + +class RenderPipeline { + public: + class Builder { + public: + explicit Builder(size_t num_c) : num_c_(num_c) { JXL_ASSERT(num_c > 0); } + + // Adds a stage to the pipeline. Must be called at least once; the last + // added stage cannot have kInOut channels. + void AddStage(std::unique_ptr stage); + + // Enables using the simple (i.e. non-memory-efficient) implementation of + // the pipeline. + void UseSimpleImplementation() { use_simple_implementation_ = true; } + + // Finalizes setup of the pipeline. Shifts for all channels should be 0 at + // this point. + std::unique_ptr Finalize( + FrameDimensions frame_dimensions) &&; + + private: + std::vector> stages_; + size_t num_c_; + bool use_simple_implementation_ = false; + }; + + friend class Builder; + + virtual ~RenderPipeline() = default; + + Status IsInitialized() const { + for (const auto& stage : stages_) { + JXL_RETURN_IF_ERROR(stage->IsInitialized()); + } + return true; + } + + // Allocates storage to run with `num` threads. If `use_group_ids` is true, + // storage is allocated for each group, not each thread. The behaviour is + // undefined if calling this function multiple times with a different value + // for `use_group_ids`. + Status PrepareForThreads(size_t num, bool use_group_ids); + + // Retrieves a buffer where input data should be stored by the callee. When + // input has been provided for all buffers, the pipeline will complete its + // processing. This method may be called multiple times concurrently from + // different threads, provided that a different `thread_id` is given. + RenderPipelineInput GetInputBuffers(size_t group_id, size_t thread_id); + + size_t PassesWithAllInput() const { + return *std::min_element(group_completed_passes_.begin(), + group_completed_passes_.end()); + } + + virtual void ClearDone(size_t i) {} + + protected: + std::vector> stages_; + // Shifts for every channel at the input of each stage. + std::vector>> channel_shifts_; + + // Amount of (cumulative) padding required by each stage and channel, in + // either direction. + std::vector>> padding_; + + FrameDimensions frame_dimensions_; + + std::vector group_completed_passes_; + + friend class RenderPipelineInput; + + private: + void InputReady(size_t group_id, size_t thread_id, + const std::vector>& buffers); + + virtual std::vector> PrepareBuffers( + size_t group_id, size_t thread_id) = 0; + + virtual void ProcessBuffers(size_t group_id, size_t thread_id) = 0; + + // Note that this method may be called multiple times with different (or + // equal) `num`. + virtual void PrepareForThreadsInternal(size_t num, bool use_group_ids) = 0; + + // Called once frame dimensions and stages are known. + virtual void Init() {} +}; + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h new file mode 100644 index 0000000000..d1a0074161 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h @@ -0,0 +1,171 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_STAGE_H_ +#define LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_STAGE_H_ + +#include + +#include "lib/jxl/base/arch_macros.h" +#include "lib/jxl/frame_header.h" + +namespace jxl { + +// The first pixel in the input to RenderPipelineStage will be located at +// this position. Pixels before this position may be accessed as padding. +// This should be at least the RoundUpTo(maximum padding / 2, maximum vector +// size) times 2: this is realized when using Gaborish + EPF + upsampling + +// chroma subsampling. +#if JXL_ARCH_ARM +constexpr size_t kRenderPipelineXOffset = 16; +#else +constexpr size_t kRenderPipelineXOffset = 32; +#endif + +enum class RenderPipelineChannelMode { + // This channel is not modified by this stage. + kIgnored = 0, + // This channel is modified in-place. + kInPlace = 1, + // This channel is modified and written to a new buffer. + kInOut = 2, + // This channel is only read. These are the only stages that are assumed to + // have observable effects, i.e. calls to ProcessRow for other stages may be + // omitted if it can be shown they can't affect any kInput stage ProcessRow + // call that happens inside image boundaries. + kInput = 3, +}; + +class RenderPipeline; + +class RenderPipelineStage { + protected: + using Row = float*; + using ChannelRows = std::vector; + + public: + using RowInfo = std::vector; + struct Settings { + // Amount of padding required in the various directions by all channels + // that have kInOut mode. + size_t border_x = 0; + size_t border_y = 0; + + // Log2 of the number of columns/rows of output that this stage will produce + // for every input row for kInOut channels. + size_t shift_x = 0; + size_t shift_y = 0; + + static Settings ShiftX(size_t shift, size_t border) { + Settings settings; + settings.border_x = border; + settings.shift_x = shift; + return settings; + } + + static Settings ShiftY(size_t shift, size_t border) { + Settings settings; + settings.border_y = border; + settings.shift_y = shift; + return settings; + } + + static Settings Symmetric(size_t shift, size_t border) { + Settings settings; + settings.border_x = settings.border_y = border; + settings.shift_x = settings.shift_y = shift; + return settings; + } + + static Settings SymmetricBorderOnly(size_t border) { + return Symmetric(0, border); + } + }; + + virtual ~RenderPipelineStage() = default; + + // Processes one row of input, producing the appropriate number of rows of + // output. Input/output rows can be obtained by calls to + // `GetInputRow`/`GetOutputRow`. `xsize+2*xextra` represents the total number + // of pixels to be processed in the input row, where the first pixel is at + // position `kRenderPipelineXOffset-xextra`. All pixels in the + // `[kRenderPipelineXOffset-xextra-border_x, + // kRenderPipelineXOffset+xsize+xextra+border_x)` range are initialized and + // accessible. `xpos` and `ypos` represent the position of the first + // (non-extra, i.e. in position kRenderPipelineXOffset) pixel in the center + // row of the input in the full image. `xpos` is a multiple of + // `GroupBorderAssigner::kPaddingXRound`. If `settings_.temp_buffer_size` is + // nonzero, `temp` will point to an HWY-aligned buffer of at least that number + // of floats; concurrent calls will have different buffers. + virtual void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const = 0; + + // How each channel will be processed. Channels are numbered starting from + // color channels (always 3) and followed by all other channels. + virtual RenderPipelineChannelMode GetChannelMode(size_t c) const = 0; + + protected: + explicit RenderPipelineStage(Settings settings) : settings_(settings) {} + + virtual Status IsInitialized() const { return true; } + + // Informs the stage about the total size of each channel. Few stages will + // actually need to use this information. + virtual void SetInputSizes( + const std::vector>& input_sizes) {} + + virtual Status PrepareForThreads(size_t num_threads) { return true; } + + // Returns a pointer to the input row of channel `c` with offset `y`. + // `y` must be in [-settings_.border_y, settings_.border_y]. `c` must be such + // that `GetChannelMode(c) != kIgnored`. The returned pointer points to the + // offset-ed row (i.e. kRenderPipelineXOffset has been applied). + float* GetInputRow(const RowInfo& input_rows, size_t c, int offset) const { + JXL_DASSERT(GetChannelMode(c) != RenderPipelineChannelMode::kIgnored); + JXL_DASSERT(-offset <= static_cast(settings_.border_y)); + JXL_DASSERT(offset <= static_cast(settings_.border_y)); + return input_rows[c][settings_.border_y + offset] + kRenderPipelineXOffset; + } + // Similar to `GetInputRow`, but can only be used if `GetChannelMode(c) == + // kInOut`. Offset must be less than `1< +#include + +#include +#include +#include + +#include "lib/extras/codec.h" +#include "lib/jxl/dec_frame.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/enc_file.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/fake_parallel_runner_testonly.h" +#include "lib/jxl/icc_codec.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/jpeg/enc_jpeg_data.h" +#include "lib/jxl/render_pipeline/test_render_pipeline_stages.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +Status DecodeFile(const Span file, bool use_slow_pipeline, + CodecInOut* io, ThreadPool* pool) { + Status ret = true; + { + BitReader reader(file); + BitReaderScopedCloser reader_closer(&reader, &ret); + JXL_RETURN_IF_ERROR(reader.ReadFixedBits<16>() == 0x0AFF); + JXL_RETURN_IF_ERROR(ReadSizeHeader(&reader, &io->metadata.size)); + JXL_RETURN_IF_ERROR(ReadImageMetadata(&reader, &io->metadata.m)); + io->metadata.transform_data.nonserialized_xyb_encoded = + io->metadata.m.xyb_encoded; + JXL_RETURN_IF_ERROR(Bundle::Read(&reader, &io->metadata.transform_data)); + if (io->metadata.m.color_encoding.WantICC()) { + PaddedBytes icc; + JXL_RETURN_IF_ERROR(ReadICC(&reader, &icc)); + JXL_RETURN_IF_ERROR(io->metadata.m.color_encoding.SetICC(std::move(icc))); + } + PassesDecoderState dec_state; + JXL_RETURN_IF_ERROR( + dec_state.output_encoding_info.SetFromMetadata(io->metadata)); + JXL_RETURN_IF_ERROR(reader.JumpToByteBoundary()); + io->frames.clear(); + do { + io->frames.emplace_back(&io->metadata.m); + // Skip frames that are not displayed. + do { + size_t frame_start = reader.TotalBitsConsumed() / kBitsPerByte; + size_t size_left = file.size() - frame_start; + JXL_RETURN_IF_ERROR( + DecodeFrame(&dec_state, pool, file.data() + frame_start, size_left, + &io->frames.back(), io->metadata, use_slow_pipeline)); + reader.SkipBits(io->frames.back().decoded_bytes() * kBitsPerByte); + } while (dec_state.shared->frame_header.frame_type != + FrameType::kRegularFrame && + dec_state.shared->frame_header.frame_type != + FrameType::kSkipProgressive); + } while (!dec_state.shared->frame_header.is_last); + + if (io->frames.empty()) return JXL_FAILURE("Not enough data."); + + if (reader.TotalBitsConsumed() != file.size() * kBitsPerByte) { + return JXL_FAILURE("Reader position not at EOF."); + } + if (!reader.AllReadsWithinBounds()) { + return JXL_FAILURE("Reader out of bounds read."); + } + io->CheckMetadata(); + // reader is closed here. + } + return ret; +} + +TEST(RenderPipelineTest, Build) { + RenderPipeline::Builder builder(/*num_c=*/1); + builder.AddStage(jxl::make_unique()); + builder.AddStage(jxl::make_unique()); + builder.AddStage(jxl::make_unique()); + builder.UseSimpleImplementation(); + FrameDimensions frame_dimensions; + frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0, + /*max_hshift=*/0, /*max_vshift=*/0, + /*modular_mode=*/false, /*upsampling=*/1); + std::move(builder).Finalize(frame_dimensions); +} + +TEST(RenderPipelineTest, CallAllGroups) { + RenderPipeline::Builder builder(/*num_c=*/1); + builder.AddStage(jxl::make_unique()); + builder.AddStage(jxl::make_unique()); + builder.AddStage(jxl::make_unique()); + builder.UseSimpleImplementation(); + FrameDimensions frame_dimensions; + frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0, + /*max_hshift=*/0, /*max_vshift=*/0, + /*modular_mode=*/false, /*upsampling=*/1); + auto pipeline = std::move(builder).Finalize(frame_dimensions); + ASSERT_TRUE(pipeline->PrepareForThreads(1, /*use_group_ids=*/false)); + + for (size_t i = 0; i < frame_dimensions.num_groups; i++) { + auto input_buffers = pipeline->GetInputBuffers(i, 0); + FillPlane(0.0f, input_buffers.GetBuffer(0).first, + input_buffers.GetBuffer(0).second); + input_buffers.Done(); + } + + EXPECT_EQ(pipeline->PassesWithAllInput(), 1); +} + +TEST(RenderPipelineTest, BuildFast) { + RenderPipeline::Builder builder(/*num_c=*/1); + builder.AddStage(jxl::make_unique()); + builder.AddStage(jxl::make_unique()); + builder.AddStage(jxl::make_unique()); + FrameDimensions frame_dimensions; + frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0, + /*max_hshift=*/0, /*max_vshift=*/0, + /*modular_mode=*/false, /*upsampling=*/1); + std::move(builder).Finalize(frame_dimensions); +} + +TEST(RenderPipelineTest, CallAllGroupsFast) { + RenderPipeline::Builder builder(/*num_c=*/1); + builder.AddStage(jxl::make_unique()); + builder.AddStage(jxl::make_unique()); + builder.AddStage(jxl::make_unique()); + builder.UseSimpleImplementation(); + FrameDimensions frame_dimensions; + frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0, + /*max_hshift=*/0, /*max_vshift=*/0, + /*modular_mode=*/false, /*upsampling=*/1); + auto pipeline = std::move(builder).Finalize(frame_dimensions); + ASSERT_TRUE(pipeline->PrepareForThreads(1, /*use_group_ids=*/false)); + + for (size_t i = 0; i < frame_dimensions.num_groups; i++) { + auto input_buffers = pipeline->GetInputBuffers(i, 0); + FillPlane(0.0f, input_buffers.GetBuffer(0).first, + input_buffers.GetBuffer(0).second); + input_buffers.Done(); + } + + EXPECT_EQ(pipeline->PassesWithAllInput(), 1); +} + +struct RenderPipelineTestInputSettings { + // Input image. + std::string input_path; + size_t xsize, ysize; + bool jpeg_transcode = false; + // Encoding settings. + CompressParams cparams; + // Short name for the encoder settings. + std::string cparams_descr; + + bool add_spot_color = false; + + Splines splines; +}; + +class RenderPipelineTestParam + : public ::testing::TestWithParam {}; + +TEST_P(RenderPipelineTestParam, PipelineTest) { + RenderPipelineTestInputSettings config = GetParam(); + + // Use a parallel runner that randomly shuffles tasks to detect possible + // border handling bugs. + FakeParallelRunner fake_pool(/*order_seed=*/123, /*num_threads=*/8); + ThreadPool pool(&JxlFakeParallelRunner, &fake_pool); + const PaddedBytes orig = jxl::test::ReadTestData(config.input_path); + + CodecInOut io; + if (config.jpeg_transcode) { + ASSERT_TRUE(jpeg::DecodeImageJPG(Span(orig), &io)); + } else { + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + } + io.ShrinkTo(config.xsize, config.ysize); + + if (config.add_spot_color) { + jxl::ImageF spot(config.xsize, config.ysize); + jxl::ZeroFillImage(&spot); + + for (size_t y = 0; y < config.ysize; y++) { + float* JXL_RESTRICT row = spot.Row(y); + for (size_t x = 0; x < config.xsize; x++) { + row[x] = ((x ^ y) & 255) * (1.f / 255.f); + } + } + ExtraChannelInfo info; + info.bit_depth.bits_per_sample = 8; + info.dim_shift = 0; + info.type = jxl::ExtraChannel::kSpotColor; + info.spot_color[0] = 0.5f; + info.spot_color[1] = 0.2f; + info.spot_color[2] = 1.f; + info.spot_color[3] = 0.5f; + + io.metadata.m.extra_channel_info.push_back(info); + std::vector ec; + ec.push_back(std::move(spot)); + io.frames[0].SetExtraChannels(std::move(ec)); + } + + PaddedBytes compressed; + + PassesEncoderState enc_state; + enc_state.shared.image_features.splines = config.splines; + ASSERT_TRUE(EncodeFile(config.cparams, &io, &enc_state, &compressed, + GetJxlCms(), /*aux_out=*/nullptr, &pool)); + + + CodecInOut io_default; + ASSERT_TRUE(DecodeFile(Span(compressed), + /*use_slow_pipeline=*/false, &io_default, &pool)); + CodecInOut io_slow_pipeline; + ASSERT_TRUE(DecodeFile(Span(compressed), + /*use_slow_pipeline=*/true, &io_slow_pipeline, &pool)); + + ASSERT_EQ(io_default.frames.size(), io_slow_pipeline.frames.size()); + for (size_t i = 0; i < io_default.frames.size(); i++) { +#if JXL_HIGH_PRECISION + constexpr float kMaxError = 1e-5; +#else + constexpr float kMaxError = 1e-4; +#endif + Image3F def = std::move(*io_default.frames[i].color()); + Image3F pip = std::move(*io_slow_pipeline.frames[i].color()); + JXL_ASSERT_OK(VerifyRelativeError(pip, def, kMaxError, kMaxError, _)); + for (size_t ec = 0; ec < io_default.frames[i].extra_channels().size(); + ec++) { + JXL_ASSERT_OK(VerifyRelativeError( + io_slow_pipeline.frames[i].extra_channels()[ec], + io_default.frames[i].extra_channels()[ec], kMaxError, kMaxError, _)); + } + } +} + +Splines CreateTestSplines() { + const ColorCorrelationMap cmap; + std::vector control_points{{9, 54}, {118, 159}, {97, 3}, + {10, 40}, {150, 25}, {120, 300}}; + const Spline spline{ + control_points, + /*color_dct=*/ + {{0.03125f, 0.00625f, 0.003125f}, {1.f, 0.321875f}, {1.f, 0.24375f}}, + /*sigma_dct=*/{0.3125f, 0.f, 0.f, 0.0625f}}; + std::vector spline_data = {spline}; + std::vector quantized_splines; + std::vector starting_points; + for (const Spline& spline : spline_data) { + quantized_splines.emplace_back(spline, /*quantization_adjustment=*/0, + cmap.YtoXRatio(0), cmap.YtoBRatio(0)); + starting_points.push_back(spline.control_points.front()); + } + return Splines(/*quantization_adjustment=*/0, std::move(quantized_splines), + std::move(starting_points)); +} + +std::vector GeneratePipelineTests() { + std::vector all_tests; + + std::pair sizes[] = { + {3, 8}, {128, 128}, {256, 256}, {258, 258}, {533, 401}, {777, 777}, + }; + + for (auto size : sizes) { + RenderPipelineTestInputSettings settings; + settings.input_path = "jxl/flower/flower.png"; + settings.xsize = size.first; + settings.ysize = size.second; + + // Base settings. + settings.cparams.butteraugli_distance = 1.0; + settings.cparams.patches = Override::kOff; + settings.cparams.dots = Override::kOff; + settings.cparams.gaborish = Override::kOff; + settings.cparams.epf = 0; + settings.cparams.color_transform = ColorTransform::kXYB; + + { + auto s = settings; + s.cparams_descr = "NoGabNoEpfNoPatches"; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams.color_transform = ColorTransform::kNone; + s.cparams_descr = "NoGabNoEpfNoPatchesNoXYB"; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams.gaborish = Override::kOn; + s.cparams_descr = "GabNoEpfNoPatches"; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams.epf = 1; + s.cparams_descr = "NoGabEpf1NoPatches"; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams.epf = 2; + s.cparams_descr = "NoGabEpf2NoPatches"; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams.epf = 3; + s.cparams_descr = "NoGabEpf3NoPatches"; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams.gaborish = Override::kOn; + s.cparams.epf = 3; + s.cparams_descr = "GabEpf3NoPatches"; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams_descr = "Splines"; + s.splines = CreateTestSplines(); + all_tests.push_back(s); + } + + for (size_t ups : {2, 4, 8}) { + { + auto s = settings; + s.cparams.resampling = ups; + s.cparams_descr = "Ups" + std::to_string(ups); + all_tests.push_back(s); + } + { + auto s = settings; + s.cparams.resampling = ups; + s.cparams.epf = 1; + s.cparams_descr = "Ups" + std::to_string(ups) + "EPF1"; + all_tests.push_back(s); + } + { + auto s = settings; + s.cparams.resampling = ups; + s.cparams.gaborish = Override::kOn; + s.cparams.epf = 1; + s.cparams_descr = "Ups" + std::to_string(ups) + "GabEPF1"; + all_tests.push_back(s); + } + } + + { + auto s = settings; + s.cparams_descr = "Noise"; + s.cparams.photon_noise_iso = 3200; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams_descr = "NoiseUps"; + s.cparams.photon_noise_iso = 3200; + s.cparams.resampling = 2; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams_descr = "ModularLossless"; + s.cparams.modular_mode = true; + s.cparams.butteraugli_distance = 0; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams_descr = "ProgressiveDC"; + s.cparams.progressive_dc = 1; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams_descr = "ModularLossy"; + s.cparams.modular_mode = true; + s.cparams.butteraugli_distance = 1.f; + all_tests.push_back(s); + } + + { + auto s = settings; + s.input_path = "jxl/flower/flower_alpha.png"; + s.cparams_descr = "AlphaVarDCT"; + all_tests.push_back(s); + } + + { + auto s = settings; + s.input_path = "jxl/flower/flower_alpha.png"; + s.cparams_descr = "AlphaVarDCTUpsamplingEPF"; + s.cparams.epf = 1; + s.cparams.ec_resampling = 2; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams.modular_mode = true; + s.cparams.butteraugli_distance = 0; + s.input_path = "jxl/flower/flower_alpha.png"; + s.cparams_descr = "AlphaLossless"; + all_tests.push_back(s); + } + + { + auto s = settings; + s.input_path = "jxl/flower/flower_alpha.png"; + s.cparams_descr = "AlphaDownsample"; + s.cparams.ec_resampling = 2; + all_tests.push_back(s); + } + + { + auto s = settings; + s.cparams_descr = "SpotColor"; + s.add_spot_color = true; + all_tests.push_back(s); + } + } + +#if JPEGXL_ENABLE_TRANSCODE_JPEG + for (const char* input : {"jxl/flower/flower.png.im_q85_444.jpg", + "jxl/flower/flower.png.im_q85_420.jpg", + "jxl/flower/flower.png.im_q85_422.jpg", + "jxl/flower/flower.png.im_q85_440.jpg"}) { + RenderPipelineTestInputSettings settings; + settings.input_path = input; + settings.jpeg_transcode = true; + settings.xsize = 2268; + settings.ysize = 1512; + settings.cparams_descr = "Default"; + all_tests.push_back(settings); + } + +#endif + + { + RenderPipelineTestInputSettings settings; + settings.input_path = "jxl/grayscale_patches.png"; + settings.xsize = 1011; + settings.ysize = 277; + settings.cparams_descr = "Patches"; + all_tests.push_back(settings); + } + + { + RenderPipelineTestInputSettings settings; + settings.input_path = "jxl/grayscale_patches.png"; + settings.xsize = 1011; + settings.ysize = 277; + settings.cparams.photon_noise_iso = 1000; + settings.cparams_descr = "PatchesAndNoise"; + all_tests.push_back(settings); + } + + { + RenderPipelineTestInputSettings settings; + settings.input_path = "jxl/grayscale_patches.png"; + settings.xsize = 1011; + settings.ysize = 277; + settings.cparams.resampling = 2; + settings.cparams_descr = "PatchesAndUps2"; + all_tests.push_back(settings); + } + + return all_tests; +} + +std::ostream& operator<<(std::ostream& os, + const RenderPipelineTestInputSettings& c) { + std::string filename; + size_t pos = c.input_path.find_last_of('/'); + if (pos == std::string::npos) { + filename = c.input_path; + } else { + filename = c.input_path.substr(pos + 1); + } + std::replace_if( + filename.begin(), filename.end(), [](char c) { return !isalnum(c); }, + '_'); + os << filename << "_" << (c.jpeg_transcode ? "JPEG_" : "") << c.xsize << "x" + << c.ysize << "_" << c.cparams_descr; + return os; +} + +std::string PipelineTestDescription( + const testing::TestParamInfo& info) { + std::stringstream name; + name << info.param; + return name.str(); +} + +JXL_GTEST_INSTANTIATE_TEST_SUITE_P(RenderPipelineTest, RenderPipelineTestParam, + testing::ValuesIn(GeneratePipelineTests()), + PipelineTestDescription); + +TEST(RenderPipelineDecodingTest, Animation) { + FakeParallelRunner fake_pool(/*order_seed=*/123, /*num_threads=*/8); + ThreadPool pool(&JxlFakeParallelRunner, &fake_pool); + + PaddedBytes compressed = + jxl::test::ReadTestData("jxl/blending/cropped_traffic_light.jxl"); + + CodecInOut io_default; + ASSERT_TRUE(DecodeFile(Span(compressed), + /*use_slow_pipeline=*/false, &io_default, &pool)); + CodecInOut io_slow_pipeline; + ASSERT_TRUE(DecodeFile(Span(compressed), + /*use_slow_pipeline=*/true, &io_slow_pipeline, &pool)); + + ASSERT_EQ(io_default.frames.size(), io_slow_pipeline.frames.size()); + for (size_t i = 0; i < io_default.frames.size(); i++) { +#if JXL_HIGH_PRECISION + constexpr float kMaxError = 1e-5; +#else + constexpr float kMaxError = 1e-4; +#endif + + Image3F fast_pipeline = std::move(*io_default.frames[i].color()); + Image3F slow_pipeline = std::move(*io_slow_pipeline.frames[i].color()); + JXL_ASSERT_OK(VerifyRelativeError(slow_pipeline, fast_pipeline, kMaxError, + kMaxError, _)) + for (size_t ec = 0; ec < io_default.frames[i].extra_channels().size(); + ec++) { + JXL_ASSERT_OK(VerifyRelativeError( + io_slow_pipeline.frames[i].extra_channels()[ec], + io_default.frames[i].extra_channels()[ec], kMaxError, kMaxError, _)); + } + } +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc new file mode 100644 index 0000000000..4495288860 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc @@ -0,0 +1,266 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/simple_render_pipeline.h" + +#include + +#include "lib/jxl/image_ops.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" +#include "lib/jxl/sanitizers.h" + +namespace jxl { + +void SimpleRenderPipeline::PrepareForThreadsInternal(size_t num, + bool use_group_ids) { + if (!channel_data_.empty()) { + return; + } + auto ch_size = [](size_t frame_size, size_t shift) { + return DivCeil(frame_size, 1 << shift) + kRenderPipelineXOffset * 2; + }; + for (size_t c = 0; c < channel_shifts_[0].size(); c++) { + channel_data_.push_back(ImageF( + ch_size(frame_dimensions_.xsize_upsampled, channel_shifts_[0][c].first), + ch_size(frame_dimensions_.ysize_upsampled, + channel_shifts_[0][c].second))); + msan::PoisonImage(channel_data_.back()); + } +} + +Rect SimpleRenderPipeline::MakeChannelRect(size_t group_id, size_t channel) { + size_t base_color_shift = + CeilLog2Nonzero(frame_dimensions_.xsize_upsampled_padded / + frame_dimensions_.xsize_padded); + + const size_t gx = group_id % frame_dimensions_.xsize_groups; + const size_t gy = group_id / frame_dimensions_.xsize_groups; + size_t xgroupdim = (frame_dimensions_.group_dim << base_color_shift) >> + channel_shifts_[0][channel].first; + size_t ygroupdim = (frame_dimensions_.group_dim << base_color_shift) >> + channel_shifts_[0][channel].second; + return Rect( + kRenderPipelineXOffset + gx * xgroupdim, + kRenderPipelineXOffset + gy * ygroupdim, xgroupdim, ygroupdim, + kRenderPipelineXOffset + DivCeil(frame_dimensions_.xsize_upsampled, + 1 << channel_shifts_[0][channel].first), + kRenderPipelineXOffset + + DivCeil(frame_dimensions_.ysize_upsampled, + 1 << channel_shifts_[0][channel].second)); +} + +std::vector> SimpleRenderPipeline::PrepareBuffers( + size_t group_id, size_t thread_id) { + std::vector> ret; + for (size_t c = 0; c < channel_data_.size(); c++) { + ret.emplace_back(&channel_data_[c], MakeChannelRect(group_id, c)); + } + return ret; +} + +void SimpleRenderPipeline::ProcessBuffers(size_t group_id, size_t thread_id) { + for (size_t c = 0; c < channel_data_.size(); c++) { + Rect r = MakeChannelRect(group_id, c); + (void)r; + JXL_CHECK_PLANE_INITIALIZED(channel_data_[c], r, c); + } + + if (PassesWithAllInput() <= processed_passes_) return; + processed_passes_++; + + for (size_t stage_id = 0; stage_id < stages_.size(); stage_id++) { + const auto& stage = stages_[stage_id]; + // Prepare buffers for kInOut channels. + std::vector new_channels(channel_data_.size()); + std::vector output_channels(channel_data_.size()); + + std::vector> input_sizes(channel_data_.size()); + for (size_t c = 0; c < channel_data_.size(); c++) { + input_sizes[c] = + std::make_pair(channel_data_[c].xsize() - kRenderPipelineXOffset * 2, + channel_data_[c].ysize() - kRenderPipelineXOffset * 2); + } + + for (size_t c = 0; c < channel_data_.size(); c++) { + if (stage->GetChannelMode(c) != RenderPipelineChannelMode::kInOut) { + continue; + } + // Ensure that the newly allocated channels are large enough to avoid + // problems with padding. + new_channels[c] = + ImageF(frame_dimensions_.xsize_upsampled_padded + + kRenderPipelineXOffset * 2 + hwy::kMaxVectorSize * 8, + frame_dimensions_.ysize_upsampled_padded + + kRenderPipelineXOffset * 2); + new_channels[c].ShrinkTo( + (input_sizes[c].first << stage->settings_.shift_x) + + kRenderPipelineXOffset * 2, + (input_sizes[c].second << stage->settings_.shift_y) + + kRenderPipelineXOffset * 2); + output_channels[c] = &new_channels[c]; + } + + auto get_row = [&](size_t c, int64_t y) { + return channel_data_[c].Row(kRenderPipelineXOffset + y) + + kRenderPipelineXOffset; + }; + + // Add mirrored pixes to all kInOut channels. + for (size_t c = 0; c < channel_data_.size(); c++) { + if (stage->GetChannelMode(c) != RenderPipelineChannelMode::kInOut) { + continue; + } + // Horizontal mirroring. + for (size_t y = 0; y < input_sizes[c].second; y++) { + float* row = get_row(c, y); + for (size_t ix = 0; ix < stage->settings_.border_x; ix++) { + *(row - ix - 1) = row[Mirror(-ssize_t(ix) - 1, input_sizes[c].first)]; + } + for (size_t ix = 0; ix < stage->settings_.border_x; ix++) { + *(row + ix + input_sizes[c].first) = + row[Mirror(ix + input_sizes[c].first, input_sizes[c].first)]; + } + } + // Vertical mirroring. + for (int y = 0; y < static_cast(stage->settings_.border_y); y++) { + memcpy(get_row(c, -y - 1) - stage->settings_.border_x, + get_row(c, Mirror(-ssize_t(y) - 1, input_sizes[c].second)) - + stage->settings_.border_x, + sizeof(float) * + (input_sizes[c].first + 2 * stage->settings_.border_x)); + } + for (int y = 0; y < static_cast(stage->settings_.border_y); y++) { + memcpy( + get_row(c, input_sizes[c].second + y) - stage->settings_.border_x, + get_row(c, + Mirror(input_sizes[c].second + y, input_sizes[c].second)) - + stage->settings_.border_x, + sizeof(float) * + (input_sizes[c].first + 2 * stage->settings_.border_x)); + } + } + + size_t ysize = 0; + size_t xsize = 0; + for (size_t c = 0; c < channel_data_.size(); c++) { + if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kIgnored) { + continue; + } + ysize = std::max(input_sizes[c].second, ysize); + xsize = std::max(input_sizes[c].first, xsize); + } + + JXL_ASSERT(ysize != 0); + JXL_ASSERT(xsize != 0); + + RenderPipelineStage::RowInfo input_rows(channel_data_.size()); + RenderPipelineStage::RowInfo output_rows(channel_data_.size()); + + // Run the pipeline. + { + stage->SetInputSizes(input_sizes); + int border_y = stage->settings_.border_y; + for (size_t y = 0; y < ysize; y++) { + // Prepare input rows. + for (size_t c = 0; c < channel_data_.size(); c++) { + if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kIgnored) { + continue; + } + input_rows[c].resize(2 * border_y + 1); + for (int iy = -border_y; iy <= border_y; iy++) { + input_rows[c][iy + border_y] = + channel_data_[c].Row(y + kRenderPipelineXOffset + iy); + } + } + // Prepare output rows. + for (size_t c = 0; c < channel_data_.size(); c++) { + if (!output_channels[c]) continue; + output_rows[c].resize(1 << stage->settings_.shift_y); + for (size_t iy = 0; iy < output_rows[c].size(); iy++) { + output_rows[c][iy] = output_channels[c]->Row( + (y << stage->settings_.shift_y) + iy + kRenderPipelineXOffset); + } + } + stage->ProcessRow(input_rows, output_rows, /*xextra=*/0, xsize, + /*xpos=*/0, y, thread_id); + } + } + + // Move new channels to current channels. + for (size_t c = 0; c < channel_data_.size(); c++) { + if (stage->GetChannelMode(c) != RenderPipelineChannelMode::kInOut) { + continue; + } + channel_data_[c] = std::move(new_channels[c]); + } + for (size_t c = 0; c < channel_data_.size(); c++) { + size_t next_stage = std::min(stage_id + 1, channel_shifts_.size() - 1); + size_t xsize = DivCeil(frame_dimensions_.xsize_upsampled, + 1 << channel_shifts_[next_stage][c].first); + size_t ysize = DivCeil(frame_dimensions_.ysize_upsampled, + 1 << channel_shifts_[next_stage][c].second); + channel_data_[c].ShrinkTo(xsize + 2 * kRenderPipelineXOffset, + ysize + 2 * kRenderPipelineXOffset); + JXL_CHECK_PLANE_INITIALIZED( + channel_data_[c], + Rect(kRenderPipelineXOffset, kRenderPipelineXOffset, xsize, ysize), + c); + } + + if (stage->SwitchToImageDimensions()) { + size_t image_xsize, image_ysize; + FrameOrigin frame_origin; + stage->GetImageDimensions(&image_xsize, &image_ysize, &frame_origin); + frame_dimensions_.Set(image_xsize, image_ysize, 0, 0, 0, false, 1); + std::vector old_channels = std::move(channel_data_); + channel_data_.clear(); + channel_data_.reserve(old_channels.size()); + for (size_t c = 0; c < old_channels.size(); c++) { + channel_data_.emplace_back(2 * kRenderPipelineXOffset + image_xsize, + 2 * kRenderPipelineXOffset + image_ysize); + } + for (size_t y = 0; y < image_ysize; ++y) { + for (size_t c = 0; c < channel_data_.size(); c++) { + output_rows[c].resize(1); + output_rows[c][0] = channel_data_[c].Row(kRenderPipelineXOffset + y); + } + // TODO(sboukortt): consider doing this only on the parts of the + // background that won't be occluded. + stage->ProcessPaddingRow(output_rows, image_xsize, 0, y); + } + ssize_t x0 = frame_origin.x0; + ssize_t y0 = frame_origin.y0; + size_t x0_fg = 0; + size_t y0_fg = 0; + if (x0 < 0) { + xsize += x0; + x0_fg -= x0; + x0 = 0; + } + if (x0 + xsize > image_xsize) { + xsize = image_xsize - x0; + } + if (y0 < 0) { + ysize += y0; + y0_fg -= x0; + y0 = 0; + } + if (y0 + ysize > image_ysize) { + ysize = image_ysize - y0; + } + const Rect rect_fg_relative_to_image = + Rect(x0, y0, xsize, ysize) + .Translate(kRenderPipelineXOffset, kRenderPipelineXOffset); + const Rect rect_fg = + Rect(x0_fg, y0_fg, xsize, ysize) + .Translate(kRenderPipelineXOffset, kRenderPipelineXOffset); + for (size_t c = 0; c < channel_data_.size(); c++) { + CopyImageTo(rect_fg, old_channels[c], rect_fg_relative_to_image, + &channel_data_[c]); + } + } + } +} +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h new file mode 100644 index 0000000000..10f4505912 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h @@ -0,0 +1,37 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_SIMPLE_RENDER_PIPELINE_H_ +#define LIB_JXL_RENDER_PIPELINE_SIMPLE_RENDER_PIPELINE_H_ + +#include + +#include "lib/jxl/render_pipeline/render_pipeline.h" + +namespace jxl { + +// A RenderPipeline that is "obviously correct"; it may use potentially large +// amounts of memory and be slow. It is intended to be used mostly for testing +// purposes. +class SimpleRenderPipeline : public RenderPipeline { + std::vector> PrepareBuffers( + size_t group_id, size_t thread_id) override; + + void ProcessBuffers(size_t group_id, size_t thread_id) override; + + void PrepareForThreadsInternal(size_t num, bool use_group_ids) override; + + // Full frame buffers. Both X and Y dimensions are padded by + // kRenderPipelineXOffset. + std::vector channel_data_; + size_t processed_passes_ = 0; + + private: + Rect MakeChannelRect(size_t group_id, size_t channel); +}; + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_SIMPLE_RENDER_PIPELINE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc new file mode 100644 index 0000000000..b6668c5625 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc @@ -0,0 +1,247 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_blending.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_blending.cc" +#include +#include + +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/blending.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +class BlendingStage : public RenderPipelineStage { + public: + explicit BlendingStage(const PassesDecoderState* dec_state, + const ColorEncoding& frame_color_encoding) + : RenderPipelineStage(RenderPipelineStage::Settings()), + state_(*dec_state->shared) { + image_xsize_ = state_.frame_header.nonserialized_metadata->xsize(); + image_ysize_ = state_.frame_header.nonserialized_metadata->ysize(); + extra_channel_info_ = + &state_.frame_header.nonserialized_metadata->m.extra_channel_info; + info_ = state_.frame_header.blending_info; + const std::vector& ec_info = + state_.frame_header.extra_channel_blending_info; + const ImageBundle& bg = state_.reference_frames[info_.source].frame; + bg_ = &bg; + if (bg.xsize() == 0 || bg.ysize() == 0) { + zeroes_.resize(image_xsize_, 0.f); + } else if (state_.reference_frames[info_.source].ib_is_in_xyb) { + initialized_ = JXL_FAILURE( + "Trying to blend XYB reference frame %i and non-XYB frame", + info_.source); + return; + } else if (std::any_of(ec_info.begin(), ec_info.end(), + [this](const BlendingInfo& info) { + const ImageBundle& bg = + state_.reference_frames[info.source].frame; + return bg.xsize() == 0 || bg.ysize() == 0; + })) { + zeroes_.resize(image_xsize_, 0.f); + } + + auto verify_bg_size = [&](const ImageBundle& bg) -> Status { + if (bg.xsize() != 0 && bg.ysize() != 0 && + (bg.xsize() < image_xsize_ || bg.ysize() < image_ysize_ || + bg.origin.x0 != 0 || bg.origin.y0 != 0)) { + return JXL_FAILURE("Trying to use a %" PRIuS "x%" PRIuS + " crop as a background", + bg.xsize(), bg.ysize()); + } + return true; + }; + + Status ok = verify_bg_size(bg); + for (const auto& info : ec_info) { + const ImageBundle& bg = state_.reference_frames[info.source].frame; + if (!!ok) ok = verify_bg_size(bg); + } + if (!ok) { + initialized_ = ok; + return; + } + + if (state_.metadata->m.xyb_encoded) { + if (!dec_state->output_encoding_info.color_encoding_is_original) { + initialized_ = JXL_FAILURE("Blending in unsupported color space"); + return; + } + } + + blending_info_.resize(ec_info.size() + 1); + auto make_blending = [&](const BlendingInfo& info, PatchBlending* pb) { + pb->alpha_channel = info.alpha_channel; + pb->clamp = info.clamp; + switch (info.mode) { + case BlendMode::kReplace: { + pb->mode = PatchBlendMode::kReplace; + break; + } + case BlendMode::kAdd: { + pb->mode = PatchBlendMode::kAdd; + break; + } + case BlendMode::kMul: { + pb->mode = PatchBlendMode::kMul; + break; + } + case BlendMode::kBlend: { + pb->mode = PatchBlendMode::kBlendAbove; + break; + } + case BlendMode::kAlphaWeightedAdd: { + pb->mode = PatchBlendMode::kAlphaWeightedAddAbove; + break; + } + default: { + JXL_ABORT("Invalid blend mode"); // should have failed to decode + } + } + }; + make_blending(info_, &blending_info_[0]); + for (size_t i = 0; i < ec_info.size(); i++) { + make_blending(ec_info[i], &blending_info_[1 + i]); + } + } + + Status IsInitialized() const override { return initialized_; } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + PROFILER_ZONE("Blend"); + JXL_ASSERT(initialized_); + const FrameOrigin& frame_origin = state_.frame_header.frame_origin; + ssize_t bg_xpos = frame_origin.x0 + static_cast(xpos); + ssize_t bg_ypos = frame_origin.y0 + static_cast(ypos); + int offset = 0; + if (bg_xpos + static_cast(xsize) <= 0 || + frame_origin.x0 >= static_cast(image_xsize_) || bg_ypos < 0 || + bg_ypos >= static_cast(image_ysize_)) { + return; + } + if (bg_xpos < 0) { + offset -= bg_xpos; + xsize += bg_xpos; + bg_xpos = 0; + } + if (bg_xpos + xsize > image_xsize_) { + xsize = + std::max(0, static_cast(image_xsize_) - bg_xpos); + } + std::vector bg_row_ptrs_(input_rows.size()); + std::vector fg_row_ptrs_(input_rows.size()); + size_t num_c = std::min(input_rows.size(), extra_channel_info_->size() + 3); + for (size_t c = 0; c < num_c; ++c) { + fg_row_ptrs_[c] = GetInputRow(input_rows, c, 0) + offset; + if (c < 3) { + bg_row_ptrs_[c] = bg_->xsize() != 0 && bg_->ysize() != 0 + ? bg_->color().ConstPlaneRow(c, bg_ypos) + bg_xpos + : zeroes_.data(); + } else { + const ImageBundle& ec_bg = + state_ + .reference_frames[state_.frame_header + .extra_channel_blending_info[c - 3] + .source] + .frame; + bg_row_ptrs_[c] = + ec_bg.xsize() != 0 && ec_bg.ysize() != 0 + ? ec_bg.extra_channels()[c - 3].ConstRow(bg_ypos) + bg_xpos + : zeroes_.data(); + } + } + PerformBlending(bg_row_ptrs_.data(), fg_row_ptrs_.data(), + fg_row_ptrs_.data(), 0, xsize, blending_info_[0], + blending_info_.data() + 1, *extra_channel_info_); + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return RenderPipelineChannelMode::kInPlace; + } + + bool SwitchToImageDimensions() const override { return true; } + + void GetImageDimensions(size_t* xsize, size_t* ysize, + FrameOrigin* frame_origin) const override { + *xsize = image_xsize_; + *ysize = image_ysize_; + *frame_origin = state_.frame_header.frame_origin; + } + + void ProcessPaddingRow(const RowInfo& output_rows, size_t xsize, size_t xpos, + size_t ypos) const override { + if (bg_->xsize() == 0 || bg_->ysize() == 0) { + for (size_t c = 0; c < 3; ++c) { + memset(GetInputRow(output_rows, c, 0), 0, xsize * sizeof(float)); + } + } else { + for (size_t c = 0; c < 3; ++c) { + memcpy(GetInputRow(output_rows, c, 0), + bg_->color().ConstPlaneRow(c, ypos) + xpos, + xsize * sizeof(float)); + } + } + for (size_t ec = 0; ec < extra_channel_info_->size(); ++ec) { + const ImageBundle& ec_bg = + state_ + .reference_frames + [state_.frame_header.extra_channel_blending_info[ec].source] + .frame; + if (ec_bg.xsize() == 0 || ec_bg.ysize() == 0) { + memset(GetInputRow(output_rows, 3 + ec, 0), 0, xsize * sizeof(float)); + } else { + memcpy(GetInputRow(output_rows, 3 + ec, 0), + ec_bg.extra_channels()[ec].ConstRow(ypos) + xpos, + xsize * sizeof(float)); + } + } + } + + const char* GetName() const override { return "Blending"; } + + private: + const PassesSharedState& state_; + BlendingInfo info_; + const ImageBundle* bg_; + Status initialized_ = true; + size_t image_xsize_; + size_t image_ysize_; + std::vector blending_info_; + const std::vector* extra_channel_info_; + std::vector zeroes_; +}; + +std::unique_ptr GetBlendingStage( + const PassesDecoderState* dec_state, + const ColorEncoding& frame_color_encoding) { + return jxl::make_unique(dec_state, frame_color_encoding); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetBlendingStage); + +std::unique_ptr GetBlendingStage( + const PassesDecoderState* dec_state, + const ColorEncoding& frame_color_encoding) { + return HWY_DYNAMIC_DISPATCH(GetBlendingStage)(dec_state, + frame_color_encoding); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h new file mode 100644 index 0000000000..c8db7490cd --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h @@ -0,0 +1,24 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_BLENDING_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_BLENDING_H_ + +#include + +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" +#include "lib/jxl/splines.h" + +namespace jxl { + +// Applies blending if applicable. +std::unique_ptr GetBlendingStage( + const PassesDecoderState* dec_state, + const ColorEncoding& frame_color_encoding); + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_BLENDING_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc new file mode 100644 index 0000000000..9b73ee91f1 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc @@ -0,0 +1,129 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_chroma_upsampling.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_chroma_upsampling.cc" +#include +#include + +#include "lib/jxl/simd_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; + +class HorizontalChromaUpsamplingStage : public RenderPipelineStage { + public: + explicit HorizontalChromaUpsamplingStage(size_t channel) + : RenderPipelineStage(RenderPipelineStage::Settings::ShiftX( + /*shift=*/1, /*border=*/1)), + c_(channel) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + PROFILER_ZONE("HorizontalChromaUpsampling"); + HWY_FULL(float) df; + xextra = RoundUpTo(xextra, Lanes(df)); + auto threefour = Set(df, 0.75f); + auto onefour = Set(df, 0.25f); + const float* row_in = GetInputRow(input_rows, c_, 0); + float* row_out = GetOutputRow(output_rows, c_, 0); + for (ssize_t x = -xextra; x < static_cast(xsize + xextra); + x += Lanes(df)) { + auto current = Mul(LoadU(df, row_in + x), threefour); + auto prev = LoadU(df, row_in + x - 1); + auto next = LoadU(df, row_in + x + 1); + auto left = MulAdd(onefour, prev, current); + auto right = MulAdd(onefour, next, current); + StoreInterleaved(df, left, right, row_out + x * 2); + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c == c_ ? RenderPipelineChannelMode::kInOut + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "HChromaUps"; } + + private: + size_t c_; +}; + +class VerticalChromaUpsamplingStage : public RenderPipelineStage { + public: + explicit VerticalChromaUpsamplingStage(size_t channel) + : RenderPipelineStage(RenderPipelineStage::Settings::ShiftY( + /*shift=*/1, /*border=*/1)), + c_(channel) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + PROFILER_ZONE("VerticalChromaUpsampling"); + HWY_FULL(float) df; + xextra = RoundUpTo(xextra, Lanes(df)); + auto threefour = Set(df, 0.75f); + auto onefour = Set(df, 0.25f); + const float* row_top = GetInputRow(input_rows, c_, -1); + const float* row_mid = GetInputRow(input_rows, c_, 0); + const float* row_bot = GetInputRow(input_rows, c_, 1); + float* row_out0 = GetOutputRow(output_rows, c_, 0); + float* row_out1 = GetOutputRow(output_rows, c_, 1); + for (ssize_t x = -xextra; x < static_cast(xsize + xextra); + x += Lanes(df)) { + auto it = LoadU(df, row_top + x); + auto im = LoadU(df, row_mid + x); + auto ib = LoadU(df, row_bot + x); + auto im_scaled = Mul(im, threefour); + Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x); + Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x); + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c == c_ ? RenderPipelineChannelMode::kInOut + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "VChromaUps"; } + + private: + size_t c_; +}; + +std::unique_ptr GetChromaUpsamplingStage(size_t channel, + bool horizontal) { + if (horizontal) { + return jxl::make_unique(channel); + } else { + return jxl::make_unique(channel); + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetChromaUpsamplingStage); + +std::unique_ptr GetChromaUpsamplingStage(size_t channel, + bool horizontal) { + return HWY_DYNAMIC_DISPATCH(GetChromaUpsamplingStage)(channel, horizontal); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h new file mode 100644 index 0000000000..b8bfc15f5f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h @@ -0,0 +1,27 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_CHROMA_UPSAMPLING_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_CHROMA_UPSAMPLING_H_ +#include +#include +#include + +#include +#include +#include + +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Applies simple upsampling, either horizontal or vertical, to the given +// channel. +std::unique_ptr GetChromaUpsamplingStage(size_t channel, + bool horizontal); +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_CHROMA_UPSAMPLING_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc new file mode 100644 index 0000000000..d59c497843 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc @@ -0,0 +1,524 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_epf.h" + +#include "lib/jxl/epf.h" +#include "lib/jxl/sanitizers.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_epf.cc" +#include +#include + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +// TODO(veluca): In principle, vectors could be not capped, if we want to deal +// with having two different sigma values in a single vector. +using DF = HWY_CAPPED(float, 8); + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::AbsDiff; +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Div; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::Vec; +using hwy::HWY_NAMESPACE::VFromD; +using hwy::HWY_NAMESPACE::ZeroIfNegative; + +JXL_INLINE Vec Weight(Vec sad, Vec inv_sigma, Vec thres) { + auto v = MulAdd(sad, inv_sigma, Set(DF(), 1.0f)); + return ZeroIfNegative(v); +} + +// 5x5 plus-shaped kernel with 5 SADs per pixel (3x3 plus-shaped). So this makes +// this filter a 7x7 filter. +class EPF0Stage : public RenderPipelineStage { + public: + EPF0Stage(const LoopFilter& lf, const ImageF& sigma) + : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric( + /*shift=*/0, /*border=*/3)), + lf_(lf), + sigma_(&sigma) {} + + template + JXL_INLINE void AddPixel(int row, float* JXL_RESTRICT rows[3][7], ssize_t x, + Vec sad, Vec inv_sigma, + Vec* JXL_RESTRICT X, Vec* JXL_RESTRICT Y, + Vec* JXL_RESTRICT B, + Vec* JXL_RESTRICT w) const { + auto cx = aligned ? Load(DF(), rows[0][3 + row] + x) + : LoadU(DF(), rows[0][3 + row] + x); + auto cy = aligned ? Load(DF(), rows[1][3 + row] + x) + : LoadU(DF(), rows[1][3 + row] + x); + auto cb = aligned ? Load(DF(), rows[2][3 + row] + x) + : LoadU(DF(), rows[2][3 + row] + x); + + auto weight = Weight(sad, inv_sigma, Set(DF(), lf_.epf_pass1_zeroflush)); + *w = Add(*w, weight); + *X = MulAdd(weight, cx, *X); + *Y = MulAdd(weight, cy, *Y); + *B = MulAdd(weight, cb, *B); + } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + DF df; + + using V = decltype(Zero(df)); + V t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA, tB; + V* sads[12] = {&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, &t9, &tA, &tB}; + + xextra = RoundUpTo(xextra, Lanes(df)); + const float* JXL_RESTRICT row_sigma = + sigma_->Row(ypos / kBlockDim + kSigmaPadding); + + float sm = lf_.epf_pass0_sigma_scale * 1.65; + float bsm = sm * lf_.epf_border_sad_mul; + + HWY_ALIGN float sad_mul_center[kBlockDim] = {bsm, sm, sm, sm, + sm, sm, sm, bsm}; + HWY_ALIGN float sad_mul_border[kBlockDim] = {bsm, bsm, bsm, bsm, + bsm, bsm, bsm, bsm}; + float* JXL_RESTRICT rows[3][7]; + for (size_t c = 0; c < 3; c++) { + for (int i = 0; i < 7; i++) { + rows[c][i] = GetInputRow(input_rows, c, i - 3); + } + } + + const float* sad_mul = + (ypos % kBlockDim == 0 || ypos % kBlockDim == kBlockDim - 1) + ? sad_mul_border + : sad_mul_center; + + for (ssize_t x = -xextra; x < static_cast(xsize + xextra); + x += Lanes(df)) { + size_t bx = (x + xpos + kSigmaPadding * kBlockDim) / kBlockDim; + size_t ix = (x + xpos) % kBlockDim; + + if (row_sigma[bx] < kMinSigma) { + for (size_t c = 0; c < 3; c++) { + auto px = Load(df, rows[c][3 + 0] + x); + StoreU(px, df, GetOutputRow(output_rows, c, 0) + x); + } + continue; + } + + const auto sm = Load(df, sad_mul + ix); + const auto inv_sigma = Mul(Set(df, row_sigma[bx]), sm); + + for (size_t i = 0; i < 12; i++) *sads[i] = Zero(df); + constexpr std::array sads_off[12] = { + {{-2, 0}}, {{-1, -1}}, {{-1, 0}}, {{-1, 1}}, {{0, -2}}, {{0, -1}}, + {{0, 1}}, {{0, 2}}, {{1, -1}}, {{1, 0}}, {{1, 1}}, {{2, 0}}, + }; + + // compute sads + // TODO(veluca): consider unrolling and optimizing this. + for (size_t c = 0; c < 3; c++) { + auto scale = Set(df, lf_.epf_channel_scale[c]); + for (size_t i = 0; i < 12; i++) { + auto sad = Zero(df); + constexpr std::array plus_off[] = { + {{0, 0}}, {{-1, 0}}, {{0, -1}}, {{1, 0}}, {{0, 1}}}; + for (size_t j = 0; j < 5; j++) { + const auto r11 = + LoadU(df, rows[c][3 + plus_off[j][0]] + x + plus_off[j][1]); + const auto c11 = + LoadU(df, rows[c][3 + sads_off[i][0] + plus_off[j][0]] + x + + sads_off[i][1] + plus_off[j][1]); + sad = Add(sad, AbsDiff(r11, c11)); + } + *sads[i] = MulAdd(sad, scale, *sads[i]); + } + } + const auto x_cc = Load(df, rows[0][3 + 0] + x); + const auto y_cc = Load(df, rows[1][3 + 0] + x); + const auto b_cc = Load(df, rows[2][3 + 0] + x); + + auto w = Set(df, 1); + auto X = x_cc; + auto Y = y_cc; + auto B = b_cc; + + for (size_t i = 0; i < 12; i++) { + AddPixel(/*row=*/sads_off[i][0], rows, + x + sads_off[i][1], *sads[i], inv_sigma, &X, + &Y, &B, &w); + } +#if JXL_HIGH_PRECISION + auto inv_w = Div(Set(df, 1.0f), w); +#else + auto inv_w = ApproximateReciprocal(w); +#endif + StoreU(Mul(X, inv_w), df, GetOutputRow(output_rows, 0, 0) + x); + StoreU(Mul(Y, inv_w), df, GetOutputRow(output_rows, 1, 0) + x); + StoreU(Mul(B, inv_w), df, GetOutputRow(output_rows, 2, 0) + x); + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInOut + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "EPF0"; } + + private: + LoopFilter lf_; + const ImageF* sigma_; +}; + +// 3x3 plus-shaped kernel with 5 SADs per pixel (also 3x3 plus-shaped). So this +// makes this filter a 5x5 filter. +class EPF1Stage : public RenderPipelineStage { + public: + EPF1Stage(const LoopFilter& lf, const ImageF& sigma) + : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric( + /*shift=*/0, /*border=*/2)), + lf_(lf), + sigma_(&sigma) {} + + template + JXL_INLINE void AddPixel(int row, float* JXL_RESTRICT rows[3][5], ssize_t x, + Vec sad, Vec inv_sigma, + Vec* JXL_RESTRICT X, Vec* JXL_RESTRICT Y, + Vec* JXL_RESTRICT B, + Vec* JXL_RESTRICT w) const { + auto cx = aligned ? Load(DF(), rows[0][2 + row] + x) + : LoadU(DF(), rows[0][2 + row] + x); + auto cy = aligned ? Load(DF(), rows[1][2 + row] + x) + : LoadU(DF(), rows[1][2 + row] + x); + auto cb = aligned ? Load(DF(), rows[2][2 + row] + x) + : LoadU(DF(), rows[2][2 + row] + x); + + auto weight = Weight(sad, inv_sigma, Set(DF(), lf_.epf_pass1_zeroflush)); + *w = Add(*w, weight); + *X = MulAdd(weight, cx, *X); + *Y = MulAdd(weight, cy, *Y); + *B = MulAdd(weight, cb, *B); + } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + DF df; + xextra = RoundUpTo(xextra, Lanes(df)); + const float* JXL_RESTRICT row_sigma = + sigma_->Row(ypos / kBlockDim + kSigmaPadding); + + float sm = 1.65f; + float bsm = sm * lf_.epf_border_sad_mul; + + HWY_ALIGN float sad_mul_center[kBlockDim] = {bsm, sm, sm, sm, + sm, sm, sm, bsm}; + HWY_ALIGN float sad_mul_border[kBlockDim] = {bsm, bsm, bsm, bsm, + bsm, bsm, bsm, bsm}; + + float* JXL_RESTRICT rows[3][5]; + for (size_t c = 0; c < 3; c++) { + for (int i = 0; i < 5; i++) { + rows[c][i] = GetInputRow(input_rows, c, i - 2); + } + } + + const float* sad_mul = + (ypos % kBlockDim == 0 || ypos % kBlockDim == kBlockDim - 1) + ? sad_mul_border + : sad_mul_center; + + for (ssize_t x = -xextra; x < static_cast(xsize + xextra); + x += Lanes(df)) { + size_t bx = (x + xpos + kSigmaPadding * kBlockDim) / kBlockDim; + size_t ix = (x + xpos) % kBlockDim; + + if (row_sigma[bx] < kMinSigma) { + for (size_t c = 0; c < 3; c++) { + auto px = Load(df, rows[c][2 + 0] + x); + Store(px, df, GetOutputRow(output_rows, c, 0) + x); + } + continue; + } + + const auto sm = Load(df, sad_mul + ix); + const auto inv_sigma = Mul(Set(df, row_sigma[bx]), sm); + auto sad0 = Zero(df); + auto sad1 = Zero(df); + auto sad2 = Zero(df); + auto sad3 = Zero(df); + + // compute sads + for (size_t c = 0; c < 3; c++) { + // center px = 22, px above = 21 + auto t = Undefined(df); + + const auto p20 = Load(df, rows[c][2 + -2] + x); + const auto p21 = Load(df, rows[c][2 + -1] + x); + auto sad0c = AbsDiff(p20, p21); // SAD 2, 1 + + const auto p11 = LoadU(df, rows[c][2 + -1] + x - 1); + auto sad1c = AbsDiff(p11, p21); // SAD 1, 2 + + const auto p31 = LoadU(df, rows[c][2 + -1] + x + 1); + auto sad2c = AbsDiff(p31, p21); // SAD 3, 2 + + const auto p02 = LoadU(df, rows[c][2 + 0] + x - 2); + const auto p12 = LoadU(df, rows[c][2 + 0] + x - 1); + sad1c = Add(sad1c, AbsDiff(p02, p12)); // SAD 1, 2 + sad0c = Add(sad0c, AbsDiff(p11, p12)); // SAD 2, 1 + + const auto p22 = LoadU(df, rows[c][2 + 0] + x); + t = AbsDiff(p12, p22); + sad1c = Add(sad1c, t); // SAD 1, 2 + sad2c = Add(sad2c, t); // SAD 3, 2 + t = AbsDiff(p22, p21); + auto sad3c = t; // SAD 2, 3 + sad0c = Add(sad0c, t); // SAD 2, 1 + + const auto p32 = LoadU(df, rows[c][2 + 0] + x + 1); + sad0c = Add(sad0c, AbsDiff(p31, p32)); // SAD 2, 1 + t = AbsDiff(p22, p32); + sad1c = Add(sad1c, t); // SAD 1, 2 + sad2c = Add(sad2c, t); // SAD 3, 2 + + const auto p42 = LoadU(df, rows[c][2 + 0] + x + 2); + sad2c = Add(sad2c, AbsDiff(p42, p32)); // SAD 3, 2 + + const auto p13 = LoadU(df, rows[c][2 + 1] + x - 1); + sad3c = Add(sad3c, AbsDiff(p13, p12)); // SAD 2, 3 + + const auto p23 = Load(df, rows[c][2 + 1] + x); + t = AbsDiff(p22, p23); + sad0c = Add(sad0c, t); // SAD 2, 1 + sad3c = Add(sad3c, t); // SAD 2, 3 + sad1c = Add(sad1c, AbsDiff(p13, p23)); // SAD 1, 2 + + const auto p33 = LoadU(df, rows[c][2 + 1] + x + 1); + sad2c = Add(sad2c, AbsDiff(p33, p23)); // SAD 3, 2 + sad3c = Add(sad3c, AbsDiff(p33, p32)); // SAD 2, 3 + + const auto p24 = Load(df, rows[c][2 + 2] + x); + sad3c = Add(sad3c, AbsDiff(p24, p23)); // SAD 2, 3 + + auto scale = Set(df, lf_.epf_channel_scale[c]); + sad0 = MulAdd(sad0c, scale, sad0); + sad1 = MulAdd(sad1c, scale, sad1); + sad2 = MulAdd(sad2c, scale, sad2); + sad3 = MulAdd(sad3c, scale, sad3); + } + const auto x_cc = Load(df, rows[0][2 + 0] + x); + const auto y_cc = Load(df, rows[1][2 + 0] + x); + const auto b_cc = Load(df, rows[2][2 + 0] + x); + + auto w = Set(df, 1); + auto X = x_cc; + auto Y = y_cc; + auto B = b_cc; + + // Top row + AddPixel(/*row=*/-1, rows, x, sad0, inv_sigma, &X, &Y, + &B, &w); + // Center + AddPixel(/*row=*/0, rows, x - 1, sad1, inv_sigma, &X, + &Y, &B, &w); + AddPixel(/*row=*/0, rows, x + 1, sad2, inv_sigma, &X, + &Y, &B, &w); + // Bottom + AddPixel(/*row=*/1, rows, x, sad3, inv_sigma, &X, &Y, + &B, &w); +#if JXL_HIGH_PRECISION + auto inv_w = Div(Set(df, 1.0f), w); +#else + auto inv_w = ApproximateReciprocal(w); +#endif + Store(Mul(X, inv_w), df, GetOutputRow(output_rows, 0, 0) + x); + Store(Mul(Y, inv_w), df, GetOutputRow(output_rows, 1, 0) + x); + Store(Mul(B, inv_w), df, GetOutputRow(output_rows, 2, 0) + x); + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInOut + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "EPF1"; } + + private: + LoopFilter lf_; + const ImageF* sigma_; +}; + +// 3x3 plus-shaped kernel with 1 SAD per pixel. So this makes this filter a 3x3 +// filter. +class EPF2Stage : public RenderPipelineStage { + public: + EPF2Stage(const LoopFilter& lf, const ImageF& sigma) + : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric( + /*shift=*/0, /*border=*/1)), + lf_(lf), + sigma_(&sigma) {} + + template + JXL_INLINE void AddPixel(int row, float* JXL_RESTRICT rows[3][3], ssize_t x, + Vec rx, Vec ry, Vec rb, + Vec inv_sigma, Vec* JXL_RESTRICT X, + Vec* JXL_RESTRICT Y, Vec* JXL_RESTRICT B, + Vec* JXL_RESTRICT w) const { + auto cx = aligned ? Load(DF(), rows[0][1 + row] + x) + : LoadU(DF(), rows[0][1 + row] + x); + auto cy = aligned ? Load(DF(), rows[1][1 + row] + x) + : LoadU(DF(), rows[1][1 + row] + x); + auto cb = aligned ? Load(DF(), rows[2][1 + row] + x) + : LoadU(DF(), rows[2][1 + row] + x); + + auto sad = Mul(AbsDiff(cx, rx), Set(DF(), lf_.epf_channel_scale[0])); + sad = MulAdd(AbsDiff(cy, ry), Set(DF(), lf_.epf_channel_scale[1]), sad); + sad = MulAdd(AbsDiff(cb, rb), Set(DF(), lf_.epf_channel_scale[2]), sad); + + auto weight = Weight(sad, inv_sigma, Set(DF(), lf_.epf_pass2_zeroflush)); + + *w = Add(*w, weight); + *X = MulAdd(weight, cx, *X); + *Y = MulAdd(weight, cy, *Y); + *B = MulAdd(weight, cb, *B); + } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + DF df; + xextra = RoundUpTo(xextra, Lanes(df)); + const float* JXL_RESTRICT row_sigma = + sigma_->Row(ypos / kBlockDim + kSigmaPadding); + + float sm = lf_.epf_pass2_sigma_scale * 1.65; + float bsm = sm * lf_.epf_border_sad_mul; + + HWY_ALIGN float sad_mul_center[kBlockDim] = {bsm, sm, sm, sm, + sm, sm, sm, bsm}; + HWY_ALIGN float sad_mul_border[kBlockDim] = {bsm, bsm, bsm, bsm, + bsm, bsm, bsm, bsm}; + + float* JXL_RESTRICT rows[3][3]; + for (size_t c = 0; c < 3; c++) { + for (int i = 0; i < 3; i++) { + rows[c][i] = GetInputRow(input_rows, c, i - 1); + } + } + + const float* sad_mul = + (ypos % kBlockDim == 0 || ypos % kBlockDim == kBlockDim - 1) + ? sad_mul_border + : sad_mul_center; + + for (ssize_t x = -xextra; x < static_cast(xsize + xextra); + x += Lanes(df)) { + size_t bx = (x + xpos + kSigmaPadding * kBlockDim) / kBlockDim; + size_t ix = (x + xpos) % kBlockDim; + + if (row_sigma[bx] < kMinSigma) { + for (size_t c = 0; c < 3; c++) { + auto px = Load(df, rows[c][1 + 0] + x); + Store(px, df, GetOutputRow(output_rows, c, 0) + x); + } + continue; + } + + const auto sm = Load(df, sad_mul + ix); + const auto inv_sigma = Mul(Set(df, row_sigma[bx]), sm); + + const auto x_cc = Load(df, rows[0][1 + 0] + x); + const auto y_cc = Load(df, rows[1][1 + 0] + x); + const auto b_cc = Load(df, rows[2][1 + 0] + x); + + auto w = Set(df, 1); + auto X = x_cc; + auto Y = y_cc; + auto B = b_cc; + + // Top row + AddPixel(/*row=*/-1, rows, x, x_cc, y_cc, b_cc, + inv_sigma, &X, &Y, &B, &w); + // Center + AddPixel(/*row=*/0, rows, x - 1, x_cc, y_cc, b_cc, + inv_sigma, &X, &Y, &B, &w); + AddPixel(/*row=*/0, rows, x + 1, x_cc, y_cc, b_cc, + inv_sigma, &X, &Y, &B, &w); + // Bottom + AddPixel(/*row=*/1, rows, x, x_cc, y_cc, b_cc, + inv_sigma, &X, &Y, &B, &w); +#if JXL_HIGH_PRECISION + auto inv_w = Div(Set(df, 1.0f), w); +#else + auto inv_w = ApproximateReciprocal(w); +#endif + Store(Mul(X, inv_w), df, GetOutputRow(output_rows, 0, 0) + x); + Store(Mul(Y, inv_w), df, GetOutputRow(output_rows, 1, 0) + x); + Store(Mul(B, inv_w), df, GetOutputRow(output_rows, 2, 0) + x); + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInOut + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "EPF2"; } + + private: + LoopFilter lf_; + const ImageF* sigma_; +}; + +std::unique_ptr GetEPFStage0(const LoopFilter& lf, + const ImageF& sigma) { + return jxl::make_unique(lf, sigma); +} + +std::unique_ptr GetEPFStage1(const LoopFilter& lf, + const ImageF& sigma) { + return jxl::make_unique(lf, sigma); +} + +std::unique_ptr GetEPFStage2(const LoopFilter& lf, + const ImageF& sigma) { + return jxl::make_unique(lf, sigma); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetEPFStage0); +HWY_EXPORT(GetEPFStage1); +HWY_EXPORT(GetEPFStage2); + +std::unique_ptr GetEPFStage(const LoopFilter& lf, + const ImageF& sigma, + size_t epf_stage) { + JXL_ASSERT(lf.epf_iters != 0); + switch (epf_stage) { + case 0: + return HWY_DYNAMIC_DISPATCH(GetEPFStage0)(lf, sigma); + case 1: + return HWY_DYNAMIC_DISPATCH(GetEPFStage1)(lf, sigma); + case 2: + return HWY_DYNAMIC_DISPATCH(GetEPFStage2)(lf, sigma); + default: + JXL_ABORT("Invalid EPF stage"); + } +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h new file mode 100644 index 0000000000..c9d0d0c785 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h @@ -0,0 +1,31 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_EPF_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_EPF_H_ +#include +#include +#include + +#include +#include +#include + +#include "lib/jxl/image.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Applies the `epf_stage`-th EPF step with the given settings and `sigma`. +// `sigma` will be accessed with an offset of (kSigmaPadding, kSigmaPadding), +// and should have (kSigmaBorder, kSigmaBorder) mirrored sigma values available +// around the main image. See also filters.(h|cc) +std::unique_ptr GetEPFStage(const LoopFilter& lf, + const ImageF& sigma, + size_t epf_stage); +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_EPF_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc new file mode 100644 index 0000000000..c7b22c663b --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc @@ -0,0 +1,191 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_from_linear.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_from_linear.cc" +#include +#include + +#include "lib/jxl/dec_tone_mapping-inl.h" +#include "lib/jxl/sanitizers.h" +#include "lib/jxl/transfer_functions-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::IfThenZeroElse; + +template +struct PerChannelOp { + explicit PerChannelOp(Op op) : op(op) {} + template + void Transform(D d, T* r, T* g, T* b) const { + *r = op.Transform(d, *r); + *g = op.Transform(d, *g); + *b = op.Transform(d, *b); + } + + Op op; +}; +template +PerChannelOp MakePerChannelOp(Op&& op) { + return PerChannelOp(std::forward(op)); +} + +struct OpLinear { + template + T Transform(D d, const T& linear) const { + return linear; + } +}; + +struct OpRgb { + template + T Transform(D d, const T& linear) const { +#if JXL_HIGH_PRECISION + return TF_SRGB().EncodedFromDisplay(d, linear); +#else + return FastLinearToSRGB(d, linear); +#endif + } +}; + +struct OpPq { + template + T Transform(D d, const T& linear) const { + return TF_PQ().EncodedFromDisplay(d, linear); + } +}; + +struct OpHlg { + explicit OpHlg(const float luminances[3], const float intensity_target) + : hlg_ootf_(HlgOOTF::ToSceneLight(/*display_luminance=*/intensity_target, + luminances)) {} + + template + void Transform(D d, T* r, T* g, T* b) const { + hlg_ootf_.Apply(r, g, b); + *r = TF_HLG().EncodedFromDisplay(d, *r); + *g = TF_HLG().EncodedFromDisplay(d, *g); + *b = TF_HLG().EncodedFromDisplay(d, *b); + } + HlgOOTF hlg_ootf_; +}; + +struct Op709 { + template + T Transform(D d, const T& linear) const { + return TF_709().EncodedFromDisplay(d, linear); + } +}; + +struct OpGamma { + const float inverse_gamma; + template + T Transform(D d, const T& linear) const { + return IfThenZeroElse(Le(linear, Set(d, 1e-5f)), + FastPowf(d, linear, Set(d, inverse_gamma))); + } +}; + +template +class FromLinearStage : public RenderPipelineStage { + public: + explicit FromLinearStage(Op op) + : RenderPipelineStage(RenderPipelineStage::Settings()), + op_(std::move(op)) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + PROFILER_ZONE("FromLinear"); + const HWY_FULL(float) d; + const size_t xsize_v = RoundUpTo(xsize, Lanes(d)); + float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0); + float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0); + float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0); + // All calculations are lane-wise, still some might require + // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last + // vector tail. + msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize)); + for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) { + auto r = LoadU(d, row0 + x); + auto g = LoadU(d, row1 + x); + auto b = LoadU(d, row2 + x); + op_.Transform(d, &r, &g, &b); + StoreU(r, d, row0 + x); + StoreU(g, d, row1 + x); + StoreU(b, d, row2 + x); + } + msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize)); + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInPlace + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "FromLinear"; } + + private: + Op op_; +}; + +template +std::unique_ptr> MakeFromLinearStage(Op&& op) { + return jxl::make_unique>(std::forward(op)); +} + +std::unique_ptr GetFromLinearStage( + const OutputEncodingInfo& output_encoding_info) { + if (output_encoding_info.color_encoding.tf.IsLinear()) { + return MakeFromLinearStage(MakePerChannelOp(OpLinear())); + } else if (output_encoding_info.color_encoding.tf.IsSRGB()) { + return MakeFromLinearStage(MakePerChannelOp(OpRgb())); + } else if (output_encoding_info.color_encoding.tf.IsPQ()) { + return MakeFromLinearStage(MakePerChannelOp(OpPq())); + } else if (output_encoding_info.color_encoding.tf.IsHLG()) { + return MakeFromLinearStage( + OpHlg(output_encoding_info.luminances, + output_encoding_info.desired_intensity_target)); + } else if (output_encoding_info.color_encoding.tf.Is709()) { + return MakeFromLinearStage(MakePerChannelOp(Op709())); + } else if (output_encoding_info.color_encoding.tf.IsGamma() || + output_encoding_info.color_encoding.tf.IsDCI()) { + return MakeFromLinearStage( + MakePerChannelOp(OpGamma{output_encoding_info.inverse_gamma})); + } else { + // This is a programming error. + JXL_ABORT("Invalid target encoding"); + } +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetFromLinearStage); + +std::unique_ptr GetFromLinearStage( + const OutputEncodingInfo& output_encoding_info) { + return HWY_DYNAMIC_DISPATCH(GetFromLinearStage)(output_encoding_info); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h new file mode 100644 index 0000000000..548ab50b8c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h @@ -0,0 +1,20 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_FROM_LINEAR_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_FROM_LINEAR_H_ + +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Converts the color channels from linear to the specified output encoding. +std::unique_ptr GetFromLinearStage( + const OutputEncodingInfo& output_encoding_info); + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_FROM_LINEAR_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc new file mode 100644 index 0000000000..fc90acb476 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc @@ -0,0 +1,122 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_gaborish.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_gaborish.cc" +#include +#include + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; + +class GaborishStage : public RenderPipelineStage { + public: + explicit GaborishStage(const LoopFilter& lf) + : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric( + /*shift=*/0, /*border=*/1)) { + weights_[0] = 1; + weights_[1] = lf.gab_x_weight1; + weights_[2] = lf.gab_x_weight2; + weights_[3] = 1; + weights_[4] = lf.gab_y_weight1; + weights_[5] = lf.gab_y_weight2; + weights_[6] = 1; + weights_[7] = lf.gab_b_weight1; + weights_[8] = lf.gab_b_weight2; + // Normalize + for (size_t c = 0; c < 3; c++) { + const float div = + weights_[3 * c] + 4 * (weights_[3 * c + 1] + weights_[3 * c + 2]); + const float mul = 1.0f / div; + weights_[3 * c] *= mul; + weights_[3 * c + 1] *= mul; + weights_[3 * c + 2] *= mul; + } + } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + PROFILER_ZONE("Gaborish"); + + const HWY_FULL(float) d; + for (size_t c = 0; c < 3; c++) { + float* JXL_RESTRICT row_t = GetInputRow(input_rows, c, -1); + float* JXL_RESTRICT row_m = GetInputRow(input_rows, c, 0); + float* JXL_RESTRICT row_b = GetInputRow(input_rows, c, 1); + float* JXL_RESTRICT row_out = GetOutputRow(output_rows, c, 0); + const auto w0 = Set(d, weights_[3 * c + 0]); + const auto w1 = Set(d, weights_[3 * c + 1]); + const auto w2 = Set(d, weights_[3 * c + 2]); +// Group data need only be aligned to a block; for >=512 bit vectors, this may +// result in unaligned loads. +#if HWY_CAP_GE512 +#define LoadMaybeU LoadU +#else +#define LoadMaybeU Load +#endif + // Since GetInputRow(input_rows, c, {-1, 0, 1}) is aligned, rounding + // xextra up to Lanes(d) doesn't access anything problematic. + for (ssize_t x = -RoundUpTo(xextra, Lanes(d)); + x < (ssize_t)(xsize + xextra); x += Lanes(d)) { + const auto t = LoadMaybeU(d, row_t + x); + const auto tl = LoadU(d, row_t + x - 1); + const auto tr = LoadU(d, row_t + x + 1); + const auto m = LoadMaybeU(d, row_m + x); + const auto l = LoadU(d, row_m + x - 1); + const auto r = LoadU(d, row_m + x + 1); + const auto b = LoadMaybeU(d, row_b + x); + const auto bl = LoadU(d, row_b + x - 1); + const auto br = LoadU(d, row_b + x + 1); + const auto sum0 = m; + const auto sum1 = Add(Add(l, r), Add(t, b)); + const auto sum2 = Add(Add(tl, tr), Add(bl, br)); + auto pixels = MulAdd(sum2, w2, MulAdd(sum1, w1, Mul(sum0, w0))); + Store(pixels, d, row_out + x); + } + } + } +#undef LoadMaybeU + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInOut + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "Gab"; } + + private: + float weights_[9]; +}; + +std::unique_ptr GetGaborishStage(const LoopFilter& lf) { + return jxl::make_unique(lf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetGaborishStage); + +std::unique_ptr GetGaborishStage(const LoopFilter& lf) { + JXL_ASSERT(lf.gab == 1); + return HWY_DYNAMIC_DISPATCH(GetGaborishStage)(lf); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h new file mode 100644 index 0000000000..761800f668 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h @@ -0,0 +1,25 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_GABORISH_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_GABORISH_H_ +#include +#include +#include + +#include +#include +#include + +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Applies decoder-side Gaborish with the given settings. `lf.gab` must be 1. +std::unique_ptr GetGaborishStage(const LoopFilter& lf); +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_GABORISH_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc new file mode 100644 index 0000000000..187095cf61 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc @@ -0,0 +1,311 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_noise.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_noise.cc" +#include +#include + +#include "lib/jxl/sanitizers.h" +#include "lib/jxl/transfer_functions-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Max; +using hwy::HWY_NAMESPACE::ShiftRight; +using hwy::HWY_NAMESPACE::Vec; +using hwy::HWY_NAMESPACE::ZeroIfNegative; + +using D = HWY_CAPPED(float, kBlockDim); +using DI = hwy::HWY_NAMESPACE::Rebind; +using DI8 = hwy::HWY_NAMESPACE::Repartition; + +// [0, max_value] +template +static HWY_INLINE V Clamp0ToMax(D d, const V x, const V max_value) { + const auto clamped = Min(x, max_value); + return ZeroIfNegative(clamped); +} + +// x is in [0+delta, 1+delta], delta ~= 0.06 +template +typename StrengthEval::V NoiseStrength(const StrengthEval& eval, + const typename StrengthEval::V x) { + return Clamp0ToMax(D(), eval(x), Set(D(), 1.0f)); +} + +// TODO(veluca): SIMD-fy. +class StrengthEvalLut { + public: + using V = Vec; + + explicit StrengthEvalLut(const NoiseParams& noise_params) +#if HWY_TARGET == HWY_SCALAR + : noise_params_(noise_params) +#endif + { +#if HWY_TARGET != HWY_SCALAR + uint32_t lut[8]; + memcpy(lut, noise_params.lut, sizeof(lut)); + for (size_t i = 0; i < 8; i++) { + low16_lut[2 * i] = (lut[i] >> 0) & 0xFF; + low16_lut[2 * i + 1] = (lut[i] >> 8) & 0xFF; + high16_lut[2 * i] = (lut[i] >> 16) & 0xFF; + high16_lut[2 * i + 1] = (lut[i] >> 24) & 0xFF; + } +#endif + } + + V operator()(const V vx) const { + constexpr size_t kScale = NoiseParams::kNumNoisePoints - 2; + auto scaled_vx = Max(Zero(D()), Mul(vx, Set(D(), kScale))); + auto floor_x = Floor(scaled_vx); + auto frac_x = Sub(scaled_vx, floor_x); + floor_x = IfThenElse(Ge(scaled_vx, Set(D(), kScale + 1)), Set(D(), kScale), + floor_x); + frac_x = + IfThenElse(Ge(scaled_vx, Set(D(), kScale + 1)), Set(D(), 1), frac_x); + auto floor_x_int = ConvertTo(DI(), floor_x); +#if HWY_TARGET == HWY_SCALAR + auto low = Set(D(), noise_params_.lut[floor_x_int.raw]); + auto hi = Set(D(), noise_params_.lut[floor_x_int.raw + 1]); +#else + // Set each lane's bytes to {0, 0, 2x+1, 2x}. + auto floorx_indices_low = + Add(Mul(floor_x_int, Set(DI(), 0x0202)), Set(DI(), 0x0100)); + // Set each lane's bytes to {2x+1, 2x, 0, 0}. + auto floorx_indices_hi = + Add(Mul(floor_x_int, Set(DI(), 0x02020000)), Set(DI(), 0x01000000)); + // load LUT + auto low16 = BitCast(DI(), LoadDup128(DI8(), low16_lut)); + auto lowm = Set(DI(), 0xFFFF); + auto hi16 = BitCast(DI(), LoadDup128(DI8(), high16_lut)); + auto him = Set(DI(), 0xFFFF0000); + // low = noise_params.lut[floor_x] + auto low = + BitCast(D(), Or(And(TableLookupBytes(low16, floorx_indices_low), lowm), + And(TableLookupBytes(hi16, floorx_indices_hi), him))); + // hi = noise_params.lut[floor_x+1] + floorx_indices_low = Add(floorx_indices_low, Set(DI(), 0x0202)); + floorx_indices_hi = Add(floorx_indices_hi, Set(DI(), 0x02020000)); + auto hi = + BitCast(D(), Or(And(TableLookupBytes(low16, floorx_indices_low), lowm), + And(TableLookupBytes(hi16, floorx_indices_hi), him))); +#endif + return MulAdd(Sub(hi, low), frac_x, low); + } + + private: +#if HWY_TARGET != HWY_SCALAR + // noise_params.lut transformed into two 16-bit lookup tables. + HWY_ALIGN uint8_t high16_lut[16]; + HWY_ALIGN uint8_t low16_lut[16]; +#else + const NoiseParams& noise_params_; +#endif +}; + +template +void AddNoiseToRGB(const D d, const Vec rnd_noise_r, + const Vec rnd_noise_g, const Vec rnd_noise_cor, + const Vec noise_strength_g, const Vec noise_strength_r, + float ytox, float ytob, float* JXL_RESTRICT out_x, + float* JXL_RESTRICT out_y, float* JXL_RESTRICT out_b) { + const auto kRGCorr = Set(d, 0.9921875f); // 127/128 + const auto kRGNCorr = Set(d, 0.0078125f); // 1/128 + + const auto red_noise = + Mul(noise_strength_r, + MulAdd(kRGNCorr, rnd_noise_r, Mul(kRGCorr, rnd_noise_cor))); + const auto green_noise = + Mul(noise_strength_g, + MulAdd(kRGNCorr, rnd_noise_g, Mul(kRGCorr, rnd_noise_cor))); + + auto vx = LoadU(d, out_x); + auto vy = LoadU(d, out_y); + auto vb = LoadU(d, out_b); + + const auto rg_noise = Add(red_noise, green_noise); + vx = Add(MulAdd(Set(d, ytox), rg_noise, Sub(red_noise, green_noise)), vx); + vy = Add(vy, rg_noise); + vb = MulAdd(Set(d, ytob), rg_noise, vb); + + StoreU(vx, d, out_x); + StoreU(vy, d, out_y); + StoreU(vb, d, out_b); +} + +class AddNoiseStage : public RenderPipelineStage { + public: + AddNoiseStage(const NoiseParams& noise_params, + const ColorCorrelationMap& cmap, size_t first_c) + : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric( + /*shift=*/0, /*border=*/0)), + noise_params_(noise_params), + cmap_(cmap), + first_c_(first_c) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + PROFILER_ZONE("Noise apply"); + + if (!noise_params_.HasAny()) return; + const StrengthEvalLut noise_model(noise_params_); + D d; + const auto half = Set(d, 0.5f); + + // With the prior subtract-random Laplacian approximation, rnd_* ranges were + // about [-1.5, 1.6]; Laplacian3 about doubles this to [-3.6, 3.6], so the + // normalizer is half of what it was before (0.5). + const auto norm_const = Set(d, 0.22f); + + float ytox = cmap_.YtoXRatio(0); + float ytob = cmap_.YtoBRatio(0); + + const size_t xsize_v = RoundUpTo(xsize, Lanes(d)); + + float* JXL_RESTRICT row_x = GetInputRow(input_rows, 0, 0); + float* JXL_RESTRICT row_y = GetInputRow(input_rows, 1, 0); + float* JXL_RESTRICT row_b = GetInputRow(input_rows, 2, 0); + const float* JXL_RESTRICT row_rnd_r = + GetInputRow(input_rows, first_c_ + 0, 0); + const float* JXL_RESTRICT row_rnd_g = + GetInputRow(input_rows, first_c_ + 1, 0); + const float* JXL_RESTRICT row_rnd_c = + GetInputRow(input_rows, first_c_ + 2, 0); + // Needed by the calls to Floor() in StrengthEvalLut. Only arithmetic and + // shuffles are otherwise done on the data, so this is safe. + msan::UnpoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float)); + msan::UnpoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float)); + for (size_t x = 0; x < xsize_v; x += Lanes(d)) { + const auto vx = LoadU(d, row_x + x); + const auto vy = LoadU(d, row_y + x); + const auto in_g = Sub(vy, vx); + const auto in_r = Add(vy, vx); + const auto noise_strength_g = NoiseStrength(noise_model, Mul(in_g, half)); + const auto noise_strength_r = NoiseStrength(noise_model, Mul(in_r, half)); + const auto addit_rnd_noise_red = Mul(LoadU(d, row_rnd_r + x), norm_const); + const auto addit_rnd_noise_green = + Mul(LoadU(d, row_rnd_g + x), norm_const); + const auto addit_rnd_noise_correlated = + Mul(LoadU(d, row_rnd_c + x), norm_const); + AddNoiseToRGB(D(), addit_rnd_noise_red, addit_rnd_noise_green, + addit_rnd_noise_correlated, noise_strength_g, + noise_strength_r, ytox, ytob, row_x + x, row_y + x, + row_b + x); + } + msan::PoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float)); + msan::PoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float)); + msan::PoisonMemory(row_b + xsize, (xsize_v - xsize) * sizeof(float)); + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c >= first_c_ ? RenderPipelineChannelMode::kInput + : c < 3 ? RenderPipelineChannelMode::kInPlace + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "AddNoise"; } + + private: + const NoiseParams& noise_params_; + const ColorCorrelationMap& cmap_; + size_t first_c_; +}; + +std::unique_ptr GetAddNoiseStage( + const NoiseParams& noise_params, const ColorCorrelationMap& cmap, + size_t noise_c_start) { + return jxl::make_unique(noise_params, cmap, noise_c_start); +} + +class ConvolveNoiseStage : public RenderPipelineStage { + public: + explicit ConvolveNoiseStage(size_t first_c) + : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric( + /*shift=*/0, /*border=*/2)), + first_c_(first_c) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + PROFILER_ZONE("Noise convolve"); + + const HWY_FULL(float) d; + for (size_t c = first_c_; c < first_c_ + 3; c++) { + float* JXL_RESTRICT rows[5]; + for (size_t i = 0; i < 5; i++) { + rows[i] = GetInputRow(input_rows, c, i - 2); + } + float* JXL_RESTRICT row_out = GetOutputRow(output_rows, c, 0); + for (ssize_t x = -RoundUpTo(xextra, Lanes(d)); + x < (ssize_t)(xsize + xextra); x += Lanes(d)) { + const auto p00 = LoadU(d, rows[2] + x); + auto others = Zero(d); + // TODO(eustas): sum loaded values to reduce the calculation chain + for (ssize_t i = -2; i <= 2; i++) { + others = Add(others, LoadU(d, rows[0] + x + i)); + others = Add(others, LoadU(d, rows[1] + x + i)); + others = Add(others, LoadU(d, rows[3] + x + i)); + others = Add(others, LoadU(d, rows[4] + x + i)); + } + others = Add(others, LoadU(d, rows[2] + x - 2)); + others = Add(others, LoadU(d, rows[2] + x - 1)); + others = Add(others, LoadU(d, rows[2] + x + 1)); + others = Add(others, LoadU(d, rows[2] + x + 2)); + // 4 * (1 - box kernel) + auto pixels = MulAdd(others, Set(d, 0.16), Mul(p00, Set(d, -3.84))); + StoreU(pixels, d, row_out + x); + } + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c >= first_c_ ? RenderPipelineChannelMode::kInOut + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "ConvNoise"; } + + private: + size_t first_c_; +}; + +std::unique_ptr GetConvolveNoiseStage( + size_t noise_c_start) { + return jxl::make_unique(noise_c_start); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetAddNoiseStage); +HWY_EXPORT(GetConvolveNoiseStage); + +std::unique_ptr GetAddNoiseStage( + const NoiseParams& noise_params, const ColorCorrelationMap& cmap, + size_t noise_c_start) { + return HWY_DYNAMIC_DISPATCH(GetAddNoiseStage)(noise_params, cmap, + noise_c_start); +} + +std::unique_ptr GetConvolveNoiseStage( + size_t noise_c_start) { + return HWY_DYNAMIC_DISPATCH(GetConvolveNoiseStage)(noise_c_start); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h new file mode 100644 index 0000000000..bd7797f991 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h @@ -0,0 +1,32 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_NOISE_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_NOISE_H_ +#include +#include +#include + +#include +#include +#include + +#include "lib/jxl/dec_noise.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Adds noise to color channels. +std::unique_ptr GetAddNoiseStage( + const NoiseParams& noise_params, const ColorCorrelationMap& cmap, + size_t noise_c_start); + +// Applies a 5x5 subtract-box-filter convolution to the noise input channels. +std::unique_ptr GetConvolveNoiseStage( + size_t noise_c_start); + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_NOISE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc new file mode 100644 index 0000000000..527be03839 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc @@ -0,0 +1,48 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_patches.h" + +namespace jxl { +namespace { +class PatchDictionaryStage : public RenderPipelineStage { + public: + PatchDictionaryStage(const PatchDictionary* patches, size_t num_channels) + : RenderPipelineStage(RenderPipelineStage::Settings()), + patches_(*patches), + num_channels_(num_channels) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + PROFILER_ZONE("RenderPatches"); + JXL_ASSERT(xpos == 0 || xpos >= xextra); + size_t x0 = xpos ? xpos - xextra : 0; + std::vector row_ptrs(num_channels_); + for (size_t i = 0; i < num_channels_; i++) { + row_ptrs[i] = GetInputRow(input_rows, i, 0) + x0 - xpos; + } + patches_.AddOneRow(row_ptrs.data(), ypos, x0, xsize + xextra + xpos - x0); + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < num_channels_ ? RenderPipelineChannelMode::kInPlace + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "Patches"; } + + private: + const PatchDictionary& patches_; + const size_t num_channels_; +}; +} // namespace + +std::unique_ptr GetPatchesStage( + const PatchDictionary* patches, size_t num_channels) { + return jxl::make_unique(patches, num_channels); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h new file mode 100644 index 0000000000..b35abdc2eb --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h @@ -0,0 +1,22 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_PATCHES_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_PATCHES_H_ + +#include + +#include "lib/jxl/patch_dictionary_internal.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Draws patches if applicable. +std::unique_ptr GetPatchesStage( + const PatchDictionary* patches, size_t num_channels); + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_PATCHES_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc new file mode 100644 index 0000000000..d97d97e5f2 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc @@ -0,0 +1,63 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_splines.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_splines.cc" +#include +#include + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +class SplineStage : public RenderPipelineStage { + public: + explicit SplineStage(const Splines* splines) + : RenderPipelineStage(RenderPipelineStage::Settings()), + splines_(*splines) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + PROFILER_ZONE("RenderSplines"); + float* row_x = GetInputRow(input_rows, 0, 0); + float* row_y = GetInputRow(input_rows, 1, 0); + float* row_b = GetInputRow(input_rows, 2, 0); + splines_.AddToRow(row_x, row_y, row_b, Rect(xpos, ypos, xsize, 1)); + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInPlace + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "Splines"; } + + private: + const Splines& splines_; +}; + +std::unique_ptr GetSplineStage(const Splines* splines) { + return jxl::make_unique(splines); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetSplineStage); + +std::unique_ptr GetSplineStage(const Splines* splines) { + return HWY_DYNAMIC_DISPATCH(GetSplineStage)(splines); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h new file mode 100644 index 0000000000..363af393ec --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h @@ -0,0 +1,21 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_SPLINES_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_SPLINES_H_ + +#include + +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" +#include "lib/jxl/splines.h" + +namespace jxl { + +// Draws splines if applicable. +std::unique_ptr GetSplineStage(const Splines* splines); + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_SPLINES_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc new file mode 100644 index 0000000000..d4f6152994 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc @@ -0,0 +1,52 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_spot.h" + +namespace jxl { +class SpotColorStage : public RenderPipelineStage { + public: + explicit SpotColorStage(size_t spot_c, const float* spot_color) + : RenderPipelineStage(RenderPipelineStage::Settings()), + spot_c_(spot_c), + spot_color_(spot_color) { + JXL_ASSERT(spot_c_ >= 3); + } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + // TODO(veluca): add SIMD. + PROFILER_ZONE("RenderSpotColors"); + float scale = spot_color_[3]; + for (size_t c = 0; c < 3; c++) { + float* JXL_RESTRICT p = GetInputRow(input_rows, c, 0); + const float* JXL_RESTRICT s = GetInputRow(input_rows, spot_c_, 0); + for (ssize_t x = -xextra; x < ssize_t(xsize + xextra); x++) { + float mix = scale * s[x]; + p[x] = mix * spot_color_[c] + (1.0f - mix) * p[x]; + } + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInPlace + : c == spot_c_ ? RenderPipelineChannelMode::kInput + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "Spot"; } + + private: + size_t spot_c_; + const float* spot_color_; +}; + +std::unique_ptr GetSpotColorStage( + size_t spot_c, const float* spot_color) { + return jxl::make_unique(spot_c, spot_color); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h new file mode 100644 index 0000000000..3e79c75823 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h @@ -0,0 +1,21 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_SPOT_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_SPOT_H_ + +#include + +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Render the spot color channels. +std::unique_ptr GetSpotColorStage(size_t spot_c, + const float* spot_color); + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_SPOT_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc new file mode 100644 index 0000000000..9f5b2b73dc --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc @@ -0,0 +1,202 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_to_linear.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_to_linear.cc" +#include +#include + +#include "lib/jxl/dec_tone_mapping-inl.h" +#include "lib/jxl/sanitizers.h" +#include "lib/jxl/transfer_functions-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::IfThenZeroElse; + +template +struct PerChannelOp { + explicit PerChannelOp(Op op) : op(op) {} + template + void Transform(D d, T* r, T* g, T* b) const { + *r = op.Transform(d, *r); + *g = op.Transform(d, *g); + *b = op.Transform(d, *b); + } + + Op op; +}; +template +PerChannelOp MakePerChannelOp(Op&& op) { + return PerChannelOp(std::forward(op)); +} + +struct OpLinear { + template + T Transform(D d, const T& encoded) const { + return encoded; + } +}; + +struct OpRgb { + template + T Transform(D d, const T& encoded) const { + return TF_SRGB().DisplayFromEncoded(encoded); + } +}; + +struct OpPq { + template + T Transform(D d, const T& encoded) const { + return TF_PQ().DisplayFromEncoded(d, encoded); + } +}; + +struct OpHlg { + explicit OpHlg(const float luminances[3], const float intensity_target) + : hlg_ootf_(HlgOOTF::FromSceneLight( + /*display_luminance=*/intensity_target, luminances)) {} + + template + void Transform(D d, T* r, T* g, T* b) const { + for (T* val : {r, g, b}) { + HWY_ALIGN float vals[MaxLanes(d)]; + Store(*val, d, vals); + for (size_t i = 0; i < Lanes(d); ++i) { + vals[i] = TF_HLG().DisplayFromEncoded(vals[i]); + } + *val = Load(d, vals); + } + hlg_ootf_.Apply(r, g, b); + } + HlgOOTF hlg_ootf_; +}; + +struct Op709 { + template + T Transform(D d, const T& encoded) const { + return TF_709().DisplayFromEncoded(d, encoded); + } +}; + +struct OpGamma { + const float gamma; + template + T Transform(D d, const T& encoded) const { + return IfThenZeroElse(Le(encoded, Set(d, 1e-5f)), + FastPowf(d, encoded, Set(d, gamma))); + } +}; + +struct OpInvalid { + template + void Transform(D d, T* r, T* g, T* b) const {} +}; + +template +class ToLinearStage : public RenderPipelineStage { + public: + explicit ToLinearStage(Op op) + : RenderPipelineStage(RenderPipelineStage::Settings()), + op_(std::move(op)) {} + + explicit ToLinearStage() + : RenderPipelineStage(RenderPipelineStage::Settings()), valid_(false) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + PROFILER_ZONE("ToLinear"); + + const HWY_FULL(float) d; + const size_t xsize_v = RoundUpTo(xsize, Lanes(d)); + float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0); + float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0); + float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0); + // All calculations are lane-wise, still some might require + // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last + // vector tail. + msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize)); + for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) { + auto r = LoadU(d, row0 + x); + auto g = LoadU(d, row1 + x); + auto b = LoadU(d, row2 + x); + op_.Transform(d, &r, &g, &b); + StoreU(r, d, row0 + x); + StoreU(g, d, row1 + x); + StoreU(b, d, row2 + x); + } + msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize)); + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInPlace + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "ToLinear"; } + + private: + Status IsInitialized() const override { return valid_; } + + Op op_; + bool valid_ = true; +}; + +template +std::unique_ptr> MakeToLinearStage(Op&& op) { + return jxl::make_unique>(std::forward(op)); +} + +std::unique_ptr GetToLinearStage( + const OutputEncodingInfo& output_encoding_info) { + if (output_encoding_info.color_encoding.tf.IsLinear()) { + return MakeToLinearStage(MakePerChannelOp(OpLinear())); + } else if (output_encoding_info.color_encoding.tf.IsSRGB()) { + return MakeToLinearStage(MakePerChannelOp(OpRgb())); + } else if (output_encoding_info.color_encoding.tf.IsPQ()) { + return MakeToLinearStage(MakePerChannelOp(OpPq())); + } else if (output_encoding_info.color_encoding.tf.IsHLG()) { + return MakeToLinearStage(OpHlg(output_encoding_info.luminances, + output_encoding_info.orig_intensity_target)); + } else if (output_encoding_info.color_encoding.tf.Is709()) { + return MakeToLinearStage(MakePerChannelOp(Op709())); + } else if (output_encoding_info.color_encoding.tf.IsGamma() || + output_encoding_info.color_encoding.tf.IsDCI()) { + return MakeToLinearStage( + MakePerChannelOp(OpGamma{1.f / output_encoding_info.inverse_gamma})); + } else { + return jxl::make_unique>(); + } +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetToLinearStage); + +std::unique_ptr GetToLinearStage( + const OutputEncodingInfo& output_encoding_info) { + return HWY_DYNAMIC_DISPATCH(GetToLinearStage)(output_encoding_info); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h new file mode 100644 index 0000000000..ccee7b09f0 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h @@ -0,0 +1,21 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_TO_LINEAR_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_TO_LINEAR_H_ + +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Converts the color channels from `output_encoding_info.color_encoding` to +// linear. +std::unique_ptr GetToLinearStage( + const OutputEncodingInfo& output_encoding_info); + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_TO_LINEAR_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc new file mode 100644 index 0000000000..7609534a5b --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc @@ -0,0 +1,151 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_tone_mapping.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_tone_mapping.cc" +#include +#include + +#include "lib/jxl/dec_tone_mapping-inl.h" +#include "lib/jxl/dec_xyb-inl.h" +#include "lib/jxl/sanitizers.h" +#include "lib/jxl/transfer_functions-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +class ToneMappingStage : public RenderPipelineStage { + public: + explicit ToneMappingStage(OutputEncodingInfo output_encoding_info) + : RenderPipelineStage(RenderPipelineStage::Settings()), + output_encoding_info_(std::move(output_encoding_info)) { + if (output_encoding_info_.desired_intensity_target == + output_encoding_info_.orig_intensity_target) { + // No tone mapping requested. + return; + } + if (output_encoding_info_.orig_color_encoding.tf.IsPQ() && + output_encoding_info_.desired_intensity_target < + output_encoding_info_.orig_intensity_target) { + tone_mapper_ = jxl::make_unique( + /*source_range=*/std::pair( + 0, output_encoding_info_.orig_intensity_target), + /*target_range=*/ + std::pair( + 0, output_encoding_info_.desired_intensity_target), + output_encoding_info_.luminances); + } else if (output_encoding_info_.orig_color_encoding.tf.IsHLG() && + !output_encoding_info_.color_encoding.tf.IsHLG()) { + hlg_ootf_ = jxl::make_unique( + /*source_luminance=*/output_encoding_info_.orig_intensity_target, + /*target_luminance=*/output_encoding_info_.desired_intensity_target, + output_encoding_info_.luminances); + } + + if (output_encoding_info_.color_encoding.tf.IsPQ() && + (tone_mapper_ || hlg_ootf_)) { + to_intensity_target_ = + 10000.f / output_encoding_info_.orig_intensity_target; + from_desired_intensity_target_ = + output_encoding_info_.desired_intensity_target / 10000.f; + } + } + + bool IsNeeded() const { return tone_mapper_ || hlg_ootf_; } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + PROFILER_ZONE("ToneMapping"); + + if (!(tone_mapper_ || hlg_ootf_)) return; + + const HWY_FULL(float) d; + const size_t xsize_v = RoundUpTo(xsize, Lanes(d)); + float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0); + float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0); + float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0); + // All calculations are lane-wise, still some might require + // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last + // vector tail. + msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize)); + for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) { + auto r = LoadU(d, row0 + x); + auto g = LoadU(d, row1 + x); + auto b = LoadU(d, row2 + x); + if (tone_mapper_ || hlg_ootf_) { + r = Mul(r, Set(d, to_intensity_target_)); + g = Mul(g, Set(d, to_intensity_target_)); + b = Mul(b, Set(d, to_intensity_target_)); + if (tone_mapper_) { + tone_mapper_->ToneMap(&r, &g, &b); + } else { + JXL_ASSERT(hlg_ootf_); + hlg_ootf_->Apply(&r, &g, &b); + } + if (tone_mapper_ || hlg_ootf_->WarrantsGamutMapping()) { + GamutMap(&r, &g, &b, output_encoding_info_.luminances); + } + r = Mul(r, Set(d, from_desired_intensity_target_)); + g = Mul(g, Set(d, from_desired_intensity_target_)); + b = Mul(b, Set(d, from_desired_intensity_target_)); + } + StoreU(r, d, row0 + x); + StoreU(g, d, row1 + x); + StoreU(b, d, row2 + x); + } + msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize)); + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInPlace + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "ToneMapping"; } + + private: + using ToneMapper = Rec2408ToneMapper; + OutputEncodingInfo output_encoding_info_; + std::unique_ptr tone_mapper_; + std::unique_ptr hlg_ootf_; + // When the target colorspace is PQ, 1 represents 10000 nits instead of + // orig_intensity_target. This temporarily changes this if the tone mappers + // require it. + float to_intensity_target_ = 1.f; + float from_desired_intensity_target_ = 1.f; +}; + +std::unique_ptr GetToneMappingStage( + const OutputEncodingInfo& output_encoding_info) { + auto stage = jxl::make_unique(output_encoding_info); + if (!stage->IsNeeded()) return nullptr; + return stage; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetToneMappingStage); + +std::unique_ptr GetToneMappingStage( + const OutputEncodingInfo& output_encoding_info) { + return HWY_DYNAMIC_DISPATCH(GetToneMappingStage)(output_encoding_info); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h new file mode 100644 index 0000000000..99824f8511 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h @@ -0,0 +1,37 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_TONE_MAPPING_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_TONE_MAPPING_H_ +#include +#include +#include + +#include +#include +#include + +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Tone maps the image if appropriate. It must be in linear space and +// `output_encoding_info.luminances` must contain the luminance for the +// primaries of that space. It must also be encoded such that (1, 1, 1) +// represents `output_encoding_info.orig_intensity_target` nits, unless +// `output_encoding_info.color_encoding.tf.IsPQ()`, in which case (1, 1, 1) must +// represent 10000 nits. This corresponds to what XYBStage outputs. After this +// stage, (1, 1, 1) will represent +// `output_encoding_info.desired_intensity_target` nits, except in the PQ +// special case in which it remains 10000. +// +// If no tone mapping is necessary, this will return nullptr. +std::unique_ptr GetToneMappingStage( + const OutputEncodingInfo& output_encoding_info); + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_TONE_MAPPING_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc new file mode 100644 index 0000000000..a75e259865 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc @@ -0,0 +1,187 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_upsampling.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_upsampling.cc" +#include +#include + +#include "lib/jxl/sanitizers.h" +#include "lib/jxl/simd_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Clamp; +using hwy::HWY_NAMESPACE::Max; +using hwy::HWY_NAMESPACE::Min; +using hwy::HWY_NAMESPACE::MulAdd; + +class UpsamplingStage : public RenderPipelineStage { + public: + explicit UpsamplingStage(const CustomTransformData& ups_factors, size_t c, + size_t shift) + : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric( + /*shift=*/shift, /*border=*/2)), + c_(c) { + const float* weights = shift == 1 ? ups_factors.upsampling2_weights + : shift == 2 ? ups_factors.upsampling4_weights + : ups_factors.upsampling8_weights; + size_t N = 1 << (shift - 1); + for (size_t i = 0; i < 5 * N; i++) { + for (size_t j = 0; j < 5 * N; j++) { + size_t y = std::min(i, j); + size_t x = std::max(i, j); + kernel_[j / 5][i / 5][j % 5][i % 5] = + weights[5 * N * y - y * (y - 1) / 2 + x - y]; + } + } + } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + PROFILER_ZONE("Upsampling"); + static HWY_FULL(float) df; + size_t shift = settings_.shift_x; + size_t N = 1 << shift; + const size_t xsize_v = RoundUpTo(xsize, Lanes(df)); + for (ssize_t iy = -2; iy <= 2; iy++) { + msan::UnpoisonMemory(GetInputRow(input_rows, c_, iy) + xsize + 2, + sizeof(float) * (xsize_v - xsize)); + } + JXL_ASSERT(xextra == 0); + ssize_t x0 = 0; + ssize_t x1 = xsize; + if (N == 2) { + ProcessRowImpl<2>(input_rows, output_rows, x0, x1); + } + if (N == 4) { + ProcessRowImpl<4>(input_rows, output_rows, x0, x1); + } + if (N == 8) { + ProcessRowImpl<8>(input_rows, output_rows, x0, x1); + } + for (size_t oy = 0; oy < N; oy++) { + float* dst_row = GetOutputRow(output_rows, c_, oy); + msan::PoisonMemory(dst_row + xsize * N, + sizeof(float) * (xsize_v - xsize) * N); + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c == c_ ? RenderPipelineChannelMode::kInOut + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "Upsample"; } + + private: + template + JXL_INLINE float Kernel(size_t x, size_t y, ssize_t ix, ssize_t iy) const { + ix += 2; + iy += 2; + if (N == 2) { + return kernel_[0][0][y % 2 ? 4 - iy : iy][x % 2 ? 4 - ix : ix]; + } + if (N == 4) { + return kernel_[y % 4 < 2 ? y % 2 : 1 - y % 2] + [x % 4 < 2 ? x % 2 : 1 - x % 2][y % 4 < 2 ? iy : 4 - iy] + [x % 4 < 2 ? ix : 4 - ix]; + } + if (N == 8) { + return kernel_[y % 8 < 4 ? y % 4 : 3 - y % 4] + [x % 8 < 4 ? x % 4 : 3 - x % 4][y % 8 < 4 ? iy : 4 - iy] + [x % 8 < 4 ? ix : 4 - ix]; + } + JXL_ABORT("Invalid upsample"); + } + + template + void ProcessRowImpl(const RowInfo& input_rows, const RowInfo& output_rows, + ssize_t x0, ssize_t x1) const { + static HWY_FULL(float) df; + using V = hwy::HWY_NAMESPACE::Vec; + V ups0, ups1, ups2, ups3, ups4, ups5, ups6, ups7; + (void)ups2, (void)ups3, (void)ups4, (void)ups5, (void)ups6, (void)ups7; + V* ups[N]; + if (N >= 2) { + ups[0] = &ups0; + ups[1] = &ups1; + } + if (N >= 4) { + ups[2] = &ups2; + ups[3] = &ups3; + } + if (N == 8) { + ups[4] = &ups4; + ups[5] = &ups5; + ups[6] = &ups6; + ups[7] = &ups7; + } + for (size_t oy = 0; oy < N; oy++) { + float* dst_row = GetOutputRow(output_rows, c_, oy); + for (ssize_t x = x0; x < x1; x += Lanes(df)) { + for (size_t ox = 0; ox < N; ox++) { + auto result = Zero(df); + auto min = LoadU(df, GetInputRow(input_rows, c_, 0) + x); + auto max = min; + for (ssize_t iy = -2; iy <= 2; iy++) { + for (ssize_t ix = -2; ix <= 2; ix++) { + auto v = LoadU(df, GetInputRow(input_rows, c_, iy) + x + ix); + result = MulAdd(Set(df, Kernel(ox, oy, ix, iy)), v, result); + min = Min(v, min); + max = Max(v, max); + } + } + // Avoid overshooting. + *ups[ox] = Clamp(result, min, max); + } + if (N == 2) { + StoreInterleaved(df, ups0, ups1, dst_row + x * N); + } + if (N == 4) { + StoreInterleaved(df, ups0, ups1, ups2, ups3, dst_row + x * N); + } + if (N == 8) { + StoreInterleaved(df, ups0, ups1, ups2, ups3, ups4, ups5, ups6, ups7, + dst_row + x * N); + } + } + } + } + + size_t c_; + float kernel_[4][4][5][5]; +}; + +std::unique_ptr GetUpsamplingStage( + const CustomTransformData& ups_factors, size_t c, size_t shift) { + return jxl::make_unique(ups_factors, c, shift); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetUpsamplingStage); + +std::unique_ptr GetUpsamplingStage( + const CustomTransformData& ups_factors, size_t c, size_t shift) { + JXL_ASSERT(shift != 0); + JXL_ASSERT(shift <= 3); + return HWY_DYNAMIC_DISPATCH(GetUpsamplingStage)(ups_factors, c, shift); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h new file mode 100644 index 0000000000..7d5defd23c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h @@ -0,0 +1,26 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_UPSAMPLING_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_UPSAMPLING_H_ +#include +#include +#include + +#include +#include +#include + +#include "lib/jxl/image_metadata.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Upsamples the given channel by the given factor. +std::unique_ptr GetUpsamplingStage( + const CustomTransformData& ups_factors, size_t c, size_t shift); +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_UPSAMPLING_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc new file mode 100644 index 0000000000..902fc33b7e --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc @@ -0,0 +1,601 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_write.h" + +#include "lib/jxl/alpha.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/sanitizers.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_write.cc" +#include +#include + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Clamp; +using hwy::HWY_NAMESPACE::Div; +using hwy::HWY_NAMESPACE::Max; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::NearestInt; +using hwy::HWY_NAMESPACE::Or; +using hwy::HWY_NAMESPACE::Rebind; +using hwy::HWY_NAMESPACE::ShiftLeftSame; +using hwy::HWY_NAMESPACE::ShiftRightSame; + +class WriteToOutputStage : public RenderPipelineStage { + public: + WriteToOutputStage(const ImageOutput& main_output, size_t width, + size_t height, bool has_alpha, bool unpremul_alpha, + size_t alpha_c, Orientation undo_orientation, + const std::vector& extra_output) + : RenderPipelineStage(RenderPipelineStage::Settings()), + width_(width), + height_(height), + main_(main_output), + num_color_(main_.num_channels_ < 3 ? 1 : 3), + want_alpha_(main_.num_channels_ == 2 || main_.num_channels_ == 4), + has_alpha_(has_alpha), + unpremul_alpha_(unpremul_alpha), + alpha_c_(alpha_c), + flip_x_(ShouldFlipX(undo_orientation)), + flip_y_(ShouldFlipY(undo_orientation)), + transpose_(ShouldTranspose(undo_orientation)), + opaque_alpha_(kMaxPixelsPerCall, 1.0f) { + for (size_t ec = 0; ec < extra_output.size(); ++ec) { + if (extra_output[ec].callback.IsPresent() || extra_output[ec].buffer) { + Output extra(extra_output[ec]); + extra.channel_index_ = 3 + ec; + extra_channels_.push_back(extra); + } + } + } + + WriteToOutputStage(const WriteToOutputStage&) = delete; + WriteToOutputStage& operator=(const WriteToOutputStage&) = delete; + WriteToOutputStage(WriteToOutputStage&&) = delete; + WriteToOutputStage& operator=(WriteToOutputStage&&) = delete; + + ~WriteToOutputStage() override { + if (main_.run_opaque_) { + main_.pixel_callback_.destroy(main_.run_opaque_); + } + for (auto& extra : extra_channels_) { + if (extra.run_opaque_) { + extra.pixel_callback_.destroy(extra.run_opaque_); + } + } + } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + JXL_DASSERT(xextra == 0); + JXL_DASSERT(main_.run_opaque_ || main_.buffer_); + if (ypos >= height_) return; + if (xpos >= width_) return; + if (flip_y_) { + ypos = height_ - 1u - ypos; + } + size_t limit = std::min(xsize, width_ - xpos); + for (size_t x0 = 0; x0 < limit; x0 += kMaxPixelsPerCall) { + size_t xstart = xpos + x0; + size_t len = std::min(kMaxPixelsPerCall, limit - x0); + + const float* line_buffers[4]; + for (size_t c = 0; c < num_color_; c++) { + line_buffers[c] = GetInputRow(input_rows, c, 0) + x0; + } + if (has_alpha_) { + line_buffers[num_color_] = GetInputRow(input_rows, alpha_c_, 0) + x0; + } else { + // opaque_alpha_ is a way to set all values to 1.0f. + line_buffers[num_color_] = opaque_alpha_.data(); + } + if (has_alpha_ && want_alpha_ && unpremul_alpha_) { + UnpremulAlpha(thread_id, len, line_buffers); + } + OutputBuffers(main_, thread_id, ypos, xstart, len, line_buffers); + for (const auto& extra : extra_channels_) { + line_buffers[0] = GetInputRow(input_rows, extra.channel_index_, 0) + x0; + OutputBuffers(extra, thread_id, ypos, xstart, len, line_buffers); + } + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + if (c < num_color_ || (has_alpha_ && c == alpha_c_)) { + return RenderPipelineChannelMode::kInput; + } + for (const auto& extra : extra_channels_) { + if (c == extra.channel_index_) { + return RenderPipelineChannelMode::kInput; + } + } + return RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "WritePixelCB"; } + + private: + struct Output { + Output(const ImageOutput& image_out) + : pixel_callback_(image_out.callback), + buffer_(image_out.buffer), + buffer_size_(image_out.buffer_size), + stride_(image_out.stride), + num_channels_(image_out.format.num_channels), + swap_endianness_(SwapEndianness(image_out.format.endianness)), + data_type_(image_out.format.data_type), + bits_per_sample_(image_out.bits_per_sample) {} + + Status PrepareForThreads(size_t num_threads) { + if (pixel_callback_.IsPresent()) { + run_opaque_ = + pixel_callback_.Init(num_threads, /*num_pixels=*/kMaxPixelsPerCall); + JXL_RETURN_IF_ERROR(run_opaque_ != nullptr); + } else { + JXL_RETURN_IF_ERROR(buffer_ != nullptr); + } + return true; + } + + PixelCallback pixel_callback_; + void* run_opaque_ = nullptr; + void* buffer_ = nullptr; + size_t buffer_size_; + size_t stride_; + size_t num_channels_; + bool swap_endianness_; + JxlDataType data_type_; + size_t bits_per_sample_; + size_t channel_index_; // used for extra_channels + }; + + Status PrepareForThreads(size_t num_threads) override { + JXL_RETURN_IF_ERROR(main_.PrepareForThreads(num_threads)); + for (auto& extra : extra_channels_) { + JXL_RETURN_IF_ERROR(extra.PrepareForThreads(num_threads)); + } + temp_out_.resize(num_threads); + for (CacheAlignedUniquePtr& temp : temp_out_) { + temp = AllocateArray(sizeof(float) * kMaxPixelsPerCall * + main_.num_channels_); + } + if ((has_alpha_ && want_alpha_ && unpremul_alpha_) || flip_x_) { + temp_in_.resize(num_threads * main_.num_channels_); + for (CacheAlignedUniquePtr& temp : temp_in_) { + temp = AllocateArray(sizeof(float) * kMaxPixelsPerCall); + } + } + return true; + } + static bool ShouldFlipX(Orientation undo_orientation) { + return (undo_orientation == Orientation::kFlipHorizontal || + undo_orientation == Orientation::kRotate180 || + undo_orientation == Orientation::kRotate270 || + undo_orientation == Orientation::kAntiTranspose); + } + static bool ShouldFlipY(Orientation undo_orientation) { + return (undo_orientation == Orientation::kFlipVertical || + undo_orientation == Orientation::kRotate180 || + undo_orientation == Orientation::kRotate90 || + undo_orientation == Orientation::kAntiTranspose); + } + static bool ShouldTranspose(Orientation undo_orientation) { + return (undo_orientation == Orientation::kTranspose || + undo_orientation == Orientation::kRotate90 || + undo_orientation == Orientation::kRotate270 || + undo_orientation == Orientation::kAntiTranspose); + } + + void UnpremulAlpha(size_t thread_id, size_t len, + const float** line_buffers) const { + const HWY_FULL(float) d; + auto one = Set(d, 1.0f); + float* temp_in[4]; + for (size_t c = 0; c < main_.num_channels_; ++c) { + size_t tix = thread_id * main_.num_channels_ + c; + temp_in[c] = reinterpret_cast(temp_in_[tix].get()); + memcpy(temp_in[c], line_buffers[c], sizeof(float) * len); + } + auto small_alpha = Set(d, kSmallAlpha); + for (size_t ix = 0; ix < len; ix += Lanes(d)) { + auto alpha = LoadU(d, temp_in[num_color_] + ix); + auto mul = Div(one, Max(small_alpha, alpha)); + for (size_t c = 0; c < num_color_; ++c) { + auto val = LoadU(d, temp_in[c] + ix); + StoreU(Mul(val, mul), d, temp_in[c] + ix); + } + } + for (size_t c = 0; c < main_.num_channels_; ++c) { + line_buffers[c] = temp_in[c]; + } + } + + void OutputBuffers(const Output& out, size_t thread_id, size_t ypos, + size_t xstart, size_t len, const float* input[4]) const { + if (flip_x_) { + FlipX(out, thread_id, len, &xstart, input); + } + if (out.data_type_ == JXL_TYPE_UINT8) { + uint8_t* JXL_RESTRICT temp = + reinterpret_cast(temp_out_[thread_id].get()); + StoreUnsignedRow(out, input, len, temp); + WriteToOutput(out, thread_id, ypos, xstart, len, temp); + } else if (out.data_type_ == JXL_TYPE_UINT16 || + out.data_type_ == JXL_TYPE_FLOAT16) { + uint16_t* JXL_RESTRICT temp = + reinterpret_cast(temp_out_[thread_id].get()); + if (out.data_type_ == JXL_TYPE_UINT16) { + StoreUnsignedRow(out, input, len, temp); + } else { + StoreFloat16Row(out, input, len, temp); + } + if (out.swap_endianness_) { + const HWY_FULL(uint16_t) du; + size_t output_len = len * out.num_channels_; + for (size_t j = 0; j < output_len; j += Lanes(du)) { + auto v = LoadU(du, temp + j); + auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8)); + StoreU(vswap, du, temp + j); + } + } + WriteToOutput(out, thread_id, ypos, xstart, len, temp); + } else if (out.data_type_ == JXL_TYPE_FLOAT) { + float* JXL_RESTRICT temp = + reinterpret_cast(temp_out_[thread_id].get()); + StoreFloatRow(out, input, len, temp); + if (out.swap_endianness_) { + size_t output_len = len * out.num_channels_; + for (size_t j = 0; j < output_len; ++j) { + temp[j] = BSwapFloat(temp[j]); + } + } + WriteToOutput(out, thread_id, ypos, xstart, len, temp); + } + } + + void FlipX(const Output& out, size_t thread_id, size_t len, size_t* xstart, + const float** line_buffers) const { + float* temp_in[4]; + for (size_t c = 0; c < out.num_channels_; ++c) { + size_t tix = thread_id * main_.num_channels_ + c; + temp_in[c] = reinterpret_cast(temp_in_[tix].get()); + if (temp_in[c] != line_buffers[c]) { + memcpy(temp_in[c], line_buffers[c], sizeof(float) * len); + } + } + size_t last = (len - 1u); + size_t num = (len / 2); + for (size_t i = 0; i < num; ++i) { + for (size_t c = 0; c < out.num_channels_; ++c) { + std::swap(temp_in[c][i], temp_in[c][last - i]); + } + } + for (size_t c = 0; c < out.num_channels_; ++c) { + line_buffers[c] = temp_in[c]; + } + *xstart = width_ - *xstart - len; + } + + template + void StoreUnsignedRow(const Output& out, const float* input[4], size_t len, + T* output) const { + const HWY_FULL(float) d; + auto zero = Zero(d); + auto one = Set(d, 1.0f); + auto mul = Set(d, (1u << (out.bits_per_sample_)) - 1); + const Rebind du; + const size_t padding = RoundUpTo(len, Lanes(d)) - len; + for (size_t c = 0; c < out.num_channels_; ++c) { + msan::UnpoisonMemory(input[c] + len, sizeof(input[c][0]) * padding); + } + if (out.num_channels_ == 1) { + for (size_t i = 0; i < len; i += Lanes(d)) { + auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul); + StoreU(DemoteTo(du, NearestInt(v0)), du, &output[i]); + } + } else if (out.num_channels_ == 2) { + for (size_t i = 0; i < len; i += Lanes(d)) { + auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul); + auto v1 = Mul(Clamp(zero, LoadU(d, &input[1][i]), one), mul); + StoreInterleaved2(DemoteTo(du, NearestInt(v0)), + DemoteTo(du, NearestInt(v1)), du, &output[2 * i]); + } + } else if (out.num_channels_ == 3) { + for (size_t i = 0; i < len; i += Lanes(d)) { + auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul); + auto v1 = Mul(Clamp(zero, LoadU(d, &input[1][i]), one), mul); + auto v2 = Mul(Clamp(zero, LoadU(d, &input[2][i]), one), mul); + StoreInterleaved3(DemoteTo(du, NearestInt(v0)), + DemoteTo(du, NearestInt(v1)), + DemoteTo(du, NearestInt(v2)), du, &output[3 * i]); + } + } else if (out.num_channels_ == 4) { + for (size_t i = 0; i < len; i += Lanes(d)) { + auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul); + auto v1 = Mul(Clamp(zero, LoadU(d, &input[1][i]), one), mul); + auto v2 = Mul(Clamp(zero, LoadU(d, &input[2][i]), one), mul); + auto v3 = Mul(Clamp(zero, LoadU(d, &input[3][i]), one), mul); + StoreInterleaved4(DemoteTo(du, NearestInt(v0)), + DemoteTo(du, NearestInt(v1)), + DemoteTo(du, NearestInt(v2)), + DemoteTo(du, NearestInt(v3)), du, &output[4 * i]); + } + } + msan::PoisonMemory(output + out.num_channels_ * len, + sizeof(output[0]) * out.num_channels_ * padding); + } + + void StoreFloat16Row(const Output& out, const float* input[4], size_t len, + uint16_t* output) const { + const HWY_FULL(float) d; + const Rebind du; + const Rebind df16; + const size_t padding = RoundUpTo(len, Lanes(d)) - len; + for (size_t c = 0; c < out.num_channels_; ++c) { + msan::UnpoisonMemory(input[c] + len, sizeof(input[c][0]) * padding); + } + if (out.num_channels_ == 1) { + for (size_t i = 0; i < len; i += Lanes(d)) { + auto v0 = LoadU(d, &input[0][i]); + StoreU(BitCast(du, DemoteTo(df16, v0)), du, &output[i]); + } + } else if (out.num_channels_ == 2) { + for (size_t i = 0; i < len; i += Lanes(d)) { + auto v0 = LoadU(d, &input[0][i]); + auto v1 = LoadU(d, &input[1][i]); + StoreInterleaved2(BitCast(du, DemoteTo(df16, v0)), + BitCast(du, DemoteTo(df16, v1)), du, &output[2 * i]); + } + } else if (out.num_channels_ == 3) { + for (size_t i = 0; i < len; i += Lanes(d)) { + auto v0 = LoadU(d, &input[0][i]); + auto v1 = LoadU(d, &input[1][i]); + auto v2 = LoadU(d, &input[2][i]); + StoreInterleaved3(BitCast(du, DemoteTo(df16, v0)), + BitCast(du, DemoteTo(df16, v1)), + BitCast(du, DemoteTo(df16, v2)), du, &output[3 * i]); + } + } else if (out.num_channels_ == 4) { + for (size_t i = 0; i < len; i += Lanes(d)) { + auto v0 = LoadU(d, &input[0][i]); + auto v1 = LoadU(d, &input[1][i]); + auto v2 = LoadU(d, &input[2][i]); + auto v3 = LoadU(d, &input[3][i]); + StoreInterleaved4(BitCast(du, DemoteTo(df16, v0)), + BitCast(du, DemoteTo(df16, v1)), + BitCast(du, DemoteTo(df16, v2)), + BitCast(du, DemoteTo(df16, v3)), du, &output[4 * i]); + } + } + msan::PoisonMemory(output + out.num_channels_ * len, + sizeof(output[0]) * out.num_channels_ * padding); + } + + void StoreFloatRow(const Output& out, const float* input[4], size_t len, + float* output) const { + const HWY_FULL(float) d; + if (out.num_channels_ == 1) { + memcpy(output, input[0], len * sizeof(output[0])); + } else if (out.num_channels_ == 2) { + for (size_t i = 0; i < len; i += Lanes(d)) { + StoreInterleaved2(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]), d, + &output[2 * i]); + } + } else if (out.num_channels_ == 3) { + for (size_t i = 0; i < len; i += Lanes(d)) { + StoreInterleaved3(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]), + LoadU(d, &input[2][i]), d, &output[3 * i]); + } + } else { + for (size_t i = 0; i < len; i += Lanes(d)) { + StoreInterleaved4(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]), + LoadU(d, &input[2][i]), LoadU(d, &input[3][i]), d, + &output[4 * i]); + } + } + } + + template + void WriteToOutput(const Output& out, size_t thread_id, size_t ypos, + size_t xstart, size_t len, T* output) const { + if (transpose_) { + // TODO(szabadka) Buffer 8x8 chunks and transpose with SIMD. + if (out.run_opaque_) { + for (size_t i = 0, j = 0; i < len; ++i, j += out.num_channels_) { + out.pixel_callback_.run(out.run_opaque_, thread_id, ypos, xstart + i, + 1, output + j); + } + } else { + const size_t pixel_stride = out.num_channels_ * sizeof(T); + const size_t offset = xstart * out.stride_ + ypos * pixel_stride; + for (size_t i = 0, j = 0; i < len; ++i, j += out.num_channels_) { + const size_t ix = offset + i * out.stride_; + JXL_DASSERT(ix + pixel_stride <= out.buffer_size_); + memcpy(reinterpret_cast(out.buffer_) + ix, output + j, + pixel_stride); + } + } + } else { + if (out.run_opaque_) { + out.pixel_callback_.run(out.run_opaque_, thread_id, xstart, ypos, len, + output); + } else { + const size_t pixel_stride = out.num_channels_ * sizeof(T); + const size_t offset = ypos * out.stride_ + xstart * pixel_stride; + JXL_DASSERT(offset + len * pixel_stride <= out.buffer_size_); + memcpy(reinterpret_cast(out.buffer_) + offset, output, + len * pixel_stride); + } + } + } + + static constexpr size_t kMaxPixelsPerCall = 1024; + size_t width_; + size_t height_; + Output main_; // color + alpha + size_t num_color_; + bool want_alpha_; + bool has_alpha_; + bool unpremul_alpha_; + size_t alpha_c_; + bool flip_x_; + bool flip_y_; + bool transpose_; + std::vector extra_channels_; + std::vector opaque_alpha_; + std::vector temp_in_; + std::vector temp_out_; +}; + +constexpr size_t WriteToOutputStage::kMaxPixelsPerCall; + +std::unique_ptr GetWriteToOutputStage( + const ImageOutput& main_output, size_t width, size_t height, bool has_alpha, + bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation, + std::vector& extra_output) { + return jxl::make_unique( + main_output, width, height, has_alpha, unpremul_alpha, alpha_c, + undo_orientation, extra_output); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace jxl { + +HWY_EXPORT(GetWriteToOutputStage); + +namespace { +class WriteToImageBundleStage : public RenderPipelineStage { + public: + explicit WriteToImageBundleStage(ImageBundle* image_bundle, + ColorEncoding color_encoding) + : RenderPipelineStage(RenderPipelineStage::Settings()), + image_bundle_(image_bundle), + color_encoding_(std::move(color_encoding)) {} + + void SetInputSizes( + const std::vector>& input_sizes) override { +#if JXL_ENABLE_ASSERT + JXL_ASSERT(input_sizes.size() >= 3); + for (size_t c = 1; c < input_sizes.size(); c++) { + JXL_ASSERT(input_sizes[c].first == input_sizes[0].first); + JXL_ASSERT(input_sizes[c].second == input_sizes[0].second); + } +#endif + // TODO(eustas): what should we do in the case of "want only ECs"? + image_bundle_->SetFromImage( + Image3F(input_sizes[0].first, input_sizes[0].second), color_encoding_); + // TODO(veluca): consider not reallocating ECs if not needed. + image_bundle_->extra_channels().clear(); + for (size_t c = 3; c < input_sizes.size(); c++) { + image_bundle_->extra_channels().emplace_back(input_sizes[c].first, + input_sizes[c].second); + } + } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + for (size_t c = 0; c < 3; c++) { + memcpy(image_bundle_->color()->PlaneRow(c, ypos) + xpos - xextra, + GetInputRow(input_rows, c, 0) - xextra, + sizeof(float) * (xsize + 2 * xextra)); + } + for (size_t ec = 0; ec < image_bundle_->extra_channels().size(); ec++) { + JXL_ASSERT(image_bundle_->extra_channels()[ec].xsize() >= + xpos + xsize + xextra); + memcpy(image_bundle_->extra_channels()[ec].Row(ypos) + xpos - xextra, + GetInputRow(input_rows, 3 + ec, 0) - xextra, + sizeof(float) * (xsize + 2 * xextra)); + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return RenderPipelineChannelMode::kInput; + } + + const char* GetName() const override { return "WriteIB"; } + + private: + ImageBundle* image_bundle_; + ColorEncoding color_encoding_; +}; + +class WriteToImage3FStage : public RenderPipelineStage { + public: + explicit WriteToImage3FStage(Image3F* image) + : RenderPipelineStage(RenderPipelineStage::Settings()), image_(image) {} + + void SetInputSizes( + const std::vector>& input_sizes) override { +#if JXL_ENABLE_ASSERT + JXL_ASSERT(input_sizes.size() >= 3); + for (size_t c = 1; c < 3; ++c) { + JXL_ASSERT(input_sizes[c].first == input_sizes[0].first); + JXL_ASSERT(input_sizes[c].second == input_sizes[0].second); + } +#endif + *image_ = Image3F(input_sizes[0].first, input_sizes[0].second); + } + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + for (size_t c = 0; c < 3; c++) { + memcpy(image_->PlaneRow(c, ypos) + xpos - xextra, + GetInputRow(input_rows, c, 0) - xextra, + sizeof(float) * (xsize + 2 * xextra)); + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInput + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "WriteI3F"; } + + private: + Image3F* image_; +}; + +} // namespace + +std::unique_ptr GetWriteToImageBundleStage( + ImageBundle* image_bundle, ColorEncoding color_encoding) { + return jxl::make_unique(image_bundle, + std::move(color_encoding)); +} + +std::unique_ptr GetWriteToImage3FStage(Image3F* image) { + return jxl::make_unique(image); +} + +std::unique_ptr GetWriteToOutputStage( + const ImageOutput& main_output, size_t width, size_t height, bool has_alpha, + bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation, + std::vector& extra_output) { + return HWY_DYNAMIC_DISPATCH(GetWriteToOutputStage)( + main_output, width, height, has_alpha, unpremul_alpha, alpha_c, + undo_orientation, extra_output); +} + +} // namespace jxl + +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h new file mode 100644 index 0000000000..c5f844ebe8 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h @@ -0,0 +1,31 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_WRITE_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_WRITE_H_ + +#include + +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +std::unique_ptr GetWriteToImageBundleStage( + ImageBundle* image_bundle, ColorEncoding color_encoding); + +// Gets a stage to write color channels to an Image3F. +std::unique_ptr GetWriteToImage3FStage(Image3F* image); + +// Gets a stage to write to a pixel callback or image buffer. +std::unique_ptr GetWriteToOutputStage( + const ImageOutput& main_output, size_t width, size_t height, bool has_alpha, + bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation, + std::vector& extra_output); + +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_WRITE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc new file mode 100644 index 0000000000..15cfc75b18 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc @@ -0,0 +1,176 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_xyb.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_xyb.cc" +#include +#include + +#include "lib/jxl/dec_xyb-inl.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/sanitizers.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +class XYBStage : public RenderPipelineStage { + public: + explicit XYBStage(const OutputEncodingInfo& output_encoding_info) + : RenderPipelineStage(RenderPipelineStage::Settings()), + opsin_params_(output_encoding_info.opsin_params), + output_is_xyb_(output_encoding_info.color_encoding.GetColorSpace() == + ColorSpace::kXYB) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + PROFILER_ZONE("UndoXYB"); + + const HWY_FULL(float) d; + JXL_ASSERT(xextra == 0); + const size_t xsize_v = RoundUpTo(xsize, Lanes(d)); + float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0); + float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0); + float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0); + // All calculations are lane-wise, still some might require + // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last + // vector tail. + msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize)); + // TODO(eustas): when using frame origin, addresses might be unaligned; + // making them aligned will void performance penalty. + if (output_is_xyb_) { + const auto scale_x = Set(d, kScaledXYBScale[0]); + const auto scale_y = Set(d, kScaledXYBScale[1]); + const auto scale_bmy = Set(d, kScaledXYBScale[2]); + const auto offset_x = Set(d, kScaledXYBOffset[0]); + const auto offset_y = Set(d, kScaledXYBOffset[1]); + const auto offset_bmy = Set(d, kScaledXYBOffset[2]); + for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) { + const auto in_x = LoadU(d, row0 + x); + const auto in_y = LoadU(d, row1 + x); + const auto in_b = LoadU(d, row2 + x); + auto out_x = Mul(Add(in_x, offset_x), scale_x); + auto out_y = Mul(Add(in_y, offset_y), scale_y); + auto out_b = Mul(Add(Sub(in_b, in_y), offset_bmy), scale_bmy); + StoreU(out_x, d, row0 + x); + StoreU(out_y, d, row1 + x); + StoreU(out_b, d, row2 + x); + } + } else { + for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) { + const auto in_opsin_x = LoadU(d, row0 + x); + const auto in_opsin_y = LoadU(d, row1 + x); + const auto in_opsin_b = LoadU(d, row2 + x); + auto r = Undefined(d); + auto g = Undefined(d); + auto b = Undefined(d); + XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b, opsin_params_, &r, &g, + &b); + StoreU(r, d, row0 + x); + StoreU(g, d, row1 + x); + StoreU(b, d, row2 + x); + } + } + msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize)); + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInPlace + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "XYB"; } + + private: + const OpsinParams opsin_params_; + const bool output_is_xyb_; +}; + +std::unique_ptr GetXYBStage( + const OutputEncodingInfo& output_encoding_info) { + return jxl::make_unique(output_encoding_info); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetXYBStage); + +std::unique_ptr GetXYBStage( + const OutputEncodingInfo& output_encoding_info) { + return HWY_DYNAMIC_DISPATCH(GetXYBStage)(output_encoding_info); +} + +namespace { +class FastXYBStage : public RenderPipelineStage { + public: + FastXYBStage(uint8_t* rgb, size_t stride, size_t width, size_t height, + bool rgba, bool has_alpha, size_t alpha_c) + : RenderPipelineStage(RenderPipelineStage::Settings()), + rgb_(rgb), + stride_(stride), + width_(width), + height_(height), + rgba_(rgba), + has_alpha_(has_alpha), + alpha_c_(alpha_c) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + if (ypos >= height_) return; + JXL_ASSERT(xextra == 0); + const float* xyba[4] = { + GetInputRow(input_rows, 0, 0), GetInputRow(input_rows, 1, 0), + GetInputRow(input_rows, 2, 0), + has_alpha_ ? GetInputRow(input_rows, alpha_c_, 0) : nullptr}; + uint8_t* out_buf = rgb_ + stride_ * ypos + (rgba_ ? 4 : 3) * xpos; + FastXYBTosRGB8(xyba, out_buf, rgba_, + xsize + xpos <= width_ ? xsize : width_ - xpos); + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 || (has_alpha_ && c == alpha_c_) + ? RenderPipelineChannelMode::kInput + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "FastXYB"; } + + private: + uint8_t* rgb_; + size_t stride_; + size_t width_; + size_t height_; + bool rgba_; + bool has_alpha_; + size_t alpha_c_; + std::vector opaque_alpha_; +}; + +} // namespace + +std::unique_ptr GetFastXYBTosRGB8Stage( + uint8_t* rgb, size_t stride, size_t width, size_t height, bool rgba, + bool has_alpha, size_t alpha_c) { + JXL_ASSERT(HasFastXYBTosRGB8()); + return make_unique(rgb, stride, width, height, rgba, has_alpha, + alpha_c); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h new file mode 100644 index 0000000000..7b06345c36 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h @@ -0,0 +1,26 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_XYB_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_XYB_H_ +#include + +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Converts the color channels from XYB to linear with appropriate primaries. +std::unique_ptr GetXYBStage( + const OutputEncodingInfo& output_encoding_info); + +// Gets a stage to convert with fixed point arithmetic from XYB to sRGB8 and +// write to a uint8 buffer. +std::unique_ptr GetFastXYBTosRGB8Stage( + uint8_t* rgb, size_t stride, size_t width, size_t height, bool rgba, + bool has_alpha, size_t alpha_c); +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_XYB_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc new file mode 100644 index 0000000000..5cba4a7d41 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc @@ -0,0 +1,85 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/render_pipeline/stage_ycbcr.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_ycbcr.cc" +#include +#include + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::MulAdd; + +class kYCbCrStage : public RenderPipelineStage { + public: + kYCbCrStage() : RenderPipelineStage(RenderPipelineStage::Settings()) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + PROFILER_ZONE("UndoYCbCr"); + + const HWY_FULL(float) df; + + // Full-range BT.601 as defined by JFIF Clause 7: + // https://www.itu.int/rec/T-REC-T.871-201105-I/en + const auto c128 = Set(df, 128.0f / 255); + const auto crcr = Set(df, 1.402f); + const auto cgcb = Set(df, -0.114f * 1.772f / 0.587f); + const auto cgcr = Set(df, -0.299f * 1.402f / 0.587f); + const auto cbcb = Set(df, 1.772f); + + float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0); + float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0); + float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0); + // TODO(eustas): when using frame origin, addresses might be unaligned; + // making them aligned will void performance penalty. + for (size_t x = 0; x < xsize; x += Lanes(df)) { + const auto y_vec = Add(LoadU(df, row1 + x), c128); + const auto cb_vec = LoadU(df, row0 + x); + const auto cr_vec = LoadU(df, row2 + x); + const auto r_vec = MulAdd(crcr, cr_vec, y_vec); + const auto g_vec = MulAdd(cgcr, cr_vec, MulAdd(cgcb, cb_vec, y_vec)); + const auto b_vec = MulAdd(cbcb, cb_vec, y_vec); + StoreU(r_vec, df, row0 + x); + StoreU(g_vec, df, row1 + x); + StoreU(b_vec, df, row2 + x); + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return c < 3 ? RenderPipelineChannelMode::kInPlace + : RenderPipelineChannelMode::kIgnored; + } + + const char* GetName() const override { return "YCbCr"; } +}; + +std::unique_ptr GetYCbCrStage() { + return jxl::make_unique(); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(GetYCbCrStage); + +std::unique_ptr GetYCbCrStage() { + return HWY_DYNAMIC_DISPATCH(GetYCbCrStage)(); +} + +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h new file mode 100644 index 0000000000..9320c9723f --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h @@ -0,0 +1,25 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_YCBCR_H_ +#define LIB_JXL_RENDER_PIPELINE_STAGE_YCBCR_H_ +#include +#include +#include + +#include +#include +#include + +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +// Converts the color channels from YCbCr to RGB. +std::unique_ptr GetYCbCrStage(); +} // namespace jxl + +#endif // LIB_JXL_RENDER_PIPELINE_STAGE_YCBCR_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h new file mode 100644 index 0000000000..789a52f8b2 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h @@ -0,0 +1,101 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include + +#include +#include +#include + +#include "lib/jxl/render_pipeline/render_pipeline_stage.h" + +namespace jxl { + +class UpsampleXSlowStage : public RenderPipelineStage { + public: + UpsampleXSlowStage() + : RenderPipelineStage(RenderPipelineStage::Settings::ShiftX(1, 1)) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + for (size_t c = 0; c < input_rows.size(); c++) { + const float* row = GetInputRow(input_rows, c, 0); + float* row_out = GetOutputRow(output_rows, c, 0); + for (int64_t x = -xextra; x < (int64_t)(xsize + xextra); x++) { + float xp = *(row + x - 1); + float xc = *(row + x); + float xn = *(row + x + 1); + float xout0 = xp * 0.25f + xc * 0.75f; + float xout1 = xc * 0.75f + xn * 0.25f; + *(row_out + 2 * x + 0) = xout0; + *(row_out + 2 * x + 1) = xout1; + } + } + } + + const char* GetName() const override { return "TEST::UpsampleXSlowStage"; } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return RenderPipelineChannelMode::kInOut; + } +}; + +class UpsampleYSlowStage : public RenderPipelineStage { + public: + UpsampleYSlowStage() + : RenderPipelineStage(RenderPipelineStage::Settings::ShiftY(1, 1)) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + for (size_t c = 0; c < input_rows.size(); c++) { + const float* rowp = GetInputRow(input_rows, c, -1); + const float* rowc = GetInputRow(input_rows, c, 0); + const float* rown = GetInputRow(input_rows, c, 1); + float* row_out0 = GetOutputRow(output_rows, c, 0); + float* row_out1 = GetOutputRow(output_rows, c, 1); + for (int64_t x = -xextra; x < (int64_t)(xsize + xextra); x++) { + float xp = *(rowp + x); + float xc = *(rowc + x); + float xn = *(rown + x); + float yout0 = xp * 0.25f + xc * 0.75f; + float yout1 = xc * 0.75f + xn * 0.25f; + *(row_out0 + x) = yout0; + *(row_out1 + x) = yout1; + } + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return RenderPipelineChannelMode::kInOut; + } + + const char* GetName() const override { return "TEST::UpsampleYSlowStage"; } +}; + +class Check0FinalStage : public RenderPipelineStage { + public: + Check0FinalStage() : RenderPipelineStage(RenderPipelineStage::Settings()) {} + + void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, + size_t xextra, size_t xsize, size_t xpos, size_t ypos, + size_t thread_id) const final { + for (size_t c = 0; c < input_rows.size(); c++) { + for (size_t x = 0; x < xsize; x++) { + JXL_CHECK(fabsf(GetInputRow(input_rows, c, 0)[x]) < 1e-8); + } + } + } + + RenderPipelineChannelMode GetChannelMode(size_t c) const final { + return RenderPipelineChannelMode::kInput; + } + const char* GetName() const override { return "TEST::Check0FinalStage"; } +}; + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/roundtrip_test.cc b/third_party/jpeg-xl/lib/jxl/roundtrip_test.cc new file mode 100644 index 0000000000..f1529b500c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/roundtrip_test.cc @@ -0,0 +1,839 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include +#include +#include +#include + +#include // std::abs +#include +#include +#include +#include + +#include "lib/extras/codec.h" +#include "lib/jxl/dec_external_image.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/enc_comparator.h" +#include "lib/jxl/enc_external_image.h" +#include "lib/jxl/encode_internal.h" +#include "lib/jxl/icc_codec.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +namespace { + +// Converts a test image to a CodecInOut. +// icc_profile can be empty to automatically deduce profile from the pixel +// format, or filled in to force this ICC profile +jxl::CodecInOut ConvertTestImage(const std::vector& buf, + const size_t xsize, const size_t ysize, + const JxlPixelFormat& pixel_format, + const jxl::PaddedBytes& icc_profile) { + jxl::CodecInOut io; + io.SetSize(xsize, ysize); + + bool is_gray = pixel_format.num_channels < 3; + bool has_alpha = + pixel_format.num_channels == 2 || pixel_format.num_channels == 4; + + io.metadata.m.color_encoding.SetColorSpace(is_gray ? jxl::ColorSpace::kGray + : jxl::ColorSpace::kRGB); + if (has_alpha) { + // Note: alpha > 16 not yet supported by the C++ codec + switch (pixel_format.data_type) { + case JXL_TYPE_UINT8: + io.metadata.m.SetAlphaBits(8); + break; + case JXL_TYPE_UINT16: + case JXL_TYPE_FLOAT: + case JXL_TYPE_FLOAT16: + io.metadata.m.SetAlphaBits(16); + break; + default: + ADD_FAILURE() << "Roundtrip tests for data type " + << pixel_format.data_type << " not yet implemented."; + } + } + size_t bitdepth = 0; + switch (pixel_format.data_type) { + case JXL_TYPE_FLOAT: + bitdepth = 32; + io.metadata.m.SetFloat32Samples(); + break; + case JXL_TYPE_FLOAT16: + bitdepth = 16; + io.metadata.m.SetFloat16Samples(); + break; + case JXL_TYPE_UINT8: + bitdepth = 8; + io.metadata.m.SetUintSamples(8); + break; + case JXL_TYPE_UINT16: + bitdepth = 16; + io.metadata.m.SetUintSamples(16); + break; + default: + ADD_FAILURE() << "Roundtrip tests for data type " + << pixel_format.data_type << " not yet implemented."; + } + jxl::ColorEncoding color_encoding; + if (!icc_profile.empty()) { + jxl::PaddedBytes icc_profile_copy(icc_profile); + EXPECT_TRUE(color_encoding.SetICC(std::move(icc_profile_copy))); + } else if (pixel_format.data_type == JXL_TYPE_FLOAT) { + color_encoding = jxl::ColorEncoding::LinearSRGB(is_gray); + } else { + color_encoding = jxl::ColorEncoding::SRGB(is_gray); + } + EXPECT_TRUE( + ConvertFromExternal(jxl::Span(buf.data(), buf.size()), + xsize, ysize, color_encoding, + /*bits_per_sample=*/bitdepth, pixel_format, + /*pool=*/nullptr, &io.Main())); + return io; +} + +template +T ConvertTestPixel(float val); + +template <> +float ConvertTestPixel(const float val) { + return val; +} + +template <> +uint16_t ConvertTestPixel(const float val) { + return (uint16_t)(val * UINT16_MAX); +} + +template <> +uint8_t ConvertTestPixel(const float val) { + return (uint8_t)(val * UINT8_MAX); +} + +// Returns a test image. +template +std::vector GetTestImage(const size_t xsize, const size_t ysize, + const JxlPixelFormat& pixel_format) { + std::vector pixels(xsize * ysize * pixel_format.num_channels); + for (size_t y = 0; y < ysize; y++) { + for (size_t x = 0; x < xsize; x++) { + for (size_t chan = 0; chan < pixel_format.num_channels; chan++) { + float val; + switch (chan % 4) { + case 0: + val = static_cast(y) / static_cast(ysize); + break; + case 1: + val = static_cast(x) / static_cast(xsize); + break; + case 2: + val = static_cast(x + y) / static_cast(xsize + ysize); + break; + case 3: + val = static_cast(x * y) / static_cast(xsize * ysize); + break; + } + pixels[(y * xsize + x) * pixel_format.num_channels + chan] = + ConvertTestPixel(val); + } + } + } + std::vector bytes(pixels.size() * sizeof(T)); + memcpy(bytes.data(), pixels.data(), sizeof(T) * pixels.size()); + return bytes; +} + +void EncodeWithEncoder(JxlEncoder* enc, std::vector* compressed) { + compressed->resize(64); + uint8_t* next_out = compressed->data(); + size_t avail_out = compressed->size() - (next_out - compressed->data()); + JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT; + while (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + process_result = JxlEncoderProcessOutput(enc, &next_out, &avail_out); + if (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + size_t offset = next_out - compressed->data(); + compressed->resize(compressed->size() * 2); + next_out = compressed->data() + offset; + avail_out = compressed->size() - offset; + } + } + compressed->resize(next_out - compressed->data()); + EXPECT_EQ(JXL_ENC_SUCCESS, process_result); +} + +// Generates some pixels using some dimensions and pixel_format, +// compresses them, and verifies that the decoded version is similar to the +// original pixels. +// TODO(firsching): change this to be a parameterized test, like in +// decode_test.cc +template +void VerifyRoundtripCompression( + const size_t xsize, const size_t ysize, + const JxlPixelFormat& input_pixel_format, + const JxlPixelFormat& output_pixel_format, const bool lossless, + const bool use_container, const uint32_t resampling = 1, + const bool already_downsampled = false, + const std::vector>& + extra_channels = {}) { + size_t orig_xsize = xsize; + size_t orig_ysize = ysize; + if (already_downsampled) { + orig_xsize = jxl::DivCeil(xsize, resampling); + orig_ysize = jxl::DivCeil(ysize, resampling); + } + + JxlPixelFormat extra_channel_pixel_format = input_pixel_format; + extra_channel_pixel_format.num_channels = 1; + const std::vector extra_channel_bytes = + GetTestImage(xsize, ysize, extra_channel_pixel_format); + const std::vector original_bytes = + GetTestImage(orig_xsize, orig_ysize, input_pixel_format); + jxl::CodecInOut original_io = ConvertTestImage( + original_bytes, orig_xsize, orig_ysize, input_pixel_format, {}); + + JxlEncoder* enc = JxlEncoderCreate(nullptr); + EXPECT_NE(nullptr, enc); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc, 10)); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseContainer(enc, use_container)); + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &input_pixel_format); + basic_info.xsize = xsize; + basic_info.ysize = ysize; + basic_info.uses_original_profile = lossless; + uint32_t num_channels = input_pixel_format.num_channels; + size_t has_interleaved_alpha = num_channels == 2 || num_channels == 4; + JxlPixelFormat output_pixel_format_with_extra_channel_alpha = + output_pixel_format; + + // In the case where we have an alpha channel, but it is provided as an extra + // channel and not interleaved, we do two things here: + // 1. modify the original_io to have the correct alpha channel + // 2. change the output_format_with_extra_alpha to have an alpha channel + bool alpha_in_extra_channels_vector = false; + for (const auto& extra_channel : extra_channels) { + if (extra_channel.first == JXL_CHANNEL_ALPHA) { + alpha_in_extra_channels_vector = true; + } + } + if (alpha_in_extra_channels_vector && !has_interleaved_alpha) { + jxl::ImageF alpha_channel(xsize, ysize); + EXPECT_TRUE(jxl::ConvertFromExternal( + jxl::Span(extra_channel_bytes.data(), + extra_channel_bytes.size()), + xsize, ysize, basic_info.bits_per_sample, extra_channel_pixel_format, 0, + /*pool=*/nullptr, &alpha_channel)); + + original_io.metadata.m.SetAlphaBits(basic_info.bits_per_sample); + original_io.Main().SetAlpha(std::move(alpha_channel)); + output_pixel_format_with_extra_channel_alpha.num_channels++; + } + // Those are the num_extra_channels including a potential alpha channel. + basic_info.num_extra_channels = extra_channels.size() + has_interleaved_alpha; + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info)); + EXPECT_EQ(enc->metadata.m.num_extra_channels, + extra_channels.size() + has_interleaved_alpha); + JxlColorEncoding color_encoding; + if (input_pixel_format.data_type == JXL_TYPE_FLOAT) { + JxlColorEncodingSetToLinearSRGB( + &color_encoding, + /*is_gray=*/input_pixel_format.num_channels < 3); + } else { + JxlColorEncodingSetToSRGB(&color_encoding, + /*is_gray=*/input_pixel_format.num_channels < 3); + } + + std::vector channel_infos; + for (const auto& extra_channel : extra_channels) { + auto channel_type = extra_channel.first; + JxlExtraChannelInfo channel_info; + JxlEncoderInitExtraChannelInfo(channel_type, &channel_info); + channel_info.bits_per_sample = (lossless ? basic_info.bits_per_sample : 8); + channel_info.exponent_bits_per_sample = + (lossless ? basic_info.exponent_bits_per_sample : 0); + channel_infos.push_back(channel_info); + } + for (size_t index = 0; index < channel_infos.size(); index++) { + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetExtraChannelInfo(enc, index + has_interleaved_alpha, + &channel_infos[index])); + std::string name = extra_channels[index].second; + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetExtraChannelName(enc, index + has_interleaved_alpha, + name.c_str(), name.length())); + } + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetColorEncoding(enc, &color_encoding)); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc, nullptr); + JxlEncoderSetFrameLossless(frame_settings, lossless); + if (resampling > 1) { + EXPECT_EQ( + JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_RESAMPLING, resampling)); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderFrameSettingsSetOption( + frame_settings, JXL_ENC_FRAME_SETTING_ALREADY_DOWNSAMPLED, + already_downsampled)); + } + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddImageFrame(frame_settings, &input_pixel_format, + (void*)original_bytes.data(), + original_bytes.size())); + EXPECT_EQ(frame_settings->enc->input_queue.back() + .frame->frame.extra_channels() + .size(), + has_interleaved_alpha + extra_channels.size()); + EXPECT_EQ(frame_settings->enc->input_queue.empty(), false); + for (size_t index = 0; index < channel_infos.size(); index++) { + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetExtraChannelBuffer( + frame_settings, &extra_channel_pixel_format, + (void*)extra_channel_bytes.data(), extra_channel_bytes.size(), + index + has_interleaved_alpha)); + } + JxlEncoderCloseInput(enc); + EXPECT_EQ(frame_settings->enc->input_queue.back() + .frame->frame.extra_channels() + .size(), + has_interleaved_alpha + extra_channels.size()); + std::vector compressed; + EncodeWithEncoder(enc, &compressed); + JxlEncoderDestroy(enc); + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_NE(nullptr, dec); + + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO | + JXL_DEC_COLOR_ENCODING | + JXL_DEC_FULL_IMAGE)); + + JxlDecoderSetInput(dec, next_in, avail_in); + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ( + JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize( + dec, &output_pixel_format_with_extra_channel_alpha, &buffer_size)); + if (&input_pixel_format == &output_pixel_format_with_extra_channel_alpha && + !already_downsampled) { + EXPECT_EQ(buffer_size, original_bytes.size()); + } + + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(xsize, info.xsize); + EXPECT_EQ(ysize, info.ysize); + EXPECT_EQ(extra_channels.size() + has_interleaved_alpha, + info.num_extra_channels); + + EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec)); + + size_t icc_profile_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetICCProfileSize( + dec, &output_pixel_format_with_extra_channel_alpha, + JXL_COLOR_PROFILE_TARGET_DATA, &icc_profile_size)); + jxl::PaddedBytes icc_profile(icc_profile_size); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetColorAsICCProfile( + dec, &output_pixel_format, JXL_COLOR_PROFILE_TARGET_DATA, + icc_profile.data(), icc_profile.size())); + + std::vector decoded_bytes(buffer_size); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer( + dec, &output_pixel_format_with_extra_channel_alpha, + decoded_bytes.data(), decoded_bytes.size())); + std::vector> extra_channel_decoded_bytes( + info.num_extra_channels - has_interleaved_alpha); + + for (size_t index = has_interleaved_alpha; index < info.num_extra_channels; + index++) { + JxlExtraChannelInfo channel_info; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetExtraChannelInfo(dec, index, &channel_info)); + EXPECT_EQ(channel_info.type, + extra_channels[index - has_interleaved_alpha].first); + std::string input_name = + extra_channels[index - has_interleaved_alpha].second; + const size_t name_length = channel_info.name_length; + EXPECT_EQ(input_name.size(), name_length); + std::vector output_name(name_length + 1); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetExtraChannelName(dec, index, output_name.data(), + output_name.size())); + EXPECT_EQ(0, + memcmp(input_name.data(), output_name.data(), input_name.size())); + size_t extra_buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderExtraChannelBufferSize(dec, &output_pixel_format, + &extra_buffer_size, index)); + std::vector extra_decoded_bytes(extra_buffer_size); + extra_channel_decoded_bytes[index - has_interleaved_alpha] = + std::move(extra_decoded_bytes); + EXPECT_EQ( + JXL_DEC_SUCCESS, + JxlDecoderSetExtraChannelBuffer( + dec, &output_pixel_format, + extra_channel_decoded_bytes[index - has_interleaved_alpha].data(), + extra_channel_decoded_bytes[index - has_interleaved_alpha].size(), + index)); + } + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + // Check if there are no further errors after getting the full image, e.g. + // check that the final codestream box is actually marked as last. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + + JxlDecoderDestroy(dec); + + jxl::CodecInOut decoded_io = ConvertTestImage( + decoded_bytes, xsize, ysize, output_pixel_format_with_extra_channel_alpha, + icc_profile); + + if (already_downsampled) { + jxl::Image3F* color = decoded_io.Main().color(); + jxl::DownsampleImage(color, resampling); + if (decoded_io.Main().HasAlpha()) { + jxl::ImageF* alpha = decoded_io.Main().alpha(); + jxl::DownsampleImage(alpha, resampling); + } + decoded_io.SetSize(color->xsize(), color->ysize()); + } + + if (lossless && !already_downsampled) { + JXL_EXPECT_OK(jxl::SamePixels(*original_io.Main().color(), + *decoded_io.Main().color(), _)); + } else { + jxl::ButteraugliParams ba; + float butteraugli_score = ButteraugliDistance( + original_io.frames, decoded_io.frames, ba, jxl::GetJxlCms(), + /*distmap=*/nullptr, nullptr); + EXPECT_LE(butteraugli_score, 2.0f); + } + JxlPixelFormat extra_channel_output_pixel_format = output_pixel_format; + extra_channel_output_pixel_format.num_channels = 1; + for (auto& extra_channel : extra_channel_decoded_bytes) { + EXPECT_EQ(extra_channel.size(), extra_channel_bytes.size()); + if (lossless) { + EXPECT_EQ(jxl::test::ComparePixels(extra_channel.data(), + extra_channel_bytes.data(), xsize, + ysize, extra_channel_pixel_format, + extra_channel_output_pixel_format), + 0u); + EXPECT_EQ(extra_channel, extra_channel_bytes); + } + } +} + +} // namespace + +TEST(RoundtripTest, FloatFrameRoundtripTest) { + std::vector>> + extra_channels_cases = {{}, + {{JXL_CHANNEL_ALPHA, "my extra alpha channel"}}, + {{JXL_CHANNEL_CFA, "my cfa channel"}}, + {{JXL_CHANNEL_DEPTH, "depth"}, + {JXL_CHANNEL_SELECTION_MASK, "mask"}, + {JXL_CHANNEL_BLACK, "black"}, + {JXL_CHANNEL_CFA, "my cfa channel"}, + {JXL_CHANNEL_OPTIONAL, "optional channel"}}, + {{JXL_CHANNEL_DEPTH, "very deep"}}}; + for (int use_container = 0; use_container < 2; use_container++) { + for (int lossless = 0; lossless < 2; lossless++) { + for (uint32_t num_channels = 1; num_channels < 5; num_channels++) { + for (auto& extra_channels : extra_channels_cases) { + uint32_t has_alpha = static_cast(num_channels % 2 == 0); + uint32_t total_extra_channels = has_alpha + extra_channels.size(); + // There's no support (yet) for lossless extra float + // channels, so we don't test it. + if (total_extra_channels == 0 || !lossless) { + JxlPixelFormat pixel_format = JxlPixelFormat{ + num_channels, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0}; + VerifyRoundtripCompression( + 63, 129, pixel_format, pixel_format, (bool)lossless, + (bool)use_container, 1, false, extra_channels); + } + } + } + } + } +} + +TEST(RoundtripTest, Uint16FrameRoundtripTest) { + std::vector>> + extra_channels_cases = {{}, + {{JXL_CHANNEL_ALPHA, "my extra alpha channel"}}, + {{JXL_CHANNEL_CFA, "my cfa channel"}}, + {{JXL_CHANNEL_CFA, "my cfa channel"}, + {JXL_CHANNEL_BLACK, "k_channel"}}, + {{JXL_CHANNEL_DEPTH, "very deep"}}}; + for (int use_container = 0; use_container < 2; use_container++) { + for (int lossless = 0; lossless < 2; lossless++) { + for (uint32_t num_channels = 1; num_channels < 5; num_channels++) { + for (auto& extra_channels : extra_channels_cases) { + JxlPixelFormat pixel_format = JxlPixelFormat{ + num_channels, JXL_TYPE_UINT16, JXL_NATIVE_ENDIAN, 0}; + VerifyRoundtripCompression( + 63, 129, pixel_format, pixel_format, (bool)lossless, + (bool)use_container, 1, false, extra_channels); + } + } + } + } +} + +TEST(RoundtripTest, Uint8FrameRoundtripTest) { + std::vector>> + extra_channels_cases = {{}, + {{JXL_CHANNEL_THERMAL, "temperature"}}, + {{JXL_CHANNEL_ALPHA, "my extra alpha channel"}}, + {{JXL_CHANNEL_CFA, "my cfa channel"}}, + {{JXL_CHANNEL_CFA, "my cfa channel"}, + {JXL_CHANNEL_BLACK, "k_channel"}}, + {{JXL_CHANNEL_DEPTH, "very deep"}}}; + for (int use_container = 0; use_container < 2; use_container++) { + for (int lossless = 0; lossless < 2; lossless++) { + for (uint32_t num_channels = 1; num_channels < 5; num_channels++) { + for (auto& extra_channels : extra_channels_cases) { + JxlPixelFormat pixel_format = JxlPixelFormat{ + num_channels, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0}; + VerifyRoundtripCompression( + 63, 129, pixel_format, pixel_format, (bool)lossless, + (bool)use_container, 1, false, extra_channels); + } + } + } + } +} + +TEST(RoundtripTest, TestNonlinearSrgbAsXybEncoded) { + for (int use_container = 0; use_container < 2; use_container++) { + for (uint32_t num_channels = 1; num_channels < 5; num_channels++) { + JxlPixelFormat pixel_format_in = + JxlPixelFormat{num_channels, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0}; + JxlPixelFormat pixel_format_out = + JxlPixelFormat{num_channels, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0}; + VerifyRoundtripCompression( + 63, 129, pixel_format_in, pixel_format_out, + /*lossless=*/false, (bool)use_container, {}); + } + } +} + +TEST(RoundtripTest, Resampling) { + JxlPixelFormat pixel_format = + JxlPixelFormat{3, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0}; + VerifyRoundtripCompression(63, 129, pixel_format, pixel_format, + /*lossless=*/false, + /*use_container=*/false, 2, + /*already_downsampled=*/false); + + // TODO(lode): also make this work for odd sizes. This requires a fix in + // enc_frame.cc to not set custom_size_or_origin to true due to even/odd + // mismatch. + VerifyRoundtripCompression(64, 128, pixel_format, pixel_format, + /*lossless=*/true, + /*use_container=*/false, 2, + /*already_downsampled=*/true); +} + +TEST(RoundtripTest, ExtraBoxesTest) { + JxlPixelFormat pixel_format = + JxlPixelFormat{4, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0}; + const size_t xsize = 61; + const size_t ysize = 71; + + const std::vector original_bytes = + GetTestImage(xsize, ysize, pixel_format); + jxl::CodecInOut original_io = + ConvertTestImage(original_bytes, xsize, ysize, pixel_format, {}); + + JxlEncoder* enc = JxlEncoderCreate(nullptr); + EXPECT_NE(nullptr, enc); + + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseContainer(enc, true)); + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format); + basic_info.xsize = xsize; + basic_info.ysize = ysize; + basic_info.uses_original_profile = false; + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc, 10)); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info)); + JxlColorEncoding color_encoding; + if (pixel_format.data_type == JXL_TYPE_FLOAT) { + JxlColorEncodingSetToLinearSRGB(&color_encoding, + /*is_gray=*/pixel_format.num_channels < 3); + } else { + JxlColorEncodingSetToSRGB(&color_encoding, + /*is_gray=*/pixel_format.num_channels < 3); + } + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetColorEncoding(enc, &color_encoding)); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc, nullptr); + JxlEncoderSetFrameLossless(frame_settings, false); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddImageFrame(frame_settings, &pixel_format, + (void*)original_bytes.data(), + original_bytes.size())); + JxlEncoderCloseInput(enc); + + std::vector compressed; + EncodeWithEncoder(enc, &compressed); + JxlEncoderDestroy(enc); + + std::vector extra_data(1023); + jxl::AppendBoxHeader(jxl::MakeBoxType("crud"), extra_data.size(), false, + &compressed); + compressed.insert(compressed.end(), extra_data.begin(), extra_data.end()); + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_NE(nullptr, dec); + + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO | + JXL_DEC_COLOR_ENCODING | + JXL_DEC_FULL_IMAGE)); + + JxlDecoderSetInput(dec, next_in, avail_in); + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &pixel_format, &buffer_size)); + EXPECT_EQ(buffer_size, original_bytes.size()); + + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(xsize, info.xsize); + EXPECT_EQ(ysize, info.ysize); + + EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec)); + + size_t icc_profile_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetICCProfileSize(dec, &pixel_format, + JXL_COLOR_PROFILE_TARGET_DATA, + &icc_profile_size)); + jxl::PaddedBytes icc_profile(icc_profile_size); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetColorAsICCProfile( + dec, &pixel_format, JXL_COLOR_PROFILE_TARGET_DATA, + icc_profile.data(), icc_profile.size())); + + std::vector decoded_bytes(buffer_size); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(dec, &pixel_format, + decoded_bytes.data(), + decoded_bytes.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + + JxlDecoderDestroy(dec); + + jxl::CodecInOut decoded_io = + ConvertTestImage(decoded_bytes, xsize, ysize, pixel_format, icc_profile); + + jxl::ButteraugliParams ba; + float butteraugli_score = ButteraugliDistance( + original_io.frames, decoded_io.frames, ba, jxl::GetJxlCms(), + /*distmap=*/nullptr, nullptr); + EXPECT_LE(butteraugli_score, 2.0f); +} + +static const unsigned char kEncodedTestProfile[] = { + 0x1f, 0x8b, 0x1, 0x13, 0x10, 0x0, 0x0, 0x0, 0x20, 0x4c, 0xcc, 0x3, + 0xe7, 0xa0, 0xa5, 0xa2, 0x90, 0xa4, 0x27, 0xe8, 0x79, 0x1d, 0xe3, 0x26, + 0x57, 0x54, 0xef, 0x0, 0xe8, 0x97, 0x2, 0xce, 0xa1, 0xd7, 0x85, 0x16, + 0xb4, 0x29, 0x94, 0x58, 0xf2, 0x56, 0xc0, 0x76, 0xea, 0x23, 0xec, 0x7c, + 0x73, 0x51, 0x41, 0x40, 0x23, 0x21, 0x95, 0x4, 0x75, 0x12, 0xc9, 0xcc, + 0x16, 0xbd, 0xb6, 0x99, 0xad, 0xf8, 0x75, 0x35, 0xb6, 0x42, 0xae, 0xae, + 0xae, 0x86, 0x56, 0xf8, 0xcc, 0x16, 0x30, 0xb3, 0x45, 0xad, 0xd, 0x40, + 0xd6, 0xd1, 0xd6, 0x99, 0x40, 0xbe, 0xe2, 0xdc, 0x31, 0x7, 0xa6, 0xb9, + 0x27, 0x92, 0x38, 0x0, 0x3, 0x5e, 0x2c, 0xbe, 0xe6, 0xfb, 0x19, 0xbf, + 0xf3, 0x6d, 0xbc, 0x4d, 0x64, 0xe5, 0xba, 0x76, 0xde, 0x31, 0x65, 0x66, + 0x14, 0xa6, 0x3a, 0xc5, 0x8f, 0xb1, 0xb4, 0xba, 0x1f, 0xb1, 0xb8, 0xd4, + 0x75, 0xba, 0x18, 0x86, 0x95, 0x3c, 0x26, 0xf6, 0x25, 0x62, 0x53, 0xfd, + 0x9c, 0x94, 0x76, 0xf6, 0x95, 0x2c, 0xb1, 0xfd, 0xdc, 0xc0, 0xe4, 0x3f, + 0xb3, 0xff, 0x67, 0xde, 0xd5, 0x94, 0xcc, 0xb0, 0x83, 0x2f, 0x28, 0x93, + 0x92, 0x3, 0xa1, 0x41, 0x64, 0x60, 0x62, 0x70, 0x80, 0x87, 0xaf, 0xe7, + 0x60, 0x4a, 0x20, 0x23, 0xb3, 0x11, 0x7, 0x38, 0x38, 0xd4, 0xa, 0x66, + 0xb5, 0x93, 0x41, 0x90, 0x19, 0x17, 0x18, 0x60, 0xa5, 0xb, 0x7a, 0x24, + 0xaa, 0x20, 0x81, 0xac, 0xa9, 0xa1, 0x70, 0xa6, 0x12, 0x8a, 0x4a, 0xa3, + 0xa0, 0xf9, 0x9a, 0x97, 0xe7, 0xa8, 0xac, 0x8, 0xa8, 0xc4, 0x2a, 0x86, + 0xa7, 0x69, 0x1e, 0x67, 0xe6, 0xbe, 0xa4, 0xd3, 0xff, 0x91, 0x61, 0xf6, + 0x8a, 0xe6, 0xb5, 0xb3, 0x61, 0x9f, 0x19, 0x17, 0x98, 0x27, 0x6b, 0xe9, + 0x8, 0x98, 0xe1, 0x21, 0x4a, 0x9, 0xb5, 0xd7, 0xca, 0xfa, 0x94, 0xd0, + 0x69, 0x1a, 0xeb, 0x52, 0x1, 0x4e, 0xf5, 0xf6, 0xdf, 0x7f, 0xe7, 0x29, + 0x70, 0xee, 0x4, 0xda, 0x2f, 0xa4, 0xff, 0xfe, 0xbb, 0x6f, 0xa8, 0xff, + 0xfe, 0xdb, 0xaf, 0x8, 0xf6, 0x72, 0xa1, 0x40, 0x5d, 0xf0, 0x2d, 0x8, + 0x82, 0x5b, 0x87, 0xbd, 0x10, 0x8, 0xe9, 0x7, 0xee, 0x4b, 0x80, 0xda, + 0x4a, 0x4, 0xc5, 0x5e, 0xa0, 0xb7, 0x1e, 0x60, 0xb0, 0x59, 0x76, 0x60, + 0xb, 0x2e, 0x19, 0x8a, 0x2e, 0x1c, 0xe6, 0x6, 0x20, 0xb8, 0x64, 0x18, + 0x2a, 0xcf, 0x51, 0x94, 0xd4, 0xee, 0xc3, 0xfe, 0x39, 0x74, 0xd4, 0x2b, + 0x48, 0xc9, 0x83, 0x4c, 0x9b, 0xd0, 0x4c, 0x35, 0x10, 0xe3, 0x9, 0xf7, + 0x72, 0xf0, 0x7a, 0xe, 0xbf, 0x7d, 0x36, 0x2e, 0x19, 0x7e, 0x3f, 0xc, + 0xf7, 0x93, 0xe7, 0xf4, 0x1d, 0x32, 0xc6, 0xb0, 0x89, 0xad, 0xe0, 0x28, + 0xc1, 0xa7, 0x59, 0xe3, 0x0, +}; + +TEST(RoundtripTest, TestICCProfile) { + // JxlEncoderSetICCProfile parses the ICC profile, so a valid profile is + // needed. The profile should be passed correctly through the roundtrip. + jxl::BitReader reader(jxl::Span(kEncodedTestProfile, + sizeof(kEncodedTestProfile))); + jxl::PaddedBytes icc; + ASSERT_TRUE(ReadICC(&reader, &icc)); + ASSERT_TRUE(reader.Close()); + + JxlPixelFormat format = + JxlPixelFormat{3, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0}; + + size_t xsize = 25; + size_t ysize = 37; + const std::vector original_bytes = + GetTestImage(xsize, ysize, format); + + JxlEncoder* enc = JxlEncoderCreate(nullptr); + EXPECT_NE(nullptr, enc); + + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &format); + basic_info.xsize = xsize; + basic_info.ysize = ysize; + basic_info.uses_original_profile = JXL_TRUE; + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info)); + + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetICCProfile(enc, icc.data(), icc.size())); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc, nullptr); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddImageFrame(frame_settings, &format, + (void*)original_bytes.data(), + original_bytes.size())); + JxlEncoderCloseInput(enc); + + std::vector compressed; + EncodeWithEncoder(enc, &compressed); + JxlEncoderDestroy(enc); + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_NE(nullptr, dec); + + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO | + JXL_DEC_COLOR_ENCODING | + JXL_DEC_FULL_IMAGE)); + + JxlDecoderSetInput(dec, next_in, avail_in); + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + EXPECT_EQ(buffer_size, original_bytes.size()); + + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(xsize, info.xsize); + EXPECT_EQ(ysize, info.ysize); + + EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec)); + + size_t dec_icc_size; + EXPECT_EQ( + JXL_DEC_SUCCESS, + JxlDecoderGetICCProfileSize( + dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, &dec_icc_size)); + EXPECT_EQ(icc.size(), dec_icc_size); + jxl::PaddedBytes dec_icc(dec_icc_size); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetColorAsICCProfile(dec, &format, + JXL_COLOR_PROFILE_TARGET_ORIGINAL, + dec_icc.data(), dec_icc.size())); + + std::vector decoded_bytes(buffer_size); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer(dec, &format, decoded_bytes.data(), + decoded_bytes.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(icc, dec_icc); + + JxlDecoderDestroy(dec); +} + +#if JPEGXL_ENABLE_JPEG // Loading .jpg files requires libjpeg support. +TEST(RoundtripTest, JXL_TRANSCODE_JPEG_TEST(TestJPEGReconstruction)) { + const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg"; + const jxl::PaddedBytes orig = jxl::test::ReadTestData(jpeg_path); + jxl::CodecInOut orig_io; + ASSERT_TRUE( + SetFromBytes(jxl::Span(orig), &orig_io, /*pool=*/nullptr)); + + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + JxlEncoderFrameSettings* frame_settings = + JxlEncoderFrameSettingsCreate(enc.get(), NULL); + + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseContainer(enc.get(), JXL_TRUE)); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderStoreJPEGMetadata(enc.get(), JXL_TRUE)); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddJPEGFrame(frame_settings, orig.data(), orig.size())); + JxlEncoderCloseInput(enc.get()); + + std::vector compressed; + EncodeWithEncoder(enc.get(), &compressed); + + JxlDecoderPtr dec = JxlDecoderMake(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec.get(), JXL_DEC_JPEG_RECONSTRUCTION | JXL_DEC_FULL_IMAGE)); + JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size()); + EXPECT_EQ(JXL_DEC_JPEG_RECONSTRUCTION, JxlDecoderProcessInput(dec.get())); + std::vector reconstructed_buffer(128); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetJPEGBuffer(dec.get(), reconstructed_buffer.data(), + reconstructed_buffer.size())); + size_t used = 0; + JxlDecoderStatus dec_process_result = JXL_DEC_JPEG_NEED_MORE_OUTPUT; + while (dec_process_result == JXL_DEC_JPEG_NEED_MORE_OUTPUT) { + used = reconstructed_buffer.size() - JxlDecoderReleaseJPEGBuffer(dec.get()); + reconstructed_buffer.resize(reconstructed_buffer.size() * 2); + EXPECT_EQ( + JXL_DEC_SUCCESS, + JxlDecoderSetJPEGBuffer(dec.get(), reconstructed_buffer.data() + used, + reconstructed_buffer.size() - used)); + dec_process_result = JxlDecoderProcessInput(dec.get()); + } + ASSERT_EQ(JXL_DEC_FULL_IMAGE, dec_process_result); + used = reconstructed_buffer.size() - JxlDecoderReleaseJPEGBuffer(dec.get()); + ASSERT_EQ(used, orig.size()); + EXPECT_EQ(0, memcmp(reconstructed_buffer.data(), orig.data(), used)); +} +#endif // JPEGXL_ENABLE_JPEG diff --git a/third_party/jpeg-xl/lib/jxl/sanitizers.h b/third_party/jpeg-xl/lib/jxl/sanitizers.h new file mode 100644 index 0000000000..ce0bd8dc63 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/sanitizers.h @@ -0,0 +1,242 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_SANITIZERS_H_ +#define LIB_JXL_SANITIZERS_H_ + +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/sanitizer_definitions.h" +#include "lib/jxl/image.h" + +#if JXL_MEMORY_SANITIZER +#include + +#include +#include +#include + +#include "lib/jxl/base/status.h" +#include "sanitizer/msan_interface.h" +#endif + +namespace jxl { +namespace msan { + +#if JXL_MEMORY_SANITIZER + +// Chosen so that kSanitizerSentinel is four copies of kSanitizerSentinelByte. +constexpr uint8_t kSanitizerSentinelByte = 0x48; +constexpr float kSanitizerSentinel = 205089.125f; + +static JXL_INLINE JXL_MAYBE_UNUSED void PoisonMemory(const volatile void* m, + size_t size) { + __msan_poison(m, size); +} + +static JXL_INLINE JXL_MAYBE_UNUSED void UnpoisonMemory(const volatile void* m, + size_t size) { + __msan_unpoison(m, size); +} + +static JXL_INLINE JXL_MAYBE_UNUSED void UnpoisonCStr(const char* c) { + do { + UnpoisonMemory(c, 1); + } while (*c++); +} + +static JXL_INLINE JXL_MAYBE_UNUSED void MemoryIsInitialized( + const volatile void* m, size_t size) { + __msan_check_mem_is_initialized(m, size); +} + +// Mark all the bytes of an image (including padding) as poisoned bytes. +static JXL_INLINE JXL_MAYBE_UNUSED void PoisonImage(const PlaneBase& im) { + PoisonMemory(im.bytes(), im.bytes_per_row() * im.ysize()); +} + +template +static JXL_INLINE JXL_MAYBE_UNUSED void PoisonImage(const Image3& im) { + PoisonImage(im.Plane(0)); + PoisonImage(im.Plane(1)); + PoisonImage(im.Plane(2)); +} + +// Print the uninitialized regions of an image. +template +static JXL_INLINE JXL_MAYBE_UNUSED void PrintImageUninitialized( + const Plane& im) { + fprintf(stderr, + "Uninitialized regions for image of size %" PRIu64 "x%" PRIu64 ":\n", + static_cast(im.xsize()), static_cast(im.ysize())); + + // A segment of uninitialized pixels in a row, in the format [first, second). + typedef std::pair PixelSegment; + + // Helper class to merge and print a list of rows of PixelSegment that may be + // the same over big ranges of rows. This compacts the output to ranges of + // rows like "[y0, y1): [x0, x1) [x2, x3)". + class RowsMerger { + public: + // Add a new row the list of rows. If the row is the same as the previous + // one it will be merged showing a range of rows [y0, y1), but if the new + // row is different the current range of rows (if any) will be printed and a + // new one will be started. + void AddRow(size_t y, std::vector&& new_row) { + if (start_y_ != -1 && new_row != segments_) { + PrintRow(y); + } + if (new_row.empty()) { + // Skip ranges with no uninitialized pixels. + start_y_ = -1; + segments_.clear(); + return; + } + if (start_y_ == -1) { + start_y_ = y; + segments_ = std::move(new_row); + } + } + + // Print the contents of the range of rows [start_y_, end_y) if any. + void PrintRow(size_t end_y) { + if (start_y_ == -1) return; + if (segments_.empty()) { + start_y_ = -1; + return; + } + if (end_y - start_y_ > 1) { + fprintf(stderr, " y=[%" PRId64 ", %" PRIu64 "):", + static_cast(start_y_), static_cast(end_y)); + } else { + fprintf(stderr, " y=[%" PRId64 "]:", static_cast(start_y_)); + } + for (const auto& seg : segments_) { + if (seg.first + 1 == seg.second) { + fprintf(stderr, " [%" PRId64 "]", static_cast(seg.first)); + } else { + fprintf(stderr, " [%" PRId64 ", %" PRIu64 ")", + static_cast(seg.first), + static_cast(seg.second)); + } + } + fprintf(stderr, "\n"); + start_y_ = -1; + } + + private: + std::vector segments_; + // Row number of the first row in the range of rows that have |segments| as + // the undefined segments. + ssize_t start_y_ = -1; + } rows_merger; + + class SegmentsMerger { + public: + void AddValue(size_t x) { + if (row.empty() || row.back().second != x) { + row.emplace_back(x, x + 1); + } else { + row.back().second = x + 1; + } + } + + std::vector row; + }; + + for (size_t y = 0; y < im.ysize(); y++) { + auto* row = im.Row(y); + SegmentsMerger seg_merger; + size_t x = 0; + while (x < im.xsize()) { + intptr_t ret = + __msan_test_shadow(row + x, (im.xsize() - x) * sizeof(row[0])); + if (ret < 0) break; + size_t next_x = x + ret / sizeof(row[0]); + seg_merger.AddValue(next_x); + x = next_x + 1; + } + rows_merger.AddRow(y, std::move(seg_merger.row)); + } + rows_merger.PrintRow(im.ysize()); +} + +// Check that all the pixels in the provided rect of the image are initialized +// (not poisoned). If any of the values is poisoned it will abort. +template +static JXL_INLINE JXL_MAYBE_UNUSED void CheckImageInitialized( + const Plane& im, const Rect& r, size_t c, const char* message) { + JXL_ASSERT(r.x0() <= im.xsize()); + JXL_ASSERT(r.x0() + r.xsize() <= im.xsize()); + JXL_ASSERT(r.y0() <= im.ysize()); + JXL_ASSERT(r.y0() + r.ysize() <= im.ysize()); + for (size_t y = r.y0(); y < r.y0() + r.ysize(); y++) { + const auto* row = im.Row(y); + intptr_t ret = __msan_test_shadow(row + r.x0(), sizeof(*row) * r.xsize()); + if (ret != -1) { + JXL_DEBUG( + 1, + "Checking an image of %" PRIu64 " x %" PRIu64 ", rect x0=%" PRIu64 + ", y0=%" PRIu64 + ", " + "xsize=%" PRIu64 ", ysize=%" PRIu64, + static_cast(im.xsize()), static_cast(im.ysize()), + static_cast(r.x0()), static_cast(r.y0()), + static_cast(r.xsize()), static_cast(r.ysize())); + size_t x = ret / sizeof(*row); + JXL_DEBUG(1, + "CheckImageInitialized failed at x=%" PRIu64 ", y=%" PRIu64 + ", c=%" PRIu64 ": %s", + static_cast(r.x0() + x), static_cast(y), + static_cast(c), message ? message : ""); + PrintImageUninitialized(im); + } + // This will report an error if memory is not initialized. + __msan_check_mem_is_initialized(row + r.x0(), sizeof(*row) * r.xsize()); + } +} + +template +static JXL_INLINE JXL_MAYBE_UNUSED void CheckImageInitialized( + const Image3& im, const Rect& r, const char* message) { + for (size_t c = 0; c < 3; c++) { + std::string str_message(message); + str_message += " c=" + std::to_string(c); + CheckImageInitialized(im.Plane(c), r, c, str_message.c_str()); + } +} + +#define JXL_CHECK_IMAGE_INITIALIZED(im, r) \ + ::jxl::msan::CheckImageInitialized(im, r, "im=" #im ", r=" #r); + +#define JXL_CHECK_PLANE_INITIALIZED(im, r, c) \ + ::jxl::msan::CheckImageInitialized(im, r, c, "im=" #im ", r=" #r ", c=" #c); + +#else // JXL_MEMORY_SANITIZER + +// In non-msan mode these functions don't use volatile since it is not needed +// for the empty functions. + +static JXL_INLINE JXL_MAYBE_UNUSED void PoisonMemory(const void*, size_t) {} +static JXL_INLINE JXL_MAYBE_UNUSED void UnpoisonMemory(const void*, size_t) {} +static JXL_INLINE JXL_MAYBE_UNUSED void UnpoisonCStr(const char*) {} +static JXL_INLINE JXL_MAYBE_UNUSED void MemoryIsInitialized(const void*, + size_t) {} + +static JXL_INLINE JXL_MAYBE_UNUSED void PoisonImage(const PlaneBase& im) {} +template +static JXL_INLINE JXL_MAYBE_UNUSED void PoisonImage(const Plane& im) {} + +#define JXL_CHECK_IMAGE_INITIALIZED(im, r) +#define JXL_CHECK_PLANE_INITIALIZED(im, r, c) + +#endif + +} // namespace msan +} // namespace jxl + +#endif // LIB_JXL_SANITIZERS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/simd_util-inl.h b/third_party/jpeg-xl/lib/jxl/simd_util-inl.h new file mode 100644 index 0000000000..77b207ffe8 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/simd_util-inl.h @@ -0,0 +1,349 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Misc utilities for SIMD operations + +#if defined(LIB_JXL_SIMD_UTIL_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_SIMD_UTIL_INL_H_ +#undef LIB_JXL_SIMD_UTIL_INL_H_ +#else +#define LIB_JXL_SIMD_UTIL_INL_H_ +#endif + +#include + +#include "lib/jxl/base/compiler_specific.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +#if HWY_CAP_GE512 +using hwy::HWY_NAMESPACE::Half; +using hwy::HWY_NAMESPACE::Vec; +template +HWY_INLINE Vec>> Quarter(const DF df, V v) { + using HF = Half; + using HHF = Half; + auto half = i >= 2 ? UpperHalf(HF(), v) : LowerHalf(HF(), v); + return i & 1 ? UpperHalf(HHF(), half) : LowerHalf(HHF(), half); +} + +template +HWY_INLINE Vec Concat4(const DF df, V v0, V v1, V v2, V v3) { + using HF = Half; + return Combine(DF(), Combine(HF(), v3, v2), Combine(HF(), v1, v0)); +} + +#endif + +// Stores v0[0], v1[0], v0[1], v1[1], ... to mem, in this order. Mem must be +// aligned. +template +void StoreInterleaved(const DF df, V v0, V v1, T* mem) { + static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types"); +#if HWY_TARGET == HWY_SCALAR + Store(v0, df, mem); + Store(v1, df, mem + 1); +#elif !HWY_CAP_GE256 + Store(InterleaveLower(df, v0, v1), df, mem); + Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df)); +#else + if (!HWY_CAP_GE512 || Lanes(df) == 8) { + auto t0 = InterleaveLower(df, v0, v1); + auto t1 = InterleaveUpper(df, v0, v1); + Store(ConcatLowerLower(df, t1, t0), df, mem); + Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df)); + } else { +#if HWY_CAP_GE512 + auto t0 = InterleaveLower(df, v0, v1); + auto t1 = InterleaveUpper(df, v0, v1); + Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1), + Quarter<1>(df, t0), Quarter<1>(df, t1)), + df, mem); + Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1), + Quarter<3>(df, t0), Quarter<3>(df, t1)), + df, mem + Lanes(df)); +#endif + } +#endif +} + +// Stores v0[0], v1[0], v2[0], v3[0], v0[1] ... to mem, in this order. Mem must +// be aligned. +template +void StoreInterleaved(const DF df, V v0, V v1, V v2, V v3, T* mem) { + static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types"); +#if HWY_TARGET == HWY_SCALAR + Store(v0, df, mem); + Store(v1, df, mem + 1); + Store(v2, df, mem + 2); + Store(v3, df, mem + 3); +#elif !HWY_CAP_GE256 + auto t0 = InterleaveLower(df, v0, v2); + auto t1 = InterleaveLower(df, v1, v3); + auto t2 = InterleaveUpper(df, v0, v2); + auto t3 = InterleaveUpper(df, v1, v3); + Store(InterleaveLower(df, t0, t1), df, mem); + Store(InterleaveUpper(df, t0, t1), df, mem + Lanes(df)); + Store(InterleaveLower(df, t2, t3), df, mem + 2 * Lanes(df)); + Store(InterleaveUpper(df, t2, t3), df, mem + 3 * Lanes(df)); +#elif !HWY_CAP_GE512 + auto t0 = InterleaveLower(df, v0, v2); + auto t1 = InterleaveLower(df, v1, v3); + auto t2 = InterleaveUpper(df, v0, v2); + auto t3 = InterleaveUpper(df, v1, v3); + + auto m0 = InterleaveLower(df, t0, t1); + auto m1 = InterleaveUpper(df, t0, t1); + auto m2 = InterleaveLower(df, t2, t3); + auto m3 = InterleaveUpper(df, t2, t3); + + Store(ConcatLowerLower(df, m1, m0), df, mem); + Store(ConcatLowerLower(df, m3, m2), df, mem + Lanes(df)); + Store(ConcatUpperUpper(df, m1, m0), df, mem + 2 * Lanes(df)); + Store(ConcatUpperUpper(df, m3, m2), df, mem + 3 * Lanes(df)); +#else + auto t0 = InterleaveLower(df, v0, v2); + auto t1 = InterleaveLower(df, v1, v3); + auto t2 = InterleaveUpper(df, v0, v2); + auto t3 = InterleaveUpper(df, v1, v3); + + auto m0 = InterleaveLower(df, t0, t1); + auto m1 = InterleaveUpper(df, t0, t1); + auto m2 = InterleaveLower(df, t2, t3); + auto m3 = InterleaveUpper(df, t2, t3); + + Store(Concat4(df, Quarter<0>(df, m0), Quarter<0>(df, m1), Quarter<0>(df, m2), + Quarter<0>(df, m3)), + df, mem); + Store(Concat4(df, Quarter<1>(df, m0), Quarter<1>(df, m1), Quarter<1>(df, m2), + Quarter<1>(df, m3)), + df, mem + Lanes(df)); + Store(Concat4(df, Quarter<2>(df, m0), Quarter<2>(df, m1), Quarter<2>(df, m2), + Quarter<2>(df, m3)), + df, mem + 2 * Lanes(df)); + Store(Concat4(df, Quarter<3>(df, m0), Quarter<3>(df, m1), Quarter<3>(df, m2), + Quarter<3>(df, m3)), + df, mem + 3 * Lanes(df)); +#endif +} + +// Stores v0[0], v1[0], v2[0], v3[0], v4[0], v5[0], v6[0], v7[0], v0[1] ... to +// mem, in this order. Mem must be aligned. +template +void StoreInterleaved(const DF df, V v0, V v1, V v2, V v3, V v4, V v5, V v6, + V v7, float* mem) { +#if HWY_TARGET == HWY_SCALAR + Store(v0, df, mem); + Store(v1, df, mem + 1); + Store(v2, df, mem + 2); + Store(v3, df, mem + 3); + Store(v4, df, mem + 4); + Store(v5, df, mem + 5); + Store(v6, df, mem + 6); + Store(v7, df, mem + 7); +#elif !HWY_CAP_GE256 + auto t0 = InterleaveLower(df, v0, v4); + auto t1 = InterleaveLower(df, v1, v5); + auto t2 = InterleaveLower(df, v2, v6); + auto t3 = InterleaveLower(df, v3, v7); + auto t4 = InterleaveUpper(df, v0, v4); + auto t5 = InterleaveUpper(df, v1, v5); + auto t6 = InterleaveUpper(df, v2, v6); + auto t7 = InterleaveUpper(df, v3, v7); + + auto w0 = InterleaveLower(df, t0, t2); + auto w1 = InterleaveLower(df, t1, t3); + auto w2 = InterleaveUpper(df, t0, t2); + auto w3 = InterleaveUpper(df, t1, t3); + auto w4 = InterleaveLower(df, t4, t6); + auto w5 = InterleaveLower(df, t5, t7); + auto w6 = InterleaveUpper(df, t4, t6); + auto w7 = InterleaveUpper(df, t5, t7); + + Store(InterleaveLower(df, w0, w1), df, mem); + Store(InterleaveUpper(df, w0, w1), df, mem + Lanes(df)); + Store(InterleaveLower(df, w2, w3), df, mem + 2 * Lanes(df)); + Store(InterleaveUpper(df, w2, w3), df, mem + 3 * Lanes(df)); + Store(InterleaveLower(df, w4, w5), df, mem + 4 * Lanes(df)); + Store(InterleaveUpper(df, w4, w5), df, mem + 5 * Lanes(df)); + Store(InterleaveLower(df, w6, w7), df, mem + 6 * Lanes(df)); + Store(InterleaveUpper(df, w6, w7), df, mem + 7 * Lanes(df)); +#elif !HWY_CAP_GE512 + auto t0 = InterleaveLower(df, v0, v4); + auto t1 = InterleaveLower(df, v1, v5); + auto t2 = InterleaveLower(df, v2, v6); + auto t3 = InterleaveLower(df, v3, v7); + auto t4 = InterleaveUpper(df, v0, v4); + auto t5 = InterleaveUpper(df, v1, v5); + auto t6 = InterleaveUpper(df, v2, v6); + auto t7 = InterleaveUpper(df, v3, v7); + + auto w0 = InterleaveLower(df, t0, t2); + auto w1 = InterleaveLower(df, t1, t3); + auto w2 = InterleaveUpper(df, t0, t2); + auto w3 = InterleaveUpper(df, t1, t3); + auto w4 = InterleaveLower(df, t4, t6); + auto w5 = InterleaveLower(df, t5, t7); + auto w6 = InterleaveUpper(df, t4, t6); + auto w7 = InterleaveUpper(df, t5, t7); + + auto m0 = InterleaveLower(df, w0, w1); + auto m1 = InterleaveUpper(df, w0, w1); + auto m2 = InterleaveLower(df, w2, w3); + auto m3 = InterleaveUpper(df, w2, w3); + auto m4 = InterleaveLower(df, w4, w5); + auto m5 = InterleaveUpper(df, w4, w5); + auto m6 = InterleaveLower(df, w6, w7); + auto m7 = InterleaveUpper(df, w6, w7); + + Store(ConcatLowerLower(df, m1, m0), df, mem); + Store(ConcatLowerLower(df, m3, m2), df, mem + Lanes(df)); + Store(ConcatLowerLower(df, m5, m4), df, mem + 2 * Lanes(df)); + Store(ConcatLowerLower(df, m7, m6), df, mem + 3 * Lanes(df)); + Store(ConcatUpperUpper(df, m1, m0), df, mem + 4 * Lanes(df)); + Store(ConcatUpperUpper(df, m3, m2), df, mem + 5 * Lanes(df)); + Store(ConcatUpperUpper(df, m5, m4), df, mem + 6 * Lanes(df)); + Store(ConcatUpperUpper(df, m7, m6), df, mem + 7 * Lanes(df)); +#else + auto t0 = InterleaveLower(df, v0, v4); + auto t1 = InterleaveLower(df, v1, v5); + auto t2 = InterleaveLower(df, v2, v6); + auto t3 = InterleaveLower(df, v3, v7); + auto t4 = InterleaveUpper(df, v0, v4); + auto t5 = InterleaveUpper(df, v1, v5); + auto t6 = InterleaveUpper(df, v2, v6); + auto t7 = InterleaveUpper(df, v3, v7); + + auto w0 = InterleaveLower(df, t0, t2); + auto w1 = InterleaveLower(df, t1, t3); + auto w2 = InterleaveUpper(df, t0, t2); + auto w3 = InterleaveUpper(df, t1, t3); + auto w4 = InterleaveLower(df, t4, t6); + auto w5 = InterleaveLower(df, t5, t7); + auto w6 = InterleaveUpper(df, t4, t6); + auto w7 = InterleaveUpper(df, t5, t7); + + auto m0 = InterleaveLower(df, w0, w1); + auto m1 = InterleaveUpper(df, w0, w1); + auto m2 = InterleaveLower(df, w2, w3); + auto m3 = InterleaveUpper(df, w2, w3); + auto m4 = InterleaveLower(df, w4, w5); + auto m5 = InterleaveUpper(df, w4, w5); + auto m6 = InterleaveLower(df, w6, w7); + auto m7 = InterleaveUpper(df, w6, w7); + + Store(Concat4(df, Quarter<0>(df, m0), Quarter<0>(df, m1), Quarter<0>(df, m2), + Quarter<0>(df, m3)), + df, mem); + Store(Concat4(df, Quarter<0>(df, m4), Quarter<0>(df, m5), Quarter<0>(df, m6), + Quarter<0>(df, m7)), + df, mem + Lanes(df)); + Store(Concat4(df, Quarter<1>(df, m0), Quarter<1>(df, m1), Quarter<1>(df, m2), + Quarter<1>(df, m3)), + df, mem + 2 * Lanes(df)); + Store(Concat4(df, Quarter<1>(df, m4), Quarter<1>(df, m5), Quarter<1>(df, m6), + Quarter<1>(df, m7)), + df, mem + 3 * Lanes(df)); + Store(Concat4(df, Quarter<2>(df, m0), Quarter<2>(df, m1), Quarter<2>(df, m2), + Quarter<2>(df, m3)), + df, mem + 4 * Lanes(df)); + Store(Concat4(df, Quarter<2>(df, m4), Quarter<2>(df, m5), Quarter<2>(df, m6), + Quarter<2>(df, m7)), + df, mem + 5 * Lanes(df)); + Store(Concat4(df, Quarter<3>(df, m0), Quarter<3>(df, m1), Quarter<3>(df, m2), + Quarter<3>(df, m3)), + df, mem + 6 * Lanes(df)); + Store(Concat4(df, Quarter<3>(df, m4), Quarter<3>(df, m5), Quarter<3>(df, m6), + Quarter<3>(df, m7)), + df, mem + 7 * Lanes(df)); +#endif +} + +#if HWY_CAP_GE256 +JXL_INLINE void Transpose8x8Block(const int32_t* JXL_RESTRICT from, + int32_t* JXL_RESTRICT to, size_t fromstride) { + const HWY_CAPPED(int32_t, 8) d; + auto i0 = Load(d, from); + auto i1 = Load(d, from + 1 * fromstride); + auto i2 = Load(d, from + 2 * fromstride); + auto i3 = Load(d, from + 3 * fromstride); + auto i4 = Load(d, from + 4 * fromstride); + auto i5 = Load(d, from + 5 * fromstride); + auto i6 = Load(d, from + 6 * fromstride); + auto i7 = Load(d, from + 7 * fromstride); + + const auto q0 = InterleaveLower(d, i0, i2); + const auto q1 = InterleaveLower(d, i1, i3); + const auto q2 = InterleaveUpper(d, i0, i2); + const auto q3 = InterleaveUpper(d, i1, i3); + const auto q4 = InterleaveLower(d, i4, i6); + const auto q5 = InterleaveLower(d, i5, i7); + const auto q6 = InterleaveUpper(d, i4, i6); + const auto q7 = InterleaveUpper(d, i5, i7); + + const auto r0 = InterleaveLower(d, q0, q1); + const auto r1 = InterleaveUpper(d, q0, q1); + const auto r2 = InterleaveLower(d, q2, q3); + const auto r3 = InterleaveUpper(d, q2, q3); + const auto r4 = InterleaveLower(d, q4, q5); + const auto r5 = InterleaveUpper(d, q4, q5); + const auto r6 = InterleaveLower(d, q6, q7); + const auto r7 = InterleaveUpper(d, q6, q7); + + i0 = ConcatLowerLower(d, r4, r0); + i1 = ConcatLowerLower(d, r5, r1); + i2 = ConcatLowerLower(d, r6, r2); + i3 = ConcatLowerLower(d, r7, r3); + i4 = ConcatUpperUpper(d, r4, r0); + i5 = ConcatUpperUpper(d, r5, r1); + i6 = ConcatUpperUpper(d, r6, r2); + i7 = ConcatUpperUpper(d, r7, r3); + + Store(i0, d, to); + Store(i1, d, to + 1 * 8); + Store(i2, d, to + 2 * 8); + Store(i3, d, to + 3 * 8); + Store(i4, d, to + 4 * 8); + Store(i5, d, to + 5 * 8); + Store(i6, d, to + 6 * 8); + Store(i7, d, to + 7 * 8); +} +#elif HWY_TARGET != HWY_SCALAR +JXL_INLINE void Transpose8x8Block(const int32_t* JXL_RESTRICT from, + int32_t* JXL_RESTRICT to, size_t fromstride) { + const HWY_CAPPED(int32_t, 4) d; + for (size_t n = 0; n < 8; n += 4) { + for (size_t m = 0; m < 8; m += 4) { + auto p0 = Load(d, from + n * fromstride + m); + auto p1 = Load(d, from + (n + 1) * fromstride + m); + auto p2 = Load(d, from + (n + 2) * fromstride + m); + auto p3 = Load(d, from + (n + 3) * fromstride + m); + const auto q0 = InterleaveLower(d, p0, p2); + const auto q1 = InterleaveLower(d, p1, p3); + const auto q2 = InterleaveUpper(d, p0, p2); + const auto q3 = InterleaveUpper(d, p1, p3); + + const auto r0 = InterleaveLower(d, q0, q1); + const auto r1 = InterleaveUpper(d, q0, q1); + const auto r2 = InterleaveLower(d, q2, q3); + const auto r3 = InterleaveUpper(d, q2, q3); + Store(r0, d, to + m * 8 + n); + Store(r1, d, to + (1 + m) * 8 + n); + Store(r2, d, to + (2 + m) * 8 + n); + Store(r3, d, to + (3 + m) * 8 + n); + } + } +} + +#endif + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_SIMD_UTIL_INL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/simd_util_test.cc b/third_party/jpeg-xl/lib/jxl/simd_util_test.cc new file mode 100644 index 0000000000..b81f5d1279 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/simd_util_test.cc @@ -0,0 +1,84 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/simd_util_test.cc" +#include + +#include "lib/jxl/simd_util-inl.h" + +// Test utils +#include +#include +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +HWY_NOINLINE void TestInterleave2() { + HWY_FULL(float) d; + auto vec1 = Iota(d, 0 * 128.0); + auto vec2 = Iota(d, 1 * 128.0); + HWY_ALIGN float mem[MaxLanes(d) * 2]; + StoreInterleaved(d, vec1, vec2, mem); + for (size_t i = 0; i < Lanes(d); i++) { + for (size_t j = 0; j < 2; j++) { + EXPECT_EQ(mem[2 * i + j], j * 128 + i) << "i: " << i << " j: " << j; + } + } +} +HWY_NOINLINE void TestInterleave4() { + HWY_FULL(float) d; + auto vec1 = Iota(d, 0 * 128.0); + auto vec2 = Iota(d, 1 * 128.0); + auto vec3 = Iota(d, 2 * 128.0); + auto vec4 = Iota(d, 3 * 128.0); + HWY_ALIGN float mem[MaxLanes(d) * 4]; + StoreInterleaved(d, vec1, vec2, vec3, vec4, mem); + for (size_t i = 0; i < Lanes(d); i++) { + for (size_t j = 0; j < 4; j++) { + EXPECT_EQ(mem[4 * i + j], j * 128 + i) << "i: " << i << " j: " << j; + } + } +} +HWY_NOINLINE void TestInterleave8() { + HWY_FULL(float) d; + auto vec1 = Iota(d, 0 * 128.0); + auto vec2 = Iota(d, 1 * 128.0); + auto vec3 = Iota(d, 2 * 128.0); + auto vec4 = Iota(d, 3 * 128.0); + auto vec5 = Iota(d, 4 * 128.0); + auto vec6 = Iota(d, 5 * 128.0); + auto vec7 = Iota(d, 6 * 128.0); + auto vec8 = Iota(d, 7 * 128.0); + HWY_ALIGN float mem[MaxLanes(d) * 8]; + StoreInterleaved(d, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, mem); + for (size_t i = 0; i < Lanes(d); i++) { + for (size_t j = 0; j < 8; j++) { + EXPECT_EQ(mem[8 * i + j], j * 128 + i) << "i: " << i << " j: " << j; + } + } +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +class SimdUtilTargetTest : public hwy::TestWithParamTarget {}; +HWY_TARGET_INSTANTIATE_TEST_SUITE_P(SimdUtilTargetTest); + +HWY_EXPORT_AND_TEST_P(SimdUtilTargetTest, TestInterleave2); +HWY_EXPORT_AND_TEST_P(SimdUtilTargetTest, TestInterleave4); +HWY_EXPORT_AND_TEST_P(SimdUtilTargetTest, TestInterleave8); + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/speed_tier_test.cc b/third_party/jpeg-xl/lib/jxl/speed_tier_test.cc new file mode 100644 index 0000000000..b3f30c3e4c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/speed_tier_test.cc @@ -0,0 +1,108 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include "lib/extras/codec.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/enc_file.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +struct SpeedTierTestParams { + explicit SpeedTierTestParams(const SpeedTier speed_tier, + const bool shrink8 = false) + : speed_tier(speed_tier), shrink8(shrink8) {} + SpeedTier speed_tier; + bool shrink8; +}; + +std::ostream& operator<<(std::ostream& os, SpeedTierTestParams params) { + auto previous_flags = os.flags(); + os << std::boolalpha; + os << "SpeedTierTestParams{" << static_cast(params.speed_tier) + << ", /*shrink8=*/" << params.shrink8 << "}"; + os.flags(previous_flags); + return os; +} + +class SpeedTierTest : public testing::TestWithParam {}; + +JXL_GTEST_INSTANTIATE_TEST_SUITE_P( + SpeedTierTestInstantiation, SpeedTierTest, + testing::Values(SpeedTierTestParams{SpeedTier::kCheetah, + /*shrink8=*/true}, + SpeedTierTestParams{SpeedTier::kCheetah, + /*shrink8=*/false}, + SpeedTierTestParams{SpeedTier::kThunder, + /*shrink8=*/true}, + SpeedTierTestParams{SpeedTier::kThunder, + /*shrink8=*/false}, + SpeedTierTestParams{SpeedTier::kLightning, + /*shrink8=*/true}, + SpeedTierTestParams{SpeedTier::kLightning, + /*shrink8=*/false}, + SpeedTierTestParams{SpeedTier::kFalcon, + /*shrink8=*/true}, + SpeedTierTestParams{SpeedTier::kFalcon, + /*shrink8=*/false}, + SpeedTierTestParams{SpeedTier::kHare, + /*shrink8=*/true}, + SpeedTierTestParams{SpeedTier::kHare, + /*shrink8=*/false}, + SpeedTierTestParams{SpeedTier::kWombat, + /*shrink8=*/true}, + SpeedTierTestParams{SpeedTier::kWombat, + /*shrink8=*/false}, + SpeedTierTestParams{SpeedTier::kSquirrel, + /*shrink8=*/true}, + SpeedTierTestParams{SpeedTier::kSquirrel, + /*shrink8=*/false}, + SpeedTierTestParams{SpeedTier::kKitten, + /*shrink8=*/true}, + SpeedTierTestParams{SpeedTier::kKitten, + /*shrink8=*/false}, + // Only downscaled image for Tortoise mode. + SpeedTierTestParams{SpeedTier::kTortoise, + /*shrink8=*/true})); + +TEST_P(SpeedTierTest, Roundtrip) { + const PaddedBytes orig = jxl::test::ReadTestData( + "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + test::ThreadPoolForTests pool(8); + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + const SpeedTierTestParams& params = GetParam(); + + if (params.shrink8) { + io.ShrinkTo(io.xsize() / 8, io.ysize() / 8); + } + + CompressParams cparams; + cparams.speed_tier = params.speed_tier; + + CodecInOut io2; + JXL_EXPECT_OK(test::Roundtrip(&io, cparams, {}, &io2, _)); + + // Can be 2.2 in non-hare mode. + EXPECT_LE( + ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, GetJxlCms(), + /*distmap=*/nullptr, /*pool=*/nullptr), + 2.8); +} +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/splines.cc b/third_party/jpeg-xl/lib/jxl/splines.cc new file mode 100644 index 0000000000..04d1df8e49 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/splines.cc @@ -0,0 +1,694 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/splines.h" + +#include +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/opsin_params.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/splines.cc" +#include +#include + +#include "lib/jxl/fast_math-inl.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::MulSub; +using hwy::HWY_NAMESPACE::Sqrt; +using hwy::HWY_NAMESPACE::Sub; + +// Given a set of DCT coefficients, this returns the result of performing cosine +// interpolation on the original samples. +float ContinuousIDCT(const float dct[32], const float t) { + // We compute here the DCT-3 of the `dct` vector, rescaled by a factor of + // sqrt(32). This is such that an input vector vector {x, 0, ..., 0} produces + // a constant result of x. dct[0] was scaled in Dequantize() to allow uniform + // treatment of all the coefficients. + constexpr float kMultipliers[32] = { + kPi / 32 * 0, kPi / 32 * 1, kPi / 32 * 2, kPi / 32 * 3, kPi / 32 * 4, + kPi / 32 * 5, kPi / 32 * 6, kPi / 32 * 7, kPi / 32 * 8, kPi / 32 * 9, + kPi / 32 * 10, kPi / 32 * 11, kPi / 32 * 12, kPi / 32 * 13, kPi / 32 * 14, + kPi / 32 * 15, kPi / 32 * 16, kPi / 32 * 17, kPi / 32 * 18, kPi / 32 * 19, + kPi / 32 * 20, kPi / 32 * 21, kPi / 32 * 22, kPi / 32 * 23, kPi / 32 * 24, + kPi / 32 * 25, kPi / 32 * 26, kPi / 32 * 27, kPi / 32 * 28, kPi / 32 * 29, + kPi / 32 * 30, kPi / 32 * 31, + }; + HWY_CAPPED(float, 32) df; + auto result = Zero(df); + const auto tandhalf = Set(df, t + 0.5f); + for (int i = 0; i < 32; i += Lanes(df)) { + auto cos_arg = Mul(LoadU(df, kMultipliers + i), tandhalf); + auto cos = FastCosf(df, cos_arg); + auto local_res = Mul(LoadU(df, dct + i), cos); + result = MulAdd(Set(df, kSqrt2), local_res, result); + } + return GetLane(SumOfLanes(df, result)); +} + +template +void DrawSegment(DF df, const SplineSegment& segment, const bool add, + const size_t y, const size_t x, float* JXL_RESTRICT rows[3]) { + Rebind di; + const auto inv_sigma = Set(df, segment.inv_sigma); + const auto half = Set(df, 0.5f); + const auto one_over_2s2 = Set(df, 0.353553391f); + const auto sigma_over_4_times_intensity = + Set(df, segment.sigma_over_4_times_intensity); + const auto dx = Sub(ConvertTo(df, Iota(di, x)), Set(df, segment.center_x)); + const auto dy = Set(df, y - segment.center_y); + const auto sqd = MulAdd(dx, dx, Mul(dy, dy)); + const auto distance = Sqrt(sqd); + const auto one_dimensional_factor = + Sub(FastErff(df, Mul(MulAdd(distance, half, one_over_2s2), inv_sigma)), + FastErff(df, Mul(MulSub(distance, half, one_over_2s2), inv_sigma))); + auto local_intensity = + Mul(sigma_over_4_times_intensity, + Mul(one_dimensional_factor, one_dimensional_factor)); + for (size_t c = 0; c < 3; ++c) { + const auto cm = Set(df, add ? segment.color[c] : -segment.color[c]); + const auto in = LoadU(df, rows[c] + x); + StoreU(MulAdd(cm, local_intensity, in), df, rows[c] + x); + } +} + +void DrawSegment(const SplineSegment& segment, const bool add, const size_t y, + const ssize_t x0, ssize_t x1, float* JXL_RESTRICT rows[3]) { + ssize_t x = + std::max(x0, segment.center_x - segment.maximum_distance + 0.5f); + // one-past-the-end + x1 = + std::min(x1, segment.center_x + segment.maximum_distance + 1.5f); + HWY_FULL(float) df; + for (; x + static_cast(Lanes(df)) <= x1; x += Lanes(df)) { + DrawSegment(df, segment, add, y, x, rows); + } + for (; x < x1; ++x) { + DrawSegment(HWY_CAPPED(float, 1)(), segment, add, y, x, rows); + } +} + +void ComputeSegments(const Spline::Point& center, const float intensity, + const float color[3], const float sigma, + std::vector& segments, + std::vector>& segments_by_y) { + // Sanity check sigma, inverse sigma and intensity + if (!(std::isfinite(sigma) && sigma != 0.0f && std::isfinite(1.0f / sigma) && + std::isfinite(intensity))) { + return; + } +#if JXL_HIGH_PRECISION + constexpr float kDistanceExp = 5; +#else + // About 30% faster. + constexpr float kDistanceExp = 3; +#endif + // We cap from below colors to at least 0.01. + float max_color = 0.01f; + for (size_t c = 0; c < 3; c++) { + max_color = std::max(max_color, std::abs(color[c] * intensity)); + } + // Distance beyond which max_color*intensity*exp(-d^2 / (2 * sigma^2)) drops + // below 10^-kDistanceExp. + const float maximum_distance = + std::sqrt(-2 * sigma * sigma * + (std::log(0.1) * kDistanceExp - std::log(max_color))); + SplineSegment segment; + segment.center_y = center.y; + segment.center_x = center.x; + memcpy(segment.color, color, sizeof(segment.color)); + segment.inv_sigma = 1.0f / sigma; + segment.sigma_over_4_times_intensity = .25f * sigma * intensity; + segment.maximum_distance = maximum_distance; + ssize_t y0 = center.y - maximum_distance + .5f; + ssize_t y1 = center.y + maximum_distance + 1.5f; // one-past-the-end + for (ssize_t y = std::max(y0, 0); y < y1; y++) { + segments_by_y.emplace_back(y, segments.size()); + } + segments.push_back(segment); +} + +void DrawSegments(float* JXL_RESTRICT row_x, float* JXL_RESTRICT row_y, + float* JXL_RESTRICT row_b, const Rect& image_rect, + const bool add, const SplineSegment* segments, + const size_t* segment_indices, + const size_t* segment_y_start) { + JXL_ASSERT(image_rect.ysize() == 1); + float* JXL_RESTRICT rows[3] = {row_x - image_rect.x0(), + row_y - image_rect.x0(), + row_b - image_rect.x0()}; + size_t y = image_rect.y0(); + for (size_t i = segment_y_start[y]; i < segment_y_start[y + 1]; i++) { + DrawSegment(segments[segment_indices[i]], add, y, image_rect.x0(), + image_rect.x0() + image_rect.xsize(), rows); + } +} + +void SegmentsFromPoints( + const Spline& spline, + const std::vector>& points_to_draw, + const float arc_length, std::vector& segments, + std::vector>& segments_by_y) { + const float inv_arc_length = 1.0f / arc_length; + int k = 0; + for (const auto& point_to_draw : points_to_draw) { + const Spline::Point& point = point_to_draw.first; + const float multiplier = point_to_draw.second; + const float progress_along_arc = + std::min(1.f, (k * kDesiredRenderingDistance) * inv_arc_length); + ++k; + float color[3]; + for (size_t c = 0; c < 3; ++c) { + color[c] = + ContinuousIDCT(spline.color_dct[c], (32 - 1) * progress_along_arc); + } + const float sigma = + ContinuousIDCT(spline.sigma_dct, (32 - 1) * progress_along_arc); + ComputeSegments(point, multiplier, color, sigma, segments, segments_by_y); + } +} +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(SegmentsFromPoints); +HWY_EXPORT(DrawSegments); + +namespace { + +// It is not in spec, but reasonable limit to avoid overflows. +template +Status ValidateSplinePointPos(const T& x, const T& y) { + constexpr T kSplinePosLimit = 1u << 23; + if ((x >= kSplinePosLimit) || (x <= -kSplinePosLimit) || + (y >= kSplinePosLimit) || (y <= -kSplinePosLimit)) { + return JXL_FAILURE("Spline coordinates out of bounds"); + } + return true; +} + +// Maximum number of spline control points per frame is +// std::min(kMaxNumControlPoints, xsize * ysize / 2) +constexpr size_t kMaxNumControlPoints = 1u << 20u; +constexpr size_t kMaxNumControlPointsPerPixelRatio = 2; + +float AdjustedQuant(const int32_t adjustment) { + return (adjustment >= 0) ? (1.f + .125f * adjustment) + : 1.f / (1.f - .125f * adjustment); +} + +float InvAdjustedQuant(const int32_t adjustment) { + return (adjustment >= 0) ? 1.f / (1.f + .125f * adjustment) + : (1.f - .125f * adjustment); +} + +// X, Y, B, sigma. +static constexpr float kChannelWeight[] = {0.0042f, 0.075f, 0.07f, .3333f}; + +Status DecodeAllStartingPoints(std::vector* const points, + BitReader* const br, ANSSymbolReader* reader, + const std::vector& context_map, + const size_t num_splines) { + points->clear(); + points->reserve(num_splines); + int64_t last_x = 0; + int64_t last_y = 0; + for (size_t i = 0; i < num_splines; i++) { + int64_t x = + reader->ReadHybridUint(kStartingPositionContext, br, context_map); + int64_t y = + reader->ReadHybridUint(kStartingPositionContext, br, context_map); + if (i != 0) { + x = UnpackSigned(x) + last_x; + y = UnpackSigned(y) + last_y; + } + JXL_RETURN_IF_ERROR(ValidateSplinePointPos(x, y)); + points->emplace_back(static_cast(x), static_cast(y)); + last_x = x; + last_y = y; + } + return true; +} + +struct Vector { + float x, y; + Vector operator-() const { return {-x, -y}; } + Vector operator+(const Vector& other) const { + return {x + other.x, y + other.y}; + } + float SquaredNorm() const { return x * x + y * y; } +}; +Vector operator*(const float k, const Vector& vec) { + return {k * vec.x, k * vec.y}; +} + +Spline::Point operator+(const Spline::Point& p, const Vector& vec) { + return {p.x + vec.x, p.y + vec.y}; +} +Vector operator-(const Spline::Point& a, const Spline::Point& b) { + return {a.x - b.x, a.y - b.y}; +} + +// TODO(eustas): avoid making a copy of "points". +void DrawCentripetalCatmullRomSpline(std::vector points, + std::vector& result) { + if (points.empty()) return; + if (points.size() == 1) { + result.push_back(points[0]); + return; + } + // Number of points to compute between each control point. + static constexpr int kNumPoints = 16; + result.reserve((points.size() - 1) * kNumPoints + 1); + points.insert(points.begin(), points[0] + (points[0] - points[1])); + points.push_back(points[points.size() - 1] + + (points[points.size() - 1] - points[points.size() - 2])); + // points has at least 4 elements at this point. + for (size_t start = 0; start < points.size() - 3; ++start) { + // 4 of them are used, and we draw from p[1] to p[2]. + const Spline::Point* const p = &points[start]; + result.push_back(p[1]); + float d[3]; + float t[4]; + t[0] = 0; + for (int k = 0; k < 3; ++k) { + // TODO(eustas): for each segment delta is calculated 3 times... + // TODO(eustas): restrict d[k] with reasonable limit and spec it. + d[k] = std::sqrt(hypotf(p[k + 1].x - p[k].x, p[k + 1].y - p[k].y)); + t[k + 1] = t[k] + d[k]; + } + for (int i = 1; i < kNumPoints; ++i) { + const float tt = d[0] + (static_cast(i) / kNumPoints) * d[1]; + Spline::Point a[3]; + for (int k = 0; k < 3; ++k) { + // TODO(eustas): reciprocal multiplication would be faster. + a[k] = p[k] + ((tt - t[k]) / d[k]) * (p[k + 1] - p[k]); + } + Spline::Point b[2]; + for (int k = 0; k < 2; ++k) { + b[k] = a[k] + ((tt - t[k]) / (d[k] + d[k + 1])) * (a[k + 1] - a[k]); + } + result.push_back(b[0] + ((tt - t[1]) / d[1]) * (b[1] - b[0])); + } + } + result.push_back(points[points.size() - 2]); +} + +// Move along the line segments defined by `points`, `kDesiredRenderingDistance` +// pixels at a time, and call `functor` with each point and the actual distance +// to the previous point (which will always be kDesiredRenderingDistance except +// possibly for the very last point). +// TODO(eustas): this method always adds the last point, but never the first +// (unless those are one); I believe both ends matter. +template +void ForEachEquallySpacedPoint(const Points& points, const Functor& functor) { + JXL_ASSERT(!points.empty()); + Spline::Point current = points.front(); + functor(current, kDesiredRenderingDistance); + auto next = points.begin(); + while (next != points.end()) { + const Spline::Point* previous = ¤t; + float arclength_from_previous = 0.f; + for (;;) { + if (next == points.end()) { + functor(*previous, arclength_from_previous); + return; + } + const float arclength_to_next = + std::sqrt((*next - *previous).SquaredNorm()); + if (arclength_from_previous + arclength_to_next >= + kDesiredRenderingDistance) { + current = + *previous + ((kDesiredRenderingDistance - arclength_from_previous) / + arclength_to_next) * + (*next - *previous); + functor(current, kDesiredRenderingDistance); + break; + } + arclength_from_previous += arclength_to_next; + previous = &*next; + ++next; + } + } +} + +} // namespace + +QuantizedSpline::QuantizedSpline(const Spline& original, + const int32_t quantization_adjustment, + const float y_to_x, const float y_to_b) { + JXL_ASSERT(!original.control_points.empty()); + control_points_.reserve(original.control_points.size() - 1); + const Spline::Point& starting_point = original.control_points.front(); + int previous_x = static_cast(roundf(starting_point.x)), + previous_y = static_cast(roundf(starting_point.y)); + int previous_delta_x = 0, previous_delta_y = 0; + for (auto it = original.control_points.begin() + 1; + it != original.control_points.end(); ++it) { + const int new_x = static_cast(roundf(it->x)); + const int new_y = static_cast(roundf(it->y)); + const int new_delta_x = new_x - previous_x; + const int new_delta_y = new_y - previous_y; + control_points_.emplace_back(new_delta_x - previous_delta_x, + new_delta_y - previous_delta_y); + previous_delta_x = new_delta_x; + previous_delta_y = new_delta_y; + previous_x = new_x; + previous_y = new_y; + } + + const auto to_int = [](float v) -> int { + return static_cast(roundf(v)); + }; + + const auto quant = AdjustedQuant(quantization_adjustment); + const auto inv_quant = InvAdjustedQuant(quantization_adjustment); + for (int c : {1, 0, 2}) { + float factor = (c == 0) ? y_to_x : (c == 1) ? 0 : y_to_b; + for (int i = 0; i < 32; ++i) { + const float dct_factor = (i == 0) ? kSqrt2 : 1.0f; + const float inv_dct_factor = (i == 0) ? kSqrt0_5 : 1.0f; + auto restored_y = + color_dct_[1][i] * inv_dct_factor * kChannelWeight[1] * inv_quant; + auto decorellated = original.color_dct[c][i] - factor * restored_y; + color_dct_[c][i] = + to_int(decorellated * dct_factor * quant / kChannelWeight[c]); + } + } + for (int i = 0; i < 32; ++i) { + const float dct_factor = (i == 0) ? kSqrt2 : 1.0f; + sigma_dct_[i] = + to_int(original.sigma_dct[i] * dct_factor * quant / kChannelWeight[3]); + } +} + +Status QuantizedSpline::Dequantize(const Spline::Point& starting_point, + const int32_t quantization_adjustment, + const float y_to_x, const float y_to_b, + const uint64_t image_size, + uint64_t* total_estimated_area_reached, + Spline& result) const { + result.control_points.clear(); + result.control_points.reserve(control_points_.size() + 1); + float px = roundf(starting_point.x); + float py = roundf(starting_point.y); + JXL_RETURN_IF_ERROR(ValidateSplinePointPos(px, py)); + int current_x = static_cast(px); + int current_y = static_cast(py); + result.control_points.push_back(Spline::Point{static_cast(current_x), + static_cast(current_y)}); + int current_delta_x = 0, current_delta_y = 0; + size_t manhattan_distance = 0; + for (const auto& point : control_points_) { + current_delta_x += point.first; + current_delta_y += point.second; + manhattan_distance += abs(current_delta_x) + abs(current_delta_y); + JXL_RETURN_IF_ERROR( + ValidateSplinePointPos(current_delta_x, current_delta_y)); + current_x += current_delta_x; + current_y += current_delta_y; + JXL_RETURN_IF_ERROR(ValidateSplinePointPos(current_x, current_y)); + result.control_points.push_back(Spline::Point{ + static_cast(current_x), static_cast(current_y)}); + } + + const auto inv_quant = InvAdjustedQuant(quantization_adjustment); + for (int c = 0; c < 3; ++c) { + for (int i = 0; i < 32; ++i) { + const float inv_dct_factor = (i == 0) ? kSqrt0_5 : 1.0f; + result.color_dct[c][i] = + color_dct_[c][i] * inv_dct_factor * kChannelWeight[c] * inv_quant; + } + } + for (int i = 0; i < 32; ++i) { + result.color_dct[0][i] += y_to_x * result.color_dct[1][i]; + result.color_dct[2][i] += y_to_b * result.color_dct[1][i]; + } + uint64_t width_estimate = 0; + + uint64_t color[3] = {}; + for (int c = 0; c < 3; ++c) { + for (int i = 0; i < 32; ++i) { + color[c] += + static_cast(ceil(inv_quant * std::abs(color_dct_[c][i]))); + } + } + color[0] += static_cast(ceil(abs(y_to_x))) * color[1]; + color[2] += static_cast(ceil(abs(y_to_b))) * color[1]; + // This is not taking kChannelWeight into account, but up to constant factors + // it gives an indication of the influence of the color values on the area + // that will need to be rendered. + uint64_t logcolor = std::max( + uint64_t(1), + static_cast(CeilLog2Nonzero( + uint64_t(1) + std::max(color[1], std::max(color[0], color[2]))))); + + for (int i = 0; i < 32; ++i) { + const float inv_dct_factor = (i == 0) ? kSqrt0_5 : 1.0f; + result.sigma_dct[i] = + sigma_dct_[i] * inv_dct_factor * kChannelWeight[3] * inv_quant; + // If we include the factor kChannelWeight[3]=.3333f here, we get a + // realistic area estimate. We leave it out to simplify the calculations, + // and understand that this way we underestimate the area by a factor of + // 1/(0.3333*0.3333). This is taken into account in the limits below. + uint64_t weight = std::max( + uint64_t(1), + static_cast(ceil(inv_quant * std::abs(sigma_dct_[i])))); + width_estimate += weight * weight * logcolor; + } + *total_estimated_area_reached += (width_estimate * manhattan_distance); + if (*total_estimated_area_reached > + std::min((1024 * image_size + (uint64_t(1) << 32)), + (uint64_t(1) << 42))) { + return JXL_FAILURE("Too large total_estimated_area_reached: %" PRIu64, + *total_estimated_area_reached); + } + + return true; +} + +Status QuantizedSpline::Decode(const std::vector& context_map, + ANSSymbolReader* const decoder, + BitReader* const br, + const size_t max_control_points, + size_t* total_num_control_points) { + const size_t num_control_points = + decoder->ReadHybridUint(kNumControlPointsContext, br, context_map); + *total_num_control_points += num_control_points; + if (*total_num_control_points > max_control_points) { + return JXL_FAILURE("Too many control points: %" PRIuS, + *total_num_control_points); + } + control_points_.resize(num_control_points); + // Maximal image dimension. + constexpr int64_t kDeltaLimit = 1u << 30; + for (std::pair& control_point : control_points_) { + control_point.first = UnpackSigned( + decoder->ReadHybridUint(kControlPointsContext, br, context_map)); + control_point.second = UnpackSigned( + decoder->ReadHybridUint(kControlPointsContext, br, context_map)); + // Check delta-deltas are not outrageous; it is not in spec, but there is + // no reason to allow larger values. + if ((control_point.first >= kDeltaLimit) || + (control_point.first <= -kDeltaLimit) || + (control_point.second >= kDeltaLimit) || + (control_point.second <= -kDeltaLimit)) { + return JXL_FAILURE("Spline delta-delta is out of bounds"); + } + } + + const auto decode_dct = [decoder, br, &context_map](int dct[32]) -> Status { + for (int i = 0; i < 32; ++i) { + dct[i] = + UnpackSigned(decoder->ReadHybridUint(kDCTContext, br, context_map)); + } + return true; + }; + for (int c = 0; c < 3; ++c) { + JXL_RETURN_IF_ERROR(decode_dct(color_dct_[c])); + } + JXL_RETURN_IF_ERROR(decode_dct(sigma_dct_)); + return true; +} + +void Splines::Clear() { + quantization_adjustment_ = 0; + splines_.clear(); + starting_points_.clear(); + segments_.clear(); + segment_indices_.clear(); + segment_y_start_.clear(); +} + +Status Splines::Decode(jxl::BitReader* br, const size_t num_pixels) { + std::vector context_map; + ANSCode code; + JXL_RETURN_IF_ERROR( + DecodeHistograms(br, kNumSplineContexts, &code, &context_map)); + ANSSymbolReader decoder(&code, br); + const size_t num_splines = + 1 + decoder.ReadHybridUint(kNumSplinesContext, br, context_map); + size_t max_control_points = std::min( + kMaxNumControlPoints, num_pixels / kMaxNumControlPointsPerPixelRatio); + if (num_splines > max_control_points) { + return JXL_FAILURE("Too many splines: %" PRIuS, num_splines); + } + JXL_RETURN_IF_ERROR(DecodeAllStartingPoints(&starting_points_, br, &decoder, + context_map, num_splines)); + + quantization_adjustment_ = UnpackSigned( + decoder.ReadHybridUint(kQuantizationAdjustmentContext, br, context_map)); + + splines_.clear(); + splines_.reserve(num_splines); + size_t num_control_points = num_splines; + for (size_t i = 0; i < num_splines; ++i) { + QuantizedSpline spline; + JXL_RETURN_IF_ERROR(spline.Decode(context_map, &decoder, br, + max_control_points, &num_control_points)); + splines_.push_back(std::move(spline)); + } + + JXL_RETURN_IF_ERROR(decoder.CheckANSFinalState()); + + if (!HasAny()) { + return JXL_FAILURE("Decoded splines but got none"); + } + + return true; +} + +void Splines::AddTo(Image3F* const opsin, const Rect& opsin_rect, + const Rect& image_rect) const { + return Apply(opsin, opsin_rect, image_rect); +} +void Splines::AddToRow(float* JXL_RESTRICT row_x, float* JXL_RESTRICT row_y, + float* JXL_RESTRICT row_b, const Rect& image_row) const { + return ApplyToRow(row_x, row_y, row_b, image_row); +} + +void Splines::SubtractFrom(Image3F* const opsin) const { + return Apply(opsin, Rect(*opsin), Rect(*opsin)); +} + +Status Splines::InitializeDrawCache(const size_t image_xsize, + const size_t image_ysize, + const ColorCorrelationMap& cmap) { + // TODO(veluca): avoid storing segments that are entirely outside image + // boundaries. + segments_.clear(); + segment_indices_.clear(); + segment_y_start_.clear(); + std::vector> segments_by_y; + std::vector intermediate_points; + uint64_t total_estimated_area_reached = 0; + std::vector splines; + for (size_t i = 0; i < splines_.size(); ++i) { + Spline spline; + JXL_RETURN_IF_ERROR(splines_[i].Dequantize( + starting_points_[i], quantization_adjustment_, cmap.YtoXRatio(0), + cmap.YtoBRatio(0), image_xsize * image_ysize, + &total_estimated_area_reached, spline)); + if (std::adjacent_find(spline.control_points.begin(), + spline.control_points.end()) != + spline.control_points.end()) { + // Otherwise division by zero might occur. Once control points coincide, + // the direction of curve is undefined... + return JXL_FAILURE( + "identical successive control points in spline %" PRIuS, i); + } + splines.push_back(spline); + } + // TODO(firsching) Change this into a JXL_FAILURE for level 5 codestreams. + if (total_estimated_area_reached > + std::min((8 * image_xsize * image_ysize + (uint64_t(1) << 25)), + (uint64_t(1) << 30))) { + JXL_WARNING( + "Large total_estimated_area_reached, expect slower decoding: %" PRIu64, + total_estimated_area_reached); + } + + for (Spline& spline : splines) { + std::vector> points_to_draw; + auto add_point = [&](const Spline::Point& point, const float multiplier) { + points_to_draw.emplace_back(point, multiplier); + }; + intermediate_points.clear(); + DrawCentripetalCatmullRomSpline(spline.control_points, intermediate_points); + ForEachEquallySpacedPoint(intermediate_points, add_point); + const float arc_length = + (points_to_draw.size() - 2) * kDesiredRenderingDistance + + points_to_draw.back().second; + if (arc_length <= 0.f) { + // This spline wouldn't have any effect. + continue; + } + HWY_DYNAMIC_DISPATCH(SegmentsFromPoints) + (spline, points_to_draw, arc_length, segments_, segments_by_y); + } + + // TODO(eustas): consider linear sorting here. + std::sort(segments_by_y.begin(), segments_by_y.end()); + segment_indices_.resize(segments_by_y.size()); + segment_y_start_.resize(image_ysize + 1); + for (size_t i = 0; i < segments_by_y.size(); i++) { + segment_indices_[i] = segments_by_y[i].second; + size_t y = segments_by_y[i].first; + if (y < image_ysize) { + segment_y_start_[y + 1]++; + } + } + for (size_t y = 0; y < image_ysize; y++) { + segment_y_start_[y + 1] += segment_y_start_[y]; + } + return true; +} + +template +void Splines::ApplyToRow(float* JXL_RESTRICT row_x, float* JXL_RESTRICT row_y, + float* JXL_RESTRICT row_b, + const Rect& image_row) const { + if (segments_.empty()) return; + JXL_ASSERT(image_row.ysize() == 1); + for (size_t iy = 0; iy < image_row.ysize(); iy++) { + HWY_DYNAMIC_DISPATCH(DrawSegments) + (row_x, row_y, row_b, image_row.Line(iy), add, segments_.data(), + segment_indices_.data(), segment_y_start_.data()); + } +} + +template +void Splines::Apply(Image3F* const opsin, const Rect& opsin_rect, + const Rect& image_rect) const { + if (segments_.empty()) return; + for (size_t iy = 0; iy < image_rect.ysize(); iy++) { + const size_t y0 = opsin_rect.Line(iy).y0(); + const size_t x0 = opsin_rect.x0(); + ApplyToRow(opsin->PlaneRow(0, y0) + x0, opsin->PlaneRow(1, y0) + x0, + opsin->PlaneRow(2, y0) + x0, image_rect.Line(iy)); + } +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/third_party/jpeg-xl/lib/jxl/splines.h b/third_party/jpeg-xl/lib/jxl/splines.h new file mode 100644 index 0000000000..c8dad3417c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/splines.h @@ -0,0 +1,148 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_SPLINES_H_ +#define LIB_JXL_SPLINES_H_ + +#include +#include + +#include +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/image.h" + +namespace jxl { + +static constexpr float kDesiredRenderingDistance = 1.f; + +enum SplineEntropyContexts : size_t { + kQuantizationAdjustmentContext = 0, + kStartingPositionContext, + kNumSplinesContext, + kNumControlPointsContext, + kControlPointsContext, + kDCTContext, + kNumSplineContexts +}; + +struct Spline { + struct Point { + Point() : x(0.0f), y(0.0f) {} + Point(float x, float y) : x(x), y(y) {} + float x, y; + bool operator==(const Point& other) const { + return std::fabs(x - other.x) < 1e-3f && std::fabs(y - other.y) < 1e-3f; + } + }; + std::vector control_points; + // X, Y, B. + float color_dct[3][32]; + // Splines are draws by normalized Gaussian splatting. This controls the + // Gaussian's parameter along the spline. + float sigma_dct[32]; +}; + +class QuantizedSplineEncoder; + +class QuantizedSpline { + public: + QuantizedSpline() = default; + explicit QuantizedSpline(const Spline& original, + int32_t quantization_adjustment, float y_to_x, + float y_to_b); + + Status Dequantize(const Spline::Point& starting_point, + int32_t quantization_adjustment, float y_to_x, float y_to_b, + uint64_t image_size, uint64_t* total_estimated_area_reached, + Spline& result) const; + + Status Decode(const std::vector& context_map, + ANSSymbolReader* decoder, BitReader* br, + size_t max_control_points, size_t* total_num_control_points); + + private: + friend class QuantizedSplineEncoder; + + std::vector> + control_points_; // Double delta-encoded. + int color_dct_[3][32] = {}; + int sigma_dct_[32] = {}; +}; + +// A single "drawable unit" of a spline, i.e. a line of the region in which we +// render each Gaussian. The structure doesn't actually depend on the exact +// row, which allows reuse for different y values (which are tracked +// separately). +struct SplineSegment { + float center_x, center_y; + float maximum_distance; + float inv_sigma; + float sigma_over_4_times_intensity; + float color[3]; +}; + +class Splines { + public: + Splines() = default; + explicit Splines(const int32_t quantization_adjustment, + std::vector splines, + std::vector starting_points) + : quantization_adjustment_(quantization_adjustment), + splines_(std::move(splines)), + starting_points_(std::move(starting_points)) {} + + bool HasAny() const { return !splines_.empty(); } + + void Clear(); + + Status Decode(BitReader* br, size_t num_pixels); + + void AddTo(Image3F* opsin, const Rect& opsin_rect, + const Rect& image_rect) const; + void AddToRow(float* JXL_RESTRICT row_x, float* JXL_RESTRICT row_y, + float* JXL_RESTRICT row_b, const Rect& image_row) const; + void SubtractFrom(Image3F* opsin) const; + + const std::vector& QuantizedSplines() const { + return splines_; + } + const std::vector& StartingPoints() const { + return starting_points_; + } + + int32_t GetQuantizationAdjustment() const { return quantization_adjustment_; } + + Status InitializeDrawCache(size_t image_xsize, size_t image_ysize, + const ColorCorrelationMap& cmap); + + private: + template + void ApplyToRow(float* JXL_RESTRICT row_x, float* JXL_RESTRICT row_y, + float* JXL_RESTRICT row_b, const Rect& image_row) const; + template + void Apply(Image3F* opsin, const Rect& opsin_rect, + const Rect& image_rect) const; + + // If positive, quantization weights are multiplied by 1 + this/8, which + // increases precision. If negative, they are divided by 1 - this/8. If 0, + // they are unchanged. + int32_t quantization_adjustment_ = 0; + std::vector splines_; + std::vector starting_points_; + std::vector segments_; + std::vector segment_indices_; + std::vector segment_y_start_; +}; + +} // namespace jxl + +#endif // LIB_JXL_SPLINES_H_ diff --git a/third_party/jpeg-xl/lib/jxl/splines_gbench.cc b/third_party/jpeg-xl/lib/jxl/splines_gbench.cc new file mode 100644 index 0000000000..78ff6d41c0 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/splines_gbench.cc @@ -0,0 +1,52 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "benchmark/benchmark.h" +#include "lib/jxl/splines.h" + +namespace jxl { +namespace { + +constexpr int kQuantizationAdjustment = 0; +const ColorCorrelationMap* const cmap = new ColorCorrelationMap; +const float kYToX = cmap->YtoXRatio(0); +const float kYToB = cmap->YtoBRatio(0); + +void BM_Splines(benchmark::State& state) { + const size_t n = state.range(); + + std::vector spline_data = { + {/*control_points=*/{ + {9, 54}, {118, 159}, {97, 3}, {10, 40}, {150, 25}, {120, 300}}, + /*color_dct=*/ + {{0.03125f, 0.00625f, 0.003125f}, {1.f, 0.321875f}, {1.f, 0.24375f}}, + /*sigma_dct=*/{0.3125f, 0.f, 0.f, 0.0625f}}}; + std::vector quantized_splines; + std::vector starting_points; + for (const Spline& spline : spline_data) { + quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX, + kYToB); + starting_points.push_back(spline.control_points.front()); + } + Splines splines(kQuantizationAdjustment, std::move(quantized_splines), + std::move(starting_points)); + + Image3F drawing_area(320, 320); + ZeroFillImage(&drawing_area); + for (auto _ : state) { + for (size_t i = 0; i < n; ++i) { + JXL_CHECK(splines.InitializeDrawCache(drawing_area.xsize(), + drawing_area.ysize(), *cmap)); + splines.AddTo(&drawing_area, Rect(drawing_area), Rect(drawing_area)); + } + } + + state.SetItemsProcessed(n * state.iterations()); +} + +BENCHMARK(BM_Splines)->Range(1, 1 << 10); + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/splines_test.cc b/third_party/jpeg-xl/lib/jxl/splines_test.cc new file mode 100644 index 0000000000..8d6bc7ed1c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/splines_test.cc @@ -0,0 +1,348 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/splines.h" + +#include "lib/extras/codec.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/enc_splines.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +namespace jxl { + +std::ostream& operator<<(std::ostream& os, const Spline::Point& p) { + return os << "(" << p.x << ", " << p.y << ")"; +} + +std::ostream& operator<<(std::ostream& os, const Spline& spline) { + return os << "(spline with " << spline.control_points.size() + << " control points)"; +} + +namespace { + +using ::testing::AllOf; +using ::testing::Field; +using ::testing::FloatNear; +using ::testing::Pointwise; + +constexpr int kQuantizationAdjustment = 0; +const ColorCorrelationMap* const cmap = new ColorCorrelationMap; +const float kYToX = cmap->YtoXRatio(0); +const float kYToB = cmap->YtoBRatio(0); + +constexpr float kTolerance = 0.003125; + +std::vector DequantizeSplines(const Splines& splines) { + const auto& quantized_splines = splines.QuantizedSplines(); + const auto& starting_points = splines.StartingPoints(); + JXL_CHECK(quantized_splines.size() == starting_points.size()); + + std::vector dequantized; + uint64_t total = 0; + for (size_t i = 0; i < quantized_splines.size(); ++i) { + dequantized.emplace_back(); + JXL_CHECK(quantized_splines[i].Dequantize( + starting_points[i], kQuantizationAdjustment, kYToX, kYToB, 2u << 30u, + &total, dequantized.back())); + } + return dequantized; +} + +MATCHER(ControlPointIs, "") { + const Spline::Point& actual = std::get<0>(arg); + const Spline::Point& expected = std::get<1>(arg); + return testing::ExplainMatchResult( + AllOf(Field(&Spline::Point::x, FloatNear(expected.x, kTolerance)), + Field(&Spline::Point::y, FloatNear(expected.y, kTolerance))), + actual, result_listener); +} + +MATCHER(ControlPointsMatch, "") { + const Spline& actual = std::get<0>(arg); + const Spline& expected = std::get<1>(arg); + return testing::ExplainMatchResult( + Field(&Spline::control_points, + Pointwise(ControlPointIs(), expected.control_points)), + actual, result_listener); +} + +MATCHER(SplinesMatch, "") { + const Spline& actual = std::get<0>(arg); + const Spline& expected = std::get<1>(arg); + if (!testing::ExplainMatchResult(ControlPointsMatch(), arg, + result_listener)) { + return false; + } + for (int i = 0; i < 3; ++i) { + size_t color_dct_size = + sizeof(expected.color_dct[i]) / sizeof(expected.color_dct[i][0]); + for (size_t j = 0; j < color_dct_size; j++) { + testing::StringMatchResultListener color_dct_listener; + if (!testing::ExplainMatchResult( + FloatNear(expected.color_dct[i][j], kTolerance), + actual.color_dct[i][j], &color_dct_listener)) { + *result_listener << ", where color_dct[" << i << "][" << j + << "] don't match, " << color_dct_listener.str(); + return false; + } + } + } + size_t sigma_dct_size = + sizeof(expected.sigma_dct) / sizeof(expected.sigma_dct[0]); + for (size_t i = 0; i < sigma_dct_size; i++) { + testing::StringMatchResultListener sigma_listener; + if (!testing::ExplainMatchResult( + FloatNear(expected.sigma_dct[i], kTolerance), actual.sigma_dct[i], + &sigma_listener)) { + *result_listener << ", where sigma_dct[" << i << "] don't match, " + << sigma_listener.str(); + return false; + } + } + return true; +} + +} // namespace + +TEST(SplinesTest, Serialization) { + std::vector spline_data = { + {/*control_points=*/{ + {109, 54}, {218, 159}, {80, 3}, {110, 274}, {94, 185}, {17, 277}}, + /*color_dct=*/ + {{36.3, 39.7, 23.2, 67.5, 4.4, 71.5, 62.3, 32.3, 92.2, 10.1, 10.8, + 9.2, 6.1, 10.5, 79.1, 7, 24.6, 90.8, 5.5, 84, 43.8, 49, + 33.5, 78.9, 54.5, 77.9, 62.1, 51.4, 36.4, 14.3, 83.7, 35.4}, + {9.4, 53.4, 9.5, 74.9, 72.7, 26.7, 7.9, 0.9, 84.9, 23.2, 26.5, + 31.1, 91, 11.7, 74.1, 39.3, 23.7, 82.5, 4.8, 2.7, 61.2, 96.4, + 13.7, 66.7, 62.9, 82.4, 5.9, 98.7, 21.5, 7.9, 51.7, 63.1}, + {48, 39.3, 6.9, 26.3, 33.3, 6.2, 1.7, 98.9, 59.9, 59.6, 95, + 61.3, 82.7, 53, 6.1, 30.4, 34.7, 96.9, 93.4, 17, 38.8, 80.8, + 63, 18.6, 43.6, 32.3, 61, 20.2, 24.3, 28.3, 69.1, 62.4}}, + /*sigma_dct=*/{32.7, 21.5, 44.4, 1.8, 45.8, 90.6, 29.3, 59.2, + 23.7, 85.2, 84.8, 27.2, 42.1, 84.1, 50.6, 17.6, + 93.7, 4.9, 2.6, 69.8, 94.9, 52, 24.3, 18.8, + 12.1, 95.7, 28.5, 81.4, 89.9, 31.4, 74.8, 52}}, + {/*control_points=*/{{172, 309}, + {196, 277}, + {42, 238}, + {114, 350}, + {307, 290}, + {316, 269}, + {124, 66}, + {233, 267}}, + /*color_dct=*/ + {{15, 28.9, 22, 6.6, 41.8, 83, 8.6, 56.8, 68.9, 9.7, 5.4, + 19.8, 70.8, 90, 52.5, 65.2, 7.8, 23.5, 26.4, 72.2, 64.7, 87.1, + 1.3, 67.5, 46, 68.4, 65.4, 35.5, 29.1, 13, 41.6, 23.9}, + {47.7, 79.4, 62.7, 29.1, 96.8, 18.5, 17.6, 15.2, 80.5, 56, 96.2, + 59.9, 26.7, 96.1, 92.3, 42.1, 35.8, 54, 23.2, 55, 76, 35.8, + 58.4, 88.7, 2.4, 78.1, 95.6, 27.5, 6.6, 78.5, 24.1, 69.8}, + {43.8, 96.5, 0.9, 95.1, 49.1, 71.2, 25.1, 33.6, 75.2, 95, 82.1, + 19.7, 10.5, 44.9, 50, 93.3, 83.5, 99.5, 64.6, 54, 3.5, 99.7, + 45.3, 82.1, 22.4, 37.9, 60, 32.2, 12.6, 4.6, 65.5, 96.4}}, + /*sigma_dct=*/{72.5, 2.6, 41.7, 2.2, 39.7, 79.1, 69.6, 19.9, + 92.3, 71.5, 41.9, 62.1, 30, 49.4, 70.3, 45.3, + 62.5, 47.2, 46.7, 41.2, 90.8, 46.8, 91.2, 55, + 8.1, 69.6, 25.4, 84.7, 61.7, 27.6, 3.7, 46.9}}, + {/*control_points=*/{{100, 186}, + {257, 97}, + {170, 49}, + {25, 169}, + {309, 104}, + {232, 237}, + {385, 101}, + {122, 168}, + {26, 300}, + {390, 88}}, + /*color_dct=*/ + {{16.9, 64.8, 4.2, 10.6, 23.5, 17, 79.3, 5.7, 60.4, 16.6, 94.9, + 63.7, 87.6, 10.5, 3.8, 61.1, 22.9, 81.9, 80.4, 40.5, 45.9, 25.4, + 39.8, 30, 50.2, 90.4, 27.9, 93.7, 65.1, 48.2, 22.3, 43.9}, + {24.9, 66, 3.5, 90.2, 97.1, 15.8, 35.6, 0.6, 68, 39.6, 24.4, + 85.9, 57.7, 77.6, 47.5, 67.9, 4.3, 5.4, 91.2, 58.5, 0.1, 52.2, + 3.5, 47.8, 63.2, 43.5, 85.8, 35.8, 50.2, 35.9, 19.2, 48.2}, + {82.8, 44.9, 76.4, 39.5, 94.1, 14.3, 89.8, 10, 10.5, 74.5, 56.3, + 65.8, 7.8, 23.3, 52.8, 99.3, 56.8, 46, 76.7, 13.5, 67, 22.4, + 29.9, 43.3, 70.3, 26, 74.3, 53.9, 62, 19.1, 49.3, 46.7}}, + /*sigma_dct=*/{83.5, 1.7, 25.1, 18.7, 46.5, 75.3, 28, 62.3, + 50.3, 23.3, 85.6, 96, 45.8, 33.1, 33.4, 52.9, + 26.3, 58.5, 19.6, 70, 92.6, 22.5, 57, 21.6, + 76.8, 87.5, 22.9, 66.3, 35.7, 35.6, 56.8, 67.2}}, + }; + + std::vector quantized_splines; + std::vector starting_points; + for (const Spline& spline : spline_data) { + quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX, + kYToB); + starting_points.push_back(spline.control_points.front()); + } + + Splines splines(kQuantizationAdjustment, std::move(quantized_splines), + std::move(starting_points)); + const std::vector quantized_spline_data = DequantizeSplines(splines); + EXPECT_THAT(quantized_spline_data, + Pointwise(ControlPointsMatch(), spline_data)); + + BitWriter writer; + EncodeSplines(splines, &writer, kLayerSplines, HistogramParams(), nullptr); + writer.ZeroPadToByte(); + const size_t bits_written = writer.BitsWritten(); + + printf("Wrote %" PRIuS " bits of splines.\n", bits_written); + + BitReader reader(writer.GetSpan()); + Splines decoded_splines; + ASSERT_TRUE(decoded_splines.Decode(&reader, /*num_pixels=*/1000)); + ASSERT_TRUE(reader.JumpToByteBoundary()); + EXPECT_EQ(reader.TotalBitsConsumed(), bits_written); + ASSERT_TRUE(reader.Close()); + + const std::vector decoded_spline_data = + DequantizeSplines(decoded_splines); + EXPECT_THAT(decoded_spline_data, + Pointwise(SplinesMatch(), quantized_spline_data)); +} + +#ifdef JXL_CRASH_ON_ERROR +TEST(SplinesTest, DISABLED_TooManySplinesTest) { +#else +TEST(SplinesTest, TooManySplinesTest) { +#endif + // This is more than the limit for 1000 pixels. + const size_t kNumSplines = 300; + + std::vector quantized_splines; + std::vector starting_points; + for (size_t i = 0; i < kNumSplines; i++) { + Spline spline = { + /*control_points=*/{{1.f + i, 2}, {10.f + i, 25}, {30.f + i, 300}}, + /*color_dct=*/ + {{1.f, 0.2f, 0.1f}, {35.7f, 10.3f}, {35.7f, 7.8f}}, + /*sigma_dct=*/{10.f, 0.f, 0.f, 2.f}}; + quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX, + kYToB); + starting_points.push_back(spline.control_points.front()); + } + + Splines splines(kQuantizationAdjustment, std::move(quantized_splines), + std::move(starting_points)); + BitWriter writer; + EncodeSplines(splines, &writer, kLayerSplines, + HistogramParams(SpeedTier::kFalcon, 1), nullptr); + writer.ZeroPadToByte(); + // Re-read splines. + BitReader reader(writer.GetSpan()); + Splines decoded_splines; + EXPECT_FALSE(decoded_splines.Decode(&reader, /*num_pixels=*/1000)); + EXPECT_TRUE(reader.Close()); +} + +#ifdef JXL_CRASH_ON_ERROR +TEST(SplinesTest, DISABLED_DuplicatePoints) { +#else +TEST(SplinesTest, DuplicatePoints) { +#endif + std::vector control_points{ + {9, 54}, {118, 159}, {97, 3}, // Repeated. + {97, 3}, {10, 40}, {150, 25}, {120, 300}}; + Spline spline{control_points, + /*color_dct=*/ + {{1.f, 0.2f, 0.1f}, {35.7f, 10.3f}, {35.7f, 7.8f}}, + /*sigma_dct=*/{10.f, 0.f, 0.f, 2.f}}; + std::vector spline_data{spline}; + std::vector quantized_splines; + std::vector starting_points; + for (const Spline& spline : spline_data) { + quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX, + kYToB); + starting_points.push_back(spline.control_points.front()); + } + Splines splines(kQuantizationAdjustment, std::move(quantized_splines), + std::move(starting_points)); + + Image3F image(320, 320); + ZeroFillImage(&image); + EXPECT_FALSE( + splines.InitializeDrawCache(image.xsize(), image.ysize(), *cmap)); +} + +TEST(SplinesTest, Drawing) { + CodecInOut io_expected; + const PaddedBytes orig = jxl::test::ReadTestData("jxl/splines.pfm"); + ASSERT_TRUE(SetFromBytes(Span(orig), &io_expected, + /*pool=*/nullptr)); + + std::vector control_points{{9, 54}, {118, 159}, {97, 3}, + {10, 40}, {150, 25}, {120, 300}}; + // Use values that survive quant/decorellation roundtrip. + const Spline spline{ + control_points, + /*color_dct=*/ + {{0.4989345073699951171875000f, 0.4997999966144561767578125f}, + {0.4772970676422119140625000f, 0.f, 0.5250000357627868652343750f}, + {-0.0176776945590972900390625f, 0.4900000095367431640625000f, + 0.5250000357627868652343750f}}, + /*sigma_dct=*/ + {0.9427147507667541503906250f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.6665999889373779296875000f}}; + std::vector spline_data = {spline}; + std::vector quantized_splines; + std::vector starting_points; + for (const Spline& spline : spline_data) { + quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX, + kYToB); + starting_points.push_back(spline.control_points.front()); + } + Splines splines(kQuantizationAdjustment, std::move(quantized_splines), + std::move(starting_points)); + + Image3F image(320, 320); + ZeroFillImage(&image); + ASSERT_TRUE(splines.InitializeDrawCache(image.xsize(), image.ysize(), *cmap)); + splines.AddTo(&image, Rect(image), Rect(image)); + + CodecInOut io_actual; + io_actual.SetFromImage(CopyImage(image), ColorEncoding::SRGB()); + ASSERT_TRUE(io_actual.frames[0].TransformTo(io_expected.Main().c_current(), + GetJxlCms())); + + JXL_ASSERT_OK(VerifyRelativeError( + *io_expected.Main().color(), *io_actual.Main().color(), 1e-2f, 1e-1f, _)); +} + +TEST(SplinesTest, ClearedEveryFrame) { + CodecInOut io_expected; + const PaddedBytes bytes_expected = + jxl::test::ReadTestData("jxl/spline_on_first_frame.png"); + ASSERT_TRUE(SetFromBytes(Span(bytes_expected), &io_expected, + /*pool=*/nullptr)); + CodecInOut io_actual; + const PaddedBytes bytes_actual = + jxl::test::ReadTestData("jxl/spline_on_first_frame.jxl"); + ASSERT_TRUE( + test::DecodeFile({}, Span(bytes_actual), &io_actual)); + + ASSERT_TRUE( + io_actual.frames[0].TransformTo(ColorEncoding::SRGB(), GetJxlCms())); + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < io_actual.ysize(); ++y) { + float* const JXL_RESTRICT row = io_actual.Main().color()->PlaneRow(c, y); + for (size_t x = 0; x < io_actual.xsize(); ++x) { + row[x] = Clamp1(row[x], 0.f, 1.f); + } + } + } + JXL_ASSERT_OK(VerifyRelativeError( + *io_expected.Main().color(), *io_actual.Main().color(), 1e-2f, 1e-1f, _)); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/test_image.cc b/third_party/jpeg-xl/lib/jxl/test_image.cc new file mode 100644 index 0000000000..af1d1293ef --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/test_image.cc @@ -0,0 +1,453 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/test_image.h" + +#include + +#include +#include +#include + +#include "lib/extras/dec/color_description.h" +#include "lib/extras/dec/color_hints.h" +#include "lib/extras/dec/decode.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/random.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/color_encoding_internal.h" + +namespace jxl { +namespace test { + +namespace { + +void StoreValue(float val, size_t bits_per_sample, JxlPixelFormat format, + uint8_t** out) { + const float mul = (1u << bits_per_sample) - 1; + if (format.data_type == JXL_TYPE_UINT8) { + **out = val * mul; + } else if (format.data_type == JXL_TYPE_UINT16) { + uint16_t uval = val * mul; + if (SwapEndianness(format.endianness)) { + uval = JXL_BSWAP16(uval); + } + memcpy(*out, &uval, 2); + } else if (format.data_type == JXL_TYPE_FLOAT) { + // TODO(szabadka) Add support for custom bits / exponent bits floats. + if (SwapEndianness(format.endianness)) { + val = BSwapFloat(val); + } + memcpy(*out, &val, 4); + } else { + // TODO(szabadka) Add support for FLOAT16. + } + *out += extras::PackedImage::BitsPerChannel(format.data_type) / 8; +} + +void FillPackedImage(size_t bits_per_sample, uint16_t seed, + extras::PackedImage* image) { + const size_t xsize = image->xsize; + const size_t ysize = image->ysize; + const JxlPixelFormat format = image->format; + + // Cause more significant image difference for successive seeds. + Rng generator(seed); + + // Returns random integer in interval [0, max_value) + auto rngu = [&generator](size_t max_value) -> size_t { + return generator.UniformU(0, max_value); + }; + + // Returns random float in interval [0.0, max_value) + auto rngf = [&generator](float max_value) { + return generator.UniformF(0.0f, max_value); + }; + + // Dark background gradient color + float r0 = rngf(0.5f); + float g0 = rngf(0.5f); + float b0 = rngf(0.5f); + float a0 = rngf(0.5f); + float r1 = rngf(0.5f); + float g1 = rngf(0.5f); + float b1 = rngf(0.5f); + float a1 = rngf(0.5f); + + // Circle with different color + size_t circle_x = rngu(xsize); + size_t circle_y = rngu(ysize); + size_t circle_r = rngu(std::min(xsize, ysize)); + + // Rectangle with random noise + size_t rect_x0 = rngu(xsize); + size_t rect_y0 = rngu(ysize); + size_t rect_x1 = rngu(xsize); + size_t rect_y1 = rngu(ysize); + if (rect_x1 < rect_x0) std::swap(rect_x0, rect_y1); + if (rect_y1 < rect_y0) std::swap(rect_y0, rect_y1); + + // Create pixel content to test, actual content does not matter as long as it + // can be compared after roundtrip. + uint8_t* out = reinterpret_cast(image->pixels()); + const float imul16 = 1.0f / 65536.0f; + for (size_t y = 0; y < ysize; y++) { + for (size_t x = 0; x < xsize; x++) { + float r = r0 * (ysize - y - 1) / ysize + r1 * y / ysize; + float g = g0 * (ysize - y - 1) / ysize + g1 * y / ysize; + float b = b0 * (ysize - y - 1) / ysize + b1 * y / ysize; + float a = a0 * (ysize - y - 1) / ysize + a1 * y / ysize; + // put some shape in there for visual debugging + if ((x - circle_x) * (x - circle_x) + (y - circle_y) * (y - circle_y) < + circle_r * circle_r) { + r = std::min(1.0f, ((65535 - x * y) ^ seed) * imul16); + g = std::min(1.0f, ((x << 8) + y + seed) * imul16); + b = std::min(1.0f, ((y << 8) + x * seed) * imul16); + a = std::min(1.0f, (32768 + x * 256 - y) * imul16); + } else if (x > rect_x0 && x < rect_x1 && y > rect_y0 && y < rect_y1) { + r = rngf(1.0f); + g = rngf(1.0f); + b = rngf(1.0f); + a = rngf(1.0f); + } + if (format.num_channels == 1) { + StoreValue(g, bits_per_sample, format, &out); + } else if (format.num_channels == 2) { + StoreValue(g, bits_per_sample, format, &out); + StoreValue(a, bits_per_sample, format, &out); + } else if (format.num_channels == 3) { + StoreValue(r, bits_per_sample, format, &out); + StoreValue(g, bits_per_sample, format, &out); + StoreValue(b, bits_per_sample, format, &out); + } else if (format.num_channels == 4) { + StoreValue(r, bits_per_sample, format, &out); + StoreValue(g, bits_per_sample, format, &out); + StoreValue(b, bits_per_sample, format, &out); + StoreValue(a, bits_per_sample, format, &out); + } + } + } +} + +} // namespace + +std::vector GetSomeTestImage(size_t xsize, size_t ysize, + size_t num_channels, uint16_t seed) { + // Cause more significant image difference for successive seeds. + Rng generator(seed); + + // Returns random integer in interval [0, max_value) + auto rng = [&generator](size_t max_value) -> size_t { + return generator.UniformU(0, max_value); + }; + + // Dark background gradient color + uint16_t r0 = rng(32768); + uint16_t g0 = rng(32768); + uint16_t b0 = rng(32768); + uint16_t a0 = rng(32768); + uint16_t r1 = rng(32768); + uint16_t g1 = rng(32768); + uint16_t b1 = rng(32768); + uint16_t a1 = rng(32768); + + // Circle with different color + size_t circle_x = rng(xsize); + size_t circle_y = rng(ysize); + size_t circle_r = rng(std::min(xsize, ysize)); + + // Rectangle with random noise + size_t rect_x0 = rng(xsize); + size_t rect_y0 = rng(ysize); + size_t rect_x1 = rng(xsize); + size_t rect_y1 = rng(ysize); + if (rect_x1 < rect_x0) std::swap(rect_x0, rect_y1); + if (rect_y1 < rect_y0) std::swap(rect_y0, rect_y1); + + size_t num_pixels = xsize * ysize; + // 16 bits per channel, big endian, 4 channels + std::vector pixels(num_pixels * num_channels * 2); + // Create pixel content to test, actual content does not matter as long as it + // can be compared after roundtrip. + for (size_t y = 0; y < ysize; y++) { + for (size_t x = 0; x < xsize; x++) { + uint16_t r = r0 * (ysize - y - 1) / ysize + r1 * y / ysize; + uint16_t g = g0 * (ysize - y - 1) / ysize + g1 * y / ysize; + uint16_t b = b0 * (ysize - y - 1) / ysize + b1 * y / ysize; + uint16_t a = a0 * (ysize - y - 1) / ysize + a1 * y / ysize; + // put some shape in there for visual debugging + if ((x - circle_x) * (x - circle_x) + (y - circle_y) * (y - circle_y) < + circle_r * circle_r) { + r = (65535 - x * y) ^ seed; + g = (x << 8) + y + seed; + b = (y << 8) + x * seed; + a = 32768 + x * 256 - y; + } else if (x > rect_x0 && x < rect_x1 && y > rect_y0 && y < rect_y1) { + r = rng(65536); + g = rng(65536); + b = rng(65536); + a = rng(65536); + } + size_t i = (y * xsize + x) * 2 * num_channels; + pixels[i + 0] = (r >> 8); + pixels[i + 1] = (r & 255); + if (num_channels >= 2) { + // This may store what is called 'g' in the alpha channel of a 2-channel + // image, but that's ok since the content is arbitrary + pixels[i + 2] = (g >> 8); + pixels[i + 3] = (g & 255); + } + if (num_channels >= 3) { + pixels[i + 4] = (b >> 8); + pixels[i + 5] = (b & 255); + } + if (num_channels >= 4) { + pixels[i + 6] = (a >> 8); + pixels[i + 7] = (a & 255); + } + } + } + return pixels; +} + +TestImage::TestImage() { + SetChannels(3); + SetAllBitDepths(8); + SetColorEncoding("RGB_D65_SRG_Rel_SRG"); +} + +TestImage& TestImage::DecodeFromBytes(const PaddedBytes& bytes) { + ColorEncoding c_enc; + JXL_CHECK( + ConvertExternalToInternalColorEncoding(ppf_.color_encoding, &c_enc)); + extras::ColorHints color_hints; + color_hints.Add("color_space", Description(c_enc)); + JXL_CHECK( + extras::DecodeBytes(Span(bytes), color_hints, &ppf_)); + return *this; +} + +TestImage& TestImage::ClearMetadata() { + ppf_.metadata = extras::PackedMetadata(); + return *this; +} + +TestImage& TestImage::SetDimensions(size_t xsize, size_t ysize) { + if (xsize <= ppf_.info.xsize && ysize <= ppf_.info.ysize) { + for (auto& frame : ppf_.frames) { + CropLayerInfo(xsize, ysize, &frame.frame_info.layer_info); + CropImage(xsize, ysize, &frame.color); + for (auto& ec : frame.extra_channels) { + CropImage(xsize, ysize, &ec); + } + } + } else { + JXL_CHECK(ppf_.info.xsize == 0 && ppf_.info.ysize == 0); + } + ppf_.info.xsize = xsize; + ppf_.info.ysize = ysize; + return *this; +} + +TestImage& TestImage::SetChannels(size_t num_channels) { + JXL_CHECK(ppf_.frames.empty()); + JXL_CHECK(!ppf_.preview_frame); + ppf_.info.num_color_channels = num_channels < 3 ? 1 : 3; + ppf_.info.num_extra_channels = num_channels - ppf_.info.num_color_channels; + if (ppf_.info.num_extra_channels > 0 && ppf_.info.alpha_bits == 0) { + ppf_.info.alpha_bits = ppf_.info.bits_per_sample; + ppf_.info.alpha_exponent_bits = ppf_.info.exponent_bits_per_sample; + } + ppf_.extra_channels_info.clear(); + for (size_t i = 1; i < ppf_.info.num_extra_channels; ++i) { + extras::PackedExtraChannel ec; + ec.index = i; + JxlEncoderInitExtraChannelInfo(JXL_CHANNEL_ALPHA, &ec.ec_info); + if (ec.ec_info.bits_per_sample == 0) { + ec.ec_info.bits_per_sample = ppf_.info.bits_per_sample; + ec.ec_info.exponent_bits_per_sample = ppf_.info.exponent_bits_per_sample; + } + ppf_.extra_channels_info.emplace_back(std::move(ec)); + } + format_.num_channels = std::min(static_cast(4), num_channels); + if (ppf_.info.num_color_channels == 1 && + ppf_.color_encoding.color_space != JXL_COLOR_SPACE_GRAY) { + SetColorEncoding("Gra_D65_Rel_SRG"); + } + return *this; +} + +// Sets the same bit depth on color, alpha and all extra channels. +TestImage& TestImage::SetAllBitDepths(uint32_t bits_per_sample, + uint32_t exponent_bits_per_sample) { + ppf_.info.bits_per_sample = bits_per_sample; + ppf_.info.exponent_bits_per_sample = exponent_bits_per_sample; + if (ppf_.info.num_extra_channels > 0) { + ppf_.info.alpha_bits = bits_per_sample; + ppf_.info.alpha_exponent_bits = exponent_bits_per_sample; + } + for (size_t i = 0; i < ppf_.extra_channels_info.size(); ++i) { + extras::PackedExtraChannel& ec = ppf_.extra_channels_info[i]; + ec.ec_info.bits_per_sample = bits_per_sample; + ec.ec_info.exponent_bits_per_sample = exponent_bits_per_sample; + } + format_.data_type = DefaultDataType(ppf_.info); + return *this; +} + +TestImage& TestImage::SetDataType(JxlDataType data_type) { + format_.data_type = data_type; + return *this; +} + +TestImage& TestImage::SetEndianness(JxlEndianness endianness) { + format_.endianness = endianness; + return *this; +} + +TestImage& TestImage::SetColorEncoding(const std::string& description) { + JXL_CHECK(ParseDescription(description, &ppf_.color_encoding)); + ColorEncoding c_enc; + JXL_CHECK( + ConvertExternalToInternalColorEncoding(ppf_.color_encoding, &c_enc)); + JXL_CHECK(c_enc.CreateICC()); + PaddedBytes icc = c_enc.ICC(); + ppf_.icc.assign(icc.begin(), icc.end()); + return *this; +} + +TestImage& TestImage::CoalesceGIFAnimationWithAlpha() { + extras::PackedFrame canvas = ppf_.frames[0].Copy(); + JXL_CHECK(canvas.color.format.num_channels == 3); + JXL_CHECK(canvas.color.format.data_type == JXL_TYPE_UINT8); + JXL_CHECK(canvas.extra_channels.size() == 1); + for (size_t i = 1; i < ppf_.frames.size(); i++) { + const extras::PackedFrame& frame = ppf_.frames[i]; + JXL_CHECK(frame.extra_channels.size() == 1); + const JxlLayerInfo& layer_info = frame.frame_info.layer_info; + extras::PackedFrame rendered = canvas.Copy(); + uint8_t* pixels_rendered = + reinterpret_cast(rendered.color.pixels()); + const uint8_t* pixels_frame = + reinterpret_cast(frame.color.pixels()); + uint8_t* alpha_rendered = + reinterpret_cast(rendered.extra_channels[0].pixels()); + const uint8_t* alpha_frame = + reinterpret_cast(frame.extra_channels[0].pixels()); + for (size_t y = 0; y < frame.color.ysize; y++) { + for (size_t x = 0; x < frame.color.xsize; x++) { + size_t idx_frame = y * frame.color.xsize + x; + size_t idx_rendered = ((layer_info.crop_y0 + y) * rendered.color.xsize + + (layer_info.crop_x0 + x)); + if (alpha_frame[idx_frame] != 0) { + memcpy(&pixels_rendered[idx_rendered * 3], + &pixels_frame[idx_frame * 3], 3); + alpha_rendered[idx_rendered] = alpha_frame[idx_frame]; + } + } + } + if (layer_info.save_as_reference != 0) { + canvas = rendered.Copy(); + } + ppf_.frames[i] = std::move(rendered); + } + return *this; +} + +TestImage::Frame::Frame(TestImage* parent, bool is_preview, size_t index) + : parent_(parent), is_preview_(is_preview), index_(index) {} + +void TestImage::Frame::ZeroFill() { + memset(frame().color.pixels(), 0, frame().color.pixels_size); + for (auto& ec : frame().extra_channels) { + memset(ec.pixels(), 0, ec.pixels_size); + } +} + +void TestImage::Frame::RandomFill(uint16_t seed) { + FillPackedImage(ppf().info.bits_per_sample, seed, &frame().color); + for (size_t i = 0; i < ppf().extra_channels_info.size(); ++i) { + FillPackedImage(ppf().extra_channels_info[i].ec_info.bits_per_sample, + seed + 1 + i, &frame().extra_channels[i]); + } +} + +void TestImage::Frame::SetValue(size_t y, size_t x, size_t c, float val) { + const extras::PackedImage& color = frame().color; + JxlPixelFormat format = color.format; + JXL_CHECK(y < ppf().info.ysize); + JXL_CHECK(x < ppf().info.xsize); + JXL_CHECK(c < format.num_channels); + size_t pwidth = extras::PackedImage::BitsPerChannel(format.data_type) / 8; + size_t idx = ((y * color.xsize + x) * format.num_channels + c) * pwidth; + uint8_t* pixels = reinterpret_cast(frame().color.pixels()); + uint8_t* p = pixels + idx; + StoreValue(val, ppf().info.bits_per_sample, frame().color.format, &p); +} + +TestImage::Frame TestImage::AddFrame() { + size_t index = ppf_.frames.size(); + extras::PackedFrame frame(ppf_.info.xsize, ppf_.info.ysize, format_); + for (size_t i = 0; i < ppf_.extra_channels_info.size(); ++i) { + JxlPixelFormat ec_format = {1, format_.data_type, format_.endianness, 0}; + extras::PackedImage image(ppf_.info.xsize, ppf_.info.ysize, ec_format); + frame.extra_channels.emplace_back(std::move(image)); + } + ppf_.frames.emplace_back(std::move(frame)); + return Frame(this, false, index); +} + +TestImage::Frame TestImage::AddPreview(size_t xsize, size_t ysize) { + extras::PackedFrame frame(xsize, ysize, format_); + for (size_t i = 0; i < ppf_.extra_channels_info.size(); ++i) { + JxlPixelFormat ec_format = {1, format_.data_type, format_.endianness, 0}; + extras::PackedImage image(xsize, ysize, ec_format); + frame.extra_channels.emplace_back(std::move(image)); + } + ppf_.preview_frame = make_unique(std::move(frame)); + return Frame(this, true, 0); +} + +void TestImage::CropLayerInfo(size_t xsize, size_t ysize, JxlLayerInfo* info) { + if (info->crop_x0 < static_cast(xsize)) { + info->xsize = std::min(info->xsize, xsize - info->crop_x0); + } else { + info->xsize = 0; + } + if (info->crop_y0 < static_cast(ysize)) { + info->ysize = std::min(info->ysize, ysize - info->crop_y0); + } else { + info->ysize = 0; + } +} + +void TestImage::CropImage(size_t xsize, size_t ysize, + extras::PackedImage* image) { + size_t new_stride = (image->stride / image->xsize) * xsize; + uint8_t* buf = reinterpret_cast(image->pixels()); + for (size_t y = 0; y < ysize; ++y) { + memmove(&buf[y * new_stride], &buf[y * image->stride], new_stride); + } + image->xsize = xsize; + image->ysize = ysize; + image->stride = new_stride; + image->pixels_size = ysize * new_stride; +} + +JxlDataType TestImage::DefaultDataType(const JxlBasicInfo& info) { + if (info.bits_per_sample == 16 && info.exponent_bits_per_sample == 5) { + return JXL_TYPE_FLOAT16; + } else if (info.exponent_bits_per_sample > 0 || info.bits_per_sample > 16) { + return JXL_TYPE_FLOAT; + } else if (info.bits_per_sample > 8) { + return JXL_TYPE_UINT16; + } else { + return JXL_TYPE_UINT8; + } +} + +} // namespace test +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/test_image.h b/third_party/jpeg-xl/lib/jxl/test_image.h new file mode 100644 index 0000000000..0106a4b341 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/test_image.h @@ -0,0 +1,94 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_TEST_IMAGE_H_ +#define LIB_JXL_TEST_IMAGE_H_ + +#include +#include +#include +#include + +#include +#include + +#include "lib/extras/packed_image.h" +#include "lib/jxl/base/padded_bytes.h" + +namespace jxl { +namespace test { + +// Returns a test image with some autogenerated pixel content, using 16 bits per +// channel, big endian order, 1 to 4 channels +// The seed parameter allows to create images with different pixel content. +std::vector GetSomeTestImage(size_t xsize, size_t ysize, + size_t num_channels, uint16_t seed); + +class TestImage { + public: + TestImage(); + + extras::PackedPixelFile& ppf() { return ppf_; } + + TestImage& DecodeFromBytes(const PaddedBytes& bytes); + + TestImage& ClearMetadata(); + + TestImage& SetDimensions(size_t xsize, size_t ysize); + + TestImage& SetChannels(size_t num_channels); + + // Sets the same bit depth on color, alpha and all extra channels. + TestImage& SetAllBitDepths(uint32_t bits_per_sample, + uint32_t exponent_bits_per_sample = 0); + + TestImage& SetDataType(JxlDataType data_type); + + TestImage& SetEndianness(JxlEndianness endianness); + + TestImage& SetColorEncoding(const std::string& description); + + TestImage& CoalesceGIFAnimationWithAlpha(); + + class Frame { + public: + Frame(TestImage* parent, bool is_preview, size_t index); + + void ZeroFill(); + void RandomFill(uint16_t seed = 177); + + void SetValue(size_t y, size_t x, size_t c, float val); + + private: + extras::PackedPixelFile& ppf() const { return parent_->ppf(); } + + extras::PackedFrame& frame() { + return is_preview_ ? *ppf().preview_frame : ppf().frames[index_]; + } + + TestImage* parent_; + bool is_preview_; + size_t index_; + }; + + Frame AddFrame(); + + Frame AddPreview(size_t xsize, size_t ysize); + + private: + extras::PackedPixelFile ppf_; + JxlPixelFormat format_ = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}; + + static void CropLayerInfo(size_t xsize, size_t ysize, JxlLayerInfo* info); + + static void CropImage(size_t xsize, size_t ysize, extras::PackedImage* image); + + static JxlDataType DefaultDataType(const JxlBasicInfo& info); +}; + +} // namespace test +} // namespace jxl + +#endif // LIB_JXL_TEST_IMAGE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/test_utils.cc b/third_party/jpeg-xl/lib/jxl/test_utils.cc new file mode 100644 index 0000000000..223641a6a5 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/test_utils.cc @@ -0,0 +1,673 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/test_utils.h" + +#include +#include + +#include "lib/extras/packed_image_convert.h" +#include "lib/jxl/base/file_io.h" +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_butteraugli_pnorm.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/enc_external_image.h" +#include "lib/jxl/enc_file.h" + +#if !defined(TEST_DATA_PATH) +#include "tools/cpp/runfiles/runfiles.h" +#endif + +namespace jxl { +namespace test { + +#if defined(TEST_DATA_PATH) +std::string GetTestDataPath(const std::string& filename) { + return std::string(TEST_DATA_PATH "/") + filename; +} +#else +using bazel::tools::cpp::runfiles::Runfiles; +const std::unique_ptr kRunfiles(Runfiles::Create("")); +std::string GetTestDataPath(const std::string& filename) { + std::string root(JPEGXL_ROOT_PACKAGE "/testdata/"); + return kRunfiles->Rlocation(root + filename); +} +#endif + +PaddedBytes ReadTestData(const std::string& filename) { + std::string full_path = GetTestDataPath(filename); + PaddedBytes data; + fprintf(stderr, "ReadTestData %s\n", full_path.c_str()); + JXL_CHECK(jxl::ReadFile(full_path, &data)); + printf("Test data %s is %d bytes long.\n", filename.c_str(), + static_cast(data.size())); + return data; +} + +Status DecodeFile(extras::JXLDecompressParams dparams, + const Span file, CodecInOut* JXL_RESTRICT io, + ThreadPool* pool) { + SetThreadParallelRunner(dparams, pool); + extras::PackedPixelFile ppf; + JXL_RETURN_IF_ERROR(DecodeImageJXL(file.data(), file.size(), dparams, + /*decoded_bytes=*/nullptr, &ppf)); + JXL_RETURN_IF_ERROR(ConvertPackedPixelFileToCodecInOut(ppf, pool, io)); + return true; +} + +void JxlBasicInfoSetFromPixelFormat(JxlBasicInfo* basic_info, + const JxlPixelFormat* pixel_format) { + JxlEncoderInitBasicInfo(basic_info); + switch (pixel_format->data_type) { + case JXL_TYPE_FLOAT: + basic_info->bits_per_sample = 32; + basic_info->exponent_bits_per_sample = 8; + break; + case JXL_TYPE_FLOAT16: + basic_info->bits_per_sample = 16; + basic_info->exponent_bits_per_sample = 5; + break; + case JXL_TYPE_UINT8: + basic_info->bits_per_sample = 8; + basic_info->exponent_bits_per_sample = 0; + break; + case JXL_TYPE_UINT16: + basic_info->bits_per_sample = 16; + basic_info->exponent_bits_per_sample = 0; + break; + default: + JXL_ABORT("Unhandled JxlDataType"); + } + if (pixel_format->num_channels < 3) { + basic_info->num_color_channels = 1; + } else { + basic_info->num_color_channels = 3; + } + if (pixel_format->num_channels == 2 || pixel_format->num_channels == 4) { + basic_info->alpha_exponent_bits = basic_info->exponent_bits_per_sample; + basic_info->alpha_bits = basic_info->bits_per_sample; + basic_info->num_extra_channels = 1; + } else { + basic_info->alpha_exponent_bits = 0; + basic_info->alpha_bits = 0; + } +} + +ColorEncoding ColorEncodingFromDescriptor(const ColorEncodingDescriptor& desc) { + ColorEncoding c; + c.SetColorSpace(desc.color_space); + if (desc.color_space != ColorSpace::kXYB) { + c.white_point = desc.white_point; + c.primaries = desc.primaries; + c.tf.SetTransferFunction(desc.tf); + } + c.rendering_intent = desc.rendering_intent; + JXL_CHECK(c.CreateICC()); + return c; +} + +namespace { +void CheckSameEncodings(const std::vector& a, + const std::vector& b, + const std::string& check_name, + std::stringstream& failures) { + JXL_CHECK(a.size() == b.size()); + for (size_t i = 0; i < a.size(); ++i) { + if ((a[i].ICC() == b[i].ICC()) || + ((a[i].primaries == b[i].primaries) && a[i].tf.IsSame(b[i].tf))) { + continue; + } + failures << "CheckSameEncodings " << check_name << ": " << i + << "-th encoding mismatch\n"; + } +} +} // namespace + +bool Roundtrip(const CodecInOut* io, const CompressParams& cparams, + extras::JXLDecompressParams dparams, + CodecInOut* JXL_RESTRICT io2, std::stringstream& failures, + size_t* compressed_size, ThreadPool* pool, AuxOut* aux_out) { + if (compressed_size) { + *compressed_size = static_cast(-1); + } + PaddedBytes compressed; + + std::vector original_metadata_encodings; + std::vector original_current_encodings; + std::vector metadata_encodings_1; + std::vector metadata_encodings_2; + std::vector current_encodings_2; + original_metadata_encodings.reserve(io->frames.size()); + original_current_encodings.reserve(io->frames.size()); + metadata_encodings_1.reserve(io->frames.size()); + metadata_encodings_2.reserve(io->frames.size()); + current_encodings_2.reserve(io->frames.size()); + + for (const ImageBundle& ib : io->frames) { + // Remember original encoding, will be returned by decoder. + original_metadata_encodings.push_back(ib.metadata()->color_encoding); + // c_current should not change during encoding. + original_current_encodings.push_back(ib.c_current()); + } + + std::unique_ptr enc_state = + jxl::make_unique(); + JXL_CHECK(EncodeFile(cparams, io, enc_state.get(), &compressed, GetJxlCms(), + aux_out, pool)); + + for (const ImageBundle& ib1 : io->frames) { + metadata_encodings_1.push_back(ib1.metadata()->color_encoding); + } + + // Should still be in the same color space after encoding. + CheckSameEncodings(metadata_encodings_1, original_metadata_encodings, + "original vs after encoding", failures); + + JXL_CHECK(DecodeFile(dparams, Span(compressed), io2, pool)); + JXL_CHECK(io2->frames.size() == io->frames.size()); + + for (const ImageBundle& ib2 : io2->frames) { + metadata_encodings_2.push_back(ib2.metadata()->color_encoding); + current_encodings_2.push_back(ib2.c_current()); + } + + // We always produce the original color encoding if a color transform hook is + // set. + CheckSameEncodings(current_encodings_2, original_current_encodings, + "current: original vs decoded", failures); + + // Decoder returns the originals passed to the encoder. + CheckSameEncodings(metadata_encodings_2, original_metadata_encodings, + "metadata: original vs decoded", failures); + + if (compressed_size) { + *compressed_size = compressed.size(); + } + + return failures.str().empty(); +} + +size_t Roundtrip(const extras::PackedPixelFile& ppf_in, + extras::JXLCompressParams cparams, + extras::JXLDecompressParams dparams, ThreadPool* pool, + extras::PackedPixelFile* ppf_out) { + SetThreadParallelRunner(cparams, pool); + SetThreadParallelRunner(dparams, pool); + std::vector compressed; + JXL_CHECK(extras::EncodeImageJXL(cparams, ppf_in, /*jpeg_bytes=*/nullptr, + &compressed)); + size_t decoded_bytes = 0; + JXL_CHECK(extras::DecodeImageJXL(compressed.data(), compressed.size(), + dparams, &decoded_bytes, ppf_out)); + JXL_CHECK(decoded_bytes == compressed.size()); + return compressed.size(); +} + +std::vector AllEncodings() { + std::vector all_encodings; + all_encodings.reserve(300); + ColorEncoding c; + + for (ColorSpace cs : Values()) { + if (cs == ColorSpace::kUnknown || cs == ColorSpace::kXYB) continue; + c.SetColorSpace(cs); + + for (WhitePoint wp : Values()) { + if (wp == WhitePoint::kCustom) continue; + if (c.ImplicitWhitePoint() && c.white_point != wp) continue; + c.white_point = wp; + + for (Primaries primaries : Values()) { + if (primaries == Primaries::kCustom) continue; + if (!c.HasPrimaries()) continue; + c.primaries = primaries; + + for (TransferFunction tf : Values()) { + if (tf == TransferFunction::kUnknown) continue; + if (c.tf.SetImplicit() && + (c.tf.IsGamma() || c.tf.GetTransferFunction() != tf)) { + continue; + } + c.tf.SetTransferFunction(tf); + + for (RenderingIntent ri : Values()) { + ColorEncodingDescriptor cdesc; + cdesc.color_space = cs; + cdesc.white_point = wp; + cdesc.primaries = primaries; + cdesc.tf = tf; + cdesc.rendering_intent = ri; + all_encodings.push_back(cdesc); + } + } + } + } + } + + return all_encodings; +} + +jxl::CodecInOut SomeTestImageToCodecInOut(const std::vector& buf, + size_t num_channels, size_t xsize, + size_t ysize) { + jxl::CodecInOut io; + io.SetSize(xsize, ysize); + io.metadata.m.SetAlphaBits(16); + io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB( + /*is_gray=*/num_channels == 1 || num_channels == 2); + JxlPixelFormat format = {static_cast(num_channels), JXL_TYPE_UINT16, + JXL_BIG_ENDIAN, 0}; + JXL_CHECK(ConvertFromExternal( + jxl::Span(buf.data(), buf.size()), xsize, ysize, + jxl::ColorEncoding::SRGB(/*is_gray=*/num_channels < 3), + /*bits_per_sample=*/16, format, + /*pool=*/nullptr, + /*ib=*/&io.Main())); + return io; +} + +bool Near(double expected, double value, double max_dist) { + double dist = expected > value ? expected - value : value - expected; + return dist <= max_dist; +} + +float LoadFloat16(uint16_t bits16) { + const uint32_t sign = bits16 >> 15; + const uint32_t biased_exp = (bits16 >> 10) & 0x1F; + const uint32_t mantissa = bits16 & 0x3FF; + + // Subnormal or zero + if (biased_exp == 0) { + const float subnormal = (1.0f / 16384) * (mantissa * (1.0f / 1024)); + return sign ? -subnormal : subnormal; + } + + // Normalized: convert the representation directly (faster than ldexp/tables). + const uint32_t biased_exp32 = biased_exp + (127 - 15); + const uint32_t mantissa32 = mantissa << (23 - 10); + const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32; + + float result; + memcpy(&result, &bits32, 4); + return result; +} + +float LoadLEFloat16(const uint8_t* p) { + uint16_t bits16 = LoadLE16(p); + return LoadFloat16(bits16); +} + +float LoadBEFloat16(const uint8_t* p) { + uint16_t bits16 = LoadBE16(p); + return LoadFloat16(bits16); +} + +size_t GetPrecision(JxlDataType data_type) { + switch (data_type) { + case JXL_TYPE_UINT8: + return 8; + case JXL_TYPE_UINT16: + return 16; + case JXL_TYPE_FLOAT: + // Floating point mantissa precision + return 24; + case JXL_TYPE_FLOAT16: + return 11; + default: + JXL_ABORT("Unhandled JxlDataType"); + } +} + +size_t GetDataBits(JxlDataType data_type) { + switch (data_type) { + case JXL_TYPE_UINT8: + return 8; + case JXL_TYPE_UINT16: + return 16; + case JXL_TYPE_FLOAT: + return 32; + case JXL_TYPE_FLOAT16: + return 16; + default: + JXL_ABORT("Unhandled JxlDataType"); + } +} + +std::vector ConvertToRGBA32(const uint8_t* pixels, size_t xsize, + size_t ysize, const JxlPixelFormat& format, + double factor) { + std::vector result(xsize * ysize * 4); + size_t num_channels = format.num_channels; + bool gray = num_channels == 1 || num_channels == 2; + bool alpha = num_channels == 2 || num_channels == 4; + JxlEndianness endianness = format.endianness; + // Compute actual type: + if (endianness == JXL_NATIVE_ENDIAN) { + endianness = IsLittleEndian() ? JXL_LITTLE_ENDIAN : JXL_BIG_ENDIAN; + } + + size_t stride = + xsize * jxl::DivCeil(GetDataBits(format.data_type) * num_channels, + jxl::kBitsPerByte); + if (format.align > 1) stride = jxl::RoundUpTo(stride, format.align); + + if (format.data_type == JXL_TYPE_UINT8) { + // Multiplier to bring to 0-1.0 range + double mul = factor > 0.0 ? factor : 1.0 / 255.0; + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + size_t j = (y * xsize + x) * 4; + size_t i = y * stride + x * num_channels; + double r = pixels[i]; + double g = gray ? r : pixels[i + 1]; + double b = gray ? r : pixels[i + 2]; + double a = alpha ? pixels[i + num_channels - 1] : 255; + result[j + 0] = r * mul; + result[j + 1] = g * mul; + result[j + 2] = b * mul; + result[j + 3] = a * mul; + } + } + } else if (format.data_type == JXL_TYPE_UINT16) { + JXL_ASSERT(endianness != JXL_NATIVE_ENDIAN); + // Multiplier to bring to 0-1.0 range + double mul = factor > 0.0 ? factor : 1.0 / 65535.0; + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + size_t j = (y * xsize + x) * 4; + size_t i = y * stride + x * num_channels * 2; + double r, g, b, a; + if (endianness == JXL_BIG_ENDIAN) { + r = (pixels[i + 0] << 8) + pixels[i + 1]; + g = gray ? r : (pixels[i + 2] << 8) + pixels[i + 3]; + b = gray ? r : (pixels[i + 4] << 8) + pixels[i + 5]; + a = alpha ? (pixels[i + num_channels * 2 - 2] << 8) + + pixels[i + num_channels * 2 - 1] + : 65535; + } else { + r = (pixels[i + 1] << 8) + pixels[i + 0]; + g = gray ? r : (pixels[i + 3] << 8) + pixels[i + 2]; + b = gray ? r : (pixels[i + 5] << 8) + pixels[i + 4]; + a = alpha ? (pixels[i + num_channels * 2 - 1] << 8) + + pixels[i + num_channels * 2 - 2] + : 65535; + } + result[j + 0] = r * mul; + result[j + 1] = g * mul; + result[j + 2] = b * mul; + result[j + 3] = a * mul; + } + } + } else if (format.data_type == JXL_TYPE_FLOAT) { + JXL_ASSERT(endianness != JXL_NATIVE_ENDIAN); + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + size_t j = (y * xsize + x) * 4; + size_t i = y * stride + x * num_channels * 4; + double r, g, b, a; + if (endianness == JXL_BIG_ENDIAN) { + r = LoadBEFloat(pixels + i); + g = gray ? r : LoadBEFloat(pixels + i + 4); + b = gray ? r : LoadBEFloat(pixels + i + 8); + a = alpha ? LoadBEFloat(pixels + i + num_channels * 4 - 4) : 1.0; + } else { + r = LoadLEFloat(pixels + i); + g = gray ? r : LoadLEFloat(pixels + i + 4); + b = gray ? r : LoadLEFloat(pixels + i + 8); + a = alpha ? LoadLEFloat(pixels + i + num_channels * 4 - 4) : 1.0; + } + result[j + 0] = r; + result[j + 1] = g; + result[j + 2] = b; + result[j + 3] = a; + } + } + } else if (format.data_type == JXL_TYPE_FLOAT16) { + JXL_ASSERT(endianness != JXL_NATIVE_ENDIAN); + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + size_t j = (y * xsize + x) * 4; + size_t i = y * stride + x * num_channels * 2; + double r, g, b, a; + if (endianness == JXL_BIG_ENDIAN) { + r = LoadBEFloat16(pixels + i); + g = gray ? r : LoadBEFloat16(pixels + i + 2); + b = gray ? r : LoadBEFloat16(pixels + i + 4); + a = alpha ? LoadBEFloat16(pixels + i + num_channels * 2 - 2) : 1.0; + } else { + r = LoadLEFloat16(pixels + i); + g = gray ? r : LoadLEFloat16(pixels + i + 2); + b = gray ? r : LoadLEFloat16(pixels + i + 4); + a = alpha ? LoadLEFloat16(pixels + i + num_channels * 2 - 2) : 1.0; + } + result[j + 0] = r; + result[j + 1] = g; + result[j + 2] = b; + result[j + 3] = a; + } + } + } else { + JXL_ASSERT(false); // Unsupported type + } + return result; +} + +size_t ComparePixels(const uint8_t* a, const uint8_t* b, size_t xsize, + size_t ysize, const JxlPixelFormat& format_a, + const JxlPixelFormat& format_b, + double threshold_multiplier) { + // Convert both images to equal full precision for comparison. + std::vector a_full = ConvertToRGBA32(a, xsize, ysize, format_a); + std::vector b_full = ConvertToRGBA32(b, xsize, ysize, format_b); + bool gray_a = format_a.num_channels < 3; + bool gray_b = format_b.num_channels < 3; + bool alpha_a = !(format_a.num_channels & 1); + bool alpha_b = !(format_b.num_channels & 1); + size_t bits_a = GetPrecision(format_a.data_type); + size_t bits_b = GetPrecision(format_b.data_type); + size_t bits = std::min(bits_a, bits_b); + // How much distance is allowed in case of pixels with lower bit depths, given + // that the double precision float images use range 0-1.0. + // E.g. in case of 1-bit this is 0.5 since 0.499 must map to 0 and 0.501 must + // map to 1. + double precision = 0.5 * threshold_multiplier / ((1ull << bits) - 1ull); + if (format_a.data_type == JXL_TYPE_FLOAT16 || + format_b.data_type == JXL_TYPE_FLOAT16) { + // Lower the precision for float16, because it currently looks like the + // scalar and wasm implementations of hwy have 1 less bit of precision + // than the x86 implementations. + // TODO(lode): Set the required precision back to 11 bits when possible. + precision = 0.5 * threshold_multiplier / ((1ull << (bits - 1)) - 1ull); + } + size_t numdiff = 0; + for (size_t y = 0; y < ysize; y++) { + for (size_t x = 0; x < xsize; x++) { + size_t i = (y * xsize + x) * 4; + bool ok = true; + if (gray_a || gray_b) { + if (!Near(a_full[i + 0], b_full[i + 0], precision)) ok = false; + // If the input was grayscale and the output not, then the output must + // have all channels equal. + if (gray_a && b_full[i + 0] != b_full[i + 1] && + b_full[i + 2] != b_full[i + 2]) { + ok = false; + } + } else { + if (!Near(a_full[i + 0], b_full[i + 0], precision) || + !Near(a_full[i + 1], b_full[i + 1], precision) || + !Near(a_full[i + 2], b_full[i + 2], precision)) { + ok = false; + } + } + if (alpha_a && alpha_b) { + if (!Near(a_full[i + 3], b_full[i + 3], precision)) ok = false; + } else { + // If the input had no alpha channel, the output should be opaque + // after roundtrip. + if (alpha_b && !Near(1.0, b_full[i + 3], precision)) ok = false; + } + if (!ok) numdiff++; + } + } + return numdiff; +} + +double DistanceRMS(const uint8_t* a, const uint8_t* b, size_t xsize, + size_t ysize, const JxlPixelFormat& format) { + // Convert both images to equal full precision for comparison. + std::vector a_full = ConvertToRGBA32(a, xsize, ysize, format); + std::vector b_full = ConvertToRGBA32(b, xsize, ysize, format); + double sum = 0.0; + for (size_t y = 0; y < ysize; y++) { + double row_sum = 0.0; + for (size_t x = 0; x < xsize; x++) { + size_t i = (y * xsize + x) * 4; + for (size_t c = 0; c < format.num_channels; ++c) { + double diff = a_full[i + c] - b_full[i + c]; + row_sum += diff * diff; + } + } + sum += row_sum; + } + sum /= (xsize * ysize); + return sqrt(sum); +} + +float ButteraugliDistance(const extras::PackedPixelFile& a, + const extras::PackedPixelFile& b, ThreadPool* pool) { + CodecInOut io0; + JXL_CHECK(ConvertPackedPixelFileToCodecInOut(a, pool, &io0)); + CodecInOut io1; + JXL_CHECK(ConvertPackedPixelFileToCodecInOut(b, pool, &io1)); + // TODO(eustas): simplify? + return ButteraugliDistance(io0.frames, io1.frames, ButteraugliParams(), + GetJxlCms(), + /*distmap=*/nullptr, pool); +} + +float Butteraugli3Norm(const extras::PackedPixelFile& a, + const extras::PackedPixelFile& b, ThreadPool* pool) { + CodecInOut io0; + JXL_CHECK(ConvertPackedPixelFileToCodecInOut(a, pool, &io0)); + CodecInOut io1; + JXL_CHECK(ConvertPackedPixelFileToCodecInOut(b, pool, &io1)); + ButteraugliParams ba; + ImageF distmap; + ButteraugliDistance(io0.frames, io1.frames, ba, GetJxlCms(), &distmap, pool); + return ComputeDistanceP(distmap, ba, 3); +} + +float ComputeDistance2(const extras::PackedPixelFile& a, + const extras::PackedPixelFile& b) { + CodecInOut io0; + JXL_CHECK(ConvertPackedPixelFileToCodecInOut(a, nullptr, &io0)); + CodecInOut io1; + JXL_CHECK(ConvertPackedPixelFileToCodecInOut(b, nullptr, &io1)); + return ComputeDistance2(io0.Main(), io1.Main(), GetJxlCms()); +} + +bool SameAlpha(const extras::PackedPixelFile& a, + const extras::PackedPixelFile& b) { + JXL_CHECK(a.info.xsize == b.info.xsize); + JXL_CHECK(a.info.ysize == b.info.ysize); + JXL_CHECK(a.info.alpha_bits == b.info.alpha_bits); + JXL_CHECK(a.info.alpha_exponent_bits == b.info.alpha_exponent_bits); + JXL_CHECK(a.info.alpha_bits > 0); + JXL_CHECK(a.frames.size() == b.frames.size()); + for (size_t i = 0; i < a.frames.size(); ++i) { + const extras::PackedImage& color_a = a.frames[i].color; + const extras::PackedImage& color_b = b.frames[i].color; + JXL_CHECK(color_a.format.num_channels == color_b.format.num_channels); + JXL_CHECK(color_a.format.data_type == color_b.format.data_type); + JXL_CHECK(color_a.format.endianness == color_b.format.endianness); + JXL_CHECK(color_a.pixels_size == color_b.pixels_size); + size_t pwidth = + extras::PackedImage::BitsPerChannel(color_a.format.data_type) / 8; + size_t num_color = color_a.format.num_channels < 3 ? 1 : 3; + const uint8_t* p_a = reinterpret_cast(color_a.pixels()); + const uint8_t* p_b = reinterpret_cast(color_b.pixels()); + for (size_t y = 0; y < a.info.ysize; ++y) { + for (size_t x = 0; x < a.info.xsize; ++x) { + size_t idx = + ((y * a.info.xsize + x) * color_a.format.num_channels + num_color) * + pwidth; + if (memcmp(&p_a[idx], &p_b[idx], pwidth) != 0) { + return false; + } + } + } + } + return true; +} + +bool SamePixels(const extras::PackedImage& a, const extras::PackedImage& b) { + JXL_CHECK(a.xsize == b.xsize); + JXL_CHECK(a.ysize == b.ysize); + JXL_CHECK(a.format.num_channels == b.format.num_channels); + JXL_CHECK(a.format.data_type == b.format.data_type); + JXL_CHECK(a.format.endianness == b.format.endianness); + JXL_CHECK(a.pixels_size == b.pixels_size); + const uint8_t* p_a = reinterpret_cast(a.pixels()); + const uint8_t* p_b = reinterpret_cast(b.pixels()); + for (size_t y = 0; y < a.ysize; ++y) { + for (size_t x = 0; x < a.xsize; ++x) { + size_t idx = (y * a.xsize + x) * a.pixel_stride(); + if (memcmp(&p_a[idx], &p_b[idx], a.pixel_stride()) != 0) { + printf("Mismatch at row %" PRIuS " col %" PRIuS "\n", y, x); + printf(" a: "); + for (size_t j = 0; j < a.pixel_stride(); ++j) { + printf(" %3u", p_a[idx + j]); + } + printf("\n b: "); + for (size_t j = 0; j < a.pixel_stride(); ++j) { + printf(" %3u", p_b[idx + j]); + } + printf("\n"); + return false; + } + } + } + return true; +} + +bool SamePixels(const extras::PackedPixelFile& a, + const extras::PackedPixelFile& b) { + JXL_CHECK(a.info.xsize == b.info.xsize); + JXL_CHECK(a.info.ysize == b.info.ysize); + JXL_CHECK(a.info.bits_per_sample == b.info.bits_per_sample); + JXL_CHECK(a.info.exponent_bits_per_sample == b.info.exponent_bits_per_sample); + JXL_CHECK(a.frames.size() == b.frames.size()); + for (size_t i = 0; i < a.frames.size(); ++i) { + const auto& frame_a = a.frames[i]; + const auto& frame_b = b.frames[i]; + if (!SamePixels(frame_a.color, frame_b.color)) { + return false; + } + JXL_CHECK(frame_a.extra_channels.size() == frame_b.extra_channels.size()); + for (size_t j = 0; j < frame_a.extra_channels.size(); ++j) { + if (!SamePixels(frame_a.extra_channels[i], frame_b.extra_channels[i])) { + return false; + } + } + } + return true; +} + +} // namespace test + +bool operator==(const jxl::PaddedBytes& a, const jxl::PaddedBytes& b) { + if (a.size() != b.size()) return false; + if (memcmp(a.data(), b.data(), a.size()) != 0) return false; + return true; +} + +// Allow using EXPECT_EQ on jxl::PaddedBytes +bool operator!=(const jxl::PaddedBytes& a, const jxl::PaddedBytes& b) { + return !(a == b); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/test_utils.h b/third_party/jpeg-xl/lib/jxl/test_utils.h new file mode 100644 index 0000000000..8c5cd434f5 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/test_utils.h @@ -0,0 +1,175 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_TEST_UTILS_H_ +#define LIB_JXL_TEST_UTILS_H_ + +// TODO(eustas): reduce includes (move to .cc) + +// Macros and functions useful for tests. + +#include +#include +#include +#include + +#include +#include + +#include "lib/extras/dec/jxl.h" +#include "lib/extras/enc/jxl.h" +#include "lib/extras/packed_image.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/enc_params.h" + +namespace jxl { + +struct AuxOut; + +namespace test { + +std::string GetTestDataPath(const std::string& filename); +PaddedBytes ReadTestData(const std::string& filename); + +void JxlBasicInfoSetFromPixelFormat(JxlBasicInfo* basic_info, + const JxlPixelFormat* pixel_format); + +template +void SetThreadParallelRunner(Params params, ThreadPool* pool) { + if (pool && !params.runner_opaque) { + params.runner = pool->runner(); + params.runner_opaque = pool->runner_opaque(); + } +} + +Status DecodeFile(extras::JXLDecompressParams dparams, + const Span file, CodecInOut* JXL_RESTRICT io, + ThreadPool* pool = nullptr); + +bool Roundtrip(const CodecInOut* io, const CompressParams& cparams, + extras::JXLDecompressParams dparams, + CodecInOut* JXL_RESTRICT io2, std::stringstream& failures, + size_t* compressed_size = nullptr, ThreadPool* pool = nullptr, + AuxOut* aux_out = nullptr); + +// Returns compressed size [bytes]. +size_t Roundtrip(const extras::PackedPixelFile& ppf_in, + extras::JXLCompressParams cparams, + extras::JXLDecompressParams dparams, ThreadPool* pool, + extras::PackedPixelFile* ppf_out); + +// A POD descriptor of a ColorEncoding. Only used in tests as the return value +// of AllEncodings(). +struct ColorEncodingDescriptor { + ColorSpace color_space; + WhitePoint white_point; + Primaries primaries; + TransferFunction tf; + RenderingIntent rendering_intent; +}; + +ColorEncoding ColorEncodingFromDescriptor(const ColorEncodingDescriptor& desc); + +// Define the operator<< for tests. +static inline ::std::ostream& operator<<(::std::ostream& os, + const ColorEncodingDescriptor& c) { + return os << "ColorEncoding/" << Description(ColorEncodingFromDescriptor(c)); +} + +// Returns ColorEncodingDescriptors, which are only used in tests. To obtain a +// ColorEncoding object call ColorEncodingFromDescriptor and then call +// ColorEncoding::CreateProfile() on that object to generate a profile. +std::vector AllEncodings(); + +// Returns a CodecInOut based on the buf, xsize, ysize, and the assumption +// that the buffer was created using `GetSomeTestImage`. +jxl::CodecInOut SomeTestImageToCodecInOut(const std::vector& buf, + size_t num_channels, size_t xsize, + size_t ysize); + +bool Near(double expected, double value, double max_dist); + +// Based on highway scalar implementation, for testing +float LoadFloat16(uint16_t bits16); + +float LoadLEFloat16(const uint8_t* p); + +float LoadBEFloat16(const uint8_t* p); + +size_t GetPrecision(JxlDataType data_type); + +size_t GetDataBits(JxlDataType data_type); + +// Procedure to convert pixels to double precision, not efficient, but +// well-controlled for testing. It uses double, to be able to represent all +// precisions needed for the maximum data types the API supports: uint32_t +// integers, and, single precision float. The values are in range 0-1 for SDR. +std::vector ConvertToRGBA32(const uint8_t* pixels, size_t xsize, + size_t ysize, const JxlPixelFormat& format, + double factor = 0.0); + +// Returns amount of pixels which differ between the two pictures. Image b is +// the image after roundtrip after roundtrip, image a before roundtrip. There +// are more strict requirements for the alpha channel and grayscale values of +// the output image. +size_t ComparePixels(const uint8_t* a, const uint8_t* b, size_t xsize, + size_t ysize, const JxlPixelFormat& format_a, + const JxlPixelFormat& format_b, + double threshold_multiplier = 1.0); + +double DistanceRMS(const uint8_t* a, const uint8_t* b, size_t xsize, + size_t ysize, const JxlPixelFormat& format); + +float ButteraugliDistance(const extras::PackedPixelFile& a, + const extras::PackedPixelFile& b, + ThreadPool* pool = nullptr); + +float Butteraugli3Norm(const extras::PackedPixelFile& a, + const extras::PackedPixelFile& b, + ThreadPool* pool = nullptr); + +float ComputeDistance2(const extras::PackedPixelFile& a, + const extras::PackedPixelFile& b); + +bool SameAlpha(const extras::PackedPixelFile& a, + const extras::PackedPixelFile& b); + +bool SamePixels(const extras::PackedImage& a, const extras::PackedImage& b); + +bool SamePixels(const extras::PackedPixelFile& a, + const extras::PackedPixelFile& b); + +class ThreadPoolForTests { + public: + explicit ThreadPoolForTests(int num_threads) { + runner_ = + JxlThreadParallelRunnerMake(/* memory_manager */ nullptr, num_threads); + pool_ = + jxl::make_unique(JxlThreadParallelRunner, runner_.get()); + } + ThreadPoolForTests(const ThreadPoolForTests&) = delete; + ThreadPoolForTests& operator&(const ThreadPoolForTests&) = delete; + ThreadPool* operator&() { return pool_.get(); } + + private: + JxlThreadParallelRunnerPtr runner_; + std::unique_ptr pool_; +}; + +} // namespace test + +bool operator==(const jxl::PaddedBytes& a, const jxl::PaddedBytes& b); + +// Allow using EXPECT_EQ on jxl::PaddedBytes +bool operator!=(const jxl::PaddedBytes& a, const jxl::PaddedBytes& b); + +} // namespace jxl + +#endif // LIB_JXL_TEST_UTILS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/testing.h b/third_party/jpeg-xl/lib/jxl/testing.h new file mode 100644 index 0000000000..d10b0c3c54 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/testing.h @@ -0,0 +1,73 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_TESTING_H_ +#define LIB_JXL_TESTING_H_ + +// GTest/GMock specific macros / wrappers. + +// gmock unconditionally redefines those macros (to wrong values). +// Lets include it only here and mitigate the problem. +#pragma push_macro("PRIdS") +#pragma push_macro("PRIuS") +#include "gmock/gmock.h" +#pragma pop_macro("PRIuS") +#pragma pop_macro("PRIdS") + +#include + +#include "gtest/gtest.h" + +#ifdef JXL_DISABLE_SLOW_TESTS +#define JXL_SLOW_TEST(X) DISABLED_##X +#else +#define JXL_SLOW_TEST(X) X +#endif // JXL_DISABLE_SLOW_TESTS + +#if JPEGXL_ENABLE_TRANSCODE_JPEG +#define JXL_TRANSCODE_JPEG_TEST(X) X +#else +#define JXL_TRANSCODE_JPEG_TEST(X) DISABLED_##X +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + +#if JPEGXL_ENABLE_BOXES +#define JXL_BOXES_TEST(X) X +#else +#define JXL_BOXES_TEST(X) DISABLED_##X +#endif // JPEGXL_ENABLE_BOXES + +#ifdef THREAD_SANITIZER +#define JXL_TSAN_SLOW_TEST(X) DISABLED_##X +#else +#define JXL_TSAN_SLOW_TEST(X) X +#endif // THREAD_SANITIZER + +// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead +// used INSTANTIATE_TEST_CASE_P which is now deprecated. +#ifdef INSTANTIATE_TEST_SUITE_P +#define JXL_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P +#else +#define JXL_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P +#endif + +// Ensures that we don't make our test bounds too lax, effectively disabling the +// tests. +MATCHER_P(IsSlightlyBelow, max, "") { + return max * 0.75 <= arg && arg <= max * 1.0; +} + +#define JXL_EXPECT_OK(F) \ + { \ + std::stringstream _; \ + EXPECT_TRUE(F) << _.str(); \ + } + +#define JXL_ASSERT_OK(F) \ + { \ + std::stringstream _; \ + ASSERT_TRUE(F) << _.str(); \ + } + +#endif // LIB_JXL_TESTING_H_ diff --git a/third_party/jpeg-xl/lib/jxl/tf_gbench.cc b/third_party/jpeg-xl/lib/jxl/tf_gbench.cc new file mode 100644 index 0000000000..9c010d460a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/tf_gbench.cc @@ -0,0 +1,143 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "benchmark/benchmark.h" +#include "lib/jxl/image_ops.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/tf_gbench.cc" +#include +#include + +#include "lib/jxl/transfer_functions-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +#define RUN_BENCHMARK(F) \ + constexpr size_t kNum = 1 << 12; \ + HWY_FULL(float) d; \ + /* Three parallel runs, as this will run on R, G and B. */ \ + auto sum1 = Zero(d); \ + auto sum2 = Zero(d); \ + auto sum3 = Zero(d); \ + for (auto _ : state) { \ + auto x = Set(d, 1e-5); \ + auto v1 = Set(d, 1e-5); \ + auto v2 = Set(d, 1.1e-5); \ + auto v3 = Set(d, 1.2e-5); \ + for (size_t i = 0; i < kNum; i++) { \ + sum1 += F(d, v1); \ + sum2 += F(d, v2); \ + sum3 += F(d, v3); \ + v1 += x; \ + v2 += x; \ + v3 += x; \ + } \ + } \ + /* floats per second */ \ + state.SetItemsProcessed(kNum* state.iterations() * Lanes(d) * 3); \ + benchmark::DoNotOptimize(sum1 + sum2 + sum3); + +#define RUN_BENCHMARK_SCALAR(F) \ + constexpr size_t kNum = 1 << 12; \ + /* Three parallel runs, as this will run on R, G and B. */ \ + float sum1 = 0, sum2 = 0, sum3 = 0; \ + for (auto _ : state) { \ + float x = 1e-5; \ + float v1 = 1e-5; \ + float v2 = 1.1e-5; \ + float v3 = 1.2e-5; \ + for (size_t i = 0; i < kNum; i++) { \ + sum1 += F(v1); \ + sum2 += F(v2); \ + sum3 += F(v3); \ + v1 += x; \ + v2 += x; \ + v3 += x; \ + } \ + } \ + /* floats per second */ \ + state.SetItemsProcessed(kNum* state.iterations() * 3); \ + benchmark::DoNotOptimize(sum1 + sum2 + sum3); + +HWY_NOINLINE void BM_FastSRGB(benchmark::State& state) { + RUN_BENCHMARK(FastLinearToSRGB); +} + +HWY_NOINLINE void BM_TFSRGB(benchmark::State& state) { + RUN_BENCHMARK(TF_SRGB().EncodedFromDisplay); +} + +HWY_NOINLINE void BM_PQDFE(benchmark::State& state) { + RUN_BENCHMARK(TF_PQ().DisplayFromEncoded); +} + +HWY_NOINLINE void BM_PQEFD(benchmark::State& state) { + RUN_BENCHMARK(TF_PQ().EncodedFromDisplay); +} + +HWY_NOINLINE void BM_PQSlowDFE(benchmark::State& state) { + RUN_BENCHMARK_SCALAR(TF_PQ().DisplayFromEncoded); +} + +HWY_NOINLINE void BM_PQSlowEFD(benchmark::State& state) { + RUN_BENCHMARK_SCALAR(TF_PQ().EncodedFromDisplay); +} +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +namespace { + +HWY_EXPORT(BM_FastSRGB); +HWY_EXPORT(BM_TFSRGB); +HWY_EXPORT(BM_PQDFE); +HWY_EXPORT(BM_PQEFD); +HWY_EXPORT(BM_PQSlowDFE); +HWY_EXPORT(BM_PQSlowEFD); + +float SRGB_pow(float x) { + return x < 0.0031308f ? 12.92f * x : 1.055f * powf(x, 1.0f / 2.4f) - 0.055f; +} + +void BM_FastSRGB(benchmark::State& state) { + HWY_DYNAMIC_DISPATCH(BM_FastSRGB)(state); +} +void BM_TFSRGB(benchmark::State& state) { + HWY_DYNAMIC_DISPATCH(BM_TFSRGB)(state); +} +void BM_PQDFE(benchmark::State& state) { + HWY_DYNAMIC_DISPATCH(BM_PQDFE)(state); +} +void BM_PQEFD(benchmark::State& state) { + HWY_DYNAMIC_DISPATCH(BM_PQEFD)(state); +} +void BM_PQSlowDFE(benchmark::State& state) { + HWY_DYNAMIC_DISPATCH(BM_PQSlowDFE)(state); +} +void BM_PQSlowEFD(benchmark::State& state) { + HWY_DYNAMIC_DISPATCH(BM_PQSlowEFD)(state); +} + +void BM_SRGB_pow(benchmark::State& state) { RUN_BENCHMARK_SCALAR(SRGB_pow); } + +BENCHMARK(BM_FastSRGB); +BENCHMARK(BM_TFSRGB); +BENCHMARK(BM_SRGB_pow); +BENCHMARK(BM_PQDFE); +BENCHMARK(BM_PQEFD); +BENCHMARK(BM_PQSlowDFE); +BENCHMARK(BM_PQSlowEFD); + +} // namespace +} // namespace jxl +#endif diff --git a/third_party/jpeg-xl/lib/jxl/toc.cc b/third_party/jpeg-xl/lib/jxl/toc.cc new file mode 100644 index 0000000000..fd7740c144 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/toc.cc @@ -0,0 +1,105 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/toc.h" + +#include + +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/common.h" +#include "lib/jxl/fields.h" + +namespace jxl { +size_t MaxBits(const size_t num_sizes) { + const size_t entry_bits = U32Coder::MaxEncodedBits(kTocDist) * num_sizes; + // permutation bit (not its tokens!), padding, entries, padding. + return 1 + kBitsPerByte + entry_bits + kBitsPerByte; +} + +Status ReadToc(size_t toc_entries, BitReader* JXL_RESTRICT reader, + std::vector* JXL_RESTRICT sizes, + std::vector* JXL_RESTRICT permutation) { + if (toc_entries > 65536) { + // Prevent out of memory if invalid JXL codestream causes a bogus amount + // of toc_entries such as 2720436919446 to be computed. + // TODO(lode): verify whether 65536 is a reasonable upper bound + return JXL_FAILURE("too many toc entries"); + } + + sizes->clear(); + sizes->resize(toc_entries); + if (reader->TotalBitsConsumed() >= reader->TotalBytes() * kBitsPerByte) { + return JXL_STATUS(StatusCode::kNotEnoughBytes, "Not enough bytes for TOC"); + } + const auto check_bit_budget = [&](size_t num_entries) -> Status { + // U32Coder reads 2 bits to recognize variant and kTocDist cheapest variant + // is Bits(10), this way at least 12 bits are required per toc-entry. + size_t minimal_bit_cost = num_entries * (2 + 10); + size_t bit_budget = reader->TotalBytes() * 8; + size_t expenses = reader->TotalBitsConsumed(); + if ((expenses <= bit_budget) && + (minimal_bit_cost <= bit_budget - expenses)) { + return true; + } + return JXL_STATUS(StatusCode::kNotEnoughBytes, "Not enough bytes for TOC"); + }; + + JXL_DASSERT(toc_entries > 0); + if (reader->ReadFixedBits<1>() == 1) { + JXL_RETURN_IF_ERROR(check_bit_budget(toc_entries)); + permutation->resize(toc_entries); + JXL_RETURN_IF_ERROR(DecodePermutation(/*skip=*/0, toc_entries, + permutation->data(), reader)); + } + JXL_RETURN_IF_ERROR(reader->JumpToByteBoundary()); + JXL_RETURN_IF_ERROR(check_bit_budget(toc_entries)); + for (size_t i = 0; i < toc_entries; ++i) { + (*sizes)[i] = U32Coder::Read(kTocDist, reader); + } + JXL_RETURN_IF_ERROR(reader->JumpToByteBoundary()); + JXL_RETURN_IF_ERROR(check_bit_budget(0)); + return true; +} + +Status ReadGroupOffsets(size_t toc_entries, BitReader* JXL_RESTRICT reader, + std::vector* JXL_RESTRICT offsets, + std::vector* JXL_RESTRICT sizes, + uint64_t* total_size) { + std::vector permutation; + JXL_RETURN_IF_ERROR(ReadToc(toc_entries, reader, sizes, &permutation)); + + offsets->clear(); + offsets->resize(toc_entries); + + // Prefix sum starting with 0 and ending with the offset of the last group + uint64_t offset = 0; + for (size_t i = 0; i < toc_entries; ++i) { + if (offset + (*sizes)[i] < offset) { + return JXL_FAILURE("group offset overflow"); + } + (*offsets)[i] = offset; + offset += (*sizes)[i]; + } + if (total_size) { + *total_size = offset; + } + + if (!permutation.empty()) { + std::vector permuted_offsets; + std::vector permuted_sizes; + permuted_offsets.reserve(toc_entries); + permuted_sizes.reserve(toc_entries); + for (coeff_order_t index : permutation) { + permuted_offsets.push_back((*offsets)[index]); + permuted_sizes.push_back((*sizes)[index]); + } + std::swap(*offsets, permuted_offsets); + std::swap(*sizes, permuted_sizes); + } + + return true; +} +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/toc.h b/third_party/jpeg-xl/lib/jxl/toc.h new file mode 100644 index 0000000000..a97197ad45 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/toc.h @@ -0,0 +1,55 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_TOC_H_ +#define LIB_JXL_TOC_H_ + +#include +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/field_encodings.h" + +namespace jxl { + +// (2+bits) = 2,3,4 bytes so encoders can patch TOC after encoding. +// 30 is sufficient for 4K channels of uncompressed 16-bit samples. +constexpr U32Enc kTocDist(Bits(10), BitsOffset(14, 1024), BitsOffset(22, 17408), + BitsOffset(30, 4211712)); + +size_t MaxBits(const size_t num_sizes); + +// TODO(veluca): move these to FrameDimensions. +static JXL_INLINE size_t AcGroupIndex(size_t pass, size_t group, + size_t num_groups, size_t num_dc_groups, + bool has_ac_global) { + return 1 + num_dc_groups + static_cast(has_ac_global) + + pass * num_groups + group; +} + +static JXL_INLINE size_t NumTocEntries(size_t num_groups, size_t num_dc_groups, + size_t num_passes, bool has_ac_global) { + if (num_groups == 1 && num_passes == 1) return 1; + return AcGroupIndex(0, 0, num_groups, num_dc_groups, has_ac_global) + + num_groups * num_passes; +} + +Status ReadToc(size_t toc_entries, BitReader* JXL_RESTRICT reader, + std::vector* JXL_RESTRICT sizes, + std::vector* JXL_RESTRICT permutation); + +Status ReadGroupOffsets(size_t toc_entries, BitReader* JXL_RESTRICT reader, + std::vector* JXL_RESTRICT offsets, + std::vector* JXL_RESTRICT sizes, + uint64_t* total_size); + +} // namespace jxl + +#endif // LIB_JXL_TOC_H_ diff --git a/third_party/jpeg-xl/lib/jxl/toc_test.cc b/third_party/jpeg-xl/lib/jxl/toc_test.cc new file mode 100644 index 0000000000..a7f0f2c27b --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/toc_test.cc @@ -0,0 +1,92 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/toc.h" + +#include "lib/jxl/base/random.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/common.h" +#include "lib/jxl/enc_aux_out.h" +#include "lib/jxl/enc_toc.h" +#include "lib/jxl/testing.h" + +namespace jxl { +namespace { + +void Roundtrip(size_t num_entries, bool permute, Rng* rng) { + // Generate a random permutation. + std::vector permutation(num_entries); + std::vector inv_permutation(num_entries); + for (size_t i = 0; i < num_entries; i++) { + permutation[i] = i; + inv_permutation[i] = i; + } + if (permute) { + rng->Shuffle(permutation.data(), permutation.size()); + for (size_t i = 0; i < num_entries; i++) { + inv_permutation[permutation[i]] = i; + } + } + + // Generate num_entries groups of random (byte-aligned) length + std::vector group_codes(num_entries); + for (BitWriter& writer : group_codes) { + const size_t max_bits = (*rng)() & 0xFFF; + BitWriter::Allotment allotment(&writer, max_bits + kBitsPerByte); + size_t i = 0; + for (; i + BitWriter::kMaxBitsPerCall < max_bits; + i += BitWriter::kMaxBitsPerCall) { + writer.Write(BitWriter::kMaxBitsPerCall, 0); + } + for (; i < max_bits; i += 1) { + writer.Write(/*n_bits=*/1, 0); + } + writer.ZeroPadToByte(); + AuxOut aux_out; + allotment.ReclaimAndCharge(&writer, 0, &aux_out); + } + + BitWriter writer; + AuxOut aux_out; + ASSERT_TRUE(WriteGroupOffsets(group_codes, permute ? &permutation : nullptr, + &writer, &aux_out)); + + BitReader reader(writer.GetSpan()); + std::vector group_offsets; + std::vector group_sizes; + uint64_t total_size; + ASSERT_TRUE(ReadGroupOffsets(num_entries, &reader, &group_offsets, + &group_sizes, &total_size)); + ASSERT_EQ(num_entries, group_offsets.size()); + ASSERT_EQ(num_entries, group_sizes.size()); + EXPECT_TRUE(reader.Close()); + + uint64_t prefix_sum = 0; + for (size_t i = 0; i < num_entries; ++i) { + EXPECT_EQ(prefix_sum, group_offsets[inv_permutation[i]]); + + EXPECT_EQ(0u, group_codes[i].BitsWritten() % kBitsPerByte); + prefix_sum += group_codes[i].BitsWritten() / kBitsPerByte; + + if (i + 1 < num_entries) { + EXPECT_EQ( + group_offsets[inv_permutation[i]] + group_sizes[inv_permutation[i]], + group_offsets[inv_permutation[i + 1]]); + } + } + EXPECT_EQ(prefix_sum, total_size); +} + +TEST(TocTest, Test) { + Rng rng(0); + for (size_t num_entries = 1; num_entries < 10; ++num_entries) { + for (bool permute : std::vector{false, true}) { + Roundtrip(num_entries, permute, &rng); + } + } +} + +} // namespace +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/transfer_functions-inl.h b/third_party/jpeg-xl/lib/jxl/transfer_functions-inl.h new file mode 100644 index 0000000000..9f4c10c76d --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/transfer_functions-inl.h @@ -0,0 +1,413 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Transfer functions for color encodings. + +#if defined(LIB_JXL_TRANSFER_FUNCTIONS_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_TRANSFER_FUNCTIONS_INL_H_ +#undef LIB_JXL_TRANSFER_FUNCTIONS_INL_H_ +#else +#define LIB_JXL_TRANSFER_FUNCTIONS_INL_H_ +#endif + +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/fast_math-inl.h" +#include "lib/jxl/rational_polynomial-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::And; +using hwy::HWY_NAMESPACE::AndNot; +using hwy::HWY_NAMESPACE::Gt; +using hwy::HWY_NAMESPACE::IfThenElse; +using hwy::HWY_NAMESPACE::Lt; +using hwy::HWY_NAMESPACE::Or; +using hwy::HWY_NAMESPACE::Sqrt; +using hwy::HWY_NAMESPACE::TableLookupBytes; + +// Definitions for BT.2100-2 transfer functions (used inside/outside SIMD): +// "display" is linear light (nits) normalized to [0, 1]. +// "encoded" is a nonlinear encoding (e.g. PQ) in [0, 1]. +// "scene" is a linear function of photon counts, normalized to [0, 1]. + +// Despite the stated ranges, we need unbounded transfer functions: see +// http://www.littlecms.com/CIC18_UnboundedCMM.pdf. Inputs can be negative or +// above 1 due to chromatic adaptation. To avoid severe round-trip errors caused +// by clamping, we mirror negative inputs via copysign (f(-x) = -f(x), see +// https://developer.apple.com/documentation/coregraphics/cgcolorspace/1644735-extendedsrgb) +// and extend the function domains above 1. + +// Hybrid Log-Gamma. +class TF_HLG { + public: + // EOTF. e = encoded. + JXL_INLINE double DisplayFromEncoded(const double e) const { + return OOTF(InvOETF(e)); + } + + // Inverse EOTF. d = display. + JXL_INLINE double EncodedFromDisplay(const double d) const { + return OETF(InvOOTF(d)); + } + + // Maximum error 5e-7. + template + JXL_INLINE V EncodedFromDisplay(D d, V x) const { + const hwy::HWY_NAMESPACE::Rebind du; + const V kSign = BitCast(d, Set(du, 0x80000000u)); + const V original_sign = And(x, kSign); + x = AndNot(kSign, x); // abs + const V below_div12 = Sqrt(Mul(Set(d, 3.0f), x)); + const V e = + MulAdd(Set(d, kA * 0.693147181f), + FastLog2f(d, MulAdd(Set(d, 12), x, Set(d, -kB))), Set(d, kC)); + const V magnitude = IfThenElse(Le(x, Set(d, kDiv12)), below_div12, e); + return Or(AndNot(kSign, magnitude), original_sign); + } + + private: + // OETF (defines the HLG approach). s = scene, returns encoded. + JXL_INLINE double OETF(double s) const { + if (s == 0.0) return 0.0; + const double original_sign = s; + s = std::abs(s); + + if (s <= kDiv12) return copysignf(std::sqrt(3.0 * s), original_sign); + + const double e = kA * std::log(12 * s - kB) + kC; + JXL_ASSERT(e > 0.0); + return copysignf(e, original_sign); + } + + // e = encoded, returns scene. + JXL_INLINE double InvOETF(double e) const { + if (e == 0.0) return 0.0; + const double original_sign = e; + e = std::abs(e); + + if (e <= 0.5) return copysignf(e * e * (1.0 / 3), original_sign); + + const double s = (std::exp((e - kC) * kRA) + kB) * kDiv12; + JXL_ASSERT(s >= 0); + return copysignf(s, original_sign); + } + + // s = scene, returns display. + JXL_INLINE double OOTF(const double s) const { + // The actual (red channel) OOTF is RD = alpha * YS^(gamma-1) * RS, where + // YS = 0.2627 * RS + 0.6780 * GS + 0.0593 * BS. Let alpha = 1 so we return + // "display" (normalized [0, 1]) instead of nits. Our transfer function + // interface does not allow a dependency on YS. Fortunately, the system + // gamma at 334 nits is 1.0, so this reduces to RD = RS. + return s; + } + + // d = display, returns scene. + JXL_INLINE double InvOOTF(const double d) const { + return d; // see OOTF(). + } + + static constexpr double kA = 0.17883277; + static constexpr double kRA = 1.0 / kA; + static constexpr double kB = 1 - 4 * kA; + static constexpr double kC = 0.5599107295; + static constexpr double kDiv12 = 1.0 / 12; +}; + +class TF_709 { + public: + JXL_INLINE double EncodedFromDisplay(const double d) const { + if (d < kThresh) return kMulLow * d; + return kMulHi * std::pow(d, kPowHi) + kSub; + } + + // Maximum error 1e-6. + template + JXL_INLINE V EncodedFromDisplay(D d, V x) const { + auto low = Mul(Set(d, kMulLow), x); + auto hi = + MulAdd(Set(d, kMulHi), FastPowf(d, x, Set(d, kPowHi)), Set(d, kSub)); + return IfThenElse(Le(x, Set(d, kThresh)), low, hi); + } + + template + JXL_INLINE V DisplayFromEncoded(D d, V x) const { + auto low = Mul(Set(d, kInvMulLow), x); + auto hi = FastPowf(d, MulAdd(x, Set(d, kInvMulHi), Set(d, kInvAdd)), + Set(d, kInvPowHi)); + return IfThenElse(Lt(x, Set(d, kInvThresh)), low, hi); + } + + private: + static constexpr double kThresh = 0.018; + static constexpr double kMulLow = 4.5; + static constexpr double kMulHi = 1.099; + static constexpr double kPowHi = 0.45; + static constexpr double kSub = -0.099; + + static constexpr double kInvThresh = 0.081; + static constexpr double kInvMulLow = 1 / 4.5; + static constexpr double kInvMulHi = 1 / 1.099; + static constexpr double kInvPowHi = 1 / 0.45; + static constexpr double kInvAdd = 0.099 * kInvMulHi; +}; + +// Perceptual Quantization +class TF_PQ { + public: + // EOTF (defines the PQ approach). e = encoded. + JXL_INLINE double DisplayFromEncoded(double e) const { + if (e == 0.0) return 0.0; + const double original_sign = e; + e = std::abs(e); + + const double xp = std::pow(e, 1.0 / kM2); + const double num = std::max(xp - kC1, 0.0); + const double den = kC2 - kC3 * xp; + JXL_DASSERT(den != 0.0); + const double d = std::pow(num / den, 1.0 / kM1); + JXL_DASSERT(d >= 0.0); // Equal for e ~= 1E-9 + return copysignf(d, original_sign); + } + + // Maximum error 3e-6 + template + JXL_INLINE V DisplayFromEncoded(D d, V x) const { + const hwy::HWY_NAMESPACE::Rebind du; + const V kSign = BitCast(d, Set(du, 0x80000000u)); + const V original_sign = And(x, kSign); + x = AndNot(kSign, x); // abs + // 4-over-4-degree rational polynomial approximation on x+x*x. This improves + // the maximum error by about 5x over a rational polynomial for x. + auto xpxx = MulAdd(x, x, x); + HWY_ALIGN constexpr float p[(4 + 1) * 4] = { + HWY_REP4(2.62975656e-04f), HWY_REP4(-6.23553089e-03f), + HWY_REP4(7.38602301e-01f), HWY_REP4(2.64553172e+00f), + HWY_REP4(5.50034862e-01f), + }; + HWY_ALIGN constexpr float q[(4 + 1) * 4] = { + HWY_REP4(4.21350107e+02f), HWY_REP4(-4.28736818e+02f), + HWY_REP4(1.74364667e+02f), HWY_REP4(-3.39078883e+01f), + HWY_REP4(2.67718770e+00f), + }; + auto magnitude = EvalRationalPolynomial(d, xpxx, p, q); + return Or(AndNot(kSign, magnitude), original_sign); + } + + // Inverse EOTF. d = display. + JXL_INLINE double EncodedFromDisplay(double d) const { + if (d == 0.0) return 0.0; + const double original_sign = d; + d = std::abs(d); + + const double xp = std::pow(d, kM1); + const double num = kC1 + xp * kC2; + const double den = 1.0 + xp * kC3; + const double e = std::pow(num / den, kM2); + JXL_DASSERT(e > 0.0); + return copysignf(e, original_sign); + } + + // Maximum error 7e-7. + template + JXL_INLINE V EncodedFromDisplay(D d, V x) const { + const hwy::HWY_NAMESPACE::Rebind du; + const V kSign = BitCast(d, Set(du, 0x80000000u)); + const V original_sign = And(x, kSign); + x = AndNot(kSign, x); // abs + // 4-over-4-degree rational polynomial approximation on x**0.25, with two + // different polynomials above and below 1e-4. + auto xto025 = Sqrt(Sqrt(x)); + HWY_ALIGN constexpr float p[(4 + 1) * 4] = { + HWY_REP4(1.351392e-02f), HWY_REP4(-1.095778e+00f), + HWY_REP4(5.522776e+01f), HWY_REP4(1.492516e+02f), + HWY_REP4(4.838434e+01f), + }; + HWY_ALIGN constexpr float q[(4 + 1) * 4] = { + HWY_REP4(1.012416e+00f), HWY_REP4(2.016708e+01f), + HWY_REP4(9.263710e+01f), HWY_REP4(1.120607e+02f), + HWY_REP4(2.590418e+01f), + }; + + HWY_ALIGN constexpr float plo[(4 + 1) * 4] = { + HWY_REP4(9.863406e-06f), HWY_REP4(3.881234e-01f), + HWY_REP4(1.352821e+02f), HWY_REP4(6.889862e+04f), + HWY_REP4(-2.864824e+05f), + }; + HWY_ALIGN constexpr float qlo[(4 + 1) * 4] = { + HWY_REP4(3.371868e+01f), HWY_REP4(1.477719e+03f), + HWY_REP4(1.608477e+04f), HWY_REP4(-4.389884e+04f), + HWY_REP4(-2.072546e+05f), + }; + + auto magnitude = IfThenElse(Lt(x, Set(d, 1e-4f)), + EvalRationalPolynomial(d, xto025, plo, qlo), + EvalRationalPolynomial(d, xto025, p, q)); + return Or(AndNot(kSign, magnitude), original_sign); + } + + private: + static constexpr double kM1 = 2610.0 / 16384; + static constexpr double kM2 = (2523.0 / 4096) * 128; + static constexpr double kC1 = 3424.0 / 4096; + static constexpr double kC2 = (2413.0 / 4096) * 32; + static constexpr double kC3 = (2392.0 / 4096) * 32; +}; + +// sRGB +class TF_SRGB { + public: + template + JXL_INLINE V DisplayFromEncoded(V x) const { + const HWY_FULL(float) d; + const HWY_FULL(uint32_t) du; + const V kSign = BitCast(d, Set(du, 0x80000000u)); + const V original_sign = And(x, kSign); + x = AndNot(kSign, x); // abs + + // TODO(janwas): range reduction + // Computed via af_cheb_rational (k=100); replicated 4x. + HWY_ALIGN constexpr float p[(4 + 1) * 4] = { + 2.200248328e-04f, 2.200248328e-04f, 2.200248328e-04f, 2.200248328e-04f, + 1.043637593e-02f, 1.043637593e-02f, 1.043637593e-02f, 1.043637593e-02f, + 1.624820318e-01f, 1.624820318e-01f, 1.624820318e-01f, 1.624820318e-01f, + 7.961564959e-01f, 7.961564959e-01f, 7.961564959e-01f, 7.961564959e-01f, + 8.210152774e-01f, 8.210152774e-01f, 8.210152774e-01f, 8.210152774e-01f, + }; + HWY_ALIGN constexpr float q[(4 + 1) * 4] = { + 2.631846970e-01f, 2.631846970e-01f, 2.631846970e-01f, + 2.631846970e-01f, 1.076976492e+00f, 1.076976492e+00f, + 1.076976492e+00f, 1.076976492e+00f, 4.987528350e-01f, + 4.987528350e-01f, 4.987528350e-01f, 4.987528350e-01f, + -5.512498495e-02f, -5.512498495e-02f, -5.512498495e-02f, + -5.512498495e-02f, 6.521209011e-03f, 6.521209011e-03f, + 6.521209011e-03f, 6.521209011e-03f, + }; + const V linear = Mul(x, Set(d, kLowDivInv)); + const V poly = EvalRationalPolynomial(d, x, p, q); + const V magnitude = + IfThenElse(Gt(x, Set(d, kThreshSRGBToLinear)), poly, linear); + return Or(AndNot(kSign, magnitude), original_sign); + } + + // Error ~5e-07 + template + JXL_INLINE V EncodedFromDisplay(D d, V x) const { + const hwy::HWY_NAMESPACE::Rebind du; + const V kSign = BitCast(d, Set(du, 0x80000000u)); + const V original_sign = And(x, kSign); + x = AndNot(kSign, x); // abs + + // Computed via af_cheb_rational (k=100); replicated 4x. + HWY_ALIGN constexpr float p[(4 + 1) * 4] = { + -5.135152395e-04f, -5.135152395e-04f, -5.135152395e-04f, + -5.135152395e-04f, 5.287254571e-03f, 5.287254571e-03f, + 5.287254571e-03f, 5.287254571e-03f, 3.903842876e-01f, + 3.903842876e-01f, 3.903842876e-01f, 3.903842876e-01f, + 1.474205315e+00f, 1.474205315e+00f, 1.474205315e+00f, + 1.474205315e+00f, 7.352629620e-01f, 7.352629620e-01f, + 7.352629620e-01f, 7.352629620e-01f, + }; + HWY_ALIGN constexpr float q[(4 + 1) * 4] = { + 1.004519624e-02f, 1.004519624e-02f, 1.004519624e-02f, 1.004519624e-02f, + 3.036675394e-01f, 3.036675394e-01f, 3.036675394e-01f, 3.036675394e-01f, + 1.340816930e+00f, 1.340816930e+00f, 1.340816930e+00f, 1.340816930e+00f, + 9.258482155e-01f, 9.258482155e-01f, 9.258482155e-01f, 9.258482155e-01f, + 2.424867759e-02f, 2.424867759e-02f, 2.424867759e-02f, 2.424867759e-02f, + }; + const V linear = Mul(x, Set(d, kLowDiv)); + const V poly = EvalRationalPolynomial(d, Sqrt(x), p, q); + const V magnitude = + IfThenElse(Gt(x, Set(d, kThreshLinearToSRGB)), poly, linear); + return Or(AndNot(kSign, magnitude), original_sign); + } + + private: + static constexpr float kThreshSRGBToLinear = 0.04045f; + static constexpr float kThreshLinearToSRGB = 0.0031308f; + static constexpr float kLowDiv = 12.92f; + static constexpr float kLowDivInv = 1.0f / kLowDiv; +}; + +// Linear to sRGB conversion with error of at most 1.2e-4. +template +V FastLinearToSRGB(D d, V v) { + const hwy::HWY_NAMESPACE::Rebind du; + const hwy::HWY_NAMESPACE::Rebind di; + // Convert to 0.25 - 0.5 range. + auto v025_05 = BitCast( + d, And(Or(BitCast(du, v), Set(du, 0x3e800000)), Set(du, 0x3effffff))); + // third degree polynomial approximation between 0.25 and 0.5 + // of 1.055/2^(7/2.4) * x^(1/2.4) * 0.5. A degree 4 polynomial only improves + // accuracy by about 3x. + auto d1 = MulAdd(v025_05, Set(d, 0.059914046f), Set(d, -0.108894556f)); + auto d2 = MulAdd(d1, v025_05, Set(d, 0.107963754f)); + auto pow = MulAdd(d2, v025_05, Set(d, 0.018092343f)); + // Compute extra multiplier depending on exponent. Valid exponent range for + // [0.0031308f, 1.0) is 0...8 after subtracting 118. + // The next three constants contain a representation of the powers of + // 2**(1/2.4) = 2**(5/12) times two; in particular, bits from 26 to 31 are + // always the same and in k2to512powers_basebits, and the two arrays contain + // the next groups of 8 bits. This ends up being a 22-bit representation (with + // a mantissa of 13 bits). The choice of polynomial to approximate is such + // that the multiplication factor has the highest 5 bits constant, and that + // the factor for the lowest possible exponent is a power of two (thus making + // the additional bits 0, which is used to correctly merge back together the + // floats). + constexpr uint32_t k2to512powers_basebits = 0x40000000; + HWY_ALIGN constexpr uint8_t k2to512powers_25to18bits[16] = { + 0x0, 0xa, 0x19, 0x26, 0x32, 0x41, 0x4d, 0x5c, + 0x68, 0x75, 0x83, 0x8f, 0xa0, 0xaa, 0xb9, 0xc6, + }; + HWY_ALIGN constexpr uint8_t k2to512powers_17to10bits[16] = { + 0x0, 0xb7, 0x4, 0xd, 0xcb, 0xe7, 0x41, 0x68, + 0x51, 0xd1, 0xeb, 0xf2, 0x0, 0xb7, 0x4, 0xd, + }; + // Note that vld1q_s8_x2 on ARM seems to actually be slower. +#if HWY_TARGET != HWY_SCALAR + using hwy::HWY_NAMESPACE::ShiftLeft; + using hwy::HWY_NAMESPACE::ShiftRight; + // Every lane of exp is now (if cast to byte) {0, 0, 0, }. + auto exp = Sub(ShiftRight<23>(BitCast(di, v)), Set(di, 118)); + auto pow25to18bits = TableLookupBytes( + LoadDup128(di, + reinterpret_cast(k2to512powers_25to18bits)), + exp); + auto pow17to10bits = TableLookupBytes( + LoadDup128(di, + reinterpret_cast(k2to512powers_17to10bits)), + exp); + // Now, pow* contain {0, 0, 0, }. Here + // we take advantage of the fact that each table has its position 0 equal to + // 0. + // We can now just reassemble the float. + auto mul = BitCast( + d, Or(Or(ShiftLeft<18>(pow25to18bits), ShiftLeft<10>(pow17to10bits)), + Set(di, k2to512powers_basebits))); +#else + // Fallback for scalar. + uint32_t exp = ((BitCast(di, v).raw >> 23) - 118) & 0xf; + auto mul = BitCast(d, Set(di, (k2to512powers_25to18bits[exp] << 18) | + (k2to512powers_17to10bits[exp] << 10) | + k2to512powers_basebits)); +#endif + return IfThenElse(Lt(v, Set(d, 0.0031308f)), Mul(v, Set(d, 12.92f)), + MulAdd(pow, mul, Set(d, -0.055))); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_TRANSFER_FUNCTIONS_INL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/transpose-inl.h b/third_party/jpeg-xl/lib/jxl/transpose-inl.h new file mode 100644 index 0000000000..4674420737 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/transpose-inl.h @@ -0,0 +1,203 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Block transpose for DCT/IDCT + +#if defined(LIB_JXL_TRANSPOSE_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_TRANSPOSE_INL_H_ +#undef LIB_JXL_TRANSPOSE_INL_H_ +#else +#define LIB_JXL_TRANSPOSE_INL_H_ +#endif + +#include + +#include +#include + +#include "lib/jxl/base/status.h" +#include "lib/jxl/dct_block-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +#ifndef JXL_INLINE_TRANSPOSE +// Workaround for issue #42 - (excessive?) inlining causes invalid codegen. +#if defined(__arm__) +#define JXL_INLINE_TRANSPOSE HWY_NOINLINE +#else +#define JXL_INLINE_TRANSPOSE HWY_INLINE +#endif +#endif // JXL_INLINE_TRANSPOSE + +// Simple wrapper that ensures that a function will not be inlined. +template +JXL_NOINLINE void NoInlineWrapper(const T& f, const Args&... args) { + return f(args...); +} + +template +struct TransposeSimdTag {}; + +// TODO(veluca): it's not super useful to have this in the SIMD namespace. +template +JXL_INLINE_TRANSPOSE void GenericTransposeBlock(TransposeSimdTag, + const From& from, const To& to, + size_t ROWSp, size_t COLSp) { + size_t ROWS = ROWS_or_0 == 0 ? ROWSp : ROWS_or_0; + size_t COLS = COLS_or_0 == 0 ? COLSp : COLS_or_0; + for (size_t n = 0; n < ROWS; ++n) { + for (size_t m = 0; m < COLS; ++m) { + to.Write(from.Read(n, m), m, n); + } + } +} + +// TODO(veluca): AVX3? +#if HWY_CAP_GE256 +constexpr bool TransposeUseSimd(size_t ROWS, size_t COLS) { + return ROWS % 8 == 0 && COLS % 8 == 0; +} + +template +JXL_INLINE_TRANSPOSE void GenericTransposeBlock(TransposeSimdTag, + const From& from, const To& to, + size_t ROWSp, size_t COLSp) { + size_t ROWS = ROWS_or_0 == 0 ? ROWSp : ROWS_or_0; + size_t COLS = COLS_or_0 == 0 ? COLSp : COLS_or_0; + static_assert(MaxLanes(BlockDesc<8>()) == 8, "Invalid descriptor size"); + static_assert(ROWS_or_0 % 8 == 0, "Invalid number of rows"); + static_assert(COLS_or_0 % 8 == 0, "Invalid number of columns"); + for (size_t n = 0; n < ROWS; n += 8) { + for (size_t m = 0; m < COLS; m += 8) { + const BlockDesc<8> d; + auto i0 = from.LoadPart(d, n + 0, m + 0); + auto i1 = from.LoadPart(d, n + 1, m + 0); + auto i2 = from.LoadPart(d, n + 2, m + 0); + auto i3 = from.LoadPart(d, n + 3, m + 0); + auto i4 = from.LoadPart(d, n + 4, m + 0); + auto i5 = from.LoadPart(d, n + 5, m + 0); + auto i6 = from.LoadPart(d, n + 6, m + 0); + auto i7 = from.LoadPart(d, n + 7, m + 0); + // Surprisingly, this straightforward implementation (24 cycles on port5) + // is faster than load128+insert and LoadDup128+ConcatUpperLower+blend. + const auto q0 = InterleaveLower(d, i0, i2); + const auto q1 = InterleaveLower(d, i1, i3); + const auto q2 = InterleaveUpper(d, i0, i2); + const auto q3 = InterleaveUpper(d, i1, i3); + const auto q4 = InterleaveLower(d, i4, i6); + const auto q5 = InterleaveLower(d, i5, i7); + const auto q6 = InterleaveUpper(d, i4, i6); + const auto q7 = InterleaveUpper(d, i5, i7); + + const auto r0 = InterleaveLower(d, q0, q1); + const auto r1 = InterleaveUpper(d, q0, q1); + const auto r2 = InterleaveLower(d, q2, q3); + const auto r3 = InterleaveUpper(d, q2, q3); + const auto r4 = InterleaveLower(d, q4, q5); + const auto r5 = InterleaveUpper(d, q4, q5); + const auto r6 = InterleaveLower(d, q6, q7); + const auto r7 = InterleaveUpper(d, q6, q7); + + i0 = ConcatLowerLower(d, r4, r0); + i1 = ConcatLowerLower(d, r5, r1); + i2 = ConcatLowerLower(d, r6, r2); + i3 = ConcatLowerLower(d, r7, r3); + i4 = ConcatUpperUpper(d, r4, r0); + i5 = ConcatUpperUpper(d, r5, r1); + i6 = ConcatUpperUpper(d, r6, r2); + i7 = ConcatUpperUpper(d, r7, r3); + to.StorePart(d, i0, m + 0, n + 0); + to.StorePart(d, i1, m + 1, n + 0); + to.StorePart(d, i2, m + 2, n + 0); + to.StorePart(d, i3, m + 3, n + 0); + to.StorePart(d, i4, m + 4, n + 0); + to.StorePart(d, i5, m + 5, n + 0); + to.StorePart(d, i6, m + 6, n + 0); + to.StorePart(d, i7, m + 7, n + 0); + } + } +} +#elif HWY_TARGET != HWY_SCALAR +constexpr bool TransposeUseSimd(size_t ROWS, size_t COLS) { + return ROWS % 4 == 0 && COLS % 4 == 0; +} + +template +JXL_INLINE_TRANSPOSE void GenericTransposeBlock(TransposeSimdTag, + const From& from, const To& to, + size_t ROWSp, size_t COLSp) { + size_t ROWS = ROWS_or_0 == 0 ? ROWSp : ROWS_or_0; + size_t COLS = COLS_or_0 == 0 ? COLSp : COLS_or_0; + static_assert(MaxLanes(BlockDesc<4>()) == 4, "Invalid descriptor size"); + static_assert(ROWS_or_0 % 4 == 0, "Invalid number of rows"); + static_assert(COLS_or_0 % 4 == 0, "Invalid number of columns"); + for (size_t n = 0; n < ROWS; n += 4) { + for (size_t m = 0; m < COLS; m += 4) { + const BlockDesc<4> d; + const auto p0 = from.LoadPart(d, n + 0, m + 0); + const auto p1 = from.LoadPart(d, n + 1, m + 0); + const auto p2 = from.LoadPart(d, n + 2, m + 0); + const auto p3 = from.LoadPart(d, n + 3, m + 0); + + const auto q0 = InterleaveLower(d, p0, p2); + const auto q1 = InterleaveLower(d, p1, p3); + const auto q2 = InterleaveUpper(d, p0, p2); + const auto q3 = InterleaveUpper(d, p1, p3); + + const auto r0 = InterleaveLower(d, q0, q1); + const auto r1 = InterleaveUpper(d, q0, q1); + const auto r2 = InterleaveLower(d, q2, q3); + const auto r3 = InterleaveUpper(d, q2, q3); + + to.StorePart(d, r0, m + 0, n + 0); + to.StorePart(d, r1, m + 1, n + 0); + to.StorePart(d, r2, m + 2, n + 0); + to.StorePart(d, r3, m + 3, n + 0); + } + } +} +#else +constexpr bool TransposeUseSimd(size_t ROWS, size_t COLS) { return false; } +#endif + +template +struct Transpose { + template + static void Run(const From& from, const To& to) { + // This does not guarantee anything, just saves from the most stupid + // mistakes. + JXL_DASSERT(from.Address(0, 0) != to.Address(0, 0)); + TransposeSimdTag tag; + GenericTransposeBlock(tag, from, to, N, M); + } +}; + +// Avoid inlining and unrolling transposes for large blocks. +template +struct Transpose< + N, M, typename std::enable_if<(N >= 8 && M >= 8 && N * M >= 512)>::type> { + template + static void Run(const From& from, const To& to) { + // This does not guarantee anything, just saves from the most stupid + // mistakes. + JXL_DASSERT(from.Address(0, 0) != to.Address(0, 0)); + TransposeSimdTag tag; + constexpr void (*transpose)(TransposeSimdTag, + const From&, const To&, size_t, size_t) = + GenericTransposeBlock<0, 0, From, To>; + NoInlineWrapper(transpose, tag, from, to, N, M); + } +}; + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_TRANSPOSE_INL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/version.h.in b/third_party/jpeg-xl/lib/jxl/version.h.in new file mode 100644 index 0000000000..d077abec79 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/version.h.in @@ -0,0 +1,39 @@ +/* Copyright (c) the JPEG XL Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style + * license that can be found in the LICENSE file. + */ + +/** @addtogroup libjxl_common + * @{ + * @file version.h + * @brief libjxl version information + */ + +#ifndef JXL_VERSION_H_ +#define JXL_VERSION_H_ + +#define JPEGXL_MAJOR_VERSION @JPEGXL_MAJOR_VERSION@ ///< JPEG XL Major version +#define JPEGXL_MINOR_VERSION @JPEGXL_MINOR_VERSION@ ///< JPEG XL Minor version +#define JPEGXL_PATCH_VERSION @JPEGXL_PATCH_VERSION@ ///< JPEG XL Patch version + +/** Can be used to conditionally compile code for a specific JXL version + * @param[maj] major version + * @param[min] minor version + * + * @code + * #if JPEGXL_NUMERIC_VERSION < JPEGXL_COMPUTE_NUMERIC_VERSION(0,8,0) + * // use old/deprecated api + * #else + * // use current api + * #endif + * @endcode + */ +#define JPEGXL_COMPUTE_NUMERIC_VERSION(major,minor,patch) ((major<<24) | (minor<<16) | (patch<<8) | 0) + +/* Numeric representation of the version */ +#define JPEGXL_NUMERIC_VERSION JPEGXL_COMPUTE_NUMERIC_VERSION(JPEGXL_MAJOR_VERSION,JPEGXL_MINOR_VERSION,JPEGXL_PATCH_VERSION) + +#endif /* JXL_VERSION_H_ */ + +/** @}*/ diff --git a/third_party/jpeg-xl/lib/jxl/xorshift128plus-inl.h b/third_party/jpeg-xl/lib/jxl/xorshift128plus-inl.h new file mode 100644 index 0000000000..a473d591f2 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/xorshift128plus-inl.h @@ -0,0 +1,103 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Fast but weak random generator. + +#if defined(LIB_JXL_XORSHIFT128PLUS_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_XORSHIFT128PLUS_INL_H_ +#undef LIB_JXL_XORSHIFT128PLUS_INL_H_ +#else +#define LIB_JXL_XORSHIFT128PLUS_INL_H_ +#endif + +#include + +#include +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::ShiftLeft; +using hwy::HWY_NAMESPACE::ShiftRight; +using hwy::HWY_NAMESPACE::Xor; + +// Adapted from https://github.com/vpxyz/xorshift/blob/master/xorshift128plus/ +// (MIT-license) +class Xorshift128Plus { + public: + // 8 independent generators (= single iteration for AVX-512) + enum { N = 8 }; + + explicit HWY_MAYBE_UNUSED Xorshift128Plus(const uint64_t seed) { + // Init state using SplitMix64 generator + s0_[0] = SplitMix64(seed + 0x9E3779B97F4A7C15ull); + s1_[0] = SplitMix64(s0_[0]); + for (size_t i = 1; i < N; ++i) { + s0_[i] = SplitMix64(s1_[i - 1]); + s1_[i] = SplitMix64(s0_[i]); + } + } + + HWY_MAYBE_UNUSED Xorshift128Plus(const uint32_t seed1, const uint32_t seed2, + const uint32_t seed3, const uint32_t seed4) { + // Init state using SplitMix64 generator + s0_[0] = SplitMix64(((static_cast(seed1) << 32) + seed2) + + 0x9E3779B97F4A7C15ull); + s1_[0] = SplitMix64(((static_cast(seed3) << 32) + seed4) + + 0x9E3779B97F4A7C15ull); + for (size_t i = 1; i < N; ++i) { + s0_[i] = SplitMix64(s0_[i - 1]); + s1_[i] = SplitMix64(s1_[i - 1]); + } + } + + HWY_INLINE HWY_MAYBE_UNUSED void Fill(uint64_t* HWY_RESTRICT random_bits) { +#if HWY_CAP_INTEGER64 + const HWY_FULL(uint64_t) d; + for (size_t i = 0; i < N; i += Lanes(d)) { + auto s1 = Load(d, s0_ + i); + const auto s0 = Load(d, s1_ + i); + const auto bits = Add(s1, s0); // b, c + Store(s0, d, s0_ + i); + s1 = Xor(s1, ShiftLeft<23>(s1)); + Store(bits, d, random_bits + i); + s1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0)))); + Store(s1, d, s1_ + i); + } +#else + for (size_t i = 0; i < N; ++i) { + auto s1 = s0_[i]; + const auto s0 = s1_[i]; + const auto bits = s1 + s0; // b, c + s0_[i] = s0; + s1 ^= s1 << 23; + random_bits[i] = bits; + s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5); + s1_[i] = s1; + } +#endif + } + + private: + static uint64_t SplitMix64(uint64_t z) { + z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull; + z = (z ^ (z >> 27)) * 0x94D049BB133111EBull; + return z ^ (z >> 31); + } + + HWY_ALIGN uint64_t s0_[N]; + HWY_ALIGN uint64_t s1_[N]; +}; + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_XORSHIFT128PLUS_INL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/xorshift128plus_test.cc b/third_party/jpeg-xl/lib/jxl/xorshift128plus_test.cc new file mode 100644 index 0000000000..2b0c78b1d1 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/xorshift128plus_test.cc @@ -0,0 +1,378 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/xorshift128plus_test.cc" +#include +#include +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/xorshift128plus-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Or; +using hwy::HWY_NAMESPACE::ShiftRight; +using hwy::HWY_NAMESPACE::Sub; + +// Define to nonzero in order to print the (new) golden outputs. +#define PRINT_RESULTS 0 + +const size_t kVectors = 64; + +#if PRINT_RESULTS + +template +void Print(const uint64_t (&result)[kNumLanes]) { + printf("{ "); + for (int i = 0; i < kNumLanes; ++i) { + if (i != 0) { + printf(", "); + } + printf("0x%016llXull", result[i]); + } + printf("},\n"); +} + +#else // PRINT_RESULTS + +const uint64_t kExpected[kVectors][Xorshift128Plus::N] = { + {0x6E901576D477CBB1ull, 0xE9E53789195DA2A2ull, 0xB681F6DDA5E0AE99ull, + 0x8EFD18CE21FD6896ull, 0xA898A80DF75CF532ull, 0x50CEB2C9E2DE7E32ull, + 0x3CA7C2FEB25C0DD0ull, 0xA4D0866B80B4D836ull}, + {0x8CD6A1E6233D3A26ull, 0x3D4603ADE98B112Dull, 0xDC427AF674019E36ull, + 0xE28B4D230705AC53ull, 0x7297E9BBA88783DDull, 0x34D3D23CFCD9B41Aull, + 0x5A223615ADBE96B8ull, 0xE5EB529027CFBD01ull}, + {0xC1894CF00DFAC6A2ull, 0x18EDF8AE9085E404ull, 0x8E936625296B4CCDull, + 0x31971EF3A14A899Bull, 0xBE87535FCE0BF26Aull, 0x576F7A752BC6649Full, + 0xA44CBADCE0C6B937ull, 0x3DBA819BB17A353Aull}, + {0x27CE38DFCC1C5EB6ull, 0x920BEB5606340256ull, 0x3986CBC40C9AFC2Cull, + 0xE22BCB3EEB1E191Eull, 0x6E1FCDD3602A8FBAull, 0x052CB044E5415A29ull, + 0x46266646EFB9ECD7ull, 0x8F44914618D29335ull}, + {0xDD30AEDF72A362C5ull, 0xBC1D824E16BB98F4ull, 0x9EA6009C2AA3D2F1ull, + 0xF65C0FBBE17AF081ull, 0x22424D06A8738991ull, 0x8A62763F2B7611D2ull, + 0x2F3E89F722637939ull, 0x84D338BEF50AFD50ull}, + {0x00F46494898E2B0Bull, 0x81239DC4FB8E8003ull, 0x414AD93EC5773FE7ull, + 0x791473C450E4110Full, 0x87F127BF68C959ACull, 0x6429282D695EF67Bull, + 0x661082E11546CBA8ull, 0x5815D53FA5436BFDull}, + {0xB3DEADAB9BE6E0F9ull, 0xAA1B7B8F7CED0202ull, 0x4C5ED437699D279Eull, + 0xA4471727F1CB39D3ull, 0xE439DA193F802F70ull, 0xF89401BB04FA6493ull, + 0x3B08045A4FE898BAull, 0x32137BFE98227950ull}, + {0xFBAE4A092897FEF3ull, 0x0639F6CE56E71C8Eull, 0xF0AD6465C07F0C1Eull, + 0xFF8E28563361DCE5ull, 0xC2013DB7F86BC6B9ull, 0x8EFCC0503330102Full, + 0x3F6B767EA5C4DA40ull, 0xB9864B950B2232E1ull}, + {0x76EB58DE8E5EC22Aull, 0x9BBBF49A18B32F4Full, 0xC8405F02B2B2FAB9ull, + 0xC3E122A5F146BC34ull, 0xC90BB046660F5765ull, 0xB933981310DBECCFull, + 0x5A2A7BFC9126FD1Cull, 0x8BB388C94DF87901ull}, + {0x753EB89AD63EF3C3ull, 0xF24AAF40C89D65ADull, 0x23F68931C1A6AA6Dull, + 0xF47E79BF702C6DD0ull, 0xA3AD113244EE7EAEull, 0xD42CBEA28F793DC3ull, + 0xD896FCF1820F497Cull, 0x042B86D2818948C1ull}, + {0x8F2A4FC5A4265763ull, 0xEC499E6F95EAA10Cull, 0xE3786D4ECCD0DEB5ull, + 0xC725C53D3AC4CC43ull, 0x065A4ACBBF83610Eull, 0x35C61C9FEF167129ull, + 0x7B720AEAA7D70048ull, 0x14206B841377D039ull}, + {0xAD27D78BF96055F6ull, 0x5F43B20FF47ADCD4ull, 0xE184C2401E2BF71Eull, + 0x30B263D78990045Dull, 0xC22F00EBFF9BA201ull, 0xAE7F86522B53A562ull, + 0x2853312BC039F0A4ull, 0x868D619E6549C3C8ull}, + {0xFD5493D8AE9A8371ull, 0x773D5E224DF61B3Bull, 0x5377C54FBB1A8280ull, + 0xCAD4DE3B8265CAFAull, 0xCDF3F19C91EBD5F6ull, 0xC8EA0F182D73BD78ull, + 0x220502D593433FF1ull, 0xB81205E612DC31B1ull}, + {0x8F32A39EAEDA4C70ull, 0x1D4B0914AA4DAC7Full, 0x56EF1570F3A8B405ull, + 0x29812CB17404A592ull, 0x97A2AAF69CAE90F2ull, 0x12BF5E02778BBFE5ull, + 0x9D4B55AD42A05FD2ull, 0x06C2BAB5E6086620ull}, + {0x8DB4B9648302B253ull, 0xD756AD9E3AEA12C7ull, 0x68709B7F11D4B188ull, + 0x7CC299DDCD707A4Bull, 0x97B860C370A7661Dull, 0xCECD314FC20E64F5ull, + 0x55F412CDFB4C7EC3ull, 0x55EE97591193B525ull}, + {0xCF70F3ACA96E6254ull, 0x022FEDECA2E09F46ull, 0x686823DB60AE1ECFull, + 0xFD36190D3739830Eull, 0x74E1C09027F68120ull, 0xB5883A835C093842ull, + 0x93E1EFB927E9E4E3ull, 0xB2721E249D7E5EBEull}, + {0x69B6E21C44188CB8ull, 0x5D6CFB853655A7AAull, 0x3E001A0B425A66DCull, + 0x8C57451103A5138Full, 0x7BF8B4BE18EAB402ull, 0x494102EB8761A365ull, + 0xB33796A9F6A81F0Eull, 0x10005AB3BCCFD960ull}, + {0xB2CF25740AE965DCull, 0x6F7C1DF7EF53D670ull, 0x648DD6087AC2251Eull, + 0x040955D9851D487Dull, 0xBD550FC7E21A7F66ull, 0x57408F484DEB3AB5ull, + 0x481E24C150B506C1ull, 0x72C0C3EAF91A40D6ull}, + {0x1997A481858A5D39ull, 0x539718F4BEF50DC1ull, 0x2EC4DC4787E7E368ull, + 0xFF1CE78879419845ull, 0xE219A93DD6F6DD30ull, 0x85328618D02FEC1Aull, + 0xC86E02D969181B20ull, 0xEBEC8CD8BBA34E6Eull}, + {0x28B55088A16CE947ull, 0xDD25AC11E6350195ull, 0xBD1F176694257B1Cull, + 0x09459CCF9FCC9402ull, 0xF8047341E386C4E4ull, 0x7E8E9A9AD984C6C0ull, + 0xA4661E95062AA092ull, 0x70A9947005ED1152ull}, + {0x4C01CF75DBE98CCDull, 0x0BA076CDFC7373B9ull, 0x6C5E7A004B57FB59ull, + 0x336B82297FD3BC56ull, 0x7990C0BE74E8D60Full, 0xF0275CC00EC5C8C8ull, + 0x6CF29E682DFAD2E9ull, 0xFA4361524BD95D72ull}, + {0x631D2A19FF62F018ull, 0x41C43863B985B3FAull, 0xE052B2267038EFD9ull, + 0xE2A535FAC575F430ull, 0xE004EEA90B1FF5B8ull, 0x42DFE2CA692A1F26ull, + 0x90FB0BFC9A189ECCull, 0x4484102BD3536BD0ull}, + {0xD027134E9ACCA5A5ull, 0xBBAB4F966D476A9Bull, 0x713794A96E03D693ull, + 0x9F6335E6B94CD44Aull, 0xC5090C80E7471617ull, 0x6D9C1B0C87B58E33ull, + 0x1969CE82E31185A5ull, 0x2099B97E87754EBEull}, + {0x60EBAF4ED934350Full, 0xC26FBF0BA5E6ECFFull, 0x9E54150F0312EC57ull, + 0x0973B48364ED0041ull, 0x800A523241426CFCull, 0x03AB5EC055F75989ull, + 0x8CF315935DEEB40Aull, 0x83D3FC0190BD1409ull}, + {0x26D35394CF720A51ull, 0xCE9EAA15243CBAFEull, 0xE2B45FBAF21B29E0ull, + 0xDB92E98EDE73F9E0ull, 0x79B16F5101C26387ull, 0x1AC15959DE88C86Full, + 0x387633AEC6D6A580ull, 0xA6FC05807BFC5EB8ull}, + {0x2D26C8E47C6BADA9ull, 0x820E6EC832D52D73ull, 0xB8432C3E0ED0EE5Bull, + 0x0F84B3C4063AAA87ull, 0xF393E4366854F651ull, 0x749E1B4D2366A567ull, + 0x805EACA43480D004ull, 0x244EBF3AA54400A5ull}, + {0xBFDC3763AA79F75Aull, 0x9E3A74CC751F41DBull, 0xF401302A149DBC55ull, + 0x6B25F7973D7BF7BCull, 0x13371D34FDBC3DAEull, 0xC5E1998C8F484DCDull, + 0x7031B8AE5C364464ull, 0x3847F0C4F3DA2C25ull}, + {0x24C6387D2C0F1225ull, 0x77CCE960255C67A4ull, 0x21A0947E497B10EBull, + 0xBB5DB73A825A9D7Eull, 0x26294A41999E553Dull, 0x3953E0089F87D925ull, + 0x3DAE6E5D4E5EAAFEull, 0x74B545460341A7AAull}, + {0x710E5EB08A7DB820ull, 0x7E43C4E77CAEA025ull, 0xD4C91529C8B060C1ull, + 0x09AE26D8A7B0CA29ull, 0xAB9F356BB360A772ull, 0xB68834A25F19F6E9ull, + 0x79B8D9894C5734E2ull, 0xC6847E7C8FFD265Full}, + {0x10C4BCB06A5111E6ull, 0x57CB50955B6A2516ull, 0xEF53C87798B6995Full, + 0xAB38E15BBD8D0197ull, 0xA51C6106EFF73C93ull, 0x83D7F0E2270A7134ull, + 0x0923FD330397FCE5ull, 0xF9DE54EDFE58FB45ull}, + {0x07D44833ACCD1A94ull, 0xAAD3C9E945E2F9F3ull, 0xABF4C879B876AA37ull, + 0xF29C69A21B301619ull, 0x2DDCE959111C788Bull, 0x7CEDB48F8AC1729Bull, + 0x93F3BA9A02B659BEull, 0xF20A87FF17933CBEull}, + {0x8E96EBE93180CFE6ull, 0x94CAA12873937079ull, 0x05F613D9380D4189ull, + 0xBCAB40C1DC79F38Aull, 0x0AD8907B7C61D19Eull, 0x88534E189D103910ull, + 0x2DB2FAABA160AB8Full, 0xA070E7506B06F15Cull}, + {0x6FB1FCDAFFEF87A9ull, 0xE735CF25337A090Dull, 0x172C6EDCEFEF1825ull, + 0x76957EA49EF0542Dull, 0x819BF4CD250F7C49ull, 0xD6FF23E4AD00C4D4ull, + 0xE79673C1EC358FF0ull, 0xAC9C048144337938ull}, + {0x4C5387FF258B3AF4ull, 0xEDB68FAEC2CB1AA3ull, 0x02A624E67B4E1DA4ull, + 0x5C44797A38E08AF2ull, 0x36546A70E9411B4Bull, 0x47C17B24D2FD9675ull, + 0x101957AAA020CA26ull, 0x47A1619D4779F122ull}, + {0xF84B8BCDC92D9A3Cull, 0x951D7D2C74B3066Bull, 0x7AC287C06EDDD9B2ull, + 0x4C38FC476608D38Full, 0x224D793B19CB4BCDull, 0x835A255899BF1A41ull, + 0x4AD250E9F62DB4ABull, 0xD9B44F4B58781096ull}, + {0xABBAF99A8EB5C6B8ull, 0xFB568E900D3A9F56ull, 0x11EDF63D23C5DF11ull, + 0xA9C3011D3FA7C5A8ull, 0xAEDD3CF11AFFF725ull, 0xABCA472B5F1EDD6Bull, + 0x0600B6BB5D879804ull, 0xDB4DE007F22191A0ull}, + {0xD76CC9EFF0CE9392ull, 0xF5E0A772B59BA49Aull, 0x7D1AE1ED0C1261B5ull, + 0x79224A33B5EA4F4Aull, 0x6DD825D80C40EA60ull, 0x47FC8E747E51C953ull, + 0x695C05F72888BF98ull, 0x1A012428440B9015ull}, + {0xD754DD61F9B772BFull, 0xC4A2FCF4C0F9D4EBull, 0x461167CDF67A24A2ull, + 0x434748490EBCB9D4ull, 0x274DD9CDCA5781DEull, 0x36BAC63BA9A85209ull, + 0x30324DAFDA36B70Full, 0x337570DB4FE6DAB3ull}, + {0xF46CBDD57C551546ull, 0x8E02507E676DA3E3ull, 0xD826245A8C15406Dull, + 0xDFB38A5B71113B72ull, 0x5EA38454C95B16B5ull, 0x28C054FB87ABF3E1ull, + 0xAA2724C0BA1A8096ull, 0xECA83EC980304F2Full}, + {0x6AA76EC294EB3303ull, 0x42D4CDB2A8032E3Bull, 0x7999EDF75DCD8735ull, + 0xB422BFFE696CCDCCull, 0x8F721461FD7CCDFEull, 0x148E1A5814FDE253ull, + 0x4DC941F4375EF8FFull, 0x27B2A9E0EB5B49CFull}, + {0xCEA592EF9343EBE1ull, 0xF7D38B5FA7698903ull, 0x6CCBF352203FEAB6ull, + 0x830F3095FCCDA9C5ull, 0xDBEEF4B81B81C8F4ull, 0x6D7EB9BCEECA5CF9ull, + 0xC58ABB0FBE436C69ull, 0xE4B97E6DB2041A4Bull}, + {0x7E40FC772978AF14ull, 0xCDDA4BBAE28354A1ull, 0xE4F993B832C32613ull, + 0xD3608093C68A4B35ull, 0x9A3B60E01BEE3699ull, 0x03BEF248F3288713ull, + 0x70B9294318F3E9B4ull, 0x8D2ABB913B8610DEull}, + {0x37F209128E7D8B2Cull, 0x81D2AB375BD874BCull, 0xA716A1B7373F7408ull, + 0x0CEE97BEC4706540ull, 0xA40C5FD9CDBC1512ull, 0x73CAF6C8918409E7ull, + 0x45E11BCEDF0BBAA1ull, 0x612C612BFF6E6605ull}, + {0xF8ECB14A12D0F649ull, 0xDA683CD7C01BA1ACull, 0xA2203F7510E124C1ull, + 0x7F83E52E162F3C78ull, 0x77D2BB73456ACADBull, 0x37FC34FC840BBA6Full, + 0x3076BC7D4C6EBC1Full, 0x4F514123632B5FA9ull}, + {0x44D789DED935E884ull, 0xF8291591E09FEC9Full, 0xD9CED2CF32A2E4B7ull, + 0x95F70E1EB604904Aull, 0xDE438FE43C14F6ABull, 0x4C8D23E4FAFCF8D8ull, + 0xC716910A3067EB86ull, 0x3D6B7915315095D3ull}, + {0x3170FDBADAB92095ull, 0x8F1963933FC5650Bull, 0x72F94F00ABECFEABull, + 0x6E3AE826C6AAB4CEull, 0xA677A2BF31068258ull, 0x9660CDC4F363AF10ull, + 0xD81A15A152379EF1ull, 0x5D7D285E1080A3F9ull}, + {0xDAD5DDFF9A2249B3ull, 0x6F9721D926103FAEull, 0x1418CBB83FFA349Aull, + 0xE71A30AD48C012B2ull, 0xBE76376C63751132ull, 0x3496467ACA713AE6ull, + 0x8D7EC01369F991A3ull, 0xD8C73A88B96B154Eull}, + {0x8B5D9C74AEB4833Aull, 0xF914FB3F867B912Full, 0xB894EA034936B1DCull, + 0x8A16D21BE51C4F5Bull, 0x31FF048ED582D98Eull, 0xB95AB2F4DC65B820ull, + 0x04082B9170561AF7ull, 0xA215610A5DC836FAull}, + {0xB2ADE592C092FAACull, 0x7A1E683BCBF13294ull, 0xC7A4DBF86858C096ull, + 0x3A49940F97BFF316ull, 0xCAE5C06B82C46703ull, 0xC7F413A0F951E2BDull, + 0x6665E7BB10EB5916ull, 0x86F84A5A94EDE319ull}, + {0x4EA199D8FAA79CA3ull, 0xDFA26E5BF1981704ull, 0x0F5E081D37FA4E01ull, + 0x9CB632F89CD675CDull, 0x4A09DB89D48C0304ull, 0x88142742EA3C7672ull, + 0xAC4F149E6D2E9BDBull, 0x6D9E1C23F8B1C6C6ull}, + {0xD58BE47B92DEC0E9ull, 0x8E57573645E34328ull, 0x4CC094CCB5FB5126ull, + 0x5F1D66AF6FB40E3Cull, 0x2BA15509132D3B00ull, 0x0D6545646120E567ull, + 0x3CF680C45C223666ull, 0x96B28E32930179DAull}, + {0x5900C45853AC7990ull, 0x61881E3E8B7FF169ull, 0x4DE5F835DF2230FFull, + 0x4427A9E7932F73FFull, 0x9B641BAD379A8C8Dull, 0xDF271E5BF98F4E5Cull, + 0xDFDA16DB830FF5EEull, 0x371C7E7CFB89C0E9ull}, + {0x4410A8576247A250ull, 0x6AD2DA12B45AC0D9ull, 0x18DFC72AAC85EECCull, + 0x06FC8BB2A0EF25C8ull, 0xEB287619C85E6118ull, 0x19553ECA67F25A2Cull, + 0x3B9557F1DCEC5BAAull, 0x7BAD9E8B710D1079ull}, + {0x34F365D66BD22B28ull, 0xE6E124B9F10F835Dull, 0x0573C38ABF2B24DCull, + 0xD32E6AF10A0125AEull, 0x383590ACEA979519ull, 0x8376ED7A39E28205ull, + 0xF0B7F184DCBDA435ull, 0x062A203390E31794ull}, + {0xA2AFFD7E41918760ull, 0x7F90FC1BD0819C86ull, 0x5033C08E5A969533ull, + 0x2707AF5C6D039590ull, 0x57BBD5980F17DF9Cull, 0xD3FE6E61D763268Aull, + 0x9E0A0AE40F335A3Bull, 0x43CF4EB0A99613C5ull}, + {0xD4D2A397CE1A7C2Eull, 0x3DF7CE7CC3212DADull, 0x0880F0D5D356C75Aull, + 0xA8AFC44DD03B1346ull, 0x79263B46C13A29E0ull, 0x11071B3C0ED58E7Aull, + 0xED46DC9F538406BFull, 0x2C94974F2B94843Dull}, + {0xE246E13C39AB5D5Eull, 0xAC1018489D955B20ull, 0x8601B558771852B8ull, + 0x110BD4C06DB40173ull, 0x738FC8A18CCA0EBBull, 0x6673E09BE0EA76E5ull, + 0x024BC7A0C7527877ull, 0x45E6B4652E2EC34Eull}, + {0xD1ED26A1A375CDC8ull, 0xAABC4E896A617CB8ull, 0x0A9C9E8E57D753C6ull, + 0xA3774A75FEB4C30Eull, 0x30B816C01C93E49Eull, 0xF405BABC06D2408Cull, + 0xCC0CE6B4CE788ABCull, 0x75E7922D0447956Cull}, + {0xD07C1676A698BC95ull, 0x5F9AEA4840E2D860ull, 0xD5FC10D58BDF6F02ull, + 0xF190A2AD4BC2EEA7ull, 0x0C24D11F51726931ull, 0xDB646899A16B6512ull, + 0x7BC10670047B1DD8ull, 0x2413A5ABCD45F092ull}, + {0x4E66892190CFD923ull, 0xF10162440365EC8Eull, 0x158ACA5A6A2280AEull, + 0x0D60ED11C0224166ull, 0x7CD2E9A71B9D7488ull, 0x450D7289706AB2A3ull, + 0x88FAE34EC9A0D7DCull, 0x96FF9103575A97DAull}, + {0x77990FAC6046C446ull, 0xB174B5FB30C76676ull, 0xE352CE3EB56CF82Aull, + 0xC6039B6873A9A082ull, 0xE3F80F3AE333148Aull, 0xB853BA24BA3539B9ull, + 0xE8863E52ECCB0C74ull, 0x309B4CC1092CC245ull}, + {0xBC2B70BEE8388D9Full, 0xE48D92AE22216DCEull, 0xF15F3BF3E2C15D8Full, + 0x1DD964D4812D8B24ull, 0xD56AF02FB4665E4Cull, 0x98002200595BD9A3ull, + 0x049246D50BB8FA12ull, 0x1B542DF485B579B9ull}, + {0x2347409ADFA8E497ull, 0x36015C2211D62498ull, 0xE9F141F32EB82690ull, + 0x1F839912D0449FB9ull, 0x4E4DCFFF2D02D97Cull, 0xF8A03AB4C0F625C9ull, + 0x0605F575795DAC5Cull, 0x4746C9BEA0DDA6B1ull}, + {0xCA5BB519ECE7481Bull, 0xFD496155E55CA945ull, 0xF753B9DBB1515F81ull, + 0x50549E8BAC0F70E7ull, 0x8614FB0271E21C60ull, 0x60C72947EB0F0070ull, + 0xA6511C10AEE742B6ull, 0x48FB48F2CACCB43Eull}}; + +#endif // PRINT_RESULTS + +// Ensures Xorshift128+ returns consistent and unchanging values. +void TestGolden() { + HWY_ALIGN Xorshift128Plus rng(12345); + for (uint64_t vector = 0; vector < kVectors; ++vector) { + HWY_ALIGN uint64_t lanes[Xorshift128Plus::N]; + rng.Fill(lanes); +#if PRINT_RESULTS + Print(lanes); +#else + for (size_t i = 0; i < Xorshift128Plus::N; ++i) { + ASSERT_EQ(kExpected[vector][i], lanes[i]) + << "Where vector=" << vector << " i=" << i; + } +#endif + } +} + +// Output changes when given different seeds +void TestSeedChanges() { + HWY_ALIGN uint64_t lanes[Xorshift128Plus::N]; + + std::vector first; + constexpr size_t kNumSeeds = 16384; + first.reserve(kNumSeeds); + + // All 14-bit seeds + for (size_t seed = 0; seed < kNumSeeds; ++seed) { + HWY_ALIGN Xorshift128Plus rng(seed); + + rng.Fill(lanes); + first.push_back(lanes[0]); + } + + // All outputs are unique + ASSERT_EQ(kNumSeeds, first.size()); + std::sort(first.begin(), first.end()); + first.erase(std::unique(first.begin(), first.end()), first.end()); + EXPECT_EQ(kNumSeeds, first.size()); +} + +void TestFloat() { + test::ThreadPoolForTests pool(8); + +#ifdef JXL_DISABLE_SLOW_TESTS + const uint32_t kMaxSeed = 256; +#else // JXL_DISABLE_SLOW_TESTS + const uint32_t kMaxSeed = 4096; +#endif // JXL_DISABLE_SLOW_TESTS + EXPECT_TRUE(RunOnPool( + &pool, 0, kMaxSeed, ThreadPool::NoInit, + [](const uint32_t seed, size_t /*thread*/) { + HWY_ALIGN Xorshift128Plus rng(seed); + + const HWY_FULL(uint32_t) du; + const HWY_FULL(float) df; + HWY_ALIGN uint64_t batch[Xorshift128Plus::N]; + HWY_ALIGN float lanes[MaxLanes(df)]; + double sum = 0.0; + size_t count = 0; + const size_t kReps = 2000; + for (size_t reps = 0; reps < kReps; ++reps) { + rng.Fill(batch); + for (size_t i = 0; i < Xorshift128Plus::N * 2; i += Lanes(df)) { + const auto bits = + Load(du, reinterpret_cast(batch) + i); + // 1.0 + 23 random mantissa bits = [1, 2) + const auto rand12 = + BitCast(df, Or(ShiftRight<9>(bits), Set(du, 0x3F800000))); + const auto rand01 = Sub(rand12, Set(df, 1.0f)); + Store(rand01, df, lanes); + for (float lane : lanes) { + sum += lane; + count += 1; + EXPECT_LE(lane, 1.0f); + EXPECT_GE(lane, 0.0f); + } + } + } + + // Verify average (uniform distribution) + EXPECT_NEAR(0.5, sum / count, 0.00702); + }, + "TestXorShift")); +} + +// Not more than one 64-bit zero +void TestNotZero() { + test::ThreadPoolForTests pool(8); + +#ifdef JXL_DISABLE_SLOW_TESTS + const uint32_t kMaxSeed = 500; +#else // JXL_DISABLE_SLOW_TESTS + const uint32_t kMaxSeed = 2000; +#endif // JXL_DISABLE_SLOW_TESTS + EXPECT_TRUE(RunOnPool( + &pool, 0, kMaxSeed, ThreadPool::NoInit, + [](const uint32_t task, size_t /*thread*/) { + HWY_ALIGN uint64_t lanes[Xorshift128Plus::N]; + + HWY_ALIGN Xorshift128Plus rng(task); + size_t num_zero = 0; + for (size_t vectors = 0; vectors < 10000; ++vectors) { + rng.Fill(lanes); + for (uint64_t lane : lanes) { + num_zero += static_cast(lane == 0); + } + } + EXPECT_LE(num_zero, 1u); + }, + "TestNotZero")); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +class Xorshift128Test : public hwy::TestWithParamTarget {}; + +HWY_TARGET_INSTANTIATE_TEST_SUITE_P(Xorshift128Test); + +HWY_EXPORT_AND_TEST_P(Xorshift128Test, TestNotZero); +HWY_EXPORT_AND_TEST_P(Xorshift128Test, TestGolden); +HWY_EXPORT_AND_TEST_P(Xorshift128Test, TestSeedChanges); +HWY_EXPORT_AND_TEST_P(Xorshift128Test, TestFloat); + +} // namespace jxl +#endif -- cgit v1.2.3