From 8dd16259287f58f9273002717ec4d27e97127719 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 12 Jun 2024 07:43:14 +0200 Subject: Merging upstream version 127.0. Signed-off-by: Daniel Baumann --- third_party/aom/aom/aom_image.h | 36 +- third_party/aom/aom/src/aom_image.c | 43 +- third_party/aom/aom_dsp/aom_dsp.cmake | 3 + third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl | 2 +- third_party/aom/aom_dsp/arm/aom_convolve8_neon.c | 401 +- .../aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c | 428 +- .../aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c | 334 +- .../aom/aom_dsp/flow_estimation/arm/disflow_neon.c | 104 +- .../aom/aom_dsp/flow_estimation/arm/disflow_neon.h | 127 + .../aom/aom_dsp/flow_estimation/arm/disflow_sve.c | 268 + third_party/aom/aom_dsp/pyramid.c | 31 +- third_party/aom/aom_dsp/x86/synonyms.h | 1 - third_party/aom/aom_util/aom_pthread.h | 1 + third_party/aom/aom_util/aom_thread.h | 2 - third_party/aom/av1/av1.cmake | 2 + third_party/aom/av1/av1_cx_iface.c | 1 + .../common/arm/compound_convolve_neon_dotprod.c | 55 +- .../aom/av1/common/arm/convolve_neon_dotprod.c | 49 +- third_party/aom/av1/common/av1_rtcd_defs.pl | 7 +- third_party/aom/av1/common/resize.c | 58 +- third_party/aom/av1/common/resize.h | 10 + third_party/aom/av1/common/x86/resize_avx2.c | 411 + .../aom/av1/encoder/arm/neon/highbd_pickrst_neon.c | 5 +- third_party/aom/av1/encoder/arm/neon/pickrst_sve.c | 590 + third_party/aom/av1/encoder/enc_enums.h | 4 + third_party/aom/av1/encoder/encodeframe.c | 4 +- third_party/aom/av1/encoder/encoder.h | 2 +- third_party/aom/av1/encoder/ethread.c | 7 +- third_party/aom/av1/encoder/global_motion.h | 7 +- third_party/aom/av1/encoder/nonrd_pickmode.c | 34 +- third_party/aom/av1/encoder/partition_search.c | 20 +- third_party/aom/av1/encoder/picklpf.c | 2 + third_party/aom/av1/encoder/pickrst.c | 21 +- third_party/aom/av1/encoder/speed_features.c | 2 +- third_party/aom/av1/encoder/tune_vmaf.c | 4 +- third_party/aom/av1/encoder/x86/pickrst_avx2.c | 12 +- third_party/aom/av1/encoder/x86/pickrst_sse4.c | 18 +- third_party/aom/test/aom_image_test.cc | 65 + third_party/aom/test/disflow_test.cc | 5 + third_party/aom/test/ethread_test.cc | 5 +- third_party/aom/test/frame_resize_test.cc | 157 + third_party/aom/test/test.cmake | 1 + third_party/aom/test/wiener_test.cc | 61 +- third_party/dav1d/meson.build | 2 + third_party/dav1d/meson_options.txt | 5 + third_party/dav1d/src/arm/64/mc.S | 4 +- third_party/dav1d/src/arm/64/mc_dotprod.S | 1413 + third_party/dav1d/src/arm/64/msac.S | 21 +- third_party/dav1d/src/arm/itx.h | 63 - third_party/dav1d/src/arm/mc.h | 85 +- third_party/dav1d/src/cdf.c | 1378 +- third_party/dav1d/src/cdf.h | 48 +- third_party/dav1d/src/decode.c | 95 +- third_party/dav1d/src/internal.h | 9 +- third_party/dav1d/src/itx.h | 63 + third_party/dav1d/src/lf_mask.c | 6 +- third_party/dav1d/src/meson.build | 1 + third_party/dav1d/src/refmvs.c | 4 +- third_party/dav1d/src/riscv/itx.h | 63 - third_party/dav1d/src/x86/ipred_avx2.asm | 3 +- third_party/dav1d/src/x86/itx.h | 64 - third_party/dav1d/src/x86/mc16_avx2.asm | 1602 +- third_party/dav1d/src/x86/mc_avx2.asm | 1475 +- third_party/dav1d/src/x86/mc_avx512.asm | 3739 +- third_party/dav1d/tests/meson.build | 2 +- third_party/gemmology/gemmology.h | 9 +- third_party/gemmology/moz.yaml | 4 +- third_party/jpeg-xl/AUTHORS | 2 + third_party/jpeg-xl/CMakeLists.txt | 12 +- third_party/jpeg-xl/examples/decode_progressive.cc | 8 +- third_party/jpeg-xl/lib/extras/dec/apng.cc | 1226 +- third_party/jpeg-xl/lib/extras/dec/apng.h | 3 +- .../jpeg-xl/lib/extras/dec/color_description.cc | 38 +- third_party/jpeg-xl/lib/extras/dec/decode.cc | 9 + third_party/jpeg-xl/lib/extras/dec/decode.h | 2 + third_party/jpeg-xl/lib/extras/dec/gif.cc | 1 + third_party/jpeg-xl/lib/extras/dec/jpg.cc | 3 +- third_party/jpeg-xl/lib/extras/dec/jxl.cc | 4 +- third_party/jpeg-xl/lib/extras/dec/pnm.cc | 10 +- third_party/jpeg-xl/lib/extras/enc/encode.cc | 8 + third_party/jpeg-xl/lib/extras/enc/encode.h | 2 + third_party/jpeg-xl/lib/extras/enc/jpegli.cc | 4 + third_party/jpeg-xl/lib/extras/enc/jpg.cc | 7 +- third_party/jpeg-xl/lib/extras/jpegli_test.cc | 2 +- third_party/jpeg-xl/lib/extras/metrics.cc | 1 + .../jpeg-xl/lib/extras/packed_image_convert.cc | 1 + .../jpeg-xl/lib/include/jxl/color_encoding.h | 2 - third_party/jpeg-xl/lib/jpegli/color_transform.cc | 316 +- third_party/jpeg-xl/lib/jpegli/common.h | 7 +- third_party/jpeg-xl/lib/jpegli/decode.cc | 49 +- third_party/jpeg-xl/lib/jpegli/decode.h | 1 + third_party/jpeg-xl/lib/jpegli/decode_api_test.cc | 24 +- third_party/jpeg-xl/lib/jpegli/decode_internal.h | 4 +- third_party/jpeg-xl/lib/jpegli/decode_marker.cc | 6 +- third_party/jpeg-xl/lib/jpegli/encode.cc | 40 +- third_party/jpeg-xl/lib/jpegli/encode.h | 1 + third_party/jpeg-xl/lib/jpegli/encode_api_test.cc | 47 +- .../jpeg-xl/lib/jpegli/error_handling_test.cc | 22 +- .../jpeg-xl/lib/jpegli/input_suspension_test.cc | 14 +- .../jpeg-xl/lib/jpegli/libjpeg_test_util.cc | 7 +- .../jpeg-xl/lib/jpegli/output_suspension_test.cc | 12 + .../jpeg-xl/lib/jpegli/source_manager_test.cc | 3 +- third_party/jpeg-xl/lib/jpegli/streaming_test.cc | 10 + third_party/jpeg-xl/lib/jpegli/test_utils.cc | 63 +- third_party/jpeg-xl/lib/jpegli/test_utils.h | 28 +- .../jpeg-xl/lib/jpegli/transcode_api_test.cc | 9 + third_party/jpeg-xl/lib/jxl/ac_context.h | 11 +- third_party/jpeg-xl/lib/jxl/ac_strategy.h | 3 + third_party/jpeg-xl/lib/jxl/ac_strategy_test.cc | 8 +- third_party/jpeg-xl/lib/jxl/ans_common.h | 11 +- third_party/jpeg-xl/lib/jxl/ans_test.cc | 3 +- .../jpeg-xl/lib/jxl/base/compiler_specific.h | 10 +- third_party/jpeg-xl/lib/jxl/base/exif.h | 6 +- third_party/jpeg-xl/lib/jxl/base/include_jpeglib.h | 20 + third_party/jpeg-xl/lib/jxl/base/matrix_ops.h | 6 +- third_party/jpeg-xl/lib/jxl/base/rect.h | 194 + third_party/jpeg-xl/lib/jxl/base/span.h | 7 +- third_party/jpeg-xl/lib/jxl/bit_reader_test.cc | 1 + third_party/jpeg-xl/lib/jxl/bits_test.cc | 3 + third_party/jpeg-xl/lib/jxl/blending.cc | 9 + third_party/jpeg-xl/lib/jxl/blending.h | 2 + .../jpeg-xl/lib/jxl/butteraugli/butteraugli.cc | 10 +- third_party/jpeg-xl/lib/jxl/cache_aligned.h | 2 - third_party/jpeg-xl/lib/jxl/chroma_from_luma.h | 7 +- third_party/jpeg-xl/lib/jxl/cms/jxl_cms_internal.h | 16 +- third_party/jpeg-xl/lib/jxl/cms/opsin_params.h | 3 + third_party/jpeg-xl/lib/jxl/cms/tone_mapping.h | 1 + third_party/jpeg-xl/lib/jxl/coeff_order.h | 17 +- third_party/jpeg-xl/lib/jxl/coeff_order_fwd.h | 4 +- .../jpeg-xl/lib/jxl/color_encoding_internal.h | 1 + .../jpeg-xl/lib/jxl/color_management_test.cc | 4 +- third_party/jpeg-xl/lib/jxl/compressed_dc.cc | 1 + third_party/jpeg-xl/lib/jxl/compressed_dc.h | 5 +- third_party/jpeg-xl/lib/jxl/convolve-inl.h | 2 + third_party/jpeg-xl/lib/jxl/convolve.h | 1 + third_party/jpeg-xl/lib/jxl/convolve_separable5.cc | 1 + third_party/jpeg-xl/lib/jxl/convolve_slow.cc | 4 +- third_party/jpeg-xl/lib/jxl/convolve_symmetric3.cc | 1 + third_party/jpeg-xl/lib/jxl/convolve_symmetric5.cc | 1 + third_party/jpeg-xl/lib/jxl/convolve_test.cc | 3 + third_party/jpeg-xl/lib/jxl/dct_for_test.h | 3 +- third_party/jpeg-xl/lib/jxl/dct_util.h | 5 +- third_party/jpeg-xl/lib/jxl/dec_ans.h | 7 +- third_party/jpeg-xl/lib/jxl/dec_frame.cc | 1 + third_party/jpeg-xl/lib/jxl/dec_group.cc | 6 +- third_party/jpeg-xl/lib/jxl/dec_group_border.cc | 7 + third_party/jpeg-xl/lib/jxl/dec_group_border.h | 9 +- third_party/jpeg-xl/lib/jxl/dec_modular.cc | 4 +- third_party/jpeg-xl/lib/jxl/dec_modular.h | 11 +- third_party/jpeg-xl/lib/jxl/dec_noise.cc | 13 +- third_party/jpeg-xl/lib/jxl/dec_noise.h | 6 +- third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.h | 6 +- third_party/jpeg-xl/lib/jxl/dec_transforms-inl.h | 4 +- .../jpeg-xl/lib/jxl/dec_transforms_testonly.cc | 1 - .../jpeg-xl/lib/jxl/dec_transforms_testonly.h | 3 +- third_party/jpeg-xl/lib/jxl/dec_xyb.cc | 5 +- third_party/jpeg-xl/lib/jxl/dec_xyb.h | 2 + third_party/jpeg-xl/lib/jxl/decode_test.cc | 4 +- third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc | 6 +- third_party/jpeg-xl/lib/jxl/enc_ac_strategy.h | 3 + .../jpeg-xl/lib/jxl/enc_adaptive_quantization.cc | 3 +- .../jpeg-xl/lib/jxl/enc_adaptive_quantization.h | 3 +- .../jpeg-xl/lib/jxl/enc_ar_control_field.cc | 6 +- third_party/jpeg-xl/lib/jxl/enc_ar_control_field.h | 5 +- third_party/jpeg-xl/lib/jxl/enc_cache.cc | 6 +- third_party/jpeg-xl/lib/jxl/enc_cache.h | 5 +- .../jpeg-xl/lib/jxl/enc_chroma_from_luma.cc | 6 +- third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.h | 2 + third_party/jpeg-xl/lib/jxl/enc_cluster.h | 8 +- third_party/jpeg-xl/lib/jxl/enc_coeff_order.cc | 4 +- third_party/jpeg-xl/lib/jxl/enc_coeff_order.h | 10 +- third_party/jpeg-xl/lib/jxl/enc_comparator.h | 2 + third_party/jpeg-xl/lib/jxl/enc_debug_image.cc | 5 +- third_party/jpeg-xl/lib/jxl/enc_debug_image.h | 4 +- third_party/jpeg-xl/lib/jxl/enc_detect_dots.cc | 4 +- third_party/jpeg-xl/lib/jxl/enc_detect_dots.h | 7 +- third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.cc | 6 +- third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.h | 4 +- third_party/jpeg-xl/lib/jxl/enc_entropy_coder.cc | 12 +- third_party/jpeg-xl/lib/jxl/enc_entropy_coder.h | 11 +- third_party/jpeg-xl/lib/jxl/enc_fast_lossless.cc | 3 +- third_party/jpeg-xl/lib/jxl/enc_fields.cc | 2 +- third_party/jpeg-xl/lib/jxl/enc_frame.cc | 22 +- third_party/jpeg-xl/lib/jxl/enc_gaborish.cc | 5 +- third_party/jpeg-xl/lib/jxl/enc_gaborish.h | 1 + third_party/jpeg-xl/lib/jxl/enc_gaborish_test.cc | 1 + third_party/jpeg-xl/lib/jxl/enc_group.cc | 2 +- third_party/jpeg-xl/lib/jxl/enc_group.h | 3 +- third_party/jpeg-xl/lib/jxl/enc_heuristics.cc | 5 +- third_party/jpeg-xl/lib/jxl/enc_heuristics.h | 2 +- third_party/jpeg-xl/lib/jxl/enc_image_bundle.cc | 1 + third_party/jpeg-xl/lib/jxl/enc_image_bundle.h | 1 + third_party/jpeg-xl/lib/jxl/enc_linalg_test.cc | 2 + third_party/jpeg-xl/lib/jxl/enc_modular.cc | 9 +- third_party/jpeg-xl/lib/jxl/enc_modular.h | 1 + third_party/jpeg-xl/lib/jxl/enc_optimize_test.cc | 4 + .../jpeg-xl/lib/jxl/enc_patch_dictionary.cc | 5 +- third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.h | 8 +- third_party/jpeg-xl/lib/jxl/enc_photon_noise.h | 4 +- .../jpeg-xl/lib/jxl/enc_photon_noise_test.cc | 1 - third_party/jpeg-xl/lib/jxl/enc_quant_weights.h | 2 + third_party/jpeg-xl/lib/jxl/enc_xyb.cc | 1 + third_party/jpeg-xl/lib/jxl/encode_internal.h | 2 +- third_party/jpeg-xl/lib/jxl/encode_test.cc | 6 +- third_party/jpeg-xl/lib/jxl/epf.cc | 18 +- third_party/jpeg-xl/lib/jxl/epf.h | 2 +- .../lib/jxl/fake_parallel_runner_testonly.h | 2 +- third_party/jpeg-xl/lib/jxl/fields.cc | 2 +- third_party/jpeg-xl/lib/jxl/fields.h | 3 - third_party/jpeg-xl/lib/jxl/fields_test.cc | 11 +- third_party/jpeg-xl/lib/jxl/frame_dimensions.h | 2 +- third_party/jpeg-xl/lib/jxl/frame_header.h | 10 +- third_party/jpeg-xl/lib/jxl/icc_codec.h | 2 +- third_party/jpeg-xl/lib/jxl/image.h | 184 +- third_party/jpeg-xl/lib/jxl/image_bundle.h | 5 +- third_party/jpeg-xl/lib/jxl/image_metadata.h | 1 + third_party/jpeg-xl/lib/jxl/image_ops.h | 1 + third_party/jpeg-xl/lib/jxl/image_ops_test.cc | 6 +- third_party/jpeg-xl/lib/jxl/image_test_utils.h | 8 +- third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.cc | 2 +- third_party/jpeg-xl/lib/jxl/jxl_test.cc | 78 +- third_party/jpeg-xl/lib/jxl/lehmer_code_test.cc | 7 +- .../lib/jxl/modular/encoding/context_predict.h | 10 +- .../lib/jxl/modular/encoding/enc_debug_tree.cc | 3 +- third_party/jpeg-xl/lib/jxl/modular/options.h | 4 +- .../lib/jxl/modular/transform/enc_palette.cc | 1 + .../jpeg-xl/lib/jxl/modular/transform/palette.cc | 4 + .../jpeg-xl/lib/jxl/modular/transform/palette.h | 8 +- .../jpeg-xl/lib/jxl/modular/transform/squeeze.h | 4 +- .../jpeg-xl/lib/jxl/modular/transform/transform.cc | 2 +- .../jpeg-xl/lib/jxl/modular/transform/transform.h | 6 +- third_party/jpeg-xl/lib/jxl/opsin_image_test.cc | 2 + third_party/jpeg-xl/lib/jxl/opsin_inverse_test.cc | 2 + third_party/jpeg-xl/lib/jxl/passes_test.cc | 5 +- third_party/jpeg-xl/lib/jxl/preview_test.cc | 2 + third_party/jpeg-xl/lib/jxl/quant_weights.h | 4 +- third_party/jpeg-xl/lib/jxl/quant_weights_test.cc | 7 + third_party/jpeg-xl/lib/jxl/quantizer.cc | 4 +- third_party/jpeg-xl/lib/jxl/quantizer.h | 14 +- third_party/jpeg-xl/lib/jxl/quantizer_test.cc | 8 + .../render_pipeline/low_memory_render_pipeline.cc | 7 + .../render_pipeline/low_memory_render_pipeline.h | 9 +- .../lib/jxl/render_pipeline/render_pipeline.cc | 4 + .../lib/jxl/render_pipeline/render_pipeline.h | 12 +- .../jxl/render_pipeline/render_pipeline_stage.h | 4 +- .../jxl/render_pipeline/render_pipeline_test.cc | 2 +- .../jxl/render_pipeline/simple_render_pipeline.cc | 6 + .../jxl/render_pipeline/simple_render_pipeline.h | 7 +- .../jpeg-xl/lib/jxl/render_pipeline/stage_noise.h | 11 +- .../lib/jxl/render_pipeline/stage_splines.cc | 8 + .../jpeg-xl/lib/jxl/render_pipeline/stage_write.h | 6 +- .../render_pipeline/test_render_pipeline_stages.h | 6 +- third_party/jpeg-xl/lib/jxl/roundtrip_test.cc | 3 +- third_party/jpeg-xl/lib/jxl/sanitizers.h | 8 +- third_party/jpeg-xl/lib/jxl/speed_tier_test.cc | 2 +- third_party/jpeg-xl/lib/jxl/splines.cc | 3 +- third_party/jpeg-xl/lib/jxl/splines.h | 1 + third_party/jpeg-xl/lib/jxl/splines_gbench.cc | 5 +- third_party/jpeg-xl/lib/jxl/splines_test.cc | 1 + third_party/jpeg-xl/lib/jxl/test_image.cc | 3 + third_party/jpeg-xl/lib/jxl/test_image.h | 3 +- third_party/jpeg-xl/lib/jxl/test_utils.h | 12 + third_party/jpeg-xl/lib/jxl/xorshift128plus-inl.h | 4 +- third_party/jpeg-xl/lib/jxl_extras.cmake | 2 +- third_party/jpeg-xl/lib/jxl_lists.bzl | 2 + third_party/jpeg-xl/lib/jxl_lists.cmake | 2 + third_party/jpeg-xl/lib/jxl_tests.cmake | 2 +- .../lib/threads/thread_parallel_runner_test.cc | 3 + third_party/libwebrtc/AUTHORS | 1 + third_party/libwebrtc/DEPS | 120 +- third_party/libwebrtc/README.moz-ff-commit | 510 + third_party/libwebrtc/README.mozilla | 340 + third_party/libwebrtc/api/BUILD.gn | 25 +- third_party/libwebrtc/api/audio_codecs/BUILD.gn | 1 + .../libwebrtc/api/audio_codecs/audio_encoder.h | 15 +- third_party/libwebrtc/api/candidate.cc | 32 +- third_party/libwebrtc/api/candidate.h | 22 +- third_party/libwebrtc/api/candidate_unittest.cc | 56 + third_party/libwebrtc/api/crypto_params.h | 43 - .../libwebrtc/api/frame_transformer_interface.h | 6 +- third_party/libwebrtc/api/neteq/neteq.cc | 3 +- third_party/libwebrtc/api/neteq/neteq.h | 31 +- .../libwebrtc/api/peer_connection_interface.h | 17 +- .../api/rtc_event_log_output_file_unittest.cc | 3 +- third_party/libwebrtc/api/stats/attribute.h | 54 +- third_party/libwebrtc/api/stats/rtc_stats.h | 16 +- third_party/libwebrtc/api/stats/rtc_stats_member.h | 185 - third_party/libwebrtc/api/stats/rtcstats_objects.h | 417 +- .../libwebrtc/api/test/mock_frame_transformer.h | 45 + .../api/test/mock_peerconnectioninterface.h | 4 + .../api/test/mock_video_decoder_factory.h | 13 +- third_party/libwebrtc/api/test/pclf/BUILD.gn | 2 + third_party/libwebrtc/api/test/video/BUILD.gn | 1 + .../test/video/function_video_decoder_factory.h | 19 +- third_party/libwebrtc/api/transport/BUILD.gn | 6 + .../api/transport/bandwidth_estimation_settings.h | 27 + .../bandwidth_estimation_settings_gn/moz.build | 198 + third_party/libwebrtc/api/video_codecs/BUILD.gn | 5 + .../libwebrtc/api/video_codecs/av1_profile.cc | 6 +- .../libwebrtc/api/video_codecs/av1_profile.h | 3 - .../libwebrtc/api/video_codecs/sdp_video_format.cc | 42 +- .../libwebrtc/api/video_codecs/test/BUILD.gn | 3 + ...o_decoder_software_fallback_wrapper_unittest.cc | 15 +- .../libwebrtc/api/video_codecs/video_codec.cc | 30 + .../libwebrtc/api/video_codecs/video_codec.h | 4 + .../api/video_codecs/video_codecs_api_gn/moz.build | 1 + .../api/video_codecs/video_decoder_factory.cc | 45 + .../api/video_codecs/video_decoder_factory.h | 31 +- .../video_decoder_factory_template_dav1d_adapter.h | 2 +- .../video_decoder_software_fallback_wrapper.cc | 23 +- .../video_decoder_software_fallback_wrapper.h | 8 + .../video_encoder_software_fallback_wrapper.cc | 18 +- third_party/libwebrtc/audio/BUILD.gn | 6 +- third_party/libwebrtc/audio/audio_send_stream.cc | 118 +- third_party/libwebrtc/audio/audio_send_stream.h | 28 +- .../libwebrtc/audio/audio_send_stream_unittest.cc | 71 +- third_party/libwebrtc/audio/channel_receive.cc | 7 - ..._receive_frame_transformer_delegate_unittest.cc | 4 +- .../libwebrtc/audio/channel_receive_unittest.cc | 2 +- .../channel_send_frame_transformer_delegate.cc | 14 +- ...nel_send_frame_transformer_delegate_unittest.cc | 10 +- .../libwebrtc/audio/channel_send_unittest.cc | 2 +- third_party/libwebrtc/call/BUILD.gn | 3 +- .../call/rtp_transport_controller_send.cc | 75 +- .../libwebrtc/call/rtp_transport_controller_send.h | 6 + .../call/rtp_transport_controller_send_interface.h | 11 + third_party/libwebrtc/call/rtp_video_sender.cc | 88 +- third_party/libwebrtc/call/rtp_video_sender.h | 9 +- .../libwebrtc/call/rtp_video_sender_interface.h | 8 +- .../libwebrtc/call/rtp_video_sender_unittest.cc | 81 +- .../call/test/mock_rtp_transport_controller_send.h | 9 + third_party/libwebrtc/call/version.cc | 2 +- third_party/libwebrtc/call/video_send_stream.h | 19 +- .../libwebrtc/common_audio/wav_file_unittest.cc | 9 +- .../common_video/h265/h265_bitstream_parser.cc | 3 +- .../libwebrtc/common_video/h265/h265_common.h | 10 +- third_party/libwebrtc/examples/BUILD.gn | 5 + third_party/libwebrtc/experiments/field_trials.py | 15 + .../libwebrtc/infra/specs/client.webrtc.json | 620 +- .../infra/specs/internal.client.webrtc.json | 120 +- third_party/libwebrtc/infra/specs/mixins.pyl | 21 +- .../libwebrtc/infra/specs/mixins_webrtc.pyl | 4 +- third_party/libwebrtc/infra/specs/test_suites.pyl | 2 +- .../libwebrtc/infra/specs/tryserver.webrtc.json | 708 +- third_party/libwebrtc/infra/specs/variants.pyl | 8 +- third_party/libwebrtc/infra/specs/waterfalls.pyl | 32 +- third_party/libwebrtc/logging/BUILD.gn | 6 +- .../encoder/rtc_event_log_encoder_legacy.cc | 11 +- .../encoder/rtc_event_log_encoder_new_format.cc | 37 +- .../encoder/rtc_event_log_encoder_new_format.h | 10 +- .../encoder/rtc_event_log_encoder_unittest.cc | 177 +- .../events/rtc_event_ice_candidate_pair_config.cc | 8 +- .../events/rtc_event_ice_candidate_pair_config.h | 15 +- .../libwebrtc/logging/rtc_event_log/ice_logger.cc | 3 +- .../logging/rtc_event_log/rtc_event_log_impl.cc | 2 +- .../logging/rtc_event_log/rtc_event_log_parser.cc | 82 +- .../logging/rtc_event_log/rtc_event_log_parser.h | 9 + .../rtc_event_log/rtc_event_log_unittest_helper.cc | 56 +- .../rtc_event_log/rtc_event_log_unittest_helper.h | 10 + third_party/libwebrtc/media/BUILD.gn | 3 + third_party/libwebrtc/media/base/codec_unittest.cc | 10 +- .../libwebrtc/media/base/media_constants.cc | 6 + third_party/libwebrtc/media/base/media_constants.h | 6 + .../libwebrtc/media/engine/fake_webrtc_call.cc | 11 - .../libwebrtc/media/engine/fake_webrtc_call.h | 1 - .../media/engine/internal_decoder_factory.cc | 7 +- .../engine/internal_decoder_factory_unittest.cc | 2 +- .../media/engine/multiplex_codec_factory.cc | 17 +- .../media/engine/multiplex_codec_factory.h | 5 +- .../engine/multiplex_codec_factory_unittest.cc | 13 +- .../media/engine/simulcast_encoder_adapter.cc | 16 +- .../libwebrtc/media/engine/webrtc_video_engine.cc | 41 +- .../libwebrtc/media/engine/webrtc_video_engine.h | 7 +- .../media/engine/webrtc_video_engine_unittest.cc | 140 - .../libwebrtc/media/engine/webrtc_voice_engine.cc | 4 +- .../libwebrtc/modules/audio_coding/BUILD.gn | 3 - .../modules/audio_coding/acm2/acm_receiver.cc | 6 +- .../audio_coding/acm2/acm_receiver_unittest.cc | 59 - .../audio_coding/codecs/g711/audio_decoder_pcm.cc | 10 + .../audio_coding/codecs/g711/audio_decoder_pcm.h | 4 + .../audio_coding/codecs/g722/audio_decoder_g722.cc | 5 + .../audio_coding/codecs/g722/audio_decoder_g722.h | 2 + .../audio_coding/codecs/opus/audio_decoder_opus.cc | 23 +- .../audio_coding/codecs/opus/audio_decoder_opus.h | 3 + .../codecs/pcm16b/audio_decoder_pcm16b.cc | 5 + .../codecs/pcm16b/audio_decoder_pcm16b.h | 2 + .../modules/audio_coding/neteq/background_noise.cc | 29 +- .../modules/audio_coding/neteq/background_noise.h | 4 +- .../modules/audio_coding/neteq/decision_logic.cc | 14 +- .../audio_coding/neteq/decision_logic_unittest.cc | 13 +- .../neteq/mock/mock_packet_arrival_history.h | 9 +- .../modules/audio_coding/neteq/neteq_impl.cc | 140 +- .../modules/audio_coding/neteq/neteq_impl.h | 15 +- .../modules/audio_coding/neteq/neteq_unittest.cc | 9 +- .../audio_coding/neteq/packet_arrival_history.cc | 131 +- .../audio_coding/neteq/packet_arrival_history.h | 79 +- .../neteq/packet_arrival_history_unittest.cc | 50 +- .../modules/audio_coding/neteq/post_decode_vad.cc | 90 - .../modules/audio_coding/neteq/post_decode_vad.h | 71 - .../audio_coding/neteq/post_decode_vad_unittest.cc | 25 - .../neteq/tools/neteq_replacement_input.cc | 2 +- .../modules/audio_coding/neteq_gn/moz.build | 1 - .../include/test_audio_device_unittest.cc | 20 +- .../agc2/input_volume_controller.h | 2 +- .../audio_processing/audio_processing_impl.cc | 2 +- .../goog_cc/goog_cc_network_control.cc | 3 +- .../goog_cc/loss_based_bwe_v2.cc | 11 +- .../goog_cc/loss_based_bwe_v2.h | 2 + .../goog_cc/loss_based_bwe_v2_test.cc | 36 + .../goog_cc/probe_controller.cc | 31 +- .../goog_cc/probe_controller.h | 3 +- .../goog_cc/probe_controller_unittest.cc | 36 + .../goog_cc/send_side_bandwidth_estimation.cc | 8 +- .../goog_cc/send_side_bandwidth_estimation.h | 1 + .../desktop_capture/win/dxgi_output_duplicator.cc | 10 +- .../libwebrtc/modules/pacing/bitrate_prober.cc | 32 +- .../libwebrtc/modules/pacing/bitrate_prober.h | 6 + .../libwebrtc/modules/pacing/pacing_controller.cc | 4 + .../libwebrtc/modules/pacing/pacing_controller.h | 3 + .../modules/pacing/pacing_controller_unittest.cc | 41 +- .../libwebrtc/modules/pacing/packet_router.cc | 10 + .../libwebrtc/modules/pacing/packet_router.h | 2 + .../modules/pacing/packet_router_unittest.cc | 25 + .../modules/pacing/task_queue_paced_sender.cc | 5 + .../modules/pacing/task_queue_paced_sender.h | 3 + third_party/libwebrtc/modules/rtp_rtcp/BUILD.gn | 12 +- .../source/create_video_rtp_depacketizer.cc | 8 +- .../source/frame_transformer_factory_unittest.cc | 2 +- .../rtp_rtcp/source/rtp_packet_h265_common.h | 54 + .../modules/rtp_rtcp/source/rtp_packetizer_h265.cc | 58 +- .../source/rtp_packetizer_h265_unittest.cc | 91 +- ...er_video_frame_transformer_delegate_unittest.cc | 2 +- .../rtp_rtcp/source/rtp_sender_video_unittest.cc | 2 +- ...receiver_frame_transformer_delegate_unittest.cc | 2 +- .../rtp_rtcp/source/video_rtp_depacketizer_h265.cc | 244 + .../rtp_rtcp/source/video_rtp_depacketizer_h265.h | 28 + .../source/video_rtp_depacketizer_h265_unittest.cc | 400 + .../video_capture/linux/video_capture_v4l2.cc | 46 +- .../video_capture/linux/video_capture_v4l2.h | 8 +- .../libwebrtc/modules/video_coding/BUILD.gn | 26 +- .../video_coding/codecs/av1/libaom_av1_encoder.cc | 18 +- .../codecs/av1/libaom_av1_encoder_unittest.cc | 25 + .../video_coding/codecs/av1/libaom_av1_unittest.cc | 1 + .../multiplex/include/multiplex_decoder_adapter.h | 5 +- .../codecs/multiplex/multiplex_decoder_adapter.cc | 7 +- .../multiplex/test/multiplex_adapter_unittest.cc | 12 +- .../video_coding/codecs/test/video_codec_test.cc | 20 +- .../video_encoder_decoder_instantiation_tests.cc | 8 +- .../codecs/test/videocodec_test_fixture_impl.cc | 6 +- .../modules/video_coding/codecs/vp8/include/vp8.h | 5 + .../video_coding/codecs/vp8/libvpx_vp8_decoder.cc | 30 +- .../video_coding/codecs/vp8/libvpx_vp8_decoder.h | 6 + .../codecs/vp8/libvpx_vp8_simulcast_test.cc | 4 +- .../codecs/vp8/test/vp8_impl_unittest.cc | 3 +- .../video_coding/codecs/vp9/libvpx_vp9_encoder.cc | 48 +- .../video_coding/codecs/vp9/libvpx_vp9_encoder.h | 8 + .../modules/video_coding/codecs/vp9/svc_config.cc | 3 + .../video_coding/codecs/vp9/svc_config_unittest.cc | 20 + .../codecs/vp9/test/vp9_impl_unittest.cc | 109 + .../modules/video_coding/h264_packet_buffer.cc | 287 - .../modules/video_coding/h264_packet_buffer.h | 56 - .../video_coding/h264_packet_buffer_unittest.cc | 778 - .../modules/video_coding/h26x_packet_buffer.cc | 337 + .../modules/video_coding/h26x_packet_buffer.h | 57 + .../video_coding/h26x_packet_buffer_unittest.cc | 1058 + .../video_coding/include/video_error_codes.h | 4 - .../include/video_error_codes_utils.cc | 46 + .../video_coding/include/video_error_codes_utils.h | 22 + .../utility/simulcast_test_fixture_impl.cc | 5 +- .../video_codec_initializer_unittest.cc | 21 + .../video_codec_interface_gn/moz.build | 1 + third_party/libwebrtc/moz-patch-stack/0001.patch | 14 +- third_party/libwebrtc/moz-patch-stack/0012.patch | 4 +- third_party/libwebrtc/moz-patch-stack/0013.patch | 4 +- third_party/libwebrtc/moz-patch-stack/0021.patch | 4 +- third_party/libwebrtc/moz-patch-stack/0023.patch | 6 +- third_party/libwebrtc/moz-patch-stack/0030.patch | 84 +- third_party/libwebrtc/moz-patch-stack/0032.patch | 11 +- third_party/libwebrtc/moz-patch-stack/0033.patch | 4 +- third_party/libwebrtc/moz-patch-stack/0034.patch | 26 +- third_party/libwebrtc/moz-patch-stack/0042.patch | 8 +- third_party/libwebrtc/moz-patch-stack/0044.patch | 6 +- third_party/libwebrtc/moz-patch-stack/0050.patch | 6 +- third_party/libwebrtc/moz-patch-stack/0052.patch | 12 +- third_party/libwebrtc/moz-patch-stack/0054.patch | 2 +- third_party/libwebrtc/moz-patch-stack/0059.patch | 4 +- third_party/libwebrtc/moz-patch-stack/0061.patch | 4 +- third_party/libwebrtc/moz-patch-stack/0064.patch | 2 +- third_party/libwebrtc/moz-patch-stack/0066.patch | 4 +- third_party/libwebrtc/moz-patch-stack/0068.patch | 6 +- third_party/libwebrtc/moz-patch-stack/0069.patch | 4 +- third_party/libwebrtc/moz-patch-stack/0070.patch | 4 +- third_party/libwebrtc/moz-patch-stack/0071.patch | 4 +- third_party/libwebrtc/moz-patch-stack/0075.patch | 4 +- third_party/libwebrtc/moz-patch-stack/0076.patch | 2 +- third_party/libwebrtc/moz-patch-stack/0081.patch | 4 +- third_party/libwebrtc/moz-patch-stack/0085.patch | 4 +- third_party/libwebrtc/moz-patch-stack/0086.patch | 2 +- third_party/libwebrtc/moz-patch-stack/0091.patch | 115 +- third_party/libwebrtc/moz-patch-stack/0092.patch | 54 +- third_party/libwebrtc/moz-patch-stack/0093.patch | 62 +- third_party/libwebrtc/moz-patch-stack/0094.patch | 55 +- third_party/libwebrtc/moz-patch-stack/0095.patch | 44 +- third_party/libwebrtc/moz-patch-stack/0096.patch | 35385 +++- third_party/libwebrtc/moz-patch-stack/0097.patch | 35372 +--- third_party/libwebrtc/moz-patch-stack/0098.patch | 78 +- third_party/libwebrtc/moz-patch-stack/0099.patch | 88 +- third_party/libwebrtc/moz-patch-stack/0100.patch | 145 +- third_party/libwebrtc/moz-patch-stack/0101.patch | 212 +- third_party/libwebrtc/moz-patch-stack/0102.patch | 134 +- third_party/libwebrtc/moz-patch-stack/0103.patch | 101 +- third_party/libwebrtc/moz-patch-stack/0104.patch | 80 +- third_party/libwebrtc/moz-patch-stack/0105.patch | 136 +- third_party/libwebrtc/moz-patch-stack/0106.patch | 301 +- third_party/libwebrtc/moz-patch-stack/0107.patch | 55 +- third_party/libwebrtc/moz-patch-stack/0108.patch | 27 - third_party/libwebrtc/moz-patch-stack/0109.patch | 243 - third_party/libwebrtc/moz-patch-stack/0110.patch | 207 - .../541f202354.no-op-cherry-pick-msg | 1 - .../74a4038ead.no-op-cherry-pick-msg | 1 + .../958c9ac546.no-op-cherry-pick-msg | 1 - .../de3c726121.no-op-cherry-pick-msg | 1 - third_party/libwebrtc/moz.build | 1 + .../zero_checksum_acceptable_chunk_parameter.cc | 28 +- .../zero_checksum_acceptable_chunk_parameter.h | 16 +- ...ero_checksum_acceptable_chunk_parameter_test.cc | 19 +- .../libwebrtc/net/dcsctp/packet/sctp_packet.cc | 4 +- .../net/dcsctp/packet/sctp_packet_test.cc | 26 +- .../libwebrtc/net/dcsctp/public/dcsctp_options.h | 15 +- third_party/libwebrtc/net/dcsctp/public/types.h | 21 + .../libwebrtc/net/dcsctp/socket/capabilities.h | 2 +- .../libwebrtc/net/dcsctp/socket/dcsctp_socket.cc | 25 +- .../net/dcsctp/socket/dcsctp_socket_test.cc | 36 +- .../net/dcsctp/socket/heartbeat_handler_test.cc | 3 +- third_party/libwebrtc/p2p/BUILD.gn | 1039 +- .../libwebrtc/p2p/base/basic_ice_controller.cc | 4 +- .../libwebrtc/p2p/base/basic_ice_controller.h | 1 - .../p2p/base/basic_packet_socket_factory.cc | 6 +- .../libwebrtc/p2p/base/candidate_pair_interface.h | 12 + third_party/libwebrtc/p2p/base/connection.cc | 54 +- third_party/libwebrtc/p2p/base/connection.h | 44 +- .../libwebrtc/p2p/base/fake_port_allocator.h | 5 +- third_party/libwebrtc/p2p/base/p2p_constants.h | 15 + .../libwebrtc/p2p/base/p2p_transport_channel.cc | 13 +- .../libwebrtc/p2p/base/p2p_transport_channel.h | 6 +- .../p2p/base/p2p_transport_channel_unittest.cc | 283 +- third_party/libwebrtc/p2p/base/port.cc | 39 +- third_party/libwebrtc/p2p/base/port.h | 81 +- third_party/libwebrtc/p2p/base/port_allocator.cc | 16 +- third_party/libwebrtc/p2p/base/port_allocator.h | 20 +- .../libwebrtc/p2p/base/port_allocator_unittest.cc | 5 +- third_party/libwebrtc/p2p/base/port_interface.h | 75 + third_party/libwebrtc/p2p/base/port_unittest.cc | 11 +- .../p2p/base/regathering_controller_unittest.cc | 2 - .../libwebrtc/p2p/base/stun_port_unittest.cc | 2 +- third_party/libwebrtc/p2p/base/tcp_port.cc | 3 +- .../libwebrtc/p2p/base/transport_description.h | 11 - .../p2p/base/transport_description_factory.cc | 92 +- .../p2p/base/transport_description_factory.h | 15 +- .../base/transport_description_factory_unittest.cc | 112 +- third_party/libwebrtc/p2p/base/turn_port.cc | 1 + third_party/libwebrtc/p2p/base/turn_port.h | 3 +- .../libwebrtc/p2p/client/basic_port_allocator.cc | 8 +- .../p2p/client/basic_port_allocator_unittest.cc | 48 +- third_party/libwebrtc/pc/BUILD.gn | 154 +- third_party/libwebrtc/pc/dtls_srtp_transport.h | 1 - third_party/libwebrtc/pc/jsep_transport.cc | 75 +- third_party/libwebrtc/pc/jsep_transport.h | 10 - .../libwebrtc/pc/jsep_transport_controller.cc | 20 +- .../libwebrtc/pc/jsep_transport_controller.h | 2 +- .../libwebrtc/pc/jsep_transport_unittest.cc | 171 +- third_party/libwebrtc/pc/media_session.cc | 428 +- third_party/libwebrtc/pc/media_session.h | 5 - third_party/libwebrtc/pc/media_session_unittest.cc | 482 +- third_party/libwebrtc/pc/peer_connection.cc | 100 +- third_party/libwebrtc/pc/peer_connection.h | 2 + .../pc/peer_connection_crypto_unittest.cc | 284 +- .../peer_connection_encodings_integrationtest.cc | 89 + .../libwebrtc/pc/peer_connection_ice_unittest.cc | 4 +- .../pc/peer_connection_integrationtest.cc | 34 +- .../pc/peer_connection_interface_unittest.cc | 49 +- third_party/libwebrtc/pc/peer_connection_proxy.h | 4 + third_party/libwebrtc/pc/rtc_stats_collector.cc | 26 +- .../libwebrtc/pc/rtc_stats_collector_unittest.cc | 5 +- .../libwebrtc/pc/rtc_stats_integrationtest.cc | 17 +- third_party/libwebrtc/pc/rtc_stats_traversal.cc | 3 +- third_party/libwebrtc/pc/rtp_receiver.cc | 5 +- third_party/libwebrtc/pc/rtp_transceiver.cc | 29 + .../libwebrtc/pc/rtp_transceiver_unittest.cc | 75 + third_party/libwebrtc/pc/sdp_offer_answer.cc | 25 +- third_party/libwebrtc/pc/session_description.h | 9 - third_party/libwebrtc/pc/srtp_filter.cc | 280 - third_party/libwebrtc/pc/srtp_filter.h | 147 - third_party/libwebrtc/pc/srtp_filter_unittest.cc | 472 - third_party/libwebrtc/pc/srtp_transport.h | 3 - .../libwebrtc/pc/test/fake_peer_connection_base.h | 3 + .../pc/test/mock_peer_connection_internal.h | 4 + third_party/libwebrtc/pc/test/svc_e2e_tests.cc | 3 +- third_party/libwebrtc/pc/used_ids.h | 11 +- third_party/libwebrtc/pc/used_ids_unittest.cc | 9 +- third_party/libwebrtc/pc/webrtc_sdp.cc | 95 +- third_party/libwebrtc/pc/webrtc_sdp_unittest.cc | 313 +- .../pc/webrtc_session_description_factory.cc | 65 +- .../pc/webrtc_session_description_factory.h | 7 +- third_party/libwebrtc/rtc_base/BUILD.gn | 5 +- third_party/libwebrtc/rtc_base/ip_address.h | 4 +- third_party/libwebrtc/rtc_base/nat_server.cc | 16 +- .../libwebrtc/rtc_base/nat_socket_factory.cc | 58 +- .../libwebrtc/rtc_base/nat_socket_factory.h | 6 +- third_party/libwebrtc/rtc_base/nat_unittest.cc | 10 +- .../libwebrtc/rtc_base/physical_socket_server.cc | 25 + .../libwebrtc/rtc_base/physical_socket_server.h | 2 + third_party/libwebrtc/rtc_base/proxy_server.cc | 4 - third_party/libwebrtc/rtc_base/proxy_server.h | 16 - third_party/libwebrtc/rtc_base/proxy_unittest.cc | 68 - .../libwebrtc/rtc_base/server_socket_adapters.cc | 131 - .../libwebrtc/rtc_base/server_socket_adapters.h | 34 - third_party/libwebrtc/rtc_base/socket.h | 11 +- third_party/libwebrtc/rtc_base/socket_adapters.cc | 194 - third_party/libwebrtc/rtc_base/socket_adapters.h | 36 - third_party/libwebrtc/rtc_base/socket_unittest.cc | 14 +- .../rtc_base/system/file_wrapper_unittest.cc | 3 +- .../libwebrtc/rtc_tools/network_tester/BUILD.gn | 1 + .../rtc_tools/rtc_event_log_to_text/converter.cc | 8 +- .../rtc_tools/rtc_event_log_visualizer/analyzer.cc | 32 +- third_party/libwebrtc/sdk/BUILD.gn | 15 + .../sdk/android/api/org/webrtc/Dav1dDecoder.java | 2 +- .../android/api/org/webrtc/LibvpxVp8Decoder.java | 6 +- .../android/api/org/webrtc/LibvpxVp9Decoder.java | 2 +- .../org/webrtc/SoftwareVideoDecoderFactory.java | 14 +- .../sdk/android/api/org/webrtc/VideoDecoder.java | 6 +- .../api/org/webrtc/VideoDecoderFallback.java | 7 +- .../api/org/webrtc/WrappedNativeVideoDecoder.java | 2 +- .../api/peerconnection/RTCPeerConnectionFactory.h | 13 + .../api/peerconnection/RTCPeerConnectionFactory.mm | 29 + .../peerconnection/RTCRtpCapabilities+Private.h | 32 + .../objc/api/peerconnection/RTCRtpCapabilities.h | 31 + .../objc/api/peerconnection/RTCRtpCapabilities.mm | 60 + .../peerconnection/RTCRtpCodecCapability+Private.h | 33 + .../api/peerconnection/RTCRtpCodecCapability.h | 58 + .../api/peerconnection/RTCRtpCodecCapability.mm | 116 + .../RTCRtpHeaderExtensionCapability+Private.h | 34 + .../RTCRtpHeaderExtensionCapability.h | 33 + .../RTCRtpHeaderExtensionCapability.mm | 56 + .../objc/api/peerconnection/RTCRtpTransceiver.h | 7 + .../objc/api/peerconnection/RTCRtpTransceiver.mm | 11 + .../unittests/RTCPeerConnectionFactory_xctest.m | 193 + third_party/libwebrtc/stats/BUILD.gn | 4 +- third_party/libwebrtc/stats/attribute.cc | 19 +- third_party/libwebrtc/stats/rtc_stats_member.cc | 62 - .../libwebrtc/stats/rtc_stats_report_unittest.cc | 7 +- third_party/libwebrtc/stats/rtc_stats_unittest.cc | 101 +- third_party/libwebrtc/stats/test/rtc_test_stats.h | 33 +- third_party/libwebrtc/test/BUILD.gn | 26 +- third_party/libwebrtc/test/call_test.cc | 4 +- third_party/libwebrtc/test/fuzzers/BUILD.gn | 6 + .../test/fuzzers/h265_depacketizer_fuzzer.cc | 19 + .../libwebrtc/test/fuzzers/neteq_signal_fuzzer.cc | 1 - .../test/fuzzers/rtp_format_h264_fuzzer.cc | 150 +- .../test/fuzzers/rtp_format_vp8_fuzzer.cc | 146 +- .../test/fuzzers/rtp_format_vp9_fuzzer.cc | 146 +- third_party/libwebrtc/test/mock_audio_encoder.h | 4 + .../libwebrtc/test/mock_frame_transformer.h | 45 - .../libwebrtc/test/mock_transformable_frame.h | 41 - third_party/libwebrtc/test/network/BUILD.gn | 3 + third_party/libwebrtc/test/pc/e2e/BUILD.gn | 2 + .../libwebrtc/test/pc/e2e/analyzer/video/BUILD.gn | 1 + .../video/quality_analyzing_video_decoder.cc | 6 +- .../video/quality_analyzing_video_decoder.h | 5 +- .../pc/e2e/network_quality_metrics_reporter.cc | 10 - .../test/pc/e2e/network_quality_metrics_reporter.h | 2 - .../test/pc/e2e/peer_connection_quality_test.cc | 5 +- ...stats_based_network_quality_metrics_reporter.cc | 10 - .../stats_based_network_quality_metrics_reporter.h | 3 - third_party/libwebrtc/test/peer_scenario/BUILD.gn | 2 + .../test/peer_scenario/peer_scenario_client.cc | 12 +- .../test/peer_scenario/peer_scenario_client.h | 1 + .../test/peer_scenario/signaling_route.cc | 39 +- .../libwebrtc/test/peer_scenario/signaling_route.h | 17 +- .../test/peer_scenario/tests/bwe_ramp_up_test.cc | 150 + .../peer_scenario/tests/unsignaled_stream_test.cc | 1 - .../libwebrtc/test/scenario/video_stream.cc | 4 - .../libwebrtc/test/testsupport/file_utils.cc | 17 +- .../libwebrtc/test/testsupport/file_utils.h | 5 + .../test/testsupport/file_utils_unittest.cc | 38 +- .../libwebrtc/test/testsupport/test_artifacts.cc | 9 +- third_party/libwebrtc/test/video_codec_tester.cc | 177 +- third_party/libwebrtc/test/video_codec_tester.h | 2 + .../libwebrtc/test/video_codec_tester_unittest.cc | 195 +- third_party/libwebrtc/video/BUILD.gn | 3 +- .../video/end_to_end_tests/codec_tests.cc | 12 +- .../end_to_end_tests/multi_codec_receive_tests.cc | 3 +- .../libwebrtc/video/rtp_video_stream_receiver2.cc | 21 +- .../video/rtp_video_stream_receiver2_unittest.cc | 2 +- third_party/libwebrtc/video/video_quality_test.cc | 12 +- third_party/libwebrtc/video/video_quality_test.h | 2 + .../libwebrtc/video/video_receive_stream2.cc | 2 +- .../video/video_receive_stream2_unittest.cc | 14 +- .../libwebrtc/video/video_send_stream_impl.cc | 70 +- .../libwebrtc/video/video_send_stream_impl.h | 5 +- .../video/video_send_stream_impl_unittest.cc | 250 +- .../libwebrtc/video/video_stream_encoder.cc | 55 +- third_party/moz.build | 2 + third_party/perfetto/LICENSE | 189 + third_party/perfetto/moz.build | 19 + third_party/perfetto/moz.yaml | 44 + third_party/perfetto/moz_attributes.patch | 31 + third_party/perfetto/sdk/perfetto.cc | 64804 +++++++ third_party/perfetto/sdk/perfetto.h | 164605 ++++++++++++++++++ .../glean_parser-13.0.1.dist-info/AUTHORS.md | 17 - .../glean_parser-13.0.1.dist-info/LICENSE | 373 - .../glean_parser-13.0.1.dist-info/METADATA | 790 - .../glean_parser-13.0.1.dist-info/RECORD | 48 - .../glean_parser-13.0.1.dist-info/WHEEL | 5 - .../glean_parser-13.0.1.dist-info/entry_points.txt | 2 - .../glean_parser-13.0.1.dist-info/top_level.txt | 1 - .../glean_parser-14.0.1.dist-info/AUTHORS.md | 17 + .../glean_parser-14.0.1.dist-info/LICENSE | 373 + .../glean_parser-14.0.1.dist-info/METADATA | 799 + .../glean_parser-14.0.1.dist-info/RECORD | 48 + .../glean_parser-14.0.1.dist-info/WHEEL | 5 + .../glean_parser-14.0.1.dist-info/entry_points.txt | 2 + .../glean_parser-14.0.1.dist-info/top_level.txt | 1 + .../glean_parser/glean_parser/javascript_server.py | 16 +- .../python/glean_parser/glean_parser/kotlin.py | 25 + .../python/glean_parser/glean_parser/parser.py | 24 +- .../python/glean_parser/glean_parser/pings.py | 6 + .../glean_parser/schemas/pings.2-0-0.schema.yaml | 22 + .../python/glean_parser/glean_parser/swift.py | 23 +- .../templates/javascript_server.jinja2 | 54 +- .../glean_parser/templates/kotlin.jinja2 | 66 +- .../glean_parser/templates/rust.jinja2 | 2 +- .../glean_parser/templates/swift.jinja2 | 49 +- .../python/glean_parser/glean_parser/util.py | 2 + third_party/python/poetry.lock | 15 +- third_party/python/requirements.in | 4 +- third_party/python/requirements.txt | 13 +- .../taskcluster_taskgraph-6.3.0.dist-info/LICENSE | 373 - .../taskcluster_taskgraph-6.3.0.dist-info/METADATA | 28 - .../taskcluster_taskgraph-6.3.0.dist-info/RECORD | 80 - .../taskcluster_taskgraph-6.3.0.dist-info/WHEEL | 5 - .../entry_points.txt | 2 - .../top_level.txt | 1 - .../taskcluster_taskgraph-8.0.1.dist-info/LICENSE | 373 + .../taskcluster_taskgraph-8.0.1.dist-info/METADATA | 123 + .../taskcluster_taskgraph-8.0.1.dist-info/RECORD | 79 + .../taskcluster_taskgraph-8.0.1.dist-info/WHEEL | 5 + .../entry_points.txt | 2 + .../top_level.txt | 1 + .../taskcluster_taskgraph/taskgraph/__init__.py | 2 +- .../taskgraph/actions/add_new_jobs.py | 2 +- .../taskgraph/actions/cancel.py | 4 +- .../taskgraph/actions/cancel_all.py | 4 +- .../taskgraph/actions/rebuild_cached_tasks.py | 2 +- .../taskgraph/actions/registry.py | 34 +- .../taskgraph/actions/retrigger.py | 26 +- .../taskgraph/actions/util.py | 15 +- .../taskcluster_taskgraph/taskgraph/config.py | 20 +- .../taskcluster_taskgraph/taskgraph/create.py | 2 +- .../taskcluster_taskgraph/taskgraph/decision.py | 23 +- .../taskcluster_taskgraph/taskgraph/docker.py | 48 +- .../taskgraph/files_changed.py | 91 - .../taskcluster_taskgraph/taskgraph/generator.py | 8 +- .../taskgraph/loader/default.py | 4 +- .../python/taskcluster_taskgraph/taskgraph/main.py | 83 +- .../taskcluster_taskgraph/taskgraph/morph.py | 1 + .../taskgraph/optimize/base.py | 12 + .../taskgraph/optimize/strategies.py | 16 +- .../taskcluster_taskgraph/taskgraph/parameters.py | 5 +- .../taskgraph/run-task/run-task | 14 +- .../taskgraph/target_tasks.py | 8 +- .../taskgraph/transforms/__init__.py | 3 - .../taskgraph/transforms/base.py | 2 +- .../taskgraph/transforms/code_review.py | 10 +- .../taskgraph/transforms/docker_image.py | 16 +- .../taskgraph/transforms/fetch.py | 65 +- .../taskgraph/transforms/from_deps.py | 21 +- .../taskgraph/transforms/job/__init__.py | 453 - .../taskgraph/transforms/job/common.py | 171 - .../taskgraph/transforms/job/index_search.py | 37 - .../taskgraph/transforms/job/run_task.py | 231 - .../taskgraph/transforms/job/toolchain.py | 175 - .../taskgraph/transforms/run/__init__.py | 451 + .../taskgraph/transforms/run/common.py | 165 + .../taskgraph/transforms/run/index_search.py | 37 + .../taskgraph/transforms/run/run_task.py | 231 + .../taskgraph/transforms/run/toolchain.py | 175 + .../taskgraph/transforms/task.py | 61 +- .../taskgraph/transforms/task_context.py | 10 +- .../taskgraph/util/archive.py | 52 +- .../taskgraph/util/cached_tasks.py | 45 +- .../taskgraph/util/decision.py | 79 - .../taskcluster_taskgraph/taskgraph/util/docker.py | 21 +- .../taskcluster_taskgraph/taskgraph/util/hash.py | 11 +- .../taskgraph/util/keyed_by.py | 16 +- .../taskgraph/util/memoize.py | 35 +- .../taskgraph/util/parameterization.py | 21 +- .../taskcluster_taskgraph/taskgraph/util/schema.py | 12 +- .../taskgraph/util/set_name.py | 34 + .../taskcluster_taskgraph/taskgraph/util/shell.py | 2 +- .../taskgraph/util/taskcluster.py | 71 +- .../taskcluster_taskgraph/taskgraph/util/time.py | 4 +- .../taskgraph/util/treeherder.py | 15 +- .../taskcluster_taskgraph/taskgraph/util/vcs.py | 36 +- .../taskcluster_taskgraph/taskgraph/util/verify.py | 27 +- .../taskcluster_taskgraph/taskgraph/util/yaml.py | 5 +- third_party/rust/ahash/.cargo-checksum.json | 2 +- third_party/rust/ahash/Cargo.toml | 74 +- third_party/rust/ahash/LICENSE-MIT | 2 +- third_party/rust/ahash/README.md | 19 +- third_party/rust/ahash/build.rs | 31 +- third_party/rust/ahash/src/aes_hash.rs | 65 +- third_party/rust/ahash/src/convert.rs | 11 +- third_party/rust/ahash/src/fallback_hash.rs | 51 +- third_party/rust/ahash/src/hash_map.rs | 150 +- third_party/rust/ahash/src/hash_quality_test.rs | 95 +- third_party/rust/ahash/src/hash_set.rs | 65 +- third_party/rust/ahash/src/lib.rs | 243 +- third_party/rust/ahash/src/operations.rs | 98 +- third_party/rust/ahash/src/random_state.rs | 511 +- third_party/rust/ahash/src/specialize.rs | 23 +- third_party/rust/ahash/tests/bench.rs | 205 +- third_party/rust/ahash/tests/map_tests.rs | 115 +- third_party/rust/ahash/tests/nopanic.rs | 35 +- .../rust/any_all_workaround/.cargo-checksum.json | 1 + third_party/rust/any_all_workaround/Cargo.toml | 28 + third_party/rust/any_all_workaround/LICENSE-APACHE | 201 + third_party/rust/any_all_workaround/LICENSE-MIT | 25 + .../rust/any_all_workaround/LICENSE-MIT-QCMS | 21 + third_party/rust/any_all_workaround/README.md | 13 + third_party/rust/any_all_workaround/build.rs | 7 + third_party/rust/any_all_workaround/src/lib.rs | 110 + third_party/rust/audio-mixer/.cargo-checksum.json | 2 +- third_party/rust/audio-mixer/Cargo.lock | 207 +- third_party/rust/audio-mixer/Cargo.toml | 2 +- third_party/rust/audio-mixer/src/channel.rs | 4 +- third_party/rust/audio-mixer/src/coefficient.rs | 239 +- third_party/rust/bindgen/clang.rs | 2 +- third_party/rust/bitflags/.cargo-checksum.json | 2 +- third_party/rust/bitflags/CHANGELOG.md | 26 + third_party/rust/bitflags/Cargo.lock | 165 +- third_party/rust/bitflags/Cargo.toml | 6 +- third_party/rust/bitflags/README.md | 2 +- third_party/rust/bitflags/src/external.rs | 24 +- third_party/rust/bitflags/src/internal.rs | 10 +- third_party/rust/bitflags/src/lib.rs | 46 +- third_party/rust/bitflags/src/parser.rs | 85 + third_party/rust/bitflags/src/public.rs | 42 +- third_party/rust/bitflags/src/tests/all.rs | 23 + third_party/rust/bitflags/src/tests/bits.rs | 36 + third_party/rust/bitflags/src/tests/complement.rs | 53 + third_party/rust/bitflags/src/tests/contains.rs | 108 + third_party/rust/bitflags/src/tests/difference.rs | 92 + third_party/rust/bitflags/src/tests/empty.rs | 23 + third_party/rust/bitflags/src/tests/eq.rs | 10 + third_party/rust/bitflags/src/tests/extend.rs | 42 + third_party/rust/bitflags/src/tests/flags.rs | 46 + third_party/rust/bitflags/src/tests/fmt.rs | 97 + third_party/rust/bitflags/src/tests/from_bits.rs | 45 + .../rust/bitflags/src/tests/from_bits_retain.rs | 38 + .../rust/bitflags/src/tests/from_bits_truncate.rs | 42 + third_party/rust/bitflags/src/tests/from_name.rs | 42 + third_party/rust/bitflags/src/tests/insert.rs | 91 + .../rust/bitflags/src/tests/intersection.rs | 79 + third_party/rust/bitflags/src/tests/intersects.rs | 91 + third_party/rust/bitflags/src/tests/is_all.rs | 32 + third_party/rust/bitflags/src/tests/is_empty.rs | 31 + third_party/rust/bitflags/src/tests/iter.rs | 209 + third_party/rust/bitflags/src/tests/parser.rs | 332 + third_party/rust/bitflags/src/tests/remove.rs | 100 + .../bitflags/src/tests/symmetric_difference.rs | 110 + third_party/rust/bitflags/src/tests/union.rs | 71 + third_party/rust/bitflags/src/traits.rs | 1 + third_party/rust/byteorder/.cargo-checksum.json | 2 +- third_party/rust/byteorder/CHANGELOG.md | 4 + third_party/rust/byteorder/Cargo.toml | 31 +- third_party/rust/byteorder/README.md | 16 +- third_party/rust/byteorder/benches/bench.rs | 2 + third_party/rust/byteorder/src/lib.rs | 231 +- .../rust/core-foundation-sys/.cargo-checksum.json | 2 +- third_party/rust/core-foundation-sys/Cargo.toml | 19 +- third_party/rust/core-foundation-sys/build.rs | 14 - third_party/rust/core-foundation-sys/src/array.rs | 119 +- .../core-foundation-sys/src/attributed_string.rs | 99 +- third_party/rust/core-foundation-sys/src/bag.rs | 101 + third_party/rust/core-foundation-sys/src/base.rs | 88 +- .../rust/core-foundation-sys/src/binary_heap.rs | 83 + .../rust/core-foundation-sys/src/bit_vector.rs | 74 + third_party/rust/core-foundation-sys/src/bundle.rs | 215 +- .../rust/core-foundation-sys/src/calendar.rs | 128 + .../rust/core-foundation-sys/src/characterset.rs | 102 +- third_party/rust/core-foundation-sys/src/data.rs | 59 +- third_party/rust/core-foundation-sys/src/date.rs | 10 +- .../rust/core-foundation-sys/src/date_formatter.rs | 147 + .../rust/core-foundation-sys/src/dictionary.rs | 124 +- third_party/rust/core-foundation-sys/src/error.rs | 38 +- .../rust/core-foundation-sys/src/file_security.rs | 71 + .../rust/core-foundation-sys/src/filedescriptor.rs | 56 +- third_party/rust/core-foundation-sys/src/lib.rs | 48 +- third_party/rust/core-foundation-sys/src/locale.rs | 146 + .../rust/core-foundation-sys/src/mach_port.rs | 63 +- .../rust/core-foundation-sys/src/messageport.rs | 123 +- .../core-foundation-sys/src/notification_center.rs | 89 + third_party/rust/core-foundation-sys/src/number.rs | 74 +- .../core-foundation-sys/src/number_formatter.rs | 163 + third_party/rust/core-foundation-sys/src/plugin.rs | 99 + .../rust/core-foundation-sys/src/preferences.rs | 103 + .../rust/core-foundation-sys/src/propertylist.rs | 102 +- .../rust/core-foundation-sys/src/runloop.rs | 220 +- third_party/rust/core-foundation-sys/src/set.rs | 64 +- third_party/rust/core-foundation-sys/src/socket.rs | 188 + third_party/rust/core-foundation-sys/src/stream.rs | 281 + third_party/rust/core-foundation-sys/src/string.rs | 757 +- .../core-foundation-sys/src/string_tokenizer.rs | 91 + .../rust/core-foundation-sys/src/timezone.rs | 76 +- third_party/rust/core-foundation-sys/src/tree.rs | 74 + third_party/rust/core-foundation-sys/src/url.rs | 468 +- .../rust/core-foundation-sys/src/url_enumerator.rs | 62 + .../core-foundation-sys/src/user_notification.rs | 138 + third_party/rust/core-foundation-sys/src/uuid.rs | 74 +- .../rust/core-foundation-sys/src/xml_node.rs | 147 + .../rust/core-foundation-sys/src/xml_parser.rs | 174 + .../rust/core-foundation/.cargo-checksum.json | 2 +- third_party/rust/core-foundation/Cargo.toml | 19 +- third_party/rust/core-foundation/src/array.rs | 97 +- .../rust/core-foundation/src/attributed_string.rs | 51 +- third_party/rust/core-foundation/src/base.rs | 84 +- third_party/rust/core-foundation/src/boolean.rs | 17 +- third_party/rust/core-foundation/src/bundle.rs | 65 +- .../rust/core-foundation/src/characterset.rs | 4 +- third_party/rust/core-foundation/src/data.rs | 53 +- third_party/rust/core-foundation/src/date.rs | 25 +- third_party/rust/core-foundation/src/dictionary.rs | 204 +- third_party/rust/core-foundation/src/error.rs | 15 +- .../rust/core-foundation/src/filedescriptor.rs | 90 +- third_party/rust/core-foundation/src/lib.rs | 24 +- third_party/rust/core-foundation/src/mach_port.rs | 13 +- third_party/rust/core-foundation/src/number.rs | 54 +- .../rust/core-foundation/src/propertylist.rs | 109 +- third_party/rust/core-foundation/src/runloop.rs | 128 +- third_party/rust/core-foundation/src/set.rs | 32 +- third_party/rust/core-foundation/src/string.rs | 100 +- third_party/rust/core-foundation/src/timezone.rs | 25 +- third_party/rust/core-foundation/src/url.rs | 75 +- third_party/rust/core-foundation/src/uuid.rs | 32 +- .../rust/core-graphics-types/.cargo-checksum.json | 2 +- third_party/rust/core-graphics-types/Cargo.toml | 10 +- .../rust/core-graphics-types/LICENSE-APACHE | 201 + third_party/rust/core-graphics-types/LICENSE-MIT | 25 + third_party/rust/core-graphics-types/src/base.rs | 4 +- .../rust/core-graphics-types/src/geometry.rs | 67 +- third_party/rust/core-graphics-types/src/lib.rs | 2 +- .../rust/cssparser-macros/.cargo-checksum.json | 2 +- third_party/rust/cssparser-macros/Cargo.toml | 4 +- third_party/rust/cssparser/.cargo-checksum.json | 2 +- .../rust/cssparser/.github/workflows/main.yml | 78 - third_party/rust/cssparser/Cargo.toml | 41 +- third_party/rust/cssparser/README.md | 4 +- third_party/rust/cssparser/docs/.nojekyll | 0 third_party/rust/cssparser/src/color.rs | 63 +- third_party/rust/cssparser/src/cow_rc_str.rs | 6 +- third_party/rust/cssparser/src/macros.rs | 4 +- third_party/rust/cssparser/src/nth.rs | 18 +- third_party/rust/cssparser/src/parser.rs | 56 +- .../rust/cssparser/src/rules_and_declarations.rs | 87 +- third_party/rust/cssparser/src/serializer.rs | 36 +- third_party/rust/cssparser/src/tests.rs | 74 +- third_party/rust/cssparser/src/tokenizer.rs | 86 +- third_party/rust/cssparser/src/unicode_range.rs | 20 +- .../rust/cubeb-coreaudio/.cargo-checksum.json | 2 +- third_party/rust/cubeb-coreaudio/Cargo.toml | 5 +- .../rust/cubeb-coreaudio/src/backend/mixer.rs | 53 +- .../rust/cubeb-coreaudio/src/backend/mod.rs | 172 +- .../rust/cubeb-coreaudio/src/backend/tests/api.rs | 3 +- third_party/rust/d3d12/.cargo-checksum.json | 2 +- third_party/rust/d3d12/Cargo.toml | 2 +- third_party/rust/encoding_rs/.cargo-checksum.json | 2 +- third_party/rust/encoding_rs/Cargo.toml | 19 +- third_party/rust/encoding_rs/README.md | 34 +- third_party/rust/encoding_rs/src/ascii.rs | 388 +- third_party/rust/encoding_rs/src/handles.rs | 36 +- third_party/rust/encoding_rs/src/lib.rs | 13 +- third_party/rust/encoding_rs/src/mem.rs | 18 +- third_party/rust/encoding_rs/src/simd_funcs.rs | 146 +- third_party/rust/encoding_rs/src/single_byte.rs | 64 +- third_party/rust/encoding_rs/src/x_user_defined.rs | 10 +- third_party/rust/equivalent/.cargo-checksum.json | 1 + third_party/rust/equivalent/Cargo.toml | 27 + third_party/rust/equivalent/LICENSE-APACHE | 201 + third_party/rust/equivalent/LICENSE-MIT | 25 + third_party/rust/equivalent/README.md | 25 + third_party/rust/equivalent/src/lib.rs | 113 + .../rust/fallible_collections/.cargo-checksum.json | 2 +- third_party/rust/fallible_collections/Cargo.toml | 7 +- third_party/rust/fallible_collections/README.md | 3 +- third_party/rust/fallible_collections/src/arc.rs | 35 +- third_party/rust/fallible_collections/src/boxed.rs | 15 +- .../rust/fallible_collections/src/btree/node.rs | 29 +- third_party/rust/fallible_collections/src/lib.rs | 24 +- .../fallible_collections/src/try_reserve_error.rs | 19 + third_party/rust/fallible_collections/src/vec.rs | 4 +- third_party/rust/getrandom/.cargo-checksum.json | 2 +- third_party/rust/getrandom/CHANGELOG.md | 39 + third_party/rust/getrandom/Cargo.toml | 3 +- third_party/rust/getrandom/LICENSE-MIT | 2 +- third_party/rust/getrandom/README.md | 23 +- third_party/rust/getrandom/src/3ds.rs | 8 - third_party/rust/getrandom/src/apple-other.rs | 27 +- third_party/rust/getrandom/src/bsd_arandom.rs | 8 - third_party/rust/getrandom/src/custom.rs | 9 - third_party/rust/getrandom/src/dragonfly.rs | 8 - third_party/rust/getrandom/src/error.rs | 14 +- third_party/rust/getrandom/src/error_impls.rs | 9 - third_party/rust/getrandom/src/espidf.rs | 8 - third_party/rust/getrandom/src/fuchsia.rs | 8 - third_party/rust/getrandom/src/hermit.rs | 24 +- third_party/rust/getrandom/src/hurd.rs | 8 - third_party/rust/getrandom/src/js.rs | 8 +- third_party/rust/getrandom/src/lazy.rs | 56 + third_party/rust/getrandom/src/lib.rs | 96 +- third_party/rust/getrandom/src/linux_android.rs | 47 +- .../getrandom/src/linux_android_with_fallback.rs | 33 + third_party/rust/getrandom/src/macos.rs | 40 +- third_party/rust/getrandom/src/openbsd.rs | 8 - third_party/rust/getrandom/src/rdrand.rs | 13 +- third_party/rust/getrandom/src/solaris_illumos.rs | 8 - third_party/rust/getrandom/src/solid.rs | 8 - third_party/rust/getrandom/src/use_file.rs | 21 +- third_party/rust/getrandom/src/util.rs | 68 +- third_party/rust/getrandom/src/util_libc.rs | 41 +- third_party/rust/getrandom/src/vita.rs | 8 - third_party/rust/getrandom/src/vxworks.rs | 8 - third_party/rust/getrandom/src/wasi.rs | 8 - third_party/rust/getrandom/src/windows.rs | 9 +- third_party/rust/getrandom/tests/rdrand.rs | 2 + third_party/rust/glean-core/.cargo-checksum.json | 2 +- third_party/rust/glean-core/Cargo.toml | 4 +- .../rust/glean-core/src/common_metric_data.rs | 5 +- third_party/rust/glean-core/src/core/mod.rs | 29 +- third_party/rust/glean-core/src/database/mod.rs | 65 + third_party/rust/glean-core/src/glean.udl | 22 +- third_party/rust/glean-core/src/internal_pings.rs | 17 +- third_party/rust/glean-core/src/lib.rs | 42 +- third_party/rust/glean-core/src/lib_unit_tests.rs | 118 +- .../src/metrics/metrics_enabled_config.rs | 46 - third_party/rust/glean-core/src/metrics/mod.rs | 21 +- third_party/rust/glean-core/src/metrics/object.rs | 29 + third_party/rust/glean-core/src/metrics/ping.rs | 74 +- .../src/metrics/remote_settings_config.rs | 52 + third_party/rust/glean-core/src/ping/mod.rs | 3 + .../rust/glean-core/src/upload/directory.rs | 6 +- third_party/rust/glean-core/src/upload/mod.rs | 24 + third_party/rust/glean-core/tests/event.rs | 4 + third_party/rust/glean-core/tests/ping.rs | 46 +- third_party/rust/glean-core/tests/ping_maker.rs | 23 +- third_party/rust/glean/.cargo-checksum.json | 2 +- third_party/rust/glean/Cargo.toml | 4 +- third_party/rust/glean/src/lib.rs | 10 +- third_party/rust/glean/src/private/ping.rs | 11 + third_party/rust/glean/src/test.rs | 29 +- third_party/rust/glean/tests/init_fails.rs | 5 +- third_party/rust/glean/tests/never_init.rs | 5 +- third_party/rust/glean/tests/no_time_to_init.rs | 5 +- third_party/rust/glean/tests/schema.rs | 3 +- third_party/rust/glean/tests/simple.rs | 5 +- .../rust/glean/tests/test-shutdown-blocking.sh | 2 +- .../rust/glean/tests/test-thread-crashing.sh | 2 +- third_party/rust/glean/tests/upload_timing.rs | 5 +- .../rust/gpu-descriptor-types/.cargo-checksum.json | 2 +- third_party/rust/gpu-descriptor-types/Cargo.toml | 23 +- .../rust/gpu-descriptor-types/src/device.rs | 128 +- third_party/rust/gpu-descriptor-types/src/lib.rs | 12 +- third_party/rust/gpu-descriptor-types/src/types.rs | 86 +- .../rust/gpu-descriptor/.cargo-checksum.json | 2 +- third_party/rust/gpu-descriptor/Cargo.toml | 14 +- third_party/rust/gpu-descriptor/src/allocator.rs | 38 +- third_party/rust/hashbrown/.cargo-checksum.json | 2 +- third_party/rust/hashbrown/CHANGELOG.md | 166 +- third_party/rust/hashbrown/Cargo.toml | 40 +- third_party/rust/hashbrown/README.md | 77 +- third_party/rust/hashbrown/benches/bench.rs | 2 +- .../rust/hashbrown/src/external_trait_impls/mod.rs | 2 + .../src/external_trait_impls/rayon/map.rs | 47 +- .../src/external_trait_impls/rayon/mod.rs | 1 + .../src/external_trait_impls/rayon/raw.rs | 23 +- .../src/external_trait_impls/rayon/set.rs | 34 +- .../src/external_trait_impls/rayon/table.rs | 252 + .../src/external_trait_impls/rkyv/hash_map.rs | 125 + .../src/external_trait_impls/rkyv/hash_set.rs | 123 + .../hashbrown/src/external_trait_impls/rkyv/mod.rs | 2 + .../hashbrown/src/external_trait_impls/serde.rs | 63 +- third_party/rust/hashbrown/src/lib.rs | 78 +- third_party/rust/hashbrown/src/macros.rs | 2 +- third_party/rust/hashbrown/src/map.rs | 1324 +- third_party/rust/hashbrown/src/raw/alloc.rs | 57 +- third_party/rust/hashbrown/src/raw/bitmask.rs | 99 +- third_party/rust/hashbrown/src/raw/generic.rs | 59 +- third_party/rust/hashbrown/src/raw/mod.rs | 3319 +- third_party/rust/hashbrown/src/raw/neon.rs | 124 + third_party/rust/hashbrown/src/raw/sse2.rs | 31 +- third_party/rust/hashbrown/src/rustc_entry.rs | 32 +- third_party/rust/hashbrown/src/scopeguard.rs | 14 +- third_party/rust/hashbrown/src/set.rs | 516 +- third_party/rust/hashbrown/src/table.rs | 2070 + .../rust/hashbrown/tests/equivalent_trait.rs | 53 + third_party/rust/hashbrown/tests/raw.rs | 11 + third_party/rust/hashbrown/tests/rayon.rs | 4 +- third_party/rust/hashbrown/tests/set.rs | 2 +- third_party/rust/hashlink/.cargo-checksum.json | 2 +- third_party/rust/hashlink/CHANGELOG.md | 3 + third_party/rust/hashlink/Cargo.toml | 8 +- third_party/rust/hashlink/tests/serde.rs | 10 +- third_party/rust/indexmap/.cargo-checksum.json | 2 +- third_party/rust/indexmap/Cargo.toml | 47 +- third_party/rust/indexmap/README.md | 10 +- third_party/rust/indexmap/RELEASES.md | 618 +- third_party/rust/indexmap/build.rs | 8 - third_party/rust/indexmap/src/arbitrary.rs | 2 + third_party/rust/indexmap/src/borsh.rs | 122 + third_party/rust/indexmap/src/equivalent.rs | 27 - third_party/rust/indexmap/src/lib.rs | 132 +- third_party/rust/indexmap/src/macros.rs | 36 +- third_party/rust/indexmap/src/map.rs | 1410 +- third_party/rust/indexmap/src/map/core.rs | 460 +- third_party/rust/indexmap/src/map/core/entry.rs | 481 + third_party/rust/indexmap/src/map/core/raw.rs | 174 +- .../rust/indexmap/src/map/core/raw_entry_v1.rs | 652 + third_party/rust/indexmap/src/map/iter.rs | 713 + third_party/rust/indexmap/src/map/mutable.rs | 87 + third_party/rust/indexmap/src/map/serde_seq.rs | 138 + third_party/rust/indexmap/src/map/slice.rs | 539 + third_party/rust/indexmap/src/map/tests.rs | 727 + third_party/rust/indexmap/src/mutable_keys.rs | 75 - third_party/rust/indexmap/src/rayon/map.rs | 184 +- third_party/rust/indexmap/src/rayon/mod.rs | 2 + third_party/rust/indexmap/src/rayon/set.rs | 111 +- third_party/rust/indexmap/src/serde.rs | 43 +- third_party/rust/indexmap/src/serde_seq.rs | 112 - third_party/rust/indexmap/src/set.rs | 1465 +- third_party/rust/indexmap/src/set/iter.rs | 626 + third_party/rust/indexmap/src/set/mutable.rs | 86 + third_party/rust/indexmap/src/set/slice.rs | 340 + third_party/rust/indexmap/src/set/tests.rs | 723 + third_party/rust/indexmap/src/util.rs | 22 + third_party/rust/indexmap/tests/quick.rs | 198 +- .../rust/interrupt-support/.cargo-checksum.json | 2 +- third_party/rust/interrupt-support/Cargo.toml | 5 + third_party/rust/interrupt-support/build.rs | 8 + .../interrupt-support/src/interrupt_support.udl | 5 + third_party/rust/interrupt-support/src/shutdown.rs | 5 +- third_party/rust/interrupt-support/src/sql.rs | 7 + third_party/rust/libc/.cargo-checksum.json | 2 +- third_party/rust/libc/Cargo.toml | 2 +- .../rust/libc/src/unix/linux_like/android/mod.rs | 33 + .../src/unix/linux_like/linux/arch/generic/mod.rs | 6 +- .../rust/libc/src/unix/linux_like/linux/mod.rs | 2 + third_party/rust/libc/src/unix/newlib/vita/mod.rs | 2 + third_party/rust/libloading/.cargo-checksum.json | 2 +- third_party/rust/libloading/Cargo.toml | 16 +- third_party/rust/libloading/README.mkd | 7 +- third_party/rust/libloading/src/changelog.rs | 45 + third_party/rust/libloading/src/error.rs | 8 +- third_party/rust/libloading/src/lib.rs | 8 +- third_party/rust/libloading/src/os/unix/consts.rs | 12 +- third_party/rust/libloading/src/os/unix/mod.rs | 186 +- third_party/rust/libloading/src/os/windows/mod.rs | 154 +- third_party/rust/libloading/tests/functions.rs | 20 +- third_party/rust/metal/.cargo-checksum.json | 2 +- third_party/rust/metal/Cargo.lock | 273 +- third_party/rust/metal/Cargo.toml | 7 +- third_party/rust/metal/README.md | 7 - third_party/rust/metal/src/device.rs | 3 +- third_party/rust/metal/src/lib.rs | 8 +- third_party/rust/metal/src/library.rs | 9 +- third_party/rust/metal/src/mps.rs | 7 +- third_party/rust/metal/src/sync.rs | 4 +- .../rust/minidump-common/.cargo-checksum.json | 2 +- third_party/rust/minidump-common/Cargo.toml | 4 +- .../rust/minidump-common/src/errors/macos.rs | 25 +- third_party/rust/minidump-common/src/format.rs | 24 + .../rust/minidump-writer/.cargo-checksum.json | 2 +- third_party/rust/minidump-writer/CHANGELOG.md | 40 +- third_party/rust/minidump-writer/Cargo.lock | 1763 +- third_party/rust/minidump-writer/Cargo.toml | 27 +- third_party/rust/minidump-writer/README.md | 26 +- third_party/rust/minidump-writer/deny.toml | 6 +- third_party/rust/minidump-writer/src/bin/test.rs | 26 +- .../rust/minidump-writer/src/linux/dso_debug.rs | 5 - .../rust/minidump-writer/src/linux/maps_reader.rs | 147 +- .../minidump-writer/src/linux/minidump_writer.rs | 38 +- .../minidump-writer/src/linux/ptrace_dumper.rs | 81 +- .../minidump-writer/src/linux/sections/mappings.rs | 21 +- .../rust/minidump-writer/src/linux/thread_info.rs | 22 +- third_party/rust/minidump-writer/src/mac/mach.rs | 3 +- .../minidump-writer/src/mac/streams/exception.rs | 8 +- .../minidump-writer/src/mac/streams/module_list.rs | 8 + .../src/mac/streams/thread_names.rs | 4 +- .../minidump-writer/src/windows/minidump_writer.rs | 14 +- .../rust/minidump-writer/tests/common/mod.rs | 67 +- .../minidump-writer/tests/linux_minidump_writer.rs | 106 +- .../rust/minidump-writer/tests/ptrace_dumper.rs | 21 +- third_party/rust/naga/.cargo-checksum.json | 2 +- third_party/rust/naga/Cargo.toml | 9 +- third_party/rust/naga/src/arena.rs | 17 +- third_party/rust/naga/src/back/dot/mod.rs | 91 + third_party/rust/naga/src/back/glsl/features.rs | 23 + third_party/rust/naga/src/back/glsl/mod.rs | 148 +- third_party/rust/naga/src/back/hlsl/conv.rs | 5 + third_party/rust/naga/src/back/hlsl/help.rs | 94 +- third_party/rust/naga/src/back/hlsl/mod.rs | 17 + third_party/rust/naga/src/back/hlsl/writer.rs | 315 +- third_party/rust/naga/src/back/mod.rs | 17 + third_party/rust/naga/src/back/msl/mod.rs | 27 +- third_party/rust/naga/src/back/msl/writer.rs | 192 +- .../rust/naga/src/back/pipeline_constants.rs | 957 + third_party/rust/naga/src/back/spv/block.rs | 32 +- third_party/rust/naga/src/back/spv/helpers.rs | 53 +- third_party/rust/naga/src/back/spv/instructions.rs | 103 + third_party/rust/naga/src/back/spv/mod.rs | 47 +- third_party/rust/naga/src/back/spv/subgroup.rs | 207 + third_party/rust/naga/src/back/spv/writer.rs | 68 +- third_party/rust/naga/src/back/wgsl/writer.rs | 131 +- third_party/rust/naga/src/block.rs | 6 + third_party/rust/naga/src/compact/expressions.rs | 27 +- third_party/rust/naga/src/compact/functions.rs | 6 +- third_party/rust/naga/src/compact/mod.rs | 49 +- third_party/rust/naga/src/compact/statements.rs | 67 + third_party/rust/naga/src/error.rs | 74 + third_party/rust/naga/src/front/glsl/context.rs | 48 +- third_party/rust/naga/src/front/glsl/error.rs | 18 +- third_party/rust/naga/src/front/glsl/functions.rs | 10 + third_party/rust/naga/src/front/glsl/mod.rs | 4 +- third_party/rust/naga/src/front/glsl/parser.rs | 17 +- .../naga/src/front/glsl/parser/declarations.rs | 9 +- .../rust/naga/src/front/glsl/parser/functions.rs | 9 +- .../rust/naga/src/front/glsl/parser_tests.rs | 22 +- third_party/rust/naga/src/front/glsl/types.rs | 17 +- third_party/rust/naga/src/front/glsl/variables.rs | 1 - third_party/rust/naga/src/front/spv/convert.rs | 5 + third_party/rust/naga/src/front/spv/error.rs | 10 +- third_party/rust/naga/src/front/spv/function.rs | 13 +- third_party/rust/naga/src/front/spv/image.rs | 13 +- third_party/rust/naga/src/front/spv/mod.rs | 455 +- third_party/rust/naga/src/front/spv/null.rs | 8 +- third_party/rust/naga/src/front/wgsl/error.rs | 29 +- third_party/rust/naga/src/front/wgsl/index.rs | 1 + third_party/rust/naga/src/front/wgsl/lower/mod.rs | 414 +- third_party/rust/naga/src/front/wgsl/mod.rs | 11 + third_party/rust/naga/src/front/wgsl/parse/ast.rs | 9 + third_party/rust/naga/src/front/wgsl/parse/conv.rs | 28 + third_party/rust/naga/src/front/wgsl/parse/mod.rs | 106 +- third_party/rust/naga/src/front/wgsl/to_wgsl.rs | 3 +- third_party/rust/naga/src/lib.rs | 174 +- .../rust/naga/src/proc/constant_evaluator.rs | 589 +- third_party/rust/naga/src/proc/index.rs | 4 +- third_party/rust/naga/src/proc/mod.rs | 105 +- third_party/rust/naga/src/proc/terminator.rs | 3 + third_party/rust/naga/src/proc/typifier.rs | 8 + third_party/rust/naga/src/span.rs | 12 +- third_party/rust/naga/src/valid/analyzer.rs | 53 +- third_party/rust/naga/src/valid/expression.rs | 39 +- third_party/rust/naga/src/valid/function.rs | 268 +- third_party/rust/naga/src/valid/handles.rs | 91 +- third_party/rust/naga/src/valid/interface.rs | 48 +- third_party/rust/naga/src/valid/mod.rs | 228 +- third_party/rust/naga/src/valid/type.rs | 3 +- third_party/rust/neqo-common/.cargo-checksum.json | 2 +- third_party/rust/neqo-common/Cargo.toml | 13 +- third_party/rust/neqo-common/src/datagram.rs | 2 +- third_party/rust/neqo-common/src/fuzz.rs | 43 + third_party/rust/neqo-common/src/lib.rs | 4 + third_party/rust/neqo-common/src/tos.rs | 27 + third_party/rust/neqo-crypto/.cargo-checksum.json | 2 +- third_party/rust/neqo-crypto/Cargo.toml | 5 +- third_party/rust/neqo-crypto/src/aead_null.rs | 2 - third_party/rust/neqo-crypto/src/agentio.rs | 2 +- third_party/rust/neqo-crypto/src/constants.rs | 2 +- third_party/rust/neqo-crypto/src/p11.rs | 26 +- third_party/rust/neqo-http3/.cargo-checksum.json | 2 +- third_party/rust/neqo-http3/Cargo.toml | 6 +- .../rust/neqo-http3/src/buffered_send_stream.rs | 51 +- third_party/rust/neqo-http3/src/connection.rs | 20 +- .../rust/neqo-http3/src/connection_client.rs | 20 +- .../tests/webtransport/negotiation.rs | 7 +- third_party/rust/neqo-http3/src/send_message.rs | 4 +- third_party/rust/neqo-http3/src/server.rs | 4 +- third_party/rust/neqo-http3/tests/httpconn.rs | 4 +- third_party/rust/neqo-qpack/.cargo-checksum.json | 2 +- third_party/rust/neqo-qpack/Cargo.toml | 6 +- .../rust/neqo-transport/.cargo-checksum.json | 2 +- third_party/rust/neqo-transport/Cargo.toml | 22 +- .../rust/neqo-transport/src/cc/classic_cc.rs | 105 +- third_party/rust/neqo-transport/src/cc/mod.rs | 3 + .../rust/neqo-transport/src/cc/tests/cubic.rs | 34 +- .../rust/neqo-transport/src/cc/tests/new_reno.rs | 89 +- .../rust/neqo-transport/src/connection/mod.rs | 285 +- .../rust/neqo-transport/src/connection/state.rs | 39 +- .../rust/neqo-transport/src/connection/tests/cc.rs | 36 +- .../neqo-transport/src/connection/tests/close.rs | 25 +- .../src/connection/tests/datagram.rs | 14 +- .../neqo-transport/src/connection/tests/ecn.rs | 392 + .../src/connection/tests/handshake.rs | 26 +- .../neqo-transport/src/connection/tests/keys.rs | 10 +- .../src/connection/tests/migration.rs | 8 +- .../neqo-transport/src/connection/tests/mod.rs | 31 +- .../neqo-transport/src/connection/tests/stream.rs | 9 +- .../rust/neqo-transport/src/connection/tests/vn.rs | 8 +- third_party/rust/neqo-transport/src/ecn.rs | 225 + third_party/rust/neqo-transport/src/events.rs | 4 +- third_party/rust/neqo-transport/src/frame.rs | 143 +- third_party/rust/neqo-transport/src/lib.rs | 27 +- third_party/rust/neqo-transport/src/packet/mod.rs | 1 + third_party/rust/neqo-transport/src/path.rs | 77 +- third_party/rust/neqo-transport/src/qlog.rs | 73 +- third_party/rust/neqo-transport/src/recovery.rs | 52 +- third_party/rust/neqo-transport/src/send_stream.rs | 18 +- third_party/rust/neqo-transport/src/sender.rs | 5 + third_party/rust/neqo-transport/src/server.rs | 7 + third_party/rust/neqo-transport/src/tracking.rs | 47 +- .../rust/neqo-transport/tests/common/mod.rs | 108 - .../rust/neqo-transport/tests/connection.rs | 41 +- third_party/rust/neqo-transport/tests/network.rs | 10 +- third_party/rust/neqo-transport/tests/retry.rs | 20 +- third_party/rust/neqo-transport/tests/server.rs | 93 +- third_party/rust/nix/.cargo-checksum.json | 2 +- third_party/rust/nix/CHANGELOG.md | 205 +- third_party/rust/nix/Cargo.toml | 16 +- third_party/rust/nix/README.md | 4 +- third_party/rust/nix/build.rs | 25 + third_party/rust/nix/src/dir.rs | 27 +- third_party/rust/nix/src/env.rs | 7 +- third_party/rust/nix/src/errno.rs | 1114 +- third_party/rust/nix/src/fcntl.rs | 561 +- third_party/rust/nix/src/features.rs | 12 +- third_party/rust/nix/src/ifaddrs.rs | 38 +- third_party/rust/nix/src/kmod.rs | 4 + third_party/rust/nix/src/lib.rs | 34 +- third_party/rust/nix/src/macros.rs | 8 +- third_party/rust/nix/src/mount/bsd.rs | 51 +- third_party/rust/nix/src/mount/linux.rs | 2 +- third_party/rust/nix/src/mount/mod.rs | 22 +- third_party/rust/nix/src/mqueue.rs | 15 +- third_party/rust/nix/src/net/if_.rs | 329 +- third_party/rust/nix/src/poll.rs | 61 +- third_party/rust/nix/src/poll_timeout.rs | 224 + third_party/rust/nix/src/pty.rs | 15 +- third_party/rust/nix/src/sched.rs | 43 +- third_party/rust/nix/src/sys/aio.rs | 80 +- third_party/rust/nix/src/sys/epoll.rs | 25 +- third_party/rust/nix/src/sys/event.rs | 171 +- third_party/rust/nix/src/sys/eventfd.rs | 77 +- third_party/rust/nix/src/sys/fanotify.rs | 416 + third_party/rust/nix/src/sys/inotify.rs | 16 +- third_party/rust/nix/src/sys/ioctl/bsd.rs | 4 +- third_party/rust/nix/src/sys/ioctl/linux.rs | 2 + third_party/rust/nix/src/sys/ioctl/mod.rs | 112 +- third_party/rust/nix/src/sys/memfd.rs | 43 + third_party/rust/nix/src/sys/mman.rs | 320 +- third_party/rust/nix/src/sys/mod.rs | 82 +- third_party/rust/nix/src/sys/personality.rs | 2 - third_party/rust/nix/src/sys/prctl.rs | 23 +- third_party/rust/nix/src/sys/ptrace/bsd.rs | 42 +- third_party/rust/nix/src/sys/ptrace/linux.rs | 58 +- third_party/rust/nix/src/sys/ptrace/mod.rs | 20 +- third_party/rust/nix/src/sys/quota.rs | 6 +- third_party/rust/nix/src/sys/reboot.rs | 175 +- third_party/rust/nix/src/sys/resource.rs | 109 +- third_party/rust/nix/src/sys/select.rs | 267 +- third_party/rust/nix/src/sys/sendfile.rs | 116 +- third_party/rust/nix/src/sys/signal.rs | 461 +- third_party/rust/nix/src/sys/signalfd.rs | 66 +- third_party/rust/nix/src/sys/socket/addr.rs | 873 +- third_party/rust/nix/src/sys/socket/mod.rs | 943 +- third_party/rust/nix/src/sys/socket/sockopt.rs | 454 +- third_party/rust/nix/src/sys/stat.rs | 73 +- third_party/rust/nix/src/sys/statfs.rs | 363 +- third_party/rust/nix/src/sys/statvfs.rs | 48 +- third_party/rust/nix/src/sys/termios.rs | 570 +- third_party/rust/nix/src/sys/time.rs | 121 +- third_party/rust/nix/src/sys/timerfd.rs | 2 +- third_party/rust/nix/src/sys/uio.rs | 49 +- third_party/rust/nix/src/sys/utsname.rs | 23 +- third_party/rust/nix/src/sys/wait.rs | 58 +- third_party/rust/nix/src/time.rs | 236 +- third_party/rust/nix/src/unistd.rs | 1313 +- third_party/rust/nix/test/common/mod.rs | 10 +- third_party/rust/nix/test/sys/mod.rs | 54 +- third_party/rust/nix/test/sys/test_aio.rs | 66 +- third_party/rust/nix/test/sys/test_aio_drop.rs | 3 +- third_party/rust/nix/test/sys/test_event.rs | 41 + third_party/rust/nix/test/sys/test_fanotify.rs | 149 + third_party/rust/nix/test/sys/test_ioctl.rs | 33 +- third_party/rust/nix/test/sys/test_mman.rs | 55 +- third_party/rust/nix/test/sys/test_ptrace.rs | 12 +- third_party/rust/nix/test/sys/test_resource.rs | 43 + third_party/rust/nix/test/sys/test_select.rs | 247 +- third_party/rust/nix/test/sys/test_signal.rs | 320 +- third_party/rust/nix/test/sys/test_signalfd.rs | 63 + third_party/rust/nix/test/sys/test_socket.rs | 474 +- third_party/rust/nix/test/sys/test_sockopt.rs | 454 +- third_party/rust/nix/test/sys/test_statfs.rs | 99 + third_party/rust/nix/test/sys/test_statvfs.rs | 13 + third_party/rust/nix/test/sys/test_termios.rs | 13 +- third_party/rust/nix/test/sys/test_time.rs | 91 + third_party/rust/nix/test/sys/test_timer.rs | 102 + third_party/rust/nix/test/sys/test_uio.rs | 36 +- third_party/rust/nix/test/sys/test_utsname.rs | 17 + third_party/rust/nix/test/sys/test_wait.rs | 16 +- third_party/rust/nix/test/test.rs | 32 +- third_party/rust/nix/test/test_dir.rs | 4 +- third_party/rust/nix/test/test_errno.rs | 16 + third_party/rust/nix/test/test_fcntl.rs | 182 +- third_party/rust/nix/test/test_mount.rs | 390 +- third_party/rust/nix/test/test_mq.rs | 20 +- third_party/rust/nix/test/test_net.rs | 8 +- third_party/rust/nix/test/test_poll.rs | 31 +- third_party/rust/nix/test/test_pty.rs | 24 +- third_party/rust/nix/test/test_resource.rs | 34 - third_party/rust/nix/test/test_sendfile.rs | 90 +- third_party/rust/nix/test/test_stat.rs | 104 +- third_party/rust/nix/test/test_time.rs | 49 +- third_party/rust/nix/test/test_timer.rs | 102 - third_party/rust/nix/test/test_unistd.rs | 260 +- .../rust/objc_exception/.cargo-checksum.json | 1 - third_party/rust/objc_exception/Cargo.toml | 25 - third_party/rust/objc_exception/build.rs | 7 - third_party/rust/objc_exception/extern/exception.m | 21 - third_party/rust/objc_exception/src/lib.rs | 100 - third_party/rust/owning_ref/.cargo-checksum.json | 1 - third_party/rust/owning_ref/CHANGELOG.md | 8 - third_party/rust/owning_ref/Cargo.toml | 24 - third_party/rust/owning_ref/LICENSE | 21 - third_party/rust/owning_ref/README.md | 64 - third_party/rust/owning_ref/src/lib.rs | 2016 - third_party/rust/packed_simd/.cargo-checksum.json | 1 - third_party/rust/packed_simd/Cargo.toml | 83 - third_party/rust/packed_simd/LICENSE-APACHE | 201 - third_party/rust/packed_simd/LICENSE-MIT | 25 - third_party/rust/packed_simd/README.md | 144 - third_party/rust/packed_simd/bors.toml | 3 - third_party/rust/packed_simd/build.rs | 6 - third_party/rust/packed_simd/ci/all.sh | 71 - .../rust/packed_simd/ci/android-install-ndk.sh | 21 - .../rust/packed_simd/ci/android-install-sdk.sh | 60 - .../rust/packed_simd/ci/android-sysimage.sh | 56 - third_party/rust/packed_simd/ci/benchmark.sh | 32 - .../ci/deploy_and_run_on_ios_simulator.rs | 176 - .../ci/docker/aarch64-linux-android/Dockerfile | 47 - .../ci/docker/aarch64-unknown-linux-gnu/Dockerfile | 14 - .../ci/docker/arm-unknown-linux-gnueabi/Dockerfile | 15 - .../docker/arm-unknown-linux-gnueabihf/Dockerfile | 13 - .../ci/docker/armv7-linux-androideabi/Dockerfile | 47 - .../armv7-unknown-linux-gnueabihf/Dockerfile | 13 - .../ci/docker/i586-unknown-linux-gnu/Dockerfile | 7 - .../ci/docker/i686-unknown-linux-gnu/Dockerfile | 7 - .../ci/docker/mips-unknown-linux-gnu/Dockerfile | 13 - .../mips64-unknown-linux-gnuabi64/Dockerfile | 10 - .../mips64el-unknown-linux-gnuabi64/Dockerfile | 10 - .../ci/docker/mipsel-unknown-linux-musl/Dockerfile | 25 - .../ci/docker/powerpc-unknown-linux-gnu/Dockerfile | 13 - .../docker/powerpc64-unknown-linux-gnu/Dockerfile | 17 - .../powerpc64le-unknown-linux-gnu/Dockerfile | 11 - .../ci/docker/s390x-unknown-linux-gnu/Dockerfile | 20 - .../ci/docker/sparc64-unknown-linux-gnu/Dockerfile | 18 - .../thumbv7neon-linux-androideabi/Dockerfile | 47 - .../thumbv7neon-unknown-linux-gnueabihf/Dockerfile | 13 - .../ci/docker/wasm32-unknown-unknown/Dockerfile | 39 - .../ci/docker/x86_64-linux-android/Dockerfile | 31 - .../x86_64-unknown-linux-gnu-emulated/Dockerfile | 16 - .../ci/docker/x86_64-unknown-linux-gnu/Dockerfile | 10 - third_party/rust/packed_simd/ci/dox.sh | 27 - third_party/rust/packed_simd/ci/linux-s390x.sh | 18 - third_party/rust/packed_simd/ci/linux-sparc64.sh | 17 - third_party/rust/packed_simd/ci/lld-shim.rs | 11 - third_party/rust/packed_simd/ci/max_line_width.sh | 17 - third_party/rust/packed_simd/ci/run-docker.sh | 38 - third_party/rust/packed_simd/ci/run.sh | 99 - third_party/rust/packed_simd/ci/run_examples.sh | 51 - third_party/rust/packed_simd/ci/runtest-android.rs | 45 - .../rust/packed_simd/ci/setup_benchmarks.sh | 7 - third_party/rust/packed_simd/ci/test-runner-linux | 24 - third_party/rust/packed_simd/contributing.md | 67 - third_party/rust/packed_simd/perf-guide/book.toml | 12 - .../rust/packed_simd/perf-guide/src/SUMMARY.md | 21 - .../rust/packed_simd/perf-guide/src/ascii.css | 4 - .../packed_simd/perf-guide/src/bound_checks.md | 22 - .../perf-guide/src/float-math/approx.md | 8 - .../packed_simd/perf-guide/src/float-math/fma.md | 6 - .../packed_simd/perf-guide/src/float-math/fp.md | 3 - .../packed_simd/perf-guide/src/float-math/svml.md | 7 - .../packed_simd/perf-guide/src/introduction.md | 26 - .../rust/packed_simd/perf-guide/src/prof/linux.md | 107 - .../rust/packed_simd/perf-guide/src/prof/mca.md | 100 - .../packed_simd/perf-guide/src/prof/profiling.md | 14 - .../perf-guide/src/target-feature/attribute.md | 5 - .../perf-guide/src/target-feature/features.md | 13 - .../perf-guide/src/target-feature/inlining.md | 5 - .../perf-guide/src/target-feature/practice.md | 31 - .../perf-guide/src/target-feature/runtime.md | 5 - .../perf-guide/src/target-feature/rustflags.md | 77 - .../packed_simd/perf-guide/src/vert-hor-ops.md | 76 - third_party/rust/packed_simd/rust-toolchain | 1 - third_party/rust/packed_simd/rustfmt.toml | 5 - third_party/rust/packed_simd/src/api.rs | 309 - third_party/rust/packed_simd/src/api/bit_manip.rs | 129 - third_party/rust/packed_simd/src/api/bitmask.rs | 79 - third_party/rust/packed_simd/src/api/cast.rs | 108 - .../rust/packed_simd/src/api/cast/macros.rs | 82 - third_party/rust/packed_simd/src/api/cast/v128.rs | 302 - third_party/rust/packed_simd/src/api/cast/v16.rs | 68 - third_party/rust/packed_simd/src/api/cast/v256.rs | 298 - third_party/rust/packed_simd/src/api/cast/v32.rs | 132 - third_party/rust/packed_simd/src/api/cast/v512.rs | 209 - third_party/rust/packed_simd/src/api/cast/v64.rs | 208 - third_party/rust/packed_simd/src/api/cmp.rs | 16 - third_party/rust/packed_simd/src/api/cmp/eq.rs | 27 - third_party/rust/packed_simd/src/api/cmp/ord.rs | 43 - .../rust/packed_simd/src/api/cmp/partial_eq.rs | 65 - .../rust/packed_simd/src/api/cmp/partial_ord.rs | 230 - .../rust/packed_simd/src/api/cmp/vertical.rs | 114 - third_party/rust/packed_simd/src/api/default.rs | 30 - third_party/rust/packed_simd/src/api/fmt.rs | 12 - third_party/rust/packed_simd/src/api/fmt/binary.rs | 54 - third_party/rust/packed_simd/src/api/fmt/debug.rs | 60 - .../rust/packed_simd/src/api/fmt/lower_hex.rs | 54 - third_party/rust/packed_simd/src/api/fmt/octal.rs | 54 - .../rust/packed_simd/src/api/fmt/upper_hex.rs | 54 - third_party/rust/packed_simd/src/api/from.rs | 7 - .../rust/packed_simd/src/api/from/from_array.rs | 124 - .../rust/packed_simd/src/api/from/from_vector.rs | 67 - third_party/rust/packed_simd/src/api/hash.rs | 49 - third_party/rust/packed_simd/src/api/into_bits.rs | 59 - .../packed_simd/src/api/into_bits/arch_specific.rs | 345 - .../rust/packed_simd/src/api/into_bits/macros.rs | 74 - .../rust/packed_simd/src/api/into_bits/v128.rs | 232 - .../rust/packed_simd/src/api/into_bits/v16.rs | 9 - .../rust/packed_simd/src/api/into_bits/v256.rs | 232 - .../rust/packed_simd/src/api/into_bits/v32.rs | 13 - .../rust/packed_simd/src/api/into_bits/v512.rs | 232 - .../rust/packed_simd/src/api/into_bits/v64.rs | 18 - third_party/rust/packed_simd/src/api/math.rs | 4 - third_party/rust/packed_simd/src/api/math/float.rs | 64 - .../rust/packed_simd/src/api/math/float/abs.rs | 31 - .../rust/packed_simd/src/api/math/float/consts.rs | 74 - .../rust/packed_simd/src/api/math/float/cos.rs | 44 - .../rust/packed_simd/src/api/math/float/exp.rs | 33 - .../rust/packed_simd/src/api/math/float/ln.rs | 33 - .../rust/packed_simd/src/api/math/float/mul_add.rs | 44 - .../packed_simd/src/api/math/float/mul_adde.rs | 48 - .../rust/packed_simd/src/api/math/float/powf.rs | 36 - .../rust/packed_simd/src/api/math/float/recpre.rs | 36 - .../rust/packed_simd/src/api/math/float/rsqrte.rs | 40 - .../rust/packed_simd/src/api/math/float/sin.rs | 50 - .../rust/packed_simd/src/api/math/float/sqrt.rs | 35 - .../rust/packed_simd/src/api/math/float/sqrte.rs | 44 - .../rust/packed_simd/src/api/math/float/tanh.rs | 29 - third_party/rust/packed_simd/src/api/minimal.rs | 6 - .../rust/packed_simd/src/api/minimal/iuf.rs | 169 - .../rust/packed_simd/src/api/minimal/mask.rs | 176 - .../rust/packed_simd/src/api/minimal/ptr.rs | 1373 - third_party/rust/packed_simd/src/api/ops.rs | 32 - .../packed_simd/src/api/ops/scalar_arithmetic.rs | 203 - .../rust/packed_simd/src/api/ops/scalar_bitwise.rs | 162 - .../packed_simd/src/api/ops/scalar_mask_bitwise.rs | 140 - .../rust/packed_simd/src/api/ops/scalar_shifts.rs | 106 - .../packed_simd/src/api/ops/vector_arithmetic.rs | 148 - .../rust/packed_simd/src/api/ops/vector_bitwise.rs | 129 - .../src/api/ops/vector_float_min_max.rs | 74 - .../packed_simd/src/api/ops/vector_int_min_max.rs | 57 - .../packed_simd/src/api/ops/vector_mask_bitwise.rs | 116 - .../rust/packed_simd/src/api/ops/vector_neg.rs | 43 - .../rust/packed_simd/src/api/ops/vector_rotates.rs | 92 - .../rust/packed_simd/src/api/ops/vector_shifts.rs | 106 - third_party/rust/packed_simd/src/api/ptr.rs | 4 - .../rust/packed_simd/src/api/ptr/gather_scatter.rs | 216 - third_party/rust/packed_simd/src/api/reductions.rs | 12 - .../rust/packed_simd/src/api/reductions/bitwise.rs | 151 - .../src/api/reductions/float_arithmetic.rs | 313 - .../src/api/reductions/integer_arithmetic.rs | 193 - .../rust/packed_simd/src/api/reductions/mask.rs | 89 - .../rust/packed_simd/src/api/reductions/min_max.rs | 360 - third_party/rust/packed_simd/src/api/select.rs | 73 - third_party/rust/packed_simd/src/api/shuffle.rs | 184 - .../rust/packed_simd/src/api/shuffle1_dyn.rs | 159 - third_party/rust/packed_simd/src/api/slice.rs | 7 - .../rust/packed_simd/src/api/slice/from_slice.rs | 202 - .../packed_simd/src/api/slice/write_to_slice.rs | 196 - third_party/rust/packed_simd/src/api/swap_bytes.rs | 192 - third_party/rust/packed_simd/src/codegen.rs | 62 - .../rust/packed_simd/src/codegen/bit_manip.rs | 347 - third_party/rust/packed_simd/src/codegen/llvm.rs | 122 - third_party/rust/packed_simd/src/codegen/math.rs | 3 - .../rust/packed_simd/src/codegen/math/float.rs | 19 - .../rust/packed_simd/src/codegen/math/float/abs.rs | 103 - .../rust/packed_simd/src/codegen/math/float/cos.rs | 103 - .../packed_simd/src/codegen/math/float/cos_pi.rs | 87 - .../rust/packed_simd/src/codegen/math/float/exp.rs | 112 - .../rust/packed_simd/src/codegen/math/float/ln.rs | 112 - .../packed_simd/src/codegen/math/float/macros.rs | 470 - .../packed_simd/src/codegen/math/float/mul_add.rs | 109 - .../packed_simd/src/codegen/math/float/mul_adde.rs | 60 - .../packed_simd/src/codegen/math/float/powf.rs | 112 - .../rust/packed_simd/src/codegen/math/float/sin.rs | 103 - .../src/codegen/math/float/sin_cos_pi.rs | 188 - .../packed_simd/src/codegen/math/float/sin_pi.rs | 87 - .../packed_simd/src/codegen/math/float/sqrt.rs | 103 - .../packed_simd/src/codegen/math/float/sqrte.rs | 67 - .../packed_simd/src/codegen/math/float/tanh.rs | 120 - .../packed_simd/src/codegen/pointer_sized_int.rs | 28 - .../rust/packed_simd/src/codegen/reductions.rs | 1 - .../packed_simd/src/codegen/reductions/mask.rs | 69 - .../src/codegen/reductions/mask/aarch64.rs | 81 - .../packed_simd/src/codegen/reductions/mask/arm.rs | 56 - .../src/codegen/reductions/mask/fallback.rs | 8 - .../src/codegen/reductions/mask/fallback_impl.rs | 237 - .../packed_simd/src/codegen/reductions/mask/x86.rs | 216 - .../src/codegen/reductions/mask/x86/avx.rs | 95 - .../src/codegen/reductions/mask/x86/avx2.rs | 35 - .../src/codegen/reductions/mask/x86/sse.rs | 35 - .../src/codegen/reductions/mask/x86/sse2.rs | 68 - .../rust/packed_simd/src/codegen/shuffle.rs | 150 - .../rust/packed_simd/src/codegen/shuffle1_dyn.rs | 408 - .../rust/packed_simd/src/codegen/swap_bytes.rs | 149 - third_party/rust/packed_simd/src/codegen/v128.rs | 46 - third_party/rust/packed_simd/src/codegen/v16.rs | 7 - third_party/rust/packed_simd/src/codegen/v256.rs | 78 - third_party/rust/packed_simd/src/codegen/v32.rs | 11 - third_party/rust/packed_simd/src/codegen/v512.rs | 145 - third_party/rust/packed_simd/src/codegen/v64.rs | 21 - third_party/rust/packed_simd/src/codegen/vPtr.rs | 35 - third_party/rust/packed_simd/src/codegen/vSize.rs | 16 - third_party/rust/packed_simd/src/lib.rs | 348 - third_party/rust/packed_simd/src/masks.rs | 126 - third_party/rust/packed_simd/src/sealed.rs | 42 - third_party/rust/packed_simd/src/testing.rs | 8 - third_party/rust/packed_simd/src/testing/macros.rs | 44 - third_party/rust/packed_simd/src/testing/utils.rs | 130 - third_party/rust/packed_simd/src/v128.rs | 80 - third_party/rust/packed_simd/src/v16.rs | 16 - third_party/rust/packed_simd/src/v256.rs | 86 - third_party/rust/packed_simd/src/v32.rs | 29 - third_party/rust/packed_simd/src/v512.rs | 99 - third_party/rust/packed_simd/src/v64.rs | 66 - third_party/rust/packed_simd/src/vPtr.rs | 34 - third_party/rust/packed_simd/src/vSize.rs | 53 - third_party/rust/packed_simd/tests/endianness.rs | 268 - third_party/rust/plist/Cargo.toml | 2 +- third_party/rust/plist/src/lib.rs | 2 + third_party/rust/plist/src/stream/binary_writer.rs | 2 +- third_party/rust/prio/.cargo-checksum.json | 2 +- third_party/rust/prio/Cargo.toml | 66 +- third_party/rust/prio/README.md | 27 +- third_party/rust/prio/benches/cycle_counts.rs | 63 +- third_party/rust/prio/benches/speed_tests.rs | 127 +- .../rust/prio/documentation/field_parameters.sage | 117 + third_party/rust/prio/src/codec.rs | 210 +- third_party/rust/prio/src/dp.rs | 1 + third_party/rust/prio/src/dp/distributions.rs | 22 +- third_party/rust/prio/src/fft.rs | 1 + third_party/rust/prio/src/field.rs | 379 +- third_party/rust/prio/src/field/field255.rs | 31 +- third_party/rust/prio/src/flp.rs | 241 +- third_party/rust/prio/src/flp/types.rs | 460 +- .../rust/prio/src/flp/types/fixedpoint_l2.rs | 131 +- third_party/rust/prio/src/idpf.rs | 152 +- third_party/rust/prio/src/lib.rs | 7 + third_party/rust/prio/src/polynomial.rs | 4 +- third_party/rust/prio/src/prng.rs | 71 +- third_party/rust/prio/src/topology/ping_pong.rs | 125 +- third_party/rust/prio/src/vdaf.rs | 437 +- third_party/rust/prio/src/vdaf/dummy.rs | 147 +- third_party/rust/prio/src/vdaf/poplar1.rs | 306 +- third_party/rust/prio/src/vdaf/prio2.rs | 30 +- third_party/rust/prio/src/vdaf/prio2/client.rs | 16 +- third_party/rust/prio/src/vdaf/prio2/server.rs | 50 +- .../rust/prio/src/vdaf/prio2/test_vector.rs | 32 +- third_party/rust/prio/src/vdaf/prio3.rs | 630 +- third_party/rust/prio/src/vdaf/prio3_test.rs | 162 +- .../prio/src/vdaf/test_vec/07/IdpfPoplar_0.json | 52 - .../rust/prio/src/vdaf/test_vec/07/Poplar1_0.json | 56 - .../rust/prio/src/vdaf/test_vec/07/Poplar1_1.json | 64 - .../rust/prio/src/vdaf/test_vec/07/Poplar1_2.json | 64 - .../rust/prio/src/vdaf/test_vec/07/Poplar1_3.json | 76 - .../prio/src/vdaf/test_vec/07/Prio3Count_0.json | 39 - .../prio/src/vdaf/test_vec/07/Prio3Count_1.json | 45 - .../src/vdaf/test_vec/07/Prio3Histogram_0.json | 52 - .../src/vdaf/test_vec/07/Prio3Histogram_1.json | 89 - .../prio/src/vdaf/test_vec/07/Prio3SumVec_0.json | 194 - .../prio/src/vdaf/test_vec/07/Prio3SumVec_1.json | 146 - .../rust/prio/src/vdaf/test_vec/07/Prio3Sum_0.json | 40 - .../rust/prio/src/vdaf/test_vec/07/Prio3Sum_1.json | 46 - .../src/vdaf/test_vec/07/XofFixedKeyAes128.json | 8 - .../prio/src/vdaf/test_vec/07/XofShake128.json | 8 - .../prio/src/vdaf/test_vec/08/IdpfPoplar_0.json | 52 + .../rust/prio/src/vdaf/test_vec/08/Poplar1_0.json | 56 + .../rust/prio/src/vdaf/test_vec/08/Poplar1_1.json | 64 + .../rust/prio/src/vdaf/test_vec/08/Poplar1_2.json | 64 + .../rust/prio/src/vdaf/test_vec/08/Poplar1_3.json | 76 + .../prio/src/vdaf/test_vec/08/Prio3Count_0.json | 39 + .../prio/src/vdaf/test_vec/08/Prio3Count_1.json | 45 + .../src/vdaf/test_vec/08/Prio3Histogram_0.json | 52 + .../src/vdaf/test_vec/08/Prio3Histogram_1.json | 89 + .../prio/src/vdaf/test_vec/08/Prio3SumVec_0.json | 194 + .../prio/src/vdaf/test_vec/08/Prio3SumVec_1.json | 146 + .../rust/prio/src/vdaf/test_vec/08/Prio3Sum_0.json | 40 + .../rust/prio/src/vdaf/test_vec/08/Prio3Sum_1.json | 46 + .../src/vdaf/test_vec/08/XofFixedKeyAes128.json | 8 + .../src/vdaf/test_vec/08/XofTurboShake128.json | 8 + .../src/vdaf/test_vec/XofHmacSha256Aes128.json | 8 + third_party/rust/prio/src/vdaf/xof.rs | 148 +- third_party/rust/prio/src/vidpf.rs | 827 + third_party/rust/prio/tests/discrete_gauss.rs | 25 +- .../tests/test_vectors/discrete_gauss_100.json | 92 +- .../tests/test_vectors/discrete_gauss_2.342.json | 54 +- .../prio/tests/test_vectors/discrete_gauss_3.json | 62 +- .../test_vectors/discrete_gauss_41293847.json | 98 +- .../prio/tests/test_vectors/discrete_gauss_9.json | 80 +- .../discrete_gauss_9999999999999999999999.json | 98 +- third_party/rust/qlog/.cargo-checksum.json | 2 +- third_party/rust/qlog/Cargo.toml | 2 +- third_party/rust/qlog/src/events/mod.rs | 3 + third_party/rust/qlog/src/events/quic.rs | 21 +- third_party/rust/qlog/src/lib.rs | 28 +- third_party/rust/relevancy/.cargo-checksum.json | 2 +- third_party/rust/relevancy/Cargo.toml | 13 + third_party/rust/relevancy/src/db.rs | 68 +- third_party/rust/relevancy/src/error.rs | 15 + third_party/rust/relevancy/src/ingest.rs | 394 + third_party/rust/relevancy/src/interest.rs | 152 +- third_party/rust/relevancy/src/lib.rs | 80 +- .../rust/relevancy/src/populate_interests.rs | 157 - third_party/rust/relevancy/src/relevancy.udl | 17 +- third_party/rust/relevancy/src/rs.rs | 60 + third_party/rust/relevancy/src/url_hash.rs | 15 +- third_party/rust/relevancy/test-data | Bin 192 -> 188 bytes .../rust/remote_settings/.cargo-checksum.json | 2 +- third_party/rust/remote_settings/src/client.rs | 89 +- third_party/rust/remote_settings/src/config.rs | 30 +- third_party/rust/remote_settings/src/error.rs | 2 + third_party/rust/remote_settings/src/lib.rs | 17 +- .../rust/remote_settings/src/remote_settings.udl | 10 + third_party/rust/serde/.cargo-checksum.json | 2 +- third_party/rust/serde/Cargo.toml | 4 +- third_party/rust/serde/build.rs | 6 + third_party/rust/serde/src/de/impls.rs | 119 +- third_party/rust/serde/src/de/mod.rs | 8 +- third_party/rust/serde/src/lib.rs | 5 +- third_party/rust/serde/src/private/de.rs | 7 + third_party/rust/serde/src/private/doc.rs | 5 +- third_party/rust/serde/src/private/ser.rs | 132 +- third_party/rust/serde/src/ser/fmt.rs | 16 +- third_party/rust/serde/src/ser/impls.rs | 51 +- third_party/rust/serde/src/ser/impossible.rs | 32 +- third_party/rust/serde/src/ser/mod.rs | 79 +- third_party/rust/serde_derive/.cargo-checksum.json | 2 +- third_party/rust/serde_derive/Cargo.toml | 3 +- third_party/rust/serde_derive/src/lib.rs | 2 +- third_party/rust/serde_derive/src/ser.rs | 4 +- third_party/rust/serde_json/.cargo-checksum.json | 2 +- third_party/rust/serde_json/Cargo.toml | 33 +- third_party/rust/serde_json/README.md | 2 +- third_party/rust/serde_json/build.rs | 39 +- third_party/rust/serde_json/src/de.rs | 295 +- third_party/rust/serde_json/src/error.rs | 128 +- .../rust/serde_json/src/features_check/error.rs | 1 - .../rust/serde_json/src/features_check/mod.rs | 13 - third_party/rust/serde_json/src/io/core.rs | 2 +- .../rust/serde_json/src/lexical/algorithm.rs | 5 +- third_party/rust/serde_json/src/lexical/bignum.rs | 1 + third_party/rust/serde_json/src/lexical/digit.rs | 5 +- third_party/rust/serde_json/src/lexical/errors.rs | 3 +- .../rust/serde_json/src/lexical/exponent.rs | 4 +- .../rust/serde_json/src/lexical/large_powers32.rs | 2 +- .../rust/serde_json/src/lexical/large_powers64.rs | 2 +- third_party/rust/serde_json/src/lexical/math.rs | 4 +- third_party/rust/serde_json/src/lexical/num.rs | 13 +- .../rust/serde_json/src/lexical/rounding.rs | 2 +- third_party/rust/serde_json/src/lib.rs | 35 +- third_party/rust/serde_json/src/macros.rs | 3 +- third_party/rust/serde_json/src/map.rs | 131 +- third_party/rust/serde_json/src/number.rs | 76 +- third_party/rust/serde_json/src/raw.rs | 270 +- third_party/rust/serde_json/src/read.rs | 6 +- third_party/rust/serde_json/src/ser.rs | 122 +- third_party/rust/serde_json/src/value/de.rs | 94 +- third_party/rust/serde_json/src/value/from.rs | 37 +- third_party/rust/serde_json/src/value/index.rs | 2 +- third_party/rust/serde_json/src/value/mod.rs | 30 +- .../rust/serde_json/src/value/partial_eq.rs | 14 +- third_party/rust/serde_json/src/value/ser.rs | 41 +- third_party/rust/serde_json/tests/lexical.rs | 6 +- .../rust/serde_json/tests/lexical/exponent.rs | 34 +- third_party/rust/serde_json/tests/lexical/parse.rs | 2 +- third_party/rust/serde_json/tests/map.rs | 1 - .../rust/serde_json/tests/regression/issue1004.rs | 12 + .../rust/serde_json/tests/regression/issue520.rs | 2 +- .../rust/serde_json/tests/regression/issue795.rs | 5 +- .../rust/serde_json/tests/regression/issue845.rs | 2 +- third_party/rust/serde_json/tests/test.rs | 245 +- .../rust/serde_json/tests/ui/parse_key.stderr | 2 +- third_party/rust/sfv/.cargo-checksum.json | 2 +- third_party/rust/sfv/Cargo.toml | 4 +- third_party/rust/sql-support/.cargo-checksum.json | 2 +- third_party/rust/sql-support/Cargo.toml | 1 + third_party/rust/sql-support/src/lazy.rs | 151 + third_party/rust/sql-support/src/lib.rs | 10 +- third_party/rust/sql-support/src/open_database.rs | 204 +- third_party/rust/suggest/.cargo-checksum.json | 2 +- third_party/rust/suggest/src/benchmarks/client.rs | 1 + third_party/rust/suggest/src/db.rs | 6 + third_party/rust/suggest/src/lib.rs | 2 +- third_party/rust/suggest/src/schema.rs | 454 +- third_party/rust/suggest/src/store.rs | 216 +- third_party/rust/suggest/src/suggest.udl | 15 + .../rust/thiserror-impl/.cargo-checksum.json | 2 +- third_party/rust/thiserror-impl/Cargo.toml | 2 +- third_party/rust/thiserror-impl/src/ast.rs | 2 +- third_party/rust/thiserror-impl/src/attr.rs | 15 +- third_party/rust/thiserror-impl/src/expand.rs | 4 +- third_party/rust/thiserror/.cargo-checksum.json | 2 +- third_party/rust/thiserror/Cargo.toml | 4 +- third_party/rust/thiserror/build.rs | 20 +- third_party/rust/thiserror/src/lib.rs | 2 +- third_party/rust/thiserror/tests/test_lints.rs | 2 + .../uniffi-example-arithmetic/.cargo-checksum.json | 1 - .../rust/uniffi-example-arithmetic/Cargo.toml | 42 - .../rust/uniffi-example-arithmetic/build.rs | 7 - .../uniffi-example-arithmetic/src/arithmetic.udl | 16 - .../rust/uniffi-example-arithmetic/src/lib.rs | 34 - .../tests/bindings/test_arithmetic.kts | 29 - .../tests/bindings/test_arithmetic.py | 37 - .../tests/bindings/test_arithmetic.rb | 31 - .../tests/bindings/test_arithmetic.swift | 32 - .../tests/test_generated_bindings.rs | 6 - .../rust/uniffi-example-arithmetic/uniffi.toml | 2 - .../uniffi-example-geometry/.cargo-checksum.json | 1 - .../rust/uniffi-example-geometry/Cargo.toml | 39 - third_party/rust/uniffi-example-geometry/build.rs | 7 - .../rust/uniffi-example-geometry/src/geometry.udl | 15 - .../rust/uniffi-example-geometry/src/lib.rs | 47 - .../tests/bindings/test_geometry.kts | 10 - .../tests/bindings/test_geometry.py | 10 - .../tests/bindings/test_geometry.rb | 16 - .../tests/bindings/test_geometry.swift | 10 - .../tests/test_generated_bindings.rs | 6 - .../uniffi-example-rondpoint/.cargo-checksum.json | 1 - .../rust/uniffi-example-rondpoint/Cargo.toml | 39 - third_party/rust/uniffi-example-rondpoint/build.rs | 7 - .../rust/uniffi-example-rondpoint/src/lib.rs | 293 - .../uniffi-example-rondpoint/src/rondpoint.udl | 146 - .../tests/bindings/test_rondpoint.kts | 250 - .../tests/bindings/test_rondpoint.py | 146 - .../tests/bindings/test_rondpoint.rb | 147 - .../tests/bindings/test_rondpoint.swift | 232 - .../tests/test_generated_bindings.rs | 6 - .../uniffi-example-sprites/.cargo-checksum.json | 1 - third_party/rust/uniffi-example-sprites/Cargo.toml | 39 - third_party/rust/uniffi-example-sprites/build.rs | 7 - third_party/rust/uniffi-example-sprites/src/lib.rs | 65 - .../rust/uniffi-example-sprites/src/sprites.udl | 22 - .../tests/bindings/test_sprites.kts | 25 - .../tests/bindings/test_sprites.py | 17 - .../tests/bindings/test_sprites.rb | 22 - .../tests/bindings/test_sprites.swift | 16 - .../tests/test_generated_bindings.rs | 6 - .../uniffi-example-todolist/.cargo-checksum.json | 1 - .../rust/uniffi-example-todolist/Cargo.toml | 43 - third_party/rust/uniffi-example-todolist/build.rs | 7 - .../rust/uniffi-example-todolist/src/lib.rs | 150 - .../rust/uniffi-example-todolist/src/todolist.udl | 38 - .../tests/bindings/test_todolist.kts | 83 - .../tests/bindings/test_todolist.py | 44 - .../tests/bindings/test_todolist.rb | 47 - .../tests/bindings/test_todolist.swift | 69 - .../tests/test_generated_bindings.rs | 6 - third_party/rust/wasm-encoder/.cargo-checksum.json | 2 +- third_party/rust/wasm-encoder/Cargo.toml | 4 +- .../rust/wasm-encoder/src/component/types.rs | 16 +- third_party/rust/wasm-encoder/src/core/code.rs | 60 +- third_party/rust/wasm-encoder/src/core/data.rs | 1 + third_party/rust/wasm-encoder/src/core/globals.rs | 13 +- third_party/rust/wasm-encoder/src/core/imports.rs | 1 + third_party/rust/wasm-encoder/src/core/memories.rs | 25 +- third_party/rust/wasm-encoder/src/core/types.rs | 15 +- third_party/rust/wasm-smith/.cargo-checksum.json | 2 +- third_party/rust/wasm-smith/Cargo.toml | 10 +- third_party/rust/wasm-smith/src/component.rs | 17 +- third_party/rust/wasm-smith/src/config.rs | 9 +- third_party/rust/wasm-smith/src/core.rs | 202 +- .../rust/wasm-smith/src/core/code_builder.rs | 77 +- .../wasm-smith/src/core/code_builder/no_traps.rs | 10 +- third_party/rust/wasm-smith/src/core/terminate.rs | 1 + third_party/rust/wasm-smith/src/lib.rs | 6 + third_party/rust/wasm-smith/tests/common/mod.rs | 52 +- third_party/rust/wasm-smith/tests/component.rs | 8 +- third_party/rust/wasm-smith/tests/core.rs | 55 +- third_party/rust/wasm-smith/tests/exports.rs | 32 +- third_party/rust/wast/.cargo-checksum.json | 2 +- third_party/rust/wast/Cargo.toml | 8 +- third_party/rust/wast/src/component/binary.rs | 30 +- third_party/rust/wast/src/component/resolve.rs | 3 +- third_party/rust/wast/src/component/types.rs | 16 +- third_party/rust/wast/src/component/wast.rs | 10 +- third_party/rust/wast/src/core/binary.rs | 84 +- third_party/rust/wast/src/core/expr.rs | 158 +- third_party/rust/wast/src/core/memory.rs | 11 +- .../src/core/resolve/deinline_import_export.rs | 6 +- third_party/rust/wast/src/core/resolve/names.rs | 28 +- third_party/rust/wast/src/core/resolve/types.rs | 4 - third_party/rust/wast/src/core/types.rs | 78 +- third_party/rust/wast/src/core/wast.rs | 18 +- third_party/rust/wast/src/lib.rs | 5 + third_party/rust/wast/src/token.rs | 20 +- third_party/rust/wast/src/wast.rs | 75 +- third_party/rust/wast/src/wat.rs | 8 + third_party/rust/wast/tests/parse-fail.rs | 42 +- third_party/rust/wgpu-core/.cargo-checksum.json | 2 +- third_party/rust/wgpu-core/Cargo.toml | 10 +- third_party/rust/wgpu-core/src/any_surface.rs | 95 - third_party/rust/wgpu-core/src/binding_model.rs | 2 - .../rust/wgpu-core/src/command/allocator.rs | 67 + third_party/rust/wgpu-core/src/command/bundle.rs | 21 +- third_party/rust/wgpu-core/src/command/clear.rs | 9 + third_party/rust/wgpu-core/src/command/compute.rs | 318 +- .../rust/wgpu-core/src/command/compute_command.rs | 322 + third_party/rust/wgpu-core/src/command/mod.rs | 249 +- third_party/rust/wgpu-core/src/command/query.rs | 11 +- third_party/rust/wgpu-core/src/command/render.rs | 168 +- third_party/rust/wgpu-core/src/command/transfer.rs | 35 + .../rust/wgpu-core/src/device/any_device.rs | 2 +- third_party/rust/wgpu-core/src/device/bgl.rs | 2 +- third_party/rust/wgpu-core/src/device/global.rs | 77 +- third_party/rust/wgpu-core/src/device/life.rs | 123 +- third_party/rust/wgpu-core/src/device/mod.rs | 37 - third_party/rust/wgpu-core/src/device/queue.rs | 169 +- third_party/rust/wgpu-core/src/device/resource.rs | 544 +- third_party/rust/wgpu-core/src/global.rs | 10 +- third_party/rust/wgpu-core/src/hal_api.rs | 20 +- third_party/rust/wgpu-core/src/hub.rs | 36 +- third_party/rust/wgpu-core/src/id.rs | 47 +- third_party/rust/wgpu-core/src/identity.rs | 40 +- third_party/rust/wgpu-core/src/instance.rs | 357 +- third_party/rust/wgpu-core/src/lib.rs | 4 +- third_party/rust/wgpu-core/src/lock/mod.rs | 41 + third_party/rust/wgpu-core/src/lock/rank.rs | 170 + third_party/rust/wgpu-core/src/lock/ranked.rs | 397 + third_party/rust/wgpu-core/src/lock/vanilla.rs | 121 + third_party/rust/wgpu-core/src/pipeline.rs | 102 +- third_party/rust/wgpu-core/src/pool.rs | 12 +- third_party/rust/wgpu-core/src/present.rs | 30 +- third_party/rust/wgpu-core/src/registry.rs | 89 +- third_party/rust/wgpu-core/src/resource.rs | 74 +- third_party/rust/wgpu-core/src/snatch.rs | 88 +- third_party/rust/wgpu-core/src/storage.rs | 5 +- third_party/rust/wgpu-core/src/track/buffer.rs | 24 +- third_party/rust/wgpu-core/src/track/metadata.rs | 10 +- third_party/rust/wgpu-core/src/track/mod.rs | 45 +- third_party/rust/wgpu-core/src/track/stateless.rs | 22 +- third_party/rust/wgpu-core/src/track/texture.rs | 4 +- third_party/rust/wgpu-core/src/validation.rs | 3 +- third_party/rust/wgpu-hal/.cargo-checksum.json | 2 +- third_party/rust/wgpu-hal/Cargo.toml | 19 +- third_party/rust/wgpu-hal/README.md | 129 +- third_party/rust/wgpu-hal/examples/halmark/main.rs | 6 + .../wgpu-hal/examples/ray-traced-triangle/main.rs | 2 + third_party/rust/wgpu-hal/src/auxil/dxgi/conv.rs | 1 + third_party/rust/wgpu-hal/src/dx12/adapter.rs | 99 +- third_party/rust/wgpu-hal/src/dx12/conv.rs | 2 +- third_party/rust/wgpu-hal/src/dx12/device.rs | 33 +- third_party/rust/wgpu-hal/src/dx12/mod.rs | 3 +- third_party/rust/wgpu-hal/src/dx12/types.rs | 22 + third_party/rust/wgpu-hal/src/gles/adapter.rs | 33 +- third_party/rust/wgpu-hal/src/gles/conv.rs | 1 + third_party/rust/wgpu-hal/src/gles/device.rs | 37 +- third_party/rust/wgpu-hal/src/gles/egl.rs | 74 +- third_party/rust/wgpu-hal/src/gles/mod.rs | 1 + third_party/rust/wgpu-hal/src/gles/queue.rs | 71 +- third_party/rust/wgpu-hal/src/gles/wgl.rs | 2 + third_party/rust/wgpu-hal/src/lib.rs | 346 +- third_party/rust/wgpu-hal/src/metal/adapter.rs | 22 +- third_party/rust/wgpu-hal/src/metal/conv.rs | 1 + third_party/rust/wgpu-hal/src/metal/device.rs | 24 +- third_party/rust/wgpu-hal/src/metal/mod.rs | 12 +- third_party/rust/wgpu-hal/src/vulkan/adapter.rs | 123 +- third_party/rust/wgpu-hal/src/vulkan/command.rs | 5 + third_party/rust/wgpu-hal/src/vulkan/conv.rs | 1 + third_party/rust/wgpu-hal/src/vulkan/device.rs | 30 +- third_party/rust/wgpu-hal/src/vulkan/mod.rs | 109 +- third_party/rust/wgpu-types/.cargo-checksum.json | 2 +- third_party/rust/wgpu-types/Cargo.toml | 4 +- third_party/rust/wgpu-types/src/lib.rs | 100 +- .../rust/zerocopy-derive/.cargo-checksum.json | 1 + third_party/rust/zerocopy-derive/Cargo.toml | 44 + third_party/rust/zerocopy-derive/LICENSE-APACHE | 202 + third_party/rust/zerocopy-derive/LICENSE-BSD | 24 + third_party/rust/zerocopy-derive/LICENSE-MIT | 26 + third_party/rust/zerocopy-derive/src/ext.rs | 53 + third_party/rust/zerocopy-derive/src/lib.rs | 882 + third_party/rust/zerocopy-derive/src/repr.rs | 311 + .../rust/zerocopy-derive/tests/enum_as_bytes.rs | 101 + .../rust/zerocopy-derive/tests/enum_from_zeroes.rs | 35 + .../zerocopy-derive/tests/enum_known_layout.rs | 46 + .../rust/zerocopy-derive/tests/enum_unaligned.rs | 47 + third_party/rust/zerocopy-derive/tests/hygiene.rs | 43 + .../zerocopy-derive/tests/paths_and_modules.rs | 38 + .../rust/zerocopy-derive/tests/priv_in_pub.rs | 24 + .../rust/zerocopy-derive/tests/struct_as_bytes.rs | 161 + .../zerocopy-derive/tests/struct_from_bytes.rs | 79 + .../zerocopy-derive/tests/struct_from_zeroes.rs | 77 + .../zerocopy-derive/tests/struct_known_layout.rs | 65 + .../rust/zerocopy-derive/tests/struct_unaligned.rs | 100 + third_party/rust/zerocopy-derive/tests/trybuild.rs | 19 + .../tests/ui-msrv/derive_transparent.rs | 40 + .../tests/ui-msrv/derive_transparent.stderr | 71 + .../rust/zerocopy-derive/tests/ui-msrv/enum.rs | 194 + .../rust/zerocopy-derive/tests/ui-msrv/enum.stderr | 199 + .../tests/ui-msrv/enum_from_bytes_u8_too_few.rs | 272 + .../ui-msrv/enum_from_bytes_u8_too_few.stderr | 11 + .../tests/ui-msrv/late_compile_pass.rs | 75 + .../tests/ui-msrv/late_compile_pass.stderr | 74 + .../tests/ui-msrv/mid_compile_pass.rs | 61 + .../tests/ui-msrv/mid_compile_pass.stderr | 104 + .../rust/zerocopy-derive/tests/ui-msrv/struct.rs | 99 + .../zerocopy-derive/tests/ui-msrv/struct.stderr | 113 + .../rust/zerocopy-derive/tests/ui-msrv/union.rs | 73 + .../zerocopy-derive/tests/ui-msrv/union.stderr | 42 + .../tests/ui-nightly/derive_transparent.rs | 40 + .../tests/ui-nightly/derive_transparent.stderr | 111 + .../rust/zerocopy-derive/tests/ui-nightly/enum.rs | 194 + .../zerocopy-derive/tests/ui-nightly/enum.stderr | 201 + .../tests/ui-nightly/enum_from_bytes_u8_too_few.rs | 272 + .../ui-nightly/enum_from_bytes_u8_too_few.stderr | 11 + .../tests/ui-nightly/late_compile_pass.rs | 75 + .../tests/ui-nightly/late_compile_pass.stderr | 150 + .../tests/ui-nightly/mid_compile_pass.rs | 61 + .../tests/ui-nightly/mid_compile_pass.stderr | 104 + .../zerocopy-derive/tests/ui-nightly/struct.rs | 99 + .../zerocopy-derive/tests/ui-nightly/struct.stderr | 143 + .../rust/zerocopy-derive/tests/ui-nightly/union.rs | 73 + .../zerocopy-derive/tests/ui-nightly/union.stderr | 48 + .../tests/ui-stable/derive_transparent.rs | 40 + .../tests/ui-stable/derive_transparent.stderr | 111 + .../rust/zerocopy-derive/tests/ui-stable/enum.rs | 194 + .../zerocopy-derive/tests/ui-stable/enum.stderr | 201 + .../tests/ui-stable/enum_from_bytes_u8_too_few.rs | 272 + .../ui-stable/enum_from_bytes_u8_too_few.stderr | 11 + .../tests/ui-stable/late_compile_pass.rs | 75 + .../tests/ui-stable/late_compile_pass.stderr | 144 + .../tests/ui-stable/mid_compile_pass.rs | 61 + .../tests/ui-stable/mid_compile_pass.stderr | 104 + .../rust/zerocopy-derive/tests/ui-stable/struct.rs | 99 + .../zerocopy-derive/tests/ui-stable/struct.stderr | 131 + .../rust/zerocopy-derive/tests/ui-stable/union.rs | 73 + .../zerocopy-derive/tests/ui-stable/union.stderr | 41 + .../rust/zerocopy-derive/tests/union_as_bytes.rs | 75 + .../rust/zerocopy-derive/tests/union_from_bytes.rs | 72 + .../zerocopy-derive/tests/union_from_zeroes.rs | 72 + .../zerocopy-derive/tests/union_known_layout.rs | 65 + .../rust/zerocopy-derive/tests/union_unaligned.rs | 77 + third_party/rust/zerocopy-derive/tests/util.rs | 20 + third_party/rust/zerocopy/.cargo-checksum.json | 1 + third_party/rust/zerocopy/CONTRIBUTING.md | 215 + third_party/rust/zerocopy/Cargo.toml | 87 + third_party/rust/zerocopy/INTERNAL.md | 44 + third_party/rust/zerocopy/LICENSE-APACHE | 202 + third_party/rust/zerocopy/LICENSE-BSD | 24 + third_party/rust/zerocopy/LICENSE-MIT | 26 + third_party/rust/zerocopy/POLICIES.md | 103 + third_party/rust/zerocopy/README.md | 154 + third_party/rust/zerocopy/cargo.sh | 120 + third_party/rust/zerocopy/clippy.toml | 10 + third_party/rust/zerocopy/generate-readme.sh | 50 + third_party/rust/zerocopy/rustfmt.toml | 19 + third_party/rust/zerocopy/src/byteorder.rs | 1075 + third_party/rust/zerocopy/src/lib.rs | 8256 + third_party/rust/zerocopy/src/macro_util.rs | 670 + third_party/rust/zerocopy/src/macros.rs | 417 + .../post_monomorphization_compile_fail_tests.rs | 118 + .../zerocopy/src/third_party/rust/LICENSE-APACHE | 176 + .../rust/zerocopy/src/third_party/rust/LICENSE-MIT | 23 + .../zerocopy/src/third_party/rust/README.fuchsia | 7 + .../rust/zerocopy/src/third_party/rust/layout.rs | 45 + third_party/rust/zerocopy/src/util.rs | 808 + third_party/rust/zerocopy/src/wrappers.rs | 503 + .../rust/zerocopy/testdata/include_value/data | 1 + third_party/rust/zerocopy/tests/trybuild.rs | 41 + .../tests/ui-msrv/include_value_not_from_bytes.rs | 12 + .../ui-msrv/include_value_not_from_bytes.stderr | 12 + .../tests/ui-msrv/include_value_wrong_size.rs | 11 + .../tests/ui-msrv/include_value_wrong_size.stderr | 9 + .../tests/ui-msrv/invalid-impls/invalid-impls.rs | 29 + .../ui-msrv/invalid-impls/invalid-impls.stderr | 127 + .../rust/zerocopy/tests/ui-msrv/max-align.rs | 99 + .../rust/zerocopy/tests/ui-msrv/max-align.stderr | 5 + .../tests/ui-msrv/transmute-dst-not-frombytes.rs | 18 + .../ui-msrv/transmute-dst-not-frombytes.stderr | 12 + .../ui-msrv/transmute-mut-alignment-increase.rs | 19 + .../transmute-mut-alignment-increase.stderr | 36 + .../zerocopy/tests/ui-msrv/transmute-mut-const.rs | 20 + .../tests/ui-msrv/transmute-mut-const.stderr | 41 + .../tests/ui-msrv/transmute-mut-dst-generic.rs | 18 + .../tests/ui-msrv/transmute-mut-dst-generic.stderr | 19 + .../ui-msrv/transmute-mut-dst-not-a-reference.rs | 17 + .../transmute-mut-dst-not-a-reference.stderr | 39 + .../tests/ui-msrv/transmute-mut-dst-not-asbytes.rs | 24 + .../ui-msrv/transmute-mut-dst-not-asbytes.stderr | 12 + .../ui-msrv/transmute-mut-dst-not-frombytes.rs | 24 + .../ui-msrv/transmute-mut-dst-not-frombytes.stderr | 12 + .../tests/ui-msrv/transmute-mut-dst-unsized.rs | 17 + .../tests/ui-msrv/transmute-mut-dst-unsized.stderr | 108 + .../ui-msrv/transmute-mut-illegal-lifetime.rs | 15 + .../ui-msrv/transmute-mut-illegal-lifetime.stderr | 9 + .../tests/ui-msrv/transmute-mut-size-decrease.rs | 17 + .../ui-msrv/transmute-mut-size-decrease.stderr | 36 + .../tests/ui-msrv/transmute-mut-size-increase.rs | 17 + .../ui-msrv/transmute-mut-size-increase.stderr | 36 + .../tests/ui-msrv/transmute-mut-src-dst-generic.rs | 19 + .../ui-msrv/transmute-mut-src-dst-generic.stderr | 19 + .../transmute-mut-src-dst-not-references.rs | 17 + .../transmute-mut-src-dst-not-references.stderr | 12 + .../tests/ui-msrv/transmute-mut-src-dst-unsized.rs | 17 + .../ui-msrv/transmute-mut-src-dst-unsized.stderr | 237 + .../tests/ui-msrv/transmute-mut-src-generic.rs | 18 + .../tests/ui-msrv/transmute-mut-src-generic.stderr | 10 + .../tests/ui-msrv/transmute-mut-src-immutable.rs | 18 + .../ui-msrv/transmute-mut-src-immutable.stderr | 11 + .../ui-msrv/transmute-mut-src-not-a-reference.rs | 17 + .../transmute-mut-src-not-a-reference.stderr | 12 + .../tests/ui-msrv/transmute-mut-src-not-asbytes.rs | 24 + .../ui-msrv/transmute-mut-src-not-asbytes.stderr | 25 + .../ui-msrv/transmute-mut-src-not-frombytes.rs | 24 + .../ui-msrv/transmute-mut-src-not-frombytes.stderr | 25 + .../tests/ui-msrv/transmute-mut-src-unsized.rs | 16 + .../tests/ui-msrv/transmute-mut-src-unsized.stderr | 198 + .../tests/ui-msrv/transmute-ptr-to-usize.rs | 20 + .../tests/ui-msrv/transmute-ptr-to-usize.stderr | 37 + .../ui-msrv/transmute-ref-alignment-increase.rs | 19 + .../transmute-ref-alignment-increase.stderr | 9 + .../tests/ui-msrv/transmute-ref-dst-generic.rs | 18 + .../tests/ui-msrv/transmute-ref-dst-generic.stderr | 19 + .../tests/ui-msrv/transmute-ref-dst-mutable.rs | 19 + .../tests/ui-msrv/transmute-ref-dst-mutable.stderr | 29 + .../ui-msrv/transmute-ref-dst-not-a-reference.rs | 17 + .../transmute-ref-dst-not-a-reference.stderr | 29 + .../ui-msrv/transmute-ref-dst-not-frombytes.rs | 18 + .../ui-msrv/transmute-ref-dst-not-frombytes.stderr | 12 + .../tests/ui-msrv/transmute-ref-dst-unsized.rs | 17 + .../tests/ui-msrv/transmute-ref-dst-unsized.stderr | 94 + .../ui-msrv/transmute-ref-illegal-lifetime.rs | 15 + .../ui-msrv/transmute-ref-illegal-lifetime.stderr | 9 + .../tests/ui-msrv/transmute-ref-size-decrease.rs | 17 + .../ui-msrv/transmute-ref-size-decrease.stderr | 9 + .../tests/ui-msrv/transmute-ref-size-increase.rs | 17 + .../ui-msrv/transmute-ref-size-increase.stderr | 9 + .../tests/ui-msrv/transmute-ref-src-dst-generic.rs | 19 + .../ui-msrv/transmute-ref-src-dst-generic.stderr | 19 + .../transmute-ref-src-dst-not-references.rs | 17 + .../transmute-ref-src-dst-not-references.stderr | 42 + .../tests/ui-msrv/transmute-ref-src-dst-unsized.rs | 17 + .../ui-msrv/transmute-ref-src-dst-unsized.stderr | 195 + .../tests/ui-msrv/transmute-ref-src-generic.rs | 18 + .../tests/ui-msrv/transmute-ref-src-generic.stderr | 19 + .../ui-msrv/transmute-ref-src-not-a-reference.rs | 17 + .../transmute-ref-src-not-a-reference.stderr | 12 + .../tests/ui-msrv/transmute-ref-src-not-asbytes.rs | 18 + .../ui-msrv/transmute-ref-src-not-asbytes.stderr | 25 + .../tests/ui-msrv/transmute-ref-src-unsized.rs | 16 + .../tests/ui-msrv/transmute-ref-src-unsized.stderr | 170 + .../tests/ui-msrv/transmute-size-decrease.rs | 19 + .../tests/ui-msrv/transmute-size-decrease.stderr | 9 + .../tests/ui-msrv/transmute-size-increase.rs | 19 + .../tests/ui-msrv/transmute-size-increase.stderr | 9 + .../tests/ui-msrv/transmute-src-not-asbytes.rs | 18 + .../tests/ui-msrv/transmute-src-not-asbytes.stderr | 25 + .../ui-nightly/include_value_not_from_bytes.rs | 12 + .../ui-nightly/include_value_not_from_bytes.stderr | 25 + .../tests/ui-nightly/include_value_wrong_size.rs | 11 + .../ui-nightly/include_value_wrong_size.stderr | 9 + .../ui-nightly/invalid-impls/invalid-impls.rs | 29 + .../ui-nightly/invalid-impls/invalid-impls.stderr | 107 + .../rust/zerocopy/tests/ui-nightly/max-align.rs | 99 + .../zerocopy/tests/ui-nightly/max-align.stderr | 5 + .../ui-nightly/transmute-dst-not-frombytes.rs | 18 + .../ui-nightly/transmute-dst-not-frombytes.stderr | 25 + .../ui-nightly/transmute-mut-alignment-increase.rs | 19 + .../transmute-mut-alignment-increase.stderr | 9 + .../tests/ui-nightly/transmute-mut-const.rs | 20 + .../tests/ui-nightly/transmute-mut-const.stderr | 42 + .../tests/ui-nightly/transmute-mut-dst-generic.rs | 18 + .../ui-nightly/transmute-mut-dst-generic.stderr | 19 + .../transmute-mut-dst-not-a-reference.rs | 17 + .../transmute-mut-dst-not-a-reference.stderr | 39 + .../ui-nightly/transmute-mut-dst-not-asbytes.rs | 24 + .../transmute-mut-dst-not-asbytes.stderr | 25 + .../ui-nightly/transmute-mut-dst-not-frombytes.rs | 24 + .../transmute-mut-dst-not-frombytes.stderr | 25 + .../tests/ui-nightly/transmute-mut-dst-unsized.rs | 17 + .../ui-nightly/transmute-mut-dst-unsized.stderr | 86 + .../ui-nightly/transmute-mut-illegal-lifetime.rs | 15 + .../transmute-mut-illegal-lifetime.stderr | 12 + .../ui-nightly/transmute-mut-size-decrease.rs | 17 + .../ui-nightly/transmute-mut-size-decrease.stderr | 9 + .../ui-nightly/transmute-mut-size-increase.rs | 17 + .../ui-nightly/transmute-mut-size-increase.stderr | 9 + .../ui-nightly/transmute-mut-src-dst-generic.rs | 19 + .../transmute-mut-src-dst-generic.stderr | 19 + .../transmute-mut-src-dst-not-references.rs | 17 + .../transmute-mut-src-dst-not-references.stderr | 15 + .../ui-nightly/transmute-mut-src-dst-unsized.rs | 17 + .../transmute-mut-src-dst-unsized.stderr | 231 + .../tests/ui-nightly/transmute-mut-src-generic.rs | 18 + .../ui-nightly/transmute-mut-src-generic.stderr | 10 + .../ui-nightly/transmute-mut-src-immutable.rs | 18 + .../ui-nightly/transmute-mut-src-immutable.stderr | 11 + .../transmute-mut-src-not-a-reference.rs | 17 + .../transmute-mut-src-not-a-reference.stderr | 15 + .../ui-nightly/transmute-mut-src-not-asbytes.rs | 24 + .../transmute-mut-src-not-asbytes.stderr | 48 + .../ui-nightly/transmute-mut-src-not-frombytes.rs | 24 + .../transmute-mut-src-not-frombytes.stderr | 48 + .../tests/ui-nightly/transmute-mut-src-unsized.rs | 16 + .../ui-nightly/transmute-mut-src-unsized.stderr | 158 + .../tests/ui-nightly/transmute-ptr-to-usize.rs | 20 + .../tests/ui-nightly/transmute-ptr-to-usize.stderr | 30 + .../ui-nightly/transmute-ref-alignment-increase.rs | 19 + .../transmute-ref-alignment-increase.stderr | 9 + .../tests/ui-nightly/transmute-ref-dst-generic.rs | 18 + .../ui-nightly/transmute-ref-dst-generic.stderr | 19 + .../tests/ui-nightly/transmute-ref-dst-mutable.rs | 19 + .../ui-nightly/transmute-ref-dst-mutable.stderr | 29 + .../transmute-ref-dst-not-a-reference.rs | 17 + .../transmute-ref-dst-not-a-reference.stderr | 29 + .../ui-nightly/transmute-ref-dst-not-frombytes.rs | 18 + .../transmute-ref-dst-not-frombytes.stderr | 25 + .../tests/ui-nightly/transmute-ref-dst-unsized.rs | 17 + .../ui-nightly/transmute-ref-dst-unsized.stderr | 69 + .../ui-nightly/transmute-ref-illegal-lifetime.rs | 15 + .../transmute-ref-illegal-lifetime.stderr | 12 + .../ui-nightly/transmute-ref-size-decrease.rs | 17 + .../ui-nightly/transmute-ref-size-decrease.stderr | 9 + .../ui-nightly/transmute-ref-size-increase.rs | 17 + .../ui-nightly/transmute-ref-size-increase.stderr | 9 + .../ui-nightly/transmute-ref-src-dst-generic.rs | 19 + .../transmute-ref-src-dst-generic.stderr | 19 + .../transmute-ref-src-dst-not-references.rs | 17 + .../transmute-ref-src-dst-not-references.stderr | 45 + .../ui-nightly/transmute-ref-src-dst-unsized.rs | 17 + .../transmute-ref-src-dst-unsized.stderr | 183 + .../tests/ui-nightly/transmute-ref-src-generic.rs | 18 + .../ui-nightly/transmute-ref-src-generic.stderr | 19 + .../transmute-ref-src-not-a-reference.rs | 17 + .../transmute-ref-src-not-a-reference.stderr | 15 + .../ui-nightly/transmute-ref-src-not-asbytes.rs | 18 + .../transmute-ref-src-not-asbytes.stderr | 48 + .../tests/ui-nightly/transmute-ref-src-unsized.rs | 16 + .../ui-nightly/transmute-ref-src-unsized.stderr | 127 + .../tests/ui-nightly/transmute-size-decrease.rs | 19 + .../ui-nightly/transmute-size-decrease.stderr | 9 + .../tests/ui-nightly/transmute-size-increase.rs | 19 + .../ui-nightly/transmute-size-increase.stderr | 9 + .../tests/ui-nightly/transmute-src-not-asbytes.rs | 18 + .../ui-nightly/transmute-src-not-asbytes.stderr | 48 + .../ui-stable/include_value_not_from_bytes.rs | 12 + .../ui-stable/include_value_not_from_bytes.stderr | 25 + .../tests/ui-stable/include_value_wrong_size.rs | 11 + .../ui-stable/include_value_wrong_size.stderr | 9 + .../tests/ui-stable/invalid-impls/invalid-impls.rs | 29 + .../ui-stable/invalid-impls/invalid-impls.stderr | 107 + .../rust/zerocopy/tests/ui-stable/max-align.rs | 99 + .../rust/zerocopy/tests/ui-stable/max-align.stderr | 5 + .../tests/ui-stable/transmute-dst-not-frombytes.rs | 18 + .../ui-stable/transmute-dst-not-frombytes.stderr | 25 + .../ui-stable/transmute-mut-alignment-increase.rs | 19 + .../transmute-mut-alignment-increase.stderr | 9 + .../tests/ui-stable/transmute-mut-const.rs | 20 + .../tests/ui-stable/transmute-mut-const.stderr | 41 + .../tests/ui-stable/transmute-mut-dst-generic.rs | 18 + .../ui-stable/transmute-mut-dst-generic.stderr | 19 + .../ui-stable/transmute-mut-dst-not-a-reference.rs | 17 + .../transmute-mut-dst-not-a-reference.stderr | 39 + .../ui-stable/transmute-mut-dst-not-asbytes.rs | 24 + .../ui-stable/transmute-mut-dst-not-asbytes.stderr | 25 + .../ui-stable/transmute-mut-dst-not-frombytes.rs | 24 + .../transmute-mut-dst-not-frombytes.stderr | 25 + .../tests/ui-stable/transmute-mut-dst-unsized.rs | 17 + .../ui-stable/transmute-mut-dst-unsized.stderr | 106 + .../ui-stable/transmute-mut-illegal-lifetime.rs | 15 + .../transmute-mut-illegal-lifetime.stderr | 12 + .../tests/ui-stable/transmute-mut-size-decrease.rs | 17 + .../ui-stable/transmute-mut-size-decrease.stderr | 9 + .../tests/ui-stable/transmute-mut-size-increase.rs | 17 + .../ui-stable/transmute-mut-size-increase.stderr | 9 + .../ui-stable/transmute-mut-src-dst-generic.rs | 19 + .../ui-stable/transmute-mut-src-dst-generic.stderr | 19 + .../transmute-mut-src-dst-not-references.rs | 17 + .../transmute-mut-src-dst-not-references.stderr | 15 + .../ui-stable/transmute-mut-src-dst-unsized.rs | 17 + .../ui-stable/transmute-mut-src-dst-unsized.stderr | 288 + .../tests/ui-stable/transmute-mut-src-generic.rs | 18 + .../ui-stable/transmute-mut-src-generic.stderr | 10 + .../tests/ui-stable/transmute-mut-src-immutable.rs | 18 + .../ui-stable/transmute-mut-src-immutable.stderr | 11 + .../ui-stable/transmute-mut-src-not-a-reference.rs | 17 + .../transmute-mut-src-not-a-reference.stderr | 15 + .../ui-stable/transmute-mut-src-not-asbytes.rs | 24 + .../ui-stable/transmute-mut-src-not-asbytes.stderr | 48 + .../ui-stable/transmute-mut-src-not-frombytes.rs | 24 + .../transmute-mut-src-not-frombytes.stderr | 48 + .../tests/ui-stable/transmute-mut-src-unsized.rs | 16 + .../ui-stable/transmute-mut-src-unsized.stderr | 195 + .../tests/ui-stable/transmute-ptr-to-usize.rs | 20 + .../tests/ui-stable/transmute-ptr-to-usize.stderr | 30 + .../ui-stable/transmute-ref-alignment-increase.rs | 19 + .../transmute-ref-alignment-increase.stderr | 9 + .../tests/ui-stable/transmute-ref-dst-generic.rs | 18 + .../ui-stable/transmute-ref-dst-generic.stderr | 19 + .../tests/ui-stable/transmute-ref-dst-mutable.rs | 19 + .../ui-stable/transmute-ref-dst-mutable.stderr | 29 + .../ui-stable/transmute-ref-dst-not-a-reference.rs | 17 + .../transmute-ref-dst-not-a-reference.stderr | 29 + .../ui-stable/transmute-ref-dst-not-frombytes.rs | 18 + .../transmute-ref-dst-not-frombytes.stderr | 25 + .../tests/ui-stable/transmute-ref-dst-unsized.rs | 17 + .../ui-stable/transmute-ref-dst-unsized.stderr | 89 + .../ui-stable/transmute-ref-illegal-lifetime.rs | 15 + .../transmute-ref-illegal-lifetime.stderr | 12 + .../tests/ui-stable/transmute-ref-size-decrease.rs | 17 + .../ui-stable/transmute-ref-size-decrease.stderr | 9 + .../tests/ui-stable/transmute-ref-size-increase.rs | 17 + .../ui-stable/transmute-ref-size-increase.stderr | 9 + .../ui-stable/transmute-ref-src-dst-generic.rs | 19 + .../ui-stable/transmute-ref-src-dst-generic.stderr | 19 + .../transmute-ref-src-dst-not-references.rs | 17 + .../transmute-ref-src-dst-not-references.stderr | 45 + .../ui-stable/transmute-ref-src-dst-unsized.rs | 17 + .../ui-stable/transmute-ref-src-dst-unsized.stderr | 240 + .../tests/ui-stable/transmute-ref-src-generic.rs | 18 + .../ui-stable/transmute-ref-src-generic.stderr | 19 + .../ui-stable/transmute-ref-src-not-a-reference.rs | 17 + .../transmute-ref-src-not-a-reference.stderr | 15 + .../ui-stable/transmute-ref-src-not-asbytes.rs | 18 + .../ui-stable/transmute-ref-src-not-asbytes.stderr | 48 + .../tests/ui-stable/transmute-ref-src-unsized.rs | 16 + .../ui-stable/transmute-ref-src-unsized.stderr | 164 + .../tests/ui-stable/transmute-size-decrease.rs | 19 + .../tests/ui-stable/transmute-size-decrease.stderr | 9 + .../tests/ui-stable/transmute-size-increase.rs | 19 + .../tests/ui-stable/transmute-size-increase.stderr | 9 + .../tests/ui-stable/transmute-src-not-asbytes.rs | 18 + .../ui-stable/transmute-src-not-asbytes.stderr | 48 + .../xsimd/arch/generic/xsimd_generic_math.hpp | 2 +- .../xsimd/arch/generic/xsimd_generic_memory.hpp | 38 +- third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp | 38 +- .../xsimd/include/xsimd/arch/xsimd_avx2.hpp | 20 +- .../xsimd/include/xsimd/arch/xsimd_avx512bw.hpp | 16 +- .../xsimd/include/xsimd/arch/xsimd_avx512f.hpp | 48 +- .../xsimd/include/xsimd/arch/xsimd_emulated.hpp | 757 + third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp | 4 + .../xsimd/include/xsimd/arch/xsimd_neon.hpp | 8 +- .../xsimd/include/xsimd/arch/xsimd_neon64.hpp | 70 +- third_party/xsimd/include/xsimd/arch/xsimd_rvv.hpp | 12 +- .../xsimd/include/xsimd/arch/xsimd_scalar.hpp | 114 +- .../xsimd/include/xsimd/arch/xsimd_sse2.hpp | 97 +- .../xsimd/include/xsimd/arch/xsimd_sse4_1.hpp | 14 +- .../xsimd/include/xsimd/arch/xsimd_ssse3.hpp | 16 +- third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp | 12 +- .../xsimd/include/xsimd/arch/xsimd_wasm.hpp | 32 +- .../include/xsimd/types/xsimd_all_registers.hpp | 4 + .../xsimd/include/xsimd/types/xsimd_api.hpp | 29 +- .../include/xsimd/types/xsimd_batch_constant.hpp | 122 +- .../xsimd/types/xsimd_emulated_register.hpp | 80 + third_party/xsimd/moz.yaml | 4 +- third_party/zstd/moz.build | 1 - 2373 files changed, 378506 insertions(+), 100758 deletions(-) create mode 100644 third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.h create mode 100644 third_party/aom/aom_dsp/flow_estimation/arm/disflow_sve.c create mode 100644 third_party/aom/av1/common/x86/resize_avx2.c create mode 100644 third_party/aom/av1/encoder/arm/neon/pickrst_sve.c create mode 100644 third_party/aom/test/frame_resize_test.cc create mode 100644 third_party/dav1d/src/arm/64/mc_dotprod.S create mode 100644 third_party/jpeg-xl/lib/jxl/base/include_jpeglib.h create mode 100644 third_party/jpeg-xl/lib/jxl/base/rect.h create mode 100644 third_party/libwebrtc/api/candidate_unittest.cc delete mode 100644 third_party/libwebrtc/api/crypto_params.h delete mode 100644 third_party/libwebrtc/api/stats/rtc_stats_member.h create mode 100644 third_party/libwebrtc/api/test/mock_frame_transformer.h create mode 100644 third_party/libwebrtc/api/transport/bandwidth_estimation_settings.h create mode 100644 third_party/libwebrtc/api/transport/bandwidth_estimation_settings_gn/moz.build create mode 100644 third_party/libwebrtc/api/video_codecs/video_decoder_factory.cc delete mode 100644 third_party/libwebrtc/modules/audio_coding/neteq/post_decode_vad.cc delete mode 100644 third_party/libwebrtc/modules/audio_coding/neteq/post_decode_vad.h delete mode 100644 third_party/libwebrtc/modules/audio_coding/neteq/post_decode_vad_unittest.cc create mode 100644 third_party/libwebrtc/modules/rtp_rtcp/source/rtp_packet_h265_common.h create mode 100644 third_party/libwebrtc/modules/rtp_rtcp/source/video_rtp_depacketizer_h265.cc create mode 100644 third_party/libwebrtc/modules/rtp_rtcp/source/video_rtp_depacketizer_h265.h create mode 100644 third_party/libwebrtc/modules/rtp_rtcp/source/video_rtp_depacketizer_h265_unittest.cc delete mode 100644 third_party/libwebrtc/modules/video_coding/h264_packet_buffer.cc delete mode 100644 third_party/libwebrtc/modules/video_coding/h264_packet_buffer.h delete mode 100644 third_party/libwebrtc/modules/video_coding/h264_packet_buffer_unittest.cc create mode 100644 third_party/libwebrtc/modules/video_coding/h26x_packet_buffer.cc create mode 100644 third_party/libwebrtc/modules/video_coding/h26x_packet_buffer.h create mode 100644 third_party/libwebrtc/modules/video_coding/h26x_packet_buffer_unittest.cc create mode 100644 third_party/libwebrtc/modules/video_coding/include/video_error_codes_utils.cc create mode 100644 third_party/libwebrtc/modules/video_coding/include/video_error_codes_utils.h delete mode 100644 third_party/libwebrtc/moz-patch-stack/0108.patch delete mode 100644 third_party/libwebrtc/moz-patch-stack/0109.patch delete mode 100644 third_party/libwebrtc/moz-patch-stack/0110.patch delete mode 100644 third_party/libwebrtc/moz-patch-stack/541f202354.no-op-cherry-pick-msg create mode 100644 third_party/libwebrtc/moz-patch-stack/74a4038ead.no-op-cherry-pick-msg delete mode 100644 third_party/libwebrtc/moz-patch-stack/958c9ac546.no-op-cherry-pick-msg delete mode 100644 third_party/libwebrtc/moz-patch-stack/de3c726121.no-op-cherry-pick-msg delete mode 100644 third_party/libwebrtc/pc/srtp_filter.cc delete mode 100644 third_party/libwebrtc/pc/srtp_filter.h delete mode 100644 third_party/libwebrtc/pc/srtp_filter_unittest.cc delete mode 100644 third_party/libwebrtc/rtc_base/proxy_unittest.cc create mode 100644 third_party/libwebrtc/sdk/objc/api/peerconnection/RTCRtpCapabilities+Private.h create mode 100644 third_party/libwebrtc/sdk/objc/api/peerconnection/RTCRtpCapabilities.h create mode 100644 third_party/libwebrtc/sdk/objc/api/peerconnection/RTCRtpCapabilities.mm create mode 100644 third_party/libwebrtc/sdk/objc/api/peerconnection/RTCRtpCodecCapability+Private.h create mode 100644 third_party/libwebrtc/sdk/objc/api/peerconnection/RTCRtpCodecCapability.h create mode 100644 third_party/libwebrtc/sdk/objc/api/peerconnection/RTCRtpCodecCapability.mm create mode 100644 third_party/libwebrtc/sdk/objc/api/peerconnection/RTCRtpHeaderExtensionCapability+Private.h create mode 100644 third_party/libwebrtc/sdk/objc/api/peerconnection/RTCRtpHeaderExtensionCapability.h create mode 100644 third_party/libwebrtc/sdk/objc/api/peerconnection/RTCRtpHeaderExtensionCapability.mm delete mode 100644 third_party/libwebrtc/stats/rtc_stats_member.cc create mode 100644 third_party/libwebrtc/test/fuzzers/h265_depacketizer_fuzzer.cc delete mode 100644 third_party/libwebrtc/test/mock_frame_transformer.h delete mode 100644 third_party/libwebrtc/test/mock_transformable_frame.h create mode 100644 third_party/perfetto/LICENSE create mode 100644 third_party/perfetto/moz.build create mode 100644 third_party/perfetto/moz.yaml create mode 100644 third_party/perfetto/moz_attributes.patch create mode 100644 third_party/perfetto/sdk/perfetto.cc create mode 100644 third_party/perfetto/sdk/perfetto.h delete mode 100644 third_party/python/glean_parser/glean_parser-13.0.1.dist-info/AUTHORS.md delete mode 100644 third_party/python/glean_parser/glean_parser-13.0.1.dist-info/LICENSE delete mode 100644 third_party/python/glean_parser/glean_parser-13.0.1.dist-info/METADATA delete mode 100644 third_party/python/glean_parser/glean_parser-13.0.1.dist-info/RECORD delete mode 100644 third_party/python/glean_parser/glean_parser-13.0.1.dist-info/WHEEL delete mode 100644 third_party/python/glean_parser/glean_parser-13.0.1.dist-info/entry_points.txt delete mode 100644 third_party/python/glean_parser/glean_parser-13.0.1.dist-info/top_level.txt create mode 100644 third_party/python/glean_parser/glean_parser-14.0.1.dist-info/AUTHORS.md create mode 100644 third_party/python/glean_parser/glean_parser-14.0.1.dist-info/LICENSE create mode 100644 third_party/python/glean_parser/glean_parser-14.0.1.dist-info/METADATA create mode 100644 third_party/python/glean_parser/glean_parser-14.0.1.dist-info/RECORD create mode 100644 third_party/python/glean_parser/glean_parser-14.0.1.dist-info/WHEEL create mode 100644 third_party/python/glean_parser/glean_parser-14.0.1.dist-info/entry_points.txt create mode 100644 third_party/python/glean_parser/glean_parser-14.0.1.dist-info/top_level.txt delete mode 100644 third_party/python/taskcluster_taskgraph/taskcluster_taskgraph-6.3.0.dist-info/LICENSE delete mode 100644 third_party/python/taskcluster_taskgraph/taskcluster_taskgraph-6.3.0.dist-info/METADATA delete mode 100644 third_party/python/taskcluster_taskgraph/taskcluster_taskgraph-6.3.0.dist-info/RECORD delete mode 100644 third_party/python/taskcluster_taskgraph/taskcluster_taskgraph-6.3.0.dist-info/WHEEL delete mode 100644 third_party/python/taskcluster_taskgraph/taskcluster_taskgraph-6.3.0.dist-info/entry_points.txt delete mode 100644 third_party/python/taskcluster_taskgraph/taskcluster_taskgraph-6.3.0.dist-info/top_level.txt create mode 100644 third_party/python/taskcluster_taskgraph/taskcluster_taskgraph-8.0.1.dist-info/LICENSE create mode 100644 third_party/python/taskcluster_taskgraph/taskcluster_taskgraph-8.0.1.dist-info/METADATA create mode 100644 third_party/python/taskcluster_taskgraph/taskcluster_taskgraph-8.0.1.dist-info/RECORD create mode 100644 third_party/python/taskcluster_taskgraph/taskcluster_taskgraph-8.0.1.dist-info/WHEEL create mode 100644 third_party/python/taskcluster_taskgraph/taskcluster_taskgraph-8.0.1.dist-info/entry_points.txt create mode 100644 third_party/python/taskcluster_taskgraph/taskcluster_taskgraph-8.0.1.dist-info/top_level.txt delete mode 100644 third_party/python/taskcluster_taskgraph/taskgraph/files_changed.py delete mode 100644 third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/__init__.py delete mode 100644 third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/common.py delete mode 100644 third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/index_search.py delete mode 100644 third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/run_task.py delete mode 100644 third_party/python/taskcluster_taskgraph/taskgraph/transforms/job/toolchain.py create mode 100644 third_party/python/taskcluster_taskgraph/taskgraph/transforms/run/__init__.py create mode 100644 third_party/python/taskcluster_taskgraph/taskgraph/transforms/run/common.py create mode 100644 third_party/python/taskcluster_taskgraph/taskgraph/transforms/run/index_search.py create mode 100644 third_party/python/taskcluster_taskgraph/taskgraph/transforms/run/run_task.py create mode 100644 third_party/python/taskcluster_taskgraph/taskgraph/transforms/run/toolchain.py delete mode 100644 third_party/python/taskcluster_taskgraph/taskgraph/util/decision.py create mode 100644 third_party/python/taskcluster_taskgraph/taskgraph/util/set_name.py create mode 100644 third_party/rust/any_all_workaround/.cargo-checksum.json create mode 100644 third_party/rust/any_all_workaround/Cargo.toml create mode 100644 third_party/rust/any_all_workaround/LICENSE-APACHE create mode 100644 third_party/rust/any_all_workaround/LICENSE-MIT create mode 100644 third_party/rust/any_all_workaround/LICENSE-MIT-QCMS create mode 100644 third_party/rust/any_all_workaround/README.md create mode 100644 third_party/rust/any_all_workaround/build.rs create mode 100644 third_party/rust/any_all_workaround/src/lib.rs create mode 100644 third_party/rust/bitflags/src/tests/all.rs create mode 100644 third_party/rust/bitflags/src/tests/bits.rs create mode 100644 third_party/rust/bitflags/src/tests/complement.rs create mode 100644 third_party/rust/bitflags/src/tests/contains.rs create mode 100644 third_party/rust/bitflags/src/tests/difference.rs create mode 100644 third_party/rust/bitflags/src/tests/empty.rs create mode 100644 third_party/rust/bitflags/src/tests/eq.rs create mode 100644 third_party/rust/bitflags/src/tests/extend.rs create mode 100644 third_party/rust/bitflags/src/tests/flags.rs create mode 100644 third_party/rust/bitflags/src/tests/fmt.rs create mode 100644 third_party/rust/bitflags/src/tests/from_bits.rs create mode 100644 third_party/rust/bitflags/src/tests/from_bits_retain.rs create mode 100644 third_party/rust/bitflags/src/tests/from_bits_truncate.rs create mode 100644 third_party/rust/bitflags/src/tests/from_name.rs create mode 100644 third_party/rust/bitflags/src/tests/insert.rs create mode 100644 third_party/rust/bitflags/src/tests/intersection.rs create mode 100644 third_party/rust/bitflags/src/tests/intersects.rs create mode 100644 third_party/rust/bitflags/src/tests/is_all.rs create mode 100644 third_party/rust/bitflags/src/tests/is_empty.rs create mode 100644 third_party/rust/bitflags/src/tests/iter.rs create mode 100644 third_party/rust/bitflags/src/tests/parser.rs create mode 100644 third_party/rust/bitflags/src/tests/remove.rs create mode 100644 third_party/rust/bitflags/src/tests/symmetric_difference.rs create mode 100644 third_party/rust/bitflags/src/tests/union.rs delete mode 100644 third_party/rust/core-foundation-sys/build.rs create mode 100644 third_party/rust/core-foundation-sys/src/bag.rs create mode 100644 third_party/rust/core-foundation-sys/src/binary_heap.rs create mode 100644 third_party/rust/core-foundation-sys/src/bit_vector.rs create mode 100644 third_party/rust/core-foundation-sys/src/calendar.rs create mode 100644 third_party/rust/core-foundation-sys/src/date_formatter.rs create mode 100644 third_party/rust/core-foundation-sys/src/file_security.rs create mode 100644 third_party/rust/core-foundation-sys/src/locale.rs create mode 100644 third_party/rust/core-foundation-sys/src/notification_center.rs create mode 100644 third_party/rust/core-foundation-sys/src/number_formatter.rs create mode 100644 third_party/rust/core-foundation-sys/src/plugin.rs create mode 100644 third_party/rust/core-foundation-sys/src/preferences.rs create mode 100644 third_party/rust/core-foundation-sys/src/socket.rs create mode 100644 third_party/rust/core-foundation-sys/src/stream.rs create mode 100644 third_party/rust/core-foundation-sys/src/string_tokenizer.rs create mode 100644 third_party/rust/core-foundation-sys/src/tree.rs create mode 100644 third_party/rust/core-foundation-sys/src/url_enumerator.rs create mode 100644 third_party/rust/core-foundation-sys/src/user_notification.rs create mode 100644 third_party/rust/core-foundation-sys/src/xml_node.rs create mode 100644 third_party/rust/core-foundation-sys/src/xml_parser.rs create mode 100644 third_party/rust/core-graphics-types/LICENSE-APACHE create mode 100644 third_party/rust/core-graphics-types/LICENSE-MIT delete mode 100644 third_party/rust/cssparser/.github/workflows/main.yml delete mode 100644 third_party/rust/cssparser/docs/.nojekyll create mode 100644 third_party/rust/equivalent/.cargo-checksum.json create mode 100644 third_party/rust/equivalent/Cargo.toml create mode 100644 third_party/rust/equivalent/LICENSE-APACHE create mode 100644 third_party/rust/equivalent/LICENSE-MIT create mode 100644 third_party/rust/equivalent/README.md create mode 100644 third_party/rust/equivalent/src/lib.rs create mode 100644 third_party/rust/fallible_collections/src/try_reserve_error.rs create mode 100644 third_party/rust/getrandom/src/lazy.rs create mode 100644 third_party/rust/getrandom/src/linux_android_with_fallback.rs delete mode 100644 third_party/rust/glean-core/src/metrics/metrics_enabled_config.rs create mode 100644 third_party/rust/glean-core/src/metrics/remote_settings_config.rs create mode 100644 third_party/rust/hashbrown/src/external_trait_impls/rayon/table.rs create mode 100644 third_party/rust/hashbrown/src/external_trait_impls/rkyv/hash_map.rs create mode 100644 third_party/rust/hashbrown/src/external_trait_impls/rkyv/hash_set.rs create mode 100644 third_party/rust/hashbrown/src/external_trait_impls/rkyv/mod.rs create mode 100644 third_party/rust/hashbrown/src/raw/neon.rs create mode 100644 third_party/rust/hashbrown/src/table.rs create mode 100644 third_party/rust/hashbrown/tests/equivalent_trait.rs create mode 100644 third_party/rust/hashbrown/tests/raw.rs delete mode 100644 third_party/rust/indexmap/build.rs create mode 100644 third_party/rust/indexmap/src/borsh.rs delete mode 100644 third_party/rust/indexmap/src/equivalent.rs create mode 100644 third_party/rust/indexmap/src/map/core/entry.rs create mode 100644 third_party/rust/indexmap/src/map/core/raw_entry_v1.rs create mode 100644 third_party/rust/indexmap/src/map/iter.rs create mode 100644 third_party/rust/indexmap/src/map/mutable.rs create mode 100644 third_party/rust/indexmap/src/map/serde_seq.rs create mode 100644 third_party/rust/indexmap/src/map/slice.rs create mode 100644 third_party/rust/indexmap/src/map/tests.rs delete mode 100644 third_party/rust/indexmap/src/mutable_keys.rs delete mode 100644 third_party/rust/indexmap/src/serde_seq.rs create mode 100644 third_party/rust/indexmap/src/set/iter.rs create mode 100644 third_party/rust/indexmap/src/set/mutable.rs create mode 100644 third_party/rust/indexmap/src/set/slice.rs create mode 100644 third_party/rust/indexmap/src/set/tests.rs create mode 100644 third_party/rust/interrupt-support/build.rs create mode 100644 third_party/rust/interrupt-support/src/interrupt_support.udl create mode 100644 third_party/rust/naga/src/back/pipeline_constants.rs create mode 100644 third_party/rust/naga/src/back/spv/subgroup.rs create mode 100644 third_party/rust/naga/src/error.rs create mode 100644 third_party/rust/neqo-common/src/fuzz.rs create mode 100644 third_party/rust/neqo-transport/src/connection/tests/ecn.rs create mode 100644 third_party/rust/neqo-transport/src/ecn.rs create mode 100644 third_party/rust/nix/build.rs create mode 100644 third_party/rust/nix/src/poll_timeout.rs create mode 100644 third_party/rust/nix/src/sys/fanotify.rs create mode 100644 third_party/rust/nix/test/sys/test_event.rs create mode 100644 third_party/rust/nix/test/sys/test_fanotify.rs create mode 100644 third_party/rust/nix/test/sys/test_resource.rs create mode 100644 third_party/rust/nix/test/sys/test_statfs.rs create mode 100644 third_party/rust/nix/test/sys/test_statvfs.rs create mode 100644 third_party/rust/nix/test/sys/test_time.rs create mode 100644 third_party/rust/nix/test/sys/test_timer.rs create mode 100644 third_party/rust/nix/test/sys/test_utsname.rs create mode 100644 third_party/rust/nix/test/test_errno.rs delete mode 100644 third_party/rust/nix/test/test_resource.rs delete mode 100644 third_party/rust/nix/test/test_timer.rs delete mode 100644 third_party/rust/objc_exception/.cargo-checksum.json delete mode 100644 third_party/rust/objc_exception/Cargo.toml delete mode 100644 third_party/rust/objc_exception/build.rs delete mode 100644 third_party/rust/objc_exception/extern/exception.m delete mode 100644 third_party/rust/objc_exception/src/lib.rs delete mode 100644 third_party/rust/owning_ref/.cargo-checksum.json delete mode 100644 third_party/rust/owning_ref/CHANGELOG.md delete mode 100644 third_party/rust/owning_ref/Cargo.toml delete mode 100644 third_party/rust/owning_ref/LICENSE delete mode 100644 third_party/rust/owning_ref/README.md delete mode 100644 third_party/rust/owning_ref/src/lib.rs delete mode 100644 third_party/rust/packed_simd/.cargo-checksum.json delete mode 100644 third_party/rust/packed_simd/Cargo.toml delete mode 100644 third_party/rust/packed_simd/LICENSE-APACHE delete mode 100644 third_party/rust/packed_simd/LICENSE-MIT delete mode 100644 third_party/rust/packed_simd/README.md delete mode 100644 third_party/rust/packed_simd/bors.toml delete mode 100644 third_party/rust/packed_simd/build.rs delete mode 100755 third_party/rust/packed_simd/ci/all.sh delete mode 100644 third_party/rust/packed_simd/ci/android-install-ndk.sh delete mode 100644 third_party/rust/packed_simd/ci/android-install-sdk.sh delete mode 100644 third_party/rust/packed_simd/ci/android-sysimage.sh delete mode 100755 third_party/rust/packed_simd/ci/benchmark.sh delete mode 100644 third_party/rust/packed_simd/ci/deploy_and_run_on_ios_simulator.rs delete mode 100644 third_party/rust/packed_simd/ci/docker/aarch64-linux-android/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/aarch64-unknown-linux-gnu/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/arm-unknown-linux-gnueabi/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/armv7-linux-androideabi/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/i586-unknown-linux-gnu/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/i686-unknown-linux-gnu/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/mips-unknown-linux-gnu/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/mipsel-unknown-linux-musl/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/powerpc-unknown-linux-gnu/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/s390x-unknown-linux-gnu/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/sparc64-unknown-linux-gnu/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/thumbv7neon-linux-androideabi/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/thumbv7neon-unknown-linux-gnueabihf/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/wasm32-unknown-unknown/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/x86_64-linux-android/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile delete mode 100644 third_party/rust/packed_simd/ci/docker/x86_64-unknown-linux-gnu/Dockerfile delete mode 100755 third_party/rust/packed_simd/ci/dox.sh delete mode 100644 third_party/rust/packed_simd/ci/linux-s390x.sh delete mode 100644 third_party/rust/packed_simd/ci/linux-sparc64.sh delete mode 100644 third_party/rust/packed_simd/ci/lld-shim.rs delete mode 100755 third_party/rust/packed_simd/ci/max_line_width.sh delete mode 100755 third_party/rust/packed_simd/ci/run-docker.sh delete mode 100755 third_party/rust/packed_simd/ci/run.sh delete mode 100644 third_party/rust/packed_simd/ci/run_examples.sh delete mode 100644 third_party/rust/packed_simd/ci/runtest-android.rs delete mode 100755 third_party/rust/packed_simd/ci/setup_benchmarks.sh delete mode 100755 third_party/rust/packed_simd/ci/test-runner-linux delete mode 100644 third_party/rust/packed_simd/contributing.md delete mode 100644 third_party/rust/packed_simd/perf-guide/book.toml delete mode 100644 third_party/rust/packed_simd/perf-guide/src/SUMMARY.md delete mode 100644 third_party/rust/packed_simd/perf-guide/src/ascii.css delete mode 100644 third_party/rust/packed_simd/perf-guide/src/bound_checks.md delete mode 100644 third_party/rust/packed_simd/perf-guide/src/float-math/approx.md delete mode 100644 third_party/rust/packed_simd/perf-guide/src/float-math/fma.md delete mode 100644 third_party/rust/packed_simd/perf-guide/src/float-math/fp.md delete mode 100644 third_party/rust/packed_simd/perf-guide/src/float-math/svml.md delete mode 100644 third_party/rust/packed_simd/perf-guide/src/introduction.md delete mode 100644 third_party/rust/packed_simd/perf-guide/src/prof/linux.md delete mode 100644 third_party/rust/packed_simd/perf-guide/src/prof/mca.md delete mode 100644 third_party/rust/packed_simd/perf-guide/src/prof/profiling.md delete mode 100644 third_party/rust/packed_simd/perf-guide/src/target-feature/attribute.md delete mode 100644 third_party/rust/packed_simd/perf-guide/src/target-feature/features.md delete mode 100644 third_party/rust/packed_simd/perf-guide/src/target-feature/inlining.md delete mode 100644 third_party/rust/packed_simd/perf-guide/src/target-feature/practice.md delete mode 100644 third_party/rust/packed_simd/perf-guide/src/target-feature/runtime.md delete mode 100644 third_party/rust/packed_simd/perf-guide/src/target-feature/rustflags.md delete mode 100644 third_party/rust/packed_simd/perf-guide/src/vert-hor-ops.md delete mode 100644 third_party/rust/packed_simd/rust-toolchain delete mode 100644 third_party/rust/packed_simd/rustfmt.toml delete mode 100644 third_party/rust/packed_simd/src/api.rs delete mode 100644 third_party/rust/packed_simd/src/api/bit_manip.rs delete mode 100644 third_party/rust/packed_simd/src/api/bitmask.rs delete mode 100644 third_party/rust/packed_simd/src/api/cast.rs delete mode 100644 third_party/rust/packed_simd/src/api/cast/macros.rs delete mode 100644 third_party/rust/packed_simd/src/api/cast/v128.rs delete mode 100644 third_party/rust/packed_simd/src/api/cast/v16.rs delete mode 100644 third_party/rust/packed_simd/src/api/cast/v256.rs delete mode 100644 third_party/rust/packed_simd/src/api/cast/v32.rs delete mode 100644 third_party/rust/packed_simd/src/api/cast/v512.rs delete mode 100644 third_party/rust/packed_simd/src/api/cast/v64.rs delete mode 100644 third_party/rust/packed_simd/src/api/cmp.rs delete mode 100644 third_party/rust/packed_simd/src/api/cmp/eq.rs delete mode 100644 third_party/rust/packed_simd/src/api/cmp/ord.rs delete mode 100644 third_party/rust/packed_simd/src/api/cmp/partial_eq.rs delete mode 100644 third_party/rust/packed_simd/src/api/cmp/partial_ord.rs delete mode 100644 third_party/rust/packed_simd/src/api/cmp/vertical.rs delete mode 100644 third_party/rust/packed_simd/src/api/default.rs delete mode 100644 third_party/rust/packed_simd/src/api/fmt.rs delete mode 100644 third_party/rust/packed_simd/src/api/fmt/binary.rs delete mode 100644 third_party/rust/packed_simd/src/api/fmt/debug.rs delete mode 100644 third_party/rust/packed_simd/src/api/fmt/lower_hex.rs delete mode 100644 third_party/rust/packed_simd/src/api/fmt/octal.rs delete mode 100644 third_party/rust/packed_simd/src/api/fmt/upper_hex.rs delete mode 100644 third_party/rust/packed_simd/src/api/from.rs delete mode 100644 third_party/rust/packed_simd/src/api/from/from_array.rs delete mode 100644 third_party/rust/packed_simd/src/api/from/from_vector.rs delete mode 100644 third_party/rust/packed_simd/src/api/hash.rs delete mode 100644 third_party/rust/packed_simd/src/api/into_bits.rs delete mode 100644 third_party/rust/packed_simd/src/api/into_bits/arch_specific.rs delete mode 100644 third_party/rust/packed_simd/src/api/into_bits/macros.rs delete mode 100644 third_party/rust/packed_simd/src/api/into_bits/v128.rs delete mode 100644 third_party/rust/packed_simd/src/api/into_bits/v16.rs delete mode 100644 third_party/rust/packed_simd/src/api/into_bits/v256.rs delete mode 100644 third_party/rust/packed_simd/src/api/into_bits/v32.rs delete mode 100644 third_party/rust/packed_simd/src/api/into_bits/v512.rs delete mode 100644 third_party/rust/packed_simd/src/api/into_bits/v64.rs delete mode 100644 third_party/rust/packed_simd/src/api/math.rs delete mode 100644 third_party/rust/packed_simd/src/api/math/float.rs delete mode 100644 third_party/rust/packed_simd/src/api/math/float/abs.rs delete mode 100644 third_party/rust/packed_simd/src/api/math/float/consts.rs delete mode 100644 third_party/rust/packed_simd/src/api/math/float/cos.rs delete mode 100644 third_party/rust/packed_simd/src/api/math/float/exp.rs delete mode 100644 third_party/rust/packed_simd/src/api/math/float/ln.rs delete mode 100644 third_party/rust/packed_simd/src/api/math/float/mul_add.rs delete mode 100644 third_party/rust/packed_simd/src/api/math/float/mul_adde.rs delete mode 100644 third_party/rust/packed_simd/src/api/math/float/powf.rs delete mode 100644 third_party/rust/packed_simd/src/api/math/float/recpre.rs delete mode 100644 third_party/rust/packed_simd/src/api/math/float/rsqrte.rs delete mode 100644 third_party/rust/packed_simd/src/api/math/float/sin.rs delete mode 100644 third_party/rust/packed_simd/src/api/math/float/sqrt.rs delete mode 100644 third_party/rust/packed_simd/src/api/math/float/sqrte.rs delete mode 100644 third_party/rust/packed_simd/src/api/math/float/tanh.rs delete mode 100644 third_party/rust/packed_simd/src/api/minimal.rs delete mode 100644 third_party/rust/packed_simd/src/api/minimal/iuf.rs delete mode 100644 third_party/rust/packed_simd/src/api/minimal/mask.rs delete mode 100644 third_party/rust/packed_simd/src/api/minimal/ptr.rs delete mode 100644 third_party/rust/packed_simd/src/api/ops.rs delete mode 100644 third_party/rust/packed_simd/src/api/ops/scalar_arithmetic.rs delete mode 100644 third_party/rust/packed_simd/src/api/ops/scalar_bitwise.rs delete mode 100644 third_party/rust/packed_simd/src/api/ops/scalar_mask_bitwise.rs delete mode 100644 third_party/rust/packed_simd/src/api/ops/scalar_shifts.rs delete mode 100644 third_party/rust/packed_simd/src/api/ops/vector_arithmetic.rs delete mode 100644 third_party/rust/packed_simd/src/api/ops/vector_bitwise.rs delete mode 100644 third_party/rust/packed_simd/src/api/ops/vector_float_min_max.rs delete mode 100644 third_party/rust/packed_simd/src/api/ops/vector_int_min_max.rs delete mode 100644 third_party/rust/packed_simd/src/api/ops/vector_mask_bitwise.rs delete mode 100644 third_party/rust/packed_simd/src/api/ops/vector_neg.rs delete mode 100644 third_party/rust/packed_simd/src/api/ops/vector_rotates.rs delete mode 100644 third_party/rust/packed_simd/src/api/ops/vector_shifts.rs delete mode 100644 third_party/rust/packed_simd/src/api/ptr.rs delete mode 100644 third_party/rust/packed_simd/src/api/ptr/gather_scatter.rs delete mode 100644 third_party/rust/packed_simd/src/api/reductions.rs delete mode 100644 third_party/rust/packed_simd/src/api/reductions/bitwise.rs delete mode 100644 third_party/rust/packed_simd/src/api/reductions/float_arithmetic.rs delete mode 100644 third_party/rust/packed_simd/src/api/reductions/integer_arithmetic.rs delete mode 100644 third_party/rust/packed_simd/src/api/reductions/mask.rs delete mode 100644 third_party/rust/packed_simd/src/api/reductions/min_max.rs delete mode 100644 third_party/rust/packed_simd/src/api/select.rs delete mode 100644 third_party/rust/packed_simd/src/api/shuffle.rs delete mode 100644 third_party/rust/packed_simd/src/api/shuffle1_dyn.rs delete mode 100644 third_party/rust/packed_simd/src/api/slice.rs delete mode 100644 third_party/rust/packed_simd/src/api/slice/from_slice.rs delete mode 100644 third_party/rust/packed_simd/src/api/slice/write_to_slice.rs delete mode 100644 third_party/rust/packed_simd/src/api/swap_bytes.rs delete mode 100644 third_party/rust/packed_simd/src/codegen.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/bit_manip.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/llvm.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/math.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/math/float.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/math/float/abs.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/math/float/cos.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/math/float/cos_pi.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/math/float/exp.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/math/float/ln.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/math/float/macros.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/math/float/mul_add.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/math/float/mul_adde.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/math/float/powf.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/math/float/sin.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/math/float/sin_cos_pi.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/math/float/sin_pi.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/math/float/sqrt.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/math/float/sqrte.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/math/float/tanh.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/pointer_sized_int.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/reductions.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/reductions/mask.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/reductions/mask/aarch64.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/reductions/mask/arm.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/reductions/mask/fallback.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/reductions/mask/fallback_impl.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/reductions/mask/x86.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/reductions/mask/x86/avx.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/reductions/mask/x86/avx2.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/reductions/mask/x86/sse.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/reductions/mask/x86/sse2.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/shuffle.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/shuffle1_dyn.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/swap_bytes.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/v128.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/v16.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/v256.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/v32.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/v512.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/v64.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/vPtr.rs delete mode 100644 third_party/rust/packed_simd/src/codegen/vSize.rs delete mode 100644 third_party/rust/packed_simd/src/lib.rs delete mode 100644 third_party/rust/packed_simd/src/masks.rs delete mode 100644 third_party/rust/packed_simd/src/sealed.rs delete mode 100644 third_party/rust/packed_simd/src/testing.rs delete mode 100644 third_party/rust/packed_simd/src/testing/macros.rs delete mode 100644 third_party/rust/packed_simd/src/testing/utils.rs delete mode 100644 third_party/rust/packed_simd/src/v128.rs delete mode 100644 third_party/rust/packed_simd/src/v16.rs delete mode 100644 third_party/rust/packed_simd/src/v256.rs delete mode 100644 third_party/rust/packed_simd/src/v32.rs delete mode 100644 third_party/rust/packed_simd/src/v512.rs delete mode 100644 third_party/rust/packed_simd/src/v64.rs delete mode 100644 third_party/rust/packed_simd/src/vPtr.rs delete mode 100644 third_party/rust/packed_simd/src/vSize.rs delete mode 100644 third_party/rust/packed_simd/tests/endianness.rs create mode 100755 third_party/rust/prio/documentation/field_parameters.sage delete mode 100644 third_party/rust/prio/src/vdaf/test_vec/07/IdpfPoplar_0.json delete mode 100644 third_party/rust/prio/src/vdaf/test_vec/07/Poplar1_0.json delete mode 100644 third_party/rust/prio/src/vdaf/test_vec/07/Poplar1_1.json delete mode 100644 third_party/rust/prio/src/vdaf/test_vec/07/Poplar1_2.json delete mode 100644 third_party/rust/prio/src/vdaf/test_vec/07/Poplar1_3.json delete mode 100644 third_party/rust/prio/src/vdaf/test_vec/07/Prio3Count_0.json delete mode 100644 third_party/rust/prio/src/vdaf/test_vec/07/Prio3Count_1.json delete mode 100644 third_party/rust/prio/src/vdaf/test_vec/07/Prio3Histogram_0.json delete mode 100644 third_party/rust/prio/src/vdaf/test_vec/07/Prio3Histogram_1.json delete mode 100644 third_party/rust/prio/src/vdaf/test_vec/07/Prio3SumVec_0.json delete mode 100644 third_party/rust/prio/src/vdaf/test_vec/07/Prio3SumVec_1.json delete mode 100644 third_party/rust/prio/src/vdaf/test_vec/07/Prio3Sum_0.json delete mode 100644 third_party/rust/prio/src/vdaf/test_vec/07/Prio3Sum_1.json delete mode 100644 third_party/rust/prio/src/vdaf/test_vec/07/XofFixedKeyAes128.json delete mode 100644 third_party/rust/prio/src/vdaf/test_vec/07/XofShake128.json create mode 100644 third_party/rust/prio/src/vdaf/test_vec/08/IdpfPoplar_0.json create mode 100644 third_party/rust/prio/src/vdaf/test_vec/08/Poplar1_0.json create mode 100644 third_party/rust/prio/src/vdaf/test_vec/08/Poplar1_1.json create mode 100644 third_party/rust/prio/src/vdaf/test_vec/08/Poplar1_2.json create mode 100644 third_party/rust/prio/src/vdaf/test_vec/08/Poplar1_3.json create mode 100644 third_party/rust/prio/src/vdaf/test_vec/08/Prio3Count_0.json create mode 100644 third_party/rust/prio/src/vdaf/test_vec/08/Prio3Count_1.json create mode 100644 third_party/rust/prio/src/vdaf/test_vec/08/Prio3Histogram_0.json create mode 100644 third_party/rust/prio/src/vdaf/test_vec/08/Prio3Histogram_1.json create mode 100644 third_party/rust/prio/src/vdaf/test_vec/08/Prio3SumVec_0.json create mode 100644 third_party/rust/prio/src/vdaf/test_vec/08/Prio3SumVec_1.json create mode 100644 third_party/rust/prio/src/vdaf/test_vec/08/Prio3Sum_0.json create mode 100644 third_party/rust/prio/src/vdaf/test_vec/08/Prio3Sum_1.json create mode 100644 third_party/rust/prio/src/vdaf/test_vec/08/XofFixedKeyAes128.json create mode 100644 third_party/rust/prio/src/vdaf/test_vec/08/XofTurboShake128.json create mode 100644 third_party/rust/prio/src/vdaf/test_vec/XofHmacSha256Aes128.json create mode 100644 third_party/rust/prio/src/vidpf.rs create mode 100644 third_party/rust/relevancy/src/ingest.rs delete mode 100644 third_party/rust/relevancy/src/populate_interests.rs create mode 100644 third_party/rust/relevancy/src/rs.rs delete mode 100644 third_party/rust/serde_json/src/features_check/error.rs delete mode 100644 third_party/rust/serde_json/src/features_check/mod.rs create mode 100644 third_party/rust/serde_json/tests/regression/issue1004.rs create mode 100644 third_party/rust/sql-support/src/lazy.rs delete mode 100644 third_party/rust/uniffi-example-arithmetic/.cargo-checksum.json delete mode 100644 third_party/rust/uniffi-example-arithmetic/Cargo.toml delete mode 100644 third_party/rust/uniffi-example-arithmetic/build.rs delete mode 100644 third_party/rust/uniffi-example-arithmetic/src/arithmetic.udl delete mode 100644 third_party/rust/uniffi-example-arithmetic/src/lib.rs delete mode 100644 third_party/rust/uniffi-example-arithmetic/tests/bindings/test_arithmetic.kts delete mode 100644 third_party/rust/uniffi-example-arithmetic/tests/bindings/test_arithmetic.py delete mode 100644 third_party/rust/uniffi-example-arithmetic/tests/bindings/test_arithmetic.rb delete mode 100644 third_party/rust/uniffi-example-arithmetic/tests/bindings/test_arithmetic.swift delete mode 100644 third_party/rust/uniffi-example-arithmetic/tests/test_generated_bindings.rs delete mode 100644 third_party/rust/uniffi-example-arithmetic/uniffi.toml delete mode 100644 third_party/rust/uniffi-example-geometry/.cargo-checksum.json delete mode 100644 third_party/rust/uniffi-example-geometry/Cargo.toml delete mode 100644 third_party/rust/uniffi-example-geometry/build.rs delete mode 100644 third_party/rust/uniffi-example-geometry/src/geometry.udl delete mode 100644 third_party/rust/uniffi-example-geometry/src/lib.rs delete mode 100644 third_party/rust/uniffi-example-geometry/tests/bindings/test_geometry.kts delete mode 100644 third_party/rust/uniffi-example-geometry/tests/bindings/test_geometry.py delete mode 100644 third_party/rust/uniffi-example-geometry/tests/bindings/test_geometry.rb delete mode 100644 third_party/rust/uniffi-example-geometry/tests/bindings/test_geometry.swift delete mode 100644 third_party/rust/uniffi-example-geometry/tests/test_generated_bindings.rs delete mode 100644 third_party/rust/uniffi-example-rondpoint/.cargo-checksum.json delete mode 100644 third_party/rust/uniffi-example-rondpoint/Cargo.toml delete mode 100644 third_party/rust/uniffi-example-rondpoint/build.rs delete mode 100644 third_party/rust/uniffi-example-rondpoint/src/lib.rs delete mode 100644 third_party/rust/uniffi-example-rondpoint/src/rondpoint.udl delete mode 100644 third_party/rust/uniffi-example-rondpoint/tests/bindings/test_rondpoint.kts delete mode 100644 third_party/rust/uniffi-example-rondpoint/tests/bindings/test_rondpoint.py delete mode 100644 third_party/rust/uniffi-example-rondpoint/tests/bindings/test_rondpoint.rb delete mode 100644 third_party/rust/uniffi-example-rondpoint/tests/bindings/test_rondpoint.swift delete mode 100644 third_party/rust/uniffi-example-rondpoint/tests/test_generated_bindings.rs delete mode 100644 third_party/rust/uniffi-example-sprites/.cargo-checksum.json delete mode 100644 third_party/rust/uniffi-example-sprites/Cargo.toml delete mode 100644 third_party/rust/uniffi-example-sprites/build.rs delete mode 100644 third_party/rust/uniffi-example-sprites/src/lib.rs delete mode 100644 third_party/rust/uniffi-example-sprites/src/sprites.udl delete mode 100644 third_party/rust/uniffi-example-sprites/tests/bindings/test_sprites.kts delete mode 100644 third_party/rust/uniffi-example-sprites/tests/bindings/test_sprites.py delete mode 100644 third_party/rust/uniffi-example-sprites/tests/bindings/test_sprites.rb delete mode 100644 third_party/rust/uniffi-example-sprites/tests/bindings/test_sprites.swift delete mode 100644 third_party/rust/uniffi-example-sprites/tests/test_generated_bindings.rs delete mode 100644 third_party/rust/uniffi-example-todolist/.cargo-checksum.json delete mode 100644 third_party/rust/uniffi-example-todolist/Cargo.toml delete mode 100644 third_party/rust/uniffi-example-todolist/build.rs delete mode 100644 third_party/rust/uniffi-example-todolist/src/lib.rs delete mode 100644 third_party/rust/uniffi-example-todolist/src/todolist.udl delete mode 100644 third_party/rust/uniffi-example-todolist/tests/bindings/test_todolist.kts delete mode 100644 third_party/rust/uniffi-example-todolist/tests/bindings/test_todolist.py delete mode 100644 third_party/rust/uniffi-example-todolist/tests/bindings/test_todolist.rb delete mode 100644 third_party/rust/uniffi-example-todolist/tests/bindings/test_todolist.swift delete mode 100644 third_party/rust/uniffi-example-todolist/tests/test_generated_bindings.rs delete mode 100644 third_party/rust/wgpu-core/src/any_surface.rs create mode 100644 third_party/rust/wgpu-core/src/command/allocator.rs create mode 100644 third_party/rust/wgpu-core/src/command/compute_command.rs create mode 100644 third_party/rust/wgpu-core/src/lock/mod.rs create mode 100644 third_party/rust/wgpu-core/src/lock/rank.rs create mode 100644 third_party/rust/wgpu-core/src/lock/ranked.rs create mode 100644 third_party/rust/wgpu-core/src/lock/vanilla.rs create mode 100644 third_party/rust/zerocopy-derive/.cargo-checksum.json create mode 100644 third_party/rust/zerocopy-derive/Cargo.toml create mode 100644 third_party/rust/zerocopy-derive/LICENSE-APACHE create mode 100644 third_party/rust/zerocopy-derive/LICENSE-BSD create mode 100644 third_party/rust/zerocopy-derive/LICENSE-MIT create mode 100644 third_party/rust/zerocopy-derive/src/ext.rs create mode 100644 third_party/rust/zerocopy-derive/src/lib.rs create mode 100644 third_party/rust/zerocopy-derive/src/repr.rs create mode 100644 third_party/rust/zerocopy-derive/tests/enum_as_bytes.rs create mode 100644 third_party/rust/zerocopy-derive/tests/enum_from_zeroes.rs create mode 100644 third_party/rust/zerocopy-derive/tests/enum_known_layout.rs create mode 100644 third_party/rust/zerocopy-derive/tests/enum_unaligned.rs create mode 100644 third_party/rust/zerocopy-derive/tests/hygiene.rs create mode 100644 third_party/rust/zerocopy-derive/tests/paths_and_modules.rs create mode 100644 third_party/rust/zerocopy-derive/tests/priv_in_pub.rs create mode 100644 third_party/rust/zerocopy-derive/tests/struct_as_bytes.rs create mode 100644 third_party/rust/zerocopy-derive/tests/struct_from_bytes.rs create mode 100644 third_party/rust/zerocopy-derive/tests/struct_from_zeroes.rs create mode 100644 third_party/rust/zerocopy-derive/tests/struct_known_layout.rs create mode 100644 third_party/rust/zerocopy-derive/tests/struct_unaligned.rs create mode 100644 third_party/rust/zerocopy-derive/tests/trybuild.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-msrv/derive_transparent.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-msrv/derive_transparent.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-msrv/enum.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-msrv/enum.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-msrv/enum_from_bytes_u8_too_few.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-msrv/enum_from_bytes_u8_too_few.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-msrv/late_compile_pass.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-msrv/late_compile_pass.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-msrv/mid_compile_pass.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-msrv/mid_compile_pass.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-msrv/struct.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-msrv/struct.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-msrv/union.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-msrv/union.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-nightly/derive_transparent.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-nightly/derive_transparent.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-nightly/enum.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-nightly/enum.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-nightly/enum_from_bytes_u8_too_few.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-nightly/enum_from_bytes_u8_too_few.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-nightly/late_compile_pass.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-nightly/late_compile_pass.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-nightly/mid_compile_pass.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-nightly/mid_compile_pass.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-nightly/struct.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-nightly/struct.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-nightly/union.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-nightly/union.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-stable/derive_transparent.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-stable/derive_transparent.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-stable/enum.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-stable/enum.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-stable/enum_from_bytes_u8_too_few.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-stable/enum_from_bytes_u8_too_few.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-stable/late_compile_pass.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-stable/late_compile_pass.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-stable/mid_compile_pass.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-stable/mid_compile_pass.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-stable/struct.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-stable/struct.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/ui-stable/union.rs create mode 100644 third_party/rust/zerocopy-derive/tests/ui-stable/union.stderr create mode 100644 third_party/rust/zerocopy-derive/tests/union_as_bytes.rs create mode 100644 third_party/rust/zerocopy-derive/tests/union_from_bytes.rs create mode 100644 third_party/rust/zerocopy-derive/tests/union_from_zeroes.rs create mode 100644 third_party/rust/zerocopy-derive/tests/union_known_layout.rs create mode 100644 third_party/rust/zerocopy-derive/tests/union_unaligned.rs create mode 100644 third_party/rust/zerocopy-derive/tests/util.rs create mode 100644 third_party/rust/zerocopy/.cargo-checksum.json create mode 100644 third_party/rust/zerocopy/CONTRIBUTING.md create mode 100644 third_party/rust/zerocopy/Cargo.toml create mode 100644 third_party/rust/zerocopy/INTERNAL.md create mode 100644 third_party/rust/zerocopy/LICENSE-APACHE create mode 100644 third_party/rust/zerocopy/LICENSE-BSD create mode 100644 third_party/rust/zerocopy/LICENSE-MIT create mode 100644 third_party/rust/zerocopy/POLICIES.md create mode 100644 third_party/rust/zerocopy/README.md create mode 100755 third_party/rust/zerocopy/cargo.sh create mode 100644 third_party/rust/zerocopy/clippy.toml create mode 100755 third_party/rust/zerocopy/generate-readme.sh create mode 100644 third_party/rust/zerocopy/rustfmt.toml create mode 100644 third_party/rust/zerocopy/src/byteorder.rs create mode 100644 third_party/rust/zerocopy/src/lib.rs create mode 100644 third_party/rust/zerocopy/src/macro_util.rs create mode 100644 third_party/rust/zerocopy/src/macros.rs create mode 100644 third_party/rust/zerocopy/src/post_monomorphization_compile_fail_tests.rs create mode 100644 third_party/rust/zerocopy/src/third_party/rust/LICENSE-APACHE create mode 100644 third_party/rust/zerocopy/src/third_party/rust/LICENSE-MIT create mode 100644 third_party/rust/zerocopy/src/third_party/rust/README.fuchsia create mode 100644 third_party/rust/zerocopy/src/third_party/rust/layout.rs create mode 100644 third_party/rust/zerocopy/src/util.rs create mode 100644 third_party/rust/zerocopy/src/wrappers.rs create mode 100644 third_party/rust/zerocopy/testdata/include_value/data create mode 100644 third_party/rust/zerocopy/tests/trybuild.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/include_value_not_from_bytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/include_value_not_from_bytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/include_value_wrong_size.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/include_value_wrong_size.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/invalid-impls/invalid-impls.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/invalid-impls/invalid-impls.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/max-align.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/max-align.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-dst-not-frombytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-dst-not-frombytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-alignment-increase.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-alignment-increase.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-const.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-const.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-dst-generic.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-dst-generic.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-dst-not-a-reference.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-dst-not-a-reference.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-dst-not-asbytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-dst-not-asbytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-dst-not-frombytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-dst-not-frombytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-dst-unsized.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-dst-unsized.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-illegal-lifetime.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-illegal-lifetime.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-size-decrease.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-size-decrease.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-size-increase.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-size-increase.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-src-dst-generic.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-src-dst-generic.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-src-dst-not-references.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-src-dst-not-references.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-src-dst-unsized.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-src-dst-unsized.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-src-generic.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-src-generic.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-src-immutable.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-src-immutable.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-src-not-a-reference.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-src-not-a-reference.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-src-not-asbytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-src-not-asbytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-src-not-frombytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-src-not-frombytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-src-unsized.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-mut-src-unsized.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ptr-to-usize.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ptr-to-usize.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-alignment-increase.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-alignment-increase.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-dst-generic.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-dst-generic.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-dst-mutable.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-dst-mutable.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-dst-not-a-reference.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-dst-not-a-reference.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-dst-not-frombytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-dst-not-frombytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-dst-unsized.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-dst-unsized.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-illegal-lifetime.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-illegal-lifetime.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-size-decrease.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-size-decrease.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-size-increase.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-size-increase.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-src-dst-generic.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-src-dst-generic.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-src-dst-not-references.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-src-dst-not-references.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-src-dst-unsized.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-src-dst-unsized.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-src-generic.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-src-generic.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-src-not-a-reference.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-src-not-a-reference.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-src-not-asbytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-src-not-asbytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-src-unsized.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-ref-src-unsized.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-size-decrease.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-size-decrease.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-size-increase.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-size-increase.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-src-not-asbytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-msrv/transmute-src-not-asbytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/include_value_not_from_bytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/include_value_not_from_bytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/include_value_wrong_size.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/include_value_wrong_size.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/invalid-impls/invalid-impls.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/invalid-impls/invalid-impls.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/max-align.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/max-align.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-dst-not-frombytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-dst-not-frombytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-alignment-increase.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-alignment-increase.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-const.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-const.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-dst-generic.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-dst-generic.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-dst-not-a-reference.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-dst-not-a-reference.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-dst-not-asbytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-dst-not-asbytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-dst-not-frombytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-dst-not-frombytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-dst-unsized.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-dst-unsized.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-illegal-lifetime.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-illegal-lifetime.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-size-decrease.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-size-decrease.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-size-increase.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-size-increase.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-src-dst-generic.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-src-dst-generic.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-src-dst-not-references.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-src-dst-not-references.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-src-dst-unsized.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-src-dst-unsized.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-src-generic.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-src-generic.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-src-immutable.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-src-immutable.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-src-not-a-reference.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-src-not-a-reference.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-src-not-asbytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-src-not-asbytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-src-not-frombytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-src-not-frombytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-src-unsized.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-mut-src-unsized.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ptr-to-usize.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ptr-to-usize.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-alignment-increase.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-alignment-increase.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-dst-generic.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-dst-generic.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-dst-mutable.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-dst-mutable.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-dst-not-a-reference.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-dst-not-a-reference.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-dst-not-frombytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-dst-not-frombytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-dst-unsized.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-dst-unsized.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-illegal-lifetime.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-illegal-lifetime.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-size-decrease.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-size-decrease.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-size-increase.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-size-increase.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-src-dst-generic.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-src-dst-generic.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-src-dst-not-references.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-src-dst-not-references.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-src-dst-unsized.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-src-dst-unsized.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-src-generic.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-src-generic.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-src-not-a-reference.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-src-not-a-reference.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-src-not-asbytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-src-not-asbytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-src-unsized.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-ref-src-unsized.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-size-decrease.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-size-decrease.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-size-increase.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-size-increase.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-src-not-asbytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-nightly/transmute-src-not-asbytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/include_value_not_from_bytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/include_value_not_from_bytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/include_value_wrong_size.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/include_value_wrong_size.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/invalid-impls/invalid-impls.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/invalid-impls/invalid-impls.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/max-align.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/max-align.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-dst-not-frombytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-dst-not-frombytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-alignment-increase.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-alignment-increase.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-const.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-const.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-dst-generic.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-dst-generic.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-dst-not-a-reference.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-dst-not-a-reference.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-dst-not-asbytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-dst-not-asbytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-dst-not-frombytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-dst-not-frombytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-dst-unsized.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-dst-unsized.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-illegal-lifetime.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-illegal-lifetime.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-size-decrease.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-size-decrease.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-size-increase.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-size-increase.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-src-dst-generic.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-src-dst-generic.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-src-dst-not-references.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-src-dst-not-references.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-src-dst-unsized.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-src-dst-unsized.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-src-generic.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-src-generic.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-src-immutable.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-src-immutable.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-src-not-a-reference.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-src-not-a-reference.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-src-not-asbytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-src-not-asbytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-src-not-frombytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-src-not-frombytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-src-unsized.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-mut-src-unsized.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ptr-to-usize.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ptr-to-usize.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-alignment-increase.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-alignment-increase.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-dst-generic.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-dst-generic.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-dst-mutable.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-dst-mutable.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-dst-not-a-reference.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-dst-not-a-reference.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-dst-not-frombytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-dst-not-frombytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-dst-unsized.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-dst-unsized.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-illegal-lifetime.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-illegal-lifetime.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-size-decrease.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-size-decrease.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-size-increase.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-size-increase.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-src-dst-generic.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-src-dst-generic.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-src-dst-not-references.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-src-dst-not-references.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-src-dst-unsized.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-src-dst-unsized.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-src-generic.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-src-generic.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-src-not-a-reference.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-src-not-a-reference.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-src-not-asbytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-src-not-asbytes.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-src-unsized.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-ref-src-unsized.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-size-decrease.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-size-decrease.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-size-increase.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-size-increase.stderr create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-src-not-asbytes.rs create mode 100644 third_party/rust/zerocopy/tests/ui-stable/transmute-src-not-asbytes.stderr create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_emulated.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_emulated_register.hpp (limited to 'third_party') diff --git a/third_party/aom/aom/aom_image.h b/third_party/aom/aom/aom_image.h index d5f0c087e6..68fb312222 100644 --- a/third_party/aom/aom/aom_image.h +++ b/third_party/aom/aom/aom_image.h @@ -103,7 +103,8 @@ typedef enum aom_transfer_characteristics { AOM_CICP_TC_SMPTE_428 = 17, /**< SMPTE ST 428 */ AOM_CICP_TC_HLG = 18, /**< BT.2100 HLG, ARIB STD-B67 */ AOM_CICP_TC_RESERVED_19 = 19 /**< For future use (values 19-255) */ -} aom_transfer_characteristics_t; /**< alias for enum aom_transfer_function */ +} aom_transfer_characteristics_t; /**< alias for enum + aom_transfer_characteristics */ /*!\brief List of supported matrix coefficients */ typedef enum aom_matrix_coefficients { @@ -125,7 +126,7 @@ typedef enum aom_matrix_coefficients { AOM_CICP_MC_CHROMAT_CL = 13, /**< Chromaticity-derived constant luminance */ AOM_CICP_MC_ICTCP = 14, /**< BT.2100 ICtCp */ AOM_CICP_MC_RESERVED_15 = 15 /**< For future use (values 15-255) */ -} aom_matrix_coefficients_t; +} aom_matrix_coefficients_t; /**< alias for enum aom_matrix_coefficients */ /*!\brief List of supported color range */ typedef enum aom_color_range { @@ -144,7 +145,8 @@ typedef enum aom_chroma_sample_position { /**< sample, between two vertical samples */ AOM_CSP_COLOCATED = 2, /**< Co-located with luma(0, 0) sample */ AOM_CSP_RESERVED = 3 /**< Reserved value */ -} aom_chroma_sample_position_t; /**< alias for enum aom_transfer_function */ +} aom_chroma_sample_position_t; /**< alias for enum aom_chroma_sample_position + */ /*!\brief List of insert flags for Metadata * @@ -244,10 +246,13 @@ typedef struct aom_image { * is NULL, the storage for the descriptor will be * allocated on the heap. * \param[in] fmt Format for the image - * \param[in] d_w Width of the image - * \param[in] d_h Height of the image + * \param[in] d_w Width of the image. Must not exceed 0x08000000 + * (2^27). + * \param[in] d_h Height of the image. Must not exceed 0x08000000 + * (2^27). * \param[in] align Alignment, in bytes, of the image buffer and - * each row in the image (stride). + * each row in the image (stride). Must not exceed + * 65536. * * \return Returns a pointer to the initialized image descriptor. If the img * parameter is non-null, the value of the img parameter will be @@ -267,10 +272,12 @@ aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt, * is NULL, the storage for the descriptor will be * allocated on the heap. * \param[in] fmt Format for the image - * \param[in] d_w Width of the image - * \param[in] d_h Height of the image + * \param[in] d_w Width of the image. Must not exceed 0x08000000 + * (2^27). + * \param[in] d_h Height of the image. Must not exceed 0x08000000 + * (2^27). * \param[in] align Alignment, in bytes, of each row in the image - * (stride). + * (stride). Must not exceed 65536. * \param[in] img_data Storage to use for the image * * \return Returns a pointer to the initialized image descriptor. If the img @@ -291,12 +298,17 @@ aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, * is NULL, the storage for the descriptor will be * allocated on the heap. * \param[in] fmt Format for the image - * \param[in] d_w Width of the image - * \param[in] d_h Height of the image + * \param[in] d_w Width of the image. Must not exceed 0x08000000 + * (2^27). + * \param[in] d_h Height of the image. Must not exceed 0x08000000 + * (2^27). * \param[in] align Alignment, in bytes, of the image buffer and - * each row in the image (stride). + * each row in the image (stride). Must not exceed + * 65536. * \param[in] size_align Alignment, in pixels, of the image width and height. + * Must not exceed 65536. * \param[in] border A border that is padded on four sides of the image. + * Must not exceed 65536. * * \return Returns a pointer to the initialized image descriptor. If the img * parameter is non-null, the value of the img parameter will be diff --git a/third_party/aom/aom/src/aom_image.c b/third_party/aom/aom/src/aom_image.c index 3b1c33d056..1d3b7df245 100644 --- a/third_party/aom/aom/src/aom_image.c +++ b/third_party/aom/aom/src/aom_image.c @@ -9,6 +9,7 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include #include #include #include @@ -36,13 +37,20 @@ static aom_image_t *img_alloc_helper( /* NOTE: In this function, bit_depth is either 8 or 16 (if * AOM_IMG_FMT_HIGHBITDEPTH is set), never 10 or 12. */ - unsigned int h, w, s, xcs, ycs, bps, bit_depth; - unsigned int stride_in_bytes; + unsigned int xcs, ycs, bps, bit_depth; if (img != NULL) memset(img, 0, sizeof(aom_image_t)); if (fmt == AOM_IMG_FMT_NONE) goto fail; + /* Impose maximum values on input parameters so that this function can + * perform arithmetic operations without worrying about overflows. + */ + if (d_w > 0x08000000 || d_h > 0x08000000 || buf_align > 65536 || + stride_align > 65536 || size_align > 65536 || border > 65536) { + goto fail; + } + /* Treat align==0 like align==1 */ if (!buf_align) buf_align = 1; @@ -105,12 +113,17 @@ static aom_image_t *img_alloc_helper( } /* Calculate storage sizes given the chroma subsampling */ - w = align_image_dimension(d_w, xcs, size_align); - h = align_image_dimension(d_h, ycs, size_align); - - s = (fmt & AOM_IMG_FMT_PLANAR) ? w : bps * w / bit_depth; - s = (s + 2 * border + stride_align - 1) & ~(stride_align - 1); - stride_in_bytes = s * bit_depth / 8; + const unsigned int w = align_image_dimension(d_w, xcs, size_align); + assert(d_w <= w); + const unsigned int h = align_image_dimension(d_h, ycs, size_align); + assert(d_h <= h); + + uint64_t s = (uint64_t)w + 2 * border; + s = (fmt & AOM_IMG_FMT_PLANAR) ? s : s * bps / bit_depth; + s = s * bit_depth / 8; + s = (s + stride_align - 1) & ~((uint64_t)stride_align - 1); + if (s > INT_MAX) goto fail; + const int stride_in_bytes = (int)s; /* Allocate the new image */ if (!img) { @@ -232,7 +245,7 @@ int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y, img->planes[AOM_PLANE_Y] = data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y]; - data += (img->h + 2 * border) * img->stride[AOM_PLANE_Y]; + data += ((size_t)img->h + 2 * border) * img->stride[AOM_PLANE_Y]; unsigned int uv_border_h = border >> img->y_chroma_shift; unsigned int uv_x = x >> img->x_chroma_shift; @@ -244,14 +257,14 @@ int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y, } else if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) { img->planes[AOM_PLANE_U] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U]; - data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) * + data += ((size_t)(img->h >> img->y_chroma_shift) + 2 * uv_border_h) * img->stride[AOM_PLANE_U]; img->planes[AOM_PLANE_V] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V]; } else { img->planes[AOM_PLANE_V] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V]; - data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) * + data += ((size_t)(img->h >> img->y_chroma_shift) + 2 * uv_border_h) * img->stride[AOM_PLANE_V]; img->planes[AOM_PLANE_U] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U]; @@ -291,15 +304,15 @@ void aom_img_free(aom_image_t *img) { } int aom_img_plane_width(const aom_image_t *img, int plane) { - if (plane > 0 && img->x_chroma_shift > 0) - return (img->d_w + 1) >> img->x_chroma_shift; + if (plane > 0) + return (img->d_w + img->x_chroma_shift) >> img->x_chroma_shift; else return img->d_w; } int aom_img_plane_height(const aom_image_t *img, int plane) { - if (plane > 0 && img->y_chroma_shift > 0) - return (img->d_h + 1) >> img->y_chroma_shift; + if (plane > 0) + return (img->d_h + img->y_chroma_shift) >> img->y_chroma_shift; else return img->d_h; } diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake index de987cbd23..27099d36b2 100644 --- a/third_party/aom/aom_dsp/aom_dsp.cmake +++ b/third_party/aom/aom_dsp/aom_dsp.cmake @@ -205,6 +205,9 @@ if(CONFIG_AV1_ENCODER) list(APPEND AOM_DSP_ENCODER_INTRIN_NEON "${AOM_ROOT}/aom_dsp/flow_estimation/arm/disflow_neon.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_SVE + "${AOM_ROOT}/aom_dsp/flow_estimation/arm/disflow_sve.c") endif() list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm" diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl index 7e746e9cb9..b75bdc5a19 100755 --- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl @@ -1799,7 +1799,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_compute_correlation sse4_1 avx2/; add_proto qw/void aom_compute_flow_at_point/, "const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v"; - specialize qw/aom_compute_flow_at_point sse4_1 avx2 neon/; + specialize qw/aom_compute_flow_at_point sse4_1 avx2 neon sve/; } } # CONFIG_AV1_ENCODER diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c index 7441108b01..6a177b2e6b 100644 --- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c @@ -20,6 +20,7 @@ #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/aom_filter.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" @@ -31,14 +32,14 @@ static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x8_t filter) { const int16x4_t filter_lo = vget_low_s16(filter); const int16x4_t filter_hi = vget_high_s16(filter); - int16x4_t sum; - sum = vmul_lane_s16(s0, filter_lo, 0); + int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0); sum = vmla_lane_s16(sum, s1, filter_lo, 1); sum = vmla_lane_s16(sum, s2, filter_lo, 2); sum = vmla_lane_s16(sum, s5, filter_hi, 1); sum = vmla_lane_s16(sum, s6, filter_hi, 2); sum = vmla_lane_s16(sum, s7, filter_hi, 3); + sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3)); sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0)); return sum; @@ -51,65 +52,56 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t filter) { const int16x4_t filter_lo = vget_low_s16(filter); const int16x4_t filter_hi = vget_high_s16(filter); - int16x8_t sum; - sum = vmulq_lane_s16(s0, filter_lo, 0); + int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0); sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); + sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3)); sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0)); return vqrshrun_n_s16(sum, FILTER_BITS); } -void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { +static INLINE void convolve8_horiz_8tap_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, int w, + int h) { const int16x8_t filter = vld1q_s16(filter_x); - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - - (void)x_step_q4; - (void)filter_y; - (void)y_step_q4; - - src -= ((SUBPEL_TAPS / 2) - 1); - if (h == 4) { - uint8x8_t t0, t1, t2, t3, d01, d23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - + uint8x8_t t0, t1, t2, t3; load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); - s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); src += 7; do { load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); - s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); + int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); + int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); transpose_elems_inplace_u8_4x4(&d01, &d23); @@ -123,39 +115,40 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s4 = s8; s5 = s9; s6 = s10; + src += 4; dst += 4; w -= 4; } while (w != 0); } else { - uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, d0, d1, d2, d3; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - if (w == 4) { do { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_u8_4x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); transpose_elems_inplace_u8_8x4(&d0, &d1, &d2, &d3); @@ -169,48 +162,49 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, h -= 8; } while (h > 0); } else { - uint8x8_t d4, d5, d6, d7; - int16x8_t s11, s12, s13, s14; - int width; - const uint8_t *s; - uint8_t *d; - do { - load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + int width = w; + const uint8_t *s = src; + uint8_t *d = dst; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - - width = w; - s = src + 7; - d = dst; + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s += 7; do { load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); - s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); - d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter); - d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter); - d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter); - d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter); + uint8x8_t d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter); + uint8x8_t d6 = + convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter); + uint8x8_t d7 = + convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter); transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); @@ -224,6 +218,7 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s4 = s12; s5 = s13; s6 = s14; + s += 8; d += 8; width -= 8; @@ -236,6 +231,137 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } } +static INLINE int16x4_t convolve4_4(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t filter) { + int16x4_t sum = vmul_lane_s16(s0, filter, 0); + sum = vmla_lane_s16(sum, s1, filter, 1); + sum = vmla_lane_s16(sum, s2, filter, 2); + sum = vmla_lane_s16(sum, s3, filter, 3); + + return sum; +} + +static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x4_t filter) { + int16x8_t sum = vmulq_lane_s16(s0, filter, 0); + sum = vmlaq_lane_s16(sum, s1, filter, 1); + sum = vmlaq_lane_s16(sum, s2, filter, 2); + sum = vmlaq_lane_s16(sum, s3, filter, 3); + + // We halved the filter values so -1 from right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE void convolve8_horiz_4tap_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, int w, + int h) { + // All filter values are even, halve to reduce intermediate precision + // requirements. + const int16x4_t filter = vshr_n_s16(vld1_s16(filter_x + 2), 1); + + if (w == 4) { + do { + int16x8_t t0 = + vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + 0 * src_stride))); + int16x8_t t1 = + vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + 1 * src_stride))); + + int16x4_t s0[4], s1[4]; + s0[0] = vget_low_s16(t0); + s0[1] = vget_low_s16(vextq_s16(t0, t0, 1)); + s0[2] = vget_low_s16(vextq_s16(t0, t0, 2)); + s0[3] = vget_low_s16(vextq_s16(t0, t0, 3)); + + s1[0] = vget_low_s16(t1); + s1[1] = vget_low_s16(vextq_s16(t1, t1, 1)); + s1[2] = vget_low_s16(vextq_s16(t1, t1, 2)); + s1[3] = vget_low_s16(vextq_s16(t1, t1, 3)); + + int16x4_t d0 = convolve4_4(s0[0], s0[1], s0[2], s0[3], filter); + int16x4_t d1 = convolve4_4(s1[0], s1[1], s1[2], s1[3], filter); + // We halved the filter values so -1 from right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h > 0); + } else { + do { + int width = w; + const uint8_t *s = src; + uint8_t *d = dst; + + int16x8_t t0 = + vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 0 * src_stride))); + int16x8_t t1 = + vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 1 * src_stride))); + + s += 8; + do { + int16x8_t t2 = + vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 0 * src_stride))); + int16x8_t t3 = + vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 1 * src_stride))); + + int16x8_t s0[4], s1[4]; + s0[0] = t0; + s0[1] = vextq_s16(t0, t2, 1); + s0[2] = vextq_s16(t0, t2, 2); + s0[3] = vextq_s16(t0, t2, 3); + + s1[0] = t1; + s1[1] = vextq_s16(t1, t3, 1); + s1[2] = vextq_s16(t1, t3, 2); + s1[3] = vextq_s16(t1, t3, 3); + + uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter); + uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter); + + store_u8_8x2(d, dst_stride, d0, d1); + + t0 = t2; + t1 = t3; + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h > 0); + } +} + +void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + (void)x_step_q4; + (void)filter_y; + (void)y_step_q4; + + src -= ((SUBPEL_TAPS / 2) - 1); + + if (get_filter_taps_convolve8(filter_x) <= 4) { + convolve8_horiz_4tap_neon(src + 2, src_stride, dst, dst_stride, filter_x, w, + h); + } else { + convolve8_horiz_8tap_neon(src, src_stride, dst, dst_stride, filter_x, w, h); + } +} + void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, @@ -253,33 +379,33 @@ void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; if (w == 4) { - uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); - s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); - s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); + + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); + int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); + int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); src += 7 * src_stride; do { load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); + int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); + int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); @@ -291,42 +417,40 @@ void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s4 = s8; s5 = s9; s6 = s10; + src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h != 0); } else { - uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int height; - const uint8_t *s; - uint8_t *d; - do { + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - - height = h; - s = src + 7 * src_stride; - d = dst; + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + int height = h; + const uint8_t *s = src + 7 * src_stride; + uint8_t *d = dst; do { load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -337,6 +461,7 @@ void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s4 = s8; s5 = s9; s6 = s10; + s += 4 * src_stride; d += 4 * dst_stride; height -= 4; diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c index c82125ba17..120c479798 100644 --- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c @@ -24,81 +24,72 @@ #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { +// Filter values always sum to 128. +#define FILTER_WEIGHT 128 + +DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { - 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, - 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 -}; - -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { - /* Shift left and insert new last column in transposed 4x4 block. */ +DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { + // Shift left and insert new last column in transposed 4x4 block. 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, - /* Shift left and insert two new columns in transposed 4x4 block. */ + // Shift left and insert two new columns in transposed 4x4 block. 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, - /* Shift left and insert three new columns in transposed 4x4 block. */ + // Shift left and insert three new columns in transposed 4x4 block. 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; -static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples, - const int8x8_t filter, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x2_t permute_tbl) { - int8x16_t clamped_samples, permuted_samples[2]; - int32x4_t sum; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); - - /* Accumulate dot product into 'correction' to account for range clamp. */ - sum = vdotq_lane_s32(correction, permuted_samples[0], filter, 0); - sum = vdotq_lane_s32(sum, permuted_samples[1], filter, 1); - - /* Further narrowing and packing is performed by the caller. */ +static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; + + // Accumulate into 128 * FILTER_WEIGHT to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT); + int32x4_t sum = vdotq_lane_s32(acc, perm_samples[0], filters, 0); + sum = vdotq_lane_s32(sum, perm_samples[1], filters, 1); + + // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } -static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples, - const int8x8_t filter, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x3_t permute_tbl) { - int8x16_t clamped_samples, permuted_samples[3]; - int32x4_t sum0, sum1; - int16x8_t sum; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); - /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */ - permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); - - /* Accumulate dot product into 'correction' to account for range clamp. */ - /* First 4 output values. */ - sum0 = vdotq_lane_s32(correction, permuted_samples[0], filter, 0); - sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filter, 1); - /* Second 4 output values. */ - sum1 = vdotq_lane_s32(correction, permuted_samples[1], filter, 0); - sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filter, 1); - - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); +static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x3_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]), + vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; + + // Accumulate into 128 * FILTER_WEIGHT to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT); + // First 4 output values. + int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0); + sum0 = vdotq_lane_s32(sum0, perm_samples[1], filters, 1); + // Second 4 output values. + int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0); + sum1 = vdotq_lane_s32(sum1, perm_samples[2], filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); return vqrshrun_n_s16(sum, FILTER_BITS); } @@ -108,10 +99,6 @@ void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h) { const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x)); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_x), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x16_t range_limit = vdupq_n_u8(128); - uint8x16_t s0, s1, s2, s3; assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); @@ -123,19 +110,17 @@ void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, src -= ((SUBPEL_TAPS / 2) - 1); if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl); do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23; - + uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl); - t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl); - t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl); - t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + int16x4_t d0 = convolve8_4_h(s0, filter, perm_tbl); + int16x4_t d1 = convolve8_4_h(s1, filter, perm_tbl); + int16x4_t d2 = convolve8_4_h(s2, filter, perm_tbl); + int16x4_t d3 = convolve8_4_h(s3, filter, perm_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); @@ -145,23 +130,20 @@ void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h > 0); } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; + const uint8x16x3_t perm_tbl = vld1q_u8_x3(kDotProdPermuteTbl); do { - width = w; - s = src; - d = dst; + int width = w; + const uint8_t *s = src; + uint8_t *d = dst; do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl); - d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl); + uint8x8_t d0 = convolve8_8_h(s0, filter, perm_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, perm_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, perm_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filter, perm_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -177,83 +159,88 @@ void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, } static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, - int8x8_t a3, int8x16_t *b, - const uint8x16_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, XX, XX, XX, XX - * a1: 10, 11, 12, 13, XX, XX, XX, XX - * a2: 20, 21, 22, 23, XX, XX, XX, XX - * a3: 30, 31, 32, 33, XX, XX, XX, XX - * - * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; - *b = vqtbl2q_s8(samples, permute_tbl); + int8x8_t a3, int8x16_t *b) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, XX, XX, XX, XX + // a1: 10, 11, 12, 13, XX, XX, XX, XX + // a2: 20, 21, 22, 23, XX, XX, XX, XX + // a3: 30, 31, 32, 33, XX, XX, XX, XX + // + // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + + int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); + int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); + int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); + int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); + + int8x16_t a01 = vzipq_s8(a0q, a1q).val[0]; + int8x16_t a23 = vzipq_s8(a2q, a3q).val[0]; + + int16x8_t a0123 = + vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0]; + + *b = vreinterpretq_s8_s16(a0123); } static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, int8x8_t a3, int8x16_t *b0, - int8x16_t *b1, - const uint8x16x2_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, 04, 05, 06, 07 - * a1: 10, 11, 12, 13, 14, 15, 16, 17 - * a2: 20, 21, 22, 23, 24, 25, 26, 27 - * a3: 30, 31, 32, 33, 34, 35, 36, 37 - * - * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; - *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]); - *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); + int8x16_t *b1) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, 04, 05, 06, 07 + // a1: 10, 11, 12, 13, 14, 15, 16, 17 + // a2: 20, 21, 22, 23, 24, 25, 26, 27 + // a3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + + int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); + int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); + int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); + int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); + + int8x16_t a01 = vzipq_s8(a0q, a1q).val[0]; + int8x16_t a23 = vzipq_s8(a2q, a3q).val[0]; + + int16x8x2_t a0123 = + vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)); + + *b0 = vreinterpretq_s8_s16(a0123.val[0]); + *b1 = vreinterpretq_s8_s16(a0123.val[1]); } -static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, - const int8x16_t samples_hi, - const int32x4_t correction, - const int8x8_t filter) { - /* Sample range-clamping and permutation are performed by the caller. */ - int32x4_t sum; +static INLINE int16x4_t convolve8_4_v(const int8x16_t samples_lo, + const int8x16_t samples_hi, + const int8x8_t filters) { + // The sample range transform and permutation are performed by the caller. - /* Accumulate dot product into 'correction' to account for range clamp. */ - sum = vdotq_lane_s32(correction, samples_lo, filter, 0); - sum = vdotq_lane_s32(sum, samples_hi, filter, 1); + // Accumulate into 128 * FILTER_WEIGHT to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT); + int32x4_t sum = vdotq_lane_s32(acc, samples_lo, filters, 0); + sum = vdotq_lane_s32(sum, samples_hi, filters, 1); - /* Further narrowing and packing is performed by the caller. */ + // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } -static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo, - const int8x16_t samples0_hi, - const int8x16_t samples1_lo, - const int8x16_t samples1_hi, - const int32x4_t correction, - const int8x8_t filter) { - /* Sample range-clamping and permutation are performed by the caller. */ - int32x4_t sum0, sum1; - int16x8_t sum; - - /* Accumulate dot product into 'correction' to account for range clamp. */ - /* First 4 output values. */ - sum0 = vdotq_lane_s32(correction, samples0_lo, filter, 0); - sum0 = vdotq_lane_s32(sum0, samples0_hi, filter, 1); - /* Second 4 output values. */ - sum1 = vdotq_lane_s32(correction, samples1_lo, filter, 0); - sum1 = vdotq_lane_s32(sum1, samples1_hi, filter, 1); - - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); +static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo, + const int8x16_t samples0_hi, + const int8x16_t samples1_lo, + const int8x16_t samples1_hi, + const int8x8_t filters) { + // The sample range transform and permutation are performed by the caller. + + // Accumulate into 128 * FILTER_WEIGHT to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT); + // First 4 output values. + int32x4_t sum0 = vdotq_lane_s32(acc, samples0_lo, filters, 0); + sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1); + // Second 4 output values. + int32x4_t sum1 = vdotq_lane_s32(acc, samples1_lo, filters, 0); + sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); return vqrshrun_n_s16(sum, FILTER_BITS); } @@ -263,10 +250,7 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h) { const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y)); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_y), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x8_t range_limit = vdup_n_u8(128); - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); int8x16x2_t samples_LUT; assert((intptr_t)dst % 4 == 0); @@ -279,62 +263,58 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); src += 7 * src_stride; - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. int8x16_t s0123, s1234, s2345, s3456; - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { uint8x8_t t7, t8, t9, t10; load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); - int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); int8x16_t s4567, s5678, s6789, s78910; - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + transpose_concat_4x4(s7, s8, s9, s10, &s78910); - /* Merge new data into block from previous iteration. */ + // Merge new data into block from previous iteration. samples_LUT.val[0] = s3456; samples_LUT.val[1] = s78910; s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - int16x4_t d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter); - int16x4_t d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter); - int16x4_t d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter); - int16x4_t d3 = - convolve8_4_sdot_partial(s3456, s78910, correction, filter); + int16x4_t d0 = convolve8_4_v(s0123, s4567, filter); + int16x4_t d1 = convolve8_4_v(s1234, s5678, filter); + int16x4_t d2 = convolve8_4_v(s2345, s6789, filter); + int16x4_t d3 = convolve8_4_v(s3456, s78910, filter); uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123 = s4567; s1234 = s5678; s2345 = s6789; @@ -345,8 +325,6 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h != 0); } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - do { int height = h; const uint8_t *s = src; @@ -356,44 +334,38 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); s += 7 * src_stride; - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, s3456_lo, s3456_hi; - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { uint8x8_t t7, t8, t9, t10; load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); - int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); int8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, s78910_lo, s78910_hi; - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); - /* Merge new data into block from previous iteration. */ + // Merge new data into block from previous iteration. samples_LUT.val[0] = s3456_lo; samples_LUT.val[1] = s78910_lo; s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); @@ -406,19 +378,19 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - uint8x8_t d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, - s4567_hi, correction, filter); - uint8x8_t d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, - s5678_hi, correction, filter); - uint8x8_t d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, - s6789_hi, correction, filter); - uint8x8_t d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, - s78910_hi, correction, filter); + uint8x8_t d0 = + convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter); + uint8x8_t d1 = + convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter); + uint8x8_t d2 = + convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter); + uint8x8_t d3 = + convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123_lo = s4567_lo; s0123_hi = s4567_hi; s1234_lo = s5678_lo; diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c index df6e4d2ab5..68e031461d 100644 --- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c @@ -23,69 +23,60 @@ #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { +DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { - 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, - 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 -}; - -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { - /* Shift left and insert new last column in transposed 4x4 block. */ +DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { + // Shift left and insert new last column in transposed 4x4 block. 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, - /* Shift left and insert two new columns in transposed 4x4 block. */ + // Shift left and insert two new columns in transposed 4x4 block. 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, - /* Shift left and insert three new columns in transposed 4x4 block. */ + // Shift left and insert three new columns in transposed 4x4 block. 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; -static INLINE int16x4_t convolve8_4_usdot(const uint8x16_t samples, - const int8x8_t filter, - const uint8x16x2_t permute_tbl) { - uint8x16_t permuted_samples[2]; - int32x4_t sum; - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); +static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]) }; - sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0); - sum = vusdotq_lane_s32(sum, permuted_samples[1], filter, 1); + int32x4_t sum = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1); - /* Further narrowing and packing is performed by the caller. */ + // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } -static INLINE uint8x8_t convolve8_8_usdot(const uint8x16_t samples, - const int8x8_t filter, - const uint8x16x3_t permute_tbl) { - uint8x16_t permuted_samples[3]; - int32x4_t sum0, sum1; - int16x8_t sum; - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); - /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */ - permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); - - /* First 4 output values. */ - sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0); - sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filter, 1); - /* Second 4 output values. */ - sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filter, 0); - sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filter, 1); - - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); +static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x3_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + uint8x16_t permuted_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]), + vqtbl1q_u8(samples, permute_tbl.val[2]) }; + + // First 4 output values. + int32x4_t sum0 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1); + // Second 4 output values. + int32x4_t sum1 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); + sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); return vqrshrun_n_s16(sum, FILTER_BITS); } @@ -95,7 +86,6 @@ void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h) { const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x)); - uint8x16_t s0, s1, s2, s3; assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); @@ -107,19 +97,17 @@ void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, src -= ((SUBPEL_TAPS / 2) - 1); if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl); do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23; - + uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_usdot(s0, filter, perm_tbl); - t1 = convolve8_4_usdot(s1, filter, perm_tbl); - t2 = convolve8_4_usdot(s2, filter, perm_tbl); - t3 = convolve8_4_usdot(s3, filter, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + int16x4_t d0 = convolve8_4_h(s0, filter, perm_tbl); + int16x4_t d1 = convolve8_4_h(s1, filter, perm_tbl); + int16x4_t d2 = convolve8_4_h(s2, filter, perm_tbl); + int16x4_t d3 = convolve8_4_h(s3, filter, perm_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); @@ -129,23 +117,20 @@ void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h > 0); } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; + const uint8x16x3_t perm_tbl = vld1q_u8_x3(kDotProdPermuteTbl); do { - width = w; - s = src; - d = dst; + int width = w; + const uint8_t *s = src; + uint8_t *d = dst; do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_usdot(s0, filter, perm_tbl); - d1 = convolve8_8_usdot(s1, filter, perm_tbl); - d2 = convolve8_8_usdot(s2, filter, perm_tbl); - d3 = convolve8_8_usdot(s3, filter, perm_tbl); + uint8x8_t d0 = convolve8_8_h(s0, filter, perm_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, perm_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, perm_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filter, perm_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -162,79 +147,83 @@ void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, - uint8x16_t *b, - const uint8x16_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, XX, XX, XX, XX - * a1: 10, 11, 12, 13, XX, XX, XX, XX - * a2: 20, 21, 22, 23, XX, XX, XX, XX - * a3: 30, 31, 32, 33, XX, XX, XX, XX - * - * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; - *b = vqtbl2q_u8(samples, permute_tbl); + uint8x16_t *b) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, XX, XX, XX, XX + // a1: 10, 11, 12, 13, XX, XX, XX, XX + // a2: 20, 21, 22, 23, XX, XX, XX, XX + // a3: 30, 31, 32, 33, XX, XX, XX, XX + // + // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + + uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); + uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); + uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); + uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); + + uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0]; + uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0]; + + uint16x8_t a0123 = + vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0]; + + *b = vreinterpretq_u8_u16(a0123); } static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, - uint8x16_t *b0, uint8x16_t *b1, - const uint8x16x2_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, 04, 05, 06, 07 - * a1: 10, 11, 12, 13, 14, 15, 16, 17 - * a2: 20, 21, 22, 23, 24, 25, 26, 27 - * a3: 30, 31, 32, 33, 34, 35, 36, 37 - * - * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; - *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]); - *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]); + uint8x16_t *b0, uint8x16_t *b1) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, 04, 05, 06, 07 + // a1: 10, 11, 12, 13, 14, 15, 16, 17 + // a2: 20, 21, 22, 23, 24, 25, 26, 27 + // a3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + + uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); + uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); + uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); + uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); + + uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0]; + uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0]; + + uint16x8x2_t a0123 = + vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)); + + *b0 = vreinterpretq_u8_u16(a0123.val[0]); + *b1 = vreinterpretq_u8_u16(a0123.val[1]); } -static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, - const uint8x16_t samples_hi, - const int8x8_t filter) { - /* Sample permutation is performed by the caller. */ - int32x4_t sum; - - sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filter, 0); - sum = vusdotq_lane_s32(sum, samples_hi, filter, 1); +static INLINE int16x4_t convolve8_4_v(const uint8x16_t samples_lo, + const uint8x16_t samples_hi, + const int8x8_t filters) { + // Sample permutation is performed by the caller. + int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); + sum = vusdotq_lane_s32(sum, samples_hi, filters, 1); - /* Further narrowing and packing is performed by the caller. */ + // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } -static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo, - const uint8x16_t samples0_hi, - const uint8x16_t samples1_lo, - const uint8x16_t samples1_hi, - const int8x8_t filter) { - /* Sample permutation is performed by the caller. */ - int32x4_t sum0, sum1; - int16x8_t sum; - - /* First 4 output values. */ - sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filter, 0); - sum0 = vusdotq_lane_s32(sum0, samples0_hi, filter, 1); - /* Second 4 output values. */ - sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filter, 0); - sum1 = vusdotq_lane_s32(sum1, samples1_hi, filter, 1); - - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); +static INLINE uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo, + const uint8x16_t samples0_hi, + const uint8x16_t samples1_lo, + const uint8x16_t samples1_hi, + const int8x8_t filters) { + // Sample permutation is performed by the caller. + + // First 4 output values. + int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0); + sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1); + // Second 4 output values. + int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0); + sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); return vqrshrun_n_s16(sum, FILTER_BITS); } @@ -244,7 +233,7 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h) { const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y)); - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); uint8x16x2_t samples_LUT; assert((intptr_t)dst % 4 == 0); @@ -257,47 +246,44 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); src += 7 * src_stride; - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. uint8x16_t s0123, s1234, s2345, s3456; - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { uint8x8_t s7, s8, s9, s10; load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); uint8x16_t s4567, s5678, s6789, s78910; - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + transpose_concat_4x4(s7, s8, s9, s10, &s78910); - /* Merge new data into block from previous iteration. */ + // Merge new data into block from previous iteration. samples_LUT.val[0] = s3456; samples_LUT.val[1] = s78910; s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - int16x4_t d0 = convolve8_4_usdot_partial(s0123, s4567, filter); - int16x4_t d1 = convolve8_4_usdot_partial(s1234, s5678, filter); - int16x4_t d2 = convolve8_4_usdot_partial(s2345, s6789, filter); - int16x4_t d3 = convolve8_4_usdot_partial(s3456, s78910, filter); + int16x4_t d0 = convolve8_4_v(s0123, s4567, filter); + int16x4_t d1 = convolve8_4_v(s1234, s5678, filter); + int16x4_t d2 = convolve8_4_v(s2345, s6789, filter); + int16x4_t d3 = convolve8_4_v(s3456, s78910, filter); uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123 = s4567; s1234 = s5678; s2345 = s6789; @@ -308,8 +294,6 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h != 0); } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - do { int height = h; const uint8_t *s = src; @@ -319,19 +303,14 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, s3456_lo, s3456_hi; - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { uint8x8_t s7, s8, s9, s10; @@ -339,10 +318,9 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, s78910_lo, s78910_hi; - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); - /* Merge new data into block from previous iteration. */ + // Merge new data into block from previous iteration. samples_LUT.val[0] = s3456_lo; samples_LUT.val[1] = s78910_lo; s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); @@ -355,19 +333,19 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - uint8x8_t d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, - s4567_hi, filter); - uint8x8_t d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, - s5678_hi, filter); - uint8x8_t d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, - s6789_hi, filter); - uint8x8_t d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, - s78910_hi, filter); + uint8x8_t d0 = + convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter); + uint8x8_t d1 = + convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter); + uint8x8_t d2 = + convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter); + uint8x8_t d3 = + convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123_lo = s4567_lo; s0123_hi = s4567_hi; s1234_lo = s5678_lo; diff --git a/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c index 62729133e3..5758d2887f 100644 --- a/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c +++ b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c @@ -16,36 +16,10 @@ #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/flow_estimation/arm/disflow_neon.h" #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" -static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) { - // Check that the fractional position is in range. - // - // Note: x is calculated from, e.g., `u_frac = u - floor(u)`. - // Mathematically, this implies that 0 <= x < 1. However, in practice it is - // possible to have x == 1 due to floating point rounding. This is fine, - // and we still interpolate correctly if we allow x = 1. - assert(0 <= x && x <= 1); - - double x2 = x * x; - double x3 = x2 * x; - kernel[0] = -0.5 * x + x2 - 0.5 * x3; - kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3; - kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3; - kernel[3] = -0.5 * x2 + 0.5 * x3; -} - -static INLINE void get_cubic_kernel_int(double x, int kernel[4]) { - double kernel_dbl[4]; - get_cubic_kernel_dbl(x, kernel_dbl); - - kernel[0] = (int)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS)); - kernel[1] = (int)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS)); - kernel[2] = (int)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS)); - kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS)); -} - // Compare two regions of width x height pixels, one rooted at position // (x, y) in src and the other at (x + u, y + v) in ref. // This function returns the sum of squared pixel differences between @@ -157,82 +131,6 @@ static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref, } } -static INLINE void sobel_filter_x(const uint8_t *src, int src_stride, - int16_t *dst, int dst_stride) { - int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; - - // Horizontal filter, using kernel {1, 0, -1}. - const uint8_t *src_start = src - 1 * src_stride - 1; - - for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) { - uint8x16_t s = vld1q_u8(src_start + i * src_stride); - uint8x8_t s0 = vget_low_u8(s); - uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2)); - - // Given that the kernel is {1, 0, -1} the convolution is a simple - // subtraction. - int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s0, s2)); - - vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, diff); - } - - // Vertical filter, using kernel {1, 2, 1}. - // This kernel can be split into two 2-taps kernels of value {1, 1}. - // That way we need only 3 add operations to perform the convolution, one of - // which can be reused for the next line. - int16x8_t s0 = vld1q_s16(tmp); - int16x8_t s1 = vld1q_s16(tmp + DISFLOW_PATCH_SIZE); - int16x8_t sum01 = vaddq_s16(s0, s1); - for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { - int16x8_t s2 = vld1q_s16(tmp + (i + 2) * DISFLOW_PATCH_SIZE); - - int16x8_t sum12 = vaddq_s16(s1, s2); - int16x8_t sum = vaddq_s16(sum01, sum12); - - vst1q_s16(dst + i * dst_stride, sum); - - sum01 = sum12; - s1 = s2; - } -} - -static INLINE void sobel_filter_y(const uint8_t *src, int src_stride, - int16_t *dst, int dst_stride) { - int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; - - // Horizontal filter, using kernel {1, 2, 1}. - // This kernel can be split into two 2-taps kernels of value {1, 1}. - // That way we need only 3 add operations to perform the convolution. - const uint8_t *src_start = src - 1 * src_stride - 1; - - for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) { - uint8x16_t s = vld1q_u8(src_start + i * src_stride); - uint8x8_t s0 = vget_low_u8(s); - uint8x8_t s1 = vget_low_u8(vextq_u8(s, s, 1)); - uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2)); - - uint16x8_t sum01 = vaddl_u8(s0, s1); - uint16x8_t sum12 = vaddl_u8(s1, s2); - uint16x8_t sum = vaddq_u16(sum01, sum12); - - vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, vreinterpretq_s16_u16(sum)); - } - - // Vertical filter, using kernel {1, 0, -1}. - // Load the whole block at once to avoid redundant loads during convolution. - int16x8_t t[10]; - load_s16_8x10(tmp, DISFLOW_PATCH_SIZE, &t[0], &t[1], &t[2], &t[3], &t[4], - &t[5], &t[6], &t[7], &t[8], &t[9]); - - for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { - // Given that the kernel is {1, 0, -1} the convolution is a simple - // subtraction. - int16x8_t diff = vsubq_s16(t[i], t[i + 2]); - - vst1q_s16(dst + i * dst_stride, diff); - } -} - // Computes the components of the system of equations used to solve for // a flow vector. // diff --git a/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.h b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.h new file mode 100644 index 0000000000..d991a13460 --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_ARM_DISFLOW_NEON_H_ +#define AOM_AOM_DSP_FLOW_ESTIMATION_ARM_DISFLOW_NEON_H_ + +#include "aom_dsp/flow_estimation/disflow.h" + +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) { + // Check that the fractional position is in range. + // + // Note: x is calculated from, e.g., `u_frac = u - floor(u)`. + // Mathematically, this implies that 0 <= x < 1. However, in practice it is + // possible to have x == 1 due to floating point rounding. This is fine, + // and we still interpolate correctly if we allow x = 1. + assert(0 <= x && x <= 1); + + double x2 = x * x; + double x3 = x2 * x; + kernel[0] = -0.5 * x + x2 - 0.5 * x3; + kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3; + kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3; + kernel[3] = -0.5 * x2 + 0.5 * x3; +} + +static INLINE void get_cubic_kernel_int(double x, int kernel[4]) { + double kernel_dbl[4]; + get_cubic_kernel_dbl(x, kernel_dbl); + + kernel[0] = (int)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS)); + kernel[1] = (int)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS)); + kernel[2] = (int)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS)); + kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS)); +} + +static INLINE void sobel_filter_x(const uint8_t *src, int src_stride, + int16_t *dst, int dst_stride) { + int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; + + // Horizontal filter, using kernel {1, 0, -1}. + const uint8_t *src_start = src - 1 * src_stride - 1; + + for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) { + uint8x16_t s = vld1q_u8(src_start + i * src_stride); + uint8x8_t s0 = vget_low_u8(s); + uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2)); + + // Given that the kernel is {1, 0, -1} the convolution is a simple + // subtraction. + int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s0, s2)); + + vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, diff); + } + + // Vertical filter, using kernel {1, 2, 1}. + // This kernel can be split into two 2-taps kernels of value {1, 1}. + // That way we need only 3 add operations to perform the convolution, one of + // which can be reused for the next line. + int16x8_t s0 = vld1q_s16(tmp); + int16x8_t s1 = vld1q_s16(tmp + DISFLOW_PATCH_SIZE); + int16x8_t sum01 = vaddq_s16(s0, s1); + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + int16x8_t s2 = vld1q_s16(tmp + (i + 2) * DISFLOW_PATCH_SIZE); + + int16x8_t sum12 = vaddq_s16(s1, s2); + int16x8_t sum = vaddq_s16(sum01, sum12); + + vst1q_s16(dst + i * dst_stride, sum); + + sum01 = sum12; + s1 = s2; + } +} + +static INLINE void sobel_filter_y(const uint8_t *src, int src_stride, + int16_t *dst, int dst_stride) { + int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; + + // Horizontal filter, using kernel {1, 2, 1}. + // This kernel can be split into two 2-taps kernels of value {1, 1}. + // That way we need only 3 add operations to perform the convolution. + const uint8_t *src_start = src - 1 * src_stride - 1; + + for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) { + uint8x16_t s = vld1q_u8(src_start + i * src_stride); + uint8x8_t s0 = vget_low_u8(s); + uint8x8_t s1 = vget_low_u8(vextq_u8(s, s, 1)); + uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2)); + + uint16x8_t sum01 = vaddl_u8(s0, s1); + uint16x8_t sum12 = vaddl_u8(s1, s2); + uint16x8_t sum = vaddq_u16(sum01, sum12); + + vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, vreinterpretq_s16_u16(sum)); + } + + // Vertical filter, using kernel {1, 0, -1}. + // Load the whole block at once to avoid redundant loads during convolution. + int16x8_t t[10]; + load_s16_8x10(tmp, DISFLOW_PATCH_SIZE, &t[0], &t[1], &t[2], &t[3], &t[4], + &t[5], &t[6], &t[7], &t[8], &t[9]); + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + // Given that the kernel is {1, 0, -1} the convolution is a simple + // subtraction. + int16x8_t diff = vsubq_s16(t[i], t[i + 2]); + + vst1q_s16(dst + i * dst_stride, diff); + } +} + +#endif // AOM_AOM_DSP_FLOW_ESTIMATION_ARM_DISFLOW_NEON_H_ diff --git a/third_party/aom/aom_dsp/flow_estimation/arm/disflow_sve.c b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_sve.c new file mode 100644 index 0000000000..7b01e90d12 --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_sve.c @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/flow_estimation/disflow.h" + +#include +#include +#include + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/flow_estimation/arm/disflow_neon.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = { + 0, 2, 4, 6, 1, 3, 5, 7, +}; + +// Compare two regions of width x height pixels, one rooted at position +// (x, y) in src and the other at (x + u, y + v) in ref. +// This function returns the sum of squared pixel differences between +// the two regions. +static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref, + int width, int height, int stride, int x, + int y, double u, double v, int16_t *dt) { + // Split offset into integer and fractional parts, and compute cubic + // interpolation kernels + const int u_int = (int)floor(u); + const int v_int = (int)floor(v); + const double u_frac = u - floor(u); + const double v_frac = v - floor(v); + + int h_kernel[4]; + int v_kernel[4]; + get_cubic_kernel_int(u_frac, h_kernel); + get_cubic_kernel_int(v_frac, v_kernel); + + int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)]; + + // Clamp coordinates so that all pixels we fetch will remain within the + // allocated border region, but allow them to go far enough out that + // the border pixels' values do not change. + // Since we are calculating an 8x8 block, the bottom-right pixel + // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic + // interpolation has 4 taps, meaning that the output of pixel + // (x_w, y_w) depends on the pixels in the range + // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]). + // + // Thus the most extreme coordinates which will be fetched are + // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9). + const int x0 = clamp(x + u_int, -9, width); + const int y0 = clamp(y + v_int, -9, height); + + // Horizontal convolution. + const uint8_t *ref_start = ref + (y0 - 1) * stride + (x0 - 1); + const int16x4_t h_kernel_s16 = vmovn_s32(vld1q_s32(h_kernel)); + const int16x8_t h_filter = vcombine_s16(h_kernel_s16, vdup_n_s16(0)); + const uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); + + for (int i = 0; i < DISFLOW_PATCH_SIZE + 3; ++i) { + svuint16_t r0 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 0); + svuint16_t r1 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 1); + svuint16_t r2 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 2); + svuint16_t r3 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 3); + + int16x8_t s0 = vreinterpretq_s16_u16(svget_neonq_u16(r0)); + int16x8_t s1 = vreinterpretq_s16_u16(svget_neonq_u16(r1)); + int16x8_t s2 = vreinterpretq_s16_u16(svget_neonq_u16(r2)); + int16x8_t s3 = vreinterpretq_s16_u16(svget_neonq_u16(r3)); + + int64x2_t sum04 = aom_svdot_lane_s16(vdupq_n_s64(0), s0, h_filter, 0); + int64x2_t sum15 = aom_svdot_lane_s16(vdupq_n_s64(0), s1, h_filter, 0); + int64x2_t sum26 = aom_svdot_lane_s16(vdupq_n_s64(0), s2, h_filter, 0); + int64x2_t sum37 = aom_svdot_lane_s16(vdupq_n_s64(0), s3, h_filter, 0); + + int32x4_t res0 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); + int32x4_t res1 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); + + // 6 is the maximum allowable number of extra bits which will avoid + // the intermediate values overflowing an int16_t. The most extreme + // intermediate value occurs when: + // * The input pixels are [0, 255, 255, 0] + // * u_frac = 0.5 + // In this case, the un-scaled output is 255 * 1.125 = 286.875. + // As an integer with 6 fractional bits, that is 18360, which fits + // in an int16_t. But with 7 fractional bits it would be 36720, + // which is too large. + int16x8_t res = vcombine_s16(vrshrn_n_s32(res0, DISFLOW_INTERP_BITS - 6), + vrshrn_n_s32(res1, DISFLOW_INTERP_BITS - 6)); + + res = aom_tbl_s16(res, idx); + + vst1q_s16(tmp_ + i * DISFLOW_PATCH_SIZE, res); + } + + // Vertical convolution. + int16x4_t v_filter = vmovn_s32(vld1q_s32(v_kernel)); + int16_t *tmp_start = tmp_ + DISFLOW_PATCH_SIZE; + + for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) { + int16x8_t t0 = vld1q_s16(tmp_start + (i - 1) * DISFLOW_PATCH_SIZE); + int16x8_t t1 = vld1q_s16(tmp_start + i * DISFLOW_PATCH_SIZE); + int16x8_t t2 = vld1q_s16(tmp_start + (i + 1) * DISFLOW_PATCH_SIZE); + int16x8_t t3 = vld1q_s16(tmp_start + (i + 2) * DISFLOW_PATCH_SIZE); + + int32x4_t sum_lo = vmull_lane_s16(vget_low_s16(t0), v_filter, 0); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t1), v_filter, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t2), v_filter, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t3), v_filter, 3); + + int32x4_t sum_hi = vmull_lane_s16(vget_high_s16(t0), v_filter, 0); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t1), v_filter, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t2), v_filter, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t3), v_filter, 3); + + uint8x8_t s = vld1_u8(src + (i + y) * stride + x); + int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, 3)); + + // This time, we have to round off the 6 extra bits which were kept + // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits + // of precision to match the scale of the dx and dy arrays. + sum_lo = vrshrq_n_s32(sum_lo, + DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2); + sum_hi = vrshrq_n_s32(sum_hi, + DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2); + int32x4_t err_lo = vsubw_s16(sum_lo, vget_low_s16(s_s16)); + int32x4_t err_hi = vsubw_s16(sum_hi, vget_high_s16(s_s16)); + vst1q_s16(dt + i * DISFLOW_PATCH_SIZE, + vcombine_s16(vmovn_s32(err_lo), vmovn_s32(err_hi))); + } +} + +// Computes the components of the system of equations used to solve for +// a flow vector. +// +// The flow equations are a least-squares system, derived as follows: +// +// For each pixel in the patch, we calculate the current error `dt`, +// and the x and y gradients `dx` and `dy` of the source patch. +// This means that, to first order, the squared error for this pixel is +// +// (dt + u * dx + v * dy)^2 +// +// where (u, v) are the incremental changes to the flow vector. +// +// We then want to find the values of u and v which minimize the sum +// of the squared error across all pixels. Conveniently, this fits exactly +// into the form of a least squares problem, with one equation +// +// u * dx + v * dy = -dt +// +// for each pixel. +// +// Summing across all pixels in a square window of size DISFLOW_PATCH_SIZE, +// and absorbing the - sign elsewhere, this results in the least squares system +// +// M = |sum(dx * dx) sum(dx * dy)| +// |sum(dx * dy) sum(dy * dy)| +// +// b = |sum(dx * dt)| +// |sum(dy * dt)| +static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride, + const int16_t *dy, int dy_stride, + double *M_inv) { + int64x2_t sum[3] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0) }; + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + int16x8_t x = vld1q_s16(dx + i * dx_stride); + int16x8_t y = vld1q_s16(dy + i * dy_stride); + + sum[0] = aom_sdotq_s16(sum[0], x, x); + sum[1] = aom_sdotq_s16(sum[1], x, y); + sum[2] = aom_sdotq_s16(sum[2], y, y); + } + + sum[0] = vpaddq_s64(sum[0], sum[1]); + sum[2] = vpaddq_s64(sum[1], sum[2]); + int32x4_t res = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); + + // Apply regularization + // We follow the standard regularization method of adding `k * I` before + // inverting. This ensures that the matrix will be invertible. + // + // Setting the regularization strength k to 1 seems to work well here, as + // typical values coming from the other equations are very large (1e5 to + // 1e6, with an upper limit of around 6e7, at the time of writing). + // It also preserves the property that all matrix values are whole numbers, + // which is convenient for integerized SIMD implementation. + + double M0 = (double)vgetq_lane_s32(res, 0) + 1; + double M1 = (double)vgetq_lane_s32(res, 1); + double M2 = (double)vgetq_lane_s32(res, 2); + double M3 = (double)vgetq_lane_s32(res, 3) + 1; + + // Invert matrix M. + double det = (M0 * M3) - (M1 * M2); + assert(det >= 1); + const double det_inv = 1 / det; + + M_inv[0] = M3 * det_inv; + M_inv[1] = -M1 * det_inv; + M_inv[2] = -M2 * det_inv; + M_inv[3] = M0 * det_inv; +} + +static INLINE void compute_flow_vector(const int16_t *dx, int dx_stride, + const int16_t *dy, int dy_stride, + const int16_t *dt, int dt_stride, + int *b) { + int64x2_t b_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + int16x8_t dx16 = vld1q_s16(dx + i * dx_stride); + int16x8_t dy16 = vld1q_s16(dy + i * dy_stride); + int16x8_t dt16 = vld1q_s16(dt + i * dt_stride); + + b_s64[0] = aom_sdotq_s16(b_s64[0], dx16, dt16); + b_s64[1] = aom_sdotq_s16(b_s64[1], dy16, dt16); + } + + b_s64[0] = vpaddq_s64(b_s64[0], b_s64[1]); + vst1_s32(b, vmovn_s64(b_s64[0])); +} + +void aom_compute_flow_at_point_sve(const uint8_t *src, const uint8_t *ref, + int x, int y, int width, int height, + int stride, double *u, double *v) { + double M_inv[4]; + int b[2]; + int16_t dt[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; + int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; + int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; + + // Compute gradients within this patch + const uint8_t *src_patch = &src[y * stride + x]; + sobel_filter_x(src_patch, stride, dx, DISFLOW_PATCH_SIZE); + sobel_filter_y(src_patch, stride, dy, DISFLOW_PATCH_SIZE); + + compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M_inv); + + for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) { + compute_flow_error(src, ref, width, height, stride, x, y, *u, *v, dt); + compute_flow_vector(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, dt, + DISFLOW_PATCH_SIZE, b); + + // Solve flow equations to find a better estimate for the flow vector + // at this point + const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1]; + const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1]; + *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2); + *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2); + + if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) { + // Stop iteration when we're close to convergence + break; + } + } +} diff --git a/third_party/aom/aom_dsp/pyramid.c b/third_party/aom/aom_dsp/pyramid.c index 5de001dbd5..05ddbb2f5f 100644 --- a/third_party/aom/aom_dsp/pyramid.c +++ b/third_party/aom/aom_dsp/pyramid.c @@ -305,6 +305,7 @@ static INLINE int fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, // Fill in the remaining levels through progressive downsampling for (int level = already_filled_levels; level < n_levels; ++level) { + bool mem_status = false; PyramidLayer *prev_layer = &frame_pyr->layers[level - 1]; uint8_t *prev_buffer = prev_layer->buffer; int prev_stride = prev_layer->stride; @@ -315,6 +316,11 @@ static INLINE int fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, int this_height = this_layer->height; int this_stride = this_layer->stride; + // The width and height of the previous layer that needs to be considered to + // derive the current layer frame. + const int input_layer_width = this_width << 1; + const int input_layer_height = this_height << 1; + // Compute the this pyramid level by downsampling the current level. // // We downsample by a factor of exactly 2, clipping the rightmost and @@ -329,13 +335,30 @@ static INLINE int fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, // 2) Up/downsampling by a factor of 2 can be implemented much more // efficiently than up/downsampling by a generic ratio. // TODO(rachelbarker): Use optimized downsample-by-2 function - if (!av1_resize_plane(prev_buffer, this_height << 1, this_width << 1, - prev_stride, this_buffer, this_height, this_width, - this_stride)) { - // If we can't allocate memory, we'll have to terminate early + + // SIMD support has been added specifically for cases where the downsample + // factor is exactly 2. In such instances, horizontal and vertical resizing + // is performed utilizing the down2_symeven() function, which considers the + // even dimensions of the input layer. + if (should_resize_by_half(input_layer_height, input_layer_width, + this_height, this_width)) { + assert(input_layer_height % 2 == 0 && input_layer_width % 2 == 0 && + "Input width or height cannot be odd."); + mem_status = av1_resize_plane_to_half( + prev_buffer, input_layer_height, input_layer_width, prev_stride, + this_buffer, this_height, this_width, this_stride); + } else { + mem_status = av1_resize_plane(prev_buffer, input_layer_height, + input_layer_width, prev_stride, this_buffer, + this_height, this_width, this_stride); + } + + // Terminate early in cases of memory allocation failure. + if (!mem_status) { frame_pyr->filled_levels = n_levels; return -1; } + fill_border(this_buffer, this_width, this_height, this_stride); } diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h index 74318de2e5..f9bc9ac733 100644 --- a/third_party/aom/aom_dsp/x86/synonyms.h +++ b/third_party/aom/aom_dsp/x86/synonyms.h @@ -46,7 +46,6 @@ static INLINE __m128i xx_loadu_128(const void *a) { return _mm_loadu_si128((const __m128i *)a); } - // _mm_loadu_si64 has been introduced in GCC 9, reimplement the function // manually on older compilers. #if !defined(__clang__) && __GNUC_MAJOR__ < 9 diff --git a/third_party/aom/aom_util/aom_pthread.h b/third_party/aom/aom_util/aom_pthread.h index 99deeb292a..e755487ae3 100644 --- a/third_party/aom/aom_util/aom_pthread.h +++ b/third_party/aom/aom_util/aom_pthread.h @@ -28,6 +28,7 @@ extern "C" { #define NOMINMAX #undef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN +#include // NOLINT #include // NOLINT #include // NOLINT #include // NOLINT diff --git a/third_party/aom/aom_util/aom_thread.h b/third_party/aom/aom_util/aom_thread.h index 92e162f121..80ed314752 100644 --- a/third_party/aom/aom_util/aom_thread.h +++ b/third_party/aom/aom_util/aom_thread.h @@ -21,8 +21,6 @@ extern "C" { #endif -#define MAX_NUM_THREADS 64 - // State of the worker thread object typedef enum { AVX_WORKER_STATUS_NOT_OK = 0, // object is unusable diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake index 32645f6065..b6cf974aa7 100644 --- a/third_party/aom/av1/av1.cmake +++ b/third_party/aom/av1/av1.cmake @@ -302,6 +302,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_AVX2 "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c" "${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c" "${AOM_ROOT}/av1/common/x86/reconinter_avx2.c" + "${AOM_ROOT}/av1/common/x86/resize_avx2.c" "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c" "${AOM_ROOT}/av1/common/x86/warp_plane_avx2.c" "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c") @@ -375,6 +376,7 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD list(APPEND AOM_AV1_ENCODER_INTRIN_SVE "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_sve.c" + "${AOM_ROOT}/av1/encoder/arm/neon/pickrst_sve.c" "${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_sve.c") list(APPEND AOM_AV1_ENCODER_INTRIN_ARM_CRC32 diff --git a/third_party/aom/av1/av1_cx_iface.c b/third_party/aom/av1/av1_cx_iface.c index 2b6b1504e6..39c03c9ecb 100644 --- a/third_party/aom/av1/av1_cx_iface.c +++ b/third_party/aom/av1/av1_cx_iface.c @@ -32,6 +32,7 @@ #include "av1/common/enums.h" #include "av1/common/scale.h" #include "av1/encoder/bitstream.h" +#include "av1/encoder/enc_enums.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encoder_alloc.h" #include "av1/encoder/encoder_utils.h" diff --git a/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c b/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c index 3aeffbb0e6..40befdf44e 100644 --- a/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c +++ b/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c @@ -80,17 +80,15 @@ static INLINE void dist_wtd_convolve_2d_horiz_neon_dotprod( const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride, const int16_t *x_filter_ptr, const int im_h, int w) { const int bd = 8; - const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2)); // Dot product constants and other shims. const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); - // Fold horiz_const into the dot-product filter correction constant. The - // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- - // rounding shifts - which are generally faster than rounding shifts on - // modern CPUs. (The extra -1 is needed because we halved the filter values.) - const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const + - (1 << ((ROUND0_BITS - 1) - 1))); + // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts + // - which are generally faster than rounding shifts on modern CPUs. + const int32_t horiz_const = + ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + // Halve the total because we will halve the filter values. + const int32x4_t correction = + vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); const uint8x16_t range_limit = vdupq_n_u8(128); const uint8_t *src_ptr = src; @@ -334,15 +332,14 @@ static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod( // Dot-product constants and other shims. const uint8x16_t range_limit = vdupq_n_u8(128); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); // Fold round_offset into the dot-product filter correction constant. The - // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- - // rounding shifts - which are generally faster than rounding shifts on - // modern CPUs. (The extra -1 is needed because we halved the filter values.) + // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // Halve the total because we will halve the filter values. int32x4_t correction = - vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) + - (1 << ((ROUND0_BITS - 1) - 1))); + vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) + + (1 << (ROUND0_BITS - 1))) / + 2); const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - horiz_offset; @@ -455,15 +452,14 @@ static INLINE void dist_wtd_convolve_x_avg_neon_dotprod( // Dot-product constants and other shims. const uint8x16_t range_limit = vdupq_n_u8(128); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); // Fold round_offset into the dot-product filter correction constant. The - // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- - // rounding shifts - which are generally faster than rounding shifts on - // modern CPUs. (The extra -1 is needed because we halved the filter values.) + // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // Halve the total because we will halve the filter values. int32x4_t correction = - vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) + - (1 << ((ROUND0_BITS - 1) - 1))); + vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) + + (1 << (ROUND0_BITS - 1))) / + 2); const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - horiz_offset; @@ -574,15 +570,14 @@ static INLINE void dist_wtd_convolve_x_neon_dotprod( // Dot-product constants and other shims. const uint8x16_t range_limit = vdupq_n_u8(128); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); // Fold round_offset into the dot-product filter correction constant. The - // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- - // rounding shifts - which are generally faster than rounding shifts on - // modern CPUs. (The extra -1 is needed because we halved the filter values.) + // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // Halve the total because we will halve the vilter values. int32x4_t correction = - vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) + - (1 << ((ROUND0_BITS - 1) - 1))); + vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) + + (1 << (ROUND0_BITS - 1))) / + 2); const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - horiz_offset; diff --git a/third_party/aom/av1/common/arm/convolve_neon_dotprod.c b/third_party/aom/av1/common/arm/convolve_neon_dotprod.c index c29229eb09..132da2442b 100644 --- a/third_party/aom/av1/common/arm/convolve_neon_dotprod.c +++ b/third_party/aom/av1/common/arm/convolve_neon_dotprod.c @@ -102,14 +102,12 @@ static INLINE void convolve_x_sr_12tap_neon_dotprod( const int8x16_t filter = vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15)); - const int32_t correction_s32 = - vaddvq_s32(vaddq_s32(vpaddlq_s16(vshlq_n_s16(filter_0_7, FILTER_BITS)), - vpaddlq_s16(vshlq_n_s16(filter_8_15, FILTER_BITS)))); - // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right - // shift by FILTER_BITS - instead of a first rounding right shift by + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding + // right shift by FILTER_BITS - instead of a first rounding right shift by // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - // ROUND0_BITS. - int32x4_t correction = vdupq_n_s32(correction_s32 + (1 << (ROUND0_BITS - 1))); + int32x4_t correction = + vdupq_n_s32((128 << FILTER_BITS) + (1 << (ROUND0_BITS - 1))); const uint8x16_t range_limit = vdupq_n_u8(128); const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); @@ -274,16 +272,13 @@ void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride, } const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); - // Dot product constants. - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); - // This shim of (1 << ((ROUND0_BITS - 1) - 1) enables us to use a single - // rounding right shift by FILTER_BITS - instead of a first rounding right - // shift by ROUND0_BITS, followed by second rounding right shift by - // FILTER_BITS - ROUND0_BITS. - // The outermost -1 is needed because we will halve the filter values. + // Dot product constants: + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding + // right shift by FILTER_BITS - instead of a first rounding right shift by + // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - + // ROUND0_BITS. Halve the total because we will halve the filter values. const int32x4_t correction = - vdupq_n_s32(correction_s32 + (1 << ((ROUND0_BITS - 1) - 1))); + vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2); const uint8x16_t range_limit = vdupq_n_u8(128); if (w <= 4) { @@ -465,16 +460,13 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod( const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]), vmovn_s16(x_filter_s16.val[1])); - // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts - // - which are generally faster than rounding shifts on modern CPUs. + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. const int32_t horiz_const = ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); // Dot product constants. - const int32x4_t correct_tmp = - vaddq_s32(vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[0], 7)), - vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[1], 7))); const int32x4_t correction = - vdupq_n_s32(vaddvq_s32(correct_tmp) + horiz_const); + vdupq_n_s32((128 << FILTER_BITS) + horiz_const); const uint8x16_t range_limit = vdupq_n_u8(128); const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); @@ -621,16 +613,15 @@ static INLINE void convolve_2d_sr_horiz_neon_dotprod( const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w, int im_h, const int16_t *x_filter_ptr) { const int bd = 8; - // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding - // shifts - which are generally faster than rounding shifts on modern CPUs. - // The outermost -1 is needed because we halved the filter values. - const int32_t horiz_const = - ((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1))); // Dot product constants. const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); - const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const); + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + const int32_t horiz_const = + ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + // Halve the total because we will halve the filter values. + const int32x4_t correction = + vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); const uint8x16_t range_limit = vdupq_n_u8(128); const uint8_t *src_ptr = src; diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl index c0831330d1..6a0043c761 100644 --- a/third_party/aom/av1/common/av1_rtcd_defs.pl +++ b/third_party/aom/av1/common/av1_rtcd_defs.pl @@ -458,7 +458,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { add_proto qw/void av1_compute_stats/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats"; - specialize qw/av1_compute_stats sse4_1 avx2 neon/; + specialize qw/av1_compute_stats sse4_1 avx2 neon sve/; add_proto qw/void av1_calc_proj_params/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params"; specialize qw/av1_calc_proj_params sse4_1 avx2 neon/; add_proto qw/int64_t av1_lowbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params"; @@ -469,7 +469,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/av1_calc_proj_params_high_bd sse4_1 avx2 neon/; add_proto qw/int64_t av1_highbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params"; specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2 neon/; - add_proto qw/void av1_compute_stats_highbd/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth"; + add_proto qw/void av1_compute_stats_highbd/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth"; specialize qw/av1_compute_stats_highbd sse4_1 avx2 neon/; } } @@ -554,6 +554,9 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/; } +add_proto qw/bool resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col"; +specialize qw/resize_vert_dir avx2/; + add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta"; specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/; diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c index 441323ab1f..2b48b9fff4 100644 --- a/third_party/aom/av1/common/resize.c +++ b/third_party/aom/av1/common/resize.c @@ -18,6 +18,7 @@ #include #include "config/aom_config.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/flow_estimation/corner_detect.h" @@ -216,10 +217,6 @@ const int16_t av1_resize_filter_normative[( // Filters for interpolation (full-band) - no filtering for integer pixels #define filteredinterp_filters1000 av1_resize_filter_normative -// Filters for factor of 2 downsampling. -static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 }; -static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 }; - static const InterpKernel *choose_interp_filter(int in_length, int out_length) { int out_length16 = out_length * 16; if (out_length16 >= in_length * 16) @@ -524,6 +521,59 @@ static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) { } } +bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, + int height, int height2, int width2, int start_col) { + bool mem_status = true; + uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(*arrbuf) * height); + uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(*arrbuf2) * height2); + if (arrbuf == NULL || arrbuf2 == NULL) { + mem_status = false; + goto Error; + } + + for (int i = start_col; i < width2; ++i) { + fill_col_to_arr(intbuf + i, width2, height, arrbuf); + down2_symeven(arrbuf, height, arrbuf2); + fill_arr_to_col(output + i, out_stride, height2, arrbuf2); + } + +Error: + aom_free(arrbuf); + aom_free(arrbuf2); + return mem_status; +} + +void resize_horz_dir(const uint8_t *const input, int in_stride, uint8_t *intbuf, + int height, int filtered_length, int width2) { + for (int i = 0; i < height; ++i) + down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i); +} + +bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, + int width2, int out_stride) { + uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(*intbuf) * width2 * height); + if (intbuf == NULL) { + return false; + } + + // Resize in the horizontal direction + resize_horz_dir(input, in_stride, intbuf, height, width, width2); + // Resize in the vertical direction + bool mem_status = resize_vert_dir(intbuf, output, out_stride, height, height2, + width2, 0 /*start_col*/); + aom_free(intbuf); + return mem_status; +} + +// Check if both the output width and height are half of input width and +// height respectively. +bool should_resize_by_half(int height, int width, int height2, int width2) { + const bool is_width_by_2 = get_down2_length(width, 1) == width2; + const bool is_height_by_2 = get_down2_length(height, 1) == height2; + return (is_width_by_2 && is_height_by_2); +} + bool av1_resize_plane(const uint8_t *input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride) { diff --git a/third_party/aom/av1/common/resize.h b/third_party/aom/av1/common/resize.h index d573a538bf..de71f5d539 100644 --- a/third_party/aom/av1/common/resize.h +++ b/third_party/aom/av1/common/resize.h @@ -20,6 +20,10 @@ extern "C" { #endif +// Filters for factor of 2 downsampling. +static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 }; +static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 }; + bool av1_resize_plane(const uint8_t *input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride); @@ -93,6 +97,12 @@ void av1_calculate_unscaled_superres_size(int *width, int *height, int denom); void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool, bool alloc_pyramid); +bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, + int width2, int out_stride); + +bool should_resize_by_half(int height, int width, int height2, int width2); + // Returns 1 if a superres upscaled frame is scaled and 0 otherwise. static INLINE int av1_superres_scaled(const AV1_COMMON *cm) { // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling diff --git a/third_party/aom/av1/common/x86/resize_avx2.c b/third_party/aom/av1/common/x86/resize_avx2.c new file mode 100644 index 0000000000..c44edb88d9 --- /dev/null +++ b/third_party/aom/av1/common/x86/resize_avx2.c @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/resize.h" + +#include "aom_dsp/x86/synonyms.h" + +#define CAST_HI(x) _mm256_castsi128_si256(x) +#define CAST_LOW(x) _mm256_castsi256_si128(x) + +#define PROCESS_RESIZE_Y_WD16 \ + const int idx1 = AOMMIN(height - 1, i + 5); \ + const int idx2 = AOMMIN(height - 1, i + 6); \ + l6 = l10; \ + l7 = l11; \ + l8 = _mm_loadu_si128((__m128i *)(data + idx1 * stride)); \ + l9 = _mm_loadu_si128((__m128i *)(data + idx2 * stride)); \ + \ + /* g0... g15 | i0... i15 */ \ + const __m256i s68 = \ + _mm256_permute2x128_si256(CAST_HI(l6), CAST_HI(l8), 0x20); \ + /* h0... h15 | j0... j15 */ \ + const __m256i s79 = \ + _mm256_permute2x128_si256(CAST_HI(l7), CAST_HI(l9), 0x20); \ + \ + /* g0h0... g7g7 | i0j0... i7j */ \ + s[3] = _mm256_unpacklo_epi8(s68, s79); \ + /* g8h8... g15g15 | i8j8... i15j15 */ \ + s[8] = _mm256_unpackhi_epi8(s68, s79); \ + \ + __m256i res_out[2] = { 0 }; \ + resize_y_convolve(s, coeffs_y, res_out); \ + \ + /* r00... r07 */ \ + __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \ + /* r20... r27 */ \ + __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \ + \ + res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits); \ + res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits); \ + \ + __m256i res_out_b[2] = { 0 }; \ + resize_y_convolve(s + 5, coeffs_y, res_out_b); \ + \ + /* r08... r015 */ \ + __m256i res_b_round_1 = _mm256_add_epi32(res_out_b[0], round_const_bits); \ + /* r28... r215 */ \ + __m256i res_b_round_2 = _mm256_add_epi32(res_out_b[1], round_const_bits); \ + res_b_round_1 = _mm256_sra_epi32(res_b_round_1, round_shift_bits); \ + res_b_round_2 = _mm256_sra_epi32(res_b_round_2, round_shift_bits); \ + \ + /* r00... r03 r20... r23 | r04... r07 r24... r27 */ \ + __m256i res_8bit0 = _mm256_packus_epi32(res_a_round_1, res_a_round_2); \ + /* r08... r012 r28... r212 | r013... r015 r213... r215 */ \ + __m256i res_8bit1 = _mm256_packus_epi32(res_b_round_1, res_b_round_2); \ + /* r00... r07 | r20... r27 */ \ + res_8bit0 = _mm256_permute4x64_epi64(res_8bit0, 0xd8); \ + /* r08... r015 | r28... r215 */ \ + res_8bit1 = _mm256_permute4x64_epi64(res_8bit1, 0xd8); \ + /* r00... r015 | r20... r215 */ \ + res_8bit1 = _mm256_packus_epi16(res_8bit0, res_8bit1); \ + res_8bit0 = _mm256_min_epu8(res_8bit1, clip_pixel); \ + res_8bit0 = _mm256_max_epu8(res_8bit0, zero); + +#define PROCESS_RESIZE_Y_WD8 \ + const int idx1 = AOMMIN(height - 1, i + 5); \ + const int idx2 = AOMMIN(height - 1, i + 6); \ + l6 = l10; \ + l7 = l11; \ + l8 = _mm_loadl_epi64((__m128i *)(data + idx1 * stride)); \ + l9 = _mm_loadl_epi64((__m128i *)(data + idx2 * stride)); \ + \ + /* g0h0... g7h7 */ \ + s67 = _mm_unpacklo_epi8(l6, l7); \ + /* i0j0...i7j7 */ \ + __m128i s89 = _mm_unpacklo_epi8(l8, l9); \ + \ + /* g0h0...g7g7 | i0j0...i7j7 */ \ + s[3] = _mm256_permute2x128_si256(CAST_HI(s67), CAST_HI(s89), 0x20); \ + \ + __m256i res_out[2] = { 0 }; \ + resize_y_convolve(s, coeffs_y, res_out); \ + \ + /* r00... r07 */ \ + __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \ + /* r20...r27 */ \ + __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \ + res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits); \ + res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits); \ + \ + /* r00...r03 r20...r23 | r04...r07 r24...r27 */ \ + res_a_round_1 = _mm256_packus_epi32(res_a_round_1, res_a_round_2); \ + /* r00...r07 | r20...r27 */ \ + res_a_round_1 = _mm256_permute4x64_epi64(res_a_round_1, 0xd8); \ + res_a_round_1 = _mm256_packus_epi16(res_a_round_1, res_a_round_1); \ + res_a_round_1 = _mm256_min_epu8(res_a_round_1, clip_pixel); \ + res_a_round_1 = _mm256_max_epu8(res_a_round_1, zero); + +static INLINE void resize_y_convolve(const __m256i *const s, + const __m256i *const coeffs, + __m256i *res_out) { + const __m256i res_0 = _mm256_maddubs_epi16(s[0], coeffs[0]); + const __m256i res_1 = _mm256_maddubs_epi16(s[1], coeffs[1]); + const __m256i res_2 = _mm256_maddubs_epi16(s[2], coeffs[2]); + const __m256i res_3 = _mm256_maddubs_epi16(s[3], coeffs[3]); + + const __m256i dst_0 = _mm256_add_epi16(res_0, res_1); + const __m256i dst_1 = _mm256_add_epi16(res_2, res_3); + // The sum of convolve operation crosses signed 16bit. Hence, the addition + // should happen in 32bit. + const __m256i dst_00 = _mm256_cvtepi16_epi32(CAST_LOW(dst_0)); + const __m256i dst_01 = + _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_0, 1)); + const __m256i dst_10 = _mm256_cvtepi16_epi32(CAST_LOW(dst_1)); + const __m256i dst_11 = + _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_1, 1)); + + res_out[0] = _mm256_add_epi32(dst_00, dst_10); + res_out[1] = _mm256_add_epi32(dst_01, dst_11); +} + +static INLINE void prepare_filter_coeffs(const int16_t *filter, + __m256i *const coeffs /* [4] */) { + // f0 f1 f2 f3 x x x x + const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter); + // f0 f1 f2 f3 f0 f1 f2 f3 + const __m128i tmp0 = _mm_shuffle_epi32(sym_even_filter, 0x44); + // f0 f1 f2 f3 f1 f0 f3 f2 + const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, 0xb1); + + const __m128i filter_8bit = _mm_packs_epi16(tmp1, tmp1); + + // f0 f1 f0 f1 .. + coeffs[2] = _mm256_broadcastw_epi16(filter_8bit); + // f2 f3 f2 f3 .. + coeffs[3] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 2)); + // f3 f2 f3 f2 .. + coeffs[0] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 6)); + // f1 f0 f1 f0 .. + coeffs[1] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 4)); +} + +bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, + int height, int height2, int stride, int start_col) { + assert(start_col <= stride); + // For the GM tool, the input layer height or width is assured to be an even + // number. Hence the function 'down2_symodd()' is not invoked and SIMD + // optimization of the same is not implemented. + // When the input height is less than 8 and even, the potential input + // heights are limited to 2, 4, or 6. These scenarios require seperate + // handling due to padding requirements. Invoking the C function here will + // eliminate the need for conditional statements within the subsequent SIMD + // code to manage these cases. + if (height & 1 || height < 8) { + return resize_vert_dir_c(intbuf, output, out_stride, height, height2, + stride, start_col); + } + + __m256i s[10], coeffs_y[4]; + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const uint8_t max_pixel = 255; + const __m256i clip_pixel = _mm256_set1_epi8(max_pixel); + const __m256i zero = _mm256_setzero_si256(); + + prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y); + + const int num_col16 = stride / 16; + int remain_col = stride % 16; + // The core vertical SIMD processes 4 input rows simultaneously to generate + // output corresponding to 2 rows. To streamline the core loop and eliminate + // the need for conditional checks, the remaining rows (4 or 6) are processed + // separately. + const int remain_row = (height % 4 == 0) ? 4 : 6; + + for (int j = start_col; j < stride - remain_col; j += 16) { + const uint8_t *data = &intbuf[j]; + const __m128i l3 = _mm_loadu_si128((__m128i *)(data + 0 * stride)); + // Padding top 3 rows with the last available row at the top. + const __m128i l0 = l3; + const __m128i l1 = l3; + const __m128i l2 = l3; + const __m128i l4 = _mm_loadu_si128((__m128i *)(data + 1 * stride)); + + __m128i l6, l7, l8, l9; + __m128i l5 = _mm_loadu_si128((__m128i *)(data + 2 * stride)); + __m128i l10 = _mm_loadu_si128((__m128i *)(data + 3 * stride)); + __m128i l11 = _mm_loadu_si128((__m128i *)(data + 4 * stride)); + + // a0...a15 | c0...c15 + const __m256i s02 = + _mm256_permute2x128_si256(CAST_HI(l0), CAST_HI(l2), 0x20); + // b0...b15 | d0...d15 + const __m256i s13 = + _mm256_permute2x128_si256(CAST_HI(l1), CAST_HI(l3), 0x20); + // c0...c15 | e0...e15 + const __m256i s24 = + _mm256_permute2x128_si256(CAST_HI(l2), CAST_HI(l4), 0x20); + // d0...d15 | f0...f15 + const __m256i s35 = + _mm256_permute2x128_si256(CAST_HI(l3), CAST_HI(l5), 0x20); + // e0...e15 | g0...g15 + const __m256i s46 = + _mm256_permute2x128_si256(CAST_HI(l4), CAST_HI(l10), 0x20); + // f0...f15 | h0...h15 + const __m256i s57 = + _mm256_permute2x128_si256(CAST_HI(l5), CAST_HI(l11), 0x20); + + // a0b0...a7b7 | c0d0...c7d7 + s[0] = _mm256_unpacklo_epi8(s02, s13); + // c0d0...c7d7 | e0f0...e7f7 + s[1] = _mm256_unpacklo_epi8(s24, s35); + // e0f0...e7f7 | g0h0...g7h7 + s[2] = _mm256_unpacklo_epi8(s46, s57); + + // a8b8...a15b15 | c8d8...c15d15 + s[5] = _mm256_unpackhi_epi8(s02, s13); + // c8d8...c15d15 | e8f8...e15f15 + s[6] = _mm256_unpackhi_epi8(s24, s35); + // e8f8...e15f15 | g8h8...g15h15 + s[7] = _mm256_unpackhi_epi8(s46, s57); + + // height to be processed here + const int process_ht = height - remain_row; + for (int i = 0; i < process_ht; i += 4) { + PROCESS_RESIZE_Y_WD16 + + _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j], + CAST_LOW(res_8bit0)); + + _mm_storeu_si128( + (__m128i *)&output[(i / 2) * out_stride + j + out_stride], + _mm256_extracti128_si256(res_8bit0, 1)); + + // Load the required data for processing of next 4 input rows. + const int idx7 = AOMMIN(height - 1, i + 7); + const int idx8 = AOMMIN(height - 1, i + 8); + l10 = _mm_loadu_si128((__m128i *)(data + idx7 * stride)); + l11 = _mm_loadu_si128((__m128i *)(data + idx8 * stride)); + + const __m256i s810 = + _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20); + const __m256i s911 = + _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20); + // i0j0... i7j7 | k0l0... k7l7 + s[4] = _mm256_unpacklo_epi8(s810, s911); + // i8j8... i15j15 | k8l8... k15l15 + s[9] = _mm256_unpackhi_epi8(s810, s911); + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + + s[5] = s[7]; + s[6] = s[8]; + s[7] = s[9]; + } + + // Process the remaining last 4 or 6 rows here. + int i = process_ht; + while (i < height - 1) { + PROCESS_RESIZE_Y_WD16 + + _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j], + CAST_LOW(res_8bit0)); + i += 2; + + const int is_store_valid = (i < height - 1); + if (is_store_valid) + _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j], + _mm256_extracti128_si256(res_8bit0, 1)); + i += 2; + + // Check if there is any remaining height to process. If so, perform the + // necessary data loading for processing the next row. + if (i < height - 1) { + l10 = l11 = l9; + const __m256i s810 = + _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20); + const __m256i s911 = + _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20); + // i0j0... i7j7 | k0l0... k7l7 + s[4] = _mm256_unpacklo_epi8(s810, s911); + // i8j8... i15j15 | k8l8... k15l15 + s[9] = _mm256_unpackhi_epi8(s810, s911); + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + + s[5] = s[7]; + s[6] = s[8]; + s[7] = s[9]; + } + } + } + + if (remain_col > 7) { + const int processed_wd = num_col16 * 16; + remain_col = stride % 8; + + const uint8_t *data = &intbuf[processed_wd]; + + const __m128i l3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride)); + // Padding top 3 rows with available top-most row. + const __m128i l0 = l3; + const __m128i l1 = l3; + const __m128i l2 = l3; + const __m128i l4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride)); + + __m128i l6, l7, l8, l9; + __m128i l5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride)); + __m128i l10 = _mm_loadl_epi64((__m128i *)(data + 3 * stride)); + __m128i l11 = _mm_loadl_epi64((__m128i *)(data + 4 * stride)); + + // a0b0...a7b7 + const __m128i s01 = _mm_unpacklo_epi8(l0, l1); + // c0d0...c7d7 + const __m128i s23 = _mm_unpacklo_epi8(l2, l3); + // e0f0...e7f7 + const __m128i s45 = _mm_unpacklo_epi8(l4, l5); + // g0h0...g7h7 + __m128i s67 = _mm_unpacklo_epi8(l10, l11); + + // a0b0...a7b7 | c0d0...c7d7 + s[0] = _mm256_permute2x128_si256(CAST_HI(s01), CAST_HI(s23), 0x20); + // c0d0...c7d7 | e0f0...e7f7 + s[1] = _mm256_permute2x128_si256(CAST_HI(s23), CAST_HI(s45), 0x20); + // e0f0...e7f7 | g0h0...g7h7 + s[2] = _mm256_permute2x128_si256(CAST_HI(s45), CAST_HI(s67), 0x20); + + // height to be processed here + const int process_ht = height - remain_row; + for (int i = 0; i < process_ht; i += 4) { + PROCESS_RESIZE_Y_WD8 + + _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd], + CAST_LOW(res_a_round_1)); + + _mm_storel_epi64( + (__m128i *)&output[(i / 2) * out_stride + processed_wd + out_stride], + _mm256_extracti128_si256(res_a_round_1, 1)); + + const int idx7 = AOMMIN(height - 1, i + 7); + const int idx8 = AOMMIN(height - 1, i + 8); + l10 = _mm_loadl_epi64((__m128i *)(data + idx7 * stride)); + l11 = _mm_loadl_epi64((__m128i *)(data + idx8 * stride)); + + // k0l0... k7l7 + const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11); + // i0j0... i7j7 | k0l0... k7l7 + s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20); + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + } + + // Process the remaining last 4 or 6 rows here. + int i = process_ht; + while (i < height - 1) { + PROCESS_RESIZE_Y_WD8 + + _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd], + CAST_LOW(res_a_round_1)); + + i += 2; + + const int is_store_valid = (i < height - 1); + if (is_store_valid) + _mm_storel_epi64( + (__m128i *)&output[(i / 2) * out_stride + processed_wd], + _mm256_extracti128_si256(res_a_round_1, 1)); + i += 2; + + // Check rows are still remaining for processing. If yes do the required + // load of data for the next iteration. + if (i < height - 1) { + l10 = l11 = l9; + // k0l0... k7l7 + const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11); + // i0j0... i7j7 | k0l0... k7l7 + s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20); + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + } + } + } + + if (remain_col) + return resize_vert_dir_c(intbuf, output, out_stride, height, height2, + stride, stride - remain_col); + + return true; +} diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c index 47b5f5cfb7..8b0d3bcc7e 100644 --- a/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c +++ b/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c @@ -1008,10 +1008,13 @@ static uint16_t highbd_find_average_neon(const uint16_t *src, int src_stride, } void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8, - const uint8_t *src8, int h_start, int h_end, + const uint8_t *src8, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth) { + (void)dgd_avg; + (void)src_avg; assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_REDUCED); const int wiener_halfwin = wiener_win >> 1; diff --git a/third_party/aom/av1/encoder/arm/neon/pickrst_sve.c b/third_party/aom/av1/encoder/arm/neon/pickrst_sve.c new file mode 100644 index 0000000000..a519ecc5f5 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/pickrst_sve.c @@ -0,0 +1,590 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "av1/common/restoration.h" +#include "av1/encoder/pickrst.h" + +static INLINE uint8_t find_average_sve(const uint8_t *src, int src_stride, + int width, int height) { + uint32x4_t avg_u32 = vdupq_n_u32(0); + uint8x16_t ones = vdupq_n_u8(1); + + // Use a predicate to compute the last columns. + svbool_t pattern = svwhilelt_b8_u32(0, width % 16); + + int h = height; + do { + int j = width; + const uint8_t *src_ptr = src; + while (j >= 16) { + uint8x16_t s = vld1q_u8(src_ptr); + avg_u32 = vdotq_u32(avg_u32, s, ones); + + j -= 16; + src_ptr += 16; + } + uint8x16_t s_end = svget_neonq_u8(svld1_u8(pattern, src_ptr)); + avg_u32 = vdotq_u32(avg_u32, s_end, ones); + + src += src_stride; + } while (--h != 0); + return (uint8_t)(vaddlvq_u32(avg_u32) / (width * height)); +} + +static INLINE void compute_sub_avg(const uint8_t *buf, int buf_stride, int avg, + int16_t *buf_avg, int buf_avg_stride, + int width, int height, + int downsample_factor) { + uint8x8_t avg_u8 = vdup_n_u8(avg); + + // Use a predicate to compute the last columns. + svbool_t pattern = svwhilelt_b8_u32(0, width % 8); + + uint8x8_t avg_end = vget_low_u8(svget_neonq_u8(svdup_n_u8_z(pattern, avg))); + + do { + int j = width; + const uint8_t *buf_ptr = buf; + int16_t *buf_avg_ptr = buf_avg; + while (j >= 8) { + uint8x8_t d = vld1_u8(buf_ptr); + vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d, avg_u8))); + + j -= 8; + buf_ptr += 8; + buf_avg_ptr += 8; + } + uint8x8_t d_end = vget_low_u8(svget_neonq_u8(svld1_u8(pattern, buf_ptr))); + vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d_end, avg_end))); + + buf += buf_stride; + buf_avg += buf_avg_stride; + height -= downsample_factor; + } while (height > 0); +} + +static INLINE void copy_upper_triangle(int64_t *H, int64_t *H_tmp, + const int wiener_win2, const int scale) { + for (int i = 0; i < wiener_win2 - 2; i = i + 2) { + // Transpose the first 2x2 square. It needs a special case as the element + // of the bottom left is on the diagonal. + int64x2_t row0 = vld1q_s64(H_tmp + i * wiener_win2 + i + 1); + int64x2_t row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + i + 1); + + int64x2_t tr_row = aom_vtrn2q_s64(row0, row1); + + vst1_s64(H_tmp + (i + 1) * wiener_win2 + i, vget_low_s64(row0)); + vst1q_s64(H_tmp + (i + 2) * wiener_win2 + i, tr_row); + + // Transpose and store all the remaining 2x2 squares of the line. + for (int j = i + 3; j < wiener_win2; j = j + 2) { + row0 = vld1q_s64(H_tmp + i * wiener_win2 + j); + row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + j); + + int64x2_t tr_row0 = aom_vtrn1q_s64(row0, row1); + int64x2_t tr_row1 = aom_vtrn2q_s64(row0, row1); + + vst1q_s64(H_tmp + j * wiener_win2 + i, tr_row0); + vst1q_s64(H_tmp + (j + 1) * wiener_win2 + i, tr_row1); + } + } + for (int i = 0; i < wiener_win2 * wiener_win2; i++) { + H[i] += H_tmp[i] * scale; + } +} + +// Transpose the matrix that has just been computed and accumulate it in M. +static INLINE void acc_transpose_M(int64_t *M, const int64_t *M_trn, + const int wiener_win, int scale) { + for (int i = 0; i < wiener_win; ++i) { + for (int j = 0; j < wiener_win; ++j) { + int tr_idx = j * wiener_win + i; + *M++ += (int64_t)(M_trn[tr_idx] * scale); + } + } +} + +// Swap each half of the dgd vectors so that we can accumulate the result of +// the dot-products directly in the destination matrix. +static INLINE int16x8x2_t transpose_dgd(int16x8_t dgd0, int16x8_t dgd1) { + int16x8_t dgd_trn0 = vreinterpretq_s16_s64( + vzip1q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1))); + int16x8_t dgd_trn1 = vreinterpretq_s16_s64( + vzip2q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1))); + + return (struct int16x8x2_t){ dgd_trn0, dgd_trn1 }; +} + +static INLINE void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd[5], + int64_t *M, int row) { + const int wiener_win = 5; + + int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0); + int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]); + + int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0); + cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1); + vst1q_s64(M + row * wiener_win + 0, cross_corr01); + + int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2); + int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]); + + int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0); + cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1); + vst1q_s64(M + row * wiener_win + 2, cross_corr23); + + int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[4]); + M[row * wiener_win + 4] += vaddvq_s64(m4); +} + +static INLINE void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd[7], + int64_t *M, int row) { + const int wiener_win = 7; + + int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0); + int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]); + + int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0); + cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1); + vst1q_s64(M + row * wiener_win + 0, cross_corr01); + + int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2); + int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]); + + int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0); + cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1); + vst1q_s64(M + row * wiener_win + 2, cross_corr23); + + int64x2_t m45 = vld1q_s64(M + row * wiener_win + 4); + int16x8x2_t dgd45 = transpose_dgd(dgd[4], dgd[5]); + + int64x2_t cross_corr45 = aom_svdot_lane_s16(m45, dgd45.val[0], src, 0); + cross_corr45 = aom_svdot_lane_s16(cross_corr45, dgd45.val[1], src, 1); + vst1q_s64(M + row * wiener_win + 4, cross_corr45); + + int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[6]); + M[row * wiener_win + 6] += vaddvq_s64(m6); +} + +static INLINE void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H, + const int wiener_win, + const int wiener_win2) { + for (int row0 = 0; row0 < wiener_win; row0++) { + for (int row1 = row0; row1 < wiener_win; row1++) { + int auto_cov_idx = + (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1; + + int64x2_t auto_cov = aom_sdotq_s16(vdupq_n_s64(0), dgd[row0], dgd[row1]); + H[auto_cov_idx] += vaddvq_s64(auto_cov); + } + } +} + +static INLINE void compute_H_two_rows_win5(int16x8_t *dgd0, int16x8_t *dgd1, + int row0, int row1, int64_t *H) { + for (int col0 = 0; col0 < 5; col0++) { + int auto_cov_idx = (row0 * 5 + col0) * 25 + (row1 * 5); + + int64x2_t h01 = vld1q_s64(H + auto_cov_idx); + int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]); + + int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0); + auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1); + vst1q_s64(H + auto_cov_idx, auto_cov01); + + int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2); + int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]); + + int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0); + auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1); + vst1q_s64(H + auto_cov_idx + 2, auto_cov23); + + int64x2_t auto_cov4 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[4]); + H[auto_cov_idx + 4] += vaddvq_s64(auto_cov4); + } +} + +static INLINE void compute_H_two_rows_win7(int16x8_t *dgd0, int16x8_t *dgd1, + int row0, int row1, int64_t *H) { + for (int col0 = 0; col0 < 7; col0++) { + int auto_cov_idx = (row0 * 7 + col0) * 49 + (row1 * 7); + + int64x2_t h01 = vld1q_s64(H + auto_cov_idx); + int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]); + + int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0); + auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1); + vst1q_s64(H + auto_cov_idx, auto_cov01); + + int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2); + int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]); + + int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0); + auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1); + vst1q_s64(H + auto_cov_idx + 2, auto_cov23); + + int64x2_t h45 = vld1q_s64(H + auto_cov_idx + 4); + int16x8x2_t dgd45 = transpose_dgd(dgd1[4], dgd1[5]); + + int64x2_t auto_cov45 = aom_svdot_lane_s16(h45, dgd45.val[0], dgd0[col0], 0); + auto_cov45 = aom_svdot_lane_s16(auto_cov45, dgd45.val[1], dgd0[col0], 1); + vst1q_s64(H + auto_cov_idx + 4, auto_cov45); + + int64x2_t auto_cov6 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[6]); + H[auto_cov_idx + 6] += vaddvq_s64(auto_cov6); + } +} + +// This function computes two matrices: the cross-correlation between the src +// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). +// +// M is of size 7 * 7. It needs to be filled such that multiplying one element +// from src with each element of a row of the wiener window will fill one +// column of M. However this is not very convenient in terms of memory +// accesses, as it means we do contiguous loads of dgd but strided stores to M. +// As a result, we use an intermediate matrix M_trn which is instead filled +// such that one row of the wiener window gives one row of M_trn. Once fully +// computed, M_trn is then transposed to return M. +// +// H is of size 49 * 49. It is filled by multiplying every pair of elements of +// the wiener window together. Since it is a symmetric matrix, we only compute +// the upper triangle, and then copy it down to the lower one. Here we fill it +// by taking each different pair of columns, and multiplying all the elements of +// the first one with all the elements of the second one, with a special case +// when multiplying a column by itself. +static INLINE void compute_stats_win7_sve(int16_t *dgd_avg, int dgd_avg_stride, + int16_t *src_avg, int src_avg_stride, + int width, int height, int64_t *M, + int64_t *H, int downsample_factor) { + const int wiener_win = 7; + const int wiener_win2 = wiener_win * wiener_win; + + // Use a predicate to compute the last columns of the block for H. + svbool_t pattern = svwhilelt_b16_u32(0, width % 8); + + // Use intermediate matrices for H and M to perform the computation, they + // will be accumulated into the original H and M at the end. + int64_t M_trn[49]; + memset(M_trn, 0, sizeof(M_trn)); + + int64_t H_tmp[49 * 49]; + memset(H_tmp, 0, sizeof(H_tmp)); + + do { + // Cross-correlation (M). + for (int row = 0; row < wiener_win; row++) { + int j = 0; + while (j < width) { + int16x8_t dgd[7]; + load_s16_8x7(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1], + &dgd[2], &dgd[3], &dgd[4], &dgd[5], &dgd[6]); + int16x8_t s = vld1q_s16(src_avg + j); + + // Compute all the elements of one row of M. + compute_M_one_row_win7(s, dgd, M_trn, row); + + j += 8; + } + } + + // Auto-covariance (H). + int j = 0; + while (j <= width - 8) { + for (int col0 = 0; col0 < wiener_win; col0++) { + int16x8_t dgd0[7]; + load_s16_8x7(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1], + &dgd0[2], &dgd0[3], &dgd0[4], &dgd0[5], &dgd0[6]); + + // Perform computation of the first column with itself (28 elements). + // For the first column this will fill the upper triangle of the 7x7 + // matrix at the top left of the H matrix. For the next columns this + // will fill the upper triangle of the other 7x7 matrices around H's + // diagonal. + compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); + + // All computation next to the matrix diagonal has already been done. + for (int col1 = col0 + 1; col1 < wiener_win; col1++) { + // Load second column and scale based on downsampling factor. + int16x8_t dgd1[7]; + load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], + &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]); + + // Compute all elements from the combination of both columns (49 + // elements). + compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp); + } + } + j += 8; + } + + if (j < width) { + // Process remaining columns using a predicate to discard excess elements. + for (int col0 = 0; col0 < wiener_win; col0++) { + // Load first column. + int16x8_t dgd0[7]; + dgd0[0] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0)); + dgd0[1] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0)); + dgd0[2] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0)); + dgd0[3] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0)); + dgd0[4] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0)); + dgd0[5] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 5 * dgd_avg_stride + j + col0)); + dgd0[6] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 6 * dgd_avg_stride + j + col0)); + + // Perform computation of the first column with itself (28 elements). + // For the first column this will fill the upper triangle of the 7x7 + // matrix at the top left of the H matrix. For the next columns this + // will fill the upper triangle of the other 7x7 matrices around H's + // diagonal. + compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); + + // All computation next to the matrix diagonal has already been done. + for (int col1 = col0 + 1; col1 < wiener_win; col1++) { + // Load second column and scale based on downsampling factor. + int16x8_t dgd1[7]; + load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], + &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]); + + // Compute all elements from the combination of both columns (49 + // elements). + compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp); + } + } + } + dgd_avg += downsample_factor * dgd_avg_stride; + src_avg += src_avg_stride; + } while (--height != 0); + + // Transpose M_trn. + acc_transpose_M(M, M_trn, 7, downsample_factor); + + // Copy upper triangle of H in the lower one. + copy_upper_triangle(H, H_tmp, wiener_win2, downsample_factor); +} + +// This function computes two matrices: the cross-correlation between the src +// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). +// +// M is of size 5 * 5. It needs to be filled such that multiplying one element +// from src with each element of a row of the wiener window will fill one +// column of M. However this is not very convenient in terms of memory +// accesses, as it means we do contiguous loads of dgd but strided stores to M. +// As a result, we use an intermediate matrix M_trn which is instead filled +// such that one row of the wiener window gives one row of M_trn. Once fully +// computed, M_trn is then transposed to return M. +// +// H is of size 25 * 25. It is filled by multiplying every pair of elements of +// the wiener window together. Since it is a symmetric matrix, we only compute +// the upper triangle, and then copy it down to the lower one. Here we fill it +// by taking each different pair of columns, and multiplying all the elements of +// the first one with all the elements of the second one, with a special case +// when multiplying a column by itself. +static INLINE void compute_stats_win5_sve(int16_t *dgd_avg, int dgd_avg_stride, + int16_t *src_avg, int src_avg_stride, + int width, int height, int64_t *M, + int64_t *H, int downsample_factor) { + const int wiener_win = 5; + const int wiener_win2 = wiener_win * wiener_win; + + // Use a predicate to compute the last columns of the block for H. + svbool_t pattern = svwhilelt_b16_u32(0, width % 8); + + // Use intermediate matrices for H and M to perform the computation, they + // will be accumulated into the original H and M at the end. + int64_t M_trn[25]; + memset(M_trn, 0, sizeof(M_trn)); + + int64_t H_tmp[25 * 25]; + memset(H_tmp, 0, sizeof(H_tmp)); + + do { + // Cross-correlation (M). + for (int row = 0; row < wiener_win; row++) { + int j = 0; + while (j < width) { + int16x8_t dgd[5]; + load_s16_8x5(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1], + &dgd[2], &dgd[3], &dgd[4]); + int16x8_t s = vld1q_s16(src_avg + j); + + // Compute all the elements of one row of M. + compute_M_one_row_win5(s, dgd, M_trn, row); + + j += 8; + } + } + + // Auto-covariance (H). + int j = 0; + while (j <= width - 8) { + for (int col0 = 0; col0 < wiener_win; col0++) { + // Load first column. + int16x8_t dgd0[5]; + load_s16_8x5(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1], + &dgd0[2], &dgd0[3], &dgd0[4]); + + // Perform computation of the first column with itself (15 elements). + // For the first column this will fill the upper triangle of the 5x5 + // matrix at the top left of the H matrix. For the next columns this + // will fill the upper triangle of the other 5x5 matrices around H's + // diagonal. + compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); + + // All computation next to the matrix diagonal has already been done. + for (int col1 = col0 + 1; col1 < wiener_win; col1++) { + // Load second column and scale based on downsampling factor. + int16x8_t dgd1[5]; + load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], + &dgd1[2], &dgd1[3], &dgd1[4]); + + // Compute all elements from the combination of both columns (25 + // elements). + compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp); + } + } + j += 8; + } + + // Process remaining columns using a predicate to discard excess elements. + if (j < width) { + for (int col0 = 0; col0 < wiener_win; col0++) { + int16x8_t dgd0[5]; + dgd0[0] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0)); + dgd0[1] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0)); + dgd0[2] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0)); + dgd0[3] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0)); + dgd0[4] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0)); + + // Perform computation of the first column with itself (15 elements). + // For the first column this will fill the upper triangle of the 5x5 + // matrix at the top left of the H matrix. For the next columns this + // will fill the upper triangle of the other 5x5 matrices around H's + // diagonal. + compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); + + // All computation next to the matrix diagonal has already been done. + for (int col1 = col0 + 1; col1 < wiener_win; col1++) { + // Load second column and scale based on downsampling factor. + int16x8_t dgd1[5]; + load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], + &dgd1[2], &dgd1[3], &dgd1[4]); + + // Compute all elements from the combination of both columns (25 + // elements). + compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp); + } + } + } + dgd_avg += downsample_factor * dgd_avg_stride; + src_avg += src_avg_stride; + } while (--height != 0); + + // Transpose M_trn. + acc_transpose_M(M, M_trn, 5, downsample_factor); + + // Copy upper triangle of H in the lower one. + copy_upper_triangle(H, H_tmp, wiener_win2, downsample_factor); +} + +void av1_compute_stats_sve(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + int use_downsampled_wiener_stats) { + assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA); + + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = wiener_win >> 1; + const int32_t width = h_end - h_start; + const int32_t height = v_end - v_start; + const uint8_t *dgd_start = &dgd[v_start * dgd_stride + h_start]; + memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); + memset(M, 0, sizeof(*M) * wiener_win * wiener_win); + + const uint8_t avg = find_average_sve(dgd_start, dgd_stride, width, height); + const int downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + + // dgd_avg and src_avg have been memset to zero before calling this + // function, so round up the stride to the next multiple of 8 so that we + // don't have to worry about a tail loop when computing M. + const int dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8; + const int src_avg_stride = (width & ~7) + 8; + + // Compute (dgd - avg) and store it in dgd_avg. + // The wiener window will slide along the dgd frame, centered on each pixel. + // For the top left pixel and all the pixels on the side of the frame this + // means half of the window will be outside of the frame. As such the actual + // buffer that we need to subtract the avg from will be 2 * wiener_halfwin + // wider and 2 * wiener_halfwin higher than the original dgd buffer. + const int vert_offset = v_start - wiener_halfwin; + const int horiz_offset = h_start - wiener_halfwin; + const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride; + compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride, + width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1); + + // Compute (src - avg), downsample if necessary and store in src-avg. + const uint8_t *src_start = src + h_start + v_start * src_stride; + compute_sub_avg(src_start, src_stride * downsample_factor, avg, src_avg, + src_avg_stride, width, height, downsample_factor); + + const int downsample_height = height / downsample_factor; + + // Since the height is not necessarily a multiple of the downsample factor, + // the last line of src will be scaled according to how many rows remain. + const int downsample_remainder = height % downsample_factor; + + if (wiener_win == WIENER_WIN) { + compute_stats_win7_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, + width, downsample_height, M, H, downsample_factor); + } else { + compute_stats_win5_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, + width, downsample_height, M, H, downsample_factor); + } + + if (downsample_remainder > 0) { + const int remainder_offset = height - downsample_remainder; + if (wiener_win == WIENER_WIN) { + compute_stats_win7_sve( + dgd_avg + remainder_offset * dgd_avg_stride, dgd_avg_stride, + src_avg + downsample_height * src_avg_stride, src_avg_stride, width, + 1, M, H, downsample_remainder); + } else { + compute_stats_win5_sve( + dgd_avg + remainder_offset * dgd_avg_stride, dgd_avg_stride, + src_avg + downsample_height * src_avg_stride, src_avg_stride, width, + 1, M, H, downsample_remainder); + } + } +} diff --git a/third_party/aom/av1/encoder/enc_enums.h b/third_party/aom/av1/encoder/enc_enums.h index 20cefa16a5..0a8b0f258a 100644 --- a/third_party/aom/av1/encoder/enc_enums.h +++ b/third_party/aom/av1/encoder/enc_enums.h @@ -12,10 +12,14 @@ #ifndef AOM_AV1_ENCODER_ENC_ENUMS_H_ #define AOM_AV1_ENCODER_ENC_ENUMS_H_ +#include "aom_ports/mem.h" + #ifdef __cplusplus extern "C" { #endif +#define MAX_NUM_THREADS 64 + // This enumerator type needs to be kept aligned with the mode order in // const MODE_DEFINITION av1_mode_defs[MAX_MODES] used in the rd code. enum { diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c index a9214f77c2..07382eb6cc 100644 --- a/third_party/aom/av1/encoder/encodeframe.c +++ b/third_party/aom/av1/encoder/encodeframe.c @@ -537,7 +537,9 @@ static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td, // Set the partition if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip || (sf->rt_sf.use_fast_fixed_part && x->sb_force_fixed_part == 1 && - !frame_is_intra_only(cm))) { + (!frame_is_intra_only(cm) && + (!cpi->ppi->use_svc || + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)))) { // set a fixed-size partition av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); BLOCK_SIZE bsize_select = sf->part_sf.fixed_partition_size; diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h index 4de5d426ce..a919bd906a 100644 --- a/third_party/aom/av1/encoder/encoder.h +++ b/third_party/aom/av1/encoder/encoder.h @@ -37,6 +37,7 @@ #include "av1/encoder/av1_quantize.h" #include "av1/encoder/block.h" #include "av1/encoder/context_tree.h" +#include "av1/encoder/enc_enums.h" #include "av1/encoder/encodemb.h" #include "av1/encoder/external_partition.h" #include "av1/encoder/firstpass.h" @@ -74,7 +75,6 @@ #endif #include "aom/internal/aom_codec_internal.h" -#include "aom_util/aom_thread.h" #ifdef __cplusplus extern "C" { diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c index 755535ba51..1d0092a5ed 100644 --- a/third_party/aom/av1/encoder/ethread.c +++ b/third_party/aom/av1/encoder/ethread.c @@ -19,6 +19,7 @@ #include "av1/encoder/allintra_vis.h" #include "av1/encoder/bitstream.h" +#include "av1/encoder/enc_enums.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encodeframe_utils.h" #include "av1/encoder/encoder.h" @@ -2520,7 +2521,7 @@ void av1_tf_do_filtering_mt(AV1_COMP *cpi) { static AOM_INLINE int get_next_gm_job(AV1_COMP *cpi, int *frame_idx, int cur_dir) { GlobalMotionInfo *gm_info = &cpi->gm_info; - JobInfo *job_info = &cpi->mt_info.gm_sync.job_info; + GlobalMotionJobInfo *job_info = &cpi->mt_info.gm_sync.job_info; int total_refs = gm_info->num_ref_frames[cur_dir]; int8_t cur_frame_to_process = job_info->next_frame_to_process[cur_dir]; @@ -2551,7 +2552,7 @@ static int gm_mt_worker_hook(void *arg1, void *unused) { AV1_COMP *cpi = thread_data->cpi; GlobalMotionInfo *gm_info = &cpi->gm_info; AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync; - JobInfo *job_info = &gm_sync->job_info; + GlobalMotionJobInfo *job_info = &gm_sync->job_info; int thread_id = thread_data->thread_id; GlobalMotionData *gm_thread_data = &thread_data->td->gm_data; #if CONFIG_MULTITHREAD @@ -2689,7 +2690,7 @@ static AOM_INLINE void gm_dealloc_thread_data(AV1_COMP *cpi, int num_workers) { // Implements multi-threading for global motion. void av1_global_motion_estimation_mt(AV1_COMP *cpi) { - JobInfo *job_info = &cpi->mt_info.gm_sync.job_info; + GlobalMotionJobInfo *job_info = &cpi->mt_info.gm_sync.job_info; av1_zero(*job_info); diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h index de46a0e1f2..2645f93e3c 100644 --- a/third_party/aom/av1/encoder/global_motion.h +++ b/third_party/aom/av1/encoder/global_motion.h @@ -14,9 +14,8 @@ #include "aom/aom_integer.h" #include "aom_dsp/flow_estimation/flow_estimation.h" -#include "aom_scale/yv12config.h" #include "aom_util/aom_pthread.h" -#include "aom_util/aom_thread.h" +#include "av1/encoder/enc_enums.h" #ifdef __cplusplus extern "C" { @@ -58,11 +57,11 @@ typedef struct { // next_frame_to_process[i] will hold the count of next reference frame to be // processed in the direction 'i'. int8_t next_frame_to_process[MAX_DIRECTIONS]; -} JobInfo; +} GlobalMotionJobInfo; typedef struct { // Data related to assigning jobs for global motion multi-threading. - JobInfo job_info; + GlobalMotionJobInfo job_info; #if CONFIG_MULTITHREAD // Mutex lock used while dispatching jobs. diff --git a/third_party/aom/av1/encoder/nonrd_pickmode.c b/third_party/aom/av1/encoder/nonrd_pickmode.c index 57c74f66d5..08ecb8495a 100644 --- a/third_party/aom/av1/encoder/nonrd_pickmode.c +++ b/third_party/aom/av1/encoder/nonrd_pickmode.c @@ -1886,14 +1886,17 @@ static AOM_INLINE int skip_mode_by_low_temp( static AOM_INLINE int skip_mode_by_bsize_and_ref_frame( PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize, - int extra_prune, unsigned int sse_zeromv_norm, int more_prune) { + int extra_prune, unsigned int sse_zeromv_norm, int more_prune, + int skip_nearmv) { const unsigned int thresh_skip_golden = 500; if (ref_frame != LAST_FRAME && sse_zeromv_norm < thresh_skip_golden && mode == NEWMV) return 1; - if (bsize == BLOCK_128X128 && mode == NEWMV) return 1; + if ((bsize == BLOCK_128X128 && mode == NEWMV) || + (skip_nearmv && mode == NEARMV)) + return 1; // Skip testing non-LAST if this flag is set. if (extra_prune) { @@ -2361,6 +2364,18 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd( (*this_mode != GLOBALMV || *ref_frame != LAST_FRAME)) return true; + // Skip the mode if use reference frame mask flag is not set. + if (!search_state->use_ref_frame_mask[*ref_frame]) return true; + + // Skip mode for some modes and reference frames when + // force_zeromv_skip_for_blk flag is true. + if (x->force_zeromv_skip_for_blk && + ((!(*this_mode == NEARESTMV && + search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) && + *this_mode != GLOBALMV) || + *ref_frame != LAST_FRAME)) + return true; + if (x->sb_me_block && *ref_frame == LAST_FRAME) { // We want to make sure to test the superblock MV: // so don't skip (return false) for NEAREST_LAST or NEAR_LAST if they @@ -2400,18 +2415,6 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd( mi->ref_frame[0] = *ref_frame; mi->ref_frame[1] = *ref_frame2; - // Skip the mode if use reference frame mask flag is not set. - if (!search_state->use_ref_frame_mask[*ref_frame]) return true; - - // Skip mode for some modes and reference frames when - // force_zeromv_skip_for_blk flag is true. - if (x->force_zeromv_skip_for_blk && - ((!(*this_mode == NEARESTMV && - search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) && - *this_mode != GLOBALMV) || - *ref_frame != LAST_FRAME)) - return true; - // Skip compound mode based on variance of previously evaluated single // reference modes. if (rt_sf->prune_compoundmode_with_singlemode_var && !*is_single_pred && @@ -2478,7 +2481,8 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd( // properties. if (skip_mode_by_bsize_and_ref_frame( *this_mode, *ref_frame, bsize, x->nonrd_prune_ref_frame_search, - sse_zeromv_norm, rt_sf->nonrd_aggressive_skip)) + sse_zeromv_norm, rt_sf->nonrd_aggressive_skip, + rt_sf->increase_source_sad_thresh)) return true; // Skip mode based on low temporal variance and souce sad. diff --git a/third_party/aom/av1/encoder/partition_search.c b/third_party/aom/av1/encoder/partition_search.c index 61d49a23f2..30ea7d9140 100644 --- a/third_party/aom/av1/encoder/partition_search.c +++ b/third_party/aom/av1/encoder/partition_search.c @@ -2323,8 +2323,9 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data, } if (cpi->sf.rt_sf.skip_cdef_sb) { // cdef_strength is initialized to 1 which means skip_cdef, and is updated - // here. Check to see is skipping cdef is allowed. - // Always allow cdef_skip for seg_skip = 1. + // here. Check to see is skipping cdef is allowed. Never skip on slide/scene + // change, near a key frame, or when color sensitivity is set. Always allow + // cdef_skip for seg_skip = 1. const int allow_cdef_skipping = seg_skip || (cpi->rc.frames_since_key > 10 && !cpi->rc.high_source_sad && @@ -2338,8 +2339,16 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data, MB_MODE_INFO **mi_sb = cm->mi_params.mi_grid_base + get_mi_grid_idx(&cm->mi_params, mi_row_sb, mi_col_sb); - // Do not skip if intra or new mv is picked, or color sensitivity is set. - // Never skip on slide/scene change. + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + unsigned int thresh_spatial_var = + (cpi->oxcf.speed >= 11 && !is_720p_or_larger && + cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) + ? 400 + : UINT_MAX; + // For skip_cdef_sb = 1: do not skip if allow_cdef_skipping is false or + // intra or new mv is picked, with possible conidition on spatial variance. + // For skip_cdef_sb >= 2: more aggressive mode to always skip unless + // allow_cdef_skipping is false and source_variance is non-zero. if (cpi->sf.rt_sf.skip_cdef_sb >= 2) { mi_sb[0]->cdef_strength = mi_sb[0]->cdef_strength && @@ -2347,7 +2356,8 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data, } else { mi_sb[0]->cdef_strength = mi_sb[0]->cdef_strength && allow_cdef_skipping && - !(mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV); + !(x->source_variance < thresh_spatial_var && + (mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV)); } // Store in the pickmode context. ctx->mic.cdef_strength = mi_sb[0]->cdef_strength; diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c index a504535028..ce0357163d 100644 --- a/third_party/aom/av1/encoder/picklpf.c +++ b/third_party/aom/av1/encoder/picklpf.c @@ -257,6 +257,8 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, inter_frame_multiplier = inter_frame_multiplier << 1; else if (cpi->rc.frame_source_sad > 50000) inter_frame_multiplier = 3 * (inter_frame_multiplier >> 1); + } else if (cpi->sf.rt_sf.use_fast_fixed_part) { + inter_frame_multiplier = inter_frame_multiplier << 1; } // These values were determined by linear fitting the result of the // searched level for 8 bit depth: diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c index b0d0d0bb78..a431c4dada 100644 --- a/third_party/aom/av1/encoder/pickrst.c +++ b/third_party/aom/av1/encoder/pickrst.c @@ -1044,10 +1044,13 @@ void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src, #if CONFIG_AV1_HIGHBITDEPTH void av1_compute_stats_highbd_c(int wiener_win, const uint8_t *dgd8, - const uint8_t *src8, int h_start, int h_end, + const uint8_t *src8, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth) { + (void)dgd_avg; + (void)src_avg; int i, j, k, l; int32_t Y[WIENER_WIN2]; const int wiener_win2 = wiener_win * wiener_win; @@ -1659,9 +1662,10 @@ static AOM_INLINE void search_wiener( // functions. Optimize intrinsics of HBD design similar to LBD (i.e., // pre-calculate d and s buffers and avoid most of the C operations). av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer, - rsc->src_buffer, limits->h_start, limits->h_end, - limits->v_start, limits->v_end, rsc->dgd_stride, - rsc->src_stride, M, H, cm->seq_params->bit_depth); + rsc->src_buffer, rsc->dgd_avg, rsc->src_avg, + limits->h_start, limits->h_end, limits->v_start, + limits->v_end, rsc->dgd_stride, rsc->src_stride, M, + H, cm->seq_params->bit_depth); } else { av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer, rsc->dgd_avg, rsc->src_avg, limits->h_start, @@ -2081,10 +2085,9 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) { // and height aligned to multiple of 16 is considered for intrinsic purpose. rsc.dgd_avg = NULL; rsc.src_avg = NULL; -#if HAVE_AVX2 || HAVE_NEON - // The buffers allocated below are used during Wiener filter processing of low - // bitdepth path. Hence, allocate the same when Wiener filter is enabled in - // low bitdepth path. +#if HAVE_AVX2 + // The buffers allocated below are used during Wiener filter processing. + // Hence, allocate the same when Wiener filter is enabled. if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) { const int buf_size = sizeof(*cpi->pick_lr_ctxt.dgd_avg) * 6 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX; @@ -2221,7 +2224,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) { best_luma_unit_size); } -#if HAVE_AVX || HAVE_NEON +#if HAVE_AVX2 if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) { aom_free(cpi->pick_lr_ctxt.dgd_avg); cpi->pick_lr_ctxt.dgd_avg = NULL; diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c index 256b6fc9eb..9a00042520 100644 --- a/third_party/aom/av1/encoder/speed_features.c +++ b/third_party/aom/av1/encoder/speed_features.c @@ -1461,7 +1461,7 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi, // for resolutions below 720p. if (speed >= 11 && !is_720p_or_larger && cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) { - sf->rt_sf.skip_cdef_sb = 2; + sf->rt_sf.skip_cdef_sb = 1; sf->rt_sf.force_only_last_ref = 1; sf->rt_sf.selective_cdf_update = 1; sf->rt_sf.use_nonrd_filter_search = 0; diff --git a/third_party/aom/av1/encoder/tune_vmaf.c b/third_party/aom/av1/encoder/tune_vmaf.c index 91db3db726..fdb7c77ebc 100644 --- a/third_party/aom/av1/encoder/tune_vmaf.c +++ b/third_party/aom/av1/encoder/tune_vmaf.c @@ -247,7 +247,9 @@ static AOM_INLINE void unsharp(const AV1_COMP *const cpi, // 8-tap Gaussian convolution filter with sigma = 1.0, sums to 128, // all co-efficients must be even. -DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 0, 8, 30, 52, +// The array is of size 9 to allow passing gauss_filter + 1 to +// _mm_loadu_si128() in prepare_coeffs_6t(). +DECLARE_ALIGNED(16, static const int16_t, gauss_filter[9]) = { 0, 8, 30, 52, 30, 8, 0, 0 }; static AOM_INLINE void gaussian_blur(const int bit_depth, const YV12_BUFFER_CONFIG *source, diff --git a/third_party/aom/av1/encoder/x86/pickrst_avx2.c b/third_party/aom/av1/encoder/x86/pickrst_avx2.c index 6658ed39a8..1f76576c9e 100644 --- a/third_party/aom/av1/encoder/x86/pickrst_avx2.c +++ b/third_party/aom/av1/encoder/x86/pickrst_avx2.c @@ -345,21 +345,27 @@ static INLINE void compute_stats_highbd_win5_opt_avx2( } void av1_compute_stats_highbd_avx2(int wiener_win, const uint8_t *dgd8, - const uint8_t *src8, int h_start, int h_end, + const uint8_t *src8, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth) { if (wiener_win == WIENER_WIN) { + (void)dgd_avg; + (void)src_avg; compute_stats_highbd_win7_opt_avx2(dgd8, src8, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } else if (wiener_win == WIENER_WIN_CHROMA) { + (void)dgd_avg; + (void)src_avg; compute_stats_highbd_win5_opt_avx2(dgd8, src8, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } else { - av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start, - v_end, dgd_stride, src_stride, M, H, bit_depth); + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg, + h_start, h_end, v_start, v_end, dgd_stride, + src_stride, M, H, bit_depth); } } #endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/encoder/x86/pickrst_sse4.c b/third_party/aom/av1/encoder/x86/pickrst_sse4.c index 50db305802..3617d33fef 100644 --- a/third_party/aom/av1/encoder/x86/pickrst_sse4.c +++ b/third_party/aom/av1/encoder/x86/pickrst_sse4.c @@ -524,21 +524,27 @@ static INLINE void compute_stats_highbd_win5_opt_sse4_1( } void av1_compute_stats_highbd_sse4_1(int wiener_win, const uint8_t *dgd8, - const uint8_t *src8, int h_start, - int h_end, int v_start, int v_end, - int dgd_stride, int src_stride, int64_t *M, - int64_t *H, aom_bit_depth_t bit_depth) { + const uint8_t *src8, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { if (wiener_win == WIENER_WIN) { + (void)dgd_avg; + (void)src_avg; compute_stats_highbd_win7_opt_sse4_1(dgd8, src8, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } else if (wiener_win == WIENER_WIN_CHROMA) { + (void)dgd_avg; + (void)src_avg; compute_stats_highbd_win5_opt_sse4_1(dgd8, src8, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } else { - av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start, - v_end, dgd_stride, src_stride, M, H, bit_depth); + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg, + h_start, h_end, v_start, v_end, dgd_stride, + src_stride, M, H, bit_depth); } } #endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/test/aom_image_test.cc b/third_party/aom/test/aom_image_test.cc index 03f4373f35..0dfb912215 100644 --- a/third_party/aom/test/aom_image_test.cc +++ b/third_party/aom/test/aom_image_test.cc @@ -9,6 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include + #include "aom/aom_image.h" #include "third_party/googletest/src/googletest/include/gtest/gtest.h" @@ -70,3 +72,66 @@ TEST(AomImageTest, AomImgAllocNv12) { EXPECT_EQ(img.planes[AOM_PLANE_V], nullptr); aom_img_free(&img); } + +TEST(AomImageTest, AomImgAllocHugeWidth) { + // The stride (0x80000000 * 2) would overflow unsigned int. + aom_image_t *image = + aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, 0x80000000, 1, 1); + ASSERT_EQ(image, nullptr); + + // The stride (0x80000000) would overflow int. + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, 0x80000000, 1, 1); + ASSERT_EQ(image, nullptr); + + // The aligned width (UINT_MAX + 1) would overflow unsigned int. + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, UINT_MAX, 1, 1); + ASSERT_EQ(image, nullptr); + + image = aom_img_alloc_with_border(nullptr, AOM_IMG_FMT_I422, 1, INT_MAX, 1, + 0x40000000, 0); + if (image) { + uint16_t *y_plane = + reinterpret_cast(image->planes[AOM_PLANE_Y]); + y_plane[0] = 0; + y_plane[image->d_w - 1] = 0; + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, 0x7ffffffe, 1, 1); + if (image) { + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, 285245883, 64, 1); + if (image) { + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_NV12, 285245883, 64, 1); + if (image) { + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_YV12, 285245883, 64, 1); + if (image) { + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, 65536, 2, 1); + if (image) { + uint16_t *y_plane = + reinterpret_cast(image->planes[AOM_PLANE_Y]); + y_plane[0] = 0; + y_plane[image->d_w - 1] = 0; + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, 285245883, 2, 1); + if (image) { + uint16_t *y_plane = + reinterpret_cast(image->planes[AOM_PLANE_Y]); + y_plane[0] = 0; + y_plane[image->d_w - 1] = 0; + aom_img_free(image); + } +} diff --git a/third_party/aom/test/disflow_test.cc b/third_party/aom/test/disflow_test.cc index 4f004480e2..bee9e1261c 100644 --- a/third_party/aom/test/disflow_test.cc +++ b/third_party/aom/test/disflow_test.cc @@ -124,4 +124,9 @@ INSTANTIATE_TEST_SUITE_P(NEON, ComputeFlowTest, ::testing::Values(aom_compute_flow_at_point_neon)); #endif +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P(SVE, ComputeFlowTest, + ::testing::Values(aom_compute_flow_at_point_sve)); +#endif + } // namespace diff --git a/third_party/aom/test/ethread_test.cc b/third_party/aom/test/ethread_test.cc index ce45394eb8..415f5de269 100644 --- a/third_party/aom/test/ethread_test.cc +++ b/third_party/aom/test/ethread_test.cc @@ -18,6 +18,7 @@ #include "test/util.h" #include "test/y4m_video_source.h" #include "test/yuv_video_source.h" +#include "av1/encoder/enc_enums.h" #include "av1/encoder/firstpass.h" namespace { @@ -411,9 +412,7 @@ class AVxEncoderThreadTest const std::vector ref_size_enc, const std::vector ref_md5_enc, const std::vector ref_md5_dec) { - // This value should be kept the same as MAX_NUM_THREADS - // in aom_thread.h - cfg_.g_threads = 64; + cfg_.g_threads = MAX_NUM_THREADS; ASSERT_NO_FATAL_FAILURE(RunLoop(video)); std::vector multi_thr_max_row_mt_size_enc; std::vector multi_thr_max_row_mt_md5_enc; diff --git a/third_party/aom/test/frame_resize_test.cc b/third_party/aom/test/frame_resize_test.cc new file mode 100644 index 0000000000..8891304192 --- /dev/null +++ b/third_party/aom/test/frame_resize_test.cc @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" +#include "test/acm_random.h" +#include "test/util.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/bitops.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace { + +using ::testing::Combine; +using ::testing::Values; +using ::testing::ValuesIn; + +using std::make_tuple; +using std::tuple; + +const int kIters = 1000; + +typedef tuple FrameDimension; + +// Resolutions (width x height) to be tested for resizing. +const FrameDimension kFrameDim[] = { + make_tuple(3840, 2160), make_tuple(2560, 1440), make_tuple(1920, 1080), + make_tuple(1280, 720), make_tuple(640, 480), make_tuple(640, 360), + make_tuple(256, 256), +}; + +// Check that two 8-bit output buffers are identical. +void AssertOutputBufferEq(const uint8_t *p1, const uint8_t *p2, int width, + int height) { + ASSERT_TRUE(p1 != p2) << "Buffers must be at different memory locations"; + for (int j = 0; j < height; ++j) { + if (memcmp(p1, p2, sizeof(*p1) * width) == 0) { + p1 += width; + p2 += width; + continue; + } + for (int i = 0; i < width; ++i) { + ASSERT_EQ(p1[i], p2[i]) + << width << "x" << height << " Pixel mismatch at (" << i << ", " << j + << ")"; + } + } +} + +typedef bool (*LowBDResizeFunc)(uint8_t *intbuf, uint8_t *output, + int out_stride, int height, int height2, + int stride, int start_wd); +// Test parameter list: +// +typedef tuple ResizeTestParams; + +class AV1ResizeYTest : public ::testing::TestWithParam { + public: + void SetUp() { + test_fun_ = GET_PARAM(0); + frame_dim_ = GET_PARAM(1); + width_ = std::get<0>(frame_dim_); + height_ = std::get<1>(frame_dim_); + const int msb = get_msb(AOMMIN(width_, height_)); + n_levels_ = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); + + src_ = (uint8_t *)aom_malloc((width_ / 2) * height_ * sizeof(*src_)); + ref_dest_ = + (uint8_t *)aom_calloc((width_ * height_) / 4, sizeof(*ref_dest_)); + test_dest_ = + (uint8_t *)aom_calloc((width_ * height_) / 4, sizeof(*test_dest_)); + } + + void RunTest() { + int width2 = width_, height2 = height_; + + for (int i = 0; i < (width_ / 2) * height_; i++) src_[i] = rng_.Rand8(); + for (int level = 1; level < n_levels_; level++) { + width2 = (width_ >> level); + height2 = (height_ >> level); + resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2, width2, + 0); + test_fun_(src_, test_dest_, width2, height2 << 1, height2, width2, 0); + + AssertOutputBufferEq(ref_dest_, test_dest_, width2, height2); + } + } + + void SpeedTest() { + int width2 = width_, height2 = height_; + + for (int i = 0; i < (width_ / 2) * height_; i++) src_[i] = rng_.Rand8(); + for (int level = 1; level < n_levels_; level++) { + width2 = (width_ >> level); + height2 = (height_ >> level); + aom_usec_timer ref_timer; + aom_usec_timer_start(&ref_timer); + for (int j = 0; j < kIters; j++) { + resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2, + width2, 0); + } + aom_usec_timer_mark(&ref_timer); + const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer); + + aom_usec_timer tst_timer; + aom_usec_timer_start(&tst_timer); + for (int j = 0; j < kIters; j++) { + test_fun_(src_, test_dest_, width2, height2 << 1, height2, width2, 0); + } + aom_usec_timer_mark(&tst_timer); + const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer); + + std::cout << "level: " << level << " [" << width2 << " x " << height2 + << "] C time = " << ref_time << " , SIMD time = " << tst_time + << " scaling=" << float(1.00) * ref_time / tst_time << "x \n"; + } + } + + void TearDown() { + aom_free(src_); + aom_free(ref_dest_); + aom_free(test_dest_); + } + + private: + LowBDResizeFunc test_fun_; + FrameDimension frame_dim_; + int width_; + int height_; + int n_levels_; + uint8_t *src_; + uint8_t *ref_dest_; + uint8_t *test_dest_; + libaom_test::ACMRandom rng_; +}; + +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1ResizeYTest); + +TEST_P(AV1ResizeYTest, RunTest) { RunTest(); } + +TEST_P(AV1ResizeYTest, DISABLED_SpeedTest) { SpeedTest(); } + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, AV1ResizeYTest, + ::testing::Combine(::testing::Values(resize_vert_dir_avx2), + ::testing::ValuesIn(kFrameDim))); +#endif + +} // namespace diff --git a/third_party/aom/test/test.cmake b/third_party/aom/test/test.cmake index e2f5da570d..2631c9fb39 100644 --- a/third_party/aom/test/test.cmake +++ b/third_party/aom/test/test.cmake @@ -209,6 +209,7 @@ if(NOT BUILD_SHARED_LIBS) "${AOM_ROOT}/test/fdct4x4_test.cc" "${AOM_ROOT}/test/fft_test.cc" "${AOM_ROOT}/test/firstpass_test.cc" + "${AOM_ROOT}/test/frame_resize_test.cc" "${AOM_ROOT}/test/fwht4x4_test.cc" "${AOM_ROOT}/test/hadamard_test.cc" "${AOM_ROOT}/test/horver_correlation_test.cc" diff --git a/third_party/aom/test/wiener_test.cc b/third_party/aom/test/wiener_test.cc index b995c84d8f..c38e10e3c2 100644 --- a/third_party/aom/test/wiener_test.cc +++ b/third_party/aom/test/wiener_test.cc @@ -397,6 +397,12 @@ INSTANTIATE_TEST_SUITE_P(NEON, WienerTest, ::testing::Values(av1_compute_stats_neon)); #endif // HAVE_NEON +#if HAVE_SVE + +INSTANTIATE_TEST_SUITE_P(SVE, WienerTest, + ::testing::Values(av1_compute_stats_sve)); +#endif // HAVE_SVE + } // namespace wiener_lowbd #if CONFIG_AV1_HIGHBITDEPTH @@ -514,25 +520,27 @@ static void compute_stats_highbd_win_opt_c(int wiener_win, const uint8_t *dgd8, } void compute_stats_highbd_opt_c(int wiener_win, const uint8_t *dgd, - const uint8_t *src, int h_start, int h_end, - int v_start, int v_end, int dgd_stride, - int src_stride, int64_t *M, int64_t *H, - aom_bit_depth_t bit_depth) { + const uint8_t *src, int16_t *d, int16_t *s, + int h_start, int h_end, int v_start, int v_end, + int dgd_stride, int src_stride, int64_t *M, + int64_t *H, aom_bit_depth_t bit_depth) { if (wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA) { compute_stats_highbd_win_opt_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } else { - av1_compute_stats_highbd_c(wiener_win, dgd, src, h_start, h_end, v_start, - v_end, dgd_stride, src_stride, M, H, bit_depth); + av1_compute_stats_highbd_c(wiener_win, dgd, src, d, s, h_start, h_end, + v_start, v_end, dgd_stride, src_stride, M, H, + bit_depth); } } static const int kIterations = 100; typedef void (*compute_stats_Func)(int wiener_win, const uint8_t *dgd, - const uint8_t *src, int h_start, int h_end, - int v_start, int v_end, int dgd_stride, - int src_stride, int64_t *M, int64_t *H, + const uint8_t *src, int16_t *d, int16_t *s, + int h_start, int h_end, int v_start, + int v_end, int dgd_stride, int src_stride, + int64_t *M, int64_t *H, aom_bit_depth_t bit_depth); typedef std::tuple WienerTestParam; @@ -546,11 +554,17 @@ class WienerTestHighbd : public ::testing::TestWithParam { dgd_buf = (uint16_t *)aom_memalign( 32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*dgd_buf)); ASSERT_NE(dgd_buf, nullptr); + const size_t buf_size = + sizeof(*buf) * 6 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX; + buf = (int16_t *)aom_memalign(32, buf_size); + ASSERT_NE(buf, nullptr); + memset(buf, 0, buf_size); target_func_ = GET_PARAM(0); } void TearDown() override { aom_free(src_buf); aom_free(dgd_buf); + aom_free(buf); } void RunWienerTest(const int32_t wiener_win, int32_t run_times, aom_bit_depth_t bit_depth); @@ -562,6 +576,7 @@ class WienerTestHighbd : public ::testing::TestWithParam { libaom_test::ACMRandom rng_; uint16_t *src_buf; uint16_t *dgd_buf; + int16_t *buf; }; void WienerTestHighbd::RunWienerTest(const int32_t wiener_win, @@ -589,6 +604,9 @@ void WienerTestHighbd::RunWienerTest(const int32_t wiener_win, const int dgd_stride = h_end; const int src_stride = MAX_DATA_BLOCK; const int iters = run_times == 1 ? kIterations : 2; + int16_t *dgd_avg = buf; + int16_t *src_avg = + buf + (3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX); for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) { for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) { dgd_buf[i] = rng_.Rand16() % (1 << bit_depth); @@ -601,16 +619,17 @@ void WienerTestHighbd::RunWienerTest(const int32_t wiener_win, aom_usec_timer timer; aom_usec_timer_start(&timer); for (int i = 0; i < run_times; ++i) { - av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, - v_start, v_end, dgd_stride, src_stride, M_ref, - H_ref, bit_depth); + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg, + h_start, h_end, v_start, v_end, dgd_stride, + src_stride, M_ref, H_ref, bit_depth); } aom_usec_timer_mark(&timer); const double time1 = static_cast(aom_usec_timer_elapsed(&timer)); aom_usec_timer_start(&timer); for (int i = 0; i < run_times; ++i) { - target_func_(wiener_win, dgd8, src8, h_start, h_end, v_start, v_end, - dgd_stride, src_stride, M_test, H_test, bit_depth); + target_func_(wiener_win, dgd8, src8, dgd_avg, src_avg, h_start, h_end, + v_start, v_end, dgd_stride, src_stride, M_test, H_test, + bit_depth); } aom_usec_timer_mark(&timer); const double time2 = static_cast(aom_usec_timer_elapsed(&timer)); @@ -657,6 +676,9 @@ void WienerTestHighbd::RunWienerTest_ExtremeValues(const int32_t wiener_win, const int dgd_stride = h_end; const int src_stride = MAX_DATA_BLOCK; const int iters = 1; + int16_t *dgd_avg = buf; + int16_t *src_avg = + buf + (3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX); for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) { // Fill with alternating extreme values to maximize difference with // the average. @@ -668,12 +690,13 @@ void WienerTestHighbd::RunWienerTest_ExtremeValues(const int32_t wiener_win, dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin); const uint8_t *src8 = CONVERT_TO_BYTEPTR(src_buf); - av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start, - v_end, dgd_stride, src_stride, M_ref, H_ref, - bit_depth); + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg, + h_start, h_end, v_start, v_end, dgd_stride, + src_stride, M_ref, H_ref, bit_depth); - target_func_(wiener_win, dgd8, src8, h_start, h_end, v_start, v_end, - dgd_stride, src_stride, M_test, H_test, bit_depth); + target_func_(wiener_win, dgd8, src8, dgd_avg, src_avg, h_start, h_end, + v_start, v_end, dgd_stride, src_stride, M_test, H_test, + bit_depth); int failed = 0; for (int i = 0; i < wiener_win2; ++i) { diff --git a/third_party/dav1d/meson.build b/third_party/dav1d/meson.build index e371415d53..a2637ed797 100644 --- a/third_party/dav1d/meson.build +++ b/third_party/dav1d/meson.build @@ -81,6 +81,8 @@ cdata.set10('TRIM_DSP_FUNCTIONS', get_option('trim_dsp') == 'true' or # Logging option cdata.set10('CONFIG_LOG', get_option('logging')) +cdata.set10('CONFIG_MACOS_KPERF', get_option('macos_kperf')) + # # OS/Compiler checks and defines # diff --git a/third_party/dav1d/meson_options.txt b/third_party/dav1d/meson_options.txt index c04deffd73..b0b45b474d 100644 --- a/third_party/dav1d/meson_options.txt +++ b/third_party/dav1d/meson_options.txt @@ -68,3 +68,8 @@ option('trim_dsp', choices: ['true', 'false', 'if-release'], value: 'if-release', description: 'Eliminate redundant DSP functions where possible') + +option('macos_kperf', + type: 'boolean', + value: false, + description: 'Use the private macOS kperf API for benchmarking') diff --git a/third_party/dav1d/src/arm/64/mc.S b/third_party/dav1d/src/arm/64/mc.S index 3df0393c3a..5b493be82d 100644 --- a/third_party/dav1d/src/arm/64/mc.S +++ b/third_party/dav1d/src/arm/64/mc.S @@ -837,7 +837,7 @@ endfunc // This has got the same signature as the put_8tap functions, // and assumes that x8 is set to (clz(w)-24). -function put_neon +function put_neon, export=1 adr x9, L(put_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw @@ -939,7 +939,7 @@ endfunc // This has got the same signature as the prep_8tap functions, // and assumes that x8 is set to (clz(w)-24), and x7 to w*2. -function prep_neon +function prep_neon, export=1 adr x9, L(prep_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw diff --git a/third_party/dav1d/src/arm/64/mc_dotprod.S b/third_party/dav1d/src/arm/64/mc_dotprod.S new file mode 100644 index 0000000000..fcf04ee4d0 --- /dev/null +++ b/third_party/dav1d/src/arm/64/mc_dotprod.S @@ -0,0 +1,1413 @@ +/* + * Copyright © 2024, VideoLAN and dav1d authors + * Copyright © 2024, Janne Grunau + * Copyright © 2024, Martin Storsjo + * Copyright © 2024, Arm Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + + +#if HAVE_DOTPROD +ENABLE_DOTPROD + +// No spaces in these expressions, due to gas-preprocessor. It is translated by +// -1 to save the negative offset at getting the address of `mc_subpel_filters`. +#define REGULAR1 (((0*15-1)<<7)|(3*15-1)) +#define SMOOTH1 (((1*15-1)<<7)|(4*15-1)) +#define SHARP1 (((2*15-1)<<7)|(3*15-1)) + +#define FUNC_ALIGN 2 +#define JUMP_ALIGN 2 +#define LOOP_ALIGN 2 + + +// Lookup table used to help conversion of shifted 32-bit values to 8-bit. + .align 4 +L(hv_tbl_neon_dotprod): + .byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 + +// Shuffle indices to permute horizontal samples in preparation for input to +// SDOT instructions. The 8-tap horizontal convolution uses sample indices in the +// interval of [-3, 4] relative to the current sample position. We load samples +// from index value -4 to keep loads word aligned, so the shuffle bytes are +// translated by 1 to handle this. + .align 4 +L(h_tbl_neon_dotprod): + .byte 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7 + .byte 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11 + .byte 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15 + .byte 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18, 16, 17, 18, 19 + +// Vertical convolutions are also using SDOT instructions, where a 128-bit +// register contains a transposed 4x4 matrix of values. Subsequent iterations of +// the vertical convolution can reuse the 3x4 sub-matrix from the previous loop +// iteration. These shuffle indices shift and merge this 4x4 matrix with the +// values of a new line. + .align 4 +L(v_tbl_neon_dotprod): + .byte 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28 + .byte 1, 2, 3, 16, 5, 6, 7, 17, 9, 10, 11, 18, 13, 14, 15, 19 + .byte 1, 2, 3, 20, 5, 6, 7, 21, 9, 10, 11, 22, 13, 14, 15, 23 + .byte 1, 2, 3, 24, 5, 6, 7, 25, 9, 10, 11, 26, 13, 14, 15, 27 + .byte 1, 2, 3, 28, 5, 6, 7, 29, 9, 10, 11, 30, 13, 14, 15, 31 + + +.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1 +function \op\()_8tap_\type\()_8bpc_\isa, export=1, align=FUNC_ALIGN + mov x9, \type_h + mov x10, \type_v + .if \jump + b \op\()_8tap_\isa + .endif +endfunc +.endm + +.macro filter_8tap_fn type, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd +make_8tap_fn \type, sharp, SHARP1, SHARP1, \isa +make_8tap_fn \type, sharp_smooth, SHARP1, SMOOTH1, \isa +make_8tap_fn \type, sharp_regular, SHARP1, REGULAR1, \isa +make_8tap_fn \type, smooth_sharp, SMOOTH1, SHARP1, \isa +make_8tap_fn \type, smooth, SMOOTH1, SMOOTH1, \isa +make_8tap_fn \type, smooth_regular, SMOOTH1, REGULAR1, \isa +make_8tap_fn \type, regular_sharp, REGULAR1, SHARP1, \isa +make_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1, \isa +make_8tap_fn \type, regular, REGULAR1, REGULAR1, \isa, jump=0 + +function \type\()_8tap_\isa, align=FUNC_ALIGN + clz w8, \w + mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) + sub w8, w8, #24 // for jump tables + movrel x12, X(mc_subpel_filters) + cbnz \mx, L(\type\()_8tap_h_hv_\isa) + cbnz \my, L(\type\()_8tap_v_\isa) +.ifc \type, prep + add \wd_strd, \w, \w // prep_neon needs w * 2 as stride +.endif + b X(\type\()_neon) + + .align JUMP_ALIGN +L(\type\()_8tap_v_\isa): + madd \my, \my, w11, w10 +.ifc \type, prep + mov w8, 0x2002 // FILTER_WEIGHT * 128 + rounding +.endif + sub \src, \src, \s_strd + ldr q6, L(v_tbl_neon_dotprod) +.ifc \type, prep + dup v4.4s, w8 +.endif + ubfx w11, \my, #7, #7 + and \my, \my, #0x7F + ldr q28, L(v_tbl_neon_dotprod) + 16 + cmp \h, #4 + csel \my, \my, w11, le + sub \src, \src, \s_strd, lsl #1 // src - src_stride * 3 + ldr q29, L(v_tbl_neon_dotprod) + 32 + add \xmy, x12, \xmy, lsl #3 // subpel V filter address + movi v5.16b, #128 + ldr d7, [\xmy] + cmp \w, #8 + b.eq 80f + b.lt 40f + + // .align JUMP_ALIGN // fallthrough +160: // V - 16xN+ + ldr q30, L(v_tbl_neon_dotprod) + 48 + ldr q31, L(v_tbl_neon_dotprod) + 64 +.ifc \type, prep + add \wd_strd, \w, \w +.endif + .align LOOP_ALIGN +161: + mov \lsrc, \src + mov \ldst, \dst + sub w8, \h, #1 + + ldr q16, [\lsrc] + ldr q17, [\lsrc, \s_strd] + add \lsrc, \lsrc, \s_strd, lsl #1 + ldr q18, [\lsrc] + ldr q19, [\lsrc, \s_strd] + add \lsrc, \lsrc, \s_strd, lsl #1 + + zip1 v0.16b, v16.16b, v17.16b + zip2 v1.16b, v16.16b, v17.16b + zip1 v2.16b, v18.16b, v19.16b + zip2 v3.16b, v18.16b, v19.16b + + ldr q20, [\lsrc] + ldr q21, [\lsrc, \s_strd] + add \lsrc, \lsrc, \s_strd, lsl #1 + ldr q22, [\lsrc] + ldr q23, [\lsrc, \s_strd] + add \lsrc, \lsrc, \s_strd, lsl #1 + + zip1 v18.16b, v20.16b, v21.16b + zip2 v21.16b, v20.16b, v21.16b + zip1 v24.16b, v22.16b, v23.16b + zip2 v27.16b, v22.16b, v23.16b + + zip1 v16.8h, v0.8h, v2.8h + zip2 v19.8h, v0.8h, v2.8h + zip1 v22.8h, v1.8h, v3.8h + zip2 v25.8h, v1.8h, v3.8h + + zip1 v17.8h, v18.8h, v24.8h + zip2 v20.8h, v18.8h, v24.8h + zip1 v23.8h, v21.8h, v27.8h + zip2 v26.8h, v21.8h, v27.8h + + sub v16.16b, v16.16b, v5.16b + sub v19.16b, v19.16b, v5.16b + sub v22.16b, v22.16b, v5.16b + sub v25.16b, v25.16b, v5.16b + + sub v17.16b, v17.16b, v5.16b + sub v20.16b, v20.16b, v5.16b + sub v23.16b, v23.16b, v5.16b + sub v26.16b, v26.16b, v5.16b + + .align LOOP_ALIGN +16: + ldr q27, [\lsrc] + add \lsrc, \lsrc, \s_strd +.ifc \type, prep + mov v0.16b, v4.16b + mov v1.16b, v4.16b + mov v2.16b, v4.16b + mov v3.16b, v4.16b +.else + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 + movi v2.4s, #32, lsl 8 + movi v3.4s, #32, lsl 8 +.endif + sub v18.16b, v27.16b, v5.16b + sub v21.16b, v27.16b, v5.16b + sub v24.16b, v27.16b, v5.16b + sub v27.16b, v27.16b, v5.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v1.4s, v19.16b, v7.4b[0] + sdot v2.4s, v22.16b, v7.4b[0] + sdot v3.4s, v25.16b, v7.4b[0] + + tbl v16.16b, {v16.16b, v17.16b}, v6.16b + tbl v19.16b, {v19.16b, v20.16b}, v6.16b + tbl v22.16b, {v22.16b, v23.16b}, v6.16b + tbl v25.16b, {v25.16b, v26.16b}, v6.16b + + sdot v0.4s, v17.16b, v7.4b[1] + sdot v1.4s, v20.16b, v7.4b[1] + sdot v2.4s, v23.16b, v7.4b[1] + sdot v3.4s, v26.16b, v7.4b[1] + + tbl v17.16b, {v17.16b, v18.16b}, v28.16b + tbl v20.16b, {v20.16b, v21.16b}, v29.16b + tbl v23.16b, {v23.16b, v24.16b}, v30.16b + tbl v26.16b, {v26.16b, v27.16b}, v31.16b + + subs w8, w8, #1 + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h +.ifc \type, prep + sshr v0.8h, v0.8h, #2 + sshr v1.8h, v2.8h, #2 + st1 {v0.8h, v1.8h}, [\ldst], \d_strd +.else + sqrshrun v0.8b, v0.8h, #6 + sqrshrun2 v0.16b, v2.8h, #6 + st1 {v0.16b}, [\ldst], \d_strd +.endif + b.gt 16b + +.ifc \type, prep + mov v0.16b, v4.16b + mov v1.16b, v4.16b + mov v2.16b, v4.16b + mov v3.16b, v4.16b +.else + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 + movi v2.4s, #32, lsl 8 + movi v3.4s, #32, lsl 8 +.endif + sdot v0.4s, v16.16b, v7.4b[0] + sdot v1.4s, v19.16b, v7.4b[0] + sdot v2.4s, v22.16b, v7.4b[0] + sdot v3.4s, v25.16b, v7.4b[0] + + sdot v0.4s, v17.16b, v7.4b[1] + sdot v1.4s, v20.16b, v7.4b[1] + sdot v2.4s, v23.16b, v7.4b[1] + sdot v3.4s, v26.16b, v7.4b[1] + + subs \w, \w, #16 + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h +.ifc \type, prep + sshr v0.8h, v0.8h, #2 + sshr v1.8h, v2.8h, #2 + stp q0, q1, [\ldst] + add \dst, \dst, #32 +.else + sqrshrun v0.8b, v0.8h, #6 + sqrshrun2 v0.16b, v2.8h, #6 + str q0, [\ldst] + add \dst, \dst, #16 +.endif + add \src, \src, #16 + b.gt 161b + ret + + .align JUMP_ALIGN +80: // V - 8xN + ldr d16, [\src] + ldr d17, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr d18, [\src] + ldr d19, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + ldr d20, [\src] + ldr d21, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr d22, [\src] + ldr d23, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + subs \h, \h, #2 // for prep: sub is enough + + zip1 v0.16b, v16.16b, v17.16b + zip1 v2.16b, v18.16b, v19.16b + zip1 v18.16b, v20.16b, v21.16b + zip1 v24.16b, v22.16b, v23.16b + + zip1 v16.8h, v0.8h, v2.8h + zip2 v19.8h, v0.8h, v2.8h + zip1 v17.8h, v18.8h, v24.8h + zip2 v20.8h, v18.8h, v24.8h + + sub v16.16b, v16.16b, v5.16b + sub v19.16b, v19.16b, v5.16b + sub v17.16b, v17.16b, v5.16b + sub v20.16b, v20.16b, v5.16b +.ifc \type, put + b.eq 82f +.endif + + .align LOOP_ALIGN +8: + ldr d21, [\src] + ldr d27, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 +.ifc \type, prep + mov v0.16b, v4.16b + mov v1.16b, v4.16b + mov v2.16b, v4.16b + mov v3.16b, v4.16b +.else + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 + movi v2.4s, #32, lsl 8 + movi v3.4s, #32, lsl 8 +.endif + sub v18.16b, v21.16b, v5.16b + sub v21.16b, v21.16b, v5.16b + sub v24.16b, v27.16b, v5.16b + sub v27.16b, v27.16b, v5.16b + + tbl v22.16b, {v16.16b, v17.16b}, v6.16b + tbl v25.16b, {v19.16b, v20.16b}, v6.16b + tbl v23.16b, {v17.16b, v18.16b}, v28.16b + tbl v26.16b, {v20.16b, v21.16b}, v29.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v0.4s, v17.16b, v7.4b[1] + sdot v1.4s, v19.16b, v7.4b[0] + sdot v1.4s, v20.16b, v7.4b[1] + + tbl v16.16b, {v22.16b, v23.16b}, v6.16b + tbl v19.16b, {v25.16b, v26.16b}, v6.16b + tbl v17.16b, {v23.16b, v24.16b}, v28.16b + tbl v20.16b, {v26.16b, v27.16b}, v29.16b + + sdot v2.4s, v22.16b, v7.4b[0] + sdot v2.4s, v23.16b, v7.4b[1] + sdot v3.4s, v25.16b, v7.4b[0] + sdot v3.4s, v26.16b, v7.4b[1] + + subs \h, \h, #2 + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h +.ifc \type, prep + sshr v0.8h, v0.8h, #2 + sshr v1.8h, v2.8h, #2 + stp q0, q1, [\dst], #32 +.else + sqrshrun v0.8b, v0.8h, #6 + sqrshrun v1.8b, v2.8h, #6 + str d0, [\dst] + str d1, [\dst, \d_strd] + add \dst, \dst, \d_strd, lsl #1 +.endif + b.gt 8b + +.ifc \type, put + .align JUMP_ALIGN +82: + ldr d21, [\src] + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 + movi v2.4s, #32, lsl 8 + movi v3.4s, #32, lsl 8 +.else + ldr d21, [\src] + mov v0.16b, v4.16b + mov v1.16b, v4.16b + mov v2.16b, v4.16b + mov v3.16b, v4.16b +.endif + sub v18.16b, v21.16b, v5.16b + sub v21.16b, v21.16b, v5.16b + + tbl v22.16b, {v16.16b, v17.16b}, v6.16b + tbl v25.16b, {v19.16b, v20.16b}, v6.16b + tbl v23.16b, {v17.16b, v18.16b}, v28.16b + tbl v26.16b, {v20.16b, v21.16b}, v29.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v0.4s, v17.16b, v7.4b[1] + sdot v1.4s, v19.16b, v7.4b[0] + sdot v1.4s, v20.16b, v7.4b[1] + + sdot v2.4s, v22.16b, v7.4b[0] + sdot v2.4s, v23.16b, v7.4b[1] + sdot v3.4s, v25.16b, v7.4b[0] + sdot v3.4s, v26.16b, v7.4b[1] + + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h +.ifc \type, prep + sshr v0.8h, v0.8h, #2 + sshr v1.8h, v2.8h, #2 + stp q0, q1, [\dst] +.else + sqrshrun v0.8b, v0.8h, #6 + sqrshrun v1.8b, v2.8h, #6 + str d0, [\dst] + str d1, [\dst, \d_strd] +.endif + ret + + .align JUMP_ALIGN +40: // V - 4xN or 2xN (put only) +.ifc \type, put + cmp \w, #2 + b.eq 20f +.endif + ldr s16, [\src] + ldr s17, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr s18, [\src] + ldr s19, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + ldr s20, [\src] + ldr s21, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr s22, [\src] + ldr s23, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + subs \h, \h, #2 // for prep: sub is enough + + zip1 v0.8b, v16.8b, v17.8b + zip1 v2.8b, v18.8b, v19.8b + zip1 v18.8b, v20.8b, v21.8b + zip1 v24.8b, v22.8b, v23.8b + + zip1 v16.8h, v0.8h, v2.8h + zip1 v17.8h, v18.8h, v24.8h + + sub v16.16b, v16.16b, v5.16b + sub v17.16b, v17.16b, v5.16b +.ifc \type, put + b.eq 42f +.endif + + .align LOOP_ALIGN +4: + ldr s18, [\src] + ldr s21, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 +.ifc \type, prep + mov v0.16b, v4.16b + mov v1.16b, v4.16b +.else + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 +.endif + sub v18.16b, v18.16b, v5.16b + sub v21.16b, v21.16b, v5.16b + + tbl v19.16b, {v16.16b, v17.16b}, v6.16b + tbl v20.16b, {v17.16b, v18.16b}, v28.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v0.4s, v17.16b, v7.4b[1] + + tbl v16.16b, {v19.16b, v20.16b}, v6.16b + tbl v17.16b, {v20.16b, v21.16b}, v28.16b + + sdot v1.4s, v19.16b, v7.4b[0] + sdot v1.4s, v20.16b, v7.4b[1] +.ifc \type, prep + subs \h, \h, #2 + shrn v0.4h, v0.4s, #2 + shrn2 v0.8h, v1.4s, #2 + str q0, [\dst], #16 +.else + uzp1 v0.8h, v0.8h, v1.8h + sqrshrun v0.8b, v0.8h, #6 + subs \h, \h, #2 + fmov x8, d0 + lsr x9, x8, #32 + str w8, [\dst] + str w9, [\dst, \d_strd] + add \dst, \dst, \d_strd, lsl #1 +.endif + b.gt 4b + +.ifc \type, put + .align JUMP_ALIGN +42: + ldr s18, [\src] + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 +.else + ldr s18, [\src] + mov v0.16b, v4.16b + mov v1.16b, v4.16b +.endif + sub v18.16b, v18.16b, v5.16b + + tbl v19.16b, {v16.16b, v17.16b}, v6.16b + tbl v20.16b, {v17.16b, v18.16b}, v28.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v0.4s, v17.16b, v7.4b[1] + + sdot v1.4s, v19.16b, v7.4b[0] + sdot v1.4s, v20.16b, v7.4b[1] +.ifc \type, prep + shrn v0.4h, v0.4s, #2 + shrn2 v0.8h, v1.4s, #2 + str q0, [\dst] + ret +.else + uzp1 v0.8h, v0.8h, v1.8h + sqrshrun v0.8b, v0.8h, #6 + fmov x8, d0 + lsr x9, x8, #32 + str w8, [\dst] + str w9, [\dst, \d_strd] + ret + + .align JUMP_ALIGN +20: // V - 2xN + ldr h16, [\src] + ldr h17, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr h18, [\src] + ldr h19, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + ldr h20, [\src] + ldr h21, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr h22, [\src] + ldr h23, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + subs \h, \h, #2 + + zip1 v0.8b, v16.8b, v17.8b + zip1 v2.8b, v18.8b, v19.8b + zip1 v18.8b, v20.8b, v21.8b + zip1 v24.8b, v22.8b, v23.8b + + zip1 v16.4h, v0.4h, v2.4h + zip1 v17.4h, v18.4h, v24.4h + + sub v16.8b, v16.8b, v5.8b + sub v17.8b, v17.8b, v5.8b + + b.eq 22f + + .align LOOP_ALIGN +2: + ldr h18, [\src] + ldr h21, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 + + sub v18.8b, v18.8b, v5.8b + sub v21.8b, v21.8b, v5.8b + + tbl v19.16b, {v16.16b, v17.16b}, v6.16b + tbl v20.16b, {v17.16b, v18.16b}, v28.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v0.4s, v17.16b, v7.4b[1] + + tbl v16.16b, {v19.16b, v20.16b}, v6.16b + tbl v17.16b, {v20.16b, v21.16b}, v28.16b + + sdot v1.4s, v19.16b, v7.4b[0] + sdot v1.4s, v20.16b, v7.4b[1] + + uzp1 v0.8h, v0.8h, v1.8h + sqrshrun v0.8b, v0.8h, #6 + + subs \h, \h, #2 + fmov x8, d0 + lsr x9, x8, #32 + strh w8, [\dst] + strh w9, [\dst, \d_strd] + add \dst, \dst, \d_strd, lsl #1 + b.gt 2b + + .align JUMP_ALIGN +22: + ldr h18, [\src] + + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 + + sub v18.8b, v18.8b, v5.8b + + tbl v19.16b, {v16.16b, v17.16b}, v6.16b + tbl v20.16b, {v17.16b, v18.16b}, v28.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v0.4s, v17.16b, v7.4b[1] + + sdot v1.4s, v19.16b, v7.4b[0] + sdot v1.4s, v20.16b, v7.4b[1] + + uzp1 v0.8h, v0.8h, v1.8h + sqrshrun v0.8b, v0.8h, #6 + + fmov x8, d0 + lsr x9, x8, #32 + strh w8, [\dst] + strh w9, [\dst, \d_strd] + ret +.endif + + .align JUMP_ALIGN +L(\type\()_8tap_h_hv_\isa): + madd \mx, \mx, w11, w9 + madd w14, \my, w11, w10 // for HV + ldr q28, L(h_tbl_neon_dotprod) + mov w13, 0x2002 // FILTER_WEIGHT * 128 + rounding + sub \src, \src, #4 // src - 4 + dup v27.4s, w13 + ubfx w9, \mx, #7, #7 + and \mx, \mx, #0x7F + ubfx w11, w14, #7, #7 // for HV + and w14, w14, #0x7F // for HV + cmp \w, #4 + csel \mx, \mx, w9, le + add \xmx, x12, \xmx, lsl #3 // subpel H filter address + movi v24.16b, #128 + cbz \my, L(\type\()_8tap_h_\isa) + + // HV cases + cmp \h, #4 + csel w14, w14, w11, le + sub \src, \src, \s_strd, lsl #1 // src - src_stride * 2 - 4 + add \xmy, x12, x14, lsl #3 // subpel V filter address + mov x15, x30 + ldr d7, [\xmy] +.ifc \type, put + ldr q25, L(hv_tbl_neon_dotprod) +.endif + sxtl v7.8h, v7.8b + cmp w10, SHARP1 + b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1 + + // HV 8-tap cases + sub \src, \src, \s_strd // src - src_stride * 3 - 4 + cmp \w, #4 + b.eq 40f +.ifc \type, put + b.lt 20f +.endif + + // .align JUMP_ALIGN // fallthrough +80: // HV8 - 8xN+ + ldr q29, L(h_tbl_neon_dotprod) + 16 + ldr q30, L(h_tbl_neon_dotprod) + 32 + ldr d26, [\xmx] +.ifc \type, prep + add \wd_strd, \w, \w +.endif + + .align LOOP_ALIGN +81: + mov \lsrc, \src + mov \ldst, \dst + mov w8, \h + + bl L(\type\()_hv_filter8_\isa) + mov v16.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v17.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v18.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v19.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v20.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v21.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + + .align LOOP_ALIGN +8: + ldr q23, [\lsrc] + add \lsrc, \lsrc, \s_strd + + smull v0.4s, v16.4h, v7.h[0] + smull2 v1.4s, v16.8h, v7.h[0] + mov v16.16b, v17.16b + + sub v23.16b, v23.16b, v24.16b + + mov v5.16b, v27.16b + mov v6.16b, v27.16b + + smlal v0.4s, v17.4h, v7.h[1] + smlal2 v1.4s, v17.8h, v7.h[1] + mov v17.16b, v18.16b + + tbl v2.16b, {v23.16b}, v28.16b + tbl v3.16b, {v23.16b}, v29.16b + tbl v4.16b, {v23.16b}, v30.16b + + smlal v0.4s, v18.4h, v7.h[2] + smlal2 v1.4s, v18.8h, v7.h[2] + mov v18.16b, v19.16b + + sdot v5.4s, v2.16b, v26.4b[0] + sdot v6.4s, v3.16b, v26.4b[0] + + smlal v0.4s, v19.4h, v7.h[3] + smlal2 v1.4s, v19.8h, v7.h[3] + mov v19.16b, v20.16b + + sdot v5.4s, v3.16b, v26.4b[1] + sdot v6.4s, v4.16b, v26.4b[1] + + smlal v0.4s, v20.4h, v7.h[4] + smlal2 v1.4s, v20.8h, v7.h[4] + mov v20.16b, v21.16b + + smlal v0.4s, v21.4h, v7.h[5] + smlal2 v1.4s, v21.8h, v7.h[5] +.ifc \type, prep + uzp1 v23.8h, v5.8h, v6.8h +.endif + mov v21.16b, v22.16b + + smlal v0.4s, v22.4h, v7.h[6] + smlal2 v1.4s, v22.8h, v7.h[6] +.ifc \type, prep + sshr v22.8h, v23.8h, #2 + smlal v0.4s, v22.4h, v7.h[7] + smlal2 v1.4s, v22.8h, v7.h[7] + rshrn v0.4h, v0.4s, #6 + rshrn2 v0.8h, v1.4s, #6 + subs w8, w8, #1 + st1 {v0.8h}, [\ldst], \d_strd + b.gt 8b + add \dst, \dst, #16 +.else + shrn v22.4h, v5.4s, #2 + shrn2 v22.8h, v6.4s, #2 + smlal v0.4s, v22.4h, v7.h[7] + smlal2 v1.4s, v22.8h, v7.h[7] + tbl v0.16b, {v0.16b, v1.16b}, v25.16b + subs w8, w8, #1 + sqrshrun v0.8b, v0.8h, #2 + st1 {v0.8b}, [\ldst], \d_strd + b.gt 8b + add \dst, \dst, #8 +.endif + add \src, \src, #8 + subs \w, \w, #8 + b.gt 81b + ret x15 + + .align JUMP_ALIGN +40: // HV8 - 4xN + ldr s26, [\xmx, #2] + add \src, \src, #2 + + bl L(\type\()_hv_filter4_\isa) + mov v16.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v17.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v18.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v19.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v20.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v21.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + + .align LOOP_ALIGN +4: + ld1 {v4.8b}, [\src], \s_strd + + smull v0.4s, v16.4h, v7.h[0] + smlal v0.4s, v17.4h, v7.h[1] + mov v16.16b, v17.16b + mov v17.16b, v18.16b + sub v4.16b, v4.16b, v24.16b + + smlal v0.4s, v18.4h, v7.h[2] + smlal v0.4s, v19.4h, v7.h[3] + tbl v2.16b, {v4.16b}, v28.16b + mov v5.16b, v27.16b + + mov v18.16b, v19.16b + mov v19.16b, v20.16b + + smlal v0.4s, v20.4h, v7.h[4] + smlal v0.4s, v21.4h, v7.h[5] + + sdot v5.4s, v2.16b, v26.4b[0] + mov v20.16b, v21.16b + mov v21.16b, v22.16b +.ifc \type, put + subs \h, \h, #1 +.endif + smlal v0.4s, v22.4h, v7.h[6] + shrn v22.4h, v5.4s, #2 + + smlal v0.4s, v22.4h, v7.h[7] +.ifc \type, prep + rshrn v0.4h, v0.4s, #6 + str d0, [\dst], #8 + subs \h, \h, #1 +.else + tbl v0.16b, {v0.16b, v1.16b}, v25.16b + sqrshrun v0.8b, v0.8h, #2 + str s0, [\dst] + add \dst, \dst, \d_strd +.endif + b.gt 4b + ret x15 + +.ifc \type, put + .align JUMP_ALIGN +20: // HV8 - 2xN + ldr s26, [\xmx, #2] + add \src, \src, #2 + + bl L(\type\()_hv_filter4_\isa) + mov v16.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v17.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v18.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v19.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v20.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v21.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + + .align LOOP_ALIGN +2: + ld1 {v4.8b}, [\src], \s_strd + + smull v0.4s, v16.4h, v7.h[0] + smlal v0.4s, v17.4h, v7.h[1] + mov v16.16b, v17.16b + mov v17.16b, v18.16b + sub v4.16b, v4.16b, v24.16b + + smlal v0.4s, v18.4h, v7.h[2] + smlal v0.4s, v19.4h, v7.h[3] + tbl v2.16b, {v4.16b}, v28.16b + mov v5.16b, v27.16b + + mov v18.16b, v19.16b + mov v19.16b, v20.16b + + smlal v0.4s, v20.4h, v7.h[4] + smlal v0.4s, v21.4h, v7.h[5] + + sdot v5.4s, v2.16b, v26.4b[0] + mov v20.16b, v21.16b + mov v21.16b, v22.16b + + subs \h, \h, #1 + smlal v0.4s, v22.4h, v7.h[6] + shrn v22.4h, v5.4s, #2 + + smlal v0.4s, v22.4h, v7.h[7] + tbl v0.16b, {v0.16b, v1.16b}, v25.16b + sqrshrun v0.8b, v0.8h, #2 + + str h0, [\dst] + add \dst, \dst, \d_strd + b.gt 2b + ret x15 +.endif + + .align JUMP_ALIGN +L(\type\()_6tap_hv_\isa): + cmp \w, #4 + b.eq 40f +.ifc \type, put + b.lt 20f +.endif + + // .align JUMP_ALIGN // fallthrough +80: // HV6 - 8xN+ + ldr q29, L(h_tbl_neon_dotprod) + 16 + ldr q30, L(h_tbl_neon_dotprod) + 32 + ldr d26, [\xmx] +.ifc \type, prep + add \wd_strd, \w, \w +.endif + + .align LOOP_ALIGN +81: + mov \lsrc, \src + mov \ldst, \dst + mov w8, \h + + bl L(\type\()_hv_filter8_\isa) + mov v16.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v17.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v18.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v19.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v20.16b, v22.16b + + .align LOOP_ALIGN +8: + ldr q23, [\xmy] + add \xmy, \xmy, \s_strd + + smull v0.4s, v16.4h, v7.h[1] + smull2 v1.4s, v16.8h, v7.h[1] + sub v23.16b, v23.16b, v24.16b + mov v16.16b, v17.16b + + mov v5.16b, v27.16b + mov v6.16b, v27.16b + + tbl v2.16b, {v23.16b}, v28.16b + tbl v3.16b, {v23.16b}, v29.16b + + smlal v0.4s, v17.4h, v7.h[2] + smlal2 v1.4s, v17.8h, v7.h[2] + tbl v4.16b, {v23.16b}, v30.16b + mov v17.16b, v18.16b + + sdot v5.4s, v2.16b, v26.4b[0] + sdot v6.4s, v3.16b, v26.4b[0] + smlal v0.4s, v18.4h, v7.h[3] + smlal2 v1.4s, v18.8h, v7.h[3] + mov v18.16b, v19.16b + + sdot v5.4s, v3.16b, v26.4b[1] + sdot v6.4s, v4.16b, v26.4b[1] + smlal v0.4s, v19.4h, v7.h[4] + smlal2 v1.4s, v19.8h, v7.h[4] + mov v19.16b, v20.16b + uzp1 v23.8h, v5.8h, v6.8h + + smlal v0.4s, v20.4h, v7.h[5] + smlal2 v1.4s, v20.8h, v7.h[5] + sshr v20.8h, v23.8h, #2 +.ifc \type, prep + smlal v0.4s, v20.4h, v7.h[6] + smlal2 v1.4s, v20.8h, v7.h[6] + rshrn v0.4h, v0.4s, #6 + rshrn2 v0.8h, v1.4s, #6 + st1 {v0.8h}, [\ldst], \d_strd + subs w8, w8, #1 + b.gt 8b + add \dst, \dst, #16 +.else + subs w8, w8, #1 + smlal v0.4s, v20.4h, v7.h[6] + smlal2 v1.4s, v20.8h, v7.h[6] + tbl v0.16b, {v0.16b, v1.16b}, v25.16b + sqrshrun v0.8b, v0.8h, #2 + st1 {v0.8b}, [\ldst], \d_strd + b.gt 8b + add \dst, \dst, #8 +.endif + add \src, \src, #8 + subs \w, \w, #8 + b.gt 81b + ret x15 + + .align FUNC_ALIGN +L(\type\()_hv_filter8_\isa): + ldr q4, [\lsrc] + add \lsrc, \lsrc, \s_strd + sub v4.16b, v4.16b, v24.16b + mov v22.16b, v27.16b + mov v23.16b, v27.16b + tbl v2.16b, {v4.16b}, v28.16b + tbl v3.16b, {v4.16b}, v29.16b + tbl v4.16b, {v4.16b}, v30.16b + sdot v22.4s, v2.16b, v26.4b[0] + sdot v22.4s, v3.16b, v26.4b[1] + sdot v23.4s, v3.16b, v26.4b[0] + sdot v23.4s, v4.16b, v26.4b[1] + shrn v22.4h, v22.4s, #2 + shrn2 v22.8h, v23.4s, #2 + ret + + .align FUNC_ALIGN +L(\type\()_hv_filter4_\isa): + mov v22.16b, v27.16b + ld1 {v4.8b}, [\src], \s_strd + sub v4.16b, v4.16b, v24.16b + tbl v2.16b, {v4.16b}, v28.16b + sdot v22.4s, v2.16b, v26.4b[0] + shrn v22.4h, v22.4s, #2 + ret + + .align JUMP_ALIGN +40: // HV6 - 4xN + ldr s26, [\xmx, #2] + add \src, \src, #2 + + bl L(\type\()_hv_filter4_\isa) + mov v16.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v17.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v18.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v19.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v20.16b, v22.16b + + .align LOOP_ALIGN +4: + ld1 {v4.8b}, [\src], \s_strd + + smull v0.4s, v16.4h, v7.h[1] + smlal v0.4s, v17.4h, v7.h[2] + sub v4.16b, v4.16b, v24.16b + mov v16.16b, v17.16b + mov v17.16b, v18.16b + + smlal v0.4s, v18.4h, v7.h[3] + smlal v0.4s, v19.4h, v7.h[4] + tbl v2.16b, {v4.16b}, v28.16b + mov v5.16b, v27.16b + + mov v18.16b, v19.16b + mov v19.16b, v20.16b + sdot v5.4s, v2.16b, v26.4b[0] + + smlal v0.4s, v20.4h, v7.h[5] + shrn v20.4h, v5.4s, #2 +.ifc \type, prep + smlal v0.4s, v20.4h, v7.h[6] + rshrn v0.4h, v0.4s, #6 + str d0, [\dst], #8 + subs \h, \h, #1 +.else + subs \h, \h, #1 + smlal v0.4s, v20.4h, v7.h[6] + tbl v0.16b, {v0.16b}, v25.16b + sqrshrun v0.8b, v0.8h, #2 + str s0, [\dst] + add \dst, \dst, \d_strd +.endif + b.gt 4b + ret x15 + +.ifc \type, put + .align JUMP_ALIGN +20: // HV6 - 2xN + ldr s26, [\xmx, #2] + add \src, \src, #2 + + bl L(\type\()_hv_filter4_\isa) + mov v16.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v17.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v18.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v19.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v20.16b, v22.16b + + .align LOOP_ALIGN +2: + ld1 {v4.8b}, [\src], \s_strd + + smull v0.4s, v16.4h, v7.h[1] + smlal v0.4s, v17.4h, v7.h[2] + sub v4.16b, v4.16b, v24.16b + mov v16.16b, v17.16b + mov v17.16b, v18.16b + + smlal v0.4s, v18.4h, v7.h[3] + smlal v0.4s, v19.4h, v7.h[4] + tbl v2.16b, {v4.16b}, v28.16b + mov v5.16b, v27.16b + + mov v18.16b, v19.16b + mov v19.16b, v20.16b + sdot v5.4s, v2.16b, v26.4b[0] + + smlal v0.4s, v20.4h, v7.h[5] + shrn v20.4h, v5.4s, #2 + + subs \h, \h, #1 + smlal v0.4s, v20.4h, v7.h[6] + + tbl v0.16b, {v0.16b}, v25.16b + sqrshrun v0.8b, v0.8h, #2 + + str h0, [\dst] + add \dst, \dst, \d_strd + b.gt 2b + ret x15 +.endif + + .align JUMP_ALIGN +L(\type\()_8tap_h_\isa): + adr x9, L(\type\()_8tap_h_\isa\()_tbl) + ldrh w8, [x9, x8, lsl #1] +.ifc \type, put + mov w10, #0x2022 // 64 * 128 + 34, bias and rounding for SDOT + dup v27.4s, w10 +.endif + sub x9, x9, x8 + br x9 + +.ifc \type, put + .align JUMP_ALIGN +20: // H - 2xN + AARCH64_VALID_JUMP_TARGET + add \src, \src, #2 + ldr s6, [\xmx, #2] + + .align LOOP_ALIGN +2: + ldr d0, [\src] + ldr d1, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + sub v0.8b, v0.8b, v24.8b + sub v1.8b, v1.8b, v24.8b + + mov v4.16b, v27.16b + mov v5.16b, v27.16b + + tbl v2.16b, {v0.16b}, v28.16b + tbl v3.16b, {v1.16b}, v28.16b + + sdot v4.4s, v2.16b, v6.4b[0] + sdot v5.4s, v3.16b, v6.4b[0] + + uzp1 v4.8h, v4.8h, v5.8h + sqshrun v4.8b, v4.8h, #6 + + subs \h, \h, #2 + fmov x8, d4 + lsr x9, x8, #32 + strh w8, [\dst] + strh w9, [\dst, \d_strd] + add \dst, \dst, \d_strd, lsl #1 + b.gt 2b + ret + +.endif + + .align JUMP_ALIGN +40: // H - 4xN + AARCH64_VALID_JUMP_TARGET + add \src, \src, #2 + ldr s26, [\xmx, #2] + + .align LOOP_ALIGN +4: + ldr d0, [\src] + ldr d1, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + sub v0.8b, v0.8b, v24.8b + sub v1.8b, v1.8b, v24.8b + + mov v4.16b, v27.16b + mov v5.16b, v27.16b + + tbl v2.16b, {v0.16b}, v28.16b + tbl v3.16b, {v1.16b}, v28.16b + + sdot v4.4s, v2.16b, v26.4b[0] + sdot v5.4s, v3.16b, v26.4b[0] +.ifc \type, prep + subs \h, \h, #2 + shrn v4.4h, v4.4s, #2 + shrn2 v4.8h, v5.4s, #2 + str q4, [\dst], #16 +.else + uzp1 v4.8h, v4.8h, v5.8h + sqshrun v4.8b, v4.8h, #6 + subs \h, \h, #2 + fmov x8, d4 + lsr x9, x8, #32 + str w8, [\dst] + str w9, [\dst, \d_strd] + add \dst, \dst, \d_strd, lsl #1 +.endif + b.gt 4b + ret + + .align JUMP_ALIGN +80: // H - 8xN + AARCH64_VALID_JUMP_TARGET + ldr q29, L(h_tbl_neon_dotprod) + 16 + ldr q30, L(h_tbl_neon_dotprod) + 32 + ldr d26, [\xmx] + + .align LOOP_ALIGN +8: + ldr q0, [\src] + ldr q16, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + sub v0.16b, v0.16b, v24.16b + sub v16.16b, v16.16b, v24.16b + + mov v4.16b, v27.16b + mov v5.16b, v27.16b + mov v20.16b, v27.16b + mov v21.16b, v27.16b + + tbl v1.16b, {v0.16b}, v28.16b + tbl v2.16b, {v0.16b}, v29.16b + tbl v3.16b, {v0.16b}, v30.16b + tbl v17.16b, {v16.16b}, v28.16b + tbl v18.16b, {v16.16b}, v29.16b + tbl v19.16b, {v16.16b}, v30.16b + + sdot v4.4s, v1.16b, v26.4b[0] + sdot v5.4s, v2.16b, v26.4b[0] + sdot v20.4s, v17.16b, v26.4b[0] + sdot v21.4s, v18.16b, v26.4b[0] + sdot v4.4s, v2.16b, v26.4b[1] + sdot v5.4s, v3.16b, v26.4b[1] + sdot v20.4s, v18.16b, v26.4b[1] + sdot v21.4s, v19.16b, v26.4b[1] + + uzp1 v4.8h, v4.8h, v5.8h + uzp1 v20.8h, v20.8h, v21.8h +.ifc \type, prep + sshr v4.8h, v4.8h, #2 + sshr v20.8h, v20.8h, #2 + subs \h, \h, #2 + stp q4, q20, [\dst], #32 +.else + sqshrun v4.8b, v4.8h, #6 + sqshrun v20.8b, v20.8h, #6 + subs \h, \h, #2 + str d4, [\dst] + str d20, [\dst, \d_strd] + add \dst, \dst, \d_strd, lsl #1 +.endif + b.gt 8b + ret + + .align JUMP_ALIGN +160: // H - 16xN + AARCH64_VALID_JUMP_TARGET + ldr q29, L(h_tbl_neon_dotprod) + 16 + ldr q30, L(h_tbl_neon_dotprod) + 32 + ldr q31, L(h_tbl_neon_dotprod) + 48 + ldr d26, [\xmx] + + .align LOOP_ALIGN +16: + ldp q16, q17, [\src] + add \src, \src, \s_strd + + sub v16.16b, v16.16b, v24.16b + sub v17.16b, v17.16b, v24.16b + + mov v6.16b, v27.16b + mov v7.16b, v27.16b + mov v22.16b, v27.16b + mov v23.16b, v27.16b + + tbl v0.16b, {v16.16b}, v28.16b + tbl v1.16b, {v16.16b}, v29.16b + tbl v2.16b, {v16.16b}, v30.16b + tbl v3.16b, {v16.16b, v17.16b}, v31.16b + tbl v4.16b, {v17.16b}, v28.16b + + sdot v6.4s, v0.16b, v26.4b[0] + sdot v7.4s, v1.16b, v26.4b[0] + sdot v22.4s, v2.16b, v26.4b[0] + sdot v23.4s, v3.16b, v26.4b[0] + sdot v6.4s, v1.16b, v26.4b[1] + sdot v7.4s, v2.16b, v26.4b[1] + sdot v22.4s, v3.16b, v26.4b[1] + sdot v23.4s, v4.16b, v26.4b[1] + + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v22.8h, v22.8h, v23.8h +.ifc \type, prep + sshr v6.8h, v6.8h, #2 + sshr v22.8h, v22.8h, #2 + subs \h, \h, #1 + stp q6, q22, [\dst], #32 +.else + sqshrun v6.8b, v6.8h, #6 + sqshrun2 v6.16b, v22.8h, #6 + subs \h, \h, #1 + str q6, [\dst] + add \dst, \dst, \d_strd +.endif + b.gt 16b + ret + + .align JUMP_ALIGN +320: // H - 32xN+ +640: +1280: + AARCH64_VALID_JUMP_TARGET + ldr q29, L(h_tbl_neon_dotprod) + 16 + ldr q30, L(h_tbl_neon_dotprod) + 32 + ldr q31, L(h_tbl_neon_dotprod) + 48 + ldr d26, [\xmx] +.ifc \type, put + sub \d_strd, \d_strd, \w, uxtw +.endif + sub \s_strd, \s_strd, \w, uxtw + mov w8, \w + + .align LOOP_ALIGN +32: + ldp q16, q17, [\src], #16 + + sub v16.16b, v16.16b, v24.16b + sub v17.16b, v17.16b, v24.16b + + mov v6.16b, v27.16b + mov v7.16b, v27.16b + mov v22.16b, v27.16b + mov v23.16b, v27.16b + + tbl v0.16b, {v16.16b}, v28.16b + tbl v1.16b, {v16.16b}, v29.16b + tbl v2.16b, {v16.16b}, v30.16b + tbl v3.16b, {v16.16b, v17.16b}, v31.16b + tbl v4.16b, {v17.16b}, v28.16b + + sdot v6.4s, v0.16b, v26.4b[0] + sdot v7.4s, v1.16b, v26.4b[0] + sdot v22.4s, v2.16b, v26.4b[0] + sdot v23.4s, v3.16b, v26.4b[0] + sdot v6.4s, v1.16b, v26.4b[1] + sdot v7.4s, v2.16b, v26.4b[1] + sdot v22.4s, v3.16b, v26.4b[1] + sdot v23.4s, v4.16b, v26.4b[1] + + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v22.8h, v22.8h, v23.8h +.ifc \type, prep + sshr v6.8h, v6.8h, #2 + sshr v22.8h, v22.8h, #2 + subs w8, w8, #16 + stp q6, q22, [\dst], #32 +.else + sqshrun v6.8b, v6.8h, #6 + sqshrun2 v6.16b, v22.8h, #6 + subs w8, w8, #16 + str q6, [\dst], #16 +.endif + b.gt 32b + + add \src, \src, \s_strd +.ifc \type, put + add \dst, \dst, \d_strd +.endif + mov w8, \w + subs \h, \h, #1 + b.gt 32b + ret + +L(\type\()_8tap_h_\isa\()_tbl): + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 1280b) + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 640b) + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 320b) + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 160b) + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 80b) + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 40b) +.ifc \type, put + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 20b) +.endif +endfunc +.endm + +// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6) +// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7) +filter_8tap_fn prep, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7 + +// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7) +// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) +filter_8tap_fn put, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 + +DISABLE_DOTPROD +#endif // HAVE_DOTPROD diff --git a/third_party/dav1d/src/arm/64/msac.S b/third_party/dav1d/src/arm/64/msac.S index 7bef9243fb..9033072a82 100644 --- a/third_party/dav1d/src/arm/64/msac.S +++ b/third_party/dav1d/src/arm/64/msac.S @@ -288,10 +288,8 @@ function msac_decode_hi_tok_neon, export=1 mvni v30.4h, #0x3f // 0xffc0 ldrh w9, [x1, #6] // count = cdf[n_symbols] ld1r {v3.4h}, [x16] // rng - movrel x16, bits ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret) add x17, x0, #DIF + 6 - ld1 {v16.8h}, [x16] mov w13, #-24 and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 ldr w10, [x0, #ALLOW_UPDATE_CDF] @@ -305,30 +303,27 @@ function msac_decode_hi_tok_neon, export=1 add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret) add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) str h3, [sp, #14] // store original u = s->rng - cmhs v2.8h, v1.8h, v4.8h // c >= v + cmhs v2.4h, v1.4h, v4.4h // c >= v str q4, [sp, #16] // store v values to allow indexed access - and v6.16b, v2.16b, v16.16b // One bit per halfword set in the mask - addv h6, v6.8h // Aggregate mask bits - umov w3, v6.h[0] + addv h6, v2.4h // -4 + ret add w13, w13, #5 - rbit w3, w3 + smov w15, v6.h[0] add x8, sp, #16 - clz w15, w3 // ret + add w15, w15, #4 // ret cbz w10, 2f // update_cdf - movi v5.8b, #0xff + sub v5.4h, v0.4h, v2.4h // cdf[i] + (i >= val ? 1 : 0) mov w4, #-5 - urhadd v4.4h, v5.4h, v2.4h // i >= val ? -1 : 32768 + orr v2.4h, #0x80, lsl #8 // i >= val ? -1 : 32768 sub w4, w4, w9, lsr #4 // -((count >> 4) + 5) - sub v4.4h, v4.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i]) + sub v4.4h, v2.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i]) dup v6.4h, w4 // -rate sub w9, w9, w9, lsr #5 // count - (count == 32) - sub v0.4h, v0.4h, v2.4h // cdf + (i >= val ? 1 : 0) sshl v4.4h, v4.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate add w9, w9, #1 // count + (count < 32) - add v0.4h, v0.4h, v4.4h // cdf + (32768 - cdf[i]) >> rate + add v0.4h, v5.4h, v4.4h // cdf[i] + (32768 - cdf[i]) >> rate st1 {v0.4h}, [x1] and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 strh w9, [x1, #6] diff --git a/third_party/dav1d/src/arm/itx.h b/third_party/dav1d/src/arm/itx.h index 17234e027a..2a58a31322 100644 --- a/third_party/dav1d/src/arm/itx.h +++ b/third_party/dav1d/src/arm/itx.h @@ -28,34 +28,6 @@ #include "src/cpu.h" #include "src/itx.h" -#define decl_itx2_fns(w, h, opt) \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) - -#define decl_itx12_fns(w, h, opt) \ -decl_itx2_fns(w, h, opt); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) - -#define decl_itx16_fns(w, h, opt) \ -decl_itx12_fns(w, h, opt); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) - -#define decl_itx17_fns(w, h, opt) \ -decl_itx16_fns(w, h, opt); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) - decl_itx17_fns( 4, 4, neon); decl_itx16_fns( 4, 8, neon); decl_itx16_fns( 4, 16, neon); @@ -78,41 +50,6 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon)); decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon)); static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) { -#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ - c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ - BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) - -#define assign_itx1_fn(pfx, w, h, ext) \ - assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) - -#define assign_itx2_fn(pfx, w, h, ext) \ - assign_itx1_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext) - -#define assign_itx12_fn(pfx, w, h, ext) \ - assign_itx2_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \ - assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \ - assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \ - assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \ - assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \ - assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext) - -#define assign_itx16_fn(pfx, w, h, ext) \ - assign_itx12_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \ - assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext) - -#define assign_itx17_fn(pfx, w, h, ext) \ - assign_itx16_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) - const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; diff --git a/third_party/dav1d/src/arm/mc.h b/third_party/dav1d/src/arm/mc.h index 06cd533a9b..7e57fd37cb 100644 --- a/third_party/dav1d/src/arm/mc.h +++ b/third_party/dav1d/src/arm/mc.h @@ -30,26 +30,40 @@ #include "src/mc.h" #include "src/cpu.h" -decl_mc_fn(BF(dav1d_put_8tap_regular, neon)); -decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, neon)); -decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, neon)); -decl_mc_fn(BF(dav1d_put_8tap_smooth, neon)); -decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, neon)); -decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, neon)); -decl_mc_fn(BF(dav1d_put_8tap_sharp, neon)); -decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, neon)); -decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, neon)); -decl_mc_fn(BF(dav1d_put_bilin, neon)); +#define decl_8tap_gen(decl_name, fn_name, opt) \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_regular, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_regular_smooth, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_regular_sharp, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_smooth_regular, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_smooth, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_smooth_sharp, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_sharp_regular, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_sharp_smooth, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_sharp, opt)) + +#define decl_8tap_fns(opt) \ + decl_8tap_gen(mc, put, opt); \ + decl_8tap_gen(mct, prep, opt) + +#define init_8tap_gen(name, opt) \ + init_##name##_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, opt); \ + init_##name##_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, opt); \ + init_##name##_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, opt); \ + init_##name##_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, opt); \ + init_##name##_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, opt); \ + init_##name##_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, opt); \ + init_##name##_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, opt); \ + init_##name##_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, opt); \ + init_##name##_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, opt) + +#define init_8tap_fns(opt) \ + init_8tap_gen(mc, opt); \ + init_8tap_gen(mct, opt) + +decl_8tap_fns(neon); +decl_8tap_fns(neon_dotprod); -decl_mct_fn(BF(dav1d_prep_8tap_regular, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_smooth, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_sharp, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, neon)); +decl_mc_fn(BF(dav1d_put_bilin, neon)); decl_mct_fn(BF(dav1d_prep_bilin, neon)); decl_avg_fn(BF(dav1d_avg, neon)); @@ -77,27 +91,10 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) { if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; - init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon); - init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon); - init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon); - init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon); - init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon); - init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon); - init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon); - init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon); - init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, neon); - init_mc_fn (FILTER_2D_BILINEAR, bilin, neon); - - init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, neon); - init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon); - init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon); - init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon); - init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon); - init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon); - init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon); - init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon); - init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon); - init_mct_fn(FILTER_2D_BILINEAR, bilin, neon); + init_8tap_fns(neon); + + init_mc_fn (FILTER_2D_BILINEAR, bilin, neon); + init_mct_fn(FILTER_2D_BILINEAR, bilin, neon); c->avg = BF(dav1d_avg, neon); c->w_avg = BF(dav1d_w_avg, neon); @@ -111,4 +108,12 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) { c->warp8x8 = BF(dav1d_warp_affine_8x8, neon); c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon); c->emu_edge = BF(dav1d_emu_edge, neon); + +#if ARCH_AARCH64 +#if HAVE_DOTPROD && BITDEPTH == 8 + if (!(flags & DAV1D_ARM_CPU_FLAG_DOTPROD)) return; + + init_8tap_fns(neon_dotprod); +#endif // HAVE_DOTPROD && BITDEPTH == 8 +#endif // ARCH_AARCH64 } diff --git a/third_party/dav1d/src/cdf.c b/third_party/dav1d/src/cdf.c index e0f2132e00..d9721dad46 100644 --- a/third_party/dav1d/src/cdf.c +++ b/third_party/dav1d/src/cdf.c @@ -65,631 +65,638 @@ #define CDF15(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \ CDF1(a), CDF14(b,c,d,e,f,g,h,i,j,k,l,m,n,o) -static const CdfModeContext av1_default_cdf = { - .y_mode = { - { CDF12(22801, 23489, 24293, 24756, 25601, 26123, - 26606, 27418, 27945, 29228, 29685, 30349) }, - { CDF12(18673, 19845, 22631, 23318, 23950, 24649, - 25527, 27364, 28152, 29701, 29984, 30852) }, - { CDF12(19770, 20979, 23396, 23939, 24241, 24654, - 25136, 27073, 27830, 29360, 29730, 30659) }, - { CDF12(20155, 21301, 22838, 23178, 23261, 23533, - 23703, 24804, 25352, 26575, 27016, 28049) }, - }, .use_filter_intra = { - [BS_4x4] = { CDF1( 4621) }, - [BS_4x8] = { CDF1( 6743) }, - [BS_8x4] = { CDF1( 5893) }, - [BS_8x8] = { CDF1( 7866) }, - [BS_8x16] = { CDF1(12551) }, - [BS_16x8] = { CDF1( 9394) }, - [BS_16x16] = { CDF1(12408) }, - [BS_16x32] = { CDF1(14301) }, - [BS_32x16] = { CDF1(12756) }, - [BS_32x32] = { CDF1(22343) }, - [BS_32x64] = { CDF1(16384) }, - [BS_64x32] = { CDF1(16384) }, - [BS_64x64] = { CDF1(16384) }, - [BS_64x128] = { CDF1(16384) }, - [BS_128x64] = { CDF1(16384) }, - [BS_128x128] = { CDF1(16384) }, - [BS_4x16] = { CDF1(12770) }, - [BS_16x4] = { CDF1(10368) }, - [BS_8x32] = { CDF1(20229) }, - [BS_32x8] = { CDF1(18101) }, - [BS_16x64] = { CDF1(16384) }, - [BS_64x16] = { CDF1(16384) }, - }, .filter_intra = { - CDF4(8949, 12776, 17211, 29558), - }, .uv_mode = { - { - { CDF12(22631, 24152, 25378, 25661, 25986, 26520, - 27055, 27923, 28244, 30059, 30941, 31961) }, - { CDF12( 9513, 26881, 26973, 27046, 27118, 27664, - 27739, 27824, 28359, 29505, 29800, 31796) }, - { CDF12( 9845, 9915, 28663, 28704, 28757, 28780, - 29198, 29822, 29854, 30764, 31777, 32029) }, - { CDF12(13639, 13897, 14171, 25331, 25606, 25727, - 25953, 27148, 28577, 30612, 31355, 32493) }, - { CDF12( 9764, 9835, 9930, 9954, 25386, 27053, - 27958, 28148, 28243, 31101, 31744, 32363) }, - { CDF12(11825, 13589, 13677, 13720, 15048, 29213, - 29301, 29458, 29711, 31161, 31441, 32550) }, - { CDF12(14175, 14399, 16608, 16821, 17718, 17775, - 28551, 30200, 30245, 31837, 32342, 32667) }, - { CDF12(12885, 13038, 14978, 15590, 15673, 15748, - 16176, 29128, 29267, 30643, 31961, 32461) }, - { CDF12(12026, 13661, 13874, 15305, 15490, 15726, - 15995, 16273, 28443, 30388, 30767, 32416) }, - { CDF12(19052, 19840, 20579, 20916, 21150, 21467, - 21885, 22719, 23174, 28861, 30379, 32175) }, - { CDF12(18627, 19649, 20974, 21219, 21492, 21816, - 22199, 23119, 23527, 27053, 31397, 32148) }, - { CDF12(17026, 19004, 19997, 20339, 20586, 21103, - 21349, 21907, 22482, 25896, 26541, 31819) }, - { CDF12(12124, 13759, 14959, 14992, 15007, 15051, - 15078, 15166, 15255, 15753, 16039, 16606) }, - }, { - { CDF13(10407, 11208, 12900, 13181, 13823, 14175, 14899, - 15656, 15986, 20086, 20995, 22455, 24212) }, - { CDF13( 4532, 19780, 20057, 20215, 20428, 21071, 21199, - 21451, 22099, 24228, 24693, 27032, 29472) }, - { CDF13( 5273, 5379, 20177, 20270, 20385, 20439, 20949, - 21695, 21774, 23138, 24256, 24703, 26679) }, - { CDF13( 6740, 7167, 7662, 14152, 14536, 14785, 15034, - 16741, 18371, 21520, 22206, 23389, 24182) }, - { CDF13( 4987, 5368, 5928, 6068, 19114, 20315, 21857, - 22253, 22411, 24911, 25380, 26027, 26376) }, - { CDF13( 5370, 6889, 7247, 7393, 9498, 21114, 21402, - 21753, 21981, 24780, 25386, 26517, 27176) }, - { CDF13( 4816, 4961, 7204, 7326, 8765, 8930, 20169, - 20682, 20803, 23188, 23763, 24455, 24940) }, - { CDF13( 6608, 6740, 8529, 9049, 9257, 9356, 9735, - 18827, 19059, 22336, 23204, 23964, 24793) }, - { CDF13( 5998, 7419, 7781, 8933, 9255, 9549, 9753, - 10417, 18898, 22494, 23139, 24764, 25989) }, - { CDF13(10660, 11298, 12550, 12957, 13322, 13624, 14040, - 15004, 15534, 20714, 21789, 23443, 24861) }, - { CDF13(10522, 11530, 12552, 12963, 13378, 13779, 14245, - 15235, 15902, 20102, 22696, 23774, 25838) }, - { CDF13(10099, 10691, 12639, 13049, 13386, 13665, 14125, - 15163, 15636, 19676, 20474, 23519, 25208) }, - { CDF13( 3144, 5087, 7382, 7504, 7593, 7690, 7801, - 8064, 8232, 9248, 9875, 10521, 29048) }, - }, - }, .angle_delta = { - { CDF6( 2180, 5032, 7567, 22776, 26989, 30217) }, - { CDF6( 2301, 5608, 8801, 23487, 26974, 30330) }, - { CDF6( 3780, 11018, 13699, 19354, 23083, 31286) }, - { CDF6( 4581, 11226, 15147, 17138, 21834, 28397) }, - { CDF6( 1737, 10927, 14509, 19588, 22745, 28823) }, - { CDF6( 2664, 10176, 12485, 17650, 21600, 30495) }, - { CDF6( 2240, 11096, 15453, 20341, 22561, 28917) }, - { CDF6( 3605, 10428, 12459, 17676, 21244, 30655) }, - }, .filter = { - { - { CDF2(31935, 32720) }, { CDF2( 5568, 32719) }, - { CDF2( 422, 2938) }, { CDF2(28244, 32608) }, - { CDF2(31206, 31953) }, { CDF2( 4862, 32121) }, - { CDF2( 770, 1152) }, { CDF2(20889, 25637) }, - }, { - { CDF2(31910, 32724) }, { CDF2( 4120, 32712) }, - { CDF2( 305, 2247) }, { CDF2(27403, 32636) }, - { CDF2(31022, 32009) }, { CDF2( 2963, 32093) }, - { CDF2( 601, 943) }, { CDF2(14969, 21398) }, - }, - }, .newmv_mode = { - { CDF1(24035) }, { CDF1(16630) }, { CDF1(15339) }, - { CDF1( 8386) }, { CDF1(12222) }, { CDF1( 4676) }, - }, .globalmv_mode = { - { CDF1( 2175) }, { CDF1( 1054) }, - }, .refmv_mode = { - { CDF1(23974) }, { CDF1(24188) }, { CDF1(17848) }, - { CDF1(28622) }, { CDF1(24312) }, { CDF1(19923) }, - }, .drl_bit = { - { CDF1(13104) }, { CDF1(24560) }, { CDF1(18945) }, - }, .comp_inter_mode = { - { CDF7( 7760, 13823, 15808, 17641, 19156, 20666, 26891) }, - { CDF7(10730, 19452, 21145, 22749, 24039, 25131, 28724) }, - { CDF7(10664, 20221, 21588, 22906, 24295, 25387, 28436) }, - { CDF7(13298, 16984, 20471, 24182, 25067, 25736, 26422) }, - { CDF7(18904, 23325, 25242, 27432, 27898, 28258, 30758) }, - { CDF7(10725, 17454, 20124, 22820, 24195, 25168, 26046) }, - { CDF7(17125, 24273, 25814, 27492, 28214, 28704, 30592) }, - { CDF7(13046, 23214, 24505, 25942, 27435, 28442, 29330) }, - }, .intra = { - { CDF1( 806) }, { CDF1(16662) }, { CDF1(20186) }, - { CDF1(26538) }, - }, .comp = { - { CDF1(26828) }, { CDF1(24035) }, { CDF1(12031) }, - { CDF1(10640) }, { CDF1( 2901) }, - }, .comp_dir = { - { CDF1( 1198) }, { CDF1( 2070) }, { CDF1( 9166) }, - { CDF1( 7499) }, { CDF1(22475) }, - }, .jnt_comp = { - { CDF1(18244) }, { CDF1(12865) }, { CDF1( 7053) }, - { CDF1(13259) }, { CDF1( 9334) }, { CDF1( 4644) }, - }, .mask_comp = { - { CDF1(26607) }, { CDF1(22891) }, { CDF1(18840) }, - { CDF1(24594) }, { CDF1(19934) }, { CDF1(22674) }, - }, .wedge_comp = { - { CDF1(23431) }, { CDF1(13171) }, { CDF1(11470) }, - { CDF1( 9770) }, { CDF1( 9100) }, { CDF1( 8233) }, - { CDF1( 6172) }, { CDF1(11820) }, { CDF1( 7701) }, - }, .wedge_idx = { - { CDF15( 2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094, - 20359, 22362, 24127, 25702, 27752, 29450, 31171) }, - { CDF15( 806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, - 16323, 17367, 18452, 19422, 22839, 26127, 29629) }, - { CDF15( 2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, - 17939, 21332, 24520, 27470, 29456, 30529, 31656) }, - { CDF15( 1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144, - 19163, 20961, 22884, 24471, 26719, 28714, 30877) }, - { CDF15( 1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, - 15369, 16730, 18114, 19313, 22521, 26012, 29550) }, - { CDF15( 2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124, - 17270, 20533, 23434, 25972, 27944, 29570, 31416) }, - { CDF15( 1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944, - 20638, 22038, 23963, 25311, 26988, 28766, 31012) }, - { CDF15( 154, 987, 1925, 2051, 2088, 2111, 2151, 23033, - 23703, 24284, 24985, 25684, 27259, 28883, 30911) }, - { CDF15( 1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, - 22935, 25057, 27251, 29173, 30089, 30960, 31933) }, - }, .interintra = { - { CDF1(16384) }, { CDF1(26887) }, { CDF1(27597) }, - { CDF1(30237) }, - }, .interintra_mode = { - { CDF3(8192, 16384, 24576) }, - { CDF3(1875, 11082, 27332) }, - { CDF3(2473, 9996, 26388) }, - { CDF3(4238, 11537, 25926) }, - }, .interintra_wedge = { - { CDF1(20036) }, { CDF1(24957) }, { CDF1(26704) }, - { CDF1(27530) }, { CDF1(29564) }, { CDF1(29444) }, - { CDF1(26872) }, - }, .ref = { - { { CDF1( 4897) }, { CDF1(16973) }, { CDF1(29744) } }, - { { CDF1( 1555) }, { CDF1(16751) }, { CDF1(30279) } }, - { { CDF1( 4236) }, { CDF1(19647) }, { CDF1(31194) } }, - { { CDF1( 8650) }, { CDF1(24773) }, { CDF1(31895) } }, - { { CDF1( 904) }, { CDF1(11014) }, { CDF1(26875) } }, - { { CDF1( 1444) }, { CDF1(15087) }, { CDF1(30304) } }, - }, .comp_fwd_ref = { - { { CDF1( 4946) }, { CDF1(19891) }, { CDF1(30731) } }, - { { CDF1( 9468) }, { CDF1(22441) }, { CDF1(31059) } }, - { { CDF1( 1503) }, { CDF1(15160) }, { CDF1(27544) } }, - }, .comp_bwd_ref = { - { { CDF1( 2235) }, { CDF1(17182) }, { CDF1(30606) } }, - { { CDF1( 1423) }, { CDF1(15175) }, { CDF1(30489) } }, - }, .comp_uni_ref = { - { { CDF1( 5284) }, { CDF1(23152) }, { CDF1(31774) } }, - { { CDF1( 3865) }, { CDF1(14173) }, { CDF1(25120) } }, - { { CDF1( 3128) }, { CDF1(15270) }, { CDF1(26710) } }, - }, .txsz = { - { - { CDF1(19968) }, { CDF1(19968) }, { CDF1(24320) }, - }, { - { CDF2(12272, 30172) }, { CDF2(12272, 30172) }, - { CDF2(18677, 30848) }, - }, { - { CDF2(12986, 15180) }, { CDF2(12986, 15180) }, - { CDF2(24302, 25602) }, - }, { - { CDF2( 5782, 11475) }, { CDF2( 5782, 11475) }, - { CDF2(16803, 22759) }, - }, - }, .txpart = { - { { CDF1(28581) }, { CDF1(23846) }, { CDF1(20847) } }, - { { CDF1(24315) }, { CDF1(18196) }, { CDF1(12133) } }, - { { CDF1(18791) }, { CDF1(10887) }, { CDF1(11005) } }, - { { CDF1(27179) }, { CDF1(20004) }, { CDF1(11281) } }, - { { CDF1(26549) }, { CDF1(19308) }, { CDF1(14224) } }, - { { CDF1(28015) }, { CDF1(21546) }, { CDF1(14400) } }, - { { CDF1(28165) }, { CDF1(22401) }, { CDF1(16088) } }, - }, .txtp_inter1 = { - { CDF15( 4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266, - 21504, 22848, 23934, 25474, 27727, 28915, 30631) }, - { CDF15( 1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357, - 17674, 20408, 22517, 25010, 27116, 28856, 30749) }, - }, .txtp_inter2 = { - CDF11( 770, 2421, 5225, 12907, 15819, 18927, - 21561, 24089, 26595, 28526, 30529) - }, .txtp_inter3 = { - { CDF1(16384) }, { CDF1( 4167) }, { CDF1( 1998) }, { CDF1( 748) }, - }, .txtp_intra1 = { - { - { CDF6( 1535, 8035, 9461, 12751, 23467, 27825) }, - { CDF6( 564, 3335, 9709, 10870, 18143, 28094) }, - { CDF6( 672, 3247, 3676, 11982, 19415, 23127) }, - { CDF6( 5279, 13885, 15487, 18044, 23527, 30252) }, - { CDF6( 4423, 6074, 7985, 10416, 25693, 29298) }, - { CDF6( 1486, 4241, 9460, 10662, 16456, 27694) }, - { CDF6( 439, 2838, 3522, 6737, 18058, 23754) }, - { CDF6( 1190, 4233, 4855, 11670, 20281, 24377) }, - { CDF6( 1045, 4312, 8647, 10159, 18644, 29335) }, - { CDF6( 202, 3734, 4747, 7298, 17127, 24016) }, - { CDF6( 447, 4312, 6819, 8884, 16010, 23858) }, - { CDF6( 277, 4369, 5255, 8905, 16465, 22271) }, - { CDF6( 3409, 5436, 10599, 15599, 19687, 24040) }, - }, { - { CDF6( 1870, 13742, 14530, 16498, 23770, 27698) }, - { CDF6( 326, 8796, 14632, 15079, 19272, 27486) }, - { CDF6( 484, 7576, 7712, 14443, 19159, 22591) }, - { CDF6( 1126, 15340, 15895, 17023, 20896, 30279) }, - { CDF6( 655, 4854, 5249, 5913, 22099, 27138) }, - { CDF6( 1299, 6458, 8885, 9290, 14851, 25497) }, - { CDF6( 311, 5295, 5552, 6885, 16107, 22672) }, - { CDF6( 883, 8059, 8270, 11258, 17289, 21549) }, - { CDF6( 741, 7580, 9318, 10345, 16688, 29046) }, - { CDF6( 110, 7406, 7915, 9195, 16041, 23329) }, - { CDF6( 363, 7974, 9357, 10673, 15629, 24474) }, - { CDF6( 153, 7647, 8112, 9936, 15307, 19996) }, - { CDF6( 3511, 6332, 11165, 15335, 19323, 23594) }, - }, - }, .txtp_intra2 = { - { - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - }, { - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - { CDF4( 6554, 13107, 19661, 26214) }, - }, { - { CDF4( 1127, 12814, 22772, 27483) }, - { CDF4( 145, 6761, 11980, 26667) }, - { CDF4( 362, 5887, 11678, 16725) }, - { CDF4( 385, 15213, 18587, 30693) }, - { CDF4( 25, 2914, 23134, 27903) }, - { CDF4( 60, 4470, 11749, 23991) }, - { CDF4( 37, 3332, 14511, 21448) }, - { CDF4( 157, 6320, 13036, 17439) }, - { CDF4( 119, 6719, 12906, 29396) }, - { CDF4( 47, 5537, 12576, 21499) }, - { CDF4( 269, 6076, 11258, 23115) }, - { CDF4( 83, 5615, 12001, 17228) }, - { CDF4( 1968, 5556, 12023, 18547) }, - }, - }, .skip = { - { CDF1(31671) }, { CDF1(16515) }, { CDF1( 4576) }, - }, .skip_mode = { - { CDF1(32621) }, { CDF1(20708) }, { CDF1( 8127) }, - }, .partition = { - { - // 128x128 -> 64x64 - { CDF7(27899, 28219, 28529, 32484, 32539, 32619, 32639) }, - { CDF7( 6607, 6990, 8268, 32060, 32219, 32338, 32371) }, - { CDF7( 5429, 6676, 7122, 32027, 32227, 32531, 32582) }, - { CDF7( 711, 966, 1172, 32448, 32538, 32617, 32664) }, - }, { - // 64x64 -> 32x32 - { CDF9(20137, 21547, 23078, 29566, 29837, - 30261, 30524, 30892, 31724) }, - { CDF9( 6732, 7490, 9497, 27944, 28250, - 28515, 28969, 29630, 30104) }, - { CDF9( 5945, 7663, 8348, 28683, 29117, - 29749, 30064, 30298, 32238) }, - { CDF9( 870, 1212, 1487, 31198, 31394, - 31574, 31743, 31881, 32332) }, - }, { - // 32x32 -> 16x16 - { CDF9(18462, 20920, 23124, 27647, 28227, - 29049, 29519, 30178, 31544) }, - { CDF9( 7689, 9060, 12056, 24992, 25660, - 26182, 26951, 28041, 29052) }, - { CDF9( 6015, 9009, 10062, 24544, 25409, - 26545, 27071, 27526, 32047) }, - { CDF9( 1394, 2208, 2796, 28614, 29061, - 29466, 29840, 30185, 31899) }, - }, { - // 16x16 -> 8x8 - { CDF9(15597, 20929, 24571, 26706, 27664, - 28821, 29601, 30571, 31902) }, - { CDF9( 7925, 11043, 16785, 22470, 23971, - 25043, 26651, 28701, 29834) }, - { CDF9( 5414, 13269, 15111, 20488, 22360, - 24500, 25537, 26336, 32117) }, - { CDF9( 2662, 6362, 8614, 20860, 23053, - 24778, 26436, 27829, 31171) }, - }, { - // 8x8 -> 4x4 only supports the four legacy partition types - { CDF3(19132, 25510, 30392) }, - { CDF3(13928, 19855, 28540) }, - { CDF3(12522, 23679, 28629) }, - { CDF3( 9896, 18783, 25853) }, - }, - }, .seg_pred = { - { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, - }, .seg_id = { - { CDF7( 5622, 7893, 16093, 18233, 27809, 28373, 32533) }, - { CDF7(14274, 18230, 22557, 24935, 29980, 30851, 32344) }, - { CDF7(27527, 28487, 28723, 28890, 32397, 32647, 32679) }, - }, .cfl_sign = { - CDF7( 1418, 2123, 13340, 18405, 26972, 28343, 32294) - }, .cfl_alpha = { - { CDF15( 7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696, - 32700, 32704, 32708, 32712, 32716, 32720, 32724) }, - { CDF15(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573, - 32620, 32647, 32668, 32672, 32676, 32680, 32684) }, - { CDF15(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649, - 32673, 32677, 32681, 32685, 32689, 32693, 32697) }, - { CDF15(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704, - 32708, 32712, 32716, 32720, 32724, 32728, 32732) }, - { CDF15(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321, - 32394, 32464, 32516, 32560, 32576, 32593, 32622) }, - { CDF15(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843, - 32144, 32413, 32520, 32594, 32622, 32656, 32660) }, - }, .restore_wiener = { - CDF1(11570) - }, .restore_sgrproj = { - CDF1(16855) - }, .restore_switchable = { - CDF2( 9413, 22581) - }, .delta_q = { - CDF3(28160, 32120, 32677) - }, .delta_lf = { - { CDF3(28160, 32120, 32677) }, - { CDF3(28160, 32120, 32677) }, - { CDF3(28160, 32120, 32677) }, - { CDF3(28160, 32120, 32677) }, - { CDF3(28160, 32120, 32677) }, - }, .motion_mode = { - [BS_8x8] = { CDF2( 7651, 24760) }, - [BS_8x16] = { CDF2( 4738, 24765) }, - [BS_8x32] = { CDF2(28799, 31390) }, - [BS_16x8] = { CDF2( 5391, 25528) }, - [BS_16x16] = { CDF2(19419, 26810) }, - [BS_16x32] = { CDF2( 5123, 23606) }, - [BS_16x64] = { CDF2(28973, 31594) }, - [BS_32x8] = { CDF2(26431, 30774) }, - [BS_32x16] = { CDF2(11606, 24308) }, - [BS_32x32] = { CDF2(26260, 29116) }, - [BS_32x64] = { CDF2(20360, 28062) }, - [BS_64x16] = { CDF2(29742, 31203) }, - [BS_64x32] = { CDF2(21679, 26830) }, - [BS_64x64] = { CDF2(29516, 30701) }, - [BS_64x128] = { CDF2(28898, 30397) }, - [BS_128x64] = { CDF2(30878, 31335) }, - [BS_128x128] = { CDF2(32507, 32558) }, - }, .obmc = { - [BS_8x8] = { CDF1(10437) }, - [BS_8x16] = { CDF1( 9371) }, - [BS_8x32] = { CDF1(23664) }, - [BS_16x8] = { CDF1( 9301) }, - [BS_16x16] = { CDF1(17432) }, - [BS_16x32] = { CDF1(14423) }, - [BS_16x64] = { CDF1(24008) }, - [BS_32x8] = { CDF1(20901) }, - [BS_32x16] = { CDF1(15142) }, - [BS_32x32] = { CDF1(25817) }, - [BS_32x64] = { CDF1(22823) }, - [BS_64x16] = { CDF1(26879) }, - [BS_64x32] = { CDF1(22083) }, - [BS_64x64] = { CDF1(30128) }, - [BS_64x128] = { CDF1(31014) }, - [BS_128x64] = { CDF1(31560) }, - [BS_128x128] = { CDF1(32638) }, - }, .pal_y = { - { { CDF1(31676) }, { CDF1( 3419) }, { CDF1( 1261) } }, - { { CDF1(31912) }, { CDF1( 2859) }, { CDF1( 980) } }, - { { CDF1(31823) }, { CDF1( 3400) }, { CDF1( 781) } }, - { { CDF1(32030) }, { CDF1( 3561) }, { CDF1( 904) } }, - { { CDF1(32309) }, { CDF1( 7337) }, { CDF1( 1462) } }, - { { CDF1(32265) }, { CDF1( 4015) }, { CDF1( 1521) } }, - { { CDF1(32450) }, { CDF1( 7946) }, { CDF1( 129) } }, - }, .pal_sz = { - { - { CDF6( 7952, 13000, 18149, 21478, 25527, 29241) }, - { CDF6( 7139, 11421, 16195, 19544, 23666, 28073) }, - { CDF6( 7788, 12741, 17325, 20500, 24315, 28530) }, - { CDF6( 8271, 14064, 18246, 21564, 25071, 28533) }, - { CDF6(12725, 19180, 21863, 24839, 27535, 30120) }, - { CDF6( 9711, 14888, 16923, 21052, 25661, 27875) }, - { CDF6(14940, 20797, 21678, 24186, 27033, 28999) }, - }, { - { CDF6( 8713, 19979, 27128, 29609, 31331, 32272) }, - { CDF6( 5839, 15573, 23581, 26947, 29848, 31700) }, - { CDF6( 4426, 11260, 17999, 21483, 25863, 29430) }, - { CDF6( 3228, 9464, 14993, 18089, 22523, 27420) }, - { CDF6( 3768, 8886, 13091, 17852, 22495, 27207) }, - { CDF6( 2464, 8451, 12861, 21632, 25525, 28555) }, - { CDF6( 1269, 5435, 10433, 18963, 21700, 25865) }, - }, - }, .pal_uv = { - { CDF1(32461) }, { CDF1(21488) }, - }, .color_map = { - { /* y */ +typedef struct CdfDefaultContext { + CdfModeContext m; + struct { + CdfMvComponent comp; + ALIGN(uint16_t joint[N_MV_JOINTS], 8); + } mv; + ALIGN(uint16_t kfym[5][5][N_INTRA_PRED_MODES + 3], 32); +} CdfDefaultContext; + +static const CdfDefaultContext default_cdf = { + .m = { + .y_mode = { + { CDF12(22801, 23489, 24293, 24756, 25601, 26123, + 26606, 27418, 27945, 29228, 29685, 30349) }, + { CDF12(18673, 19845, 22631, 23318, 23950, 24649, + 25527, 27364, 28152, 29701, 29984, 30852) }, + { CDF12(19770, 20979, 23396, 23939, 24241, 24654, + 25136, 27073, 27830, 29360, 29730, 30659) }, + { CDF12(20155, 21301, 22838, 23178, 23261, 23533, + 23703, 24804, 25352, 26575, 27016, 28049) }, + }, .use_filter_intra = { + [BS_4x4] = { CDF1( 4621) }, + [BS_4x8] = { CDF1( 6743) }, + [BS_8x4] = { CDF1( 5893) }, + [BS_8x8] = { CDF1( 7866) }, + [BS_8x16] = { CDF1(12551) }, + [BS_16x8] = { CDF1( 9394) }, + [BS_16x16] = { CDF1(12408) }, + [BS_16x32] = { CDF1(14301) }, + [BS_32x16] = { CDF1(12756) }, + [BS_32x32] = { CDF1(22343) }, + [BS_32x64] = { CDF1(16384) }, + [BS_64x32] = { CDF1(16384) }, + [BS_64x64] = { CDF1(16384) }, + [BS_64x128] = { CDF1(16384) }, + [BS_128x64] = { CDF1(16384) }, + [BS_128x128] = { CDF1(16384) }, + [BS_4x16] = { CDF1(12770) }, + [BS_16x4] = { CDF1(10368) }, + [BS_8x32] = { CDF1(20229) }, + [BS_32x8] = { CDF1(18101) }, + [BS_16x64] = { CDF1(16384) }, + [BS_64x16] = { CDF1(16384) }, + }, .filter_intra = { + CDF4(8949, 12776, 17211, 29558), + }, .uv_mode = { { - { CDF1(28710) }, { CDF1(16384) }, { CDF1(10553) }, - { CDF1(27036) }, { CDF1(31603) }, + { CDF12(22631, 24152, 25378, 25661, 25986, 26520, + 27055, 27923, 28244, 30059, 30941, 31961) }, + { CDF12( 9513, 26881, 26973, 27046, 27118, 27664, + 27739, 27824, 28359, 29505, 29800, 31796) }, + { CDF12( 9845, 9915, 28663, 28704, 28757, 28780, + 29198, 29822, 29854, 30764, 31777, 32029) }, + { CDF12(13639, 13897, 14171, 25331, 25606, 25727, + 25953, 27148, 28577, 30612, 31355, 32493) }, + { CDF12( 9764, 9835, 9930, 9954, 25386, 27053, + 27958, 28148, 28243, 31101, 31744, 32363) }, + { CDF12(11825, 13589, 13677, 13720, 15048, 29213, + 29301, 29458, 29711, 31161, 31441, 32550) }, + { CDF12(14175, 14399, 16608, 16821, 17718, 17775, + 28551, 30200, 30245, 31837, 32342, 32667) }, + { CDF12(12885, 13038, 14978, 15590, 15673, 15748, + 16176, 29128, 29267, 30643, 31961, 32461) }, + { CDF12(12026, 13661, 13874, 15305, 15490, 15726, + 15995, 16273, 28443, 30388, 30767, 32416) }, + { CDF12(19052, 19840, 20579, 20916, 21150, 21467, + 21885, 22719, 23174, 28861, 30379, 32175) }, + { CDF12(18627, 19649, 20974, 21219, 21492, 21816, + 22199, 23119, 23527, 27053, 31397, 32148) }, + { CDF12(17026, 19004, 19997, 20339, 20586, 21103, + 21349, 21907, 22482, 25896, 26541, 31819) }, + { CDF12(12124, 13759, 14959, 14992, 15007, 15051, + 15078, 15166, 15255, 15753, 16039, 16606) }, }, { - { CDF2(27877, 30490) }, { CDF2(11532, 25697) }, - { CDF2( 6544, 30234) }, { CDF2(23018, 28072) }, - { CDF2(31915, 32385) }, + { CDF13(10407, 11208, 12900, 13181, 13823, 14175, 14899, + 15656, 15986, 20086, 20995, 22455, 24212) }, + { CDF13( 4532, 19780, 20057, 20215, 20428, 21071, 21199, + 21451, 22099, 24228, 24693, 27032, 29472) }, + { CDF13( 5273, 5379, 20177, 20270, 20385, 20439, 20949, + 21695, 21774, 23138, 24256, 24703, 26679) }, + { CDF13( 6740, 7167, 7662, 14152, 14536, 14785, 15034, + 16741, 18371, 21520, 22206, 23389, 24182) }, + { CDF13( 4987, 5368, 5928, 6068, 19114, 20315, 21857, + 22253, 22411, 24911, 25380, 26027, 26376) }, + { CDF13( 5370, 6889, 7247, 7393, 9498, 21114, 21402, + 21753, 21981, 24780, 25386, 26517, 27176) }, + { CDF13( 4816, 4961, 7204, 7326, 8765, 8930, 20169, + 20682, 20803, 23188, 23763, 24455, 24940) }, + { CDF13( 6608, 6740, 8529, 9049, 9257, 9356, 9735, + 18827, 19059, 22336, 23204, 23964, 24793) }, + { CDF13( 5998, 7419, 7781, 8933, 9255, 9549, 9753, + 10417, 18898, 22494, 23139, 24764, 25989) }, + { CDF13(10660, 11298, 12550, 12957, 13322, 13624, 14040, + 15004, 15534, 20714, 21789, 23443, 24861) }, + { CDF13(10522, 11530, 12552, 12963, 13378, 13779, 14245, + 15235, 15902, 20102, 22696, 23774, 25838) }, + { CDF13(10099, 10691, 12639, 13049, 13386, 13665, 14125, + 15163, 15636, 19676, 20474, 23519, 25208) }, + { CDF13( 3144, 5087, 7382, 7504, 7593, 7690, 7801, + 8064, 8232, 9248, 9875, 10521, 29048) }, + }, + }, .angle_delta = { + { CDF6( 2180, 5032, 7567, 22776, 26989, 30217) }, + { CDF6( 2301, 5608, 8801, 23487, 26974, 30330) }, + { CDF6( 3780, 11018, 13699, 19354, 23083, 31286) }, + { CDF6( 4581, 11226, 15147, 17138, 21834, 28397) }, + { CDF6( 1737, 10927, 14509, 19588, 22745, 28823) }, + { CDF6( 2664, 10176, 12485, 17650, 21600, 30495) }, + { CDF6( 2240, 11096, 15453, 20341, 22561, 28917) }, + { CDF6( 3605, 10428, 12459, 17676, 21244, 30655) }, + }, .filter = { + { + { CDF2(31935, 32720) }, { CDF2( 5568, 32719) }, + { CDF2( 422, 2938) }, { CDF2(28244, 32608) }, + { CDF2(31206, 31953) }, { CDF2( 4862, 32121) }, + { CDF2( 770, 1152) }, { CDF2(20889, 25637) }, }, { - { CDF3(25572, 28046, 30045) }, - { CDF3( 9478, 21590, 27256) }, - { CDF3( 7248, 26837, 29824) }, - { CDF3(19167, 24486, 28349) }, - { CDF3(31400, 31825, 32250) }, + { CDF2(31910, 32724) }, { CDF2( 4120, 32712) }, + { CDF2( 305, 2247) }, { CDF2(27403, 32636) }, + { CDF2(31022, 32009) }, { CDF2( 2963, 32093) }, + { CDF2( 601, 943) }, { CDF2(14969, 21398) }, + }, + }, .newmv_mode = { + { CDF1(24035) }, { CDF1(16630) }, { CDF1(15339) }, + { CDF1( 8386) }, { CDF1(12222) }, { CDF1( 4676) }, + }, .globalmv_mode = { + { CDF1( 2175) }, { CDF1( 1054) }, + }, .refmv_mode = { + { CDF1(23974) }, { CDF1(24188) }, { CDF1(17848) }, + { CDF1(28622) }, { CDF1(24312) }, { CDF1(19923) }, + }, .drl_bit = { + { CDF1(13104) }, { CDF1(24560) }, { CDF1(18945) }, + }, .comp_inter_mode = { + { CDF7( 7760, 13823, 15808, 17641, 19156, 20666, 26891) }, + { CDF7(10730, 19452, 21145, 22749, 24039, 25131, 28724) }, + { CDF7(10664, 20221, 21588, 22906, 24295, 25387, 28436) }, + { CDF7(13298, 16984, 20471, 24182, 25067, 25736, 26422) }, + { CDF7(18904, 23325, 25242, 27432, 27898, 28258, 30758) }, + { CDF7(10725, 17454, 20124, 22820, 24195, 25168, 26046) }, + { CDF7(17125, 24273, 25814, 27492, 28214, 28704, 30592) }, + { CDF7(13046, 23214, 24505, 25942, 27435, 28442, 29330) }, + }, .intra = { + { CDF1( 806) }, { CDF1(16662) }, { CDF1(20186) }, + { CDF1(26538) }, + }, .comp = { + { CDF1(26828) }, { CDF1(24035) }, { CDF1(12031) }, + { CDF1(10640) }, { CDF1( 2901) }, + }, .comp_dir = { + { CDF1( 1198) }, { CDF1( 2070) }, { CDF1( 9166) }, + { CDF1( 7499) }, { CDF1(22475) }, + }, .jnt_comp = { + { CDF1(18244) }, { CDF1(12865) }, { CDF1( 7053) }, + { CDF1(13259) }, { CDF1( 9334) }, { CDF1( 4644) }, + }, .mask_comp = { + { CDF1(26607) }, { CDF1(22891) }, { CDF1(18840) }, + { CDF1(24594) }, { CDF1(19934) }, { CDF1(22674) }, + }, .wedge_comp = { + { CDF1(23431) }, { CDF1(13171) }, { CDF1(11470) }, + { CDF1( 9770) }, { CDF1( 9100) }, { CDF1( 8233) }, + { CDF1( 6172) }, { CDF1(11820) }, { CDF1( 7701) }, + }, .wedge_idx = { + { CDF15( 2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094, + 20359, 22362, 24127, 25702, 27752, 29450, 31171) }, + { CDF15( 806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, + 16323, 17367, 18452, 19422, 22839, 26127, 29629) }, + { CDF15( 2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, + 17939, 21332, 24520, 27470, 29456, 30529, 31656) }, + { CDF15( 1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144, + 19163, 20961, 22884, 24471, 26719, 28714, 30877) }, + { CDF15( 1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, + 15369, 16730, 18114, 19313, 22521, 26012, 29550) }, + { CDF15( 2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124, + 17270, 20533, 23434, 25972, 27944, 29570, 31416) }, + { CDF15( 1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944, + 20638, 22038, 23963, 25311, 26988, 28766, 31012) }, + { CDF15( 154, 987, 1925, 2051, 2088, 2111, 2151, 23033, + 23703, 24284, 24985, 25684, 27259, 28883, 30911) }, + { CDF15( 1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, + 22935, 25057, 27251, 29173, 30089, 30960, 31933) }, + }, .interintra = { + { CDF1(16384) }, { CDF1(26887) }, { CDF1(27597) }, + { CDF1(30237) }, + }, .interintra_mode = { + { CDF3(8192, 16384, 24576) }, + { CDF3(1875, 11082, 27332) }, + { CDF3(2473, 9996, 26388) }, + { CDF3(4238, 11537, 25926) }, + }, .interintra_wedge = { + { CDF1(20036) }, { CDF1(24957) }, { CDF1(26704) }, + { CDF1(27530) }, { CDF1(29564) }, { CDF1(29444) }, + { CDF1(26872) }, + }, .ref = { + { { CDF1( 4897) }, { CDF1(16973) }, { CDF1(29744) } }, + { { CDF1( 1555) }, { CDF1(16751) }, { CDF1(30279) } }, + { { CDF1( 4236) }, { CDF1(19647) }, { CDF1(31194) } }, + { { CDF1( 8650) }, { CDF1(24773) }, { CDF1(31895) } }, + { { CDF1( 904) }, { CDF1(11014) }, { CDF1(26875) } }, + { { CDF1( 1444) }, { CDF1(15087) }, { CDF1(30304) } }, + }, .comp_fwd_ref = { + { { CDF1( 4946) }, { CDF1(19891) }, { CDF1(30731) } }, + { { CDF1( 9468) }, { CDF1(22441) }, { CDF1(31059) } }, + { { CDF1( 1503) }, { CDF1(15160) }, { CDF1(27544) } }, + }, .comp_bwd_ref = { + { { CDF1( 2235) }, { CDF1(17182) }, { CDF1(30606) } }, + { { CDF1( 1423) }, { CDF1(15175) }, { CDF1(30489) } }, + }, .comp_uni_ref = { + { { CDF1( 5284) }, { CDF1(23152) }, { CDF1(31774) } }, + { { CDF1( 3865) }, { CDF1(14173) }, { CDF1(25120) } }, + { { CDF1( 3128) }, { CDF1(15270) }, { CDF1(26710) } }, + }, .txsz = { + { + { CDF1(19968) }, { CDF1(19968) }, { CDF1(24320) }, }, { - { CDF4(24779, 26955, 28576, 30282) }, - { CDF4( 8669, 20364, 24073, 28093) }, - { CDF4( 4255, 27565, 29377, 31067) }, - { CDF4(19864, 23674, 26716, 29530) }, - { CDF4(31646, 31893, 32147, 32426) }, + { CDF2(12272, 30172) }, { CDF2(12272, 30172) }, + { CDF2(18677, 30848) }, }, { - { CDF5(23132, 25407, 26970, 28435, 30073) }, - { CDF5( 7443, 17242, 20717, 24762, 27982) }, - { CDF5( 6300, 24862, 26944, 28784, 30671) }, - { CDF5(18916, 22895, 25267, 27435, 29652) }, - { CDF5(31270, 31550, 31808, 32059, 32353) }, + { CDF2(12986, 15180) }, { CDF2(12986, 15180) }, + { CDF2(24302, 25602) }, }, { - { CDF6(23105, 25199, 26464, 27684, 28931, 30318) }, - { CDF6( 6950, 15447, 18952, 22681, 25567, 28563) }, - { CDF6( 7560, 23474, 25490, 27203, 28921, 30708) }, - { CDF6(18544, 22373, 24457, 26195, 28119, 30045) }, - { CDF6(31198, 31451, 31670, 31882, 32123, 32391) }, + { CDF2( 5782, 11475) }, { CDF2( 5782, 11475) }, + { CDF2(16803, 22759) }, + }, + }, .txpart = { + { { CDF1(28581) }, { CDF1(23846) }, { CDF1(20847) } }, + { { CDF1(24315) }, { CDF1(18196) }, { CDF1(12133) } }, + { { CDF1(18791) }, { CDF1(10887) }, { CDF1(11005) } }, + { { CDF1(27179) }, { CDF1(20004) }, { CDF1(11281) } }, + { { CDF1(26549) }, { CDF1(19308) }, { CDF1(14224) } }, + { { CDF1(28015) }, { CDF1(21546) }, { CDF1(14400) } }, + { { CDF1(28165) }, { CDF1(22401) }, { CDF1(16088) } }, + }, .txtp_inter1 = { + { CDF15( 4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266, + 21504, 22848, 23934, 25474, 27727, 28915, 30631) }, + { CDF15( 1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357, + 17674, 20408, 22517, 25010, 27116, 28856, 30749) }, + }, .txtp_inter2 = { + CDF11( 770, 2421, 5225, 12907, 15819, 18927, + 21561, 24089, 26595, 28526, 30529) + }, .txtp_inter3 = { + { CDF1(16384) }, { CDF1( 4167) }, { CDF1( 1998) }, { CDF1( 748) }, + }, .txtp_intra1 = { + { + { CDF6( 1535, 8035, 9461, 12751, 23467, 27825) }, + { CDF6( 564, 3335, 9709, 10870, 18143, 28094) }, + { CDF6( 672, 3247, 3676, 11982, 19415, 23127) }, + { CDF6( 5279, 13885, 15487, 18044, 23527, 30252) }, + { CDF6( 4423, 6074, 7985, 10416, 25693, 29298) }, + { CDF6( 1486, 4241, 9460, 10662, 16456, 27694) }, + { CDF6( 439, 2838, 3522, 6737, 18058, 23754) }, + { CDF6( 1190, 4233, 4855, 11670, 20281, 24377) }, + { CDF6( 1045, 4312, 8647, 10159, 18644, 29335) }, + { CDF6( 202, 3734, 4747, 7298, 17127, 24016) }, + { CDF6( 447, 4312, 6819, 8884, 16010, 23858) }, + { CDF6( 277, 4369, 5255, 8905, 16465, 22271) }, + { CDF6( 3409, 5436, 10599, 15599, 19687, 24040) }, }, { - { CDF7(21689, 23883, 25163, 26352, 27506, 28827, 30195) }, - { CDF7( 6892, 15385, 17840, 21606, 24287, 26753, 29204) }, - { CDF7( 5651, 23182, 25042, 26518, 27982, 29392, 30900) }, - { CDF7(19349, 22578, 24418, 25994, 27524, 29031, 30448) }, - { CDF7(31028, 31270, 31504, 31705, 31927, 32153, 32392) }, + { CDF6( 1870, 13742, 14530, 16498, 23770, 27698) }, + { CDF6( 326, 8796, 14632, 15079, 19272, 27486) }, + { CDF6( 484, 7576, 7712, 14443, 19159, 22591) }, + { CDF6( 1126, 15340, 15895, 17023, 20896, 30279) }, + { CDF6( 655, 4854, 5249, 5913, 22099, 27138) }, + { CDF6( 1299, 6458, 8885, 9290, 14851, 25497) }, + { CDF6( 311, 5295, 5552, 6885, 16107, 22672) }, + { CDF6( 883, 8059, 8270, 11258, 17289, 21549) }, + { CDF6( 741, 7580, 9318, 10345, 16688, 29046) }, + { CDF6( 110, 7406, 7915, 9195, 16041, 23329) }, + { CDF6( 363, 7974, 9357, 10673, 15629, 24474) }, + { CDF6( 153, 7647, 8112, 9936, 15307, 19996) }, + { CDF6( 3511, 6332, 11165, 15335, 19323, 23594) }, }, - }, { /* uv */ + }, .txtp_intra2 = { { - { CDF1(29089) }, { CDF1(16384) }, { CDF1( 8713) }, - { CDF1(29257) }, { CDF1(31610) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + }, { + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, + { CDF4( 6554, 13107, 19661, 26214) }, }, { - { CDF2(25257, 29145) }, { CDF2(12287, 27293) }, - { CDF2( 7033, 27960) }, { CDF2(20145, 25405) }, - { CDF2(30608, 31639) }, + { CDF4( 1127, 12814, 22772, 27483) }, + { CDF4( 145, 6761, 11980, 26667) }, + { CDF4( 362, 5887, 11678, 16725) }, + { CDF4( 385, 15213, 18587, 30693) }, + { CDF4( 25, 2914, 23134, 27903) }, + { CDF4( 60, 4470, 11749, 23991) }, + { CDF4( 37, 3332, 14511, 21448) }, + { CDF4( 157, 6320, 13036, 17439) }, + { CDF4( 119, 6719, 12906, 29396) }, + { CDF4( 47, 5537, 12576, 21499) }, + { CDF4( 269, 6076, 11258, 23115) }, + { CDF4( 83, 5615, 12001, 17228) }, + { CDF4( 1968, 5556, 12023, 18547) }, + }, + }, .skip = { + { CDF1(31671) }, { CDF1(16515) }, { CDF1( 4576) }, + }, .skip_mode = { + { CDF1(32621) }, { CDF1(20708) }, { CDF1( 8127) }, + }, .partition = { + { + // 128x128 -> 64x64 + { CDF7(27899, 28219, 28529, 32484, 32539, 32619, 32639) }, + { CDF7( 6607, 6990, 8268, 32060, 32219, 32338, 32371) }, + { CDF7( 5429, 6676, 7122, 32027, 32227, 32531, 32582) }, + { CDF7( 711, 966, 1172, 32448, 32538, 32617, 32664) }, }, { - { CDF3(24210, 27175, 29903) }, - { CDF3( 9888, 22386, 27214) }, - { CDF3( 5901, 26053, 29293) }, - { CDF3(18318, 22152, 28333) }, - { CDF3(30459, 31136, 31926) }, + // 64x64 -> 32x32 + { CDF9(20137, 21547, 23078, 29566, 29837, + 30261, 30524, 30892, 31724) }, + { CDF9( 6732, 7490, 9497, 27944, 28250, + 28515, 28969, 29630, 30104) }, + { CDF9( 5945, 7663, 8348, 28683, 29117, + 29749, 30064, 30298, 32238) }, + { CDF9( 870, 1212, 1487, 31198, 31394, + 31574, 31743, 31881, 32332) }, }, { - { CDF4(22980, 25479, 27781, 29986) }, - { CDF4( 8413, 21408, 24859, 28874) }, - { CDF4( 2257, 29449, 30594, 31598) }, - { CDF4(19189, 21202, 25915, 28620) }, - { CDF4(31844, 32044, 32281, 32518) }, + // 32x32 -> 16x16 + { CDF9(18462, 20920, 23124, 27647, 28227, + 29049, 29519, 30178, 31544) }, + { CDF9( 7689, 9060, 12056, 24992, 25660, + 26182, 26951, 28041, 29052) }, + { CDF9( 6015, 9009, 10062, 24544, 25409, + 26545, 27071, 27526, 32047) }, + { CDF9( 1394, 2208, 2796, 28614, 29061, + 29466, 29840, 30185, 31899) }, }, { - { CDF5(22217, 24567, 26637, 28683, 30548) }, - { CDF5( 7307, 16406, 19636, 24632, 28424) }, - { CDF5( 4441, 25064, 26879, 28942, 30919) }, - { CDF5(17210, 20528, 23319, 26750, 29582) }, - { CDF5(30674, 30953, 31396, 31735, 32207) }, + // 16x16 -> 8x8 + { CDF9(15597, 20929, 24571, 26706, 27664, + 28821, 29601, 30571, 31902) }, + { CDF9( 7925, 11043, 16785, 22470, 23971, + 25043, 26651, 28701, 29834) }, + { CDF9( 5414, 13269, 15111, 20488, 22360, + 24500, 25537, 26336, 32117) }, + { CDF9( 2662, 6362, 8614, 20860, 23053, + 24778, 26436, 27829, 31171) }, }, { - { CDF6(21239, 23168, 25044, 26962, 28705, 30506) }, - { CDF6( 6545, 15012, 18004, 21817, 25503, 28701) }, - { CDF6( 3448, 26295, 27437, 28704, 30126, 31442) }, - { CDF6(15889, 18323, 21704, 24698, 26976, 29690) }, - { CDF6(30988, 31204, 31479, 31734, 31983, 32325) }, + // 8x8 -> 4x4 only supports the four legacy partition types + { CDF3(19132, 25510, 30392) }, + { CDF3(13928, 19855, 28540) }, + { CDF3(12522, 23679, 28629) }, + { CDF3( 9896, 18783, 25853) }, + }, + }, .seg_pred = { + { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, + }, .seg_id = { + { CDF7( 5622, 7893, 16093, 18233, 27809, 28373, 32533) }, + { CDF7(14274, 18230, 22557, 24935, 29980, 30851, 32344) }, + { CDF7(27527, 28487, 28723, 28890, 32397, 32647, 32679) }, + }, .cfl_sign = { + CDF7( 1418, 2123, 13340, 18405, 26972, 28343, 32294) + }, .cfl_alpha = { + { CDF15( 7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696, + 32700, 32704, 32708, 32712, 32716, 32720, 32724) }, + { CDF15(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573, + 32620, 32647, 32668, 32672, 32676, 32680, 32684) }, + { CDF15(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649, + 32673, 32677, 32681, 32685, 32689, 32693, 32697) }, + { CDF15(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704, + 32708, 32712, 32716, 32720, 32724, 32728, 32732) }, + { CDF15(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321, + 32394, 32464, 32516, 32560, 32576, 32593, 32622) }, + { CDF15(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843, + 32144, 32413, 32520, 32594, 32622, 32656, 32660) }, + }, .restore_wiener = { + CDF1(11570) + }, .restore_sgrproj = { + CDF1(16855) + }, .restore_switchable = { + CDF2( 9413, 22581) + }, .delta_q = { + CDF3(28160, 32120, 32677) + }, .delta_lf = { + { CDF3(28160, 32120, 32677) }, + { CDF3(28160, 32120, 32677) }, + { CDF3(28160, 32120, 32677) }, + { CDF3(28160, 32120, 32677) }, + { CDF3(28160, 32120, 32677) }, + }, .motion_mode = { + [BS_8x8] = { CDF2( 7651, 24760) }, + [BS_8x16] = { CDF2( 4738, 24765) }, + [BS_8x32] = { CDF2(28799, 31390) }, + [BS_16x8] = { CDF2( 5391, 25528) }, + [BS_16x16] = { CDF2(19419, 26810) }, + [BS_16x32] = { CDF2( 5123, 23606) }, + [BS_16x64] = { CDF2(28973, 31594) }, + [BS_32x8] = { CDF2(26431, 30774) }, + [BS_32x16] = { CDF2(11606, 24308) }, + [BS_32x32] = { CDF2(26260, 29116) }, + [BS_32x64] = { CDF2(20360, 28062) }, + [BS_64x16] = { CDF2(29742, 31203) }, + [BS_64x32] = { CDF2(21679, 26830) }, + [BS_64x64] = { CDF2(29516, 30701) }, + [BS_64x128] = { CDF2(28898, 30397) }, + [BS_128x64] = { CDF2(30878, 31335) }, + [BS_128x128] = { CDF2(32507, 32558) }, + }, .obmc = { + [BS_8x8] = { CDF1(10437) }, + [BS_8x16] = { CDF1( 9371) }, + [BS_8x32] = { CDF1(23664) }, + [BS_16x8] = { CDF1( 9301) }, + [BS_16x16] = { CDF1(17432) }, + [BS_16x32] = { CDF1(14423) }, + [BS_16x64] = { CDF1(24008) }, + [BS_32x8] = { CDF1(20901) }, + [BS_32x16] = { CDF1(15142) }, + [BS_32x32] = { CDF1(25817) }, + [BS_32x64] = { CDF1(22823) }, + [BS_64x16] = { CDF1(26879) }, + [BS_64x32] = { CDF1(22083) }, + [BS_64x64] = { CDF1(30128) }, + [BS_64x128] = { CDF1(31014) }, + [BS_128x64] = { CDF1(31560) }, + [BS_128x128] = { CDF1(32638) }, + }, .pal_y = { + { { CDF1(31676) }, { CDF1( 3419) }, { CDF1( 1261) } }, + { { CDF1(31912) }, { CDF1( 2859) }, { CDF1( 980) } }, + { { CDF1(31823) }, { CDF1( 3400) }, { CDF1( 781) } }, + { { CDF1(32030) }, { CDF1( 3561) }, { CDF1( 904) } }, + { { CDF1(32309) }, { CDF1( 7337) }, { CDF1( 1462) } }, + { { CDF1(32265) }, { CDF1( 4015) }, { CDF1( 1521) } }, + { { CDF1(32450) }, { CDF1( 7946) }, { CDF1( 129) } }, + }, .pal_sz = { + { + { CDF6( 7952, 13000, 18149, 21478, 25527, 29241) }, + { CDF6( 7139, 11421, 16195, 19544, 23666, 28073) }, + { CDF6( 7788, 12741, 17325, 20500, 24315, 28530) }, + { CDF6( 8271, 14064, 18246, 21564, 25071, 28533) }, + { CDF6(12725, 19180, 21863, 24839, 27535, 30120) }, + { CDF6( 9711, 14888, 16923, 21052, 25661, 27875) }, + { CDF6(14940, 20797, 21678, 24186, 27033, 28999) }, }, { - { CDF7(21442, 23288, 24758, 26246, 27649, 28980, 30563) }, - { CDF7( 5863, 14933, 17552, 20668, 23683, 26411, 29273) }, - { CDF7( 3415, 25810, 26877, 27990, 29223, 30394, 31618) }, - { CDF7(17965, 20084, 22232, 23974, 26274, 28402, 30390) }, - { CDF7(31190, 31329, 31516, 31679, 31825, 32026, 32322) }, + { CDF6( 8713, 19979, 27128, 29609, 31331, 32272) }, + { CDF6( 5839, 15573, 23581, 26947, 29848, 31700) }, + { CDF6( 4426, 11260, 17999, 21483, 25863, 29430) }, + { CDF6( 3228, 9464, 14993, 18089, 22523, 27420) }, + { CDF6( 3768, 8886, 13091, 17852, 22495, 27207) }, + { CDF6( 2464, 8451, 12861, 21632, 25525, 28555) }, + { CDF6( 1269, 5435, 10433, 18963, 21700, 25865) }, + }, + }, .pal_uv = { + { CDF1(32461) }, { CDF1(21488) }, + }, .color_map = { + { /* y */ + { + { CDF1(28710) }, { CDF1(16384) }, { CDF1(10553) }, + { CDF1(27036) }, { CDF1(31603) }, + }, { + { CDF2(27877, 30490) }, { CDF2(11532, 25697) }, + { CDF2( 6544, 30234) }, { CDF2(23018, 28072) }, + { CDF2(31915, 32385) }, + }, { + { CDF3(25572, 28046, 30045) }, + { CDF3( 9478, 21590, 27256) }, + { CDF3( 7248, 26837, 29824) }, + { CDF3(19167, 24486, 28349) }, + { CDF3(31400, 31825, 32250) }, + }, { + { CDF4(24779, 26955, 28576, 30282) }, + { CDF4( 8669, 20364, 24073, 28093) }, + { CDF4( 4255, 27565, 29377, 31067) }, + { CDF4(19864, 23674, 26716, 29530) }, + { CDF4(31646, 31893, 32147, 32426) }, + }, { + { CDF5(23132, 25407, 26970, 28435, 30073) }, + { CDF5( 7443, 17242, 20717, 24762, 27982) }, + { CDF5( 6300, 24862, 26944, 28784, 30671) }, + { CDF5(18916, 22895, 25267, 27435, 29652) }, + { CDF5(31270, 31550, 31808, 32059, 32353) }, + }, { + { CDF6(23105, 25199, 26464, 27684, 28931, 30318) }, + { CDF6( 6950, 15447, 18952, 22681, 25567, 28563) }, + { CDF6( 7560, 23474, 25490, 27203, 28921, 30708) }, + { CDF6(18544, 22373, 24457, 26195, 28119, 30045) }, + { CDF6(31198, 31451, 31670, 31882, 32123, 32391) }, + }, { + { CDF7(21689, 23883, 25163, 26352, 27506, 28827, 30195) }, + { CDF7( 6892, 15385, 17840, 21606, 24287, 26753, 29204) }, + { CDF7( 5651, 23182, 25042, 26518, 27982, 29392, 30900) }, + { CDF7(19349, 22578, 24418, 25994, 27524, 29031, 30448) }, + { CDF7(31028, 31270, 31504, 31705, 31927, 32153, 32392) }, + }, + }, { /* uv */ + { + { CDF1(29089) }, { CDF1(16384) }, { CDF1( 8713) }, + { CDF1(29257) }, { CDF1(31610) }, + }, { + { CDF2(25257, 29145) }, { CDF2(12287, 27293) }, + { CDF2( 7033, 27960) }, { CDF2(20145, 25405) }, + { CDF2(30608, 31639) }, + }, { + { CDF3(24210, 27175, 29903) }, + { CDF3( 9888, 22386, 27214) }, + { CDF3( 5901, 26053, 29293) }, + { CDF3(18318, 22152, 28333) }, + { CDF3(30459, 31136, 31926) }, + }, { + { CDF4(22980, 25479, 27781, 29986) }, + { CDF4( 8413, 21408, 24859, 28874) }, + { CDF4( 2257, 29449, 30594, 31598) }, + { CDF4(19189, 21202, 25915, 28620) }, + { CDF4(31844, 32044, 32281, 32518) }, + }, { + { CDF5(22217, 24567, 26637, 28683, 30548) }, + { CDF5( 7307, 16406, 19636, 24632, 28424) }, + { CDF5( 4441, 25064, 26879, 28942, 30919) }, + { CDF5(17210, 20528, 23319, 26750, 29582) }, + { CDF5(30674, 30953, 31396, 31735, 32207) }, + }, { + { CDF6(21239, 23168, 25044, 26962, 28705, 30506) }, + { CDF6( 6545, 15012, 18004, 21817, 25503, 28701) }, + { CDF6( 3448, 26295, 27437, 28704, 30126, 31442) }, + { CDF6(15889, 18323, 21704, 24698, 26976, 29690) }, + { CDF6(30988, 31204, 31479, 31734, 31983, 32325) }, + }, { + { CDF7(21442, 23288, 24758, 26246, 27649, 28980, 30563) }, + { CDF7( 5863, 14933, 17552, 20668, 23683, 26411, 29273) }, + { CDF7( 3415, 25810, 26877, 27990, 29223, 30394, 31618) }, + { CDF7(17965, 20084, 22232, 23974, 26274, 28402, 30390) }, + { CDF7(31190, 31329, 31516, 31679, 31825, 32026, 32322) }, + }, }, + }, .intrabc = { + CDF1(30531) + }, + }, .mv = { + .comp = { + .classes = { + CDF10(28672, 30976, 31858, 32320, 32551, + 32656, 32740, 32757, 32762, 32767) + }, .class0 = { + CDF1(27648) + }, .classN = { + { CDF1(17408) }, { CDF1(17920) }, { CDF1(18944) }, + { CDF1(20480) }, { CDF1(22528) }, { CDF1(24576) }, + { CDF1(28672) }, { CDF1(29952) }, { CDF1(29952) }, + { CDF1(30720) }, + }, .class0_fp = { + { CDF3(16384, 24576, 26624) }, + { CDF3(12288, 21248, 24128) }, + }, .classN_fp = { + CDF3( 8192, 17408, 21248) + }, .class0_hp = { + CDF1(20480) + }, .classN_hp = { + CDF1(16384) + }, .sign = { + CDF1(16384) + }, + }, .joint = { + CDF3( 4096, 11264, 19328) + }, + }, .kfym = { + { + { CDF12(15588, 17027, 19338, 20218, 20682, 21110, + 21825, 23244, 24189, 28165, 29093, 30466) }, + { CDF12(12016, 18066, 19516, 20303, 20719, 21444, + 21888, 23032, 24434, 28658, 30172, 31409) }, + { CDF12(10052, 10771, 22296, 22788, 23055, 23239, + 24133, 25620, 26160, 29336, 29929, 31567) }, + { CDF12(14091, 15406, 16442, 18808, 19136, 19546, + 19998, 22096, 24746, 29585, 30958, 32462) }, + { CDF12(12122, 13265, 15603, 16501, 18609, 20033, + 22391, 25583, 26437, 30261, 31073, 32475) }, + }, { + { CDF12(10023, 19585, 20848, 21440, 21832, 22760, + 23089, 24023, 25381, 29014, 30482, 31436) }, + { CDF12( 5983, 24099, 24560, 24886, 25066, 25795, + 25913, 26423, 27610, 29905, 31276, 31794) }, + { CDF12( 7444, 12781, 20177, 20728, 21077, 21607, + 22170, 23405, 24469, 27915, 29090, 30492) }, + { CDF12( 8537, 14689, 15432, 17087, 17408, 18172, + 18408, 19825, 24649, 29153, 31096, 32210) }, + { CDF12( 7543, 14231, 15496, 16195, 17905, 20717, + 21984, 24516, 26001, 29675, 30981, 31994) }, + }, { + { CDF12(12613, 13591, 21383, 22004, 22312, 22577, + 23401, 25055, 25729, 29538, 30305, 32077) }, + { CDF12( 9687, 13470, 18506, 19230, 19604, 20147, + 20695, 22062, 23219, 27743, 29211, 30907) }, + { CDF12( 6183, 6505, 26024, 26252, 26366, 26434, + 27082, 28354, 28555, 30467, 30794, 32086) }, + { CDF12(10718, 11734, 14954, 17224, 17565, 17924, + 18561, 21523, 23878, 28975, 30287, 32252) }, + { CDF12( 9194, 9858, 16501, 17263, 18424, 19171, + 21563, 25961, 26561, 30072, 30737, 32463) }, + }, { + { CDF12(12602, 14399, 15488, 18381, 18778, 19315, + 19724, 21419, 25060, 29696, 30917, 32409) }, + { CDF12( 8203, 13821, 14524, 17105, 17439, 18131, + 18404, 19468, 25225, 29485, 31158, 32342) }, + { CDF12( 8451, 9731, 15004, 17643, 18012, 18425, + 19070, 21538, 24605, 29118, 30078, 32018) }, + { CDF12( 7714, 9048, 9516, 16667, 16817, 16994, + 17153, 18767, 26743, 30389, 31536, 32528) }, + { CDF12( 8843, 10280, 11496, 15317, 16652, 17943, + 19108, 22718, 25769, 29953, 30983, 32485) }, + }, { + { CDF12(12578, 13671, 15979, 16834, 19075, 20913, + 22989, 25449, 26219, 30214, 31150, 32477) }, + { CDF12( 9563, 13626, 15080, 15892, 17756, 20863, + 22207, 24236, 25380, 29653, 31143, 32277) }, + { CDF12( 8356, 8901, 17616, 18256, 19350, 20106, + 22598, 25947, 26466, 29900, 30523, 32261) }, + { CDF12(10835, 11815, 13124, 16042, 17018, 18039, + 18947, 22753, 24615, 29489, 30883, 32482) }, + { CDF12( 7618, 8288, 9859, 10509, 15386, 18657, + 22903, 28776, 29180, 31355, 31802, 32593) }, }, - }, .intrabc = { - CDF1(30531) - }, -}; - -static const CdfMvComponent default_mv_component_cdf = { - .classes = { - CDF10(28672, 30976, 31858, 32320, 32551, - 32656, 32740, 32757, 32762, 32767) - }, .class0 = { - CDF1(27648) - }, .classN = { - { CDF1(17408) }, { CDF1(17920) }, { CDF1(18944) }, - { CDF1(20480) }, { CDF1(22528) }, { CDF1(24576) }, - { CDF1(28672) }, { CDF1(29952) }, { CDF1(29952) }, - { CDF1(30720) }, - }, .class0_fp = { - { CDF3(16384, 24576, 26624) }, - { CDF3(12288, 21248, 24128) }, - }, .classN_fp = { - CDF3( 8192, 17408, 21248) - }, .class0_hp = { - CDF1(20480) - }, .classN_hp = { - CDF1(16384) - }, .sign = { - CDF1(16384) - }, -}; - -static const uint16_t ALIGN(default_mv_joint_cdf[N_MV_JOINTS], 8) = { - CDF3( 4096, 11264, 19328) -}; - -static const uint16_t ALIGN(default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 3], 32) = { - { - { CDF12(15588, 17027, 19338, 20218, 20682, 21110, - 21825, 23244, 24189, 28165, 29093, 30466) }, - { CDF12(12016, 18066, 19516, 20303, 20719, 21444, - 21888, 23032, 24434, 28658, 30172, 31409) }, - { CDF12(10052, 10771, 22296, 22788, 23055, 23239, - 24133, 25620, 26160, 29336, 29929, 31567) }, - { CDF12(14091, 15406, 16442, 18808, 19136, 19546, - 19998, 22096, 24746, 29585, 30958, 32462) }, - { CDF12(12122, 13265, 15603, 16501, 18609, 20033, - 22391, 25583, 26437, 30261, 31073, 32475) }, - }, { - { CDF12(10023, 19585, 20848, 21440, 21832, 22760, - 23089, 24023, 25381, 29014, 30482, 31436) }, - { CDF12( 5983, 24099, 24560, 24886, 25066, 25795, - 25913, 26423, 27610, 29905, 31276, 31794) }, - { CDF12( 7444, 12781, 20177, 20728, 21077, 21607, - 22170, 23405, 24469, 27915, 29090, 30492) }, - { CDF12( 8537, 14689, 15432, 17087, 17408, 18172, - 18408, 19825, 24649, 29153, 31096, 32210) }, - { CDF12( 7543, 14231, 15496, 16195, 17905, 20717, - 21984, 24516, 26001, 29675, 30981, 31994) }, - }, { - { CDF12(12613, 13591, 21383, 22004, 22312, 22577, - 23401, 25055, 25729, 29538, 30305, 32077) }, - { CDF12( 9687, 13470, 18506, 19230, 19604, 20147, - 20695, 22062, 23219, 27743, 29211, 30907) }, - { CDF12( 6183, 6505, 26024, 26252, 26366, 26434, - 27082, 28354, 28555, 30467, 30794, 32086) }, - { CDF12(10718, 11734, 14954, 17224, 17565, 17924, - 18561, 21523, 23878, 28975, 30287, 32252) }, - { CDF12( 9194, 9858, 16501, 17263, 18424, 19171, - 21563, 25961, 26561, 30072, 30737, 32463) }, - }, { - { CDF12(12602, 14399, 15488, 18381, 18778, 19315, - 19724, 21419, 25060, 29696, 30917, 32409) }, - { CDF12( 8203, 13821, 14524, 17105, 17439, 18131, - 18404, 19468, 25225, 29485, 31158, 32342) }, - { CDF12( 8451, 9731, 15004, 17643, 18012, 18425, - 19070, 21538, 24605, 29118, 30078, 32018) }, - { CDF12( 7714, 9048, 9516, 16667, 16817, 16994, - 17153, 18767, 26743, 30389, 31536, 32528) }, - { CDF12( 8843, 10280, 11496, 15317, 16652, 17943, - 19108, 22718, 25769, 29953, 30983, 32485) }, - }, { - { CDF12(12578, 13671, 15979, 16834, 19075, 20913, - 22989, 25449, 26219, 30214, 31150, 32477) }, - { CDF12( 9563, 13626, 15080, 15892, 17756, 20863, - 22207, 24236, 25380, 29653, 31143, 32277) }, - { CDF12( 8356, 8901, 17616, 18256, 19350, 20106, - 22598, 25947, 26466, 29900, 30523, 32261) }, - { CDF12(10835, 11815, 13124, 16042, 17018, 18039, - 18947, 22753, 24615, 29489, 30883, 32482) }, - { CDF12( 7618, 8288, 9859, 10509, 15386, 18657, - 22903, 28776, 29180, 31355, 31802, 32593) }, }, }; -static const CdfCoefContext av1_default_coef_cdf[4] = { +static const CdfCoefContext default_coef_cdf[4] = { [0] = { .skip = { { @@ -3951,10 +3958,8 @@ void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr, { #define update_cdf_1d(n1d, name) \ do { \ - memcpy(dst->name, src->name, sizeof(dst->name)); \ dst->name[n1d] = 0; \ } while (0) - #define update_cdf_2d(n1d, n2d, name) \ for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j]) #define update_cdf_3d(n1d, n2d, n3d, name) \ @@ -3962,29 +3967,8 @@ void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr, #define update_cdf_4d(n1d, n2d, n3d, n4d, name) \ for (int l = 0; l < (n1d); l++) update_cdf_3d(n2d, n3d, n4d, name[l]) -#define update_bit_0d(name) \ - do { \ - dst->name[0] = src->name[0]; \ - dst->name[1] = 0; \ - } while (0) - -#define update_bit_1d(n1d, name) \ - for (int i = 0; i < (n1d); i++) update_bit_0d(name[i]) -#define update_bit_2d(n1d, n2d, name) \ - for (int j = 0; j < (n1d); j++) update_bit_1d(n2d, name[j]) -#define update_bit_3d(n1d, n2d, n3d, name) \ - for (int k = 0; k < (n1d); k++) update_bit_2d(n2d, n3d, name[k]) + memcpy(dst, src, offsetof(CdfContext, m.intrabc)); - update_bit_1d(N_BS_SIZES, m.use_filter_intra); - update_cdf_1d(4, m.filter_intra); - update_cdf_3d(2, N_INTRA_PRED_MODES, N_UV_INTRA_PRED_MODES - 1 - !k, m.uv_mode); - update_cdf_2d(8, 6, m.angle_delta); - update_cdf_3d(N_TX_SIZES - 1, 3, imin(k + 1, 2), m.txsz); - update_cdf_3d(2, N_INTRA_PRED_MODES, 6, m.txtp_intra1); - update_cdf_3d(3, N_INTRA_PRED_MODES, 4, m.txtp_intra2); - update_bit_1d(3, m.skip); - update_cdf_3d(N_BL_LEVELS, 4, dav1d_partition_type_count[k], m.partition); - update_bit_2d(N_TX_SIZES, 13, coef.skip); update_cdf_3d(2, 2, 4, coef.eob_bin_16); update_cdf_3d(2, 2, 5, coef.eob_bin_32); update_cdf_3d(2, 2, 6, coef.eob_bin_64); @@ -3992,106 +3976,104 @@ void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr, update_cdf_3d(2, 2, 8, coef.eob_bin_256); update_cdf_2d(2, 9, coef.eob_bin_512); update_cdf_2d(2, 10, coef.eob_bin_1024); - update_bit_3d(N_TX_SIZES, 2, 11 /*22*/, coef.eob_hi_bit); update_cdf_4d(N_TX_SIZES, 2, 4, 2, coef.eob_base_tok); update_cdf_4d(N_TX_SIZES, 2, 41 /*42*/, 3, coef.base_tok); - update_bit_2d(2, 3, coef.dc_sign); update_cdf_4d(4, 2, 21, 3, coef.br_tok); - update_cdf_2d(3, DAV1D_MAX_SEGMENTS - 1, m.seg_id); - update_cdf_1d(7, m.cfl_sign); + update_cdf_4d(N_TX_SIZES, 2, 11 /*22*/, 1, coef.eob_hi_bit); + update_cdf_3d(N_TX_SIZES, 13, 1, coef.skip); + update_cdf_3d(2, 3, 1, coef.dc_sign); + + update_cdf_3d(2, N_INTRA_PRED_MODES, N_UV_INTRA_PRED_MODES - 1 - !k, m.uv_mode); + update_cdf_2d(4, N_PARTITIONS - 3, m.partition[BL_128X128]); + for (int k = BL_64X64; k < BL_8X8; k++) + update_cdf_2d(4, N_PARTITIONS - 1, m.partition[k]); + update_cdf_2d(4, N_SUB8X8_PARTITIONS - 1, m.partition[BL_8X8]); update_cdf_2d(6, 15, m.cfl_alpha); - update_bit_0d(m.restore_wiener); - update_bit_0d(m.restore_sgrproj); - update_cdf_1d(2, m.restore_switchable); - update_cdf_1d(3, m.delta_q); - update_cdf_2d(5, 3, m.delta_lf); - update_bit_2d(7, 3, m.pal_y); - update_bit_1d(2, m.pal_uv); - update_cdf_3d(2, 7, 6, m.pal_sz); - update_cdf_4d(2, 7, 5, k + 1, m.color_map); - update_bit_2d(7, 3, m.txpart); update_cdf_2d(2, 15, m.txtp_inter1); update_cdf_1d(11, m.txtp_inter2); - update_bit_1d(4, m.txtp_inter3); - - if (IS_KEY_OR_INTRA(hdr)) { - update_bit_0d(m.intrabc); + update_cdf_3d(2, N_INTRA_PRED_MODES, 6, m.txtp_intra1); + update_cdf_3d(3, N_INTRA_PRED_MODES, 4, m.txtp_intra2); + update_cdf_1d(7, m.cfl_sign); + update_cdf_2d(8, 6, m.angle_delta); + update_cdf_1d(4, m.filter_intra); + update_cdf_2d(3, DAV1D_MAX_SEGMENTS - 1, m.seg_id); + update_cdf_3d(2, 7, 6, m.pal_sz); + update_cdf_4d(2, 7, 5, k + 1, m.color_map); + update_cdf_3d(N_TX_SIZES - 1, 3, imin(k + 1, 2), m.txsz); + update_cdf_1d(3, m.delta_q); + update_cdf_2d(5, 3, m.delta_lf); + update_cdf_1d(2, m.restore_switchable); + update_cdf_1d(1, m.restore_wiener); + update_cdf_1d(1, m.restore_sgrproj); + update_cdf_2d(4, 1, m.txtp_inter3); + update_cdf_2d(N_BS_SIZES, 1, m.use_filter_intra); + update_cdf_3d(7, 3, 1, m.txpart); + update_cdf_2d(3, 1, m.skip); + update_cdf_3d(7, 3, 1, m.pal_y); + update_cdf_2d(2, 1, m.pal_uv); - update_cdf_1d(N_MV_JOINTS - 1, dmv.joint); - for (int k = 0; k < 2; k++) { - update_cdf_1d(10, dmv.comp[k].classes); - update_bit_0d(dmv.comp[k].class0); - update_bit_1d(10, dmv.comp[k].classN); - update_bit_0d(dmv.comp[k].sign); - } + if (IS_KEY_OR_INTRA(hdr)) return; - } - update_bit_1d(3, m.skip_mode); + memcpy(dst->m.y_mode, src->m.y_mode, + offsetof(CdfContext, kfym) - offsetof(CdfContext, m.y_mode)); + update_cdf_2d(4, N_INTRA_PRED_MODES - 1, m.y_mode); - update_cdf_3d(2, 8, DAV1D_N_SWITCHABLE_FILTERS - 1, m.filter); - update_bit_1d(6, m.newmv_mode); - update_bit_1d(2, m.globalmv_mode); - update_bit_1d(6, m.refmv_mode); - update_bit_1d(3, m.drl_bit); - update_cdf_2d(8, N_COMP_INTER_PRED_MODES - 1, m.comp_inter_mode); - update_bit_1d(4, m.intra); - update_bit_1d(5, m.comp); - update_bit_1d(5, m.comp_dir); - update_bit_1d(6, m.jnt_comp); - update_bit_1d(6, m.mask_comp); - update_bit_1d(9, m.wedge_comp); update_cdf_2d(9, 15, m.wedge_idx); - update_bit_2d(6, 3, m.ref); - update_bit_2d(3, 3, m.comp_fwd_ref); - update_bit_2d(2, 3, m.comp_bwd_ref); - update_bit_2d(3, 3, m.comp_uni_ref); - update_bit_1d(3, m.seg_pred); - update_bit_1d(4, m.interintra); - update_bit_1d(7, m.interintra_wedge); + update_cdf_2d(8, N_COMP_INTER_PRED_MODES - 1, m.comp_inter_mode); + update_cdf_3d(2, 8, DAV1D_N_SWITCHABLE_FILTERS - 1, m.filter); update_cdf_2d(4, 3, m.interintra_mode); update_cdf_2d(N_BS_SIZES, 2, m.motion_mode); - update_bit_1d(N_BS_SIZES, m.obmc); + update_cdf_2d(3, 1, m.skip_mode); + update_cdf_2d(6, 1, m.newmv_mode); + update_cdf_2d(2, 1, m.globalmv_mode); + update_cdf_2d(6, 1, m.refmv_mode); + update_cdf_2d(3, 1, m.drl_bit); + update_cdf_2d(4, 1, m.intra); + update_cdf_2d(5, 1, m.comp); + update_cdf_2d(5, 1, m.comp_dir); + update_cdf_2d(6, 1, m.jnt_comp); + update_cdf_2d(6, 1, m.mask_comp); + update_cdf_2d(9, 1, m.wedge_comp); + update_cdf_3d(6, 3, 1, m.ref); + update_cdf_3d(3, 3, 1, m.comp_fwd_ref); + update_cdf_3d(2, 3, 1, m.comp_bwd_ref); + update_cdf_3d(3, 3, 1, m.comp_uni_ref); + update_cdf_2d(3, 1, m.seg_pred); + update_cdf_2d(4, 1, m.interintra); + update_cdf_2d(7, 1, m.interintra_wedge); + update_cdf_2d(N_BS_SIZES, 1, m.obmc); - update_cdf_1d(N_MV_JOINTS - 1, mv.joint); for (int k = 0; k < 2; k++) { update_cdf_1d(10, mv.comp[k].classes); - update_bit_0d(mv.comp[k].class0); - update_bit_1d(10, mv.comp[k].classN); + update_cdf_1d(1, mv.comp[k].sign); + update_cdf_1d(1, mv.comp[k].class0); update_cdf_2d(2, 3, mv.comp[k].class0_fp); + update_cdf_1d(1, mv.comp[k].class0_hp); + update_cdf_2d(10, 1, mv.comp[k].classN); update_cdf_1d(3, mv.comp[k].classN_fp); - update_bit_0d(mv.comp[k].class0_hp); - update_bit_0d(mv.comp[k].classN_hp); - update_bit_0d(mv.comp[k].sign); + update_cdf_1d(1, mv.comp[k].classN_hp); } + update_cdf_1d(N_MV_JOINTS - 1, mv.joint); } /* * CDF threading wrappers. */ -static inline int get_qcat_idx(const int q) { - if (q <= 20) return 0; - if (q <= 60) return 1; - if (q <= 120) return 2; - return 3; -} - -void dav1d_cdf_thread_init_static(CdfThreadContext *const cdf, const int qidx) { +void dav1d_cdf_thread_init_static(CdfThreadContext *const cdf, const unsigned qidx) { cdf->ref = NULL; - cdf->data.qcat = get_qcat_idx(qidx); + cdf->data.qcat = (qidx > 20) + (qidx > 60) + (qidx > 120); } void dav1d_cdf_thread_copy(CdfContext *const dst, const CdfThreadContext *const src) { if (src->ref) { memcpy(dst, src->data.cdf, sizeof(*dst)); } else { - dst->m = av1_default_cdf; - memcpy(dst->kfym, default_kf_y_mode_cdf, sizeof(default_kf_y_mode_cdf)); - dst->coef = av1_default_coef_cdf[src->data.qcat]; - memcpy(dst->mv.joint, default_mv_joint_cdf, sizeof(default_mv_joint_cdf)); - memcpy(dst->dmv.joint, default_mv_joint_cdf, sizeof(default_mv_joint_cdf)); - dst->mv.comp[0] = dst->mv.comp[1] = dst->dmv.comp[0] = dst->dmv.comp[1] = - default_mv_component_cdf; + dst->coef = default_coef_cdf[src->data.qcat]; + memcpy(&dst->m, &default_cdf.m, + offsetof(CdfDefaultContext, mv.joint)); + memcpy(&dst->mv.comp[1], &default_cdf.mv.comp, + sizeof(default_cdf) - offsetof(CdfDefaultContext, mv.comp)); } } diff --git a/third_party/dav1d/src/cdf.h b/third_party/dav1d/src/cdf.h index 4b30474baa..c9b516dc72 100644 --- a/third_party/dav1d/src/cdf.h +++ b/third_party/dav1d/src/cdf.h @@ -34,12 +34,10 @@ #include "src/ref.h" #include "src/thread_data.h" -/* Buffers padded to [8] or [16] for SIMD where needed. */ +/* Buffers padded to [4]/[8]/[16] for SIMD where needed. */ typedef struct CdfModeContext { - ALIGN(uint16_t y_mode[4][N_INTRA_PRED_MODES + 3], 32); ALIGN(uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 2], 32); - ALIGN(uint16_t wedge_idx[9][16], 32); ALIGN(uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 6], 32); ALIGN(uint16_t cfl_alpha[6][16], 32); ALIGN(uint16_t txtp_inter1[2][16], 32); @@ -49,23 +47,33 @@ typedef struct CdfModeContext { ALIGN(uint16_t cfl_sign[8], 16); ALIGN(uint16_t angle_delta[8][8], 16); ALIGN(uint16_t filter_intra[5 + 3], 16); - ALIGN(uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES], 16); ALIGN(uint16_t seg_id[3][DAV1D_MAX_SEGMENTS], 16); ALIGN(uint16_t pal_sz[2][7][7 + 1], 16); ALIGN(uint16_t color_map[2][7][5][8], 16); - ALIGN(uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1], 8); ALIGN(uint16_t txsz[N_TX_SIZES - 1][3][4], 8); - ALIGN(uint16_t motion_mode[N_BS_SIZES][3 + 1], 8); ALIGN(uint16_t delta_q[4], 8); ALIGN(uint16_t delta_lf[5][4], 8); - ALIGN(uint16_t interintra_mode[4][4], 8); ALIGN(uint16_t restore_switchable[3 + 1], 8); ALIGN(uint16_t restore_wiener[2], 4); ALIGN(uint16_t restore_sgrproj[2], 4); - ALIGN(uint16_t interintra[7][2], 4); - ALIGN(uint16_t interintra_wedge[7][2], 4); ALIGN(uint16_t txtp_inter3[4][2], 4); ALIGN(uint16_t use_filter_intra[N_BS_SIZES][2], 4); + ALIGN(uint16_t txpart[7][3][2], 4); + ALIGN(uint16_t skip[3][2], 4); + ALIGN(uint16_t pal_y[7][3][2], 4); + ALIGN(uint16_t pal_uv[2][2], 4); + + /* key/intra */ + ALIGN(uint16_t intrabc[2], 4); + + /* inter/switch */ + ALIGN(uint16_t y_mode[4][N_INTRA_PRED_MODES + 3], 32); + ALIGN(uint16_t wedge_idx[9][16], 32); + ALIGN(uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES], 16); + ALIGN(uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1], 8); + ALIGN(uint16_t interintra_mode[4][4], 8); + ALIGN(uint16_t motion_mode[N_BS_SIZES][3 + 1], 8); + ALIGN(uint16_t skip_mode[3][2], 4); ALIGN(uint16_t newmv_mode[6][2], 4); ALIGN(uint16_t globalmv_mode[2][2], 4); ALIGN(uint16_t refmv_mode[6][2], 4); @@ -80,14 +88,10 @@ typedef struct CdfModeContext { ALIGN(uint16_t comp_fwd_ref[3][3][2], 4); ALIGN(uint16_t comp_bwd_ref[2][3][2], 4); ALIGN(uint16_t comp_uni_ref[3][3][2], 4); - ALIGN(uint16_t txpart[7][3][2], 4); - ALIGN(uint16_t skip[3][2], 4); - ALIGN(uint16_t skip_mode[3][2], 4); ALIGN(uint16_t seg_pred[3][2], 4); + ALIGN(uint16_t interintra[7][2], 4); + ALIGN(uint16_t interintra_wedge[7][2], 4); ALIGN(uint16_t obmc[N_BS_SIZES][2], 4); - ALIGN(uint16_t pal_y[7][3][2], 4); - ALIGN(uint16_t pal_uv[2][2], 4); - ALIGN(uint16_t intrabc[2], 4); } CdfModeContext; typedef struct CdfCoefContext { @@ -108,13 +112,13 @@ typedef struct CdfCoefContext { typedef struct CdfMvComponent { ALIGN(uint16_t classes[11 + 5], 32); + ALIGN(uint16_t sign[2], 4); + ALIGN(uint16_t class0[2], 4); ALIGN(uint16_t class0_fp[2][4], 8); - ALIGN(uint16_t classN_fp[4], 8); ALIGN(uint16_t class0_hp[2], 4); - ALIGN(uint16_t classN_hp[2], 4); - ALIGN(uint16_t class0[2], 4); ALIGN(uint16_t classN[10][2], 4); - ALIGN(uint16_t sign[2], 4); + ALIGN(uint16_t classN_fp[4], 8); + ALIGN(uint16_t classN_hp[2], 4); } CdfMvComponent; typedef struct CdfMvContext { @@ -123,10 +127,10 @@ typedef struct CdfMvContext { } CdfMvContext; typedef struct CdfContext { + CdfCoefContext coef; CdfModeContext m; + CdfMvContext mv; ALIGN(uint16_t kfym[5][5][N_INTRA_PRED_MODES + 3], 32); - CdfCoefContext coef; - CdfMvContext mv, dmv; } CdfContext; typedef struct CdfThreadContext { @@ -138,7 +142,7 @@ typedef struct CdfThreadContext { atomic_uint *progress; } CdfThreadContext; -void dav1d_cdf_thread_init_static(CdfThreadContext *cdf, int qidx); +void dav1d_cdf_thread_init_static(CdfThreadContext *cdf, unsigned qidx); int dav1d_cdf_thread_alloc(Dav1dContext *c, CdfThreadContext *cdf, const int have_frame_mt); void dav1d_cdf_thread_copy(CdfContext *dst, const CdfThreadContext *src); diff --git a/third_party/dav1d/src/decode.c b/third_party/dav1d/src/decode.c index eed9dfb756..7427c35592 100644 --- a/third_party/dav1d/src/decode.c +++ b/third_party/dav1d/src/decode.c @@ -73,42 +73,29 @@ static void init_quant_tables(const Dav1dSequenceHeader *const seq_hdr, } } -static int read_mv_component_diff(Dav1dTaskContext *const t, +static int read_mv_component_diff(MsacContext *const msac, CdfMvComponent *const mv_comp, - const int have_fp) + const int mv_prec) { - Dav1dTileState *const ts = t->ts; - const Dav1dFrameContext *const f = t->f; - const int have_hp = f->frame_hdr->hp; - const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->sign); - const int cl = dav1d_msac_decode_symbol_adapt16(&ts->msac, - mv_comp->classes, 10); - int up, fp, hp; + const int sign = dav1d_msac_decode_bool_adapt(msac, mv_comp->sign); + const int cl = dav1d_msac_decode_symbol_adapt16(msac, mv_comp->classes, 10); + int up, fp = 3, hp = 1; if (!cl) { - up = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0); - if (have_fp) { - fp = dav1d_msac_decode_symbol_adapt4(&ts->msac, - mv_comp->class0_fp[up], 3); - hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac, - mv_comp->class0_hp) : 1; - } else { - fp = 3; - hp = 1; + up = dav1d_msac_decode_bool_adapt(msac, mv_comp->class0); + if (mv_prec >= 0) { // !force_integer_mv + fp = dav1d_msac_decode_symbol_adapt4(msac, mv_comp->class0_fp[up], 3); + if (mv_prec > 0) // allow_high_precision_mv + hp = dav1d_msac_decode_bool_adapt(msac, mv_comp->class0_hp); } } else { up = 1 << cl; for (int n = 0; n < cl; n++) - up |= dav1d_msac_decode_bool_adapt(&ts->msac, - mv_comp->classN[n]) << n; - if (have_fp) { - fp = dav1d_msac_decode_symbol_adapt4(&ts->msac, - mv_comp->classN_fp, 3); - hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac, - mv_comp->classN_hp) : 1; - } else { - fp = 3; - hp = 1; + up |= dav1d_msac_decode_bool_adapt(msac, mv_comp->classN[n]) << n; + if (mv_prec >= 0) { // !force_integer_mv + fp = dav1d_msac_decode_symbol_adapt4(msac, mv_comp->classN_fp, 3); + if (mv_prec > 0) // allow_high_precision_mv + hp = dav1d_msac_decode_bool_adapt(msac, mv_comp->classN_hp); } } @@ -117,25 +104,16 @@ static int read_mv_component_diff(Dav1dTaskContext *const t, return sign ? -diff : diff; } -static void read_mv_residual(Dav1dTaskContext *const t, mv *const ref_mv, - CdfMvContext *const mv_cdf, const int have_fp) +static void read_mv_residual(Dav1dTileState *const ts, mv *const ref_mv, + const int mv_prec) { - switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint, - N_MV_JOINTS - 1)) - { - case MV_JOINT_HV: - ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp); - ref_mv->x += read_mv_component_diff(t, &mv_cdf->comp[1], have_fp); - break; - case MV_JOINT_H: - ref_mv->x += read_mv_component_diff(t, &mv_cdf->comp[1], have_fp); - break; - case MV_JOINT_V: - ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp); - break; - default: - break; - } + MsacContext *const msac = &ts->msac; + const enum MVJoint mv_joint = + dav1d_msac_decode_symbol_adapt4(msac, ts->cdf.mv.joint, N_MV_JOINTS - 1); + if (mv_joint & MV_JOINT_V) + ref_mv->y += read_mv_component_diff(msac, &ts->cdf.mv.comp[0], mv_prec); + if (mv_joint & MV_JOINT_H) + ref_mv->x += read_mv_component_diff(msac, &ts->cdf.mv.comp[1], mv_prec); } static void read_tx_tree(Dav1dTaskContext *const t, @@ -1001,8 +979,7 @@ static int decode_b(Dav1dTaskContext *const t, const int have_delta_q = f->frame_hdr->delta.q.present && (bs != (f->seq_hdr->sb128 ? BS_128x128 : BS_64x64) || !b->skip); - int8_t prev_delta_lf[4]; - memcpy(prev_delta_lf, ts->last_delta_lf, 4); + uint32_t prev_delta_lf = ts->last_delta_lf.u32; if (have_delta_q) { int delta_q = dav1d_msac_decode_symbol_adapt4(&ts->msac, @@ -1038,8 +1015,8 @@ static int decode_b(Dav1dTaskContext *const t, delta_lf = -delta_lf; delta_lf *= 1 << f->frame_hdr->delta.lf.res_log2; } - ts->last_delta_lf[i] = - iclip(ts->last_delta_lf[i] + delta_lf, -63, 63); + ts->last_delta_lf.i8[i] = + iclip(ts->last_delta_lf.i8[i] + delta_lf, -63, 63); if (have_delta_q && DEBUG_BLOCK_INFO) printf("Post-delta_lf[%d:%d]: r=%d\n", i, delta_lf, ts->msac.rng); @@ -1054,13 +1031,13 @@ static int decode_b(Dav1dTaskContext *const t, init_quant_tables(f->seq_hdr, f->frame_hdr, ts->last_qidx, ts->dqmem); ts->dq = ts->dqmem; } - if (!memcmp(ts->last_delta_lf, (int8_t[4]) { 0, 0, 0, 0 }, 4)) { + if (!ts->last_delta_lf.u32) { // assign frame-wide lf values to this sb ts->lflvl = f->lf.lvl; - } else if (memcmp(ts->last_delta_lf, prev_delta_lf, 4)) { + } else if (ts->last_delta_lf.u32 != prev_delta_lf) { // find sb-specific lf lvl parameters - dav1d_calc_lf_values(ts->lflvlmem, f->frame_hdr, ts->last_delta_lf); ts->lflvl = ts->lflvlmem; + dav1d_calc_lf_values(ts->lflvlmem, f->frame_hdr, ts->last_delta_lf.i8); } } @@ -1324,7 +1301,7 @@ static int decode_b(Dav1dTaskContext *const t, } const union mv ref = b->mv[0]; - read_mv_residual(t, &b->mv[0], &ts->cdf.dmv, 0); + read_mv_residual(ts, &b->mv[0], -1); // clip intrabc motion vector to decoded parts of current tile int border_left = ts->tiling.col_start * 4; @@ -1586,8 +1563,8 @@ static int decode_b(Dav1dTaskContext *const t, break; \ case NEWMV: \ b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \ - read_mv_residual(t, &b->mv[idx], &ts->cdf.mv, \ - !f->frame_hdr->force_integer_mv); \ + const int mv_prec = f->frame_hdr->hp - f->frame_hdr->force_integer_mv; \ + read_mv_residual(ts, &b->mv[idx], mv_prec); \ break; \ } has_subpel_filter = imin(bw4, bh4) == 1 || @@ -1775,8 +1752,8 @@ static int decode_b(Dav1dTaskContext *const t, if (DEBUG_BLOCK_INFO) printf("Post-intermode[%d,drl=%d]: r=%d\n", b->inter_mode, b->drl_idx, ts->msac.rng); - read_mv_residual(t, &b->mv[0], &ts->cdf.mv, - !f->frame_hdr->force_integer_mv); + const int mv_prec = f->frame_hdr->hp - f->frame_hdr->force_integer_mv; + read_mv_residual(ts, &b->mv[0], mv_prec); if (DEBUG_BLOCK_INFO) printf("Post-residualmv[mv=y:%d,x:%d]: r=%d\n", b->mv[0].y, b->mv[0].x, ts->msac.rng); @@ -2495,7 +2472,7 @@ static void setup_tile(Dav1dTileState *const ts, dav1d_cdf_thread_copy(&ts->cdf, &f->in_cdf); ts->last_qidx = f->frame_hdr->quant.yac; - memset(ts->last_delta_lf, 0, sizeof(ts->last_delta_lf)); + ts->last_delta_lf.u32 = 0; dav1d_msac_init(&ts->msac, data, sz, f->frame_hdr->disable_cdf_update); diff --git a/third_party/dav1d/src/internal.h b/third_party/dav1d/src/internal.h index 72f65607ed..96bf409c6c 100644 --- a/third_party/dav1d/src/internal.h +++ b/third_party/dav1d/src/internal.h @@ -303,8 +303,8 @@ struct Dav1dFrameContext { int lr_buf_plane_sz[2]; /* (stride*sbh*4) << sb128 if n_tc > 1, else stride*4 */ int re_sz /* h */; ALIGN(Av1FilterLUT lim_lut, 16); + ALIGN(uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */], 16); int last_sharpness; - uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */]; uint8_t *tx_lpf_right_edge[2]; uint8_t *cdef_line_buf, *lr_line_buf; pixel *cdef_line[2 /* pre, post */][3 /* plane */]; @@ -376,8 +376,11 @@ struct Dav1dTileState { const uint16_t (*dq)[3][2]; int last_qidx; - int8_t last_delta_lf[4]; - uint8_t lflvlmem[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */]; + union { + int8_t i8[4]; + uint32_t u32; + } last_delta_lf; + ALIGN(uint8_t lflvlmem[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */], 16); const uint8_t (*lflvl)[4][8][2]; Av1RestorationUnit *lr_ref[3]; diff --git a/third_party/dav1d/src/itx.h b/third_party/dav1d/src/itx.h index d522079907..8ef4f4df48 100644 --- a/third_party/dav1d/src/itx.h +++ b/third_party/dav1d/src/itx.h @@ -39,10 +39,73 @@ void (name)(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob \ HIGHBD_DECL_SUFFIX) typedef decl_itx_fn(*itxfm_fn); +#define decl_itx2_fns(w, h, opt) \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) + +#define decl_itx12_fns(w, h, opt) \ +decl_itx2_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) + +#define decl_itx16_fns(w, h, opt) \ +decl_itx12_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) + +#define decl_itx17_fns(w, h, opt) \ +decl_itx16_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) + typedef struct Dav1dInvTxfmDSPContext { itxfm_fn itxfm_add[N_RECT_TX_SIZES][N_TX_TYPES_PLUS_LL]; } Dav1dInvTxfmDSPContext; bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c, int bpc); +#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ + c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ + BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) + +#define assign_itx1_fn(pfx, w, h, ext) \ + assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) + +#define assign_itx2_fn(pfx, w, h, ext) \ + assign_itx1_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext) + +#define assign_itx12_fn(pfx, w, h, ext) \ + assign_itx2_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \ + assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext) + +#define assign_itx16_fn(pfx, w, h, ext) \ + assign_itx12_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \ + assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext) + +#define assign_itx17_fn(pfx, w, h, ext) \ + assign_itx16_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) + #endif /* DAV1D_SRC_ITX_H */ diff --git a/third_party/dav1d/src/lf_mask.c b/third_party/dav1d/src/lf_mask.c index 062ba67371..09a5c532c4 100644 --- a/third_party/dav1d/src/lf_mask.c +++ b/third_party/dav1d/src/lf_mask.c @@ -436,7 +436,7 @@ static void calc_lf_value(uint8_t (*const lflvl_values)[2], const int base = iclip(iclip(base_lvl + lf_delta, 0, 63) + seg_delta, 0, 63); if (!mr_delta) { - memset(lflvl_values, base, 8 * 2); + memset(lflvl_values, base, sizeof(*lflvl_values) * 8); } else { const int sh = base >= 32; lflvl_values[0][0] = lflvl_values[0][1] = @@ -457,7 +457,7 @@ static inline void calc_lf_value_chroma(uint8_t (*const lflvl_values)[2], const Dav1dLoopfilterModeRefDeltas *const mr_delta) { if (!base_lvl) - memset(lflvl_values, 0, 8 * 2); + memset(lflvl_values, 0, sizeof(*lflvl_values) * 8); else calc_lf_value(lflvl_values, base_lvl, lf_delta, seg_delta, mr_delta); } @@ -469,7 +469,7 @@ void dav1d_calc_lf_values(uint8_t (*const lflvl_values)[4][8][2], const int n_seg = hdr->segmentation.enabled ? 8 : 1; if (!hdr->loopfilter.level_y[0] && !hdr->loopfilter.level_y[1]) { - memset(lflvl_values, 0, 8 * 4 * 2 * n_seg); + memset(lflvl_values, 0, sizeof(*lflvl_values) * n_seg); return; } diff --git a/third_party/dav1d/src/meson.build b/third_party/dav1d/src/meson.build index dc4be5fd6f..cd19b70c38 100644 --- a/third_party/dav1d/src/meson.build +++ b/third_party/dav1d/src/meson.build @@ -106,6 +106,7 @@ if is_asm_enabled 'arm/64/loopfilter.S', 'arm/64/looprestoration.S', 'arm/64/mc.S', + 'arm/64/mc_dotprod.S', ) endif diff --git a/third_party/dav1d/src/refmvs.c b/third_party/dav1d/src/refmvs.c index 200afebde7..1da024b630 100644 --- a/third_party/dav1d/src/refmvs.c +++ b/third_party/dav1d/src/refmvs.c @@ -817,7 +817,9 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf, if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) { if (rf->r) dav1d_freep_aligned(&rf->r); const int uses_2pass = n_tile_threads > 1 && n_frame_threads > 1; - rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass), 64); + /* sizeof(refmvs_block) == 12 but it's accessed using 16-byte loads in asm, + * so add 4 bytes of padding to avoid buffer overreads. */ + rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass) + 4, 64); if (!rf->r) return DAV1D_ERR(ENOMEM); rf->r_stride = r_stride; } diff --git a/third_party/dav1d/src/riscv/itx.h b/third_party/dav1d/src/riscv/itx.h index d3f9a03a03..e11b138348 100644 --- a/third_party/dav1d/src/riscv/itx.h +++ b/third_party/dav1d/src/riscv/itx.h @@ -28,34 +28,6 @@ #include "src/cpu.h" #include "src/itx.h" -#define decl_itx2_fns(w, h, opt) \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) - -#define decl_itx12_fns(w, h, opt) \ -decl_itx2_fns(w, h, opt); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) - -#define decl_itx16_fns(w, h, opt) \ -decl_itx12_fns(w, h, opt); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) - -#define decl_itx17_fns(w, h, opt) \ -decl_itx16_fns(w, h, opt); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) - #define decl_itx_fns(ext) \ decl_itx17_fns( 4, 4, ext); \ decl_itx16_fns( 4, 8, ext); \ @@ -70,41 +42,6 @@ decl_itx16_fns(16, 16, ext) decl_itx_fns(rvv); static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, int const bpc) { -#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ - c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ - BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) - -#define assign_itx1_fn(pfx, w, h, ext) \ - assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) - -#define assign_itx2_fn(pfx, w, h, ext) \ - assign_itx1_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext) - -#define assign_itx12_fn(pfx, w, h, ext) \ - assign_itx2_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \ - assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \ - assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \ - assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \ - assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \ - assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext) - -#define assign_itx16_fn(pfx, w, h, ext) \ - assign_itx12_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \ - assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext) - -#define assign_itx17_fn(pfx, w, h, ext) \ - assign_itx16_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) - const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_RISCV_CPU_FLAG_V)) return; diff --git a/third_party/dav1d/src/x86/ipred_avx2.asm b/third_party/dav1d/src/x86/ipred_avx2.asm index 35738e7c0b..2956ffaf29 100644 --- a/third_party/dav1d/src/x86/ipred_avx2.asm +++ b/third_party/dav1d/src/x86/ipred_avx2.asm @@ -66,7 +66,8 @@ z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16 db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0 -z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 +const \ +z_filter_s, db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15 db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line pb_128: times 4 db 128 ; those are just placed here for alignment. diff --git a/third_party/dav1d/src/x86/itx.h b/third_party/dav1d/src/x86/itx.h index 346fde7d90..23d7a73806 100644 --- a/third_party/dav1d/src/x86/itx.h +++ b/third_party/dav1d/src/x86/itx.h @@ -30,34 +30,6 @@ #define BF_BPC(x, bits, suffix) x##_##bits##bpc_##suffix -#define decl_itx2_fns(w, h, opt) \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) - -#define decl_itx12_fns(w, h, opt) \ -decl_itx2_fns(w, h, opt); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) - -#define decl_itx16_fns(w, h, opt) \ -decl_itx12_fns(w, h, opt); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) - -#define decl_itx17_fns(w, h, opt) \ -decl_itx16_fns(w, h, opt); \ -decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) - #define decl_itx_fns(ext) \ decl_itx17_fns( 4, 4, ext); \ decl_itx16_fns( 4, 8, ext); \ @@ -136,42 +108,6 @@ decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2); decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2)); static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) { -#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ - c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ - BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) - -#define assign_itx1_fn(pfx, w, h, ext) \ - assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) - -#define assign_itx2_fn(pfx, w, h, ext) \ - assign_itx1_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext) - -#define assign_itx12_fn(pfx, w, h, ext) \ - assign_itx2_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \ - assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \ - assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \ - assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \ - assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \ - assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext) - -#define assign_itx16_fn(pfx, w, h, ext) \ - assign_itx12_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \ - assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \ - assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \ - assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext) - -#define assign_itx17_fn(pfx, w, h, ext) \ - assign_itx16_fn(pfx, w, h, ext); \ - assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) - - #define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext) diff --git a/third_party/dav1d/src/x86/mc16_avx2.asm b/third_party/dav1d/src/x86/mc16_avx2.asm index 42e2a5525e..6b4424946b 100644 --- a/third_party/dav1d/src/x86/mc16_avx2.asm +++ b/third_party/dav1d/src/x86/mc16_avx2.asm @@ -1222,7 +1222,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 -%macro FN 4 ; prefix, type, type_h, type_v +%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to cglobal %1_%2_16bpc mov t0d, FILTER_%3 %ifidn %3, %4 @@ -1230,8 +1230,8 @@ cglobal %1_%2_16bpc %else mov t1d, FILTER_%4 %endif -%ifnidn %2, regular ; skip the jump in the last filter - jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX) +%if %0 == 5 ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%5 %+ SUFFIX) %endif %endmacro @@ -1242,22 +1242,17 @@ DECLARE_REG_TMP 7, 8 %endif %define PUT_8TAP_FN FN put_8tap, -PUT_8TAP_FN sharp, SHARP, SHARP -PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH -PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP -PUT_8TAP_FN smooth, SMOOTH, SMOOTH -PUT_8TAP_FN sharp_regular, SHARP, REGULAR -PUT_8TAP_FN regular_sharp, REGULAR, SHARP -PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR -PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_16bpc +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_16bpc +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_16bpc PUT_8TAP_FN regular, REGULAR, REGULAR -cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my +cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my %define base r8-put_avx2 imul mxd, mxm, 0x010101 - add mxd, t0d ; 8tap_h, mx, 4tap_h + add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 - add myd, t1d ; 8tap_v, my, 4tap_v + add myd, t1d ; 6tap_v, my, 4tap_v lea r8, [put_avx2] movifnidn wd, wm movifnidn hd, hm @@ -1265,6 +1260,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my jnz .h test myd, 0xf00 jnz .v +.put: tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 @@ -1337,43 +1333,36 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my cmp wd, 4 je .h_w4 jl .h_w2 - WIN64_SPILL_XMM 13 + WIN64_SPILL_XMM 11 shr mxd, 16 - sub srcq, 6 - vpbroadcastq m0, [base+subpel_filters+mxq*8] - vbroadcasti128 m6, [subpel_h_shufA] - vbroadcasti128 m7, [subpel_h_shufB] + sub srcq, 4 + vpbroadcastq m0, [base+subpel_filters+1+mxq*8] + vbroadcasti128 m6, [base+subpel_h_shufA] punpcklbw m0, m0 psraw m0, 8 ; sign-extend - pshufd m8, m0, q0000 - pshufd m9, m0, q1111 - pshufd m10, m0, q2222 - pshufd m11, m0, q3333 - cmp wd, 8 - jg .h_w16 + pshufd m7, m0, q0000 + pshufd m8, m0, q1111 + pshufd m9, m0, q2222 + sub wd, 16 + jge .h_w16 .h_w8: -%macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] - pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 - pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 - pmaddwd m%5, m9, m%4 ; abcd1 - pmaddwd m%1, m8 ; abcd0 - pshufb m%2, m7 ; 6 7 7 8 8 9 9 a - shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 - paddd m%5, m4 - paddd m%1, m%5 - pmaddwd m%5, m11, m%2 ; abcd3 - paddd m%1, m%5 - pmaddwd m%5, m10, m%4 ; abcd2 - pshufb m%3, m7 ; a b b c c d d e - pmaddwd m%4, m8 ; efgh0 - paddd m%1, m%5 - pmaddwd m%5, m9, m%2 ; efgh1 - shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c - pmaddwd m%3, m11 ; efgh3 - pmaddwd m%2, m10 ; efgh2 +%macro PUT_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] + pshufb m%1, m6 ; 01 12 23 34 + pshufb m%2, m6 ; 45 56 67 78 + pmaddwd m%4, m7, m%1 ; a0 + pshufb m%3, m6 ; 89 9a ab bc + pmaddwd m%5, m9, m%2 ; a2 + shufpd m%1, m%2, 0x05 ; 23 34 45 56 + paddd m%4, m%5 ; a0+a2 + pmaddwd m%5, m7, m%2 ; b0 + shufpd m%2, m%3, 0x05 ; 67 78 89 9a + pmaddwd m%3, m9 ; b2 + pmaddwd m%1, m8 ; a1 + pmaddwd m%2, m8 ; b1 + paddd m%3, m%5 ; b0+b2 paddd m%4, m4 - paddd m%4, m%5 - paddd m%3, m%4 + paddd m%3, m4 + paddd m%1, m%4 paddd m%2, m%3 psrad m%1, 6 psrad m%2, 6 @@ -1384,9 +1373,9 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my vinserti128 m0, [srcq+ssq*1+ 0], 1 movu xm2, [srcq+ssq*0+16] vinserti128 m2, [srcq+ssq*1+16], 1 - lea srcq, [srcq+ssq*2] shufpd m1, m0, m2, 0x05 - PUT_8TAP_H 0, 1, 2, 3, 12 + lea srcq, [srcq+ssq*2] + PUT_6TAP_H 0, 1, 2, 3, 10 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] @@ -1396,13 +1385,13 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my .h_w16: mov r6d, wd .h_w16_loop: - movu m0, [srcq+r6*2-32] - movu m1, [srcq+r6*2-24] - movu m2, [srcq+r6*2-16] - PUT_8TAP_H 0, 1, 2, 3, 12 - mova [dstq+r6*2-32], m0 + movu m0, [srcq+r6*2+ 0] + movu m1, [srcq+r6*2+ 8] + movu m2, [srcq+r6*2+16] + PUT_6TAP_H 0, 1, 2, 3, 10 + mova [dstq+r6*2], m0 sub r6d, 16 - jg .h_w16_loop + jge .h_w16_loop add srcq, ssq add dstq, dsq dec hd @@ -1411,10 +1400,449 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my .v: movzx mxd, myb shr myd, 16 - cmp hd, 4 - cmovle myd, mxd + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m0, [base+subpel_filters+1+myq*8] + WIN64_SPILL_XMM 10, 12 + vpbroadcastd m5, [pd_32] + vpbroadcastw m6, r8m + punpcklbw m0, m0 + mov r6, ssq + psraw m0, 8 ; sign-extend + neg r6 + pshufd m7, m0, q0000 + pshufd m8, m0, q1111 + pshufd m9, m0, q2222 + cmp wd, 4 + jg .v_w8 + je .v_w4 +.v_w2: + movd xm2, [srcq+r6 *2] + pinsrd xm2, [srcq+r6 *1], 1 + pinsrd xm2, [srcq+ssq*0], 2 + pinsrd xm2, [srcq+ssq*1], 3 ; 0 1 2 3 + lea srcq, [srcq+ssq*2] + movd xm0, [srcq+ssq*0] + palignr xm3, xm0, xm2, 4 ; 1 2 3 4 + punpcklwd xm1, xm2, xm3 ; 01 12 + punpckhwd xm2, xm3 ; 23 34 +.v_w2_loop: + movd xm3, [srcq+ssq*1] + pmaddwd xm4, xm7, xm1 ; a0 b0 + mova xm1, xm2 + pmaddwd xm2, xm8 ; a1 b1 + lea srcq, [srcq+ssq*2] + paddd xm4, xm2 + punpckldq xm2, xm0, xm3 ; 4 5 + movd xm0, [srcq+ssq*0] + punpckldq xm3, xm0 ; 5 6 + punpcklwd xm2, xm3 ; 45 56 + pmaddwd xm3, xm9, xm2 ; a2 b2 + paddd xm4, xm5 + paddd xm4, xm3 + psrad xm4, 6 + packusdw xm4, xm4 + pminsw xm4, xm6 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq xm1, [srcq+r6 *2] + vpbroadcastq m3, [srcq+r6 *1] + vpbroadcastq m2, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m1, m3, 0x30 + vpblendd m3, m2, 0x30 + punpcklwd m1, m3 ; 01 12 + vpblendd m2, m4, 0x30 + vpblendd m4, m0, 0x30 + punpcklwd m2, m4 ; 23 34 +.v_w4_loop: + vpbroadcastq m3, [srcq+ssq*1] + pmaddwd m4, m7, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m8 ; a1 b1 + lea srcq, [srcq+ssq*2] + paddd m4, m2 + vpblendd m2, m0, m3, 0x30 + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m3, m0, 0x30 + punpcklwd m2, m3 ; 45 56 + pmaddwd m3, m9, m2 ; a2 b2 + paddd m4, m5 + paddd m4, m3 + psrad m4, 6 + vextracti128 xm3, m4, 1 + packusdw xm4, xm3 + pminsw xm4, xm6 + movq [dstq+dsq*0], xm4 + movhps [dstq+dsq*1], xm4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + shl wd, 5 + WIN64_PUSH_XMM 12 + lea wd, [hq+wq-256] +.v_w8_loop0: + vbroadcasti128 m3, [srcq+r6 *2] + vbroadcasti128 m4, [srcq+r6 *1] + lea r7, [srcq+ssq*2] + vbroadcasti128 m0, [srcq+ssq*0] + vbroadcasti128 m1, [srcq+ssq*1] + mov r8, dstq + vbroadcasti128 m2, [r7+ssq*0] + shufpd m3, m0, 0x0c + shufpd m4, m1, 0x0c + punpcklwd m1, m3, m4 ; 01 + punpckhwd m3, m4 ; 23 + shufpd m0, m2, 0x0c + punpcklwd m2, m4, m0 ; 12 + punpckhwd m4, m0 ; 34 +.v_w8_loop: + vbroadcasti128 m5, [r7+ssq*1] + pmaddwd m10, m7, m1 ; a0 + lea r7, [r7+ssq*2] + pmaddwd m11, m7, m2 ; b0 + mova m1, m3 + pmaddwd m3, m8 ; a1 + mova m2, m4 + pmaddwd m4, m8 ; b1 + paddd m10, m3 + vbroadcasti128 m3, [r7+ssq*0] + paddd m11, m4 + shufpd m4, m0, m5, 0x0d + shufpd m0, m5, m3, 0x0c + punpcklwd m3, m4, m0 ; 45 + punpckhwd m4, m0 ; 56 + pmaddwd m5, m9, m3 ; a2 + paddd m10, m5 + pmaddwd m5, m9, m4 ; b2 + paddd m5, m11 + psrad m10, 5 + psrad m5, 5 + packusdw m10, m5 + pxor m5, m5 + pavgw m5, m10 + pminsw m5, m6 + vpermq m5, m5, q3120 + mova [r8+dsq*0], xm5 + vextracti128 [r8+dsq*1], m5, 1 + lea r8, [r8+dsq*2] + sub hd, 2 + jg .v_w8_loop + add srcq, 16 + add dstq, 16 + movzx hd, wb + sub wd, 1<<8 + jg .v_w8_loop0 + RET +.hv: + WIN64_SPILL_XMM 12, 16 + vpbroadcastd m10, [pd_512] + vpbroadcastw m11, r8m + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + vpbroadcastd m0, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m1, [base+subpel_filters+1+myq*8] + mov r6, ssq + sub srcq, 2 + neg r6 + pxor m6, m6 + punpcklbw m6, m0 + punpcklbw m1, m1 + psraw m1, 8 ; sign-extend + test dword r8m, 0x800 + jz .hv_10bit + psraw m6, 2 + psllw m1, 2 +.hv_10bit: + pshufd m7, m1, q0000 + pshufd m8, m1, q1111 + pshufd m9, m1, q2222 + cmp wd, 4 + je .hv_w4 + vbroadcasti128 m5, [subpel_h_shuf2] + vbroadcasti128 m0, [srcq+ssq*0] + vinserti128 m2, m0, [srcq+r6*2], 1 ; 2 0 + movu xm1, [srcq+ssq*1] + vinserti128 m1, [srcq+r6 *1], 1 ; 3 1 + lea srcq, [srcq+ssq*2] + vinserti128 m0, [srcq+ssq*0], 0 ; 4 2 + REPX {pshufb x, m5}, m2, m1, m0 + REPX {pmaddwd x, m6}, m2, m1, m0 + phaddd m2, m1 + phaddd m1, m0 + paddd m2, m10 + paddd m1, m10 + psrad m2, 10 + psrad m1, 10 + packssdw m2, m1 ; 2 3 3 4 0 1 1 2 + punpckhqdq m0, m2, m2 + punpcklwd m2, m0 ; 23 34 + vextracti128 xm1, m2, 1 ; 01 12 +.hv_w2_loop: + movu xm3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movu xm4, [srcq+ssq*0] + pshufb xm3, xm5 + pshufb xm4, xm5 + pmaddwd xm3, xm6 + pmaddwd xm4, xm6 + phaddd xm3, xm4 + pmaddwd xm4, xm7, xm1 ; a0 b0 + mova xm1, xm2 + pmaddwd xm2, xm8 ; a1 b1 + paddd xm4, xm2 + paddd xm3, xm10 + psrad xm3, 10 + packssdw xm3, xm3 + palignr xm2, xm3, xm0, 12 + mova xm0, xm3 + punpcklwd xm2, xm0 ; 45 56 + pmaddwd xm3, xm9, xm2 ; a2 b2 + paddd xm4, xm10 + paddd xm4, xm3 + psrad xm4, 10 + packusdw xm4, xm4 + pminsw xm4, xm11 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + WIN64_PUSH_XMM 14 + vbroadcasti128 m12, [subpel_h_shufA] + pshufd m5, m6, q0000 + vbroadcasti128 m13, [subpel_h_shufB] + pshufd m6, m6, q1111 + movu xm2, [srcq+r6 *2] + vinserti128 m2, [srcq+r6 *1], 1 ; 0 1 + movu xm0, [srcq+ssq*0] + vinserti128 m0, [srcq+ssq*1], 1 ; 2 3 + lea srcq, [srcq+ssq*2] + movu xm3, [srcq+ssq*0] ; 4 + pshufb m1, m2, m12 + pmaddwd m1, m5 + pshufb m2, m13 + pmaddwd m2, m6 + pshufb m4, m0, m12 + pmaddwd m4, m5 + pshufb m0, m13 + pmaddwd m0, m6 + paddd m2, m1 + pshufb xm1, xm3, xm12 + pmaddwd xm1, xm5 + pshufb xm3, xm13 + pmaddwd xm3, xm6 + paddd m0, m4 + paddd m2, m10 + paddd xm1, xm10 + paddd m0, m10 + paddd xm3, xm1 + REPX {psrad x, 10}, m2, m0, xm3 + packssdw m2, m0 ; 0 2 1 3 + packssdw xm0, xm3 ; 2 4 + vperm2i128 m0, m2, 0x03 + punpcklwd m1, m2, m0 ; 01 12 + punpckhwd m2, m0 ; 23 34 +.hv_w4_loop: + movu xm3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti128 m3, [srcq+ssq*0], 1 + pmaddwd m4, m7, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m8 ; a1 b1 + paddd m4, m2 + pshufb m2, m3, m12 + pmaddwd m2, m5 + pshufb m3, m13 + pmaddwd m3, m6 + paddd m2, m10 + paddd m3, m2 + psrad m3, 10 + packssdw m3, m3 ; 5 5 6 6 + vperm2i128 m2, m0, m3, 0x21 + mova m0, m3 + punpckhwd m2, m3 ; 45 56 + pmaddwd m3, m9, m2 ; a2 b2 + paddd m4, m10 + paddd m4, m3 + psrad m4, 10 + vextracti128 xm3, m4, 1 + packusdw xm4, xm3 + pminsw xm4, xm11 + movq [dstq+dsq*0], xm4 + movhps [dstq+dsq*1], xm4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + WIN64_PUSH_XMM 16, 12 + shr mxd, 16 + vbroadcasti128 m12, [subpel_h_shufA] + vpbroadcastq m2, [base+subpel_filters+1+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + pmovsxbw xm1, [base+subpel_filters+1+myq*8] + shl wd, 5 + mov r6, ssq + sub srcq, 4 + pxor m0, m0 + neg r6 + punpcklbw m0, m2 + lea wd, [hq+wq-256] + test dword r8m, 0x800 + jz .hv_w8_10bit + psraw m0, 2 + psllw xm1, 2 +.hv_w8_10bit: + pshufd m7, m0, q0000 + pshufd m8, m0, q1111 +%if WIN64 + %define v_mul (rsp+stack_offset+40) ; r4m +%else + %define v_mul (rsp+stack_offset+ 8) ; r6m +%endif + mova [v_mul], xm1 + pshufd m9, m0, q2222 +.hv_w8_loop0: + vbroadcasti128 m0, [srcq+ssq*0+ 0] + vinserti128 m3, m0, [srcq+r6*2+ 0], 0 + lea r7, [srcq+ssq*2] + vbroadcasti128 m2, [srcq+ssq*0+16] + vinserti128 m1, m2, [srcq+r6*2+16], 0 + mov r8, dstq + vinserti128 m0, [r7 +ssq*0+ 0], 1 + vinserti128 m2, [r7 +ssq*0+16], 1 + shufpd m4, m3, m1, 0x05 +%macro PUT_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] + pshufb m%1, m12 ; 01 12 23 34 + pshufb m%2, m12 ; 45 56 67 78 + pmaddwd m%4, m7, m%1 ; a0 + pshufb m%3, m12 ; 89 9a ab bc + pmaddwd m%5, m9, m%2 ; a2 + shufpd m%1, m%2, 0x05 ; 23 34 45 56 + paddd m%4, m%5 ; a0+a2 + pmaddwd m%5, m7, m%2 ; b0 + shufpd m%2, m%3, 0x05 ; 67 78 89 9a + pmaddwd m%3, m9 ; b2 + pmaddwd m%1, m8 ; a1 + pmaddwd m%2, m8 ; b1 + paddd m%3, m%5 ; b0+b2 + paddd m%4, m10 + paddd m%3, m10 + paddd m%1, m%4 + paddd m%2, m%3 + psrad m%1, 10 + psrad m%2, 10 + packssdw m%1, m%2 +%endmacro + PUT_6TAP_HV_H 3, 4, 1, 5, 6 ; 0 2 + movu xm4, [srcq+r6 *1+ 0] + vinserti128 m4, [srcq+ssq*1+ 0], 1 + shufpd m1, m0, m2, 0x05 + PUT_6TAP_HV_H 0, 1, 2, 5, 6 ; 2 4 + movu xm2, [srcq+r6 *1+16] + vinserti128 m2, [srcq+ssq*1+16], 1 + shufpd m1, m4, m2, 0x05 + PUT_6TAP_HV_H 4, 1, 2, 5, 6 ; 1 3 + vpermq m3, m3, q3120 + vpermq m4, m4, q3120 + vpermq m0, m0, q3120 + punpcklwd m1, m3, m4 ; 01 + punpckhwd m3, m4 ; 23 + punpcklwd m2, m4, m0 ; 12 + punpckhwd m4, m0 ; 34 +.hv_w8_loop: + vpbroadcastd m15, [v_mul+4*0] + vpbroadcastd m13, [v_mul+4*1] + movu xm5, [r7+ssq*1+ 0] + movu xm6, [r7+ssq*1+16] + lea r7, [r7+ssq*2] + pmaddwd m14, m15, m1 ; a0 + pmaddwd m15, m2 ; b0 + vinserti128 m5, [r7+ssq*0+ 0], 1 + vinserti128 m6, [r7+ssq*0+16], 1 + mova m1, m3 + pmaddwd m3, m13 ; a1 + mova m2, m4 + pmaddwd m4, m13 ; b1 + paddd m14, m3 + shufpd m3, m5, m6, 0x05 + paddd m15, m4 + PUT_6TAP_HV_H 5, 3, 6, 4, 13 ; 5 6 + vpbroadcastd m6, [v_mul+4*2] + vpermq m5, m5, q3120 + shufpd m4, m0, m5, 0x05 + mova m0, m5 + punpcklwd m3, m4, m5 ; 45 + punpckhwd m4, m5 ; 56 + pmaddwd m5, m6, m3 ; a2 + pmaddwd m6, m4 ; b2 + paddd m14, m10 + paddd m15, m10 + paddd m5, m14 + paddd m6, m15 + psrad m5, 10 + psrad m6, 10 + packusdw m5, m6 + pminsw m5, m11 + vpermq m5, m5, q3120 + mova [r8+dsq*0], xm5 + vextracti128 [r8+dsq*1], m5, 1 + lea r8, [r8+dsq*2] + sub hd, 2 + jg .hv_w8_loop + add srcq, 16 + add dstq, 16 + movzx hd, wb + sub wd, 1<<8 + jg .hv_w8_loop0 + RET + +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_16bpc +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_16bpc +PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_16bpc +PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_16bpc +PUT_8TAP_FN sharp, SHARP, SHARP + +cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my +%define base r8-put_avx2 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx2] + movifnidn wd, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jz mangle(private_prefix %+ _put_6tap_16bpc_avx2).put +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd vpbroadcastq m0, [base+subpel_filters+myq*8] - WIN64_SPILL_XMM 15 + WIN64_SPILL_XMM 12, 15 vpbroadcastd m6, [pd_32] vpbroadcastw m7, r8m lea r6, [ssq*3] @@ -1518,19 +1946,19 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my RET .v_w8: shl wd, 5 - mov r7, srcq - mov r8, dstq + WIN64_PUSH_XMM 15 lea wd, [hq+wq-256] .v_w8_loop0: vbroadcasti128 m4, [srcq+ssq*0] vbroadcasti128 m5, [srcq+ssq*1] + lea r7, [srcq+ssq*4] vbroadcasti128 m0, [srcq+r6 ] vbroadcasti128 m6, [srcq+ssq*2] - lea srcq, [srcq+ssq*4] - vbroadcasti128 m1, [srcq+ssq*0] - vbroadcasti128 m2, [srcq+ssq*1] - vbroadcasti128 m3, [srcq+ssq*2] - add srcq, r6 + mov r8, dstq + vbroadcasti128 m1, [r7+ssq*0] + vbroadcasti128 m2, [r7+ssq*1] + vbroadcasti128 m3, [r7+ssq*2] + add r7, r6 shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklwd m1, m4, m5 ; 01 @@ -1542,7 +1970,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my punpcklwd m3, m6, m0 ; 23 punpckhwd m6, m0 ; 56 .v_w8_loop: - vbroadcasti128 m14, [srcq+ssq*0] + vbroadcasti128 m14, [r7+ssq*0] pmaddwd m12, m8, m1 ; a0 pmaddwd m13, m8, m2 ; b0 mova m1, m3 @@ -1556,8 +1984,8 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m12, m5 - vbroadcasti128 m5, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] + vbroadcasti128 m5, [r7+ssq*1] + lea r7, [r7+ssq*2] paddd m13, m6 shufpd m6, m0, m14, 0x0d shufpd m0, m14, m5, 0x0c @@ -1574,41 +2002,121 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my pavgw m12, m13 pminsw m12, m7 vpermq m12, m12, q3120 - mova [dstq+dsq*0], xm12 - vextracti128 [dstq+dsq*1], m12, 1 - lea dstq, [dstq+dsq*2] + mova [r8+dsq*0], xm12 + vextracti128 [r8+dsq*1], m12, 1 + lea r8, [r8+dsq*2] sub hd, 2 jg .v_w8_loop - add r7, 16 - add r8, 16 + add srcq, 16 + add dstq, 16 movzx hd, wb - mov srcq, r7 - mov dstq, r8 sub wd, 1<<8 jg .v_w8_loop0 RET -.hv: - WIN64_SPILL_XMM 16 - vpbroadcastw m15, r8m +.h: + RESET_STACK_STATE + test myd, 0xf00 + jnz .hv + mov r7d, r8m + vpbroadcastw m5, r8m + shr r7d, 11 + vpbroadcastd m4, [base+put_8tap_h_rnd+r7*4] cmp wd, 4 - jg .hv_w8 - movzx mxd, mxb - vpbroadcastd m0, [base+subpel_filters+mxq*8+2] - movzx mxd, myb - shr myd, 16 - cmp hd, 4 - cmovle myd, mxd - vpbroadcastq m1, [base+subpel_filters+myq*8] - vpbroadcastd m6, [pd_512] - lea r6, [ssq*3] - sub srcq, 2 - sub srcq, r6 - pxor m7, m7 - punpcklbw m7, m0 - punpcklbw m1, m1 - psraw m1, 8 ; sign-extend - test dword r8m, 0x800 - jz .hv_10bit + jl mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w2 + je mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w4 + WIN64_SPILL_XMM 13 + shr mxd, 16 + sub srcq, 6 + vpbroadcastq m0, [base+subpel_filters+mxq*8] + vbroadcasti128 m6, [subpel_h_shufA] + vbroadcasti128 m7, [subpel_h_shufB] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 + sub wd, 16 + jge .h_w16 +.h_w8: +%macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] + pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 + pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 + pmaddwd m%5, m9, m%4 ; abcd1 + pmaddwd m%1, m8 ; abcd0 + pshufb m%2, m7 ; 6 7 7 8 8 9 9 a + shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 + paddd m%5, m4 + paddd m%1, m%5 + pmaddwd m%5, m11, m%2 ; abcd3 + paddd m%1, m%5 + pmaddwd m%5, m10, m%4 ; abcd2 + pshufb m%3, m7 ; a b b c c d d e + pmaddwd m%4, m8 ; efgh0 + paddd m%1, m%5 + pmaddwd m%5, m9, m%2 ; efgh1 + shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c + pmaddwd m%3, m11 ; efgh3 + pmaddwd m%2, m10 ; efgh2 + paddd m%4, m4 + paddd m%4, m%5 + paddd m%3, m%4 + paddd m%2, m%3 + psrad m%1, 6 + psrad m%2, 6 + packusdw m%1, m%2 + pminsw m%1, m5 +%endmacro + movu xm0, [srcq+ssq*0+ 0] + vinserti128 m0, [srcq+ssq*1+ 0], 1 + movu xm2, [srcq+ssq*0+16] + vinserti128 m2, [srcq+ssq*1+16], 1 + lea srcq, [srcq+ssq*2] + shufpd m1, m0, m2, 0x05 + PUT_8TAP_H 0, 1, 2, 3, 12 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + mov r6d, wd +.h_w16_loop: + movu m0, [srcq+r6*2+ 0] + movu m1, [srcq+r6*2+ 8] + movu m2, [srcq+r6*2+16] + PUT_8TAP_H 0, 1, 2, 3, 12 + mova [dstq+r6*2], m0 + sub r6d, 16 + jge .h_w16_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w16 + RET +.hv: + WIN64_SPILL_XMM 16 + vpbroadcastw m15, r8m + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + vpbroadcastd m0, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m1, [base+subpel_filters+myq*8] + vpbroadcastd m6, [pd_512] + lea r6, [ssq*3] + sub srcq, 2 + sub srcq, r6 + pxor m7, m7 + punpcklbw m7, m0 + punpcklbw m1, m1 + psraw m1, 8 ; sign-extend + test dword r8m, 0x800 + jz .hv_10bit psraw m7, 2 psllw m1, 2 .hv_10bit: @@ -1773,17 +2281,15 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my vpbroadcastq m2, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 - cmp hd, 4 - cmovle myd, mxd + cmp hd, 6 + cmovs myd, mxd pmovsxbw xm1, [base+subpel_filters+myq*8] shl wd, 5 lea r6, [ssq*3] sub srcq, 6 - sub srcq, r6 pxor m0, m0 + sub srcq, r6 punpcklbw m0, m2 - mov r7, srcq - mov r8, dstq lea wd, [hq+wq-256] test dword r8m, 0x800 jz .hv_w8_10bit @@ -1792,14 +2298,9 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my .hv_w8_10bit: pshufd m11, m0, q0000 pshufd m12, m0, q1111 + mova [v_mul], xm1 pshufd m13, m0, q2222 pshufd m14, m0, q3333 -%if WIN64 - %define v_mul (rsp+stack_offset+40) ; r4m -%else - %define v_mul (rsp-24) ; red zone -%endif - mova [v_mul], xm1 .hv_w8_loop0: %macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 @@ -1830,14 +2331,16 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my %endmacro movu xm4, [srcq+r6 *1+ 0] vbroadcasti128 m8, [subpel_h_shufA] + lea r7, [srcq+ssq*4] movu xm6, [srcq+r6 *1+ 8] vbroadcasti128 m9, [subpel_h_shufB] + mov r8, dstq movu xm0, [srcq+r6 *1+16] vpbroadcastd m10, [pd_512] movu xm5, [srcq+ssq*0+ 0] - vinserti128 m5, [srcq+ssq*4+ 0], 1 + vinserti128 m5, [r7 +ssq*0+ 0], 1 movu xm1, [srcq+ssq*0+16] - vinserti128 m1, [srcq+ssq*4+16], 1 + vinserti128 m1, [r7 +ssq*0+16], 1 shufpd m7, m5, m1, 0x05 INIT_XMM avx2 PUT_8TAP_HV_H 4, 6, 0 ; 3 @@ -1851,10 +2354,9 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my PUT_8TAP_HV_H 0, 7, 1 ; 2 6 movu xm6, [srcq+ssq*1+ 0] movu xm1, [srcq+ssq*1+16] - lea srcq, [srcq+ssq*4] - vinserti128 m6, [srcq+ssq*1+ 0], 1 - vinserti128 m1, [srcq+ssq*1+16], 1 - add srcq, r6 + vinserti128 m6, [r7 +ssq*1+ 0], 1 + vinserti128 m1, [r7 +ssq*1+16], 1 + add r7, r6 shufpd m7, m6, m1, 0x05 PUT_8TAP_HV_H 6, 7, 1 ; 1 5 vpermq m4, m4, q1100 @@ -1885,13 +2387,13 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my pmaddwd m6, m10 ; b2 paddd m8, m5 paddd m9, m6 - movu xm5, [srcq+ssq*0] - vinserti128 m5, [srcq+ssq*1], 1 + movu xm5, [r7+ssq*0] + vinserti128 m5, [r7+ssq*1], 1 vbroadcasti128 m7, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] - movu xm6, [srcq+ssq*0+16] - vinserti128 m6, [srcq+ssq*1+16], 1 - vextracti128 [dstq], m0, 1 + movu xm6, [r7+ssq*0+16] + vinserti128 m6, [r7+ssq*1+16], 1 + vextracti128 [r8], m0, 1 pshufb m0, m5, m7 ; 01 pshufb m5, m10 ; 23 pmaddwd m0, m11 @@ -1902,9 +2404,9 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my pmaddwd m5, m13 pmaddwd m6, m14 paddd m6, m5 - movu xm5, [srcq+ssq*0+8] - vinserti128 m5, [srcq+ssq*1+8], 1 - lea srcq, [srcq+ssq*2] + movu xm5, [r7+ssq*0+8] + vinserti128 m5, [r7+ssq*1+8], 1 + lea r7, [r7+ssq*2] pshufb m7, m5, m7 pshufb m5, m10 pmaddwd m10, m13, m7 @@ -1916,7 +2418,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my pmaddwd m5, m12 paddd m0, m7 paddd m5, m6 - vbroadcasti128 m6, [dstq] + vbroadcasti128 m6, [r8] paddd m8, m10 paddd m9, m10 paddd m0, m10 @@ -1938,36 +2440,512 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my packusdw m7, m9 pminsw m7, m15 vpermq m7, m7, q3120 - mova [dstq+dsq*0], xm7 - vextracti128 [dstq+dsq*1], m7, 1 - lea dstq, [dstq+dsq*2] + mova [r8+dsq*0], xm7 + vextracti128 [r8+dsq*1], m7, 1 + lea r8, [r8+dsq*2] + sub hd, 2 + jg .hv_w8_loop + add srcq, 16 + add dstq, 16 + movzx hd, wb + sub wd, 1<<8 + jg .hv_w8_loop0 + RET + +%if WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +%define PREP_8TAP_FN FN prep_8tap, +PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_16bpc +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_16bpc +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_16bpc +PREP_8TAP_FN regular, REGULAR, REGULAR + +cglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my +%define base r7-prep_avx2 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 6tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 6tap_v, my, 4tap_v + lea r7, [prep_avx2] + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v +.prep: + tzcnt wd, wd + mov r6d, r7m ; bitdepth_max + movzx wd, word [r7+wq*2+table_offset(prep,)] + vpbroadcastd m5, [r7-prep_avx2+pw_8192] + shr r6d, 11 + add wq, r7 + vpbroadcastd m4, [base+prep_mul+r6*4] + lea r6, [ssq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h_w4: + movzx mxd, mxb + sub srcq, 2 + pmovsxbw xm0, [base+subpel_filters+mxq*8] + vbroadcasti128 m3, [subpel_h_shufA] + lea r6, [ssq*3] + vbroadcasti128 m4, [subpel_h_shufB] + WIN64_SPILL_XMM 8 + pshufd xm0, xm0, q2211 + test dword r7m, 0x800 + jnz .h_w4_12bpc + psllw xm0, 2 +.h_w4_12bpc: + vpbroadcastq m6, xm0 + vpermq m7, m0, q1111 +.h_w4_loop: + movu xm1, [srcq+ssq*0] + vinserti128 m1, [srcq+ssq*2], 1 + movu xm2, [srcq+ssq*1] + vinserti128 m2, [srcq+r6 *1], 1 + lea srcq, [srcq+ssq*4] + pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 + pshufb m1, m4 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m6 + pmaddwd m1, m7 + paddd m0, m5 + paddd m0, m1 + pshufb m1, m2, m3 + pshufb m2, m4 + pmaddwd m1, m6 + pmaddwd m2, m7 + paddd m1, m5 + paddd m1, m2 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4) + cmp wd, 4 + je .h_w4 + shr mxd, 16 + sub srcq, 4 + vpbroadcastq m0, [base+subpel_filters+1+mxq*8] + WIN64_SPILL_XMM 10 + vbroadcasti128 m6, [subpel_h_shufA] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + test dword r7m, 0x800 + jnz .h_12bpc + psllw m0, 2 +.h_12bpc: + pshufd m7, m0, q0000 + pshufd m8, m0, q1111 + pshufd m9, m0, q2222 + cmp wd, 8 + jg .h_w16 +.h_w8: + movu xm0, [srcq+ssq*0+ 0] + vinserti128 m0, [srcq+ssq*1+ 0], 1 + movu xm2, [srcq+ssq*0+16] + vinserti128 m2, [srcq+ssq*1+16], 1 + lea srcq, [srcq+ssq*2] + shufpd m1, m0, m2, 0x05 +%macro PREP_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] + pshufb m%1, m6 ; 01 12 23 34 + pshufb m%2, m6 ; 45 56 67 78 + pmaddwd m%4, m7, m%1 ; a0 + pshufb m%3, m6 ; 89 9a ab bc + pmaddwd m%5, m9, m%2 ; a2 + shufpd m%1, m%2, 0x05 ; 23 34 45 56 + paddd m%4, m%5 ; a0+a2 + pmaddwd m%5, m7, m%2 ; b0 + shufpd m%2, m%3, 0x05 ; 67 78 89 9a + pmaddwd m%3, m9 ; b2 + pmaddwd m%1, m8 ; a1 + pmaddwd m%2, m8 ; b1 + paddd m%3, m%5 ; b0+b2 + paddd m%4, m5 + paddd m%3, m5 + paddd m%1, m%4 + paddd m%2, m%3 + psrad m%1, 4 + psrad m%2, 4 + packssdw m%1, m%2 +%endmacro + PREP_6TAP_H 0, 1, 2, 3, 4 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + add wd, wd +.h_w16_loop0: + mov r6d, wd +.h_w16_loop: + movu m0, [srcq+r6-32] + movu m1, [srcq+r6-24] + movu m2, [srcq+r6-16] + PREP_6TAP_H 0, 1, 2, 3, 4 + mova [tmpq+r6-32], m0 + sub r6d, 32 + jg .h_w16_loop + add srcq, ssq + add tmpq, wq + dec hd + jg .h_w16_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m0, [base+subpel_filters+1+myq*8] + WIN64_SPILL_XMM 9, 12 + vpbroadcastd m5, [prep_8tap_1d_rnd] + mov r6, ssq + punpcklbw m0, m0 + neg r6 + psraw m0, 8 ; sign-extend + test dword r7m, 0x800 + jnz .v_12bpc + psllw m0, 2 +.v_12bpc: + pshufd m6, m0, q0000 + pshufd m7, m0, q1111 + pshufd m8, m0, q2222 + cmp wd, 4 + jg .v_w8 +.v_w4: + movq xm1, [srcq+r6 *2] + vpbroadcastq m3, [srcq+r6 *1] + vpbroadcastq m2, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m1, m3, 0x30 + vpblendd m3, m2, 0x30 + punpcklwd m1, m3 ; 01 12 + vpblendd m2, m4, 0x30 + vpblendd m4, m0, 0x30 + punpcklwd m2, m4 ; 23 34 +.v_w4_loop: + vpbroadcastq m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m4, m6, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m7 ; a1 b1 + paddd m4, m2 + vpblendd m2, m0, m3, 0x30 + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m3, m0, 0x30 + punpcklwd m2, m3 ; 45 56 + pmaddwd m3, m8, m2 ; a2 b2 + paddd m4, m5 + paddd m4, m3 + psrad m4, 4 + vextracti128 xm3, m4, 1 + packssdw xm4, xm3 + mova [tmpq], xm4 + add tmpq, 16 + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + WIN64_PUSH_XMM 12 +%if WIN64 + push r8 +%endif + mov r8d, wd + shl wd, 5 + lea wd, [hq+wq-256] +.v_w8_loop0: + vbroadcasti128 m3, [srcq+r6 *2] + vbroadcasti128 m4, [srcq+r6 *1] + lea r5, [srcq+ssq*2] + vbroadcasti128 m0, [srcq+ssq*0] + vbroadcasti128 m1, [srcq+ssq*1] + mov r7, tmpq + vbroadcasti128 m2, [r5+ssq*0] + shufpd m3, m0, 0x0c + shufpd m4, m1, 0x0c + punpcklwd m1, m3, m4 ; 01 + punpckhwd m3, m4 ; 23 + shufpd m0, m2, 0x0c + punpcklwd m2, m4, m0 ; 12 + punpckhwd m4, m0 ; 34 +.v_w8_loop: + vbroadcasti128 m9, [r5+ssq*1] + pmaddwd m10, m6, m1 ; a0 + lea r5, [r5+ssq*2] + pmaddwd m11, m6, m2 ; b0 + mova m1, m3 + pmaddwd m3, m7 ; a1 + mova m2, m4 + pmaddwd m4, m7 ; b1 + paddd m10, m5 + paddd m11, m5 + paddd m10, m3 + vbroadcasti128 m3, [r5+ssq*0] + paddd m11, m4 + shufpd m4, m0, m9, 0x0d + shufpd m0, m9, m3, 0x0c + punpcklwd m3, m4, m0 ; 45 + punpckhwd m4, m0 ; 56 + pmaddwd m9, m8, m3 ; a2 + paddd m10, m9 + pmaddwd m9, m8, m4 ; b2 + paddd m11, m9 + psrad m10, 4 + psrad m11, 4 + packssdw m10, m11 + vpermq m10, m10, q3120 + mova [r7+r8*0], xm10 + vextracti128 [r7+r8*2], m10, 1 + lea r7, [r7+r8*4] + sub hd, 2 + jg .v_w8_loop + add srcq, 16 + add tmpq, 16 + movzx hd, wb + sub wd, 1<<8 + jg .v_w8_loop0 +%if WIN64 + pop r8 +%endif + RET +.hv: + WIN64_SPILL_XMM 13, 15 + vpbroadcastd m7, [prep_8tap_2d_rnd] + vbroadcasti128 m8, [subpel_h_shufA] + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + vpbroadcastd m0, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m1, [base+subpel_filters+1+myq*8] + mov r6, ssq + sub srcq, 2 + pxor m6, m6 + neg r6 + punpcklbw m6, m0 + punpcklbw m1, m1 + psraw m6, 4 + psraw m1, 8 + test dword r7m, 0x800 + jz .hv_w4_10bit + psraw m6, 2 +.hv_w4_10bit: + pshufd m10, m1, q0000 + pshufd m11, m1, q1111 + pshufd m12, m1, q2222 +.hv_w4: + movu xm2, [srcq+r6 *2] + vinserti128 m2, [srcq+r6 *1], 1 ; 0 1 + pshufd m5, m6, q0000 + vbroadcasti128 m9, [base+subpel_h_shufB] + movu xm0, [srcq+ssq*0] + pshufd m6, m6, q1111 + vinserti128 m0, [srcq+ssq*1], 1 ; 2 3 + lea srcq, [srcq+ssq*2] + movu xm3, [srcq+ssq*0] ; 4 + pshufb m1, m2, m8 + pmaddwd m1, m5 + pshufb m2, m9 + pmaddwd m2, m6 + pshufb m4, m0, m8 + pmaddwd m4, m5 + pshufb m0, m9 + pmaddwd m0, m6 + paddd m2, m1 + pshufb xm1, xm3, xm8 + pmaddwd xm1, xm5 + pshufb xm3, xm9 + pmaddwd xm3, xm6 + paddd m0, m4 + paddd m2, m7 + paddd xm1, xm7 + paddd m0, m7 + paddd xm3, xm1 + REPX {psrad x, 6}, m2, m0, xm3 + packssdw m2, m0 ; 0 2 1 3 + packssdw xm0, xm3 ; 2 4 + vperm2i128 m0, m2, 0x03 + punpcklwd m1, m2, m0 ; 01 12 + punpckhwd m2, m0 ; 23 34 +.hv_w4_loop: + movu xm3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti128 m3, [srcq+ssq*0], 1 + pmaddwd m4, m10, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m11 ; a1 b1 + paddd m4, m2 + pshufb m2, m3, m8 + pmaddwd m2, m5 + pshufb m3, m9 + pmaddwd m3, m6 + paddd m2, m7 + paddd m3, m2 + psrad m3, 6 + packssdw m3, m3 ; 5 5 6 6 + vperm2i128 m2, m0, m3, 0x21 + mova m0, m3 + punpckhwd m2, m3 ; 45 56 + pmaddwd m3, m12, m2 ; a2 b2 + paddd m4, m7 + paddd m4, m3 + psrad m4, 6 + vextracti128 xm3, m4, 1 + packssdw xm4, xm3 + mova [tmpq], xm4 + add tmpq, 16 + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + vpbroadcastq m2, [base+subpel_filters+1+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + pmovsxbw xm1, [base+subpel_filters+1+myq*8] + WIN64_PUSH_XMM 15 +%if WIN64 + PUSH r8 +%endif + mov r8d, wd + shl wd, 5 + mov r6, ssq + sub srcq, 4 + neg r6 + lea wd, [hq+wq-256] + pxor m0, m0 + punpcklbw m0, m2 + psraw m0, 4 + test dword r7m, 0x800 + jz .hv_w8_10bit + psraw m0, 2 +.hv_w8_10bit: + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + mova [v_mul], xm1 + pshufd m12, m0, q2222 +.hv_w8_loop0: + vbroadcasti128 m0, [srcq+ssq*0+ 0] + vinserti128 m3, m0, [srcq+r6*2+ 0], 0 + lea r5, [srcq+ssq*2] + vbroadcasti128 m2, [srcq+ssq*0+16] + vinserti128 m1, m2, [srcq+r6*2+16], 0 + mov r7, tmpq + vinserti128 m0, [r5 +ssq*0+ 0], 1 + vinserti128 m2, [r5 +ssq*0+16], 1 + shufpd m4, m3, m1, 0x05 +%macro PREP_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] + pshufb m%1, m8 ; 01 12 23 34 + pshufb m%2, m8 ; 45 56 67 78 + pmaddwd m%4, m10, m%1 ; a0 + pshufb m%3, m8 ; 89 9a ab bc + pmaddwd m%5, m12, m%2 ; a2 + shufpd m%1, m%2, 0x05 ; 23 34 45 56 + paddd m%4, m%5 ; a0+a2 + pmaddwd m%5, m10, m%2 ; b0 + shufpd m%2, m%3, 0x05 ; 67 78 89 9a + pmaddwd m%3, m12 ; b2 + pmaddwd m%1, m11 ; a1 + pmaddwd m%2, m11 ; b1 + paddd m%3, m%5 ; b0+b2 + paddd m%4, m7 + paddd m%3, m7 + paddd m%1, m%4 + paddd m%2, m%3 + psrad m%1, 6 + psrad m%2, 6 + packssdw m%1, m%2 +%endmacro + PREP_6TAP_HV_H 3, 4, 1, 5, 6 ; 0 2 + movu xm4, [srcq+r6 *1+ 0] + vinserti128 m4, [srcq+ssq*1+ 0], 1 + shufpd m1, m0, m2, 0x05 + PREP_6TAP_HV_H 0, 1, 2, 5, 6 ; 2 4 + movu xm2, [srcq+r6 *1+16] + vinserti128 m2, [srcq+ssq*1+16], 1 + shufpd m1, m4, m2, 0x05 + PREP_6TAP_HV_H 4, 1, 2, 5, 6 ; 1 3 + vpermq m3, m3, q3120 + vpermq m4, m4, q3120 + vpermq m0, m0, q3120 + punpcklwd m1, m3, m4 ; 01 + punpckhwd m3, m4 ; 23 + punpcklwd m2, m4, m0 ; 12 + punpckhwd m4, m0 ; 34 +.hv_w8_loop: + vpbroadcastd m14, [v_mul+4*0] + vpbroadcastd m9, [v_mul+4*1] + movu xm5, [r5+ssq*1+ 0] + movu xm6, [r5+ssq*1+16] + lea r5, [r5+ssq*2] + pmaddwd m13, m14, m1 ; a0 + pmaddwd m14, m2 ; b0 + vinserti128 m5, [r5+ssq*0+ 0], 1 + vinserti128 m6, [r5+ssq*0+16], 1 + mova m1, m3 + pmaddwd m3, m9 ; a1 + mova m2, m4 + pmaddwd m4, m9 ; b1 + paddd m13, m3 + shufpd m3, m5, m6, 0x05 + paddd m14, m4 + PREP_6TAP_HV_H 5, 3, 6, 4, 9 ; 5 6 + vpbroadcastd m6, [v_mul+4*2] + vpermq m5, m5, q3120 + shufpd m4, m0, m5, 0x05 + mova m0, m5 + punpcklwd m3, m4, m5 ; 45 + punpckhwd m4, m5 ; 56 + pmaddwd m5, m6, m3 ; a2 + pmaddwd m6, m4 ; b2 + paddd m13, m7 + paddd m14, m7 + paddd m5, m13 + paddd m6, m14 + psrad m5, 6 + psrad m6, 6 + packssdw m5, m6 + vpermq m5, m5, q3120 + mova [r7+r8*0], xm5 + vextracti128 [r7+r8*2], m5, 1 + lea r7, [r7+r8*4] sub hd, 2 jg .hv_w8_loop - add r7, 16 - add r8, 16 + add srcq, 16 + add tmpq, 16 movzx hd, wb - mov srcq, r7 - mov dstq, r8 sub wd, 1<<8 jg .hv_w8_loop0 - RET - %if WIN64 -DECLARE_REG_TMP 6, 4 -%else -DECLARE_REG_TMP 6, 7 + POP r8 %endif + RET -%define PREP_8TAP_FN FN prep_8tap, +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_16bpc +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_16bpc +PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_16bpc +PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_16bpc PREP_8TAP_FN sharp, SHARP, SHARP -PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH -PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP -PREP_8TAP_FN smooth, SMOOTH, SMOOTH -PREP_8TAP_FN sharp_regular, SHARP, REGULAR -PREP_8TAP_FN regular_sharp, REGULAR, SHARP -PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR -PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH -PREP_8TAP_FN regular, REGULAR, REGULAR cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my %define base r7-prep_avx2 @@ -1980,152 +2958,18 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my test mxd, 0xf00 jnz .h test myd, 0xf00 - jnz .v - tzcnt wd, wd - mov r6d, r7m ; bitdepth_max - movzx wd, word [r7+wq*2+table_offset(prep,)] - vpbroadcastd m5, [r7-prep_avx2+pw_8192] - shr r6d, 11 - add wq, r7 - vpbroadcastd m4, [base+prep_mul+r6*4] - lea r6, [strideq*3] -%if WIN64 - pop r7 -%endif - jmp wq -.h_w4: - movzx mxd, mxb - sub srcq, 2 - pmovsxbw xm0, [base+subpel_filters+mxq*8] - vbroadcasti128 m3, [subpel_h_shufA] - vbroadcasti128 m4, [subpel_h_shufB] - WIN64_SPILL_XMM 8 - pshufd xm0, xm0, q2211 - test dword r7m, 0x800 - jnz .h_w4_12bpc - psllw xm0, 2 -.h_w4_12bpc: - vpbroadcastq m6, xm0 - vpermq m7, m0, q1111 -.h_w4_loop: - movu xm1, [srcq+strideq*0] - vinserti128 m1, [srcq+strideq*2], 1 - movu xm2, [srcq+strideq*1] - vinserti128 m2, [srcq+r6 ], 1 - lea srcq, [srcq+strideq*4] - pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 - pshufb m1, m4 ; 2 3 3 4 4 5 5 6 - pmaddwd m0, m6 - pmaddwd m1, m7 - paddd m0, m5 - paddd m0, m1 - pshufb m1, m2, m3 - pshufb m2, m4 - pmaddwd m1, m6 - pmaddwd m2, m7 - paddd m1, m5 - paddd m1, m2 - psrad m0, 4 - psrad m1, 4 - packssdw m0, m1 - mova [tmpq], m0 - add tmpq, 32 - sub hd, 4 - jg .h_w4_loop - RET -.h: - test myd, 0xf00 - jnz .hv - vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4) - lea r6, [strideq*3] - cmp wd, 4 - je .h_w4 - shr mxd, 16 - sub srcq, 6 - vpbroadcastq m0, [base+subpel_filters+mxq*8] - WIN64_SPILL_XMM 12 - vbroadcasti128 m6, [subpel_h_shufA] - vbroadcasti128 m7, [subpel_h_shufB] - punpcklbw m0, m0 - psraw m0, 8 ; sign-extend - test dword r7m, 0x800 - jnz .h_12bpc - psllw m0, 2 -.h_12bpc: - pshufd m8, m0, q0000 - pshufd m9, m0, q1111 - pshufd m10, m0, q2222 - pshufd m11, m0, q3333 - cmp wd, 8 - jg .h_w16 -.h_w8: -%macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] - pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 - pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 - pmaddwd m%5, m9, m%4 ; abcd1 - pmaddwd m%1, m8 ; abcd0 - pshufb m%2, m7 ; 6 7 7 8 8 9 9 a - shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 - paddd m%5, m5 - paddd m%1, m%5 - pmaddwd m%5, m11, m%2 ; abcd3 - paddd m%1, m%5 - pmaddwd m%5, m10, m%4 ; abcd2 - pshufb m%3, m7 ; a b b c c d d e - pmaddwd m%4, m8 ; efgh0 - paddd m%1, m%5 - pmaddwd m%5, m9, m%2 ; efgh1 - shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c - pmaddwd m%3, m11 ; efgh3 - pmaddwd m%2, m10 ; efgh2 - paddd m%4, m5 - paddd m%4, m%5 - paddd m%3, m%4 - paddd m%2, m%3 - psrad m%1, 4 - psrad m%2, 4 - packssdw m%1, m%2 -%endmacro - movu xm0, [srcq+strideq*0+ 0] - vinserti128 m0, [srcq+strideq*1+ 0], 1 - movu xm2, [srcq+strideq*0+16] - vinserti128 m2, [srcq+strideq*1+16], 1 - lea srcq, [srcq+strideq*2] - shufpd m1, m0, m2, 0x05 - PREP_8TAP_H 0, 1, 2, 3, 4 - mova [tmpq], m0 - add tmpq, 32 - sub hd, 2 - jg .h_w8 - RET -.h_w16: - add wd, wd -.h_w16_loop0: - mov r6d, wd -.h_w16_loop: - movu m0, [srcq+r6-32] - movu m1, [srcq+r6-24] - movu m2, [srcq+r6-16] - PREP_8TAP_H 0, 1, 2, 3, 4 - mova [tmpq+r6-32], m0 - sub r6d, 32 - jg .h_w16_loop - add srcq, strideq - add tmpq, wq - dec hd - jg .h_w16_loop0 - RET + jz mangle(private_prefix %+ _prep_6tap_16bpc_avx2).prep .v: movzx mxd, myb shr myd, 16 cmp hd, 4 - cmovle myd, mxd + cmove myd, mxd vpbroadcastq m0, [base+subpel_filters+myq*8] - WIN64_SPILL_XMM 15 + WIN64_SPILL_XMM 12, 15 vpbroadcastd m7, [prep_8tap_1d_rnd] lea r6, [strideq*3] - sub srcq, r6 punpcklbw m0, m0 + sub srcq, r6 psraw m0, 8 ; sign-extend test dword r7m, 0x800 jnz .v_12bpc @@ -2183,23 +3027,23 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my RET .v_w8: %if WIN64 + WIN64_PUSH_XMM 15 push r8 %endif mov r8d, wd shl wd, 5 - mov r5, srcq - mov r7, tmpq lea wd, [hq+wq-256] .v_w8_loop0: vbroadcasti128 m4, [srcq+strideq*0] vbroadcasti128 m5, [srcq+strideq*1] + lea r5, [srcq+strideq*4] vbroadcasti128 m0, [srcq+r6 ] vbroadcasti128 m6, [srcq+strideq*2] - lea srcq, [srcq+strideq*4] - vbroadcasti128 m1, [srcq+strideq*0] - vbroadcasti128 m2, [srcq+strideq*1] - vbroadcasti128 m3, [srcq+strideq*2] - add srcq, r6 + mov r7, tmpq + vbroadcasti128 m1, [r5+strideq*0] + vbroadcasti128 m2, [r5+strideq*1] + vbroadcasti128 m3, [r5+strideq*2] + add r5, r6 shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklwd m1, m4, m5 ; 01 @@ -2211,7 +3055,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my punpcklwd m3, m6, m0 ; 23 punpckhwd m6, m0 ; 56 .v_w8_loop: - vbroadcasti128 m14, [srcq+strideq*0] + vbroadcasti128 m14, [r5+strideq*0] pmaddwd m12, m8, m1 ; a0 pmaddwd m13, m8, m2 ; b0 mova m1, m3 @@ -2227,8 +3071,8 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m12, m5 - vbroadcasti128 m5, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] + vbroadcasti128 m5, [r5+strideq*1] + lea r5, [r5+strideq*2] paddd m13, m6 shufpd m6, m0, m14, 0x0d shufpd m0, m14, m5, 0x0c @@ -2242,22 +3086,101 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my psrad m13, 4 packssdw m12, m13 vpermq m12, m12, q3120 - mova [tmpq+r8*0], xm12 - vextracti128 [tmpq+r8*2], m12, 1 - lea tmpq, [tmpq+r8*4] + mova [r7+r8*0], xm12 + vextracti128 [r7+r8*2], m12, 1 + lea r7, [r7+r8*4] sub hd, 2 jg .v_w8_loop - add r5, 16 - add r7, 16 + add srcq, 16 + add tmpq, 16 movzx hd, wb - mov srcq, r5 - mov tmpq, r7 sub wd, 1<<8 jg .v_w8_loop0 %if WIN64 pop r8 %endif RET +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4) + cmp wd, 4 + je mangle(private_prefix %+ _prep_6tap_16bpc_avx2).h_w4 + shr mxd, 16 + sub srcq, 6 + vpbroadcastq m0, [base+subpel_filters+mxq*8] + WIN64_SPILL_XMM 12 + vbroadcasti128 m6, [subpel_h_shufA] + vbroadcasti128 m7, [subpel_h_shufB] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + test dword r7m, 0x800 + jnz .h_12bpc + psllw m0, 2 +.h_12bpc: + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 + cmp wd, 8 + jg .h_w16 +.h_w8: +%macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] + pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 + pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 + pmaddwd m%5, m9, m%4 ; abcd1 + pmaddwd m%1, m8 ; abcd0 + pshufb m%2, m7 ; 6 7 7 8 8 9 9 a + shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 + paddd m%5, m5 + paddd m%1, m%5 + pmaddwd m%5, m11, m%2 ; abcd3 + paddd m%1, m%5 + pmaddwd m%5, m10, m%4 ; abcd2 + pshufb m%3, m7 ; a b b c c d d e + pmaddwd m%4, m8 ; efgh0 + paddd m%1, m%5 + pmaddwd m%5, m9, m%2 ; efgh1 + shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c + pmaddwd m%3, m11 ; efgh3 + pmaddwd m%2, m10 ; efgh2 + paddd m%4, m5 + paddd m%4, m%5 + paddd m%3, m%4 + paddd m%2, m%3 + psrad m%1, 4 + psrad m%2, 4 + packssdw m%1, m%2 +%endmacro + movu xm0, [srcq+strideq*0+ 0] + vinserti128 m0, [srcq+strideq*1+ 0], 1 + movu xm2, [srcq+strideq*0+16] + vinserti128 m2, [srcq+strideq*1+16], 1 + lea srcq, [srcq+strideq*2] + shufpd m1, m0, m2, 0x05 + PREP_8TAP_H 0, 1, 2, 3, 4 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + add wd, wd +.h_w16_loop0: + mov r6d, wd +.h_w16_loop: + movu m0, [srcq+r6-32] + movu m1, [srcq+r6-24] + movu m2, [srcq+r6-16] + PREP_8TAP_H 0, 1, 2, 3, 4 + mova [tmpq+r6-32], m0 + sub r6d, 32 + jg .h_w16_loop + add srcq, strideq + add tmpq, wq + dec hd + jg .h_w16_loop0 + RET .hv: WIN64_SPILL_XMM 16 vpbroadcastd m15, [prep_8tap_2d_rnd] @@ -2268,12 +3191,12 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my movzx mxd, myb shr myd, 16 cmp hd, 4 - cmovle myd, mxd + cmove myd, mxd vpbroadcastq m1, [base+subpel_filters+myq*8] lea r6, [strideq*3] sub srcq, 2 - sub srcq, r6 pxor m7, m7 + sub srcq, r6 punpcklbw m7, m0 punpcklbw m1, m1 psraw m7, 4 @@ -2375,7 +3298,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my movzx mxd, myb shr myd, 16 cmp hd, 4 - cmovle myd, mxd + cmove myd, mxd pmovsxbw xm1, [base+subpel_filters+myq*8] %if WIN64 PUSH r8 @@ -2385,12 +3308,9 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my lea r6, [strideq*3] sub srcq, 6 sub srcq, r6 - mov r5, srcq - mov r7, tmpq lea wd, [hq+wq-256] pxor m0, m0 punpcklbw m0, m2 - mova [v_mul], xm1 psraw m0, 4 test dword r7m, 0x800 jz .hv_w8_10bit @@ -2398,6 +3318,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my .hv_w8_10bit: pshufd m11, m0, q0000 pshufd m12, m0, q1111 + mova [v_mul], xm1 pshufd m13, m0, q2222 pshufd m14, m0, q3333 .hv_w8_loop0: @@ -2430,13 +3351,15 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my %endmacro movu xm4, [srcq+r6 + 0] vbroadcasti128 m8, [subpel_h_shufA] + lea r5, [srcq+strideq*4] movu xm6, [srcq+r6 + 8] vbroadcasti128 m9, [subpel_h_shufB] + mov r7, tmpq movu xm0, [srcq+r6 +16] movu xm5, [srcq+strideq*0+ 0] - vinserti128 m5, [srcq+strideq*4+ 0], 1 + vinserti128 m5, [r5 +strideq*0+ 0], 1 movu xm1, [srcq+strideq*0+16] - vinserti128 m1, [srcq+strideq*4+16], 1 + vinserti128 m1, [r5 +strideq*0+16], 1 shufpd m7, m5, m1, 0x05 INIT_XMM avx2 PREP_8TAP_HV_H 4, 6, 0 ; 3 @@ -2450,10 +3373,9 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my PREP_8TAP_HV_H 0, 7, 1 ; 2 6 movu xm6, [srcq+strideq*1+ 0] movu xm1, [srcq+strideq*1+16] - lea srcq, [srcq+strideq*4] - vinserti128 m6, [srcq+strideq*1+ 0], 1 - vinserti128 m1, [srcq+strideq*1+16], 1 - add srcq, r6 + vinserti128 m6, [r5 +strideq*1+ 0], 1 + vinserti128 m1, [r5 +strideq*1+16], 1 + add r5, r6 shufpd m7, m6, m1, 0x05 PREP_8TAP_HV_H 6, 7, 1 ; 1 5 vpermq m4, m4, q1100 @@ -2486,13 +3408,13 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my pmaddwd m6, m10 ; b2 paddd m8, m5 paddd m9, m6 - movu xm5, [srcq+strideq*0] - vinserti128 m5, [srcq+strideq*1], 1 + movu xm5, [r5+strideq*0] + vinserti128 m5, [r5+strideq*1], 1 vbroadcasti128 m7, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] - movu xm6, [srcq+strideq*0+16] - vinserti128 m6, [srcq+strideq*1+16], 1 - vextracti128 [tmpq], m0, 1 + movu xm6, [r5+strideq*0+16] + vinserti128 m6, [r5+strideq*1+16], 1 + vextracti128 [r7], m0, 1 pshufb m0, m5, m7 ; 01 pshufb m5, m10 ; 23 pmaddwd m0, m11 @@ -2505,9 +3427,9 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my pmaddwd m6, m14 paddd m5, m15 paddd m6, m5 - movu xm5, [srcq+strideq*0+8] - vinserti128 m5, [srcq+strideq*1+8], 1 - lea srcq, [srcq+strideq*2] + movu xm5, [r5+strideq*0+8] + vinserti128 m5, [r5+strideq*1+8], 1 + lea r5, [r5+strideq*2] pshufb m7, m5, m7 pshufb m5, m10 pmaddwd m10, m13, m7 @@ -2518,7 +3440,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my pmaddwd m5, m12 paddd m0, m7 paddd m5, m6 - vbroadcasti128 m6, [tmpq] + vbroadcasti128 m6, [r7] vpbroadcastd m10, [v_mul+4*3] psrad m0, 6 psrad m5, 6 @@ -2535,16 +3457,14 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my psrad m9, 6 packssdw m7, m9 vpermq m7, m7, q3120 - mova [tmpq+r8*0], xm7 - vextracti128 [tmpq+r8*2], m7, 1 - lea tmpq, [tmpq+r8*4] + mova [r7+r8*0], xm7 + vextracti128 [r7+r8*2], m7, 1 + lea r7, [r7+r8*4] sub hd, 2 jg .hv_w8_loop - add r5, 16 - add r7, 16 + add srcq, 16 + add tmpq, 16 movzx hd, wb - mov srcq, r5 - mov tmpq, r7 sub wd, 1<<8 jg .hv_w8_loop0 %if WIN64 @@ -4223,14 +5143,14 @@ DECLARE_REG_TMP 6, 8 %define PUT_8TAP_SCALED_FN FN put_8tap_scaled, BILIN_SCALED_FN put -PUT_8TAP_SCALED_FN sharp, SHARP, SHARP -PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH -PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP -PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH -PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR -PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP -PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR -PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_SCALED_FN sharp, SHARP, SHARP, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, put_8tap_scaled_16bpc +PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, put_8tap_scaled_16bpc PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED put @@ -4242,14 +5162,14 @@ DECLARE_REG_TMP 6, 7 %define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, BILIN_SCALED_FN prep -PREP_8TAP_SCALED_FN sharp, SHARP, SHARP -PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH -PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP -PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH -PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR -PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP -PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR -PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_SCALED_FN sharp, SHARP, SHARP, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, prep_8tap_scaled_16bpc +PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, prep_8tap_scaled_16bpc PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED prep diff --git a/third_party/dav1d/src/x86/mc_avx2.asm b/third_party/dav1d/src/x86/mc_avx2.asm index 58e3cb5af1..df8bebb1cb 100644 --- a/third_party/dav1d/src/x86/mc_avx2.asm +++ b/third_party/dav1d/src/x86/mc_avx2.asm @@ -60,15 +60,14 @@ subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 1 subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 -bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 pb_8x0_8x8: db 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8 bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 -rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 +rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 wm_420_sign: dd 0x01020102, 0x01010101 wm_422_sign: dd 0x80808080, 0x7f7f7f7f @@ -95,6 +94,7 @@ pq_0x40000000: dq 0x40000000 cextern mc_subpel_filters cextern mc_warp_filter2 cextern resize_filter +cextern z_filter_s %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) @@ -184,7 +184,9 @@ BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 6tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 6tap, avx2, 1, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128 @@ -298,7 +300,7 @@ INIT_YMM avx2 ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 imul mxyd, 255 - vbroadcasti128 m4, [bilin_h_shuf8] + vbroadcasti128 m4, [z_filter_s+2] add mxyd, 16 movd xm5, mxyd mov mxyd, r7m ; my @@ -900,7 +902,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] imul mxyd, 255 - vbroadcasti128 m4, [bilin_h_shuf8] + vbroadcasti128 m4, [z_filter_s+2] add mxyd, 16 movd xm5, mxyd mov mxyd, r6m ; my @@ -1436,7 +1438,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 -%macro FN 4 ; fn, type, type_h, type_v +%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to cglobal %1_%2_8bpc mov t0d, FILTER_%3 %ifidn %3, %4 @@ -1444,8 +1446,8 @@ cglobal %1_%2_8bpc %else mov t1d, FILTER_%4 %endif -%ifnidn %2, regular ; skip the jump in the last filter - jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) +%if %0 == 5 ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%5 %+ SUFFIX) %endif %endmacro @@ -1456,28 +1458,24 @@ DECLARE_REG_TMP 7, 8 %endif %define PUT_8TAP_FN FN put_8tap, -PUT_8TAP_FN sharp, SHARP, SHARP -PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH -PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP -PUT_8TAP_FN smooth, SMOOTH, SMOOTH -PUT_8TAP_FN sharp_regular, SHARP, REGULAR -PUT_8TAP_FN regular_sharp, REGULAR, SHARP -PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR -PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc PUT_8TAP_FN regular, REGULAR, REGULAR -cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 +cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns imul mxd, mxm, 0x010101 - add mxd, t0d ; 8tap_h, mx, 4tap_h + add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 - add myd, t1d ; 8tap_v, my, 4tap_v + add myd, t1d ; 6tap_v, my, 4tap_v lea r8, [put_avx2] - movsxd wq, wm + mov wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v +.put: tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 @@ -1487,36 +1485,18 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pop r8 %endif jmp wq -.h: - test myd, 0xf00 - jnz .hv - vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) - WIN64_SPILL_XMM 11 - cmp wd, 4 - jl .h_w2 - vbroadcasti128 m6, [subpel_h_shufA] - je .h_w4 - tzcnt wd, wd - vbroadcasti128 m7, [subpel_h_shufB] - vbroadcasti128 m8, [subpel_h_shufC] - shr mxd, 16 - sub srcq, 3 - movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] - vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0] - vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4] - add wq, r8 - jmp wq .h_w2: movzx mxd, mxb - dec srcq - mova xm4, [subpel_h_shuf4] - vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] + lea srcq, [srcq-1] + vpbroadcastd xm4, [r8+mxq*8+subpel_filters-put_avx2+2] + je .h_w4 + mova xm3, [subpel_h_shuf4] .h_w2_loop: movq xm0, [srcq+ssq*0] movhps xm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - pshufb xm0, xm4 - pmaddubsw xm0, xm3 + pshufb xm0, xm3 + pmaddubsw xm0, xm4 phaddw xm0, xm0 paddw xm0, xm5 psraw xm0, 6 @@ -1528,17 +1508,15 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .h_w2_loop RET .h_w4: - movzx mxd, mxb - dec srcq - vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] + mova xm3, [subpel_h_shufA] .h_w4_loop: movq xm0, [srcq+ssq*0] movq xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - pshufb xm0, xm6 - pshufb xm1, xm6 - pmaddubsw xm0, xm3 - pmaddubsw xm1, xm3 + pshufb xm0, xm3 + pshufb xm1, xm3 + pmaddubsw xm0, xm4 + pmaddubsw xm1, xm4 phaddw xm0, xm1 paddw xm0, xm5 psraw xm0, 6 @@ -1549,25 +1527,43 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 sub hd, 2 jg .h_w4_loop RET +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) + cmp wd, 4 + jle .h_w2 + WIN64_SPILL_XMM 11 + tzcnt wd, wd + vbroadcasti128 m4, [z_filter_s+ 2] ; 01 + shr mxd, 16 + vbroadcasti128 m6, [z_filter_s+ 6] ; 23 + sub srcq, 2 + vbroadcasti128 m7, [z_filter_s+10] ; 45 + lea mxq, [r8+mxq*8+subpel_filters+1-put_avx2] + movzx wd, word [r8+wq*2+table_offset(put, _6tap_h)] + vpbroadcastw m8, [mxq+0] + vpbroadcastw m9, [mxq+2] + add wq, r8 + vpbroadcastw m10, [mxq+4] + jmp wq .h_w8: -%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] - pshufb m%2, m%1, m7 - pshufb m%3, m%1, m8 - pshufb m%1, m6 - pmaddubsw m%4, m%2, m9 - pmaddubsw m%2, m10 - pmaddubsw m%3, m10 - pmaddubsw m%1, m9 - paddw m%3, m%4 +%macro PUT_6TAP_H 3 ; dst/src, tmp[1-2] + pshufb m%2, m%1, m4 + pmaddubsw m%2, m8 + pshufb m%3, m%1, m6 + pmaddubsw m%3, m9 + pshufb m%1, m7 + pmaddubsw m%1, m10 + paddw m%2, m5 + paddw m%1, m%3 paddw m%1, m%2 - phaddw m%1, m%3 - paddw m%1, m5 psraw m%1, 6 %endmacro movu xm0, [srcq+ssq*0] vinserti128 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] - PUT_8TAP_H 0, 1, 2, 3 + PUT_6TAP_H 0, 1, 2 vextracti128 xm1, m0, 1 packuswb xm0, xm1 movq [dstq+dsq*0], xm0 @@ -1581,9 +1577,9 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 vinserti128 m0, [srcq+ssq*1+8*0], 1 movu xm1, [srcq+ssq*0+8*1] vinserti128 m1, [srcq+ssq*1+8*1], 1 - PUT_8TAP_H 0, 2, 3, 4 + PUT_6TAP_H 0, 2, 3 lea srcq, [srcq+ssq*2] - PUT_8TAP_H 1, 2, 3, 4 + PUT_6TAP_H 1, 2, 3 packuswb m0, m1 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 @@ -1606,8 +1602,8 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 .h_loop: movu m0, [srcq+r6+8*0] movu m1, [srcq+r6+8*1] - PUT_8TAP_H 0, 2, 3, 4 - PUT_8TAP_H 1, 2, 3, 4 + PUT_6TAP_H 0, 2, 3 + PUT_6TAP_H 1, 2, 3 packuswb m0, m1 mova [dstq+r6], m0 add r6, 32 @@ -1619,7 +1615,421 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .h_loop RET .v: - WIN64_SPILL_XMM 16 + WIN64_SPILL_XMM 9, 12 + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + tzcnt r6d, wd + movzx r6d, word [r8+r6*2+table_offset(put, _6tap_v)] + vpbroadcastd m8, [pw_512] + lea myq, [r8+myq*8+subpel_filters+1-put_avx2] + vpbroadcastw m5, [myq+0] + vpbroadcastw m6, [myq+2] + vpbroadcastw m7, [myq+4] + add r6, r8 + mov nsq, ssq + neg nsq + jmp r6 +.v_w2: + movd xm2, [srcq+nsq*2] + pinsrw xm2, [srcq+nsq*1], 2 + pinsrw xm2, [srcq+ssq*0], 4 + pinsrw xm2, [srcq+ssq*1], 6 ; 0 1 2 3 + lea srcq, [srcq+ssq*2] + vpbroadcastd xm0, [srcq+ssq*0] + palignr xm3, xm0, xm2, 4 ; 1 2 3 4 + punpcklbw xm1, xm2, xm3 ; 01 12 + punpckhbw xm2, xm3 ; 23 34 +.v_w2_loop: + vpbroadcastd xm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw xm3, xm1, xm5 ; a0 b0 + mova xm1, xm2 + pmaddubsw xm2, xm6 ; a1 b1 + paddw xm3, xm2 + vpblendd xm2, xm0, xm4, 0x02 ; 4 5 + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm4, xm0, 0x02 ; 5 6 + punpcklbw xm2, xm4 ; 67 78 + pmaddubsw xm4, xm2, xm7 ; a3 b3 + paddw xm3, xm4 + pmulhrsw xm3, xm8 + packuswb xm3, xm3 + pextrw [dstq+dsq*0], xm3, 0 + pextrw [dstq+dsq*1], xm3, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movd xm2, [srcq+nsq*2] + pinsrd xm2, [srcq+nsq*1], 1 + pinsrd xm2, [srcq+ssq*0], 2 + pinsrd xm2, [srcq+ssq*1], 3 ; 0 1 2 3 + lea srcq, [srcq+ssq*2] + vpbroadcastd xm0, [srcq+ssq*0] + palignr xm3, xm0, xm2, 4 ; 1 2 3 4 + punpcklbw xm1, xm2, xm3 ; 01 12 + punpckhbw xm2, xm3 ; 23 34 +.v_w4_loop: + vpbroadcastd xm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw xm3, xm1, xm5 ; a0 b0 + mova xm1, xm2 + pmaddubsw xm2, xm6 ; a1 b1 + paddw xm3, xm2 + vpblendd xm2, xm0, xm4, 0x02 ; 4 5 + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm4, xm0, 0x02 ; 5 6 + punpcklbw xm2, xm4 ; 45 56 + pmaddubsw xm4, xm2, xm7 ; a2 b2 + paddw xm3, xm4 + pmulhrsw xm3, xm8 + packuswb xm3, xm3 + movd [dstq+dsq*0], xm3 + pextrd [dstq+dsq*1], xm3, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movq xm1, [srcq+nsq*2] + vpbroadcastq m3, [srcq+nsq*1] + vpbroadcastq m2, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m1, m3, 0x30 + vpblendd m3, m2, 0x30 + punpcklbw m1, m3 ; 01 12 + vpblendd m2, m4, 0x30 + vpblendd m4, m0, 0x30 + punpcklbw m2, m4 ; 23 34 +.v_w8_loop: + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw m3, m1, m5 ; a0 b0 + mova m1, m2 + pmaddubsw m2, m6 ; a1 b1 + paddw m3, m2 + vpblendd m2, m0, m4, 0x30 + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m4, m0, 0x30 + punpcklbw m2, m4 ; 45 56 + pmaddubsw m4, m2, m7 ; a2 b2 + paddw m3, m4 + pmulhrsw m3, m8 + vextracti128 xm4, m3, 1 + packuswb xm3, xm4 + movq [dstq+dsq*0], xm3 + movhps [dstq+dsq*1], xm3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: +.v_w32: +.v_w64: +.v_w128: + lea r6d, [wq*8-128] + WIN64_PUSH_XMM 12 + lea r6d, [hq+r6*2] +.v_w16_loop0: + vbroadcasti128 m3, [srcq+nsq*2] + vbroadcasti128 m4, [srcq+nsq*1] + lea r4, [srcq+ssq*2] + vbroadcasti128 m0, [srcq+ssq*0] + vbroadcasti128 m1, [srcq+ssq*1] + mov r7, dstq + vbroadcasti128 m2, [r4+ssq*0] + shufpd m3, m0, 0x0c + shufpd m4, m1, 0x0c + punpcklbw m1, m3, m4 ; 01 + punpckhbw m3, m4 ; 23 + shufpd m0, m2, 0x0c + punpcklbw m2, m4, m0 ; 12 + punpckhbw m4, m0 ; 34 +.v_w16_loop: + vbroadcasti128 m9, [r4+ssq*1] + pmaddubsw m10, m1, m5 ; a0 + lea r4, [r4+ssq*2] + pmaddubsw m11, m2, m5 ; b0 + mova m1, m3 + pmaddubsw m3, m6 ; a1 + mova m2, m4 + pmaddubsw m4, m6 ; b1 + paddw m10, m3 + vbroadcasti128 m3, [r4+ssq*0] + paddw m11, m4 + shufpd m4, m0, m9, 0x0d + shufpd m0, m9, m3, 0x0c + punpcklbw m3, m4, m0 ; 45 + punpckhbw m4, m0 ; 56 + pmaddubsw m9, m3, m7 ; a2 + paddw m10, m9 + pmaddubsw m9, m4, m7 ; b2 + paddw m11, m9 + pmulhrsw m10, m8 + pmulhrsw m11, m8 + packuswb m10, m11 + vpermq m10, m10, q3120 + mova [r7+dsq*0], xm10 + vextracti128 [r7+dsq*1], m10, 1 + lea r7, [r7+dsq*2] + sub hd, 2 + jg .v_w16_loop + add srcq, 16 + add dstq, 16 + movzx hd, r6b + sub r6d, 1<<8 + jg .v_w16_loop0 + RET +.hv: + WIN64_SPILL_XMM 12, 16 + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + dec srcq + vpbroadcastd m6, [r8+mxq*8+subpel_filters-put_avx2+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m0, [r8+myq*8+subpel_filters+1-put_avx2] + vpbroadcastd m7, [pw_8192] + punpcklbw m0, m0 + vpbroadcastd m8, [pd_512] + psraw m0, 8 ; sign-extend + mov nsq, ssq + pshufd m9, m0, q0000 + neg nsq + pshufd m10, m0, q1111 + pshufd m11, m0, q2222 + cmp wd, 4 + je .hv_w4 + vbroadcasti128 m5, [subpel_h_shuf4] + movq xm2, [srcq+nsq*2] + movhps xm2, [srcq+nsq*1] + movq xm0, [srcq+ssq*0] + movhps xm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpbroadcastq m1, [srcq+ssq*0] + vpblendd m2, m1, 0x30 + pshufb m2, m5 + pshufb xm0, xm5 + pmaddubsw m2, m6 + pmaddubsw xm0, xm6 + phaddw m2, m0 + pmulhrsw m2, m7 + vextracti128 xm0, m2, 1 + palignr xm0, xm2, 4 + punpcklwd xm1, xm2, xm0 ; 01 12 + punpckhwd xm2, xm0 ; 23 34 +.hv_w2_loop: + movq xm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xm4, [srcq+ssq*0] + pshufb xm4, xm5 + pmaddubsw xm4, xm6 + pmaddwd xm3, xm9, xm1 ; a0 b0 + mova xm1, xm2 + pmaddwd xm2, xm10 ; a1 b1 + phaddw xm4, xm4 + paddd xm3, xm2 + pmulhrsw xm4, xm7 + palignr xm2, xm4, xm0, 12 + mova xm0, xm4 + punpcklwd xm2, xm4 ; 45 56 + pmaddwd xm4, xm11, xm2 ; a2 b2 + paddd xm3, xm8 + paddd xm3, xm4 + psrad xm3, 10 + packssdw xm3, xm3 + packuswb xm3, xm3 + pextrw [dstq+dsq*0], xm3, 0 + pextrw [dstq+dsq*1], xm3, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + mova m5, [subpel_h_shuf4] + vpbroadcastq m2, [srcq+nsq*2] + vpbroadcastq m4, [srcq+nsq*1] + vpbroadcastq m1, [srcq+ssq*0] + vpbroadcastq m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m2, m4, 0xcc ; 0 1 + vpblendd m1, m3, 0xcc ; 2 3 + pshufb m2, m5 + pshufb m1, m5 + pshufb m0, m5 + pmaddubsw m2, m6 + pmaddubsw m1, m6 + pmaddubsw m0, m6 + phaddw m2, m1 + phaddw m0, m0 + pmulhrsw m2, m7 + pmulhrsw m0, m7 + palignr m3, m0, m2, 4 + punpcklwd m1, m2, m3 ; 01 12 + punpckhwd m2, m3 ; 23 34 +.hv_w4_loop: + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m3, m9, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m10 ; a1 b1 + paddd m3, m2 + vpbroadcastq m2, [srcq+ssq*0] + vpblendd m4, m2, 0xcc ; 5 6 + pshufb m4, m5 + pmaddubsw m4, m6 + phaddw m4, m4 + pmulhrsw m4, m7 + palignr m2, m4, m0, 12 + mova m0, m4 + punpcklwd m2, m4 ; 45 56 + pmaddwd m4, m11, m2 ; a2 b2 + paddd m3, m8 + paddd m3, m4 + psrad m3, 10 + vextracti128 xm4, m3, 1 + packssdw xm3, xm4 + packuswb xm3, xm3 + pshuflw xm3, xm3, q3120 + movd [dstq+dsq*0], xm3 + pextrd [dstq+dsq*1], xm3, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + sub srcq, 2 + lea mxq, [r8+mxq*8+subpel_filters+1-put_avx2] + WIN64_PUSH_XMM 16 + vpbroadcastw m10, [mxq+0] + vpbroadcastw m11, [mxq+2] + vpbroadcastw m12, [mxq+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m0, [r8+myq*8+subpel_filters+1-put_avx2] + lea r6d, [wq*8-64] + vbroadcasti128 m8, [z_filter_s+ 6] + punpcklbw m0, m0 + vbroadcasti128 m9, [z_filter_s+10] + psraw m0, 8 ; sign-extend + mov nsq, ssq + pshufd m13, m0, q0000 + neg nsq + pshufd m14, m0, q1111 + lea r6d, [hq+r6*4] + pshufd m15, m0, q2222 +.hv_w8_loop0: + vbroadcasti128 m7, [z_filter_s+2] + movu xm3, [srcq+nsq*2] + lea r4, [srcq+ssq*2] + movu xm4, [srcq+nsq*1] + vbroadcasti128 m0, [srcq+ssq*0] + mov r7, dstq + vinserti128 m4, [srcq+ssq*1], 1 ; 1 3 + vpblendd m3, m0, 0xf0 ; 0 2 + vinserti128 m0, [r4+ssq*0], 1 ; 2 4 + vpbroadcastd m5, [pw_8192] +%macro HV_H_6TAP_W8 6 ; src/dst, tmp[1-2], shuf[1-3] + pshufb %2, %1, %4 + pmaddubsw %2, m10 + pshufb %3, %1, %5 + pmaddubsw %3, m11 + pshufb %1, %6 + pmaddubsw %1, m12 + paddw %2, %3 + paddw %1, %2 +%endmacro + HV_H_6TAP_W8 m3, m1, m2, m7, m8, m9 + HV_H_6TAP_W8 m4, m1, m2, m7, m8, m9 + HV_H_6TAP_W8 m0, m1, m2, m7, m8, m9 + vpermq m3, m3, q3120 + vpermq m4, m4, q3120 + vpermq m0, m0, q3120 + pmulhrsw m3, m5 + pmulhrsw m4, m5 + pmulhrsw m0, m5 + punpcklwd m1, m3, m4 ; 01 + punpckhwd m3, m4 ; 23 + punpcklwd m2, m4, m0 ; 12 + punpckhwd m4, m0 ; 34 +.hv_w8_loop: + movu xm7, [r4+ssq*1] + lea r4, [r4+ssq*2] + vinserti128 m7, [r4+ssq*0], 1 ; 5 6 + pmaddwd m5, m13, m1 ; a0 + mova m1, m3 + pmaddwd m6, m13, m2 ; b0 + mova m2, m4 + pmaddwd m3, m14 ; a1 + pmaddwd m4, m14 ; b1 + paddd m5, m3 + vbroadcasti128 m3, [z_filter_s+2] + paddd m6, m4 + HV_H_6TAP_W8 m7, m3, m4, m3, m8, m9 + vpbroadcastd m3, [pw_8192] + vpbroadcastd m4, [pd_512] + pmulhrsw m7, m3 + paddd m5, m4 + paddd m6, m4 + mova m4, m0 + vpermq m0, m7, q3120 + shufpd m4, m0, 0x05 + punpcklwd m3, m4, m0 ; 45 + pmaddwd m7, m15, m3 ; a2 + punpckhwd m4, m0 ; 67 + paddd m5, m7 + pmaddwd m7, m15, m4 ; b2 + paddd m6, m7 + psrad m5, 10 + psrad m6, 10 + packssdw m5, m6 + vextracti128 xm6, m5, 1 + packuswb xm5, xm6 + pshufd xm5, xm5, q3120 + movq [r7+dsq*0], xm5 + movhps [r7+dsq*1], xm5 + lea r7, [r7+dsq*2] + sub hd, 2 + jg .hv_w8_loop + add srcq, 8 + add dstq, 8 + movzx hd, r6b + sub r6d, 1<<8 + jg .hv_w8_loop0 + RET + +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_8bpc +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_8bpc +PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_8bpc +PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_8bpc +PUT_8TAP_FN sharp, SHARP, SHARP + +cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx2] + movsxd wq, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jz mangle(private_prefix %+ _put_6tap_8bpc_avx2).put +.v: + WIN64_SPILL_XMM 12, 15 movzx mxd, myb shr myd, 16 cmp hd, 6 @@ -1765,19 +2175,19 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 .v_w64: .v_w128: lea r6d, [wq*8-128] - mov r4, srcq - mov r7, dstq + WIN64_PUSH_XMM 15 lea r6d, [hq+r6*2] .v_w16_loop0: vbroadcasti128 m4, [srcq+ssq*0] vbroadcasti128 m5, [srcq+ssq*1] + lea r4, [srcq+ss3q] vbroadcasti128 m6, [srcq+ssq*2] - add srcq, ss3q - vbroadcasti128 m0, [srcq+ssq*0] - vbroadcasti128 m1, [srcq+ssq*1] - vbroadcasti128 m2, [srcq+ssq*2] - add srcq, ss3q - vbroadcasti128 m3, [srcq+ssq*0] + vbroadcasti128 m0, [r4+ssq*0] + mov r7, dstq + vbroadcasti128 m1, [r4+ssq*1] + vbroadcasti128 m2, [r4+ssq*2] + add r4, ss3q + vbroadcasti128 m3, [r4+ssq*0] shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklbw m1, m4, m5 ; 01 @@ -1789,51 +2199,138 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 punpcklbw m3, m6, m0 ; 23 punpckhbw m6, m0 ; 56 .v_w16_loop: - vbroadcasti128 m12, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vbroadcasti128 m13, [srcq+ssq*0] - pmaddubsw m14, m1, m8 ; a0 - pmaddubsw m15, m2, m8 ; b0 + vbroadcasti128 m12, [r4+ssq*1] + lea r4, [r4+ssq*2] + pmaddubsw m13, m1, m8 ; a0 + pmaddubsw m14, m2, m8 ; b0 mova m1, m3 mova m2, m4 pmaddubsw m3, m9 ; a1 pmaddubsw m4, m9 ; b1 - paddw m14, m3 - paddw m15, m4 + paddw m13, m3 + paddw m14, m4 mova m3, m5 mova m4, m6 pmaddubsw m5, m10 ; a2 pmaddubsw m6, m10 ; b2 - paddw m14, m5 - paddw m15, m6 + paddw m13, m5 + vbroadcasti128 m5, [r4+ssq*0] + paddw m14, m6 shufpd m6, m0, m12, 0x0d - shufpd m0, m12, m13, 0x0c + shufpd m0, m12, m5, 0x0c punpcklbw m5, m6, m0 ; 67 punpckhbw m6, m0 ; 78 pmaddubsw m12, m5, m11 ; a3 - pmaddubsw m13, m6, m11 ; b3 + paddw m13, m12 + pmaddubsw m12, m6, m11 ; b3 paddw m14, m12 - paddw m15, m13 + pmulhrsw m13, m7 pmulhrsw m14, m7 - pmulhrsw m15, m7 - packuswb m14, m15 - vpermq m14, m14, q3120 - mova [dstq+dsq*0], xm14 - vextracti128 [dstq+dsq*1], m14, 1 - lea dstq, [dstq+dsq*2] + packuswb m13, m14 + vpermq m13, m13, q3120 + mova [r7+dsq*0], xm13 + vextracti128 [r7+dsq*1], m13, 1 + lea r7, [r7+dsq*2] sub hd, 2 jg .v_w16_loop - add r4, 16 - add r7, 16 + add srcq, 16 + add dstq, 16 movzx hd, r6b - mov srcq, r4 - mov dstq, r7 sub r6d, 1<<8 jg .v_w16_loop0 RET -.hv: - WIN64_SPILL_XMM 16 - cmp wd, 4 +.h: +.h_w2: +.h_w4: + test myd, 0xf00 + jnz .hv + vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) + cmp wd, 4 + jle mangle(private_prefix %+ _put_6tap_8bpc_avx2).h_w2 + WIN64_SPILL_XMM 11 + tzcnt wd, wd + vbroadcasti128 m6, [subpel_h_shufA] + shr mxd, 16 + vbroadcasti128 m7, [subpel_h_shufB] + sub srcq, 3 + vbroadcasti128 m8, [subpel_h_shufC] + movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] + vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0] + vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4] + add wq, r8 + jmp wq +.h_w8: +%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] + pshufb m%2, m%1, m7 + pshufb m%3, m%1, m8 + pshufb m%1, m6 + pmaddubsw m%4, m%2, m9 + pmaddubsw m%2, m10 + pmaddubsw m%3, m10 + pmaddubsw m%1, m9 + paddw m%3, m%4 + paddw m%1, m%2 + phaddw m%1, m%3 + paddw m%1, m5 + psraw m%1, 6 +%endmacro + movu xm0, [srcq+ssq*0] + vinserti128 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 0, 1, 2, 3 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu xm0, [srcq+ssq*0+8*0] + vinserti128 m0, [srcq+ssq*1+8*0], 1 + movu xm1, [srcq+ssq*0+8*1] + vinserti128 m1, [srcq+ssq*1+8*1], 1 + PUT_8TAP_H 0, 2, 3, 4 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 1, 2, 3, 4 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16 + RET +.h_w32: + xor r6d, r6d + jmp .h_start +.h_w64: + mov r6, -32*1 + jmp .h_start +.h_w128: + mov r6, -32*3 +.h_start: + sub srcq, r6 + sub dstq, r6 + mov r4, r6 +.h_loop: + movu m0, [srcq+r6+8*0] + movu m1, [srcq+r6+8*1] + PUT_8TAP_H 0, 2, 3, 4 + PUT_8TAP_H 1, 2, 3, 4 + packuswb m0, m1 + mova [dstq+r6], m0 + add r6, 32 + jle .h_loop + add srcq, ssq + add dstq, dsq + mov r6, r4 + dec hd + jg .h_loop + RET +.hv: + WIN64_SPILL_XMM 14, 16 + cmp wd, 4 jg .hv_w8 movzx mxd, mxb dec srcq @@ -1975,6 +2472,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .hv_w4_loop RET .hv_w8: + WIN64_PUSH_XMM 16 shr mxd, 16 sub srcq, 3 vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0] @@ -1993,24 +2491,23 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pshufd m14, m0, q2222 pshufd m15, m0, q3333 lea r6d, [wq*8-64] - mov r4, srcq - mov r7, dstq lea r6d, [hq+r6*4] .hv_w8_loop0: vbroadcasti128 m7, [subpel_h_shufA] movu xm4, [srcq+ssq*0] + lea r4, [srcq+ss3q] vbroadcasti128 m8, [subpel_h_shufB] movu xm5, [srcq+ssq*1] + mov r7, dstq vbroadcasti128 m9, [subpel_h_shufC] movu xm6, [srcq+ssq*2] - add srcq, ss3q - vbroadcasti128 m0, [srcq+ssq*0] - vpblendd m4, m0, 0xf0 ; 0 3 - vinserti128 m5, [srcq+ssq*1], 1 ; 1 4 - vinserti128 m6, [srcq+ssq*2], 1 ; 2 5 - add srcq, ss3q - vinserti128 m0, [srcq+ssq*0], 1 ; 3 6 -%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] + vbroadcasti128 m0, [r4+ssq*0] + vpblendd m4, m0, 0xf0 ; 0 3 + vinserti128 m5, [r4+ssq*1], 1 ; 1 4 + vinserti128 m6, [r4+ssq*2], 1 ; 2 5 + add r4, ss3q + vinserti128 m0, [r4+ssq*0], 1 ; 3 6 +%macro HV_H_8TAP_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] pshufb %3, %1, %6 pshufb %4, %1, %7 pshufb %1, %5 @@ -2022,10 +2519,10 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 paddw %1, %3 phaddw %1, %2 %endmacro - HV_H_W8 m4, m1, m2, m3, m7, m8, m9 - HV_H_W8 m5, m1, m2, m3, m7, m8, m9 - HV_H_W8 m6, m1, m2, m3, m7, m8, m9 - HV_H_W8 m0, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m4, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m5, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m6, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m0, m1, m2, m3, m7, m8, m9 vpbroadcastd m7, [pw_8192] vpermq m4, m4, q3120 vpermq m5, m5, q3120 @@ -2043,9 +2540,9 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 punpckhwd m6, m7 ; 56 .hv_w8_loop: vextracti128 r6m, m0, 1 ; not enough registers - movu xm0, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vinserti128 m0, [srcq+ssq*0], 1 ; 7 8 + movu xm0, [r4+ssq*1] + lea r4, [r4+ssq*2] + vinserti128 m0, [r4+ssq*0], 1 ; 7 8 pmaddwd m8, m1, m12 ; a0 pmaddwd m9, m2, m12 ; b0 mova m1, m3 @@ -2063,15 +2560,15 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 vbroadcasti128 m6, [subpel_h_shufB] vbroadcasti128 m7, [subpel_h_shufC] vbroadcasti128 m5, [subpel_h_shufA] - HV_H_W8 m0, m5, m6, m7, m5, m6, m7 + HV_H_8TAP_W8 m0, m5, m6, m7, m5, m6, m7 vpbroadcastd m5, [pw_8192] vpbroadcastd m7, [pd_512] vbroadcasti128 m6, r6m pmulhrsw m0, m5 paddd m8, m7 paddd m9, m7 - vpermq m7, m0, q3120 ; 7 8 - shufpd m6, m6, m7, 0x04 ; 6 7 + vpermq m7, m0, q3120 ; 7 8 + shufpd m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, m15 ; a3 @@ -2084,34 +2581,18 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 vextracti128 xm7, m8, 1 packuswb xm8, xm7 pshufd xm7, xm8, q3120 - movq [dstq+dsq*0], xm7 - movhps [dstq+dsq*1], xm7 - lea dstq, [dstq+dsq*2] + movq [r7+dsq*0], xm7 + movhps [r7+dsq*1], xm7 + lea r7, [r7+dsq*2] sub hd, 2 jg .hv_w8_loop - add r4, 8 - add r7, 8 + add srcq, 8 + add dstq, 8 movzx hd, r6b - mov srcq, r4 - mov dstq, r7 sub r6d, 1<<8 jg .hv_w8_loop0 RET -%macro PREP_8TAP_H 0 - pshufb m1, m0, m5 - pshufb m2, m0, m6 - pshufb m3, m0, m7 - pmaddubsw m1, m8 - pmaddubsw m0, m2, m8 - pmaddubsw m2, m9 - pmaddubsw m3, m9 - paddw m1, m2 - paddw m0, m3 - phaddw m0, m1, m0 - pmulhrsw m0, m4 -%endmacro - %if WIN64 DECLARE_REG_TMP 6, 4 %else @@ -2119,71 +2600,197 @@ DECLARE_REG_TMP 6, 7 %endif %define PREP_8TAP_FN FN prep_8tap, -PREP_8TAP_FN sharp, SHARP, SHARP -PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH -PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP -PREP_8TAP_FN smooth, SMOOTH, SMOOTH -PREP_8TAP_FN sharp_regular, SHARP, REGULAR -PREP_8TAP_FN regular_sharp, REGULAR, SHARP -PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR -PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_8bpc +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_8bpc +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_8bpc PREP_8TAP_FN regular, REGULAR, REGULAR -cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 +cglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my, ns imul mxd, mxm, 0x010101 - add mxd, t0d ; 8tap_h, mx, 4tap_h + add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 - add myd, t1d ; 8tap_v, my, 4tap_v + add myd, t1d ; 6tap_v, my, 4tap_v lea r7, [prep%+SUFFIX] - movsxd wq, wm + mov wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v +.prep: tzcnt wd, wd movzx wd, word [r7+wq*2+table_offset(prep,)] add wq, r7 - lea r6, [strideq*3] + lea r6, [ssq*3] %if WIN64 pop r7 %endif jmp wq -.h: - test myd, 0xf00 - jnz .hv - vpbroadcastd m4, [pw_8192] - vbroadcasti128 m5, [subpel_h_shufA] - WIN64_SPILL_XMM 10 - cmp wd, 4 - je .h_w4 - tzcnt wd, wd - vbroadcasti128 m6, [subpel_h_shufB] - vbroadcasti128 m7, [subpel_h_shufC] - shr mxd, 16 - sub srcq, 3 - movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] - vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] - vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] - add wq, r7 - jmp wq +.v: + WIN64_SPILL_XMM 10, 12 + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + lea myq, [r7+myq*8+subpel_filters+1-prep%+SUFFIX] + vpbroadcastd m9, [pw_8192] + vpbroadcastw m6, [myq+0] + mov nsq, ssq + vpbroadcastw m7, [myq+2] + neg nsq + vpbroadcastw m8, [myq+4] + cmp wd, 8 + jg .v_w16 + je .v_w8 +.v_w4: + movd xm2, [srcq+nsq*2] + pinsrd xm2, [srcq+nsq*1], 1 + vpbroadcastd m1, [srcq+ssq*0] + vpbroadcastd m3, [srcq+ssq*1] + vpbroadcastd m0, [srcq+ssq*2] + vbroadcasti128 m5, [deint_shuf4] + vpblendd m1, m2, 0xeb + punpcklqdq m3, m0 + vpblendd m1, m3, 0x60 ; 0 1 2 _ 2 3 4 _ + pshufb m1, m5 ; 01 12 23 34 +.v_w4_loop: + lea srcq, [srcq+ssq*4] + pinsrd xm0, [srcq+nsq*1], 1 + vpbroadcastd m2, [srcq+ssq*0] + vpbroadcastd m3, [srcq+ssq*1] + vpblendd m2, m0, 0xeb + vpbroadcastd m0, [srcq+ssq*2] + punpcklqdq m3, m0 + vpblendd m2, m3, 0x60 ; 4 5 6 _ 6 7 8 _ + pshufb m2, m5 ; 45 56 67 78 + pmaddubsw m3, m1, m6 ; a0 b0 c0 d0 + vperm2i128 m1, m2, 0x21 ; 23 34 45 56 + pmaddubsw m4, m2, m8 ; a2 b2 c2 d2 + pmaddubsw m1, m7 ; a1 b1 c1 d1 + paddw m3, m4 + paddw m3, m1 + pmulhrsw m3, m9 + mova m1, m2 + mova [tmpq], m3 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + movq xm1, [srcq+nsq*2] + vpbroadcastq m3, [srcq+nsq*1] + vpbroadcastq m2, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + vpbroadcastq m0, [srcq+ssq*2] + vpblendd m1, m3, 0x30 + vpblendd m3, m2, 0x30 + punpcklbw m1, m3 ; 01 12 + vpblendd m2, m4, 0x30 + vpblendd m4, m0, 0x30 + punpcklbw m2, m4 ; 23 34 +.v_w8_loop: + lea srcq, [srcq+ssq*4] + pmaddubsw m1, m6 ; a0 + vpbroadcastq m3, [srcq+nsq*1] + pmaddubsw m4, m2, m7 ; a1 + pmaddubsw m5, m2, m6 ; b0 + vpbroadcastq m2, [srcq+ssq*0] + vpblendd m0, m3, 0x30 + vpblendd m3, m2, 0x30 + paddw m4, m1 + punpcklbw m1, m0, m3 ; 45 56 + vpbroadcastq m3, [srcq+ssq*1] + vpbroadcastq m0, [srcq+ssq*2] + vpblendd m2, m3, 0x30 + vpblendd m3, m0, 0x30 + punpcklbw m2, m3 ; 67 78 + pmaddubsw m3, m1, m7 ; b1 + paddw m5, m3 + pmaddubsw m3, m1, m8 ; a2 + paddw m4, m3 + pmaddubsw m3, m2, m8 ; b2 + paddw m5, m3 + pmulhrsw m4, m9 + pmulhrsw m5, m9 + mova [tmpq+32*0], m4 + mova [tmpq+32*1], m5 + add tmpq, 32*2 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + lea r6d, [wq*2-32] + lea srcq, [srcq+nsq*2] + WIN64_PUSH_XMM 12 + lea r6d, [hq+r6*8] +.v_w16_loop0: + vbroadcasti128 m3, [srcq+ssq*0] + lea r5, [srcq+ssq*2] + vbroadcasti128 m4, [srcq+ssq*1] + mov r7, tmpq + vbroadcasti128 m0, [r5+ssq*0] + vbroadcasti128 m1, [r5+ssq*1] + lea r5, [r5+ssq*2] + vbroadcasti128 m2, [r5+ssq*0] + shufpd m3, m0, 0x0c + shufpd m4, m1, 0x0c + punpcklbw m1, m3, m4 ; 01 + punpckhbw m3, m4 ; 23 + shufpd m0, m2, 0x0c + punpcklbw m2, m4, m0 ; 12 + punpckhbw m4, m0 ; 34 +.v_w16_loop: + vbroadcasti128 m5, [r5+ssq*1] + pmaddubsw m10, m1, m6 ; a0 + lea r5, [r5+ssq*2] + pmaddubsw m11, m2, m6 ; b0 + mova m1, m3 + pmaddubsw m3, m7 ; a1 + mova m2, m4 + pmaddubsw m4, m7 ; b1 + paddw m10, m3 + vbroadcasti128 m3, [r5+ssq*0] + paddw m11, m4 + shufpd m4, m0, m5, 0x0d + shufpd m0, m5, m3, 0x0c + punpcklbw m3, m4, m0 ; 45 + punpckhbw m4, m0 ; 56 + pmaddubsw m5, m3, m8 ; a2 + paddw m10, m5 + pmaddubsw m5, m4, m8 ; b2 + paddw m11, m5 + pmulhrsw m10, m9 + pmulhrsw m11, m9 + mova [r7+wq*0], m10 + mova [r7+wq*2], m11 + lea r7, [r7+wq*4] + sub hd, 2 + jg .v_w16_loop + add srcq, 16 + add tmpq, 32 + movzx hd, r6b + sub r6d, 1<<8 + jg .v_w16_loop0 + RET .h_w4: + RESET_STACK_STATE movzx mxd, mxb + vbroadcasti128 m3, [subpel_h_shufA] dec srcq - vpbroadcastd m6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] - lea stride3q, [strideq*3] + vpbroadcastd m5, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] + lea r3, [ssq*3] .h_w4_loop: - movq xm0, [srcq+strideq*0] - vpbroadcastq m2, [srcq+strideq*2] - movq xm1, [srcq+strideq*1] - vpblendd m0, m2, 0xf0 - vpbroadcastq m2, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpblendd m1, m2, 0xf0 - pshufb m0, m5 - pshufb m1, m5 - pmaddubsw m0, m6 - pmaddubsw m1, m6 + movq xm0, [srcq+ssq*0] + vpbroadcastq m2, [srcq+ssq*2] + movq xm1, [srcq+ssq*1] + vpblendd m0, m2, 0x30 + vpbroadcastq m2, [srcq+r3 ] + lea srcq, [srcq+ssq*4] + vpblendd m1, m2, 0x30 + pshufb m0, m3 + pshufb m1, m3 + pmaddubsw m0, m5 + pmaddubsw m1, m5 phaddw m0, m1 pmulhrsw m0, m4 mova [tmpq], m0 @@ -2191,25 +2798,56 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 sub hd, 4 jg .h_w4_loop RET +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m4, [pw_8192] + cmp wd, 4 + je .h_w4 + WIN64_SPILL_XMM 10 + tzcnt wd, wd + vbroadcasti128 m3, [z_filter_s+ 2] + shr mxd, 16 + vbroadcasti128 m5, [z_filter_s+ 6] + sub srcq, 2 + vbroadcasti128 m6, [z_filter_s+10] + lea mxq, [r7+mxq*8+subpel_filters+1-prep%+SUFFIX] + movzx wd, word [r7+wq*2+table_offset(prep, _6tap_h)] + vpbroadcastw m7, [mxq+0] + vpbroadcastw m8, [mxq+2] + add wq, r7 + vpbroadcastw m9, [mxq+4] + jmp wq .h_w8: - movu xm0, [srcq+strideq*0] - vinserti128 m0, [srcq+strideq*1], 1 - lea srcq, [srcq+strideq*2] - PREP_8TAP_H + movu xm0, [srcq+ssq*0] + vinserti128 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] +%macro PREP_6TAP_H 0 + pshufb m1, m0, m3 + pmaddubsw m1, m7 + pshufb m2, m0, m5 + pmaddubsw m2, m8 + pshufb m0, m6 + pmaddubsw m0, m9 + paddw m1, m2 + paddw m0, m1 + pmulhrsw m0, m4 +%endmacro + PREP_6TAP_H mova [tmpq], m0 add tmpq, 32 sub hd, 2 jg .h_w8 RET .h_w16: - movu xm0, [srcq+strideq*0+8*0] - vinserti128 m0, [srcq+strideq*0+8*1], 1 - PREP_8TAP_H + movu xm0, [srcq+ssq*0+8*0] + vinserti128 m0, [srcq+ssq*0+8*1], 1 + PREP_6TAP_H mova [tmpq+32*0], m0 - movu xm0, [srcq+strideq*1+8*0] - vinserti128 m0, [srcq+strideq*1+8*1], 1 - lea srcq, [srcq+strideq*2] - PREP_8TAP_H + movu xm0, [srcq+ssq*1+8*0] + vinserti128 m0, [srcq+ssq*1+8*1], 1 + lea srcq, [srcq+ssq*2] + PREP_6TAP_H mova [tmpq+32*1], m0 add tmpq, 32*2 sub hd, 2 @@ -2229,27 +2867,219 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 .h_loop: movu xm0, [srcq+r6+8*0] vinserti128 m0, [srcq+r6+8*1], 1 - PREP_8TAP_H + PREP_6TAP_H mova [tmpq+32*0], m0 movu xm0, [srcq+r6+8*2] vinserti128 m0, [srcq+r6+8*3], 1 - PREP_8TAP_H + PREP_6TAP_H mova [tmpq+32*1], m0 add tmpq, 32*2 add r6, 32 jle .h_loop - add srcq, strideq + add srcq, ssq mov r6, r5 dec hd jg .h_loop RET +.hv: + WIN64_SPILL_XMM 14, 16 + cmp wd, 4 + jne .hv_w8 +.hv_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd m7, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + mova m6, [subpel_h_shuf4] + vpbroadcastq m0, [r7+myq*8+subpel_filters+1-prep%+SUFFIX] + mov nsq, ssq + pmovzxbd m13, [deint_shuf4] + neg nsq + vpbroadcastd m8, [pw_8192] + vpbroadcastd m9, [pd_32] + punpcklbw m0, m0 + vpbroadcastq m2, [srcq+nsq*2] + psraw m0, 8 ; sign-extend + vpbroadcastq m4, [srcq+nsq*1] + pshufd m10, m0, q0000 + vpbroadcastq m1, [srcq+ssq*0] + pshufd m11, m0, q1111 + vpbroadcastq m3, [srcq+ssq*1] + pshufd m12, m0, q2222 + vpbroadcastq m0, [srcq+ssq*2] + vpblendd m2, m4, 0xcc ; 0 1 + vpblendd m1, m3, 0xcc ; 2 3 + pshufb m2, m6 + pshufb m1, m6 + pshufb m0, m6 + pmaddubsw m2, m7 + pmaddubsw m1, m7 + pmaddubsw m0, m7 + phaddw m2, m1 ; 0 1 2 3 + phaddw m0, m0 ; 4 + pmulhrsw m2, m8 + pmulhrsw m0, m8 + palignr m0, m2, 4 + punpcklwd m1, m2, m0 ; 01 12 + punpckhwd m2, m0 ; 23 34 +.hv_w4_loop: + pmaddwd m4, m10, m1 ; a0 b0 + lea srcq, [srcq+ssq*4] + pmaddwd m5, m2, m10 ; c0 d0 + vpbroadcastq m1, [srcq+nsq*1] + pmaddwd m2, m11 ; a1 b1 + vpbroadcastq m3, [srcq+ssq*0] + paddd m4, m2 + vpbroadcastq m2, [srcq+ssq*1] + vpblendd m1, m3, 0xcc ; 5 6 + vpbroadcastq m3, [srcq+ssq*2] + vpblendd m2, m3, 0xcc ; 7 8 + pshufb m1, m6 + pshufb m2, m6 + pmaddubsw m1, m7 + pmaddubsw m2, m7 + phaddw m1, m2 ; 5 6 7 8 + pmulhrsw m1, m8 + paddd m5, m9 + paddd m4, m9 + palignr m2, m1, m0, 12 + mova m0, m1 + punpcklwd m1, m2, m0 ; 45 56 + punpckhwd m2, m0 ; 67 78 + pmaddwd m3, m11, m1 ; c1 d1 + paddd m5, m3 + pmaddwd m3, m12, m1 ; a2 b2 + paddd m4, m3 + pmaddwd m3, m12, m2 ; c2 d2 + paddd m5, m3 + psrad m4, 6 + psrad m5, 6 + packssdw m4, m5 + vpermd m4, m13, m4 + mova [tmpq], m4 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + lea mxq, [r7+mxq*8+subpel_filters+1-prep_avx2] + WIN64_PUSH_XMM 16 + vpbroadcastw m10, [mxq+0] + vpbroadcastw m11, [mxq+2] + vpbroadcastw m12, [mxq+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m0, [r7+myq*8+subpel_filters+1-prep_avx2] + lea r7, [ssq*2+2] + vbroadcasti128 m8, [z_filter_s+ 6] + punpcklbw m0, m0 + vbroadcasti128 m9, [z_filter_s+10] + psraw m0, 8 ; sign-extend + lea r6d, [wq*8-64] + pshufd m13, m0, q0000 + sub srcq, r7 + pshufd m14, m0, q1111 + lea r6d, [hq+r6*4] + pshufd m15, m0, q2222 +.hv_w8_loop0: + vbroadcasti128 m7, [z_filter_s+2] + movu xm3, [srcq+ssq*0] + lea r5, [srcq+ssq*2] + movu xm4, [srcq+ssq*1] + vbroadcasti128 m0, [r5+ssq*0] + mov r7, tmpq + vinserti128 m4, [r5+ssq*1], 1 ; 1 3 + lea r5, [r5+ssq*2] + vpblendd m3, m0, 0xf0 ; 0 2 + vinserti128 m0, [r5+ssq*0], 1 ; 2 4 + vpbroadcastd m5, [pw_8192] + HV_H_6TAP_W8 m3, m1, m2, m7, m8, m9 + HV_H_6TAP_W8 m4, m1, m2, m7, m8, m9 + HV_H_6TAP_W8 m0, m1, m2, m7, m8, m9 + vpermq m3, m3, q3120 + vpermq m4, m4, q3120 + vpermq m0, m0, q3120 + pmulhrsw m3, m5 + pmulhrsw m4, m5 + pmulhrsw m0, m5 + punpcklwd m1, m3, m4 ; 01 + punpckhwd m3, m4 ; 23 + punpcklwd m2, m4, m0 ; 12 + punpckhwd m4, m0 ; 34 +.hv_w8_loop: + movu xm7, [r5+ssq*1] + lea r5, [r5+ssq*2] + vinserti128 m7, [r5+ssq*0], 1 ; 5 6 + pmaddwd m5, m13, m1 ; a0 + mova m1, m3 + pmaddwd m6, m13, m2 ; b0 + mova m2, m4 + pmaddwd m3, m14 ; a1 + pmaddwd m4, m14 ; b1 + paddd m5, m3 + vbroadcasti128 m3, [z_filter_s+2] + paddd m6, m4 + HV_H_6TAP_W8 m7, m3, m4, m3, m8, m9 + vpbroadcastd m3, [pw_8192] + vpbroadcastd m4, [pd_32] + pmulhrsw m7, m3 + paddd m5, m4 + paddd m6, m4 + mova m4, m0 + vpermq m0, m7, q3120 + shufpd m4, m0, 0x05 + punpcklwd m3, m4, m0 ; 45 + pmaddwd m7, m15, m3 ; a2 + punpckhwd m4, m0 ; 67 + paddd m5, m7 + pmaddwd m7, m15, m4 ; b2 + paddd m6, m7 + psrad m5, 6 + psrad m6, 6 + packssdw m5, m6 + vpermq m5, m5, q3120 + mova [r7+wq*0], xm5 + vextracti128 [r7+wq*2], m5, 1 + lea r7, [r7+wq*4] + sub hd, 2 + jg .hv_w8_loop + add srcq, 8 + add tmpq, 16 + movzx hd, r6b + sub r6d, 1<<8 + jg .hv_w8_loop0 + RET + +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_8bpc +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_8bpc +PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_8bpc +PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_8bpc +PREP_8TAP_FN sharp, SHARP, SHARP + +cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r7, [prep%+SUFFIX] + mov wd, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jz mangle(private_prefix %+ _prep_6tap_8bpc_avx2).prep .v: - WIN64_SPILL_XMM 16 + WIN64_SPILL_XMM 12, 15 movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. shr myd, 16 ; Note that the code is 8-tap only, having cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 cmove myd, mxd ; had a negligible effect on performance. - ; TODO: Would a 6-tap code path be worth it? lea myq, [r7+myq*8+subpel_filters-prep%+SUFFIX] lea stride3q, [strideq*3] sub srcq, stride3q @@ -2359,72 +3189,154 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 jg .v_w8_loop RET .v_w16: - add wd, wd - mov r5, srcq - mov r7, tmpq - lea r6d, [hq+wq*8-256] + lea r6d, [wq*2-32] + WIN64_PUSH_XMM 15 + lea r6d, [hq+r6*8] .v_w16_loop0: vbroadcasti128 m4, [srcq+strideq*0] vbroadcasti128 m5, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vbroadcasti128 m0, [srcq+strideq*1] - vbroadcasti128 m6, [srcq+strideq*0] - lea srcq, [srcq+strideq*2] - vbroadcasti128 m1, [srcq+strideq*0] - vbroadcasti128 m2, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vbroadcasti128 m3, [srcq+strideq*0] - shufpd m4, m4, m0, 0x0c - shufpd m5, m5, m1, 0x0c + lea r5, [srcq+strideq*2] + vbroadcasti128 m0, [r5+strideq*1] + vbroadcasti128 m6, [r5+strideq*0] + lea r5, [r5+strideq*2] + vbroadcasti128 m1, [r5+strideq*0] + vbroadcasti128 m2, [r5+strideq*1] + lea r5, [r5+strideq*2] + vbroadcasti128 m3, [r5+strideq*0] + mov r7, tmpq + shufpd m4, m0, 0x0c + shufpd m5, m1, 0x0c punpcklbw m1, m4, m5 ; 01 punpckhbw m4, m5 ; 34 - shufpd m6, m6, m2, 0x0c + shufpd m6, m2, 0x0c punpcklbw m2, m5, m6 ; 12 punpckhbw m5, m6 ; 45 - shufpd m0, m0, m3, 0x0c + shufpd m0, m3, 0x0c punpcklbw m3, m6, m0 ; 23 punpckhbw m6, m0 ; 56 .v_w16_loop: - vbroadcasti128 m12, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vbroadcasti128 m13, [srcq+strideq*0] - pmaddubsw m14, m1, m8 ; a0 - pmaddubsw m15, m2, m8 ; b0 + vbroadcasti128 m12, [r5+strideq*1] + lea r5, [r5+strideq*2] + pmaddubsw m13, m1, m8 ; a0 + pmaddubsw m14, m2, m8 ; b0 mova m1, m3 mova m2, m4 pmaddubsw m3, m9 ; a1 pmaddubsw m4, m9 ; b1 - paddw m14, m3 - paddw m15, m4 + paddw m13, m3 + paddw m14, m4 mova m3, m5 mova m4, m6 pmaddubsw m5, m10 ; a2 pmaddubsw m6, m10 ; b2 - paddw m14, m5 - paddw m15, m6 + paddw m13, m5 + vbroadcasti128 m5, [r5+strideq*0] + paddw m14, m6 shufpd m6, m0, m12, 0x0d - shufpd m0, m12, m13, 0x0c + shufpd m0, m12, m5, 0x0c punpcklbw m5, m6, m0 ; 67 punpckhbw m6, m0 ; 78 pmaddubsw m12, m5, m11 ; a3 - pmaddubsw m13, m6, m11 ; b3 + paddw m13, m12 + pmaddubsw m12, m6, m11 ; b3 paddw m14, m12 - paddw m15, m13 + pmulhrsw m13, m7 pmulhrsw m14, m7 - pmulhrsw m15, m7 - mova [tmpq+wq*0], m14 - mova [tmpq+wq*1], m15 - lea tmpq, [tmpq+wq*2] + mova [r7+wq*0], m13 + mova [r7+wq*2], m14 + lea r7, [r7+wq*4] sub hd, 2 jg .v_w16_loop - add r5, 16 - add r7, 32 + add srcq, 16 + add tmpq, 32 movzx hd, r6b - mov srcq, r5 - mov tmpq, r7 sub r6d, 1<<8 jg .v_w16_loop0 RET +.h: +.h_w4: + test myd, 0xf00 + jnz .hv + vpbroadcastd m4, [pw_8192] + cmp wd, 4 + je mangle(private_prefix %+ _prep_6tap_8bpc_avx2).h_w4 + WIN64_SPILL_XMM 10 + vbroadcasti128 m5, [subpel_h_shufA] + tzcnt wd, wd + vbroadcasti128 m6, [subpel_h_shufB] + vbroadcasti128 m7, [subpel_h_shufC] + shr mxd, 16 + sub srcq, 3 + movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] + vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] + vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] + add wq, r7 + jmp wq +.h_w8: + movu xm0, [srcq+strideq*0] + vinserti128 m0, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] +%macro PREP_8TAP_H 0 + pshufb m1, m0, m5 + pshufb m2, m0, m6 + pshufb m3, m0, m7 + pmaddubsw m1, m8 + pmaddubsw m0, m2, m8 + pmaddubsw m2, m9 + pmaddubsw m3, m9 + paddw m1, m2 + paddw m0, m3 + phaddw m0, m1, m0 + pmulhrsw m0, m4 +%endmacro + PREP_8TAP_H + mova [tmpq], m0 + add tmpq, 32 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu xm0, [srcq+strideq*0+8*0] + vinserti128 m0, [srcq+strideq*0+8*1], 1 + PREP_8TAP_H + mova [tmpq+32*0], m0 + movu xm0, [srcq+strideq*1+8*0] + vinserti128 m0, [srcq+strideq*1+8*1], 1 + lea srcq, [srcq+strideq*2] + PREP_8TAP_H + mova [tmpq+32*1], m0 + add tmpq, 32*2 + sub hd, 2 + jg .h_w16 + RET +.h_w32: + xor r6d, r6d + jmp .h_start +.h_w64: + mov r6, -32*1 + jmp .h_start +.h_w128: + mov r6, -32*3 +.h_start: + sub srcq, r6 + mov r5, r6 +.h_loop: + movu xm0, [srcq+r6+8*0] + vinserti128 m0, [srcq+r6+8*1], 1 + PREP_8TAP_H + mova [tmpq+32*0], m0 + movu xm0, [srcq+r6+8*2] + vinserti128 m0, [srcq+r6+8*3], 1 + PREP_8TAP_H + mova [tmpq+32*1], m0 + add tmpq, 32*2 + add r6, 32 + jle .h_loop + add srcq, strideq + mov r6, r5 + dec hd + jg .h_loop + RET .hv: WIN64_SPILL_XMM 16 cmp wd, 4 @@ -2542,28 +3454,27 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 RET .hv_w8: lea r6d, [wq*8-64] - mov r5, srcq - mov r7, tmpq lea r6d, [hq+r6*4] .hv_w8_loop0: vbroadcasti128 m7, [subpel_h_shufA] movu xm4, [srcq+strideq*0] + lea r5, [srcq+strideq*2] vbroadcasti128 m8, [subpel_h_shufB] movu xm5, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] + mov r7, tmpq vbroadcasti128 m9, [subpel_h_shufC] - movu xm6, [srcq+strideq*0] - vbroadcasti128 m0, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpblendd m4, m0, 0xf0 ; 0 3 - vinserti128 m5, [srcq+strideq*0], 1 ; 1 4 - vinserti128 m6, [srcq+strideq*1], 1 ; 2 5 - lea srcq, [srcq+strideq*2] - vinserti128 m0, [srcq+strideq*0], 1 ; 3 6 - HV_H_W8 m4, m1, m2, m3, m7, m8, m9 - HV_H_W8 m5, m1, m2, m3, m7, m8, m9 - HV_H_W8 m6, m1, m2, m3, m7, m8, m9 - HV_H_W8 m0, m1, m2, m3, m7, m8, m9 + movu xm6, [r5+strideq*0] + vbroadcasti128 m0, [r5+strideq*1] + lea r5, [r5+strideq*2] + vpblendd m4, m0, 0xf0 ; 0 3 + vinserti128 m5, [r5+strideq*0], 1 ; 1 4 + vinserti128 m6, [r5+strideq*1], 1 ; 2 5 + lea r5, [r5+strideq*2] + vinserti128 m0, [r5+strideq*0], 1 ; 3 6 + HV_H_8TAP_W8 m4, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m5, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m6, m1, m2, m3, m7, m8, m9 + HV_H_8TAP_W8 m0, m1, m2, m3, m7, m8, m9 vpbroadcastd m7, [pw_8192] vpermq m4, m4, q3120 vpermq m5, m5, q3120 @@ -2580,10 +3491,10 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 punpcklwd m3, m6, m7 ; 23 punpckhwd m6, m7 ; 56 .hv_w8_loop: - vextracti128 [tmpq], m0, 1 ; not enough registers - movu xm0, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vinserti128 m0, [srcq+strideq*0], 1 ; 7 8 + vextracti128 [r7], m0, 1 ; not enough registers + movu xm0, [r5+strideq*1] + lea r5, [r5+strideq*2] + vinserti128 m0, [r5+strideq*0], 1 ; 7 8 pmaddwd m8, m1, m12 ; a0 pmaddwd m9, m2, m12 ; b0 mova m1, m3 @@ -2601,15 +3512,15 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 vbroadcasti128 m6, [subpel_h_shufB] vbroadcasti128 m7, [subpel_h_shufC] vbroadcasti128 m5, [subpel_h_shufA] - HV_H_W8 m0, m5, m6, m7, m5, m6, m7 + HV_H_8TAP_W8 m0, m5, m6, m7, m5, m6, m7 vpbroadcastd m5, [pw_8192] vpbroadcastd m7, [pd_32] - vbroadcasti128 m6, [tmpq] + vbroadcasti128 m6, [r7] pmulhrsw m0, m5 paddd m8, m7 paddd m9, m7 - vpermq m7, m0, q3120 ; 7 8 - shufpd m6, m6, m7, 0x04 ; 6 7 + vpermq m7, m0, q3120 ; 7 8 + shufpd m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, m15 ; a3 @@ -2620,16 +3531,14 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 psrad m7, 6 packssdw m8, m7 vpermq m7, m8, q3120 - mova [tmpq+wq*0], xm7 - vextracti128 [tmpq+wq*2], m7, 1 - lea tmpq, [tmpq+wq*4] + mova [r7+wq*0], xm7 + vextracti128 [r7+wq*2], m7, 1 + lea r7, [r7+wq*4] sub hd, 2 jg .hv_w8_loop - add r5, 8 - add r7, 16 + add srcq, 8 + add tmpq, 16 movzx hd, r6b - mov srcq, r5 - mov tmpq, r7 sub r6d, 1<<8 jg .hv_w8_loop0 RET @@ -4008,14 +4917,14 @@ DECLARE_REG_TMP 6, 8 %define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, BILIN_SCALED_FN put -PUT_8TAP_SCALED_FN sharp, SHARP, SHARP -PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH -PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP -PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH -PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR -PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP -PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR -PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_SCALED_FN sharp, SHARP, SHARP, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, put_8tap_scaled_8bpc PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED put @@ -4026,14 +4935,14 @@ DECLARE_REG_TMP 6, 7 %endif BILIN_SCALED_FN prep -PREP_8TAP_SCALED_FN sharp, SHARP, SHARP -PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH -PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP -PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH -PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR -PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP -PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR -PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_SCALED_FN sharp, SHARP, SHARP, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, prep_8tap_scaled_8bpc PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED prep diff --git a/third_party/dav1d/src/x86/mc_avx512.asm b/third_party/dav1d/src/x86/mc_avx512.asm index f9043f1ad3..50e670ec25 100644 --- a/third_party/dav1d/src/x86/mc_avx512.asm +++ b/third_party/dav1d/src/x86/mc_avx512.asm @@ -89,55 +89,47 @@ wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 3 db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 -bilin_h_perm16: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 - db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15 - db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39 - db 41, 40, 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47 -bilin_h_perm32: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 - db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15 - db 17, 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23 - db 25, 24, 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31 -bilin_v_perm8: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 - db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23 - db 32, 80, 33, 81, 34, 82, 35, 83, 36, 84, 37, 85, 38, 86, 39, 87 - db 64, 32, 65, 33, 66, 34, 67, 35, 68, 36, 69, 37, 70, 38, 71, 39 -bilin_v_perm16: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 - db 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 - db 64, 16, 65, 17, 66, 18, 67, 19, 68, 20, 69, 21, 70, 22, 71, 23 - db 72, 24, 73, 25, 74, 26, 75, 27, 76, 28, 77, 29, 78, 30, 79, 31 -bilin_v_perm32: db 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7 - db 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15 - db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23 - db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 -bilin_v_perm64: dq 0, 4, 1, 5, 2, 6, 3, 7 -spel_h_perm16a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 +bilin_h_perm16: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 + db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 + db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40 + db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48 +bilin_h_perm32: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 + db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 + db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24 + db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32 +bilin_v_perm8: db 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 + db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87 + db 80, 32, 81, 33, 82, 34, 83, 35, 84, 36, 85, 37, 86, 38, 87, 39 + db 32, 64, 33, 65, 34, 66, 35, 67, 36, 68, 37, 69, 38, 70, 39, 71 +bilin_v_perm16: db 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 + db 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 + db 16, 64, 17, 65, 18, 66, 19, 67, 20, 68, 21, 69, 22, 70, 23, 71 + db 24, 72, 25, 73, 26, 74, 27, 75, 28, 76, 29, 77, 30, 78, 31, 79 +bilin_v_perm32: db 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71 + db 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79 + db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87 + db 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 +bilin_v_perm64: dd 0, 0, 4, 8, 1, 1, 5, 9, 2, 2, 6, 10, 3, 3, 7, 11 +spel_h_perm16: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 -spel_h_perm16b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 - db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 - db 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 41, 39, 40, 41, 42 - db 44, 45, 46, 47, 45, 46, 47, 48, 46, 47, 48, 49, 47, 48, 49, 50 -spel_h_perm16c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 - db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 - db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 - db 48, 49, 50, 51, 49, 50, 51, 52, 50, 51, 52, 53, 51, 52, 53, 54 -spel_h_perm32a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 +spel_h_perm32: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 -spel_h_perm32b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 - db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 - db 20, 21, 22, 23, 21, 22, 23, 24, 22, 23, 24, 25, 23, 24, 25, 26 - db 28, 29, 30, 31, 29, 30, 31, 32, 30, 31, 32, 33, 31, 32, 33, 34 -spel_h_perm32c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 - db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 - db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 - db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 -spel_v_perm16: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 +spel_v_perm8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 + db 8, 16, 9, 17, 10, 18, 11, 19, 12, 20, 13, 21, 14, 22, 15, 23 + db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 + db 24, 32, 25, 33, 26, 34, 27, 35, 28, 36, 29, 37, 30, 38, 31, 39 +spel_v_perm16a: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23 db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 +spel_v_perm16b: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 + db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23 + db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 + db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 spel_v_perm32: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39 db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 @@ -154,34 +146,20 @@ spel_hv_perm8a: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 2 db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 -spel_hv_perm8b: db 32, 33, 48, 49, 34, 35, 50, 51, 36, 37, 52, 53, 38, 39, 54, 55 - db 40, 41, 56, 57, 42, 43, 58, 59, 44, 45, 60, 61, 46, 47, 62, 63 - db 48, 49, 64, 65, 50, 51, 66, 67, 52, 53, 68, 69, 54, 55, 70, 71 - db 56, 57, 72, 73, 58, 59, 74, 75, 60, 61, 76, 77, 62, 63, 78, 79 -spel_hv_perm8c: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13 +spel_hv_perm8b: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13 db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29 db 0, 1, 32, 33, 4, 5, 36, 37, 8, 9, 40, 41, 12, 13, 44, 45 db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61 -spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55 - db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63 spel_hv_perm16a:db 0, 1, 2, 3, 32, 33, 34, 35, 1, 2, 3, 4, 33, 34, 35, 36 db 2, 3, 4, 5, 34, 35, 36, 37, 3, 4, 5, 6, 35, 36, 37, 38 -spel_hv_perm16c:db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44 + db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44 db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46 - db 16, 17, 18, 19, 48, 49, 50, 51, 17, 18, 19, 20, 49, 50, 51, 52 - db 18, 19, 20, 21, 50, 51, 52, 53, 19, 20, 21, 22, 51, 52, 53, 54 -spel_hv_perm16b:db 4, 5, 6, 7, 36, 37, 38, 39, 5, 6, 7, 8, 37, 38, 39, 40 - db 6, 7, 8, 9, 38, 39, 40, 41, 7, 8, 9, 10, 39, 40, 41, 42 - db 12, 13, 14, 15, 44, 45, 46, 47, 13, 14, 15, 16, 45, 46, 47, 48 - db 14, 15, 16, 17, 46, 47, 48, 49, 15, 16, 17, 18, 47, 48, 49, 50 -spel_hv_perm16d:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8 +spel_hv_perm16b:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8 db 2, 3, 4, 5, 3, 4, 5, 6, 6, 7, 8, 9, 7, 8, 9, 10 db 8, 9, 10, 11, 9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16 db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18 -spel_hv_perm16e:db 4, 5, 6, 7, 5, 6, 7, 8, 8, 9, 10, 11, 9, 10, 11, 12 - db 6, 7, 8, 9, 7, 8, 9, 10, 10, 11, 12, 13, 11, 12, 13, 14 - db 12, 13, 14, 15, 13, 14, 15, 16, 16, 17, 18, 19, 17, 18, 19, 20 - db 14, 15, 16, 17, 15, 16, 17, 18, 18, 19, 20, 21, 19, 20, 21, 22 +spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55 + db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63 spel_hv_end: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55 deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 @@ -189,15 +167,14 @@ subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 1 subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 -bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 -bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 -bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 +bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 +bilin_v_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 resize_permA: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 resize_permB: dd 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 resize_permC: dd 0, 4, 8, 12 +resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7 wm_420_perm64: dq 0xfedcba9876543210 @@ -205,6 +182,8 @@ wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040 pb_8x0_8x8: times 8 db 0 times 8 db 8 +pb_4: times 4 db 4 +pb_32: times 4 db 32 pb_127: times 4 db 127 pw_m128 times 2 dw -128 pw_m256: times 2 dw -256 @@ -216,7 +195,6 @@ pd_32: dd 32 pd_34: dd 34 pd_63: dd 63 pd_512: dd 512 -pd_32768: dd 32768 %define pb_m64 (wm_sign+4) %define pb_64 (wm_sign+8) @@ -289,8 +267,10 @@ BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 6tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, avx512icl, 3, 2, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 6tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 8tap, avx512icl, 3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 @@ -401,9 +381,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy .h: ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 - imul mxyd, 0xff01 - vbroadcasti128 m4, [bilin_h_shuf8] - add mxyd, 16 << 8 + imul mxyd, 255 + vbroadcasti128 m4, [bilin_h_perm16] + add mxyd, 16 vpbroadcastw m5, mxyd mov mxyd, r7m ; my test mxyd, mxyd @@ -526,9 +506,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy RET .v: movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] - imul mxyd, 0xff01 + imul mxyd, 255 vpbroadcastd m5, [pw_2048] - add mxyd, 16 << 8 + add mxyd, 16 add wq, r7 vpbroadcastw m4, mxyd jmp wq @@ -539,7 +519,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy lea srcq, [srcq+ssq*2] pinsrw xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1 pshuflw xmm1, xmm1, q2301 ; 1 0 - punpcklbw xmm1, xmm0, xmm1 + punpcklbw xmm1, xmm0 pmaddubsw xmm1, xm4 pmulhrsw xmm1, xm5 packuswb xmm1, xmm1 @@ -552,11 +532,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy .v_w4: movd xmm0, [srcq+ssq*0] .v_w4_loop: - vpbroadcastd xmm1, [srcq+ssq*1] + vpbroadcastd xmm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - vpblendd xmm2, xmm1, xmm0, 0x01 ; 0 1 + vpblendd xmm1, xmm2, xmm0, 0x01 ; 0 1 vpbroadcastd xmm0, [srcq+ssq*0] - vpblendd xmm1, xmm0, 0x02 ; 1 2 + vpblendd xmm2, xmm0, 0x02 ; 1 2 punpcklbw xmm1, xmm2 pmaddubsw xmm1, xm4 pmulhrsw xmm1, xm5 @@ -570,11 +550,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy .v_w8: movq xmm0, [srcq+ssq*0] .v_w8_loop: - movq xmm3, [srcq+ssq*1] + movq xmm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - punpcklbw xmm1, xmm3, xmm0 + punpcklbw xmm1, xmm0, xmm2 movq xmm0, [srcq+ssq*0] - punpcklbw xmm2, xmm0, xmm3 + punpcklbw xmm2, xmm0 pmaddubsw xmm1, xm4 pmaddubsw xmm2, xm4 pmulhrsw xmm1, xm5 @@ -589,11 +569,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy .v_w16: movu xmm0, [srcq+ssq*0] .v_w16_loop: - vbroadcasti128 ymm2, [srcq+ssq*1] + vbroadcasti128 ymm3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - vpblendd ymm3, ymm2, ymm0, 0x0f ; 0 1 + vpblendd ymm2, ymm3, ymm0, 0x0f ; 0 1 vbroadcasti128 ymm0, [srcq+ssq*0] - vpblendd ymm2, ymm2, ymm0, 0xf0 ; 1 2 + vpblendd ymm3, ymm0, 0xf0 ; 1 2 punpcklbw ymm1, ymm2, ymm3 punpckhbw ymm2, ymm3 pmaddubsw ymm1, ym4 @@ -612,11 +592,11 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy movu ym0, [srcq+ssq*0] kxnorb k1, k1, k1 .v_w32_loop: - vbroadcasti32x8 m2, [srcq+ssq*1] + vbroadcasti32x8 m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - vpblendmd m3{k1}, m2, m0 ; 0 1 + vpblendmd m2{k1}, m3, m0 ; 0 1 vbroadcasti32x8 m0, [srcq+ssq*0] - vpblendmd m2{k1}, m0, m2 ; 1 2 + vpblendmd m3{k1}, m0, m3 ; 1 2 punpcklbw m1, m2, m3 punpckhbw m2, m3 pmaddubsw m1, m4 @@ -635,18 +615,18 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy .v_w64_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - punpcklbw m1, m3, m0 - punpckhbw m6, m3, m0 + punpcklbw m1, m0, m3 + punpckhbw m6, m0, m3 movu m0, [srcq+ssq*0] pmaddubsw m1, m4 pmaddubsw m6, m4 - punpcklbw m2, m0, m3 - punpckhbw m7, m0, m3 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 pmaddubsw m2, m4 - pmaddubsw m7, m4 - REPX {pmulhrsw x, m5}, m1, m6, m2, m7 + pmaddubsw m3, m4 + REPX {pmulhrsw x, m5}, m1, m6, m2, m3 packuswb m1, m6 - packuswb m2, m7 + packuswb m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] @@ -660,13 +640,13 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy add srcq, ssq movu m2, [srcq+64*0] movu m3, [srcq+64*1] - punpcklbw m6, m2, m0 + punpcklbw m6, m0, m2 pmaddubsw m6, m4 - punpckhbw m0, m2, m0 + punpckhbw m0, m2 pmaddubsw m0, m4 - punpcklbw m7, m3, m1 + punpcklbw m7, m1, m3 pmaddubsw m7, m4 - punpckhbw m1, m3, m1 + punpckhbw m1, m3 pmaddubsw m1, m4 REPX {pmulhrsw x, m5}, m6, m0, m7, m1 packuswb m6, m0 @@ -1005,8 +985,8 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .h: ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] - imul mxyd, 0xff01 - add mxyd, 16 << 8 + imul mxyd, 255 + add mxyd, 16 vpbroadcastw m5, mxyd mov mxyd, r6m ; my test mxyd, mxyd @@ -1032,7 +1012,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 jg .h_w4_loop RET .h_w8: - vbroadcasti32x4 m4, [bilin_h_shuf8] + vbroadcasti32x4 m4, [bilin_h_perm16] .h_w8_loop: movu xmm0, [srcq+strideq*0] vinserti32x4 ym0, ymm0, [srcq+strideq*1], 1 @@ -1127,8 +1107,8 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .v: WIN64_SPILL_XMM 7 movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)] - imul mxyd, 0xff01 - add mxyd, 16 << 8 + imul mxyd, 255 + add mxyd, 16 add wq, t2 lea stride3q, [strideq*3] vpbroadcastw m6, mxyd @@ -1218,11 +1198,11 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .v_w64_loop: vpermq m1, m5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] - punpcklbw m4, m1, m0 - punpckhbw m2, m1, m0 + punpcklbw m4, m0, m1 + punpckhbw m2, m0, m1 vpermq m0, m5, [srcq+strideq*0] - punpcklbw m3, m0, m1 - punpckhbw m1, m0, m1 + punpcklbw m3, m1, m0 + punpckhbw m1, m0 pmaddubsw m4, m6 pmaddubsw m2, m6 pmaddubsw m3, m6 @@ -1243,28 +1223,28 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 vpermq m2, m5, [srcq+strideq*1+ 0] vpermq m3, m5, [srcq+strideq*1+64] lea srcq, [srcq+strideq*2] - punpcklbw m4, m2, m0 - punpckhbw m0, m2, m0 + punpcklbw m4, m0, m2 + punpckhbw m0, m2 pmaddubsw m4, m6 pmaddubsw m0, m6 mova [tmpq+64*0], m4 mova [tmpq+64*1], m0 - punpcklbw m4, m3, m1 - punpckhbw m1, m3, m1 + punpcklbw m4, m1, m3 + punpckhbw m1, m3 pmaddubsw m4, m6 pmaddubsw m1, m6 mova [tmpq+64*2], m4 mova [tmpq+64*3], m1 vpermq m0, m5, [srcq+strideq*0+ 0] vpermq m1, m5, [srcq+strideq*0+64] - punpcklbw m4, m0, m2 - punpckhbw m2, m0, m2 + punpcklbw m4, m2, m0 + punpckhbw m2, m0 pmaddubsw m4, m6 pmaddubsw m2, m6 mova [tmpq+64*4], m4 mova [tmpq+64*5], m2 - punpcklbw m4, m1, m3 - punpckhbw m3, m1, m3 + punpcklbw m4, m3, m1 + punpckhbw m3, m1 pmaddubsw m4, m6 pmaddubsw m3, m6 mova [tmpq+64*6], m4 @@ -1308,7 +1288,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 jg .hv_w4_loop RET .hv_w8: - vbroadcasti32x4 m4, [bilin_h_shuf8] + vbroadcasti32x4 m4, [bilin_h_perm16] vbroadcasti32x4 m0, [srcq+strideq*0] pshufb m0, m4 pmaddubsw m0, m5 @@ -1448,7 +1428,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 -%macro FN 4 ; fn, type, type_h, type_v +%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to cglobal %1_%2_8bpc mov t0d, FILTER_%3 %ifidn %3, %4 @@ -1456,8 +1436,8 @@ cglobal %1_%2_8bpc %else mov t1d, FILTER_%4 %endif -%ifnidn %2, regular ; skip the jump in the last filter - jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) +%if %0 == 5 ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%5 %+ SUFFIX) %endif %endmacro @@ -1489,24 +1469,22 @@ DECLARE_REG_TMP 4, 5 DECLARE_REG_TMP 7, 8 %endif +; Due to the use of vpdpbusd (which does 4 pixels per instruction) in +; the horizontal filter, 6-tap is only used for the vertical filter. %define PUT_8TAP_FN FN put_8tap, - -PUT_8TAP_FN sharp, SHARP, SHARP -PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH -PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP -PUT_8TAP_FN smooth, SMOOTH, SMOOTH -PUT_8TAP_FN sharp_regular, SHARP, REGULAR -PUT_8TAP_FN regular_sharp, REGULAR, SHARP -PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR -PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_6tap_8bpc +PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_6tap_8bpc +PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc PUT_8TAP_FN regular, REGULAR, REGULAR -cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 +cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns %define base r8-put_avx512icl imul mxd, mxm, 0x010101 - add mxd, t0d ; 8tap_h, mx, 4tap_h + add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 - add myd, t1d ; 8tap_v, my, 4tap_v + add myd, t1d ; 6tap_v, my, 4tap_v lea r8, [put_avx512icl] movsxd wq, wm movifnidn hd, hm @@ -1514,6 +1492,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jnz .h test myd, 0xf00 jnz .v +.put: tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 @@ -1523,474 +1502,273 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pop r8 %endif jmp wq -.h: - test myd, 0xf00 - jnz .hv - vpbroadcastd m5, [pd_34] ; 2 + (8 << 2) - WIN64_SPILL_XMM 11 - cmp wd, 4 - jl .h_w2 - vbroadcasti128 m6, [subpel_h_shufA] - je .h_w4 - tzcnt wd, wd - vbroadcasti128 m7, [subpel_h_shufB] - vbroadcasti128 m8, [subpel_h_shufC] - shr mxd, 16 - sub srcq, 3 - movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] - vpbroadcastd m9, [base+mxq*8+subpel_filters+0] - vpbroadcastd m10, [base+mxq*8+subpel_filters+4] - add wq, r8 - jmp wq -.h_w2: - movzx mxd, mxb - dec srcq - mova xmm4, [subpel_h_shuf4] - vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] -.h_w2_loop: - movq xmm0, [srcq+ssq*0] - movhps xmm0, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - pshufb xmm0, xmm4 - mova xmm1, xm5 - vpdpbusd xmm1, xmm0, xmm3 - packssdw xmm0, xmm1, xmm1 - psraw xmm0, 6 - packuswb xmm0, xm0 - pextrw [dstq+dsq*0], xmm0, 0 - pextrw [dstq+dsq*1], xmm0, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w2_loop - RET -.h_w4: - movzx mxd, mxb - dec srcq - vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] -.h_w4_loop: - movq xmm0, [srcq+ssq*0] - movq xmm1, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - pshufb xmm0, xm6 - pshufb xmm1, xm6 - mova xmm2, xm5 - vpdpbusd xmm2, xmm0, xmm3 - mova xmm0, xm5 - vpdpbusd xmm0, xmm1, xmm3 - packssdw xmm0, xmm2, xmm0 - psraw xmm0, 6 - packuswb xmm0, xmm0 - movd [dstq+dsq*0], xmm0 - pextrd [dstq+dsq*1], xmm0, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w4_loop - RET -.h_w8: - movu xm0, [srcq+ssq*0] - vinserti32x4 ym0, [srcq+ssq*1], 1 - lea srcq, [srcq+ssq*2] - WRAP_YMM PUT_8TAP_H 0, 1, 2, 3 - vpmovuswb xm0, ym0 - movq [dstq+dsq*0], xm0 - movhps [dstq+dsq*1], xm0 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w8 - RET -.h_w16: - mova m6, [spel_h_perm16a] - mova m7, [spel_h_perm16b] - mova m8, [spel_h_perm16c] -.h_w16_loop: - movu ym0, [srcq+ssq*0] - vinserti32x8 m0, [srcq+ssq*1], 1 - lea srcq, [srcq+ssq*2] - PUT_8TAP_H 0, 1, 2, 3, 1 - vpmovuswb ym0, m0 - mova [dstq+dsq*0], xm0 - vextracti128 [dstq+dsq*1], ym0, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w16_loop - RET -.h_w32: - movu ym0, [srcq+ssq*0+8*0] - vinserti32x8 m0, [srcq+ssq*1+8*0], 1 - movu ym1, [srcq+ssq*0+8*1] - vinserti32x8 m1, [srcq+ssq*1+8*1], 1 - lea srcq, [srcq+ssq*2] - PUT_8TAP_H 0, 2, 3, 4 - PUT_8TAP_H 1, 4, 3, 2 - packuswb m0, m1 - mova [dstq+dsq*0], ym0 - vextracti32x8 [dstq+dsq*1], m0, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w32 - RET -.h_w64: - movu m0, [srcq+8*0] - movu m1, [srcq+8*1] - add srcq, ssq - PUT_8TAP_H 0, 2, 3, 4 - PUT_8TAP_H 1, 4, 3, 2 - packuswb m0, m1 - mova [dstq], m0 - add dstq, dsq - dec hd - jg .h_w64 - RET -.h_w128: - movu m0, [srcq+8*0] - movu m2, [srcq+8*1] - movu m1, [srcq+8*8] - movu m3, [srcq+8*9] - add srcq, ssq - PUT_8TAP_H 0, 4, 11, 12 - PUT_8TAP_H 2, 12, 11, 4 - PUT_8TAP_H 1, 4, 11, 12 - PUT_8TAP_H 3, 12, 11, 4 - packuswb m0, m2 - packuswb m1, m3 - mova [dstq+64*0], m0 - mova [dstq+64*1], m1 - add dstq, dsq - dec hd - jg .h_w128 - RET .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd tzcnt r6d, wd - movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] - vpbroadcastd m7, [pw_512] - lea myq, [base+subpel_filters+myq*8] - vpbroadcastw m8, [myq+0] - vpbroadcastw m9, [myq+2] - vpbroadcastw m10, [myq+4] - vpbroadcastw m11, [myq+6] + movzx r6d, word [r8+r6*2+table_offset(put, _6tap_v)] + vpbroadcastd m6, [pw_512] + lea myq, [base+subpel_filters+1+myq*8] + vpbroadcastw m7, [myq+0] add r6, r8 - lea ss3q, [ssq*3] - sub srcq, ss3q + vpbroadcastw m8, [myq+2] + mov nsq, ssq + vpbroadcastw m9, [myq+4] + neg nsq jmp r6 .v_w2: - movd xmm2, [srcq+ssq*0] - pinsrw xmm2, [srcq+ssq*1], 2 - pinsrw xmm2, [srcq+ssq*2], 4 - add srcq, ss3q - pinsrw xmm2, [srcq+ssq*0], 6 ; 0 1 2 3 - movd xmm3, [srcq+ssq*1] - vpbroadcastd xmm1, [srcq+ssq*2] - add srcq, ss3q + movd xmm2, [srcq+nsq*2] + pinsrw xmm2, [srcq+nsq*1], 2 + pinsrw xmm2, [srcq+ssq*0], 4 + pinsrw xmm2, [srcq+ssq*1], 6 ; 0 1 2 3 + lea srcq, [srcq+ssq*2] vpbroadcastd xmm0, [srcq+ssq*0] - vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 - vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 - palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 - punpcklbw xmm3, xmm1 ; 45 56 - punpcklbw xmm1, xmm2, xmm4 ; 01 12 - punpckhbw xmm2, xmm4 ; 23 34 + palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4 + punpcklbw xmm1, xmm2, xmm3 ; 01 12 + punpckhbw xmm2, xmm3 ; 23 34 .v_w2_loop: - pmaddubsw xmm5, xmm1, xm8 ; a0 b0 - mova xmm1, xmm2 - pmaddubsw xmm2, xm9 ; a1 b1 - paddw xmm5, xmm2 - mova xmm2, xmm3 - pmaddubsw xmm3, xm10 ; a2 b2 - paddw xmm5, xmm3 vpbroadcastd xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 + pmaddubsw xmm3, xmm1, xm7 ; a0 b0 + mova xmm1, xmm2 + pmaddubsw xmm2, xm8 ; a1 b1 + paddw xmm3, xmm2 + vpblendd xmm2, xmm0, xmm4, 0x02 ; 4 5 vpbroadcastd xmm0, [srcq+ssq*0] - vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 - punpcklbw xmm3, xmm4 ; 67 78 - pmaddubsw xmm4, xmm3, xm11 ; a3 b3 - paddw xmm5, xmm4 - pmulhrsw xmm5, xm7 - packuswb xmm5, xmm5 - pextrw [dstq+dsq*0], xmm5, 0 - pextrw [dstq+dsq*1], xmm5, 2 + vpblendd xmm4, xmm0, 0x02 ; 5 6 + punpcklbw xmm2, xmm4 ; 67 78 + pmaddubsw xmm4, xmm2, xm9 ; a3 b3 + paddw xmm3, xmm4 + pmulhrsw xmm3, xm6 + packuswb xmm3, xmm3 + pextrw [dstq+dsq*0], xmm3, 0 + pextrw [dstq+dsq*1], xmm3, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: - movd xmm2, [srcq+ssq*0] - pinsrd xmm2, [srcq+ssq*1], 1 - pinsrd xmm2, [srcq+ssq*2], 2 - add srcq, ss3q - pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 - movd xmm3, [srcq+ssq*1] - vpbroadcastd xmm1, [srcq+ssq*2] - add srcq, ss3q + movd xmm2, [srcq+nsq*2] + pinsrd xmm2, [srcq+nsq*1], 1 + pinsrd xmm2, [srcq+ssq*0], 2 + pinsrd xmm2, [srcq+ssq*1], 3 ; 0 1 2 3 + lea srcq, [srcq+ssq*2] vpbroadcastd xmm0, [srcq+ssq*0] - vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 - vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 - palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 - punpcklbw xmm3, xmm1 ; 45 56 - punpcklbw xmm1, xmm2, xmm4 ; 01 12 - punpckhbw xmm2, xmm4 ; 23 34 + palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4 + punpcklbw xmm1, xmm2, xmm3 ; 01 12 + punpckhbw xmm2, xmm3 ; 23 34 .v_w4_loop: vpbroadcastd xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - pmaddubsw xmm5, xmm1, xm8 ; a0 b0 + pmaddubsw xmm3, xmm1, xm7 ; a0 b0 mova xmm1, xmm2 - pmaddubsw xmm2, xm9 ; a1 b1 - paddw xmm5, xmm2 - mova xmm2, xmm3 - pmaddubsw xmm3, xm10 ; a2 b2 - paddw xmm5, xmm3 - vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 + pmaddubsw xmm2, xm8 ; a1 b1 + paddw xmm3, xmm2 + vpblendd xmm2, xmm0, xmm4, 0x02 ; 4 5 vpbroadcastd xmm0, [srcq+ssq*0] - vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 - punpcklbw xmm3, xmm4 ; 67 78 - pmaddubsw xmm4, xmm3, xm11 ; a3 b3 - paddw xmm5, xmm4 - pmulhrsw xmm5, xm7 - packuswb xmm5, xmm5 - movd [dstq+dsq*0], xmm5 - pextrd [dstq+dsq*1], xmm5, 1 + vpblendd xmm4, xmm0, 0x02 ; 5 6 + punpcklbw xmm2, xmm4 ; 45 56 + pmaddubsw xmm4, xmm2, xm9 ; a2 b2 + paddw xmm3, xmm4 + pmulhrsw xmm3, xm6 + packuswb xmm3, xmm3 + movd [dstq+dsq*0], xmm3 + pextrd [dstq+dsq*1], xmm3, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: - movq xmm1, [srcq+ssq*0] - vpbroadcastq ymm0, [srcq+ssq*1] - vpbroadcastq ymm2, [srcq+ssq*2] - add srcq, ss3q - vpbroadcastq ymm5, [srcq+ssq*0] - vpbroadcastq ymm3, [srcq+ssq*1] - vpbroadcastq ymm4, [srcq+ssq*2] - add srcq, ss3q - vpblendd ymm1, ymm0, 0x30 - vpblendd ymm0, ymm2, 0x30 - punpcklbw ymm1, ymm0 ; 01 12 + movq xmm1, [srcq+nsq*2] + vpbroadcastq ymm3, [srcq+nsq*1] + vpbroadcastq ymm2, [srcq+ssq*0] + vpbroadcastq ymm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] vpbroadcastq ymm0, [srcq+ssq*0] - vpblendd ymm2, ymm5, 0x30 - vpblendd ymm5, ymm3, 0x30 - punpcklbw ymm2, ymm5 ; 23 34 - vpblendd ymm3, ymm4, 0x30 + vpblendd ymm1, ymm3, 0x30 + vpblendd ymm3, ymm2, 0x30 + punpcklbw ymm1, ymm3 ; 01 12 + vpblendd ymm2, ymm4, 0x30 vpblendd ymm4, ymm0, 0x30 - punpcklbw ymm3, ymm4 ; 45 56 + punpcklbw ymm2, ymm4 ; 23 34 .v_w8_loop: vpbroadcastq ymm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - pmaddubsw ymm5, ymm1, ym8 ; a0 b0 + pmaddubsw ymm3, ymm1, ym7 ; a0 b0 mova ymm1, ymm2 - pmaddubsw ymm2, ym9 ; a1 b1 - paddw ymm5, ymm2 - mova ymm2, ymm3 - pmaddubsw ymm3, ym10 ; a2 b2 - paddw ymm5, ymm3 - vpblendd ymm3, ymm0, ymm4, 0x30 + pmaddubsw ymm2, ym8 ; a1 b1 + paddw ymm3, ymm2 + vpblendd ymm2, ymm0, ymm4, 0x30 vpbroadcastq ymm0, [srcq+ssq*0] - vpblendd ymm4, ymm4, ymm0, 0x30 - punpcklbw ymm3, ymm4 ; 67 78 - pmaddubsw ymm4, ymm3, ym11 ; a3 b3 - paddw ymm5, ymm4 - pmulhrsw ymm5, ym7 - vextracti128 xmm4, ymm5, 1 - packuswb xmm5, xmm4 - movq [dstq+dsq*0], xmm5 - movhps [dstq+dsq*1], xmm5 + vpblendd ymm4, ymm0, 0x30 + punpcklbw ymm2, ymm4 ; 45 56 + pmaddubsw ymm4, ymm2, ym9 ; a2 b2 + paddw ymm3, ymm4 + pmulhrsw ymm3, ym6 + vextracti128 xmm4, ymm3, 1 + packuswb xmm3, xmm4 + movq [dstq+dsq*0], xmm3 + movhps [dstq+dsq*1], xmm3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop vzeroupper RET .v_w16: - mova m12, [spel_v_perm16] - vbroadcasti32x4 m1, [srcq+ssq*0] - vbroadcasti32x4 ym4, [srcq+ssq*1] + mova m5, [spel_v_perm16a] + vbroadcasti32x4 m1, [srcq+nsq*2] + vbroadcasti32x4 ym3, [srcq+nsq*1] mov r6d, 0x0f - vbroadcasti32x4 m2, [srcq+ssq*2] - add srcq, ss3q - vbroadcasti32x4 ym5, [srcq+ssq*0] + vbroadcasti32x4 m2, [srcq+ssq*0] kmovb k1, r6d - vbroadcasti32x4 m3, [srcq+ssq*1] - vbroadcasti32x4 ym6, [srcq+ssq*2] - add srcq, ss3q + vbroadcasti32x4 ym4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] vbroadcasti32x4 m0, [srcq+ssq*0] - vshufpd m1{k1}, m4, m2, 0xcc - vshufpd m2{k1}, m5, m3, 0xcc - vshufpd m3{k1}, m6, m0, 0xcc - vpermb m1, m12, m1 ; 01 12 - vpermb m2, m12, m2 ; 23 34 - vpermb m3, m12, m3 ; 45 56 + vshufpd m1{k1}, m3, m2, 0xcc + vshufpd m2{k1}, m4, m0, 0xcc + vpermb m1, m5, m1 ; 01 12 + vpermb m2, m5, m2 ; 23 34 .v_w16_loop: - pmaddubsw m4, m1, m8 ; a0 b0 - mova m1, m2 - pmaddubsw m5, m2, m9 ; a1 b1 - mova m2, m3 - pmaddubsw m6, m3, m10 ; a2 b2 - mova m3, m0 - paddw m4, m5 - vbroadcasti32x4 ym5, [srcq+ssq*1] + vbroadcasti32x4 ym4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] + pmaddubsw m3, m1, m7 ; a0 b0 + mova m1, m2 + pmaddubsw m2, m8 ; a1 b1 + paddw m3, m2 + mova m2, m0 vbroadcasti32x4 m0, [srcq+ssq*0] - vshufpd m3{k1}, m5, m0, 0xcc - vpermb m3, m12, m3 ; 67 78 - pmaddubsw m5, m3, m11 ; a3 b3 - paddw m4, m6 - paddw m4, m5 - pmulhrsw m4, m7 - vextracti32x8 ym5, m4, 1 - packuswb ym4, ym5 - mova [dstq+dsq*0], xm4 - vextracti32x4 [dstq+dsq*1], ym4, 1 + vshufpd m2{k1}, m4, m0, 0xcc + vpermb m2, m5, m2 ; 45 56 + pmaddubsw m4, m2, m9 ; a2 b2 + paddw m3, m4 + pmulhrsw m3, m6 + vextracti32x8 ym4, m3, 1 + packuswb ym3, ym4 + mova [dstq+dsq*0], xm3 + vextracti32x4 [dstq+dsq*1], ym3, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: - mova m12, [spel_v_perm32] - pmovzxbq m14, [pb_02461357] - vpshrdw m13, m12, m12, 8 - movu ym0, [srcq+ssq*0] + mova m10, [spel_v_perm32] + pmovzxbq m5, [pb_02461357] + vpshrdw m11, m10, m10, 8 + movu ym0, [srcq+nsq*2] + vinserti32x8 m0, [srcq+nsq*1], 1 + vpermb m1, m10, m0 ; 01 + vinserti32x8 m0, [srcq+ssq*0], 0 + vpermb m2, m11, m0 ; 12 vinserti32x8 m0, [srcq+ssq*1], 1 - vpermb m1, m12, m0 ; 01 - vinserti32x8 m0, [srcq+ssq*2], 0 - add srcq, ss3q - vpermb m2, m13, m0 ; 12 - vinserti32x8 m0, [srcq+ssq*0], 1 - vpermb m3, m12, m0 ; 23 - vinserti32x8 m0, [srcq+ssq*1], 0 - vpermb m4, m13, m0 ; 34 - vinserti32x8 m0, [srcq+ssq*2], 1 - add srcq, ss3q - vpermb m5, m12, m0 ; 45 + lea srcq, [srcq+ssq*2] + vpermb m3, m10, m0 ; 23 vinserti32x8 m0, [srcq+ssq*0], 0 - vpermb m6, m13, m0 ; 56 + vpermb m4, m11, m0 ; 34 .v_w32_loop: vinserti32x8 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] - pmaddubsw m15, m1, m8 + pmaddubsw m12, m1, m7 mova m1, m3 - pmaddubsw m16, m2, m8 + pmaddubsw m13, m2, m7 mova m2, m4 - pmaddubsw m17, m3, m9 - mova m3, m5 - pmaddubsw m18, m4, m9 - mova m4, m6 - pmaddubsw m19, m5, m10 - vpermb m5, m12, m0 ; 67 + pmaddubsw m14, m3, m8 + vpermb m3, m10, m0 ; 45 vinserti32x8 m0, [srcq+ssq*0], 0 - pmaddubsw m20, m6, m10 - vpermb m6, m13, m0 ; 78 - paddw m15, m17 - pmaddubsw m17, m5, m11 - paddw m16, m18 - pmaddubsw m18, m6, m11 - paddw m15, m19 - paddw m16, m20 - paddw m15, m17 - paddw m16, m18 - pmulhrsw m15, m7 - pmulhrsw m16, m7 - packuswb m15, m16 - vpermq m15, m14, m15 - mova [dstq+dsq*0], ym15 - vextracti32x8 [dstq+dsq*1], m15, 1 + pmaddubsw m15, m4, m8 + vpermb m4, m11, m0 ; 56 + paddw m12, m14 + pmaddubsw m14, m3, m9 + paddw m13, m15 + pmaddubsw m15, m4, m9 + paddw m12, m14 + paddw m13, m15 + pmulhrsw m12, m6 + pmulhrsw m13, m6 + packuswb m12, m13 + vpermq m12, m5, m12 + mova [dstq+dsq*0], ym12 + vextracti32x8 [dstq+dsq*1], m12, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop - vzeroupper RET .v_w64: .v_w128: lea r6d, [hq+wq*4-256] - mov r4, srcq - mov r7, dstq .v_loop0: - movu m2, [srcq+ssq*0] - movu m4, [srcq+ssq*1] - movu m6, [srcq+ssq*2] - add srcq, ss3q - movu m13, [srcq+ssq*0] - movu m15, [srcq+ssq*1] - movu m17, [srcq+ssq*2] - add srcq, ss3q - movu m0, [srcq+ssq*0] - punpcklbw m1, m2, m4 ; 01l - punpckhbw m2, m4 ; 01h - punpcklbw m3, m4, m6 ; 12l - punpckhbw m4, m6 ; 12h - punpcklbw m5, m6, m13 ; 23l - punpckhbw m6, m13 ; 23h - punpcklbw m12, m13, m15 ; 34l - punpckhbw m13, m15 ; 34h - punpcklbw m14, m15, m17 ; 45l - punpckhbw m15, m17 ; 45h - punpcklbw m16, m17, m0 ; 56l - punpckhbw m17, m0 ; 56h + movu m2, [srcq+nsq*2] + movu m4, [srcq+nsq*1] + lea r4, [srcq+ssq*2] + movu m11, [srcq+ssq*0] + movu m13, [srcq+ssq*1] + mov r7, dstq + movu m0, [r4 +ssq*0] + punpcklbw m1, m2, m4 ; 01l + punpckhbw m2, m4 ; 01h + punpcklbw m3, m4, m11 ; 12l + punpckhbw m4, m11 ; 12h + punpcklbw m10, m11, m13 ; 23l + punpckhbw m11, m13 ; 23h + punpcklbw m12, m13, m0 ; 34l + punpckhbw m13, m0 ; 34h .v_loop: - pmaddubsw m18, m1, m8 ; a0l - mova m1, m5 - pmaddubsw m19, m2, m8 ; a0h - mova m2, m6 - pmaddubsw m20, m3, m8 ; b0l + movu m5, [r4+ssq*1] + pmaddubsw m14, m1, m7 ; a0l + mova m1, m10 + pmaddubsw m10, m8 ; a1l + lea r4, [r4+ssq*2] + pmaddubsw m15, m2, m7 ; a0h + mova m2, m11 + pmaddubsw m11, m8 ; a1h + paddw m14, m10 + punpcklbw m10, m0, m5 ; 45l + paddw m15, m11 + punpckhbw m11, m0, m5 ; 45h + pmaddubsw m0, m10, m9 ; a2l + paddw m14, m0 + pmaddubsw m0, m11, m9 ; a2h + paddw m15, m0 + movu m0, [r4+ssq*0] + pmulhrsw m14, m6 + pmulhrsw m15, m6 + packuswb m14, m15 + pmaddubsw m15, m3, m7 ; b0l mova m3, m12 - pmaddubsw m21, m4, m8 ; b0h + pmaddubsw m12, m8 ; b1l + mova [r7+dsq*0], m14 + pmaddubsw m14, m4, m7 ; b0h mova m4, m13 - pmaddubsw m5, m9 ; a1l - pmaddubsw m6, m9 ; a1h - pmaddubsw m12, m9 ; b1l - pmaddubsw m13, m9 ; b1h - paddw m18, m5 - mova m5, m14 - pmaddubsw m14, m10 ; a2l - paddw m19, m6 - mova m6, m15 - pmaddubsw m15, m10 ; a2h - paddw m20, m12 - mova m12, m16 - pmaddubsw m16, m10 ; b2l - paddw m21, m13 - mova m13, m17 - pmaddubsw m17, m10 ; b2h - paddw m18, m14 - paddw m19, m15 - paddw m20, m16 - paddw m21, m17 - movu m17, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - punpcklbw m14, m0, m17 ; 67l - punpckhbw m15, m0, m17 ; 67h - pmaddubsw m16, m14, m11 ; a3l - pmaddubsw m0, m15, m11 ; a3h - paddw m18, m16 - paddw m19, m0 - movu m0, [srcq+ssq*0] - punpcklbw m16, m17, m0 ; 78l - punpckhbw m17, m0 ; 78h - pmulhrsw m18, m7 - pmulhrsw m19, m7 - packuswb m18, m19 - mova [dstq+dsq*0], m18 - pmaddubsw m18, m16, m11 ; b3l - pmaddubsw m19, m17, m11 ; b3h - paddw m18, m20 - paddw m19, m21 - pmulhrsw m18, m7 - pmulhrsw m19, m7 - packuswb m18, m19 - mova [dstq+dsq*1], m18 - lea dstq, [dstq+dsq*2] + pmaddubsw m13, m8 ; b1h + paddw m15, m12 + punpcklbw m12, m5, m0 ; 56l + paddw m14, m13 + punpckhbw m13, m5, m0 ; 56h + pmaddubsw m5, m12, m9 ; b2l + paddw m15, m5 + pmaddubsw m5, m13, m9 ; b2h + paddw m14, m5 + pmulhrsw m15, m6 + pmulhrsw m14, m6 + packuswb m15, m14 + mova [r7+dsq*1], m15 + lea r7, [r7+dsq*2] sub hd, 2 jg .v_loop - add r4, 64 - add r7, 64 + add srcq, 64 + add dstq, 64 movzx hd, r6b - mov srcq, r4 - mov dstq, r7 sub r6d, 256 jg .v_loop0 - vzeroupper RET +.h: + test myd, 0xf00 + jz mangle(private_prefix %+ _put_8tap_8bpc_avx512icl).h2 .hv: + vpbroadcastd m9, [pd_34] + mova xm10, [spel_hv_end] + pxor xm0, xm0 cmp wd, 4 jg .hv_w8 movzx mxd, mxb @@ -2000,94 +1778,850 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 shr myd, 16 cmp hd, 6 cmovs myd, mxd - vpbroadcastd m8, [pd_2] - vpbroadcastq ym0, [base+subpel_filters+myq*8] - lea ss3q, [ssq*3] - vpbroadcastd ym9, [pd_32768] - mov r6, srcq - punpcklbw ym0, ym8, ym0 - sub r6, ss3q + vpbroadcastq ym1, [base+subpel_filters+1+myq*8] + mov nsq, ssq + punpcklbw ym0, ym1 + neg nsq psraw ym0, 2 ; << 6 - mova xm14, [spel_hv_end] - pshufd ym10, ym0, q0000 - pshufd ym11, ym0, q1111 - pshufd ym12, ym0, q2222 - pshufd ym13, ym0, q3333 + pshufd ym11, ym0, q0000 + pshufd ym12, ym0, q1111 + pshufd ym13, ym0, q2222 cmp wd, 4 je .hv_w4 - vbroadcasti128 ym6, [subpel_h_shuf4] - movq xmm2, [r6+ssq*0] - movhps xmm2, [r6+ssq*1] - movq xmm0, [r6+ssq*2] - movhps xmm0, [srcq+ssq*0] - vpbroadcastq ymm3, [srcq+ssq*1] - vpbroadcastq ymm4, [srcq+ssq*2] - add srcq, ss3q + vbroadcasti128 ym5, [subpel_h_shuf4] + movq xmm0, [srcq+nsq*2] + movhps xmm0, [srcq+nsq*1] + movq xmm2, [srcq+ssq*0] + movhps xmm2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] vpbroadcastq ymm1, [srcq+ssq*0] - vpblendd ymm2, ymm3, 0x30 - vpblendd ymm0, ymm1, 0x30 ; 2 3 6 _ - vpblendd ymm2, ymm4, 0xc0 ; 0 1 4 5 - pshufb ymm2, ym6 - pshufb ymm0, ym6 - mova ymm1, ym8 - vpdpbusd ymm1, ymm2, ym7 - mova ymm2, ym8 + vpblendd ymm0, ymm1, 0x30 + pshufb xmm2, xm5 ; 2 3 + pshufb ymm0, ym5 ; 0 1 4 + mova xmm1, xm9 + vpdpbusd xmm1, xmm2, xm7 + mova ymm2, ym9 vpdpbusd ymm2, ymm0, ym7 - packssdw ymm2, ymm1, ymm2 + packssdw ymm2, ymm1 psraw ymm2, 2 - vextracti128 xmm3, ymm2, 1 - palignr xmm4, xmm3, xmm2, 4 - punpcklwd xmm1, xmm2, xmm4 ; 01 12 - punpckhwd xmm2, xmm4 ; 23 34 - pshufd xmm0, xmm3, q2121 - punpcklwd xmm3, xmm0 ; 45 56 + vextracti128 xmm0, ymm2, 1 + vzeroupper + palignr xmm0, xmm2, 4 + punpcklwd xmm1, xmm2, xmm0 ; 01 12 + punpckhwd xmm2, xmm0 ; 23 34 .hv_w2_loop: - movq xmm4, [srcq+ssq*1] + movq xmm3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - movhps xmm4, [srcq+ssq*0] - mova xmm5, xm9 - vpdpwssd xmm5, xmm1, xm10 ; a0 b0 + movhps xmm3, [srcq+ssq*0] + pmaddwd xmm4, xmm1, xm11 ; a0 b0 mova xmm1, xmm2 - vpdpwssd xmm5, xmm2, xm11 ; a1 b1 - pshufb xmm4, xm6 - mova xmm2, xmm3 - vpdpwssd xmm5, xmm3, xm12 ; a2 b2 - mova xmm3, xm8 - vpdpbusd xmm3, xmm4, xm7 - packssdw xmm4, xmm3, xmm3 - psraw xmm4, 2 - palignr xmm3, xmm4, xmm0, 12 - mova xmm0, xmm4 - punpcklwd xmm3, xmm4 ; 67 78 - vpdpwssd xmm5, xmm3, xm13 ; a3 b3 - packuswb xmm5, xmm5 - pshufb xmm5, xm14 - pextrw [dstq+dsq*0], xmm5, 0 - pextrw [dstq+dsq*1], xmm5, 1 + vpdpwssd xmm4, xmm2, xm12 ; a1 b1 + pshufb xmm3, xm5 + mova xmm2, xm9 + vpdpbusd xmm2, xmm3, xm7 + packssdw xmm3, xmm2, xmm2 + psraw xmm3, 2 + palignr xmm2, xmm3, xmm0, 12 + mova xmm0, xmm3 + punpcklwd xmm2, xmm3 ; 45 56 + vpdpwssd xmm4, xmm2, xm13 ; a2 b2 + packuswb xmm4, xmm4 + pshufb xmm4, xm10 + pextrw [dstq+dsq*0], xmm4, 0 + pextrw [dstq+dsq*1], xmm4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop - vzeroupper RET .hv_w4: - movq xmm1, [r6+ssq*0] - vpbroadcastq ym2, [r6+ssq*1] - vinserti32x4 ym1, ymm1, [r6+ssq*2], 1 - vinserti32x4 m2, [srcq+ssq*0], 2 - vinserti32x4 m1, [srcq+ssq*1], 2 - vinserti32x4 m2, [srcq+ssq*2], 3 ; _ 1 3 5 - vbroadcasti32x4 m6, [subpel_h_shufA] - add srcq, ss3q - vinserti32x4 m1, [srcq+ssq*0], 3 ; 0 2 4 6 - pshufb m2, m6 - pshufb m1, m6 - mova m0, m8 - vpdpbusd m0, m2, m7 - mova m4, m8 - vpdpbusd m4, m1, m7 + movq xm2, [srcq+nsq*2] + vpbroadcastq ym1, [srcq+nsq*1] + vinserti32x4 ym2, [srcq+ssq*0], 1 + vinserti32x4 m1, [srcq+ssq*1], 2 ; _ 1 3 + lea srcq, [srcq+ssq*2] + vbroadcasti32x4 m5, [subpel_h_shufA] + vinserti32x4 m2, [srcq+ssq*0], 2 ; 0 2 4 + pshufb m1, m5 + mova m0, m9 + pshufb m2, m5 + mova m3, m9 + vpdpbusd m0, m1, m7 mova ym1, [spel_hv_perm4a] + vpdpbusd m3, m2, m7 mova ym2, [spel_hv_perm4b] - mova ym3, [spel_hv_perm4c] + mov r6d, 0x5555 + mova ym6, [spel_hv_perm4d] + packssdw m0, m3 + kmovw k1, r6d + psraw m0, 2 ; _ 0 1 2 3 4 5 6 + vpermb ym1, ym1, ym0 ; 01 12 + vpermb m2, m2, m0 ; 23 34 +.hv_w4_loop: + movq xm3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x4 ym3, [srcq+ssq*0], 1 + pmaddwd ym4, ym1, ym11 ; a0 b0 + mova ym1, ym2 + pshufb ym3, ym5 + mova ym0, ym9 + vpdpbusd ym0, ym3, ym7 + vpdpwssd ym4, ym2, ym12 ; a1 b1 + vpsraw ym2{k1}, ym0, 2 ; 5 6 + vpermb ym2, ym6, ym2 ; 45 56 + vpdpwssd ym4, ym2, ym13 ; a2 b2 + packuswb ym4, ym4 + vpermb ym4, ym10, ym4 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + sub srcq, 3 + vpbroadcastd m11, [base+subpel_filters+mxq*8+0] + vpbroadcastd m12, [base+subpel_filters+mxq*8+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m1, [base+subpel_filters+1+myq*8] + mov nsq, ssq + punpcklbw m0, m1 + neg nsq + psraw m0, 2 ; << 6 + pshufd m13, m0, q0000 + pshufd m14, m0, q1111 + pshufd m15, m0, q2222 + cmp wd, 8 + jne .hv_w16 + movu xm0, [srcq+nsq*2] + vinserti32x4 ym0, [srcq+nsq*1], 1 + vbroadcasti32x4 m1, [subpel_h_shufA] + vinserti32x4 m0, [srcq+ssq*0], 2 + vbroadcasti32x4 m4, [subpel_h_shufB] + vinserti32x4 m0, [srcq+ssq*1], 3 + lea srcq, [srcq+ssq*2] + vbroadcasti32x4 m7, [subpel_h_shufC] + vbroadcasti32x4 ym5, [srcq+ssq*0] + vbroadcasti32x8 m6, [subpel_h_shufA] + pshufb m1, m0, m1 ; 0 1 2 3 0123 + mova m2, m9 + vpdpbusd m2, m1, m11 + pshufb m4, m0, m4 ; 0 1 2 3 4567 + mova m1, m9 + vpdpbusd m1, m4, m11 + pshufb m0, m7 ; 0 1 2 3 89ab + pshufb ym7, ym5, ym6 ; 4 0123 4567 + mova ym3, ym9 + vpdpbusd ym3, ym7, ym11 + vbroadcasti32x8 m7, [subpel_h_shufB] + vpdpbusd m2, m4, m12 + mova m4, [spel_hv_perm8a] + pshufb ym5, ym7 ; 4 4567 89ab + vpdpbusd m1, m0, m12 + vpaddd m0, m4, [pb_32] {1to16} + vpdpbusd ym3, ym5, ym12 + mova m5, [spel_hv_perm8b] + mov r6, 0x55555555ff00 + packssdw m2, m1 + vpmovsdw xm3, ym3 + kmovq k1, r6 + psraw m2, 2 ; 0 1 2 3 + psraw xm3, 2 ; 4 + vpermb m1, m4, m2 ; 01 12 + kshiftrq k2, k1, 16 + vpermt2b m2, m0, m3 ; 23 34 +.hv_w8_loop: + vbroadcasti32x4 ym3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vbroadcasti32x4 m3{k1}, [srcq+ssq*0] + pmaddwd m0, m1, m13 ; a0 b0 + pshufb m1, m3, m6 ; 5 6 0123 4567 + mova m4, m9 + vpdpbusd m4, m1, m11 + pshufb m3, m7 ; 5 6 4567 89ab + vpdpwssd m0, m2, m14 ; a1 b1 + mova m1, m2 + vpdpbusd m4, m3, m12 + psraw m2{k2}, m4, 2 ; 53 64 + vpermb m2, m5, m2 ; 45 56 + vpdpwssd m0, m2, m15 ; a2 b2 + packuswb m0, m0 + vpermb m0, m10, m0 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: + movu m19, [spel_hv_perm16a] + vpbroadcastd m7, [pb_4] + lea r6d, [wq*2-32] + mova m6, [spel_hv_perm16b] + paddb m20, m7, m19 + lea r6d, [hq+r6*8] + paddb m21, m7, m20 + mova ym10, [spel_hv_end16] + paddb m7, m6 +.hv_w16_loop0: + movu ym16, [srcq+nsq*2] + vinserti32x8 m16, [srcq+nsq*1], 1 + lea r4, [srcq+ssq*2] + movu ym17, [srcq+ssq*0] + vinserti32x8 m17, [srcq+ssq*1], 1 + mov r7, dstq + movu ym18, [r4 +ssq*0] + vpermb m2, m19, m16 ; 0 1 0123 89ab + mova m1, m9 + vpermb m3, m21, m16 ; 0 1 89ab ghij + vpdpbusd m1, m2, m11 + mova m2, m9 + vpermb m4, m19, m17 ; 2 3 0123 89ab + vpdpbusd m2, m3, m12 + mova m3, m9 + vpermb m5, m21, m17 ; 2 3 89ab ghij + vpdpbusd m3, m4, m11 + mova m4, m9 + vpermb m0, m6, m18 ; 4 0145 2367 89cd abef + vpdpbusd m4, m5, m12 + mova m5, m9 + vpermb m16, m20, m16 ; 0 1 4567 cdef + vpdpbusd m5, m0, m11 + vpermb m17, m20, m17 ; 2 3 4567 cdef + vpdpbusd m1, m16, m12 + vpermb m18, m7, m18 ; 4 4589 67ab cdgh efij + vpdpbusd m2, m16, m11 + vpdpbusd m3, m17, m12 + vpdpbusd m4, m17, m11 + vpdpbusd m5, m18, m12 + packssdw m1, m2 ; 01 + packssdw m3, m4 ; 23 + REPX {psraw x, 2}, m1, m3, m5 + vpshrdd m2, m1, m3, 16 ; 12 + vpshrdd m4, m3, m5, 16 ; 34 +.hv_w16_loop: + movu ym18, [r4+ssq*1] + lea r4, [r4+ssq*2] + vinserti32x8 m18, [r4+ssq*0], 1 + pmaddwd m16, m1, m13 ; a0 + vpermb m1, m19, m18 ; 5 6 0123 89ab + pmaddwd m17, m2, m13 ; b0 + vpermb m2, m20, m18 ; 5 6 4567 cdef + mova m0, m9 + vpdpbusd m0, m1, m11 + vpermb m18, m21, m18 + mova m1, m9 + vpdpbusd m1, m2, m11 + vpdpwssd m16, m3, m14 ; a1 + vpdpwssd m17, m4, m14 ; b1 + vpdpbusd m0, m2, m12 + mova m2, m4 + vpdpbusd m1, m18, m12 + packssdw m0, m1 + mova m1, m3 + psraw m4, m0, 2 ; 5 6 + vpshrdd m3, m2, m4, 16 ; 4 5 + vpdpwssd m17, m4, m15 ; b2 + vpdpwssd m16, m3, m15 ; a2 + packuswb m16, m17 + vpermb m16, m10, m16 + mova [r7+dsq*0], xm16 + vextracti128 [r7+dsq*1], ym16, 1 + lea r7, [r7+dsq*2] + sub hd, 2 + jg .hv_w16_loop + add srcq, 16 + add dstq, 16 + movzx hd, r6b + sub r6d, 1<<8 + jg .hv_w16_loop0 + vzeroupper + RET + +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_8bpc +PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_8bpc +PUT_8TAP_FN sharp, SHARP, SHARP + +cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx512icl] + movsxd wq, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jz mangle(private_prefix %+ _put_6tap_8bpc_avx512icl).put +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + tzcnt r6d, wd + lea myq, [base+subpel_filters+myq*8] + movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] + vpbroadcastd m7, [pw_512] + vpbroadcastw m8, [myq+0] + add r6, r8 + vpbroadcastw m9, [myq+2] + lea ss3q, [ssq*3] + vpbroadcastw m10, [myq+4] + sub srcq, ss3q + vpbroadcastw m11, [myq+6] + jmp r6 +.v_w2: + movd xmm2, [srcq+ssq*0] + pinsrw xmm2, [srcq+ssq*1], 2 + pinsrw xmm2, [srcq+ssq*2], 4 + add srcq, ss3q + pinsrw xmm2, [srcq+ssq*0], 6 ; 0 1 2 3 + movd xmm3, [srcq+ssq*1] + vpbroadcastd xmm1, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 + vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 + palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 + punpcklbw xmm3, xmm1 ; 45 56 + punpcklbw xmm1, xmm2, xmm4 ; 01 12 + punpckhbw xmm2, xmm4 ; 23 34 +.v_w2_loop: + pmaddubsw xmm5, xmm1, xm8 ; a0 b0 + mova xmm1, xmm2 + pmaddubsw xmm2, xm9 ; a1 b1 + paddw xmm5, xmm2 + mova xmm2, xmm3 + pmaddubsw xmm3, xm10 ; a2 b2 + paddw xmm5, xmm3 + vpbroadcastd xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 + punpcklbw xmm3, xmm4 ; 67 78 + pmaddubsw xmm4, xmm3, xm11 ; a3 b3 + paddw xmm5, xmm4 + pmulhrsw xmm5, xm7 + packuswb xmm5, xmm5 + pextrw [dstq+dsq*0], xmm5, 0 + pextrw [dstq+dsq*1], xmm5, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movd xmm2, [srcq+ssq*0] + pinsrd xmm2, [srcq+ssq*1], 1 + pinsrd xmm2, [srcq+ssq*2], 2 + add srcq, ss3q + pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 + movd xmm3, [srcq+ssq*1] + vpbroadcastd xmm1, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 + vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 + palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 + punpcklbw xmm3, xmm1 ; 45 56 + punpcklbw xmm1, xmm2, xmm4 ; 01 12 + punpckhbw xmm2, xmm4 ; 23 34 +.v_w4_loop: + vpbroadcastd xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw xmm5, xmm1, xm8 ; a0 b0 + mova xmm1, xmm2 + pmaddubsw xmm2, xm9 ; a1 b1 + paddw xmm5, xmm2 + mova xmm2, xmm3 + pmaddubsw xmm3, xm10 ; a2 b2 + paddw xmm5, xmm3 + vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 + punpcklbw xmm3, xmm4 ; 67 78 + pmaddubsw xmm4, xmm3, xm11 ; a3 b3 + paddw xmm5, xmm4 + pmulhrsw xmm5, xm7 + packuswb xmm5, xmm5 + movd [dstq+dsq*0], xmm5 + pextrd [dstq+dsq*1], xmm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movq xmm1, [srcq+ssq*0] + vpbroadcastq ymm0, [srcq+ssq*1] + vpbroadcastq ymm2, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq ymm5, [srcq+ssq*0] + vpbroadcastq ymm3, [srcq+ssq*1] + vpbroadcastq ymm4, [srcq+ssq*2] + add srcq, ss3q + vpblendd ymm1, ymm0, 0x30 + vpblendd ymm0, ymm2, 0x30 + punpcklbw ymm1, ymm0 ; 01 12 + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm2, ymm5, 0x30 + vpblendd ymm5, ymm3, 0x30 + punpcklbw ymm2, ymm5 ; 23 34 + vpblendd ymm3, ymm4, 0x30 + vpblendd ymm4, ymm0, 0x30 + punpcklbw ymm3, ymm4 ; 45 56 +.v_w8_loop: + vpbroadcastq ymm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw ymm5, ymm1, ym8 ; a0 b0 + mova ymm1, ymm2 + pmaddubsw ymm2, ym9 ; a1 b1 + paddw ymm5, ymm2 + mova ymm2, ymm3 + pmaddubsw ymm3, ym10 ; a2 b2 + paddw ymm5, ymm3 + vpblendd ymm3, ymm0, ymm4, 0x30 + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm4, ymm4, ymm0, 0x30 + punpcklbw ymm3, ymm4 ; 67 78 + pmaddubsw ymm4, ymm3, ym11 ; a3 b3 + paddw ymm5, ymm4 + pmulhrsw ymm5, ym7 + vextracti128 xmm4, ymm5, 1 + packuswb xmm5, xmm4 + movq [dstq+dsq*0], xmm5 + movhps [dstq+dsq*1], xmm5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + vzeroupper + RET +.v_w16: + mova m12, [spel_v_perm16a] + vbroadcasti32x4 m1, [srcq+ssq*0] + vbroadcasti32x4 ym4, [srcq+ssq*1] + mov r6d, 0x0f + vbroadcasti32x4 m2, [srcq+ssq*2] + add srcq, ss3q + vbroadcasti32x4 ym5, [srcq+ssq*0] + kmovb k1, r6d + vbroadcasti32x4 m3, [srcq+ssq*1] + vbroadcasti32x4 ym6, [srcq+ssq*2] + add srcq, ss3q + vbroadcasti32x4 m0, [srcq+ssq*0] + vshufpd m1{k1}, m4, m2, 0xcc + vshufpd m2{k1}, m5, m3, 0xcc + vshufpd m3{k1}, m6, m0, 0xcc + vpermb m1, m12, m1 ; 01 12 + vpermb m2, m12, m2 ; 23 34 + vpermb m3, m12, m3 ; 45 56 +.v_w16_loop: + pmaddubsw m4, m1, m8 ; a0 b0 + mova m1, m2 + pmaddubsw m5, m2, m9 ; a1 b1 + mova m2, m3 + pmaddubsw m6, m3, m10 ; a2 b2 + mova m3, m0 + paddw m4, m5 + vbroadcasti32x4 ym5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vbroadcasti32x4 m0, [srcq+ssq*0] + vshufpd m3{k1}, m5, m0, 0xcc + vpermb m3, m12, m3 ; 67 78 + pmaddubsw m5, m3, m11 ; a3 b3 + paddw m4, m6 + paddw m4, m5 + pmulhrsw m4, m7 + vextracti32x8 ym5, m4, 1 + packuswb ym4, ym5 + mova [dstq+dsq*0], xm4 + vextracti32x4 [dstq+dsq*1], ym4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: + mova m12, [spel_v_perm32] + pmovzxbq m14, [pb_02461357] + vpshrdw m13, m12, m12, 8 + movu ym0, [srcq+ssq*0] + vinserti32x8 m0, [srcq+ssq*1], 1 + vpermb m1, m12, m0 ; 01 + vinserti32x8 m0, [srcq+ssq*2], 0 + add srcq, ss3q + vpermb m2, m13, m0 ; 12 + vinserti32x8 m0, [srcq+ssq*0], 1 + vpermb m3, m12, m0 ; 23 + vinserti32x8 m0, [srcq+ssq*1], 0 + vpermb m4, m13, m0 ; 34 + vinserti32x8 m0, [srcq+ssq*2], 1 + add srcq, ss3q + vpermb m5, m12, m0 ; 45 + vinserti32x8 m0, [srcq+ssq*0], 0 + vpermb m6, m13, m0 ; 56 +.v_w32_loop: + vinserti32x8 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pmaddubsw m15, m1, m8 + mova m1, m3 + pmaddubsw m16, m2, m8 + mova m2, m4 + pmaddubsw m17, m3, m9 + mova m3, m5 + pmaddubsw m18, m4, m9 + mova m4, m6 + pmaddubsw m19, m5, m10 + vpermb m5, m12, m0 ; 67 + vinserti32x8 m0, [srcq+ssq*0], 0 + pmaddubsw m20, m6, m10 + vpermb m6, m13, m0 ; 78 + paddw m15, m17 + pmaddubsw m17, m5, m11 + paddw m16, m18 + pmaddubsw m18, m6, m11 + paddw m15, m19 + paddw m16, m20 + paddw m15, m17 + paddw m16, m18 + pmulhrsw m15, m7 + pmulhrsw m16, m7 + packuswb m15, m16 + vpermq m15, m14, m15 + mova [dstq+dsq*0], ym15 + vextracti32x8 [dstq+dsq*1], m15, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w32_loop + vzeroupper + RET +.v_w64: +.v_w128: + lea r6d, [hq+wq*4-256] + mov r4, srcq + mov r7, dstq +.v_loop0: + movu m2, [srcq+ssq*0] + movu m4, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + add srcq, ss3q + movu m13, [srcq+ssq*0] + movu m15, [srcq+ssq*1] + movu m17, [srcq+ssq*2] + add srcq, ss3q + movu m0, [srcq+ssq*0] + punpcklbw m1, m2, m4 ; 01l + punpckhbw m2, m4 ; 01h + punpcklbw m3, m4, m6 ; 12l + punpckhbw m4, m6 ; 12h + punpcklbw m5, m6, m13 ; 23l + punpckhbw m6, m13 ; 23h + punpcklbw m12, m13, m15 ; 34l + punpckhbw m13, m15 ; 34h + punpcklbw m14, m15, m17 ; 45l + punpckhbw m15, m17 ; 45h + punpcklbw m16, m17, m0 ; 56l + punpckhbw m17, m0 ; 56h +.v_loop: + pmaddubsw m18, m1, m8 ; a0l + mova m1, m5 + pmaddubsw m19, m2, m8 ; a0h + mova m2, m6 + pmaddubsw m20, m3, m8 ; b0l + mova m3, m12 + pmaddubsw m21, m4, m8 ; b0h + mova m4, m13 + pmaddubsw m5, m9 ; a1l + pmaddubsw m6, m9 ; a1h + pmaddubsw m12, m9 ; b1l + pmaddubsw m13, m9 ; b1h + paddw m18, m5 + mova m5, m14 + pmaddubsw m14, m10 ; a2l + paddw m19, m6 + mova m6, m15 + pmaddubsw m15, m10 ; a2h + paddw m20, m12 + mova m12, m16 + pmaddubsw m16, m10 ; b2l + paddw m21, m13 + mova m13, m17 + pmaddubsw m17, m10 ; b2h + paddw m18, m14 + paddw m19, m15 + paddw m20, m16 + paddw m21, m17 + movu m17, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklbw m14, m0, m17 ; 67l + punpckhbw m15, m0, m17 ; 67h + pmaddubsw m16, m14, m11 ; a3l + pmaddubsw m0, m15, m11 ; a3h + paddw m18, m16 + paddw m19, m0 + movu m0, [srcq+ssq*0] + punpcklbw m16, m17, m0 ; 78l + punpckhbw m17, m0 ; 78h + pmulhrsw m18, m7 + pmulhrsw m19, m7 + packuswb m18, m19 + mova [dstq+dsq*0], m18 + pmaddubsw m18, m16, m11 ; b3l + pmaddubsw m19, m17, m11 ; b3h + paddw m18, m20 + paddw m19, m21 + pmulhrsw m18, m7 + pmulhrsw m19, m7 + packuswb m18, m19 + mova [dstq+dsq*1], m18 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_loop + add r4, 64 + add r7, 64 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 256 + jg .v_loop0 + vzeroupper + RET +.h: + test myd, 0xf00 + jnz .hv +.h2: + vpbroadcastd m5, [pd_34] ; 2 + (8 << 2) + cmp wd, 4 + jl .h_w2 + vbroadcasti128 m6, [subpel_h_shufA] + je .h_w4 + tzcnt wd, wd + vbroadcasti128 m7, [subpel_h_shufB] + vbroadcasti128 m8, [subpel_h_shufC] + shr mxd, 16 + sub srcq, 3 + movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] + vpbroadcastd m9, [base+mxq*8+subpel_filters+0] + vpbroadcastd m10, [base+mxq*8+subpel_filters+4] + add wq, r8 + jmp wq +.h_w2: + movzx mxd, mxb + dec srcq + mova xmm4, [subpel_h_shuf4] + vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] +.h_w2_loop: + movq xmm0, [srcq+ssq*0] + movhps xmm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xmm0, xmm4 + mova xmm1, xm5 + vpdpbusd xmm1, xmm0, xmm3 + packssdw xmm0, xmm1, xmm1 + psraw xmm0, 6 + packuswb xmm0, xm0 + pextrw [dstq+dsq*0], xmm0, 0 + pextrw [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] +.h_w4_loop: + movq xmm0, [srcq+ssq*0] + movq xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xmm0, xm6 + pshufb xmm1, xm6 + mova xmm2, xm5 + vpdpbusd xmm2, xmm0, xmm3 + mova xmm0, xm5 + vpdpbusd xmm0, xmm1, xmm3 + packssdw xmm0, xmm2, xmm0 + psraw xmm0, 6 + packuswb xmm0, xmm0 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: + movu xm0, [srcq+ssq*0] + vinserti32x4 ym0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + WRAP_YMM PUT_8TAP_H 0, 1, 2, 3 + vpmovuswb xm0, ym0 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + mova m6, [spel_h_perm16] + vpbroadcastd m8, [pb_4] + paddb m7, m8, m6 + paddb m8, m7 +.h_w16_loop: + movu ym0, [srcq+ssq*0] + vinserti32x8 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 0, 1, 2, 3, 1 + vpmovuswb ym0, m0 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16_loop + RET +.h_w32: + movu ym0, [srcq+ssq*0+8*0] + vinserti32x8 m0, [srcq+ssq*1+8*0], 1 + movu ym1, [srcq+ssq*0+8*1] + vinserti32x8 m1, [srcq+ssq*1+8*1], 1 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 0, 2, 3, 4 + PUT_8TAP_H 1, 4, 3, 2 + packuswb m0, m1 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w32 + RET +.h_w64: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + add srcq, ssq + PUT_8TAP_H 0, 2, 3, 4 + PUT_8TAP_H 1, 4, 3, 2 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq + dec hd + jg .h_w64 + RET +.h_w128: + movu m0, [srcq+8*0] + movu m2, [srcq+8*1] + movu m1, [srcq+8*8] + movu m3, [srcq+8*9] + add srcq, ssq + PUT_8TAP_H 0, 4, 11, 12 + PUT_8TAP_H 2, 12, 11, 4 + PUT_8TAP_H 1, 4, 11, 12 + PUT_8TAP_H 3, 12, 11, 4 + packuswb m0, m2 + packuswb m1, m3 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, dsq + dec hd + jg .h_w128 + RET +.hv: + vpbroadcastd m9, [pd_34] + pxor xm0, xm0 + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + dec srcq + vpbroadcastd m7, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq ym1, [base+subpel_filters+myq*8] + lea ss3q, [ssq*3] + mov r6, srcq + punpcklbw ym0, ym1 + sub r6, ss3q + psraw ym0, 2 ; << 6 + mova xm14, [spel_hv_end] + pshufd ym10, ym0, q0000 + pshufd ym11, ym0, q1111 + pshufd ym12, ym0, q2222 + pshufd ym13, ym0, q3333 + cmp wd, 4 + je .hv_w4 + vbroadcasti128 ym6, [subpel_h_shuf4] + movq xmm2, [r6+ssq*0] + movhps xmm2, [r6+ssq*1] + movq xmm0, [r6+ssq*2] + movhps xmm0, [srcq+ssq*0] + vpbroadcastq ymm3, [srcq+ssq*1] + vpbroadcastq ymm4, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq ymm1, [srcq+ssq*0] + vpblendd ymm2, ymm3, 0x30 + vpblendd ymm0, ymm1, 0x30 ; 2 3 6 _ + vpblendd ymm2, ymm4, 0xc0 ; 0 1 4 5 + pshufb ymm2, ym6 + pshufb ymm0, ym6 + mova ymm1, ym9 + vpdpbusd ymm1, ymm2, ym7 + mova ymm2, ym9 + vpdpbusd ymm2, ymm0, ym7 + packssdw ymm2, ymm1, ymm2 + psraw ymm2, 2 + vextracti128 xmm3, ymm2, 1 + palignr xmm4, xmm3, xmm2, 4 + punpcklwd xmm1, xmm2, xmm4 ; 01 12 + punpckhwd xmm2, xmm4 ; 23 34 + pshufd xmm0, xmm3, q2121 + punpcklwd xmm3, xmm0 ; 45 56 +.hv_w2_loop: + movq xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xmm4, [srcq+ssq*0] + pmaddwd xmm5, xmm1, xm10 ; a0 b0 + mova xmm1, xmm2 + vpdpwssd xmm5, xmm2, xm11 ; a1 b1 + pshufb xmm4, xm6 + mova xmm2, xmm3 + vpdpwssd xmm5, xmm3, xm12 ; a2 b2 + mova xmm3, xm9 + vpdpbusd xmm3, xmm4, xm7 + packssdw xmm4, xmm3, xmm3 + psraw xmm4, 2 + palignr xmm3, xmm4, xmm0, 12 + mova xmm0, xmm4 + punpcklwd xmm3, xmm4 ; 67 78 + vpdpwssd xmm5, xmm3, xm13 ; a3 b3 + packuswb xmm5, xmm5 + pshufb xmm5, xm14 + pextrw [dstq+dsq*0], xmm5, 0 + pextrw [dstq+dsq*1], xmm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + vzeroupper + RET +.hv_w4: + movq xmm1, [r6+ssq*0] + vpbroadcastq ym2, [r6+ssq*1] + vinserti32x4 ym1, ymm1, [r6+ssq*2], 1 + vinserti32x4 m2, [srcq+ssq*0], 2 + vinserti32x4 m1, [srcq+ssq*1], 2 + vinserti32x4 m2, [srcq+ssq*2], 3 ; _ 1 3 5 + vbroadcasti32x4 m6, [subpel_h_shufA] + add srcq, ss3q + vinserti32x4 m1, [srcq+ssq*0], 3 ; 0 2 4 6 + pshufb m2, m6 + pshufb m1, m6 + mova m0, m9 + vpdpbusd m0, m2, m7 + mova m4, m9 + vpdpbusd m4, m1, m7 + mova ym1, [spel_hv_perm4a] + mova ym2, [spel_hv_perm4b] + mova ym3, [spel_hv_perm4c] packssdw m0, m4 psraw m0, 2 ; _ 0 1 2 3 4 5 6 mov r6d, 0x5555 @@ -2100,11 +2634,10 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 movq xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x4 ym4, ymm4, [srcq+ssq*0], 1 - mova ym5, ym9 - vpdpwssd ym5, ym1, ym10 ; a0 b0 + pmaddwd ym5, ym1, ym10 ; a0 b0 mova ym1, ym2 pshufb ym4, ym6 - mova ym0, ym8 + mova ym0, ym9 vpdpbusd ym0, ym4, ym7 vpdpwssd ym5, ym2, ym11 ; a1 b1 mova ym2, ym3 @@ -2129,10 +2662,8 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 shr myd, 16 cmp hd, 6 cmovs myd, mxd - vpbroadcastd m8, [pd_2] - vpbroadcastq m0, [base+subpel_filters+myq*8] - vpbroadcastd m9, [pd_32768] - punpcklbw m0, m8, m0 + vpbroadcastq m1, [base+subpel_filters+myq*8] + punpcklbw m0, m1 lea ss3q, [ssq*3] psraw m0, 2 ; << 6 pshufd m12, m0, q0000 @@ -2150,177 +2681,717 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 vinserti128 ymm2, [srcq+ssq*2], 1 vinserti32x4 m6, [srcq+ssq*0], 3 ; 0 1 2 3 add srcq, ss3q - vbroadcasti32x4 m4, [subpel_h_shufA] - vinserti32x4 m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _ - vbroadcasti32x4 m7, [subpel_h_shufB] - vbroadcasti32x4 m17, [subpel_h_shufC] - pshufb m1, m6, m4 ; 0 1 2 3 0123 + vbroadcasti32x4 m4, [subpel_h_shufA] + vinserti32x4 m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _ + vbroadcasti32x4 m7, [subpel_h_shufB] + vbroadcasti32x4 m8, [subpel_h_shufC] + pshufb m1, m6, m4 ; 0 1 2 3 0123 + mova m2, m9 + vpdpbusd m2, m1, m10 + pshufb m5, m6, m7 ; 0 1 2 3 4567 + mova m1, m9 + vpdpbusd m1, m5, m10 + pshufb m4, m0, m4 ; 4 5 6 _ 0123 + mova m3, m9 + vpdpbusd m3, m4, m10 + pshufb m7, m0, m7 ; 4 5 6 _ 4567 + mova m4, m9 + vpdpbusd m4, m7, m10 + pshufb m6, m8 + vpdpbusd m2, m5, m11 + vpdpbusd m1, m6, m11 + pshufb m6, m0, m8 + vpdpbusd m3, m7, m11 + vpdpbusd m4, m6, m11 + mova m5, [spel_hv_perm8a] + vpaddd m0, m5, [pb_32] {1to16} + mov r6, 0x55555555ff00 + packssdw m2, m1 + packssdw m3, m4 + mova m8, [spel_hv_perm8b] + psraw m2, 2 ; 0 1 2 3 + psraw m3, 2 ; 4 5 6 _ + vpermb m1, m5, m2 ; 01 12 + vbroadcasti32x8 m6, [subpel_h_shufA] + kmovq k1, r6 + vpermt2b m2, m0, m3 ; 23 34 + vbroadcasti32x8 m7, [subpel_h_shufB] + kshiftrq k2, k1, 16 + mova xm16, [spel_hv_end] + vpermb m3, m5, m3 ; 45 56 +.hv_w8_loop: + vbroadcasti32x4 ym4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vbroadcasti32x4 m4{k1}, [srcq+ssq*0] + pmaddwd m0, m1, m12 ; a0 b0 + pshufb m1, m4, m6 ; 7 8 0123 4567 + mova m5, m9 + vpdpbusd m5, m1, m10 + pshufb m4, m7 ; 7 8 4567 89ab + vpdpwssd m0, m2, m13 ; a1 b1 + mova m1, m2 + vpdpbusd m5, m4, m11 + mova m2, m3 + vpdpwssd m0, m3, m14 ; a2 b2 + psraw m3{k2}, m5, 2 ; 75 86 + vpermb m3, m8, m3 ; 67 78 + vpdpwssd m0, m3, m15 ; a3 b3 + packuswb m0, m0 + vpermb zmm1, m16, m0 + movq [dstq+dsq*0], xmm1 + movhps [dstq+dsq*1], xmm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + vzeroupper + RET +.hv_w16: + WIN64_SPILL_XMM 23 + movu m22, [spel_hv_perm16a] + sub srcq, ss3q + vpbroadcastd m8, [pb_4] + lea r6d, [wq*2-32] + mova m7, [spel_hv_perm16b] + paddb m20, m8, m22 + mova ym16, [spel_hv_end16] + paddb m21, m8, m20 + lea r6d, [hq+r6*8] + paddb m8, m7 +.hv_w16_loop0: + movu ym17, [srcq+ssq*0] + vinserti32x8 m17, [srcq+ssq*1], 1 ; 0 1 + lea r4, [srcq+ss3q] + movu ym18, [srcq+ssq*2] + vinserti32x8 m18, [r4 +ssq*0], 1 ; 2 3 + mov r7, dstq + movu ym19, [r4 +ssq*1] + vinserti32x8 m19, [r4 +ssq*2], 1 ; 4 5 + add r4, ss3q + vpermb m2, m22, m17 ; 0 1 0123 89ab + mova m1, m9 + vpermb m3, m21, m17 ; 0 1 89ab ghij + vpdpbusd m1, m2, m10 + mova m2, m9 + vpermb m4, m22, m18 ; 2 3 0123 89ab + vpdpbusd m2, m3, m11 + mova m3, m9 + vpermb m5, m21, m18 ; 2 3 89ab ghij + vpdpbusd m3, m4, m10 + mova m4, m9 + vpermb m6, m22, m19 ; 4 5 0123 89ab + vpdpbusd m4, m5, m11 + mova m5, m9 + vpermb m17, m20, m17 ; 0 1 4567 cdef + vpdpbusd m5, m6, m10 + mova m6, m9 + vpermb m0, m21, m19 ; 4 5 89ab ghij + vpdpbusd m1, m17, m11 + vpdpbusd m2, m17, m10 + movu ym17, [r4+ssq*0] ; 6 + vpermb m18, m20, m18 ; 2 3 4567 cdef + vpdpbusd m6, m0, m11 + vpermb m0, m7, m17 ; 6 0145 2367 89cd abef + vpdpbusd m3, m18, m11 + vpermb m19, m20, m19 ; 4 5 4567 cdef + vpdpbusd m4, m18, m10 + mova m18, m9 + vpermb m17, m8, m17 ; 6 4589 67ab cdgh efij + vpdpbusd m18, m0, m10 + packssdw m1, m2 + vpdpbusd m5, m19, m11 + vpdpbusd m6, m19, m10 + packssdw m3, m4 + vpdpbusd m18, m17, m11 + psraw m1, 2 ; 01 + psraw m3, 2 ; 23 + packssdw m5, m6 + vpshrdd m2, m1, m3, 16 ; 12 + psraw m5, 2 ; 45 + vpshrdd m4, m3, m5, 16 ; 34 + psraw m18, 2 + vpshrdd m6, m5, m18, 16 ; 56 +.hv_w16_loop: + movu ym19, [r4+ssq*1] + lea r4, [r4+ssq*2] + vinserti32x8 m19, [r4+ssq*0], 1 + pmaddwd m17, m1, m12 ; a0 + vpermb m1, m22, m19 ; 7 8 0123 89ab + pmaddwd m18, m2, m12 ; b0 + mova m0, m9 + vpermb m2, m21, m19 ; 7 8 89ab ghij + vpdpbusd m0, m1, m10 + mova m1, m9 + vpermb m19, m20, m19 ; 7 8 4567 cdef + vpdpbusd m1, m2, m11 + mova m2, m4 + vpdpwssd m17, m3, m13 ; a1 + vpdpwssd m18, m4, m13 ; b1 + mova m4, m6 + vpdpbusd m0, m19, m11 + vpdpbusd m1, m19, m10 + vpdpwssd m17, m5, m14 ; a2 + vpdpwssd m18, m6, m14 ; b2 + packssdw m0, m1 + mova m1, m3 + psraw m6, m0, 2 ; 78 + mova m3, m5 + vpshrdd m5, m4, m6, 16 ; 67 + vpdpwssd m18, m6, m15 ; b3 + vpdpwssd m17, m5, m15 ; a3 + packuswb m17, m18 + vpermb m17, m16, m17 + mova [r7+dsq*0], xm17 + vextracti128 [r7+dsq*1], ym17, 1 + lea r7, [r7+dsq*2] + sub hd, 2 + jg .hv_w16_loop + add srcq, 16 + add dstq, 16 + movzx hd, r6b + sub r6d, 1<<8 + jg .hv_w16_loop0 + RET + +%if WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +%define PREP_8TAP_FN FN prep_8tap, +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_6tap_8bpc +PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_6tap_8bpc +PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_8bpc +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_8bpc +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_8bpc +PREP_8TAP_FN regular, REGULAR, REGULAR + +cglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my, ss3 +%define base r7-prep_avx512icl + imul mxd, mxm, 0x010101 + add mxd, t0d ; 6tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 6tap_v, my, 4tap_v + lea r7, [prep_avx512icl] + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v +.prep: + tzcnt wd, wd + movzx wd, word [r7+wq*2+table_offset(prep,)] + add wq, r7 + lea r6, [ssq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + tzcnt r5d, wd + lea myq, [base+subpel_filters+1+myq*8] + movzx r5d, word [r7+r5*2+table_offset(prep, _6tap_v)] + vpbroadcastd m7, [pw_8192] + sub srcq, ssq + vpbroadcastw m8, [myq+0] + add r5, r7 + vpbroadcastw m9, [myq+2] + lea ss3q, [ssq*3] + vpbroadcastw m10, [myq+4] + sub srcq, ssq + jmp r5 +.v_w4: + movd xmm2, [srcq+ssq*0] + pinsrd xmm2, [srcq+ssq*1], 1 + vpbroadcastd ymm1, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastd ymm3, [srcq+ssq*0] + vpbroadcastd ymm0, [srcq+ssq*1] + vbroadcasti128 ymm5, [deint_shuf4] + vpblendd ymm1, ymm2, 0xeb + punpcklqdq ymm3, ymm0 + vpblendd ymm1, ymm3, 0x60 ; 0 1 2 _ 2 3 4 _ + pshufb ymm1, ymm5 ; 01 12 23 34 +.v_w4_loop: + pinsrd xmm0, [srcq+ssq*2], 1 + vpbroadcastd ymm2, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + vpbroadcastd ymm3, [srcq+ssq*0] + vpblendd ymm2, ymm0, 0xeb + vpbroadcastd ymm0, [srcq+ssq*1] + punpcklqdq ymm3, ymm0 + vpblendd ymm2, ymm3, 0x60 ; 4 5 6 _ 6 7 8 _ + pshufb ymm2, ymm5 ; 45 56 67 78 + pmaddubsw ymm3, ymm1, ym8 ; a0 b0 c0 d0 + vperm2i128 ymm1, ymm2, 0x21 ; 23 34 45 56 + pmaddubsw ymm4, ymm2, ym10 ; a2 b2 c2 d2 + pmaddubsw ymm1, ym9 ; a1 b1 c1 d1 + paddw ymm3, ymm4 + paddw ymm3, ymm1 + pmulhrsw ymm3, ym7 + mova ymm1, ymm2 + mova [tmpq], ymm3 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + vzeroupper + RET +.v_w8: + mova m6, [spel_v_perm8] + movq xm1, [srcq+ssq*0] + mov r6d, 0x3e + movq xm2, [srcq+ssq*1] + kmovb k1, r6d + vpbroadcastq ym3, [srcq+ssq*2] + add srcq, ss3q + vpunpcklqdq ym2, [srcq+ssq*0] {1to4} + vpunpcklqdq m1{k1}, m3, [srcq+ssq*1] {1to8} + movq xm0, [srcq+ssq*1] + kshiftlb k2, k1, 2 + shufpd m1, m2, 0x18 ; 0 1 2 3 4 + vpermb m1, m6, m1 ; 01 12 23 34 +.v_w8_loop: + vpbroadcastq ym3, [srcq+ss3q ] + vpunpcklqdq ym0{k1}, ym3, [srcq+ssq*2] {1to4} + lea srcq, [srcq+ssq*4] + vpbroadcastq m3, [srcq+ssq*1] + vpunpcklqdq m0{k2}, m3, [srcq+ssq*0] {1to8} + pmaddubsw m4, m1, m8 ; a0 b0 c0 d0 + vpermb m2, m6, m0 ; 45 56 67 78 + mova xm0, xm3 + vshufi32x4 m1, m2, q1032 ; 23 34 45 56 + pmaddubsw m3, m2, m10 ; a3 b3 c3 d3 + pmaddubsw m5, m1, m9 ; a2 b2 c2 d2 + mova m1, m2 + paddw m4, m3 + paddw m4, m5 + pmulhrsw m4, m7 + mova [tmpq], m4 + add tmpq, 64 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + mova m11, [spel_v_perm16b] + vbroadcasti32x4 m1, [srcq+ssq*0] + mov r6d, 0x0f + vbroadcasti32x4 ym3, [srcq+ssq*1] + vbroadcasti32x4 m2, [srcq+ssq*2] + kmovb k1, r6d + add srcq, ss3q + vbroadcasti32x4 ym4, [srcq+ssq*0] + vbroadcasti32x4 m0, [srcq+ssq*1] + vshufpd m1{k1}, m3, m2, 0xcc + vshufpd m2{k1}, m4, m0, 0xcc + vpermb m1, m11, m1 ; 01 12 + vpermb m2, m11, m2 ; 23 34 +.v_w16_loop: + pmaddubsw m3, m1, m8 ; a0 b0 + pmaddubsw m5, m2, m9 ; a1 b1 + vbroadcasti32x4 ym6, [srcq+ssq*2] + pmaddubsw m4, m2, m8 ; c0 d0 + vbroadcasti32x4 m2, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + vshufpd m0{k1}, m6, m2, 0xcc + vbroadcasti32x4 ym6, [srcq+ssq*0] + vpermb m1, m11, m0 ; 45 56 + vbroadcasti32x4 m0, [srcq+ssq*1] + vshufpd m2{k1}, m6, m0, 0xcc + pmaddubsw m6, m1, m9 ; c1 d1 + vpermb m2, m11, m2 ; 67 78 + paddw m3, m5 + pmaddubsw m5, m1, m10 ; a2 b2 + paddw m4, m6 + pmaddubsw m6, m2, m10 ; c2 d2 + paddw m3, m5 + paddw m4, m6 + pmulhrsw m3, m7 + pmulhrsw m4, m7 + mova [tmpq+ 0], m3 + mova [tmpq+64], m4 + add tmpq, 64*2 + sub hd, 4 + jg .v_w16_loop + RET +.v_w32: + movshdup m6, [bilin_v_perm64] + movu ym16, [srcq+ssq*0] + movu ym17, [srcq+ssq*1] + movu ym18, [srcq+ssq*2] + add srcq, ss3q + movu ym19, [srcq+ssq*0] + add srcq, ssq + movu ym20, [srcq+ssq*0] + vpermt2q m16, m6, m18 ; 0 2 + vpermt2q m17, m6, m19 ; 1 3 + vpermt2q m18, m6, m20 ; 2 4 + punpcklbw m0, m16, m17 ; 01 + punpcklbw m1, m17, m18 ; 12 + punpckhbw m2, m16, m17 ; 23 + punpckhbw m3, m17, m18 ; 34 +.v_w32_loop: + movu ym16, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movu ym17, [srcq+ssq*0] + pmaddubsw m4, m0, m8 ; a0 + mova m0, m2 + pmaddubsw m2, m9 ; a1 + vpermt2q m16, m6, m17 ; 5 6 + pmaddubsw m5, m1, m8 ; b0 + mova m1, m3 + pmaddubsw m3, m9 ; b1 + shufpd m18, m16, 0x55 ; 4 5 + paddw m4, m2 + punpcklbw m2, m18, m16 ; 45 + paddw m5, m3 + punpckhbw m3, m18, m16 ; 56 + mova m18, m16 + pmaddubsw m16, m2, m10 ; a2 + pmaddubsw m17, m3, m10 ; b2 + paddw m4, m16 + paddw m5, m17 + pmulhrsw m4, m7 + pmulhrsw m5, m7 + mova [tmpq+ 0], m4 + mova [tmpq+64], m5 + add tmpq, 64*2 + sub hd, 2 + jg .v_w32_loop + vzeroupper + RET +.v_w64: +.v_w128: + mova m6, [bilin_v_perm64] + add wd, wd + lea r6d, [hq+wq] +.v_loop0: + vpermq m12, m6, [srcq+ssq*0] + vpermq m13, m6, [srcq+ssq*1] + lea r5, [srcq+ssq*2] + vpermq m14, m6, [r5 +ssq*0] + vpermq m15, m6, [r5 +ssq*1] + lea r5, [r5+ssq*2] + vpermq m16, m6, [r5 +ssq*0] + mov r7, tmpq + punpcklbw m0, m12, m13 ; 01 + punpckhbw m12, m13 + punpcklbw m1, m13, m14 ; 12 + punpckhbw m13, m14 + punpcklbw m2, m14, m15 ; 23 + punpckhbw m14, m15 + punpcklbw m3, m15, m16 ; 34 + punpckhbw m15, m16 +.v_loop: + pmaddubsw m17, m0, m8 ; a0 + vpermq m5, m6, [r5+ssq*1] + pmaddubsw m18, m12, m8 + mova m0, m2 + pmaddubsw m2, m9 ; a1 + mova m12, m14 + pmaddubsw m14, m9 + lea r5, [r5+ssq*2] + pmaddubsw m19, m1, m8 ; b0 + pmaddubsw m20, m13, m8 + mova m1, m3 + pmaddubsw m3, m9 ; b1 + mova m13, m15 + pmaddubsw m15, m9 + paddw m17, m2 + punpcklbw m2, m16, m5 ; 67 + paddw m18, m14 + punpckhbw m14, m16, m5 + vpermq m16, m6, [r5+ssq*0] + paddw m19, m3 + pmaddubsw m3, m2, m10 ; a3 + paddw m20, m15 + pmaddubsw m15, m14, m10 + paddw m17, m3 + punpcklbw m3, m5, m16 ; 78 + pmaddubsw m4, m3, m10 ; b3 + paddw m18, m15 + punpckhbw m15, m5, m16 + pmaddubsw m5, m15, m10 + paddw m19, m4 + paddw m20, m5 + REPX {pmulhrsw x, m7}, m17, m18, m19, m20 + mova [r7+wq*0+ 0], m17 + mova [r7+wq*0+64], m18 + mova [r7+wq*1+ 0], m19 + mova [r7+wq*1+64], m20 + lea r7, [r7+wq*2] + sub hd, 2 + jg .v_loop + add srcq, 64 + add tmpq, 128 + movzx hd, r6b + sub r6d, 1<<8 + jg .v_loop0 + vzeroupper + RET +.h: + test myd, 0xf00 + jz mangle(private_prefix %+ _prep_8tap_8bpc_avx512icl).h2 +.hv: + vpbroadcastd m8, [pd_2] + vpbroadcastd m9, [pd_32] + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + vpbroadcastd m11, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m3, [base+subpel_filters+1+myq*8] + vbroadcasti128 m10, [subpel_h_shufA] + lea r6, [ssq*2+1] + mov r3d, 0x30 + sub srcq, r6 + kmovb k1, r3d + vpbroadcastq ym2, [srcq+ssq*0] + lea ss3q, [ssq*3] + vpbroadcastq m1, [srcq+ssq*1] + kaddb k2, k1, k1 + vpbroadcastq m2{k1}, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m1{k2}, [srcq+ssq*0] ; _ _ 1 3 + punpcklbw m3, m3 + vpbroadcastq m2{k2}, [srcq+ssq*1] ; _ 0 2 4 + psraw m3, 8 ; sign-extend + mova m6, [spel_hv_perm4a] + kshiftrb k1, k1, 2 + movu m7, [spel_hv_perm4b] + pshufb m1, m10 + mova m0, m8 + vpdpbusd m0, m1, m11 + pshufb m2, m10 + mova m1, m8 + vpdpbusd m1, m2, m11 + pshufd m12, m3, q0000 + pshufd m13, m3, q1111 + pshufd m14, m3, q2222 + packssdw m0, m1 ; _ _ _ 0 1 2 3 4 + psraw m0, 2 + vpermb m1, m7, m0 ; 01 12 23 34 +.hv_w4_loop: + movq xm3, [srcq+ssq*2] + movq xm4, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + vpbroadcastq ym3{k1}, [srcq+ssq*0] ; 5 7 + vpbroadcastq ym4{k1}, [srcq+ssq*1] ; 6 8 + pshufb ym3, ym10 + mova ym2, ym8 + vpdpbusd ym2, ym3, ym11 + pshufb ym4, ym10 + mova ym3, ym8 + vpdpbusd ym3, ym4, ym11 + mova m4, m9 + vpdpwssd m4, m1, m12 ; a0 b0 c0 d0 + packssdw ym2, ym3 ; 5 6 7 8 + psraw ym2, 2 + vshufi32x4 m0, m2, q1032 ; _ 2 3 4 5 6 7 8 + vpermb m2, m6, m0 ; 23 34 45 56 + vpermb m1, m7, m0 ; 45 56 67 78 + vpdpwssd m4, m2, m13 ; a1 b1 c1 d1 + vpdpwssd m4, m1, m14 ; a2 b2 c2 d2 + psrad m4, 6 + vpmovdw [tmpq], m4 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + vpbroadcastd m10, [base+subpel_filters+mxq*8+0] + vpbroadcastd m11, [base+subpel_filters+mxq*8+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m0, [base+subpel_filters+1+myq*8] + lea r6, [ssq*2+3] + punpcklbw m0, m0 + sub srcq, r6 + psraw m0, 8 ; sign-extend + lea ss3q, [ssq*3] + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + cmp wd, 8 + jg .hv_w16 + movu xm16, [srcq+ssq*0] + vbroadcasti32x4 m19, [subpel_h_shufA] + vinserti128 ym16, [srcq+ssq*1], 1 + vbroadcasti32x4 m21, [subpel_h_shufC] + vinserti32x4 m16, [srcq+ssq*2], 2 + add srcq, ss3q + vinserti32x4 m16, [srcq+ssq*0], 3 + movu xm17, [srcq+ssq*1] + vbroadcasti32x4 m20, [subpel_h_shufB] + pshufb m3, m16, m19 ; 0 1 2 3 0123 mova m2, m8 - vpdpbusd m2, m1, m10 - pshufb m5, m6, m7 ; 0 1 2 3 4567 - mova m1, m8 - vpdpbusd m1, m5, m10 - pshufb m4, m0, m4 ; 4 5 6 _ 0123 + pshufb m0, m16, m21 ; 0 1 2 3 89ab + vpdpbusd m2, m3, m10 mova m3, m8 - vpdpbusd m3, m4, m10 - pshufb m7, m0, m7 ; 4 5 6 _ 4567 - mova m4, m8 - vpdpbusd m4, m7, m10 - pshufb m6, m17 - vpdpbusd m2, m5, m11 - vpdpbusd m1, m6, m11 - pshufb m6, m0, m17 - vpdpbusd m3, m7, m11 - vpdpbusd m4, m6, m11 - mova m5, [spel_hv_perm8a] - mova m0, [spel_hv_perm8b] - mov r6, 0x55555555ff00 - packssdw m2, m1 - packssdw m3, m4 - mova m18, [spel_hv_perm8c] - psraw m2, 2 ; 0 1 2 3 - psraw m3, 2 ; 4 5 6 _ - vpermb m1, m5, m2 ; 01 12 - vbroadcasti32x8 m6, [subpel_h_shufA] - kmovq k1, r6 - vpermt2b m2, m0, m3 ; 23 34 - vbroadcasti32x8 m7, [subpel_h_shufB] - kshiftrq k2, k1, 16 - mova xm16, [spel_hv_end] - vpermb m3, m5, m3 ; 45 56 + pshufb xm1, xm17, xm19 ; 3 4 5 6 0123 + vpdpbusd m3, m0, m11 + mova xm0, xm8 + pshufb xm18, xm17, xm21 ; 3 4 5 6 89ab + vpdpbusd xm0, xm1, xm10 + mova xm1, xm8 + pshufb m16, m20 ; 0 1 2 3 4567 + vpdpbusd xm1, xm18, xm11 + pshufb xm17, xm20 ; 3 4 5 6 4567 + vpdpbusd m2, m16, m11 + vpdpbusd m3, m16, m10 + vpdpbusd xm0, xm17, xm11 + vpdpbusd xm1, xm17, xm10 + packssdw m2, m3 + packssdw xm0, xm1 + psraw m2, 2 ; 0 1 2 3 + psraw xm0, 2 ; 4 + valignq m0, m2, 2 ; 1 2 3 4 + punpcklwd m1, m2, m0 ; 01 12 23 34 + punpckhwd m2, m0 .hv_w8_loop: - vbroadcasti32x4 ym4, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vbroadcasti32x4 m4{k1}, [srcq+ssq*0] - mova m0, m9 - vpdpwssd m0, m1, m12 ; a0 b0 - pshufb m1, m4, m6 ; 7 8 0123 4567 + movu xm16, [srcq+ssq*2] + vinserti128 ym16, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + vinserti32x4 m16, [srcq+ssq*0], 2 + vinserti32x4 m16, [srcq+ssq*1], 3 + pshufb m6, m16, m19 ; 5 6 7 8 0123 mova m5, m8 - vpdpbusd m5, m1, m10 - pshufb m4, m7 ; 7 8 4567 89ab - vpdpwssd m0, m2, m13 ; a1 b1 - mova m1, m2 - vpdpbusd m5, m4, m11 - mova m2, m3 - vpdpwssd m0, m3, m14 ; a2 b2 - psraw m3{k2}, m5, 2 ; 75 86 - vpermb m3, m18, m3 ; 67 78 - vpdpwssd m0, m3, m15 ; a3 b3 - packuswb m0, m0 - vpermb zmm1, m16, m0 - movq [dstq+dsq*0], xmm1 - movhps [dstq+dsq*1], xmm1 - lea dstq, [dstq+dsq*2] - sub hd, 2 + pshufb m3, m16, m21 ; 5 6 7 8 89ab + vpdpbusd m5, m6, m10 + mova m6, m8 + pshufb m16, m20 ; 5 6 7 8 4567 + vpdpbusd m6, m3, m11 + mova m3, m9 + vpdpwssd m3, m1, m12 ; a0 b0 c0 d0 + mova m4, m9 + vpdpwssd m4, m2, m12 + vpdpbusd m5, m16, m11 + vpdpbusd m6, m16, m10 + mova m16, m1 + packssdw m5, m6 + mova m6, m2 + psraw m5, 2 ; 5 6 7 8 + valignq m2, m5, m0, 6 ; 4 5 6 7 + mova m0, m5 + punpcklwd m1, m2, m5 ; 45 56 67 78 + punpckhwd m2, m5 + vpdpwssd m3, m1, m14 ; a2 b2 c2 d2 + vpdpwssd m4, m2, m14 + vshufi32x4 m16, m1, q1032 ; 23 34 45 56 + vshufi32x4 m6, m2, q1032 + vpdpwssd m3, m16, m13 ; a1 b1 c1 d1 + vpdpwssd m4, m6, m13 + psrad m3, 6 + psrad m4, 6 + packssdw m3, m4 + mova [tmpq], m3 + add tmpq, 64 + sub hd, 4 jg .hv_w8_loop vzeroupper RET .hv_w16: - movu m7, [spel_hv_perm16a] - sub srcq, ss3q - mova m20, [spel_hv_perm16b] - lea r6d, [wq*2-32] - mova m21, [spel_hv_perm16c] - mov r4, srcq - mov r7, dstq - mova ym16, [spel_hv_end16] - lea r6d, [hq+r6*8] + mova m16, [spel_h_perm16] + vpbroadcastd m18, [pb_4] + add wd, wd + paddb m17, m18, m16 + lea r6d, [hq+wq*8-256] + paddb m18, m17 .hv_w16_loop0: - movu ym17, [srcq+ssq*0] - vinserti32x8 m17, [srcq+ssq*1], 1 ; 0 1 - movu ym18, [srcq+ssq*2] - add srcq, ss3q - vinserti32x8 m18, [srcq+ssq*0], 1 ; 2 3 - movu ym19, [srcq+ssq*1] - vinserti32x8 m19, [srcq+ssq*2], 1 ; 4 5 - add srcq, ss3q - vpermb m2, m7, m17 ; 0 1 0123 89ab - vpermb m0, m20, m17 ; 0 1 4567 cdef - vpermb m4, m7, m18 ; 2 3 0123 89ab - mova m1, m8 - vpdpbusd m1, m2, m10 - vpermb m5, m20, m18 ; 2 3 4567 cdef + movu ym19, [srcq+ssq*0] + vinserti32x8 m19, [srcq+ssq*1], 1 + lea r5, [srcq+ssq*2] + movu ym20, [r5 +ssq*0] + vinserti32x8 m20, [r5 +ssq*1], 1 + lea r5, [r5 +ssq*2] + movu ym21, [r5 +ssq*0] + mov r7, tmpq + vpermb m3, m16, m19 ; 0 1 0123 89ab mova m2, m8 - vpdpbusd m2, m0, m10 - vpermb m17, m21, m17 ; 0 1 89ab ghij + vpermb m4, m18, m19 ; 0 1 89ab ghij + vpdpbusd m2, m3, m10 mova m3, m8 - vpdpbusd m3, m4, m10 - vpermb m6, m7, m19 ; 4 5 0123 89ab + vpermb m5, m16, m20 ; 2 3 0123 89ab + vpdpbusd m3, m4, m11 mova m4, m8 + vpermb m0, m18, m20 ; 2 3 89ab ghij vpdpbusd m4, m5, m10 - vpermb m18, m21, m18 ; 2 3 89ab ghij - vpdpbusd m1, m0, m11 - movu ym0, [srcq+ssq*0] ; 6 - vpdpbusd m2, m17, m11 - vpermb m17, m20, m19 ; 4 5 4567 cdef - vpdpbusd m3, m5, m11 mova m5, m8 + vpermb ym1, ym16, ym21 ; 4 0123 89ab + vpdpbusd m5, m0, m11 + mova ym0, ym8 + vpermb ym6, ym18, ym21 ; 4 89ab ghij + vpdpbusd ym0, ym1, ym10 + mova ym1, ym8 + vpermb m19, m17, m19 ; 0 1 4567 cdef + vpdpbusd ym1, ym6, ym11 + vpermb m20, m17, m20 ; 2 3 4567 cdef + vpdpbusd m2, m19, m11 + vpdpbusd m3, m19, m10 + vpermb ym21, ym17, ym21 ; 4 4567 cdef + vpdpbusd m4, m20, m11 + vpdpbusd m5, m20, m10 + vpdpbusd ym0, ym21, ym11 + vpdpbusd ym1, ym21, ym10 + packssdw m2, m3 ; 0 1 + packssdw m4, m5 ; 2 3 + packssdw ym0, ym1 ; 4 + REPX {psraw x, 2}, m2, m4, ym0 + vshufi32x4 m3, m2, m4, q1032 ; 1 2 + vshufi32x4 m0, m4, m0, q1032 ; 3 4 + punpcklwd m1, m2, m3 ; 01 12 + punpckhwd m2, m3 + punpcklwd m3, m4, m0 ; 23 34 + punpckhwd m4, m0 +.hv_w16_loop: + movu ym19, [r5+ssq*1] + lea r5, [r5+ssq*2] + vinserti32x8 m19, [r5+ssq*0], 1 + vpermb m6, m16, m19 ; 5 6 0123 89ab + mova m5, m8 + vpermb m20, m18, m19 ; 5 6 89ab ghij vpdpbusd m5, m6, m10 mova m6, m8 - vpdpbusd m6, m17, m10 - vpdpbusd m4, m18, m11 - mova m18, [spel_hv_perm16d] - vpermb m18, m18, m0 ; 6 0145 2367 89cd abef - vpdpbusd m5, m17, m11 - vpermb m19, m21, m19 ; 4 5 89ab ghij - mova m17, m8 - vpdpbusd m17, m18, m10 - mova m18, [spel_hv_perm16e] - vpermb m0, m18, m0 ; 6 4589 67ab cdgh efij - packssdw m1, m2 ; 01 - vpdpbusd m6, m19, m11 - packssdw m3, m4 ; 23 - vpdpbusd m17, m0, m11 - psraw m1, 2 - packssdw m5, m6 ; 45 - psraw m3, 2 - vpshrdd m2, m1, m3, 16 ; 12 - psraw m5, 2 - vpshrdd m4, m3, m5, 16 ; 34 - psraw m17, 2 - vpshrdd m6, m5, m17, 16 ; 56 -.hv_w16_loop: - movu ym18, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vinserti32x8 m18, [srcq+ssq*0], 1 - mova m0, m9 - vpdpwssd m0, m1, m12 ; a0 - vpermb m1, m7, m18 ; 7 8 0123 89ab - mova m17, m9 - vpdpwssd m17, m2, m12 ; b0 - vpermb m2, m20, m18 ; 7 8 4567 cdef - mova m19, m8 - vpdpbusd m19, m1, m10 - vpermb m18, m21, m18 - mova m1, m8 - vpdpbusd m1, m2, m10 - vpdpwssd m0, m3, m13 ; a1 - vpdpwssd m17, m4, m13 ; b1 - vpdpbusd m19, m2, m11 - mova m2, m4 - vpdpbusd m1, m18, m11 - mova m4, m6 - vpdpwssd m0, m5, m14 ; a2 - vpdpwssd m17, m6, m14 ; b2 - packssdw m19, m1 + vpermb m19, m17, m19 ; 5 6 4567 cdef + vpdpbusd m6, m20, m11 + mova m20, m9 + vpdpwssd m20, m1, m12 ; a0 b0 + mova m21, m9 + vpdpwssd m21, m2, m12 + vpdpbusd m5, m19, m11 + vpdpbusd m6, m19, m10 + vpdpwssd m20, m3, m13 ; a1 b1 + vpdpwssd m21, m4, m13 + packssdw m5, m6 mova m1, m3 - mova m3, m5 - psraw m6, m19, 2 ; 7 8 - vpshrdd m5, m4, m6, 16 ; 6 7 - vpdpwssd m17, m6, m15 ; b3 - vpdpwssd m0, m5, m15 ; a3 - packuswb m0, m17 - vpermb zmm1, m16, m0 - mova [dstq+dsq*0], xmm1 - vextracti128 [dstq+dsq*1], ymm1, 1 - lea dstq, [dstq+dsq*2] + psraw m5, 2 ; 5 6 + mova m2, m4 + vshufi32x4 m4, m0, m5, q1032 ; 4 5 + mova m0, m5 + punpcklwd m3, m4, m0 ; 45 56 + punpckhwd m4, m0 + vpdpwssd m20, m3, m14 ; a2 b2 + vpdpwssd m21, m4, m14 + psrad m20, 6 + psrad m21, 6 + packssdw m20, m21 + mova [r7+wq*0], ym20 + vextracti32x8 [r7+wq*1], m20, 1 + lea r7, [r7+wq*2] sub hd, 2 jg .hv_w16_loop - add r4, 16 - add r7, 16 + add srcq, 16 + add tmpq, 32 movzx hd, r6b - mov srcq, r4 - mov dstq, r7 sub r6d, 1<<8 jg .hv_w16_loop0 vzeroupper @@ -2353,57 +3424,333 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 mova [tmpq+64*1], m1 %endmacro -%if WIN64 -DECLARE_REG_TMP 6, 4 -%else -DECLARE_REG_TMP 6, 7 -%endif - -%define PREP_8TAP_FN FN prep_8tap, - +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_8bpc +PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_8bpc PREP_8TAP_FN sharp, SHARP, SHARP -PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH -PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP -PREP_8TAP_FN smooth, SMOOTH, SMOOTH -PREP_8TAP_FN sharp_regular, SHARP, REGULAR -PREP_8TAP_FN regular_sharp, REGULAR, SHARP -PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR -PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH -PREP_8TAP_FN regular, REGULAR, REGULAR -cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 +cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my, stride3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r7, [prep_avx512icl] - movsxd wq, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 - jnz .v - tzcnt wd, wd - movzx wd, word [r7+wq*2+table_offset(prep,)] - add wq, r7 - lea r6, [strideq*3] -%if WIN64 - pop r7 -%endif - jmp wq + jz mangle(private_prefix %+ _prep_6tap_8bpc_avx512icl).prep +.v: + movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. + shr myd, 16 ; Note that the code is 8-tap only, having + cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 + cmove myd, mxd ; had a negligible effect on performance. + tzcnt r5d, wd + lea myq, [base+subpel_filters+myq*8] + movzx r5d, word [r7+r5*2+table_offset(prep, _8tap_v)] + vpbroadcastd m7, [pw_8192] + vpbroadcastw m8, [myq+0] + add r5, r7 + vpbroadcastw m9, [myq+2] + lea stride3q, [strideq*3] + vpbroadcastw m10, [myq+4] + sub srcq, stride3q + vpbroadcastw m11, [myq+6] + jmp r5 +.v_w4: + movd xmm0, [srcq+strideq*0] + vpbroadcastd ymm1, [srcq+strideq*2] + vpbroadcastd xmm2, [srcq+strideq*1] + vpbroadcastd ymm3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd ymm1, ymm0, 0x01 ; 0 2 2 _ 2 _ _ _ + vpblendd ymm3, ymm2, 0x03 ; 1 1 3 3 3 3 _ _ + vpbroadcastd ymm0, [srcq+strideq*0] + vpbroadcastd ymm2, [srcq+strideq*1] + vpblendd ymm1, ymm0, 0x68 ; 0 2 2 4 2 4 4 _ + vpbroadcastd ymm0, [srcq+strideq*2] + vbroadcasti128 ymm5, [deint_shuf4] + vpblendd ymm3, ymm2, 0xc0 ; 1 1 3 3 3 3 5 5 + vpblendd ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3 2 3 4 5 + vpblendd ymm3, ymm1, 0xaa ; 1 2 3 4 3 4 5 _ + punpcklbw ymm1, ymm2, ymm3 ; 01 12 23 34 + vpblendd ymm3, ymm0, 0x80 ; 1 2 3 4 3 4 5 6 + punpckhbw ymm2, ymm3 ; 23 34 45 56 +.v_w4_loop: + pinsrd xmm0, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + vpbroadcastd ymm3, [srcq+strideq*0] + vpbroadcastd ymm4, [srcq+strideq*1] + vpblendd ymm3, ymm4, 0x20 ; _ _ 8 _ 8 9 _ _ + vpblendd ymm3, ymm0, 0x03 ; 6 7 8 _ 8 9 _ _ + vpbroadcastd ymm0, [srcq+strideq*2] + vpblendd ymm3, ymm0, 0x40 ; 6 7 8 _ 8 9 a _ + pshufb ymm3, ymm5 ; 67 78 89 9a + pmaddubsw ymm4, ymm1, ym8 + vperm2i128 ymm1, ymm2, ymm3, 0x21 ; 45 56 67 78 + pmaddubsw ymm2, ym9 + paddw ymm4, ymm2 + mova ymm2, ymm3 + pmaddubsw ymm3, ym11 + paddw ymm3, ymm4 + pmaddubsw ymm4, ymm1, ym10 + paddw ymm3, ymm4 + pmulhrsw ymm3, ym7 + mova [tmpq], ymm3 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + vzeroupper + RET +.v_w8: + mova m6, [spel_v_perm8] + movq xm1, [srcq+strideq*0] + mov r6d, 0x3e + movq xm2, [srcq+strideq*1] + vpbroadcastq ym3, [srcq+strideq*2] + kmovb k1, r6d + vpbroadcastq ym4, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpunpcklqdq m1{k1}, m3, [srcq+strideq*0] {1to8} + vpunpcklqdq m2{k1}, m4, [srcq+strideq*1] {1to8} + movq xm0, [srcq+strideq*2] + kshiftlb k2, k1, 2 + shufpd m1, m2, 0x30 ; 0 1 2 3 4 5 + vshufi32x4 m2, m1, m0, q0021 ; 2 3 4 5 6 _ + vpermb m1, m6, m1 ; 01 12 23 34 + vpermb m2, m6, m2 ; 23 34 45 56 +.v_w8_loop: + vpbroadcastq ym3, [srcq+strideq*4] + vpunpcklqdq ym0{k1}, ym3, [srcq+stride3q] {1to4} + lea srcq, [srcq+strideq*4] + vpbroadcastq m3, [srcq+strideq*2] + vpunpcklqdq m0{k2}, m3, [srcq+strideq*1] {1to8} + pmaddubsw m4, m1, m8 ; a0 b0 c0 d0 + mova m1, m2 + pmaddubsw m5, m2, m9 ; a1 b1 c1 d1 + vpermb m2, m6, m0 ; 67 78 89 9a + mova xm0, xm3 + vshufi32x4 m1, m2, q1032 ; 45 56 67 78 + pmaddubsw m3, m2, m11 ; a3 b3 c3 d3 + paddw m4, m5 + pmaddubsw m5, m1, m10 ; a2 b2 c2 d2 + paddw m4, m3 + paddw m4, m5 + pmulhrsw m4, m7 + mova [tmpq], m4 + add tmpq, 64 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + mova m12, [spel_v_perm16b] + vbroadcasti32x4 m1, [srcq+strideq*0] + mov r6d, 0x0f + vbroadcasti32x4 ym4, [srcq+strideq*1] + vbroadcasti32x4 m2, [srcq+strideq*2] + kmovb k1, r6d + vbroadcasti32x4 ym5, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vbroadcasti32x4 m3, [srcq+strideq*0] + vbroadcasti32x4 ym6, [srcq+strideq*1] + vbroadcasti32x4 m0, [srcq+strideq*2] + vshufpd m1{k1}, m4, m2, 0xcc + vshufpd m2{k1}, m5, m3, 0xcc + vshufpd m3{k1}, m6, m0, 0xcc + vpermb m1, m12, m1 ; 01 12 + vpermb m2, m12, m2 ; 23 34 + vpermb m3, m12, m3 ; 45 56 +.v_w16_loop: + pmaddubsw m4, m1, m8 ; a0 b0 + mova m1, m3 + pmaddubsw m13, m2, m9 ; a1 b1 + vbroadcasti32x4 ym6, [srcq+stride3q ] + pmaddubsw m5, m2, m8 ; c0 d0 + lea srcq, [srcq+strideq*4] + pmaddubsw m14, m3, m9 ; c1 d1 + vbroadcasti32x4 m3, [srcq+strideq*0] + vshufpd m0{k1}, m6, m3, 0xcc + vbroadcasti32x4 ym6, [srcq+strideq*1] + vpermb m2, m12, m0 ; 67 78 + vbroadcasti32x4 m0, [srcq+strideq*2] + vshufpd m3{k1}, m6, m0, 0xcc + paddw m4, m13 + pmaddubsw m13, m1, m10 ; a2 b2 + vpermb m3, m12, m3 ; 89 9a + paddw m5, m14 + pmaddubsw m14, m2, m10 ; c2 d2 + pmaddubsw m15, m2, m11 ; a3 b3 + pmaddubsw m6, m3, m11 ; c3 d3 + paddw m4, m13 + paddw m5, m14 + paddw m4, m15 + paddw m5, m6 + pmulhrsw m4, m7 + pmulhrsw m5, m7 + mova [tmpq+ 0], m4 + mova [tmpq+64], m5 + add tmpq, 64*2 + sub hd, 4 + jg .v_w16_loop + RET +.v_w32: + movshdup m21, [bilin_v_perm64] + movu ym16, [srcq+strideq*0] + movu ym17, [srcq+strideq*1] + movu ym18, [srcq+strideq*2] + add srcq, stride3q + movu ym19, [srcq+strideq*0] + vpermt2q m16, m21, m19 ; 0 3 + movu ym20, [srcq+strideq*1] + vpermt2q m17, m21, m20 ; 1 4 + movu ym20, [srcq+strideq*2] + add srcq, stride3q + vpermt2q m18, m21, m20 ; 2 5 + movu ym20, [srcq+strideq*0] + vpermt2q m19, m21, m20 ; 3 6 + punpcklbw m0, m16, m17 ; 01 + punpcklbw m1, m17, m18 ; 12 + punpcklbw m2, m18, m19 ; 23 + punpckhbw m3, m16, m17 ; 34 + punpckhbw m4, m17, m18 ; 45 + punpckhbw m5, m18, m19 ; 56 +.v_w32_loop: + movu ym16, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movu ym17, [srcq+strideq*0] + pmaddubsw m14, m0, m8 + mova m0, m2 + pmaddubsw m15, m1, m8 + mova m1, m3 + pmaddubsw m2, m9 + vpermt2q m16, m21, m17 ; 7 8 + pmaddubsw m3, m9 + pmaddubsw m12, m4, m10 + pmaddubsw m13, m5, m10 + shufpd m19, m16, 0x55 ; 6 7 + paddw m14, m2 + mova m2, m4 + punpcklbw m4, m19, m16 ; 67 + paddw m15, m3 + mova m3, m5 + punpckhbw m5, m19, m16 ; 78 + paddw m14, m12 + paddw m15, m13 + pmaddubsw m12, m4, m11 + pmaddubsw m13, m5, m11 + mova m19, m16 + paddw m14, m12 + paddw m15, m13 + pmulhrsw m14, m7 + pmulhrsw m15, m7 + mova [tmpq+ 0], m14 + mova [tmpq+64], m15 + add tmpq, 64*2 + sub hd, 2 + jg .v_w32_loop + vzeroupper + RET +.v_w64: +.v_w128: + WIN64_SPILL_XMM 24 + mova m23, [bilin_v_perm64] + add wd, wd + lea r6d, [hq+wq] +.v_loop0: + vpermq m12, m23, [srcq+strideq*0] + vpermq m13, m23, [srcq+strideq*1] + lea r5, [srcq+strideq*2] + vpermq m14, m23, [r5 +strideq*0] + vpermq m15, m23, [r5 +strideq*1] + lea r5, [r5+strideq*2] + vpermq m16, m23, [r5 +strideq*0] + vpermq m17, m23, [r5 +strideq*1] + lea r5, [r5+strideq*2] + vpermq m18, m23, [r5 +strideq*0] + mov r7, tmpq + punpcklbw m0, m12, m13 ; 01 + punpckhbw m12, m13 + punpcklbw m1, m13, m14 ; 12 + punpckhbw m13, m14 + punpcklbw m2, m14, m15 ; 23 + punpckhbw m14, m15 + punpcklbw m3, m15, m16 ; 34 + punpckhbw m15, m16 + punpcklbw m4, m16, m17 ; 45 + punpckhbw m16, m17 + punpcklbw m5, m17, m18 ; 56 + punpckhbw m17, m18 +.v_loop: + pmaddubsw m19, m0, m8 ; a0 + vpermq m6, m23, [r5+strideq*1] + pmaddubsw m20, m12, m8 + mova m0, m2 + pmaddubsw m2, m9 ; a1 + mova m12, m14 + pmaddubsw m14, m9 + lea r5, [r5+strideq*2] + pmaddubsw m21, m1, m8 ; b0 + pmaddubsw m22, m13, m8 + mova m1, m3 + pmaddubsw m3, m9 ; b1 + mova m13, m15 + pmaddubsw m15, m9 + paddw m19, m2 + mova m2, m4 + pmaddubsw m4, m10 ; a2 + paddw m20, m14 + mova m14, m16 + pmaddubsw m16, m10 + paddw m21, m3 + mova m3, m5 + pmaddubsw m5, m10 ; b2 + paddw m22, m15 + mova m15, m17 + pmaddubsw m17, m10 + paddw m19, m4 + punpcklbw m4, m18, m6 ; 67 + paddw m20, m16 + punpckhbw m16, m18, m6 + vpermq m18, m23, [r5+strideq*0] + paddw m21, m5 + pmaddubsw m5, m4, m11 ; a3 + paddw m22, m17 + pmaddubsw m17, m16, m11 + paddw m19, m5 + punpcklbw m5, m6, m18 ; 78 + paddw m20, m17 + punpckhbw m17, m6, m18 + pmaddubsw m6, m5, m11 ; b3 + paddw m21, m6 + pmaddubsw m6, m17, m11 + paddw m22, m6 + REPX {pmulhrsw x, m7}, m19, m20, m21, m22 + mova [r7+wq*0+ 0], m19 + mova [r7+wq*0+64], m20 + mova [r7+wq*1+ 0], m21 + mova [r7+wq*1+64], m22 + lea r7, [r7+wq*2] + sub hd, 2 + jg .v_loop + add srcq, 64 + add tmpq, 128 + movzx hd, r6b + sub r6d, 1<<8 + jg .v_loop0 + RET .h: + RESET_STACK_STATE test myd, 0xf00 jnz .hv +.h2: vpbroadcastd m4, [pd_2] - WIN64_SPILL_XMM 10 cmp wd, 4 je .h_w4 tzcnt wd, wd shr mxd, 16 sub srcq, 3 movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] - vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+0] - vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep_avx512icl+4] + vpbroadcastd m8, [base+subpel_filters+mxq*8+0] + vpbroadcastd m9, [base+subpel_filters+mxq*8+4] add wq, r7 jmp wq .h_w4: @@ -2411,7 +3758,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 vbroadcasti128 ym5, [subpel_h_shufA] mov r3d, 0x4 dec srcq - vpbroadcastd ym6, [r7+mxq*8+subpel_filters-prep_avx512icl+2] + vpbroadcastd ym6, [base+subpel_filters+mxq*8+2] kmovb k1, r3d lea stride3q, [strideq*3] .h_w4_loop: @@ -2461,10 +3808,11 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 jg .h_w8_loop RET .h_w16: - mova m5, [spel_h_perm16a] - mova m6, [spel_h_perm16b] - mova m7, [spel_h_perm16c] + mova m5, [spel_h_perm16] + vpbroadcastd m7, [pb_4] lea stride3q, [strideq*3] + paddb m6, m7, m5 + paddb m7, m6 .h_w16_loop: movu ym0, [srcq+strideq*0] movu ym1, [srcq+strideq*2] @@ -2477,9 +3825,10 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 jg .h_w16_loop RET .h_w32: - mova m5, [spel_h_perm32a] - mova m6, [spel_h_perm32b] - mova m7, [spel_h_perm32c] + mova m5, [spel_h_perm32] + vpbroadcastd m7, [pb_4] + paddb m6, m7, m5 + paddb m7, m6 .h_w32_loop: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*1] @@ -2495,409 +3844,47 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 .h_w128: mov r6, -64*1 .h_start: - mova m5, [spel_h_perm32a] - mova m6, [spel_h_perm32b] - mova m7, [spel_h_perm32c] + mova m5, [spel_h_perm32] + vpbroadcastd m7, [pb_4] sub srcq, r6 + paddb m6, m7, m5 + paddb m7, m6 +.h_loop0: mov r5, r6 .h_loop: - movu m0, [srcq+r6+32*0] - movu m1, [srcq+r6+32*1] + movu m0, [srcq+r5+32*0] + movu m1, [srcq+r5+32*1] PREP_8TAP_H add tmpq, 64*2 - add r6, 64 + add r5, 64 jle .h_loop add srcq, strideq - mov r6, r5 dec hd - jg .h_loop - RET -.v: - movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. - shr myd, 16 ; Note that the code is 8-tap only, having - tzcnt wd, wd - cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 - cmove myd, mxd ; had a negligible effect on performance. - ; TODO: Would a 6-tap code path be worth it? - lea myq, [r7+myq*8+subpel_filters-prep_avx512icl] - movzx wd, word [r7+wq*2+table_offset(prep, _8tap_v)] - add wq, r7 - lea stride3q, [strideq*3] - sub srcq, stride3q - vpbroadcastd m7, [pw_8192] - vpbroadcastw m8, [myq+0] - vpbroadcastw m9, [myq+2] - vpbroadcastw m10, [myq+4] - vpbroadcastw m11, [myq+6] - jmp wq -.v_w4: - movd xmm0, [srcq+strideq*0] - vpbroadcastd ymm1, [srcq+strideq*2] - vpbroadcastd xmm2, [srcq+strideq*1] - vpbroadcastd ymm3, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpblendd ymm1, ymm0, 0x01 ; 0 2 2 _ 2 _ _ _ - vpblendd ymm3, ymm2, 0x03 ; 1 1 3 3 3 3 _ _ - vpbroadcastd ymm0, [srcq+strideq*0] - vpbroadcastd ymm2, [srcq+strideq*1] - vpblendd ymm1, ymm0, 0x68 ; 0 2 2 4 2 4 4 _ - vpbroadcastd ymm0, [srcq+strideq*2] - vbroadcasti128 ymm5, [deint_shuf4] - vpblendd ymm3, ymm2, 0xc0 ; 1 1 3 3 3 3 5 5 - vpblendd ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3 2 3 4 5 - vpblendd ymm3, ymm1, 0xaa ; 1 2 3 4 3 4 5 _ - punpcklbw ymm1, ymm2, ymm3 ; 01 12 23 34 - vpblendd ymm3, ymm0, 0x80 ; 1 2 3 4 3 4 5 6 - punpckhbw ymm2, ymm3 ; 23 34 45 56 -.v_w4_loop: - pinsrd xmm0, [srcq+stride3q ], 1 - lea srcq, [srcq+strideq*4] - vpbroadcastd ymm3, [srcq+strideq*0] - vpbroadcastd ymm4, [srcq+strideq*1] - vpblendd ymm3, ymm4, 0x20 ; _ _ 8 _ 8 9 _ _ - vpblendd ymm3, ymm0, 0x03 ; 6 7 8 _ 8 9 _ _ - vpbroadcastd ymm0, [srcq+strideq*2] - vpblendd ymm3, ymm0, 0x40 ; 6 7 8 _ 8 9 a _ - pshufb ymm3, ymm5 ; 67 78 89 9a - pmaddubsw ymm4, ymm1, ym8 - vperm2i128 ymm1, ymm2, ymm3, 0x21 ; 45 56 67 78 - pmaddubsw ymm2, ym9 - paddw ymm4, ymm2 - mova ymm2, ymm3 - pmaddubsw ymm3, ym11 - paddw ymm3, ymm4 - pmaddubsw ymm4, ymm1, ym10 - paddw ymm3, ymm4 - pmulhrsw ymm3, ym7 - mova [tmpq], ymm3 - add tmpq, 32 - sub hd, 4 - jg .v_w4_loop - vzeroupper - RET -.v_w8: - mov r3d, 0xf044 - kmovw k1, r3d - kshiftrw k2, k1, 8 - movq xm0, [srcq+strideq*0] - vpbroadcastq ym1, [srcq+strideq*1] - vpbroadcastq m2, [srcq+strideq*2] - vpbroadcastq m3, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpbroadcastq m4, [srcq+strideq*0] - vpbroadcastq m5, [srcq+strideq*1] - vpbroadcastq m6, [srcq+strideq*2] - vmovdqa64 ym0{k1}, ym1 - vmovdqa64 ym1{k1}, ym2 - vmovdqa64 m2{k1}, m3 - vmovdqa64 m3{k1}, m4 - vmovdqa64 m4{k1}, m5 - vmovdqa64 m5{k1}, m6 - punpcklbw ym0, ym1 ; 01 12 __ __ - punpcklbw m2, m3 ; 23 34 23 34 - punpcklbw m4, m5 ; 45 56 45 56 - vmovdqa64 m0{k2}, m2 ; 01 12 23 34 - vmovdqa64 m2{k2}, m4 ; 23 34 45 56 -.v_w8_loop: - vpbroadcastq m1, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpbroadcastq m3, [srcq+strideq*0] - vpbroadcastq m5, [srcq+strideq*1] - pmaddubsw m14, m0, m8 - pmaddubsw m15, m2, m9 - vpblendmq m0{k1}, m6, m1 - vpblendmq m2{k1}, m1, m3 - vpbroadcastq m6, [srcq+strideq*2] - paddw m14, m15 - punpcklbw m2, m0, m2 ; 67 78 67 78 - vpblendmq m12{k1}, m3, m5 - vpblendmq m13{k1}, m5, m6 - vpblendmq m0{k2}, m4, m2 ; 45 56 67 78 - punpcklbw m4, m12, m13 ; 89 9a 89 9a - vmovdqa64 m2{k2}, m4 ; 67 78 89 9a - pmaddubsw m12, m0, m10 - pmaddubsw m13, m2, m11 - paddw m14, m12 - paddw m14, m13 - pmulhrsw m14, m7 - mova [tmpq], m14 - add tmpq, 64 - sub hd, 4 - jg .v_w8_loop - RET -.v_w16: - mov r3d, 0xf0 - kmovb k1, r3d - vbroadcasti128 m0, [srcq+strideq*0] - vbroadcasti128 m1, [srcq+strideq*1] - vbroadcasti128 m2, [srcq+strideq*2] - vbroadcasti128 m3, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vbroadcasti128 m4, [srcq+strideq*0] - vbroadcasti128 m5, [srcq+strideq*1] - vbroadcasti128 m6, [srcq+strideq*2] - vmovdqa64 m0{k1}, m1 - vmovdqa64 m1{k1}, m2 - vmovdqa64 m2{k1}, m3 - vmovdqa64 m3{k1}, m4 - vmovdqa64 m4{k1}, m5 - vmovdqa64 m5{k1}, m6 - shufpd m0, m2, 0xcc ; 0a_2a 0b_2b 1a_3a 1b_3b - shufpd m1, m3, 0xcc ; 1a_3a 1b_3b 2a_4a 2b_4b - shufpd m4, m4, 0x44 ; 4a_-- 4b_-- 5a_-- 5b_-- - shufpd m5, m5, 0x44 ; 5a_-- 5b_-- 6a_-- 6b_-- - punpckhbw m2, m0, m1 ; 23a 23b 34a 34b - punpcklbw m0, m1 ; 01a 01b 12a 12b - punpcklbw m4, m5 ; 45a 45b 56a 56b -.v_w16_loop: - vbroadcasti128 m3, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vbroadcasti128 m5, [srcq+strideq*0] - vpblendmq m1{k1}, m6, m3 - vmovdqa64 m3{k1}, m5 - pmaddubsw m12, m0, m8 - pmaddubsw m13, m2, m8 - pmaddubsw m14, m2, m9 - pmaddubsw m15, m4, m9 - pmaddubsw m0, m4, m10 - vbroadcasti128 m2, [srcq+strideq*1] - vbroadcasti128 m6, [srcq+strideq*2] - paddw m12, m14 - paddw m13, m15 - paddw m12, m0 - vmovdqa64 m5{k1}, m2 - vmovdqa64 m2{k1}, m6 - mova m0, m4 - shufpd m1, m5, 0xcc ; 6a_8a 6b_8b 7a_9a 7b_9b - shufpd m3, m2, 0xcc ; 7a_9a 7b_9b 8a_Aa 8b_Ab - punpcklbw m2, m1, m3 ; 67a 67b 78a 78b - punpckhbw m4, m1, m3 ; 89a 89b 9Aa 9Ab - pmaddubsw m14, m2, m10 - pmaddubsw m15, m2, m11 - paddw m13, m14 - paddw m12, m15 - pmaddubsw m14, m4, m11 - paddw m13, m14 - pmulhrsw m12, m7 - pmulhrsw m13, m7 - mova [tmpq+ 0], m12 - mova [tmpq+64], m13 - add tmpq, 64*2 - sub hd, 4 - jg .v_w16_loop - RET -.v_w32: - mova m18, [bilin_v_perm64] - movu ym0, [srcq+strideq*0] - movu ym1, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movu ym2, [srcq+strideq*0] - movu ym3, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movu ym4, [srcq+strideq*0] - movu ym5, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movu ym6, [srcq+strideq*0] - vpermq m0, m18, m0 - vpermq m1, m18, m1 - vpermq m2, m18, m2 - vpermq m3, m18, m3 - vpermq m4, m18, m4 - vpermq m5, m18, m5 - vpermq m6, m18, m6 - punpcklbw m0, m1 - punpcklbw m1, m2 - punpcklbw m2, m3 - punpcklbw m3, m4 - punpcklbw m4, m5 - punpcklbw m5, m6 -.v_w32_loop: - movu ym12, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movu ym13, [srcq+strideq*0] - pmaddubsw m14, m0, m8 - pmaddubsw m16, m2, m9 - pmaddubsw m15, m1, m8 - pmaddubsw m17, m3, m9 - mova m0, m2 - mova m1, m3 - vpermq m12, m18, m12 - vpermq m13, m18, m13 - paddw m14, m16 - paddw m15, m17 - pmaddubsw m16, m4, m10 - pmaddubsw m17, m5, m10 - punpcklbw m6, m12 - punpcklbw m12, m13 - mova m2, m4 - mova m3, m5 - paddw m14, m16 - paddw m15, m17 - pmaddubsw m16, m6, m11 - pmaddubsw m17, m12, m11 - mova m4, m6 - mova m5, m12 - paddw m14, m16 - paddw m15, m17 - pmulhrsw m14, m7 - pmulhrsw m15, m7 - mova m6, m13 - mova [tmpq+ 0], m14 - mova [tmpq+64], m15 - add tmpq, 64*2 - sub hd, 2 - jg .v_w32_loop - vzeroupper - RET -.v_w64: - mov wd, 64 - jmp .v_start -.v_w128: - mov wd, 128 -.v_start: - WIN64_SPILL_XMM 27 - mova m26, [bilin_v_perm64] - lea r6d, [hq+wq*2] - mov r5, srcq - mov r7, tmpq -.v_loop0: - vpermq m0, m26, [srcq+strideq*0] - vpermq m1, m26, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpermq m2, m26, [srcq+strideq*0] - vpermq m3, m26, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpermq m4, m26, [srcq+strideq*0] - vpermq m5, m26, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpermq m6, m26, [srcq+strideq*0] - punpckhbw m12, m0, m1 - punpcklbw m0, m1 - punpckhbw m13, m1, m2 - punpcklbw m1, m2 - punpckhbw m14, m2, m3 - punpcklbw m2, m3 - punpckhbw m15, m3, m4 - punpcklbw m3, m4 - punpckhbw m16, m4, m5 - punpcklbw m4, m5 - punpckhbw m17, m5, m6 - punpcklbw m5, m6 -.v_loop: - vpermq m18, m26, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpermq m19, m26, [srcq+strideq*0] - pmaddubsw m20, m0, m8 - pmaddubsw m21, m12, m8 - pmaddubsw m22, m1, m8 - pmaddubsw m23, m13, m8 - mova m0, m2 - mova m12, m14 - mova m1, m3 - mova m13, m15 - pmaddubsw m2, m9 - pmaddubsw m14, m9 - pmaddubsw m3, m9 - pmaddubsw m15, m9 - punpckhbw m24, m6, m18 - punpcklbw m6, m18 - paddw m20, m2 - paddw m21, m14 - paddw m22, m3 - paddw m23, m15 - mova m2, m4 - mova m14, m16 - mova m3, m5 - mova m15, m17 - pmaddubsw m4, m10 - pmaddubsw m16, m10 - pmaddubsw m5, m10 - pmaddubsw m17, m10 - punpckhbw m25, m18, m19 - punpcklbw m18, m19 - paddw m20, m4 - paddw m21, m16 - paddw m22, m5 - paddw m23, m17 - mova m4, m6 - mova m16, m24 - mova m5, m18 - mova m17, m25 - pmaddubsw m6, m11 - pmaddubsw m24, m11 - pmaddubsw m18, m11 - pmaddubsw m25, m11 - paddw m20, m6 - paddw m21, m24 - paddw m22, m18 - paddw m23, m25 - pmulhrsw m20, m7 - pmulhrsw m21, m7 - pmulhrsw m22, m7 - pmulhrsw m23, m7 - mova m6, m19 - mova [tmpq+wq*0+ 0], m20 - mova [tmpq+wq*0+64], m21 - mova [tmpq+wq*2+ 0], m22 - mova [tmpq+wq*2+64], m23 - lea tmpq, [tmpq+wq*4] - sub hd, 2 - jg .v_loop - add r5, 64 - add r7, 128 - movzx hd, r6b - mov srcq, r5 - mov tmpq, r7 - sub r6d, 1<<8 - jg .v_loop0 + jg .h_loop0 RET .hv: - WIN64_SPILL_XMM 16 - cmp wd, 4 - je .hv_w4 - shr mxd, 16 - sub srcq, 3 - vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep_avx512icl+0] - vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep_avx512icl+4] - movzx mxd, myb - shr myd, 16 - cmp hd, 4 - cmove myd, mxd - tzcnt wd, wd + RESET_STACK_STATE vpbroadcastd m8, [pd_2] - movzx wd, word [r7+wq*2+table_offset(prep, _8tap_hv)] vpbroadcastd m9, [pd_32] - add wq, r7 - vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl] - lea stride3q, [strideq*3] - sub srcq, stride3q - punpcklbw m0, m0 - psraw m0, 8 ; sign-extend - pshufd m12, m0, q0000 - pshufd m13, m0, q1111 - pshufd m14, m0, q2222 - pshufd m15, m0, q3333 - jmp wq -.hv_w4: + cmp wd, 4 + jg .hv_w8 movzx mxd, mxb dec srcq - vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+2] + vpbroadcastd m11, [base+subpel_filters+mxq*8+2] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd - vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl] + vpbroadcastq m0, [base+subpel_filters+myq*8] lea stride3q, [strideq*3] sub srcq, stride3q mov r3d, 0x04 kmovb k1, r3d kshiftlb k2, k1, 2 kshiftlb k3, k1, 4 - vpbroadcastd m10, [pd_2] - vbroadcasti128 m16, [subpel_h_shufA] + vbroadcasti128 m10, [subpel_h_shufA] punpcklbw m0, m0 psraw m0, 8 ; sign-extend - vpbroadcastd m11, [pd_32] pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 @@ -2910,263 +3897,265 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 vpbroadcastq m3{k2}, [srcq+strideq*0] vpbroadcastq m2{k3}, [srcq+strideq*1] vpbroadcastq m3{k3}, [srcq+strideq*2] - mova m17, [spel_hv_perm4a] - movu m18, [spel_hv_perm4b] - mova m0, m10 - mova m1, m10 - pshufb m2, m16 - pshufb m3, m16 - vpdpbusd m0, m2, m8 - vpdpbusd m1, m3, m8 + mova m6, [spel_hv_perm4a] + movu m7, [spel_hv_perm4b] + mova m0, m8 + mova m1, m8 + pshufb m2, m10 + pshufb m3, m10 + vpdpbusd m0, m2, m11 + vpdpbusd m1, m3, m11 packssdw m0, m1 ; _ 0 1 2 3 4 5 6 psraw m0, 2 - vpermb m1, m17, m0 ; 01 12 23 34 - vpermb m2, m18, m0 ; 23 34 45 56 + vpermb m1, m6, m0 ; 01 12 23 34 + vpermb m2, m7, m0 ; 23 34 45 56 .hv_w4_loop: movq xm3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] movq xm4, [srcq+strideq*0] vpbroadcastq ym3{k1}, [srcq+strideq*1] vpbroadcastq ym4{k1}, [srcq+strideq*2] - mova ym5, ym10 - mova ym6, ym10 - pshufb ym3, ym16 - pshufb ym4, ym16 - vpdpbusd ym5, ym3, ym8 - vpdpbusd ym6, ym4, ym8 - mova m7, m11 - packssdw ym5, ym6 ; 7 8 9 a _ _ _ _ - psraw ym5, 2 - valignq m0, m5, m0, 4 ; _ 4 5 6 7 8 9 a - vpdpwssd m7, m1, m12 - vpdpwssd m7, m2, m13 - vpermb m1, m17, m0 ; 45 56 67 78 - vpermb m2, m18, m0 ; 67 78 89 9a - vpdpwssd m7, m1, m14 - vpdpwssd m7, m2, m15 - psrad m7, 6 - vpmovdw [tmpq], m7 + mova m5, m9 + pshufb ym3, ym10 + vpdpwssd m5, m1, m12 ; a0 b0 c0 d0 + mova ym1, ym8 + pshufb ym4, ym10 + vpdpbusd ym1, ym3, ym11 + mova ym3, ym8 + vpdpbusd ym3, ym4, ym11 + vpdpwssd m5, m2, m13 ; a1 b1 c1 d1 + packssdw ym1, ym3 ; 7 8 9 a + psraw ym1, 2 + vshufi32x4 m0, m1, q1032 ; _ 4 5 6 7 8 9 a + vpermb m1, m6, m0 ; 45 56 67 78 + vpermb m2, m7, m0 ; 67 78 89 9a + vpdpwssd m5, m1, m14 ; a2 b2 c2 d2 + vpdpwssd m5, m2, m15 ; a3 b3 c3 d3 + psrad m5, 6 + vpmovdw [tmpq], m5 add tmpq, 32 sub hd, 4 jg .hv_w4_loop - vzeroupper RET .hv_w8: - WIN64_SPILL_XMM 24 - vbroadcasti128 m16, [subpel_h_shufA] - vbroadcasti128 m17, [subpel_h_shufB] - vbroadcasti128 m18, [subpel_h_shufC] - vinserti128 ym0, [srcq+strideq*0], 1 - vinserti128 m0, [srcq+strideq*1], 2 - vinserti128 m0, [srcq+strideq*2], 3 - movu xm1, [srcq+stride3q ] + shr mxd, 16 + sub srcq, 3 + vpbroadcastd m10, [base+subpel_filters+mxq*8+0] + vpbroadcastd m11, [base+subpel_filters+mxq*8+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m0, [base+subpel_filters+myq*8] + lea stride3q, [strideq*3] + sub srcq, stride3q + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + cmp wd, 8 + jg .hv_w16 + vbroadcasti32x4 m17, [srcq+stride3q ] + vinserti32x4 m16, m17, [srcq+strideq*0], 0 + vbroadcasti32x4 m19, [subpel_h_shufA] + vinserti32x4 m16, [srcq+strideq*1], 1 + vbroadcasti32x4 m21, [subpel_h_shufC] + vinserti32x4 m16, [srcq+strideq*2], 2 lea srcq, [srcq+strideq*4] - vinserti128 ym1, [srcq+strideq*0], 1 - vinserti128 m1, [srcq+strideq*1], 2 - vinserti128 m1, [srcq+strideq*2], 3 + vinserti128 ym17, [srcq+strideq*0], 1 + vbroadcasti32x4 m20, [subpel_h_shufB] + vinserti32x4 m17, [srcq+strideq*1], 2 + vinserti32x4 m17, [srcq+strideq*2], 3 + pshufb m3, m16, m19 ; 0 1 2 3 0123 mova m2, m8 - mova m4, m8 + pshufb m0, m16, m21 ; 0 1 2 3 89ab + vpdpbusd m2, m3, m10 mova m3, m8 - mova m5, m8 - pshufb m20, m0, m16 - pshufb m21, m0, m17 - pshufb m22, m0, m18 - pshufb m23, m1, m16 - pshufb m6, m1, m17 - pshufb m7, m1, m18 - vpdpbusd m2, m20, m10 - vpdpbusd m4, m21, m10 - vpdpbusd m2, m21, m11 - vpdpbusd m4, m22, m11 - vpdpbusd m3, m23, m10 - vpdpbusd m5, m6, m10 - vpdpbusd m3, m6, m11 - vpdpbusd m5, m7, m11 - packssdw m2, m4 - packssdw m3, m5 - psraw m2, 2 ; _ 0 1 2 - psraw m3, 2 ; 3 4 5 6 - valignq m0, m3, m2, 2 ; 0 1 2 3 - valignq m1, m3, m2, 4 ; 1 2 3 4 - valignq m2, m3, m2, 6 ; 2 3 4 5 - punpcklwd m4, m0, m1 ; 01a 12a 23a 34a - punpckhwd m5, m0, m1 ; 01b 12b 23b 34b - punpcklwd m6, m2, m3 ; 23a 34a 45a 56a - punpckhwd m7, m2, m3 ; 23b 34b 45b 56b + pshufb m1, m17, m19 ; 3 4 5 6 0123 + vpdpbusd m3, m0, m11 + mova m0, m8 + pshufb m4, m17, m21 ; 3 4 5 6 89ab + vpdpbusd m0, m1, m10 + mova m1, m8 + pshufb m16, m20 ; 0 1 2 3 4567 + vpdpbusd m1, m4, m11 + pshufb m17, m20 ; 3 4 5 6 4567 + vpdpbusd m2, m16, m11 + vpdpbusd m3, m16, m10 + vpdpbusd m0, m17, m11 + vpdpbusd m1, m17, m10 + packssdw m2, m3 + packssdw m0, m1 + psraw m2, 2 ; 0 1 2 3 + psraw m0, 2 ; 3 4 5 6 + vshufi32x4 m4, m2, m0, q2132 ; 2 3 4 5 + vshufi32x4 m5, m2, m0, q1021 ; 1 2 3 4 + punpcklwd m3, m4, m0 ; 23 34 45 56 + punpckhwd m4, m0 + punpcklwd m1, m2, m5 ; 01 12 23 34 + punpckhwd m2, m5 .hv_w8_loop: - movu xm19, [srcq+stride3q ] + movu xm18, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - vinserti128 ym19, [srcq+strideq*0], 1 - vinserti128 m19, [srcq+strideq*1], 2 - vinserti128 m19, [srcq+strideq*2], 3 - mova m20, m9 - mova m21, m9 - mova m22, m8 - mova m23, m8 - vpdpwssd m20, m4, m12 - vpdpwssd m21, m5, m12 - vpdpwssd m20, m6, m13 - vpdpwssd m21, m7, m13 - pshufb m0, m19, m16 - pshufb m1, m19, m17 - pshufb m2, m19, m18 - vpdpbusd m22, m0, m10 - vpdpbusd m23, m1, m10 - vpdpbusd m22, m1, m11 - vpdpbusd m23, m2, m11 - packssdw m22, m23 - psraw m22, 2 ; 7 8 9 A - valignq m0, m22, m3, 2 ; 4 5 6 7 - valignq m1, m22, m3, 4 ; 5 6 7 8 - valignq m2, m22, m3, 6 ; 6 7 8 9 - mova m3, m22 - punpcklwd m4, m0, m1 ; 45a 56a 67a 78a - punpckhwd m5, m0, m1 ; 45b 56b 67b 78b - punpcklwd m6, m2, m3 ; 67a 78a 89a 9Aa - punpckhwd m7, m2, m3 ; 67b 78b 89b 9Ab - vpdpwssd m20, m4, m14 - vpdpwssd m21, m5, m14 - vpdpwssd m20, m6, m15 - vpdpwssd m21, m7, m15 - psrad m20, 6 - psrad m21, 6 - packssdw m20, m21 - mova [tmpq], m20 + vinserti128 ym18, [srcq+strideq*0], 1 + vinserti32x4 m18, [srcq+strideq*1], 2 + vinserti32x4 m18, [srcq+strideq*2], 3 + pshufb m17, m18, m19 ; 7 8 9 a 0123 + mova m16, m8 + pshufb m5, m18, m21 ; 7 8 9 a 89ab + vpdpbusd m16, m17, m10 + mova m17, m8 + pshufb m18, m20 ; 7 8 9 a 4567 + vpdpbusd m17, m5, m11 + mova m5, m9 + vpdpwssd m5, m3, m13 ; a1 b1 c1 d1 + mova m6, m9 + vpdpwssd m6, m4, m13 + vpdpbusd m16, m18, m11 + vpdpbusd m17, m18, m10 + vpdpwssd m5, m1, m12 ; a0 b0 c0 d0 + mova m1, m3 + vpdpwssd m6, m2, m12 + mova m2, m4 + packssdw m16, m17 + psraw m16, 2 ; 7 8 9 a + valignq m4, m16, m0, 6 ; 6 7 8 9 + mova m0, m16 + punpcklwd m3, m4, m16 ; 67 78 89 9a + punpckhwd m4, m16 + vpdpwssd m5, m3, m15 ; a3 b3 c3 d3 + vpdpwssd m6, m4, m15 + vshufi32x4 m1, m3, q1032 ; 45 56 67 78 + vshufi32x4 m2, m4, q1032 + vpdpwssd m5, m1, m14 ; a2 b2 c2 d2 + vpdpwssd m6, m2, m14 + psrad m5, 6 + psrad m6, 6 + packssdw m5, m6 + mova [tmpq], m5 add tmpq, 64 sub hd, 4 jg .hv_w8_loop + vzeroupper RET .hv_w16: - mov wd, 16*2 - jmp .hv_start -.hv_w32: - mov wd, 32*2 - jmp .hv_start -.hv_w64: - mov wd, 64*2 - jmp .hv_start -.hv_w128: - mov wd, 128*2 -.hv_start: - WIN64_SPILL_XMM 31 - mova m16, [spel_h_perm16a] - mova m17, [spel_h_perm16b] - mova m18, [spel_h_perm16c] + WIN64_SPILL_XMM 23 + mova m16, [spel_h_perm16] + vpbroadcastd m18, [pb_4] + add wd, wd + paddb m17, m18, m16 lea r6d, [hq+wq*8-256] - mov r5, srcq + paddb m18, m17 +.hv_w16_loop0: + movu ym19, [srcq+strideq*0] + vinserti32x8 m19, [srcq+strideq*1], 1 + lea r5, [srcq+strideq*2] + movu ym20, [r5 +strideq*0] + vinserti32x8 m20, [r5 +strideq*1], 1 + lea r5, [r5 +strideq*2] + movu ym21, [r5 +strideq*0] + vinserti32x8 m21, [r5 +strideq*1], 1 + lea r5, [r5 +strideq*2] + movu ym22, [r5 +strideq*0] mov r7, tmpq -.hv_loop0: - movu ym0, [srcq+strideq*0] - vinserti32x8 m0, [srcq+strideq*1], 1 - lea srcq, [srcq+strideq*2] - movu ym1, [srcq+strideq*0] - vinserti32x8 m1, [srcq+strideq*1], 1 - lea srcq, [srcq+strideq*2] - movu ym2, [srcq+strideq*0] - vinserti32x8 m2, [srcq+strideq*1], 1 - lea srcq, [srcq+strideq*2] - movu ym3, [srcq+strideq*0] + vpermb m3, m16, m19 ; 0 1 0123 89ab + mova m2, m8 + vpermb m4, m18, m19 ; 0 1 89ab ghij + vpdpbusd m2, m3, m10 + mova m3, m8 + vpermb m5, m16, m20 ; 2 3 0123 89ab + vpdpbusd m3, m4, m11 mova m4, m8 + vpermb m6, m18, m20 ; 2 3 89ab ghij + vpdpbusd m4, m5, m10 mova m5, m8 + vpermb m7, m16, m21 ; 4 5 0123 89ab + vpdpbusd m5, m6, m11 mova m6, m8 + vpermb m0, m18, m21 ; 4 5 89ab ghij + vpdpbusd m6, m7, m10 mova m7, m8 - vpermb m19, m16, m0 - vpermb m20, m17, m0 - vpermb m21, m18, m0 - vpermb m22, m16, m1 - vpermb m23, m17, m1 - vpermb m24, m18, m1 - vpermb m25, m16, m2 - vpermb m26, m17, m2 - vpermb m27, m18, m2 - vpermb ym28, ym16, ym3 - vpermb ym29, ym17, ym3 - vpermb ym30, ym18, ym3 - mova m0, m8 - mova m1, m8 - mova ym2, ym8 - mova ym3, ym8 - vpdpbusd m4, m19, m10 - vpdpbusd m5, m20, m10 - vpdpbusd m6, m22, m10 - vpdpbusd m7, m23, m10 - vpdpbusd m0, m25, m10 - vpdpbusd m1, m26, m10 - vpdpbusd ym2, ym28, ym10 - vpdpbusd ym3, ym29, ym10 + vpermb ym1, ym16, ym22 ; 6 0123 89ab + vpdpbusd m7, m0, m11 + mova ym0, ym8 + vpermb m19, m17, m19 ; 0 1 4567 cdef + vpdpbusd ym0, ym1, ym10 + vpermb ym1, ym18, ym22 ; 6 89ab ghij + vpdpbusd m2, m19, m11 + vpdpbusd m3, m19, m10 + mova ym19, ym8 + vpermb m20, m17, m20 ; 2 3 4567 cdef + vpdpbusd ym19, ym1, ym11 + vpermb m21, m17, m21 ; 4 5 4567 cdef vpdpbusd m4, m20, m11 - vpdpbusd m5, m21, m11 - vpdpbusd m6, m23, m11 - vpdpbusd m7, m24, m11 - vpdpbusd m0, m26, m11 - vpdpbusd m1, m27, m11 - vpdpbusd ym2, ym29, ym11 - vpdpbusd ym3, ym30, ym11 - packssdw m4, m5 - packssdw m6, m7 - packssdw m0, m1 - packssdw ym2, ym3 - psraw m4, 2 ; 0a 0b 1a 1b - psraw m6, 2 ; 2a 2b 3a 3b - psraw m0, 2 ; 4a 4b 5a 5b - psraw ym2, 2 ; 6a 6b __ __ - vshufi32x4 m5, m4, m6, q1032 ; 1a 1b 2a 2b - vshufi32x4 m7, m6, m0, q1032 ; 3a 3b 4a 4b - vshufi32x4 m1, m0, m2, q1032 ; 5a 5b 6a 6b - punpcklwd m2, m4, m5 ; 01a 01c 12a 12c - punpckhwd m3, m4, m5 ; 01b 01d 12b 12d - punpcklwd m4, m6, m7 ; 23a 23c 34a 34c - punpckhwd m5, m6, m7 ; 23b 23d 34b 34d - punpcklwd m6, m0, m1 ; 45a 45c 56a 56c - punpckhwd m7, m0, m1 ; 45b 45d 56b 56d -.hv_loop: - movu ym19, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vinserti32x8 m19, [srcq+strideq*0], 1 + vpdpbusd m5, m20, m10 + vpermb ym22, ym17, ym22 ; 6 4567 cdef + vpdpbusd m6, m21, m11 + vpdpbusd m7, m21, m10 + packssdw m2, m3 ; 0 1 + vpdpbusd ym0, ym22, ym11 + packssdw m4, m5 ; 2 3 + vpdpbusd ym19, ym22, ym10 + packssdw m6, m7 ; 4 5 + packssdw ym0, ym19 ; 6 + REPX {psraw x, 2}, m2, m4, m6, ym0 + vshufi32x4 m3, m2, m4, q1032 ; 1 2 + vshufi32x4 m5, m4, m6, q1032 ; 3 4 + vshufi32x4 m0, m6, m0, q1032 ; 5 6 + punpcklwd m1, m2, m3 ; 01 12 + punpckhwd m2, m3 + punpcklwd m3, m4, m5 ; 23 34 + punpckhwd m4, m5 + punpcklwd m5, m6, m0 ; 45 56 + punpckhwd m6, m0 +.hv_w16_loop: + movu ym19, [r5+strideq*1] + lea r5, [r5+strideq*2] + vinserti32x8 m19, [r5+strideq*0], 1 mova m20, m9 + vpdpwssd m20, m1, m12 ; a0 + vpermb m1, m16, m19 mova m21, m9 + vpdpwssd m21, m2, m12 ; b0 + vpermb m2, m17, m19 mova m22, m8 - mova m23, m8 - vpdpwssd m20, m2, m12 - vpdpwssd m21, m3, m12 - vpdpwssd m20, m4, m13 - vpdpwssd m21, m5, m13 - vpermb m24, m16, m19 - vpermb m25, m17, m19 - vpermb m26, m18, m19 - vpdpbusd m22, m24, m10 - vpdpbusd m23, m25, m10 - vpdpbusd m22, m25, m11 - vpdpbusd m23, m26, m11 - packssdw m22, m23 - psraw m22, 2 ; 7a 7b 8a 8b - vshufi32x4 m0, m1, m22, q1032 ; 6a 6b 7a 7b + vpdpbusd m22, m1, m10 + mova m1, m8 + vpermb m19, m18, m19 + vpdpbusd m1, m2, m10 + vpdpwssd m20, m3, m13 ; a1 + vpdpwssd m21, m4, m13 ; b1 + vpdpbusd m22, m2, m11 mova m2, m4 - mova m3, m5 - mova m1, m22 + vpdpbusd m1, m19, m11 mova m4, m6 - mova m5, m7 - punpcklwd m6, m0, m1 ; 67a 67c 78a 78c - punpckhwd m7, m0, m1 ; 67b 67d 78b 78d - vpdpwssd m20, m4, m14 - vpdpwssd m21, m5, m14 - vpdpwssd m20, m6, m15 - vpdpwssd m21, m7, m15 + vpdpwssd m20, m5, m14 ; a2 + vpdpwssd m21, m6, m14 ; b2 + packssdw m22, m1 + mova m1, m3 + psraw m22, 2 ; 7 8 + mova m3, m5 + vshufi32x4 m6, m0, m22, q1032 ; 6 7 + mova m0, m22 + punpcklwd m5, m6, m0 ; 67 78 + punpckhwd m6, m0 + vpdpwssd m20, m5, m15 ; a3 + vpdpwssd m21, m6, m15 ; b3 psrad m20, 6 psrad m21, 6 packssdw m20, m21 - mova [tmpq+wq*0], ym20 - vextracti32x8 [tmpq+wq*1], m20, 1 - lea tmpq, [tmpq+wq*2] + mova [r7+wq*0], ym20 + vextracti32x8 [r7+wq*1], m20, 1 + lea r7, [r7+wq*2] sub hd, 2 - jg .hv_loop - add r5, 16 - add r7, 32 + jg .hv_w16_loop + add srcq, 16 + add tmpq, 32 movzx hd, r6b - mov srcq, r5 - mov tmpq, r7 sub r6d, 1<<8 - jg .hv_loop0 + jg .hv_w16_loop0 RET cglobal warp_affine_8x8t_8bpc, 4, 7, 22, tmp, ts diff --git a/third_party/dav1d/tests/meson.build b/third_party/dav1d/tests/meson.build index 11db0a56e9..38a591b5b4 100644 --- a/third_party/dav1d/tests/meson.build +++ b/third_party/dav1d/tests/meson.build @@ -100,7 +100,7 @@ if is_asm_enabled ], ) - test('checkasm', checkasm, suite: 'checkasm', timeout: 180, is_parallel: false) + test('checkasm', checkasm, suite: 'checkasm', timeout: 180) benchmark('checkasm', checkasm, suite: 'checkasm', timeout: 3600, args: '--bench') endif diff --git a/third_party/gemmology/gemmology.h b/third_party/gemmology/gemmology.h index eb5ebed3b4..21004d2709 100644 --- a/third_party/gemmology/gemmology.h +++ b/third_party/gemmology/gemmology.h @@ -786,7 +786,6 @@ public: using batch8 = xsimd::batch; using batch16 = xsimd::batch; using batch32 = xsimd::batch; - using ubatch32 = xsimd::batch; // Put higher rows in the second half of the register. These will jumble // around in the same way then conveniently land in the right place. @@ -814,7 +813,7 @@ public: return xsimd::bitwise_cast( xsimd::swizzle(xsimd::bitwise_cast(packed), - xsimd::make_batch_constant>())); + xsimd::make_batch_constant>())); } else if constexpr (batchf32::size == 8) return Tile(quant_mult, input, input + 2 * cols, input + 16 * cols, input + 18 * cols); @@ -833,7 +832,6 @@ public: using batch8 = xsimd::batch; using batch16 = xsimd::batch; using batch32 = xsimd::batch; - using ubatch32 = xsimd::batch; const batch8 neg127(-127); // Grab 4 registers at a time in 32-bit format. @@ -857,7 +855,7 @@ public: // and the values are only used for GEMM. return xsimd::bitwise_cast( xsimd::swizzle(xsimd::bitwise_cast(packed), - xsimd::make_batch_constant>())); + xsimd::make_batch_constant>())); } private: @@ -869,7 +867,6 @@ private: using batch8 = xsimd::batch; using batch16 = xsimd::batch; using batch32 = xsimd::batch; - using ubatch32 = xsimd::batch; const batch8 neg127 = -127; const batch8 pos127 = +127; @@ -894,7 +891,7 @@ private: // and the values are only used for GEMM. return xsimd::bitwise_cast( xsimd::swizzle(xsimd::bitwise_cast(packed), - xsimd::make_batch_constant>())); + xsimd::make_batch_constant>())); } }; diff --git a/third_party/gemmology/moz.yaml b/third_party/gemmology/moz.yaml index 749227e2ee..bb99db1a39 100644 --- a/third_party/gemmology/moz.yaml +++ b/third_party/gemmology/moz.yaml @@ -10,8 +10,8 @@ origin: url: https://github.com/mozilla/gemmology - release: dbcd029c3bc6e183355ea597216d379677ff9b19 (2024-02-20T12:36:14Z). - revision: dbcd029c3bc6e183355ea597216d379677ff9b19 + release: a37d922a88ded67a0b741f16390821000fde99ee (2024-04-17T11:18:45Z). + revision: a37d922a88ded67a0b741f16390821000fde99ee license: MIT diff --git a/third_party/jpeg-xl/AUTHORS b/third_party/jpeg-xl/AUTHORS index ed6d72db66..3f3675858d 100644 --- a/third_party/jpeg-xl/AUTHORS +++ b/third_party/jpeg-xl/AUTHORS @@ -39,6 +39,7 @@ Alistair Barrow Andrius Lukas Narbutas Aous Naman Artem Selishchev +Aryan Pingle Biswapriyo Nath CanadianBaconBoi Damiano Albani @@ -53,6 +54,7 @@ Dong Xu estrogently <41487185+estrogently@users.noreply.github.com> Even Rouault Fred Brennan +Gerhard Huber gi-man Gilles Devillers (GilDev) Heiko Becker diff --git a/third_party/jpeg-xl/CMakeLists.txt b/third_party/jpeg-xl/CMakeLists.txt index 9b74537f1c..ea8ccc43ba 100644 --- a/third_party/jpeg-xl/CMakeLists.txt +++ b/third_party/jpeg-xl/CMakeLists.txt @@ -160,7 +160,7 @@ set(JPEGXL_ENABLE_AVX512_SPR false CACHE BOOL "Build with AVX-512FP16 support (faster on CPUs that support it, but larger binary size).") set(JPEGXL_ENABLE_AVX512_ZEN4 false CACHE BOOL "Build with Zen4-optimized AVX512 support (faster on CPUs that support it, but larger binary size).") -set(JPEGXL_ENABLE_WASM_TRHEADS true CACHE BOOL +set(JPEGXL_ENABLE_WASM_THREADS true CACHE BOOL "Builds WASM modules with threads support") # Force system dependencies. @@ -211,10 +211,12 @@ if(JPEGXL_STATIC) # Clang developers say that in case to use "static" we have to build stdlib # ourselves; for real use case we don't care about stdlib, as it is "granted", # so just linking all other libraries is fine. - if (NOT MSVC AND NOT APPLE) + if (NOT MSVC) + string(APPEND CMAKE_EXE_LINKER_FLAGS " -static") + endif() + if ((NOT WIN32 AND NOT APPLE) OR CYGWIN OR MINGW) set(CMAKE_FIND_LIBRARY_SUFFIXES .a) - set(CMAKE_EXE_LINKER_FLAGS - "${CMAKE_EXE_LINKER_FLAGS} -static -static-libgcc -static-libstdc++") + string(APPEND CMAKE_EXE_LINKER_FLAGS " -static-libgcc -static-libstdc++") endif() endif() # JPEGXL_STATIC @@ -265,7 +267,7 @@ if(JPEGXL_STATIC) endif() endif() # JPEGXL_STATIC -if (EMSCRIPTEN AND JPEGXL_ENABLE_WASM_TRHEADS) +if (EMSCRIPTEN AND JPEGXL_ENABLE_WASM_THREADS) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -pthread") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread") diff --git a/third_party/jpeg-xl/examples/decode_progressive.cc b/third_party/jpeg-xl/examples/decode_progressive.cc index 2cdc175e8a..7a3a9aa33b 100644 --- a/third_party/jpeg-xl/examples/decode_progressive.cc +++ b/third_party/jpeg-xl/examples/decode_progressive.cc @@ -10,16 +10,16 @@ #define __STDC_FORMAT_MACROS #endif -#include #include #include #include #include #include -#include -#include -#include +#include // PRIu64 +#include +#include +#include #include bool WritePAM(const char* filename, const uint8_t* buffer, size_t w, size_t h) { diff --git a/third_party/jpeg-xl/lib/extras/dec/apng.cc b/third_party/jpeg-xl/lib/extras/dec/apng.cc index c607a71d08..824a6f47ee 100644 --- a/third_party/jpeg-xl/lib/extras/dec/apng.cc +++ b/third_party/jpeg-xl/lib/extras/dec/apng.cc @@ -38,8 +38,10 @@ #include #include -#include +#include +#include +#include #include #include #include @@ -49,8 +51,7 @@ #include "lib/jxl/base/common.h" #include "lib/jxl/base/compiler_specific.h" #include "lib/jxl/base/printf_macros.h" -#include "lib/jxl/base/scope_guard.h" -#include "lib/jxl/sanitizers.h" +#include "lib/jxl/base/span.h" #if JPEGXL_ENABLE_APNG #include "png.h" /* original (unpatched) libpng is ok */ #endif @@ -58,39 +59,49 @@ namespace jxl { namespace extras { -#if JPEGXL_ENABLE_APNG +#if !JPEGXL_ENABLE_APNG + +bool CanDecodeAPNG() { return false; } +Status DecodeImageAPNG(const Span bytes, + const ColorHints& color_hints, PackedPixelFile* ppf, + const SizeConstraints* constraints) { + return false; +} + +#else // JPEGXL_ENABLE_APNG + namespace { -constexpr unsigned char kExifSignature[6] = {0x45, 0x78, 0x69, - 0x66, 0x00, 0x00}; +constexpr uint8_t kExifSignature[6] = {0x45, 0x78, 0x69, 0x66, 0x00, 0x00}; /* hIST chunk tail is not proccesed properly; skip this chunk completely; see https://github.com/glennrp/libpng/pull/413 */ -const png_byte kIgnoredPngChunks[] = { - 104, 73, 83, 84, '\0' /* hIST */ -}; +const uint8_t kIgnoredPngChunks[] = {'h', 'I', 'S', 'T', '\0'}; // Returns floating-point value from the PNG encoding (times 10^5). double F64FromU32(const uint32_t x) { return static_cast(x) * 1E-5; } -Status DecodeSRGB(const unsigned char* payload, const size_t payload_size, - JxlColorEncoding* color_encoding) { - if (payload_size != 1) return JXL_FAILURE("Wrong sRGB size"); +/** Extract information from 'sRGB' chunk. */ +Status DecodeSrgbChunk(const Bytes payload, JxlColorEncoding* color_encoding) { + if (payload.size() != 1) return JXL_FAILURE("Wrong sRGB size"); + uint8_t ri = payload[0]; // (PNG uses the same values as ICC.) - if (payload[0] >= 4) return JXL_FAILURE("Invalid Rendering Intent"); + if (ri >= 4) return JXL_FAILURE("Invalid Rendering Intent"); color_encoding->white_point = JXL_WHITE_POINT_D65; color_encoding->primaries = JXL_PRIMARIES_SRGB; color_encoding->transfer_function = JXL_TRANSFER_FUNCTION_SRGB; - color_encoding->rendering_intent = - static_cast(payload[0]); + color_encoding->rendering_intent = static_cast(ri); return true; } -// If the cICP profile is not fully supported, return false and leave -// color_encoding unmodified. -Status DecodeCICP(const unsigned char* payload, const size_t payload_size, - JxlColorEncoding* color_encoding) { - if (payload_size != 4) return JXL_FAILURE("Wrong cICP size"); +/** + * Extract information from 'cICP' chunk. + * + * If the cICP profile is not fully supported, return `false` and leave + * `color_encoding` unmodified. + */ +Status DecodeCicpChunk(const Bytes payload, JxlColorEncoding* color_encoding) { + if (payload.size() != 4) return JXL_FAILURE("Wrong cICP size"); JxlColorEncoding color_enc = *color_encoding; // From https://www.itu.int/rec/T-REC-H.273-202107-I/en @@ -217,257 +228,279 @@ Status DecodeCICP(const unsigned char* payload, const size_t payload_size, return true; } -Status DecodeGAMA(const unsigned char* payload, const size_t payload_size, - JxlColorEncoding* color_encoding) { - if (payload_size != 4) return JXL_FAILURE("Wrong gAMA size"); +/** Extract information from 'gAMA' chunk. */ +Status DecodeGamaChunk(Bytes payload, JxlColorEncoding* color_encoding) { + if (payload.size() != 4) return JXL_FAILURE("Wrong gAMA size"); color_encoding->transfer_function = JXL_TRANSFER_FUNCTION_GAMMA; - color_encoding->gamma = F64FromU32(LoadBE32(payload)); + color_encoding->gamma = F64FromU32(LoadBE32(payload.data())); return true; } -Status DecodeCHRM(const unsigned char* payload, const size_t payload_size, - JxlColorEncoding* color_encoding) { - if (payload_size != 32) return JXL_FAILURE("Wrong cHRM size"); - +/** Extract information from 'cHTM' chunk. */ +Status DecodeChrmChunk(Bytes payload, JxlColorEncoding* color_encoding) { + if (payload.size() != 32) return JXL_FAILURE("Wrong cHRM size"); + const uint8_t* data = payload.data(); color_encoding->white_point = JXL_WHITE_POINT_CUSTOM; - color_encoding->white_point_xy[0] = F64FromU32(LoadBE32(payload + 0)); - color_encoding->white_point_xy[1] = F64FromU32(LoadBE32(payload + 4)); + color_encoding->white_point_xy[0] = F64FromU32(LoadBE32(data + 0)); + color_encoding->white_point_xy[1] = F64FromU32(LoadBE32(data + 4)); color_encoding->primaries = JXL_PRIMARIES_CUSTOM; - color_encoding->primaries_red_xy[0] = F64FromU32(LoadBE32(payload + 8)); - color_encoding->primaries_red_xy[1] = F64FromU32(LoadBE32(payload + 12)); - color_encoding->primaries_green_xy[0] = F64FromU32(LoadBE32(payload + 16)); - color_encoding->primaries_green_xy[1] = F64FromU32(LoadBE32(payload + 20)); - color_encoding->primaries_blue_xy[0] = F64FromU32(LoadBE32(payload + 24)); - color_encoding->primaries_blue_xy[1] = F64FromU32(LoadBE32(payload + 28)); + color_encoding->primaries_red_xy[0] = F64FromU32(LoadBE32(data + 8)); + color_encoding->primaries_red_xy[1] = F64FromU32(LoadBE32(data + 12)); + color_encoding->primaries_green_xy[0] = F64FromU32(LoadBE32(data + 16)); + color_encoding->primaries_green_xy[1] = F64FromU32(LoadBE32(data + 20)); + color_encoding->primaries_blue_xy[0] = F64FromU32(LoadBE32(data + 24)); + color_encoding->primaries_blue_xy[1] = F64FromU32(LoadBE32(data + 28)); return true; } -// Retrieves XMP and EXIF/IPTC from itext and text. -class BlobsReaderPNG { - public: - static Status Decode(const png_text_struct& info, PackedMetadata* metadata) { - // We trust these are properly null-terminated by libpng. - const char* key = info.key; - const char* value = info.text; - if (strstr(key, "XML:com.adobe.xmp")) { - metadata->xmp.resize(strlen(value)); // safe, see above - memcpy(metadata->xmp.data(), value, metadata->xmp.size()); - } - - std::string type; - std::vector bytes; - - // Handle text chunks annotated with key "Raw profile type ####", with - // #### a type, which may contain metadata. - const char* kKey = "Raw profile type "; - if (strncmp(key, kKey, strlen(kKey)) != 0) return false; +/** Returns false if invalid. */ +JXL_INLINE Status DecodeHexNibble(const char c, uint32_t* JXL_RESTRICT nibble) { + if ('a' <= c && c <= 'f') { + *nibble = 10 + c - 'a'; + } else if ('0' <= c && c <= '9') { + *nibble = c - '0'; + } else { + *nibble = 0; + return JXL_FAILURE("Invalid metadata nibble"); + } + JXL_ASSERT(*nibble < 16); + return true; +} - if (!MaybeDecodeBase16(key, value, &type, &bytes)) { - JXL_WARNING("Couldn't parse 'Raw format type' text chunk"); - return false; - } - if (type == "exif") { - // Remove "Exif\0\0" prefix if present - if (bytes.size() >= sizeof kExifSignature && - memcmp(bytes.data(), kExifSignature, sizeof kExifSignature) == 0) { - bytes.erase(bytes.begin(), bytes.begin() + sizeof kExifSignature); - } - if (!metadata->exif.empty()) { - JXL_WARNING("overwriting EXIF (%" PRIuS " bytes) with base16 (%" PRIuS - " bytes)", - metadata->exif.size(), bytes.size()); - } - metadata->exif = std::move(bytes); - } else if (type == "iptc") { - // TODO(jon): Deal with IPTC in some way - } else if (type == "8bim") { - // TODO(jon): Deal with 8bim in some way - } else if (type == "xmp") { - if (!metadata->xmp.empty()) { - JXL_WARNING("overwriting XMP (%" PRIuS " bytes) with base16 (%" PRIuS - " bytes)", - metadata->xmp.size(), bytes.size()); +/** Returns false if invalid. */ +JXL_INLINE Status DecodeDecimal(const char** pos, const char* end, + uint32_t* JXL_RESTRICT value) { + size_t len = 0; + *value = 0; + while (*pos < end) { + char next = **pos; + if (next >= '0' && next <= '9') { + *value = (*value * 10) + static_cast(next - '0'); + len++; + if (len > 8) { + break; } - metadata->xmp = std::move(bytes); } else { - JXL_WARNING("Unknown type in 'Raw format type' text chunk: %s: %" PRIuS - " bytes", - type.c_str(), bytes.size()); + // Do not consume terminator (non-decimal digit). + break; } - return true; + (*pos)++; + } + if (len == 0 || len > 8) { + return JXL_FAILURE("Failed to parse decimal"); } + return true; +} - private: - // Returns false if invalid. - static JXL_INLINE Status DecodeNibble(const char c, - uint32_t* JXL_RESTRICT nibble) { - if ('a' <= c && c <= 'f') { - *nibble = 10 + c - 'a'; - } else if ('0' <= c && c <= '9') { - *nibble = c - '0'; - } else { - *nibble = 0; - return JXL_FAILURE("Invalid metadata nibble"); +/** + * Parses a PNG text chunk with key of the form "Raw profile type ####", with + * #### a type. + * + * Returns whether it could successfully parse the content. + * We trust key and encoded are null-terminated because they come from + * libpng. + */ +Status MaybeDecodeBase16(const char* key, const char* encoded, + std::string* type, std::vector* bytes) { + const char* encoded_end = encoded + strlen(encoded); + + const char* kKey = "Raw profile type "; + if (strncmp(key, kKey, strlen(kKey)) != 0) return false; + *type = key + strlen(kKey); + const size_t kMaxTypeLen = 20; + if (type->length() > kMaxTypeLen) return false; // Type too long + + // Header: freeform string and number of bytes + // Expected format is: + // \n + // profile name/description\n + // 40\n (the number of bytes after hex-decoding) + // 01234566789abcdef....\n (72 bytes per line max). + // 012345667\n (last line) + const char* pos = encoded; + + if (*(pos++) != '\n') return false; + while (pos < encoded_end && *pos != '\n') { + pos++; + } + if (pos == encoded_end) return false; + // We parsed so far a \n, some number of non \n characters and are now + // pointing at a \n. + if (*(pos++) != '\n') return false; + // Skip leading spaces + while (pos < encoded_end && *pos == ' ') { + pos++; + } + uint32_t bytes_to_decode = 0; + JXL_RETURN_IF_ERROR(DecodeDecimal(&pos, encoded_end, &bytes_to_decode)); + + // We need 2*bytes for the hex values plus 1 byte every 36 values, + // plus terminal \n for length. + size_t tail = static_cast(encoded_end - pos); + bool ok = ((tail / 2) >= bytes_to_decode); + if (ok) tail -= 2 * static_cast(bytes_to_decode); + ok = ok && (tail == 1 + DivCeil(bytes_to_decode, 36)); + if (!ok) { + return JXL_FAILURE("Not enough bytes to parse %d bytes in hex", + bytes_to_decode); + } + JXL_ASSERT(bytes->empty()); + bytes->reserve(bytes_to_decode); + + // Encoding: base16 with newline after 72 chars. + // pos points to the \n before the first line of hex values. + for (size_t i = 0; i < bytes_to_decode; ++i) { + if (i % 36 == 0) { + if (pos + 1 >= encoded_end) return false; // Truncated base16 1 + if (*pos != '\n') return false; // Expected newline + ++pos; } - JXL_ASSERT(*nibble < 16); - return true; + + if (pos + 2 >= encoded_end) return false; // Truncated base16 2; + uint32_t nibble0; + uint32_t nibble1; + JXL_RETURN_IF_ERROR(DecodeHexNibble(pos[0], &nibble0)); + JXL_RETURN_IF_ERROR(DecodeHexNibble(pos[1], &nibble1)); + bytes->push_back(static_cast((nibble0 << 4) + nibble1)); + pos += 2; } + if (pos + 1 != encoded_end) return false; // Too many encoded bytes + if (pos[0] != '\n') return false; // Incorrect metadata terminator + return true; +} - // Returns false if invalid. - static JXL_INLINE Status DecodeDecimal(const char** pos, const char* end, - uint32_t* JXL_RESTRICT value) { - size_t len = 0; - *value = 0; - while (*pos < end) { - char next = **pos; - if (next >= '0' && next <= '9') { - *value = (*value * 10) + static_cast(next - '0'); - len++; - if (len > 8) { - break; - } - } else { - // Do not consume terminator (non-decimal digit). - break; - } - (*pos)++; - } - if (len == 0 || len > 8) { - return JXL_FAILURE("Failed to parse decimal"); - } - return true; +/** Retrieves XMP and EXIF/IPTC from itext and text. */ +Status DecodeBlob(const png_text_struct& info, PackedMetadata* metadata) { + // We trust these are properly null-terminated by libpng. + const char* key = info.key; + const char* value = info.text; + if (strstr(key, "XML:com.adobe.xmp")) { + metadata->xmp.resize(strlen(value)); // safe, see above + memcpy(metadata->xmp.data(), value, metadata->xmp.size()); } - // Parses a PNG text chunk with key of the form "Raw profile type ####", with - // #### a type. - // Returns whether it could successfully parse the content. - // We trust key and encoded are null-terminated because they come from - // libpng. - static Status MaybeDecodeBase16(const char* key, const char* encoded, - std::string* type, - std::vector* bytes) { - const char* encoded_end = encoded + strlen(encoded); - - const char* kKey = "Raw profile type "; - if (strncmp(key, kKey, strlen(kKey)) != 0) return false; - *type = key + strlen(kKey); - const size_t kMaxTypeLen = 20; - if (type->length() > kMaxTypeLen) return false; // Type too long - - // Header: freeform string and number of bytes - // Expected format is: - // \n - // profile name/description\n - // 40\n (the number of bytes after hex-decoding) - // 01234566789abcdef....\n (72 bytes per line max). - // 012345667\n (last line) - const char* pos = encoded; - - if (*(pos++) != '\n') return false; - while (pos < encoded_end && *pos != '\n') { - pos++; - } - if (pos == encoded_end) return false; - // We parsed so far a \n, some number of non \n characters and are now - // pointing at a \n. - if (*(pos++) != '\n') return false; - // Skip leading spaces - while (pos < encoded_end && *pos == ' ') { - pos++; + std::string type; + std::vector bytes; + + // Handle text chunks annotated with key "Raw profile type ####", with + // #### a type, which may contain metadata. + const char* kKey = "Raw profile type "; + if (strncmp(key, kKey, strlen(kKey)) != 0) return false; + + if (!MaybeDecodeBase16(key, value, &type, &bytes)) { + JXL_WARNING("Couldn't parse 'Raw format type' text chunk"); + return false; + } + if (type == "exif") { + // Remove "Exif\0\0" prefix if present + if (bytes.size() >= sizeof kExifSignature && + memcmp(bytes.data(), kExifSignature, sizeof kExifSignature) == 0) { + bytes.erase(bytes.begin(), bytes.begin() + sizeof kExifSignature); } - uint32_t bytes_to_decode = 0; - JXL_RETURN_IF_ERROR(DecodeDecimal(&pos, encoded_end, &bytes_to_decode)); - - // We need 2*bytes for the hex values plus 1 byte every 36 values, - // plus terminal \n for length. - size_t tail = static_cast(encoded_end - pos); - bool ok = ((tail / 2) >= bytes_to_decode); - if (ok) tail -= 2 * static_cast(bytes_to_decode); - ok = ok && (tail == 1 + DivCeil(bytes_to_decode, 36)); - if (!ok) { - return JXL_FAILURE("Not enough bytes to parse %d bytes in hex", - bytes_to_decode); + if (!metadata->exif.empty()) { + JXL_WARNING("overwriting EXIF (%" PRIuS " bytes) with base16 (%" PRIuS + " bytes)", + metadata->exif.size(), bytes.size()); } - JXL_ASSERT(bytes->empty()); - bytes->reserve(bytes_to_decode); - - // Encoding: base16 with newline after 72 chars. - // pos points to the \n before the first line of hex values. - for (size_t i = 0; i < bytes_to_decode; ++i) { - if (i % 36 == 0) { - if (pos + 1 >= encoded_end) return false; // Truncated base16 1 - if (*pos != '\n') return false; // Expected newline - ++pos; - } - - if (pos + 2 >= encoded_end) return false; // Truncated base16 2; - uint32_t nibble0; - uint32_t nibble1; - JXL_RETURN_IF_ERROR(DecodeNibble(pos[0], &nibble0)); - JXL_RETURN_IF_ERROR(DecodeNibble(pos[1], &nibble1)); - bytes->push_back(static_cast((nibble0 << 4) + nibble1)); - pos += 2; + metadata->exif = std::move(bytes); + } else if (type == "iptc") { + // TODO(jon): Deal with IPTC in some way + } else if (type == "8bim") { + // TODO(jon): Deal with 8bim in some way + } else if (type == "xmp") { + if (!metadata->xmp.empty()) { + JXL_WARNING("overwriting XMP (%" PRIuS " bytes) with base16 (%" PRIuS + " bytes)", + metadata->xmp.size(), bytes.size()); } - if (pos + 1 != encoded_end) return false; // Too many encoded bytes - if (pos[0] != '\n') return false; // Incorrect metadata terminator - return true; + metadata->xmp = std::move(bytes); + } else { + JXL_WARNING("Unknown type in 'Raw format type' text chunk: %s: %" PRIuS + " bytes", + type.c_str(), bytes.size()); } -}; + return true; +} constexpr bool isAbc(char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); } -constexpr uint32_t kId_IHDR = 0x52444849; -constexpr uint32_t kId_acTL = 0x4C546361; -constexpr uint32_t kId_fcTL = 0x4C546366; -constexpr uint32_t kId_IDAT = 0x54414449; -constexpr uint32_t kId_fdAT = 0x54416466; -constexpr uint32_t kId_IEND = 0x444E4549; -constexpr uint32_t kId_cICP = 0x50434963; -constexpr uint32_t kId_iCCP = 0x50434369; -constexpr uint32_t kId_sRGB = 0x42475273; -constexpr uint32_t kId_gAMA = 0x414D4167; -constexpr uint32_t kId_cHRM = 0x4D524863; -constexpr uint32_t kId_eXIf = 0x66495865; - -struct APNGFrame { - APNGFrame() : pixels(nullptr, free) {} - std::unique_ptr pixels; +/** Wrap 4-char tag name into ID. */ +constexpr uint32_t MakeTag(uint8_t a, uint8_t b, uint8_t c, uint8_t d) { + return a | (b << 8) | (c << 16) | (d << 24); +} + +/** Reusable image data container. */ +struct Pixels { + // Use array instead of vector to avoid memory initialization. + std::unique_ptr pixels; size_t pixels_size = 0; std::vector rows; - unsigned int w, h, delay_num, delay_den; - Status Resize(size_t new_size) { + + Status Resize(size_t row_bytes, size_t num_rows) { + size_t new_size = row_bytes * num_rows; // it is assumed size is sane if (new_size > pixels_size) { - pixels.reset(malloc(new_size)); + pixels.reset(new uint8_t[new_size]); if (!pixels) { // TODO(szabadka): use specialized OOM error code return JXL_FAILURE("Failed to allocate memory for image buffer"); } pixels_size = new_size; } + rows.resize(num_rows); + for (size_t y = 0; y < num_rows; y++) { + rows[y] = pixels.get() + y * row_bytes; + } return true; } }; +/** + * Helper that chunks in-memory input. + */ struct Reader { - const uint8_t* next; - const uint8_t* last; - bool Read(void* data, size_t len) { - size_t cap = last - next; + explicit Reader(Span data) : data_(data) {} + + const Span data_; + size_t offset_ = 0; + + Bytes Peek(size_t len) const { + size_t cap = data_.size() - offset_; size_t to_copy = std::min(cap, len); - memcpy(data, next, to_copy); - next += to_copy; - return (len == to_copy); + return {data_.data() + offset_, to_copy}; } - bool Eof() const { return next == last; } -}; -const uint32_t cMaxPNGSize = 1000000UL; -const size_t kMaxPNGChunkSize = 1lu << 30; // 1 GB + Bytes Read(size_t len) { + Bytes result = Peek(len); + offset_ += result.size(); + return result; + } -void info_fn(png_structp png_ptr, png_infop info_ptr) { + /* Returns empty Span on errror. */ + Bytes ReadChunk() { + Bytes len = Peek(4); + if (len.size() != 4) { + return Bytes(); + } + const auto size = png_get_uint_32(len.data()); + // NB: specification allows 2^31 - 1 + constexpr size_t kMaxPNGChunkSize = 1u << 30; // 1 GB + // Check first, to avoid overflow. + if (size > kMaxPNGChunkSize) { + JXL_WARNING("APNG chunk size is too big"); + return Bytes(); + } + size_t full_size = size + 12; // size does not include itself, tag and CRC. + Bytes result = Read(full_size); + return (result.size() == full_size) ? result : Bytes(); + } + + bool Eof() const { return offset_ == data_.size(); } +}; + +void ProgressiveRead_OnInfo(png_structp png_ptr, png_infop info_ptr) { png_set_expand(png_ptr); png_set_palette_to_rgb(png_ptr); png_set_tRNS_to_alpha(png_ptr); @@ -475,432 +508,437 @@ void info_fn(png_structp png_ptr, png_infop info_ptr) { png_read_update_info(png_ptr, info_ptr); } -void row_fn(png_structp png_ptr, png_bytep new_row, png_uint_32 row_num, - int pass) { - APNGFrame* frame = - reinterpret_cast(png_get_progressive_ptr(png_ptr)); +void ProgressiveRead_OnRow(png_structp png_ptr, png_bytep new_row, + png_uint_32 row_num, int pass) { + Pixels* frame = reinterpret_cast(png_get_progressive_ptr(png_ptr)); JXL_CHECK(frame); JXL_CHECK(row_num < frame->rows.size()); JXL_CHECK(frame->rows[row_num] < frame->rows[0] + frame->pixels_size); png_progressive_combine_row(png_ptr, frame->rows[row_num], new_row); } -inline unsigned int read_chunk(Reader* r, std::vector* pChunk) { - unsigned char len[4]; - if (r->Read(&len, 4)) { - const auto size = png_get_uint_32(len); - // Check first, to avoid overflow. - if (size > kMaxPNGChunkSize) { - JXL_WARNING("APNG chunk size is too big"); - return 0; - } - pChunk->resize(size + 12); - memcpy(pChunk->data(), len, 4); - if (r->Read(pChunk->data() + 4, pChunk->size() - 4)) { - return LoadLE32(pChunk->data() + 4); - } +// Holds intermediate state during parsing APNG file. +struct Context { + ~Context() { + // Make sure png memory is released in any case. + ResetPngDecoder(); } - return 0; -} - -int processing_start(png_structp& png_ptr, png_infop& info_ptr, void* frame_ptr, - bool hasInfo, std::vector& chunkIHDR, - std::vector>& chunksInfo) { - unsigned char header[8] = {137, 80, 78, 71, 13, 10, 26, 10}; - // Cleanup prior decoder, if any. - png_destroy_read_struct(&png_ptr, &info_ptr, nullptr); - // Just in case. Not all versions on libpng wipe-out the pointers. - png_ptr = nullptr; - info_ptr = nullptr; + bool CreatePngDecoder() { + png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, nullptr, nullptr, + nullptr); + info_ptr = png_create_info_struct(png_ptr); + return (png_ptr != nullptr && info_ptr != nullptr); + } - png_ptr = - png_create_read_struct(PNG_LIBPNG_VER_STRING, nullptr, nullptr, nullptr); - info_ptr = png_create_info_struct(png_ptr); - if (!png_ptr || !info_ptr) return 1; + /** + * Initialize PNG decoder. + * + * TODO(eustas): add details + */ + bool InitPngDecoder(bool hasInfo, std::vector& chunksInfo) { + ResetPngDecoder(); + + png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, nullptr, nullptr, + nullptr); + info_ptr = png_create_info_struct(png_ptr); + if (png_ptr == nullptr || info_ptr == nullptr) { + return false; + } - if (setjmp(png_jmpbuf(png_ptr))) { - return 1; - } + if (setjmp(png_jmpbuf(png_ptr))) { + return false; + } - png_set_keep_unknown_chunks(png_ptr, 1, kIgnoredPngChunks, - static_cast(sizeof(kIgnoredPngChunks) / 5)); + png_set_keep_unknown_chunks( + png_ptr, 1, kIgnoredPngChunks, + static_cast(sizeof(kIgnoredPngChunks) / 5)); - png_set_crc_action(png_ptr, PNG_CRC_QUIET_USE, PNG_CRC_QUIET_USE); - png_set_progressive_read_fn(png_ptr, frame_ptr, info_fn, row_fn, nullptr); + png_set_crc_action(png_ptr, PNG_CRC_QUIET_USE, PNG_CRC_QUIET_USE); + png_set_progressive_read_fn(png_ptr, static_cast(&frameRaw), + ProgressiveRead_OnInfo, ProgressiveRead_OnRow, + nullptr); - png_process_data(png_ptr, info_ptr, header, 8); - png_process_data(png_ptr, info_ptr, chunkIHDR.data(), chunkIHDR.size()); + std::array header = {137, 80, 78, 71, 13, 10, 26, 10}; + png_process_data(png_ptr, info_ptr, header.data(), header.size()); + png_process_data(png_ptr, info_ptr, ihdr.data(), ihdr.size()); - if (hasInfo) { - for (auto& chunk : chunksInfo) { - png_process_data(png_ptr, info_ptr, chunk.data(), chunk.size()); + if (hasInfo) { + for (auto& chunk : chunksInfo) { + png_process_data(png_ptr, info_ptr, const_cast(chunk.data()), + chunk.size()); + } } + return true; } - return 0; -} -int processing_data(png_structp png_ptr, png_infop info_ptr, unsigned char* p, - unsigned int size) { - if (!png_ptr || !info_ptr) return 1; + /** + * Pass chunk to PNG decoder. + */ + bool FeedChunks(const Bytes& chunk1, const Bytes& chunk2 = Bytes()) { + // TODO(eustas): turn to DCHECK + if (!png_ptr || !info_ptr) return false; + + if (setjmp(png_jmpbuf(png_ptr))) { + return false; + } - if (setjmp(png_jmpbuf(png_ptr))) { - return 1; + for (const auto& chunk : {chunk1, chunk2}) { + if (!chunk.empty()) { + png_process_data(png_ptr, info_ptr, const_cast(chunk.data()), + chunk.size()); + } + } + return true; } - png_process_data(png_ptr, info_ptr, p, size); - return 0; -} + bool FinalizeStream(PackedMetadata* metadata) { + // TODO(eustas): turn to DCHECK + if (!png_ptr || !info_ptr) return false; -int processing_finish(png_structp png_ptr, png_infop info_ptr, - PackedMetadata* metadata) { - unsigned char footer[12] = {0, 0, 0, 0, 73, 69, 78, 68, 174, 66, 96, 130}; + if (setjmp(png_jmpbuf(png_ptr))) { + return false; + } - if (!png_ptr || !info_ptr) return 1; + const std::array kFooter = {0, 0, 0, 0, 73, 69, + 78, 68, 174, 66, 96, 130}; + png_process_data(png_ptr, info_ptr, const_cast(kFooter.data()), + kFooter.size()); + // before destroying: check if we encountered any metadata chunks + png_textp text_ptr; + int num_text; + png_get_text(png_ptr, info_ptr, &text_ptr, &num_text); + for (int i = 0; i < num_text; i++) { + Status result = DecodeBlob(text_ptr[i], metadata); + // Ignore unknown / malformed blob. + (void)result; + } - if (setjmp(png_jmpbuf(png_ptr))) { - return 1; + return true; } - png_process_data(png_ptr, info_ptr, footer, 12); - // before destroying: check if we encountered any metadata chunks - png_textp text_ptr; - int num_text; - png_get_text(png_ptr, info_ptr, &text_ptr, &num_text); - for (int i = 0; i < num_text; i++) { - (void)BlobsReaderPNG::Decode(text_ptr[i], metadata); + void ResetPngDecoder() { + png_destroy_read_struct(&png_ptr, &info_ptr, nullptr); + // Just in case. Not all versions on libpng wipe-out the pointers. + png_ptr = nullptr; + info_ptr = nullptr; } - return 0; -} + std::vector ihdr; // (modified) copy of file IHDR chunk + png_structp png_ptr = nullptr; + png_infop info_ptr = nullptr; + Pixels frameRaw = {}; +}; + +constexpr uint32_t kMaxPNGSize = 1000000UL; + +struct FrameInfo { + PackedImage data; + uint32_t duration; + size_t x0, xsize; + size_t y0, ysize; + uint32_t dispose_op; + uint32_t blend_op; +}; } // namespace -#endif -bool CanDecodeAPNG() { -#if JPEGXL_ENABLE_APNG - return true; -#else - return false; -#endif -} +bool CanDecodeAPNG() { return true; } Status DecodeImageAPNG(const Span bytes, const ColorHints& color_hints, PackedPixelFile* ppf, const SizeConstraints* constraints) { -#if JPEGXL_ENABLE_APNG - Reader r; - unsigned char sig[8]; - png_structp png_ptr = nullptr; - png_infop info_ptr = nullptr; - std::vector chunk; - std::vector chunkIHDR; - std::vector> chunksInfo; - bool isAnimated = false; - bool hasInfo = false; - bool seenFctl = false; - APNGFrame frameRaw = {}; - uint32_t num_channels; - JxlPixelFormat format = {}; - unsigned int bytes_per_pixel = 0; - - struct FrameInfo { - PackedImage data; - uint32_t duration; - size_t x0, xsize; - size_t y0, ysize; - uint32_t dispose_op; - uint32_t blend_op; - }; - - std::vector frames; + // Initialize output (default settings in case e.g. only gAMA is given). + ppf->frames.clear(); + ppf->info.exponent_bits_per_sample = 0; + ppf->info.alpha_exponent_bits = 0; + ppf->info.orientation = JXL_ORIENT_IDENTITY; + ppf->color_encoding.color_space = JXL_COLOR_SPACE_RGB; + ppf->color_encoding.white_point = JXL_WHITE_POINT_D65; + ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB; + ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB; + ppf->color_encoding.rendering_intent = JXL_RENDERING_INTENT_RELATIVE; - // Make sure png memory is released in any case. - auto scope_guard = MakeScopeGuard([&]() { - png_destroy_read_struct(&png_ptr, &info_ptr, nullptr); - // Just in case. Not all versions on libpng wipe-out the pointers. - png_ptr = nullptr; - info_ptr = nullptr; - }); + Reader input(bytes); - r = {bytes.data(), bytes.data() + bytes.size()}; // Not a PNG => not an error unsigned char png_signature[8] = {137, 80, 78, 71, 13, 10, 26, 10}; - if (!r.Read(sig, 8) || memcmp(sig, png_signature, 8) != 0) { + Bytes sig = input.Read(8); + if (sig.size() != 8 || memcmp(sig.data(), png_signature, 8) != 0) { return false; } - unsigned int id = read_chunk(&r, &chunkIHDR); - ppf->info.exponent_bits_per_sample = 0; - ppf->info.alpha_exponent_bits = 0; - ppf->info.orientation = JXL_ORIENT_IDENTITY; + Bytes chunk_ihdr = input.ReadChunk(); + if (chunk_ihdr.empty()) { + return false; + } + uint32_t id = LoadLE32(chunk_ihdr.data() + 4); + if (id != MakeTag('I', 'H', 'D', 'R') || chunk_ihdr.size() != 25) { + return false; + } + const uint32_t w = png_get_uint_32(chunk_ihdr.data() + 8); + const uint32_t h = png_get_uint_32(chunk_ihdr.data() + 12); + if (w > kMaxPNGSize || h > kMaxPNGSize) { + return false; + } - ppf->frames.clear(); + Context ctx; + ctx.ihdr = chunk_ihdr.Copy(); + + std::vector chunksInfo; + if (!ctx.InitPngDecoder(false, chunksInfo)) { + return false; + } + bool isAnimated = false; + bool hasInfo = false; + uint32_t num_channels; + JxlPixelFormat format = {}; + unsigned int bytes_per_pixel = 0; + std::vector frames; bool have_color = false; bool have_cicp = false; bool have_iccp = false; bool have_srgb = false; - bool errorstate = true; - if (id == kId_IHDR && chunkIHDR.size() == 25) { - uint32_t x0 = 0; - uint32_t y0 = 0; - uint32_t delay_num = 1; - uint32_t delay_den = 10; - uint32_t dop = 0; - uint32_t bop = 0; - - uint32_t w = png_get_uint_32(chunkIHDR.data() + 8); - uint32_t h = png_get_uint_32(chunkIHDR.data() + 12); - uint32_t w0 = w; - uint32_t h0 = h; - if (w > cMaxPNGSize || h > cMaxPNGSize) { + uint32_t x0 = 0; + uint32_t y0 = 0; + uint32_t delay_num = 1; + uint32_t delay_den = 10; + uint32_t dop = 0; + uint32_t bop = 0; + uint32_t w0 = w; + uint32_t h0 = h; + + while (!input.Eof()) { + Bytes chunk = input.ReadChunk(); + if (chunk.empty()) { return false; } - - // default settings in case e.g. only gAMA is given - ppf->color_encoding.color_space = JXL_COLOR_SPACE_RGB; - ppf->color_encoding.white_point = JXL_WHITE_POINT_D65; - ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB; - ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB; - ppf->color_encoding.rendering_intent = JXL_RENDERING_INTENT_RELATIVE; - - if (!processing_start(png_ptr, info_ptr, static_cast(&frameRaw), - hasInfo, chunkIHDR, chunksInfo)) { - while (!r.Eof()) { - id = read_chunk(&r, &chunk); - if (!id) break; - seenFctl |= (id == kId_fcTL); - - if (id == kId_acTL && !hasInfo && !isAnimated) { - isAnimated = true; - ppf->info.have_animation = JXL_TRUE; - ppf->info.animation.tps_numerator = 1000; - ppf->info.animation.tps_denominator = 1; - } else if (id == kId_IEND || - (id == kId_fcTL && (!hasInfo || isAnimated))) { - if (hasInfo) { - if (!processing_finish(png_ptr, info_ptr, &ppf->metadata)) { - // Allocates the frame buffer. - uint32_t duration = delay_num * 1000 / delay_den; - JXL_ASSIGN_OR_RETURN(PackedImage image, - PackedImage::Create(w0, h0, format)); - frames.push_back(FrameInfo{std::move(image), duration, x0, w0, y0, - h0, dop, bop}); - auto& frame = frames.back().data; - for (size_t y = 0; y < h0; ++y) { - memcpy(static_cast(frame.pixels()) + frame.stride * y, - frameRaw.rows[y], bytes_per_pixel * w0); - } - } else { - break; - } + id = LoadLE32(chunk.data() + 4); + + if (id == MakeTag('a', 'c', 'T', 'L') && !hasInfo && !isAnimated) { + isAnimated = true; + ppf->info.have_animation = JXL_TRUE; + ppf->info.animation.tps_numerator = 1000; + ppf->info.animation.tps_denominator = 1; + } else if (id == MakeTag('I', 'E', 'N', 'D') || + (id == MakeTag('f', 'c', 'T', 'L') && + (!hasInfo || isAnimated))) { + if (hasInfo) { + if (ctx.FinalizeStream(&ppf->metadata)) { + // Allocates the frame buffer. + uint32_t duration = delay_num * 1000 / delay_den; + JXL_ASSIGN_OR_RETURN(PackedImage image, + PackedImage::Create(w0, h0, format)); + frames.push_back( + FrameInfo{std::move(image), duration, x0, w0, y0, h0, dop, bop}); + auto& frame = frames.back().data; + for (size_t y = 0; y < h0; ++y) { + // TODO(eustas): ensure multiplication is safe + memcpy(static_cast(frame.pixels()) + frame.stride * y, + ctx.frameRaw.rows[y], bytes_per_pixel * w0); } + } else { + return false; + } + } - if (id == kId_IEND) { - errorstate = false; - break; - } - if (chunk.size() < 34) { - return JXL_FAILURE("Received a chunk that is too small (%" PRIuS - "B)", - chunk.size()); - } - // At this point the old frame is done. Let's start a new one. - w0 = png_get_uint_32(chunk.data() + 12); - h0 = png_get_uint_32(chunk.data() + 16); - x0 = png_get_uint_32(chunk.data() + 20); - y0 = png_get_uint_32(chunk.data() + 24); - delay_num = png_get_uint_16(chunk.data() + 28); - delay_den = png_get_uint_16(chunk.data() + 30); - dop = chunk[32]; - bop = chunk[33]; - - if (!delay_den) delay_den = 100; - - if (w0 > cMaxPNGSize || h0 > cMaxPNGSize || x0 > cMaxPNGSize || - y0 > cMaxPNGSize || x0 + w0 > w || y0 + h0 > h || dop > 2 || - bop > 1) { - break; - } + if (id == MakeTag('I', 'E', 'N', 'D')) { + break; + } + if (chunk.size() < 34) { + return JXL_FAILURE("Received a chunk that is too small (%" PRIuS "B)", + chunk.size()); + } + // At this point the old frame is done. Let's start a new one. + w0 = png_get_uint_32(chunk.data() + 12); + h0 = png_get_uint_32(chunk.data() + 16); + x0 = png_get_uint_32(chunk.data() + 20); + y0 = png_get_uint_32(chunk.data() + 24); + delay_num = png_get_uint_16(chunk.data() + 28); + delay_den = png_get_uint_16(chunk.data() + 30); + dop = chunk[32]; + bop = chunk[33]; + + if (!delay_den) delay_den = 100; + + if (w0 > kMaxPNGSize || h0 > kMaxPNGSize || x0 > kMaxPNGSize || + y0 > kMaxPNGSize || x0 + w0 > w || y0 + h0 > h || dop > 2 || + bop > 1) { + return false; + } - if (hasInfo) { - memcpy(chunkIHDR.data() + 8, chunk.data() + 12, 8); - if (processing_start(png_ptr, info_ptr, - static_cast(&frameRaw), hasInfo, - chunkIHDR, chunksInfo)) { - break; - } - } - } else if (id == kId_IDAT) { - // First IDAT chunk means we now have all header info - if (seenFctl) { - // `fcTL` chunk must appear after all `IDAT` chunks - return JXL_FAILURE("IDAT chunk after fcTL chunk"); - } - hasInfo = true; - JXL_CHECK(w == png_get_image_width(png_ptr, info_ptr)); - JXL_CHECK(h == png_get_image_height(png_ptr, info_ptr)); - int colortype = png_get_color_type(png_ptr, info_ptr); - int png_bit_depth = png_get_bit_depth(png_ptr, info_ptr); - ppf->info.bits_per_sample = png_bit_depth; - png_color_8p sigbits = nullptr; - png_get_sBIT(png_ptr, info_ptr, &sigbits); - if (colortype & 1) { - // palette will actually be 8-bit regardless of the index bitdepth - ppf->info.bits_per_sample = 8; - } - if (colortype & 2) { - ppf->info.num_color_channels = 3; - ppf->color_encoding.color_space = JXL_COLOR_SPACE_RGB; - if (sigbits && sigbits->red == sigbits->green && - sigbits->green == sigbits->blue) { - ppf->info.bits_per_sample = sigbits->red; - } else if (sigbits) { - int maxbps = std::max(sigbits->red, - std::max(sigbits->green, sigbits->blue)); - JXL_WARNING( - "sBIT chunk: bit depths for R, G, and B are not the same (%i " - "%i %i), while in JPEG XL they have to be the same. Setting " - "RGB bit depth to %i.", - sigbits->red, sigbits->green, sigbits->blue, maxbps); - ppf->info.bits_per_sample = maxbps; - } - } else { - ppf->info.num_color_channels = 1; - ppf->color_encoding.color_space = JXL_COLOR_SPACE_GRAY; - if (sigbits) ppf->info.bits_per_sample = sigbits->gray; - } - if (colortype & 4 || - png_get_valid(png_ptr, info_ptr, PNG_INFO_tRNS)) { - ppf->info.alpha_bits = ppf->info.bits_per_sample; - if (sigbits && sigbits->alpha != ppf->info.bits_per_sample) { - JXL_WARNING( - "sBIT chunk: bit depths for RGBA are inconsistent " - "(%i %i %i %i). Setting A bitdepth to %i.", - sigbits->red, sigbits->green, sigbits->blue, sigbits->alpha, - ppf->info.bits_per_sample); - } - } else { - ppf->info.alpha_bits = 0; - } - ppf->color_encoding.color_space = - (ppf->info.num_color_channels == 1 ? JXL_COLOR_SPACE_GRAY - : JXL_COLOR_SPACE_RGB); - ppf->info.xsize = w; - ppf->info.ysize = h; - JXL_RETURN_IF_ERROR(VerifyDimensions(constraints, w, h)); - num_channels = - ppf->info.num_color_channels + (ppf->info.alpha_bits ? 1 : 0); - format = { - /*num_channels=*/num_channels, - /*data_type=*/ppf->info.bits_per_sample > 8 ? JXL_TYPE_UINT16 - : JXL_TYPE_UINT8, - /*endianness=*/JXL_BIG_ENDIAN, - /*align=*/0, - }; - if (png_bit_depth > 8 && format.data_type == JXL_TYPE_UINT8) { - png_set_strip_16(png_ptr); - } - bytes_per_pixel = - num_channels * (format.data_type == JXL_TYPE_UINT16 ? 2 : 1); - size_t rowbytes = w * bytes_per_pixel; - if (h > std::numeric_limits::max() / rowbytes) { - return JXL_FAILURE("Image too big."); - } - size_t imagesize = h * rowbytes; - JXL_RETURN_IF_ERROR(frameRaw.Resize(imagesize)); - frameRaw.rows.resize(h); - for (size_t j = 0; j < h; j++) { - frameRaw.rows[j] = - reinterpret_cast(frameRaw.pixels.get()) + - j * rowbytes; - } + if (hasInfo) { + // Copy dimensions. + memcpy(ctx.ihdr.data() + 8, chunk.data() + 12, 8); + if (!ctx.InitPngDecoder(hasInfo, chunksInfo)) { + return false; + } + } + } else if (id == MakeTag('I', 'D', 'A', 'T')) { + // First IDAT chunk means we now have all header info + hasInfo = true; + JXL_CHECK(w == png_get_image_width(ctx.png_ptr, ctx.info_ptr)); + JXL_CHECK(h == png_get_image_height(ctx.png_ptr, ctx.info_ptr)); + int colortype = png_get_color_type(ctx.png_ptr, ctx.info_ptr); + int png_bit_depth = png_get_bit_depth(ctx.png_ptr, ctx.info_ptr); + ppf->info.bits_per_sample = png_bit_depth; + png_color_8p sigbits = nullptr; + png_get_sBIT(ctx.png_ptr, ctx.info_ptr, &sigbits); + if (colortype & 1) { + // palette will actually be 8-bit regardless of the index bitdepth + ppf->info.bits_per_sample = 8; + } + if (colortype & 2) { + ppf->info.num_color_channels = 3; + ppf->color_encoding.color_space = JXL_COLOR_SPACE_RGB; + if (sigbits && sigbits->red == sigbits->green && + sigbits->green == sigbits->blue) { + ppf->info.bits_per_sample = sigbits->red; + } else if (sigbits) { + int maxbps = + std::max(sigbits->red, std::max(sigbits->green, sigbits->blue)); + JXL_WARNING( + "sBIT chunk: bit depths for R, G, and B are not the same (%i " + "%i %i), while in JPEG XL they have to be the same. Setting " + "RGB bit depth to %i.", + sigbits->red, sigbits->green, sigbits->blue, maxbps); + ppf->info.bits_per_sample = maxbps; + } + } else { + ppf->info.num_color_channels = 1; + ppf->color_encoding.color_space = JXL_COLOR_SPACE_GRAY; + if (sigbits) ppf->info.bits_per_sample = sigbits->gray; + } + if (colortype & 4 || + png_get_valid(ctx.png_ptr, ctx.info_ptr, PNG_INFO_tRNS)) { + ppf->info.alpha_bits = ppf->info.bits_per_sample; + if (sigbits && sigbits->alpha != ppf->info.bits_per_sample) { + JXL_WARNING( + "sBIT chunk: bit depths for RGBA are inconsistent " + "(%i %i %i %i). Setting A bitdepth to %i.", + sigbits->red, sigbits->green, sigbits->blue, sigbits->alpha, + ppf->info.bits_per_sample); + } + } else { + ppf->info.alpha_bits = 0; + } + ppf->color_encoding.color_space = + (ppf->info.num_color_channels == 1 ? JXL_COLOR_SPACE_GRAY + : JXL_COLOR_SPACE_RGB); + ppf->info.xsize = w; + ppf->info.ysize = h; + JXL_RETURN_IF_ERROR(VerifyDimensions(constraints, w, h)); + num_channels = + ppf->info.num_color_channels + (ppf->info.alpha_bits ? 1 : 0); + format = { + /*num_channels=*/num_channels, + /*data_type=*/ppf->info.bits_per_sample > 8 ? JXL_TYPE_UINT16 + : JXL_TYPE_UINT8, + /*endianness=*/JXL_BIG_ENDIAN, + /*align=*/0, + }; + if (png_bit_depth > 8 && format.data_type == JXL_TYPE_UINT8) { + png_set_strip_16(ctx.png_ptr); + } + bytes_per_pixel = + num_channels * (format.data_type == JXL_TYPE_UINT16 ? 2 : 1); + // TODO(eustas): ensure multiplication is safe + size_t rowbytes = w * bytes_per_pixel; + if (h > std::numeric_limits::max() / rowbytes) { + return JXL_FAILURE("Image too big."); + } + JXL_RETURN_IF_ERROR(ctx.frameRaw.Resize(rowbytes, h)); - if (processing_data(png_ptr, info_ptr, chunk.data(), chunk.size())) { - break; - } - } else if (id == kId_fdAT && isAnimated) { - if (!hasInfo) { - return JXL_FAILURE("fDAT chunk before iDAT"); - } - png_save_uint_32(chunk.data() + 4, chunk.size() - 16); - memcpy(chunk.data() + 8, "IDAT", 4); - if (processing_data(png_ptr, info_ptr, chunk.data() + 4, - chunk.size() - 4)) { - break; - } - } else if (id == kId_cICP) { - // Color profile chunks: cICP has the highest priority, followed by - // iCCP and sRGB (which shouldn't co-exist, but if they do, we use - // iCCP), followed finally by gAMA and cHRM. - if (DecodeCICP(chunk.data() + 8, chunk.size() - 12, - &ppf->color_encoding)) { - have_cicp = true; - have_color = true; - ppf->icc.clear(); - ppf->primary_color_representation = - PackedPixelFile::kColorEncodingIsPrimary; - } - } else if (!have_cicp && id == kId_iCCP) { - if (processing_data(png_ptr, info_ptr, chunk.data(), chunk.size())) { - JXL_WARNING("Corrupt iCCP chunk"); - break; - } + if (!ctx.FeedChunks(chunk)) { + return false; + } + } else if (id == MakeTag('f', 'd', 'A', 'T') && isAnimated) { + if (!hasInfo) { + return JXL_FAILURE("fdAT chunk before IDAT"); + } + /* The 'fdAT' chunk has... the same structure as an 'IDAT' chunk, + * except preceded by a sequence number. */ + size_t payload_size = chunk.size() - 12; + if (payload_size < 4) { + return JXL_FAILURE("Corrupted fdAT chunk"); + } + // Turn 'fdAT' to 'IDAT' by cutting sequence number and replacing tag. + std::array preamble; + png_save_uint_32(preamble.data(), payload_size - 4); + memcpy(preamble.data() + 4, "IDAT", 4); + if (!ctx.FeedChunks(Bytes(preamble), + Bytes(chunk.data() + 12, chunk.size() - 12))) { + return false; + } + } else if (id == MakeTag('c', 'I', 'C', 'P')) { + // Color profile chunks: cICP has the highest priority, followed by + // iCCP and sRGB (which shouldn't co-exist, but if they do, we use + // iCCP), followed finally by gAMA and cHRM. + if (DecodeCicpChunk(Bytes(chunk.data() + 8, chunk.size() - 12), + &ppf->color_encoding)) { + have_cicp = true; + have_color = true; + ppf->icc.clear(); + ppf->primary_color_representation = + PackedPixelFile::kColorEncodingIsPrimary; + } + } else if (!have_cicp && id == MakeTag('i', 'C', 'C', 'P')) { + if (!ctx.FeedChunks(chunk)) { + JXL_WARNING("Corrupt iCCP chunk"); + return false; + } - // TODO(jon): catch special case of PQ and synthesize color encoding - // in that case - int compression_type; - png_bytep profile; - png_charp name; - png_uint_32 proflen = 0; - auto ok = png_get_iCCP(png_ptr, info_ptr, &name, &compression_type, - &profile, &proflen); - if (ok && proflen) { - ppf->icc.assign(profile, profile + proflen); - ppf->primary_color_representation = PackedPixelFile::kIccIsPrimary; - have_color = true; - have_iccp = true; - } else { - // TODO(eustas): JXL_WARNING? - } - } else if (!have_cicp && !have_iccp && id == kId_sRGB) { - JXL_RETURN_IF_ERROR(DecodeSRGB(chunk.data() + 8, chunk.size() - 12, - &ppf->color_encoding)); - have_srgb = true; - have_color = true; - } else if (!have_cicp && !have_srgb && !have_iccp && id == kId_gAMA) { - JXL_RETURN_IF_ERROR(DecodeGAMA(chunk.data() + 8, chunk.size() - 12, - &ppf->color_encoding)); - have_color = true; - } else if (!have_cicp && !have_srgb && !have_iccp && id == kId_cHRM) { - JXL_RETURN_IF_ERROR(DecodeCHRM(chunk.data() + 8, chunk.size() - 12, - &ppf->color_encoding)); - have_color = true; - } else if (id == kId_eXIf) { - ppf->metadata.exif.resize(chunk.size() - 12); - memcpy(ppf->metadata.exif.data(), chunk.data() + 8, - chunk.size() - 12); - } else if (!isAbc(chunk[4]) || !isAbc(chunk[5]) || !isAbc(chunk[6]) || - !isAbc(chunk[7])) { - break; - } else { - if (processing_data(png_ptr, info_ptr, chunk.data(), chunk.size())) { - break; - } - if (!hasInfo) { - chunksInfo.push_back(chunk); - continue; - } - } + // TODO(jon): catch special case of PQ and synthesize color encoding + // in that case + int compression_type; + png_bytep profile; + png_charp name; + png_uint_32 proflen = 0; + auto ok = png_get_iCCP(ctx.png_ptr, ctx.info_ptr, &name, + &compression_type, &profile, &proflen); + if (ok && proflen) { + ppf->icc.assign(profile, profile + proflen); + ppf->primary_color_representation = PackedPixelFile::kIccIsPrimary; + have_color = true; + have_iccp = true; + } else { + // TODO(eustas): JXL_WARNING? + } + } else if (!have_cicp && !have_iccp && id == MakeTag('s', 'R', 'G', 'B')) { + JXL_RETURN_IF_ERROR(DecodeSrgbChunk( + Bytes(chunk.data() + 8, chunk.size() - 12), &ppf->color_encoding)); + have_srgb = true; + have_color = true; + } else if (!have_cicp && !have_srgb && !have_iccp && + id == MakeTag('g', 'A', 'M', 'A')) { + JXL_RETURN_IF_ERROR(DecodeGamaChunk( + Bytes(chunk.data() + 8, chunk.size() - 12), &ppf->color_encoding)); + have_color = true; + } else if (!have_cicp && !have_srgb && !have_iccp && + id == MakeTag('c', 'H', 'R', 'M')) { + JXL_RETURN_IF_ERROR(DecodeChrmChunk( + Bytes(chunk.data() + 8, chunk.size() - 12), &ppf->color_encoding)); + have_color = true; + } else if (id == MakeTag('e', 'X', 'I', 'f')) { + ppf->metadata.exif.resize(chunk.size() - 12); + memcpy(ppf->metadata.exif.data(), chunk.data() + 8, chunk.size() - 12); + } else if (!isAbc(chunk[4]) || !isAbc(chunk[5]) || !isAbc(chunk[6]) || + !isAbc(chunk[7])) { + return false; + } else { + if (!ctx.FeedChunks(chunk)) { + return false; + } + if (!hasInfo) { + chunksInfo.push_back(chunk); + continue; } } - - JXL_RETURN_IF_ERROR(ApplyColorHints( - color_hints, have_color, ppf->info.num_color_channels == 1, ppf)); } - if (errorstate) return false; + JXL_RETURN_IF_ERROR(ApplyColorHints(color_hints, have_color, + ppf->info.num_color_channels == 1, ppf)); bool has_nontrivial_background = false; bool previous_frame_should_be_cleared = false; @@ -1014,14 +1052,14 @@ Status DecodeImageAPNG(const Span bytes, previous_frame_should_be_cleared = has_nontrivial_background && frame.dispose_op == DISPOSE_OP_BACKGROUND; } + if (ppf->frames.empty()) return JXL_FAILURE("No frames decoded"); ppf->frames.back().frame_info.is_last = JXL_TRUE; return true; -#else - return false; -#endif } +#endif // JPEGXL_ENABLE_APNG + } // namespace extras } // namespace jxl diff --git a/third_party/jpeg-xl/lib/extras/dec/apng.h b/third_party/jpeg-xl/lib/extras/dec/apng.h index d91364b1e6..7ebc2ee7c8 100644 --- a/third_party/jpeg-xl/lib/extras/dec/apng.h +++ b/third_party/jpeg-xl/lib/extras/dec/apng.h @@ -8,11 +8,10 @@ // Decodes APNG images in memory. -#include +#include #include "lib/extras/dec/color_hints.h" #include "lib/extras/packed_image.h" -#include "lib/jxl/base/data_parallel.h" #include "lib/jxl/base/span.h" #include "lib/jxl/base/status.h" diff --git a/third_party/jpeg-xl/lib/extras/dec/color_description.cc b/third_party/jpeg-xl/lib/extras/dec/color_description.cc index bf229632d0..87fff6e54a 100644 --- a/third_party/jpeg-xl/lib/extras/dec/color_description.cc +++ b/third_party/jpeg-xl/lib/extras/dec/color_description.cc @@ -203,12 +203,38 @@ Status ParseTransferFunction(Tokenizer* tokenizer, JxlColorEncoding* c) { Status ParseDescription(const std::string& description, JxlColorEncoding* c) { *c = {}; - Tokenizer tokenizer(&description, '_'); - JXL_RETURN_IF_ERROR(ParseColorSpace(&tokenizer, c)); - JXL_RETURN_IF_ERROR(ParseWhitePoint(&tokenizer, c)); - JXL_RETURN_IF_ERROR(ParsePrimaries(&tokenizer, c)); - JXL_RETURN_IF_ERROR(ParseRenderingIntent(&tokenizer, c)); - JXL_RETURN_IF_ERROR(ParseTransferFunction(&tokenizer, c)); + if (description == "sRGB") { + c->color_space = JXL_COLOR_SPACE_RGB; + c->white_point = JXL_WHITE_POINT_D65; + c->primaries = JXL_PRIMARIES_SRGB; + c->transfer_function = JXL_TRANSFER_FUNCTION_SRGB; + c->rendering_intent = JXL_RENDERING_INTENT_PERCEPTUAL; + } else if (description == "DisplayP3") { + c->color_space = JXL_COLOR_SPACE_RGB; + c->white_point = JXL_WHITE_POINT_D65; + c->primaries = JXL_PRIMARIES_P3; + c->transfer_function = JXL_TRANSFER_FUNCTION_SRGB; + c->rendering_intent = JXL_RENDERING_INTENT_PERCEPTUAL; + } else if (description == "Rec2100PQ") { + c->color_space = JXL_COLOR_SPACE_RGB; + c->white_point = JXL_WHITE_POINT_D65; + c->primaries = JXL_PRIMARIES_2100; + c->transfer_function = JXL_TRANSFER_FUNCTION_PQ; + c->rendering_intent = JXL_RENDERING_INTENT_RELATIVE; + } else if (description == "Rec2100HLG") { + c->color_space = JXL_COLOR_SPACE_RGB; + c->white_point = JXL_WHITE_POINT_D65; + c->primaries = JXL_PRIMARIES_2100; + c->transfer_function = JXL_TRANSFER_FUNCTION_HLG; + c->rendering_intent = JXL_RENDERING_INTENT_RELATIVE; + } else { + Tokenizer tokenizer(&description, '_'); + JXL_RETURN_IF_ERROR(ParseColorSpace(&tokenizer, c)); + JXL_RETURN_IF_ERROR(ParseWhitePoint(&tokenizer, c)); + JXL_RETURN_IF_ERROR(ParsePrimaries(&tokenizer, c)); + JXL_RETURN_IF_ERROR(ParseRenderingIntent(&tokenizer, c)); + JXL_RETURN_IF_ERROR(ParseTransferFunction(&tokenizer, c)); + } return true; } diff --git a/third_party/jpeg-xl/lib/extras/dec/decode.cc b/third_party/jpeg-xl/lib/extras/dec/decode.cc index 3546cb65c0..2581d53f63 100644 --- a/third_party/jpeg-xl/lib/extras/dec/decode.cc +++ b/third_party/jpeg-xl/lib/extras/dec/decode.cc @@ -91,6 +91,15 @@ bool CanDecode(Codec codec) { } } +std::string ListOfDecodeCodecs() { + std::string list_of_codecs("JXL, PPM, PNM, PFM, PAM, PGX"); + if (CanDecode(Codec::kPNG)) list_of_codecs.append(", PNG, APNG"); + if (CanDecode(Codec::kGIF)) list_of_codecs.append(", GIF"); + if (CanDecode(Codec::kJPG)) list_of_codecs.append(", JPEG"); + if (CanDecode(Codec::kEXR)) list_of_codecs.append(", EXR"); + return list_of_codecs; +} + Status DecodeBytes(const Span bytes, const ColorHints& color_hints, extras::PackedPixelFile* ppf, const SizeConstraints* constraints, Codec* orig_codec) { diff --git a/third_party/jpeg-xl/lib/extras/dec/decode.h b/third_party/jpeg-xl/lib/extras/dec/decode.h index 1a90f4c6a3..26dc1409df 100644 --- a/third_party/jpeg-xl/lib/extras/dec/decode.h +++ b/third_party/jpeg-xl/lib/extras/dec/decode.h @@ -38,6 +38,8 @@ enum class Codec : uint32_t { bool CanDecode(Codec codec); +std::string ListOfDecodeCodecs(); + // If and only if extension is ".pfm", *bits_per_sample is updated to 32 so // that Encode() would encode to PFM instead of PPM. Codec CodecFromPath(const std::string& path, diff --git a/third_party/jpeg-xl/lib/extras/dec/gif.cc b/third_party/jpeg-xl/lib/extras/dec/gif.cc index 3f89d460b8..243d8b5103 100644 --- a/third_party/jpeg-xl/lib/extras/dec/gif.cc +++ b/third_party/jpeg-xl/lib/extras/dec/gif.cc @@ -17,6 +17,7 @@ #include "lib/extras/size_constraints.h" #include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/rect.h" #include "lib/jxl/sanitizers.h" namespace jxl { diff --git a/third_party/jpeg-xl/lib/extras/dec/jpg.cc b/third_party/jpeg-xl/lib/extras/dec/jpg.cc index 4a3e0d3b21..a65b46b4c8 100644 --- a/third_party/jpeg-xl/lib/extras/dec/jpg.cc +++ b/third_party/jpeg-xl/lib/extras/dec/jpg.cc @@ -6,8 +6,7 @@ #include "lib/extras/dec/jpg.h" #if JPEGXL_ENABLE_JPEG -#include -#include +#include "lib/jxl/base/include_jpeglib.h" // NOLINT #endif #include diff --git a/third_party/jpeg-xl/lib/extras/dec/jxl.cc b/third_party/jpeg-xl/lib/extras/dec/jxl.cc index 5b7fa03f02..e2534fa745 100644 --- a/third_party/jpeg-xl/lib/extras/dec/jxl.cc +++ b/third_party/jpeg-xl/lib/extras/dec/jxl.cc @@ -10,7 +10,7 @@ #include #include -#include +#include // PRIu32 #include "lib/extras/common.h" #include "lib/extras/dec/color_description.h" @@ -211,7 +211,7 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size, return false; } uint32_t progression_index = 0; - bool codestream_done = accepted_formats.empty(); + bool codestream_done = jpeg_bytes == nullptr && accepted_formats.empty(); BoxProcessor boxes(dec); for (;;) { JxlDecoderStatus status = JxlDecoderProcessInput(dec); diff --git a/third_party/jpeg-xl/lib/extras/dec/pnm.cc b/third_party/jpeg-xl/lib/extras/dec/pnm.cc index e64d7e95f9..b3f9cd1206 100644 --- a/third_party/jpeg-xl/lib/extras/dec/pnm.cc +++ b/third_party/jpeg-xl/lib/extras/dec/pnm.cc @@ -5,17 +5,17 @@ #include "lib/extras/dec/pnm.h" -#include -#include +#include #include +#include #include -#include +#include +#include -#include "jxl/encode.h" #include "lib/extras/size_constraints.h" #include "lib/jxl/base/bits.h" -#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/span.h" #include "lib/jxl/base/status.h" namespace jxl { diff --git a/third_party/jpeg-xl/lib/extras/enc/encode.cc b/third_party/jpeg-xl/lib/extras/enc/encode.cc index c5e22d8c7e..71be78e36c 100644 --- a/third_party/jpeg-xl/lib/extras/enc/encode.cc +++ b/third_party/jpeg-xl/lib/extras/enc/encode.cc @@ -134,5 +134,13 @@ std::unique_ptr Encoder::FromExtension(std::string extension) { return nullptr; } +std::string ListOfEncodeCodecs() { + std::string list_of_codecs("PPM, PNM, PFM, PAM, PGX"); + if (GetAPNGEncoder()) list_of_codecs.append(", PNG, APNG"); + if (GetJPEGEncoder()) list_of_codecs.append(", JPEG"); + if (GetEXREncoder()) list_of_codecs.append(", EXR"); + return list_of_codecs; +} + } // namespace extras } // namespace jxl diff --git a/third_party/jpeg-xl/lib/extras/enc/encode.h b/third_party/jpeg-xl/lib/extras/enc/encode.h index 2502d9976b..a71f3b220f 100644 --- a/third_party/jpeg-xl/lib/extras/enc/encode.h +++ b/third_party/jpeg-xl/lib/extras/enc/encode.h @@ -82,6 +82,8 @@ class Encoder { std::unordered_map options_; }; +std::string ListOfEncodeCodecs(); + } // namespace extras } // namespace jxl diff --git a/third_party/jpeg-xl/lib/extras/enc/jpegli.cc b/third_party/jpeg-xl/lib/extras/enc/jpegli.cc index cb473a1290..9735cd8cb9 100644 --- a/third_party/jpeg-xl/lib/extras/enc/jpegli.cc +++ b/third_party/jpeg-xl/lib/extras/enc/jpegli.cc @@ -454,6 +454,10 @@ Status EncodeJpeg(const PackedPixelFile& ppf, const JpegSettings& jpeg_settings, cinfo.comp_info[i].h_samp_factor = 1; cinfo.comp_info[i].v_samp_factor = 1; } + } else if (!jpeg_settings.xyb) { + // Default is no chroma subsampling. + cinfo.comp_info[0].h_samp_factor = 1; + cinfo.comp_info[0].v_samp_factor = 1; } jpegli_enable_adaptive_quantization( &cinfo, TO_JXL_BOOL(jpeg_settings.use_adaptive_quantization)); diff --git a/third_party/jpeg-xl/lib/extras/enc/jpg.cc b/third_party/jpeg-xl/lib/extras/enc/jpg.cc index 0095ac9294..a2ef4a9fc4 100644 --- a/third_party/jpeg-xl/lib/extras/enc/jpg.cc +++ b/third_party/jpeg-xl/lib/extras/enc/jpg.cc @@ -6,18 +6,15 @@ #include "lib/extras/enc/jpg.h" #if JPEGXL_ENABLE_JPEG -#include -#include +#include "lib/jxl/base/include_jpeglib.h" // NOLINT #endif -#include #include #include #include +#include #include -#include #include -#include #include #include #include diff --git a/third_party/jpeg-xl/lib/extras/jpegli_test.cc b/third_party/jpeg-xl/lib/extras/jpegli_test.cc index 3049049a64..96b546755c 100644 --- a/third_party/jpeg-xl/lib/extras/jpegli_test.cc +++ b/third_party/jpeg-xl/lib/extras/jpegli_test.cc @@ -255,7 +255,7 @@ TEST(JpegliTest, JpegliHDRRoundtripTest) { std::string testimage = "jxl/hdr_room.png"; PackedPixelFile ppf_in; ASSERT_TRUE(ReadTestImage(testimage, &ppf_in)); - EXPECT_EQ("RGB_D65_202_Rel_HLG", Description(ppf_in.color_encoding)); + EXPECT_EQ("Rec2100HLG", Description(ppf_in.color_encoding)); EXPECT_EQ(16, ppf_in.info.bits_per_sample); std::vector compressed; diff --git a/third_party/jpeg-xl/lib/extras/metrics.cc b/third_party/jpeg-xl/lib/extras/metrics.cc index 4259d3c375..f70ab0a61d 100644 --- a/third_party/jpeg-xl/lib/extras/metrics.cc +++ b/third_party/jpeg-xl/lib/extras/metrics.cc @@ -16,6 +16,7 @@ #include #include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/rect.h" #include "lib/jxl/base/status.h" #include "lib/jxl/color_encoding_internal.h" HWY_BEFORE_NAMESPACE(); diff --git a/third_party/jpeg-xl/lib/extras/packed_image_convert.cc b/third_party/jpeg-xl/lib/extras/packed_image_convert.cc index 2ad001bf09..7e4b592fc4 100644 --- a/third_party/jpeg-xl/lib/extras/packed_image_convert.cc +++ b/third_party/jpeg-xl/lib/extras/packed_image_convert.cc @@ -11,6 +11,7 @@ #include +#include "lib/jxl/base/rect.h" #include "lib/jxl/base/status.h" #include "lib/jxl/color_encoding_internal.h" #include "lib/jxl/dec_external_image.h" diff --git a/third_party/jpeg-xl/lib/include/jxl/color_encoding.h b/third_party/jpeg-xl/lib/include/jxl/color_encoding.h index e6325dcb30..f5de188223 100644 --- a/third_party/jpeg-xl/lib/include/jxl/color_encoding.h +++ b/third_party/jpeg-xl/lib/include/jxl/color_encoding.h @@ -14,8 +14,6 @@ #ifndef JXL_COLOR_ENCODING_H_ #define JXL_COLOR_ENCODING_H_ -#include - #if defined(__cplusplus) || defined(c_plusplus) extern "C" { #endif diff --git a/third_party/jpeg-xl/lib/jpegli/color_transform.cc b/third_party/jpeg-xl/lib/jpegli/color_transform.cc index 60a0dc83bb..ec906bedce 100644 --- a/third_party/jpeg-xl/lib/jpegli/color_transform.cc +++ b/third_party/jpeg-xl/lib/jpegli/color_transform.cc @@ -26,11 +26,16 @@ using hwy::HWY_NAMESPACE::Mul; using hwy::HWY_NAMESPACE::MulAdd; using hwy::HWY_NAMESPACE::Sub; -void YCbCrToRGB(float* row[kMaxComponents], size_t xsize) { +template +void YCbCrToExtRGB(float* row[kMaxComponents], size_t xsize) { const HWY_CAPPED(float, 8) df; - float* JXL_RESTRICT row0 = row[0]; - float* JXL_RESTRICT row1 = row[1]; - float* JXL_RESTRICT row2 = row[2]; + const float* row_y = row[0]; + const float* row_cb = row[1]; + const float* row_cr = row[2]; + float* row_r = row[kRed]; + float* row_g = row[kGreen]; + float* row_b = row[kBlue]; + float* row_a = row[kAlpha]; // Full-range BT.601 as defined by JFIF Clause 7: // https://www.itu.int/rec/T-REC-T.871-201105-I/en @@ -38,20 +43,48 @@ void YCbCrToRGB(float* row[kMaxComponents], size_t xsize) { const auto cgcb = Set(df, -0.114f * 1.772f / 0.587f); const auto cgcr = Set(df, -0.299f * 1.402f / 0.587f); const auto cbcb = Set(df, 1.772f); + const auto alpha_opaque = Set(df, 127.0f / 255.0f); for (size_t x = 0; x < xsize; x += Lanes(df)) { - const auto y_vec = Load(df, row0 + x); - const auto cb_vec = Load(df, row1 + x); - const auto cr_vec = Load(df, row2 + x); + const auto y_vec = Load(df, row_y + x); + const auto cb_vec = Load(df, row_cb + x); + const auto cr_vec = Load(df, row_cr + x); const auto r_vec = MulAdd(crcr, cr_vec, y_vec); const auto g_vec = MulAdd(cgcr, cr_vec, MulAdd(cgcb, cb_vec, y_vec)); const auto b_vec = MulAdd(cbcb, cb_vec, y_vec); - Store(r_vec, df, row0 + x); - Store(g_vec, df, row1 + x); - Store(b_vec, df, row2 + x); + Store(r_vec, df, row_r + x); + Store(g_vec, df, row_g + x); + Store(b_vec, df, row_b + x); + if (kAlpha >= 0) { + Store(alpha_opaque, df, row_a + x); + } } } +void YCbCrToRGB(float* row[kMaxComponents], size_t xsize) { + YCbCrToExtRGB<0, 1, 2, -1>(row, xsize); +} + +void YCbCrToBGR(float* row[kMaxComponents], size_t xsize) { + YCbCrToExtRGB<2, 1, 0, -1>(row, xsize); +} + +void YCbCrToRGBA(float* row[kMaxComponents], size_t xsize) { + YCbCrToExtRGB<0, 1, 2, 3>(row, xsize); +} + +void YCbCrToBGRA(float* row[kMaxComponents], size_t xsize) { + YCbCrToExtRGB<2, 1, 0, 3>(row, xsize); +} + +void YCbCrToARGB(float* row[kMaxComponents], size_t xsize) { + YCbCrToExtRGB<1, 2, 3, 0>(row, xsize); +} + +void YCbCrToABGR(float* row[kMaxComponents], size_t xsize) { + YCbCrToExtRGB<3, 2, 1, 0>(row, xsize); +} + void YCCKToCMYK(float* row[kMaxComponents], size_t xsize) { const HWY_CAPPED(float, 8) df; float* JXL_RESTRICT row0 = row[0]; @@ -66,11 +99,15 @@ void YCCKToCMYK(float* row[kMaxComponents], size_t xsize) { } } -void RGBToYCbCr(float* row[kMaxComponents], size_t xsize) { +template +void ExtRGBToYCbCr(float* row[kMaxComponents], size_t xsize) { const HWY_CAPPED(float, 8) df; - float* JXL_RESTRICT row0 = row[0]; - float* JXL_RESTRICT row1 = row[1]; - float* JXL_RESTRICT row2 = row[2]; + const float* row_r = row[kRed]; + const float* row_g = row[kGreen]; + const float* row_b = row[kBlue]; + float* row_y = row[0]; + float* row_cb = row[1]; + float* row_cr = row[2]; // Full-range BT.601 as defined by JFIF Clause 7: // https://www.itu.int/rec/T-REC-T.871-201105-I/en const auto c128 = Set(df, 128.0f); @@ -85,9 +122,9 @@ void RGBToYCbCr(float* row[kMaxComponents], size_t xsize) { const auto kNormB = Div(Set(df, 1.0f), (Add(kR, Add(kG, kAmpB)))); for (size_t x = 0; x < xsize; x += Lanes(df)) { - const auto r = Load(df, row0 + x); - const auto g = Load(df, row1 + x); - const auto b = Load(df, row2 + x); + const auto r = Load(df, row_r + x); + const auto g = Load(df, row_g + x); + const auto b = Load(df, row_b + x); const auto r_base = Mul(r, kR); const auto r_diff = Mul(r, kDiffR); const auto g_base = Mul(g, kG); @@ -96,12 +133,28 @@ void RGBToYCbCr(float* row[kMaxComponents], size_t xsize) { const auto y_base = Add(r_base, Add(g_base, b_base)); const auto cb_vec = MulAdd(Sub(b_diff, y_base), kNormB, c128); const auto cr_vec = MulAdd(Sub(r_diff, y_base), kNormR, c128); - Store(y_base, df, row0 + x); - Store(cb_vec, df, row1 + x); - Store(cr_vec, df, row2 + x); + Store(y_base, df, row_y + x); + Store(cb_vec, df, row_cb + x); + Store(cr_vec, df, row_cr + x); } } +void RGBToYCbCr(float* row[kMaxComponents], size_t xsize) { + ExtRGBToYCbCr<0, 1, 2>(row, xsize); +} + +void BGRToYCbCr(float* row[kMaxComponents], size_t xsize) { + ExtRGBToYCbCr<2, 1, 0>(row, xsize); +} + +void ARGBToYCbCr(float* row[kMaxComponents], size_t xsize) { + ExtRGBToYCbCr<1, 2, 3>(row, xsize); +} + +void ABGRToYCbCr(float* row[kMaxComponents], size_t xsize) { + ExtRGBToYCbCr<3, 2, 1>(row, xsize); +} + void CMYKToYCCK(float* row[kMaxComponents], size_t xsize) { const HWY_CAPPED(float, 8) df; float* JXL_RESTRICT row0 = row[0]; @@ -127,7 +180,15 @@ namespace jpegli { HWY_EXPORT(CMYKToYCCK); HWY_EXPORT(YCCKToCMYK); HWY_EXPORT(YCbCrToRGB); +HWY_EXPORT(YCbCrToBGR); +HWY_EXPORT(YCbCrToRGBA); +HWY_EXPORT(YCbCrToBGRA); +HWY_EXPORT(YCbCrToARGB); +HWY_EXPORT(YCbCrToABGR); HWY_EXPORT(RGBToYCbCr); +HWY_EXPORT(BGRToYCbCr); +HWY_EXPORT(ARGBToYCbCr); +HWY_EXPORT(ABGRToYCbCr); bool CheckColorSpaceComponents(int num_components, J_COLOR_SPACE colorspace) { switch (colorspace) { @@ -164,16 +225,73 @@ bool CheckColorSpaceComponents(int num_components, J_COLOR_SPACE colorspace) { void NullTransform(float* row[kMaxComponents], size_t len) {} +void FillAlpha(float* row, size_t len) { + static const float kAlpha = 127.0f / 255.0f; + for (size_t i = 0; i < len; ++i) { + row[i] = kAlpha; + } +} + +// Works for BGR as well. void GrayscaleToRGB(float* row[kMaxComponents], size_t len) { memcpy(row[1], row[0], len * sizeof(row[1][0])); memcpy(row[2], row[0], len * sizeof(row[2][0])); } +// Works for BGRA as well. +void GrayscaleToRGBA(float* row[kMaxComponents], size_t len) { + memcpy(row[1], row[0], len * sizeof(row[1][0])); + memcpy(row[2], row[0], len * sizeof(row[2][0])); + FillAlpha(row[3], len); +} + +// Works for ABGR as well. +void GrayscaleToARGB(float* row[kMaxComponents], size_t len) { + memcpy(row[1], row[0], len * sizeof(row[1][0])); + memcpy(row[2], row[0], len * sizeof(row[2][0])); + memcpy(row[3], row[0], len * sizeof(row[1][0])); + FillAlpha(row[0], len); +} + void GrayscaleToYCbCr(float* row[kMaxComponents], size_t len) { memset(row[1], 0, len * sizeof(row[1][0])); memset(row[2], 0, len * sizeof(row[2][0])); } +void RGBToBGR(float* row[kMaxComponents], size_t len) { + for (size_t i = 0; i < len; ++i) { + std::swap(row[0][i], row[2][i]); + } +} + +void RGBToRGBA(float* row[kMaxComponents], size_t len) { + FillAlpha(row[3], len); +} + +void RGBToBGRA(float* row[kMaxComponents], size_t len) { + static const float kAlpha = 127.0f / 255.0f; + for (size_t i = 0; i < len; ++i) { + std::swap(row[0][i], row[2][i]); + row[3][i] = kAlpha; + } +} + +void RGBToARGB(float* row[kMaxComponents], size_t len) { + memcpy(row[3], row[2], len * sizeof(row[1][0])); + memcpy(row[2], row[1], len * sizeof(row[2][0])); + memcpy(row[1], row[0], len * sizeof(row[1][0])); + FillAlpha(row[0], len); +} + +void RGBToABGR(float* row[kMaxComponents], size_t len) { + static const float kAlpha = 127.0f / 255.0f; + for (size_t i = 0; i < len; ++i) { + std::swap(row[1][i], row[2][i]); + row[3][i] = row[0][i]; + row[0][i] = kAlpha; + } +} + void ChooseColorTransform(j_compress_ptr cinfo) { jpeg_comp_master* m = cinfo->master; if (!CheckColorSpaceComponents(cinfo->input_components, @@ -226,6 +344,43 @@ void ChooseColorTransform(j_compress_ptr cinfo) { } } + if (cinfo->jpeg_color_space == JCS_GRAYSCALE || + cinfo->jpeg_color_space == JCS_YCbCr) { + switch (cinfo->in_color_space) { +#ifdef JCS_EXTENSIONS + case JCS_EXT_RGB: + case JCS_EXT_RGBX: + m->color_transform = HWY_DYNAMIC_DISPATCH(RGBToYCbCr); + break; + case JCS_EXT_BGR: + case JCS_EXT_BGRX: + m->color_transform = HWY_DYNAMIC_DISPATCH(BGRToYCbCr); + break; + case JCS_EXT_XRGB: + m->color_transform = HWY_DYNAMIC_DISPATCH(ARGBToYCbCr); + break; + case JCS_EXT_XBGR: + m->color_transform = HWY_DYNAMIC_DISPATCH(ABGRToYCbCr); + break; +#endif +#ifdef JCS_ALPHA_EXTENSIONS + case JCS_EXT_RGBA: + m->color_transform = HWY_DYNAMIC_DISPATCH(RGBToYCbCr); + break; + case JCS_EXT_BGRA: + m->color_transform = HWY_DYNAMIC_DISPATCH(BGRToYCbCr); + break; + case JCS_EXT_ARGB: + m->color_transform = HWY_DYNAMIC_DISPATCH(ARGBToYCbCr); + break; + case JCS_EXT_ABGR: + m->color_transform = HWY_DYNAMIC_DISPATCH(ABGRToYCbCr); + break; +#endif + default:; // Nothing to do. + } + } + if (m->color_transform == nullptr) { // TODO(szabadka) Support more color transforms. JPEGLI_ERROR("Unsupported color transform %d -> %d", cinfo->in_color_space, @@ -257,18 +412,123 @@ void ChooseColorTransform(j_decompress_ptr cinfo) { m->color_transform = nullptr; if (cinfo->jpeg_color_space == JCS_GRAYSCALE) { - if (cinfo->out_color_space == JCS_RGB) { - m->color_transform = GrayscaleToRGB; + switch (cinfo->out_color_space) { + case JCS_RGB: + m->color_transform = GrayscaleToRGB; + break; +#ifdef JCS_EXTENSIONS + case JCS_EXT_RGB: + case JCS_EXT_BGR: + m->color_transform = GrayscaleToRGB; + break; + case JCS_EXT_RGBX: + case JCS_EXT_BGRX: + m->color_transform = GrayscaleToRGBA; + break; + case JCS_EXT_XRGB: + case JCS_EXT_XBGR: + m->color_transform = GrayscaleToARGB; + break; +#endif +#ifdef JCS_ALPHA_EXTENSIONS + case JCS_EXT_RGBA: + case JCS_EXT_BGRA: + m->color_transform = GrayscaleToRGBA; + break; + case JCS_EXT_ARGB: + case JCS_EXT_ABGR: + m->color_transform = GrayscaleToARGB; + break; +#endif + default: + m->color_transform = nullptr; } } else if (cinfo->jpeg_color_space == JCS_RGB) { - if (cinfo->out_color_space == JCS_GRAYSCALE) { - m->color_transform = HWY_DYNAMIC_DISPATCH(RGBToYCbCr); + switch (cinfo->out_color_space) { + case JCS_GRAYSCALE: + m->color_transform = HWY_DYNAMIC_DISPATCH(RGBToYCbCr); + break; +#ifdef JCS_EXTENSIONS + case JCS_EXT_RGB: + m->color_transform = NullTransform; + break; + case JCS_EXT_BGR: + m->color_transform = RGBToBGR; + break; + case JCS_EXT_RGBX: + m->color_transform = RGBToRGBA; + break; + case JCS_EXT_BGRX: + m->color_transform = RGBToBGRA; + break; + case JCS_EXT_XRGB: + m->color_transform = RGBToARGB; + break; + case JCS_EXT_XBGR: + m->color_transform = RGBToABGR; + break; +#endif +#ifdef JCS_ALPHA_EXTENSIONS + case JCS_EXT_RGBA: + m->color_transform = RGBToRGBA; + break; + case JCS_EXT_BGRA: + m->color_transform = RGBToBGRA; + break; + case JCS_EXT_ARGB: + m->color_transform = RGBToARGB; + break; + case JCS_EXT_ABGR: + m->color_transform = RGBToABGR; + break; +#endif + default: + m->color_transform = nullptr; } } else if (cinfo->jpeg_color_space == JCS_YCbCr) { - if (cinfo->out_color_space == JCS_RGB) { - m->color_transform = HWY_DYNAMIC_DISPATCH(YCbCrToRGB); - } else if (cinfo->out_color_space == JCS_GRAYSCALE) { - m->color_transform = NullTransform; + switch (cinfo->out_color_space) { + case JCS_GRAYSCALE: + m->color_transform = NullTransform; + break; + case JCS_RGB: + m->color_transform = HWY_DYNAMIC_DISPATCH(YCbCrToRGB); + break; +#ifdef JCS_EXTENSIONS + case JCS_EXT_RGB: + m->color_transform = HWY_DYNAMIC_DISPATCH(YCbCrToRGB); + break; + case JCS_EXT_BGR: + m->color_transform = HWY_DYNAMIC_DISPATCH(YCbCrToBGR); + break; + case JCS_EXT_RGBX: + m->color_transform = HWY_DYNAMIC_DISPATCH(YCbCrToRGBA); + break; + case JCS_EXT_BGRX: + m->color_transform = HWY_DYNAMIC_DISPATCH(YCbCrToBGRA); + break; + case JCS_EXT_XRGB: + m->color_transform = HWY_DYNAMIC_DISPATCH(YCbCrToARGB); + break; + case JCS_EXT_XBGR: + m->color_transform = HWY_DYNAMIC_DISPATCH(YCbCrToABGR); + break; +#endif +#ifdef JCS_ALPHA_EXTENSIONS + case JCS_EXT_RGBA: + m->color_transform = HWY_DYNAMIC_DISPATCH(YCbCrToRGBA); + break; + case JCS_EXT_BGRA: + m->color_transform = HWY_DYNAMIC_DISPATCH(YCbCrToBGRA); + break; + case JCS_EXT_ARGB: + m->color_transform = HWY_DYNAMIC_DISPATCH(YCbCrToARGB); + break; + case JCS_EXT_ABGR: + m->color_transform = HWY_DYNAMIC_DISPATCH(YCbCrToABGR); + break; +#endif + default: + m->color_transform = nullptr; } } else if (cinfo->jpeg_color_space == JCS_YCCK) { if (cinfo->out_color_space == JCS_CMYK) { diff --git a/third_party/jpeg-xl/lib/jpegli/common.h b/third_party/jpeg-xl/lib/jpegli/common.h index 42487f2b89..514483afef 100644 --- a/third_party/jpeg-xl/lib/jpegli/common.h +++ b/third_party/jpeg-xl/lib/jpegli/common.h @@ -20,12 +20,7 @@ #ifndef LIB_JPEGLI_COMMON_H_ #define LIB_JPEGLI_COMMON_H_ -/* clang-format off */ -#include -#include -/* clang-format on */ - -#include "lib/jpegli/types.h" +#include "lib/jxl/base/include_jpeglib.h" // NOLINT #if defined(__cplusplus) || defined(c_plusplus) extern "C" { diff --git a/third_party/jpeg-xl/lib/jpegli/decode.cc b/third_party/jpeg-xl/lib/jpegli/decode.cc index 9fdf68dd18..d967b787d3 100644 --- a/third_party/jpeg-xl/lib/jpegli/decode.cc +++ b/third_party/jpeg-xl/lib/jpegli/decode.cc @@ -54,6 +54,7 @@ void InitializeImage(j_decompress_ptr cinfo) { m->found_soi_ = false; m->found_dri_ = false; m->found_sof_ = false; + m->found_sos_ = false; m->found_eoi_ = false; m->icc_index_ = 0; m->icc_total_ = 0; @@ -243,10 +244,14 @@ void PrepareForScan(j_decompress_ptr cinfo) { // Copy quantization tables into comp_info. for (int i = 0; i < cinfo->comps_in_scan; ++i) { jpeg_component_info* comp = cinfo->cur_comp_info[i]; + int quant_tbl_idx = comp->quant_tbl_no; + JQUANT_TBL* quant_table = cinfo->quant_tbl_ptrs[quant_tbl_idx]; + if (!quant_table) { + JPEGLI_ERROR("Quantization table with index %d not found", quant_tbl_idx); + } if (comp->quant_table == nullptr) { comp->quant_table = Allocate(cinfo, 1, JPOOL_IMAGE); - memcpy(comp->quant_table, cinfo->quant_tbl_ptrs[comp->quant_tbl_no], - sizeof(JQUANT_TBL)); + memcpy(comp->quant_table, quant_table, sizeof(JQUANT_TBL)); } } if (cinfo->comps_in_scan == 1) { @@ -723,16 +728,36 @@ void jpegli_calc_output_dimensions(j_decompress_ptr cinfo) { } } } - if (cinfo->out_color_space == JCS_GRAYSCALE) { - cinfo->out_color_components = 1; - } else if (cinfo->out_color_space == JCS_RGB || - cinfo->out_color_space == JCS_YCbCr) { - cinfo->out_color_components = 3; - } else if (cinfo->out_color_space == JCS_CMYK || - cinfo->out_color_space == JCS_YCCK) { - cinfo->out_color_components = 4; - } else { - cinfo->out_color_components = cinfo->num_components; + switch (cinfo->out_color_space) { + case JCS_GRAYSCALE: + cinfo->out_color_components = 1; + break; + case JCS_RGB: + case JCS_YCbCr: +#ifdef JCS_EXTENSIONS + case JCS_EXT_RGB: + case JCS_EXT_BGR: +#endif + cinfo->out_color_components = 3; + break; + case JCS_CMYK: + case JCS_YCCK: +#ifdef JCS_EXTENSIONS + case JCS_EXT_RGBX: + case JCS_EXT_BGRX: + case JCS_EXT_XBGR: + case JCS_EXT_XRGB: +#endif +#ifdef JCS_ALPHA_EXTENSIONS + case JCS_EXT_RGBA: + case JCS_EXT_BGRA: + case JCS_EXT_ABGR: + case JCS_EXT_ARGB: +#endif + cinfo->out_color_components = 4; + break; + default: + cinfo->out_color_components = cinfo->num_components; } cinfo->output_components = cinfo->quantize_colors ? 1 : cinfo->out_color_components; diff --git a/third_party/jpeg-xl/lib/jpegli/decode.h b/third_party/jpeg-xl/lib/jpegli/decode.h index f5b099eda3..668d630586 100644 --- a/third_party/jpeg-xl/lib/jpegli/decode.h +++ b/third_party/jpeg-xl/lib/jpegli/decode.h @@ -21,6 +21,7 @@ #define LIB_JPEGLI_DECODE_H_ #include "lib/jpegli/common.h" +#include "lib/jpegli/types.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { diff --git a/third_party/jpeg-xl/lib/jpegli/decode_api_test.cc b/third_party/jpeg-xl/lib/jpegli/decode_api_test.cc index 3ecd479951..c429f0f810 100644 --- a/third_party/jpeg-xl/lib/jpegli/decode_api_test.cc +++ b/third_party/jpeg-xl/lib/jpegli/decode_api_test.cc @@ -3,17 +3,27 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#include + +#include #include #include +#include +#include +#include +#include +#include +#include #include #include "lib/jpegli/decode.h" #include "lib/jpegli/encode.h" +#include "lib/jpegli/libjpeg_test_util.h" +#include "lib/jpegli/test_params.h" #include "lib/jpegli/test_utils.h" #include "lib/jpegli/testing.h" -#include "lib/jxl/base/byte_order.h" +#include "lib/jpegli/types.h" #include "lib/jxl/base/status.h" -#include "lib/jxl/sanitizers.h" namespace jpegli { namespace { @@ -894,7 +904,9 @@ std::vector GenerateTests(bool buffered) { all_tests.push_back(config); } // Tests for color transforms. - for (J_COLOR_SPACE out_color_space : {JCS_RGB, JCS_GRAYSCALE}) { + for (J_COLOR_SPACE out_color_space : + {JCS_RGB, JCS_GRAYSCALE, JCS_EXT_RGB, JCS_EXT_BGR, JCS_EXT_RGBA, + JCS_EXT_BGRA, JCS_EXT_ARGB, JCS_EXT_ABGR}) { TestConfig config; config.input.xsize = config.input.ysize = 256; config.input.color_space = JCS_GRAYSCALE; @@ -903,7 +915,9 @@ std::vector GenerateTests(bool buffered) { all_tests.push_back(config); } for (J_COLOR_SPACE jpeg_color_space : {JCS_RGB, JCS_YCbCr}) { - for (J_COLOR_SPACE out_color_space : {JCS_RGB, JCS_YCbCr, JCS_GRAYSCALE}) { + for (J_COLOR_SPACE out_color_space : + {JCS_RGB, JCS_YCbCr, JCS_GRAYSCALE, JCS_EXT_RGB, JCS_EXT_BGR, + JCS_EXT_RGBA, JCS_EXT_BGRA, JCS_EXT_ARGB, JCS_EXT_ABGR}) { if (jpeg_color_space == JCS_RGB && out_color_space == JCS_YCbCr) continue; TestConfig config; config.input.xsize = config.input.ysize = 256; @@ -1108,6 +1122,8 @@ std::vector GenerateTests(bool buffered) { TestConfig config; config.input.xsize = xsize; config.input.ysize = ysize; + config.jparams.h_sampling = {1, 1, 1}; + config.jparams.v_sampling = {1, 1, 1}; all_tests.push_back(config); } } diff --git a/third_party/jpeg-xl/lib/jpegli/decode_internal.h b/third_party/jpeg-xl/lib/jpegli/decode_internal.h index 37dfcc4526..8455fae392 100644 --- a/third_party/jpeg-xl/lib/jpegli/decode_internal.h +++ b/third_party/jpeg-xl/lib/jpegli/decode_internal.h @@ -6,14 +6,15 @@ #ifndef LIB_JPEGLI_DECODE_INTERNAL_H_ #define LIB_JPEGLI_DECODE_INTERNAL_H_ -#include #include +#include #include #include "lib/jpegli/common.h" #include "lib/jpegli/common_internal.h" #include "lib/jpegli/huffman.h" +#include "lib/jpegli/types.h" namespace jpegli { @@ -58,6 +59,7 @@ struct jpeg_decomp_master { bool found_soi_; bool found_dri_; bool found_sof_; + bool found_sos_; bool found_eoi_; // Whether this jpeg has multiple scans (progressive or non-interleaved diff --git a/third_party/jpeg-xl/lib/jpegli/decode_marker.cc b/third_party/jpeg-xl/lib/jpegli/decode_marker.cc index a9ed4df329..2621ed0867 100644 --- a/third_party/jpeg-xl/lib/jpegli/decode_marker.cc +++ b/third_party/jpeg-xl/lib/jpegli/decode_marker.cc @@ -103,9 +103,6 @@ void ProcessSOF(j_decompress_ptr cinfo, const uint8_t* data, size_t len) { int quant_tbl_idx = ReadUint8(data, &pos); JPEG_VERIFY_INPUT(quant_tbl_idx, 0, NUM_QUANT_TBLS - 1); comp->quant_tbl_no = quant_tbl_idx; - if (cinfo->quant_tbl_ptrs[quant_tbl_idx] == nullptr) { - JPEGLI_ERROR("Quantization table with index %u not found", quant_tbl_idx); - } comp->quant_table = nullptr; // will be allocated after SOS marker } JPEG_VERIFY_MARKER_END(); @@ -168,6 +165,7 @@ void ProcessSOS(j_decompress_ptr cinfo, const uint8_t* data, size_t len) { if (!m->found_sof_) { JPEGLI_ERROR("Unexpected SOS marker."); } + m->found_sos_ = true; size_t pos = 2; JPEG_VERIFY_LEN(1); cinfo->comps_in_scan = ReadUint8(data, &pos); @@ -337,7 +335,7 @@ void ProcessDHT(j_decompress_ptr cinfo, const uint8_t* data, size_t len) { void ProcessDQT(j_decompress_ptr cinfo, const uint8_t* data, size_t len) { jpeg_decomp_master* m = cinfo->master; - if (m->found_sof_) { + if (m->found_sos_) { JPEGLI_ERROR("Updating quant tables between scans is not supported."); } size_t pos = 2; diff --git a/third_party/jpeg-xl/lib/jpegli/encode.cc b/third_party/jpeg-xl/lib/jpegli/encode.cc index 5326f2cb0f..6cfd54ad30 100644 --- a/third_party/jpeg-xl/lib/jpegli/encode.cc +++ b/third_party/jpeg-xl/lib/jpegli/encode.cc @@ -283,15 +283,15 @@ void ProcessCompressionParams(j_compress_ptr cinfo) { JPEGLI_ERROR("Invalid sampling factor %d x %d", comp->h_samp_factor, comp->v_samp_factor); } + if (cinfo->num_components == 1) { + // Force samp factors to 1x1 for single-component images. + comp->h_samp_factor = comp->v_samp_factor = 1; + } cinfo->max_h_samp_factor = std::max(comp->h_samp_factor, cinfo->max_h_samp_factor); cinfo->max_v_samp_factor = std::max(comp->v_samp_factor, cinfo->max_v_samp_factor); } - if (cinfo->num_components == 1 && - (cinfo->max_h_samp_factor != 1 || cinfo->max_v_samp_factor != 1)) { - JPEGLI_ERROR("Sampling is not supported for simgle component image."); - } size_t iMCU_width = DCTSIZE * cinfo->max_h_samp_factor; size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor; size_t total_iMCU_cols = DivCeil(cinfo->image_width, iMCU_width); @@ -713,18 +713,31 @@ void jpegli_set_defaults(j_compress_ptr cinfo) { void jpegli_default_colorspace(j_compress_ptr cinfo) { CheckState(cinfo, jpegli::kEncStart); + if (cinfo->in_color_space == JCS_RGB && cinfo->master->xyb_mode) { + jpegli_set_colorspace(cinfo, JCS_RGB); + return; + } switch (cinfo->in_color_space) { case JCS_GRAYSCALE: jpegli_set_colorspace(cinfo, JCS_GRAYSCALE); break; - case JCS_RGB: { - if (cinfo->master->xyb_mode) { - jpegli_set_colorspace(cinfo, JCS_RGB); - } else { - jpegli_set_colorspace(cinfo, JCS_YCbCr); - } + case JCS_RGB: +#ifdef JCS_EXTENSIONS + case JCS_EXT_RGB: + case JCS_EXT_BGR: + case JCS_EXT_RGBX: + case JCS_EXT_BGRX: + case JCS_EXT_XRGB: + case JCS_EXT_XBGR: +#endif +#if JCS_ALPHA_EXTENSIONS + case JCS_EXT_RGBA: + case JCS_EXT_BGRA: + case JCS_EXT_ARGB: + case JCS_EXT_ABGR: +#endif + jpegli_set_colorspace(cinfo, JCS_YCbCr); break; - } case JCS_YCbCr: jpegli_set_colorspace(cinfo, JCS_YCbCr); break; @@ -806,6 +819,11 @@ void jpegli_set_colorspace(j_compress_ptr cinfo, J_COLOR_SPACE colorspace) { cinfo->comp_info[2].quant_tbl_no = 1; cinfo->comp_info[1].dc_tbl_no = cinfo->comp_info[1].ac_tbl_no = 1; cinfo->comp_info[2].dc_tbl_no = cinfo->comp_info[2].ac_tbl_no = 1; + // Use chroma subsampling by default + cinfo->comp_info[0].h_samp_factor = cinfo->comp_info[0].v_samp_factor = 2; + if (colorspace == JCS_YCCK) { + cinfo->comp_info[3].h_samp_factor = cinfo->comp_info[3].v_samp_factor = 2; + } } } diff --git a/third_party/jpeg-xl/lib/jpegli/encode.h b/third_party/jpeg-xl/lib/jpegli/encode.h index ed34838450..33de674471 100644 --- a/third_party/jpeg-xl/lib/jpegli/encode.h +++ b/third_party/jpeg-xl/lib/jpegli/encode.h @@ -21,6 +21,7 @@ #define LIB_JPEGLI_ENCODE_H_ #include "lib/jpegli/common.h" +#include "lib/jpegli/types.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { diff --git a/third_party/jpeg-xl/lib/jpegli/encode_api_test.cc b/third_party/jpeg-xl/lib/jpegli/encode_api_test.cc index 2978b3f35d..81b1b25bef 100644 --- a/third_party/jpeg-xl/lib/jpegli/encode_api_test.cc +++ b/third_party/jpeg-xl/lib/jpegli/encode_api_test.cc @@ -4,14 +4,23 @@ // license that can be found in the LICENSE file. #include -#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include "lib/jpegli/encode.h" -#include "lib/jpegli/error.h" +#include "lib/jpegli/libjpeg_test_util.h" +#include "lib/jpegli/test_params.h" #include "lib/jpegli/test_utils.h" #include "lib/jpegli/testing.h" -#include "lib/jxl/sanitizers.h" +#include "lib/jpegli/types.h" +#include "lib/jxl/base/status.h" namespace jpegli { namespace { @@ -372,6 +381,8 @@ std::vector GenerateTests() { { TestConfig config; config.jparams.quality = 100; + config.jparams.h_sampling = {1, 1, 1}; + config.jparams.v_sampling = {1, 1, 1}; config.max_bpp = 6.6; config.max_dist = 0.6; all_tests.push_back(config); @@ -510,17 +521,23 @@ std::vector GenerateTests() { config.jparams.libjpeg_mode = true; config.max_bpp = 2.1; config.max_dist = 1.7; + config.jparams.h_sampling = {1, 1, 1}; + config.jparams.v_sampling = {1, 1, 1}; all_tests.push_back(config); } - for (J_COLOR_SPACE in_color_space : {JCS_RGB, JCS_YCbCr, JCS_GRAYSCALE}) { + for (J_COLOR_SPACE in_color_space : + {JCS_RGB, JCS_YCbCr, JCS_GRAYSCALE, JCS_EXT_RGB, JCS_EXT_BGR, + JCS_EXT_RGBA, JCS_EXT_BGRA, JCS_EXT_ARGB, JCS_EXT_ABGR}) { for (J_COLOR_SPACE jpeg_color_space : {JCS_RGB, JCS_YCbCr, JCS_GRAYSCALE}) { - if (jpeg_color_space == JCS_RGB && in_color_space == JCS_YCbCr) continue; + if (jpeg_color_space == JCS_RGB && in_color_space >= JCS_YCbCr) continue; TestConfig config; config.input.xsize = config.input.ysize = 256; config.input.color_space = in_color_space; config.jparams.set_jpeg_colorspace = true; config.jparams.jpeg_color_space = jpeg_color_space; + config.jparams.h_sampling = {1, 1, 1}; + config.jparams.v_sampling = {1, 1, 1}; config.max_bpp = jpeg_color_space == JCS_RGB ? 4.5 : 1.85; config.max_dist = jpeg_color_space == JCS_RGB ? 1.4 : 2.05; all_tests.push_back(config); @@ -536,6 +553,8 @@ std::vector GenerateTests() { config.jparams.set_jpeg_colorspace = true; config.jparams.jpeg_color_space = jpeg_color_space; } + config.jparams.h_sampling = {1, 1, 1, 1}; + config.jparams.v_sampling = {1, 1, 1, 1}; config.max_bpp = jpeg_color_space == JCS_CMYK ? 4.0 : 3.6; config.max_dist = jpeg_color_space == JCS_CMYK ? 1.2 : 1.5; all_tests.push_back(config); @@ -546,6 +565,8 @@ std::vector GenerateTests() { config.input.color_space = JCS_YCbCr; config.max_bpp = 1.6; config.max_dist = 1.35; + config.jparams.h_sampling = {1, 1, 1}; + config.jparams.v_sampling = {1, 1, 1}; all_tests.push_back(config); } for (bool xyb : {false, true}) { @@ -596,6 +617,8 @@ std::vector GenerateTests() { table.add_raw = add_raw; table.Generate(); config.jparams.optimize_coding = 1; + config.jparams.h_sampling = {1, 1, 1}; + config.jparams.v_sampling = {1, 1, 1}; config.jparams.quant_tables.push_back(table); config.jparams.quant_indexes = {0, 0, 0}; float q = (type == 0 ? 16 : type) * scale * 0.01f; @@ -614,6 +637,8 @@ std::vector GenerateTests() { config.input.ysize = 256; config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1, (qidx >> 0) & 1}; + config.jparams.h_sampling = {1, 1, 1}; + config.jparams.v_sampling = {1, 1, 1}; config.max_bpp = 2.25; config.max_dist = 2.8; all_tests.push_back(config); @@ -626,6 +651,8 @@ std::vector GenerateTests() { config.input.ysize = 256; config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1, (qidx >> 0) & 1}; + config.jparams.h_sampling = {1, 1, 1}; + config.jparams.v_sampling = {1, 1, 1}; CustomQuantTable table; table.slot_idx = slot_idx; table.Generate(); @@ -643,6 +670,10 @@ std::vector GenerateTests() { config.jparams.xyb_mode = xyb; config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1, (qidx >> 0) & 1}; + if (!xyb) { + config.jparams.h_sampling = {1, 1, 1}; + config.jparams.v_sampling = {1, 1, 1}; + } { CustomQuantTable table; table.slot_idx = 0; @@ -667,6 +698,10 @@ std::vector GenerateTests() { config.input.ysize = 256; config.jparams.xyb_mode = xyb; config.jparams.quant_indexes = {0, 1, 2}; + if (!xyb) { + config.jparams.h_sampling = {1, 1, 1}; + config.jparams.v_sampling = {1, 1, 1}; + } { CustomQuantTable table; table.slot_idx = 0; @@ -738,6 +773,8 @@ std::vector GenerateTests() { } config.jparams.progressive_mode = 0; config.jparams.optimize_coding = 0; + config.jparams.h_sampling = {1, 1, 1}; + config.jparams.v_sampling = {1, 1, 1}; config.max_bpp = 1.85; config.max_dist = 2.05; if (input_mode == COEFFICIENTS) { diff --git a/third_party/jpeg-xl/lib/jpegli/error_handling_test.cc b/third_party/jpeg-xl/lib/jpegli/error_handling_test.cc index bcd7355124..582c6b170b 100644 --- a/third_party/jpeg-xl/lib/jpegli/error_handling_test.cc +++ b/third_party/jpeg-xl/lib/jpegli/error_handling_test.cc @@ -3,12 +3,20 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#include +#include +#include +#include +#include + +#include "lib/jpegli/common.h" #include "lib/jpegli/decode.h" #include "lib/jpegli/encode.h" -#include "lib/jpegli/error.h" +#include "lib/jpegli/libjpeg_test_util.h" +#include "lib/jpegli/test_params.h" #include "lib/jpegli/test_utils.h" #include "lib/jpegli/testing.h" -#include "lib/jxl/sanitizers.h" +#include "lib/jxl/base/status.h" namespace jpegli { namespace { @@ -996,6 +1004,9 @@ TEST(EncoderErrorHandlingTest, AddOnTableNoStringParam) { const uint8_t kCompressed0[] = { // SOI 0xff, 0xd8, // + // SOF + 0xff, 0xc0, 0x00, 0x0b, 0x08, 0x00, 0x01, 0x00, 0x01, 0x01, // + 0x01, 0x11, 0x00, // // DQT 0xff, 0xdb, 0x00, 0x43, 0x00, 0x03, 0x02, 0x02, 0x03, 0x02, // 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x03, 0x03, 0x04, 0x05, // @@ -1004,9 +1015,6 @@ const uint8_t kCompressed0[] = { 0x0e, 0x12, 0x10, 0x0d, 0x0e, 0x11, 0x0e, 0x0b, 0x0b, 0x10, // 0x16, 0x10, 0x11, 0x13, 0x14, 0x15, 0x15, 0x15, 0x0c, 0x0f, // 0x17, 0x18, 0x16, 0x14, 0x18, 0x12, 0x14, 0x15, 0x14, // - // SOF - 0xff, 0xc0, 0x00, 0x0b, 0x08, 0x00, 0x01, 0x00, 0x01, 0x01, // - 0x01, 0x11, 0x00, // // DHT 0xff, 0xc4, 0x00, 0xd2, 0x00, 0x00, 0x01, 0x05, 0x01, 0x01, // 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // @@ -1039,8 +1047,8 @@ const uint8_t kCompressed0[] = { }; const size_t kLen0 = sizeof(kCompressed0); -const size_t kDQTOffset = 2; -const size_t kSOFOffset = 71; +const size_t kSOFOffset = 2; +const size_t kDQTOffset = 15; const size_t kDHTOffset = 84; const size_t kSOSOffset = 296; diff --git a/third_party/jpeg-xl/lib/jpegli/input_suspension_test.cc b/third_party/jpeg-xl/lib/jpegli/input_suspension_test.cc index 6546b7b087..dc5aee2fc5 100644 --- a/third_party/jpeg-xl/lib/jpegli/input_suspension_test.cc +++ b/third_party/jpeg-xl/lib/jpegli/input_suspension_test.cc @@ -3,16 +3,24 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include +#include + +#include +#include #include +#include +#include +#include +#include +#include #include #include "lib/jpegli/decode.h" +#include "lib/jpegli/libjpeg_test_util.h" +#include "lib/jpegli/test_params.h" #include "lib/jpegli/test_utils.h" #include "lib/jpegli/testing.h" -#include "lib/jxl/base/byte_order.h" #include "lib/jxl/base/status.h" -#include "lib/jxl/sanitizers.h" namespace jpegli { namespace { diff --git a/third_party/jpeg-xl/lib/jpegli/libjpeg_test_util.cc b/third_party/jpeg-xl/lib/jpegli/libjpeg_test_util.cc index 020adf5e9e..d34ec7e999 100644 --- a/third_party/jpeg-xl/lib/jpegli/libjpeg_test_util.cc +++ b/third_party/jpeg-xl/lib/jpegli/libjpeg_test_util.cc @@ -5,12 +5,7 @@ #include "lib/jpegli/libjpeg_test_util.h" -/* clang-format off */ -#include -#include -#include -/* clang-format on */ - +#include "lib/jxl/base/include_jpeglib.h" // NOLINT #include "lib/jxl/sanitizers.h" namespace jpegli { diff --git a/third_party/jpeg-xl/lib/jpegli/output_suspension_test.cc b/third_party/jpeg-xl/lib/jpegli/output_suspension_test.cc index 3cb2fd3ee4..44d63fdcbb 100644 --- a/third_party/jpeg-xl/lib/jpegli/output_suspension_test.cc +++ b/third_party/jpeg-xl/lib/jpegli/output_suspension_test.cc @@ -3,7 +3,18 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#include +#include +#include +#include +#include +#include +#include +#include + #include "lib/jpegli/encode.h" +#include "lib/jpegli/libjpeg_test_util.h" +#include "lib/jpegli/test_params.h" #include "lib/jpegli/test_utils.h" #include "lib/jpegli/testing.h" @@ -130,6 +141,7 @@ TEST_P(OutputSuspensionTestParam, RawData) { cinfo.input_components = input.components; cinfo.in_color_space = JCS_YCbCr; jpegli_set_defaults(&cinfo); + cinfo.comp_info[0].h_samp_factor = config.jparams.h_sampling[0]; cinfo.comp_info[0].v_samp_factor = config.jparams.v_sampling[0]; jpegli_set_progressive_level(&cinfo, 0); cinfo.optimize_coding = FALSE; diff --git a/third_party/jpeg-xl/lib/jpegli/source_manager_test.cc b/third_party/jpeg-xl/lib/jpegli/source_manager_test.cc index a513b7063b..2d49ac16ba 100644 --- a/third_party/jpeg-xl/lib/jpegli/source_manager_test.cc +++ b/third_party/jpeg-xl/lib/jpegli/source_manager_test.cc @@ -3,14 +3,13 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include #include #include #include "lib/jpegli/decode.h" +#include "lib/jpegli/libjpeg_test_util.h" #include "lib/jpegli/test_utils.h" #include "lib/jpegli/testing.h" -#include "lib/jxl/base/status.h" namespace jpegli { namespace { diff --git a/third_party/jpeg-xl/lib/jpegli/streaming_test.cc b/third_party/jpeg-xl/lib/jpegli/streaming_test.cc index 1f19dc2045..29a224385f 100644 --- a/third_party/jpeg-xl/lib/jpegli/streaming_test.cc +++ b/third_party/jpeg-xl/lib/jpegli/streaming_test.cc @@ -3,8 +3,18 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#include +#include +#include +#include +#include +#include +#include +#include + #include "lib/jpegli/decode.h" #include "lib/jpegli/encode.h" +#include "lib/jpegli/test_params.h" #include "lib/jpegli/test_utils.h" #include "lib/jpegli/testing.h" diff --git a/third_party/jpeg-xl/lib/jpegli/test_utils.cc b/third_party/jpeg-xl/lib/jpegli/test_utils.cc index db5a30e8dc..5315c692a1 100644 --- a/third_party/jpeg-xl/lib/jpegli/test_utils.cc +++ b/third_party/jpeg-xl/lib/jpegli/test_utils.cc @@ -8,6 +8,7 @@ #include #include #include +#include #include "lib/jpegli/decode.h" #include "lib/jpegli/encode.h" @@ -171,6 +172,18 @@ std::string ColorSpaceName(J_COLOR_SPACE colorspace) { return "CMYK"; case JCS_YCCK: return "YCCK"; + case JCS_EXT_RGB: + return "EXT_RGB"; + case JCS_EXT_BGR: + return "EXT_BGR"; + case JCS_EXT_RGBA: + return "EXT_RGBA"; + case JCS_EXT_BGRA: + return "EXT_BGRA"; + case JCS_EXT_ARGB: + return "EXT_ARGB"; + case JCS_EXT_ABGR: + return "EXT_ABGR"; default: return ""; } @@ -301,9 +314,12 @@ std::ostream& operator<<(std::ostream& os, const CompressParams& jparams) { void SetNumChannels(J_COLOR_SPACE colorspace, size_t* channels) { if (colorspace == JCS_GRAYSCALE) { *channels = 1; - } else if (colorspace == JCS_RGB || colorspace == JCS_YCbCr) { + } else if (colorspace == JCS_RGB || colorspace == JCS_YCbCr || + colorspace == JCS_EXT_RGB || colorspace == JCS_EXT_BGR) { *channels = 3; - } else if (colorspace == JCS_CMYK || colorspace == JCS_YCCK) { + } else if (colorspace == JCS_CMYK || colorspace == JCS_YCCK || + colorspace == JCS_EXT_RGBA || colorspace == JCS_EXT_BGRA || + colorspace == JCS_EXT_ARGB || colorspace == JCS_EXT_ABGR) { *channels = 4; } else if (colorspace == JCS_UNKNOWN) { JXL_CHECK(*channels <= 4); @@ -330,7 +346,28 @@ void ConvertPixel(const uint8_t* input_rgb, uint8_t* out, if (colorspace == JCS_GRAYSCALE) { const float Y = 0.299f * r + 0.587f * g + 0.114f * b; out8[0] = static_cast(std::round(Y * kMul)); - } else if (colorspace == JCS_RGB || colorspace == JCS_UNKNOWN) { + } else if (colorspace == JCS_RGB || colorspace == JCS_EXT_RGB || + colorspace == JCS_EXT_RGBA) { + out8[0] = input_rgb[0]; + out8[1] = input_rgb[1]; + out8[2] = input_rgb[2]; + if (colorspace == JCS_EXT_RGBA) out8[3] = 255; + } else if (colorspace == JCS_EXT_BGR || colorspace == JCS_EXT_BGRA) { + out8[2] = input_rgb[0]; + out8[1] = input_rgb[1]; + out8[0] = input_rgb[2]; + if (colorspace == JCS_EXT_BGRA) out8[3] = 255; + } else if (colorspace == JCS_EXT_ABGR) { + out8[0] = 255; + out8[3] = input_rgb[0]; + out8[2] = input_rgb[1]; + out8[1] = input_rgb[2]; + } else if (colorspace == JCS_EXT_ARGB) { + out8[0] = 255; + out8[1] = input_rgb[0]; + out8[2] = input_rgb[1]; + out8[3] = input_rgb[2]; + } else if (colorspace == JCS_UNKNOWN) { for (size_t c = 0; c < num_channels; ++c) { out8[c] = input_rgb[std::min(2, c)]; } @@ -390,9 +427,23 @@ void ConvertPixel(const uint8_t* input_rgb, uint8_t* out, void ConvertToGrayscale(TestImage* img) { if (img->color_space == JCS_GRAYSCALE) return; JXL_CHECK(img->data_type == JPEGLI_TYPE_UINT8); - for (size_t i = 0; i < img->pixels.size(); i += 3) { - if (img->color_space == JCS_RGB) { - ConvertPixel(&img->pixels[i], &img->pixels[i / 3], JCS_GRAYSCALE, 1); + bool rgb_pre_alpha = + img->color_space == JCS_EXT_ARGB || img->color_space == JCS_EXT_ABGR; + bool rgb_post_alpha = + img->color_space == JCS_EXT_RGBA || img->color_space == JCS_EXT_BGRA; + bool rgb_alpha = rgb_pre_alpha || rgb_post_alpha; + bool is_rgb = img->color_space == JCS_RGB || + img->color_space == JCS_EXT_RGB || + img->color_space == JCS_EXT_BGR || rgb_alpha; + bool switch_br = img->color_space == JCS_EXT_BGR || + img->color_space == JCS_EXT_ABGR || + img->color_space == JCS_EXT_BGRA; + size_t stride = rgb_alpha ? 4 : 3; + size_t offset = rgb_pre_alpha ? 1 : 0; + for (size_t i = offset; i < img->pixels.size(); i += stride) { + if (is_rgb) { + if (switch_br) std::swap(img->pixels[i], img->pixels[i + 2]); + ConvertPixel(&img->pixels[i], &img->pixels[i / stride], JCS_GRAYSCALE, 1); } else if (img->color_space == JCS_YCbCr) { img->pixels[i / 3] = img->pixels[i]; } diff --git a/third_party/jpeg-xl/lib/jpegli/test_utils.h b/third_party/jpeg-xl/lib/jpegli/test_utils.h index 132cfd042a..22c620c46c 100644 --- a/third_party/jpeg-xl/lib/jpegli/test_utils.h +++ b/third_party/jpeg-xl/lib/jpegli/test_utils.h @@ -6,22 +6,15 @@ #ifndef LIB_JPEGLI_TEST_UTILS_H_ #define LIB_JPEGLI_TEST_UTILS_H_ -#include -#include - -#include +#include +#include #include #include -/* clang-format off */ -#include -#include -#include -/* clang-format on */ - -#include "lib/jpegli/common.h" -#include "lib/jpegli/libjpeg_test_util.h" #include "lib/jpegli/test_params.h" +#include "lib/jpegli/types.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/include_jpeglib.h" // NOLINT namespace jpegli { @@ -127,4 +120,15 @@ void VerifyOutputImage(const TestImage& input, const TestImage& output, } // namespace jpegli +#if !defined(FUZZ_TEST) +struct FuzzTestSink { + template + FuzzTestSink WithSeeds(F) { + return *this; + } +}; +#define FUZZ_TEST(A, B) \ + const JXL_MAYBE_UNUSED FuzzTestSink unused##A##B = FuzzTestSink() +#endif + #endif // LIB_JPEGLI_TEST_UTILS_H_ diff --git a/third_party/jpeg-xl/lib/jpegli/transcode_api_test.cc b/third_party/jpeg-xl/lib/jpegli/transcode_api_test.cc index 13c81a1119..413d5ae996 100644 --- a/third_party/jpeg-xl/lib/jpegli/transcode_api_test.cc +++ b/third_party/jpeg-xl/lib/jpegli/transcode_api_test.cc @@ -3,10 +3,19 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#include +#include +#include +#include +#include +#include +#include #include #include "lib/jpegli/decode.h" #include "lib/jpegli/encode.h" +#include "lib/jpegli/libjpeg_test_util.h" +#include "lib/jpegli/test_params.h" #include "lib/jpegli/test_utils.h" #include "lib/jpegli/testing.h" #include "lib/jxl/base/status.h" diff --git a/third_party/jpeg-xl/lib/jxl/ac_context.h b/third_party/jpeg-xl/lib/jxl/ac_context.h index a2b9e046d1..6529a9bb88 100644 --- a/third_party/jpeg-xl/lib/jxl/ac_context.h +++ b/third_party/jpeg-xl/lib/jxl/ac_context.h @@ -62,7 +62,8 @@ static JXL_INLINE size_t ZeroDensityContext(size_t nonzeros_left, size_t k, size_t covered_blocks, size_t log2_covered_blocks, size_t prev) { - JXL_DASSERT((1u << log2_covered_blocks) == covered_blocks); + JXL_DASSERT((static_cast(1) << log2_covered_blocks) == + covered_blocks); nonzeros_left = (nonzeros_left + covered_blocks - 1) >> log2_covered_blocks; k >>= log2_covered_blocks; JXL_DASSERT(k > 0); @@ -109,7 +110,8 @@ struct BlockCtxMap { // Non-zero context is based on number of non-zeros and block context. // For better clustering, contexts with same number of non-zeros are grouped. constexpr uint32_t ZeroDensityContextsOffset(uint32_t block_ctx) const { - return num_ctxs * kNonZeroBuckets + kZeroDensityContextCount * block_ctx; + return static_cast(num_ctxs * kNonZeroBuckets + + kZeroDensityContextCount * block_ctx); } // Context map for AC coefficients consists of 2 blocks: @@ -121,7 +123,8 @@ struct BlockCtxMap { // number of non-zeros left and // index in scan order constexpr uint32_t NumACContexts() const { - return num_ctxs * (kNonZeroBuckets + kZeroDensityContextCount); + return static_cast(num_ctxs * + (kNonZeroBuckets + kZeroDensityContextCount)); } // Non-zero context is based on number of non-zeros and block context. @@ -134,7 +137,7 @@ struct BlockCtxMap { } else { ctx = 4 + non_zeros / 2; } - return ctx * num_ctxs + block_ctx; + return static_cast(ctx * num_ctxs + block_ctx); } BlockCtxMap() { diff --git a/third_party/jpeg-xl/lib/jxl/ac_strategy.h b/third_party/jpeg-xl/lib/jxl/ac_strategy.h index 9e5917ff1b..fd40b0ced8 100644 --- a/third_party/jpeg-xl/lib/jxl/ac_strategy.h +++ b/third_party/jpeg-xl/lib/jxl/ac_strategy.h @@ -11,9 +11,12 @@ #include // kMaxVectorSize +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/rect.h" #include "lib/jxl/base/status.h" #include "lib/jxl/coeff_order_fwd.h" #include "lib/jxl/frame_dimensions.h" +#include "lib/jxl/image.h" #include "lib/jxl/image_ops.h" // Defines the different kinds of transforms, and heuristics to choose between diff --git a/third_party/jpeg-xl/lib/jxl/ac_strategy_test.cc b/third_party/jpeg-xl/lib/jxl/ac_strategy_test.cc index b1d9103466..dc25c89898 100644 --- a/third_party/jpeg-xl/lib/jxl/ac_strategy_test.cc +++ b/third_party/jpeg-xl/lib/jxl/ac_strategy_test.cc @@ -5,16 +5,14 @@ #include "lib/jxl/ac_strategy.h" -#include - -#include +#include +#include #include #include // HWY_ALIGN_MAX #include -#include #include "lib/jxl/base/random.h" -#include "lib/jxl/dct_scales.h" +#include "lib/jxl/coeff_order_fwd.h" #include "lib/jxl/dec_transforms_testonly.h" #include "lib/jxl/enc_transforms.h" #include "lib/jxl/simd_util.h" diff --git a/third_party/jpeg-xl/lib/jxl/ans_common.h b/third_party/jpeg-xl/lib/jxl/ans_common.h index 44b8e3fba1..8236bb20ec 100644 --- a/third_party/jpeg-xl/lib/jxl/ans_common.h +++ b/third_party/jpeg-xl/lib/jxl/ans_common.h @@ -6,23 +6,24 @@ #ifndef LIB_JXL_ANS_COMMON_H_ #define LIB_JXL_ANS_COMMON_H_ -#include - #include +#include +#include +#include +#include #include // Prefetch #include #include "lib/jxl/ans_params.h" #include "lib/jxl/base/byte_order.h" #include "lib/jxl/base/compiler_specific.h" -#include "lib/jxl/base/status.h" namespace jxl { // Returns the precision (number of bits) that should be used to store // a histogram count such that Log2Floor(count) == logcount. -static JXL_INLINE uint32_t GetPopulationCountPrecision(uint32_t logcount, - uint32_t shift) { +static JXL_MAYBE_UNUSED JXL_INLINE uint32_t +GetPopulationCountPrecision(uint32_t logcount, uint32_t shift) { int32_t r = std::min( logcount, static_cast(shift) - static_cast((ANS_LOG_TAB_SIZE - logcount) >> 1)); diff --git a/third_party/jpeg-xl/lib/jxl/ans_test.cc b/third_party/jpeg-xl/lib/jxl/ans_test.cc index 5d6a5ef090..83a2e732f8 100644 --- a/third_party/jpeg-xl/lib/jxl/ans_test.cc +++ b/third_party/jpeg-xl/lib/jxl/ans_test.cc @@ -10,11 +10,10 @@ #include "lib/jxl/ans_params.h" #include "lib/jxl/base/random.h" -#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" #include "lib/jxl/dec_ans.h" #include "lib/jxl/dec_bit_reader.h" #include "lib/jxl/enc_ans.h" -#include "lib/jxl/enc_aux_out.h" #include "lib/jxl/enc_bit_writer.h" #include "lib/jxl/testing.h" diff --git a/third_party/jpeg-xl/lib/jxl/base/compiler_specific.h b/third_party/jpeg-xl/lib/jxl/base/compiler_specific.h index 702ff8e058..52f88c50f8 100644 --- a/third_party/jpeg-xl/lib/jxl/base/compiler_specific.h +++ b/third_party/jpeg-xl/lib/jxl/base/compiler_specific.h @@ -8,7 +8,6 @@ // Macros for compiler version + nonstandard keywords, e.g. __builtin_expect. -#include #include #include "lib/jxl/base/sanitizer_definitions.h" @@ -97,6 +96,11 @@ #define JXL_UNLIKELY(expr) __builtin_expect(!!(expr), 0) #endif +#if JXL_COMPILER_MSVC +#include +using ssize_t = intptr_t; +#endif + // Returns a void* pointer which the compiler then assumes is N-byte aligned. // Example: float* JXL_RESTRICT aligned = (float*)JXL_ASSUME_ALIGNED(in, 32); // @@ -150,8 +154,4 @@ #define JXL_FORMAT(idx_fmt, idx_arg) #endif -#if JXL_COMPILER_MSVC -using ssize_t = intptr_t; -#endif - #endif // LIB_JXL_BASE_COMPILER_SPECIFIC_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/exif.h b/third_party/jpeg-xl/lib/jxl/base/exif.h index a3574a16ff..acaa1a1ce4 100644 --- a/third_party/jpeg-xl/lib/jxl/base/exif.h +++ b/third_party/jpeg-xl/lib/jxl/base/exif.h @@ -3,8 +3,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#ifndef LIB_JXL_EXIF_H_ -#define LIB_JXL_EXIF_H_ +#ifndef LIB_JXL_BASE_EXIF_H_ +#define LIB_JXL_BASE_EXIF_H_ // Basic parsing of Exif (just enough for the render-impacting things // like orientation) @@ -87,4 +87,4 @@ JXL_INLINE void InterpretExif(const std::vector& exif, } // namespace jxl -#endif // LIB_JXL_EXIF_H_ +#endif // LIB_JXL_BASE_EXIF_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/include_jpeglib.h b/third_party/jpeg-xl/lib/jxl/base/include_jpeglib.h new file mode 100644 index 0000000000..f72d13d04b --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/include_jpeglib.h @@ -0,0 +1,20 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_INCLUDE_JPEGLIB_H_ +#define LIB_JXL_BASE_INCLUDE_JPEGLIB_H_ + +// Using this header ensures that includes go in the right order, +// not alphabetically sorted. + +// NOLINTBEGIN +/* clang-format off */ +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // IWYU pragma: keep +/* clang-format on */ +// NOLINTEND + +#endif // LIB_JXL_BASE_INCLUDE_JPEGLIB_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/matrix_ops.h b/third_party/jpeg-xl/lib/jxl/base/matrix_ops.h index cde6a64b1e..e1f8753932 100644 --- a/third_party/jpeg-xl/lib/jxl/base/matrix_ops.h +++ b/third_party/jpeg-xl/lib/jxl/base/matrix_ops.h @@ -3,8 +3,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#ifndef LIB_JXL_MATRIX_OPS_H_ -#define LIB_JXL_MATRIX_OPS_H_ +#ifndef LIB_JXL_BASE_MATRIX_OPS_H_ +#define LIB_JXL_BASE_MATRIX_OPS_H_ // 3x3 matrix operations. @@ -83,4 +83,4 @@ Status Inv3x3Matrix(Matrix& matrix) { } // namespace jxl -#endif // LIB_JXL_MATRIX_OPS_H_ +#endif // LIB_JXL_BASE_MATRIX_OPS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/rect.h b/third_party/jpeg-xl/lib/jxl/base/rect.h new file mode 100644 index 0000000000..666c3d73ec --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/rect.h @@ -0,0 +1,194 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_RECT_H_ +#define LIB_JXL_BASE_RECT_H_ + +#include +#include +#include +#include +#include +#include +#include // std::move + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +// Rectangular region in image(s). Factoring this out of Image instead of +// shifting the pointer by x0/y0 allows this to apply to multiple images with +// different resolutions (e.g. color transform and quantization field). +// Can compare using SameSize(rect1, rect2). +template +class RectT { + public: + // Most windows are xsize_max * ysize_max, except those on the borders where + // begin + size_max > end. + constexpr RectT(T xbegin, T ybegin, size_t xsize_max, size_t ysize_max, + T xend, T yend) + : x0_(xbegin), + y0_(ybegin), + xsize_(ClampedSize(xbegin, xsize_max, xend)), + ysize_(ClampedSize(ybegin, ysize_max, yend)) {} + + // Construct with origin and known size (typically from another Rect). + constexpr RectT(T xbegin, T ybegin, size_t xsize, size_t ysize) + : x0_(xbegin), y0_(ybegin), xsize_(xsize), ysize_(ysize) {} + + // Construct a rect that covers a whole image/plane/ImageBundle etc. + template + explicit RectT(const ImageT& image) + : RectT(0, 0, image.xsize(), image.ysize()) {} + + RectT() : RectT(0, 0, 0, 0) {} + + RectT(const RectT&) = default; + RectT& operator=(const RectT&) = default; + + // Construct a subrect that resides in an image/plane/ImageBundle etc. + template + RectT Crop(const ImageT& image) const { + return Intersection(RectT(image)); + } + + // Construct a subrect that resides in the [0, ysize) x [0, xsize) region of + // the current rect. + RectT Crop(size_t area_xsize, size_t area_ysize) const { + return Intersection(RectT(0, 0, area_xsize, area_ysize)); + } + + // Returns a rect that only contains `num` lines with offset `y` from `y0()`. + RectT Lines(size_t y, size_t num) const { + JXL_DASSERT(y + num <= ysize_); + return RectT(x0_, y0_ + y, xsize_, num); + } + + RectT Line(size_t y) const { return Lines(y, 1); } + + JXL_MUST_USE_RESULT RectT Intersection(const RectT& other) const { + return RectT(std::max(x0_, other.x0_), std::max(y0_, other.y0_), xsize_, + ysize_, std::min(x1(), other.x1()), + std::min(y1(), other.y1())); + } + + JXL_MUST_USE_RESULT RectT Translate(int64_t x_offset, + int64_t y_offset) const { + return RectT(x0_ + x_offset, y0_ + y_offset, xsize_, ysize_); + } + + template